From ec008a71410da8e9699d4570e47704e9fbcacfb5 Mon Sep 17 00:00:00 2001
From: Roc <30228238+sljlp@users.noreply.github.com>
Date: Mon, 10 Apr 2023 10:59:00 +0800
Subject: [PATCH 001/156] [AMP OP & Test] Tril & Triu (#52411)

---
 .../tests/unittests/test_tril_triu_op.py      | 95 ++++++++++++++++---
 1 file changed, 82 insertions(+), 13 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_tril_triu_op.py b/python/paddle/fluid/tests/unittests/test_tril_triu_op.py
index aef72df282600d..c2b80f1c4a9cf8 100644
--- a/python/paddle/fluid/tests/unittests/test_tril_triu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_tril_triu_op.py
@@ -14,10 +14,11 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from eager_op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import fluid, tensor
+from paddle.fluid import core
 from paddle.fluid.framework import Program, program_guard
 
 
@@ -49,20 +50,58 @@ def test_check_output(self):
     def test_check_grad_normal(self):
         self.check_grad(['X'], 'Out')
 
+    def init_dtype(self):
+        self.dtype = np.float64
+
+    def initTestCase(self):
+        self.init_dtype()
+        self.real_op_type = np.random.choice(['triu', 'tril'])
+        self.diagonal = None
+        self.X = np.arange(1, 101, dtype=self.dtype).reshape([10, -1])
+
+
+class TrilTriuOpDefaultTestFP16(TrilTriuOpDefaultTest):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    'not supported bf16',
+)
+class TrilTriuOpDefaultTestBF16(TrilTriuOpDefaultTest):
+    def init_dtype(self):
+        self.dtype = np.uint16
+
+    def setUp(self):
+        super().setUp()
+        self.outputs["Out"] = convert_float_to_uint16(self.outputs["Out"])
+        self.inputs['X'] = convert_float_to_uint16(self.inputs['X'])
+
     def initTestCase(self):
+        self.init_dtype()
         self.real_op_type = np.random.choice(['triu', 'tril'])
         self.diagonal = None
-        self.X = np.arange(1, 101, dtype="float64").reshape([10, -1])
+        self.X = np.arange(1, 101, dtype="float32").reshape([10, -1])
+
+    def test_check_output(self):
+        self.check_output_with_place(core.CUDAPlace(0))
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(
+            core.CUDAPlace(0), ['X'], 'Out', numeric_grad_delta=0.05
+        )
 
 
-def case_generator(op_type, Xshape, diagonal, expected):
+def case_generator(op_type, Xshape, diagonal, expected, dtype):
     """
     Generate testcases with the params shape of X, diagonal and op_type.
     If arg`expercted` is 'success', it will register an Optest case and expect to pass.
     Otherwise, it will register an API case and check the expect failure.
     """
-    cls_name = "{}_{}_shape_{}_diag_{}".format(
-        expected, op_type, Xshape, diagonal
+    cls_name = "{}_{}_shape_{}_diag_{}_dtype_{}".format(
+        expected, op_type, Xshape, diagonal, dtype
     )
     errmsg = {
         "diagonal: TypeError": "diagonal in {} must be a python Int".format(
@@ -93,7 +132,34 @@ def initTestCase(self):
             self.diagonal = diagonal
             self.X = np.random.random(Xshape).astype("float64")
 
-    CLASS = locals()['SuccessCase' if expected == "success" else 'FailureCase']
+    class SuccessCaseFP16(TrilTriuOpDefaultTestFP16):
+        def initTestCase(self):
+            self.init_dtype()
+            self.real_op_type = op_type
+            self.diagonal = diagonal
+            self.X = np.random.random(Xshape).astype("float16")
+
+    class SuccessCaseBF16(TrilTriuOpDefaultTestBF16):
+        def initTestCase(self):
+            self.init_dtype()
+            self.real_op_type = op_type
+            self.diagonal = diagonal
+            self.X = np.random.random(Xshape).astype("float32")
+
+    if dtype == "float64":
+        CLASS = locals()[
+            'SuccessCase' if expected == "success" else 'FailureCase'
+        ]
+    elif dtype == "float16":
+        CLASS = locals()[
+            'SuccessCaseFP16' if expected == "success" else 'FailureCase'
+        ]
+    elif dtype == "bfloat16":
+        CLASS = locals()[
+            'SuccessCaseBF16' if expected == "success" else 'FailureCase'
+        ]
+    else:
+        raise ValueError(f"Not supported dtype {dtype}")
     CLASS.__name__ = cls_name
     globals()[cls_name] = CLASS
 
@@ -119,13 +185,16 @@ def initTestCase(self):
         (2020,): [None],
     },
 }
-for _op_type in ['tril', 'triu']:
-    for _expected, _params in cases.items():
-        for _Xshape, _diaglist in _params.items():
-            [
-                case_generator(_op_type, _Xshape, _diagonal, _expected)
-                for _diagonal in _diaglist
-            ]
+for dtype in ["float64", "float16", "bfloat16"]:
+    for _op_type in ['tril', 'triu']:
+        for _expected, _params in cases.items():
+            for _Xshape, _diaglist in _params.items():
+                [
+                    case_generator(
+                        _op_type, _Xshape, _diagonal, _expected, dtype
+                    )
+                    for _diagonal in _diaglist
+                ]
 
 
 class TestTrilTriuOpAPI(unittest.TestCase):

From 90280542efee8be098f057b0487b33cc9bdded40 Mon Sep 17 00:00:00 2001
From: Wang Xin <xinwang614@gmail.com>
Date: Mon, 10 Apr 2023 11:08:10 +0800
Subject: [PATCH 002/156] add autogen code support for affine_grid op (#52560)

* add autogen code support for affine_grid op

* update op_compat.yaml for affine_grid

* update op_compat.yaml for affine_grid

* fix AffineGridGradInferMeta

* fix CI error

* update AffineGridInferMeta
---
 paddle/fluid/operators/affine_grid_op.cc | 288 -----------------------
 paddle/phi/api/yaml/backward.yaml        |  11 +
 paddle/phi/api/yaml/legacy_backward.yaml |  12 -
 paddle/phi/api/yaml/legacy_ops.yaml      |  12 -
 paddle/phi/api/yaml/op_compat.yaml       |   8 +
 paddle/phi/api/yaml/op_version.yaml      |   8 +
 paddle/phi/api/yaml/ops.yaml             |  12 +
 paddle/phi/infermeta/backward.cc         |   6 +-
 paddle/phi/infermeta/unary.cc            |  50 ++--
 paddle/phi/ops/compat/affine_grid_sig.cc |  49 ----
 10 files changed, 70 insertions(+), 386 deletions(-)
 delete mode 100644 paddle/fluid/operators/affine_grid_op.cc
 delete mode 100644 paddle/phi/ops/compat/affine_grid_sig.cc

diff --git a/paddle/fluid/operators/affine_grid_op.cc b/paddle/fluid/operators/affine_grid_op.cc
deleted file mode 100644
index a0cb5480d51b18..00000000000000
--- a/paddle/fluid/operators/affine_grid_op.cc
+++ /dev/null
@@ -1,288 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/phi/infermeta/backward.h"
-#include "paddle/phi/infermeta/unary.h"
-
-namespace paddle {
-namespace operators {
-
-class AffineGridOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Theta"),
-                      true,
-                      platform::errors::NotFound(
-                          "The input 'Theta' of AffineGridOp is not found."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Output"),
-                      true,
-                      platform::errors::NotFound(
-                          "The output 'Output' of AffineGridOp is not found."));
-    auto theta_dims = ctx->GetInputDim("Theta");
-    PADDLE_ENFORCE_EQ(
-        theta_dims.size(),
-        3,
-        platform::errors::InvalidArgument(
-            "The input Theta's dimensions size should be 3. But received "
-            "Theta's demensions size=[%d],  Theta's dimensions=[%s].",
-            theta_dims.size(),
-            theta_dims));
-
-    auto output_shape = ctx->Attrs().Get<std::vector<int>>("output_shape");
-    if (output_shape.size() == 0) {
-      PADDLE_ENFORCE_EQ(
-          ctx->HasInput("OutputShape"),
-          true,
-          platform::errors::NotFound(
-              "The input 'OutputShape' of AffineGridOp should not be null if "
-              "'output_shape' is not configured."));
-      auto output_shape_dims = ctx->GetInputDim("OutputShape");
-      PADDLE_ENFORCE_EQ(
-          output_shape_dims.size(),
-          1,
-          platform::errors::InvalidArgument(
-              "The dimesions size of input OutputShape in AffineGridOp should "
-              "be 1. But received OutputShape's  dimesions size=[%d], "
-              "OutputShape's  dimesions=[%s]",
-              output_shape_dims.size(),
-              output_shape_dims));
-    } else {
-      PADDLE_ENFORCE_GE(output_shape.size(),
-                        4,
-                        platform::errors::InvalidArgument(
-                            "The size of attribute 'output_shape' in "
-                            "AffineGridOp should be >= "
-                            "4. But received output_shape's size=[%d].",
-                            output_shape.size()));
-      PADDLE_ENFORCE_LE(output_shape.size(),
-                        5,
-                        platform::errors::InvalidArgument(
-                            "The size of attribute 'output_shape' in "
-                            "AffineGridOp should be <= "
-                            "5. But received output_shape's size=[%d].",
-                            output_shape.size()));
-    }
-
-    PADDLE_ENFORCE_GE(theta_dims[1],
-                      2,
-                      platform::errors::InvalidArgument(
-                          "The second dimesion of input 'theta' in "
-                          "AffineGridOp should be >= 2. "
-                          "But received second dimesion=[%d], dimesions=[%s]",
-                          theta_dims[1],
-                          theta_dims));
-    PADDLE_ENFORCE_LE(theta_dims[1],
-                      3,
-                      platform::errors::InvalidArgument(
-                          "The second dimesion of input 'theta' in "
-                          "AffineGridOp should be <= 3. "
-                          "But received second dimesion=[%d], dimesions=[%s]",
-                          theta_dims[1],
-                          theta_dims));
-    PADDLE_ENFORCE_GE(theta_dims[2],
-                      3,
-                      platform::errors::InvalidArgument(
-                          "The third dimesion of input 'theta' in AffineGridOp "
-                          "should be >= 3. "
-                          "But received third dimesion=[%d], dimesions=[%s]",
-                          theta_dims[2],
-                          theta_dims));
-    PADDLE_ENFORCE_LE(theta_dims[2],
-                      4,
-                      platform::errors::InvalidArgument(
-                          "The third dimesion of input 'theta' in AffineGridOp "
-                          "should be <= 4. "
-                          "But received third dimesion=[%d], dimesions=[%s]",
-                          theta_dims[2],
-                          theta_dims));
-
-    if (output_shape.size() == 4) {
-      // N * H * W * 2
-      ctx->SetOutputDim("Output", phi::make_ddim({theta_dims[0], -1, -1, 2}));
-    } else {
-      // N * D * H * W * 3
-      ctx->SetOutputDim("Output",
-                        phi::make_ddim({theta_dims[0], -1, -1, -1, 3}));
-    }
-    ctx->ShareLoD("Theta", "Output");
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "Theta");
-    return phi::KernelKey(data_type, ctx.GetPlace());
-  }
-};
-
-class AffineGridOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(
-        "Theta",
-        "(Tensor) A batch of affine transform parameters with shape [N, 2, 3]. "
-        "It is used to transform coordinate (x_0, y_0) to coordinate (x_1, "
-        "y_1).");
-    AddInput("OutputShape",
-             "(Tensor) The shape of target image with format [N, C, H, W].")
-        .AsDispensable();
-    AddOutput("Output", "(Tensor) Output Tensor with shape [N, H, W, 2].");
-    AddAttr<bool>("align_corners",
-                  "(bool, default false) Whether to align the corners of input"
-                  "and output.")
-        .SetDefault(true);
-    AddAttr<std::vector<int>>(
-        "output_shape",
-        "The target output image shape with format [N, C, H, W].")
-        .SetDefault(std::vector<int>());
-
-    AddComment(R"DOC(
-    It generates a grid of (x,y) coordinates using the parameters of the
-    affine transformation that correspond to a set of points where the input
-    feature map should be sampled to produce the transformed output feature map.
-
-    Given:
-        Theta = [[[x_11, x_12, x_13]
-                  [x_14, x_15, x_16]]
-                 [[x_21, x_22, x_23]
-                  [x_24, x_25, x_26]]]
-
-        OutputShape = [2, 3, 5, 5]
-
-    Step 1:
-
-        Generate relative coordinates according to OutputShape.
-        The values of relative coordinates are in the interval between -1 and 1.
-        The shape of the relative coordinates is [2, H, W] as below:
-
-        C = [[[-1.  -1.  -1.  -1.  -1. ]
-              [-0.5 -0.5 -0.5 -0.5 -0.5]
-              [ 0.   0.   0.   0.   0. ]
-              [ 0.5  0.5  0.5  0.5  0.5]
-              [ 1.   1.   1.   1.   1. ]]
-             [[-1.  -0.5  0.   0.5  1. ]
-              [-1.  -0.5  0.   0.5  1. ]
-              [-1.  -0.5  0.   0.5  1. ]
-              [-1.  -0.5  0.   0.5  1. ]
-              [-1.  -0.5  0.   0.5  1. ]]]
-        C[0] is the coordinates in height axis and  C[1] is the coordinates in
-        width axis.
-
-    Step2:
-        Tanspose and reshape C to shape [H * W, 2] and append ones to last
-        dimension. The we get:
-        C_ = [[-1.  -1.   1. ]
-              [-0.5 -1.   1. ]
-              [ 0.  -1.   1. ]
-              [ 0.5 -1.   1. ]
-              [ 1.  -1.   1. ]
-              [-1.  -0.5  1. ]
-              [-0.5 -0.5  1. ]
-              [ 0.  -0.5  1. ]
-              [ 0.5 -0.5  1. ]
-              [ 1.  -0.5  1. ]
-              [-1.   0.   1. ]
-              [-0.5  0.   1. ]
-              [ 0.   0.   1. ]
-              [ 0.5  0.   1. ]
-              [ 1.   0.   1. ]
-              [-1.   0.5  1. ]
-              [-0.5  0.5  1. ]
-              [ 0.   0.5  1. ]
-              [ 0.5  0.5  1. ]
-              [ 1.   0.5  1. ]
-              [-1.   1.   1. ]
-              [-0.5  1.   1. ]
-              [ 0.   1.   1. ]
-              [ 0.5  1.   1. ]
-              [ 1.   1.   1. ]]
-    Step3:
-        Compute output by equation $$Output[i] = C_ * Theta[i]^T$$
-    )DOC");
-  }
-};
-
-class AffineGridOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    if (ctx->HasOutput(framework::GradVarName("Theta"))) {
-      auto output_dims = ctx->GetInputDim(framework::GradVarName("Output"));
-      if (output_dims.size() == 4) {
-        ctx->SetOutputDim(framework::GradVarName("Theta"),
-                          {output_dims[0], 2, 3});
-      } else {
-        ctx->SetOutputDim(framework::GradVarName("Theta"),
-                          {output_dims[0], 3, 4});
-      }
-    }
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto data_type = OperatorWithKernel::IndicateVarDataType(
-        ctx, framework::GradVarName("Output"));
-    return phi::KernelKey(data_type, ctx.GetPlace());
-  }
-};
-
-template <typename T>
-class AffineGridGradMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("affine_grid_grad");
-    op->SetInput("OutputShape", this->Input("OutputShape"));
-    op->SetInput(framework::GradVarName("Output"), this->OutputGrad("Output"));
-
-    op->SetAttrMap(this->Attrs());
-
-    op->SetOutput(framework::GradVarName("Theta"), this->InputGrad("Theta"));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(affine_grid,
-                  ops::AffineGridOp,
-                  ops::AffineGridOpMaker,
-                  ops::AffineGridGradMaker<paddle::framework::OpDesc>,
-                  ops::AffineGridGradMaker<paddle::imperative::OpBase>);
-
-REGISTER_OPERATOR(affine_grid_grad, ops::AffineGridOpGrad);
-
-REGISTER_OP_VERSION(affine_grid)
-    .AddCheckpoint(
-        R"ROC(
-               Compatible upgrade of affine_grid, add a new attribute [align_corners])ROC",
-        paddle::framework::compatible::OpVersionDesc().NewAttr(
-            "align_corners",
-            "Whether to align the corners of input and output.",
-            true));
diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml
index b8244915369557..b046f5830ad4c8 100644
--- a/paddle/phi/api/yaml/backward.yaml
+++ b/paddle/phi/api/yaml/backward.yaml
@@ -33,6 +33,17 @@
   kernel :
     func : addmm_grad
 
+- backward_op : affine_grid_grad
+  forward : affine_grid (Tensor input, IntArray output_shape={}, bool align_corners=true) -> Tensor(output)
+  args : (Tensor input, Tensor output_grad, IntArray output_shape, bool align_corners=true)
+  output : Tensor(input_grad)
+  infer_meta :
+    func : AffineGridGradInferMeta
+    param : [output_grad, output_shape, align_corners]
+  kernel :
+    func : affine_grid_grad
+    param : [output_grad, output_shape, align_corners]
+
 - backward_op : angle_grad
   forward : angle (Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml
index d60f2b70597de1..037a2f94862bce 100755
--- a/paddle/phi/api/yaml/legacy_backward.yaml
+++ b/paddle/phi/api/yaml/legacy_backward.yaml
@@ -58,18 +58,6 @@
     func : add_triple_grad
   inplace : (grad_grad_out_grad -> grad_grad_x_grad)
 
-- backward_op : affine_grid_grad
-  forward : affine_grid (Tensor input, IntArray outputShape, bool align_corners=true) -> Tensor(output)
-  args : (Tensor input, Tensor output_grad, IntArray outputShape, bool align_corners=true)
-  output : Tensor(input_grad)
-  infer_meta :
-    func : AffineGridGradInferMeta
-    param : [output_grad, outputShape, align_corners]
-  kernel :
-    func : affine_grid_grad
-    param : [output_grad, outputShape, align_corners]
-  no_need_buffer : input
-
 - backward_op : amax_grad
   forward: amax (Tensor x,  int64_t[] axis={},  bool keepdim=false) -> Tensor(out)
   args : (Tensor x, Tensor out, Tensor out_grad, int64_t[] axis={},  bool keepdim=false, bool reduce_all=false)
diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml
index 92f522ca886735..aa942970c8b17e 100755
--- a/paddle/phi/api/yaml/legacy_ops.yaml
+++ b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -83,18 +83,6 @@
   invoke : add_n_impl(inputs)
   backward : add_n_grad
 
-- op : affine_grid
-  args : (Tensor input, IntArray outputShape, bool align_corners=true)
-  output : Tensor
-  infer_meta :
-    func : AffineGridInferMeta
-    param : [input, outputShape, align_corners]
-  kernel :
-    func : affine_grid
-    param : [input, outputShape, align_corners]
-    data_type : input
-  backward : affine_grid_grad
-
 - op : all
   args : (Tensor x, int64_t[] axis={}, bool keepdim=false)
   output : Tensor(out)
diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
index d8f7ec7cc5a0f7..63cca1670fe9bc 100644
--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -63,6 +63,14 @@
 
 - op : affine_grid
   backward : affine_grid_grad
+  inputs :
+    input : Theta
+  outputs :
+    out : Output
+  int_array:
+    output_shape :
+      data_type : int
+      tensor_name : OutputShape
   extra :
     attrs : [bool use_cudnn = true]
 
diff --git a/paddle/phi/api/yaml/op_version.yaml b/paddle/phi/api/yaml/op_version.yaml
index 66a8bb9e6bb646..8014103fad92af 100644
--- a/paddle/phi/api/yaml/op_version.yaml
+++ b/paddle/phi/api/yaml/op_version.yaml
@@ -1,3 +1,11 @@
+- op : affine_grid
+  version :
+    - checkpoint : Compatible upgrade of affine_grid, add a new attribute [align_corners].
+      action :
+        - add_attr : align_corners
+          comment : Whether to align the corners of input and output.
+          default : "true"
+
 - op : allclose
   version :
     - checkpoint : Upgrade allclose, add two new inputs [Rtol] and [Atol].
diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
index 3bbd9ac4f4c183..1870de32b49efd 100644
--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -42,6 +42,18 @@
     data_type : x
   backward : addmm_grad
 
+- op : affine_grid
+  args : (Tensor input, IntArray output_shape={}, bool align_corners=true)
+  output : Tensor
+  infer_meta :
+    func : AffineGridInferMeta
+    param : [input, output_shape, align_corners]
+  kernel :
+    func : affine_grid
+    param : [input, output_shape, align_corners]
+    data_type : input
+  backward : affine_grid_grad
+
 - op : allclose
   args : (Tensor x, Tensor y, Scalar rtol="1e-5", Scalar atol="1e-8", bool equal_nan=false)
   output : Tensor(out)
diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc
index 185cc09260d3f2..427bc51ab0dd8f 100644
--- a/paddle/phi/infermeta/backward.cc
+++ b/paddle/phi/infermeta/backward.cc
@@ -25,7 +25,11 @@ void AffineGridGradInferMeta(const MetaTensor& output_grad,
                              MetaTensor* input_grad) {
   if (input_grad) {
     auto output_dims = output_grad.dims();
-    input_grad->set_dims(phi::make_ddim({output_dims[0], 2, 3}));
+    if (output_dims.size() == 4) {
+      input_grad->set_dims(phi::make_ddim({output_dims[0], 2, 3}));
+    } else {
+      input_grad->set_dims(phi::make_ddim({output_dims[0], 3, 4}));
+    }
   }
 }
 
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 077fc851bd88a6..48fa20f0fb2299 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -49,31 +49,33 @@ void AffineGridInferMeta(const MetaTensor& input,
                          bool align_corners,
                          MetaTensor* output) {
   auto theta_dims = input.dims();
-  PADDLE_ENFORCE_EQ(
-      theta_dims.size(),
-      3,
-      phi::errors::InvalidArgument(
-          "The input Theta's dimensions size should be 3. But received "
-          "Theta's demensions size=[%d],  Theta's dimensions=[%s].",
-          theta_dims.size(),
-          theta_dims));
-
-  PADDLE_ENFORCE_GE(
-      outputShape.GetData().size(),
-      4,
-      phi::errors::InvalidArgument(
-          "The size of attribute 'output_shape' in AffineGridOp should be >= "
-          "4. But received output_shape's size=[%d].",
-          outputShape.GetData().size()));
+  bool is_from_tensor = outputShape.FromTensor();
+  if (!is_from_tensor) {
+    PADDLE_ENFORCE_EQ(
+        theta_dims.size(),
+        3,
+        phi::errors::InvalidArgument(
+            "The input Theta's dimensions size should be 3. But received "
+            "Theta's demensions size=[%d],  Theta's dimensions=[%s].",
+            theta_dims.size(),
+            theta_dims));
 
-  PADDLE_ENFORCE_LE(
-      outputShape.GetData().size(),
-      5,
-      phi::errors::InvalidArgument(
-          "The size of attribute 'output_shape' in AffineGridOp should be <= "
-          "5. But received output_shape's size=[%d].",
-          outputShape.GetData().size()));
+    PADDLE_ENFORCE_GE(
+        outputShape.GetData().size(),
+        4,
+        phi::errors::InvalidArgument(
+            "The size of attribute 'output_shape' in AffineGridOp should be >= "
+            "4. But received output_shape's size=[%d].",
+            outputShape.GetData().size()));
 
+    PADDLE_ENFORCE_LE(
+        outputShape.GetData().size(),
+        5,
+        phi::errors::InvalidArgument(
+            "The size of attribute 'output_shape' in AffineGridOp should be <= "
+            "5. But received output_shape's size=[%d].",
+            outputShape.GetData().size()));
+  }
   PADDLE_ENFORCE_GE(theta_dims[1],
                     2,
                     phi::errors::InvalidArgument(
@@ -109,7 +111,7 @@ void AffineGridInferMeta(const MetaTensor& input,
           "But received third dimesion=[%d], dimesions=[%s]",
           theta_dims[2],
           theta_dims));
-  if (outputShape.GetData().size() == 4) {
+  if (outputShape.GetData().size() == 4 && !is_from_tensor) {
     // N * H * W * 2
     output->set_dims(phi::make_ddim({theta_dims[0], -1, -1, 2}));
   } else {
diff --git a/paddle/phi/ops/compat/affine_grid_sig.cc b/paddle/phi/ops/compat/affine_grid_sig.cc
deleted file mode 100644
index 2506b4b7557189..00000000000000
--- a/paddle/phi/ops/compat/affine_grid_sig.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/compat/op_utils.h"
-namespace phi {
-
-KernelSignature AffineGridOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  if (ctx.HasInput("OutputShape")) {
-    return KernelSignature(
-        "affine_grid", {"Theta"}, {"OutputShape", "align_corners"}, {"Output"});
-  } else {
-    return KernelSignature("affine_grid",
-                           {"Theta"},
-                           {"output_shape", "align_corners"},
-                           {"Output"});
-  }
-}
-
-KernelSignature AffineGridGradOpArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  if (ctx.HasInput("OutputShape")) {
-    return KernelSignature("affine_grid_grad",
-                           {"Output@GRAD"},
-                           {"OutputShape", "align_corners"},
-                           {"Theta@GRAD"});
-  } else {
-    return KernelSignature("affine_grid_grad",
-                           {"Output@GRAD"},
-                           {"output_shape", "align_corners"},
-                           {"Theta@GRAD"});
-  }
-}
-}  // namespace phi
-
-PD_REGISTER_ARG_MAPPING_FN(affine_grid, phi::AffineGridOpArgumentMapping);
-
-PD_REGISTER_ARG_MAPPING_FN(affine_grid_grad,
-                           phi::AffineGridGradOpArgumentMapping);

From 3cbcaf1afd10ce75422f5494d07e760603270cab Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Mon, 10 Apr 2023 11:17:19 +0800
Subject: [PATCH 003/156] add tensor_utils.h into all.h (#52600)

---
 paddle/phi/api/all.h                    | 1 +
 paddle/phi/api/yaml/fused_backward.yaml | 2 +-
 paddle/phi/api/yaml/fused_ops.yaml      | 2 +-
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/paddle/phi/api/all.h b/paddle/phi/api/all.h
index 1dd908991f4ab8..b70119e1e4e5df 100644
--- a/paddle/phi/api/all.h
+++ b/paddle/phi/api/all.h
@@ -30,6 +30,7 @@ limitations under the License. */
 #include "paddle/phi/api/include/fused_api.h"
 #include "paddle/phi/api/include/sparse_api.h"
 #include "paddle/phi/api/include/tensor.h"
+#include "paddle/phi/api/include/tensor_utils.h"
 
 // phi common headers
 #include "paddle/phi/common/backend.h"
diff --git a/paddle/phi/api/yaml/fused_backward.yaml b/paddle/phi/api/yaml/fused_backward.yaml
index bdea08a1dd6a80..5bddb16ae9ba24 100644
--- a/paddle/phi/api/yaml/fused_backward.yaml
+++ b/paddle/phi/api/yaml/fused_backward.yaml
@@ -1,6 +1,6 @@
 # This file is designed for fusion C++ backward operators, which manages the
 # generated code for static mode and dynamic mode (when `support_dygraph_mode` is true).
-# "support_dygraph_mode" is and extra configuration item in this file,
+# "support_dygraph_mode" is an extra configuration item in this file,
 # if one operator have "support_dygraph_mode : true", it supports dygraph mode,
 # otherwise the operator only could be used in static mode.
 
diff --git a/paddle/phi/api/yaml/fused_ops.yaml b/paddle/phi/api/yaml/fused_ops.yaml
index 452983351daddb..c9fae2a81e3b74 100644
--- a/paddle/phi/api/yaml/fused_ops.yaml
+++ b/paddle/phi/api/yaml/fused_ops.yaml
@@ -1,6 +1,6 @@
 # This file is designed for fusion C++ farward operators, which manages the
 # generated code for static mode and dynamic mode (when `support_dygraph_mode` is true).
-# "support_dygraph_mode" is and extra configuration item in this file,
+# "support_dygraph_mode" is an extra configuration item in this file,
 # if one operator have "support_dygraph_mode : true", it supports dygraph mode,
 # otherwise the operator only could be used in static mode.
 

From 575cafb44b2e5dd10048a0639a927446f47a66e3 Mon Sep 17 00:00:00 2001
From: lishicheng1996 <43111799+lishicheng1996@users.noreply.github.com>
Date: Mon, 10 Apr 2023 11:17:34 +0800
Subject: [PATCH 004/156] support custom device on macos (#52620)

---
 paddle/fluid/distributed/collective/custom_ccl_tools.h | 5 +++++
 paddle/phi/backends/device_ext.h                       | 2 +-
 paddle/phi/backends/device_manager.cc                  | 4 ++++
 3 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/distributed/collective/custom_ccl_tools.h b/paddle/fluid/distributed/collective/custom_ccl_tools.h
index 9431c8d3e52756..d3ebc639a3c11c 100644
--- a/paddle/fluid/distributed/collective/custom_ccl_tools.h
+++ b/paddle/fluid/distributed/collective/custom_ccl_tools.h
@@ -14,7 +14,12 @@
 
 #pragma once
 
+#if defined(__APPLE__)
+#include <mach/error.h>
+#else
 #include <error.h>
+#endif
+
 #include <string>
 
 #include "paddle/fluid/distributed/collective/types.h"
diff --git a/paddle/phi/backends/device_ext.h b/paddle/phi/backends/device_ext.h
index e383dde1a2e191..4563160e335a1f 100644
--- a/paddle/phi/backends/device_ext.h
+++ b/paddle/phi/backends/device_ext.h
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #pragma once
-#if !defined(_WIN32) && !defined(__APPLE__)
+#if !defined(_WIN32)
 #include <cstddef>
 #include <cstring>
 
diff --git a/paddle/phi/backends/device_manager.cc b/paddle/phi/backends/device_manager.cc
index cb7e77bab3fdf5..c95616150d3a63 100644
--- a/paddle/phi/backends/device_manager.cc
+++ b/paddle/phi/backends/device_manager.cc
@@ -655,7 +655,11 @@ void DeviceManager::Clear() {
 
 std::vector<std::string> ListAllLibraries(const std::string& library_dir) {
   std::vector<std::string> libraries;
+#if defined(__APPLE__)
+  std::regex express(".*\\.dylib");
+#else
   std::regex express(".*\\.so");
+#endif
   std::match_results<std::string::iterator> results;
 
 #if !defined(_WIN32)

From 0e776965e5a821364204bfeddc85c6fa093c6dc4 Mon Sep 17 00:00:00 2001
From: Asthestarsfalll <72954905+Asthestarsfalll@users.noreply.github.com>
Date: Mon, 10 Apr 2023 11:23:49 +0800
Subject: [PATCH 005/156] =?UTF-8?q?=E3=80=90PaddlePaddle=20Hackathon=204?=
 =?UTF-8?q?=20No.44=E3=80=91=E4=B8=BA=20Paddle=20=E4=BC=98=E5=8C=96=20logs?=
 =?UTF-8?q?umexp=20op=20=E5=9C=A8=20GPU=20=E4=B8=8A=E7=9A=84=E8=AE=A1?=
 =?UTF-8?q?=E7=AE=97=E6=80=A7=E8=83=BD=20(#52509)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Optimize the performance of logsumexp

* Support zero-dim tensor
---
 paddle/phi/kernels/gpu/logsumexp_kernel.cu | 97 +++++++++++++++++++++-
 1 file changed, 94 insertions(+), 3 deletions(-)

diff --git a/paddle/phi/kernels/gpu/logsumexp_kernel.cu b/paddle/phi/kernels/gpu/logsumexp_kernel.cu
index 7963808476dedb..4806593469b035 100644
--- a/paddle/phi/kernels/gpu/logsumexp_kernel.cu
+++ b/paddle/phi/kernels/gpu/logsumexp_kernel.cu
@@ -1,4 +1,4 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -14,12 +14,103 @@
 
 #include "paddle/phi/kernels/logsumexp_kernel.h"
 
-#include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/impl/logsumexp_kernel_impl.h"
+#include "paddle/phi/kernels/elementwise_add_kernel.h"
+#include "paddle/phi/kernels/elementwise_subtract_kernel.h"
+#include "paddle/phi/kernels/funcs/activation_functor.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/gpu/reduce.h"
 
 using float16 = phi::dtype::float16;
 
+namespace phi {
+
+template <typename T>
+struct LogCUDAFunctor {
+  HOSTDEVICE inline T operator()(const T x) const { return std::log(x); }
+};
+
+template <>
+struct LogCUDAFunctor<float16> {
+  HOSTDEVICE inline float16 operator()(const float16 x) const {
+    auto x_ = static_cast<float>(x);
+    return static_cast<float16>(std::log(x_));
+  }
+};
+
+template <typename T, typename Context>
+void LogsumexpKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const std::vector<int64_t>& axis,
+                     bool keepdim,
+                     bool reduce_all,
+                     DenseTensor* out) {
+  auto* in_x = &x;
+  auto* out_y = out;
+  auto xdim = in_x->dims();
+  for (size_t i = 0; i < xdim.size(); i++)
+    PADDLE_ENFORCE_LT(0,
+                      xdim[i],
+                      errors::InvalidArgument(
+                          "The dims of Input(X) should be greater than 0."));
+
+  reduce_all = recompute_reduce_all(x, axis, reduce_all);
+  std::vector<int64_t> outdim_vec, keeped_outdim_vec;
+  std::vector<int> axis_vec;
+  for (auto i : axis) {
+    auto v = i >= 0 ? i : i + xdim.size();
+    axis_vec.push_back(v);
+  }
+  if (axis.size() == 0 || reduce_all) {
+    for (size_t i = 0; i < xdim.size(); i++) {
+      axis_vec.push_back(i);
+    }
+  }
+  for (size_t i = 0; i < xdim.size(); i++) {
+    bool flag = false;
+    for (auto v : axis_vec) {
+      if (v == i) {
+        flag = true;
+        break;
+      }
+    }
+    if (flag) {
+      keeped_outdim_vec.push_back(1);
+      if (keepdim) outdim_vec.push_back(1);
+    } else {
+      outdim_vec.push_back(xdim[i]);
+      keeped_outdim_vec.push_back(xdim[i]);
+    }
+  }
+
+  auto outdim = phi::make_ddim(outdim_vec);
+  auto keeped_outdim = phi::make_ddim(keeped_outdim_vec);
+  out->Resize(outdim);
+  dev_ctx.template Alloc<T>(out_y);
+
+  DenseTensor max_x;
+  max_x.Resize(outdim);
+  dev_ctx.template Alloc<T>(&max_x);
+
+  phi::funcs::ReduceKernel<T, T, kps::MaxFunctor, kps::IdentityFunctor<T>>(
+      dev_ctx, *in_x, &max_x, kps::IdentityFunctor<T>(), axis_vec);
+
+  max_x.Resize(keeped_outdim);
+  DenseTensor temp_x = Subtract<T, Context>(dev_ctx, *in_x, max_x);
+  phi::funcs::ReduceKernel<T, T, kps::AddFunctor, kps::ExpFunctor<T>>(
+      dev_ctx, temp_x, out_y, kps::ExpFunctor<T>(), axis_vec);
+
+  const std::vector<const DenseTensor*> inputs = {out_y};
+  std::vector<DenseTensor*> outputs = {&temp_x};
+  phi::funcs::ElementwiseKernel<T>(
+      dev_ctx, inputs, &outputs, LogCUDAFunctor<T>());
+  temp_x.Resize(outdim);
+  out->Resize(outdim);
+  phi::AddKernel<T, Context>(dev_ctx, temp_x, max_x, out);
+}
+
+}  // namespace phi
+
 PD_REGISTER_KERNEL(
     logsumexp, GPU, ALL_LAYOUT, phi::LogsumexpKernel, float, double, float16) {}

From 93404a6105fe4e191c9ca6325ff2e2c3d68347e3 Mon Sep 17 00:00:00 2001
From: cyberslack_lee <luhputu0815@gmail.com>
Date: Mon, 10 Apr 2023 11:24:22 +0800
Subject: [PATCH 006/156] support auto generate for eigvalsh (#52687)

---
 paddle/fluid/operators/eigvalsh_op.cc         | 113 ------------------
 paddle/phi/api/yaml/backward.yaml             |  10 ++
 paddle/phi/api/yaml/legacy_backward.yaml      |  12 --
 paddle/phi/api/yaml/legacy_ops.yaml           |   9 --
 paddle/phi/api/yaml/op_compat.yaml            |   9 ++
 paddle/phi/api/yaml/ops.yaml                  |  10 ++
 .../phi/kernels/cpu/eigvalsh_grad_kernel.cc   |   4 +-
 .../phi/kernels/gpu/eigvalsh_grad_kernel.cu   |   4 +-
 paddle/phi/kernels/gpu/eigvalsh_kernel.cu     |   4 +-
 paddle/phi/ops/compat/eigvalsh_sig.cc         |  34 ------
 10 files changed, 38 insertions(+), 171 deletions(-)
 delete mode 100644 paddle/fluid/operators/eigvalsh_op.cc
 delete mode 100644 paddle/phi/ops/compat/eigvalsh_sig.cc

diff --git a/paddle/fluid/operators/eigvalsh_op.cc b/paddle/fluid/operators/eigvalsh_op.cc
deleted file mode 100644
index 27c70f1e9b9a9a..00000000000000
--- a/paddle/fluid/operators/eigvalsh_op.cc
+++ /dev/null
@@ -1,113 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/phi/infermeta/backward.h"
-#include "paddle/phi/infermeta/unary.h"
-
-namespace paddle {
-namespace operators {
-
-class EigvalshOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-};
-
-class EigvalshOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(Tensor), Hermitian or real symmetric matrices."
-             "Its shape should be [*, N, N] where * is zero or"
-             "more batch dimensions. The data type is float32 ,"
-             "float64, complex64, complex128.");
-    AddOutput("Eigenvalues",
-              "(Tensor), The eigenvalues in ascending order."
-              "The data type is float32 or float64.");
-    AddOutput(
-        "Eigenvectors",
-        "(Tensor), The column is the normalized eigenvector "
-        "corresponding to the eigenvalue. The data type is the same as ``X``."
-        "Eigenvectors are required to calculate gradient when backward.");
-    AddAttr<std::string>(
-        "UPLO",
-        "(string, default 'L'), 'L' represents the lower triangular matrix,"
-        "'U' represents the upper triangular matrix.")
-        .SetDefault("L");
-    AddAttr<bool>("is_test",
-                  "(bool, default false) Set to true for inference only, false "
-                  "for training.")
-        .SetDefault(false);
-    AddComment(R"DOC(
-Eigvalsh Operator.
-
-Computes the eigenvalues of a complex Hermitian
- (conjugate symmetric) or a real symmetric matrix.
-
-)DOC");
-  }
-};
-
-class EigvalshGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(
-        OperatorWithKernel::IndicateVarDataType(ctx, "Eigenvectors"),
-        ctx.device_context().GetPlace());
-  }
-};
-
-template <typename T>
-class EigvalshGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType(this->ForwardOpType() + "_grad");
-    op->SetInput("Eigenvectors", this->Output("Eigenvectors"));
-    op->SetInput(framework::GradVarName("Eigenvalues"),
-                 this->OutputGrad("Eigenvalues"));
-    op->SetAttrMap(this->Attrs());
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-DECLARE_INFER_SHAPE_FUNCTOR(eigvalsh,
-                            EigvalshInferShapeFunctor,
-                            PD_INFER_META(phi::EigvalshInferMeta));
-DECLARE_INFER_SHAPE_FUNCTOR(eigvalsh_grad,
-                            EigvalshGradInferShapeFunctor,
-                            PD_INFER_META(phi::EigvalshGradInferMeta));
-
-REGISTER_OPERATOR(eigvalsh,
-                  ops::EigvalshOp,
-                  ops::EigvalshOpMaker,
-                  ops::EigvalshGradOpMaker<paddle::framework::OpDesc>,
-                  ops::EigvalshGradOpMaker<paddle::imperative::OpBase>,
-                  EigvalshInferShapeFunctor);
-REGISTER_OPERATOR(eigvalsh_grad,
-                  ops::EigvalshGradOp,
-                  EigvalshGradInferShapeFunctor);
diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml
index b046f5830ad4c8..e0a12e13fb4e94 100644
--- a/paddle/phi/api/yaml/backward.yaml
+++ b/paddle/phi/api/yaml/backward.yaml
@@ -456,6 +456,16 @@
     func : eigh_grad
     data_type : out_v
 
+- backward_op : eigvalsh_grad
+  forward : eigvalsh (Tensor x, str uplo = "L", bool is_test = false) -> Tensor(eigenvalues), Tensor(eigenvectors)
+  args : (Tensor eigenvectors, Tensor eigenvalues_grad, str uplo, bool is_test)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : EigvalshGradInferMeta
+  kernel :
+    func : eigvalsh_grad
+    data_type : eigenvectors
+
 - backward_op : elu_double_grad
   forward : elu_grad (Tensor x, Tensor out, Tensor grad_out, float alpha)-> Tensor(grad_x)
   args : (Tensor x, Tensor grad_out, Tensor grad_x_grad, float alpha)
diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml
index 037a2f94862bce..1e11bc54b3f51c 100755
--- a/paddle/phi/api/yaml/legacy_backward.yaml
+++ b/paddle/phi/api/yaml/legacy_backward.yaml
@@ -340,18 +340,6 @@
   kernel :
     func : dropout_grad
 
-- backward_op : eigvalsh_grad
-  forward : eigvalsh (Tensor x, str uplo, bool is_test) -> Tensor(eigenvalues), Tensor(eigenvectors)
-  args : (Tensor eigenvectors, Tensor eigenvalues_grad, str uplo, bool is_test)
-  output : Tensor(x_grad)
-  infer_meta :
-    func : EigvalshGradInferMeta
-  kernel :
-    func : eigvalsh_grad
-    data_type : eigenvectors
-  data_transform :
-    skip_transform : eigenvalues_grad
-
 - backward_op : einsum_grad
   forward : einsum (Tensor[] x, str equation) -> Tensor(out), Tensor[](inner_cache), Tensor[](x_shape)
   args : (Tensor[] x_shape, Tensor[] inner_cache, Tensor out_grad, str equation)
diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml
index aa942970c8b17e..a689fbc17dfaf4 100755
--- a/paddle/phi/api/yaml/legacy_ops.yaml
+++ b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -424,15 +424,6 @@
     data_type: DataType::FLOAT32
   optional : hypslength, refslength
 
-- op : eigvalsh
-  args : (Tensor x, str uplo, bool is_test)
-  output : Tensor(eigenvalues), Tensor(eigenvectors)
-  infer_meta :
-    func : EigvalshInferMeta
-  kernel :
-    func : eigvalsh
-  backward : eigvalsh_grad
-
 - op : einsum
   args : (Tensor[] x, str equation)
   output : Tensor, Tensor[]{x.size()}, Tensor[]{x.size()}
diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
index 63cca1670fe9bc..8a2ce29511f06c 100644
--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -614,6 +614,15 @@
   outputs :
     out : Out
 
+- op : eigvalsh
+  backward : eigvalsh_grad
+  inputs :
+    {x : X}
+  outputs :
+    {eigenvalues : Eigenvalues, eigenvectors : Eigenvectors}
+  attrs :
+    uplo : UPLO
+
 - op : elementwise_pow
   backward : elementwise_pow_grad
   extra :
diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
index 1870de32b49efd..110fc1838aba10 100644
--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -486,6 +486,16 @@
   kernel :
     func : eigvals
 
+- op : eigvalsh
+  args : (Tensor x, str uplo = "L", bool is_test = false)
+  output : Tensor(eigenvalues), Tensor(eigenvectors)
+  infer_meta :
+    func : EigvalshInferMeta
+  kernel :
+    func : eigvalsh
+    data_type : x
+  backward : eigvalsh_grad
+
 - op : elu
   args : (Tensor x, float alpha = 1.0f)
   output : Tensor(out)
diff --git a/paddle/phi/kernels/cpu/eigvalsh_grad_kernel.cc b/paddle/phi/kernels/cpu/eigvalsh_grad_kernel.cc
index b7b5927740e093..2489cbc825b22f 100644
--- a/paddle/phi/kernels/cpu/eigvalsh_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/eigvalsh_grad_kernel.cc
@@ -26,4 +26,6 @@ PD_REGISTER_KERNEL(eigvalsh_grad,
                    float,
                    double,
                    phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
+                   phi::dtype::complex<double>) {
+  kernel->InputAt(1).SetDataType(phi::dtype::ToReal(kernel_key.dtype()));
+}
diff --git a/paddle/phi/kernels/gpu/eigvalsh_grad_kernel.cu b/paddle/phi/kernels/gpu/eigvalsh_grad_kernel.cu
index de26617d80f1b8..bf62c2736e87c1 100644
--- a/paddle/phi/kernels/gpu/eigvalsh_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/eigvalsh_grad_kernel.cu
@@ -26,4 +26,6 @@ PD_REGISTER_KERNEL(eigvalsh_grad,
                    float,
                    double,
                    phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
+                   phi::dtype::complex<double>) {
+  kernel->InputAt(1).SetDataType(phi::dtype::ToReal(kernel_key.dtype()));
+}
diff --git a/paddle/phi/kernels/gpu/eigvalsh_kernel.cu b/paddle/phi/kernels/gpu/eigvalsh_kernel.cu
index 383f036c98cf9d..a075dad6cddb30 100644
--- a/paddle/phi/kernels/gpu/eigvalsh_kernel.cu
+++ b/paddle/phi/kernels/gpu/eigvalsh_kernel.cu
@@ -26,4 +26,6 @@ PD_REGISTER_KERNEL(eigvalsh,  // cuda_only
                    float,
                    double,
                    phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
+                   phi::dtype::complex<double>) {
+  kernel->InputAt(1).SetDataType(phi::dtype::ToReal(kernel_key.dtype()));
+}
diff --git a/paddle/phi/ops/compat/eigvalsh_sig.cc b/paddle/phi/ops/compat/eigvalsh_sig.cc
deleted file mode 100644
index b0635403355f7b..00000000000000
--- a/paddle/phi/ops/compat/eigvalsh_sig.cc
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-
-KernelSignature EigvalshOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  return KernelSignature(
-      "eigvalsh", {"X"}, {"UPLO", "is_test"}, {"Eigenvalues", "Eigenvectors"});
-}
-
-KernelSignature EigvalshGradOpArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  return KernelSignature("eigvalsh_grad",
-                         {"Eigenvectors", "Eigenvalues@GRAD"},
-                         {"UPLO", "is_test"},
-                         {"X@GRAD"});
-}
-}  // namespace phi
-
-PD_REGISTER_ARG_MAPPING_FN(eigvalsh, phi::EigvalshOpArgumentMapping);
-PD_REGISTER_ARG_MAPPING_FN(eigvalsh_grad, phi::EigvalshGradOpArgumentMapping);

From 61fe2198185bb92e43eb846617b5faf5d0e40eb9 Mon Sep 17 00:00:00 2001
From: Zero Rains <me@zerorains.top>
Date: Mon, 10 Apr 2023 11:25:20 +0800
Subject: [PATCH 007/156] =?UTF-8?q?=E3=80=90PaddlePaddle=20Hackathon=204?=
 =?UTF-8?q?=20No.36=E3=80=91=E4=B8=BA=20Paddle=20=E4=BC=98=E5=8C=96=20tile?=
 =?UTF-8?q?=20op=20=E5=9C=A8=20GPU=20=E4=B8=8A=E7=9A=84=E8=AE=A1=E7=AE=97?=
 =?UTF-8?q?=E6=80=A7=E8=83=BD=20(#52482)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix divide zero bug for softmax_with_cross_entropy

* change the single test way

* can run but slow. the most important is that I do not know why it slow

* remove some useless commet

* change the copyright to correct

* remove some useless change

* if repeat_times == 1, we will not use BroadcastKernel
---
 paddle/phi/kernels/gpu/tile_kernel.cu | 84 ++++++++++++++++++++++++++-
 1 file changed, 82 insertions(+), 2 deletions(-)

diff --git a/paddle/phi/kernels/gpu/tile_kernel.cu b/paddle/phi/kernels/gpu/tile_kernel.cu
index ba598862f5910e..be825eea499c84 100644
--- a/paddle/phi/kernels/gpu/tile_kernel.cu
+++ b/paddle/phi/kernels/gpu/tile_kernel.cu
@@ -13,10 +13,90 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/tile_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/impl/tile_kernel_impl.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TileKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const IntArray& repeat_times,
+                DenseTensor* out) {
+  auto x_dims = x.dims();
+  auto rank = x_dims.size();
+  auto repeat_times_data = repeat_times.GetData();
+  int repeat_times_size = repeat_times_data.size();
+  rank = std::max(rank, repeat_times_size);
+
+  if (rank == 0) {
+    phi::Copy<DeviceContext>(dev_ctx, x, dev_ctx.GetPlace(), false, out);
+    return;
+  }
+
+  for (size_t i = 0; i < repeat_times_data.size(); ++i) {
+    PADDLE_ENFORCE_GT(
+        repeat_times_data[i],
+        0,
+        errors::InvalidArgument(
+            "All elements of the input 'repeat_times' for tile op must "
+            "be positive integers, but the value received is %d.",
+            repeat_times_data[i]));
+  }
+
+  auto vec_x_dims = phi::vectorize<int>(x_dims);
+  if (repeat_times_data.size() < vec_x_dims.size()) {
+    int diff = vec_x_dims.size() - repeat_times_data.size();
+    repeat_times_data.insert(repeat_times_data.begin(), diff, 1);
+  } else {
+    int diff = repeat_times_data.size() - vec_x_dims.size();
+    vec_x_dims.insert(vec_x_dims.begin(), diff, 1);
+  }
+
+  PADDLE_ENFORCE_EQ(
+      repeat_times_data.size(),
+      vec_x_dims.size(),
+      errors::InvalidArgument(
+          "The rank (%d) of the input 'x' and the rank (%d) of the input "
+          "'repeat_times' for tile op must match after promotion.",
+          vec_x_dims.size(),
+          repeat_times_data.size()));
+
+  DDim new_x_dims = make_ddim(vec_x_dims);
+  DDim out_dims(new_x_dims);
+  DenseTensor new_x = x;
+  vec_x_dims.insert(vec_x_dims.begin(), 1, 1);
+  for (size_t i = 0; i < repeat_times_data.size(); ++i) {
+    out_dims[i] *= repeat_times_data[i];
+    new_x.Resize(make_ddim(vec_x_dims));
+    std::vector<const DenseTensor*> ins = {&new_x};
+    vec_x_dims[i] *= repeat_times_data[i];
+    if (i != repeat_times_data.size() - 1) {
+      if (repeat_times_data[i] != 1) {
+        DenseTensor tmp_out;
+        tmp_out.Resize(make_ddim(vec_x_dims));
+        dev_ctx.template Alloc<T>(&tmp_out);
+        std::vector<DenseTensor*> outs = {&tmp_out};
+        phi::funcs::BroadcastKernel<ElementwiseType::kUnary, T, T>(
+            dev_ctx, ins, &outs, i, kps::IdentityFunctor<T>());
+        tmp_out.Resize(out_dims);
+        new_x = tmp_out;
+      }
+      vec_x_dims[i] *= vec_x_dims[i + 1];
+      vec_x_dims[i + 1] = 1;
+    } else {
+      out->Resize(make_ddim(vec_x_dims));
+      dev_ctx.template Alloc<T>(out);
+      std::vector<DenseTensor*> outs = {out};
+      phi::funcs::BroadcastKernel<ElementwiseType::kUnary, T, T>(
+          dev_ctx, ins, &outs, i, kps::IdentityFunctor<T>());
+      out->Resize(out_dims);
+    }
+  }
+}
+
+}  // namespace phi
 
 PD_REGISTER_KERNEL(tile,
                    GPU,

From 6913feb0c963e862b00d05fd3e61735a51b156a6 Mon Sep 17 00:00:00 2001
From: jjyaoao <88936287+jjyaoao@users.noreply.github.com>
Date: Mon, 10 Apr 2023 11:39:18 +0800
Subject: [PATCH 008/156] remove infrt V1.1 (#52672)

---
 .gitignore                |   9 -
 CMakeLists.txt            |   1 -
 cmake/external/llvm.cmake | 132 ---------------
 cmake/infrt_lib.cmake     |  85 ----------
 cmake/third_party.cmake   |   5 -
 paddle/phi/README.md      | 345 ++++++++++++++++++++------------------
 tools/get_pr_ut.py        |   4 +-
 7 files changed, 179 insertions(+), 402 deletions(-)
 delete mode 100644 cmake/external/llvm.cmake
 delete mode 100644 cmake/infrt_lib.cmake

diff --git a/.gitignore b/.gitignore
index 56d34af5d531f4..c0bdf7e4bf5cbc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -73,16 +73,7 @@ tools/nvcc_lazy
 
 # This file is automatically generated.
 # TODO(zhiqiang) Move this file to build directory.
-paddle/infrt/dialect/pd/ir/pd_ops.td
-paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td
-paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td
-tools/infrt/kernels.json
-tools/infrt/kernel_signature.json
-paddle/infrt/dialect/pd/common/pd_ops_info.h
 .lit_test_times.txt
-paddle/infrt/tests/dialect/Output
-paddle/infrt/tests/lit.cfg.py
-paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.cc
 paddle/fluid/pybind/eager_op_function.cc
 tools/nvcc_lazy
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index bc337fa8e27c2e..aa6ec5e55c07fc 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -269,7 +269,6 @@ option(
   OFF)
 option(WITH_LITE "Compile Paddle Fluid with Lite Engine" OFF)
 option(WITH_CINN "Compile PaddlePaddle with CINN" OFF)
-option(WITH_INFRT "Compile PaddlePaddle with INFRT" OFF)
 option(WITH_NCCL "Compile PaddlePaddle with NCCL support" ON)
 option(WITH_RCCL "Compile PaddlePaddle with RCCL support" ON)
 option(WITH_XPU_BKCL "Compile PaddlePaddle with BAIDU KUNLUN XPU BKCL" OFF)
diff --git a/cmake/external/llvm.cmake b/cmake/external/llvm.cmake
deleted file mode 100644
index 8b33a73e24c8d1..00000000000000
--- a/cmake/external/llvm.cmake
+++ /dev/null
@@ -1,132 +0,0 @@
-include(FetchContent)
-
-set(LLVM_DOWNLOAD_URL
-    https://paddle-inference-dist.bj.bcebos.com/infrt/llvm_b5149f4e66a49a98b67e8e2de4e24a4af8e2781b.tar.gz
-)
-set(LLVM_MD5 022819bb5760817013cf4b8a37e97d5e)
-
-set(FETCHCONTENT_BASE_DIR ${THIRD_PARTY_PATH}/llvm)
-set(FETCHCONTENT_QUIET OFF)
-FetchContent_Declare(
-  external_llvm
-  URL ${LLVM_DOWNLOAD_URL}
-  URL_MD5 ${LLVM_MD5}
-  PREFIX ${THIRD_PARTY_PATH}/llvm SOURCE_DIR ${THIRD_PARTY_PATH}/install/llvm)
-if(NOT LLVM_PATH)
-  FetchContent_GetProperties(external_llvm)
-  if(NOT external_llvm_POPULATED)
-    FetchContent_Populate(external_llvm)
-  endif()
-  set(LLVM_PATH ${THIRD_PARTY_PATH}/install/llvm)
-  set(LLVM_DIR ${THIRD_PARTY_PATH}/install/llvm/lib/cmake/llvm)
-  set(MLIR_DIR ${THIRD_PARTY_PATH}/install/llvm/lib/cmake/mlir)
-else()
-  set(LLVM_DIR ${LLVM_PATH}/lib/cmake/llvm)
-  set(MLIR_DIR ${LLVM_PATH}/lib/cmake/mlir)
-endif()
-
-if(${CMAKE_CXX_COMPILER} STREQUAL "clang++")
-  set(CMAKE_EXE_LINKER_FLAGS
-      "${CMAKE_EXE_LINKER_FLAGS} -stdlib=libc++ -lc++abi")
-endif()
-
-message(STATUS "set LLVM_DIR: ${LLVM_DIR}")
-message(STATUS "set MLIR_DIR: ${MLIR_DIR}")
-find_package(LLVM REQUIRED CONFIG HINTS ${LLVM_DIR})
-find_package(MLIR REQUIRED CONFIG HINTS ${MLIR_DIR})
-find_package(ZLIB REQUIRED)
-
-list(APPEND CMAKE_MODULE_PATH "${LLVM_CMAKE_DIR}")
-include(AddLLVM)
-
-include_directories(${LLVM_INCLUDE_DIRS})
-list(APPEND CMAKE_MODULE_PATH "${LLVM_CMAKE_DIR}")
-list(APPEND CMAKE_MODULE_PATH "${MLIR_CMAKE_DIR}")
-include(AddLLVM)
-include(TableGen)
-include(AddMLIR)
-
-message(STATUS "Found MLIR: ${MLIR_DIR}")
-message(STATUS "Found LLVM ${LLVM_PACKAGE_VERSION}")
-message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}")
-
-# To build with MLIR, the LLVM is build from source code using the following flags:
-
-#[==[
-cmake ../llvm  -G "Unix Makefiles" \
-  -DLLVM_ENABLE_PROJECTS="mlir;clang" \
-  -DLLVM_BUILD_EXAMPLES=OFF \
-  -DLLVM_TARGETS_TO_BUILD="X86" \
-  -DCMAKE_BUILD_TYPE=Release \
-  -DLLVM_ENABLE_ASSERTIONS=ON \
-  -DLLVM_ENABLE_ZLIB=OFF \
-  -DLLVM_ENABLE_RTTI=ON \
-  -DLLVM_INSTALL_UTILS=ON \
-  -DCMAKE_INSTALL_PREFIX=./install
-#]==]
-# The matched llvm-project version is b5149f4e66a49a98b67e8e2de4e24a4af8e2781b (currently a temporary commit)
-
-add_definitions(${LLVM_DEFINITIONS})
-
-llvm_map_components_to_libnames(
-  llvm_libs
-  Support
-  Core
-  irreader
-  X86
-  executionengine
-  orcjit
-  mcjit
-  all
-  codegen)
-
-message(STATUS "LLVM libs: ${llvm_libs}")
-
-get_property(mlir_libs GLOBAL PROPERTY MLIR_ALL_LIBS)
-message(STATUS "MLIR libs: ${mlir_libs}")
-add_definitions(${LLVM_DEFINITIONS})
-
-# The minimum needed libraries for MLIR IR parse and transform.
-set(MLIR_IR_LIBS MLIRAnalysis MLIRPass MLIRParser MLIRDialect MLIRIR MLIROptLib)
-
-# tb_base is the name of a xxx.td file (without the .td suffix)
-function(mlir_tablegen_on td_base)
-  set(options)
-  set(oneValueArgs DIALECT)
-  cmake_parse_arguments(mlir_tablegen_on "${options}" "${oneValueArgs}"
-                        "${multiValueArgs}" ${ARGN})
-
-  set(LLVM_TARGET_DEFINITIONS ${td_base}.td)
-  mlir_tablegen(${td_base}.hpp.inc -gen-op-decls)
-  mlir_tablegen(${td_base}.cpp.inc -gen-op-defs)
-  if(mlir_tablegen_on_DIALECT)
-    mlir_tablegen(${td_base}_dialect.hpp.inc --gen-dialect-decls
-                  -dialect=${mlir_tablegen_on_DIALECT})
-    mlir_tablegen(${td_base}_dialect.cpp.inc --gen-dialect-defs
-                  -dialect=${mlir_tablegen_on_DIALECT})
-  endif()
-  add_public_tablegen_target(${td_base}_IncGen)
-  add_custom_target(${td_base}_inc DEPENDS ${td_base}_IncGen)
-endfunction()
-
-function(mlir_add_rewriter td_base)
-  set(LLVM_TARGET_DEFINITIONS ${td_base}.td)
-  set(LLVM_TARGET_DEPENDS
-      ${LLVM_TARGET_DEPENDS}
-      ${CMAKE_SOURCE_DIR}/paddle/infrt/dialect/infrt/ir/infrt_base.td)
-  mlir_tablegen(${td_base}.cpp.inc -gen-rewriters)
-  add_public_tablegen_target(MLIR${td_base}IncGen)
-  add_dependencies(mlir-headers MLIR${td_base}IncGen)
-endfunction()
-
-# Execute the mlir script with infrt-exec program.
-# @name: name of the test
-# @script: path to the mlir script file
-function(infrt_exec_check name script)
-  add_test(
-    NAME ${name}
-    COMMAND
-      sh -c
-      "${CMAKE_BINARY_DIR}/paddle/infrt/host_context/infrt-exec -i ${CMAKE_CURRENT_SOURCE_DIR}/${script}| ${LLVM_PATH}/bin/FileCheck  ${CMAKE_CURRENT_SOURCE_DIR}/${script}"
-  )
-endfunction()
diff --git a/cmake/infrt_lib.cmake b/cmake/infrt_lib.cmake
deleted file mode 100644
index 21dcd0ef36d166..00000000000000
--- a/cmake/infrt_lib.cmake
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set(INFRT_INSTALL_DIR
-    "${CMAKE_BINARY_DIR}/paddle_infrt_install_dir"
-    CACHE STRING "A path setting paddle infrt shared and static libraries")
-
-function(copy TARGET)
-  set(options "")
-  set(oneValueArgs "")
-  set(multiValueArgs SRCS DSTS)
-  cmake_parse_arguments(copy_lib "${options}" "${oneValueArgs}"
-                        "${multiValueArgs}" ${ARGN})
-
-  list(LENGTH copy_lib_SRCS copy_lib_SRCS_len)
-  list(LENGTH copy_lib_DSTS copy_lib_DSTS_len)
-  if(NOT ${copy_lib_SRCS_len} EQUAL ${copy_lib_DSTS_len})
-    message(
-      FATAL_ERROR
-        "${TARGET} source numbers are not equal to destination numbers")
-  endif()
-  math(EXPR len "${copy_lib_SRCS_len} - 1")
-  foreach(index RANGE ${len})
-    list(GET copy_lib_SRCS ${index} src)
-    list(GET copy_lib_DSTS ${index} dst)
-    add_custom_command(
-      TARGET ${TARGET}
-      POST_BUILD
-      COMMAND mkdir -p "${dst}"
-      COMMAND cp -r "${src}" "${dst}"
-      COMMENT "copying ${src} -> ${dst}")
-  endforeach()
-endfunction()
-
-function(copy_part_of_thrid_party TARGET DST)
-  set(dst_dir "${DST}/third_party/install/glog")
-  copy(
-    ${TARGET}
-    SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES}
-    DSTS ${dst_dir} ${dst_dir}/lib)
-endfunction()
-
-# inference library for only inference
-set(infrt_lib_deps third_party infrt infrt_static)
-add_custom_target(infrt_lib_dist DEPENDS ${infrt_lib_deps})
-
-# CMakeCache Info
-copy(
-  infrt_lib_dist
-  SRCS ${CMAKE_BINARY_DIR}/CMakeCache.txt
-  DSTS ${INFRT_INSTALL_DIR})
-
-set(infrt_lib ${INFRT_BINARY_DIR}/libinfrt.*)
-copy(
-  infrt_lib_dist
-  SRCS ${INFRT_SOURCE_DIR}/api/infrt_api.h ${infrt_lib}
-  DSTS ${INFRT_INSTALL_DIR}/infrt/include ${INFRT_INSTALL_DIR}/infrt/lib)
-
-copy(
-  infrt_lib_dist
-  SRCS ${INFRT_BINARY_DIR}/paddle/framework.pb.h
-  DSTS ${INFRT_INSTALL_DIR}/infrt/include/internal)
-
-# paddle fluid version
-function(version version_file)
-  execute_process(
-    COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
-    OUTPUT_VARIABLE PADDLE_GIT_COMMIT)
-  file(WRITE ${version_file} "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n")
-  file(APPEND ${version_file}
-       "CXX compiler version: ${CMAKE_CXX_COMPILER_VERSION}\n")
-endfunction()
-version(${INFRT_INSTALL_DIR}/version.txt)
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index 90973ecfba5289..f8cb9c85c5202d 100755
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -493,11 +493,6 @@ if(WIN32)
   list(APPEND third_party_deps extern_dirent)
 endif()
 
-if(WITH_INFRT)
-  include(external/llvm)
-  list(APPEND third_party_deps ${llvm_libs})
-endif()
-
 if(WITH_IPU)
   include(external/poplar)
   list(APPEND third_party_deps extern_poplar)
diff --git a/paddle/phi/README.md b/paddle/phi/README.md
index e9cb953bc0510c..f1a669f1f461fa 100644
--- a/paddle/phi/README.md
+++ b/paddle/phi/README.md
@@ -22,11 +22,11 @@ The root cause of poor reusability is the inflexibility of the original Op archi
 
 1. When an Op reuses the `Opkernel::Compute` method of another Op, an `ExecutionContext` needs to be constructed first, and the reuse method is relatively cumbersome
 
-    > It will be much more convenient if you can directly call the Kernel in the form of a function
+   > It will be much more convenient if you can directly call the Kernel in the form of a function
 
 2. Due to the overhead introduced by additional data structure construction and independent Op scheduling, from the perspective of computing performance, it is better to copy the calculation code directly when reusing Op, which leads us to gradually abandon the earlier principle of backward Op reusing forward Op, and began to implement Kernel separately for each backward Op, so that Paddle maintains a large number of backward OpKernel implementation codes internally.
 
-    > Only when the overhead of reusing Ops is small enough, reusing existing Ops to implement new Ops can be widely promoted
+   > Only when the overhead of reusing Ops is small enough, reusing existing Ops to implement new Ops can be widely promoted
 
 ### 1.2 Conciseness and fine-grained execution scheduling
 
@@ -54,11 +54,7 @@ For a long time, because the Paddle and Paddle-Lite operators are maintained sep
 
 Therefore, this functional operator library will be jointly constructed by training and inference team, and will serve as an independent compilation component and underlying infrastructure (not yet independently split), which can serve training, server-inference, and -inference execution systems at the same time.
 
-### 1.5 The adaptation of the new inference Runtime design 'infrt'
-
-Inference team designed a new runtime `infrt`. It is expected to unify the execution system of Paddle-Inference and Paddle-Lite. It is necessary to directly call the operators in the PHI operator library jointly built this time. Therefore, the adaptation to `infrt` needs to be considered in the design. (Currently the `infrt` project is temporarily on hold).
-
-### 1.6 Op and Kernel parameter normalization
+### 1.5 Op and Kernel parameter normalization
 
 The Python 2.0 API project in 2020 standardized the argument list of the Paddle Python-side API, making it concise, easy to use, and standard. However, due to cost considerations, the argument list at the Op level was not standardized, so there will be many early developed operators that differ greatly in arguments from the Python API. For example, `conv` op, the Python API has only 8 arguments, but the corresponding C++ `Conv` Op has 29 arguments. API and Op are essentially the same layer of concepts, both are descriptions of an operation, and the arguments should be consistent. In order to solve this problem, 'the operator definition enhancement project' was launched, and the declarations of 'AsExtra' and 'AsQuant' were added to some unnecessary arguments, but the problem was not fundamentally solved, which is what the construction of the PHI operator library hopes to solve.
 
@@ -68,7 +64,7 @@ We hope to be able to achieve the same three-layer arguments of Python API -> Op
 
 ### 2.1 Location
 
-The PHI code directory is inside the paddle directory, which is at the same level as fluid, rather than inside the fluid directory. PHI is a basic component that is called by various upper-layer runtime such as fluid, lite, and infrt, and it will be used later as a separately compiled dynamic library, therefore PHI is not suitable as the submodule of fluid.
+The PHI code directory is inside the paddle directory, which is at the same level as fluid, rather than inside the fluid directory. PHI is a basic component that is called by various upper-layer runtime such as fluid, lite, and it will be used later as a separately compiled dynamic library, therefore PHI is not suitable as the submodule of fluid.
 
 ### 2.2 Directory Structure
 
@@ -78,27 +74,31 @@ Training and inference require a clear operator library directory structure:
 
 - The directory design should support various split compilation requirements of the operator library, which including:
 
-    - Split and compile according to the computing device.
-        - For example, compile for cpu only, or compile for gpu only.
-    - Split and compile according to the training and inference scenarios.
-        - For example, the inference scenario does not compile backward-relevant kernels (xxx_grad_kernel.cc|cu)
-    - Precisely crop and compile according to the operators actually used by the mobile device (not supported yet)
-        - For example, a model uses `add` and `multiply` only, ideally it could be cropped to only 2 kernels.
+  - Split and compile according to the computing device.
+    - For example, compile for cpu only, or compile for gpu only.
+  - Split and compile according to the training and inference scenarios.
+    - For example, the inference scenario does not compile backward-relevant kernels (xxx_grad_kernel.cc|cu)
+  - Precisely crop and compile according to the operators actually used by the mobile device (not supported yet)
+    - For example, a model uses `add` and `multiply` only, ideally it could be cropped to only 2 kernels.
 
 - In the long run, support the requirement of easily reusing kernel implementation.
-    - Explanation: When reusing the kernel, the corresponding function implementation should be introduced through `include` easily, rather than cannot find the kernel because of the complex directory structure.
+
+  - Explanation: When reusing the kernel, the corresponding function implementation should be introduced through `include` easily, rather than cannot find the kernel because of the complex directory structure.
 
 - In the long run, support the requirement of the unified writing method among cross-device kernels, and the writing method is intuitive and easy to use, without introducing unnecessary template parameters.
-    - Explanation: Kernel Primitive API module is at the lower layer of the operator library. Its long-term vision is that each operation uses only one kernel to adapt to various devices, the code that truly distinguishes the device is only in the implementation of the Kernel Primitive API. In the future, the template parameters should be limited to as concise as possible when passing complex parameters into the reused kernel.
+
+  - Explanation: Kernel Primitive API module is at the lower layer of the operator library. Its long-term vision is that each operation uses only one kernel to adapt to various devices, the code that truly distinguishes the device is only in the implementation of the Kernel Primitive API. In the future, the template parameters should be limited to as concise as possible when passing complex parameters into the reused kernel.
 
 - In terms of ease of use, developers can accurately understand where the newly added kernel should be placed, without ambiguity.
-    - Explanation: When developers add an API, they will not be confused about which directory they should put the corresponding kernel in. Moreover, different people should have no ambiguous understanding of where the same kernel should be placed.
+
+  - Explanation: When developers add an API, they will not be confused about which directory they should put the corresponding kernel in. Moreover, different people should have no ambiguous understanding of where the same kernel should be placed.
 
 - Do not introduce a lot of duplicate directory design.
-    - Explanation: Concept splitting is needed, but also with boundaries. Avoid subdirectories with the same name occurring in multiple directories. For example, if `eigen`, `funcs`, `math` directories are placed under the cpu directory, then they shouldn't be placed under the gpu directory. The directory design of the new operator library is mainly divided according to the device, and the directory splitting at other levels should be weakened as much as possible. For example, try not to split based on functions, try not to split based on fields, etc.
+
+  - Explanation: Concept splitting is needed, but also with boundaries. Avoid subdirectories with the same name occurring in multiple directories. For example, if `eigen`, `funcs`, `math` directories are placed under the cpu directory, then they shouldn't be placed under the gpu directory. The directory design of the new operator library is mainly divided according to the device, and the directory splitting at other levels should be weakened as much as possible. For example, try not to split based on functions, try not to split based on fields, etc.
 
 - Do not introduce too deep directory design.
-    - Explanation: The directory level should not be too deep, otherwise it will lead to higher understanding and maintenance costs.
+  - Explanation: The directory level should not be too deep, otherwise it will lead to higher understanding and maintenance costs.
 
 #### 2.2.2 Directory design details
 
@@ -129,13 +129,12 @@ Some directory structure description:
 - `common`: Data structures to be used both inside PHI `core` and PHI `api` directory. These data structures neither belong to the `core` nor the `api` directory.
 - `core`: PHI has some public module implementations that it needs, such as `DenseTensor`, kernel registration and management modules.
 - `backends`: The backends include data structures that need to be added for each backend, such as `CPUContext`, `GPUContext`, etc.
-    - The basic data structures are placed in the `core`, while the dedicated data structures of specific backends are not placed in the `core`, and the dependencies strictly ensure that the `backends` depend on the `core`, but the `core` cannot depend on the `backends`.
-    - Example 1: If Context is a base class, then put it in `core`, inherited `CPUContext` is in `backends/cpu` and `GPUContext` is in `backends/gpu`.
-    - Example 2: TensorBase is in `core`, `DenseTensor` is used by most devices so that it is also in the `core`. If there is `OneDNNTensor`, which is only used for `OneDNN`, then it should be placed in `backends/onednn`.
+  - The basic data structures are placed in the `core`, while the dedicated data structures of specific backends are not placed in the `core`, and the dependencies strictly ensure that the `backends` depend on the `core`, but the `core` cannot depend on the `backends`.
+  - Example 1: If Context is a base class, then put it in `core`, inherited `CPUContext` is in `backends/cpu` and `GPUContext` is in `backends/gpu`.
+  - Example 2: TensorBase is in `core`, `DenseTensor` is used by most devices so that it is also in the `core`. If there is `OneDNNTensor`, which is only used for `OneDNN`, then it should be placed in `backends/onednn`.
 - `infermeta`: The location of the infermeta function, the infermeta function is equivalent to `infershape + inferdtype + inferlayout`, etc.
 - `kernels`: Kernels related to each device.
-    - `cpu, gpu, ...`
-
+  - `cpu, gpu, ...`
 
 ##### 2.2.2.2 Kernels directory
 
@@ -156,25 +155,26 @@ paddle/phi/kernels
 The directory structure is described as follows:
 
 - The root directory under kernels includes device-independent `kernel.h` and `kernel.cc`. In principle, each kernel has one .h and .cc
-    - For example, if a kernel is implemented using Primitive api, or is implemented by reusing other basic kernels, there should be only one implementation for all devices, so its declaration and implementation can be placed directly in the kernels directory. (This is the ideal state in the future.)
-    - At present, most of our kernels do not have the feature of unity implementation across devices, but the input parameters and return values of the kernel should be consistent except for `DeviceContext`, so the kernel parameter declaration header file is also placed in the current directory (consistent with the original design, `DeviceContext` and `T` are used as template parameters), The functions implementation of each device are placed in the corresponding device folder.
-        - Note that the unity implementation across devices here does not mean that the CPU and GPU implementations of a kernel are unified, but the implementations of all devices are the same. Currently, it includes at least `CPU`, `GPU`, `XPU`, `ONEDNN`, `GPUDNN`, etc.
-    - If the backward kernel does not need to support cropping, it can be merged appropriately (but if you want to leave the possibility of supporting end-to-side training, the backward kernel may also be a potential target for cropping)
+  - For example, if a kernel is implemented using Primitive api, or is implemented by reusing other basic kernels, there should be only one implementation for all devices, so its declaration and implementation can be placed directly in the kernels directory. (This is the ideal state in the future.)
+  - At present, most of our kernels do not have the feature of unity implementation across devices, but the input parameters and return values of the kernel should be consistent except for `DeviceContext`, so the kernel parameter declaration header file is also placed in the current directory (consistent with the original design, `DeviceContext` and `T` are used as template parameters), The functions implementation of each device are placed in the corresponding device folder.
+    - Note that the unity implementation across devices here does not mean that the CPU and GPU implementations of a kernel are unified, but the implementations of all devices are the same. Currently, it includes at least `CPU`, `GPU`, `XPU`, `ONEDNN`, `GPUDNN`, etc.
+  - If the backward kernel does not need to support cropping, it can be merged appropriately (but if you want to leave the possibility of supporting end-to-side training, the backward kernel may also be a potential target for cropping)
 - The next-level subdirectory of kernels, in principle, is created according to the backend classification, and only two special directories are reserved:
-    - `funcs`: In order to be compatible with the directories of functor and function in the original fluid/operators directory, when placing functions and functor that support multiple backends, we organize them according to the original design that one header file corresponding to multiple .cc(u) (This part of the code may be removed in the future, because it will be gradually replaced by Kernel Primitive API and reuse between Kernels, so no over-design here.)
-        - Example 1: A common function `XXXFunction` is called in both reduce CPU and reduce GPU kernel implementations, and the reduce CPU and reduce GPU kernel implementations are different, then `XXXFunction` should be in the `funcs` directory.
-    - `primitive`: Kernel Primitive API, some basic tools for multi-device unified kernel implementation.
-    - `impl`: Many paddle's original op kernel implementation reuse the same code for CPU and GPU, and they are in a large number of `xx_op.h`. This part of the code is not suitable to be placed in the `cpu` or `gpu` directory, nor in the `funcs` directory (putting it in the `funcs` directory will cause a considerable part of the kernel implementation to be placed in the `funcs` directory, which is too bloated and confusing. The `funcs` directory is created to place the `functor` and `function` tools as in the original operators/math directory). This part of the code is also not suitable to be placed in the root directory of `kernels` (it is not a device-independent implementation, only an implementation shared by cpu and gpu). Therefore, in order not to overthink this part of the code when migrating, and the location of the placement is relatively consistent with its implementation nature, the `impl` directory was created.
-        - In the `impl` directory, only the kernel functions that are consistent across some devices are placed. They are all header files, and the names are all suffixed with `xxx_kernel_impl.h`
-        - For example: `scale`, `fill_constant`, `fill_any_like` kernels are all such cases.
+  - `funcs`: In order to be compatible with the directories of functor and function in the original fluid/operators directory, when placing functions and functor that support multiple backends, we organize them according to the original design that one header file corresponding to multiple .cc(u) (This part of the code may be removed in the future, because it will be gradually replaced by Kernel Primitive API and reuse between Kernels, so no over-design here.)
+    - Example 1: A common function `XXXFunction` is called in both reduce CPU and reduce GPU kernel implementations, and the reduce CPU and reduce GPU kernel implementations are different, then `XXXFunction` should be in the `funcs` directory.
+  - `primitive`: Kernel Primitive API, some basic tools for multi-device unified kernel implementation.
+  - `impl`: Many paddle's original op kernel implementation reuse the same code for CPU and GPU, and they are in a large number of `xx_op.h`. This part of the code is not suitable to be placed in the `cpu` or `gpu` directory, nor in the `funcs` directory (putting it in the `funcs` directory will cause a considerable part of the kernel implementation to be placed in the `funcs` directory, which is too bloated and confusing. The `funcs` directory is created to place the `functor` and `function` tools as in the original operators/math directory). This part of the code is also not suitable to be placed in the root directory of `kernels` (it is not a device-independent implementation, only an implementation shared by cpu and gpu). Therefore, in order not to overthink this part of the code when migrating, and the location of the placement is relatively consistent with its implementation nature, the `impl` directory was created.
+    - In the `impl` directory, only the kernel functions that are consistent across some devices are placed. They are all header files, and the names are all suffixed with `xxx_kernel_impl.h`
+    - For example: `scale`, `fill_constant`, `fill_any_like` kernels are all such cases.
 - The auxiliary functions that are only used by the current kernel, they are always placed in the same backend folder as the kernel implementation, and the .h file is used to manage the code. Auxiliary function codes are no longer placed elsewhere, unless their implementations are used in multiple places.
-    - Even if there are multiple calls, if it is still limited to the same device, directly build the header file and put it in the same directory.
+  - Even if there are multiple calls, if it is still limited to the same device, directly build the header file and put it in the same directory.
 - The implementation of the backward kernel and the forward kernel are placed in different files, and the file suffix is `*_grad_kernel.*`, which is convenient for cmake to separate and compile.
-    - No more directories are created for the backward kernel, otherwise directories such as cpu/gpu will also be created under the backward kernel directory.
-    - The implementation of the second-order derivative and the third-order derivative is also placed in the grad kernel implementation file.
+
+  - No more directories are created for the backward kernel, otherwise directories such as cpu/gpu will also be created under the backward kernel directory.
+  - The implementation of the second-order derivative and the third-order derivative is also placed in the grad kernel implementation file.
 
 - Why is the directory named `gpu` instead of `cuda` and `hip`?
-    - The code of `cuda` and `hip` is very repetitive, and the unified implementation is easier to maintain.
+  - The code of `cuda` and `hip` is very repetitive, and the unified implementation is easier to maintain.
 
 #### 2.2.3 Namespace
 
@@ -230,26 +230,28 @@ void FullKernel(const Context& dev_ctx,
 ##### 2.3.2.1 API Tensor interface
 
 - The top-layer is the API-level Tensor interface, which contains two pointer members, `TensorBase` and `AbstractAutogradMeta`.
-    - Both members are designed as Interface and do not depend on real Tensor and `Autograd` implementations.
-    - `AutogradMeta` is only meaningful in the dynamic graph API-level Tensor, it will not be used in the specific kernel calculation, so put it in the top-layer Tensor interface.
-    - In addition, such a design facilitates data sharing and reduces copy overhead.
-        - When a Tensor is assigned to another Tensor, or Tensor is used as a function return value, only the pointer is actually copied, and no real data copy is performed.
+
+  - Both members are designed as Interface and do not depend on real Tensor and `Autograd` implementations.
+  - `AutogradMeta` is only meaningful in the dynamic graph API-level Tensor, it will not be used in the specific kernel calculation, so put it in the top-layer Tensor interface.
+  - In addition, such a design facilitates data sharing and reduces copy overhead.
+    - When a Tensor is assigned to another Tensor, or Tensor is used as a function return value, only the pointer is actually copied, and no real data copy is performed.
 
 - The top-layer C++ Tensor plays a similar role as the Python-side Tensor, and the interface design is as consistent as possible with the Python-side.
-    - Contain basic property access and data access methods of Tensor.
-        - `shape`, `place`, `dtype`, `data`.
-    - Contain the `autograd` methods required by the dynamic graph Tensor.
-        - `gradient`, `backward`.
-    - Contain conversion methods between Tensors.
-        - cpu, gpu, xpu etc.
-    - Contain calculation methods related to Tensor (not added yet).
-        - All methods of the `paddle.tensor` module.
+
+  - Contain basic property access and data access methods of Tensor.
+    - `shape`, `place`, `dtype`, `data`.
+  - Contain the `autograd` methods required by the dynamic graph Tensor.
+    - `gradient`, `backward`.
+  - Contain conversion methods between Tensors.
+    - cpu, gpu, xpu etc.
+  - Contain calculation methods related to Tensor (not added yet).
+    - All methods of the `paddle.tensor` module.
 
 - Compilation decoupling:
 
-    - The `autograd` information here is just a pointer index, which is empty by default.
-        - `std::unique_ptr<AbstractAutogradMeta> autograd_meta_ = nullptr;`
-    - `AbstractAutogradMeta` is an abstract class interface that does not depend on any module of `autograd`, so it will not affect the independent compilation of PHI, and at the same time takes into account the need for dynamic graph Tensor to hold backward information.
+  - The `autograd` information here is just a pointer index, which is empty by default.
+    - `std::unique_ptr<AbstractAutogradMeta> autograd_meta_ = nullptr;`
+  - `AbstractAutogradMeta` is an abstract class interface that does not depend on any module of `autograd`, so it will not affect the independent compilation of PHI, and at the same time takes into account the need for dynamic graph Tensor to hold backward information.
 
 - `AutogradMeta` is only set in the dynamic graph scenario. For unneeded scenarios, such as in static graphs, `AutogradMeta` is just a null pointer.
 
@@ -277,22 +279,23 @@ Tensor ondnn() const;
 ```
 
 - This conversion process may be `cast` or `copy`:
-    - `cast` if no data copy required.
-    - `copy` if data copy required.
-    - Transformations are implemented by functional kernels.
+
+  - `cast` if no data copy required.
+  - `copy` if data copy required.
+  - Transformations are implemented by functional kernels.
 
 - Usage in API Scenarios
-    - In a complete training scenario, when a user uses an API, such as `DataLoader`, the data is generally read from the disk, put it into the CPU, and then converted to the specific execution device.
+  - In a complete training scenario, when a user uses an API, such as `DataLoader`, the data is generally read from the disk, put it into the CPU, and then converted to the specific execution device.
 
 ##### 2.3.2.2 TensorBase
 
 - The interface implemented by Tensor only contains the necessary pure virtual Tensor methods, and does not contain members with real meaning. The methods here should also be strictly monitored during the development process.
 
 - Why use abstract class design at this level?
-    - On the one hand, it is to isolate the Tensor API from the specific implementation of Tensor without generating too many dependencies. If the Tensor API needs to be redesigned in the future, or the `autograd` information needs to be abandoned, only the Tensor API needs to be redesigned, which has little effect on the implementation of the underlying Tensor.
-    - On the other hand, in order to reserve sufficient expansion space for heterogeneous Tensors, the framework-level API only needs one Tensor data structure, and there is no need to expose multiple data structures. In fact, a large-scale definition is made here: all data structures in the framework are Tensors.
-        - For a basically consistent memory layout, or a basically consistent implementation of Tensor descriptions, it can be inherited based on an implementation of `DenseTensor`.
-        - For Tensors with a high degree of heterogeneity, new Tensor classes (such as Tensors with only one Object) can be directly inherited from Interface. This ensures that Tensor has no bottlenecks in scaling flexibility.
+  - On the one hand, it is to isolate the Tensor API from the specific implementation of Tensor without generating too many dependencies. If the Tensor API needs to be redesigned in the future, or the `autograd` information needs to be abandoned, only the Tensor API needs to be redesigned, which has little effect on the implementation of the underlying Tensor.
+  - On the other hand, in order to reserve sufficient expansion space for heterogeneous Tensors, the framework-level API only needs one Tensor data structure, and there is no need to expose multiple data structures. In fact, a large-scale definition is made here: all data structures in the framework are Tensors.
+    - For a basically consistent memory layout, or a basically consistent implementation of Tensor descriptions, it can be inherited based on an implementation of `DenseTensor`.
+    - For Tensors with a high degree of heterogeneity, new Tensor classes (such as Tensors with only one Object) can be directly inherited from Interface. This ensures that Tensor has no bottlenecks in scaling flexibility.
 
 ##### 2.3.3.3 DenseTensor、SparseTensor
 
@@ -334,12 +337,12 @@ Inherit other Tensors with high degrees of freedom: directly inherit `TensorBase
 
 - `TensorBase` is an abstract class, which leaves a lot of room for the description of specific Tensor. If the description of traditional Tensor cannot meet the requirements, a specialized Tensor implementation can be designed.
 
-
 #### 2.3.3 C++ API
 
 ##### 2.3.3.1 C++ API form
 
 > Highlights of this section:
+>
 > 1. The C++ API corresponds to the Python 2.0 API: the function name, parameter name, parameter order, and return value are the same.
 
 After investigation, we found that very few framework products are designed with the ease of use of the C++ API in mind. For the long-term consideration, if we want to attract more developers to build the paddle ecology, it is also very important to provide a standardized and easy-to-use C++ API architecture. At the same time, the Python 2.0 API project has laid a good reference foundation for the C++ API, and we can directly inherit its achievements.
@@ -362,11 +365,11 @@ Described as follows:
 **What scenarios is this new C++ API architecture mainly used for?**
 
 1. C++ API that can be called when developing custom operators, it improves ease of use.
-    - For example, the user needs to initialize a Tensor in a custom operator, loop through the Tensor data and assign values, then you can directly call `paddle::ones`, `paddle::full` APIs.
+   - For example, the user needs to initialize a Tensor in a custom operator, loop through the Tensor data and assign values, then you can directly call `paddle::ones`, `paddle::full` APIs.
 2. The architecture serves as the basic calling unit of the new dynamic graph.
-    - The new dynamic graph will use the API as the scheduling calculation unit, and will no longer call the Op architecture, thus improving the scheduling performance.
+   - The new dynamic graph will use the API as the scheduling calculation unit, and will no longer call the Op architecture, thus improving the scheduling performance.
 3. As a basis for the development of backward Op reuse forward Op.
-    - Now the backward op kernel needs to be implemented separately. After the API architecture is completed, it is hoped that the backward op implementation can be completed by reusing the forward API.
+   - Now the backward op kernel needs to be implemented separately. After the API architecture is completed, it is hoped that the backward op implementation can be completed by reusing the forward API.
 
 ##### 2.3.3.2 C++ API auto-generate
 
@@ -386,24 +389,24 @@ The key to C++ API generation lies in the configuration of the YAML file. Taking
 
 ```yaml
 ## Forward API configuration
-- api : matmul
-  args : (Tensor x, Tensor y, bool transpose_x=false, bool transpose_y=false)
-  output : Tensor
-  infer_meta :
-    func : MatmulInferMeta
-  kernel :
-    func : matmul
-  backward : matmul_grad
+- api: matmul
+  args: (Tensor x, Tensor y, bool transpose_x=false, bool transpose_y=false)
+  output: Tensor
+  infer_meta:
+    func: MatmulInferMeta
+  kernel:
+    func: matmul
+  backward: matmul_grad
 
 ## Backward API configuration
-- backward_api : matmul_grad
-  forward : matmul (Tensor x, Tensor y, bool transpose_x, bool transpose_y) -> Tensor(out)
-  args : (Tensor x, Tensor y, Tensor out_grad, bool transpose_x=false, bool transpose_y=false)
-  output : Tensor(x_grad), Tensor(y_grad)
-  infer_meta :
-    func : MatmulGradInferMeta
-  kernel :
-    func : matmul_grad
+- backward_api: matmul_grad
+  forward: matmul (Tensor x, Tensor y, bool transpose_x, bool transpose_y) -> Tensor(out)
+  args: (Tensor x, Tensor y, Tensor out_grad, bool transpose_x=false, bool transpose_y=false)
+  output: Tensor(x_grad), Tensor(y_grad)
+  infer_meta:
+    func: MatmulGradInferMeta
+  kernel:
+    func: matmul_grad
 ```
 
 The meaning of each configuration parameter:
@@ -412,9 +415,9 @@ The meaning of each configuration parameter:
 - args: the function parameters. Their order and data type must be exactly the same as the PHI Kernel function of the same name, and the `Attributes` type must be ranked after the `Tensor` type.
 - output: the output type. If there are multiple outputs, then separate them by commas (","). You can optionally mark the name of each input with "()" after the type (e.g. `Tensor(out)`). If there is no mark, the default markers is `out0`, `out1`, ...
 - infer_meta: calculate the dimension and type of the returned Tensor (see the introduction of the `InferMeta` function for details).
-    - func: the called `InferMeta` function. It's default input is all the parameters of the args item and the output parameter of api, the Tensor type variable in it will be automatically replaced with `MetaTensor`.
+  - func: the called `InferMeta` function. It's default input is all the parameters of the args item and the output parameter of api, the Tensor type variable in it will be automatically replaced with `MetaTensor`.
 - kernel: the specific Kernel function called by the API.
-    - func: the registered name of the kernel function (the name used by `REGISTER`, not the function name). It's default input is all the parameters of the args item and the output parameter of api.
+  - func: the registered name of the kernel function (the name used by `REGISTER`, not the function name). It's default input is all the parameters of the args item and the output parameter of api.
 - backward: (optional). The corresponding backward function name, if not set only the forward API will be generated.
 
 The YAML parsing script will automatically generate the corresponding C++ API according to the above configuration items. The generated code includes the relevant processing logic such as Kernel automatic selection, Tensor transformation, Data Transform, `InferMeta` and Kernel calling. For details, please refer to the generated code in `api.cc` .
@@ -426,10 +429,11 @@ Due to the large number of C++ APIs and their various forms and functions, some
 ##### 2.3.4.1 Kernel form
 
 > Highlights of this section:
+>
 > 1. Notes on Kernel function form:
-> (1) Data type `T` and `DeviceContext` (abbreviated as `Context`) as template parameters;
-> (2) `Context` is the first parameter of Kernel;
-> (3) The return value Tensor takes the form of a pointer as an input parameter, and the return value of Kernel itself is void.
+>    (1) Data type `T` and `DeviceContext` (abbreviated as `Context`) as template parameters;
+>    (2) `Context` is the first parameter of Kernel;
+>    (3) The return value Tensor takes the form of a pointer as an input parameter, and the return value of Kernel itself is void.
 
 This part includes the specific Kernel. The functions implemented in this part will be registered in the framework as Kernel for unified search and scheduling by the framework.
 
@@ -451,33 +455,37 @@ Described as follows:
 
 - The kernels of different devices must have different function implementations. The function names are named in **camel case**. Except for the capitalization of the first letter, the naming should be as consistent as possible with the API function name. The function names of the same calculation are kept the same, and the functions of different devices are managed through different files or directories.
 - There are generally two template parameters, `T` and `Context`, which are used to determine the data type and device type at runtime.
-    - According to our current architecture, the vast majority of Kernels reduce the code in the way of **specialized DeviceContext and data type**, which is consistent with the original `OpKernel` form.
-    - The form should be unified. If the Kernel level is also exposed as a fine-grained API in the future, the ease of use is guaranteed.
+  - According to our current architecture, the vast majority of Kernels reduce the code in the way of **specialized DeviceContext and data type**, which is consistent with the original `OpKernel` form.
+  - The form should be unified. If the Kernel level is also exposed as a fine-grained API in the future, the ease of use is guaranteed.
 - Specification of function input parameters:
-    - Take a specific `DeviceContext` (such as `CPUContext`, `GPUContext`) as the first input parameter to meet the needs of specific context information required at runtime. Pass the stream in if there are multiple streams.
-        - Currently, it is not supported to pass multiple `DeviceContext` parameters to one Kernel. At present, such a requirement is considered unreasonable.
-    - The parameter list is consistent with the API. If there is other special information that needs to be passed into the Kernel, pass it through the `Context`.
-    - Then all input Tensors and input Attributes are passed in with const &, and POD types are passed in directly by value.
-    - The input Tensor is a specific Tensor type, such as `DenseTensor` or `SelectedRows`, not the Tensor of the external interface API.
-    - Finally, the Tensor return value of the function, passed in as a pointer.
-    - In order to make the mechanism more flexible and allow the kernel to adapt to more scenarios, the declaration of flexible types of input, output and parameters will be allowed subsequently to adapt to non-Tensor input, output and Tensor Attribute.
+  - Take a specific `DeviceContext` (such as `CPUContext`, `GPUContext`) as the first input parameter to meet the needs of specific context information required at runtime. Pass the stream in if there are multiple streams.
+    - Currently, it is not supported to pass multiple `DeviceContext` parameters to one Kernel. At present, such a requirement is considered unreasonable.
+  - The parameter list is consistent with the API. If there is other special information that needs to be passed into the Kernel, pass it through the `Context`.
+  - Then all input Tensors and input Attributes are passed in with const &, and POD types are passed in directly by value.
+  - The input Tensor is a specific Tensor type, such as `DenseTensor` or `SelectedRows`, not the Tensor of the external interface API.
+  - Finally, the Tensor return value of the function, passed in as a pointer.
+  - In order to make the mechanism more flexible and allow the kernel to adapt to more scenarios, the declaration of flexible types of input, output and parameters will be allowed subsequently to adapt to non-Tensor input, output and Tensor Attribute.
 - The internal implementation of the function is determined on demand:
-    - Short term:
-        - Migrate the implementation of the existing `OpKernel` to the specific device Kernel.
-        - Abstract the implementation of `OpKernel` with common devices into functions, which are called by multiple device Kernels.
-    - Long term:
-        - The complex kernel directly calls the basic kernel to complete the calculation, encourages kernel reuse, thus simplifies the code.
+  - Short term:
+    - Migrate the implementation of the existing `OpKernel` to the specific device Kernel.
+    - Abstract the implementation of `OpKernel` with common devices into functions, which are called by multiple device Kernels.
+  - Long term:
+    - The complex kernel directly calls the basic kernel to complete the calculation, encourages kernel reuse, thus simplifies the code.
 
 > FAQ:
 
->- Why does the first parameter need to be `DeviceContext`? Why must this parameter be passed in?
+> - Why does the first parameter need to be `DeviceContext`? Why must this parameter be passed in?
+
     - The PHI kernel requires a pure function format. The variables used in the function are passed in through parameters or created inside the function, global singletons are not allowed inside the function. In order to adapt to various kernel requirements, the `DeviceContext` parameter that stores context information is necessary.
->- Why are two template parameters needed?
+
+> - Why are two template parameters needed?
+
     - In order to efficiently support the reusing of device-independent kernels. If we want to implement a Fourier transform `fft` kernel, assuming that the kernel can be derived by combining the basic kernels, the form of `Xxx<T, Device>()` can avoid dynamically redistributing devices.
 
 ##### 2.3.4.3 Kernel implementation
 
 > Highlights of this section:
+>
 > 1. Kernel focuses on computing logic without mixing scheduling logic.
 > 2. Kernel is fine-grained enough, with clear boundaries, no optional parameters, easy to reuse.
 
@@ -531,13 +539,14 @@ In addition to the change of kernel form from structure format to functional for
 2. In the PHI kernel, the memory application of the output Tensor is required to use the `ctx.Alloc` or `ctx.HostAlloc` method, and no longer use the original `mutable_data` to apply for memory.
 
 > FAQ
+>
 > 1. Why is `mutable_data` replaced by `ctx.Alloc`?
-> Answer: Because the global method `memory::AllocShared` called in the original `mutable_data` method uses a global singleton for memory allocation, which does not conform to the pure function design principle mentioned above. In terms of business requirements, if a single instance is used in the kernel to determine the way of memory allocation, in the multi-threaded environment of inference, different threads will not be able to flexibly specify different memory allocation ways.
-
+>    Answer: Because the global method `memory::AllocShared` called in the original `mutable_data` method uses a global singleton for memory allocation, which does not conform to the pure function design principle mentioned above. In terms of business requirements, if a single instance is used in the kernel to determine the way of memory allocation, in the multi-threaded environment of inference, different threads will not be able to flexibly specify different memory allocation ways.
 
 ##### 2.3.4.4 Kernel registration
 
 > Highlights of this section:
+>
 > 1. Kernel needs to expose all its key information to the framework and record its input, output and attribute information, otherwise it will lead to unclear boundaries between framework scheduling and Kernel calculation.
 
 When fluid Kernel is registered, only the `place`, `layout`, `dtype`, `input` and `output` of the Kernel are recorded and managed by `ExecutionContext`, and there is no corresponding information record. Now the kernel needs to be changed to a functional type. The input, output and attributes of each function are clear. We hope to record the information of each input and output here, which is also compatible with paddle-lite scheduling.
@@ -546,69 +555,69 @@ Meanwhile, we need to simplify the writing method of Kernel registration. The ex
 
 1. There is a lot of redundant information in the Kernel registration method of fluid. Taking `scale` as an example, you can see that in addition to the last data type of each kernel, the preceding function names and `DeviceContext` specialization information are redundant.
 
-    ```c++
-    REGISTER_OP_CPU_KERNEL(
-        scale, ops::ScaleKernel<phi::CPUContext, float>,
-        ops::ScaleKernel<phi::CPUContext, double>,
-        ops::ScaleKernel<phi::CPUContext,
-                         phi::dtype::bfloat16>,
-        ops::ScaleKernel<phi::CPUContext, uint8_t>,
-        ops::ScaleKernel<phi::CPUContext, int8_t>,
-        ops::ScaleKernel<phi::CPUContext, int16_t>,
-        ops::ScaleKernel<phi::CPUContext, int>,
-        ops::ScaleKernel<phi::CPUContext, int64_t>);
-    ```
+   ```c++
+   REGISTER_OP_CPU_KERNEL(
+       scale, ops::ScaleKernel<phi::CPUContext, float>,
+       ops::ScaleKernel<phi::CPUContext, double>,
+       ops::ScaleKernel<phi::CPUContext,
+                        phi::dtype::bfloat16>,
+       ops::ScaleKernel<phi::CPUContext, uint8_t>,
+       ops::ScaleKernel<phi::CPUContext, int8_t>,
+       ops::ScaleKernel<phi::CPUContext, int16_t>,
+       ops::ScaleKernel<phi::CPUContext, int>,
+       ops::ScaleKernel<phi::CPUContext, int64_t>);
+   ```
 
 2. Paddle-Lite's kernel registration method declares input and output information for each Kernel, but since the kernel of each data type is different, it will also cause redundancy in the writing method. As you can see in the following code, except for the data type, other information is basically redundant.
 
-    ```c++
-    #ifdef LITE_BUILD_EXTRA
-    using scale_int32_f =
-        paddle::lite::kernels::arm::ScaleCompute<int, PRECISION(kFloat)>;
-    REGISTER_LITE_KERNEL(scale, kARM, kFloat, kNCHW, scale_int32_f, int32)
-        .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
-        .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
-        .Finalize();
-
-    using scale_int64_f =
-        paddle::lite::kernels::arm::ScaleCompute<int64_t, PRECISION(kFloat)>;
-    REGISTER_LITE_KERNEL(scale, kARM, kFloat, kNCHW, scale_int64_f, int64)
-        .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
-        .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
-        .Finalize();
-    #endif  // LITE_BUILD_EXTRA
-
-    #ifdef ENABLE_ARM_FP16
-    using scale_float16 =
-        paddle::lite::kernels::arm::ScaleCompute<float16_t, PRECISION(kFP16)>;
-    REGISTER_LITE_KERNEL(scale, kARM, kFP16, kNCHW, scale_float16, def)
-        .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFP16))})
-        .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFP16))})
-        .Finalize();
-
-    #endif  // ENABLE_ARM_FP16
-
-    using scale_float =
-        paddle::lite::kernels::arm::ScaleCompute<float, PRECISION(kFloat)>;
-    REGISTER_LITE_KERNEL(scale, kARM, kFloat, kNCHW, scale_float, def)
-        .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
-        .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
-        .Finalize();
-
-    using scale_int32 =
-        paddle::lite::kernels::arm::ScaleCompute<int, PRECISION(kInt32)>;
-    REGISTER_LITE_KERNEL(scale, kARM, kInt32, kNCHW, scale_int32, def)
-        .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
-        .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
-        .Finalize();
-
-    using scale_int64 =
-        paddle::lite::kernels::arm::ScaleCompute<int64_t, PRECISION(kInt64)>;
-    REGISTER_LITE_KERNEL(scale, kARM, kInt64, kNCHW, scale_int64, def)
-        .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
-        .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
-        .Finalize();
-    ```
+   ```c++
+   #ifdef LITE_BUILD_EXTRA
+   using scale_int32_f =
+       paddle::lite::kernels::arm::ScaleCompute<int, PRECISION(kFloat)>;
+   REGISTER_LITE_KERNEL(scale, kARM, kFloat, kNCHW, scale_int32_f, int32)
+       .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+       .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+       .Finalize();
+
+   using scale_int64_f =
+       paddle::lite::kernels::arm::ScaleCompute<int64_t, PRECISION(kFloat)>;
+   REGISTER_LITE_KERNEL(scale, kARM, kFloat, kNCHW, scale_int64_f, int64)
+       .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+       .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+       .Finalize();
+   #endif  // LITE_BUILD_EXTRA
+
+   #ifdef ENABLE_ARM_FP16
+   using scale_float16 =
+       paddle::lite::kernels::arm::ScaleCompute<float16_t, PRECISION(kFP16)>;
+   REGISTER_LITE_KERNEL(scale, kARM, kFP16, kNCHW, scale_float16, def)
+       .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFP16))})
+       .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFP16))})
+       .Finalize();
+
+   #endif  // ENABLE_ARM_FP16
+
+   using scale_float =
+       paddle::lite::kernels::arm::ScaleCompute<float, PRECISION(kFloat)>;
+   REGISTER_LITE_KERNEL(scale, kARM, kFloat, kNCHW, scale_float, def)
+       .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
+       .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
+       .Finalize();
+
+   using scale_int32 =
+       paddle::lite::kernels::arm::ScaleCompute<int, PRECISION(kInt32)>;
+   REGISTER_LITE_KERNEL(scale, kARM, kInt32, kNCHW, scale_int32, def)
+       .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+       .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+       .Finalize();
+
+   using scale_int64 =
+       paddle::lite::kernels::arm::ScaleCompute<int64_t, PRECISION(kInt64)>;
+   REGISTER_LITE_KERNEL(scale, kARM, kInt64, kNCHW, scale_int64, def)
+       .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+       .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+       .Finalize();
+   ```
 
 Therefore, in this design, we do not want to continue to maintain this redundant writing method. We hope that the writing method of kernel registration is concise enough, and at the same time, it can flexibly meet the requirements of Kernel input and output information configuration.
 
@@ -655,6 +664,7 @@ In addition, only basic template adaptation has been implemented at present, and
 ##### 2.3.4.4 Kernel management
 
 > Highlights of this section:
+>
 > 1. Introduce the design of the current Kernel management components
 
 For the management of the new form of Kernel, described as follows:
@@ -663,10 +673,10 @@ For the management of the new form of Kernel, described as follows:
 - `KernelKey` is similar to the original `OpKernelType`, but the `palce` and `library_type` fields are combined into one and called `Backend`, because the original `LibraryType` is a limited enumeration class, which is strongly related to place, the splitting increases the cost of understanding instead.
 - `Kernel` holds more information than the original `OpKernel`. In addition to the Function during execution, it also holds information about specific parameters, namely `KernelArgsDef`. For Tensor type input and output, it saves Tensor type information, Device, data Type, data layout. For Attribute type input and output, it saves type information.
 
-
 #### 2.3.5 Kernel Compilation and Dependencies
 
 > Highlights of this section:
+>
 > 1. Introduce the compilation design of the kernel.
 > 2. Introduce the establishment of kernel dependencies.
 
@@ -714,8 +724,9 @@ The original `InferShape` of fluid Op is the same as `OpKernel`, has the problem
 We also rewrite `InferShape` into a functional form, which supports different Ops to call the same `InferShape` function, which improves ease of use and reduces maintenance costs.
 
 > FAQ:
+>
 > 1. Why call it `InferMeta` instead of continuing to call it `InferShape`?
-> Answer: The `Meta` of `InferMeta` comes from the `meta` member in `DenseTensor`. In PHI, an op has two components, `InferMeta` and `Kernel`. `InferMeta` covers the functions of `InferShape`, but it is not limited to `InferShape`. In addition to the inference of dims and lod, `InferMeta` also infers dtype and layout, which is different from the original.
+>    Answer: The `Meta` of `InferMeta` comes from the `meta` member in `DenseTensor`. In PHI, an op has two components, `InferMeta` and `Kernel`. `InferMeta` covers the functions of `InferShape`, but it is not limited to `InferShape`. In addition to the inference of dims and lod, `InferMeta` also infers dtype and layout, which is different from the original.
 
 ##### 2.3.6.1 InferMeta related design
 
@@ -757,8 +768,8 @@ The purpose of using `MetaTensor` is to mask multiple Tensor types, and to be co
 
 The basic design of `MetaTensor` see the `paddle/phi/core/meta_tensor.h`. There is a pointer member `TensorBase` in the base class `MetaTensor`, so it can be compatible with `DenseTensor`, `SelectedRows`, `SparseCsrTensor` and other types in PHI.
 
-
 > Note:
 > Only the content related to the design of PHI itself in this README. If you want to know more about the design of how phi and fluid are compatible, please refer to:
+>
 > 1. [Paddle HIgh reusability operator library (PHI) Design Document (CN Version)](https://github.com/PaddlePaddle/docs/blob/develop/docs/design/phi/design_cn.md)
 > 2. [Paddle HIgh reusability operator library (PHI) Design Document (EN Version)](https://github.com/PaddlePaddle/docs/blob/develop/docs/design/phi/design_en.md)
diff --git a/tools/get_pr_ut.py b/tools/get_pr_ut.py
index ddbacc0884a2a3..c6f190189a67f4 100644
--- a/tools/get_pr_ut.py
+++ b/tools/get_pr_ut.py
@@ -330,9 +330,7 @@ def get_pr_ut(self):
             if filename.startswith(PADDLE_ROOT + 'python/'):
                 file_list.append(filename)
             elif filename.startswith(PADDLE_ROOT + 'paddle/'):
-                if filename.startswith(PADDLE_ROOT + 'paddle/infrt'):
-                    filterFiles.append(filename)
-                elif filename.startswith(PADDLE_ROOT + 'paddle/scripts'):
+                if filename.startswith(PADDLE_ROOT + 'paddle/scripts'):
                     if filename.startswith(
                         (
                             PADDLE_ROOT + 'paddle/scripts/paddle_build.sh',

From aa35331f11b8ecb1d9c285fafdb9ed239a4d98c9 Mon Sep 17 00:00:00 2001
From: huangjiyi <43315610+huangjiyi@users.noreply.github.com>
Date: Mon, 10 Apr 2023 11:39:55 +0800
Subject: [PATCH 009/156] register fluid kerenls to phi [part 7] (#52577)

* update

* fix bug

* fix ci-windows-openblas

* fix test_partial_sum_op

* fix codestyle
---
 .../collective/partial_allgather_op.cc        | 15 +++--
 .../collective/partial_allgather_op.cu.cc     | 20 ++++---
 .../collective/partial_allgather_op.h         |  2 +-
 .../operators/collective/partial_recv_op.cc   | 15 +++--
 .../collective/partial_recv_op.cu.cc          | 20 ++++---
 .../operators/collective/partial_recv_op.h    |  2 +-
 .../operators/collective/partial_send_op.cc   | 15 +++--
 .../collective/partial_send_op.cu.cc          | 20 ++++---
 .../operators/collective/partial_send_op.h    |  2 +-
 .../detection/polygon_box_transform_op.cc     | 13 +++--
 .../detection/polygon_box_transform_op.cu     | 13 +++--
 .../operators/metrics/precision_recall_op.cc  | 10 ++--
 .../operators/metrics/precision_recall_op.h   |  2 +-
 paddle/fluid/operators/nccl/nccl_op.cu.cc     | 15 +++--
 .../fluid/operators/nccl/nccl_op_test.cu.cc   |  9 ++-
 paddle/fluid/operators/nce_op.cc              | 11 ++--
 paddle/fluid/operators/nce_op.h               |  4 +-
 paddle/fluid/operators/nop_op.cc              | 10 ++--
 paddle/fluid/operators/number_count_op.cc     |  7 +--
 paddle/fluid/operators/number_count_op.cu     |  5 +-
 paddle/fluid/operators/number_count_op.h      |  2 +-
 .../optimizers/proximal_adagrad_op.cc         |  4 +-
 .../optimizers/proximal_adagrad_op.cu         |  4 +-
 .../optimizers/proximal_adagrad_op.h          |  2 +-
 .../operators/optimizers/proximal_gd_op.cc    |  5 +-
 .../operators/optimizers/proximal_gd_op.cu    |  4 +-
 .../operators/optimizers/proximal_gd_op.h     |  2 +-
 paddle/fluid/operators/pad2d_op.cc            | 17 +++---
 paddle/fluid/operators/pad2d_op.cu            | 30 ++++++----
 .../fluid/operators/pad_constant_like_op.cc   | 58 +++++++++++--------
 paddle/fluid/operators/pad_constant_like_op.h |  4 +-
 paddle/fluid/operators/partial_concat_op.cc   | 27 +++++----
 paddle/fluid/operators/partial_concat_op.cu   | 36 +++++++-----
 paddle/fluid/operators/partial_concat_op.h    |  4 +-
 paddle/fluid/operators/partial_sum_op.cc      | 27 +++++----
 paddle/fluid/operators/partial_sum_op.cu      | 19 +-----
 paddle/fluid/operators/partial_sum_op.h       |  4 +-
 .../operators/positive_negative_pair_op.cc    | 11 ++--
 .../operators/positive_negative_pair_op.h     |  2 +-
 paddle/fluid/operators/prroi_pool_op.cc       | 27 +++++----
 paddle/fluid/operators/prroi_pool_op.cu       | 19 +++---
 paddle/fluid/operators/prroi_pool_op.h        |  4 +-
 .../operators/prune_gate_by_capacity_op.cc    | 10 ++--
 .../operators/prune_gate_by_capacity_op.cu    | 10 ++--
 .../operators/prune_gate_by_capacity_op.h     |  2 +-
 .../operators/pull_box_extended_sparse_op.cc  | 19 +++---
 .../operators/pull_box_extended_sparse_op.cu  | 23 +++++---
 .../operators/pull_box_extended_sparse_op.h   |  4 +-
 paddle/fluid/operators/pull_box_sparse_op.cc  |  7 ++-
 paddle/fluid/operators/pull_box_sparse_op.h   |  4 +-
 paddle/fluid/operators/pull_box_sparse_op.kps | 17 ++----
 .../fluid/operators/pull_gpups_sparse_op.cc   | 19 ++++--
 .../fluid/operators/pull_gpups_sparse_op.cu   | 22 ++++---
 paddle/fluid/operators/pull_gpups_sparse_op.h |  4 +-
 paddle/fluid/operators/pull_sparse_op.cc      |  6 +-
 paddle/fluid/operators/pull_sparse_op.h       |  4 +-
 paddle/fluid/operators/pull_sparse_v2_op.cc   |  6 +-
 paddle/fluid/operators/pull_sparse_v2_op.h    |  4 +-
 paddle/fluid/operators/unity_build_rule.cmake |  1 -
 59 files changed, 387 insertions(+), 297 deletions(-)

diff --git a/paddle/fluid/operators/collective/partial_allgather_op.cc b/paddle/fluid/operators/collective/partial_allgather_op.cc
index 00610768059230..7f9e5f3f3e37f0 100644
--- a/paddle/fluid/operators/collective/partial_allgather_op.cc
+++ b/paddle/fluid/operators/collective/partial_allgather_op.cc
@@ -85,9 +85,12 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
     ops::PartialAllGatherOpInplaceInferer)
 
-REGISTER_OP_CPU_KERNEL(partial_allgather,
-                       ops::PartialAllGatherOpCPUKernel<float>,
-                       ops::PartialAllGatherOpCPUKernel<double>,
-                       ops::PartialAllGatherOpCPUKernel<int>,
-                       ops::PartialAllGatherOpCPUKernel<int64_t>,
-                       ops::PartialAllGatherOpCPUKernel<plat::float16>);
+PD_REGISTER_STRUCT_KERNEL(partial_allgather,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::PartialAllGatherOpCPUKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t,
+                          plat::float16) {}
diff --git a/paddle/fluid/operators/collective/partial_allgather_op.cu.cc b/paddle/fluid/operators/collective/partial_allgather_op.cu.cc
index ce5a5438eff555..2374f4a4aed823 100644
--- a/paddle/fluid/operators/collective/partial_allgather_op.cu.cc
+++ b/paddle/fluid/operators/collective/partial_allgather_op.cu.cc
@@ -23,7 +23,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PartialAllGatherOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -102,12 +102,16 @@ class PartialAllGatherOpCUDAKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_CUDA_KERNEL(partial_allgather,
-                        ops::PartialAllGatherOpCUDAKernel<float>,
+PD_REGISTER_STRUCT_KERNEL(partial_allgather,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::PartialAllGatherOpCUDAKernel,
+                          float,
+                          double,
 #if NCCL_VERSION_CODE >= 21000
-                        ops::PartialAllGatherOpCUDAKernel<plat::bfloat16>,
+                          plat::bfloat16,
 #endif
-                        ops::PartialAllGatherOpCUDAKernel<double>,
-                        ops::PartialAllGatherOpCUDAKernel<int>,
-                        ops::PartialAllGatherOpCUDAKernel<int64_t>,
-                        ops::PartialAllGatherOpCUDAKernel<plat::float16>);
+                          int,
+                          int64_t,
+                          plat::float16) {
+}
diff --git a/paddle/fluid/operators/collective/partial_allgather_op.h b/paddle/fluid/operators/collective/partial_allgather_op.h
index 7e9c85214cf318..6b827a2656f29e 100644
--- a/paddle/fluid/operators/collective/partial_allgather_op.h
+++ b/paddle/fluid/operators/collective/partial_allgather_op.h
@@ -26,7 +26,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PartialAllGatherOpCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/fluid/operators/collective/partial_recv_op.cc b/paddle/fluid/operators/collective/partial_recv_op.cc
index 14cca68cf16ab5..5cd4a72ea7ea9f 100644
--- a/paddle/fluid/operators/collective/partial_recv_op.cc
+++ b/paddle/fluid/operators/collective/partial_recv_op.cc
@@ -129,9 +129,12 @@ REGISTER_OP_WITHOUT_GRADIENT(partial_recv,
                              ops::PartialRecvOp,
                              ops::PartialRecvOpMaker);
 
-REGISTER_OP_CPU_KERNEL(partial_recv,
-                       ops::PartialRecvOpCPUKernel<float>,
-                       ops::PartialRecvOpCPUKernel<double>,
-                       ops::PartialRecvOpCPUKernel<int>,
-                       ops::PartialRecvOpCPUKernel<int64_t>,
-                       ops::PartialRecvOpCPUKernel<plat::float16>);
+PD_REGISTER_STRUCT_KERNEL(partial_recv,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::PartialRecvOpCPUKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t,
+                          plat::float16) {}
diff --git a/paddle/fluid/operators/collective/partial_recv_op.cu.cc b/paddle/fluid/operators/collective/partial_recv_op.cu.cc
index 306175d1ca7af8..b0df94194e4f87 100644
--- a/paddle/fluid/operators/collective/partial_recv_op.cu.cc
+++ b/paddle/fluid/operators/collective/partial_recv_op.cu.cc
@@ -23,7 +23,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PartialRecvOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
@@ -118,12 +118,16 @@ class PartialRecvOpCUDAKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_CUDA_KERNEL(partial_recv,
-                        ops::PartialRecvOpCUDAKernel<float>,
+PD_REGISTER_STRUCT_KERNEL(partial_recv,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::PartialRecvOpCUDAKernel,
+                          float,
+                          double,
 #if NCCL_VERSION_CODE >= 21000
-                        ops::PartialRecvOpCUDAKernel<plat::bfloat16>,
+                          plat::bfloat16,
 #endif
-                        ops::PartialRecvOpCUDAKernel<double>,
-                        ops::PartialRecvOpCUDAKernel<int>,
-                        ops::PartialRecvOpCUDAKernel<int64_t>,
-                        ops::PartialRecvOpCUDAKernel<plat::float16>);
+                          int,
+                          int64_t,
+                          plat::float16) {
+}
diff --git a/paddle/fluid/operators/collective/partial_recv_op.h b/paddle/fluid/operators/collective/partial_recv_op.h
index d64fa39939c2d6..fdf3f02b0d679f 100644
--- a/paddle/fluid/operators/collective/partial_recv_op.h
+++ b/paddle/fluid/operators/collective/partial_recv_op.h
@@ -24,7 +24,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PartialRecvOpCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/fluid/operators/collective/partial_send_op.cc b/paddle/fluid/operators/collective/partial_send_op.cc
index a45cc6ddde6438..936336ce74ad52 100644
--- a/paddle/fluid/operators/collective/partial_send_op.cc
+++ b/paddle/fluid/operators/collective/partial_send_op.cc
@@ -94,9 +94,12 @@ REGISTER_OP_WITHOUT_GRADIENT(partial_send,
                              ops::PartialSendOp,
                              ops::PartialSendMaker);
 
-REGISTER_OP_CPU_KERNEL(partial_send,
-                       ops::PartialSendOpCPUKernel<float>,
-                       ops::PartialSendOpCPUKernel<double>,
-                       ops::PartialSendOpCPUKernel<int>,
-                       ops::PartialSendOpCPUKernel<int64_t>,
-                       ops::PartialSendOpCPUKernel<plat::float16>);
+PD_REGISTER_STRUCT_KERNEL(partial_send,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::PartialSendOpCPUKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t,
+                          plat::float16) {}
diff --git a/paddle/fluid/operators/collective/partial_send_op.cu.cc b/paddle/fluid/operators/collective/partial_send_op.cu.cc
index afac7f963fa0dc..dc24ea01fc98e9 100644
--- a/paddle/fluid/operators/collective/partial_send_op.cu.cc
+++ b/paddle/fluid/operators/collective/partial_send_op.cu.cc
@@ -24,7 +24,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PartialSendCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -117,12 +117,16 @@ class PartialSendCUDAKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_CUDA_KERNEL(partial_send,
-                        ops::PartialSendCUDAKernel<float>,
-                        ops::PartialSendCUDAKernel<double>,
+PD_REGISTER_STRUCT_KERNEL(partial_send,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::PartialSendCUDAKernel,
+                          float,
+                          double,
 #if NCCL_VERSION_CODE >= 21000
-                        ops::PartialSendCUDAKernel<plat::bfloat16>,
+                          plat::bfloat16,
 #endif
-                        ops::PartialSendCUDAKernel<int>,
-                        ops::PartialSendCUDAKernel<int64_t>,
-                        ops::PartialSendCUDAKernel<plat::float16>);
+                          int,
+                          int64_t,
+                          plat::float16) {
+}
diff --git a/paddle/fluid/operators/collective/partial_send_op.h b/paddle/fluid/operators/collective/partial_send_op.h
index 7550ac40078c40..773125be7d40f0 100644
--- a/paddle/fluid/operators/collective/partial_send_op.h
+++ b/paddle/fluid/operators/collective/partial_send_op.h
@@ -25,7 +25,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PartialSendOpCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/fluid/operators/detection/polygon_box_transform_op.cc b/paddle/fluid/operators/detection/polygon_box_transform_op.cc
index c331cdc97f0005..936480a9e23ddb 100644
--- a/paddle/fluid/operators/detection/polygon_box_transform_op.cc
+++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class PolygonBoxTransformCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -111,7 +111,10 @@ REGISTER_OPERATOR(
     ops::PolygonBoxTransformOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    polygon_box_transform,
-    ops::PolygonBoxTransformCPUKernel<paddle::platform::CPUPlace, float>,
-    ops::PolygonBoxTransformCPUKernel<paddle::platform::CPUPlace, double>);
+
+PD_REGISTER_STRUCT_KERNEL(polygon_box_transform,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::PolygonBoxTransformCPUKernel,
+                          float,
+                          double) {}
diff --git a/paddle/fluid/operators/detection/polygon_box_transform_op.cu b/paddle/fluid/operators/detection/polygon_box_transform_op.cu
index de43f2d62b4554..4f182464f77b50 100644
--- a/paddle/fluid/operators/detection/polygon_box_transform_op.cu
+++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cu
@@ -38,7 +38,7 @@ __global__ void PolygonBoxTransformKernel(
   }
 }
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PolygonBoxTransformOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -73,7 +73,10 @@ class PolygonBoxTransformOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_CUDA_KERNEL(
-    polygon_box_transform,
-    paddle::operators::PolygonBoxTransformOpCUDAKernel<float>,
-    paddle::operators::PolygonBoxTransformOpCUDAKernel<double>);
+namespace ops = paddle::operators;
+PD_REGISTER_STRUCT_KERNEL(polygon_box_transform,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::PolygonBoxTransformOpCUDAKernel,
+                          float,
+                          double) {}
diff --git a/paddle/fluid/operators/metrics/precision_recall_op.cc b/paddle/fluid/operators/metrics/precision_recall_op.cc
index 0652151320d819..413cd8546011be 100644
--- a/paddle/fluid/operators/metrics/precision_recall_op.cc
+++ b/paddle/fluid/operators/metrics/precision_recall_op.cc
@@ -242,7 +242,9 @@ REGISTER_OPERATOR(
     ops::PrecisionRecallOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    precision_recall,
-    ops::PrecisionRecallKernel<paddle::platform::CPUPlace, float>,
-    ops::PrecisionRecallKernel<paddle::platform::CPUPlace, double>);
+PD_REGISTER_STRUCT_KERNEL(precision_recall,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::PrecisionRecallKernel,
+                          float,
+                          double) {}
diff --git a/paddle/fluid/operators/metrics/precision_recall_op.h b/paddle/fluid/operators/metrics/precision_recall_op.h
index bec8bba09ad1a1..6eef5658c5c007 100644
--- a/paddle/fluid/operators/metrics/precision_recall_op.h
+++ b/paddle/fluid/operators/metrics/precision_recall_op.h
@@ -26,7 +26,7 @@ using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
 enum StateVariable { TP = 0, FP, TN, FN };
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class PrecisionRecallKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/fluid/operators/nccl/nccl_op.cu.cc b/paddle/fluid/operators/nccl/nccl_op.cu.cc
index d328329e1c24a8..7dae16afafdf11 100644
--- a/paddle/fluid/operators/nccl/nccl_op.cu.cc
+++ b/paddle/fluid/operators/nccl/nccl_op.cu.cc
@@ -52,7 +52,7 @@ static ncclRedOp_t str_to_nccl_red_type(std::string reduction) {
   return it->second;
 }
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class NCCLAllReduceKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -87,7 +87,7 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class NCCLReduceKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -128,7 +128,7 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class NCCLBcastKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -172,6 +172,9 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(ncclAllReduce, ops::NCCLAllReduceKernel<float>);
-REGISTER_OP_CUDA_KERNEL(ncclBcast, ops::NCCLBcastKernel<float>);
-REGISTER_OP_CUDA_KERNEL(ncclReduce, ops::NCCLReduceKernel<float>);
+PD_REGISTER_STRUCT_KERNEL(
+    ncclAllReduce, GPU, ALL_LAYOUT, ops::NCCLAllReduceKernel, float) {}
+PD_REGISTER_STRUCT_KERNEL(
+    ncclBcast, GPU, ALL_LAYOUT, ops::NCCLBcastKernel, float) {}
+PD_REGISTER_STRUCT_KERNEL(
+    ncclReduce, GPU, ALL_LAYOUT, ops::NCCLReduceKernel, float) {}
diff --git a/paddle/fluid/operators/nccl/nccl_op_test.cu.cc b/paddle/fluid/operators/nccl/nccl_op_test.cu.cc
index 8d5528716f4a92..87c0708e12d398 100644
--- a/paddle/fluid/operators/nccl/nccl_op_test.cu.cc
+++ b/paddle/fluid/operators/nccl/nccl_op_test.cu.cc
@@ -31,9 +31,12 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 
 USE_NO_KERNEL_OP(ncclInit);
-USE_CUDA_ONLY_OP(ncclAllReduce);
-USE_CUDA_ONLY_OP(ncclReduce);
-USE_CUDA_ONLY_OP(ncclBcast);
+USE_OP_ITSELF(ncclAllReduce);
+USE_OP_ITSELF(ncclReduce);
+USE_OP_ITSELF(ncclBcast);
+PD_DECLARE_KERNEL(ncclAllReduce, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(ncclReduce, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(ncclBcast, GPU, ALL_LAYOUT);
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc
index 286c8512781179..9c9055d1987e12 100644
--- a/paddle/fluid/operators/nce_op.cc
+++ b/paddle/fluid/operators/nce_op.cc
@@ -320,9 +320,8 @@ REGISTER_OPERATOR(nce_grad,
                   ops::NCEOpGrad,
                   ops::NCEOpGradVarTypeInference,
                   ops::NCEGradOpNoNeedBufferVarInferer);
-REGISTER_OP_CPU_KERNEL(nce,
-                       ops::NCEKernel<paddle::platform::CPUPlace, float>,
-                       ops::NCEKernel<paddle::platform::CPUPlace, double>);
-REGISTER_OP_CPU_KERNEL(nce_grad,
-                       ops::NCEGradKernel<paddle::platform::CPUPlace, float>,
-                       ops::NCEGradKernel<paddle::platform::CPUPlace, double>);
+
+PD_REGISTER_STRUCT_KERNEL(nce, CPU, ALL_LAYOUT, ops::NCEKernel, float, double) {
+}
+PD_REGISTER_STRUCT_KERNEL(
+    nce_grad, CPU, ALL_LAYOUT, ops::NCEGradKernel, float, double) {}
diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h
index 4b9fe86b225653..188568ec323ba3 100644
--- a/paddle/fluid/operators/nce_op.h
+++ b/paddle/fluid/operators/nce_op.h
@@ -75,7 +75,7 @@ void PrepareSamples(const framework::ExecutionContext &context,
   }
 }
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class NCEKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
@@ -245,7 +245,7 @@ class NCEKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class NCEGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
diff --git a/paddle/fluid/operators/nop_op.cc b/paddle/fluid/operators/nop_op.cc
index 709b1f4f1f0209..69f0bfb2abcd31 100644
--- a/paddle/fluid/operators/nop_op.cc
+++ b/paddle/fluid/operators/nop_op.cc
@@ -45,7 +45,7 @@ establish the dependency between input and output tensors.
   }
 };
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class NopKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {}
@@ -58,8 +58,8 @@ namespace ops = paddle::operators;
 
 REGISTER_OP_WITHOUT_GRADIENT(nop, ops::NopOp, ops::NopOpMaker);
 
-REGISTER_OP_CPU_KERNEL(nop, ops::NopKernel<float>);
+PD_REGISTER_STRUCT_KERNEL(nop, CPU, ALL_LAYOUT, ops::NopKernel, float) {}
 
-REGISTER_OP_CUDA_KERNEL(nop, ops::NopKernel<float>);
-
-REGISTER_OP_NPU_KERNEL(nop, ops::NopKernel<float>);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_STRUCT_KERNEL(nop, GPU, ALL_LAYOUT, ops::NopKernel, float) {}
+#endif
diff --git a/paddle/fluid/operators/number_count_op.cc b/paddle/fluid/operators/number_count_op.cc
index e636bc98bfca5a..bc566ca5fbfa75 100644
--- a/paddle/fluid/operators/number_count_op.cc
+++ b/paddle/fluid/operators/number_count_op.cc
@@ -58,10 +58,9 @@ class NumberCountOpMaker : public framework::OpProtoAndCheckerMaker {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_CPU_KERNEL(number_count,
-                       ops::NumberCountOpCPUKernel<int>,
-                       ops::NumberCountOpCPUKernel<int64_t>);
-
 REGISTER_OP_WITHOUT_GRADIENT(number_count,
                              ops::NumberCountOp,
                              ops::NumberCountOpMaker);
+
+PD_REGISTER_STRUCT_KERNEL(
+    number_count, CPU, ALL_LAYOUT, ops::NumberCountOpCPUKernel, int, int64_t) {}
diff --git a/paddle/fluid/operators/number_count_op.cu b/paddle/fluid/operators/number_count_op.cu
index fdab03698711c3..b9afffd7887d49 100644
--- a/paddle/fluid/operators/number_count_op.cu
+++ b/paddle/fluid/operators/number_count_op.cu
@@ -79,7 +79,7 @@ __global__ void NumberCount(const T* numbers,
   }
 }
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class NumberCountOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -111,4 +111,5 @@ class NumberCountOpCUDAKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_CUDA_KERNEL(number_count, ops::NumberCountOpCUDAKernel<int64_t>);
+PD_REGISTER_STRUCT_KERNEL(
+    number_count, GPU, ALL_LAYOUT, ops::NumberCountOpCUDAKernel, int64_t) {}
diff --git a/paddle/fluid/operators/number_count_op.h b/paddle/fluid/operators/number_count_op.h
index ded7ea6eec54f7..e95336ae2a3a8e 100644
--- a/paddle/fluid/operators/number_count_op.h
+++ b/paddle/fluid/operators/number_count_op.h
@@ -24,7 +24,7 @@
 namespace paddle {
 namespace operators {
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class NumberCountOpCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc
index 076f5137cab92f..3261e96cbbeca4 100644
--- a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc
+++ b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cc
@@ -133,5 +133,5 @@ namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(proximal_adagrad,
                              ops::ProximalAdagradOp,
                              ops::ProximalAdagradOpMaker);
-REGISTER_OP_CPU_KERNEL(proximal_adagrad,
-                       ops::ProximalAdagradOpKernel<phi::CPUContext, float>);
+PD_REGISTER_STRUCT_KERNEL(
+    proximal_adagrad, CPU, ALL_LAYOUT, ops::ProximalAdagradOpKernel, float) {}
diff --git a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cu b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cu
index c338f4cc717a57..0a79dcd425f128 100644
--- a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cu
+++ b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cu
@@ -13,5 +13,5 @@ specific language governing permissions and limitations under the License. */
 #include "paddle/fluid/operators/optimizers/proximal_adagrad_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(proximal_adagrad,
-                        ops::ProximalAdagradOpKernel<phi::GPUContext, float>);
+PD_REGISTER_STRUCT_KERNEL(
+    proximal_adagrad, GPU, ALL_LAYOUT, ops::ProximalAdagradOpKernel, float) {}
diff --git a/paddle/fluid/operators/optimizers/proximal_adagrad_op.h b/paddle/fluid/operators/optimizers/proximal_adagrad_op.h
index 72eccd17e4489e..973d870d14f31b 100644
--- a/paddle/fluid/operators/optimizers/proximal_adagrad_op.h
+++ b/paddle/fluid/operators/optimizers/proximal_adagrad_op.h
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class ProximalAdagradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/fluid/operators/optimizers/proximal_gd_op.cc b/paddle/fluid/operators/optimizers/proximal_gd_op.cc
index d7e01aa07109ea..08cc29ce9eb8db 100644
--- a/paddle/fluid/operators/optimizers/proximal_gd_op.cc
+++ b/paddle/fluid/operators/optimizers/proximal_gd_op.cc
@@ -106,5 +106,6 @@ namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(proximal_gd,
                              ops::ProximalGDOp,
                              ops::ProximalGDOpMaker);
-REGISTER_OP_CPU_KERNEL(proximal_gd,
-                       ops::ProximalGDOpKernel<phi::CPUContext, float>);
+
+PD_REGISTER_STRUCT_KERNEL(
+    proximal_gd, CPU, ALL_LAYOUT, ops::ProximalGDOpKernel, float) {}
diff --git a/paddle/fluid/operators/optimizers/proximal_gd_op.cu b/paddle/fluid/operators/optimizers/proximal_gd_op.cu
index edc911134c7293..ef1edfc2ee458f 100644
--- a/paddle/fluid/operators/optimizers/proximal_gd_op.cu
+++ b/paddle/fluid/operators/optimizers/proximal_gd_op.cu
@@ -13,5 +13,5 @@ specific language governing permissions and limitations under the License. */
 #include "paddle/fluid/operators/optimizers/proximal_gd_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(proximal_gd,
-                        ops::ProximalGDOpKernel<phi::GPUContext, float>);
+PD_REGISTER_STRUCT_KERNEL(
+    proximal_gd, GPU, ALL_LAYOUT, ops::ProximalGDOpKernel, float) {}
diff --git a/paddle/fluid/operators/optimizers/proximal_gd_op.h b/paddle/fluid/operators/optimizers/proximal_gd_op.h
index 49cf7b68bd32af..1945ef5bf6b778 100644
--- a/paddle/fluid/operators/optimizers/proximal_gd_op.h
+++ b/paddle/fluid/operators/optimizers/proximal_gd_op.h
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class ProximalGDOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/fluid/operators/pad2d_op.cc b/paddle/fluid/operators/pad2d_op.cc
index 91eeed0e9008ec..e29981d35b41f9 100644
--- a/paddle/fluid/operators/pad2d_op.cc
+++ b/paddle/fluid/operators/pad2d_op.cc
@@ -402,7 +402,7 @@ static inline void GetPaddings(int* paddings,
   }
 }
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class Pad2dCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -520,7 +520,7 @@ class Pad2dCPUKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class Pad2dGradCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -873,11 +873,8 @@ REGISTER_OPERATOR(pad2d,
 REGISTER_OPERATOR(pad2d_grad,
                   ops::Pad2dOpGrad,
                   ops::Pad2dOpGradNoNeedBufferVarsInferer);
-REGISTER_OP_CPU_KERNEL(pad2d,
-                       ops::Pad2dCPUKernel<float>,
-                       ops::Pad2dCPUKernel<double>,
-                       ops::Pad2dCPUKernel<int>,
-                       ops::Pad2dCPUKernel<int64_t>);
-REGISTER_OP_CPU_KERNEL(pad2d_grad,
-                       ops::Pad2dGradCPUKernel<float>,
-                       ops::Pad2dGradCPUKernel<double>);
+
+PD_REGISTER_STRUCT_KERNEL(
+    pad2d, CPU, ALL_LAYOUT, ops::Pad2dCPUKernel, float, double, int, int64_t) {}
+PD_REGISTER_STRUCT_KERNEL(
+    pad2d_grad, CPU, ALL_LAYOUT, ops::Pad2dGradCPUKernel, float, double) {}
diff --git a/paddle/fluid/operators/pad2d_op.cu b/paddle/fluid/operators/pad2d_op.cu
index 7b0dd2149dead5..b8263ea6bb1692 100644
--- a/paddle/fluid/operators/pad2d_op.cu
+++ b/paddle/fluid/operators/pad2d_op.cu
@@ -361,7 +361,7 @@ static inline void GetPaddings(int* paddings,
   }
 }
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class Pad2dCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -489,7 +489,7 @@ class Pad2dCUDAKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class Pad2dGradCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -618,13 +618,19 @@ class Pad2dGradCUDAKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_CUDA_KERNEL(pad2d,
-                        ops::Pad2dCUDAKernel<plat::float16>,
-                        ops::Pad2dCUDAKernel<float>,
-                        ops::Pad2dCUDAKernel<double>,
-                        ops::Pad2dCUDAKernel<int>,
-                        ops::Pad2dCUDAKernel<int64_t>);
-REGISTER_OP_CUDA_KERNEL(pad2d_grad,
-                        ops::Pad2dGradCUDAKernel<plat::float16>,
-                        ops::Pad2dGradCUDAKernel<float>,
-                        ops::Pad2dGradCUDAKernel<double>);
+PD_REGISTER_STRUCT_KERNEL(pad2d,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::Pad2dCUDAKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t,
+                          plat::float16) {}
+PD_REGISTER_STRUCT_KERNEL(pad2d_grad,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::Pad2dGradCUDAKernel,
+                          float,
+                          double,
+                          plat::float16) {}
diff --git a/paddle/fluid/operators/pad_constant_like_op.cc b/paddle/fluid/operators/pad_constant_like_op.cc
index 9b08bb3fc1e1c6..d00cefab450454 100644
--- a/paddle/fluid/operators/pad_constant_like_op.cc
+++ b/paddle/fluid/operators/pad_constant_like_op.cc
@@ -243,26 +243,38 @@ REGISTER_OPERATOR(pad_constant_like,
                   ops::PadConstantLikeOpGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(pad_constant_like_grad, ops::PadConstantLikeOpGrad);
 
-REGISTER_OP_CPU_KERNEL(pad_constant_like,
-                       ops::PadConstantLikeKernel<phi::CPUContext, float>,
-                       ops::PadConstantLikeKernel<phi::CPUContext, double>,
-                       ops::PadConstantLikeKernel<phi::CPUContext, int>,
-                       ops::PadConstantLikeKernel<phi::CPUContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    pad_constant_like_grad,
-    ops::PadConstantLikeGradKernel<phi::CPUContext, float>,
-    ops::PadConstantLikeGradKernel<phi::CPUContext, double>,
-    ops::PadConstantLikeGradKernel<phi::CPUContext, int>,
-    ops::PadConstantLikeGradKernel<phi::CPUContext, int64_t>);
-
-REGISTER_OP_CUDA_KERNEL(pad_constant_like,
-                        ops::PadConstantLikeKernel<phi::GPUContext, float>,
-                        ops::PadConstantLikeKernel<phi::GPUContext, double>,
-                        ops::PadConstantLikeKernel<phi::GPUContext, int>,
-                        ops::PadConstantLikeKernel<phi::GPUContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    pad_constant_like_grad,
-    ops::PadConstantLikeGradKernel<phi::GPUContext, int>,
-    ops::PadConstantLikeGradKernel<phi::GPUContext, int64_t>,
-    ops::PadConstantLikeGradKernel<phi::GPUContext, float>,
-    ops::PadConstantLikeGradKernel<phi::GPUContext, double>);
+PD_REGISTER_STRUCT_KERNEL(pad_constant_like,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::PadConstantLikeKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t) {}
+PD_REGISTER_STRUCT_KERNEL(pad_constant_like_grad,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::PadConstantLikeGradKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_STRUCT_KERNEL(pad_constant_like,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::PadConstantLikeKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t) {}
+PD_REGISTER_STRUCT_KERNEL(pad_constant_like_grad,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::PadConstantLikeGradKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t) {}
+#endif
diff --git a/paddle/fluid/operators/pad_constant_like_op.h b/paddle/fluid/operators/pad_constant_like_op.h
index ba87bd3ef18182..f6162037fbd56f 100644
--- a/paddle/fluid/operators/pad_constant_like_op.h
+++ b/paddle/fluid/operators/pad_constant_like_op.h
@@ -26,7 +26,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class PadConstantLikeKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -61,7 +61,7 @@ class PadConstantLikeKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class PadConstantLikeGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
diff --git a/paddle/fluid/operators/partial_concat_op.cc b/paddle/fluid/operators/partial_concat_op.cc
index 1fb9dceb4150c0..f2f3da9f0511f1 100644
--- a/paddle/fluid/operators/partial_concat_op.cc
+++ b/paddle/fluid/operators/partial_concat_op.cc
@@ -202,14 +202,19 @@ REGISTER_OPERATOR(partial_concat,
 
 REGISTER_OPERATOR(partial_concat_grad, ops::PartialConcatGradOp);
 
-REGISTER_OP_CPU_KERNEL(partial_concat,
-                       ops::PartialConcatKernel<phi::CPUContext, double>,
-                       ops::PartialConcatKernel<phi::CPUContext, float>,
-                       ops::PartialConcatKernel<phi::CPUContext, int64_t>,
-                       ops::PartialConcatKernel<phi::CPUContext, int>);
-
-REGISTER_OP_CPU_KERNEL(partial_concat_grad,
-                       ops::PartialConcatGradientOpKernel<float>,
-                       ops::PartialConcatGradientOpKernel<int>,
-                       ops::PartialConcatGradientOpKernel<double>,
-                       ops::PartialConcatGradientOpKernel<int64_t>);
+PD_REGISTER_STRUCT_KERNEL(partial_concat,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::PartialConcatKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t) {}
+PD_REGISTER_STRUCT_KERNEL(partial_concat_grad,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::PartialConcatGradientOpKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t) {}
diff --git a/paddle/fluid/operators/partial_concat_op.cu b/paddle/fluid/operators/partial_concat_op.cu
index f4acf68dcbc708..ffef094fa96dd0 100644
--- a/paddle/fluid/operators/partial_concat_op.cu
+++ b/paddle/fluid/operators/partial_concat_op.cu
@@ -65,7 +65,7 @@ __global__ void ConcatPartialGradCUDAKernel(T **in,
   }
 }
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PartialConcatOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
@@ -146,7 +146,7 @@ class PartialConcatOpCUDAKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PartialConcatGradOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
@@ -231,16 +231,22 @@ class PartialConcatGradOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(partial_concat,
-                        ops::PartialConcatOpCUDAKernel<float>,
-                        ops::PartialConcatOpCUDAKernel<double>,
-                        ops::PartialConcatOpCUDAKernel<int>,
-                        ops::PartialConcatOpCUDAKernel<int64_t>,
-                        ops::PartialConcatOpCUDAKernel<plat::float16>);
-
-REGISTER_OP_CUDA_KERNEL(partial_concat_grad,
-                        ops::PartialConcatGradOpCUDAKernel<float>,
-                        ops::PartialConcatGradOpCUDAKernel<double>,
-                        ops::PartialConcatGradOpCUDAKernel<int>,
-                        ops::PartialConcatGradOpCUDAKernel<int64_t>,
-                        ops::PartialConcatGradOpCUDAKernel<plat::float16>);
+
+PD_REGISTER_STRUCT_KERNEL(partial_concat,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::PartialConcatOpCUDAKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t,
+                          plat::float16) {}
+PD_REGISTER_STRUCT_KERNEL(partial_concat_grad,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::PartialConcatGradOpCUDAKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t,
+                          plat::float16) {}
diff --git a/paddle/fluid/operators/partial_concat_op.h b/paddle/fluid/operators/partial_concat_op.h
index 407b57e3a82814..fb0d17aa97b842 100644
--- a/paddle/fluid/operators/partial_concat_op.h
+++ b/paddle/fluid/operators/partial_concat_op.h
@@ -39,7 +39,7 @@ static inline int64_t ComputeStartIndex(int64_t start_index, int64_t size) {
   return start_index;
 }
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class PartialConcatKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -84,7 +84,7 @@ class PartialConcatKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PartialConcatGradientOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/fluid/operators/partial_sum_op.cc b/paddle/fluid/operators/partial_sum_op.cc
index 9ef7ac0a21a481..4b130306825c67 100644
--- a/paddle/fluid/operators/partial_sum_op.cc
+++ b/paddle/fluid/operators/partial_sum_op.cc
@@ -204,14 +204,19 @@ REGISTER_OPERATOR(partial_sum,
 
 REGISTER_OPERATOR(partial_sum_grad, ops::PartialSumGradOp);
 
-REGISTER_OP_CPU_KERNEL(partial_sum,
-                       ops::PartialSumKernel<phi::CPUContext, float>,
-                       ops::PartialSumKernel<phi::CPUContext, int>,
-                       ops::PartialSumKernel<phi::CPUContext, double>,
-                       ops::PartialSumKernel<phi::CPUContext, int64_t>);
-
-REGISTER_OP_CPU_KERNEL(partial_sum_grad,
-                       ops::PartialSumGradientOpKernel<float>,
-                       ops::PartialSumGradientOpKernel<int>,
-                       ops::PartialSumGradientOpKernel<double>,
-                       ops::PartialSumGradientOpKernel<int64_t>);
+PD_REGISTER_STRUCT_KERNEL(partial_sum,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::PartialSumKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t) {}
+PD_REGISTER_STRUCT_KERNEL(partial_sum_grad,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::PartialSumGradientOpKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t) {}
diff --git a/paddle/fluid/operators/partial_sum_op.cu b/paddle/fluid/operators/partial_sum_op.cu
index 093e0032b3cb9b..a38ec4c8394691 100644
--- a/paddle/fluid/operators/partial_sum_op.cu
+++ b/paddle/fluid/operators/partial_sum_op.cu
@@ -70,7 +70,7 @@ __global__ void PartialSumGradCUDAKernel(T **res_grad,
   }
 }
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PartialSumOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
@@ -144,7 +144,7 @@ class PartialSumOpCUDAKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PartialSumGradOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
@@ -233,18 +233,3 @@ class PartialSumGradOpCUDAKernel : public framework::OpKernel<T> {
 
 }  // namespace operators
 }  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(partial_sum,
-                        ops::PartialSumOpCUDAKernel<float>,
-                        ops::PartialSumOpCUDAKernel<double>,
-                        ops::PartialSumOpCUDAKernel<int>,
-                        ops::PartialSumOpCUDAKernel<int64_t>,
-                        ops::PartialSumOpCUDAKernel<plat::float16>);
-
-REGISTER_OP_CUDA_KERNEL(partial_sum_grad,
-                        ops::PartialSumGradOpCUDAKernel<float>,
-                        ops::PartialSumGradOpCUDAKernel<double>,
-                        ops::PartialSumGradOpCUDAKernel<int>,
-                        ops::PartialSumGradOpCUDAKernel<int64_t>,
-                        ops::PartialSumGradOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/partial_sum_op.h b/paddle/fluid/operators/partial_sum_op.h
index fa4cc19d5e2c3f..1b88eafae77db8 100644
--- a/paddle/fluid/operators/partial_sum_op.h
+++ b/paddle/fluid/operators/partial_sum_op.h
@@ -21,7 +21,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class PartialSumKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -57,7 +57,7 @@ class PartialSumKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PartialSumGradientOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/fluid/operators/positive_negative_pair_op.cc b/paddle/fluid/operators/positive_negative_pair_op.cc
index 3f4d8125671e4b..72236c012c357c 100644
--- a/paddle/fluid/operators/positive_negative_pair_op.cc
+++ b/paddle/fluid/operators/positive_negative_pair_op.cc
@@ -253,7 +253,10 @@ namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(positive_negative_pair,
                              ops::PositiveNegativePairOp,
                              ops::PositiveNegativePairOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    positive_negative_pair,
-    ops::PositiveNegativePairKernel<paddle::platform::CPUPlace, float>,
-    ops::PositiveNegativePairKernel<paddle::platform::CPUPlace, double>);
+
+PD_REGISTER_STRUCT_KERNEL(positive_negative_pair,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::PositiveNegativePairKernel,
+                          float,
+                          double) {}
diff --git a/paddle/fluid/operators/positive_negative_pair_op.h b/paddle/fluid/operators/positive_negative_pair_op.h
index 745b793f51147a..0cddbcc3abf853 100644
--- a/paddle/fluid/operators/positive_negative_pair_op.h
+++ b/paddle/fluid/operators/positive_negative_pair_op.h
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class PositiveNegativePairKernel : public framework::OpKernel<T> {
  public:
   struct PredictionResult {
diff --git a/paddle/fluid/operators/prroi_pool_op.cc b/paddle/fluid/operators/prroi_pool_op.cc
index d1c455331b4e78..0f0dbf3c6888a8 100644
--- a/paddle/fluid/operators/prroi_pool_op.cc
+++ b/paddle/fluid/operators/prroi_pool_op.cc
@@ -195,13 +195,20 @@ REGISTER_OPERATOR(prroi_pool,
                   ops::PRROIPoolGradMaker<paddle::framework::OpDesc>,
                   ops::PRROIPoolGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(prroi_pool_grad, ops::PRROIPoolGradOp);
-REGISTER_OP_CPU_KERNEL(prroi_pool,
-                       ops::CPUPRROIPoolOpKernel<phi::CPUContext, float>,
-                       ops::CPUPRROIPoolOpKernel<phi::CPUContext, double>,
-                       ops::CPUPRROIPoolOpKernel<phi::CPUContext, int>,
-                       ops::CPUPRROIPoolOpKernel<phi::CPUContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(prroi_pool_grad,
-                       ops::CPUPRROIPoolGradOpKernel<phi::CPUContext, float>,
-                       ops::CPUPRROIPoolGradOpKernel<phi::CPUContext, double>,
-                       ops::CPUPRROIPoolGradOpKernel<phi::CPUContext, int>,
-                       ops::CPUPRROIPoolGradOpKernel<phi::CPUContext, int64_t>);
+
+PD_REGISTER_STRUCT_KERNEL(prroi_pool,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::CPUPRROIPoolOpKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t) {}
+PD_REGISTER_STRUCT_KERNEL(prroi_pool_grad,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::CPUPRROIPoolGradOpKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t) {}
diff --git a/paddle/fluid/operators/prroi_pool_op.cu b/paddle/fluid/operators/prroi_pool_op.cu
index d1aa1d37d0479a..5d1243964279b0 100644
--- a/paddle/fluid/operators/prroi_pool_op.cu
+++ b/paddle/fluid/operators/prroi_pool_op.cu
@@ -211,7 +211,7 @@ __global__ void GPUPRROIPoolBackward(const int nthreads,
   }
 }
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class GPUPRROIPoolOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -314,7 +314,7 @@ class GPUPRROIPoolOpKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class GPUPRROIPoolGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -428,9 +428,12 @@ class GPUPRROIPoolGradOpKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(prroi_pool,
-                        ops::GPUPRROIPoolOpKernel<float>,
-                        ops::GPUPRROIPoolOpKernel<double>);
-REGISTER_OP_CUDA_KERNEL(prroi_pool_grad,
-                        ops::GPUPRROIPoolGradOpKernel<phi::GPUContext, float>,
-                        ops::GPUPRROIPoolGradOpKernel<phi::GPUContext, double>);
+
+PD_REGISTER_STRUCT_KERNEL(
+    prroi_pool, GPU, ALL_LAYOUT, ops::GPUPRROIPoolOpKernel, float, double) {}
+PD_REGISTER_STRUCT_KERNEL(prroi_pool_grad,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::GPUPRROIPoolGradOpKernel,
+                          float,
+                          double) {}
diff --git a/paddle/fluid/operators/prroi_pool_op.h b/paddle/fluid/operators/prroi_pool_op.h
index 07a2bde7e94e46..e2417a071ce886 100644
--- a/paddle/fluid/operators/prroi_pool_op.h
+++ b/paddle/fluid/operators/prroi_pool_op.h
@@ -327,7 +327,7 @@ inline HOSTDEVICE void PrRoIPoolingCoorBackward(int s_w,
                         (*this_out_grad));
 }
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class CPUPRROIPoolOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -481,7 +481,7 @@ class CPUPRROIPoolOpKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class CPUPRROIPoolGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/fluid/operators/prune_gate_by_capacity_op.cc b/paddle/fluid/operators/prune_gate_by_capacity_op.cc
index 388b65f3dd6743..c1112b13feb50c 100644
--- a/paddle/fluid/operators/prune_gate_by_capacity_op.cc
+++ b/paddle/fluid/operators/prune_gate_by_capacity_op.cc
@@ -126,7 +126,9 @@ REGISTER_OP_WITHOUT_GRADIENT(prune_gate_by_capacity,
                              ops::PruneGateByCapacityOp,
                              ops::PruneGateByCapacityOpMaker);
 
-REGISTER_OP_CPU_KERNEL(
-    prune_gate_by_capacity,
-    ops::PruneGateByCapacityCPUKernel<phi::CPUContext, int>,
-    ops::PruneGateByCapacityCPUKernel<phi::CPUContext, int64_t>);
+PD_REGISTER_STRUCT_KERNEL(prune_gate_by_capacity,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::PruneGateByCapacityCPUKernel,
+                          int,
+                          int64_t) {}
diff --git a/paddle/fluid/operators/prune_gate_by_capacity_op.cu b/paddle/fluid/operators/prune_gate_by_capacity_op.cu
index 38baaeb809c11c..510de11029f0c0 100644
--- a/paddle/fluid/operators/prune_gate_by_capacity_op.cu
+++ b/paddle/fluid/operators/prune_gate_by_capacity_op.cu
@@ -105,7 +105,7 @@ static void VisitDataType(phi::DataType type, Visitor visitor) {
   }
 }
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class PruneGateByCapacityCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -127,6 +127,8 @@ class PruneGateByCapacityCUDAKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_CUDA_KERNEL(
-    prune_gate_by_capacity,
-    ops::PruneGateByCapacityCUDAKernel<phi::GPUContext, int64_t>);
+PD_REGISTER_STRUCT_KERNEL(prune_gate_by_capacity,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::PruneGateByCapacityCUDAKernel,
+                          int64_t) {}
diff --git a/paddle/fluid/operators/prune_gate_by_capacity_op.h b/paddle/fluid/operators/prune_gate_by_capacity_op.h
index d7a00bd40d786f..4420fae6ef5e3e 100644
--- a/paddle/fluid/operators/prune_gate_by_capacity_op.h
+++ b/paddle/fluid/operators/prune_gate_by_capacity_op.h
@@ -20,7 +20,7 @@
 namespace paddle {
 namespace operators {
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class PruneGateByCapacityCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
diff --git a/paddle/fluid/operators/pull_box_extended_sparse_op.cc b/paddle/fluid/operators/pull_box_extended_sparse_op.cc
index 7b949fa4338c72..f0799f75862bc4 100644
--- a/paddle/fluid/operators/pull_box_extended_sparse_op.cc
+++ b/paddle/fluid/operators/pull_box_extended_sparse_op.cc
@@ -151,10 +151,15 @@ REGISTER_OPERATOR(
 
 REGISTER_OPERATOR(push_box_extended_sparse, ops::PushBoxExtendedSparseOp);
 
-REGISTER_OP_CPU_KERNEL(pull_box_extended_sparse,
-                       ops::PullBoxExtendedSparseCPUKernel<float>,
-                       ops::PullBoxExtendedSparseCPUKernel<double>);
-
-REGISTER_OP_CPU_KERNEL(push_box_extended_sparse,
-                       ops::PushBoxExtendedSparseCPUKernel<float>,
-                       ops::PushBoxExtendedSparseCPUKernel<double>);
+PD_REGISTER_STRUCT_KERNEL(pull_box_extended_sparse,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::PullBoxExtendedSparseCPUKernel,
+                          float,
+                          double) {}
+PD_REGISTER_STRUCT_KERNEL(push_box_extended_sparse,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::PushBoxExtendedSparseCPUKernel,
+                          float,
+                          double) {}
diff --git a/paddle/fluid/operators/pull_box_extended_sparse_op.cu b/paddle/fluid/operators/pull_box_extended_sparse_op.cu
index cfa317a3d392fb..570c367c93182d 100644
--- a/paddle/fluid/operators/pull_box_extended_sparse_op.cu
+++ b/paddle/fluid/operators/pull_box_extended_sparse_op.cu
@@ -19,7 +19,7 @@
 namespace paddle {
 namespace operators {
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PullBoxExtendedSparseCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
@@ -27,7 +27,7 @@ class PullBoxExtendedSparseCUDAKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PushBoxExtendedSparseCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
@@ -38,9 +38,16 @@ class PushBoxExtendedSparseCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(pull_box_extended_sparse,
-                        ops::PullBoxExtendedSparseCUDAKernel<float>,
-                        ops::PullBoxExtendedSparseCUDAKernel<double>);
-REGISTER_OP_CUDA_KERNEL(push_box_extended_sparse,
-                        ops::PushBoxExtendedSparseCUDAKernel<float>,
-                        ops::PushBoxExtendedSparseCUDAKernel<double>);
+
+PD_REGISTER_STRUCT_KERNEL(pull_box_extended_sparse,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::PullBoxExtendedSparseCUDAKernel,
+                          float,
+                          double) {}
+PD_REGISTER_STRUCT_KERNEL(push_box_extended_sparse,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::PushBoxExtendedSparseCUDAKernel,
+                          float,
+                          double) {}
diff --git a/paddle/fluid/operators/pull_box_extended_sparse_op.h b/paddle/fluid/operators/pull_box_extended_sparse_op.h
index eff3bfd2a5f3c3..b9508a279505ea 100644
--- a/paddle/fluid/operators/pull_box_extended_sparse_op.h
+++ b/paddle/fluid/operators/pull_box_extended_sparse_op.h
@@ -108,7 +108,7 @@ static void PushBoxExtendedSparseFunctor(
 #endif
 }
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PullBoxExtendedSparseCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
@@ -116,7 +116,7 @@ class PullBoxExtendedSparseCPUKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PushBoxExtendedSparseCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
diff --git a/paddle/fluid/operators/pull_box_sparse_op.cc b/paddle/fluid/operators/pull_box_sparse_op.cc
index c58a176d526355..a8f91c85485c7c 100644
--- a/paddle/fluid/operators/pull_box_sparse_op.cc
+++ b/paddle/fluid/operators/pull_box_sparse_op.cc
@@ -135,5 +135,8 @@ REGISTER_OPERATOR(pull_box_sparse,
                   ops::PushBoxSparseOpMaker<paddle::framework::OpDesc>,
                   ops::PushBoxSparseOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(push_box_sparse, ops::PushBoxSparseOp);
-REGISTER_OP_CPU_KERNEL(pull_box_sparse, ops::PullBoxSparseKernel<float>);
-REGISTER_OP_CPU_KERNEL(push_box_sparse, ops::PushBoxSparseKernel<float>);
+
+PD_REGISTER_STRUCT_KERNEL(
+    pull_box_sparse, CPU, ALL_LAYOUT, ops::PullBoxSparseKernel, float) {}
+PD_REGISTER_STRUCT_KERNEL(
+    push_box_sparse, CPU, ALL_LAYOUT, ops::PushBoxSparseKernel, float) {}
diff --git a/paddle/fluid/operators/pull_box_sparse_op.h b/paddle/fluid/operators/pull_box_sparse_op.h
index dd41fd6ff0f4f2..1ebfa11a2b2e65 100644
--- a/paddle/fluid/operators/pull_box_sparse_op.h
+++ b/paddle/fluid/operators/pull_box_sparse_op.h
@@ -113,7 +113,7 @@ static void PushBoxSparseFunctor(const framework::ExecutionContext &ctx) {
 #endif
 }
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PullBoxSparseKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
@@ -121,7 +121,7 @@ class PullBoxSparseKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PushBoxSparseKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
diff --git a/paddle/fluid/operators/pull_box_sparse_op.kps b/paddle/fluid/operators/pull_box_sparse_op.kps
index 4b0580c5e1ab5c..1e4a3640bdac3f 100644
--- a/paddle/fluid/operators/pull_box_sparse_op.kps
+++ b/paddle/fluid/operators/pull_box_sparse_op.kps
@@ -45,16 +45,7 @@ limitations under the License. */
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-#ifdef PADDLE_WITH_XPU_KP
-REGISTER_OP_KERNEL(pull_box_sparse,
-                   KP,
-                   plat::XPUPlace,
-                   ops::PullBoxSparseKernel<float>);
-REGISTER_OP_KERNEL(push_box_sparse,
-                   KP,
-                   plat::XPUPlace,
-                   ops::PushBoxSparseKernel<float>);
-#else
-REGISTER_OP_CUDA_KERNEL(pull_box_sparse, ops::PullBoxSparseKernel<float>);
-REGISTER_OP_CUDA_KERNEL(push_box_sparse, ops::PushBoxSparseKernel<float>);
-#endif
+PD_REGISTER_STRUCT_KERNEL(
+    pull_box_sparse, KPS, ALL_LAYOUT, ops::PullBoxSparseKernel, float) {}
+PD_REGISTER_STRUCT_KERNEL(
+    push_box_sparse, KPS, ALL_LAYOUT, ops::PushBoxSparseKernel, float) {}
diff --git a/paddle/fluid/operators/pull_gpups_sparse_op.cc b/paddle/fluid/operators/pull_gpups_sparse_op.cc
index 821cfdab6f10c1..afaa9af3fda20a 100644
--- a/paddle/fluid/operators/pull_gpups_sparse_op.cc
+++ b/paddle/fluid/operators/pull_gpups_sparse_op.cc
@@ -145,9 +145,16 @@ REGISTER_OPERATOR(pull_gpups_sparse,
                   ops::PushGpuPSSparseOpMaker<paddle::framework::OpDesc>,
                   ops::PushGpuPSSparseOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(push_gpups_sparse, ops::PushGpuPSSparseOp);
-REGISTER_OP_CPU_KERNEL(pull_gpups_sparse,
-                       ops::PullGpuPSSparseCPUKernel<float>,
-                       ops::PullGpuPSSparseCPUKernel<double>)
-REGISTER_OP_CPU_KERNEL(push_gpups_sparse,
-                       ops::PushGpuPSSparseCPUKernel<float>,
-                       ops::PushGpuPSSparseCPUKernel<double>)
+
+PD_REGISTER_STRUCT_KERNEL(pull_gpups_sparse,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::PullGpuPSSparseCPUKernel,
+                          float,
+                          double) {}
+PD_REGISTER_STRUCT_KERNEL(push_gpups_sparse,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::PushGpuPSSparseCPUKernel,
+                          float,
+                          double) {}
diff --git a/paddle/fluid/operators/pull_gpups_sparse_op.cu b/paddle/fluid/operators/pull_gpups_sparse_op.cu
index ff68c42c8eb1b1..a936d810216e61 100644
--- a/paddle/fluid/operators/pull_gpups_sparse_op.cu
+++ b/paddle/fluid/operators/pull_gpups_sparse_op.cu
@@ -20,7 +20,7 @@ namespace paddle {
 namespace operators {
 using phi::PADDLE_CUDA_NUM_THREADS;
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PullGpuPSSparseCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
@@ -28,7 +28,7 @@ class PullGpuPSSparseCUDAKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PushGpuPSSparseCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
@@ -39,9 +39,15 @@ class PushGpuPSSparseCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(pull_gpups_sparse,
-                        ops::PullGpuPSSparseCUDAKernel<float>,
-                        ops::PullGpuPSSparseCUDAKernel<double>)
-REGISTER_OP_CUDA_KERNEL(push_gpups_sparse,
-                        ops::PushGpuPSSparseCUDAKernel<float>,
-                        ops::PushGpuPSSparseCUDAKernel<double>)
+PD_REGISTER_STRUCT_KERNEL(pull_gpups_sparse,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::PullGpuPSSparseCUDAKernel,
+                          float,
+                          double) {}
+PD_REGISTER_STRUCT_KERNEL(push_gpups_sparse,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::PushGpuPSSparseCUDAKernel,
+                          float,
+                          double) {}
diff --git a/paddle/fluid/operators/pull_gpups_sparse_op.h b/paddle/fluid/operators/pull_gpups_sparse_op.h
index 2d844a4ce2bf09..d8fdadd99cbd46 100644
--- a/paddle/fluid/operators/pull_gpups_sparse_op.h
+++ b/paddle/fluid/operators/pull_gpups_sparse_op.h
@@ -97,7 +97,7 @@ static void PushGpuPSSparseFunctor(const framework::ExecutionContext &ctx) {
 #endif
 }
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PullGpuPSSparseCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
@@ -105,7 +105,7 @@ class PullGpuPSSparseCPUKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PushGpuPSSparseCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
diff --git a/paddle/fluid/operators/pull_sparse_op.cc b/paddle/fluid/operators/pull_sparse_op.cc
index 7dc9ae98e0e41c..4850bf33ae89cd 100644
--- a/paddle/fluid/operators/pull_sparse_op.cc
+++ b/paddle/fluid/operators/pull_sparse_op.cc
@@ -143,5 +143,7 @@ REGISTER_OPERATOR(pull_sparse,
                   ops::PushSparseOpMaker<paddle::framework::OpDesc>,
                   ops::PushSparseOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(push_sparse, ops::PushSparseOp);
-REGISTER_OP_CPU_KERNEL(pull_sparse, ops::PullSparseCPUKernel<float>)
-REGISTER_OP_CPU_KERNEL(push_sparse, ops::PushSparseCPUKernel<float>)
+PD_REGISTER_STRUCT_KERNEL(
+    pull_sparse, CPU, ALL_LAYOUT, ops::PullSparseCPUKernel, float) {}
+PD_REGISTER_STRUCT_KERNEL(
+    push_sparse, CPU, ALL_LAYOUT, ops::PushSparseCPUKernel, float) {}
diff --git a/paddle/fluid/operators/pull_sparse_op.h b/paddle/fluid/operators/pull_sparse_op.h
index ecc3a5e1021dee..263511b65180da 100644
--- a/paddle/fluid/operators/pull_sparse_op.h
+++ b/paddle/fluid/operators/pull_sparse_op.h
@@ -66,7 +66,7 @@ void PushSparseFunctor(const framework::ExecutionContext& ctx) {
                                                 &grads);
 }
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PullSparseCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -74,7 +74,7 @@ class PullSparseCPUKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PushSparseCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/fluid/operators/pull_sparse_v2_op.cc b/paddle/fluid/operators/pull_sparse_v2_op.cc
index 88a0ac86c2532d..993950c360c12c 100644
--- a/paddle/fluid/operators/pull_sparse_v2_op.cc
+++ b/paddle/fluid/operators/pull_sparse_v2_op.cc
@@ -135,5 +135,7 @@ REGISTER_OPERATOR(pull_sparse_v2,
                   ops::PushSparseV2OpMaker<paddle::framework::OpDesc>,
                   ops::PushSparseV2OpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(push_sparse_v2, ops::PushSparseV2Op);
-REGISTER_OP_CPU_KERNEL(pull_sparse_v2, ops::PullSparseV2CPUKernel<float>)
-REGISTER_OP_CPU_KERNEL(push_sparse_v2, ops::PushSparseV2CPUKernel<float>)
+PD_REGISTER_STRUCT_KERNEL(
+    pull_sparse_v2, CPU, ALL_LAYOUT, ops::PullSparseV2CPUKernel, float) {}
+PD_REGISTER_STRUCT_KERNEL(
+    push_sparse_v2, CPU, ALL_LAYOUT, ops::PushSparseV2CPUKernel, float) {}
diff --git a/paddle/fluid/operators/pull_sparse_v2_op.h b/paddle/fluid/operators/pull_sparse_v2_op.h
index c24d0a4f338e7d..95ce7183857807 100644
--- a/paddle/fluid/operators/pull_sparse_v2_op.h
+++ b/paddle/fluid/operators/pull_sparse_v2_op.h
@@ -25,7 +25,7 @@
 namespace paddle {
 namespace operators {
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PullSparseV2CPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -33,7 +33,7 @@ class PullSparseV2CPUKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class PushSparseV2CPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/fluid/operators/unity_build_rule.cmake b/paddle/fluid/operators/unity_build_rule.cmake
index 8f9a2f92814d5d..7ca431e8ea5d10 100644
--- a/paddle/fluid/operators/unity_build_rule.cmake
+++ b/paddle/fluid/operators/unity_build_rule.cmake
@@ -202,7 +202,6 @@ register_unity_group(
   pad_op.cc)
 register_unity_group(
   cc
-  modified_huber_loss_op.cc
   partial_sum_op.cc
   pixel_shuffle_op.cc
   pool_op.cc

From 891cf433e29455cd5740a6b5e59dad5109d826c8 Mon Sep 17 00:00:00 2001
From: Wang Xin <xinwang614@gmail.com>
Date: Mon, 10 Apr 2023 11:40:30 +0800
Subject: [PATCH 010/156] add autogen code support for logcumsumexp op (#52682)

---
 paddle/fluid/operators/cum_op.cc          | 78 -----------------------
 paddle/phi/api/yaml/backward.yaml         | 10 +++
 paddle/phi/api/yaml/legacy_backward.yaml  | 10 ---
 paddle/phi/api/yaml/legacy_ops.yaml       |  9 ---
 paddle/phi/api/yaml/op_compat.yaml        |  7 ++
 paddle/phi/api/yaml/ops.yaml              |  9 +++
 paddle/phi/ops/compat/logcumsumexp_sig.cc | 39 ------------
 7 files changed, 26 insertions(+), 136 deletions(-)
 delete mode 100644 paddle/phi/ops/compat/logcumsumexp_sig.cc

diff --git a/paddle/fluid/operators/cum_op.cc b/paddle/fluid/operators/cum_op.cc
index 2c42280c6d45c1..a886e0dbbe99b5 100644
--- a/paddle/fluid/operators/cum_op.cc
+++ b/paddle/fluid/operators/cum_op.cc
@@ -123,74 +123,6 @@ class CumsumCompositeGradOpMaker : public prim::CompositeGradOpMakerBase {
     this->RecoverOutputName(dx, dx_name);
   }
 };
-
-class LogcumsumexpOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "Input of logcumsumexp operator");
-    AddOutput("Out", "Output of logcumsumexp operator");
-    AddAttr<int>("axis",
-                 "The dimension to accumulate along. -1 means the last "
-                 "dimension [default -1].")
-        .SetDefault(-1);
-    AddAttr<bool>(
-        "flatten",
-        "Whether to compute the logcumsumexp over the flattened array. "
-        "[default false].")
-        .SetDefault(false);
-    AddAttr<bool>("exclusive",
-                  "Whether to perform exclusive logcumsumexp. [default false].")
-        .SetDefault(false);
-    AddAttr<bool>(
-        "reverse",
-        "If true, the logcumsumexp is performed in the reversed direction. "
-        "[default false].")
-        .SetDefault(false);
-    AddComment(R"DOC(
-Returns the logarithm of the cumulative summation of the exponentiation of elements of input along the given axis.
-By default, the first element of the result is the same of the first element of
-the input. If exclusive is true, the first element of the result is the lowest finite value of the dtype of output tensor.
-)DOC");
-  }
-};
-
-class LogcumsumexpGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "logcumsumexp");
-    OP_INOUT_CHECK(ctx->HasInput("Out"), "Input", "Out", "logcumsumexp");
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input",
-                   "Out@GRAD",
-                   "logcumsumexp");
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-  }
-};
-
-template <typename T>
-class LogcumsumexpGradMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> grad_op) const override {
-    grad_op->SetType("logcumsumexp_grad");
-    grad_op->SetInput("X", this->Input("X"));
-    grad_op->SetInput("Out", this->Output("Out"));
-    grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    grad_op->SetAttr("axis", PADDLE_GET_CONST(int, this->GetAttr("axis")));
-    grad_op->SetAttr("flatten",
-                     PADDLE_GET_CONST(bool, this->GetAttr("flatten")));
-    grad_op->SetAttr("exclusive",
-                     PADDLE_GET_CONST(bool, this->GetAttr("exclusive")));
-    grad_op->SetAttr("reverse",
-                     PADDLE_GET_CONST(bool, this->GetAttr("reverse")));
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
 
@@ -200,9 +132,6 @@ DECLARE_INFER_SHAPE_FUNCTOR(cumsum,
                             CumsumInferShapeFunctor,
                             PD_INFER_META(phi::CumScalarAxisInferMeta));
 
-DECLARE_INFER_SHAPE_FUNCTOR(logcumsumexp,
-                            LogcumsumexpInferShapeFunctor,
-                            PD_INFER_META(phi::CumInferMeta));
 REGISTER_OPERATOR(cumsum,
                   ops::CumOp,
                   ops::CumsumOpMaker,
@@ -210,13 +139,6 @@ REGISTER_OPERATOR(cumsum,
                   ops::CumsumGradMaker<paddle::framework::OpDesc>,
                   ops::CumsumGradMaker<paddle::imperative::OpBase>,
                   CumsumInferShapeFunctor);
-REGISTER_OPERATOR(logcumsumexp,
-                  ops::CumOp,
-                  ops::LogcumsumexpOpMaker,
-                  ops::LogcumsumexpGradMaker<paddle::framework::OpDesc>,
-                  ops::LogcumsumexpGradMaker<paddle::imperative::OpBase>,
-                  LogcumsumexpInferShapeFunctor);
-REGISTER_OPERATOR(logcumsumexp_grad, ops::LogcumsumexpGradOp);
 REGISTER_OPERATOR(cumsum_grad, ops::CumGradOp);
 
 REGISTER_OP_VERSION(cumsum).AddCheckpoint(
diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml
index e0a12e13fb4e94..0a6062dd8294c5 100644
--- a/paddle/phi/api/yaml/backward.yaml
+++ b/paddle/phi/api/yaml/backward.yaml
@@ -966,6 +966,16 @@
     func : log_softmax_grad
     data_type : out_grad
 
+- backward_op : logcumsumexp_grad
+  forward : logcumsumexp(Tensor x, int axis=-1, bool flatten=false, bool exclusive=false, bool reverse=false) -> Tensor(out)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  args : (Tensor x, Tensor out, Tensor out_grad, int axis, bool flatten, bool exclusive, bool reverse)
+  output : Tensor(x_grad)
+  kernel :
+    func : logcumsumexp_grad
+
 - backward_op : logit_grad
   forward : logit (Tensor x, float eps = 1e-6f) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, float eps)
diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml
index 1e11bc54b3f51c..b655f379cf71eb 100755
--- a/paddle/phi/api/yaml/legacy_backward.yaml
+++ b/paddle/phi/api/yaml/legacy_backward.yaml
@@ -556,16 +556,6 @@
   no_need_buffer : bias
   optional : scale, bias
 
-- backward_op : logcumsumexp_grad
-  forward : logcumsumexp(Tensor x, int axis, bool flatten, bool exclusive, bool reverse) -> Tensor(out)
-  infer_meta :
-    func : UnchangedInferMeta
-    param : [x]
-  args : (Tensor x, Tensor out, Tensor out_grad, int axis, bool flatten, bool exclusive, bool reverse)
-  output : Tensor(x_grad)
-  kernel :
-    func : logcumsumexp_grad
-
 - backward_op : logsumexp_grad
   forward : logsumexp(Tensor x, int64_t[] axis,  bool keepdim,  bool reduce_all) -> Tensor(out)
   args : (Tensor x, Tensor out, Tensor out_grad, int64_t[] axis,  bool keepdim,  bool reduce_all)
diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml
index a689fbc17dfaf4..d827e7eabbfa7b 100755
--- a/paddle/phi/api/yaml/legacy_ops.yaml
+++ b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -800,15 +800,6 @@
     data_type : dtype
     backend : place
 
-- op : logcumsumexp
-  args : (Tensor x, int axis, bool flatten, bool exclusive, bool reverse)
-  output : Tensor(out)
-  infer_meta :
-    func : CumInferMeta
-  kernel :
-    func : logcumsumexp
-  backward : logcumsumexp_grad
-
 - op : logspace
   args : (Tensor start, Tensor stop, Tensor num, Tensor base, DataType dtype, Place place={})
   output : Tensor(out)
diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
index 8a2ce29511f06c..552895cf25fde4 100644
--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -1247,6 +1247,13 @@
   extra :
     attrs : [bool use_mkldnn = false]
 
+- op : logcumsumexp
+  backward : logcumsumexp_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
 - op : logical_and
   inputs :
     {x : X, y : Y}
diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
index 110fc1838aba10..40e47845fe9002 100644
--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -1003,6 +1003,15 @@
     data_type : x
   backward : log_softmax_grad
 
+- op : logcumsumexp
+  args : (Tensor x, int axis=-1, bool flatten=false, bool exclusive=false, bool reverse=false)
+  output : Tensor(out)
+  infer_meta :
+    func : CumInferMeta
+  kernel :
+    func : logcumsumexp
+  backward : logcumsumexp_grad
+
 - op : logical_and
   args : (Tensor x, Tensor y)
   output : Tensor(out)
diff --git a/paddle/phi/ops/compat/logcumsumexp_sig.cc b/paddle/phi/ops/compat/logcumsumexp_sig.cc
deleted file mode 100644
index 2c790903b63330..00000000000000
--- a/paddle/phi/ops/compat/logcumsumexp_sig.cc
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-
-KernelSignature LogcumsumexpOpArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  return KernelSignature("logcumsumexp",
-                         {"X"},
-                         {"axis", "flatten", "exclusive", "reverse"},
-                         {"Out"});
-}
-
-KernelSignature LogcumsumexpGradOpArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  return KernelSignature("logcumsumexp_grad",
-                         {"X", "Out", "Out@GRAD"},
-                         {"axis", "flatten", "exclusive", "reverse"},
-                         {"X@GRAD"});
-}
-
-}  // namespace phi
-
-PD_REGISTER_ARG_MAPPING_FN(logcumsumexp, phi::LogcumsumexpOpArgumentMapping);
-PD_REGISTER_ARG_MAPPING_FN(logcumsumexp_grad,
-                           phi::LogcumsumexpGradOpArgumentMapping);

From de44b3ac2c0387970fac139bd88dbd6a5db51265 Mon Sep 17 00:00:00 2001
From: chalsliu <45041955+chalsliu@users.noreply.github.com>
Date: Mon, 10 Apr 2023 12:53:31 +0800
Subject: [PATCH 011/156] fix version message (#50318)

---
 cmake/experiments/cuda_module_loading_lazy.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/experiments/cuda_module_loading_lazy.cmake b/cmake/experiments/cuda_module_loading_lazy.cmake
index f6da0771be0c08..9e1fa4fa4d961a 100644
--- a/cmake/experiments/cuda_module_loading_lazy.cmake
+++ b/cmake/experiments/cuda_module_loading_lazy.cmake
@@ -31,9 +31,9 @@ if(LINUX)
     message("cuda 11.7+ already support lazy module loading")
     return()
   endif()
-  if(${CUDA_VERSION} VERSION_LESS "11.2" AND ${CMAKE_CXX_COMPILER_VERSION}
+  if(${CUDA_VERSION} VERSION_LESS "12.0" AND ${CMAKE_CXX_COMPILER_VERSION}
                                              VERSION_GREATER_EQUAL 12.0)
-    message("cuda less than 11.2 doesn't support gcc12")
+    message("cuda less than 12.0 doesn't support gcc12")
     return()
   endif()
 

From 7c98abd96ec0fbbbf8fc8b36a216541df9f7a4da Mon Sep 17 00:00:00 2001
From: qizhaoaoe <10208099+qizhaoaoe@users.noreply.github.com>
Date: Mon, 10 Apr 2023 12:54:23 +0800
Subject: [PATCH 012/156] =?UTF-8?q?=E3=80=90AMP=20OP&Test=E3=80=91instance?=
 =?UTF-8?q?=5Fnorm=20fp16=20and=20bf16=20support.=20(#52241)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* add fp16 and bf16 support for instance_norm

* fix /= operator which not support bf16

* fix instance_norm_grad kernel and unittests.

* fix fp32 unittests.

* fix instance_norm_kernel and unittests.

* fix instance_norm_grad_kernel and unittest threshold.

* add fp16/bf16 for instance_norm_grad_grad op.

* add bf16 dtype check.

* fix conflicts.

* fix cpu support for fp32 op and fix type in instance_norm_grad_kernel.

* fix type in instance_norm_kernel.

* fix bf16 outputs in unittests and refine codes.

* fix dx computation.

* delete unuseful params and head including.

* add fp16/bf16 for static graph.

* fix device condiction for instance_norm op.

* fix instance_norm_grad_grad and bf16 op tests.

* fix op_test to support grad of bf16 can be compared with fp32.

* remove updates.

* add self-defined grad.
---
 .../kernels/gpu/instance_norm_grad_kernel.cu  | 260 ++++++++++--------
 .../phi/kernels/gpu/instance_norm_kernel.cu   |  47 +++-
 paddle/phi/kernels/gpu/instance_norm_utils.h  |  12 +-
 .../unittests/test_instance_norm_op_v2.py     | 198 +++++++++++++
 .../white_list/op_accuracy_white_list.py      |   1 +
 python/paddle/nn/functional/norm.py           |   5 +-
 python/paddle/static/nn/common.py             |   5 +-
 7 files changed, 396 insertions(+), 132 deletions(-)

diff --git a/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
index 7ffb36d1129975..a121f9fb95b063 100644
--- a/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/instance_norm_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -62,12 +61,12 @@ static __global__ void GradComputeDX(const T *dy,
   }
   __syncthreads();
   for (int i = beg_idx; i < end_idx; i += BlockDim) {
-    dx[i] =
+    dx[i] = static_cast<T>(
         (static_cast<BatchNormParamType<T>>(dy[i]) -
          dy_sum_val / static_cast<BatchNormParamType<T>>(sample_size) -
          (static_cast<BatchNormParamType<T>>(x[i]) - mean_val) *
              dy_x_sub_mean_sum_val * inv_var_val * inv_var_val / sample_size) *
-        scale[c] * inv_var_val;
+        scale[c] * inv_var_val);
   }
 }
 
@@ -78,14 +77,14 @@ static __device__ __forceinline__ double real_sqrt(double x) {
   return 1. / sqrt(x);
 }
 
-template <typename T, int BlockDim>
+template <typename T, typename AccT, int BlockDim>
 __global__ void DoubleGradComputeDX(const T *x,
-                                    const T *mean,
-                                    const T *variance,
+                                    const AccT *mean,
+                                    const AccT *variance,
                                     const T *ddx,
                                     const T *dy,
-                                    const T *scale,
-                                    const T *ddscale,
+                                    const AccT *scale,
+                                    const AccT *ddscale,
                                     int C,
                                     int sample_size,
                                     const double epsilon,
@@ -95,30 +94,30 @@ __global__ void DoubleGradComputeDX(const T *x,
   int ncid = blockIdx.x;
   int c = ncid % C;
 
-  T mean_val = mean[ncid];
-  T var_val = variance[ncid];
+  AccT mean_val = mean[ncid];
+  AccT var_val = variance[ncid];
 
-  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
+  typedef cub::BlockReduce<AccT, BlockDim> BlockReduce;
   __shared__ typename BlockReduce::TempStorage dy_storage;
   __shared__ typename BlockReduce::TempStorage ddx_storage;
   __shared__ typename BlockReduce::TempStorage dy_mul_ddx_storage;
   __shared__ typename BlockReduce::TempStorage dy_mul_x_sub_mean_storage;
   __shared__ typename BlockReduce::TempStorage ddx_mul_x_sub_mean_storage;
-  __shared__ T dy_sum_val;
-  __shared__ T ddx_sum_val;
-  __shared__ T dy_mul_ddx_sum_val;
-  __shared__ T dy_mul_x_sub_mean_sum_val;
-  __shared__ T ddx_mul_x_sub_mean_sum_val;
-
-  T dy_sum = 0;
-  T ddx_sum = 0;
-  T dy_mul_ddx_sum = 0;
-  T dy_mul_x_sub_mean_sum = 0;
-  T ddx_mul_x_sub_mean_sum = 0;
+  __shared__ AccT dy_sum_val;
+  __shared__ AccT ddx_sum_val;
+  __shared__ AccT dy_mul_ddx_sum_val;
+  __shared__ AccT dy_mul_x_sub_mean_sum_val;
+  __shared__ AccT ddx_mul_x_sub_mean_sum_val;
+
+  AccT dy_sum = 0;
+  AccT ddx_sum = 0;
+  AccT dy_mul_ddx_sum = 0;
+  AccT dy_mul_x_sub_mean_sum = 0;
+  AccT ddx_mul_x_sub_mean_sum = 0;
   for (int i = beg_idx; i < end_idx; i += BlockDim) {
-    T ddx_i = ddx[i];
-    T dy_i = dy[i];
-    T tmp = x[i] - mean_val;
+    AccT ddx_i = static_cast<AccT>(ddx[i]);
+    AccT dy_i = static_cast<AccT>(dy[i]);
+    AccT tmp = static_cast<AccT>(x[i]) - mean_val;
 
     dy_sum += dy_i;
     ddx_sum += ddx_i;
@@ -148,37 +147,44 @@ __global__ void DoubleGradComputeDX(const T *x,
 
   if (ddx != nullptr) {
     for (int i = beg_idx; i < end_idx; i += BlockDim) {
-      dx[i] +=
-          ((x[i] - mean_val) * var_val * var_val * var_val / sample_size *
+      AccT tmp = static_cast<AccT>(dx[i]);
+      tmp +=
+          ((static_cast<AccT>(x[i]) - mean_val) * var_val * var_val * var_val /
+               sample_size *
                (ddx_sum_val * dy_sum_val / sample_size - dy_mul_ddx_sum_val +
                 3. * dy_mul_x_sub_mean_sum_val * var_val *
                     ddx_mul_x_sub_mean_sum_val * var_val / sample_size) +
            ddx_mul_x_sub_mean_sum_val * var_val / sample_size * var_val *
-               var_val * (dy_sum_val / sample_size - dy[i]) +
+               var_val * (dy_sum_val / sample_size - static_cast<AccT>(dy[i])) +
            dy_mul_x_sub_mean_sum_val * var_val / sample_size * var_val *
-               var_val * (ddx_sum_val / sample_size - ddx[i])) *
+               var_val *
+               (ddx_sum_val / sample_size - static_cast<AccT>(ddx[i]))) *
           scale[c];
+      dx[i] = static_cast<T>(tmp);
     }
   }
   __syncthreads();
   if (ddscale != nullptr) {
     for (int i = beg_idx; i < end_idx; i += BlockDim) {
-      dx[i] += (dy[i] * var_val - dy_sum_val / sample_size * var_val -
-                (x[i] - mean_val) * var_val * dy_mul_x_sub_mean_sum_val *
-                    var_val / sample_size) *
-               ddscale[c];
+      AccT tmp = static_cast<AccT>(dx[i]);
+      tmp += (static_cast<AccT>(dy[i]) * var_val -
+              dy_sum_val / sample_size * var_val -
+              (static_cast<AccT>(x[i]) - mean_val) * var_val *
+                  dy_mul_x_sub_mean_sum_val * var_val / sample_size) *
+             ddscale[c];
+      dx[i] = static_cast<T>(tmp);
     }
   }
 }
 
-template <typename T, int BlockDim>
+template <typename T, typename AccT, int BlockDim>
 __global__ void DoubleGradComputeDDY(const T *x,
-                                     const T *mean,
-                                     const T *variance,
-                                     const T *ddscale,
-                                     const T *ddbias,
+                                     const AccT *mean,
+                                     const AccT *variance,
+                                     const AccT *ddscale,
+                                     const AccT *ddbias,
                                      const T *ddx,
-                                     const T *scale,
+                                     const AccT *scale,
                                      int C,
                                      int sample_size,
                                      const double epsilon,
@@ -187,20 +193,20 @@ __global__ void DoubleGradComputeDDY(const T *x,
   int end_idx = (blockIdx.x + 1) * sample_size;
   int ncid = blockIdx.x;
   int c = ncid % C;
-  T mean_val = mean[ncid];
-  T var_val = variance[ncid];
-  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
+  AccT mean_val = mean[ncid];
+  AccT var_val = variance[ncid];
+  typedef cub::BlockReduce<AccT, BlockDim> BlockReduce;
   __shared__ typename BlockReduce::TempStorage ddx_storage;
   __shared__ typename BlockReduce::TempStorage ddx_mul_x_sub_mean_storage;
-  __shared__ T ddx_sum_val;
-  __shared__ T ddx_mul_x_sub_mean_sum_val;
+  __shared__ AccT ddx_sum_val;
+  __shared__ AccT ddx_mul_x_sub_mean_sum_val;
 
-  T ddx_sum = 0;
-  T ddx_mul_x_sub_mean_sum = 0;
+  AccT ddx_sum = 0;
+  AccT ddx_mul_x_sub_mean_sum = 0;
   for (int i = beg_idx; i < end_idx; i += BlockDim) {
-    T ddx_i = ddx[i];
+    AccT ddx_i = static_cast<AccT>(ddx[i]);
     ddx_sum += ddx_i;
-    ddx_mul_x_sub_mean_sum += (ddx_i * (x[i] - mean_val));
+    ddx_mul_x_sub_mean_sum += (ddx_i * (static_cast<AccT>(x[i]) - mean_val));
   }
   ddx_sum = BlockReduce(ddx_storage).Reduce(ddx_sum, cub::Sum());
   ddx_mul_x_sub_mean_sum = BlockReduce(ddx_mul_x_sub_mean_storage)
@@ -212,55 +218,59 @@ __global__ void DoubleGradComputeDDY(const T *x,
   __syncthreads();
   if (ddx != nullptr) {
     for (int i = beg_idx; i < end_idx; i += BlockDim) {
-      ddy[i] += scale[c] * var_val *
-                (ddx[i] - ddx_sum_val / sample_size -
-                 (x[i] - mean_val) * var_val * ddx_mul_x_sub_mean_sum_val *
-                     var_val / sample_size);
+      AccT tmp = static_cast<AccT>(ddy[i]);
+      tmp += scale[c] * var_val *
+             (static_cast<AccT>(ddx[i]) - ddx_sum_val / sample_size -
+              (static_cast<AccT>(x[i]) - mean_val) * var_val *
+                  ddx_mul_x_sub_mean_sum_val * var_val / sample_size);
+      ddy[i] = static_cast<T>(tmp);
     }
   }
   __syncthreads();
   if (ddscale != nullptr) {
     for (int i = beg_idx; i < end_idx; i += BlockDim) {
-      ddy[i] += (x[i] - mean_val) * var_val * ddscale[c];
+      AccT tmp = static_cast<AccT>(ddy[i]);
+      tmp += (static_cast<AccT>(x[i]) - mean_val) * var_val * ddscale[c];
+      ddy[i] = static_cast<T>(tmp);
     }
   }
   __syncthreads();
   if (ddbias != nullptr) {
     for (int i = beg_idx; i < end_idx; i += BlockDim) {
-      ddy[i] += ddbias[c];
+      ddy[i] = static_cast<T>(static_cast<AccT>(ddy[i]) + ddbias[c]);
     }
   }
 }
 
-template <typename T, int BlockDim>
+template <typename T, typename AccT, int BlockDim>
 __global__ void DoubleGradComputeDScale(const T *x,
-                                        const T *mean,
-                                        const T *variance,
+                                        const AccT *mean,
+                                        const AccT *variance,
                                         const T *ddx,
                                         const T *dy,
                                         int C,
                                         int sample_size,
                                         const double epsilon,
-                                        T *dscale) {
+                                        AccT *dscale) {
   int beg_idx = blockIdx.x * sample_size + threadIdx.x;
   int end_idx = (blockIdx.x + 1) * sample_size;
   int ncid = blockIdx.x;
   int c = ncid % C;
-  T mean_val = mean[ncid];
-  T var_val = variance[ncid];
-  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
+  AccT mean_val = mean[ncid];
+  AccT var_val = variance[ncid];
+  typedef cub::BlockReduce<AccT, BlockDim> BlockReduce;
   __shared__ typename BlockReduce::TempStorage dy_storage;
   __shared__ typename BlockReduce::TempStorage dy_mul_x_sub_mean_storage;
   __shared__ typename BlockReduce::TempStorage dscale_tmp_storage;
-  __shared__ T dy_sum_val;
-  __shared__ T dy_mul_x_sub_mean_sum_val;
+  __shared__ AccT dy_sum_val;
+  __shared__ AccT dy_mul_x_sub_mean_sum_val;
 
-  T dy_sum = 0;
-  T dy_mul_x_sub_mean_sum = 0;
+  AccT dy_sum = 0;
+  AccT dy_mul_x_sub_mean_sum = 0;
   for (int i = beg_idx; i < end_idx; i += BlockDim) {
-    T dy_i = dy[i];
+    AccT dy_i = static_cast<AccT>(dy[i]);
     dy_sum += dy_i;
-    dy_mul_x_sub_mean_sum += (dy_i * (x[i] - mean_val));
+    dy_mul_x_sub_mean_sum += (dy_i * (static_cast<AccT>(x[i]) - mean_val));
   }
   dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
   dy_mul_x_sub_mean_sum = BlockReduce(dy_mul_x_sub_mean_storage)
@@ -272,12 +282,13 @@ __global__ void DoubleGradComputeDScale(const T *x,
   }
   __syncthreads();
   if (ddx != nullptr) {
-    T dscale_tmp = 0;
+    AccT dscale_tmp = 0;
     for (int i = beg_idx; i < end_idx; i += BlockDim) {
-      dscale_tmp += ddx[i] * var_val *
-                    (dy[i] - dy_sum_val / sample_size -
-                     dy_mul_x_sub_mean_sum_val * (x[i] - mean_val) * var_val *
-                         var_val / sample_size);
+      dscale_tmp +=
+          static_cast<AccT>(ddx[i]) * var_val *
+          (static_cast<AccT>(dy[i]) - dy_sum_val / sample_size -
+           dy_mul_x_sub_mean_sum_val * (static_cast<AccT>(x[i]) - mean_val) *
+               var_val * var_val / sample_size);
     }
     dscale_tmp = BlockReduce(dscale_tmp_storage).Reduce(dscale_tmp, cub::Sum());
     if (threadIdx.x == 0) {
@@ -298,6 +309,7 @@ void InstanceNormGradKernel(const Context &dev_ctx,
                             DenseTensor *d_x,
                             DenseTensor *d_scale,
                             DenseTensor *d_bias) {
+  using AccT = typename phi::dtype::MPTypeTrait<T>::Type;
   double epsilon = static_cast<double>(epsilon_f);
   const auto *scale_ptr = scale.get_ptr();
 
@@ -313,8 +325,8 @@ void InstanceNormGradKernel(const Context &dev_ctx,
 
   dev_ctx.template Alloc<T>(d_x);
   if (d_scale && d_bias) {
-    dev_ctx.template Alloc<T>(d_scale);
-    dev_ctx.template Alloc<T>(d_bias);
+    dev_ctx.template Alloc<AccT>(d_scale);
+    dev_ctx.template Alloc<AccT>(d_bias);
   }
   if (scale_ptr) {
     PADDLE_ENFORCE_EQ(
@@ -339,7 +351,7 @@ void InstanceNormGradKernel(const Context &dev_ctx,
                           scale_ptr->dims()));
   }
 
-  phi::funcs::SetConstant<GPUContext, T> set_constant;
+  phi::funcs::SetConstant<GPUContext, AccT> set_constant;
 
   const int n = x.numel();
   const int block = 512;
@@ -350,23 +362,21 @@ void InstanceNormGradKernel(const Context &dev_ctx,
 
   DenseTensor scale_tmp;
   scale_tmp.Resize({NxC});
-  dev_ctx.template Alloc<T>(&scale_tmp);
+  dev_ctx.template Alloc<AccT>(&scale_tmp);
 
   DenseTensor d_scale_tmp;
   d_scale_tmp.Resize({NxC});
-  dev_ctx.template Alloc<T>(&d_scale_tmp);
+  dev_ctx.template Alloc<AccT>(&d_scale_tmp);
 
   DenseTensor d_bias_tmp;
   d_bias_tmp.Resize({NxC});
-  dev_ctx.template Alloc<T>(&d_bias_tmp);
-
+  dev_ctx.template Alloc<AccT>(&d_bias_tmp);
   if (scale_ptr) {
-    repeat_param<T><<<grid, block, 0, dev_ctx.stream()>>>(
-        scale_ptr->data<T>(), scale_tmp.data<T>(), N, C);
+    repeat_param<AccT><<<grid, block, 0, dev_ctx.stream()>>>(
+        scale_ptr->data<AccT>(), scale_tmp.data<AccT>(), N, C);
   } else {
-    set_constant(dev_ctx, &scale_tmp, static_cast<T>(1));
+    set_constant(dev_ctx, &scale_tmp, static_cast<AccT>(1));
   }
-
   std::vector<int> dims;
   std::vector<int> strides;
   dims = {1, NxC, H, W, D};
@@ -424,11 +434,11 @@ void InstanceNormGradKernel(const Context &dev_ctx,
   PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDeriveBNTensorDescriptor(
       in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
 #endif
-
   const auto *saved_mean_data =
       saved_mean.template data<BatchNormParamType<T>>();
   const auto *saved_var_data =
       saved_variance.template data<BatchNormParamType<T>>();
+
   if (d_scale && d_bias) {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenBatchNormalizationBackward(
@@ -486,12 +496,11 @@ void InstanceNormGradKernel(const Context &dev_ctx,
           d_x->data<T>());
     }
   }
-
   if (d_scale && d_bias) {
-    add_param<T, block, false><<<grid1, block, 0, dev_ctx.stream()>>>(
-        d_scale_tmp.data<T>(), d_scale->data<T>(), N, C);
-    add_param<T, block, false><<<grid1, block, 0, dev_ctx.stream()>>>(
-        d_bias_tmp.data<T>(), d_bias->data<T>(), N, C);
+    add_param<AccT, block, false><<<grid1, block, 0, dev_ctx.stream()>>>(
+        d_scale_tmp.data<AccT>(), d_scale->data<AccT>(), N, C);
+    add_param<AccT, block, false><<<grid1, block, 0, dev_ctx.stream()>>>(
+        d_bias_tmp.data<AccT>(), d_bias->data<AccT>(), N, C);
   }
 
 #ifdef PADDLE_WITH_HIP
@@ -521,6 +530,7 @@ void InstanceNormDoubleGradKernel(const Context &dev_ctx,
                                   DenseTensor *dx,
                                   DenseTensor *dscale,
                                   DenseTensor *ddy) {
+  using AccT = typename phi::dtype::MPTypeTrait<T>::Type;
   const auto *Scale = scale.get_ptr();
   const auto *ddX = ddx.get_ptr();
   const auto *ddScale = ddscale.get_ptr();
@@ -529,11 +539,15 @@ void InstanceNormDoubleGradKernel(const Context &dev_ctx,
   const T *x_data = x.data<T>();
   const T *dy_data = dy.data<T>();
   const T *ddx_data = (ddX == nullptr ? nullptr : ddX->data<T>());
-  const T *ddscale_data = (ddScale == nullptr ? nullptr : ddScale->data<T>());
-  const T *ddbias_data = (ddScale == nullptr ? nullptr : ddBias->data<T>());
-  const T *mean_data = saved_mean.data<T>();
-  const T *variance_data = saved_variance.data<T>();
+  const AccT *ddscale_data =
+      (ddScale == nullptr ? nullptr : ddScale->data<AccT>());
+  const AccT *ddbias_data =
+      (ddScale == nullptr ? nullptr : ddBias->data<AccT>());
+  const AccT *mean_data = saved_mean.data<AccT>();
+  const AccT *variance_data = saved_variance.data<AccT>();
   phi::funcs::SetConstant<GPUContext, T> set_zero;
+  phi::funcs::SetConstant<GPUContext, AccT> set_zero_AccT;
+
   auto &x_dims = x.dims();
   int N, C, H, W, D;
   funcs::ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
@@ -544,10 +558,10 @@ void InstanceNormDoubleGradKernel(const Context &dev_ctx,
   DenseTensor scale_tmp;
   if (!Scale) {
     scale_tmp.Resize({C});
-    dev_ctx.template Alloc<T>(&scale_tmp);
-    set_zero(dev_ctx, &scale_tmp, static_cast<T>(1));
+    dev_ctx.template Alloc<AccT>(&scale_tmp);
+    set_zero_AccT(dev_ctx, &scale_tmp, static_cast<AccT>(1));
   }
-  const T *scale_data = Scale ? Scale->data<T>() : scale_tmp.data<T>();
+  const AccT *scale_data = Scale ? Scale->data<AccT>() : scale_tmp.data<AccT>();
   const int block = 512;
   int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
   const int max_blocks = std::max(max_threads / block, 1);
@@ -557,7 +571,7 @@ void InstanceNormDoubleGradKernel(const Context &dev_ctx,
   if (dx) {
     T *dx_data = dev_ctx.template Alloc<T>(dx);
     set_zero(dev_ctx, dx, static_cast<T>(0));
-    DoubleGradComputeDX<T, block>
+    DoubleGradComputeDX<T, AccT, block>
         <<<grid, block, 0, dev_ctx.stream()>>>(x_data,
                                                mean_data,
                                                variance_data,
@@ -573,13 +587,13 @@ void InstanceNormDoubleGradKernel(const Context &dev_ctx,
   if (dscale) {
     DenseTensor dscale_tmp;
     dscale_tmp.Resize({NxC});
-    dev_ctx.template Alloc<T>(&dscale_tmp);
-    set_zero(dev_ctx, &dscale_tmp, static_cast<T>(0));
-    T *dscale_tmp_data = dscale_tmp.data<T>();
+    dev_ctx.template Alloc<AccT>(&dscale_tmp);
+    set_zero_AccT(dev_ctx, &dscale_tmp, static_cast<AccT>(0));
+    AccT *dscale_tmp_data = dscale_tmp.data<AccT>();
 
-    T *dscale_data = dev_ctx.template Alloc<T>(dscale);
-    set_zero(dev_ctx, dscale, static_cast<T>(0));
-    DoubleGradComputeDScale<T, block>
+    AccT *dscale_data = dev_ctx.template Alloc<AccT>(dscale);
+    set_zero_AccT(dev_ctx, dscale, static_cast<AccT>(0));
+    DoubleGradComputeDScale<T, AccT, block>
         <<<grid, block, 0, dev_ctx.stream()>>>(x_data,
                                                mean_data,
                                                variance_data,
@@ -589,13 +603,13 @@ void InstanceNormDoubleGradKernel(const Context &dev_ctx,
                                                sample_size,
                                                epsilon,
                                                dscale_tmp_data);
-    add_param<T, block, false><<<grid1, block, 0, dev_ctx.stream()>>>(
-        dscale_tmp.data<T>(), dscale->data<T>(), N, C);
+    add_param<AccT, block, false><<<grid1, block, 0, dev_ctx.stream()>>>(
+        dscale_tmp.data<AccT>(), dscale->data<AccT>(), N, C);
   }
   if (ddy) {
     T *ddy_data = dev_ctx.template Alloc<T>(ddy);
     set_zero(dev_ctx, ddy, static_cast<T>(0));
-    DoubleGradComputeDDY<T, block>
+    DoubleGradComputeDDY<T, AccT, block>
         <<<grid, block, 0, dev_ctx.stream()>>>(x_data,
                                                mean_data,
                                                variance_data,
@@ -613,24 +627,48 @@ void InstanceNormDoubleGradKernel(const Context &dev_ctx,
 
 #ifdef PADDLE_WITH_HIP
 // MIOPEN do not support double
-PD_REGISTER_KERNEL(
-    instance_norm_grad, GPU, ALL_LAYOUT, phi::InstanceNormGradKernel, float) {}
+PD_REGISTER_KERNEL(instance_norm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::InstanceNormGradKernel,
+                   float,
+                   phi::dtype::float16) {}
 PD_REGISTER_KERNEL(instance_norm_double_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::InstanceNormDoubleGradKernel,
-                   float) {}
+                   float,
+                   phi::dtype::float16) {}
+#elif CUDNN_VERSION_MIN(8, 1, 0)
+PD_REGISTER_KERNEL(instance_norm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::InstanceNormGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+PD_REGISTER_KERNEL(instance_norm_double_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::InstanceNormDoubleGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 #else
 PD_REGISTER_KERNEL(instance_norm_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::InstanceNormGradKernel,
                    float,
-                   double) {}
+                   double,
+                   phi::dtype::float16) {}
 PD_REGISTER_KERNEL(instance_norm_double_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::InstanceNormDoubleGradKernel,
                    float,
-                   double) {}
+                   double,
+                   phi::dtype::float16) {}
 #endif
diff --git a/paddle/phi/kernels/gpu/instance_norm_kernel.cu b/paddle/phi/kernels/gpu/instance_norm_kernel.cu
index b842ce61dc3eb5..d4f421e62ddb92 100644
--- a/paddle/phi/kernels/gpu/instance_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/instance_norm_kernel.cu
@@ -33,6 +33,7 @@ void InstanceNormKernel(const Context &dev_ctx,
                         DenseTensor *y,
                         DenseTensor *saved_mean,
                         DenseTensor *saved_variance) {
+  using AccT = typename phi::dtype::MPTypeTrait<T>::Type;
   double epsilon = static_cast<double>(epsilon_f);
   auto &x_dims = x.dims();
   PADDLE_ENFORCE_GE(x_dims.size(),
@@ -113,10 +114,10 @@ void InstanceNormKernel(const Context &dev_ctx,
 
   DenseTensor scale_tmp;
   scale_tmp.Resize({NxC});
-  dev_ctx.template Alloc<T>(&scale_tmp);
+  dev_ctx.template Alloc<AccT>(&scale_tmp);
   DenseTensor bias_tmp;
   bias_tmp.Resize({NxC});
-  dev_ctx.template Alloc<T>(&bias_tmp);
+  dev_ctx.template Alloc<AccT>(&bias_tmp);
 
   const int n = x.numel();
   const int block = 512;
@@ -124,24 +125,25 @@ void InstanceNormKernel(const Context &dev_ctx,
   const int max_blocks = std::max(max_threads / block, 1);
   const int grid = std::min((NxC + block - 1) / block, max_blocks);
 
-  phi::funcs::SetConstant<GPUContext, T> set_constant;
+  phi::funcs::SetConstant<GPUContext, AccT> set_constant;
   if (scale_ptr) {
-    repeat_param<T><<<grid, block, 0, dev_ctx.stream()>>>(
-        scale_ptr->data<T>(), scale_tmp.data<T>(), N, C);
+    repeat_param<AccT><<<grid, block, 0, dev_ctx.stream()>>>(
+        scale_ptr->data<AccT>(), scale_tmp.data<AccT>(), N, C);
   } else {
-    set_constant(dev_ctx, &scale_tmp, static_cast<T>(1));
+    set_constant(dev_ctx, &scale_tmp, static_cast<AccT>(1));
   }
   if (bias_ptr) {
-    repeat_param<T><<<grid, block, 0, dev_ctx.stream()>>>(
-        bias_ptr->data<T>(), bias_tmp.data<T>(), N, C);
+    repeat_param<AccT><<<grid, block, 0, dev_ctx.stream()>>>(
+        bias_ptr->data<AccT>(), bias_tmp.data<AccT>(), N, C);
   } else {
-    set_constant(dev_ctx, &bias_tmp, static_cast<T>(0));
+    set_constant(dev_ctx, &bias_tmp, static_cast<AccT>(0));
   }
 
   auto handle = dev_ctx.cudnn_handle();
 
   DenseTensor saved_mean_tmp, saved_variance_tmp;
   phi::funcs::SetConstant<GPUContext, BatchNormParamType<T>> functor;
+
   if (saved_mean) {
     dev_ctx.template Alloc<BatchNormParamType<T>>(saved_mean);
     functor(dev_ctx, saved_mean, static_cast<BatchNormParamType<T>>(0));
@@ -156,7 +158,6 @@ void InstanceNormKernel(const Context &dev_ctx,
     saved_variance_tmp = phi::Full<BatchNormParamType<T>>(
         dev_ctx, {NxC}, static_cast<BatchNormParamType<T>>(0));
   }
-
   auto *saved_mean_data = saved_mean
                               ? saved_mean->data<BatchNormParamType<T>>()
                               : saved_mean_tmp.data<BatchNormParamType<T>>();
@@ -225,9 +226,27 @@ void InstanceNormKernel(const Context &dev_ctx,
 
 #ifdef PADDLE_WITH_HIP
 // MIOPEN do not support double
-PD_REGISTER_KERNEL(
-    instance_norm, GPU, ALL_LAYOUT, phi::InstanceNormKernel, float) {}
+PD_REGISTER_KERNEL(instance_norm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::InstanceNormKernel,
+                   float,
+                   phi::dtype::float16) {}
+#elif CUDNN_VERSION_MIN(8, 1, 0)
+PD_REGISTER_KERNEL(instance_norm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::InstanceNormKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 #else
-PD_REGISTER_KERNEL(
-    instance_norm, GPU, ALL_LAYOUT, phi::InstanceNormKernel, float, double) {}
+PD_REGISTER_KERNEL(instance_norm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::InstanceNormKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
 #endif
diff --git a/paddle/phi/kernels/gpu/instance_norm_utils.h b/paddle/phi/kernels/gpu/instance_norm_utils.h
index e52fe868c39ec5..865ab91da7b1b3 100644
--- a/paddle/phi/kernels/gpu/instance_norm_utils.h
+++ b/paddle/phi/kernels/gpu/instance_norm_utils.h
@@ -27,6 +27,7 @@ namespace cub = hipcub;
 #endif
 
 #include "paddle/phi/backends/gpu/gpu_dnn.h"
+#include "paddle/phi/common/amp_type_traits.h"
 
 namespace phi {
 
@@ -51,22 +52,23 @@ static __global__ void add_param(const T *input,
                                  T *output,
                                  const int repeat_num,
                                  const int C) {
-  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  typedef cub::BlockReduce<MPType, BlockDim> BlockReduce;
   __shared__ typename BlockReduce::TempStorage ou_storage;
   for (int i = blockIdx.x; i < C; i += gridDim.x) {
-    T ou = static_cast<T>(0);
+    MPType ou = static_cast<MPType>(0);
     for (int j = threadIdx.x; j < repeat_num; j += blockDim.x) {
       const int index = j * C + i;
-      ou += static_cast<T>(input[index]);
+      ou = ou + static_cast<MPType>(input[index]);
     }
     ou = BlockReduce(ou_storage).Reduce(ou, cub::Sum());
     if (threadIdx.x == 0) {
-      output[i] = ou;
+      output[i] = static_cast<T>(ou);
     }
     __syncthreads();
 
     if (AVG) {
-      output[i] /= repeat_num;
+      output[i] = static_cast<T>(static_cast<MPType>(output[i]) / repeat_num);
     }
   }
 }
diff --git a/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py
index 6dd462fda43fbd..d214965b2dd6e6 100644
--- a/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py
@@ -15,6 +15,7 @@
 import unittest
 
 import numpy as np
+from eager_op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import fluid
@@ -121,5 +122,202 @@ def compute_v2(x_np):
             np.testing.assert_allclose(y1, y2, rtol=1e-05)
 
 
+def instance_norm_warpper(
+    input, weight, bias, epsilon=1e-5, momentum=0.9, data_format='NCHW'
+):
+    if data_format == "AnyLayout":
+        data_format = "NCDHW"
+    return paddle._C_ops.instance_norm(
+        input, weight, bias, epsilon, momentum, data_format
+    )
+
+
+def _reference_instance_norm(x, scale, bias, epsilon):
+    N, C, H, W = x.shape
+    mean = np.mean(x, axis=(2, 3), keepdims=True)
+    variance = np.var(x, axis=(2, 3), keepdims=True)
+    std = np.sqrt(variance) + epsilon
+    x_norm = (x - mean) / std
+    scale = scale.reshape([1, C, 1, 1])
+    bias = bias.reshape([1, C, 1, 1])
+    x_norm = scale * x_norm + bias
+    return x_norm, mean.reshape(N * C), std.reshape(N * C)
+
+
+def _reference_instance_norm_grad(x, scale, mean, var):
+    n, c, h, w = x.shape
+    d_y = np.ones(x.shape) / (np.prod(x.shape))
+    d_bias = np.ones((c,)) / c
+
+    mean_tile = np.reshape(mean, (n, c, 1, 1))
+    mean_tile = np.tile(mean_tile, (1, 1, h, w))
+    var_tile = np.reshape(var, (n, c, 1, 1))
+    var_tile = np.tile(var_tile, (1, 1, h, w))
+
+    d_scale = np.sum(d_y * (x - mean_tile) * var_tile, axis=(0, 2, 3))
+    var_inv = var_tile
+    scale_tile = np.reshape(scale, (1, c, 1, 1))
+    scale_tile = np.tile(scale_tile, (n, 1, h, w))
+
+    d_x = (
+        scale_tile
+        * var_inv
+        * (
+            d_y
+            - np.mean(d_y, axis=(2, 3), keepdims=True)
+            - (x - mean_tile)
+            * var_inv
+            * np.mean(
+                d_y * (x - mean_tile) * var_inv, axis=(2, 3), keepdims=True
+            )
+        )
+    )
+
+    return d_x, d_scale, d_bias
+
+
+class TestInstanceNormFP32OP(OpTest):
+    def setUp(self):
+        '''Test instance_norm op with default value'''
+        self.op_type = "instance_norm"
+        self.__class__.op_type = self.op_type
+        self.python_api = instance_norm_warpper
+        self.data_format = "NCHW"
+        self.eps = 1e-5
+        self.init_dtype()
+        self.init_shape()
+        self.init_value()
+        self.set_err_thre()
+        self.inputs = {'X': self.value, 'Scale': self.scale, 'Bias': self.bias}
+        self.attrs = {
+            'epsilon': self.eps,
+            'momentum': 0.9,
+            'data_format': self.data_format,
+        }
+        y, mean, variance = _reference_instance_norm(
+            self.value, self.scale, self.bias, self.eps
+        )
+        self.python_out_sig = ['Y']
+        self.outputs = {
+            'Y': y,
+            'SavedMean': mean,
+            'SavedVariance': 1.0 / variance,
+        }
+
+    def test_check_output(self):
+        self.check_output(atol=self.atol)
+
+    def test_check_grad(self):
+        self.check_grad(
+            ['X', 'Scale', 'Bias'],
+            'Y',
+        )
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def init_shape(self):
+        self.shape = [4, 100, 4, 4]
+
+    def init_value(self):
+        np.random.seed(0)
+        self.value = np.random.random(self.shape).astype(self.dtype)
+        self.scale = np.random.random([self.shape[1]]).astype(np.float32)
+        self.bias = np.random.random([self.shape[1]]).astype(np.float32)
+
+    def set_err_thre(self):
+        self.atol = 1e-3
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_float16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support the float16",
+)
+class TestInstanceNormFP16OP(TestInstanceNormFP32OP):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def set_err_thre(self):
+        self.atol = 0.03125
+        self.max_relative_error = 8e-3
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place, atol=self.atol)
+
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(
+            place,
+            ['X', 'Scale', 'Bias'],
+            'Y',
+            max_relative_error=self.max_relative_error,
+        )
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support the bfloat16",
+)
+class TestInstanceNormBF16OP(OpTest):
+    def setUp(self):
+        self.op_type = "instance_norm"
+        self.__class__.op_type = self.op_type
+        self.python_api = instance_norm_warpper
+        self.eps = 1e-5
+        self.data_format = "NCHW"
+        self.dtype = np.uint16
+        self.init_shape()
+        self.init_value()
+
+        y, mean, variance = _reference_instance_norm(
+            self.value, self.scale, self.bias, self.eps
+        )
+        var_inv = 1.0 / variance
+        self.user_defined_grads = _reference_instance_norm_grad(
+            self.value, self.scale, mean, var_inv
+        )
+        self.python_out_sig = ['Y']
+        self.outputs = {
+            'Y': convert_float_to_uint16(y),
+            'SavedMean': mean,
+            'SavedVariance': var_inv,
+        }
+        self.inputs = {
+            'X': convert_float_to_uint16(self.value),
+            'Scale': self.scale,
+            'Bias': self.bias,
+        }
+        self.attrs = {
+            'epsilon': self.eps,
+            'momentum': 0.9,
+            'data_format': self.data_format,
+        }
+
+    def init_value(self):
+        np.random.seed(0)
+        self.value = np.random.random(self.shape).astype(np.float32)
+        self.scale = np.random.random([self.shape[1]]).astype(np.float32)
+        self.bias = np.random.random([self.shape[1]]).astype(np.float32)
+
+    def init_shape(self):
+        self.shape = [4, 100, 4, 4]
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(
+            place,
+            ['X', 'Scale', 'Bias'],
+            'Y',
+            user_defined_grads=self.user_defined_grads,
+        )
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py b/python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py
index ced30722cf2792..d7613f7b284e81 100644
--- a/python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py
@@ -14,6 +14,7 @@
 
 # For op in NO_FP64_CHECK_GRAD_OP_LIST, the op test requires check_grad with fp64 precision
 NO_FP64_CHECK_GRAD_OP_LIST = [
+    'instance_norm',
     'affine_grid',
     'clip',
     'conv2d',
diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index 2f71f137f43b27..95e1ca2504cd97 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -426,7 +426,10 @@ def instance_norm(
         return out
     else:
         check_variable_and_dtype(
-            x, 'input', ['float32', 'float64'], "InstanceNorm"
+            x,
+            'input',
+            ['float32', 'float64', 'float16', 'uint16'],
+            "InstanceNorm",
         )
 
         attrs = {
diff --git a/python/paddle/static/nn/common.py b/python/paddle/static/nn/common.py
index 2e0d3b2289c290..a3ae723c6273c7 100644
--- a/python/paddle/static/nn/common.py
+++ b/python/paddle/static/nn/common.py
@@ -306,7 +306,10 @@ def instance_norm(
             hidden2 = paddle.static.nn.instance_norm(hidden1)
     """
     check_variable_and_dtype(
-        input, 'input', ['float32', 'float64'], 'instance_norm'
+        input,
+        'input',
+        ['uint16', 'float16', 'float32', 'float64'],
+        'instance_norm',
     )
     if param_attr is False:
         assert (

From 70eaf9de2c7876f4ad46e08461613b17ffdf16c6 Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Mon, 10 Apr 2023 12:55:35 +0800
Subject: [PATCH 013/156] update (#51297)

---
 .../fluid/tests/unittests/test_reduce_op.py   | 262 +++++++++---------
 1 file changed, 128 insertions(+), 134 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_reduce_op.py b/python/paddle/fluid/tests/unittests/test_reduce_op.py
index 72d1f4f2d6963e..01b25b543117c8 100644
--- a/python/paddle/fluid/tests/unittests/test_reduce_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py
@@ -25,15 +25,31 @@
 
 class TestSumOp(OpTest):
     def setUp(self):
+        self.init_dtype()
+        self.init_input()
+        self.init_attrs()
+        self.calc_output()
+
         self.python_api = paddle.sum
         self.public_python_api = paddle.sum
         self.op_type = "reduce_sum"
         self.prim_op_type = "prim"
-        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
-        self.outputs = {'Out': self.inputs['X'].sum(axis=0)}
-        self.attrs = {'dim': [0]}
+        self.inputs = {'X': self.x}
+        self.outputs = {'Out': self.out}
         self.enable_cinn = True
 
+    def init_dtype(self):
+        self.dtype = np.float64
+
+    def init_input(self):
+        self.x = np.random.random((5, 6, 10)).astype(self.dtype)
+
+    def init_attrs(self):
+        self.attrs = {'dim': [0]}
+
+    def calc_output(self):
+        self.out = self.x.sum(axis=tuple(self.attrs['dim']))
+
     def test_check_output(self):
         self.check_output()
 
@@ -41,51 +57,42 @@ def test_check_grad(self):
         self.check_grad(['X'], 'Out', check_prim=True)
 
 
-class TestSumOpFp32(OpTest):
-    def setUp(self):
-        self.python_api = paddle.sum
-        self.public_python_api = paddle.sum
-        self.op_type = "reduce_sum"
-        self.prim_op_type = "prim"
-        self.inputs = {
-            'X': np.random.uniform(0, 0.1, (5, 6, 10)).astype("float16")
-        }
-        self.attrs = {'dim': [0, 1, 2]}
-        self.outputs = {
-            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
-        }
-        self.gradient = self.calc_gradient()
-        self.enable_cinn = True
+class TestSumOp_ZeroDim(TestSumOp):
+    def init_attrs(self):
+        self.attrs = {'dim': [], 'reduce_all': True}
 
-    def test_check_output(self):
-        self.check_output()
+    def init_input(self):
+        self.x = np.random.random([]).astype(self.dtype)
 
-    def calc_gradient(self):
-        x = self.inputs["X"]
-        grad = np.ones(x.shape, dtype=x.dtype)
-        return (grad,)
+    def calc_output(self):
+        self.out = self.x.sum(axis=None)
 
     def test_check_grad(self):
-        self.check_grad(
-            ['X'],
-            'Out',
-            user_defined_grads=self.gradient,
-            check_prim=True,
-        )
+        self.check_grad(['X'], 'Out')
 
 
-class TestSumOp_ZeroDim(OpTest):
-    def setUp(self):
-        self.python_api = paddle.sum
-        self.public_python_api = paddle.sum
-        self.op_type = "reduce_sum"
-        self.prim_op_type = "prim"
-        self.inputs = {'X': np.random.random([]).astype("float64")}
-        self.outputs = {'Out': self.inputs['X'].sum(axis=None)}
-        self.attrs = {'dim': [], 'reduce_all': True}
-        # reduce doesn't support float64 in cinn.
-        # 0-D tensor doesn't support in cinn
-        self.enable_cinn = False
+class TestSumOp5D(TestSumOp):
+    def init_input(self):
+        self.x = np.random.random((1, 2, 5, 6, 10)).astype(self.dtype)
+
+    def init_attrs(self):
+        self.attrs = {'dim': [0]}
+
+
+class TestSumOp6D(TestSumOp):
+    def init_input(self):
+        self.x = np.random.random((1, 1, 2, 5, 6, 10)).astype(self.dtype)
+
+    def init_attrs(self):
+        self.attrs = {'dim': [0]}
+
+
+class TestSumOp8D(TestSumOp):
+    def init_input(self):
+        self.x = np.random.random((1, 3, 1, 2, 1, 4, 3, 10)).astype(self.dtype)
+
+    def init_attrs(self):
+        self.attrs = {'dim': (0, 3)}
 
     def test_check_output(self):
         self.check_output()
@@ -94,64 +101,38 @@ def test_check_grad(self):
         self.check_grad(['X'], 'Out')
 
 
-@unittest.skipIf(
-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-)
-class TestSumOp_bf16(OpTest):
-    def setUp(self):
-        np.random.seed(100)
-        self.python_api = paddle.sum
-        self.public_python_api = paddle.sum
-        self.op_type = "reduce_sum"
-        self.prim_op_type = "prim"
-        self.dtype = np.uint16
-        self.x = np.random.uniform(0, 0.1, (2, 5, 10)).astype(np.float32)
-        self.attrs = {'dim': [0, 1, 2]}
-        self.out = self.x.sum(axis=tuple(self.attrs['dim']))
-        self.gradient = self.calc_gradient()
+class TestSumOp_withInt(TestSumOp):
+    def init_input(self):
+        # ref to https://en.wikipedia.org/wiki/Half-precision_floating-point_format
+        # Precision limitations on integer values between 0 and 2048 can be exactly represented
+        self.x = np.random.randint(0, 30, (10, 10)).astype(self.dtype)
 
-        self.inputs = {'X': convert_float_to_uint16(self.x)}
-        self.outputs = {'Out': convert_float_to_uint16(self.out)}
-        self.gradient = self.calc_gradient()
-        self.enable_cinn = False
+    def init_attrs(self):
+        self.attrs = {'dim': (0, 1)}
 
     def test_check_output(self):
-        place = core.CUDAPlace(0)
-        self.check_output_with_place(place, atol=0.1)
+        self.check_output()
+
+    def calc_gradient(self):
+        x = self.inputs["X"]
+        grad = np.ones(x.shape, dtype=x.dtype)
+        return (grad,)
 
     def test_check_grad(self):
-        place = core.CUDAPlace(0)
-        self.check_grad_with_place(
-            place,
+        self.check_grad(
             ['X'],
             'Out',
-            user_defined_grads=self.gradient,
+            user_defined_grads=self.calc_gradient(),
             check_prim=True,
         )
 
-    def calc_gradient(self):
-        x = self.x
-        grad = np.ones(x.shape, dtype=x.dtype)
-        return [grad]
 
+class TestSumOp3Dim(TestSumOp):
+    def init_input(self):
+        self.x = np.random.uniform(0, 0.1, (5, 6, 10)).astype(self.dtype)
 
-class TestSumOp_fp16_withInt(OpTest):
-    def setUp(self):
-        self.python_api = paddle.sum
-        self.public_python_api = paddle.sum
-        self.op_type = "reduce_sum"
-        self.prim_op_type = "prim"
-        self.inputs = {
-            # ref to https://en.wikipedia.org/wiki/Half-precision_floating-point_format
-            # Precision limitations on integer values between 0 and 2048 can be exactly represented
-            'X': np.random.randint(0, 30, (10, 10)).astype("float16")
-        }
-        self.attrs = {'dim': [0, 1]}
-        self.outputs = {
-            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
-        }
-        self.gradient = self.calc_gradient()
-        self.enable_cinn = True
+    def init_attrs(self):
+        self.attrs = {'dim': (0, 1, 2)}
 
     def test_check_output(self):
         self.check_output()
@@ -165,66 +146,79 @@ def test_check_grad(self):
         self.check_grad(
             ['X'],
             'Out',
-            user_defined_grads=self.gradient,
+            user_defined_grads=self.calc_gradient(),
             check_prim=True,
         )
 
 
-class TestSumOp5D(OpTest):
-    def setUp(self):
-        self.python_api = paddle.sum
-        self.public_python_api = paddle.sum
-        self.op_type = "reduce_sum"
-        self.prim_op_type = "prim"
-        self.inputs = {
-            'X': np.random.random((1, 2, 5, 6, 10)).astype("float64")
-        }
-        self.attrs = {'dim': [0]}
-        self.outputs = {'Out': self.inputs['X'].sum(axis=0)}
-        # error occurred in cinn
-        self.enable_cinn = True
+def create_test_fp16_class(parent):
+    @unittest.skipIf(
+        not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
+    )
+    class TestSumOpFp16(parent):
+        def init_dtype(self):
+            self.dtype = np.float16
 
-    def test_check_output(self):
-        self.check_output()
+        def test_check_output(self):
+            self.check_output()
 
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_prim=True)
+        def test_check_grad(self):
+            self.check_grad(
+                ['X'],
+                'Out',
+                check_prim=True,
+            )
 
 
-class TestSumOp6D(OpTest):
-    def setUp(self):
-        self.python_api = paddle.sum
-        self.public_python_api = paddle.sum
-        self.op_type = "reduce_sum"
-        self.prim_op_type = "prim"
-        self.inputs = {
-            'X': np.random.random((1, 1, 2, 5, 6, 10)).astype("float64")
-        }
-        self.attrs = {'dim': [0]}
-        self.outputs = {'Out': self.inputs['X'].sum(axis=0)}
+create_test_fp16_class(TestSumOp)
+create_test_fp16_class(TestSumOp_ZeroDim)
+create_test_fp16_class(TestSumOp5D)
+create_test_fp16_class(TestSumOp6D)
+create_test_fp16_class(TestSumOp8D)
+create_test_fp16_class(TestSumOp_withInt)
+create_test_fp16_class(TestSumOp3Dim)
 
-    def test_check_output(self):
-        self.check_output()
 
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_prim=True)
+def create_test_bf16_class(parent):
+    @unittest.skipIf(
+        not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
+    )
+    class TestSumOpBf16(parent):
+        def setUp(self):
+            self.inputs = {'X': convert_float_to_uint16(self.x)}
+            self.outputs = {'Out': convert_float_to_uint16(self.out)}
+            self.enable_cinn = False
 
+        def init_dtype(self):
+            self.dtype = np.uint16
 
-class TestSumOp8D(OpTest):
-    def setUp(self):
-        self.python_api = paddle.sum
-        self.op_type = "reduce_sum"
-        self.inputs = {
-            'X': np.random.random((1, 3, 1, 2, 1, 4, 3, 10)).astype("float64")
-        }
-        self.attrs = {'dim': (0, 3)}
-        self.outputs = {'Out': self.inputs['X'].sum(axis=(0, 3))}
+        def test_check_output(self):
+            place = core.CUDAPlace(0)
+            self.check_output_with_place(place)
 
-    def test_check_output(self):
-        self.check_output()
+        def test_check_grad(self):
+            place = core.CUDAPlace(0)
+            self.check_grad_with_place(
+                place,
+                ['X'],
+                'Out',
+                user_defined_grads=self.gradient,
+                check_prim=True,
+            )
 
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
+        def calc_gradient(self):
+            x = self.x
+            grad = np.ones(x.shape, dtype=x.dtype)
+            return [grad]
+
+
+create_test_bf16_class(TestSumOp)
+create_test_bf16_class(TestSumOp_ZeroDim)
+create_test_bf16_class(TestSumOp5D)
+create_test_bf16_class(TestSumOp6D)
+create_test_bf16_class(TestSumOp8D)
+create_test_bf16_class(TestSumOp_withInt)
+create_test_bf16_class(TestSumOp3Dim)
 
 
 @skip_check_grad_ci(

From 6bd5fd752662d276e4e53e6d30eae1941377fde7 Mon Sep 17 00:00:00 2001
From: Vvsmile <17864154871@163.com>
Date: Mon, 10 Apr 2023 12:55:49 +0800
Subject: [PATCH 014/156] [AMP OP&Test] Add fp16 and bf16 test to activation
 (#52521)

* adjust defalut tolerance of output and grad

* fix a bug in the grad of OpTest

* fix the type of setting defalut value in optest, both forward and
backward

* add defalut

* fix test_sum_op

* adjust tolerance

* fix the tolerance of eager

* add bf16 and fp16 to the activation tests

* remove some fixs

* fix activation

* fix fp16

* fix gelu

* fix the activation tests

* add bfloat16 specialization to singrad and cosgrad

* fix bugs

* fix bugs

* add unittest

* add skip

* add fp/bf to rrelu/rrelu_grad

* git add rrelu

* fix bugs
---
 paddle/phi/kernels/funcs/activation_functor.h |  22 +-
 .../phi/kernels/gpu/activation_grad_kernel.cu |  15 +-
 paddle/phi/kernels/gpu/rrelu_grad_kernel.cu   |   1 +
 paddle/phi/kernels/gpu/rrelu_kernel.cu        |   1 +
 .../fluid/tests/unittests/eager_op_test.py    |  28 +-
 .../tests/unittests/test_activation_op.py     | 290 +++++++++++++-----
 .../fluid/tests/unittests/test_rrelu_op.py    |  73 ++++-
 7 files changed, 339 insertions(+), 91 deletions(-)

diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h
index 7c48c0a02b413f..78a1f8cb24f852 100644
--- a/paddle/phi/kernels/funcs/activation_functor.h
+++ b/paddle/phi/kernels/funcs/activation_functor.h
@@ -70,6 +70,13 @@ struct Sine<dtype::float16> {
   }
 };
 
+template <>
+struct Sine<dtype::bfloat16> {
+  HOSTDEVICE dtype::bfloat16 operator()(const dtype::bfloat16& val) const {
+    return dtype::bfloat16(sin(static_cast<float>(val)));
+  }
+};
+
 template <typename T>
 struct Cosine {
   HOSTDEVICE T operator()(const T& val) const { return cos(val); }
@@ -82,6 +89,13 @@ struct Cosine<dtype::float16> {
   }
 };
 
+template <>
+struct Cosine<dtype::bfloat16> {
+  HOSTDEVICE dtype::bfloat16 operator()(const dtype::bfloat16& val) const {
+    return dtype::bfloat16(cos(static_cast<float>(val)));
+  }
+};
+
 // sine'(x) = cos(x)
 template <typename T>
 struct SinGradFunctor : public BaseActivationFunctor<T> {
@@ -2664,10 +2678,12 @@ struct CudaExpGradFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct CudaReciprocalFunctor : public BaseActivationFunctor<T> {
-  T one = static_cast<T>(1.0f);
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
 
-  // reciprocal(x) = 1 / x
-  __device__ __forceinline__ T operator()(const T x) const { return one / x; }
+  __device__ __forceinline__ T operator()(const T x) const {
+    return static_cast<T>(one / static_cast<MPType>(x));
+  }
 };
 
 template <typename T>
diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
index 5573d666776b7f..e56c3cf4f42000 100644
--- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
@@ -425,7 +425,8 @@ PD_REGISTER_KERNEL(sin_double_grad,
                    double,
                    int,
                    int64_t,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 
 PD_REGISTER_KERNEL(sin_triple_grad,
                    GPU,
@@ -435,7 +436,8 @@ PD_REGISTER_KERNEL(sin_triple_grad,
                    double,
                    int,
                    int64_t,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 
 PD_REGISTER_KERNEL(cos_double_grad,
                    GPU,
@@ -445,7 +447,8 @@ PD_REGISTER_KERNEL(cos_double_grad,
                    double,
                    int,
                    int64_t,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 
 PD_REGISTER_KERNEL(cos_triple_grad,
                    GPU,
@@ -455,7 +458,8 @@ PD_REGISTER_KERNEL(cos_triple_grad,
                    double,
                    int,
                    int64_t,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(softsign_grad, SoftsignGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_grad, SigmoidGradKernel)
@@ -473,7 +477,8 @@ PD_REGISTER_KERNEL(log_double_grad,
                    phi::LogDoubleGradKernel,
                    float,
                    double,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(hardswish_grad, HardSwishGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(swish_grad, SwishGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(round_grad, RoundGradKernel)
diff --git a/paddle/phi/kernels/gpu/rrelu_grad_kernel.cu b/paddle/phi/kernels/gpu/rrelu_grad_kernel.cu
index fa9ef450307855..361e4c28e16b80 100644
--- a/paddle/phi/kernels/gpu/rrelu_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/rrelu_grad_kernel.cu
@@ -83,4 +83,5 @@ PD_REGISTER_KERNEL(rrelu_grad,
                    phi::RReluGradKernel,
                    float,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    double) {}
diff --git a/paddle/phi/kernels/gpu/rrelu_kernel.cu b/paddle/phi/kernels/gpu/rrelu_kernel.cu
index e872cbf3cb6553..b15e525a3bcce3 100644
--- a/paddle/phi/kernels/gpu/rrelu_kernel.cu
+++ b/paddle/phi/kernels/gpu/rrelu_kernel.cu
@@ -110,4 +110,5 @@ PD_REGISTER_KERNEL(rrelu,
                    phi::RReluKernel,
                    float,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    double) {}
diff --git a/python/paddle/fluid/tests/unittests/eager_op_test.py b/python/paddle/fluid/tests/unittests/eager_op_test.py
index b764a1acd0a966..9b0868edfa7ecb 100644
--- a/python/paddle/fluid/tests/unittests/eager_op_test.py
+++ b/python/paddle/fluid/tests/unittests/eager_op_test.py
@@ -885,7 +885,9 @@ def _check_api_outs_by_dygraph_outs(self, api_outs, dygraph_outs, place):
                 np_dyg,
                 rtol=1e-05,
                 equal_nan=False,
-                err_msg='Output ('
+                err_msg='Operator ('
+                + self.op_type
+                + ') Output ('
                 + name
                 + ') has diff at '
                 + str(place)
@@ -1137,7 +1139,9 @@ def _compare_expect_and_actual_outputs(
                     actual_out,
                     rtol=1e-05,
                     atol=inplace_atol,
-                    err_msg='Output ('
+                    err_msg='Operator ('
+                    + self.op_type
+                    + ') Output ('
                     + name
                     + ') has diff at '
                     + str(place)
@@ -1626,7 +1630,9 @@ def _compare_numpy(self, name, actual_np, expect_np):
                         rtol=self.rtol if hasattr(self, 'rtol') else rtol,
                         equal_nan=equal_nan,
                         err_msg=(
-                            "Output ("
+                            "Operator ("
+                            + self.op_type
+                            + ") Output ("
                             + name
                             + ") has diff at "
                             + str(place)
@@ -1643,7 +1649,9 @@ def _compare_numpy(self, name, actual_np, expect_np):
                         rtol=self.rtol if hasattr(self, 'rtol') else rtol,
                         equal_nan=equal_nan,
                     ),
-                    "Output ("
+                    "Operator ("
+                    + self.op_type
+                    + ") Output ("
                     + name
                     + ") has diff at "
                     + str(place)
@@ -1815,7 +1823,9 @@ def _compare_numpy(self, name, actual_np, expect_np):
                             rtol=self.rtol if hasattr(self, 'rtol') else rtol,
                             equal_nan=equal_nan,
                             err_msg=(
-                                "Output ("
+                                "Operator ("
+                                + self.op_type
+                                + ") Output ("
                                 + name
                                 + ") has diff at "
                                 + str(place)
@@ -1832,7 +1842,9 @@ def _compare_numpy(self, name, actual_np, expect_np):
                             rtol=self.rtol if hasattr(self, 'rtol') else rtol,
                             equal_nan=equal_nan,
                         ),
-                        "Output ("
+                        "Operator ("
+                        + self.op_type
+                        + ") Output ("
                         + name
                         + ") has diff at "
                         + str(place)
@@ -1882,7 +1894,9 @@ def _compare_list(self, name, actual, expect):
                         .get_tensor()
                         .recursive_sequence_lengths(),
                         expect[1],
-                        "Output ("
+                        "Operator ("
+                        + self.op_type
+                        + ") Output ("
                         + name
                         + ") has different lod at "
                         + str(place)
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index 8656c564136050..dfa95f760ce6aa 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -63,6 +63,8 @@ def setUp(self):
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
 
+        self.convert_input_output()
+
     def test_check_output(self):
         self.check_output()
 
@@ -83,6 +85,9 @@ def init_shape(self):
     def init_kernel_type(self):
         pass
 
+    def convert_input_output(self):
+        pass
+
 
 class TestActivation_ZeroDim(TestActivation):
     def init_shape(self):
@@ -148,6 +153,7 @@ def setUp(self):
 
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
+        self.convert_input_output()
 
     def test_check_grad(self):
         self.check_grad(['X'], 'Out')
@@ -247,6 +253,8 @@ def setUp(self):
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
 
+        self.convert_input_output()
+
     def init_dtype(self):
         self.dtype = np.float32
 
@@ -320,10 +328,11 @@ def setUp(self):
         np.random.seed(1024)
         x = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
         out = x / (np.exp(-x) + 1)
-
-        self.inputs = {'X': x}
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
 
+        self.convert_input_output()
+
     def init_dtype(self):
         self.dtype = np.float32
 
@@ -401,10 +410,11 @@ def setUp(self):
         np.random.seed(2048)
         x = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
         out = np.log(1 / (1 + np.exp(-x)))
-
-        self.inputs = {'X': x}
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
 
+        self.convert_input_output()
+
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
@@ -479,9 +489,9 @@ def setUp(self):
         np.random.seed(1024)
         x = np.random.uniform(0.1, 1, self.shape).astype(self.dtype)
         out = np.tanh(x)
-
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
+        self.convert_input_output()
 
     def test_check_grad(self):
         if self.dtype == np.float16:
@@ -581,6 +591,7 @@ def setUp(self):
 
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
+        self.convert_input_output()
 
     def test_check_grad(self):
         if self.dtype == np.float16:
@@ -625,10 +636,11 @@ def setUp(self):
         np.random.seed(1024)
         x = np.random.uniform(0.1, 1, self.shape).astype(self.dtype)
         out = np.sinh(x)
-
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
 
+        self.convert_input_output()
+
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
@@ -716,10 +728,11 @@ def setUp(self):
         np.random.seed(1024)
         x = np.random.uniform(0.1, 1, self.shape).astype(self.dtype)
         out = np.cosh(x)
-
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
 
+        self.convert_input_output()
+
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
@@ -812,10 +825,11 @@ def setUp(self):
         np.random.seed(1024)
         x = np.random.uniform(10, 20, self.shape).astype(self.dtype)
         out = ref_tanhshrink(x)
-
-        self.inputs = {'X': x}
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
 
+        self.convert_input_output()
+
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
@@ -895,10 +909,12 @@ def setUp(self):
         np.random.seed(1024)
         x = np.random.uniform(-1, 1, self.shape).astype(self.dtype) * 10
         out = ref_hardshrink(x, self.threshold)
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
 
         self.attrs = {'threshold': self.threshold}
-        self.inputs = {'X': x}
-        self.outputs = {'Out': out}
+
+        self.convert_input_output()
 
     def init_shape(self):
         self.shape = [10, 12]
@@ -1067,10 +1083,12 @@ def setUp(self):
         np.random.seed(1023)
         x = np.random.uniform(0.25, 10, self.shape).astype(self.dtype)
         out = ref_softshrink(x, threshold)
-        self.inputs = {'X': x}
-        self.attrs = {"lambda": threshold}
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
 
+        self.attrs = {"lambda": threshold}
+
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
@@ -1154,6 +1172,7 @@ def setUp(self):
 
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
+        self.convert_input_output()
         self.enable_cinn = False
 
     # TODO(wanghao107) add prim test
@@ -1266,6 +1285,7 @@ def setUp(self):
 
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
+        self.convert_input_output()
         self.enable_cinn = True
 
     def test_check_grad(self):
@@ -1320,6 +1340,7 @@ def setUp(self):
 
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
+        self.convert_input_output()
         self.enable_cinn = True
 
     def init_shape(self):
@@ -1368,6 +1389,7 @@ def setUp(self):
 
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
+        self.convert_input_output()
 
     def init_shape(self):
         self.shape = [4, 25]
@@ -1396,6 +1418,7 @@ def setUp(self):
 
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
+        self.convert_input_output()
 
     def init_shape(self):
         self.shape = [10, 12]
@@ -1425,6 +1448,7 @@ def setUp(self):
 
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
+        self.convert_input_output()
 
     def init_shape(self):
         self.shape = [10, 12]
@@ -1496,9 +1520,9 @@ def setUp(self):
         np.random.seed(1024)
         x = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
         out = np.cos(x)
-
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
+        self.convert_input_output()
 
     def init_shape(self):
         self.shape = [10, 12]
@@ -1534,6 +1558,7 @@ def setUp(self):
 
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(self.x_np)}
         self.outputs = {'Out': out}
+        self.convert_input_output()
 
     def init_shape(self):
         self.shape = [10, 12]
@@ -1603,6 +1628,7 @@ def setUp(self):
 
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
+        self.convert_input_output()
 
     def init_shape(self):
         self.shape = [10, 12]
@@ -1632,9 +1658,9 @@ def setUp(self):
         np.random.seed(1024)
         x = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
         out = np.sin(x)
-
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
+        self.convert_input_output()
 
     def init_shape(self):
         self.shape = [10, 12]
@@ -1663,6 +1689,7 @@ def setUp(self):
 
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
+        self.convert_input_output()
 
     def init_shape(self):
         self.shape = [10, 12]
@@ -1691,6 +1718,7 @@ def setUp(self):
 
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
+        self.convert_input_output()
 
     def init_shape(self):
         self.shape = [10, 12]
@@ -1719,6 +1747,7 @@ def setUp(self):
 
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
+        self.convert_input_output()
 
     def init_shape(self):
         self.shape = [10, 12]
@@ -1747,6 +1776,7 @@ def setUp(self):
 
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
+        self.convert_input_output()
 
     def init_shape(self):
         self.shape = [10, 12]
@@ -1775,6 +1805,7 @@ def setUp(self):
 
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
+        self.convert_input_output()
 
     def init_shape(self):
         self.shape = [10, 12]
@@ -1799,20 +1830,14 @@ def setUp(self):
         self.skip_cinn()
 
         np.random.seed(1024)
-        if self.dtype == np.uint16:
-            x = np.random.uniform(-1, 1, self.shape).astype(np.float32)
-            # The same reason with TestAbs
-            x[np.abs(x) < 0.005] = 0.02
-            out = convert_float_to_uint16(np.maximum(x, 0))
-            self.inputs = {'X': convert_float_to_uint16(x)}
-        else:
-            x = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
-            # The same reason with TestAbs
-            x[np.abs(x) < 0.005] = 0.02
-            out = np.maximum(x, 0)
-            self.inputs = {'X': x}
+        x = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
+        # The same reason with TestAbs
+        x[np.abs(x) < 0.005] = 0.02
+        out = np.maximum(x, 0)
+        self.inputs = {'X': x}
 
         self.outputs = {'Out': out}
+        self.convert_input_output()
 
     def test_check_grad(self):
         if self.dtype == np.float16:
@@ -1921,6 +1946,7 @@ def setUp(self):
         self.inputs = {'X': x}
         self.outputs = {'Out': out}
         self.attrs = {'alpha': alpha}
+        self.convert_input_output()
 
     def test_check_grad(self):
         if self.dtype == np.float16:
@@ -2066,8 +2092,9 @@ def setUp(self):
         out = gelu(x, approximate)
         self.if_enable_cinn()
 
-        self.inputs = {'X': x}
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
+        self.convert_input_output()
         self.attrs = {"approximate": approximate}
         # The backward decomposite of gelu is inconsistent with raw kernel on
         # cpu, lower threshold to support 1e-8 for pass the unittest
@@ -2175,8 +2202,9 @@ def setUp(self):
         t[t > t_max] = t_max
 
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.attrs = {'t_min': t_min, 't_max': t_max}
         self.outputs = {'Out': t}
+        self.convert_input_output()
+        self.attrs = {'t_min': t_min, 't_max': t_max}
 
     def test_check_grad(self):
         if self.dtype == np.float16:
@@ -2203,9 +2231,11 @@ def setUp(self):
         x[np.abs(x) < 0.005] = 0.02
         out = ref_relu6(x)
 
-        self.inputs = {'X': x}
         self.attrs = {'threshold': 6.0}
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
+        self.convert_input_output()
 
     def init_shape(self):
         self.shape = [10, 12]
@@ -2338,9 +2368,10 @@ def setUp(self):
         x[np.abs(x - threshold + offset) < 0.005] = threshold - offset + 0.02
         out = ref_hardswish(x, threshold, scale, offset)
 
-        self.inputs = {'X': x}
-        self.attrs = {'threshold': threshold, 'scale': scale, 'offset': offset}
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
+        self.convert_input_output()
+        self.attrs = {'threshold': threshold, 'scale': scale, 'offset': offset}
         self.enable_cinn = False
 
     def init_shape(self):
@@ -2450,8 +2481,9 @@ def setUp(self):
         out = np.log(np.exp(t) + 1)
 
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.attrs = {'threshold': threshold}
         self.outputs = {'Out': out}
+        self.convert_input_output()
+        self.attrs = {'threshold': threshold}
 
     def test_check_output(self):
         self.check_output(check_dygraph=False)
@@ -2482,9 +2514,11 @@ def setUp(self):
         out = elu(x, alpha)
         # Note: unlike other Relu extensions, point 0 on standard ELU function (i.e. alpha = 1)
         # is differentiable, so we can skip modifications like x[np.abs(x) < 0.005] = 0.02 here
-        self.inputs = {'X': x}
-        self.attrs = {'alpha': alpha}
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
+        self.convert_input_output()
+        self.attrs = {'alpha': alpha}
 
     def init_shape(self):
         self.shape = [10, 12]
@@ -2597,9 +2631,11 @@ def setUp(self):
         x = np.random.uniform(-3, 3, self.shape).astype(self.dtype)
         alpha = 1.5
         out = celu(x, alpha)
-        self.inputs = {'X': x}
-        self.attrs = {'alpha': alpha}
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
+        self.convert_input_output()
+        self.attrs = {'alpha': alpha}
 
     def init_shape(self):
         self.shape = [10, 12]
@@ -2696,6 +2732,7 @@ def setUp(self):
 
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
+        self.convert_input_output()
 
     def test_check_grad(self):
         if self.dtype == np.float16:
@@ -2730,6 +2767,7 @@ def setUp(self):
 
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
+        self.convert_input_output()
 
     def test_check_grad(self):
         if self.dtype == np.float16:
@@ -2782,6 +2820,7 @@ def setUp(self):
 
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
+        self.convert_input_output()
 
     def test_check_grad(self):
         if self.dtype == np.float16:
@@ -2844,6 +2883,7 @@ def setUp(self):
 
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
+        self.convert_input_output()
 
     def test_check_grad(self):
         if self.dtype == np.float16:
@@ -2909,6 +2949,7 @@ def setUp(self):
 
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
+        self.convert_input_output()
 
     def test_check_grad(self):
         if self.dtype == np.float16:
@@ -2981,6 +3022,7 @@ def setUp(self):
 
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
+        self.convert_input_output()
 
     def test_check_grad(self):
         if self.dtype == np.float16:
@@ -3040,8 +3082,9 @@ def setUp(self):
         out = np.power(x, 3)
 
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.attrs = {'factor': 3.0}
         self.outputs = {'Out': out}
+        self.attrs = {'factor': 3.0}
+        self.convert_input_output()
 
     def test_check_output(self):
         self.check_output(check_prim=True)
@@ -3142,9 +3185,10 @@ def setUp(self):
         # The same reason with TestAbs
         out = ref_stanh(x, scale_a, scale_b)
 
-        self.inputs = {'X': x}
-        self.attrs = {'scale_a': scale_a, 'scale_b': scale_b}
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
+        self.attrs = {'scale_a': scale_a, 'scale_b': scale_b}
+        self.convert_input_output()
 
     def test_check_grad(self):
         if self.dtype == np.float16:
@@ -3380,8 +3424,10 @@ def setUp(self):
         np.random.seed(1024)
         x = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
         out = ref_softsign(x)
-        self.inputs = {'X': x}
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
+        self.convert_input_output()
 
     def init_shape(self):
         self.shape = [10, 12]
@@ -3465,9 +3511,11 @@ def setUp(self):
         x = np.random.uniform(-20, 20, self.shape).astype(self.dtype)
         x[np.abs(x) < 0.005] = 0.02
         out = ref_thresholded_relu(x, threshold)
-        self.inputs = {'X': x}
-        self.attrs = {"threshold": threshold}
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
+        self.attrs = {"threshold": threshold}
+        self.convert_input_output()
 
     def init_shape(self):
         self.shape = [10, 12]
@@ -3561,8 +3609,10 @@ def setUp(self):
         out = ref_hardsigmoid(x, self.slope, self.offset)
 
         self.attrs = {'slope': self.slope, 'offset': self.offset}
-        self.inputs = {'X': x}
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
+        self.convert_input_output()
 
     def init_shape(self):
         self.shape = [10, 12]
@@ -3666,9 +3716,11 @@ def setUp(self):
         np.random.seed(1024)
         x = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
         out = ref_swish(x)
-        self.inputs = {'X': x}
-        self.attrs = {'beta': 1.0}
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
+        self.attrs = {'beta': 1.0}
+        self.convert_input_output()
 
     def init_shape(self):
         self.shape = [10, 12]
@@ -3764,8 +3816,10 @@ def setUp(self):
         np.random.seed(1024)
         x = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
         out = ref_mish(x)
-        self.inputs = {'X': x}
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
+        self.convert_input_output()
 
     def init_shape(self):
         self.shape = [10, 12]
@@ -3872,7 +3926,7 @@ def create_test_act_fp16_class(
     check_dygraph=True,
     check_prim=False,
     enable_cinn=True,
-    grad_atol=0.80,
+    grad_atol=1e-2,
     **kwargs
 ):
     @unittest.skipIf(
@@ -3914,7 +3968,7 @@ def test_check_grad(self):
                     max_relative_error=grad_atol,
                 )
 
-    cls_name = "{}_{}".format(parent.__name__, "fp16")
+    cls_name = "{}_{}".format(parent.__name__, "FP16OP")
     TestActFp16.__name__ = cls_name
     globals()[cls_name] = TestActFp16
 
@@ -3933,17 +3987,17 @@ def test_check_grad(self):
 create_test_act_fp16_class(TestAbs, check_prim=True)
 create_test_act_fp16_class(TestCeil, grad_check=False)
 create_test_act_fp16_class(TestFloor, check_prim=True, grad_check=False)
-create_test_act_fp16_class(TestCos, grad_atol=0.85)
-create_test_act_fp16_class(TestTan, grad_atol=0.85)
-create_test_act_fp16_class(TestCosh, grad_atol=0.85)
-create_test_act_fp16_class(TestAcos, grad_atol=0.85)
+create_test_act_fp16_class(TestCos)
+create_test_act_fp16_class(TestTan)
+create_test_act_fp16_class(TestCosh)
+create_test_act_fp16_class(TestAcos)
 create_test_act_fp16_class(TestSin)
 create_test_act_fp16_class(TestSinh)
 create_test_act_fp16_class(TestAsin)
 create_test_act_fp16_class(TestAtan)
-create_test_act_fp16_class(TestAcosh, grad_atol=0.85)
-create_test_act_fp16_class(TestAsinh, grad_atol=0.85)
-create_test_act_fp16_class(TestAtanh, grad_atol=0.85)
+create_test_act_fp16_class(TestAcosh)
+create_test_act_fp16_class(TestAsinh)
+create_test_act_fp16_class(TestAtanh)
 create_test_act_fp16_class(TestRound, grad_check=False)
 create_test_act_fp16_class(TestRelu, check_prim=True)
 create_test_act_fp16_class(
@@ -3955,38 +4009,63 @@ def test_check_grad(self):
 )
 create_test_act_fp16_class(TestBRelu)
 create_test_act_fp16_class(TestRelu6)
-create_test_act_fp16_class(TestSoftRelu, check_dygraph=False, grad_atol=0.85)
+create_test_act_fp16_class(TestSoftRelu, check_dygraph=False)
 create_test_act_fp16_class(TestELU)
 create_test_act_fp16_class(TestCELU)
 create_test_act_fp16_class(TestReciprocal)
 create_test_act_fp16_class(TestLog, check_prim=True)
 if core.is_compiled_with_rocm():
-    create_test_act_fp16_class(TestLog2, atol=5e-2, grad_atol=0.85)
+    create_test_act_fp16_class(TestLog2)
 else:
-    create_test_act_fp16_class(TestLog2, atol=5e-2)
-create_test_act_fp16_class(TestLog10, atol=5e-2)
-create_test_act_fp16_class(TestLog1p, grad_atol=0.9)
+    create_test_act_fp16_class(TestLog2)
+create_test_act_fp16_class(TestLog10)
+create_test_act_fp16_class(TestLog1p)
 create_test_act_fp16_class(TestSquare)
-create_test_act_fp16_class(TestPow, check_prim=True, atol=5e-2)
-create_test_act_fp16_class(TestPow_factor_tensor, atol=5e-2)
-create_test_act_fp16_class(TestSTanh, grad_atol=0.9)
+create_test_act_fp16_class(TestPow, check_prim=True)
+create_test_act_fp16_class(TestPow_factor_tensor)
+create_test_act_fp16_class(TestSTanh)
 create_test_act_fp16_class(TestSoftplus)
 create_test_act_fp16_class(TestSoftsign)
 create_test_act_fp16_class(TestThresholdedRelu)
 create_test_act_fp16_class(TestHardSigmoid)
-create_test_act_fp16_class(TestSwish, grad_atol=0.85)
+create_test_act_fp16_class(TestSwish)
 create_test_act_fp16_class(TestHardSwish, check_prim=True)
-create_test_act_fp16_class(TestMish, grad_atol=0.9)
+create_test_act_fp16_class(TestMish)
+create_test_act_fp16_class(TestLeakyRelu)
+create_test_act_fp16_class(TestLeakyReluAlpha1)
+create_test_act_fp16_class(TestLeakyReluAlpha2)
+create_test_act_fp16_class(TestLeakyReluAlpha3)
+create_test_act_fp16_class(TestLeakyRelu_ZeroDim)
+create_test_act_fp16_class(TestRsqrt)
 
 
 def create_test_act_bf16_class(
-    parent, atol=1e-2, grad_check=True, grad_atol=0.80
+    parent,
+    atol=1e-2,
+    grad_check=True,
+    check_dygraph=True,
+    check_prim=False,
+    enable_cinn=True,
+    grad_atol=1e-2,
+    **kwargs
 ):
     @unittest.skipIf(
-        not paddle.is_compiled_with_cuda(), "core is not compiled with CUDA"
+        not core.is_compiled_with_cuda()
+        or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+        "core is not compiled with CUDA and do not support bfloat16",
     )
     class TestActBF16(parent):
+        def setUp(self):
+            super().setUp()
+            for k, v in kwargs.items():
+                setattr(self, k, v)
+
         def init_dtype(self):
+            self.dtype = np.float32
+
+        def convert_input_output(self):
+            self.inputs = {'X': convert_float_to_uint16(self.inputs['X'])}
+            self.outputs = {'Out': convert_float_to_uint16(self.outputs['Out'])}
             self.dtype = np.uint16
 
         def test_check_output(self):
@@ -3995,17 +4074,80 @@ def test_check_output(self):
 
         def test_check_grad(self):
             place = core.CUDAPlace(0)
-            self.check_grad_with_place(
-                place, ['X'], 'Out', max_relative_error=grad_atol
-            )
+            if grad_check:
+                self.check_grad_with_place(
+                    place, ['X'], 'Out', max_relative_error=grad_atol
+                )
 
-    cls_name = "{}_{}".format(parent.__name__, "bf16")
+    cls_name = "{}_{}".format(parent.__name__, "BF16OP")
     TestActBF16.__name__ = cls_name
     globals()[cls_name] = TestActBF16
 
 
-create_test_act_bf16_class(TestRelu)
-create_test_act_bf16_class(TestAbs)
+create_test_act_bf16_class(TestActivation, check_prim=True)
+create_test_act_bf16_class(TestExpm1)
+create_test_act_bf16_class(TestSigmoid, check_prim=True)
+create_test_act_bf16_class(TestSilu, check_prim=True)
+create_test_act_bf16_class(TestLogSigmoid)
+create_test_act_bf16_class(TestTanh)
+create_test_act_bf16_class(TestTanhshrink)
+create_test_act_bf16_class(TestHardShrink)
+create_test_act_bf16_class(TestSoftshrink)
+create_test_act_bf16_class(TestSqrt, check_prim=True)
+create_test_act_bf16_class(TestSqrtComp, check_prim=True)
+create_test_act_bf16_class(TestAbs, check_prim=True)
+create_test_act_bf16_class(TestCeil, grad_check=False)
+create_test_act_bf16_class(TestFloor, grad_check=False, check_prim=True)
+create_test_act_bf16_class(TestCos)
+create_test_act_bf16_class(TestTan)
+create_test_act_bf16_class(TestCosh)
+create_test_act_bf16_class(TestAcos)
+create_test_act_bf16_class(TestSin)
+create_test_act_bf16_class(TestSinh)
+create_test_act_bf16_class(TestAsin)
+create_test_act_bf16_class(TestAtan)
+create_test_act_bf16_class(TestAcosh)
+create_test_act_bf16_class(TestAsinh)
+create_test_act_bf16_class(TestAtanh)
+create_test_act_bf16_class(TestRound, grad_check=False)
+create_test_act_bf16_class(TestRelu, check_prim=True)
+create_test_act_bf16_class(
+    TestGelu,
+    check_prim=True,
+    enable_cinn=False,
+    rev_comp_rtol=1e-2,
+    rev_comp_atol=1e-2,
+)
+create_test_act_bf16_class(TestBRelu)
+create_test_act_bf16_class(TestRelu6)
+create_test_act_bf16_class(TestSoftRelu, check_dygraph=False)
+create_test_act_bf16_class(TestELU)
+create_test_act_bf16_class(TestCELU)
+create_test_act_bf16_class(TestReciprocal)
+create_test_act_bf16_class(TestLog, check_prim=True)
+if core.is_compiled_with_rocm():
+    create_test_act_bf16_class(TestLog2)
+else:
+    create_test_act_bf16_class(TestLog2)
+create_test_act_bf16_class(TestLog10)
+create_test_act_bf16_class(TestLog1p)
+create_test_act_bf16_class(TestSquare)
+create_test_act_bf16_class(TestPow, check_prim=True)
+create_test_act_bf16_class(TestPow_factor_tensor)
+create_test_act_bf16_class(TestSTanh)
+create_test_act_bf16_class(TestSoftplus)
+create_test_act_bf16_class(TestSoftsign)
+create_test_act_bf16_class(TestThresholdedRelu)
+create_test_act_bf16_class(TestHardSigmoid)
+create_test_act_bf16_class(TestSwish)
+create_test_act_bf16_class(TestHardSwish, check_prim=True)
+create_test_act_bf16_class(TestMish)
+create_test_act_bf16_class(TestLeakyRelu)
+create_test_act_bf16_class(TestLeakyReluAlpha1)
+create_test_act_bf16_class(TestLeakyReluAlpha2)
+create_test_act_bf16_class(TestLeakyReluAlpha3)
+create_test_act_bf16_class(TestLeakyRelu_ZeroDim)
+create_test_act_bf16_class(TestRsqrt)
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_rrelu_op.py b/python/paddle/fluid/tests/unittests/test_rrelu_op.py
index eb7fb9df5e17d5..b86b7808aba938 100644
--- a/python/paddle/fluid/tests/unittests/test_rrelu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rrelu_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from eager_op_test import OpTest, convert_float_to_uint16
 
 import paddle
 import paddle.nn.functional as F
@@ -327,7 +327,7 @@ def setUp(self):
         ]  # python out sig is customized output signature.
 
     def init_params(self):
-        self.dtype = "float64"
+        self.init_dtype()
         self.x_shape = [2, 3, 4, 5]
 
         x_np = np.random.uniform(-1, 1, self.x_shape).astype(self.dtype)
@@ -337,12 +337,19 @@ def init_params(self):
 
         self.inputs = {'X': x_np}
         self.outputs = {'Out': out_np, 'Noise': noise_np}
+        self.convert_input_output()
         self.attrs = {
             'lower': self.lower,
             "upper": self.upper,
             "is_test": self.is_test,
         }
 
+    def init_dtype(self):
+        self.dtype = "float64"
+
+    def convert_input_output(self):
+        pass
+
     def test_check_output(self):
         self.check_output(no_check_set=['Noise'])
 
@@ -363,5 +370,67 @@ def setUp(self):
         ]  # python out sig is customized output signature.
 
 
+class RReluTestFP16OP(RReluTest):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and do not support bfloat16",
+)
+class RReluTestBF16OP(RReluTest):
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def convert_input_output(self):
+        self.inputs = {'X': convert_float_to_uint16(self.inputs['X'])}
+        self.outputs = {
+            'Out': convert_float_to_uint16(self.outputs['Out']),
+            'Noise': convert_float_to_uint16(self.outputs['Noise']),
+        }
+        self.dtype = np.uint16
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place, no_check_set=['Noise'])
+
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(place, ['X'], 'Out')
+
+
+class RReluTrainingTestFP16OP(RReluTrainingTest):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and do not support bfloat16",
+)
+class RReluTrainingTestBF16OP(RReluTrainingTest):
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def convert_input_output(self):
+        self.inputs = {'X': convert_float_to_uint16(self.inputs['X'])}
+        self.outputs = {
+            'Out': convert_float_to_uint16(self.outputs['Out']),
+            'Noise': convert_float_to_uint16(self.outputs['Noise']),
+        }
+        self.dtype = np.uint16
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place, no_check_set=['Noise'])
+
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(place, ['X'], 'Out')
+
+
 if __name__ == "__main__":
     unittest.main()

From 3c0b1795c413b55e582bcd5b0c2eb00082d793eb Mon Sep 17 00:00:00 2001
From: HongyuJia <jiahongyu@baidu.com>
Date: Mon, 10 Apr 2023 13:08:31 +0800
Subject: [PATCH 015/156] [enforce.h Decouple gflags.h] Move gflags.h from
 enforce.h to enforce.cc (#52573)

* [enforce.h Decouple gflags.h] Move gflags.h from enforce.h to enforce.cc

* Add gflags.h for other files

* Add gflags.h for other files

* Add gflags.h for blas_impl.hip.h

* Add gflags.h for miopen_helper.h
---
 paddle/phi/api/profiler/device_tracer.cc              | 1 +
 paddle/phi/api/yaml/generator/api_gen.py              | 1 +
 paddle/phi/api/yaml/generator/backward_api_gen.py     | 1 +
 paddle/phi/api/yaml/generator/intermediate_api_gen.py | 1 +
 paddle/phi/api/yaml/generator/sparse_api_gen.py       | 1 +
 paddle/phi/api/yaml/generator/sparse_bw_api_gen.py    | 1 +
 paddle/phi/api/yaml/generator/strings_api_gen.py      | 1 +
 paddle/phi/backends/gpu/cuda/cudnn_helper.h           | 1 +
 paddle/phi/backends/gpu/rocm/miopen_helper.h          | 2 ++
 paddle/phi/core/enforce.cc                            | 1 +
 paddle/phi/core/enforce.h                             | 1 -
 paddle/phi/core/kernel_factory.cc                     | 1 +
 paddle/phi/infermeta/unary.cc                         | 1 +
 paddle/phi/kernels/autotune/cache_base.h              | 1 +
 paddle/phi/kernels/funcs/blas/blas_impl.cu.h          | 1 +
 paddle/phi/kernels/funcs/blas/blas_impl.hip.h         | 2 ++
 paddle/phi/kernels/gpu/embedding_grad_kernel.cu       | 1 +
 paddle/phi/kernels/gpu/index_add_kernel.cu            | 1 +
 paddle/phi/kernels/gpu/index_select_grad_kernel.cu    | 1 +
 paddle/phi/kernels/gpu/randperm_kernel.cu             | 2 ++
 paddle/phi/kernels/impl/einsum_impl.h                 | 1 +
 paddle/phi/kernels/selected_rows/cpu/adam_kernel.cc   | 2 ++
 test/cpp/phi/api/scale_api.h                          | 1 +
 23 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/paddle/phi/api/profiler/device_tracer.cc b/paddle/phi/api/profiler/device_tracer.cc
index 6499b42b17f955..7bd09ff8413b0c 100644
--- a/paddle/phi/api/profiler/device_tracer.cc
+++ b/paddle/phi/api/profiler/device_tracer.cc
@@ -21,6 +21,7 @@ limitations under the License. */
 #include <string>
 #include <thread>  // NOLINT
 
+#include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "paddle/phi/core/enforce.h"
 
diff --git a/paddle/phi/api/yaml/generator/api_gen.py b/paddle/phi/api/yaml/generator/api_gen.py
index 468d07efc5a526..71285de7b24723 100644
--- a/paddle/phi/api/yaml/generator/api_gen.py
+++ b/paddle/phi/api/yaml/generator/api_gen.py
@@ -360,6 +360,7 @@ def source_include(header_file_path):
 #include <memory>
 
 #include "glog/logging.h"
+#include "gflags/gflags.h"
 
 #include "paddle/phi/api/lib/api_custom_impl.h"
 #include "paddle/phi/api/lib/api_gen_utils.h"
diff --git a/paddle/phi/api/yaml/generator/backward_api_gen.py b/paddle/phi/api/yaml/generator/backward_api_gen.py
index 48dd09c5f461f7..36ac38a88dd8d0 100644
--- a/paddle/phi/api/yaml/generator/backward_api_gen.py
+++ b/paddle/phi/api/yaml/generator/backward_api_gen.py
@@ -275,6 +275,7 @@ def source_include(header_file_path, fw_header_file_path):
 #include <memory>
 
 #include "glog/logging.h"
+#include "gflags/gflags.h"
 
 #include "paddle/phi/api/lib/api_custom_impl.h"
 #include "paddle/phi/api/lib/api_gen_utils.h"
diff --git a/paddle/phi/api/yaml/generator/intermediate_api_gen.py b/paddle/phi/api/yaml/generator/intermediate_api_gen.py
index 092a8e6ad1d438..df950cae8ea26f 100644
--- a/paddle/phi/api/yaml/generator/intermediate_api_gen.py
+++ b/paddle/phi/api/yaml/generator/intermediate_api_gen.py
@@ -36,6 +36,7 @@ def source_include(header_file_path):
 #include <memory>
 
 #include "glog/logging.h"
+#include "gflags/gflags.h"
 
 #include "paddle/phi/api/lib/api_custom_impl.h"
 #include "paddle/phi/api/lib/api_gen_utils.h"
diff --git a/paddle/phi/api/yaml/generator/sparse_api_gen.py b/paddle/phi/api/yaml/generator/sparse_api_gen.py
index 276e2555876392..ec20e1b0434631 100644
--- a/paddle/phi/api/yaml/generator/sparse_api_gen.py
+++ b/paddle/phi/api/yaml/generator/sparse_api_gen.py
@@ -313,6 +313,7 @@ def source_include(header_file_path):
 #include <memory>
 
 #include "glog/logging.h"
+#include "gflags/gflags.h"
 
 #include "paddle/phi/api/lib/api_gen_utils.h"
 #include "paddle/phi/api/lib/data_transform.h"
diff --git a/paddle/phi/api/yaml/generator/sparse_bw_api_gen.py b/paddle/phi/api/yaml/generator/sparse_bw_api_gen.py
index eec3f734545247..697d2ac508a3fe 100644
--- a/paddle/phi/api/yaml/generator/sparse_bw_api_gen.py
+++ b/paddle/phi/api/yaml/generator/sparse_bw_api_gen.py
@@ -121,6 +121,7 @@ def source_include(header_file_path):
 #include <memory>
 
 #include "glog/logging.h"
+#include "gflags/gflags.h"
 
 #include "paddle/phi/api/include/sparse_api.h"
 #include "paddle/phi/api/lib/api_gen_utils.h"
diff --git a/paddle/phi/api/yaml/generator/strings_api_gen.py b/paddle/phi/api/yaml/generator/strings_api_gen.py
index 7a25e21ae3bec7..53f4cd6e0dff3b 100644
--- a/paddle/phi/api/yaml/generator/strings_api_gen.py
+++ b/paddle/phi/api/yaml/generator/strings_api_gen.py
@@ -329,6 +329,7 @@ def source_include(header_file_path):
     return f"""
 #include "{header_file_path}"
 
+#include "gflags/gflags.h"
 #include "paddle/phi/api/lib/api_gen_utils.h"
 #include "paddle/phi/core/kernel_context.h"
 #include "paddle/phi/core/string_tensor.h"
diff --git a/paddle/phi/backends/gpu/cuda/cudnn_helper.h b/paddle/phi/backends/gpu/cuda/cudnn_helper.h
index 330c056458c944..468567a9ee9d89 100644
--- a/paddle/phi/backends/gpu/cuda/cudnn_helper.h
+++ b/paddle/phi/backends/gpu/cuda/cudnn_helper.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
+#include "gflags/gflags.h"
 #include "paddle/phi/backends/dynload/cudnn.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
diff --git a/paddle/phi/backends/gpu/rocm/miopen_helper.h b/paddle/phi/backends/gpu/rocm/miopen_helper.h
index fc602d90fc6471..095f32ba460d01 100644
--- a/paddle/phi/backends/gpu/rocm/miopen_helper.h
+++ b/paddle/phi/backends/gpu/rocm/miopen_helper.h
@@ -17,6 +17,8 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
+#include "gflags/gflags.h"
+
 #include "paddle/phi/backends/dynload/miopen.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
diff --git a/paddle/phi/core/enforce.cc b/paddle/phi/core/enforce.cc
index 539fc57ec6756d..897ca5fe5c5ece 100644
--- a/paddle/phi/core/enforce.cc
+++ b/paddle/phi/core/enforce.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 
+#include "gflags/gflags.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/utils/blank.h"
 
diff --git a/paddle/phi/core/enforce.h b/paddle/phi/core/enforce.h
index c709aa42b09003..70cd7af19cdbe9 100644
--- a/paddle/phi/core/enforce.h
+++ b/paddle/phi/core/enforce.h
@@ -59,7 +59,6 @@ limitations under the License. */
 #endif
 
 #define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
-#include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "paddle/phi/core/errors.h"
 
diff --git a/paddle/phi/core/kernel_factory.cc b/paddle/phi/core/kernel_factory.cc
index 5744c3b85dcaf8..2851e8d2c5269f 100644
--- a/paddle/phi/core/kernel_factory.cc
+++ b/paddle/phi/core/kernel_factory.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/phi/core/kernel_factory.h"
 
+#include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "paddle/phi/core/enforce.h"
 #if defined(PADDLE_WITH_XPU)
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 48fa20f0fb2299..0aac6f969beb72 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <algorithm>
 #include <set>
 
+#include "gflags/gflags.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/type_traits.h"
 #include "paddle/phi/core/enforce.h"
diff --git a/paddle/phi/kernels/autotune/cache_base.h b/paddle/phi/kernels/autotune/cache_base.h
index d111ab1b57556d..267c8ef3f6859e 100644
--- a/paddle/phi/kernels/autotune/cache_base.h
+++ b/paddle/phi/kernels/autotune/cache_base.h
@@ -18,6 +18,7 @@
 #include <unordered_map>
 #include <vector>
 
+#include "gflags/gflags.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/errors.h"
 
diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
index a27c7f013feef1..2568f88274f0ea 100644
--- a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
+++ b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include "gflags/gflags.h"
 #include "paddle/phi/backends/dynload/cublas.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.hip.h b/paddle/phi/kernels/funcs/blas/blas_impl.hip.h
index 37343111d53dc4..5edfe3a602c7ba 100644
--- a/paddle/phi/kernels/funcs/blas/blas_impl.hip.h
+++ b/paddle/phi/kernels/funcs/blas/blas_impl.hip.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include "gflags/gflags.h"
+
 #include "paddle/phi/backends/dynload/rocblas.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/phi/kernels/gpu/embedding_grad_kernel.cu b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu
index 521e4b9282b82f..130dc570e33986 100644
--- a/paddle/phi/kernels/gpu/embedding_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu
@@ -14,6 +14,7 @@
 
 #include "paddle/phi/kernels/embedding_grad_kernel.h"
 
+#include "gflags/gflags.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/common/data_type.h"
diff --git a/paddle/phi/kernels/gpu/index_add_kernel.cu b/paddle/phi/kernels/gpu/index_add_kernel.cu
index 215b28085a92d5..e6d12e5c78414b 100644
--- a/paddle/phi/kernels/gpu/index_add_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_add_kernel.cu
@@ -14,6 +14,7 @@
 
 #include "paddle/phi/kernels/index_add_kernel.h"
 
+#include "gflags/gflags.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
diff --git a/paddle/phi/kernels/gpu/index_select_grad_kernel.cu b/paddle/phi/kernels/gpu/index_select_grad_kernel.cu
index 9578241829fd02..9c99a12fc285ef 100644
--- a/paddle/phi/kernels/gpu/index_select_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_select_grad_kernel.cu
@@ -14,6 +14,7 @@
 
 #include "paddle/phi/kernels/index_select_grad_kernel.h"
 
+#include "gflags/gflags.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
diff --git a/paddle/phi/kernels/gpu/randperm_kernel.cu b/paddle/phi/kernels/gpu/randperm_kernel.cu
index dbfe2dd992ee5a..6ae0e6df07cfb7 100644
--- a/paddle/phi/kernels/gpu/randperm_kernel.cu
+++ b/paddle/phi/kernels/gpu/randperm_kernel.cu
@@ -25,6 +25,8 @@
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
 #endif
+
+#include "gflags/gflags.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/impl/einsum_impl.h b/paddle/phi/kernels/impl/einsum_impl.h
index 238c89b17012ac..400334ad4e04eb 100644
--- a/paddle/phi/kernels/impl/einsum_impl.h
+++ b/paddle/phi/kernels/impl/einsum_impl.h
@@ -15,6 +15,7 @@
 
 #include <set>
 
+#include "gflags/gflags.h"
 #include "glog/logging.h"
 
 #include "paddle/phi/core/dense_tensor.h"
diff --git a/paddle/phi/kernels/selected_rows/cpu/adam_kernel.cc b/paddle/phi/kernels/selected_rows/cpu/adam_kernel.cc
index cec2c01702ffe4..a9cc821aaaf5f1 100644
--- a/paddle/phi/kernels/selected_rows/cpu/adam_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/cpu/adam_kernel.cc
@@ -14,6 +14,8 @@
 
 #include "paddle/phi/kernels/selected_rows/adam_kernel.h"
 
+#include "gflags/gflags.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
diff --git a/test/cpp/phi/api/scale_api.h b/test/cpp/phi/api/scale_api.h
index 2284ce0a42cead..c8ab3c7e985e0e 100644
--- a/test/cpp/phi/api/scale_api.h
+++ b/test/cpp/phi/api/scale_api.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"

From d6ee0a136943d29f2862de8f0b6ad04bcd150f35 Mon Sep 17 00:00:00 2001
From: kangguangli <kangguangli@hotmail.com>
Date: Mon, 10 Apr 2023 13:12:34 +0800
Subject: [PATCH 016/156] [StandaloneExe] Remove flag about Executor (#52671)

* add strategy force_sequential_run

* remove flag

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix
---
 .../fluid/framework/details/build_strategy.h  |   5 +-
 .../framework/new_executor/interpretercore.cc |   3 -
 .../framework/new_executor/interpretercore.h  |   1 -
 .../controlflow/conditional_block_op.cc       | 121 ++++++------------
 .../fluid/operators/controlflow/while_op.cc   | 121 ++++++------------
 paddle/fluid/pybind/parallel_executor.cc      |  10 +-
 python/paddle/fluid/executor.py               |  25 +---
 python/paddle/fluid/framework.py              |   6 -
 .../test_standalone_sequentail_run.py         |  12 +-
 9 files changed, 101 insertions(+), 203 deletions(-)

diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index b06ef5d1d22a7b..f336ab88c0cb40 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -135,7 +135,7 @@ struct BuildStrategy {
   bool fuse_adamw_{false};
   // Fused feed forward
   bool fused_feedforward_{false};
-  bool force_sequential_run_{false};
+  bool sequential_run_{false};
 
   // mkldnn_enabled_op_types specify the operator type list to
   // use MKLDNN acceleration. It is null in default, means
@@ -270,8 +270,7 @@ inline std::ostream &operator<<(std::ostream &os,
   os << "fuse_gemm_epilogue_: " << strategy.fuse_gemm_epilogue_ << std::endl;
   os << "fused_attention_: " << strategy.fused_attention_ << std::endl;
   os << "fused_feedforward_: " << strategy.fused_feedforward_ << std::endl;
-  os << "force_sequential_run_: " << strategy.force_sequential_run_
-     << std::endl;
+  os << "sequential_run_: " << strategy.sequential_run_ << std::endl;
   os << "mkldnn_enabled_op_types_: ";
   for (auto str : strategy.mkldnn_enabled_op_types_) {
     os << str << ", ";
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index c99de872c8b111..d61a0a472d8738 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -50,9 +50,6 @@ PADDLE_DEFINE_EXPORTED_bool(new_executor_use_local_scope,
                             true,
                             "Use local_scope in new executor(especially used "
                             "in UT), can turn off for better performance");
-PADDLE_DEFINE_EXPORTED_bool(control_flow_use_new_executor,
-                            true,
-                            "Use new executor in control flow op");
 
 DECLARE_bool(check_nan_inf);
 DECLARE_bool(benchmark);
diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h
index daa2a281f8b1d6..46cbf9cfc3fcb9 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.h
+++ b/paddle/fluid/framework/new_executor/interpretercore.h
@@ -34,7 +34,6 @@
 #include "paddle/fluid/platform/device_event.h"
 
 DECLARE_bool(new_executor_use_local_scope);
-DECLARE_bool(control_flow_use_new_executor);
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.cc b/paddle/fluid/operators/controlflow/conditional_block_op.cc
index 4544dade327e0f..ee8ec2e276b616 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_op.cc
+++ b/paddle/fluid/operators/controlflow/conditional_block_op.cc
@@ -95,48 +95,28 @@ class ConditionalBlockOp : public ConditionalOp {
       auto &skip_vars =
           Attr<std::vector<std::string>>(ConditionalOp::kSkipEagerDeletionVars);
 
-      if (FLAGS_control_flow_use_new_executor) {
-        LOG_FIRST_N(INFO, 1)
-            << "[ControlFlow][ConditionalBlock] New Executor is Running.";
-        if (!core_ || !platform::is_same_place(core_->GetPlace(), dev_place)) {
-          VLOG(10) << "[interpreterCore cache]" << core_.get();
-          VLOG_IF(10, core_)
-              << platform::is_same_place(core_->GetPlace(), dev_place);
-
-          framework::interpreter::ExecutionConfig execution_config;
-          execution_config.create_local_scope = false;
-          execution_config.used_for_control_flow_op = true;
-          execution_config.skip_gc_vars =
-              std::set<std::string>(skip_vars.begin(), skip_vars.end());
-
-          core_.reset(new InterpreterCore(
-              dev_place, *block, &cur_scope, execution_config));
-          VLOG(10) << "[interpreterCore cache]"
-                   << "new created:" << core_;
-        } else {
-          BuildScopeForControlFlowOp(*core_, *block, &cur_scope);
-          core_->reset_scope(&cur_scope);
-        }
-
-        core_->Run({}, false);
-
+      LOG_FIRST_N(INFO, 1)
+          << "[ControlFlow][ConditionalBlock] New Executor is Running.";
+      if (!core_ || !platform::is_same_place(core_->GetPlace(), dev_place)) {
+        VLOG(10) << "[interpreterCore cache]" << core_.get();
+        VLOG_IF(10, core_) << platform::is_same_place(core_->GetPlace(),
+                                                      dev_place);
+
+        framework::interpreter::ExecutionConfig execution_config;
+        execution_config.create_local_scope = false;
+        execution_config.used_for_control_flow_op = true;
+        execution_config.skip_gc_vars =
+            std::set<std::string>(skip_vars.begin(), skip_vars.end());
+
+        core_.reset(new InterpreterCore(
+            dev_place, *block, &cur_scope, execution_config));
+        VLOG(10) << "[interpreterCore] created:" << core_;
       } else {
-        if (!exec_ || !platform::is_same_place(exec_->GetPlace(), dev_place)) {
-          auto &pdesc = *block->Program();
-          exec_.reset(new Executor(dev_place));
-          if (FLAGS_use_mkldnn) exec_->EnableMKLDNN(pdesc);
-          ctx_ = exec_->Prepare(pdesc, block->ID(), skip_vars, false);
-#ifdef PADDLE_WITH_MKLDNN
-          platform::AttachPointerHashToMKLDNNKey(exec_.get(), dev_place);
-          platform::RegisterModelLayout(ctx_->ops_, dev_place);
-#endif
-        }
-        exec_->RunPreparedContext(ctx_.get(),
-                                  &cur_scope,
-                                  /* create_local_scope */ false,
-                                  /* create_vars */ true,
-                                  /* keep_kids */ true);
+        BuildScopeForControlFlowOp(*core_, *block, &cur_scope);
+        core_->reset_scope(&cur_scope);
       }
+
+      core_->Run({}, false);
     }
   }
 
@@ -208,47 +188,27 @@ class ConditionalBlockGradOp : public ConditionalOp {
       VLOG(3) << "Conditional Grad block.idx = " << block->ID()
               << ", scope = " << &cur_scope;
 
-      if (FLAGS_control_flow_use_new_executor) {
-        LOG_FIRST_N(INFO, 1)
-            << "[ControlFlow][ConditionalGradBlock] New Executor is Running.";
-        if (!core_ || !platform::is_same_place(core_->GetPlace(), dev_place)) {
-          VLOG(10) << "[interpreterCore cache]" << core_.get();
-          VLOG_IF(10, core_)
-              << platform::is_same_place(core_->GetPlace(), dev_place);
-
-          framework::interpreter::ExecutionConfig execution_config;
-          execution_config.create_local_scope = false;
-          execution_config.used_for_control_flow_op = true;
-          execution_config.skip_gc_vars =
-              std::set<std::string>(inside_grads.begin(), inside_grads.end());
-
-          core_.reset(new InterpreterCore(
-              dev_place, *block, &cur_scope, execution_config));
-          VLOG(10) << "[interpreterCore cache]"
-                   << "new created:" << core_;
-        } else {
-          BuildScopeForControlFlowOp(*core_, *block, &cur_scope);
-          core_->reset_scope(&cur_scope);
-        }
-        core_->Run({}, false);
-
+      LOG_FIRST_N(INFO, 1)
+          << "[ControlFlow][ConditionalGradBlock] New Executor is Running.";
+      if (!core_ || !platform::is_same_place(core_->GetPlace(), dev_place)) {
+        VLOG(10) << "[interpreterCore cache]" << core_.get();
+        VLOG_IF(10, core_) << platform::is_same_place(core_->GetPlace(),
+                                                      dev_place);
+
+        framework::interpreter::ExecutionConfig execution_config;
+        execution_config.create_local_scope = false;
+        execution_config.used_for_control_flow_op = true;
+        execution_config.skip_gc_vars =
+            std::set<std::string>(inside_grads.begin(), inside_grads.end());
+
+        core_.reset(new InterpreterCore(
+            dev_place, *block, &cur_scope, execution_config));
+        VLOG(10) << "[interpreterCore] created:" << core_;
       } else {
-        if (!exec_ || !platform::is_same_place(exec_->GetPlace(), dev_place)) {
-          auto &pdesc = *block->Program();
-          exec_.reset(new Executor(dev_place));
-          if (FLAGS_use_mkldnn) exec_->EnableMKLDNN(pdesc);
-          ctx_ = exec_->Prepare(pdesc, block->ID(), inside_grads, false);
-#ifdef PADDLE_WITH_MKLDNN
-          platform::AttachPointerHashToMKLDNNKey(exec_.get(), dev_place);
-          platform::RegisterModelLayout(ctx_->ops_, dev_place);
-#endif
-        }
-        exec_->RunPreparedContext(ctx_.get(),
-                                  &cur_scope,
-                                  /* create_local_scope */ false,
-                                  /* create_vars */ true,
-                                  /* keep_kids */ true);
+        BuildScopeForControlFlowOp(*core_, *block, &cur_scope);
+        core_->reset_scope(&cur_scope);
       }
+      core_->Run({}, false);
 
       AssignLocalGradientToParentScope(
           dev_place, cur_scope, scope, inside_grads, outside_grads, inputs);
@@ -398,7 +358,8 @@ struct FilterNoGradInput<framework::OpDesc> {
                      std::vector<std::string> *vec) {
     auto f = [desc](const std::string &name) -> std::string {
       if (name == framework::kEmptyVarName) {
-        // don't drop empty var name, you can use Input(name, true) to drop it.
+        // don't drop empty var name, you can use Input(name, true) to drop
+        // it.
         return framework::kEmptyVarName;
       }
       auto var_desc =
diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc
index 3017a1e0fc4b79..30fdb90ce10696 100644
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
@@ -120,7 +120,6 @@ class WhileOp : public framework::OperatorBase {
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto &dev_ctx = *pool.Get(dev_place);
 
-    auto *program = block->Program();
     bool is_test = Attr<bool>("is_test");
 
     std::set<std::string> no_copy_var_names;
@@ -199,26 +198,18 @@ class WhileOp : public framework::OperatorBase {
       }
     }
 
-    if (FLAGS_control_flow_use_new_executor) {
-      LOG_FIRST_N(INFO, 1) << "[ControlFlow][WhileOp] New Executor is Running.";
-      if (!core_ || !platform::is_same_place(core_->GetPlace(), dev_place)) {
-        framework::Scope placeholder;  // Don't care if it's valid, just for
-                                       // initialize InterpreterCore
-        framework::interpreter::ExecutionConfig execution_config;
-        execution_config.create_local_scope = false;
-        execution_config.used_for_control_flow_op = true;
-        execution_config.skip_gc_vars =
-            std::set<std::string>(skip_vars.begin(), skip_vars.end());
-
-        core_.reset(new framework::InterpreterCore(
-            dev_place, *block, &placeholder, execution_config));
-      }
-    } else {
-      if (!executor_ ||
-          !platform::is_same_place(executor_->GetPlace(), dev_place)) {
-        executor_.reset(new framework::Executor(dev_place));
-        ctx_ = executor_->Prepare(*program, block->ID(), skip_vars);
-      }
+    LOG_FIRST_N(INFO, 1) << "[ControlFlow][WhileOp] New Executor is Running.";
+    if (!core_ || !platform::is_same_place(core_->GetPlace(), dev_place)) {
+      framework::Scope placeholder;  // Don't care if it's valid, just for
+                                     // initialize InterpreterCore
+      framework::interpreter::ExecutionConfig execution_config;
+      execution_config.create_local_scope = false;
+      execution_config.used_for_control_flow_op = true;
+      execution_config.skip_gc_vars =
+          std::set<std::string>(skip_vars.begin(), skip_vars.end());
+
+      core_.reset(new framework::InterpreterCore(
+          dev_place, *block, &placeholder, execution_config));
     }
 
     if (!is_test) {
@@ -244,22 +235,17 @@ class WhileOp : public framework::OperatorBase {
             }
           }
         }
-        if (FLAGS_control_flow_use_new_executor) {
-          BuildScopeForControlFlowOp(*core_, *block, &current_scope);
-          core_->reset_scope(&current_scope);
-          core_->Run({}, false);
-
-          // restore inputs place
-          for (const auto &n : input_var_original_places) {
-            const std::string &in_name = n.first;
-            const phi::Place &original_place = n.second;
-            // input vars exist in `scope` not `current_scope`
-            TransferVariablePlace(&scope, in_name, original_place, dev_ctx);
-          }
 
-        } else {
-          executor_->RunPreparedContext(
-              ctx_.get(), &current_scope, false, true, true);
+        BuildScopeForControlFlowOp(*core_, *block, &current_scope);
+        core_->reset_scope(&current_scope);
+        core_->Run({}, false);
+
+        // restore inputs place
+        for (const auto &n : input_var_original_places) {
+          const std::string &in_name = n.first;
+          const phi::Place &original_place = n.second;
+          // input vars exist in `scope` not `current_scope`
+          TransferVariablePlace(&scope, in_name, original_place, dev_ctx);
         }
 
         for (auto &var_rename : rename_vars) {
@@ -273,12 +259,8 @@ class WhileOp : public framework::OperatorBase {
     } else {
       auto &current_scope = scope.NewScope();
 
-      if (FLAGS_control_flow_use_new_executor) {
-        BuildScopeForControlFlowOp(*core_, *block, &current_scope);
-        core_->reset_scope(&current_scope);
-      } else {
-        executor_->CreateVariables(*program, &current_scope, block->ID());
-      }
+      BuildScopeForControlFlowOp(*core_, *block, &current_scope);
+      core_->reset_scope(&current_scope);
 
       while (cond_data) {
         for (auto &name : current_scope.LocalVarNames()) {
@@ -295,12 +277,7 @@ class WhileOp : public framework::OperatorBase {
           }
         }
 
-        if (FLAGS_control_flow_use_new_executor) {
-          core_->Run({}, false);
-        } else {
-          executor_->RunPreparedContext(
-              ctx_.get(), &current_scope, false, false, false);
-        }
+        core_->Run({}, false);
 
         cond_data = GetCondData(
             scope.FindVar(Input(kCondition))->Get<phi::DenseTensor>());
@@ -367,7 +344,6 @@ class WhileGradOp : public framework::OperatorBase {
     auto &dev_ctx = *pool.Get(dev_place);
 
     auto *block = Attr<framework::BlockDesc *>(kStepBlock);
-    auto *program = block->Program();
     auto *parent_block = block->ParentBlock();
 
     auto &skip_vars = Attr<std::vector<std::string>>(kSkipEagerDeletionVars);
@@ -391,28 +367,20 @@ class WhileGradOp : public framework::OperatorBase {
                           outside_og_names.size(),
                           inside_og_names.size()));
 
-    if (FLAGS_control_flow_use_new_executor) {
-      LOG_FIRST_N(INFO, 1)
-          << "[ControlFlow][WhileGradOp] New Executor is Running.";
-      if (!core_ || !platform::is_same_place(core_->GetPlace(), dev_place)) {
-        std::set<std::string> skip_gc_vars(skip_vars.begin(), skip_vars.end());
-        framework::Scope placeholder;  // Don't care if it's valid, just for
-                                       // initialize InterpreterCore
-        framework::interpreter::ExecutionConfig execution_config;
-        execution_config.create_local_scope = false;
-        execution_config.used_for_control_flow_op = true;
-        execution_config.skip_gc_vars =
-            std::set<std::string>(skip_vars.begin(), skip_vars.end());
-
-        core_.reset(new framework::InterpreterCore(
-            dev_place, *block, &placeholder, execution_config));
-      }
-    } else {
-      if (!executor_ ||
-          !platform::is_same_place(executor_->GetPlace(), dev_place)) {
-        executor_.reset(new framework::Executor(dev_place));
-        ctx_ = executor_->Prepare(*program, block->ID(), skip_vars);
-      }
+    LOG_FIRST_N(INFO, 1)
+        << "[ControlFlow][WhileGradOp] New Executor is Running.";
+    if (!core_ || !platform::is_same_place(core_->GetPlace(), dev_place)) {
+      std::set<std::string> skip_gc_vars(skip_vars.begin(), skip_vars.end());
+      framework::Scope placeholder;  // Don't care if it's valid, just for
+                                     // initialize InterpreterCore
+      framework::interpreter::ExecutionConfig execution_config;
+      execution_config.create_local_scope = false;
+      execution_config.used_for_control_flow_op = true;
+      execution_config.skip_gc_vars =
+          std::set<std::string>(skip_vars.begin(), skip_vars.end());
+
+      core_.reset(new framework::InterpreterCore(
+          dev_place, *block, &placeholder, execution_config));
     }
 
     for (auto cur_scope_iter = step_scopes->rbegin();
@@ -504,14 +472,9 @@ class WhileGradOp : public framework::OperatorBase {
         }
       }
 
-      if (FLAGS_control_flow_use_new_executor) {
-        BuildScopeForControlFlowOp(*core_, *block, *cur_scope_iter);
-        core_->reset_scope(*cur_scope_iter);
-        core_->Run({}, false);
-      } else {
-        executor_->RunPreparedContext(
-            ctx_.get(), *cur_scope_iter, false, true, true);
-      }
+      BuildScopeForControlFlowOp(*core_, *block, *cur_scope_iter);
+      core_->reset_scope(*cur_scope_iter);
+      core_->Run({}, false);
 
       // The Outputs(kXGRAD) contains the names of the gradient of parameters
       // and inputs.
diff --git a/paddle/fluid/pybind/parallel_executor.cc b/paddle/fluid/pybind/parallel_executor.cc
index 0dc171aabc740e..0c8898b524fae5 100644
--- a/paddle/fluid/pybind/parallel_executor.cc
+++ b/paddle/fluid/pybind/parallel_executor.cc
@@ -760,17 +760,17 @@ void BindParallelExecutor(pybind11::module &m) {  // NOLINT
                         build_strategy.fused_feedforward = True
                      )DOC")
       .def_property(
-          "force_sequential_run",
-          [](const BuildStrategy &self) { return self.force_sequential_run_; },
+          "sequential_run",
+          [](const BuildStrategy &self) { return self.sequential_run_; },
           [](BuildStrategy &self, bool b) {
             PADDLE_ENFORCE_NE(self.IsFinalized(),
                               true,
                               platform::errors::PreconditionNotMet(
                                   "BuildStrategy has been finlaized, cannot be "
                                   "configured again."));
-            self.force_sequential_run_ = b;
+            self.sequential_run_ = b;
           },
-          R"DOC((bool, optional): force_sequential_run is used to let the `StandaloneExecutor` run ops by the
+          R"DOC((bool, optional): sequential_run is used to let the `StandaloneExecutor` run ops by the
           order of `ProgramDesc`. Default is False.
 
                 Examples:
@@ -782,7 +782,7 @@ void BindParallelExecutor(pybind11::module &m) {  // NOLINT
                         paddle.enable_static()
 
                         build_strategy = static.BuildStrategy()
-                        build_strategy.fused_feedforward = True
+                        build_strategy.sequential_run = True
                      )DOC")
       .def_property(
           "fuse_bn_act_ops",
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 55dac9695cdffb..37718c25c6c1f1 100755
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -493,26 +493,6 @@ def _to_str(var):
         return _to_str(var)
 
 
-def _is_dy2st_enable_standalone_executor():
-    return framework._dy2st_enable_standalone_executor_ in [
-        1,
-        '1',
-        True,
-        'True',
-        'true',
-    ]
-
-
-def _is_cuda_graph_enable_standalone_executor():
-    return framework._cuda_graph_enable_standalone_executor_ in [
-        1,
-        '1',
-        True,
-        'True',
-        'true',
-    ]
-
-
 def _prepare_fleet_executor():
     from ..distributed.fleet.proto import fleet_executor_desc_pb2
 
@@ -1619,10 +1599,7 @@ def _can_use_interpreter_core(program, place):
                     else program._graph
                 )
                 build_strategy = compiled_program._build_strategy
-                if (
-                    build_strategy is not None
-                    and build_strategy.force_sequential_run
-                ):
+                if build_strategy is not None and build_strategy.sequential_run:
                     schedule_flag = [
                         'FLAGS_new_executor_serial_run',
                         'FLAGS_new_executor_sequential_run',
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 98d0fbf0620b7d..708cc462e78ea9 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -117,12 +117,6 @@ def __setattr__(self, name, val):
 _current_cuda_graph_mode = None
 _global_flags_ = core.globals()
 
-_dy2st_enable_standalone_executor_ = os.environ.get(
-    'FLAGS_DY2ST_USE_STANDALONE_EXECUTOR', 1
-)
-_cuda_graph_enable_standalone_executor_ = os.environ.get(
-    'FLAGS_CUDA_GRAPH_USE_STANDALONE_EXECUTOR', 0
-)
 
 # special_op_attrs, extra_op_attrs are prepared for printing warnings
 # when turning on FLAGS_print_extra_attrs
diff --git a/test/standalone_executor/test_standalone_sequentail_run.py b/test/standalone_executor/test_standalone_sequentail_run.py
index bc7368e58b4e72..64b4e4293d100f 100644
--- a/test/standalone_executor/test_standalone_sequentail_run.py
+++ b/test/standalone_executor/test_standalone_sequentail_run.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import unittest
 
 import numpy as np
@@ -31,13 +32,14 @@ def build_program(self):
 
         return main_program, startup_program, [c]
 
-    def run_program(self, force_sequential_run=False):
+    def run_program(self, sequential_run=False):
         seed = 100
         paddle.seed(seed)
         np.random.seed(seed)
         main, startup, outs = self.build_program()
         build_strategy = paddle.static.BuildStrategy()
-        build_strategy.force_sequential_run = force_sequential_run
+        build_strategy.sequential_run = sequential_run
+        print(build_strategy)
         compiled_program = paddle.static.CompiledProgram(
             main, build_strategy=build_strategy
         )
@@ -60,6 +62,12 @@ def test_result(self):
         ret2 = self.run_program(False)
         np.testing.assert_array_equal(ret1, ret2)
 
+    def test_str_flag(self):
+        paddle.enable_static()
+        os.environ['FLAGS_new_executor_sequential_run'] = 'true'
+        ret1 = self.run_program(True)
+        assert os.environ['FLAGS_new_executor_sequential_run'] == "true"
+
 
 if __name__ == "__main__":
     unittest.main()

From f9aaa1e485623de633845c023aa2ac2b1ed09e22 Mon Sep 17 00:00:00 2001
From: HongyuJia <jiahongyu@baidu.com>
Date: Mon, 10 Apr 2023 13:13:22 +0800
Subject: [PATCH 017/156] [Approval For Phi] Add approval check for including
 third-party in phi headerfiles (#52653)

---
 tools/check_file_diff_approvals.sh | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index 39ece6a3b13466..50f9344c66fe41 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -352,6 +352,19 @@ if [ "${PHI_INCLUDE_FLUID_FILES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     check_approval 1 chenwhql YuanRisheng zyfncg
 fi
 
+HAS_MODIFIED_PHI_HEADER_FILES=`git diff --name-only upstream/$BRANCH | grep "paddle/phi/.*\.h" || true`
+PHI_INCLUDE_THIRD_PARTY_FILES=""
+for CHANGE_FILE in ${HAS_MODIFIED_PHI_HEADER_FILES}; do
+    PHI_DIR_ADDED_LINES=`git diff -U0 upstream/$BRANCH -- ${PADDLE_ROOT}/${CHANGE_FILE} | grep "^+" | grep -E "#include \"gflags/gflags.h\"|#include \"glog/logging.h\"" || true`
+    if [ "${PHI_DIR_ADDED_LINES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+        PHI_INCLUDE_THIRD_PARTY_FILES="${PHI_INCLUDE_THIRD_PARTY_FILES} ${CHANGE_FILE}"
+    fi
+done
+if [ "${PHI_INCLUDE_THIRD_PARTY_FILES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+    echo_line="You must have one RD (jiahy0825, zyfncg, chenwhql, YuanRisheng or heavyrain-lzy) approval for including \"gflags/gflags.h\" or \"glog/logging.h\" headerfile in paddle/phi headerfiles(${PHI_INCLUDE_THIRD_PARTY_FILES}). Recommend including third party headers in phi source files(*.cc) instead of phi headerfiles(*.h). Because if phi headerfiles include third party headers like \"gflags.h\" or \"logging.h\", error might occur when outside developers use phi headerfiles directly.\n"
+    check_approval 1 jiahy0825 zyfncg chenwhql YuanRisheng heavyrain-lzy
+fi
+
 HAS_MODIFIED_PHI_OR_FLUID_FILES=`git diff --name-only upstream/$BRANCH | grep -E "paddle/phi|paddle/fluid" || true`
 USE_MUTABLE_DATA_FILES=""
 for CHANGE_FILE in ${HAS_MODIFIED_PHI_OR_FLUID_FILES}; do

From 351ccb63e5d122869f97b2dc82e1c0e36faa8d36 Mon Sep 17 00:00:00 2001
From: lzydev <lizhiyu02@baidu.com>
Date: Mon, 10 Apr 2023 14:10:47 +0800
Subject: [PATCH 018/156] Autogen softmax_with_cross_entropy (#52515)

* autogen softmax_with_cross_entropy

* fix error in softmax_with_cross_entropy version
---
 .../softmax_with_cross_entropy_op.cc          | 336 ------------------
 paddle/phi/api/yaml/backward.yaml             |  11 +
 paddle/phi/api/yaml/legacy_backward.yaml      |  11 -
 paddle/phi/api/yaml/legacy_ops.yaml           |  11 -
 paddle/phi/api/yaml/op_compat.yaml            |   7 +
 paddle/phi/api/yaml/op_version.yaml           |   8 +
 paddle/phi/api/yaml/ops.yaml                  |  12 +
 .../compat/softmax_with_cross_entropy_sig.cc  |  53 ---
 python/paddle/nn/functional/loss.py           |  14 -
 9 files changed, 38 insertions(+), 425 deletions(-)
 delete mode 100644 paddle/fluid/operators/softmax_with_cross_entropy_op.cc
 delete mode 100644 paddle/phi/ops/compat/softmax_with_cross_entropy_sig.cc

diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
deleted file mode 100644
index e745dfa96b6c98..00000000000000
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
+++ /dev/null
@@ -1,336 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/phi/kernels/funcs/axis_utils.h"
-
-namespace paddle {
-namespace operators {
-
-class SoftmaxWithCrossEntropyOpMaker
-    : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Logits",
-             "(Tensor, default: Tensor<float>), The input tensor of unscaled "
-             "log probabilities, whose dimension :attr:`axis` should be scaled "
-             "by softmax.");
-    AddInput(
-        "Label",
-        "(Tensor) The input tensor of groud truth label. If :attr:`soft_label` "
-        "is set to false, Label is a Tensor<int64> in same shape with "
-        "Input(Logits) except the shape in dimension :attr:`axis` as 1. If "
-        "soft_label is set to true, Label is a Tensor<float/double> in same "
-        "shape with Input(Logits).");
-    AddOutput(
-        "Softmax",
-        "(Tensor, default: Tensor<float>), A tensor in same shape with "
-        "Input(Logits). "
-        "The outputs value of softmax activation by given the input batch, "
-        "which will be used in backward calculation.")
-        .AsIntermediate();
-    AddOutput("Loss",
-              "(Tensor, default: Tensor<float>), A tensor in same shape with "
-              "Input(Logits) "
-              "except the shape in dimension :attr:`axis` as 1. The cross "
-              "entropy loss.");
-    AddAttr<bool>(
-        "soft_label",
-        "(bool, default: false), A flag to indicate whether to interpretant "
-        "the given labels as soft labels.")
-        .SetDefault(false);
-    AddAttr<bool>(
-        "use_softmax",
-        "(bool, default: true), A flag to indicate whether to do softmax ")
-        .SetDefault(true);
-    AddAttr<bool>(
-        "numeric_stable_mode",
-        "(bool, default: true), A flag to indicate whether to use more "
-        "numerically stable algorithm. This flag is only valid when "
-        "soft_label is false and GPU is used.")
-        .SetDefault(true);
-    AddAttr<int>(
-        "ignore_index",
-        "(int, default -100), Specifies a target value that is ignored and"
-        "does not contribute to the input gradient. Only valid if soft_label"
-        "is set to False")
-        .SetDefault(-100);
-    AddAttr<int>("axis",
-                 "The dimension index of Input(Logits) to perform softmax,"
-                 "default -1 for last dimension")
-        .SetDefault(-1);
-    AddComment(R"DOC(
-Softmax With Cross Entropy Operator.
-
-Cross entropy loss with softmax is used as the output layer extensively. This
-operator computes the softmax normalized values for each row of the input
-tensor, after which cross-entropy loss is computed. This provides a more
-numerically stable gradient.
-
-Because this operator performs a softmax on logits internally, it expects
-unscaled logits. This operator should not be used with the output of
-softmax operator since that would produce incorrect results.
-
-When the attribute soft_label is set false, this operators expects mutually
-exclusive hard labels, each sample in a batch is in exactly one class with a
-probability of 1.0. Each sample in the batch will have a single label.
-
-The equation is as follows:
-
-1) Hard label (one-hot label, so every sample has exactly one class)
-
-$$Loss_j =  -\text{Logit}_{Label_j} +
-\log\left(\sum_{i=0}^{K}\exp(\text{Logit}_i)\right),
-j = 1,..., K$$
-
-2) Soft label (each sample can have a distribution over all classes)
-
-$$Loss_j =  -\sum_{i=0}^{K}\text{Label}_i \left(\text{Logit}_i -
-\log\left(\sum_{i=0}^{K}\exp(\text{Logit}_i)\right)\right),
-j = 1,...,K$$
-
-)DOC");
-  }
-};
-
-class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Logits"),
-        true,
-        platform::errors::InvalidArgument("Input(Logits) should be not null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Label"),
-        true,
-        platform::errors::InvalidArgument("Input(Label) should be not null."));
-
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Softmax"),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Output(Softmax) should be not null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("Loss"),
-        true,
-        platform::errors::InvalidArgument("Output(Loss) should be not null."));
-
-    auto axis = ctx->Attrs().Get<int>("axis");
-    auto logits_dims = ctx->GetInputDim("Logits");
-    auto labels_dims = ctx->GetInputDim("Label");
-    auto logits_rank = logits_dims.size();
-    PADDLE_ENFORCE_GE(axis,
-                      -logits_rank,
-                      platform::errors::InvalidArgument(
-                          "Attr(axis) value should be in range [-R, R-1], "
-                          "R is the rank of Input(Logits)."));
-    PADDLE_ENFORCE_LT(axis,
-                      logits_rank,
-                      platform::errors::InvalidArgument(
-                          "Attr(axis) value should be in range [-R, R-1], "
-                          "R is the rank of Input(Logits)."));
-
-    axis = phi::funcs::CanonicalAxis(axis, logits_rank);
-    for (int i = 0; i < logits_rank; i++) {
-      if (i != axis) {
-        if (ctx->IsRuntime() || (logits_dims[i] > 0 && labels_dims[i] > 0)) {
-          PADDLE_ENFORCE_EQ(logits_dims[i],
-                            labels_dims[i],
-                            platform::errors::InvalidArgument(
-                                "Input(Logits) and Input(Label) should in "
-                                "same shape in dimensions except axis."));
-        }
-      }
-    }
-
-    auto numeric_stable_mode = ctx->Attrs().Get<bool>("numeric_stable_mode");
-    if (axis != logits_rank - 1) {
-      PADDLE_ENFORCE_EQ(numeric_stable_mode,
-                        true,
-                        platform::errors::InvalidArgument(
-                            "Attr(axis) can only be -1 "
-                            "when not in numeric_stable_mode."));
-    }
-
-    bool soft_label = ctx->Attrs().Get<bool>("soft_label");
-    if (soft_label) {
-      if (ctx->IsRuntime() ||
-          (logits_dims[axis] > 0 && labels_dims[axis] > 0)) {
-        PADDLE_ENFORCE_EQ(logits_dims[axis],
-                          labels_dims[axis],
-                          platform::errors::InvalidArgument(
-                              "If Attr(soft_label) == true,  "
-                              "the axis dimension of "
-                              "Input(X) and Input(Label) should be equal."));
-      }
-    } else {
-      if (ctx->IsRuntime() || labels_dims[axis] > 0) {
-        PADDLE_ENFORCE_EQ(
-            labels_dims[axis],
-            1UL,
-            platform::errors::InvalidArgument("If Attr(soft_label) == false, "
-                                              "the axis dimension of "
-                                              "Input(Label) should be 1."));
-      }
-    }
-
-    ctx->SetOutputDim("Softmax", logits_dims);
-
-    logits_dims[axis] = 1;
-    ctx->SetOutputDim("Loss", logits_dims);
-
-    ctx->ShareLoD("Logits", /*->*/ "Softmax");
-    ctx->ShareLoD("Logits", /*->*/ "Loss");
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(
-        OperatorWithKernel::IndicateVarDataType(ctx, "Logits"), ctx.GetPlace());
-  }
-};
-
-class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Loss")),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Input(Loss@Grad) should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Softmax"),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Input(Softmax) should be not null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Label"),
-        true,
-        platform::errors::InvalidArgument("Input(Label) should be not null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("Logits")),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Output(Logits@Grad) should be not null."));
-
-    auto axis = ctx->Attrs().Get<int>("axis");
-    auto softmax_dims = ctx->GetInputDim("Softmax");
-    auto labels_dims = ctx->GetInputDim("Label");
-    auto softmax_rank = softmax_dims.size();
-    PADDLE_ENFORCE_GE(axis,
-                      -softmax_rank,
-                      platform::errors::InvalidArgument(
-                          "Attr(axis) value should be in range [-R, R-1], "
-                          "R is the rank of Input(Logits)."));
-    PADDLE_ENFORCE_LT(axis,
-                      softmax_rank,
-                      platform::errors::InvalidArgument(
-                          "Attr(axis) value should be in range [-R, R-1], "
-                          "R is the rank of Input(Logits)."));
-
-    axis = phi::funcs::CanonicalAxis(axis, softmax_rank);
-    for (int i = 0; i < softmax_rank; i++) {
-      if (i != axis) {
-        if (ctx->IsRuntime() || (softmax_dims[i] > 0 && labels_dims[i] > 0)) {
-          PADDLE_ENFORCE_EQ(
-              softmax_dims[i],
-              labels_dims[i],
-              platform::errors::InvalidArgument(
-                  "Input(Logits) and Input(Label) should in same shape in "
-                  "dimensions except axis."));
-        }
-      }
-    }
-
-    bool soft_label = ctx->Attrs().Get<bool>("soft_label");
-    if (soft_label) {
-      if (ctx->IsRuntime() ||
-          (softmax_dims[axis] > 0 && labels_dims[axis] > 0)) {
-        PADDLE_ENFORCE_EQ(softmax_dims[axis],
-                          labels_dims[axis],
-                          platform::errors::InvalidArgument(
-                              "If Attr(soft_label) == true, "
-                              "the axis dimension of "
-                              "Input(X) and Input(Label) should be equal."));
-      }
-    } else {
-      if (ctx->IsRuntime() || labels_dims[axis] > 0) {
-        PADDLE_ENFORCE_EQ(
-            labels_dims[axis],
-            1UL,
-            platform::errors::InvalidArgument("If Attr(soft_label) == false, "
-                                              "the axis dimension of "
-                                              "Input(Label) should be 1."));
-      }
-    }
-
-    ctx->SetOutputDim(framework::GradVarName("Logits"),
-                      ctx->GetInputDim("Softmax"));
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(
-                              ctx, framework::GradVarName("Loss")),
-                          ctx.GetPlace());
-  }
-};
-
-template <typename T>
-class SoftmaxGradMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> grad_op) const override {
-    grad_op->SetType("softmax_with_cross_entropy_grad");
-    grad_op->SetInput("Label", this->Input("Label"));
-    grad_op->SetInput("Softmax", this->Output("Softmax"));
-    grad_op->SetInput(framework::GradVarName("Loss"), this->OutputGrad("Loss"));
-    grad_op->SetOutput(framework::GradVarName("Logits"),
-                       this->InputGrad("Logits"));
-    grad_op->SetAttrMap(this->Attrs());
-  }
-};
-
-DECLARE_INPLACE_OP_INFERER(SoftmaxWithCrossEntropyInplaceInferer,
-                           {"Logits", "Softmax"});
-
-DECLARE_INPLACE_OP_INFERER(SoftmaxWithCrossEntropyGradInplaceInferer,
-                           {"Softmax", framework::GradVarName("Logits")});
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(softmax_with_cross_entropy,
-                  ops::SoftmaxWithCrossEntropyOp,
-                  ops::SoftmaxWithCrossEntropyOpMaker,
-                  ops::SoftmaxGradMaker<paddle::framework::OpDesc>,
-                  ops::SoftmaxGradMaker<paddle::imperative::OpBase>,
-                  ops::SoftmaxWithCrossEntropyInplaceInferer);
-REGISTER_OPERATOR(softmax_with_cross_entropy_grad,
-                  ops::SoftmaxWithCrossEntropyOpGrad,
-                  ops::SoftmaxWithCrossEntropyGradInplaceInferer);
-
-REGISTER_OP_VERSION(softmax_with_cross_entropy)
-    .AddCheckpoint(
-        R"ROC(
-              Add a new attribute [use_softmax] )ROC",
-        paddle::framework::compatible::OpVersionDesc().NewAttr(
-            "use_softmax", "A flag to indicate whether to do softmax", true));
diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml
index 0a6062dd8294c5..2b49b6950f68ac 100644
--- a/paddle/phi/api/yaml/backward.yaml
+++ b/paddle/phi/api/yaml/backward.yaml
@@ -348,6 +348,17 @@
     func : crop_grad
     data_type : x
 
+- backward_op : cross_entropy_with_softmax_grad
+  forward : cross_entropy_with_softmax (Tensor input, Tensor label, bool soft_label=false, bool use_softmax=true, bool numeric_stable_mode=true, int ignore_index=-100, int axis=-1) -> Tensor(softmax), Tensor(loss)
+  args : (Tensor label, Tensor softmax, Tensor loss_grad, bool soft_label, bool use_softmax, bool numeric_stable_mode, int ignore_index, int axis)
+  output : Tensor(input_grad)
+  infer_meta :
+    func : CrossEntropyWithSoftmaxGradInferMeta
+  kernel :
+    func : cross_entropy_with_softmax_grad
+    data_type : loss_grad
+  inplace : (softmax -> input_grad)
+
 - backward_op : cross_grad
   forward : cross (Tensor x, Tensor y, int axis = 9) -> Tensor(out)
   args : (Tensor x, Tensor y, Tensor out_grad, int axis)
diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml
index b655f379cf71eb..25582edca0c937 100755
--- a/paddle/phi/api/yaml/legacy_backward.yaml
+++ b/paddle/phi/api/yaml/legacy_backward.yaml
@@ -239,17 +239,6 @@
   kernel :
     func : conv3d_transpose_grad
 
-- backward_op : cross_entropy_with_softmax_grad
-  forward : cross_entropy_with_softmax (Tensor input, Tensor label, bool soft_label, bool use_softmax, bool numeric_stable_mode, int ignore_index, int axis) -> Tensor(softmax), Tensor(loss)
-  args : (Tensor label, Tensor softmax, Tensor loss_grad, bool soft_label, bool use_softmax, bool numeric_stable_mode, int ignore_index, int axis)
-  output : Tensor(input_grad)
-  infer_meta :
-    func : CrossEntropyWithSoftmaxGradInferMeta
-  kernel :
-    func : cross_entropy_with_softmax_grad
-    data_type : softmax
-  inplace : (softmax -> input_grad)
-
 - backward_op : cumsum_grad
   forward : cumsum(Tensor x, Scalar axis, bool flatten, bool exclusive, bool reverse) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, Scalar axis, bool flatten, bool exclusive, bool reverse)
diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml
index d827e7eabbfa7b..972f85070ba8f8 100755
--- a/paddle/phi/api/yaml/legacy_ops.yaml
+++ b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -322,17 +322,6 @@
   output : Tensor(out)
   invoke : copy_to_impl(x, place, blocking)
 
-# Part of python API paddle.nn.functional.cross_entropy
-- op : cross_entropy_with_softmax
-  args : (Tensor input, Tensor label, bool soft_label, bool use_softmax, bool numeric_stable_mode, int ignore_index, int axis)
-  output : Tensor(softmax), Tensor(loss)
-  infer_meta :
-    func : CrossEntropyWithSoftmaxInferMeta
-  kernel :
-    func : cross_entropy_with_softmax
-    data_type : input
-  backward : cross_entropy_with_softmax_grad
-
 - op : cumsum
   args : (Tensor x, Scalar axis, bool flatten, bool exclusive, bool reverse)
   output : Tensor(out)
diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
index 552895cf25fde4..e9a790e912871d 100644
--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -456,6 +456,13 @@
   outputs :
     out : Out
 
+- op : cross_entropy_with_softmax (softmax_with_cross_entropy)
+  backward : cross_entropy_with_softmax_grad (softmax_with_cross_entropy_grad)
+  inputs :
+    {input : Logits, label : Label}
+  outputs :
+    {softmax : Softmax, loss : Loss}
+
 - op : cumprod
   backward : cumprod_grad
   inputs :
diff --git a/paddle/phi/api/yaml/op_version.yaml b/paddle/phi/api/yaml/op_version.yaml
index 8014103fad92af..b36cd86f78ae8a 100644
--- a/paddle/phi/api/yaml/op_version.yaml
+++ b/paddle/phi/api/yaml/op_version.yaml
@@ -191,6 +191,14 @@
         - add_input : ShiftsTensor
           comment : The number of places by which the elements of the tensor are shifted.
 
+- op : softmax_with_cross_entropy
+  version :
+    - checkpoint : Add a new attribute [use_softmax]
+      action :
+        - add_attr : use_softmax
+          comment : A flag to indicate whether to do softmax
+          default : "true"
+
 - op : trace
   version :
     - checkpoint : Upgrade trace add a new attribute [axis2]
diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
index 40e47845fe9002..5d5eb8d9fc3fe3 100644
--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -380,6 +380,18 @@
     data_type : x
   backward : cross_grad
 
+# Part of python API paddle.nn.functional.cross_entropy
+- op : cross_entropy_with_softmax
+  args : (Tensor input, Tensor label, bool soft_label=false, bool use_softmax=true, bool numeric_stable_mode=true, int ignore_index=-100, int axis=-1)
+  output : Tensor(softmax), Tensor(loss)
+  inplace : (input -> softmax)
+  infer_meta :
+    func : CrossEntropyWithSoftmaxInferMeta
+  kernel :
+    func : cross_entropy_with_softmax
+    data_type : input
+  backward : cross_entropy_with_softmax_grad
+
 - op : cumprod
   args : (Tensor x,  int dim)
   output : Tensor(out)
diff --git a/paddle/phi/ops/compat/softmax_with_cross_entropy_sig.cc b/paddle/phi/ops/compat/softmax_with_cross_entropy_sig.cc
deleted file mode 100644
index c75d4f711dc0fd..00000000000000
--- a/paddle/phi/ops/compat/softmax_with_cross_entropy_sig.cc
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-
-KernelSignature SoftmaxWithCrossEntropyOpArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  return KernelSignature("cross_entropy_with_softmax",
-                         {"Logits", "Label"},
-                         {"soft_label",
-                          "use_softmax",
-                          "numeric_stable_mode",
-                          "ignore_index",
-                          "axis"},
-                         {"Softmax", "Loss"});
-}
-
-KernelSignature SoftmaxWithCrossEntropyGradOpArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  return KernelSignature("cross_entropy_with_softmax_grad",
-                         {"Label", "Softmax", "Loss@GRAD"},
-                         {"soft_label",
-                          "use_softmax",
-                          "numeric_stable_mode",
-                          "ignore_index",
-                          "axis"},
-                         {"Logits@GRAD"});
-}
-
-}  // namespace phi
-
-PD_REGISTER_BASE_KERNEL_NAME(softmax_with_cross_entropy,
-                             cross_entropy_with_softmax);
-PD_REGISTER_BASE_KERNEL_NAME(softmax_with_cross_entropy_grad,
-                             cross_entropy_with_softmax_grad);
-
-PD_REGISTER_ARG_MAPPING_FN(softmax_with_cross_entropy,
-                           phi::SoftmaxWithCrossEntropyOpArgumentMapping);
-PD_REGISTER_ARG_MAPPING_FN(softmax_with_cross_entropy_grad,
-                           phi::SoftmaxWithCrossEntropyGradOpArgumentMapping);
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 874faafc92338c..85c96de4bfbc79 100644
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -313,13 +313,6 @@ def fluid_softmax_with_cross_entropy(
         loss = helper.create_variable_for_type_inference(dtype=logits.dtype)
 
         outputs = {'Softmax': softmax, 'Loss': loss}
-        if core.is_compiled_with_custom_device(
-            "npu"
-        ) or core.is_compiled_with_custom_device("mlu"):
-            backprop = helper.create_variable_for_type_inference(
-                dtype=logits.dtype
-            )
-            outputs['Backprop'] = backprop
         helper.append_op(
             type='softmax_with_cross_entropy',
             inputs={'Logits': logits, 'Label': label},
@@ -2768,13 +2761,6 @@ def cross_entropy(
         out = helper.create_variable_for_type_inference(dtype=input.dtype)
 
         outputs = {'Softmax': softmax, 'Loss': out}
-        if core.is_compiled_with_custom_device(
-            "npu"
-        ) or core.is_compiled_with_custom_device("mlu"):
-            backprop = helper.create_variable_for_type_inference(
-                dtype=input.dtype
-            )
-            outputs['Backprop'] = backprop
         helper.append_op(
             type='softmax_with_cross_entropy',
             inputs={'Logits': input, 'Label': label},

From 3ee2b23713a139fcc5bce6d40b6b57e08621b0df Mon Sep 17 00:00:00 2001
From: cyberslack_lee <luhputu0815@gmail.com>
Date: Mon, 10 Apr 2023 14:36:14 +0800
Subject: [PATCH 019/156] =?UTF-8?q?=E3=80=90Hackathon4=20No58=E3=80=91fix?=
 =?UTF-8?q?=20exponential=20and=20pad=20(#51300)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 paddle/phi/kernels/cpu/exponential_kernel.cc  |   2 +-
 paddle/phi/kernels/exponential_kernel.h       |   2 +-
 paddle/phi/kernels/gpu/exponential_kernel.cu  |  17 ++-
 paddle/phi/kernels/gpu/pad_grad_kernel.cu     |   1 +
 paddle/phi/kernels/gpu/pad_kernel.cu          |   1 +
 paddle/phi/kernels/pad_grad_kernel.h          |   1 +
 paddle/phi/kernels/pad_kernel.h               |   1 +
 .../tests/unittests/test_exponential_op.py    | 105 +++++++++++++++++-
 .../fluid/tests/unittests/test_pad_op.py      |  39 ++++++-
 python/paddle/tensor/random.py                |   4 +-
 10 files changed, 162 insertions(+), 11 deletions(-)

diff --git a/paddle/phi/kernels/cpu/exponential_kernel.cc b/paddle/phi/kernels/cpu/exponential_kernel.cc
index a4a07fc7a65e8f..d2624edda3265e 100644
--- a/paddle/phi/kernels/cpu/exponential_kernel.cc
+++ b/paddle/phi/kernels/cpu/exponential_kernel.cc
@@ -17,10 +17,10 @@
 #include <random>
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/generator.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/distribution_helper.h"
-
 namespace phi {
 
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/exponential_kernel.h b/paddle/phi/kernels/exponential_kernel.h
index 736baacca4cc9b..3862468ead5b7a 100644
--- a/paddle/phi/kernels/exponential_kernel.h
+++ b/paddle/phi/kernels/exponential_kernel.h
@@ -15,7 +15,7 @@
 #pragma once
 
 #include "paddle/phi/core/dense_tensor.h"
-
+#include "paddle/phi/core/device_context.h"
 namespace phi {
 
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/gpu/exponential_kernel.cu b/paddle/phi/kernels/gpu/exponential_kernel.cu
index fc1730dde64a7f..7d6e1d54d1e37a 100644
--- a/paddle/phi/kernels/gpu/exponential_kernel.cu
+++ b/paddle/phi/kernels/gpu/exponential_kernel.cu
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/exponential_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/distribution_helper.h"
 
@@ -25,12 +25,19 @@ void ExponentialKernel(const Context &dev_ctx,
                        const DenseTensor &x,
                        float lambda,
                        DenseTensor *out) {
-  phi::funcs::uniform_distribution<T> dist;
-  phi::funcs::exponential_transform<T> trans(lambda);
+  using MT = typename kps::details::MPTypeTrait<T>::Type;
+  phi::funcs::uniform_distribution<MT> dist;
+  phi::funcs::exponential_transform<MT> trans(lambda);
   phi::funcs::distribution_and_transform<T>(dev_ctx, out, dist, trans);
 }
 
 }  // namespace phi
 
-PD_REGISTER_KERNEL(
-    exponential, GPU, ALL_LAYOUT, phi::ExponentialKernel, float, double) {}
+PD_REGISTER_KERNEL(exponential,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ExponentialKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/pad_grad_kernel.cu b/paddle/phi/kernels/gpu/pad_grad_kernel.cu
index a25472d122b837..04b94588baa590 100644
--- a/paddle/phi/kernels/gpu/pad_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/pad_grad_kernel.cu
@@ -25,5 +25,6 @@ PD_REGISTER_KERNEL(pad_grad,
                    float,
                    double,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/pad_kernel.cu b/paddle/phi/kernels/gpu/pad_kernel.cu
index 90d81046b999df..e983e36be9b9d9 100644
--- a/paddle/phi/kernels/gpu/pad_kernel.cu
+++ b/paddle/phi/kernels/gpu/pad_kernel.cu
@@ -28,5 +28,6 @@ PD_REGISTER_KERNEL(pad,
                    int,
                    int64_t,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/pad_grad_kernel.h b/paddle/phi/kernels/pad_grad_kernel.h
index aef30f8f5e21dc..db982af8bbda25 100644
--- a/paddle/phi/kernels/pad_grad_kernel.h
+++ b/paddle/phi/kernels/pad_grad_kernel.h
@@ -17,6 +17,7 @@
 
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/pad_kernel.h b/paddle/phi/kernels/pad_kernel.h
index f3496edba0ff5f..a7479c8b9a24c0 100644
--- a/paddle/phi/kernels/pad_kernel.h
+++ b/paddle/phi/kernels/pad_kernel.h
@@ -17,6 +17,7 @@
 
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
 
 namespace phi {
 
diff --git a/python/paddle/fluid/tests/unittests/test_exponential_op.py b/python/paddle/fluid/tests/unittests/test_exponential_op.py
index 99207d7ac1dd21..2974e2d4e513b0 100644
--- a/python/paddle/fluid/tests/unittests/test_exponential_op.py
+++ b/python/paddle/fluid/tests/unittests/test_exponential_op.py
@@ -15,9 +15,14 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from eager_op_test import (
+    OpTest,
+    convert_float_to_uint16,
+    convert_uint16_to_float,
+)
 
 import paddle
+from paddle.fluid import core
 
 
 class TestExponentialOp1(OpTest):
@@ -344,6 +349,104 @@ def test_fixed_random_number(self):
         paddle.enable_static()
 
 
+class TestExponentialFP16Op(OpTest):
+    def setUp(self):
+        paddle.enable_static()
+        self.op_type = "exponential"
+        self.python_api = paddle.tensor.exponential_
+        self.config()
+        self.attrs = {"lambda": self.lam}
+        self.inputs = {'X': np.empty([1024, 1024], dtype=self.dtype)}
+        self.outputs = {'Out': np.ones([1024, 1024], dtype=self.dtype)}
+
+    def config(self):
+        self.lam = 0.5
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_customized(self.verify_output)
+
+    def verify_output(self, outs):
+        hist1, _ = np.histogram(outs[0], range=(0, 5))
+        hist1 = hist1.astype(np.float16)
+        hist1 = hist1 / float(outs[0].size)
+
+        data_np = np.random.exponential(1.0 / self.lam, [1024, 1024])
+        hist2, _ = np.histogram(data_np, range=(0, 5))
+        hist2 = hist2.astype(np.float16)
+        hist2 = hist2 / float(data_np.size)
+
+        np.testing.assert_allclose(hist1, hist2, rtol=0.05)
+
+    def test_check_grad_normal(self):
+        self.check_grad(
+            ['X'],
+            'Out',
+            in_place=True,
+            user_defined_grads=[np.zeros([1024, 1024], dtype=self.dtype)],
+            user_defined_grad_outputs=[
+                np.random.rand(1024, 1024).astype(self.dtype)
+            ],
+            check_dygraph=False,  # inplace can not call paddle.grad
+        )
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and not support the bfloat16",
+)
+class TestExponentialBP16Op(OpTest):
+    def setUp(self):
+        paddle.enable_static()
+        self.op_type = "exponential"
+        self.python_api = paddle.tensor.exponential_
+        self.config()
+        x = np.empty([1024, 1024]).astype('float32')
+        out = np.ones([1024, 1024]).astype('float32')
+        self.inputs = {'X': convert_float_to_uint16(x)}
+        self.attrs = {"lambda": self.lam}
+        self.outputs = {'Out': convert_float_to_uint16(out)}
+
+    def config(self):
+        self.lam = 0.5
+        self.dtype = np.uint16
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place_customized(
+            checker=self.verify_output, place=place
+        )
+
+    def verify_output(self, outs):
+        outs = convert_uint16_to_float(outs)
+        self.assertEqual(outs[0].shape, (1024, 1024))
+        hist1, _ = np.histogram(outs[0], range=(-3, 5))
+        hist1 = hist1.astype("float32")
+        hist1 = hist1 / float(outs[0].size)
+
+        data_np = np.random.exponential(1.0 / self.lam, [1024, 1024])
+        hist2, _ = np.histogram(data_np, range=(-3, 5))
+        hist2 = hist2.astype("float32")
+        hist2 = hist2 / float(data_np.size)
+
+        np.testing.assert_allclose(hist1, hist2, rtol=0.05)
+
+    def test_check_grad_normal(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(
+            place,
+            ['X'],
+            'Out',
+            in_place=True,
+            user_defined_grads=[np.zeros([1024, 1024], dtype=self.dtype)],
+            user_defined_grad_outputs=[
+                np.random.rand(1024, 1024).astype(self.dtype)
+            ],
+            check_dygraph=False,  # inplace can not call paddle.grad
+        )
+
+
 if __name__ == "__main__":
     paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pad_op.py b/python/paddle/fluid/tests/unittests/test_pad_op.py
index bb569686783767..a40fd7a7409bc9 100644
--- a/python/paddle/fluid/tests/unittests/test_pad_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pad_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from eager_op_test import OpTest, convert_float_to_uint16
 from test_attribute_var import UnittestBase
 
 import paddle
@@ -96,7 +96,7 @@ def get_dtype(self):
             return np.float16
 
         def test_check_grad_normal(self):
-            self.check_grad(['X'], 'Out', max_relative_error=0.3)
+            self.check_grad(['X'], 'Out')
 
     cls_name = "{}_{}".format(parent.__name__, "Fp16")
     TestPadFp16.__name__ = cls_name
@@ -202,6 +202,41 @@ def test_static(self):
         np.testing.assert_allclose(pd_out, np_out)
 
 
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and not support the bfloat16",
+)
+class TestPadBP16Op(OpTest):
+    def setUp(self):
+        self.initTestCase()
+        self.dtype = np.uint16
+        self.op_type = "pad"
+        self.python_api = pad_wrapper
+        x = np.random.random(self.shape).astype(np.float32)
+        self.attrs = {}
+        self.attrs['paddings'] = np.array(self.paddings).flatten()
+        self.attrs['pad_value'] = self.pad_value
+        out = np.pad(
+            x, self.paddings, mode='constant', constant_values=self.pad_value
+        )
+        self.inputs = {'X': convert_float_to_uint16(x)}
+        self.outputs = {'Out': convert_float_to_uint16(out)}
+
+    def initTestCase(self):
+        self.shape = (16, 16)
+        self.paddings = [(0, 1), (2, 3)]
+        self.pad_value = 0.0
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(place, ['X'], 'Out')
+
+
 if __name__ == '__main__':
     paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py
index 96297964940557..109dbcdfd97e89 100644
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
@@ -1154,7 +1154,9 @@ def exponential_(x, lam=1.0, name=None):
     if in_dygraph_mode():
         return _C_ops.exponential_(x, lam)
     else:
-        check_variable_and_dtype(x, "x", ["float32", "float64"], "exponential")
+        check_variable_and_dtype(
+            x, "x", ["float16", "float32", "float64", "uint16"], "exponential"
+        )
 
         helper = LayerHelper("exponential", **locals())
         helper.append_op(

From 90c3bddfda207c9b5197a7db99cf98707fdb7a7c Mon Sep 17 00:00:00 2001
From: gouzil <66515297+gouzil@users.noreply.github.com>
Date: Mon, 10 Apr 2023 14:39:02 +0800
Subject: [PATCH 020/156] Autogen code bilinear_tensor_product (#52690)

* add autogen code bilinear_tensor_product

* [phi] rm cc file
---
 .../operators/bilinear_tensor_product_op.cc   | 110 ------------------
 paddle/phi/api/yaml/backward.yaml             |   9 ++
 paddle/phi/api/yaml/legacy_backward.yaml      |   9 --
 paddle/phi/api/yaml/legacy_ops.yaml           |  10 --
 paddle/phi/api/yaml/op_compat.yaml            |   6 +
 paddle/phi/api/yaml/ops.yaml                  |  10 ++
 .../ops/compat/bilinear_tensor_product_sig.cc |  37 ------
 7 files changed, 25 insertions(+), 166 deletions(-)
 delete mode 100644 paddle/fluid/operators/bilinear_tensor_product_op.cc
 delete mode 100644 paddle/phi/ops/compat/bilinear_tensor_product_sig.cc

diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.cc b/paddle/fluid/operators/bilinear_tensor_product_op.cc
deleted file mode 100644
index 00586c4e1e4ab6..00000000000000
--- a/paddle/fluid/operators/bilinear_tensor_product_op.cc
+++ /dev/null
@@ -1,110 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/phi/infermeta/backward.h"
-#include "paddle/phi/infermeta/multiary.h"
-
-namespace paddle {
-namespace operators {
-
-class BilinearTensorProductOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-};
-
-class BilinearTensorProductOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The first input of bilinear_tensor_product operator.");
-    AddInput("Y", "The second input of bilinear_tensor_product operator.");
-    AddInput("Weight",
-             "The learnable parameters of bilinear_tensor_product operator.");
-    AddInput("Bias", "The learnable bias of bilinear_tensor_product operator.")
-        .AsDispensable();
-    AddOutput("Out", "The output of bilinear_tensor_product operator.");
-    AddComment(R"DOC(
-Bilinear Tensor Product operator.
-Given input X and Y, a 3D tensor Weight and a Bias. Each column of the
-Output is computed by one slice $i = 1, . . . , k$ of the tensor:
-
-$$
-M =  (X W_i) * Y \\
-Out_i = \sum_j {M_j} + Bias_i
-$$
-
-Where $W_i$ is the $i$-th slice of Input(Weight);
-      $M_j$ is the $j$-th column of $M$;
-      $Out_i$ is the $i$-th column of Output(Out);
-      $Bias_i$ is a column vector, each element of it is equal to
-        the $i$-th element of $Bias$;
-
-)DOC");
-  }
-};
-
-class BilinearTensorProductOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-};
-
-template <typename T>
-class BilinearTensorProductGradOpMaker
-    : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("bilinear_tensor_product_grad");
-    op->SetAttrMap(this->Attrs());
-    op->SetInput("X", this->Input("X"));
-    op->SetInput("Y", this->Input("Y"));
-    op->SetInput("Weight", this->Input("Weight"));
-    if (this->HasInput("Bias")) {
-      op->SetOutput(framework::GradVarName("Bias"), this->InputGrad("Bias"));
-    }
-
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    op->SetOutput(framework::GradVarName("Y"), this->InputGrad("Y"));
-    op->SetOutput(framework::GradVarName("Weight"), this->InputGrad("Weight"));
-    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-DECLARE_INFER_SHAPE_FUNCTOR(bilinear_tensor_product,
-                            BilinearTensorProductInferShapeFunctor,
-                            PD_INFER_META(phi::BilinearInferMeta));
-DECLARE_INFER_SHAPE_FUNCTOR(
-    bilinear_tensor_product_grad,
-    BilinearTensorProductGradInferShapeFunctor,
-    PD_INFER_META(phi::BilinearTensorProductGradInferMeta));
-
-REGISTER_OPERATOR(
-    bilinear_tensor_product,
-    ops::BilinearTensorProductOp,
-    ops::BilinearTensorProductOpMaker,
-    ops::BilinearTensorProductGradOpMaker<paddle::framework::OpDesc>,
-    ops::BilinearTensorProductGradOpMaker<paddle::imperative::OpBase>,
-    BilinearTensorProductInferShapeFunctor);
-REGISTER_OPERATOR(bilinear_tensor_product_grad,
-                  ops::BilinearTensorProductOpGrad,
-                  BilinearTensorProductGradInferShapeFunctor);
diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml
index 2b49b6950f68ac..7116b2be70dbf0 100644
--- a/paddle/phi/api/yaml/backward.yaml
+++ b/paddle/phi/api/yaml/backward.yaml
@@ -173,6 +173,15 @@
   data_transform :
     skip_transform : out_size, size_tensor, scale_tensor
 
+- backward_op : bilinear_tensor_product_grad
+  forward : bilinear_tensor_product (Tensor x, Tensor y, Tensor weight, Tensor bias) -> Tensor(out)
+  args : (Tensor x, Tensor y, Tensor weight, Tensor out_grad)
+  output : Tensor(x_grad), Tensor(y_grad), Tensor(weight_grad), Tensor(bias_grad)
+  infer_meta :
+    func : BilinearTensorProductGradInferMeta
+  kernel :
+    func : bilinear_grad
+
 - backward_op : bmm_grad
   forward : bmm (Tensor x, Tensor y) -> Tensor(out)
   args : (Tensor x, Tensor y, Tensor out_grad)
diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml
index 25582edca0c937..6ba507312b303e 100755
--- a/paddle/phi/api/yaml/legacy_backward.yaml
+++ b/paddle/phi/api/yaml/legacy_backward.yaml
@@ -122,15 +122,6 @@
   composite: batch_norm_grad(x, scale, bias, mean_out, variance_out, saved_mean, saved_variance, reserve_space, out_grad, momentum, epsilon, data_layout, is_test, use_global_stats, trainable_statistics)
   backward : batch_norm_double_grad
 
-- backward_op : bilinear_tensor_product_grad
-  forward : bilinear_tensor_product (Tensor x, Tensor y, Tensor weight, Tensor bias) -> Tensor(out)
-  args : (Tensor x, Tensor y, Tensor weight, Tensor out_grad)
-  output : Tensor(x_grad), Tensor(y_grad), Tensor(weight_grad), Tensor(bias_grad)
-  infer_meta :
-    func : BilinearTensorProductGradInferMeta
-  kernel :
-    func : bilinear_grad
-
 - backward_op : cast_grad
   forward : cast (Tensor x, DataType dtype) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml
index 972f85070ba8f8..32966d54e09594 100755
--- a/paddle/phi/api/yaml/legacy_ops.yaml
+++ b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -186,16 +186,6 @@
   view : (mean -> mean_out), (variance -> variance_out)
   backward : batch_norm_grad
 
-- op : bilinear_tensor_product
-  args : (Tensor x, Tensor y, Tensor weight, Tensor bias)
-  output : Tensor
-  infer_meta :
-    func : BilinearInferMeta
-  kernel :
-    func : bilinear
-  optional : bias
-  backward : bilinear_tensor_product_grad
-
 - op : bincount
   args: (Tensor x, Tensor weights, Scalar(int) minlength = 0)
   output: Tensor(out)
diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
index e9a790e912871d..f905b04c92df7b 100644
--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -262,6 +262,12 @@
   extra :
     attrs : [bool use_mkldnn = false]
 
+- op : bilinear_tensor_product
+  inputs :
+    {x : X, y : Y,weight: Weight, bias: Bias}
+  outputs :
+    {out : Out}
+
 - op : bitwise_and
   inputs :
     {x : X, y : Y}
diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
index 5d5eb8d9fc3fe3..e0598f15b58105 100644
--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -218,6 +218,16 @@
   data_transform :
     skip_transform : out_size, size_tensor, scale_tensor
 
+- op : bilinear_tensor_product
+  args : (Tensor x, Tensor y, Tensor weight, Tensor bias)
+  output : Tensor
+  infer_meta :
+    func : BilinearInferMeta
+  kernel :
+    func : bilinear
+  optional : bias
+  backward : bilinear_tensor_product_grad
+
 - op : bitwise_and
   args : (Tensor x, Tensor y)
   output : Tensor(out)
diff --git a/paddle/phi/ops/compat/bilinear_tensor_product_sig.cc b/paddle/phi/ops/compat/bilinear_tensor_product_sig.cc
deleted file mode 100644
index 54509e4b2de459..00000000000000
--- a/paddle/phi/ops/compat/bilinear_tensor_product_sig.cc
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-
-KernelSignature BilinearTensorProductOpArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  return KernelSignature("bilinear", {"X", "Y", "Weight", "Bias"}, {}, {"Out"});
-}
-
-KernelSignature BilinearTensorProductGradOpArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  return KernelSignature("bilinear_grad",
-                         {"X", "Y", "Weight", "Out@GRAD"},
-                         {},
-                         {"X@GRAD", "Y@GRAD", "Weight@GRAD", "Bias@GRAD"});
-}
-
-}  // namespace phi
-
-PD_REGISTER_ARG_MAPPING_FN(bilinear_tensor_product,
-                           phi::BilinearTensorProductOpArgumentMapping);
-PD_REGISTER_ARG_MAPPING_FN(bilinear_tensor_product_grad,
-                           phi::BilinearTensorProductGradOpArgumentMapping);

From 01247e3318f91a779da827178db3834d22ec5f96 Mon Sep 17 00:00:00 2001
From: HongyuJia <jiahongyu@baidu.com>
Date: Mon, 10 Apr 2023 14:46:00 +0800
Subject: [PATCH 021/156] [Opt Performance] Optimize custom operator
 performance (#52597)

* [Opt Performance] Optimize custom operator performance, reconstruct python API auto-gen, add cache and use const inference

* opt AutoGradMeta implementation

* remove profiler codes

* fix unit test

* change year, 2021->2023

* fix int64_t parse bug
---
 .../custom_operator/custom_operator_node.cc   |   6 +-
 paddle/fluid/framework/custom_operator.cc     |  84 +----
 .../fluid/framework/custom_operator_utils.h   | 105 ++++++
 paddle/fluid/pybind/eager_functions.cc        | 319 +++++++++++-------
 paddle/fluid/pybind/eager_utils.cc            |  62 +++-
 paddle/fluid/pybind/eager_utils.h             |   3 +-
 paddle/fluid/pybind/op_function_common.cc     |   8 +-
 paddle/fluid/pybind/pybind.cc                 |  64 ----
 paddle/phi/api/ext/op_meta_info.h             |  15 +-
 paddle/phi/api/lib/op_meta_info.cc            |  56 ++-
 .../utils/cpp_extension/extension_utils.py    |  36 +-
 11 files changed, 425 insertions(+), 333 deletions(-)
 create mode 100644 paddle/fluid/framework/custom_operator_utils.h

diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.cc b/paddle/fluid/eager/custom_operator/custom_operator_node.cc
index 63dbf152b81156..6695cf5027f6de 100644
--- a/paddle/fluid/eager/custom_operator/custom_operator_node.cc
+++ b/paddle/fluid/eager/custom_operator/custom_operator_node.cc
@@ -236,7 +236,8 @@ RunCustomOpNode::operator()(paddle::small_vector<std::vector<paddle::Tensor>,
   VLOG(7) << "Run Kernel of Grad Custom Op: " << op_type_ << "_grad";
 
   // handle inplace map
-  ctx.MapPlainOutputs(grad_inputs_name, grad_outputs_names, grad_inplace_map);
+  ctx.UpdatePlainOutputs(
+      grad_inputs_name, grad_outputs_names, grad_inplace_map);
   (*paddle::OpMetaInfoHelper::GetKernelFn(kernel_map.at(op_type_)[1]))(&ctx);
   ctx.AssignInplaceOutputs();
 
@@ -443,7 +444,8 @@ RunCustomOpDoubleGradNode::operator()(
   VLOG(7) << "Run Kernel of Grad Custom Op: " << name();
 
   // handle inplace map
-  ctx.MapPlainOutputs(grad_inputs_name, grad_outputs_names, grad_inplace_map);
+  ctx.UpdatePlainOutputs(
+      grad_inputs_name, grad_outputs_names, grad_inplace_map);
   (*paddle::OpMetaInfoHelper::GetKernelFn(kernel_map.at(op_type_)[2]))(&ctx);
   ctx.AssignInplaceOutputs();
 
diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index 641674695ca8c5..8435e825531eef 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -28,6 +28,7 @@ limitations under the License. */
 #include "paddle/fluid/eager/api/utils/global_utils.h"
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/custom_operator_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/phi_utils.h"
@@ -52,87 +53,6 @@ DECLARE_string(tensor_operants_mode);
 namespace paddle {
 namespace framework {
 
-namespace detail {
-
-// dynamic lib load func
-template <typename T>
-static T* DynLoad(void* handle, std::string name) {
-  T* func = reinterpret_cast<T*>(dlsym(handle, name.c_str()));
-#if !defined(_WIN32)
-  auto errorno = dlerror();
-#else
-  auto errorno = GetLastError();
-#endif  // !_WIN32
-  PADDLE_ENFORCE_NOT_NULL(
-      func,
-      platform::errors::NotFound(
-          "Failed to load dynamic operator library, error message(%s).",
-          errorno));
-  return func;
-}
-
-inline static bool IsDuplicableVar(const std::string& var_name) {
-  std::string suffix = kTensorVectorSuffix;
-  return var_name.rfind(suffix) != std::string::npos;
-}
-
-inline static bool IsOptionalVar(const std::string& var_name) {
-  std::string suffix = kOptionalSuffix;
-  return var_name.rfind(suffix) != std::string::npos;
-}
-
-inline static std::string NoGrad(const std::string& var_name,
-                                 bool is_double_grad = false) {
-  std::string suffix = kGradVarSuffix;
-  std::string new_out_suffix = kDoubleGradNewOutSuffix;
-  std::string tmp_var_name(var_name);
-  if (is_double_grad &&
-      (tmp_var_name.rfind(new_out_suffix) != std::string::npos)) {
-    tmp_var_name = tmp_var_name.substr(
-        0, tmp_var_name.size() - /*kDoubleGradNewOutSuffix length*/ 4);
-  }
-  return tmp_var_name.substr(0, tmp_var_name.size() - kGradVarSuffixSize);
-}
-
-inline static bool IsGradVar(const std::string& var_name, bool is_double_grad) {
-  std::string suffix = kGradVarSuffix;
-  if (!is_double_grad) {
-    return var_name.rfind(suffix) != std::string::npos;
-  } else {
-    // for double grad cases, the X@GRAD is not a grad var, X@GRAD@GRAD is a
-    // grad var, here we remove a @GRAD suffix
-    return NoGrad(var_name).rfind(suffix) != std::string::npos;
-  }
-}
-
-inline static bool IsMemberOf(const std::vector<std::string>& vec,
-                              const std::string& name) {
-  return std::find(vec.cbegin(), vec.cend(), name) != vec.cend();
-}
-
-static std::vector<std::string> ParseAttrStr(const std::string& attr) {
-  auto split_pos = attr.find_first_of(":");
-  PADDLE_ENFORCE_NE(split_pos,
-                    std::string::npos,
-                    platform::errors::InvalidArgument(
-                        "Invalid attribute string format. Attribute string "
-                        "format is `<name>:<type>`."));
-
-  std::vector<std::string> rlt;
-  // 1. name
-  rlt.emplace_back(string::trim_spaces(attr.substr(0, split_pos)));
-  // 2. type
-  rlt.emplace_back(string::trim_spaces(attr.substr(split_pos + 1)));
-
-  VLOG(3) << "attr name: " << rlt[0] << ", attr type str: " << rlt[1];
-
-  return rlt;
-}
-
-}  // namespace detail
-
-////////////////// Kernel Define ////////////////////
-
 // custom op kernel call function define
 static void RunKernelFunc(
     const framework::ExecutionContext& ctx,
@@ -355,7 +275,7 @@ static void RunKernelFunc(
     }
 
     // handle inplace map
-    kernel_ctx.MapPlainOutputs(inputs, outputs, inplace_map);
+    kernel_ctx.UpdatePlainOutputs(inputs, outputs, inplace_map);
     func(&kernel_ctx);
     kernel_ctx.AssignInplaceOutputs();
 
diff --git a/paddle/fluid/framework/custom_operator_utils.h b/paddle/fluid/framework/custom_operator_utils.h
new file mode 100644
index 00000000000000..678e0f5db3194d
--- /dev/null
+++ b/paddle/fluid/framework/custom_operator_utils.h
@@ -0,0 +1,105 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/string/string_helper.h"
+#include "paddle/phi/api/ext/op_meta_info.h"
+
+namespace paddle {
+namespace framework {
+
+namespace detail {
+
+// dynamic lib load func
+template <typename T>
+static T* DynLoad(void* handle, std::string name) {
+  T* func = reinterpret_cast<T*>(dlsym(handle, name.c_str()));
+#if !defined(_WIN32)
+  auto errorno = dlerror();
+#else
+  auto errorno = GetLastError();
+#endif  // !_WIN32
+  PADDLE_ENFORCE_NOT_NULL(
+      func,
+      platform::errors::NotFound(
+          "Failed to load dynamic operator library, error message(%s).",
+          errorno));
+  return func;
+}
+
+inline static bool IsDuplicableVar(const std::string& var_name) {
+  std::string suffix = kTensorVectorSuffix;
+  return var_name.rfind(suffix) != std::string::npos;
+}
+
+inline static bool IsOptionalVar(const std::string& var_name) {
+  std::string suffix = kOptionalSuffix;
+  return var_name.rfind(suffix) != std::string::npos;
+}
+
+inline static std::string NoGrad(const std::string& var_name,
+                                 bool is_double_grad = false) {
+  std::string suffix = kGradVarSuffix;
+  std::string new_out_suffix = kDoubleGradNewOutSuffix;
+  std::string tmp_var_name(var_name);
+  if (is_double_grad &&
+      (tmp_var_name.rfind(new_out_suffix) != std::string::npos)) {
+    tmp_var_name = tmp_var_name.substr(
+        0, tmp_var_name.size() - /*kDoubleGradNewOutSuffix length*/ 4);
+  }
+  return tmp_var_name.substr(0, tmp_var_name.size() - kGradVarSuffixSize);
+}
+
+inline static bool IsGradVar(const std::string& var_name, bool is_double_grad) {
+  std::string suffix = kGradVarSuffix;
+  if (!is_double_grad) {
+    return var_name.rfind(suffix) != std::string::npos;
+  } else {
+    // for double grad cases, the X@GRAD is not a grad var, X@GRAD@GRAD is a
+    // grad var, here we remove a @GRAD suffix
+    return NoGrad(var_name).rfind(suffix) != std::string::npos;
+  }
+}
+
+inline static bool IsMemberOf(const std::vector<std::string>& vec,
+                              const std::string& name) {
+  return std::find(vec.cbegin(), vec.cend(), name) != vec.cend();
+}
+
+static std::vector<std::string> ParseAttrStr(const std::string& attr) {
+  auto split_pos = attr.find_first_of(":");
+  PADDLE_ENFORCE_NE(split_pos,
+                    std::string::npos,
+                    platform::errors::InvalidArgument(
+                        "Invalid attribute string format. Attribute string "
+                        "format is `<name>:<type>`."));
+
+  std::vector<std::string> rlt;
+  // 1. name
+  rlt.emplace_back(string::trim_spaces(attr.substr(0, split_pos)));
+  // 2. type
+  rlt.emplace_back(string::trim_spaces(attr.substr(split_pos + 1)));
+
+  VLOG(3) << "attr name: " << rlt[0] << ", attr type str: " << rlt[1];
+
+  return rlt;
+}
+
+}  // namespace detail
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
index 0d8b297bf40329..8df301520ec50c 100644
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -33,6 +33,7 @@ typedef SSIZE_T ssize_t;
 #include "paddle/fluid/eager/utils.h"
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/custom_operator.h"
+#include "paddle/fluid/framework/custom_operator_utils.h"
 #include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/fluid/framework/python_headers.h"
 #include "paddle/fluid/memory/allocation/allocator.h"
@@ -43,6 +44,7 @@ typedef SSIZE_T ssize_t;
 #include "paddle/fluid/pybind/eager.h"
 #include "paddle/fluid/pybind/eager_utils.h"
 #include "paddle/fluid/pybind/exception.h"
+#include "paddle/fluid/pybind/op_function_common.h"
 #include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/phi/api/ext/op_meta_info.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
@@ -424,55 +426,6 @@ static void ConstructFwdAndBwdMap(
   }
 }
 
-static std::vector<paddle::any> CastAttrsToTargetType(
-    const std::vector<paddle::any>& src,
-    const std::vector<std::string>& attrs_names) {
-  std::vector<paddle::any> res;
-  PADDLE_ENFORCE_EQ(src.size(),
-                    attrs_names.size(),
-                    paddle::platform::errors::InvalidArgument(
-                        "We Expected same size of attrs and attrs_name list, "
-                        "if u got this error indicate your custom op setting "
-                        "%s attrs, but you just give %s",
-                        attrs_names.size(),
-                        src.size()));
-  for (size_t i = 0; i < src.size(); i++) {
-    size_t end = attrs_names[i].find(": ");
-    std::string type_name = attrs_names[i].substr(end + 2);
-    if (type_name == "int") {
-      if (src[i].type() == typeid(bool)) {
-        res.emplace_back(static_cast<int>(paddle::any_cast<bool>(src[i])));
-      } else if (src[i].type() == typeid(int)) {
-        res.emplace_back(src[i]);
-      } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "Your No. %s attrs should only can be bool or int32, other type is "
-            "forbidden for now but we got %s. Check your code first please",
-            i,
-            src[i].type().name()));
-      }
-    } else if (type_name == "int64_t") {
-      if (src[i].type() == typeid(bool)) {
-        res.emplace_back(static_cast<int64_t>(paddle::any_cast<bool>(src[i])));
-      } else if (src[i].type() == typeid(int)) {
-        res.emplace_back(static_cast<int64_t>(paddle::any_cast<int>(src[i])));
-      } else if (src[i].type() == typeid(int64_t)) {
-        res.emplace_back(src[i]);
-      } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "Your No. %s attrs should only can be bool or int32 or int64_t, "
-            "other type is forbidden for now but we got %s. Check your code "
-            "first please",
-            i,
-            src[i].type().name()));
-      }
-    } else {
-      res.emplace_back(src[i]);
-    }
-  }
-  return res;
-}
-
 static PyObject* eager_api_jit_function_call(PyObject* self,
                                              PyObject* args,
                                              PyObject* kwargs) {
@@ -534,6 +487,25 @@ static PyObject* eager_api__get_custom_operator_inplace_reverse_idx(
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
+// This function copies from function `EmptyTensorInitializer` with default
+// parameters
+static Tensor InitializedEmptyTensor() {
+  auto ddims = phi::make_ddim({0});
+  auto tensor = paddle::Tensor();
+  tensor.set_name(
+      egr::Controller::Instance().GenerateUniqueName("generated_tensor"));
+  auto autograd_meta = egr::EagerUtils::autograd_meta(&tensor);
+  autograd_meta->SetPersistable(false);
+  std::shared_ptr<phi::DenseTensor> dense_tensor = nullptr;
+  std::shared_ptr<phi::Allocation> allocation_ptr = nullptr;
+  dense_tensor = std::make_shared<phi::DenseTensor>(
+      allocation_ptr, phi::DenseTensorMeta(phi::DataType::FLOAT32, ddims));
+  tensor.set_impl(dense_tensor);
+  autograd_meta->SetGradNode(
+      std::make_shared<egr::GradNodeAccumulation>(autograd_meta));
+  return tensor;
+}
+
 static PyObject* eager_api_run_custom_op(PyObject* self,
                                          PyObject* args,
                                          PyObject* kwargs) {
@@ -545,14 +517,11 @@ static PyObject* eager_api_run_custom_op(PyObject* self,
     VLOG(4) << "Initialize phi tensor operants successfully";
   }
 
-  paddle::CustomOpKernelContext ctx =
-      CastPyArg2CustomOpKernelContext(PyTuple_GET_ITEM(args, 0), 0);
-  std::string op_type = CastPyArg2AttrString(PyTuple_GET_ITEM(args, 1), 1);
-  bool trace_backward = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 2), 2);
+  std::string op_type = CastPyArg2AttrString(PyTuple_GET_ITEM(args, 0), 0);
+  VLOG(7) << "Get things from python for Custom Op: " << op_type;
+  paddle::CustomOpKernelContext ctx;
   {
     eager_gil_scoped_release guard;
-    VLOG(7) << "Get things for python for Custom Op: " << op_type
-            << ", trace_backward is: " << trace_backward;
     auto meta_info_map = egr::Controller::Instance().GetOpMetaInfoMap();
     PADDLE_ENFORCE_NE(
         meta_info_map.find(op_type),
@@ -562,40 +531,138 @@ static PyObject* eager_api_run_custom_op(PyObject* self,
             "created by LoadOpMetaInfoAndRegisterOp, please make "
             "sure you registered your op first and try again. ",
             op_type));
-    VLOG(7) << "Run Kernel of Custom Op: " << op_type;
-    // TODO(HongyuJia): Optimize Attrs Cast naming and implementation
-    std::vector<paddle::any> res_attrs = CastAttrsToTargetType(
-        ctx.Attrs(),
-        paddle::OpMetaInfoHelper::GetAttrs(meta_info_map.at(op_type)[0]));
-    ctx.EmplaceBackAttrs(res_attrs);
     const auto& vec_map = meta_info_map.at(op_type);
-
-    const auto& inputs =
-        paddle::OpMetaInfoHelper::GetInputs(meta_info_map.at(op_type)[0]);
-    const auto& outputs =
-        paddle::OpMetaInfoHelper::GetOutputs(meta_info_map.at(op_type)[0]);
+    const auto& inputs = paddle::OpMetaInfoHelper::GetInputs(vec_map[0]);
+    const auto& attrs = paddle::OpMetaInfoHelper::GetAttrs(vec_map[0]);
+    const auto& outputs = paddle::OpMetaInfoHelper::GetOutputs(vec_map[0]);
     const auto& inplace_map =
-        paddle::OpMetaInfoHelper::GetInplaceMap(meta_info_map.at(op_type)[0]);
+        paddle::OpMetaInfoHelper::GetInplaceMap(vec_map[0]);
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      const auto& input = inputs.at(i);
+      // Parse op_type first, so that use i + 1
+      PyObject* obj = PyTuple_GET_ITEM(args, i + 1);
+      // Emplace Py_None from python, this means optional inputs passed to C++,
+      // use one un-initialized tensor to indicate both Tensor and
+      // vector<Tensor> inputs.
+      if (obj == Py_None) {
+        VLOG(7) << "Custom operator add input " << input
+                << " to CustomOpKernelContext. Add un-initialized tensor "
+                   "because the optional input is None";
+        ctx.EmplaceBackInput(std::move(paddle::Tensor()));
+        continue;
+      }
+      if (paddle::framework::detail::IsDuplicableVar(input)) {
+        ctx.EmplaceBackInputs(std::move(CastPyArg2VectorOfTensor(obj, i + 1)));
+        VLOG(7) << "Custom operator add input " << input
+                << " to CustomOpKernelContext. Add vector<Tensor> size = "
+                << ctx.InputRangeAt(i).second - ctx.InputRangeAt(i).first;
+      } else {
+        ctx.EmplaceBackInput(std::move(CastPyArg2Tensor(obj, i + 1)));
+        VLOG(7) << "Custom operator add input " << input
+                << " to CustomOpKernelContext. Add Tensor for general case.";
+      }
+    }
+    // Parse op_type and inputs first, so that use 1 + inputs.size() + i
+    int attr_start_idx = 1 + inputs.size();
+    for (size_t i = 0; i < attrs.size(); ++i) {
+      const auto& attr = attrs.at(i);
+      std::vector<std::string> attr_name_and_type =
+          paddle::framework::detail::ParseAttrStr(attr);
+      auto attr_type_str = attr_name_and_type[1];
+      VLOG(7) << "Custom operator add attrs " << attr_name_and_type[0]
+              << " to CustomOpKernelContext. Attribute type = "
+              << attr_type_str;
+      PyObject* obj = PyTuple_GET_ITEM(args, attr_start_idx + i);
+      if (attr_type_str == "bool") {
+        ctx.EmplaceBackAttr(CastPyArg2AttrBoolean(obj, attr_start_idx + i));
+      } else if (attr_type_str == "int") {
+        ctx.EmplaceBackAttr(CastPyArg2AttrInt(obj, attr_start_idx + i));
+      } else if (attr_type_str == "float") {
+        ctx.EmplaceBackAttr(CastPyArg2AttrFloat(obj, attr_start_idx + i));
+      } else if (attr_type_str == "int64_t") {
+        ctx.EmplaceBackAttr(CastPyArg2Long(obj, op_type, attr_start_idx + i));
+      } else if (attr_type_str == "std::string") {
+        ctx.EmplaceBackAttr(CastPyArg2AttrString(obj, attr_start_idx + i));
+      } else if (attr_type_str == "std::vector<int>") {
+        ctx.EmplaceBackAttr(CastPyArg2VectorOfInt(obj, attr_start_idx + i));
+      } else if (attr_type_str == "std::vector<float>") {
+        ctx.EmplaceBackAttr(CastPyArg2VectorOfFloat(obj, attr_start_idx + i));
+      } else if (attr_type_str == "std::vector<int64_t>") {
+        ctx.EmplaceBackAttr(CastPyArg2Longs(obj, op_type, attr_start_idx + i));
+      } else if (attr_type_str == "std::vector<std::string>") {
+        ctx.EmplaceBackAttr(CastPyArg2VectorOfString(obj, attr_start_idx + i));
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupported `%s` type value as custom attribute now. "
+            "Supported data types include `bool`, `int`, `float`, "
+            "`int64_t`, `std::string`, `std::vector<int>`, "
+            "`std::vector<float>`, `std::vector<int64_t>`, "
+            "`std::vector<std::string>`, Please check whether "
+            "the attribute data type and data type string are matched.",
+            attr_type_str));
+      }
+    }
+    ctx.ConstructInplaceIndex(inputs, outputs, inplace_map);
+    const auto& inplace_reverse_idx_map = ctx.GetInplaceReverseIndexMap();
+    for (size_t out_idx = 0; out_idx < outputs.size(); ++out_idx) {
+      const auto& output = outputs.at(out_idx);
+      // inplace special case
+      if (inplace_reverse_idx_map.find(out_idx) !=
+          inplace_reverse_idx_map.end()) {
+        size_t in_idx = inplace_reverse_idx_map.at(out_idx);
+        const auto& input_range = ctx.InputRangeAt(in_idx);
+        const auto& input_tensor = ctx.InputAt(input_range.first);
+        // inplace optional [Tensor or vector<Tensor>], un-initialized tensor.
+        if (paddle::framework::detail::IsOptionalVar(output) &&
+            !input_tensor.initialized()) {
+          VLOG(7) << "Custom operator add output " << output
+                  << " to CustomOpKernelContext. Add un-initialized tensor "
+                     "because the inplace optional input is None";
+          ctx.EmplaceBackOutput(std::move(paddle::Tensor()));
+          continue;
+        }
+        /// inplace vector<Tensor>, initialized tensor.
+        if (paddle::framework::detail::IsDuplicableVar(output)) {
+          std::vector<paddle::Tensor> empty_tensors;
+          size_t vector_size = input_range.second - input_range.first;
+          empty_tensors.resize(vector_size);
+          for (size_t i = 0; i < vector_size; ++i) {
+            empty_tensors[i] = InitializedEmptyTensor();
+          }
+          VLOG(7) << "Custom operator add output " << output
+                  << " to CustomOpKernelContext. Add vector<tensor> size = "
+                  << empty_tensors.size();
+          ctx.EmplaceBackOutputs(std::move(empty_tensors));
+          continue;
+        }
+      }
+      VLOG(7) << "Custom operator add output " << output
+              << " to CustomOpKernelContext. Add initialized Tensor because "
+                 "using general or inplace mechanism";
+      // general Tensor or inplace Tensor, initialized tensor.
+      ctx.EmplaceBackOutput(std::move(InitializedEmptyTensor()));
+    }
+
     // handle inplace map
-    ctx.MapPlainOutputs(inputs, outputs, inplace_map);
+    ctx.UpdatePlainOutputs(inputs, outputs, inplace_map);
+    VLOG(7) << "Run Kernel of Custom Op: " << op_type;
     (*paddle::OpMetaInfoHelper::GetKernelFn(vec_map[0]))(&ctx);
     ctx.AssignInplaceOutputs();
 
     // handle optional None output when construct backward graph
     for (size_t i = 0; i < ctx.OutputRange().size(); i++) {
       if (ctx.OutputRangeAt(i).first + 1 == ctx.OutputRangeAt(i).second) {
-        size_t idx = ctx.OutputRangeAt(i).first;
-        paddle::Tensor* out_tensor = ctx.MutableOutputAt(idx);
+        paddle::Tensor* out_tensor =
+            ctx.MutableOutputAt(ctx.OutputRangeAt(i).first);
         if (!out_tensor->initialized()) {
           PADDLE_ENFORCE(
-              outputs.at(idx).find(paddle::kOptionalSuffix) !=
-                  std::string::npos,
+              paddle::framework::detail::IsOptionalVar(outputs.at(i)),
               phi::errors::InvalidArgument(
                   "Custom operator's %d-th output is not initialized. "
                   "Please check your implementation again. If you are "
                   "using inplace optional output, then you must use "
                   "`paddle::Optional` to decorate this output",
-                  idx));
+                  i));
           // We can also consider using `autograd_meta` to tolerant nullptr.
           out_tensor->set_autograd_meta(std::make_shared<egr::AutogradMeta>());
         }
@@ -603,45 +670,37 @@ static PyObject* eager_api_run_custom_op(PyObject* self,
     }
 
     VLOG(7) << "Get AutogradMeta for inputs and outputs for Custom Op";
-    std::vector<std::vector<egr::AutogradMeta*>> ins_auto_grad_metas;
-    std::vector<std::vector<egr::AutogradMeta*>> outs_auto_grad_metas;
-    VLOG(7) << "We got slot num of ins is: " << ctx.InputRange().size();
-    ins_auto_grad_metas.resize(ctx.InputRange().size());
-    VLOG(7) << "We got slot num of outs is: " << ctx.OutputRange().size();
-    outs_auto_grad_metas.resize(ctx.OutputRange().size());
-
-    for (size_t i = 0; i < ctx.InputRange().size(); i++) {
-      ins_auto_grad_metas[i] =
-          egr::EagerUtils::nullable_autograd_meta(ctx.InputsBetween(
-              ctx.InputRangeAt(i).first, ctx.InputRangeAt(i).second));
-    }
-    for (size_t i = 0; i < ctx.OutputRange().size(); i++) {
-      outs_auto_grad_metas[i] =
-          egr::EagerUtils::unsafe_autograd_meta(ctx.OutputsBetweeen(
-              ctx.OutputRangeAt(i).first, ctx.OutputRangeAt(i).second));
-    }
+    size_t slot_ins_num = ctx.InputRange().size();
+    size_t slot_outs_num = ctx.OutputRange().size();
+    VLOG(7) << "We got slot num of ins is: " << slot_ins_num;
+    VLOG(7) << "We got slot num of outs is: " << slot_outs_num;
+    std::vector<egr::AutogradMeta*> ins_auto_grad_metas =
+        egr::EagerUtils::nullable_autograd_meta(*ctx.AllMutableInput());
+    std::vector<egr::AutogradMeta*> outs_auto_grad_metas =
+        egr::EagerUtils::unsafe_autograd_meta(*ctx.AllMutableOutput());
+
     bool require_any_grad = false;
-    for (size_t i = 0; i < ins_auto_grad_metas.size(); i++) {
+    bool trace_backward = true;
+    for (size_t i = 0; i < ins_auto_grad_metas.size(); ++i) {
       require_any_grad =
           require_any_grad || egr::EagerUtils::ComputeRequireGrad(
-                                  trace_backward, &(ins_auto_grad_metas[i]));
+                                  trace_backward, ins_auto_grad_metas[i]);
     }
 
     // handle inplace map
-    for (size_t i = 0; i < ctx.InputRange().size(); i++) {
-      if (inplace_map.find(inputs[i]) != inplace_map.end()) {
-        size_t input_size =
-            ctx.InputRangeAt(i).second - ctx.InputRangeAt(i).first;
-        size_t start_idx = ctx.InputRangeAt(i).first;
-        for (size_t j = 0; j < input_size; j++) {
-          egr::EagerUtils::CheckInplace(ctx.InputAt(start_idx + j),
-                                        ins_auto_grad_metas[i][j],
-                                        require_any_grad);
-          if (ctx.MutableInputAt(start_idx + j).defined()) {
+    if (!inplace_map.empty()) {
+      for (size_t i = 0; i < ctx.InputRange().size(); i++) {
+        if (inplace_map.find(inputs[i]) == inplace_map.end()) {
+          continue;
+        }
+        const auto& input_pair = ctx.InputRangeAt(i);
+        for (size_t j = input_pair.first; j < input_pair.second; j++) {
+          egr::EagerUtils::CheckInplace(
+              ctx.InputAt(j), ins_auto_grad_metas[j], require_any_grad);
+          if (ctx.MutableInputAt(j).defined()) {
             // Bump Inplace Version
-            ctx.MutableInputAt(start_idx + j).bump_inplace_version();
-            VLOG(3) << "Custom operator: Tensor("
-                    << ctx.InputAt(start_idx + j).name()
+            ctx.MutableInputAt(j).bump_inplace_version();
+            VLOG(3) << "Custom operator: Tensor(" << ctx.InputAt(j).name()
                     << ") uses Inplace Strategy.";
           }
         }
@@ -651,45 +710,50 @@ static PyObject* eager_api_run_custom_op(PyObject* self,
     if (require_any_grad && (vec_map.size() > 1)) {
       VLOG(6) << " Construct Grad for Custom Op: " << op_type;
       ConstructFwdAndBwdMap(vec_map, op_type);
-      for (size_t i = 0; i < outs_auto_grad_metas.size(); i++) {
-        egr::EagerUtils::PassStopGradient(false, &(outs_auto_grad_metas[i]));
+      for (size_t i = 0; i < outs_auto_grad_metas.size(); ++i) {
+        egr::EagerUtils::PassStopGradient(false, outs_auto_grad_metas[i]);
       }
       // Note(HongyuJia): In dygraph eager mode, CheckInplace makes sure leaf
       // nodes set stop_gradient=True. However, dygraph mode can also outputs
       // lead nodes' gradients (For example, we can get x.grad after x.add_(y)).
       // To be consistent with dygraph mode, we have to PassStopGradient for all
       // inplaced ins_auto_grad_metas.
-      std::unordered_map<size_t, size_t> inplace_tensor_map =
-          ctx.GetInplaceTensorMap();
-      for (auto pair : inplace_tensor_map) {
-        egr::EagerUtils::PassStopGradient(false,
-                                          &(ins_auto_grad_metas[pair.first]));
+      const auto& inplace_index_map = ctx.GetInplaceIndexMap();
+      for (auto pair : inplace_index_map) {
+        const auto& size_pair = ctx.InputRangeAt(pair.first);
+        for (size_t i = size_pair.first; i < size_pair.second; ++i) {
+          egr::EagerUtils::PassStopGradient(false, ins_auto_grad_metas[i]);
+        }
       }
       auto grad_node = std::make_shared<egr::RunCustomOpNode>(
-          outs_auto_grad_metas.size(), ins_auto_grad_metas.size(), op_type);
-      auto slot_map =
+          slot_outs_num, slot_ins_num, op_type);
+      const auto& slot_map =
           egr::Controller::Instance().GetCustomEdgesSlotMap().at(op_type);
+
       // Prepare Grad outputs
       size_t no_grad_cnt = 0;
-      for (size_t i = 0; i < ins_auto_grad_metas.size(); i++) {
+      for (size_t i = 0; i < slot_ins_num; i++) {
         const std::vector<paddle::Tensor>& in_tensors = ctx.InputsBetween(
             ctx.InputRangeAt(i).first, ctx.InputRangeAt(i).second);
 
         if (slot_map[0][0].find(i) != slot_map[0][0].end()) {
-          grad_node->SetGradOutMeta(in_tensors, slot_map[0][0][i]);
+          grad_node->SetGradOutMeta(in_tensors, slot_map[0][0].at(i));
         } else {
-          grad_node->SetGradOutMeta(
-              in_tensors, ins_auto_grad_metas.size() - 1 - no_grad_cnt);
+          grad_node->SetGradOutMeta(in_tensors, slot_ins_num - 1 - no_grad_cnt);
           no_grad_cnt++;
         }
       }
       // Prepare Grad inputs with grad of fwd outputs
-      for (size_t i = 0; i < outs_auto_grad_metas.size(); i++) {
-        const std::vector<paddle::Tensor>& out_tensors = ctx.OutputsBetweeen(
-            ctx.OutputRangeAt(i).first, ctx.OutputRangeAt(i).second);
-
-        egr::EagerUtils::SetOutRankWithSlot(&(outs_auto_grad_metas[i]), i);
-        egr::EagerUtils::SetHistory(&(outs_auto_grad_metas[i]), grad_node);
+      for (size_t i = 0; i < slot_outs_num; i++) {
+        const auto& size_pair = ctx.OutputRangeAt(i);
+        const std::vector<paddle::Tensor>& out_tensors =
+            ctx.OutputsBetweeen(size_pair.first, size_pair.second);
+        for (size_t j = size_pair.first; j < size_pair.second; j++) {
+          // SetOutRankWithSlot: slot_id = i, rank = j - size_pair.first
+          outs_auto_grad_metas[j]->SetSingleOutRankWithSlot(
+              i, j - size_pair.first);
+          egr::EagerUtils::SetHistory(outs_auto_grad_metas[j], grad_node);
+        }
         grad_node->SetGradInMeta(out_tensors, i);
       }
 
@@ -713,9 +777,8 @@ static PyObject* eager_api_run_custom_op(PyObject* self,
                                   ctx.InputRangeAt(it->first).second));
       }
 
-      auto attrs_names =
-          paddle::OpMetaInfoHelper::GetAttrs(meta_info_map.at(op_type)[1]);
-      std::vector<paddle::any> attrs(attrs_names.size());
+      const std::vector<paddle::any>& res_attrs = ctx.Attrs();
+      std::vector<paddle::any> attrs(res_attrs.size());
       // Prepare attrs for Grad node
       for (auto it = slot_map[0][4].begin(); it != slot_map[0][4].end(); it++) {
         VLOG(7) << "Prepare fwd attrs: " << it->first
@@ -725,7 +788,7 @@ static PyObject* eager_api_run_custom_op(PyObject* self,
       grad_node->SetAttrs(attrs);
     }
   }
-  RETURN_PY_NONE
+  return ToPyObject(*ctx.AllMutableOutput());
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index 1a177f59adba22..f2d1c396617b12 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -56,7 +56,6 @@ extern PyTypeObject* g_cudapinnedplace_pytype;
 extern PyTypeObject* g_customplace_pytype;
 extern PyTypeObject* g_framework_tensor_pytype;
 extern PyTypeObject* g_framework_lodtensorarray_pytype;
-extern PyTypeObject* g_custom_op_kernel_ctx_pytype;
 extern PyTypeObject* g_jit_function_pytype;
 
 int TensorDtype2NumpyDtype(phi::DataType dtype) {
@@ -432,6 +431,54 @@ std::vector<size_t> CastPyArg2VectorOfSize_t(PyObject* obj, size_t arg_pos) {
   return result;
 }
 
+std::vector<float> CastPyArg2VectorOfFloat(PyObject* obj, size_t arg_pos) {
+  std::vector<float> result;
+  if (PyList_Check(obj)) {
+    Py_ssize_t len = PyList_Size(obj);
+    PyObject* item = nullptr;
+    for (Py_ssize_t i = 0; i < len; i++) {
+      item = PyList_GetItem(obj, i);
+      if (PyObject_CheckFloatOrConvertToFloat(&item)) {
+        result.emplace_back(static_cast<float>(PyFloat_AsDouble(item)));
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "argument (position %d) must be "
+            "list of float, but got %s at pos %d",
+            arg_pos + 1,
+            reinterpret_cast<PyTypeObject*>(item->ob_type)->tp_name,
+            i));
+      }
+    }
+  } else if (PyTuple_Check(obj)) {
+    Py_ssize_t len = PyTuple_Size(obj);
+    PyObject* item = nullptr;
+    for (Py_ssize_t i = 0; i < len; i++) {
+      item = PyTuple_GET_ITEM(obj, i);
+      if (PyObject_CheckFloatOrConvertToFloat(&item)) {
+        result.emplace_back(static_cast<float>(PyFloat_AsDouble(item)));
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "argument (position %d) must be "
+            "list of float, but got %s at pos %d",
+            arg_pos + 1,
+            reinterpret_cast<PyTypeObject*>(item->ob_type)->tp_name,
+            i));
+      }
+    }
+  } else if (obj == Py_None) {
+    return {};
+  } else if (PyObject_CheckFloatOrConvertToFloat(&obj)) {
+    return {static_cast<float>(PyFloat_AsDouble(obj))};
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "argument (position %d) must be "
+        "list of float, but got %s",
+        arg_pos + 1,
+        reinterpret_cast<PyTypeObject*>(obj->ob_type)->tp_name));
+  }
+  return result;
+}
+
 std::vector<std::vector<size_t>> CastPyArg2VectorOfVectorOfSize_t(
     PyObject* obj, size_t arg_pos) {
   std::vector<std::vector<size_t>> result;
@@ -602,19 +649,6 @@ std::vector<std::string> CastPyArg2VectorOfString(PyObject* obj,
   }
 }
 
-paddle::CustomOpKernelContext CastPyArg2CustomOpKernelContext(PyObject* obj,
-                                                              ssize_t arg_pos) {
-  if (PyObject_IsInstance(
-          obj, reinterpret_cast<PyObject*>(g_custom_op_kernel_ctx_pytype))) {
-    return ::pybind11::handle(obj).cast<paddle::CustomOpKernelContext>();
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "argument (position %d) must be CustomOpKernelContext, "
-        "but got %s",
-        arg_pos + 1,
-        reinterpret_cast<PyTypeObject*>(obj->ob_type)->tp_name));
-  }
-}
 PyObject* ToPyObject(bool value) {
   if (value) {
     Py_INCREF(Py_True);
diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h
index 555489c2ff242f..dcf71ec0819395 100644
--- a/paddle/fluid/pybind/eager_utils.h
+++ b/paddle/fluid/pybind/eager_utils.h
@@ -57,8 +57,6 @@ int64_t CastPyArg2AttrLong(PyObject* obj, ssize_t arg_pos);
 size_t CastPyArg2AttrSize_t(PyObject* obj, ssize_t arg_pos);
 float CastPyArg2AttrFloat(PyObject* obj, ssize_t arg_pos);
 std::string CastPyArg2AttrString(PyObject* obj, ssize_t arg_pos);
-paddle::CustomOpKernelContext CastPyArg2CustomOpKernelContext(PyObject* obj,
-                                                              ssize_t arg_pos);
 std::shared_ptr<imperative::VarBase> CastPyArg2VarBase(PyObject* obj,
                                                        ssize_t arg_pos);
 std::vector<paddle::Tensor> CastPyArg2VectorOfTensor(PyObject* obj,
@@ -70,6 +68,7 @@ std::vector<phi::DenseTensor> CastPyArg2VectorOfTensorBase(PyObject* obj,
 std::vector<int> CastPyArg2VectorOfInt(PyObject* obj, size_t arg_pos);
 std::vector<int64_t> CastPyArg2VectorOfInt64(PyObject* obj, size_t arg_pos);
 std::vector<size_t> CastPyArg2VectorOfSize_t(PyObject* obj, size_t arg_pos);
+std::vector<float> CastPyArg2VectorOfFloat(PyObject* obj, size_t arg_pos);
 std::vector<std::vector<size_t>> CastPyArg2VectorOfVectorOfSize_t(
     PyObject* obj, size_t arg_pos);
 framework::proto::VarType::Type CastPyArg2ProtoType(PyObject* obj,
diff --git a/paddle/fluid/pybind/op_function_common.cc b/paddle/fluid/pybind/op_function_common.cc
index 8a53863c798300..86dea0d145afea 100644
--- a/paddle/fluid/pybind/op_function_common.cc
+++ b/paddle/fluid/pybind/op_function_common.cc
@@ -464,7 +464,7 @@ std::vector<int64_t> CastPyArg2Longs(PyObject* obj,
     for (Py_ssize_t i = 0; i < len; i++) {
       item = PyList_GetItem(obj, i);
       if (PyObject_CheckLongOrToLong(&item)) {
-        value.emplace_back(PyLong_AsLong(item));
+        value.emplace_back((int64_t)PyLong_AsLongLong(item));
       } else {
         PADDLE_THROW(platform::errors::InvalidArgument(
             "%s(): argument (position %d) must be "
@@ -481,7 +481,7 @@ std::vector<int64_t> CastPyArg2Longs(PyObject* obj,
     for (Py_ssize_t i = 0; i < len; i++) {
       item = PyTuple_GetItem(obj, i);
       if (PyObject_CheckLongOrToLong(&item)) {
-        value.emplace_back(PyLong_AsLong(item));
+        value.emplace_back((int64_t)PyLong_AsLongLong(item));
       } else {
         PADDLE_THROW(platform::errors::InvalidArgument(
             "%s(): argument (position %d) must be "
@@ -498,7 +498,7 @@ std::vector<int64_t> CastPyArg2Longs(PyObject* obj,
     for (Py_ssize_t i = 0; i < len; i++) {
       item = PySequence_GetItem(obj, i);
       if (PyObject_CheckLongOrToLong(&item)) {
-        value.emplace_back(PyLong_AsLong(item));
+        value.emplace_back((int64_t)PyLong_AsLongLong(item));
       } else {
         PADDLE_THROW(platform::errors::InvalidArgument(
             "%s(): argument (position %d) must be "
@@ -512,7 +512,7 @@ std::vector<int64_t> CastPyArg2Longs(PyObject* obj,
   } else if (obj == Py_None) {
     return {};
   } else if (PyObject_CheckLongOrToLong(&obj)) {
-    return {static_cast<int64_t>(PyLong_AsLong(obj))};
+    return {(int64_t)PyLong_AsLongLong(obj)};
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "%s(): argument (position %d) must be "
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 1583102865fc69..65aa609e34fde1 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1013,70 +1013,6 @@ PYBIND11_MODULE(libpaddle, m) {
   m.def("_promote_types_if_complex_exists",
         &paddle::framework::PromoteTypesIfComplexExists);
 
-  py::class_<paddle::CustomOpKernelContext> custom_op_kernel_ctx(
-      m, "CustomOpKernelContext", R"DOC()DOC");
-  g_custom_op_kernel_ctx_pytype =
-      reinterpret_cast<PyTypeObject *>(custom_op_kernel_ctx.ptr());
-  custom_op_kernel_ctx.def(py::init<>())
-      .def("add_inputs",
-           [](paddle::CustomOpKernelContext &self, const py::handle &input) {
-             PyObject *obj = input.ptr();
-             if (PyList_Check(obj) || PyTuple_Check(obj)) {
-               self.EmplaceBackInputs(
-                   std::move(CastPyArg2VectorOfTensor(obj, 1)));
-             } else if (obj == Py_None) {
-               // Check optional Tensor, use one un-initialized tensor to
-               // indicate both Tensor and vector<Tensor> inputs
-               self.EmplaceBackInput(std::move(paddle::Tensor()));
-             } else {
-               self.EmplaceBackInput(std::move(CastPyArg2Tensor(obj, 1)));
-             }
-           })
-      .def("add_outputs",
-           [](paddle::CustomOpKernelContext &self, py::handle &outputs) {
-             PyObject *obj = outputs.ptr();
-             if (PyList_Check(obj) || PyTuple_Check(obj)) {
-               self.EmplaceBackOutputs(
-                   std::move(CastPyArg2VectorOfTensor(obj, 1)));
-             } else {
-               self.EmplaceBackOutput(std::move(CastPyArg2Tensor(obj, 1)));
-             }
-           })
-      .def("add_attr",
-           [](paddle::CustomOpKernelContext &self, bool attr) {
-             self.EmplaceBackAttr(attr);
-           })
-      .def("add_attr",
-           [](paddle::CustomOpKernelContext &self, int attr) {
-             self.EmplaceBackAttr(attr);
-           })
-      .def("add_attr",
-           [](paddle::CustomOpKernelContext &self, float attr) {
-             self.EmplaceBackAttr(attr);
-           })
-      .def("add_attr",
-           [](paddle::CustomOpKernelContext &self, int64_t attr) {
-             self.EmplaceBackAttr(attr);
-           })
-      .def("add_attr",
-           [](paddle::CustomOpKernelContext &self, const std::string &attr) {
-             self.EmplaceBackAttr(attr);
-           })
-      .def("add_attr",
-           [](paddle::CustomOpKernelContext &self,
-              const std::vector<int> &attr) { self.EmplaceBackAttr(attr); })
-      .def("add_attr",
-           [](paddle::CustomOpKernelContext &self,
-              const std::vector<float> &attr) { self.EmplaceBackAttr(attr); })
-      .def("add_attr",
-           [](paddle::CustomOpKernelContext &self,
-              const std::vector<int64_t> &attr) { self.EmplaceBackAttr(attr); })
-      .def("add_attr",
-           [](paddle::CustomOpKernelContext &self,
-              const std::vector<std::string> &attr) {
-             self.EmplaceBackAttr(attr);
-           });
-
   py::class_<Variable>(m, "Variable", R"DOC(Variable Class.
 
 All parameter, weight, gradient are variables in Paddle.
diff --git a/paddle/phi/api/ext/op_meta_info.h b/paddle/phi/api/ext/op_meta_info.h
index 4a9a10a53aa906..07a47ed1df6327 100644
--- a/paddle/phi/api/ext/op_meta_info.h
+++ b/paddle/phi/api/ext/op_meta_info.h
@@ -119,6 +119,7 @@ class PADDLE_API CustomOpKernelContext {
   const Tensor& InputAt(size_t idx) const;
   std::vector<Tensor> InputsBetween(size_t start, size_t end) const;
   Tensor& MutableInputAt(size_t idx);
+  std::vector<Tensor>* AllMutableInput();
   paddle::optional<Tensor> OptionalInputAt(size_t idx);
   paddle::optional<std::vector<Tensor>> OptionalInputsBetween(size_t start,
                                                               size_t end);
@@ -144,13 +145,18 @@ class PADDLE_API CustomOpKernelContext {
   }
 
   // handle inplace map
-  void MapPlainOutputs(
+  void ConstructInplaceIndex(
+      const std::vector<std::string>& inputs,
+      const std::vector<std::string>& outputs,
+      const std::unordered_map<std::string, std::string>& inplace_map);
+  void UpdatePlainOutputs(
       const std::vector<std::string>& inputs,
       const std::vector<std::string>& outputs,
       const std::unordered_map<std::string, std::string>& inplace_map);
   void AssignInplaceOutputs();
   std::vector<Tensor*>* AllMutablePlainOutput();
-  std::unordered_map<size_t, size_t> GetInplaceTensorMap();
+  std::unordered_map<size_t, size_t> GetInplaceIndexMap();
+  std::unordered_map<size_t, size_t> GetInplaceReverseIndexMap();
 
  private:
   // TODO(chenweihang): replaced be SmallVector
@@ -159,7 +165,10 @@ class PADDLE_API CustomOpKernelContext {
   std::vector<paddle::any> attrs_;
   // handle inplace map
   std::vector<Tensor*> plain_outputs_;
-  std::unordered_map<size_t, size_t> inplace_tensor_map_;
+  // {input: output}
+  std::unordered_map<size_t, size_t> inplace_idx_map_;
+  // {output: input}
+  std::unordered_map<size_t, size_t> inplace_reverse_idx_map_;
 
   std::vector<std::pair<size_t, size_t>> input_range_;
   std::vector<std::pair<size_t, size_t>> output_range_;
diff --git a/paddle/phi/api/lib/op_meta_info.cc b/paddle/phi/api/lib/op_meta_info.cc
index bdc46a4e0e7cad..0af2c96521c9f1 100644
--- a/paddle/phi/api/lib/op_meta_info.cc
+++ b/paddle/phi/api/lib/op_meta_info.cc
@@ -103,6 +103,10 @@ Tensor& CustomOpKernelContext::MutableInputAt(size_t idx) {
   return inputs_.at(idx);
 }
 
+std::vector<Tensor>* CustomOpKernelContext::AllMutableInput() {
+  return &inputs_;
+}
+
 paddle::optional<Tensor> CustomOpKernelContext::OptionalInputAt(size_t idx) {
   if (!inputs_.at(idx).is_initialized()) {
     return paddle::none;
@@ -156,13 +160,15 @@ const std::pair<size_t, size_t>& CustomOpKernelContext::OutputRangeAt(
   return output_range_.at(idx);
 }
 
-// handle inplace mechanism
-// Find out non-inplace output tensors.
-// TODO(HongyuJia): Add cache for inplace_tensor_map_ to optimize performance
-void CustomOpKernelContext::MapPlainOutputs(
+void CustomOpKernelContext::ConstructInplaceIndex(
     const std::vector<std::string>& inputs,
     const std::vector<std::string>& outputs,
     const std::unordered_map<std::string, std::string>& inplace_map) {
+  // Cache inplace indices.
+  if (inplace_map.empty() || !inplace_idx_map_.empty()) {
+    VLOG(4) << "Custom opertor ConstructInplaceIndex no need to recompute.";
+    return;
+  }
   for (size_t in_idx = 0; in_idx < inputs.size(); ++in_idx) {
     auto& input = inputs[in_idx];
     if (inplace_map.find(input) == inplace_map.end()) {
@@ -175,15 +181,26 @@ void CustomOpKernelContext::MapPlainOutputs(
                               "the input of `Inplace` again and make "
                               "sure you registered your op accurately. ",
                               input));
-    inplace_tensor_map_[in_idx] = distance(outputs.begin(), out_iter);
+    size_t out_idx = distance(outputs.begin(), out_iter);
+    inplace_idx_map_[in_idx] = out_idx;
+    inplace_reverse_idx_map_[out_idx] = in_idx;
+  }
+  VLOG(4) << "Custom opertor update inplace input-output map successfully.";
+}
+
+// Find out non-inplace output tensors.
+void CustomOpKernelContext::UpdatePlainOutputs(
+    const std::vector<std::string>& inputs,
+    const std::vector<std::string>& outputs,
+    const std::unordered_map<std::string, std::string>& inplace_map) {
+  // Cache plain outputs vector.
+  if (!plain_outputs_.empty()) {
+    VLOG(4) << "Custom opertor UpdatePlainOutputs no need to recompute.";
+    return;
   }
+  ConstructInplaceIndex(inputs, outputs, inplace_map);
   for (size_t i = 0; i < outputs.size(); ++i) {
-    if (std::any_of(
-            inplace_tensor_map_.begin(),
-            inplace_tensor_map_.end(),
-            [i](std::unordered_map<size_t, size_t>::const_reference pair) {
-              return pair.second == i;
-            })) {
+    if (inplace_reverse_idx_map_.find(i) != inplace_reverse_idx_map_.end()) {
       continue;
     }
     size_t output_start_idx = output_range_[i].first;
@@ -192,11 +209,12 @@ void CustomOpKernelContext::MapPlainOutputs(
       plain_outputs_.push_back(&outputs_[idx]);
     }
   }
-  VLOG(4) << "Custom opertor update inplace input-output map successfully.";
+  VLOG(4) << "Custom opertor update plain outputs map successfully.";
 }
+
 // Assign input tensor to inplace output tensors.
 void CustomOpKernelContext::AssignInplaceOutputs() {
-  for (auto pair : inplace_tensor_map_) {
+  for (auto pair : inplace_idx_map_) {
     size_t in_start_idx = input_range_[pair.first].first;
     size_t in_end_idx = input_range_[pair.first].second;
     size_t out_start_idx = output_range_[pair.second].first;
@@ -213,15 +231,21 @@ void CustomOpKernelContext::AssignInplaceOutputs() {
     }
     VLOG(4) << "Custom opertor update inplace input-output tensor "
                "successfully. Update map size = "
-            << inplace_tensor_map_.size();
+            << inplace_idx_map_.size();
   }
 }
+
 std::vector<Tensor*>* CustomOpKernelContext::AllMutablePlainOutput() {
   return &plain_outputs_;
 }
+
+std::unordered_map<size_t, size_t> CustomOpKernelContext::GetInplaceIndexMap() {
+  return inplace_idx_map_;
+}
+
 std::unordered_map<size_t, size_t>
-CustomOpKernelContext::GetInplaceTensorMap() {
-  return inplace_tensor_map_;
+CustomOpKernelContext::GetInplaceReverseIndexMap() {
+  return inplace_reverse_idx_map_;
 }
 ////////////////////// Op Meta Info //////////////////////
 
diff --git a/python/paddle/utils/cpp_extension/extension_utils.py b/python/paddle/utils/cpp_extension/extension_utils.py
index 8958c6bc7ac1f5..e78cc85f73ca0a 100644
--- a/python/paddle/utils/cpp_extension/extension_utils.py
+++ b/python/paddle/utils/cpp_extension/extension_utils.py
@@ -1042,7 +1042,9 @@ def _gen_output_content(
     # ' ' * tab space * tab number
     indent = ' ' * 4 * 2
     inplace_idx = {v: k for k, v in inplace_reverse_idx.items()}
-    dynamic_content = ""
+    dynamic_content = f"""
+{indent}res = []
+{indent}start_idx = 0"""
     static_content = f"""
 {indent}ins = {{}}
 {indent}ins_map = {ins_map}
@@ -1065,10 +1067,11 @@ def _gen_output_content(
             lower_in_names = in_names[in_idx].split("@")[0].lower()
             dynamic_content += f"""
 {indent}if {lower_in_names} is not None:
-{indent}    outs['{out_name}'] = [core.eager.Tensor() for _ in range(len({lower_in_names}))]
+{indent}    res.append(outs[start_idx: start_idx + len({lower_in_names})])
+{indent}    start_idx += len({lower_in_names})
 {indent}else:
-{indent}    outs['{out_name}'] = core.eager.Tensor()
-{indent}ctx.add_outputs(outs['{out_name}'])"""
+{indent}    res.append(None)
+{indent}    start_idx += 1"""
             static_content += f"""
 {indent}if {lower_in_names} is not None:
 {indent}    outs['{out_name}'] = [helper.create_variable(dtype='float32') for _ in range(len({lower_in_names}))]"""
@@ -1077,8 +1080,8 @@ def _gen_output_content(
         ):  # inplace vector<Tensor> output case
             lower_in_names = in_names[in_idx].split("@")[0].lower()
             dynamic_content += f"""
-{indent}outs['{out_name}'] = [core.eager.Tensor() for _ in range(len({lower_in_names}))]
-{indent}ctx.add_outputs(outs['{out_name}'])"""
+{indent}res.append(outs[start_idx: start_idx + len({lower_in_names})])
+{indent}start_idx += len({lower_in_names})"""
             static_content += f"""
 {indent}outs['{out_name}'] = [helper.create_variable(dtype='float32') for _ in range(len({lower_in_names}))]"""
         elif (
@@ -1086,21 +1089,22 @@ def _gen_output_content(
         ):  # inplace optional Tensor output case, handle inplace None input
             lower_in_names = in_names[in_idx].split("@")[0].lower()
             dynamic_content += f"""
-{indent}outs['{out_name}'] = core.eager.Tensor()
-{indent}ctx.add_outputs(outs['{out_name}'])"""
+{indent}if {lower_in_names} is not None:
+{indent}    res.append(outs[start_idx])
+{indent}else:
+{indent}    res.append(None)
+{indent}start_idx += 1"""
             static_content += f"""
 {indent}if {lower_in_names} is not None:
 {indent}    outs['{out_name}'] = helper.create_variable(dtype='float32')"""
         else:  # general/inplace Tensor output case
             dynamic_content += f"""
-{indent}outs['{out_name}'] = core.eager.Tensor()
-{indent}ctx.add_outputs(outs['{out_name}'])"""
+{indent}res.append(outs[start_idx])
+{indent}start_idx += 1"""
             static_content += f"""
 {indent}outs['{out_name}'] = helper.create_variable(dtype='float32')"""
 
     dynamic_content += f"""
-{indent}core.eager._run_custom_op(ctx, "{op_name}", True)
-{indent}res = [outs[out_name] if isinstance(outs[out_name], list) or outs[out_name]._is_initialized() else None for out_name in outs_list]
 {indent}return res[0] if len(res)==1 else res"""
 
     static_content += f"""
@@ -1134,7 +1138,7 @@ def _custom_api_content(op_name):
     API_TEMPLATE = textwrap.dedent(
         """
         import paddle.fluid.core as core
-        from paddle.fluid.core import Tensor, CustomOpKernelContext
+        from paddle.fluid.core import Tensor
         from paddle.fluid.framework import _dygraph_tracer, in_dygraph_mode
         from paddle.fluid.layer_helper import LayerHelper
 
@@ -1146,11 +1150,7 @@ def {op_name}({params_list}):
             # The output variable's dtype use default value 'float32',
             # and the actual dtype of output variable will be inferred in runtime.
             if in_dygraph_mode():
-                ctx = CustomOpKernelContext()
-                for i in {in_names}:
-                    ctx.add_inputs(i)
-                for j in {attr_names}:
-                    ctx.add_attr(j)
+                outs = core.eager._run_custom_op("{op_name}", {params_list})
                 {dynamic_content}
             else:
                 {static_content}

From 6934ac797f6ae6d3c83529af2f510ac194452d66 Mon Sep 17 00:00:00 2001
From: xiaoxiaohehe001 <49090790+xiaoxiaohehe001@users.noreply.github.com>
Date: Mon, 10 Apr 2023 15:23:52 +0800
Subject: [PATCH 022/156] [Paddle Inference] Support two inputs of multihead
 attention named qk_multihead. (#52455)

* Support two inputs of multihead attention named qk_multihead
---
 paddle/fluid/framework/ir/CMakeLists.txt      |   1 +
 .../ir/trt_qk_multihead_matmul_fuse_pass.cc   | 591 ++++++++++++++++++
 .../ir/trt_qk_multihead_matmul_fuse_pass.h    | 104 +++
 .../fluid/inference/api/analysis_predictor.cc |   1 +
 .../inference/api/paddle_pass_builder.cc      |   1 +
 .../inference/tensorrt/convert/CMakeLists.txt |   1 +
 .../convert/qk_multihead_matmul_op.cc         | 301 +++++++++
 paddle/fluid/inference/tensorrt/op_teller.cc  |   2 +
 .../test_trt_convert_qk_multihead_matmul.py   | 385 ++++++++++++
 9 files changed, 1387 insertions(+)
 create mode 100644 paddle/fluid/framework/ir/trt_qk_multihead_matmul_fuse_pass.cc
 create mode 100644 paddle/fluid/framework/ir/trt_qk_multihead_matmul_fuse_pass.h
 create mode 100644 paddle/fluid/inference/tensorrt/convert/qk_multihead_matmul_op.cc
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_qk_multihead_matmul.py

diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index fbec6488568f06..91c3ba6d608b4c 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -134,6 +134,7 @@ if(WITH_TENSORRT)
   pass_library(trt_multihead_matmul_fuse_pass inference)
   pass_library(trt_flash_multihead_matmul_fuse_pass inference)
   pass_library(trt_cross_multihead_matmul_fuse_pass inference)
+  pass_library(trt_qk_multihead_matmul_fuse_pass inference)
   pass_library(trt_skip_layernorm_fuse_pass inference)
   pass_library(merge_layernorm_fuse_pass inference)
   pass_library(preln_skip_layernorm_fuse_pass inference)
diff --git a/paddle/fluid/framework/ir/trt_qk_multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/trt_qk_multihead_matmul_fuse_pass.cc
new file mode 100644
index 00000000000000..df1476e9db3454
--- /dev/null
+++ b/paddle/fluid/framework/ir/trt_qk_multihead_matmul_fuse_pass.cc
@@ -0,0 +1,591 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/trt_qk_multihead_matmul_fuse_pass.h"
+
+#include <string>
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#ifdef PADDLE_WITH_TENSORRT
+#include "paddle/fluid/inference/tensorrt/helper.h"
+#endif
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace patterns {
+
+//       input_qk   input_v
+//       |q     |k      v
+//       |------|       |
+//    matmul  matmul  matmul
+//       |      |       |
+//    reshape reshape reshape
+//       |      |       |
+//     trans   trans   trans
+//       |(x)   |(x)    |
+//        matmul        |
+//          |           |
+//        scale         |
+//          |           |
+//        softmax       |(y)
+//          |------matmul
+//                    |
+//                  trans
+//                    |
+//                  reshape
+//                    |
+//                   output
+//
+// -> fused to
+//
+//   input_qk intput_v
+//           |
+//     qk_multihead_matmul
+//           |
+//         output
+
+PDNode* TrtQKMultiHeadMatmulPattern::operator()() {
+  std::unordered_set<std::string> mul_ops{"mul", "matmul_v2"};
+  std::unordered_set<std::string> matmul_ops{"matmul", "matmul_v2"};
+  auto* input0 = pattern->NewNode(input0_repr());
+  auto* input1 = pattern->NewNode(input1_repr());
+
+  input0->assert_is_ops_input(mul_ops);
+  input1->assert_is_ops_input(mul_ops);
+  VLOG(5) << "Start match TrtQKMultiHeadMatmulPattern";
+
+  // First path
+  auto* mul0 = pattern->NewNode(mul0_repr())->assert_is_ops(mul_ops);
+  auto* mul0_w_var = pattern->NewNode(mul0_w_repr())
+                         ->AsInput()
+                         ->assert_is_ops_input(mul_ops, "Y");
+  auto* mul0_out_var = pattern->NewNode(mul0_out_repr())
+                           ->assert_is_ops_output(mul_ops)
+                           ->assert_is_op_input("elementwise_add", "X")
+                           ->AsIntermediate();
+
+  auto* elementwise0 =
+      pattern->NewNode(elementwise0_repr())->assert_is_op("elementwise_add");
+  auto* elementwise0_w = pattern->NewNode(elementwise0_w_repr())
+                             ->AsInput()
+                             ->assert_is_op_input("elementwise_add", "Y");
+  auto* elementwise0_out = pattern->NewNode(elementwise0_out_repr())
+                               ->assert_is_op_output("elementwise_add", "Out")
+                               ->assert_is_op_input("reshape2", "X")
+                               ->AsIntermediate();
+
+  auto* reshape2_0 =
+      pattern->NewNode(reshape2_0_repr())->assert_is_op("reshape2");
+
+  auto* reshape2_0_out_var = pattern->NewNode(reshape2_0_out_repr())
+                                 ->assert_is_op_output("reshape2")
+                                 ->assert_is_op_input("transpose2")
+                                 ->AsIntermediate();
+
+  auto* transpose2_0 =
+      pattern->NewNode(transpose2_0_repr())->assert_is_op("transpose2");
+  auto* transpose2_0_out_var = pattern->NewNode(transpose2_0_out_repr())
+                                   ->assert_is_op_output("transpose2")
+                                   ->assert_is_ops_input(matmul_ops, "X")
+                                   ->AsIntermediate();
+
+  auto* matmul_qk =
+      pattern->NewNode(matmul_qk_repr())->assert_is_ops(matmul_ops);
+  auto* matmul_qk_out_var = pattern->NewNode(matmul_qk_out_repr())
+                                ->assert_is_ops_output(matmul_ops)
+                                ->assert_is_op_input("scale")
+                                ->AsIntermediate();
+
+  auto* scale = pattern->NewNode(scale_repr())->assert_is_op("scale");
+  auto* scale_out_var = pattern->NewNode(scale_out_repr())
+                            ->assert_is_op_output("scale")
+                            ->assert_is_op_input("softmax")
+                            ->AsIntermediate();
+
+  auto* softmax_qk =
+      pattern->NewNode(softmax_qk_repr())->assert_is_op("softmax");
+  auto* softmax_qk_out_var = pattern->NewNode(softmax_qk_out_repr())
+                                 ->assert_is_op_output("softmax")
+                                 ->assert_is_ops_input(matmul_ops)
+                                 ->AsIntermediate();
+
+  auto* matmul_qkv =
+      pattern->NewNode(matmul_qkv_repr())->assert_is_ops(matmul_ops);
+  auto* matmul_qkv_out_var = pattern->NewNode(matmul_qkv_out_repr())
+                                 ->assert_is_ops_output(matmul_ops)
+                                 ->assert_is_op_input("transpose2")
+                                 ->AsIntermediate();
+
+  auto* transpose2_qkv =
+      pattern->NewNode(transpose2_qkv_repr())->assert_is_op("transpose2");
+  auto* transpose2_qkv_out_var = pattern->NewNode(transpose2_qkv_out_repr())
+                                     ->assert_is_op_output("transpose2")
+                                     ->assert_is_op_input("reshape2")
+                                     ->AsIntermediate();
+
+  auto* reshape2_qkv =
+      pattern->NewNode(reshape2_qkv_repr())->assert_is_op("reshape2");
+  auto* reshape2_qkv_out_var = pattern->NewNode(reshape2_qkv_out_repr())
+                                   ->assert_is_op_output("reshape2")
+                                   ->AsOutput();
+
+  // Second path to matmul
+  auto* mul1 = pattern->NewNode(mul1_repr())->assert_is_ops(mul_ops);
+  auto* mul1_w_var = pattern->NewNode(mul1_w_repr())
+                         ->AsInput()
+                         ->assert_is_ops_input(mul_ops, "Y");
+  auto* mul1_out_var = pattern->NewNode(mul1_out_repr())
+                           ->assert_is_ops_output(mul_ops)
+                           ->assert_is_op_input("elementwise_add", "X")
+                           ->AsIntermediate();
+
+  auto* elementwise1 =
+      pattern->NewNode(elementwise1_repr())->assert_is_op("elementwise_add");
+  auto* elementwise1_w = pattern->NewNode(elementwise1_w_repr())
+                             ->AsInput()
+                             ->assert_is_op_input("elementwise_add", "Y");
+  auto* elementwise1_out = pattern->NewNode(elementwise1_out_repr())
+                               ->assert_is_op_output("elementwise_add", "Out")
+                               ->assert_is_op_input("reshape2", "X")
+                               ->AsIntermediate();
+
+  auto* reshape2_1 =
+      pattern->NewNode(reshape2_1_repr())->assert_is_op("reshape2");
+
+  auto* reshape2_1_out_var = pattern->NewNode(reshape2_1_out_repr())
+                                 ->assert_is_op_output("reshape2")
+                                 ->assert_is_op_input("transpose2")
+                                 ->AsIntermediate();
+
+  auto* transpose2_1 =
+      pattern->NewNode(transpose2_1_repr())->assert_is_op("transpose2");
+  auto* transpose2_1_out_var = pattern->NewNode(transpose2_1_out_repr())
+                                   ->assert_is_op_output("transpose2")
+                                   ->assert_is_ops_input(matmul_ops, "Y")
+                                   ->AsIntermediate();  // link to matmul qk
+
+  // Third path to matmul
+  auto* mul2 = pattern->NewNode(mul2_repr())->assert_is_ops(mul_ops);
+  auto* mul2_w_var = pattern->NewNode(mul2_w_repr())
+                         ->AsInput()
+                         ->assert_is_ops_input(mul_ops, "Y");
+  auto* mul2_out_var = pattern->NewNode(mul2_out_repr())
+                           ->assert_is_ops_output(mul_ops)
+                           ->assert_is_op_input("elementwise_add", "X")
+                           ->AsIntermediate();
+
+  auto* elementwise2 =
+      pattern->NewNode(elementwise2_repr())->assert_is_op("elementwise_add");
+  auto* elementwise2_w = pattern->NewNode(elementwise2_w_repr())
+                             ->AsInput()
+                             ->assert_is_op_input("elementwise_add", "Y");
+  auto* elementwise2_out = pattern->NewNode(elementwise2_out_repr())
+                               ->assert_is_op_output("elementwise_add", "Out")
+                               ->assert_is_op_input("reshape2", "X")
+                               ->AsIntermediate();
+
+  auto* reshape2_2 =
+      pattern->NewNode(reshape2_2_repr())->assert_is_op("reshape2");
+
+  auto* reshape2_2_out_var = pattern->NewNode(reshape2_2_out_repr())
+                                 ->assert_is_op_output("reshape2")
+                                 ->assert_is_op_input("transpose2")
+                                 ->AsIntermediate();
+
+  auto* transpose2_2 =
+      pattern->NewNode(transpose2_2_repr())->assert_is_op("transpose2");
+  auto* transpose2_2_out_var = pattern->NewNode(transpose2_2_out_repr())
+                                   ->assert_is_op_output("transpose2")
+                                   ->assert_is_ops_input(matmul_ops)
+                                   ->AsIntermediate();  // link to matmul qkv
+
+  // Q path
+  mul0->LinksFrom({input0, mul0_w_var}).LinksTo({mul0_out_var});
+  elementwise0->LinksFrom({mul0_out_var, elementwise0_w})
+      .LinksTo({elementwise0_out});
+
+  reshape2_0->LinksFrom({elementwise0_out}).LinksTo({reshape2_0_out_var});
+  transpose2_0->LinksFrom({reshape2_0_out_var}).LinksTo({transpose2_0_out_var});
+  // K path
+  mul1->LinksFrom({input0, mul1_w_var}).LinksTo({mul1_out_var});
+  elementwise1->LinksFrom({mul1_out_var, elementwise1_w})
+      .LinksTo({elementwise1_out});
+
+  reshape2_1->LinksFrom({elementwise1_out}).LinksTo({reshape2_1_out_var});
+  transpose2_1->LinksFrom({reshape2_1_out_var}).LinksTo({transpose2_1_out_var});
+  // compute q*k
+  matmul_qk->LinksFrom({transpose2_0_out_var, transpose2_1_out_var})
+      .LinksTo({matmul_qk_out_var});
+  scale->LinksFrom({matmul_qk_out_var}).LinksTo({scale_out_var});
+  softmax_qk->LinksFrom({scale_out_var}).LinksTo({softmax_qk_out_var});
+  // V  path
+  mul2->LinksFrom({input1, mul2_w_var}).LinksTo({mul2_out_var});
+  elementwise2->LinksFrom({mul2_out_var, elementwise2_w})
+      .LinksTo({elementwise2_out});
+
+  reshape2_2->LinksFrom({elementwise2_out}).LinksTo({reshape2_2_out_var});
+  transpose2_2->LinksFrom({reshape2_2_out_var}).LinksTo({transpose2_2_out_var});
+  // compute q*k*v
+  matmul_qkv->LinksFrom({softmax_qk_out_var, transpose2_2_out_var})
+      .LinksTo({matmul_qkv_out_var});
+  transpose2_qkv->LinksFrom({matmul_qkv_out_var})
+      .LinksTo({transpose2_qkv_out_var});
+  reshape2_qkv->LinksFrom({transpose2_qkv_out_var})
+      .LinksTo({reshape2_qkv_out_var});
+
+  return reshape2_qkv_out_var;
+}
+
+}  // namespace patterns
+
+int TrtQkMultiHeadMatmulFusePass::BuildQkFusion(Graph* graph,
+                                                const std::string& name_scope,
+                                                Scope* scope) const {
+  GraphPatternDetector gpd;
+  auto* pattern = gpd.mutable_pattern();
+
+  // Create pattern.
+  patterns::TrtQKMultiHeadMatmulPattern multihead_pattern(pattern, name_scope);
+
+  multihead_pattern();
+  auto fuse_creater = [&](Node* input0,
+                          Node* input1,
+                          Node* mul0,
+                          Node* mul1,
+                          Node* mul2,
+                          Node* mul0_out,
+                          Node* mul1_out,
+                          Node* mul2_out,
+                          Node* mul0_w,
+                          Node* mul1_w,
+                          Node* mul2_w,
+                          Node* elementwise0,
+                          Node* elementwise0_w,
+                          Node* elementwise1,
+                          Node* elementwise1_w,
+                          Node* elementwise2,
+                          Node* elementwise2_w,
+                          Node* reshape2,
+                          Node* reshape2_qkv_out,
+                          Node* scale,
+                          Node* scale_out) {
+    // get Device context
+    auto* dev_ctx = static_cast<phi::CPUContext*>(
+        platform::DeviceContextPool::Instance().Get(platform::CPUPlace()));
+
+    auto scale_attr = PADDLE_GET_CONST(float, scale->Op()->GetAttr("scale"));
+
+    // create multihead
+    OpDesc multihead_op_desc(mul0->Op()->Block());
+    auto reshape_desc = reshape2->Op();
+    int head_number =
+        PADDLE_GET_CONST(std::vector<int>, reshape_desc->GetAttr("shape"))
+            .at(2);
+    multihead_op_desc.SetType("qk_multihead_matmul");
+    multihead_op_desc.SetInput("Input_qk", {input0->Name()});
+    multihead_op_desc.SetInput("Input_v", {input1->Name()});
+
+    auto* wq_tensor =
+        scope->FindVar(mul0_w->Name())->GetMutable<phi::DenseTensor>();
+    auto* wk_tensor =
+        scope->FindVar(mul1_w->Name())->GetMutable<phi::DenseTensor>();
+    auto* bq_tensor =
+        scope->FindVar(elementwise0_w->Name())->GetMutable<phi::DenseTensor>();
+    auto* bk_tensor =
+        scope->FindVar(elementwise1_w->Name())->GetMutable<phi::DenseTensor>();
+
+    int hidden_out = wq_tensor->dims()[1];
+    int head_size = hidden_out / head_number;
+    if (abs(scale_attr - 1.0f / sqrt(static_cast<float>(head_size))) > 1e-5) {
+      VLOG(3) << "scale of muilthead matmul do not fit the requirement of "
+                 "qk attention plugin, Stop fusing.";
+      return;
+    }
+    VLOG(3) << "trt qk attention get wq_tensor name = " << mul0_w->Name()
+            << "trt qk attention get wk_tensor name = " << mul1_w->Name();
+
+    auto* wq_data = wq_tensor->data<float>();
+    auto* wk_data = wk_tensor->data<float>();
+    auto* bq_data = bq_tensor->data<float>();
+    auto* bk_data = bk_tensor->data<float>();
+
+    // combined_w_dims = [in,2,out]
+    auto combined_w_qk_dims =
+        phi::make_ddim({wq_tensor->dims()[0], 2, wq_tensor->dims()[1]});
+    auto combined_bias_dims = phi::make_ddim({2, bq_tensor->dims()[0]});
+
+    VLOG(3) << "trt qk attention trt wq_dim in:" << wq_tensor->dims()[0]
+            << "trt qk attention trt wk_dim out:" << wq_tensor->dims()[1];
+    auto* combined_w_qk_desc = mul0_w->Var();
+    combined_w_qk_desc->SetShape(
+        {wq_tensor->dims()[0], 2, wq_tensor->dims()[1]});
+    combined_w_qk_desc->SetPersistable(true);
+    phi::DenseTensor tmp_combined_w_qk_tensor;
+    tmp_combined_w_qk_tensor.Resize(combined_w_qk_dims);
+    float* tmp_combined_w_qk_data =
+        dev_ctx->template HostAlloc<float>(&tmp_combined_w_qk_tensor);
+
+    std::vector<float*> w_vec = {wq_data, wk_data};
+    int dims_h = combined_w_qk_dims[0], dims_w = combined_w_qk_dims[2];
+    // dims_h=in_feature, dims_w=out_feature
+    // Combine the two fc weights together.
+    // weight [Hidden_in * 2 * N * H]
+    for (int i = 0; i < dims_h; i++) {
+      for (int j = 0; j < 2; j++) {
+        for (int k = 0; k < dims_w; k++) {
+          int out_index = i * (2 * dims_w) + j * dims_w + k;
+          int in_index = i * dims_w + k;
+          tmp_combined_w_qk_data[out_index] = w_vec[j][in_index];
+        }
+      }
+    }
+    wq_tensor->clear();
+    wq_tensor->Resize(combined_w_qk_dims);
+    auto* new_combined_w_qk_data = dev_ctx->template HostAlloc<float>(
+        wq_tensor, sizeof(float) * wq_tensor->numel());
+    memcpy(new_combined_w_qk_data,
+           tmp_combined_w_qk_data,
+           sizeof(float) * wq_tensor->numel());
+
+    scope->EraseVars({mul1_w->Name()});
+    auto* combined_bias_desc = elementwise0_w->Var();
+    combined_bias_desc->SetShape({2, bq_tensor->dims()[0]});
+    combined_bias_desc->SetPersistable(true);
+
+    phi::DenseTensor tmp_combined_bias_tensor;
+    tmp_combined_bias_tensor.Resize(combined_bias_dims);
+    float* tmp_combined_bias_data =
+        dev_ctx->template HostAlloc<float>(&tmp_combined_bias_tensor);
+
+    size_t bias_size = bq_tensor->numel();
+    memcpy(tmp_combined_bias_data, bq_data, sizeof(float) * bias_size);
+    memcpy(
+        tmp_combined_bias_data + bias_size, bk_data, sizeof(float) * bias_size);
+
+    bq_tensor->clear();
+    bq_tensor->Resize(combined_bias_dims);
+    auto* new_combined_bias_data = dev_ctx->template HostAlloc<float>(
+        bq_tensor, sizeof(float) * bq_tensor->numel());
+
+    memcpy(new_combined_bias_data,
+           tmp_combined_bias_data,
+           sizeof(float) * bq_tensor->numel());
+
+    scope->EraseVars({elementwise1_w->Name()});
+
+    multihead_op_desc.SetInput("W_qk", {mul0_w->Name()});
+    multihead_op_desc.SetInput("W_v", {mul2_w->Name()});
+    multihead_op_desc.SetInput("B_qk", {elementwise0_w->Name()});
+    multihead_op_desc.SetInput("B_v", {elementwise2_w->Name()});
+    multihead_op_desc.SetOutput("Out", {reshape2_qkv_out->Name()});
+    multihead_op_desc.SetAttr("alpha", scale_attr);
+    multihead_op_desc.SetAttr("head_number", head_number);
+
+    auto* multihead = graph->CreateOpNode(&multihead_op_desc);
+    IR_NODE_LINK_TO(input0, multihead);
+    IR_NODE_LINK_TO(input1, multihead);
+    IR_NODE_LINK_TO(mul0_w, multihead);
+    IR_NODE_LINK_TO(mul2_w, multihead);
+    IR_NODE_LINK_TO(elementwise0_w, multihead);
+    IR_NODE_LINK_TO(elementwise2_w, multihead);
+    IR_NODE_LINK_TO(multihead, reshape2_qkv_out);
+  };
+  int fusion_count{0};
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    // GET_IR_NODE_FROM_SUBGRAPH(dropout_out, dropout_out, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(input0, input0, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(input1, input1, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul0, mul0, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul0_out, mul0_out, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul0_w, mul0_w, multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise0, elementwise0, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        elementwise0_w, elementwise0_w, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        elementwise0_out, elementwise0_out, multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_0, reshape2_0, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        reshape2_0_out, reshape2_0_out, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_0, transpose2_0, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        transpose2_0_out, transpose2_0_out, multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(scale, scale, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(scale_out, scale_out, multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(mul1, mul1, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul1_out, mul1_out, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul1_w, mul1_w, multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise1, elementwise1, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        elementwise1_w, elementwise1_w, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        elementwise1_out, elementwise1_out, multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_1, reshape2_1, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        reshape2_1_out, reshape2_1_out, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_1, transpose2_1, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        transpose2_1_out, transpose2_1_out, multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(mul2, mul2, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul2_out, mul2_out, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul2_w, mul2_w, multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise2, elementwise2, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        elementwise2_w, elementwise2_w, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        elementwise2_out, elementwise2_out, multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_2, reshape2_2, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        reshape2_2_out, reshape2_2_out, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_2, transpose2_2, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        transpose2_2_out, transpose2_2_out, multihead_pattern);
+
+    // nodes need be removed
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_qk, matmul_qk, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_qk_out, matmul_qk_out, multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(softmax_qk, softmax_qk, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        softmax_qk_out, softmax_qk_out, multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_qkv, matmul_qkv, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul_qkv_out, matmul_qkv_out, multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_qkv, reshape2_qkv, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        reshape2_qkv_out, reshape2_qkv_out, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        transpose2_qkv, transpose2_qkv, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        transpose2_qkv_out, transpose2_qkv_out, multihead_pattern);
+
+    fuse_creater(input0,
+                 input1,
+                 mul0,
+                 mul1,
+                 mul2,
+                 mul0_out,
+                 mul1_out,
+                 mul2_out,
+                 mul0_w,
+                 mul1_w,
+                 mul2_w,
+                 elementwise0,
+                 elementwise0_w,
+                 elementwise1,
+                 elementwise1_w,
+                 elementwise2,
+                 elementwise2_w,
+                 reshape2_0,
+                 reshape2_qkv_out,
+                 scale,
+                 scale_out);
+
+    std::unordered_set<const Node*> marked_nodes({reshape2_0,
+                                                  reshape2_1,
+                                                  reshape2_2,
+                                                  reshape2_0_out,
+                                                  reshape2_1_out,
+                                                  reshape2_2_out,
+                                                  transpose2_0,
+                                                  transpose2_1,
+                                                  transpose2_2,
+                                                  transpose2_0_out,
+                                                  transpose2_1_out,
+                                                  transpose2_2_out,
+                                                  matmul_qk,
+                                                  matmul_qk_out,
+                                                  softmax_qk,
+                                                  softmax_qk_out,
+                                                  transpose2_qkv,
+                                                  transpose2_qkv_out,
+                                                  matmul_qkv,
+                                                  matmul_qkv_out,
+                                                  mul0,
+                                                  mul1,
+                                                  mul2,
+                                                  mul0_out,
+                                                  mul1_out,
+                                                  mul2_out,
+                                                  elementwise0,
+                                                  elementwise0_out,
+                                                  elementwise1,
+                                                  elementwise1_w,
+                                                  elementwise1_out,
+                                                  elementwise2,
+                                                  elementwise2_out,
+                                                  reshape2_qkv,
+                                                  scale});
+    // Remove unneeded nodes.
+    GraphSafeRemoveNodes(graph, marked_nodes);
+    ++fusion_count;
+  };
+  gpd(graph, handler);
+
+  return fusion_count;
+}
+
+void TrtQkMultiHeadMatmulFusePass::ApplyImpl(Graph* graph) const {
+  FusePassBase::Init(name_scope_, graph);
+#ifdef PADDLE_WITH_TENSORRT
+  auto trt_version = paddle::inference::tensorrt::GetTrtRuntimeVersion();
+  if (std::get<0>(trt_version) * 1000 + std::get<1>(trt_version) * 100 +
+          std::get<2>(trt_version) * 10 <
+      8520) {
+    VLOG(3) << "Qk attention oss plugin only available for trt version >= "
+               "8.5.2.2. Stop this pass";
+    return;
+  }
+#else
+  // if no tensorrt, early stop
+  return;
+#endif
+  bool with_dynamic_shape = Get<bool>("with_dynamic_shape");
+  if (!with_dynamic_shape) {
+    VLOG(3) << "Qk attention oss plugin need trt "
+               "with_dynamic_shape. Stop this pass";
+    return;
+  }
+  auto* scope = param_scope();
+  int fusion_count = BuildQkFusion(graph, name_scope_, scope);
+  AddStatis(fusion_count);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(trt_qk_multihead_matmul_fuse_pass,
+              paddle::framework::ir::TrtQkMultiHeadMatmulFusePass);
+REGISTER_PASS_CAPABILITY(trt_qk_multihead_matmul_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("reshape2", 0)
+            .EQ("transpose2", 0)
+            .EQ("scale", 0)
+            .EQ("softmax", 0)
+            .EQ("matmul_v2", 0));
diff --git a/paddle/fluid/framework/ir/trt_qk_multihead_matmul_fuse_pass.h b/paddle/fluid/framework/ir/trt_qk_multihead_matmul_fuse_pass.h
new file mode 100644
index 00000000000000..abc0d63e1403a3
--- /dev/null
+++ b/paddle/fluid/framework/ir/trt_qk_multihead_matmul_fuse_pass.h
@@ -0,0 +1,104 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace patterns {
+
+struct TrtQKMultiHeadMatmulPattern : public PatternBase {
+  TrtQKMultiHeadMatmulPattern(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "qk_multihead_matmul") {}
+
+  PDNode* operator()();
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(input0);
+  PATTERN_DECL_NODE(input1);
+  PATTERN_DECL_NODE(mul0);
+  PATTERN_DECL_NODE(mul1);
+  PATTERN_DECL_NODE(mul2);
+  PATTERN_DECL_NODE(mul0_w);
+  PATTERN_DECL_NODE(mul1_w);
+  PATTERN_DECL_NODE(mul2_w);
+  PATTERN_DECL_NODE(mul0_out);
+  PATTERN_DECL_NODE(mul1_out);
+  PATTERN_DECL_NODE(mul2_out);
+
+  PATTERN_DECL_NODE(elementwise0);
+  PATTERN_DECL_NODE(elementwise1);
+  PATTERN_DECL_NODE(elementwise2);
+
+  PATTERN_DECL_NODE(elementwise0_w);
+  PATTERN_DECL_NODE(elementwise1_w);
+  PATTERN_DECL_NODE(elementwise2_w);
+
+  PATTERN_DECL_NODE(elementwise0_out);
+  PATTERN_DECL_NODE(elementwise1_out);
+  PATTERN_DECL_NODE(elementwise2_out);
+
+  PATTERN_DECL_NODE(scale);
+  PATTERN_DECL_NODE(scale_out);
+  PATTERN_DECL_NODE(reshape2_0);
+  PATTERN_DECL_NODE(reshape2_1);
+  PATTERN_DECL_NODE(reshape2_2);
+  PATTERN_DECL_NODE(reshape2_qkv);
+  PATTERN_DECL_NODE(reshape2_0_out);
+  PATTERN_DECL_NODE(reshape2_1_out);
+  PATTERN_DECL_NODE(reshape2_2_out);
+  PATTERN_DECL_NODE(reshape2_qkv_out);
+  PATTERN_DECL_NODE(transpose2_0);
+  PATTERN_DECL_NODE(transpose2_1);
+  PATTERN_DECL_NODE(transpose2_2);
+  PATTERN_DECL_NODE(transpose2_qkv);
+  PATTERN_DECL_NODE(transpose2_0_out);
+  PATTERN_DECL_NODE(transpose2_1_out);
+  PATTERN_DECL_NODE(transpose2_2_out);
+  PATTERN_DECL_NODE(transpose2_qkv_out);
+  PATTERN_DECL_NODE(matmul_qk);
+  PATTERN_DECL_NODE(matmul_qk_out);
+  PATTERN_DECL_NODE(softmax_qk);
+  PATTERN_DECL_NODE(softmax_qk_out);
+
+  PATTERN_DECL_NODE(matmul_qkv);
+  PATTERN_DECL_NODE(matmul_qkv_out);
+};
+
+}  // namespace patterns
+
+class TrtQkMultiHeadMatmulFusePass : public FusePassBase {
+ public:
+  virtual ~TrtQkMultiHeadMatmulFusePass() {}
+
+ protected:
+  void ApplyImpl(Graph* graph) const;
+
+  const std::string name_scope_{"trt_qk_multihead_matmul_fuse"};
+
+ private:
+  int BuildQkFusion(Graph* graph,
+                    const std::string& name_scope,
+                    Scope* scope) const;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index b4c35e82c6e457..b07c47b81eff45 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -2569,6 +2569,7 @@ USE_TRT_CONVERTER(preln_groupnorm_act)
 #if IS_TRT_VERSION_GE(8522)
 USE_TRT_CONVERTER(flash_multihead_matmul)
 USE_TRT_CONVERTER(cross_multihead_matmul)
+USE_TRT_CONVERTER(qk_multihead_matmul)
 #endif
 #if IS_TRT_VERSION_GE(8510)
 USE_TRT_CONVERTER(grid_sampler)
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 8b1399515eda09..3cc8b077ad7e63 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -108,6 +108,7 @@ const std::vector<std::string> kTRTSubgraphPasses({
       "trt_flash_multihead_matmul_fuse_pass",         //
       "trt_cross_multihead_matmul_fuse_pass",         //
       "vit_attention_fuse_pass",                      //
+      "trt_qk_multihead_matmul_fuse_pass",            //
       "layernorm_shift_partition_fuse_pass",          //
       "merge_layernorm_fuse_pass",                    //
 #if !defined _WIN32
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index a47267ac3a562c..cbe26a3d31e4d9 100755
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -28,6 +28,7 @@ list(
   multihead_matmul_roformer_op.cc
   flash_multihead_matmul_op.cc
   cross_multihead_matmul_op.cc
+  qk_multihead_matmul_op.cc
   grid_sampler_op.cc
   shuffle_channel_op.cc
   fill_any_like_op.cc
diff --git a/paddle/fluid/inference/tensorrt/convert/qk_multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/qk_multihead_matmul_op.cc
new file mode 100644
index 00000000000000..89b65e95bd8eb0
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/qk_multihead_matmul_op.cc
@@ -0,0 +1,301 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See
+the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class QkMultiheadMatMulOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope,
+                  bool test_mode) override {
+    VLOG(3) << "convert a qk_multihead_mamul op to a corresponding tensorrt "
+               "network structure";
+
+    framework::OpDesc op_desc(op, nullptr);
+    auto* input_qk = engine_->GetITensor(op_desc.Input("Input_qk").front());
+    auto* input_v = engine_->GetITensor(op_desc.Input("Input_v").front());
+
+    auto output_name = op_desc.Output("Out")[0];
+
+    /* ------------------    weight_qk  -------------------------*/
+    auto weight_qk_name = op_desc.Input("W_qk").front();
+    auto* weight_qk_v = scope.FindVar(weight_qk_name);
+    auto* weight_qk_t = weight_qk_v->GetMutable<phi::DenseTensor>();
+    float* weight_qk_data = nullptr;
+    weight_qk_data = const_cast<float*>(static_cast<const float*>(
+        engine_->GetFp32TrtWeight(weight_qk_name, *weight_qk_t).get().values));
+
+    const auto& weight_qk_dims =
+        weight_qk_t->dims();  // hidden_in_qk 2 hidden_out_qk
+    int hidden_in_qk = weight_qk_dims[0];
+    int num_qk = weight_qk_dims[1];
+    int hidden_out_qk = weight_qk_dims[2];
+    int head_number_qk = PADDLE_GET_CONST(int, op_desc.GetAttr("head_number"));
+    int head_size_qk = hidden_out_qk / head_number_qk;
+    int n_qk = num_qk * hidden_out_qk;
+
+    // [hidden_in, 2, head_number, head_size]
+    // -> [head_number, 2, head_size, hidden_in]
+    auto transpose_weight_qk = [](const float* src,
+                                  float* dst,
+                                  int two,
+                                  int head_number,
+                                  int head_size,
+                                  int hidden_in) {
+      for (int hn = 0; hn < head_number; hn++) {
+        for (int t = 0; t < two; t++) {
+          for (int hs = 0; hs < head_size; hs++) {
+            for (int hi = 0; hi < hidden_in; hi++) {
+              int out_index = hn * two * head_size * hidden_in +
+                              t * head_size * hidden_in + hs * hidden_in + hi;
+              int in_index = hi * two * head_number * head_size +
+                             t * head_number * head_size + hn * head_size + hs;
+              dst[out_index] = src[in_index];
+            }
+          }
+        }
+      }
+    };
+
+    std::vector<float> weight_qk_data_tmp;
+    weight_qk_data_tmp.reserve(weight_qk_t->numel());
+    memcpy(weight_qk_data_tmp.data(),
+           weight_qk_data,
+           weight_qk_t->numel() * sizeof(float));
+    transpose_weight_qk(weight_qk_data_tmp.data(),
+                        weight_qk_data,
+                        num_qk,
+                        head_number_qk,
+                        head_size_qk,
+                        hidden_in_qk);
+
+    /* ------------------    bias_qk  -------------------------*/
+    auto bias_qk_name = op_desc.Input("B_qk").front();
+    auto* bias_qk_v = scope.FindVar(bias_qk_name);
+    auto* bias_qk_t = bias_qk_v->GetMutable<phi::DenseTensor>();
+    float* bias_qk_data = nullptr;
+    bias_qk_data = const_cast<float*>(static_cast<const float*>(
+        engine_->GetFp32TrtWeight(bias_qk_name, *bias_qk_t).get().values));
+
+    // [2, head_number, head_size] -> [head_number, 2, head_size]
+    auto transpose_bias_qk = [](const float* src, float* dst, int N, int H) {
+      for (int i = 0; i < 2; ++i) {
+        for (int n = 0; n < N; ++n) {
+          for (int h = 0; h < H; ++h) {
+            dst[n * 2 * H + i * H + h] = src[i * N * H + n * H + h];
+          }
+        }
+      }
+    };
+
+    std::vector<float> bias_qk_data_tmp;
+    bias_qk_data_tmp.reserve(bias_qk_t->numel());
+    memcpy(bias_qk_data_tmp.data(),
+           bias_qk_data,
+           bias_qk_t->numel() * sizeof(float));
+    transpose_bias_qk(
+        bias_qk_data_tmp.data(), bias_qk_data, head_number_qk, head_size_qk);
+
+    auto weight_qk_shape = nvinfer1::Dims3{1, n_qk, hidden_in_qk};
+    auto* weight_qk_tensor =
+        AddConstantLayer(weight_qk_data, weight_qk_shape, " ");
+    auto bias_qk_shape = nvinfer1::Dims3{1, 1, n_qk};
+    auto* bias_qk_tensor = AddConstantLayer(bias_qk_data, bias_qk_shape, " ");
+    nvinfer1::ITensor* input_qk_shape_tensor = Shape(input_qk);
+
+    nvinfer1::ILayer* fc_qk_layer = nullptr;
+    nvinfer1::ILayer* merge_qk_element_layer = nullptr;
+    nvinfer1::MatrixOperation matrix_operation_X =
+        nvinfer1::MatrixOperation::kNONE;
+    nvinfer1::MatrixOperation matrix_operation_Y =
+        nvinfer1::MatrixOperation::kTRANSPOSE;
+    fc_qk_layer = TRT_ENGINE_ADD_LAYER(engine_,
+                                       MatrixMultiply,
+                                       *input_qk,
+                                       matrix_operation_X,
+                                       *weight_qk_tensor,
+                                       matrix_operation_Y);
+    fc_qk_layer->setName(
+        ("qk_attention_matrix_multiply(Output: " + output_name + ")").c_str());
+
+    // add qk ElementWiseLayer layer
+    nvinfer1::ElementWiseOperation elementwise_operation =
+        nvinfer1::ElementWiseOperation::kSUM;
+    merge_qk_element_layer = TRT_ENGINE_ADD_LAYER(engine_,
+                                                  ElementWise,
+                                                  *fc_qk_layer->getOutput(0),
+                                                  *bias_qk_tensor,
+                                                  elementwise_operation);
+    merge_qk_element_layer->setName(
+        ("multihead_mamul_fc_qk(Output: " + output_name + ")").c_str());
+
+    auto* reshape_after_fc_qk_layer = TRT_ENGINE_ADD_LAYER(
+        engine_, Shuffle, *merge_qk_element_layer->getOutput(0));
+    std::vector<nvinfer1::ITensor*> mha_input_qk_tensor_shape;
+    for (int i = 0; i < 5; i++) {
+      mha_input_qk_tensor_shape.push_back(Add1DConstantLayer(1));
+    }
+    mha_input_qk_tensor_shape[0] =
+        GetEleTensorOfShape(input_qk_shape_tensor, 0);
+    mha_input_qk_tensor_shape[1] =
+        GetEleTensorOfShape(input_qk_shape_tensor, 1);
+    mha_input_qk_tensor_shape[2] = Add1DConstantLayer(head_number_qk);
+    mha_input_qk_tensor_shape[3] = Add1DConstantLayer(2);
+    mha_input_qk_tensor_shape[4] = Add1DConstantLayer(head_size_qk);
+    reshape_after_fc_qk_layer->setInput(1, *Concat(mha_input_qk_tensor_shape));
+    reshape_after_fc_qk_layer->setName(
+        ("shuffle_after_fc_qk_multihead_matmul(Output: " + output_name + ")")
+            .c_str());
+
+    /* ------------------    weight_v  -------------------------*/
+    auto weight_v_name = op_desc.Input("W_v").front();
+    auto* weight_v_v = scope.FindVar(weight_v_name);
+    auto* weight_v_t = weight_v_v->GetMutable<phi::DenseTensor>();
+    float* weight_v_data = nullptr;
+    weight_v_data = const_cast<float*>(static_cast<const float*>(
+        engine_->GetFp32TrtWeight(weight_v_name, *weight_v_t).get().values));
+    int n_v = hidden_out_qk;
+
+    // [hidden_in, head_number, head_size]
+    // -> [head_number, head_size, hidden_in]
+    auto transpose_weight_v = [](const float* src,
+                                 float* dst,
+                                 int head_number,
+                                 int head_size,
+                                 int hidden_in) {
+      for (int hn = 0; hn < head_number; hn++) {
+        for (int hs = 0; hs < head_size; hs++) {
+          for (int hi = 0; hi < hidden_in; hi++) {
+            int out_index = hn * head_size * hidden_in + hs * hidden_in + hi;
+            int in_index = hi * head_number * head_size + hn * head_size + hs;
+            dst[out_index] = src[in_index];
+          }
+        }
+      }
+    };
+    std::vector<float> weight_v_data_tmp;
+    weight_v_data_tmp.reserve(weight_v_t->numel());
+    memcpy(weight_v_data_tmp.data(),
+           weight_v_data,
+           weight_v_t->numel() * sizeof(float));
+    transpose_weight_v(weight_v_data_tmp.data(),
+                       weight_v_data,
+                       head_number_qk,
+                       head_size_qk,
+                       hidden_in_qk);
+
+    /* ------------------    bias_v  -------------------------*/
+    auto bias_v_name = op_desc.Input("B_v").front();
+    auto* bias_v_v = scope.FindVar(bias_v_name);
+    auto* bias_v_t = bias_v_v->GetMutable<phi::DenseTensor>();
+    float* bias_v_data = nullptr;
+    bias_v_data = const_cast<float*>(static_cast<const float*>(
+        engine_->GetFp32TrtWeight(bias_v_name, *bias_v_t).get().values));
+
+    auto weight_v_shape = nvinfer1::Dims3{1, n_v, hidden_in_qk};
+    auto* weight_v_tensor =
+        AddConstantLayer(weight_v_data, weight_v_shape, " ");
+    auto bias_v_shape = nvinfer1::Dims3{1, 1, n_v};
+    auto* bias_v_tensor = AddConstantLayer(bias_v_data, bias_v_shape, " ");
+    nvinfer1::ITensor* input_v_shape_tensor = Shape(input_v);
+
+    nvinfer1::ILayer* fc_v_layer = nullptr;
+    nvinfer1::ILayer* merge_v_element_layer = nullptr;
+    fc_v_layer = TRT_ENGINE_ADD_LAYER(engine_,
+                                      MatrixMultiply,
+                                      *input_v,
+                                      matrix_operation_X,
+                                      *weight_v_tensor,
+                                      matrix_operation_Y);
+    fc_v_layer->setName(
+        ("v_attention_matrix_multiply(Output: " + output_name + ")").c_str());
+
+    // add v ElementWiseLayer layer
+    merge_v_element_layer = TRT_ENGINE_ADD_LAYER(engine_,
+                                                 ElementWise,
+                                                 *fc_v_layer->getOutput(0),
+                                                 *bias_v_tensor,
+                                                 elementwise_operation);
+    merge_v_element_layer->setName(
+        ("multihead_mamul_fc_v(Output: " + output_name + ")").c_str());
+
+    // add shuffle for fc layer
+    auto* reshape_after_fc_v_layer = TRT_ENGINE_ADD_LAYER(
+        engine_, Shuffle, *merge_v_element_layer->getOutput(0));
+    std::vector<nvinfer1::ITensor*> mha_input_v_tensor_shape;
+    for (int i = 0; i < 5; i++) {
+      mha_input_v_tensor_shape.push_back(Add1DConstantLayer(1));
+    }
+    mha_input_v_tensor_shape[0] = GetEleTensorOfShape(input_v_shape_tensor, 0);
+    mha_input_v_tensor_shape[1] = GetEleTensorOfShape(input_v_shape_tensor, 1);
+    mha_input_v_tensor_shape[2] = Add1DConstantLayer(head_number_qk);
+    mha_input_v_tensor_shape[3] = Add1DConstantLayer(1);
+    mha_input_v_tensor_shape[4] = Add1DConstantLayer(head_size_qk);
+    reshape_after_fc_v_layer->setInput(1, *Concat(mha_input_v_tensor_shape));
+    reshape_after_fc_v_layer->setName(
+        ("shuffle_after_fc_v_multihead_matmul(Output: " + output_name + ")")
+            .c_str());
+
+    std::vector<nvinfer1::ITensor*> mha_input_tensor_vector{
+        reshape_after_fc_qk_layer->getOutput(0),
+        reshape_after_fc_v_layer->getOutput(0)};
+    nvinfer1::ITensor* mha_input_tensor = Concat(mha_input_tensor_vector, 3);
+    auto creator = GetPluginRegistry()->getPluginCreator("fMHA_V2", "1");
+    assert(creator != nullptr);
+    std::vector<nvinfer1::PluginField> fields{};
+    nvinfer1::PluginFieldCollection* plugin_collection =
+        static_cast<nvinfer1::PluginFieldCollection*>(
+            malloc(sizeof(*plugin_collection) +
+                   fields.size() *
+                       sizeof(nvinfer1::PluginField)));  // remember to free
+
+    plugin_collection->nbFields = static_cast<int>(fields.size());
+    plugin_collection->fields = fields.data();
+    auto plugin = creator->createPlugin("fMHA_V2", plugin_collection);
+    free(plugin_collection);
+    std::vector<nvinfer1::ITensor*> plugin_inputs;
+    plugin_inputs.emplace_back(mha_input_tensor);
+    auto plugin_layer = engine_->network()->addPluginV2(
+        plugin_inputs.data(), plugin_inputs.size(), *plugin);
+
+    // add shuffle
+    nvinfer1::ITensor* batch_tensor =
+        GetEleTensorOfShape(input_qk_shape_tensor, 0);
+    nvinfer1::ITensor* length_tensor =
+        GetEleTensorOfShape(input_qk_shape_tensor, 1);
+    auto* reshape_after_mha_layer =
+        TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *plugin_layer->getOutput(0));
+    std::vector<nvinfer1::ITensor*> reshape_tensor;
+    reshape_tensor.push_back(batch_tensor);
+    reshape_tensor.push_back(length_tensor);
+    reshape_tensor.push_back(Add1DConstantLayer(-1));
+    reshape_after_mha_layer->setInput(1, *Concat(reshape_tensor));
+    reshape_after_mha_layer->setName(
+        ("shuffle_last_multihead_matmul(Output: " + output_name + ")").c_str());
+    nvinfer1::ILayer* layer = nullptr;
+    layer = reshape_after_mha_layer;
+    RreplenishLayerAndOutput(
+        layer, "qk_multihead_matmul", {output_name}, test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(qk_multihead_matmul, QkMultiheadMatMulOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 685fc44d7b36a9..24dca82d3fba17 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -70,6 +70,8 @@ struct SimpleOpTypeSetTeller : public Teller {
     int8_teller_set.insert("flash_multihead_matmul");
     teller_set.insert("cross_multihead_matmul");
     int8_teller_set.insert("cross_multihead_matmul");
+    teller_set.insert("qk_multihead_matmul");
+    int8_teller_set.insert("qk_multihead_matmul");
 #endif
 #if IS_TRT_VERSION_GE(8200)
     teller_set.insert("round");
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_qk_multihead_matmul.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_qk_multihead_matmul.py
new file mode 100644
index 00000000000000..548f0486e12f68
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_qk_multihead_matmul.py
@@ -0,0 +1,385 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from functools import partial
+from typing import List
+
+import numpy as np
+from program_config import ProgramConfig, TensorConfig
+from trt_layer_auto_scan_test import SkipReasons, TrtLayerAutoScanTest
+
+import paddle.inference as paddle_infer
+
+
+class TrtConvertQkAttentionTest(TrtLayerAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        ver = paddle_infer.get_trt_compile_version()
+        if ver[0] * 1000 + ver[1] * 100 + ver[2] * 10 < 8520:
+            return False
+        return True
+
+    def sample_program_configs(self):
+        def generate_input1(batch, length):
+            return np.random.rand(batch, length, 256).astype(np.float32) / 10
+
+        def generate_input2(batch, length):
+            return np.random.rand(batch, length, 256).astype(np.float32) / 10
+
+        def generate_weight_q():
+            return np.random.rand(256, 256).astype(np.float32) / 10
+
+        def generate_weight_k():
+            return np.random.rand(256, 256).astype(np.float32) / 10
+
+        def generate_weight_v():
+            return np.random.rand(256, 256).astype(np.float32) / 10
+
+        def generate_bias_q():
+            return np.random.rand(256).astype(np.float32) / 10
+
+        def generate_bias_k():
+            return np.random.rand(256).astype(np.float32) / 10
+
+        def generate_bias_v():
+            return np.random.rand(256).astype(np.float32) / 10
+
+        for batch in [1, 2]:
+            self.batch = batch
+            for length in [300, 400]:
+                ops_config = [
+                    # q
+                    {
+                        "op_type": "matmul_v2",
+                        "op_inputs": {
+                            "X": ["input_data1"],
+                            "Y": ["matmul_q_weight"],
+                        },
+                        "op_outputs": {"Out": ["matmul_q_output"]},
+                        "op_attrs": {"trans_x": False, "trans_y": False},
+                    },
+                    {
+                        "op_type": "elementwise_add",
+                        "op_inputs": {
+                            "X": ["matmul_q_output"],
+                            "Y": ["bias_q"],
+                        },
+                        "op_outputs": {"Out": ["elementwise_q_output"]},
+                        "op_attrs": {
+                            "Scale_out": 1.0,
+                            "Scale_x": 1.0,
+                            "Scale_y": 1.0,
+                            "axis": 2,
+                        },
+                    },
+                    {
+                        "op_type": "reshape2",
+                        "op_inputs": {
+                            "X": ["elementwise_q_output"],
+                        },
+                        "op_outputs": {
+                            "Out": ["reshape_q_output"],
+                            "XShape": ["reshape_q_output_xshape"],
+                        },
+                        "op_attrs": {"shape": [0, 0, 8, 32]},
+                    },
+                    {
+                        "op_type": "transpose2",
+                        "op_inputs": {"X": ["reshape_q_output"]},
+                        "op_outputs": {
+                            "Out": ["transpose_q_output"],
+                            "XShape": ["transpose_q_output_xshape"],
+                        },
+                        "op_attrs": {
+                            "axis": [0, 2, 1, 3],
+                            "data_format": "AnyLayout",
+                        },
+                    },
+                    # k
+                    {
+                        "op_type": "matmul_v2",
+                        "op_inputs": {
+                            "X": ["input_data1"],
+                            "Y": ["matmul_k_weight"],
+                        },
+                        "op_outputs": {"Out": ["matmul_k_output"]},
+                        "op_attrs": {"trans_x": False, "trans_y": False},
+                    },
+                    {
+                        "op_type": "elementwise_add",
+                        "op_inputs": {
+                            "X": ["matmul_k_output"],
+                            "Y": ["bias_k"],
+                        },
+                        "op_outputs": {"Out": ["elementwise_k_output"]},
+                        "op_attrs": {
+                            "Scale_out": 1.0,
+                            "Scale_x": 1.0,
+                            "Scale_y": 1.0,
+                            "axis": 2,
+                        },
+                    },
+                    {
+                        "op_type": "reshape2",
+                        "op_inputs": {
+                            "X": ["elementwise_k_output"],
+                        },
+                        "op_outputs": {
+                            "Out": ["reshape_k_output"],
+                            "XShape": ["reshape_k_output_xshape"],
+                        },
+                        "op_attrs": {"shape": [0, 0, 8, 32]},
+                    },
+                    {
+                        "op_type": "transpose2",
+                        "op_inputs": {"X": ["reshape_k_output"]},
+                        "op_outputs": {
+                            "Out": ["transpose_k_output"],
+                            "XShape": ["transpose_k_output_xshape"],
+                        },
+                        "op_attrs": {
+                            "axis": [0, 2, 1, 3],
+                            "data_format": "AnyLayout",
+                        },
+                    },
+                    # V
+                    {
+                        "op_type": "matmul_v2",
+                        "op_inputs": {
+                            "X": ["input_data2"],
+                            "Y": ["matmul_v_weight"],
+                        },
+                        "op_outputs": {"Out": ["matmul_v_output"]},
+                        "op_attrs": {"trans_x": False, "trans_y": False},
+                    },
+                    {
+                        "op_type": "elementwise_add",
+                        "op_inputs": {
+                            "X": ["matmul_v_output"],
+                            "Y": ["bias_v"],
+                        },
+                        "op_outputs": {"Out": ["elementwise_v_output"]},
+                        "op_attrs": {
+                            "Scale_out": 1.0,
+                            "Scale_x": 1.0,
+                            "Scale_y": 1.0,
+                            "axis": 2,
+                        },
+                    },
+                    {
+                        "op_type": "reshape2",
+                        "op_inputs": {
+                            "X": ["elementwise_v_output"],
+                        },
+                        "op_outputs": {
+                            "Out": ["reshape_v_output"],
+                            "XShape": ["reshape_v_output_xshape"],
+                        },
+                        "op_attrs": {"shape": [0, 0, 8, 32]},
+                    },
+                    {
+                        "op_type": "transpose2",
+                        "op_inputs": {"X": ["reshape_v_output"]},
+                        "op_outputs": {
+                            "Out": ["transpose_v_output"],
+                            "XShape": ["transpose_v_output_xshape"],
+                        },
+                        "op_attrs": {
+                            "axis": [0, 2, 1, 3],
+                            "data_format": "AnyLayout",
+                        },
+                    },
+                    # matmul1+matmul2
+                    {
+                        "op_type": "matmul_v2",
+                        "op_inputs": {
+                            "X": ["transpose_q_output"],
+                            "Y": ["transpose_k_output"],
+                        },
+                        "op_outputs": {"Out": ["matmul1_output"]},
+                        "op_attrs": {"trans_x": False, "trans_y": True},
+                    },
+                    {
+                        "op_type": "scale",
+                        "op_inputs": {
+                            "X": ["matmul1_output"],
+                        },
+                        "op_outputs": {"Out": ["scale_output"]},
+                        "op_attrs": {
+                            "scale": 0.17677,
+                            "bias": 0.0,
+                            "bias_after_scale": True,
+                        },
+                    },
+                    {
+                        "op_type": "softmax",
+                        "op_inputs": {"X": ["scale_output"]},
+                        "op_outputs": {"Out": ["softmax_output"]},
+                        "op_attrs": {
+                            "axis": -1,
+                            "data_format": "AnyLayout",
+                        },
+                    },
+                    {
+                        "op_type": "matmul_v2",
+                        "op_inputs": {
+                            "X": ["softmax_output"],
+                            "Y": ["transpose_v_output"],
+                        },
+                        "op_outputs": {"Out": ["matmul2_output"]},
+                        "op_attrs": {"trans_x": False, "trans_y": False},
+                    },
+                    {
+                        "op_type": "transpose2",
+                        "op_inputs": {"X": ["matmul2_output"]},
+                        "op_outputs": {
+                            "Out": ["transpose_output"],
+                            "XShape": ["transpose_output_xshape"],
+                        },
+                        "op_attrs": {
+                            "axis": [0, 2, 1, 3],
+                            "data_format": "AnyLayout",
+                        },
+                    },
+                    {
+                        "op_type": "reshape2",
+                        "op_inputs": {"X": ["transpose_output"]},
+                        "op_outputs": {
+                            "Out": ["reshape_output"],
+                            "XShape": ["reshape_output_xshape"],
+                        },
+                        "op_attrs": {"shape": [0, 0, 256]},
+                    },
+                ]
+                ops = self.generate_op_config(ops_config)
+
+                program_config = ProgramConfig(
+                    ops=ops,
+                    weights={
+                        "matmul_q_weight": TensorConfig(
+                            data_gen=partial(generate_weight_q)
+                        ),
+                        "matmul_k_weight": TensorConfig(
+                            data_gen=partial(generate_weight_k)
+                        ),
+                        "matmul_v_weight": TensorConfig(
+                            data_gen=partial(generate_weight_v)
+                        ),
+                        "bias_q": TensorConfig(
+                            data_gen=partial(generate_bias_q)
+                        ),
+                        "bias_k": TensorConfig(
+                            data_gen=partial(generate_bias_k)
+                        ),
+                        "bias_v": TensorConfig(
+                            data_gen=partial(generate_bias_v)
+                        ),
+                    },
+                    inputs={
+                        "input_data1": TensorConfig(
+                            data_gen=partial(generate_input1, batch, length)
+                        ),
+                        "input_data2": TensorConfig(
+                            data_gen=partial(generate_input2, batch, length)
+                        ),
+                    },
+                    outputs=["reshape_output"],
+                )
+
+                yield program_config
+
+    def sample_predictor_configs(
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
+        def generate_dynamic_shape(attrs):
+            # The last dim of input1 and input2 should be static.
+            self.dynamic_shape.min_input_shape = {
+                "input_data1": [1, 300, 256],
+                "input_data2": [1, 300, 256],
+            }
+            self.dynamic_shape.max_input_shape = {
+                "input_data1": [4, 1200, 256],
+                "input_data2": [4, 1200, 256],
+            }
+            self.dynamic_shape.opt_input_shape = {
+                "input_data1": [1, 300, 256],
+                "input_data2": [1, 300, 256],
+            }
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        attrs = [
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
+        ]
+
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        self.trt_param.workspace_size = 2013265920
+        yield self.create_inference_config(), (1, 3), (1e-5, 1e-5)
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), (1, 3), (1e-3, 1e-3)
+
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        self.trt_param.workspace_size = 2013265920
+        yield self.create_inference_config(), (1, 3), (1e-5, 1e-4)
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), (1, 3), (1e-2, 1e-3)
+
+    def add_skip_trt_case(self):
+        def teller1(program_config, predictor_config):
+            if self.dynamic_shape.min_input_shape == {}:
+                return True
+            return False
+
+        self.add_skip_case(
+            teller1,
+            SkipReasons.TRT_NOT_IMPLEMENTED,
+            "The qk attention trt oss plugin do not support static shape yet",
+        )
+
+        def teller2(program_config, predictor_config):
+            if self.trt_param.precision == paddle_infer.PrecisionType.Float32:
+                return True
+            return False
+
+        self.add_skip_case(
+            teller2,
+            SkipReasons.TRT_NOT_IMPLEMENTED,
+            "The qk attention trt oss plugin do not support fp32 yet",
+        )
+
+        def teller3(program_config, predictor_config):
+            if self.trt_param.precision == paddle_infer.PrecisionType.Int8:
+                return True
+            return False
+
+        self.add_skip_case(
+            teller3,
+            SkipReasons.TRT_NOT_IMPLEMENTED,
+            "The qk attention trt oss plugin do not support int8 yet.",
+        )
+
+    def test(self):
+        self.add_skip_trt_case()
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()

From 4970dd6558fba3860462434588b6c1a2800793e8 Mon Sep 17 00:00:00 2001
From: Zhang Ting <zhangting_2017@163.com>
Date: Mon, 10 Apr 2023 16:45:43 +0800
Subject: [PATCH 023/156] [AMP] support master_grad for amp training (#52235)

* support set master_grad

* move register_hook to auto_cast

* update unittest

* fix fp16 test

* update for review comments
---
 .../fluid/imperative/gradient_accumulator.cc  | 10 +-
 paddle/fluid/pybind/eager_functions.cc        | 37 +++++++
 python/paddle/amp/auto_cast.py                | 46 ++++++++-
 test/amp/test_amp_master_grad.py              | 96 +++++++++++++++++++
 4 files changed, 181 insertions(+), 8 deletions(-)
 create mode 100644 test/amp/test_amp_master_grad.py

diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index c2ccd48d4ca1e5..14b9bc5aae0bce 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -187,18 +187,14 @@ void TensorAdd(const VarType& src, VarType* dst) {
   auto data_type = framework::TransToProtoVarType(src_tensor.dtype());
   auto place = src_tensor.place();
 
-  PADDLE_ENFORCE_EQ(framework::TransToProtoVarType(dst_tensor->dtype()),
-                    data_type,
-                    platform::errors::PreconditionNotMet(
-                        "The data type of source tensor and destination tensor "
-                        "should be equal, Otherwise, the calculation results "
-                        "will be incorrect."));
-
   // if src and dst are in different place, copy dst to src's place
   if (dst_tensor->place() != place) {
     paddle::framework::TensorCopySync(*dst_tensor, place, dst_tensor);
   }
 
+  // AddKernel already support inputs of different dtype. For AMP master_grad,
+  // the dtype of source tensor and destination tensor will be diferent. So the
+  // check requiring input dtypes to be the same have been removed.
 #define PADDLE_TENSOR_ADD(T, CONTEXT)                                          \
   if (data_type == framework::DataTypeTrait<T>::DataType()) {                  \
     auto cpu_ctx = static_cast<CONTEXT*>(                                      \
diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
index 8df301520ec50c..848fa1fe742e8b 100644
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -47,6 +47,7 @@ typedef SSIZE_T ssize_t;
 #include "paddle/fluid/pybind/op_function_common.h"
 #include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/phi/api/ext/op_meta_info.h"
+#include "paddle/phi/api/include/api.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/compat/convert_utils.h"
@@ -1246,6 +1247,37 @@ static PyObject* eager_api__add_backward_final_hook(PyObject* self,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
+static PyObject* eager_api_set_master_grads(PyObject* self,
+                                            PyObject* args,
+                                            PyObject* kwargs) {
+  EAGER_TRY
+  // tensor_list is a list of model parameters.
+  auto tensor_list = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 0), 0);
+  for (auto& tensor : tensor_list) {
+    VLOG(6) << "set master_grad for tensor: " << tensor.name();
+    PADDLE_ENFORCE_EQ(
+        egr::egr_utils_api::IsLeafTensor(tensor),
+        true,
+        paddle::platform::errors::Fatal("Only leaf Tensor can be set grad."));
+    paddle::Tensor* grad = egr::EagerUtils::mutable_grad(tensor);
+    PADDLE_ENFORCE_NE(grad,
+                      nullptr,
+                      paddle::platform::errors::Fatal(
+                          "Detected NULL grad"
+                          "Please check if you have manually cleared"
+                          "the grad inside autograd_meta"));
+    auto dtype = (*grad).dtype();
+    if ((*grad).initialized() &&
+        (dtype == phi::DataType::FLOAT16 || dtype == phi::DataType::BFLOAT16)) {
+      auto master_grad =
+          paddle::experimental::cast(*grad, phi::DataType::FLOAT32);
+      grad->set_impl(master_grad.impl());
+    }
+  }
+  RETURN_PY_NONE
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
 PyMethodDef variable_functions[] = {
     // TODO(jiabin): Remove scale when we have final state tests
     {"scale",
@@ -1314,6 +1346,11 @@ PyMethodDef variable_functions[] = {
      (PyCFunction)(void (*)(void))eager_api_reset_saved_tensors_hooks,
      METH_VARARGS | METH_KEYWORDS,
      NULL},
+    /**amp functions**/
+    {"set_master_grads",
+     (PyCFunction)(void (*)(void))eager_api_set_master_grads,
+     METH_VARARGS | METH_KEYWORDS,
+     NULL},
 /**sparse functions**/
 #if defined(PADDLE_WITH_CUDA)
     {"async_read",
diff --git a/python/paddle/amp/auto_cast.py b/python/paddle/amp/auto_cast.py
index 7e20980be95676..33c7855d897243 100644
--- a/python/paddle/amp/auto_cast.py
+++ b/python/paddle/amp/auto_cast.py
@@ -101,6 +101,23 @@ def amp_state():
     return _g_amp_state_
 
 
+class AMPGlobalState:
+    def __init__(self):
+        self.model_parameters = []
+        self.use_master_grad = False
+        self.already_register_final_backward_hook = False
+
+    def __setattr__(self, name, val):
+        self.__dict__[name] = val
+
+
+_amp_global_state = AMPGlobalState()
+
+
+def amp_global_state():
+    return _amp_global_state
+
+
 # NOTE(zhiqiu): similar as paddle.static.amp.fp16_lists.AutoMixedPrecisionLists._update_list
 # The reason why not use AutoMixedPrecisionLists is that custom_black_varnames is not suitable for imperative mode.
 def _update_list(
@@ -418,6 +435,21 @@ def amp_guard(
         amp_level = AMP_LEVEL.O0
         amp_dtype = "float32"
 
+    # master_grad_hook will run at the end of backward.
+    # Since backward_final_hook will be cleared once they have been
+    # done, we should register the hook every step.
+    if (
+        amp_global_state().use_master_grad
+        and not amp_global_state().already_register_final_backward_hook
+    ):
+
+        def master_grad_hook():
+            core.eager.set_master_grads(amp_global_state().model_parameters)
+            amp_global_state().already_register_final_backward_hook = False
+
+        core.eager._add_backward_final_hook(master_grad_hook)
+        amp_global_state().already_register_final_backward_hook = True
+
     if tracer:
         # enable auto_cast
         original_amp_level = tracer._amp_level
@@ -486,6 +518,7 @@ def amp_decorate(
     dtype='float16',
     master_weight=None,
     save_dtype=None,
+    master_grad=False,
 ):
     """
     Decorate models and optimizers for auto-mixed-precision. When level is O1(amp), the decorate will do nothing.
@@ -599,6 +632,14 @@ def amp_decorate(
         for opt in optimizers:
             _set_multi_precision(opt, use_multi_precision)
 
+        # support master_grad
+        if master_grad:
+            amp_global_state().use_master_grad = True
+            for idx in range(len(models)):
+                amp_global_state().model_parameters.extend(
+                    models[idx].parameters()
+                )
+
     if save_dtype is not None:
         if not (save_dtype in ['float16', 'bfloat16', 'float32', 'float64']):
             raise ValueError(
@@ -696,6 +737,7 @@ def decorate(
     dtype='float16',
     master_weight=None,
     save_dtype=None,
+    master_grad=False,
 ):
     """
     Decorate models and optimizers for auto-mixed-precision. When level is O1(amp), the decorate will do nothing.
@@ -712,6 +754,8 @@ def decorate(
         master_weight(bool, optinal): For level='O2', whether to use multi-precision during weight updating. If master_weight is None, in O2 level optimizer will use multi-precision. Default is None.
         save_dtype(float, optional): The save model parameter dtype when use `paddle.save` or `paddle.jit.save`,it should be float16, bfloat16, float32, float64 or None.
              The save_dtype will not change model parameters dtype, it just change the state_dict dtype. When save_dtype is None, the save dtype is same as model dtype. Default is None.
+        master_grad(bool, optional): For level='O2', whether to use FP32 weight gradients for calculations such as gradient clipping, weight decay, and weight updates. If it is enabled, the weight
+             gradients will be FP32 dtype after the backpropagation. Default is False.
 
     Examples:
 
@@ -761,5 +805,5 @@ def decorate(
             print(output.dtype) # FP16
     """
     return amp_decorate(
-        models, optimizers, level, dtype, master_weight, save_dtype
+        models, optimizers, level, dtype, master_weight, save_dtype, master_grad
     )
diff --git a/test/amp/test_amp_master_grad.py b/test/amp/test_amp_master_grad.py
new file mode 100644
index 00000000000000..6b5aebf35771e6
--- /dev/null
+++ b/test/amp/test_amp_master_grad.py
@@ -0,0 +1,96 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle.fluid import core
+
+
+class SimpleNet(paddle.nn.Layer):
+    def __init__(self, input_size, output_size):
+        super().__init__()
+        self.linear = paddle.nn.Linear(input_size, output_size)
+
+    def forward(self, x):
+        x = self.linear(x)
+        return x
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_float16_supported(core.CUDAPlace(0)),
+    "core is not complied with CUDA and not support the float16",
+)
+class TestMasterGrad(unittest.TestCase):
+    def check_results(
+        self, fp32_grads, op_list, total_steps, accumulate_batchs_num
+    ):
+        for grad in fp32_grads:
+            self.assertEqual(grad.dtype, paddle.float32)
+        # fp16 calls
+        self.assertEqual(int(op_list['matmul_v2'].split(',')[0]), total_steps)
+        self.assertEqual(
+            int(op_list['adamw_'].split(',')[0]),
+            2 * (total_steps / accumulate_batchs_num),
+        )
+        self.assertEqual(
+            int(op_list['transfer_dtype'].split(',')[0]),
+            total_steps + total_steps * 2,
+        )
+
+    def run_dygraph(self, total_steps, accumulate_batchs_num):
+        model = SimpleNet(2, 4)
+        opt = paddle.optimizer.AdamW(parameters=model.parameters())
+        model, opt = paddle.amp.decorate(
+            model, optimizers=opt, level='O2', master_grad=True
+        )
+        scaler = paddle.amp.GradScaler()
+
+        paddle.amp.debugging.enable_operator_stats_collection()
+        for i in range(total_steps):
+            x = np.random.random((2, 2)).astype('float32')
+            label = np.random.random((2, 4)).astype('float32')
+
+            with paddle.amp.auto_cast(level='O2'):
+                out = model(paddle.to_tensor(x))
+                loss = paddle.nn.functional.l1_loss(
+                    out, paddle.to_tensor(label)
+                )
+            scaled = scaler.scale(loss)
+            scaled.backward()
+            fp32_grads = [model.linear.weight.grad, model.linear.bias.grad]
+            if (i + 1) % accumulate_batchs_num == 0:
+                scaler.step(opt)
+                scaler.update()
+                opt.clear_grad()
+        paddle.amp.debugging.disable_operator_stats_collection()
+        op_list = paddle.fluid.core.get_low_precision_op_list()
+        return fp32_grads, op_list
+
+    def test_master_grad(self):
+        total_steps = 4
+        accumulate_batchs_num = 2
+        fp32_grads, op_list = self.run_dygraph(
+            total_steps, accumulate_batchs_num
+        )
+        self.check_results(
+            fp32_grads, op_list, total_steps, accumulate_batchs_num
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()

From 349a059d35d63c650d5e5cba090d7382f3c69878 Mon Sep 17 00:00:00 2001
From: LyndonKong <kongrui.chn@outlook.com>
Date: Mon, 10 Apr 2023 16:47:42 +0800
Subject: [PATCH 024/156] =?UTF-8?q?=E3=80=90Hackathon=20No.16=E3=80=91add?=
 =?UTF-8?q?=20PoissonNLLLoss=20API=20(#51117)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* add PoissonNLLLoss API

* update unittests

* Fix poisson_nll_loss init and update data type support

* remove type comment

* Update doc string

* Fix doc string erro

* Fix doc string math equation format

* Add float16 and bfloat16 support
---
 .../tests/unittests/test_poisson_nll_loss.py  | 248 ++++++++++++++++++
 python/paddle/nn/__init__.py                  |   2 +
 python/paddle/nn/functional/__init__.py       |   2 +
 python/paddle/nn/functional/loss.py           | 122 ++++++++-
 python/paddle/nn/layer/__init__.py            |   1 +
 python/paddle/nn/layer/loss.py                |  93 +++++++
 python/paddle/tensor/math.py                  |   2 +-
 python/paddle/tensor/ops.py                   |   1 +
 8 files changed, 468 insertions(+), 3 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_poisson_nll_loss.py

diff --git a/python/paddle/fluid/tests/unittests/test_poisson_nll_loss.py b/python/paddle/fluid/tests/unittests/test_poisson_nll_loss.py
new file mode 100644
index 00000000000000..096018a6e2bf0b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_poisson_nll_loss.py
@@ -0,0 +1,248 @@
+#   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+import paddle.nn.functional as F
+from paddle.fluid import core
+
+np.random.seed(100)
+
+
+def ref_poisson_nll_loss(
+    input,
+    label,
+    log_input=True,
+    full=False,
+    epsilon=1e-8,
+    reduction="mean",
+):
+    if epsilon <= 0:
+        raise ValueError(
+            "The value of `epsilon` in PoissonNLLLoss should be positve, but received %f, which is not allowed"
+            % epsilon
+        )
+
+    if reduction not in ['sum', 'mean', 'none']:
+        raise ValueError(
+            "The value of 'reduction' in SoftMarginLoss should be 'sum', 'mean' or 'none', but "
+            "received %s, which is not allowed." % reduction
+        )
+    loss_out = 0
+    if log_input:
+        loss_out = np.exp(input) - label * input
+    else:
+        loss_out = input - label * np.log(input + epsilon)
+    if full:
+        stirling_approx = (
+            label * np.log(label) - label + 0.5 * np.log(2 * np.pi * label)
+        )
+        loss_out += np.where(stirling_approx <= 1, 0, stirling_approx)
+
+    if reduction == 'none':
+        return loss_out
+    elif reduction == 'sum':
+        return [np.sum(loss_out)]
+    elif reduction == 'mean':
+        return [np.mean(loss_out)]
+
+
+class TestPoissonNLLLossBasicCase(unittest.TestCase):
+    def setUp(self, dtype="float32"):
+        self.shape = [10, 2]
+        self.dtype = dtype
+        self.input_np = np.random.random(self.shape).astype(self.dtype)
+        self.label_np = np.random.random(self.shape).astype(self.dtype)
+        self.place = (
+            paddle.CUDAPlace(0)
+            if core.is_compiled_with_cuda()
+            else paddle.CPUPlace()
+        )
+
+    def test_static_case(
+        self,
+        dtype="float32",
+        log_input=True,
+        full=False,
+        epsilon=1e-8,
+        reduction="mean",
+    ):
+        self.setUp(dtype)
+        paddle.enable_static()
+        prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        with paddle.static.program_guard(prog, startup_prog):
+            input = paddle.static.data('input', self.shape, dtype)
+            label = paddle.static.data('label', self.shape, dtype)
+            input.desc.set_need_check_feed(False)
+            label.desc.set_need_check_feed(False)
+            out1 = F.poisson_nll_loss(
+                input,
+                label,
+                log_input=log_input,
+                full=full,
+                epsilon=epsilon,
+                reduction=reduction,
+            )
+            poisson_nll_loss = paddle.nn.PoissonNLLLoss(
+                log_input=log_input,
+                full=full,
+                epsilon=epsilon,
+                reduction=reduction,
+            )
+            out2 = poisson_nll_loss(input, label)
+        exe = paddle.static.Executor(self.place)
+        exe.run(startup_prog)
+        res = exe.run(
+            prog,
+            feed={'input': self.input_np, 'label': self.label_np},
+            fetch_list=[out1, out2],
+        )
+        out_ref = ref_poisson_nll_loss(
+            self.input_np,
+            self.label_np,
+            log_input=log_input,
+            full=full,
+            epsilon=epsilon,
+            reduction=reduction,
+        )
+        for r in res:
+            np.allclose(out_ref, r, rtol=1e-5)
+
+    def test_dynamic_case(
+        self,
+        dtype="float32",
+        log_input=True,
+        full=False,
+        epsilon=1e-8,
+        reduction="mean",
+        type=None,
+    ):
+        self.setUp(dtype)
+        paddle.disable_static(self.place)
+
+        input_x = paddle.to_tensor(self.input_np)
+        label = paddle.to_tensor(self.label_np)
+        out_ref = ref_poisson_nll_loss(
+            self.input_np,
+            self.label_np,
+            log_input=log_input,
+            full=full,
+            epsilon=epsilon,
+            reduction=reduction,
+        )
+        out1 = F.poisson_nll_loss(
+            input_x,
+            label,
+            log_input=log_input,
+            full=full,
+            epsilon=epsilon,
+            reduction=reduction,
+        )
+        if type == 'test_err_reduction':
+            self.assertRaises(
+                ValueError,
+                paddle.nn.functional.poisson_nll_loss,
+                input=input_x,
+                label=label,
+                log_input=log_input,
+                full=full,
+                epsilon=epsilon,
+                reduction="unsupport reduction",
+            )
+        elif type == 'test_err_epsilon':
+            self.assertRaises(
+                ValueError,
+                paddle.nn.functional.poisson_nll_loss,
+                input=input_x,
+                label=label,
+                log_input=log_input,
+                full=full,
+                epsilon=-1,
+                reduction="mean",
+            )
+        poisson_nll_loss = paddle.nn.PoissonNLLLoss(
+            log_input=log_input, full=full, epsilon=epsilon, reduction=reduction
+        )
+        out2 = poisson_nll_loss(input_x, label)
+
+        for r in [out1, out2]:
+            np.allclose(out_ref, r.numpy(), rtol=1e-5)
+        paddle.enable_static()
+
+    def test_api(self):
+        pass
+
+
+class TestPoissonNLLLossErrCase(TestPoissonNLLLossBasicCase):
+    def test_err_reduction(self):
+        self.test_dynamic_case(type="test_err_reduction")
+
+    def test_err_epsilon(self):
+        self.test_dynamic_case(type="test_err_epsilon")
+
+    def test_api(self):
+        self.test_err_reduction()
+        self.test_err_epsilon()
+
+
+class TestPoissonNLLLossFloat16Case(TestPoissonNLLLossBasicCase):
+    def test_api(self):
+        if core.is_compiled_with_cuda():
+            self.test_static_case(dtype="float16")
+            self.test_dynamic_case(dtype="float16")
+
+
+class TestPoissonNLLLossBfloat16Case(TestPoissonNLLLossBasicCase):
+    def test_api(self):
+        if core.is_compiled_with_cuda():
+            self.test_static_case(dtype="uint16")
+            self.test_dynamic_case(dtype="uint16")
+
+
+class TestPoissonNLLLossFloat32Case(TestPoissonNLLLossBasicCase):
+    def test_api(self):
+        self.test_static_case(dtype="float32")
+        self.test_dynamic_case(dtype="float32")
+
+
+class TestPoissonNLLLossFloat64Case(TestPoissonNLLLossBasicCase):
+    def test_api(self):
+        self.test_static_case(dtype="float64")
+        self.test_dynamic_case(dtype="float64")
+
+
+class TestPoissonNLLLossNoLoginputCase(TestPoissonNLLLossBasicCase):
+    def test_api(self):
+        self.test_static_case(log_input=False)
+        self.test_dynamic_case(log_input=False)
+
+
+class TestPoissonNLLLossFulllossCase(TestPoissonNLLLossBasicCase):
+    def test_api(self):
+        self.test_static_case(full=True)
+        self.test_dynamic_case(full=True)
+
+
+class TestPoissonNLLLossSumReductionCase(TestPoissonNLLLossBasicCase):
+    def test_api(self):
+        self.test_static_case(reduction="sum")
+        self.test_dynamic_case(reduction="sum")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index 1b6cc3ae71ea8d..fe7209ecf46dab 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -100,6 +100,7 @@
 from .layer.loss import MSELoss  # noqa: F401
 from .layer.loss import L1Loss  # noqa: F401
 from .layer.loss import NLLLoss  # noqa: F401
+from .layer.loss import PoissonNLLLoss  # noqa: F401
 from .layer.loss import BCELoss  # noqa: F401
 from .layer.loss import KLDivLoss  # noqa: F401
 from .layer.loss import MarginRankingLoss  # noqa: F401
@@ -268,6 +269,7 @@ def weight_norm(*args):
     'AdaptiveAvgPool3D',
     'AdaptiveMaxPool3D',
     'NLLLoss',
+    'PoissonNLLLoss',
     'Conv1D',
     'Sequential',
     'Hardswish',
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index 31d74225e1a70c..2a9d1390527c86 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -83,6 +83,7 @@
 from .loss import margin_ranking_loss  # noqa: F401
 from .loss import mse_loss  # noqa: F401
 from .loss import nll_loss  # noqa: F401
+from .loss import poisson_nll_loss  # noqa: F401
 from .loss import npair_loss  # noqa: F401
 from .loss import sigmoid_focal_loss  # noqa: F401
 from .loss import smooth_l1_loss  # noqa: F401
@@ -214,6 +215,7 @@
     'margin_ranking_loss',
     'multi_label_soft_margin_loss',
     'nll_loss',
+    'poisson_nll_loss',
     'npair_loss',
     'sigmoid_focal_loss',
     'smooth_l1_loss',
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 85c96de4bfbc79..4b57c9d936123c 100644
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import math
+
 # TODO: define loss functions of neural network
 import paddle
 from paddle import _C_ops, _legacy_C_ops, fluid, in_dynamic_mode
@@ -1322,10 +1324,16 @@ def l1_loss(input, label, reduction='mean', name=None):
             return unreduced
     else:
         check_variable_and_dtype(
-            input, 'input', ['float32', 'float64', 'int32', 'int64'], 'l1_loss'
+            input,
+            'input',
+            ['float32', 'float64', 'int32', 'int64'],
+            'l1_loss',
         )
         check_variable_and_dtype(
-            label, 'label', ['float32', 'float64', 'int32', 'int64'], 'l1_loss'
+            label,
+            'label',
+            ['float32', 'float64', 'int32', 'int64'],
+            'l1_loss',
         )
 
         if reduction == 'sum':
@@ -1462,6 +1470,116 @@ def nll_loss(
         return out
 
 
+def poisson_nll_loss(
+    input,
+    label,
+    log_input=True,
+    full=False,
+    epsilon=1e-8,
+    reduction="mean",
+    name=None,
+):
+    r"""Poisson negative log likelihood loss.
+    See more detail in :ref:`PoissonNLLLoss <api_paddle_nn_PoissonNLLLoss>` .
+
+    Parameters:
+         input (Tensor):
+            Input tensor, expectation of underlying Poisson distribution.
+            The shape of input tensor should be `(N, *)` or `(*)` where `(*)` denotes any number of extra dimensions.
+            It's data type should be float16, bfloat16, float32, float64.
+         label (Tensor):
+            Label tensor, random sampled from Poisson distribution :math:`label \sim \text{Poisson}(input)`.
+            The shape of input tensor should be `(N, *)` or `(*)`, same shape as the input tensor.
+            It's data type should be float16, bfloat16, float32, float64.
+         log_input (bool, optional):
+            Whether to the treat input tensor as log input.
+            If ``True`` the loss is computed as, :math:`\exp(\text{input}) - \text{label} * \text{input}` .
+            If ``False`` then loss is :math:`\text{input} - \text{label} * \log(\text{input}+\text{epsilon})` .
+            Default: ``True``.
+         full (bool, optional):
+            Whether to compute full loss.
+            If ``True``, the Stirling approximation term is added.
+            If ``False``, the Stirling approximation is dropped.
+            Default: ``False``.
+         epsilon (float, optional):
+            A small value to avoid evaluation of :math:`\log(0)` when `log_input`\ =\ ``False``. ``epsilon > 0``.
+            Default: 1e-8.
+         reduction (str, optional):
+            Indicate how to reduce the loss, the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+            If `reduction` is ``'mean'``, the reduced mean loss is returned;
+            if `reduction` is ``'sum'``, the reduced sum loss is returned;
+            if `reduction` is ``'none'``, no reduction will be apllied.
+            Default is ``'mean'``.
+         name (str, optional):
+            Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+
+            input = paddle.randn([5, 2], dtype=paddle.float32)
+            label = paddle.randn([5, 2], dtype=paddle.float32)
+            loss = F.poisson_nll_loss(input, label, log_input=True, reduction='None')
+            print(loss)
+            loss = F.poisson_nll_loss(input, label, reduction='mean')
+            print(loss)
+
+    """
+    # check parameter values
+    if epsilon <= 0:
+        raise ValueError(
+            "The value of `epsilon` in poisson_nll_loss should be positve, but received %f, which is not allowed"
+            % epsilon
+        )
+
+    if reduction not in ['sum', 'mean', 'none']:
+        raise ValueError(
+            "The value of 'reduction' in poisson_nll_loss should be 'sum', 'mean' or 'none', but "
+            "received %s, which is not allowed." % reduction
+        )
+    # check input dtype and dimension
+    check_variable_and_dtype(
+        input,
+        'input',
+        ['float16', 'uint16', 'float32', 'float64'],
+        'poisson_nll_loss',
+    )
+    check_variable_and_dtype(
+        label,
+        'label',
+        ['float16', 'uint16', 'float32', 'float64'],
+        'poisson_nll_loss',
+    )
+
+    if not (input.shape == label.shape):
+        raise ValueError("input's shape must equal to label's shape")
+
+    label = paddle.cast(label, input.dtype)
+    loss_out = 0
+    if log_input:
+        loss_out = paddle.exp(input) - label * input
+    else:
+        loss_out = input - label * paddle.log(input + epsilon)
+    if full:
+        stirling_approx = (
+            label * paddle.log(label)
+            - label
+            + 0.5 * paddle.log(2 * math.pi * label)
+        )
+        loss_out += paddle.where(
+            stirling_approx <= 1,
+            paddle.zeros_like(stirling_approx),
+            stirling_approx,
+        )
+    if reduction == 'mean':
+        loss_out = paddle.mean(loss_out)
+    elif reduction == 'sum':
+        loss_out = paddle.sum(loss_out)
+    return loss_out
+
+
 def kl_div(input, label, reduction='mean', name=None):
     r"""
     Calculate the Kullback-Leibler divergence loss
diff --git a/python/paddle/nn/layer/__init__.py b/python/paddle/nn/layer/__init__.py
index b435a73e1d88a3..09b491b900d5ca 100644
--- a/python/paddle/nn/layer/__init__.py
+++ b/python/paddle/nn/layer/__init__.py
@@ -72,6 +72,7 @@
 from .loss import MSELoss  # noqa: F401
 from .loss import L1Loss  # noqa: F401
 from .loss import NLLLoss  # noqa: F401
+from .loss import PoissonNLLLoss  # noqa: F401
 from .loss import BCELoss  # noqa: F401
 from .loss import KLDivLoss  # noqa: F401
 from .loss import MarginRankingLoss  # noqa: F401
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index 39c68120b6a83c..57ee00608cce10 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -882,6 +882,99 @@ def forward(self, input, label):
         )
 
 
+class PoissonNLLLoss(Layer):
+    r"""Generate a callable object of 'PoissonNLLLoss' to calculate the
+    Poisson negative log likelihood loss between Input(input) and
+    Input(label). Notes that Input(input) is the expectation of underlying
+    Poisson distribution and Input(label) is the random samples from the
+    Poisson distribution
+
+
+    Poisson negative log likelihood loss is calculated as follows:
+
+    .. math::
+        \text{loss}(\text{input}, \text{label}) = \text{input} - \text{label} * \log(\text{label}) + \log(\text{label!})
+
+    The last term can be approximated with Stirling formula. This approximation term is used when :attr:`full` is ``True``.
+    The approximation is added when label values are more than 1 and omitted when the labels are less than or equal to 1.
+
+    Parameters:
+         log_input (bool, optional):
+            Whether to the treat input tensor as log input.
+            If ``True`` the loss is computed as, :math:`\exp(\text{input}) - \text{label} * \text{input}` .
+            If ``False`` then loss is :math:`\text{input} - \text{label} * \log(\text{input}+\text{epsilon})` .
+            Default: ``True``.
+         full (bool, optional):
+            Whether to compute full loss.
+            If ``True``, the Stirling approximation term is added.
+            If ``False``, the Stirling approximation is dropped.
+            Default: ``False``.
+         epsilon (float, optional):
+            A small value to avoid evaluation of :math:`\log(0)` when ``log_input`` = ``False``. ``epsilon > 0``.
+            Default: 1e-8.
+         reduction (str, optional):
+            Indicate how to reduce the loss, the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+            If `reduction` is ``'mean'``, the reduced mean loss is returned;
+            if `reduction` is ``'sum'``, the reduced sum loss is returned;
+            if `reduction` is ``'none'``, no reduction will be apllied.
+            Default is ``'mean'``.
+         name (str, optional):
+            Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input (Tensor): The shape of input tensor should be `(N, *)` or `(*)` where `(*)` denotes any number of extra dimensions.
+        - label (Tensor): The shape of input tensor should be `(N, *)` or `(*)`, same shape as the input tensor.
+        - output (Tensor): scalar if :attr:`reduction` is ``'mean'`` (default) or ``'sum'``. If :attr:`reduction` is ``'none'``, then :math:`(N, *)`, same shape as the input
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            poisson_nll_loss = paddle.nn.loss.PoissonNLLLoss()
+            input = paddle.randn([5, 2], dtype=paddle.float32)
+            label = paddle.randn([5, 2], dtype=paddle.float32)
+            loss = poisson_nll_loss(input, label)
+
+    """
+
+    def __init__(
+        self,
+        log_input=True,
+        full=False,
+        epsilon=1e-8,
+        reduction="mean",
+        name=None,
+    ):
+        if epsilon <= 0:
+            raise ValueError(
+                "The value of `epsilon` in PoissonNLLLoss should be positve, but received %f, which is not allowed"
+                % epsilon
+            )
+        if reduction not in ['sum', 'mean', 'none']:
+            raise ValueError(
+                "The value of 'reduction' in PoissonNLLLoss should be 'sum', 'mean' or 'none', but "
+                "received %s, which is not allowed." % reduction
+            )
+        super().__init__()
+        self._log_input = log_input
+        self._full = full
+        self._epsilon = epsilon
+        self._reduction = reduction
+        self._name = name
+
+    def forward(self, input, label):
+        return F.poisson_nll_loss(
+            input,
+            label,
+            log_input=self._log_input,
+            full=self._full,
+            epsilon=self._epsilon,
+            reduction=self._reduction,
+            name=self._name,
+        )
+
+
 class KLDivLoss(Layer):
     r"""
 
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 7c01c310917983..ba7efb7956f77d 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -159,7 +159,7 @@ def log(x, name=None):
         return _C_ops.log(x)
     else:
         check_variable_and_dtype(
-            x, 'x', ['float16', 'float32', 'float64'], "log"
+            x, 'x', ['uint16', 'float16', 'float32', 'float64'], "log"
         )
         inputs = {'X': [x]}
         helper = LayerHelper('log', **locals())
diff --git a/python/paddle/tensor/ops.py b/python/paddle/tensor/ops.py
index 7fef070b42c233..c8cdb4e4b4e305 100644
--- a/python/paddle/tensor/ops.py
+++ b/python/paddle/tensor/ops.py
@@ -565,6 +565,7 @@ def exp(x, name=None):
             [
                 'int32',
                 'int64',
+                'uint16',
                 'float16',
                 'float32',
                 'float64',

From 03afb41c697d8ba47ddb7ca3a7ee1acab747be47 Mon Sep 17 00:00:00 2001
From: JZ-LIANG <jianzhongliang10@gmail.com>
Date: Mon, 10 Apr 2023 17:02:40 +0800
Subject: [PATCH 025/156] [Auto Parallel]  Randomness Control for Distributed
 Training (#52554)

* unique id for mesh

* rng ctrl

* support dropout

* register op

* adopt for recompute

* update unitest

* support pp
---
 .../distributed/auto_parallel/__init__.py     |   1 +
 .../distributed/auto_parallel/engine.py       |   9 +-
 .../auto_parallel/operators/__init__.py       |   1 +
 .../auto_parallel/operators/dist_dropout.py   | 186 ++++++++++++
 .../auto_parallel/operators/dist_embedding.py |   2 +-
 .../auto_parallel/parallelizer_v2.py          |   4 +
 .../distributed/auto_parallel/process_mesh.py |  41 +++
 .../distributed/auto_parallel/random.py       | 138 +++++++++
 .../passes/auto_parallel_recompute.py         |   4 +-
 .../unittests/auto_parallel/CMakeLists.txt    |   3 +
 .../unittests/auto_parallel/get_gpt_model.py  |   6 +-
 .../auto_parallel/random_control_unittest.py  | 273 ++++++++++++++++++
 .../auto_parallel/test_random_ctrl.py         |  55 ++++
 13 files changed, 716 insertions(+), 7 deletions(-)
 create mode 100644 python/paddle/distributed/auto_parallel/operators/dist_dropout.py
 create mode 100644 python/paddle/distributed/auto_parallel/random.py
 create mode 100644 python/paddle/fluid/tests/unittests/auto_parallel/random_control_unittest.py
 create mode 100644 python/paddle/fluid/tests/unittests/auto_parallel/test_random_ctrl.py

diff --git a/python/paddle/distributed/auto_parallel/__init__.py b/python/paddle/distributed/auto_parallel/__init__.py
index 269a0ec644dbd2..835ca68df2d1c1 100644
--- a/python/paddle/distributed/auto_parallel/__init__.py
+++ b/python/paddle/distributed/auto_parallel/__init__.py
@@ -19,5 +19,6 @@
 from .interface import shard_op
 from .interface import recompute
 from .interface import fetch
+from .random import parallel_manual_seed
 
 __all__ = []
diff --git a/python/paddle/distributed/auto_parallel/engine.py b/python/paddle/distributed/auto_parallel/engine.py
index 4a6181758d114d..a84bea42d538fc 100644
--- a/python/paddle/distributed/auto_parallel/engine.py
+++ b/python/paddle/distributed/auto_parallel/engine.py
@@ -45,7 +45,7 @@
 from .dist_op import DistributedOperator
 from .dist_saver import DistributedSaver
 from .helper import ProgramHelper
-from .interface import CollectionNames, get_collection
+from .interface import CollectionNames, fetch, get_collection
 from .parallelizer_v2 import Parallelizer
 from .planner_v2 import Planner
 from .process_group import get_all_process_groups, new_process_group
@@ -410,6 +410,8 @@ def _prepare_fetch(self, user_fetches, mode):
             ), "user_fetches must be a list, but receive {}".format(
                 type(user_fetches).__name__
             )
+        else:
+            user_fetches = []
         fetch_names = []
         fetch_indices = []
 
@@ -434,10 +436,13 @@ def _process_fetch_group(group_name, var_list):
                 _process_fetch_group("metrics_" + str(i), var_list)
         if mode == "predict":
             _process_fetch_group("outputs", self._fetch_vars[mode]["outputs"])
+        for usr_fetch in user_fetches:
+            var_name = _to_name_str(usr_fetch)
+            fetch(var_name)
         user_fetches_collection = [
             item[1] for item in get_collection(CollectionNames.FETCHES)
         ]
-        var_list = (user_fetches_collection or []) + (user_fetches or [])
+        var_list = user_fetches_collection or []
         _process_fetch_group("fetches", var_list)
         return fetch_names, fetch_indices
 
diff --git a/python/paddle/distributed/auto_parallel/operators/__init__.py b/python/paddle/distributed/auto_parallel/operators/__init__.py
index 406ec4d8b36da0..bc5bf4b7379e72 100644
--- a/python/paddle/distributed/auto_parallel/operators/__init__.py
+++ b/python/paddle/distributed/auto_parallel/operators/__init__.py
@@ -36,3 +36,4 @@
 from . import dist_shape
 from . import dist_assign
 from . import dist_scale
+from . import dist_dropout
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_dropout.py b/python/paddle/distributed/auto_parallel/operators/dist_dropout.py
new file mode 100644
index 00000000000000..e43870b26883ba
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/operators/dist_dropout.py
@@ -0,0 +1,186 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+import logging
+
+import paddle
+from paddle.framework import core
+from paddle.utils import unique_name
+
+from ...utils.log_utils import get_logger
+
+_logger = get_logger(logging.INFO)
+from ..random import determinate_rng, is_enable_auto_rand_ctrl
+from ..utils import (
+    naive_set_dist_op_attr_for_program_by_mesh_and_mapping,
+    set_var_dist_attr,
+)
+from .common import (
+    DistributedOperatorImplContainer,
+    register_distributed_operator_impl,
+    register_distributed_operator_impl_container,
+)
+from .dist_eltwise import DistributedDefaultImpl0, DistributedElementwiseImpl0
+
+
+class DistributedDropout(DistributedOperatorImplContainer):
+    def __init__(self, op_type):
+        super().__init__(op_type)
+
+
+register_distributed_operator_impl_container(DistributedDropout("dropout"))
+
+
+# Dist Dropout with Random Control
+# Dropout re-use the compatible and cost function of elementwise
+class DistributedDropoutImpl0(DistributedElementwiseImpl0):
+    def __init__(self, name):
+        super().__init__(name)
+        self._forward_implemented = True
+        self._backward_implemented = True
+
+    @staticmethod
+    def forward(ctx, *args, **kwargs):
+
+        dist_op_context = ctx.dist_op_context
+        main_block = dist_op_context.work_block
+        startup_block = dist_op_context.startup_block
+        src_op = dist_op_context.cur_src_op
+        rank_id = dist_op_context.rank_id
+        op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
+
+        if is_enable_auto_rand_ctrl() and not op_dist_attr.is_recompute:
+
+            assert (
+                op_dist_attr is not None
+            ), f"forward op [{str(src_op)}] don't have dist attribute !"
+
+            # check validation of inputs / outputs
+            assert 'X' in kwargs, "input [{}] is not given".format('X')
+            assert (
+                len(kwargs['X']) == 1
+            ), "input X should be only one tensor but got {}".format(
+                kwargs['X']
+            )
+            assert 'Seed' in kwargs, "input [{}] is not given".format('Seed')
+
+            if (
+                src_op.has_attr("fix_seed")
+                and src_op.attr("fix_seed")
+                and src_op.has_attr("seed")
+                and src_op.attr("seed")
+            ):
+                _logger.info(
+                    "Auto Parallel Random Control Skiped Since manul seed is set by user: {}".format(
+                        src_op
+                    )
+                )
+            elif rank_id not in op_dist_attr.process_mesh.process_ids:
+                pass
+            # NOTE Adopt for recompute
+            # If user already set seed, We should not modify it. But if the seed is added by recompute pass, it should be under control.
+            # TODO  in future recompute pass should happen after parallel partitione. and remove this at that time.
+            elif len(kwargs['Seed']) > 0 or len(src_op.input("Seed")) > 0:
+                seed_var_name = kwargs['Seed'][0]
+                if seed_var_name.startswith('rc_seed'):
+                    pre_op = main_block.ops[-1]
+                    assert (
+                        pre_op.type == "seed"
+                        and len(pre_op.attr("rng_name")) == 0
+                    ), f"found exception op {str(pre_op)}"
+
+                    # determinate rng
+                    X_var = main_block._var_recursive(kwargs['X'][0])
+                    X_dims_mapping = op_dist_attr.get_input_dims_mapping(
+                        X_var.name
+                    )
+                    process_mesh = op_dist_attr.process_mesh
+                    rng_name = determinate_rng(
+                        rank_id, X_dims_mapping, process_mesh
+                    )
+                    # make recompute seed under control
+                    pre_op._set_attr("rng_name", rng_name)
+                    pre_op._set_attr("deterministic", True)
+                    pre_op._set_attr("force_cpu", True)
+                else:
+                    _logger.info(
+                        "Auto Parallel Random Control Skiped Since manul seed is set by user: {}".format(
+                            src_op
+                        )
+                    )
+            else:
+                # determinate rng
+                X_var = main_block._var_recursive(kwargs['X'][0])
+                X_dims_mapping = op_dist_attr.get_input_dims_mapping(X_var.name)
+                process_mesh = op_dist_attr.process_mesh
+
+                rng_name = determinate_rng(
+                    rank_id, X_dims_mapping, process_mesh
+                )
+                assert rng_name is not None and rng_name != ""
+
+                # insert seed op
+                seed_var = main_block.create_var(
+                    name=unique_name.generate_with_ignorable_key(
+                        ".".join(["tensor_parallel_seed", 'tmp'])
+                    ),
+                    dtype=paddle.int32,
+                    type=core.VarDesc.VarType.LOD_TENSOR,
+                    persistable=False,
+                    stop_gradient=False,
+                )
+
+                # set new seed_var's dist_attr
+                seed_var_dims_mapping = [-1]
+                seed_var_dist_attr = set_var_dist_attr(
+                    ctx, seed_var, seed_var_dims_mapping, process_mesh
+                )
+
+                # adopt for recompute
+                # force_cpu to reduce sync copy from CPU->GPU->CPU, and reduce pipeline hang
+                seed_op = main_block.append_op(
+                    type='seed',
+                    outputs={'Out': seed_var},
+                    attrs={
+                        'deterministic': True,
+                        'rng_name': rng_name,
+                        'force_cpu': True,
+                    },
+                )
+                seed_op._set_attr('op_namescope', 'auto_tensor_parallel_seed')
+                # set new seed op's dist_attr
+                naive_set_dist_op_attr_for_program_by_mesh_and_mapping(
+                    seed_op, process_mesh, seed_var_dims_mapping, ctx
+                )
+
+                # modify dropout op
+                src_op.desc.set_input("Seed", [seed_var.name])
+                src_op._remove_attr("fix_seed")
+                src_op._remove_attr("seed")
+                op_dist_attr.set_input_dist_attr(
+                    seed_var.name, seed_var_dist_attr
+                )
+                kwargs['Seed'] = [seed_var.name]
+
+        DistributedDefaultImpl0.forward(ctx, *args, **kwargs)
+
+    @staticmethod
+    def backward(ctx, *args, **kwargs):
+        # dropout backward is deterministic by mask, and not need for random state control
+        DistributedDefaultImpl0.backward(ctx, *args, **kwargs)
+
+
+register_distributed_operator_impl(
+    "dropout", DistributedDropoutImpl0("random_control")
+)
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_embedding.py b/python/paddle/distributed/auto_parallel/operators/dist_embedding.py
index 92fc5f31a81eb9..7176341feedfb9 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_embedding.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_embedding.py
@@ -362,7 +362,7 @@ def forward(ctx, *args, **kwargs):
         op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
         assert (
             op_dist_attr is not None
-        ), f"backward op [{str(src_op)}] don't have dist attribute !"
+        ), f"forward op [{str(src_op)}] don't have dist attribute !"
 
         # check validation of inputs / outputs
         assert 'Ids' in kwargs, "input [{}] is not given".format('Ids')
diff --git a/python/paddle/distributed/auto_parallel/parallelizer_v2.py b/python/paddle/distributed/auto_parallel/parallelizer_v2.py
index 3f7c3999cebef1..a76a3f5dcb9abd 100644
--- a/python/paddle/distributed/auto_parallel/parallelizer_v2.py
+++ b/python/paddle/distributed/auto_parallel/parallelizer_v2.py
@@ -23,6 +23,7 @@
 from ..utils.log_utils import get_logger
 from .partitioner import Partitioner
 from .process_group import get_world_process_group
+from .random import init_auto_parallel_rng
 from .reshard import Resharder
 from .utils import set_grad_var_shape
 
@@ -83,6 +84,9 @@ def parallel(self, rank):
             ) = partitioner.partition(
                 serial_main_program, serial_startup_program, params_grads
             )
+
+            init_auto_parallel_rng()
+
             self._logger.debug(
                 "within parallel partitioner time: {}, mode {}".format(
                     time.time() - time0, self._mode
diff --git a/python/paddle/distributed/auto_parallel/process_mesh.py b/python/paddle/distributed/auto_parallel/process_mesh.py
index 531de9b545e937..e2ccd16aaaad4b 100644
--- a/python/paddle/distributed/auto_parallel/process_mesh.py
+++ b/python/paddle/distributed/auto_parallel/process_mesh.py
@@ -22,6 +22,8 @@
 # Use to store the previous and current process mesh
 _g_previous_process_mesh = None
 _g_current_process_mesh = None
+# {shape_process_ids : unique_id}
+_g_unique_process_mesh_map = {}
 
 
 def get_current_process_mesh():
@@ -42,6 +44,30 @@ def reset_current_process_mesh():
     _g_current_process_mesh = _g_previous_process_mesh
 
 
+def get_unique_id_for_process_mesh(shape, process_ids):
+    key = f"shape {shape}, process_ids {process_ids}"
+    global _g_unique_process_mesh_map
+    if key in _g_unique_process_mesh_map:
+        unique_id = _g_unique_process_mesh_map[key]
+    else:
+        unique_id = len(_g_unique_process_mesh_map) + 1
+        _g_unique_process_mesh_map[key] = unique_id
+
+    return unique_id
+
+
+def retrive_unique_id_for_process_mesh(shape, process_ids):
+    key = f"shape {shape}, process_ids {process_ids}"
+    global _g_unique_process_mesh_map
+    assert key in _g_unique_process_mesh_map
+    return _g_unique_process_mesh_map[key]
+
+
+def get_unique_process_mesh_map():
+    global _g_unique_process_mesh_map
+    return _g_unique_process_mesh_map
+
+
 class ProcessMesh(core.ProcessMesh):
     """
     The `ProcessMesh` object describes the Cartesian topology of the used processes.
@@ -124,6 +150,11 @@ def __init__(self, mesh=None, dim_names=None, shape=None, process_ids=None):
         pg0 = get_process_group(0)
         pg0.add_ranks(self.process_ids)
 
+        # Uniqe Mesh Id
+        self._unique_id = get_unique_id_for_process_mesh(
+            self._shape, self._process_ids
+        )
+
     @property
     def mesh(self):
         """
@@ -131,6 +162,16 @@ def mesh(self):
         """
         return self._mesh
 
+    @property
+    def unique_id(self):
+        """
+        Get the unique id of ProcessMesh.
+        NOTE
+        Unique id only take process_ids and shape into account.
+        Different ProcessMesh with same process_ids and shape have same unique id.
+        """
+        return self._unique_id
+
     def __getitem__(self, index):
         if isinstance(index, tuple):
             new_dim_names = []
diff --git a/python/paddle/distributed/auto_parallel/random.py b/python/paddle/distributed/auto_parallel/random.py
new file mode 100644
index 00000000000000..5ca6d9e9ea0696
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/random.py
@@ -0,0 +1,138 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+
+import paddle
+
+from ..utils.log_utils import get_logger
+from .process_mesh import retrive_unique_id_for_process_mesh
+from .utils import _get_idx_in_axis
+
+_logger = get_logger(logging.INFO)
+
+_rng_name_to_seed = {}
+_inited_rng_name_to_seed = {}
+_enable_random_control = False
+_basic_seed = 42
+
+# use Prime number as offset to avoid confict
+_mesh_offset = 173
+_dim_offsets = [11, 23, 37, 73]
+
+
+def is_enable_auto_rand_ctrl():
+    global _enable_random_control
+    return _enable_random_control
+
+
+def enable_auto_rand_ctrl():
+    global _enable_random_control
+    _enable_random_control = True
+
+
+def parallel_manual_seed(seed):
+    """Enable auto parallel random control.
+    Random control maintain the randomness when tensor is distributed across devices on a Mesh(any order).
+        * Independency: If tensor is **Sharded** on a Mesh dimension, Devices along that Mesh dimension should have Different randomness.
+
+        * Consistency:  Meanwhile if the tensor is **Replicated** on another Mesh dimension, randomness of Devices along that Mesh dimension should be Consistent.
+
+    For instance: rank0 ~ rank7 consist a Mesh of shape of [2, 4]; A 2D tensor is distributed in that Mesh using dims_mapping [-1, 1].
+    Randomness for rank0-rank1-rank2-rank3 (rank4-rank5-rank6-rank7) should be Independent;
+    Randomness for rank0 and rank4 (rank1 and rank5, ...) should be Consistent.
+
+    This function should be called only once before auto parallel compiles the computation graph (e.g. auto_parallel.engine.prepare() or fit()).
+
+    This seed only affects how randomness-relative **operators** (dropout, fuse op with dropout inside, etc) are execute amonge mesh, and would NOT affect other processe like Parameter initialization.
+
+    Examples:
+        # seed relative to training step
+        auto_parallel_random_seed((step + 13) * 257)
+        ...
+        engine.prepare()
+    """
+
+    enable_auto_rand_ctrl()
+    global _basic_seed
+    _basic_seed = seed
+
+
+def determinate_rng(rank, dims_mapping, process_mesh):
+
+    # TODO(JZ-LIANG) Support Mesh with any high rank
+    # use a string to unique integer hashing algorithm for seed computation.
+    # instead of using offsets to coodinate seed across devices.
+    if len(process_mesh.shape) > 4:
+        raise NotImplementedError(
+            "Auto Parallel Random Control for Mesh's rank > 4 is NOT supported! Got {}".format(
+                str(process_mesh)
+            )
+        )
+    global _basic_seed
+    seed_ = _basic_seed
+
+    # FIXME
+    # unique_id = process_mesh.unique_id
+    unique_id = retrive_unique_id_for_process_mesh(
+        process_mesh.shape, process_mesh.process_ids
+    )
+    sharding_expr = f'mesh:{unique_id}'
+    seed_ += _mesh_offset * (unique_id + 1)
+
+    for i in range(len(process_mesh.shape)):
+        if i not in dims_mapping:
+            relative_idx = -1
+        else:
+            relative_idx = _get_idx_in_axis(
+                process_mesh.process_ids,
+                process_mesh.shape,
+                i,
+                rank,
+            )
+
+        sharding_expr += f"_dim{i}:{relative_idx}"
+        seed_ += _dim_offsets[i] * (relative_idx + 1)
+
+    global _rng_name_to_seed
+    if sharding_expr in _rng_name_to_seed:
+        assert _rng_name_to_seed[sharding_expr] == seed_
+    else:
+        assert (
+            seed_ not in _rng_name_to_seed.values()
+        ), "Seed Confilt! current seed: {}, current sharding expr: {}, generated seed: {}".format(
+            seed_, sharding_expr, _rng_name_to_seed
+        )
+        _rng_name_to_seed[sharding_expr] = seed_
+
+    return sharding_expr
+
+
+def init_auto_parallel_rng():
+
+    if not is_enable_auto_rand_ctrl():
+        return
+
+    global _rng_name_to_seed
+    # NOTE init rng maybe call multiple times, avoid init same rng twice
+    global _inited_rng_name_to_seed
+
+    for rng_name, seed in _rng_name_to_seed.items():
+        if rng_name in _inited_rng_name_to_seed:
+            assert _inited_rng_name_to_seed[rng_name] == seed
+        else:
+            _logger.info(
+                f"Init Auto Parallel RNG: {rng_name}, with seed {seed}"
+            )
+            paddle.framework.random.set_random_seed_generator(rng_name, seed)
+            _inited_rng_name_to_seed[rng_name] = seed
diff --git a/python/paddle/distributed/passes/auto_parallel_recompute.py b/python/paddle/distributed/passes/auto_parallel_recompute.py
index b6a13540caf098..cb3dd480fbf47c 100644
--- a/python/paddle/distributed/passes/auto_parallel_recompute.py
+++ b/python/paddle/distributed/passes/auto_parallel_recompute.py
@@ -136,7 +136,9 @@ def modify_forward_desc_for_recompute(self, dist_context):
 
             cur_op_dist_attr = dist_context.get_op_dist_attr_for_program(cur_op)
             # insert seed op to guarantee that two dropout op have the same outputs
-            op_unique_name = unique_name.generate("seed")
+            # NOTE Hack for adopt recompute for random control, for more info see dist_dropout.py
+            # new seed added by recompute should have a prefix to distinguish with seed added by user or other moudule.
+            op_unique_name = unique_name.generate("rc_seed")
             var_unique_name = unique_name.generate_with_ignorable_key(
                 ".".join([op_unique_name, 'tmp'])
             )
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
index 7db4d58bd8b917..d1ba09ee8b47de 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
@@ -37,6 +37,9 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
   py_test_modules(test_pass_grad_clip MODULES test_pass_grad_clip)
   set_tests_properties(test_pass_grad_clip
                        PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 50)
+  py_test_modules(test_random_ctrl MODULES test_random_ctrl)
+  set_tests_properties(test_random_ctrl PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE"
+                                                   TIMEOUT 50)
   py_test_modules(test_pass_gradient_merge MODULES test_pass_gradient_merge)
   set_tests_properties(test_pass_gradient_merge
                        PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 50)
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/get_gpt_model.py b/python/paddle/fluid/tests/unittests/auto_parallel/get_gpt_model.py
index 35bf1a323d15c4..f23b3faf8dfe61 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/get_gpt_model.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/get_gpt_model.py
@@ -75,7 +75,7 @@ def create_data_holder(batch_size, vocab_size=1000, sequence_len=512):
     return [tokens, position_ids, attention_mask], [labels, loss_mask]
 
 
-def generate_model(strategy):
+def generate_model(strategy, dropout_prob=0.0):
     modeling.init_global()
     ranks = list(range(paddle.distributed.get_world_size()))
     modeling._global_process_mesh = auto.ProcessMesh(
@@ -97,8 +97,8 @@ def generate_model(strategy):
         num_attention_heads=8,
         intermediate_size=256,
         hidden_act="gelu",
-        hidden_dropout_prob=0.0,
-        attention_probs_dropout_prob=0.0,
+        hidden_dropout_prob=dropout_prob,
+        attention_probs_dropout_prob=dropout_prob,
         max_position_embeddings=1024,
         type_vocab_size=1,
         initializer_range=0.02,
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/random_control_unittest.py b/python/paddle/fluid/tests/unittests/auto_parallel/random_control_unittest.py
new file mode 100644
index 00000000000000..52e6e216074fdb
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/random_control_unittest.py
@@ -0,0 +1,273 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import unittest
+
+import numpy as np
+from get_gpt_model import FakeDataset, generate_model
+
+import paddle
+
+paddle.enable_static()
+from paddle import _legacy_C_ops
+from paddle.distributed.fleet import auto
+
+
+def dy_broadcast_helper(tensor):
+    _legacy_C_ops.c_broadcast(
+        tensor, tensor, 'root', 1, 'use_calc_stream', True, 'ring_id', 0
+    )
+    _legacy_C_ops.c_sync_calc_stream(tensor, tensor)
+
+
+def apply_pass(use_recompute=False, no_recompute_segments=[]):
+    strategy = auto.Strategy()
+    strategy.auto_mode = "semi"
+    strategy.reinit = True
+    if use_recompute:
+        recompute = strategy.recompute
+        recompute.enable = True
+        recompute.no_recompute_segments = no_recompute_segments
+    return strategy
+
+
+def reset_prog():
+    paddle.fluid.framework.switch_main_program(paddle.static.Program())
+    paddle.fluid.framework.switch_startup_program(paddle.static.Program())
+
+
+class TestRandomControl(unittest.TestCase):
+    def setUp(self):
+        self.rtol = 1e-6
+        self.atol = 1e-8
+        self.batch_size = 1
+        self.batch_num = 10
+        self.clip_norm = 0.2
+        self.dataset = FakeDataset(self.batch_size * self.batch_num)
+        paddle.distributed.auto_parallel.parallel_manual_seed(100)
+
+    def init(self, engine):
+        paddle.seed(2022)
+        np.random.seed(2022)
+        random.seed(2022)
+        place = paddle.fluid.CUDAPlace(paddle.distributed.ParallelEnv().dev_id)
+        engine._executor = paddle.static.Executor(place)
+
+    def get_engine(self, use_recompute=False, no_recompute_segments=[]):
+        reset_prog()
+
+        strategy = apply_pass(use_recompute, no_recompute_segments)
+        clip = paddle.nn.ClipGradByGlobalNorm(self.clip_norm)
+        opt = paddle.optimizer.AdamW(learning_rate=0.00001, grad_clip=clip)
+        model, loss = generate_model("mp", dropout_prob=0.1)
+
+        engine = auto.Engine(model, loss, opt, strategy=strategy)
+        self.init(engine)
+        return engine
+
+    def compare_mask_between_ranks(
+        self, rank, mask_np_list, comapre_idx, equal
+    ):
+
+        for np_mask in [mask_np_list[i] for i in comapre_idx]:
+            mask_tensor_local = paddle.to_tensor(np_mask.astype("float32"))
+            if rank == 0:
+                mask_tensor_remote = paddle.ones_like(mask_tensor_local)
+                dy_broadcast_helper(mask_tensor_remote)
+                if equal:
+                    assert np.array_equal(
+                        mask_tensor_remote.numpy(), mask_tensor_local.numpy()
+                    )
+                else:
+                    assert not np.array_equal(
+                        mask_tensor_remote.numpy(),
+                        mask_tensor_local.numpy(),
+                    )
+            else:
+                dy_broadcast_helper(mask_tensor_local)
+
+    def test_random_ctrl_vanilla(self):
+        # mp2 recompute training
+        rc_engine = self.get_engine(False)
+        train_dataloader = rc_engine.dataloader(
+            self.dataset,
+            batch_size=self.batch_size,
+            mode="train",
+            sample_split=3,
+        )
+
+        rc_engine.prepare(mode="train")
+        mask_name_list = [f'dropout_{i}.tmp_1' for i in range(7)]
+        mask_var_list = [
+            rc_engine.main_program.global_block().var(varname)
+            for varname in mask_name_list
+        ]
+
+        for data in train_dataloader:
+            outs = rc_engine.run(data, fetch_list=mask_var_list, mode="train")
+        mask_np_list = [outs['fetches'][varname] for varname in mask_name_list]
+
+        paddle.disable_static()
+        rank = paddle.distributed.get_rank()
+        # check globl mask consistent across ranks
+        global_index = [0, 2, 3, 5, 6]
+        self.compare_mask_between_ranks(
+            rank, mask_np_list, global_index, equal=True
+        )
+        local_index = [1, 4]
+        # check loacl mask different across ranks
+        self.compare_mask_between_ranks(
+            rank, mask_np_list, local_index, equal=False
+        )
+        paddle.enable_static()
+
+        # check program
+        ops = rc_engine.main_program.global_block().ops
+        rng_names = []
+        seed_var_names = []
+        for op in ops:
+            if op.type == "seed":
+                rng_names.append(op.attr('rng_name'))
+            if op.type == "dropout":
+                seed_var_names.append(op.input("Seed")[0])
+        rank = paddle.distributed.get_rank()
+
+        self.assertEqual(
+            rng_names,
+            [
+                'mesh:1_dim0:-1',
+                f'mesh:1_dim0:{rank}',
+                'mesh:1_dim0:-1',
+                'mesh:1_dim0:-1',
+                f'mesh:1_dim0:{rank}',
+                'mesh:1_dim0:-1',
+                'mesh:1_dim0:-1',
+            ],
+        )
+        self.assertEqual(
+            seed_var_names,
+            [
+                'tensor_parallel_seed.tmp_0',
+                'tensor_parallel_seed.tmp_1',
+                'tensor_parallel_seed.tmp_2',
+                'tensor_parallel_seed.tmp_3',
+                'tensor_parallel_seed.tmp_4',
+                'tensor_parallel_seed.tmp_5',
+                'tensor_parallel_seed.tmp_6',
+            ],
+        )
+
+    def test_random_ctrl_with_recompute(self):
+        # mp2 recompute training
+        rc_engine = self.get_engine(True)
+        train_dataloader = rc_engine.dataloader(
+            self.dataset,
+            batch_size=self.batch_size,
+            mode="train",
+            sample_split=3,
+        )
+
+        rc_engine.prepare(mode="train")
+        mask_name_list = [f'dropout_{i}.tmp_1' for i in range(7)]
+        recompute_mask_name_list = [
+            'dropout_0.tmp_1.subprog_1',
+            'dropout_1.tmp_1.subprog_1',
+            'dropout_2.tmp_1.subprog_1',
+            'dropout_3.tmp_1.subprog_1',
+            'dropout_4.tmp_1.subprog_0',
+            'dropout_5.tmp_1.subprog_0',
+            'dropout_6.tmp_1.subprog_0',
+        ]
+        mask_var_list = [
+            rc_engine.main_program.global_block().var(varname)
+            for varname in mask_name_list + recompute_mask_name_list
+        ]
+
+        for data in train_dataloader:
+            outs = rc_engine.run(data, fetch_list=mask_var_list, mode="train")
+        mask_np_list = [
+            outs['fetches'][varname]
+            for varname in mask_name_list + recompute_mask_name_list
+        ]
+
+        # check recompute is mask the same within local device
+        for i in range(7):
+            mask_fw = mask_np_list[i].astype("float32")
+            mask_rc = mask_np_list[i + 7].astype("float32")
+            assert np.array_equal(
+                mask_fw,
+                mask_rc,
+            )
+
+        paddle.disable_static()
+        # check globl mask consistent across ranks
+        rank = paddle.distributed.get_rank()
+        global_index = [0, 2, 3, 5, 6]
+        self.compare_mask_between_ranks(
+            rank, mask_np_list, global_index, equal=True
+        )
+        local_index = [1, 4]
+        # check loacl mask different across ranks
+        self.compare_mask_between_ranks(
+            rank, mask_np_list, local_index, equal=False
+        )
+        paddle.enable_static()
+
+        # check program
+        rank = paddle.distributed.get_rank()
+        ops = rc_engine.main_program.global_block().ops
+        rng_names = []
+        seed_var_names = []
+        for op in ops:
+            if op.type == "seed":
+                rng_names.append(op.attr('rng_name'))
+            if op.type == "dropout":
+                seed_var_names.append(op.input("Seed")[0])
+
+        self.assertEqual(
+            rng_names,
+            [
+                'mesh:1_dim0:-1',
+                f'mesh:1_dim0:{rank}',
+                'mesh:1_dim0:-1',
+                'mesh:1_dim0:-1',
+                f'mesh:1_dim0:{rank}',
+                'mesh:1_dim0:-1',
+                'mesh:1_dim0:-1',
+            ],
+        )
+        self.assertEqual(
+            seed_var_names,
+            [
+                'rc_seed_0.tmp_0',
+                'rc_seed_1.tmp_0',
+                'rc_seed_2.tmp_0',
+                'rc_seed_3.tmp_0',
+                'rc_seed_4.tmp_0',
+                'rc_seed_5.tmp_0',
+                'rc_seed_6.tmp_0',
+                'rc_seed_4.tmp_0',
+                'rc_seed_5.tmp_0',
+                'rc_seed_6.tmp_0',
+                'rc_seed_0.tmp_0',
+                'rc_seed_1.tmp_0',
+                'rc_seed_2.tmp_0',
+                'rc_seed_3.tmp_0',
+            ],
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_random_ctrl.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_random_ctrl.py
new file mode 100644
index 00000000000000..6162db5e93ee7f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_random_ctrl.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import subprocess
+import sys
+import tempfile
+import unittest
+
+
+class TestRandomCtrlPass(unittest.TestCase):
+    def test_mp2_with_recompute(self):
+        file_dir = os.path.dirname(os.path.abspath(__file__))
+        launch_model_path = os.path.join(file_dir, "random_control_unittest.py")
+
+        if os.environ.get("WITH_COVERAGE", "OFF") == "ON":
+            coverage_args = ["-m", "coverage", "run", "--branch", "-p"]
+        else:
+            coverage_args = []
+
+        tmp_dir = tempfile.TemporaryDirectory()
+        cmd = (
+            [sys.executable, "-u"]
+            + coverage_args
+            + [
+                "-m",
+                "paddle.distributed.launch",
+                "--devices",
+                "0,1",
+                "--log_dir",
+                tmp_dir.name,
+                launch_model_path,
+            ]
+        )
+
+        process = subprocess.Popen(cmd)
+        process.wait()
+        self.assertEqual(process.returncode, 0)
+
+        tmp_dir.cleanup()
+
+
+if __name__ == "__main__":
+    unittest.main()

From d7a1a178e89dcadd5f3ef2caf59717c0e29de2ea Mon Sep 17 00:00:00 2001
From: jjyaoao <88936287+jjyaoao@users.noreply.github.com>
Date: Mon, 10 Apr 2023 17:19:36 +0800
Subject: [PATCH 026/156] delete paddle/fluid/operators/amp/*_npu.* (#52673)

* delete paddle/fluid/operators/*_npu.*

* try pass code-style
---
 .gitignore                                    |   1 +
 .../amp/alloc_float_status_op_npu.cc          |  46 ---
 .../amp/check_finite_and_unscale_op_npu.cc    | 111 -------
 .../check_finite_and_unscale_op_npu_test.cc   | 131 --------
 .../amp/clear_float_status_op_npu.cc          |  53 ----
 .../operators/amp/get_float_status_op_npu.cc  |  53 ----
 .../amp/update_loss_scaling_op_npu.cc         | 293 ------------------
 7 files changed, 1 insertion(+), 687 deletions(-)
 delete mode 100644 paddle/fluid/operators/amp/alloc_float_status_op_npu.cc
 delete mode 100644 paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
 delete mode 100644 paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc
 delete mode 100644 paddle/fluid/operators/amp/clear_float_status_op_npu.cc
 delete mode 100644 paddle/fluid/operators/amp/get_float_status_op_npu.cc
 delete mode 100644 paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc

diff --git a/.gitignore b/.gitignore
index c0bdf7e4bf5cbc..047d9684b4cd0f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -77,6 +77,7 @@ tools/nvcc_lazy
 paddle/fluid/pybind/eager_op_function.cc
 tools/nvcc_lazy
 
+
 # these files (directories) are generated before build system generation
 paddle/fluid/operators/generated_op*.cc
 paddle/fluid/operators/generated_sparse_op.cc
diff --git a/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc b/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc
deleted file mode 100644
index 424c2326ab2010..00000000000000
--- a/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cmath>
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class AllocFloatStatusKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* float_status = ctx.Output<phi::DenseTensor>("FloatStatus");
-    float_status->mutable_data<T>(ctx.GetPlace());
-
-    const auto& runner =
-        NpuOpRunner("NPUAllocFloatStatus", {}, {*float_status});
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    alloc_float_status,
-    ops::AllocFloatStatusKernel<paddle::platform::NPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
deleted file mode 100644
index 63e16fb3570588..00000000000000
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
+++ /dev/null
@@ -1,111 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor_util.h"
-
-namespace paddle {
-namespace operators {
-
-// NOTE(zhiqiu): The CheckFiniteAndUnscaleNPUKernel is different from CUDA.
-// On NPU, we do not really check the data of input tensors,
-// but use NPUGetFloatStatus to check whether the nan/inf occurs on device,
-// and clear it after this op.
-// Which may leads to wrong result if the input tensors is not calculated
-// on NPU device, but got from other way, for example, feeding.
-template <typename T>
-class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    const auto xs = ctx.MultiInput<phi::DenseTensor>("X");
-    const auto* scale = ctx.Input<phi::DenseTensor>("Scale");
-    const auto* float_status = ctx.Input<phi::DenseTensor>("FloatStatus");
-    auto outs = ctx.MultiOutput<phi::DenseTensor>("Out");
-    auto* found_inf = ctx.Output<phi::DenseTensor>("FoundInfinite");
-
-    found_inf->mutable_data<bool>(ctx.GetPlace());
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    // step1: inverse scale
-    phi::DenseTensor const_tensor;
-    const_tensor.mutable_data<T>({1}, ctx.GetPlace());
-    FillNpuTensorWithConstant<T>(&const_tensor, static_cast<T>(1.0));
-
-    // Inverse(1.0/scale)
-    phi::DenseTensor* tmp_inverse_out = const_cast<phi::DenseTensor*>(scale);
-    phi::DenseTensor inverse_out(scale->type());
-    inverse_out.Resize(scale->dims());
-    inverse_out.mutable_data<T>(ctx.GetPlace());
-    const auto& runner_inverse =
-        NpuOpRunner("Div", {const_tensor, *scale}, {inverse_out}, {});
-    runner_inverse.Run(stream);
-    tmp_inverse_out = &inverse_out;
-
-    // NOTE(zhiqiu):
-    phi::DenseTensor tmp;
-    tmp.mutable_data<float>({8}, ctx.GetPlace());
-    // NOTE(zhiqiu): NPUGetFloatStatus updates data on input in-place.
-    // tmp is only placeholder.
-    const auto& runner_float_status =
-        NpuOpRunner("NPUGetFloatStatus",
-                    {*float_status},
-                    {tmp},
-                    {{"message", std::string("check_nan_and_inf")}});
-    runner_float_status.Run(stream);
-
-    phi::DenseTensor sum;
-    sum.mutable_data<float>({1}, ctx.GetPlace());
-    const auto& runner_reduce_sum =
-        NpuOpRunner("ReduceSumD",
-                    {*float_status},
-                    {sum},
-                    {{"axes", std::vector<int>{0}}, {"keep_dims", true}});
-    runner_reduce_sum.Run(stream);
-
-    const auto& runner_greater =
-        NpuOpRunner("GreaterEqual", {sum, const_tensor}, {*found_inf}, {});
-    runner_greater.Run(stream);
-
-    // NOTE(zhiqiu): The normal logic is :
-    // out = in, if found_inf = true
-    // out = in/scale, if found_inf = false
-    // However, on NPU, in order to avoid stream sync, we do not copy the
-    // found_inf data to cpu to check whether to unscale or not.
-    // Instead, we do the Mul no matter found_inf or not.
-    // And, a fact is, only few steps contains nan/inf during training.
-    for (size_t i = 0; i < xs.size(); ++i) {
-      const auto* x = xs[i];
-      auto* out = outs[i];
-      out->mutable_data<T>(ctx.GetPlace());
-      const auto& runner_mul =
-          NpuOpRunner("Mul", {*x, *tmp_inverse_out}, {*out}, {});
-      runner_mul.Run(stream);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(check_finite_and_unscale,
-                       ops::CheckFiniteAndUnscaleNPUKernel<float>,
-                       ops::CheckFiniteAndUnscaleNPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc
deleted file mode 100644
index bf7272ba8b8786..00000000000000
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc
+++ /dev/null
@@ -1,131 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <algorithm>
-#include <cstdlib>
-#include <memory>
-#include <random>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-USE_OP_ITSELF(check_finite_and_unscale);
-USE_OP_DEVICE_KERNEL(check_finite_and_unscale, NPU);
-
-struct InputVars {
-  std::string name;
-  phi::DenseTensor *tensor;
-};
-
-template <typename T>
-void Compare(f::Scope *scope, const p::DeviceContext &ctx) {
-  const f::DDim dims = phi::make_ddim({2, 2});
-  auto place = ctx.GetPlace();
-
-  // init input
-  std::vector<InputVars> input_names = {
-      {"x", scope->Var("x")->GetMutable<phi::DenseTensor>()},
-      {"x1", scope->Var("x1")->GetMutable<phi::DenseTensor>()}};
-
-  auto *scale = scope->Var("scale")->GetMutable<phi::DenseTensor>();
-
-  // init output
-  auto *out = scope->Var("out")->GetMutable<phi::DenseTensor>();
-  auto *out1 = scope->Var("out1")->GetMutable<phi::DenseTensor>();
-  auto *found_inf = scope->Var("found_inf")->GetMutable<phi::DenseTensor>();
-
-  // Initialize input data
-  const int num_inputs = input_names.size();
-  size_t numel = static_cast<size_t>(phi::product(dims));
-
-  for (int i = 0; i < num_inputs; ++i) {
-    std::vector<T> init_xs;
-    for (size_t j = 0; j < numel; ++j) {
-      if (j == 0) {
-        init_xs.push_back(static_cast<T>(NAN));
-      } else {
-        init_xs.push_back(static_cast<T>(j + 1));
-      }
-    }
-    f::TensorFromVector(init_xs, ctx, input_names[i].tensor);
-    input_names[i].tensor->Resize(dims);
-  }
-
-  f::TensorFromVector(std::vector<T>{static_cast<T>(0.5)}, ctx, scale);
-
-  ctx.Wait();
-
-  // run
-  f::AttributeMap attrs;
-  auto op = f::OpRegistry::CreateOp(
-      "check_finite_and_unscale",
-      {{"X", {"x", "x1"}}, {"Scale", {"scale"}}},
-      {{"Out", {"out", "out1"}}, {"FoundInfinite", {"found_inf"}}},
-      attrs);
-  op->Run(*scope, place);
-  ctx.Wait();
-
-  // out0
-  std::vector<T> out_vec;
-  f::TensorToVector(*out, ctx, &out_vec);
-  EXPECT_EQ(out_vec.size(), static_cast<size_t>(4));
-  for (size_t j = 0; j < out_vec.size(); ++j) {
-    VLOG(3) << "out_vec[" << j << "]:" << out_vec[j];
-  }
-
-  ctx.Wait();
-
-  // out0
-  std::vector<T> out1_vec;
-  f::TensorToVector(*out1, ctx, &out1_vec);
-  EXPECT_EQ(out1_vec.size(), static_cast<size_t>(4));
-  for (size_t j = 0; j < out1_vec.size(); ++j) {
-    VLOG(3) << "out1_vec[" << j << "]:" << out1_vec[j];
-  }
-
-  ctx.Wait();
-
-  // out found_inf
-  phi::DenseTensor found_inf_tensor;
-  found_inf_tensor.Resize({1});
-  bool *found_inf_data =
-      found_inf_tensor.mutable_data<bool>(paddle::platform::CPUPlace());
-  f::TensorCopy(*found_inf, place, &found_inf_tensor);
-  EXPECT_TRUE(*found_inf_data);
-
-  ctx.Wait();
-}
-
-TEST(check_finite_and_unscale, NPU_fp32) {
-  f::Scope scope;
-  auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<float>(&scope, *ctx);
-}
-
-TEST(check_finite_and_unscale, NPU_fp16) {
-  f::Scope scope;
-  auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<p::float16>(&scope, *ctx);
-}
diff --git a/paddle/fluid/operators/amp/clear_float_status_op_npu.cc b/paddle/fluid/operators/amp/clear_float_status_op_npu.cc
deleted file mode 100644
index 1f3e54421f0204..00000000000000
--- a/paddle/fluid/operators/amp/clear_float_status_op_npu.cc
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cmath>
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ClearFloatStatusKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto* float_status = ctx.Input<phi::DenseTensor>("FloatStatus");
-    auto* float_status_out = ctx.Output<phi::DenseTensor>("FloatStatusOut");
-    // NOTE(zhiqiu): NPUClearFloatStatus modifies the input.
-    PADDLE_ENFORCE_EQ(float_status_out,
-                      float_status,
-                      platform::errors::PreconditionNotMet(
-                          "The input(FloatStatus) and Output(FloatStatusOut) "
-                          "should be the same."));
-    phi::DenseTensor tmp;
-    tmp.mutable_data<float>({8}, ctx.GetPlace());
-    const auto& runner =
-        NpuOpRunner("NPUClearFloatStatus", {tmp}, {*float_status_out});
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    clear_float_status,
-    ops::ClearFloatStatusKernel<paddle::platform::NPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/amp/get_float_status_op_npu.cc b/paddle/fluid/operators/amp/get_float_status_op_npu.cc
deleted file mode 100644
index 5d8f88cc85f26b..00000000000000
--- a/paddle/fluid/operators/amp/get_float_status_op_npu.cc
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cmath>
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class GetFloatStatusKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto* float_status = ctx.Input<phi::DenseTensor>("FloatStatus");
-    auto* float_status_out = ctx.Output<phi::DenseTensor>("FloatStatusOut");
-    // GetClearFloatStatus modifies the input.
-    PADDLE_ENFORCE_EQ(float_status_out,
-                      float_status,
-                      platform::errors::PreconditionNotMet(
-                          "The input(FloatStatus) and Output(FloatStatusOut) "
-                          "should be the same."));
-    phi::DenseTensor tmp;
-    tmp.mutable_data<float>({8}, ctx.GetPlace());
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    // NPUGetFloatStatus updates data on input in-place.
-    // tmp is only placeholder.
-    NpuOpRunner("NPUGetFloatStatus", {*float_status}, {tmp}).Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    get_float_status,
-    ops::GetFloatStatusKernel<paddle::platform::NPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
deleted file mode 100644
index d4565c1780928e..00000000000000
--- a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
+++ /dev/null
@@ -1,293 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cmath>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/amp/fp16_type_traits.h"
-
-DECLARE_int32(min_loss_scaling);
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-void Update(const platform::NPUDeviceContext& ctx,
-            const std::vector<bool> found_inf_vec,
-            const phi::DenseTensor* pre_loss_scaling_tensor,
-            const phi::DenseTensor* good_in_tensor,
-            const phi::DenseTensor* bad_in_tensor,
-            const int incr_every_n_steps,
-            const int decr_every_n_nan_or_inf,
-            const float incr_ratio,
-            const float decr_ratio,
-            phi::DenseTensor* updated_loss_scaling_tensor,
-            phi::DenseTensor* good_out_tensor,
-            phi::DenseTensor* bad_out_tensor) {
-  auto place = ctx.GetPlace();
-  auto stream = ctx.stream();
-  if (found_inf_vec[0]) {
-    // good_out_data = 0
-    auto g = good_out_tensor->mutable_data<int>(place);
-    platform::NPUMemsetAsync(static_cast<void*>(g),
-                             0,
-                             good_out_tensor->numel() * sizeof(int),
-                             stream);
-    // bad_out_data = bad_in_data + 1
-    phi::DenseTensor factor_tensor(bad_out_tensor->dtype());
-    factor_tensor.mutable_data<int>({1}, place);
-    FillNpuTensorWithConstant<int>(&factor_tensor, static_cast<int>(1));
-    const auto& runner_p2 = NpuOpRunner(
-        "Add", {*bad_in_tensor, factor_tensor}, {*bad_out_tensor}, {});
-    runner_p2.Run(stream);
-
-    std::vector<int> bad_out_data;
-    paddle::framework::TensorToVector(*bad_out_tensor, ctx, &bad_out_data);
-    if (bad_out_data[0] >= decr_every_n_nan_or_inf) {
-      const auto& runner_p3 = NpuOpRunner("Power",
-                                          {*pre_loss_scaling_tensor},
-                                          {*updated_loss_scaling_tensor},
-                                          {{"power", static_cast<float>(1)},
-                                           {"scale", decr_ratio},
-                                           {"shift", static_cast<float>(0)}});
-
-      runner_p3.Run(stream);
-
-      std::vector<T> new_loss_scaling;
-      paddle::framework::TensorToVector(
-          *updated_loss_scaling_tensor, ctx, &new_loss_scaling);
-      float min_value = 1.0;
-      if (FLAGS_min_loss_scaling > 1) {
-        min_value = static_cast<float>(FLAGS_min_loss_scaling);
-      }
-
-      if (new_loss_scaling[0] < min_value) {
-        // updated_loss_scaling_data = 1
-        const auto& runner_p4 =
-            NpuOpRunner("Power",
-                        {*pre_loss_scaling_tensor},
-                        {*updated_loss_scaling_tensor},
-                        {{"power", static_cast<float>(1)},
-                         {"scale", static_cast<float>(0)},
-                         {"shift", static_cast<float>(min_value)}});
-
-        runner_p4.Run(stream);
-      }
-
-      // bad_out_data = 0
-      auto b = bad_out_tensor->mutable_data<int>(place);
-      platform::NPUMemsetAsync(static_cast<void*>(b),
-                               0,
-                               bad_out_tensor->numel() * sizeof(int),
-                               stream);
-    }
-  } else {
-    // bad_out_data = 0
-    auto b = bad_out_tensor->mutable_data<int>(place);
-    platform::NPUMemsetAsync(static_cast<void*>(b),
-                             0,
-                             bad_out_tensor->numel() * sizeof(int),
-                             stream);
-
-    // good_out_data = good_in_data + 1
-    phi::DenseTensor factor_tensor(good_out_tensor->dtype());
-    factor_tensor.mutable_data<int>({1}, place);
-    FillNpuTensorWithConstant<int>(&factor_tensor, static_cast<int>(1));
-    const auto& runner_p2 = NpuOpRunner(
-        "Add", {*good_in_tensor, factor_tensor}, {*good_out_tensor}, {});
-    runner_p2.Run(stream);
-
-    std::vector<int> good_out_data;
-    paddle::framework::TensorToVector(*good_out_tensor, ctx, &good_out_data);
-
-    if (good_out_data[0] >= incr_every_n_steps) {
-      const auto& runner_p3 = NpuOpRunner("Power",
-                                          {*pre_loss_scaling_tensor},
-                                          {*updated_loss_scaling_tensor},
-                                          {{"power", static_cast<float>(1)},
-                                           {"scale", incr_ratio},
-                                           {"shift", static_cast<float>(0)}});
-      runner_p3.Run(stream);
-
-      std::vector<T> new_loss_scaling;
-      paddle::framework::TensorToVector(
-          *updated_loss_scaling_tensor, ctx, &new_loss_scaling);
-      if (!std::isfinite(new_loss_scaling[0])) {
-        // updated_loss_scaling_data = pre_loss_scaling_data
-        const auto& runner_p4 = NpuOpRunner("Power",
-                                            {*pre_loss_scaling_tensor},
-                                            {*updated_loss_scaling_tensor},
-                                            {{"power", static_cast<float>(1)},
-                                             {"scale", static_cast<float>(1)},
-                                             {"shift", static_cast<float>(0)}});
-
-        runner_p4.Run(stream);
-      }
-      // good_out_data = 0
-      auto g = good_out_tensor->mutable_data<int>(place);
-      platform::NPUMemsetAsync(static_cast<void*>(g),
-                               0,
-                               good_out_tensor->numel() * sizeof(int),
-                               stream);
-    }
-  }
-}
-
-template <typename T>
-class UpdateLossScalingFunctor {
- public:
-  void operator()(const platform::NPUDeviceContext& dev_ctx,
-                  const std::vector<bool> found_inf_vec,
-                  const phi::DenseTensor* pre_loss_scaling_tensor,
-                  const phi::DenseTensor* good_in_tensor,
-                  const phi::DenseTensor* bad_in_tensor,
-                  const int incr_every_n_steps,
-                  const int decr_every_n_nan_or_inf,
-                  const float incr_ratio,
-                  const float decr_ratio,
-                  phi::DenseTensor* updated_loss_scaling_tensor,
-                  phi::DenseTensor* good_out_tensor,
-                  phi::DenseTensor* bad_out_tensor) const {
-    Update<T>(dev_ctx,
-              found_inf_vec,
-              pre_loss_scaling_tensor,
-              good_in_tensor,
-              bad_in_tensor,
-              incr_every_n_steps,
-              decr_every_n_nan_or_inf,
-              incr_ratio,
-              decr_ratio,
-              updated_loss_scaling_tensor,
-              good_out_tensor,
-              bad_out_tensor);
-  }
-};
-
-template <typename T>
-class LazyZerosNPU {
- public:
-  void operator()(const platform::NPUDeviceContext& dev_ctx,
-                  const std::vector<bool> found_inf_vec,
-                  const std::vector<const phi::DenseTensor*>& xs,
-                  const std::vector<phi::DenseTensor*>& outs) const {
-    if (!xs.size()) {
-      return;
-    }
-    auto place = dev_ctx.GetPlace();
-    auto stream = dev_ctx.stream();
-    phi::DenseTensor* zero_tensor = nullptr;
-    void* zero_ptr = nullptr;
-    if (found_inf_vec[0]) {
-      int max_num = -1;
-      for (size_t i = 0; i < xs.size(); ++i) {
-        auto* out = outs[i];
-        int num = out->numel();
-        if (max_num < num) {
-          max_num = num;
-          zero_tensor = out;
-        }
-      }
-
-      zero_tensor->mutable_data<T>(place);
-      const auto& runner_zeros =
-          NpuOpRunner("ZerosLike", {*zero_tensor}, {*zero_tensor});
-      runner_zeros.Run(stream);
-      zero_tensor->check_memory_size();
-      zero_ptr = zero_tensor->data();
-    }
-
-    for (size_t i = 0; i < xs.size(); ++i) {
-      auto* out = outs[i];
-      auto* x = xs[i];
-      auto dst_ptr = out->mutable_data<T>(place);
-      if (!found_inf_vec[0]) {
-        framework::TensorCopy(*x, place, dev_ctx, out);
-      } else if (zero_ptr != dst_ptr) {
-        auto size = out->numel() * phi::SizeOf(out->dtype());
-        memory::Copy(place, dst_ptr, place, zero_ptr, size, stream);
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class UpdateLossScalingNPUKernel : public framework::OpKernel<T> {
-  using MPDType = typename details::MPTypeTrait<T>::Type;
-
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-    const auto xs = ctx.MultiInput<phi::DenseTensor>("X");
-    auto outs = ctx.MultiOutput<phi::DenseTensor>("Out");
-    const auto* found_inf = ctx.Input<phi::DenseTensor>("FoundInfinite");
-    PADDLE_ENFORCE_EQ(found_inf->numel(),
-                      1,
-                      platform::errors::InvalidArgument(
-                          "FoundInfinite must has only one element."));
-
-    std::vector<bool> found_inf_vec;
-    paddle::framework::TensorToVector(
-        *found_inf, ctx.device_context(), &found_inf_vec);
-
-    LazyZerosNPU<T>{}(dev_ctx, found_inf_vec, xs, outs);
-    const bool stop_update = ctx.Attr<bool>("stop_update");
-    if (stop_update) {
-      return;
-    }
-
-    const auto* pre_loss_scaling =
-        ctx.Input<phi::DenseTensor>("PrevLossScaling");
-    const auto* good_in = ctx.Input<phi::DenseTensor>("InGoodSteps");
-    const auto* bad_in = ctx.Input<phi::DenseTensor>("InBadSteps");
-    auto* updated_loss_scaling = ctx.Output<phi::DenseTensor>("LossScaling");
-    auto* good_out = ctx.Output<phi::DenseTensor>("OutGoodSteps");
-    auto* bad_out = ctx.Output<phi::DenseTensor>("OutBadSteps");
-
-    updated_loss_scaling->mutable_data<MPDType>(dev_ctx.GetPlace());
-    good_out->mutable_data<int>(dev_ctx.GetPlace());
-    bad_out->mutable_data<int>(dev_ctx.GetPlace());
-
-    const int incr_every_n_steps = ctx.Attr<int>("incr_every_n_steps");
-    const int decr_every_n_nan_or_inf =
-        ctx.Attr<int>("decr_every_n_nan_or_inf");
-    const float incr_ratio = ctx.Attr<float>("incr_ratio");
-    const float decr_ratio = ctx.Attr<float>("decr_ratio");
-    UpdateLossScalingFunctor<MPDType>{}(dev_ctx,
-                                        found_inf_vec,
-                                        pre_loss_scaling,
-                                        good_in,
-                                        bad_in,
-                                        incr_every_n_steps,
-                                        decr_every_n_nan_or_inf,
-                                        incr_ratio,
-                                        decr_ratio,
-                                        updated_loss_scaling,
-                                        good_out,
-                                        bad_out);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    update_loss_scaling,
-    ops::UpdateLossScalingNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::UpdateLossScalingNPUKernel<paddle::platform::NPUDeviceContext,
-                                    double>);

From 2b0fffc2ae0a8b9ee808b5f69ea27d45ac6df17d Mon Sep 17 00:00:00 2001
From: Difer <707065510@qq.com>
Date: Mon, 10 Apr 2023 17:26:19 +0800
Subject: [PATCH 027/156] =?UTF-8?q?=E3=80=90Hackathon=20No57=E3=80=91=20ad?=
 =?UTF-8?q?d=20fp16=20&=20bf16=20for=20flip,=20fp16=20for=20gaussian=20(#5?=
 =?UTF-8?q?2380)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* add_fp_bf_for_flip_gaussian_random

* forget convert uint

* fix some error

* fix some error
---
 paddle/phi/kernels/gpu/flip_kernel.cu         |  1 +
 .../paddle/fluid/tests/unittests/test_flip.py | 95 ++++++++++++++++++-
 .../unittests/test_gaussian_random_op.py      | 43 +++++++++
 3 files changed, 135 insertions(+), 4 deletions(-)

diff --git a/paddle/phi/kernels/gpu/flip_kernel.cu b/paddle/phi/kernels/gpu/flip_kernel.cu
index d48ecc4dfac4cc..812d68df92d932 100644
--- a/paddle/phi/kernels/gpu/flip_kernel.cu
+++ b/paddle/phi/kernels/gpu/flip_kernel.cu
@@ -145,6 +145,7 @@ PD_REGISTER_KERNEL(flip,
                    float,
                    double,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    int,
                    int64_t,
                    bool,
diff --git a/python/paddle/fluid/tests/unittests/test_flip.py b/python/paddle/fluid/tests/unittests/test_flip.py
index 197766a5563e4e..a06ef10ca06130 100644
--- a/python/paddle/fluid/tests/unittests/test_flip.py
+++ b/python/paddle/fluid/tests/unittests/test_flip.py
@@ -17,7 +17,7 @@
 import gradient_checker
 import numpy as np
 from decorator_helper import prog_scope
-from eager_op_test import OpTest
+from eager_op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import fluid
@@ -74,9 +74,27 @@ def setUp(self):
         self.op_type = 'flip'
         self.python_api = paddle.tensor.flip
         self.init_test_case()
-        self.inputs = {'X': np.random.random(self.in_shape).astype('float64')}
         self.init_attrs()
-        self.outputs = {'Out': self.calc_ref_res()}
+        self.init_dtype()
+
+        if self.is_bfloat16_op():
+            self.input = np.random.random(self.in_shape).astype(np.float32)
+        else:
+            self.input = np.random.random(self.in_shape).astype(self.dtype)
+
+        output = self.calc_ref_res()
+
+        if self.is_bfloat16_op():
+            output = output.astype(np.float32)
+            self.inputs = {'X': convert_float_to_uint16(self.input)}
+            self.outputs = {'Out': convert_float_to_uint16(output)}
+        else:
+            self.inputs = {'X': self.input.astype(self.dtype)}
+            output = output.astype(self.dtype)
+            self.outputs = {'Out': output}
+
+    def init_dtype(self):
+        self.dtype = np.float64
 
     def init_attrs(self):
         self.attrs = {"axis": self.axis}
@@ -92,7 +110,7 @@ def init_test_case(self):
         self.axis = [0, 1]
 
     def calc_ref_res(self):
-        res = self.inputs['X']
+        res = self.input
         if isinstance(self.axis, int):
             return np.flip(res, self.axis)
         for axis in self.axis:
@@ -136,6 +154,75 @@ def init_test_case(self):
         self.axis = [-1]
 
 
+# ----------------flip_fp16----------------
+def create_test_fp16_class(parent):
+    @unittest.skipIf(
+        not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
+    )
+    class TestFlipFP16(parent):
+        def init_dtype(self):
+            self.dtype = np.float16
+
+        def test_check_output(self):
+            if core.is_compiled_with_cuda():
+                place = core.CUDAPlace(0)
+                if core.is_float16_supported(place):
+                    self.check_output_with_place(place)
+
+        def test_check_grad(self):
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_grad_with_place(place, ["X"], "Out")
+
+    cls_name = "{}_{}".format(parent.__name__, "FP16OP")
+    TestFlipFP16.__name__ = cls_name
+    globals()[cls_name] = TestFlipFP16
+
+
+create_test_fp16_class(TestFlipOp)
+create_test_fp16_class(TestFlipOpAxis1)
+create_test_fp16_class(TestFlipOpAxis2)
+create_test_fp16_class(TestFlipOpAxis3)
+create_test_fp16_class(TestFlipOpAxis4)
+create_test_fp16_class(TestFlipOpEmptyAxis)
+create_test_fp16_class(TestFlipOpNegAxis)
+
+
+# ----------------flip_bf16----------------
+def create_test_bf16_class(parent):
+    @unittest.skipIf(
+        not core.is_compiled_with_cuda()
+        or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+        "core is not compiled with CUDA and do not support bfloat16",
+    )
+    class TestFlipBF16(parent):
+        def init_dtype(self):
+            self.dtype = np.uint16
+
+        def test_check_output(self):
+            place = core.CUDAPlace(0)
+            if core.is_bfloat16_supported(place):
+                self.check_output_with_place(place)
+
+        def test_check_grad(self):
+            place = core.CUDAPlace(0)
+            if core.is_bfloat16_supported(place):
+                self.check_grad_with_place(place, ["X"], "Out")
+
+    cls_name = "{}_{}".format(parent.__name__, "BF16OP")
+    TestFlipBF16.__name__ = cls_name
+    globals()[cls_name] = TestFlipBF16
+
+
+create_test_bf16_class(TestFlipOp)
+create_test_bf16_class(TestFlipOpAxis1)
+create_test_bf16_class(TestFlipOpAxis2)
+create_test_bf16_class(TestFlipOpAxis3)
+create_test_bf16_class(TestFlipOpAxis4)
+create_test_bf16_class(TestFlipOpEmptyAxis)
+create_test_bf16_class(TestFlipOpNegAxis)
+
+
 class TestFlipDoubleGradCheck(unittest.TestCase):
     def flip_wrapper(self, x):
         return paddle.flip(x[0], [0, 1])
diff --git a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
index 3b8bf23e6003bd..f735835cb58804 100644
--- a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
@@ -64,6 +64,49 @@ def verify_output(self, outs):
         np.testing.assert_allclose(hist, hist2, rtol=0, atol=0.01)
 
 
+@unittest.skipIf(
+    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
+)
+class TestGaussianRandomFP16Op(OpTest):
+    def setUp(self):
+        self.op_type = "gaussian_random"
+        self.python_api = paddle.normal
+        self.set_attrs()
+        self.inputs = {}
+        self.use_mkldnn = False
+        self.attrs = {
+            "shape": [123, 92],
+            "mean": self.mean,
+            "std": self.std,
+            "seed": 10,
+            "dtype": paddle.fluid.core.VarDesc.VarType.FP16,
+            "use_mkldnn": self.use_mkldnn,
+        }
+        paddle.seed(10)
+
+        self.outputs = {'Out': np.zeros((123, 92), dtype='float16')}
+
+    def set_attrs(self):
+        self.mean = 1.0
+        self.std = 2.0
+
+    def test_check_output(self):
+        self.check_output_with_place_customized(
+            self.verify_output, place=core.CUDAPlace(0)
+        )
+
+    def verify_output(self, outs):
+        self.assertEqual(outs[0].shape, (123, 92))
+        hist, _ = np.histogram(outs[0], range=(-3, 5))
+        hist = hist.astype("float16")
+        hist /= float(outs[0].size)
+        data = np.random.normal(size=(123, 92), loc=1, scale=2)
+        hist2, _ = np.histogram(data, range=(-3, 5))
+        hist2 = hist2.astype("float16")
+        hist2 /= float(outs[0].size)
+        np.testing.assert_allclose(hist, hist2, rtol=0, atol=0.015)
+
+
 @unittest.skipIf(
     not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
 )

From a7707efbb8ec33a91b5ed38e4fdd4fcc69b7d1fe Mon Sep 17 00:00:00 2001
From: jjyaoao <88936287+jjyaoao@users.noreply.github.com>
Date: Mon, 10 Apr 2023 18:22:59 +0800
Subject: [PATCH 028/156] delete paddle/fluid/operators/*_npu.* (#52678)

* delete paddle/fluid/operators/*_npu.*

* try pass CI

* try pass CI
---
 paddle/fluid/operators/CMakeLists.txt         |    2 +-
 paddle/fluid/operators/abs_op_npu.cc          |   76 --
 paddle/fluid/operators/activation_op_npu.cc   | 1116 -----------------
 paddle/fluid/operators/argsort_op_npu.cc      |  286 -----
 paddle/fluid/operators/assign_op_npu.cc       |   58 -
 paddle/fluid/operators/assign_op_npu_test.cc  |   79 --
 paddle/fluid/operators/assign_value_op_npu.cc |   23 -
 paddle/fluid/operators/batch_norm_op_npu.cc   |  261 ----
 paddle/fluid/operators/bce_loss_op_npu.cc     |   81 --
 paddle/fluid/operators/beam_search_op_npu.cc  |   25 -
 paddle/fluid/operators/cast_op_npu.cc         |   98 --
 paddle/fluid/operators/clip_by_norm_op_npu.cc |   93 --
 paddle/fluid/operators/clip_op_npu.cc         |  122 --
 paddle/fluid/operators/concat_op_npu.cc       |  136 --
 paddle/fluid/operators/conv_op_npu.cc         |  688 ----------
 .../fluid/operators/conv_transpose_op_npu.cc  |  317 -----
 paddle/fluid/operators/crop_op_npu.cc         |  113 --
 paddle/fluid/operators/cumsum_op_npu.cc       |  110 --
 paddle/fluid/operators/dropout_op_npu.cc      |  212 ----
 paddle/fluid/operators/expand_as_v2_op_npu.cc |  104 --
 paddle/fluid/operators/expand_op_npu.cc       |  124 --
 paddle/fluid/operators/expand_op_npu_test.cc  |   75 --
 paddle/fluid/operators/expand_v2_op_npu.cc    |  235 ----
 paddle/fluid/operators/eye_op_npu.cc          |   57 -
 .../fluid/operators/fill_any_like_op_npu.cc   |   88 --
 .../fill_constant_batch_size_like_op_npu.cc   |  109 --
 .../fluid/operators/fill_constant_op_npu.cc   |  113 --
 .../fluid/operators/fill_zeros_like_op_npu.cc |   49 -
 paddle/fluid/operators/flatten_op_npu.cc      |  148 ---
 paddle/fluid/operators/gather_nd_op_npu.cc    |  120 --
 paddle/fluid/operators/gather_op_npu.cc       |   98 --
 paddle/fluid/operators/gather_op_npu_test.cc  |  171 ---
 .../fluid/operators/gaussian_random_op_npu.cc |   60 -
 paddle/fluid/operators/gelu_op_npu.cc         |   90 --
 paddle/fluid/operators/gelu_op_npu_test.cc    |  167 ---
 paddle/fluid/operators/group_norm_op_npu.cc   |  327 -----
 paddle/fluid/operators/huber_loss_op_npu.cc   |  144 ---
 paddle/fluid/operators/increment_op_npu.cc    |   55 -
 .../fluid/operators/increment_op_npu_test.cc  |   81 --
 paddle/fluid/operators/index_sample_op_npu.cc |  138 --
 paddle/fluid/operators/index_select_op_npu.cc |  161 ---
 .../fluid/operators/instance_norm_op_npu.cc   |   91 --
 paddle/fluid/operators/interpolate_op_npu.cc  |  226 ----
 .../fluid/operators/interpolate_v2_op_npu.cc  |  812 ------------
 paddle/fluid/operators/is_empty_op_npu.cc     |   23 -
 paddle/fluid/operators/kldiv_loss_op_npu.cc   |  170 ---
 paddle/fluid/operators/label_smooth_op_npu.cc |  114 --
 paddle/fluid/operators/layer_norm_op_npu.cc   |  449 -------
 paddle/fluid/operators/load_combine_op_npu.cc |   25 -
 paddle/fluid/operators/load_op_npu.cc         |  134 --
 paddle/fluid/operators/log_loss_op_npu.cc     |  130 --
 paddle/fluid/operators/log_softmax_op_npu.cc  |   78 --
 .../fluid/operators/lookup_table_v2_op_npu.cc |  176 ---
 .../fluid/operators/masked_select_op_npu.cc   |  202 ---
 paddle/fluid/operators/matmul_op_npu.cc       |  561 ---------
 paddle/fluid/operators/matmul_v2_op_npu.cc    |  480 -------
 paddle/fluid/operators/mean_op_npu.cc         |  102 --
 paddle/fluid/operators/meshgrid_op_npu.cc     |   88 --
 paddle/fluid/operators/mul_op_npu.cc          |  274 ----
 paddle/fluid/operators/multinomial_op_npu.cc  |   57 -
 paddle/fluid/operators/norm_op_npu.cc         |  107 --
 paddle/fluid/operators/one_hot_op_npu.cc      |   81 --
 paddle/fluid/operators/one_hot_v2_op_npu.cc   |   80 --
 paddle/fluid/operators/p_norm_op_npu.cc       |  228 ----
 paddle/fluid/operators/pad3d_op_npu.cc        |  147 ---
 paddle/fluid/operators/pad_op_npu.cc          |   96 --
 paddle/fluid/operators/pool_op_npu.cc         |  334 -----
 paddle/fluid/operators/randperm_op_npu.cc     |   23 -
 paddle/fluid/operators/range_op_npu.cc        |   81 --
 paddle/fluid/operators/range_op_npu_test.cc   |   93 --
 paddle/fluid/operators/reshape_op_npu.cc      |  167 ---
 paddle/fluid/operators/roi_align_op_npu.cc    |  200 ---
 paddle/fluid/operators/run_program_op_npu.cc  |   13 -
 paddle/fluid/operators/sampling_id_op_npu.cc  |   20 -
 paddle/fluid/operators/save_combine_op_npu.cc |   24 -
 paddle/fluid/operators/save_op_npu.cc         |   29 -
 paddle/fluid/operators/scale_op_npu.cc        |  114 --
 paddle/fluid/operators/scatter_op_npu.cc      |   13 -
 paddle/fluid/operators/seed_op_npu.cc         |   47 -
 paddle/fluid/operators/set_value_op_npu.cc    |  198 ---
 paddle/fluid/operators/shape_op_npu.cc        |   56 -
 paddle/fluid/operators/shard_index_op_npu.cc  |  121 --
 ...igmoid_cross_entropy_with_logits_op_npu.cc |  107 --
 paddle/fluid/operators/size_op_npu.cc         |   54 -
 paddle/fluid/operators/slice_op_npu.cc        |  254 ----
 .../fluid/operators/smooth_l1_loss_op_npu.cc  |  218 ----
 paddle/fluid/operators/softmax_op_npu.cc      |  103 --
 paddle/fluid/operators/softmax_op_npu_test.cc |  171 ---
 .../softmax_with_cross_entropy_op_npu.cc      |  141 ---
 paddle/fluid/operators/split_op_npu.cc        |   80 --
 .../fluid/operators/squared_l2_norm_op_npu.cc |  101 --
 paddle/fluid/operators/squeeze_op_npu.cc      |   59 -
 paddle/fluid/operators/squeeze_op_npu_test.cc |   88 --
 paddle/fluid/operators/stack_op_npu.cc        |  101 --
 .../fluid/operators/strided_slice_op_npu.cc   |  480 -------
 paddle/fluid/operators/sum_op_npu.cc          |  124 --
 .../fluid/operators/sync_batch_norm_op_npu.cc | 1105 ----------------
 .../fluid/operators/take_along_axis_op_npu.cc |   86 --
 paddle/fluid/operators/tile_op_npu.cc         |  138 --
 paddle/fluid/operators/top_k_op_npu.cc        |  101 --
 paddle/fluid/operators/top_k_v2_op_npu.cc     |  100 --
 paddle/fluid/operators/transpose_op_npu.cc    |   91 --
 .../fluid/operators/transpose_op_npu_test.cc  |  141 ---
 paddle/fluid/operators/tril_triu_op_npu.cc    |   90 --
 .../truncated_gaussian_random_op_npu.cc       |  112 --
 .../fluid/operators/uniform_random_op_npu.cc  |  115 --
 paddle/fluid/operators/unsqueeze_op_npu.cc    |   13 -
 .../fluid/operators/unsqueeze_op_npu_test.cc  |   88 --
 paddle/fluid/operators/unstack_op_npu.cc      |   86 --
 paddle/fluid/operators/where_index_op_npu.cc  |  105 --
 paddle/fluid/operators/where_op_npu.cc        |   96 --
 111 files changed, 1 insertion(+), 17621 deletions(-)
 delete mode 100644 paddle/fluid/operators/abs_op_npu.cc
 delete mode 100644 paddle/fluid/operators/activation_op_npu.cc
 delete mode 100644 paddle/fluid/operators/argsort_op_npu.cc
 delete mode 100644 paddle/fluid/operators/assign_op_npu.cc
 delete mode 100644 paddle/fluid/operators/assign_op_npu_test.cc
 delete mode 100644 paddle/fluid/operators/assign_value_op_npu.cc
 delete mode 100644 paddle/fluid/operators/batch_norm_op_npu.cc
 delete mode 100644 paddle/fluid/operators/bce_loss_op_npu.cc
 delete mode 100644 paddle/fluid/operators/beam_search_op_npu.cc
 delete mode 100644 paddle/fluid/operators/cast_op_npu.cc
 delete mode 100644 paddle/fluid/operators/clip_by_norm_op_npu.cc
 delete mode 100644 paddle/fluid/operators/clip_op_npu.cc
 delete mode 100644 paddle/fluid/operators/concat_op_npu.cc
 delete mode 100644 paddle/fluid/operators/conv_op_npu.cc
 delete mode 100644 paddle/fluid/operators/conv_transpose_op_npu.cc
 delete mode 100644 paddle/fluid/operators/crop_op_npu.cc
 delete mode 100644 paddle/fluid/operators/cumsum_op_npu.cc
 delete mode 100644 paddle/fluid/operators/dropout_op_npu.cc
 delete mode 100644 paddle/fluid/operators/expand_as_v2_op_npu.cc
 delete mode 100644 paddle/fluid/operators/expand_op_npu.cc
 delete mode 100644 paddle/fluid/operators/expand_op_npu_test.cc
 delete mode 100644 paddle/fluid/operators/expand_v2_op_npu.cc
 delete mode 100644 paddle/fluid/operators/eye_op_npu.cc
 delete mode 100644 paddle/fluid/operators/fill_any_like_op_npu.cc
 delete mode 100644 paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc
 delete mode 100644 paddle/fluid/operators/fill_constant_op_npu.cc
 delete mode 100644 paddle/fluid/operators/fill_zeros_like_op_npu.cc
 delete mode 100644 paddle/fluid/operators/flatten_op_npu.cc
 delete mode 100644 paddle/fluid/operators/gather_nd_op_npu.cc
 delete mode 100644 paddle/fluid/operators/gather_op_npu.cc
 delete mode 100644 paddle/fluid/operators/gather_op_npu_test.cc
 delete mode 100644 paddle/fluid/operators/gaussian_random_op_npu.cc
 delete mode 100644 paddle/fluid/operators/gelu_op_npu.cc
 delete mode 100644 paddle/fluid/operators/gelu_op_npu_test.cc
 delete mode 100644 paddle/fluid/operators/group_norm_op_npu.cc
 delete mode 100644 paddle/fluid/operators/huber_loss_op_npu.cc
 delete mode 100644 paddle/fluid/operators/increment_op_npu.cc
 delete mode 100644 paddle/fluid/operators/increment_op_npu_test.cc
 delete mode 100644 paddle/fluid/operators/index_sample_op_npu.cc
 delete mode 100644 paddle/fluid/operators/index_select_op_npu.cc
 delete mode 100644 paddle/fluid/operators/instance_norm_op_npu.cc
 delete mode 100644 paddle/fluid/operators/interpolate_op_npu.cc
 delete mode 100644 paddle/fluid/operators/interpolate_v2_op_npu.cc
 delete mode 100644 paddle/fluid/operators/is_empty_op_npu.cc
 delete mode 100644 paddle/fluid/operators/kldiv_loss_op_npu.cc
 delete mode 100644 paddle/fluid/operators/label_smooth_op_npu.cc
 delete mode 100644 paddle/fluid/operators/layer_norm_op_npu.cc
 delete mode 100644 paddle/fluid/operators/load_combine_op_npu.cc
 delete mode 100644 paddle/fluid/operators/load_op_npu.cc
 delete mode 100644 paddle/fluid/operators/log_loss_op_npu.cc
 delete mode 100644 paddle/fluid/operators/log_softmax_op_npu.cc
 delete mode 100644 paddle/fluid/operators/lookup_table_v2_op_npu.cc
 delete mode 100644 paddle/fluid/operators/masked_select_op_npu.cc
 delete mode 100644 paddle/fluid/operators/matmul_op_npu.cc
 delete mode 100644 paddle/fluid/operators/matmul_v2_op_npu.cc
 delete mode 100644 paddle/fluid/operators/mean_op_npu.cc
 delete mode 100644 paddle/fluid/operators/meshgrid_op_npu.cc
 delete mode 100644 paddle/fluid/operators/mul_op_npu.cc
 delete mode 100644 paddle/fluid/operators/multinomial_op_npu.cc
 delete mode 100644 paddle/fluid/operators/norm_op_npu.cc
 delete mode 100644 paddle/fluid/operators/one_hot_op_npu.cc
 delete mode 100644 paddle/fluid/operators/one_hot_v2_op_npu.cc
 delete mode 100644 paddle/fluid/operators/p_norm_op_npu.cc
 delete mode 100644 paddle/fluid/operators/pad3d_op_npu.cc
 delete mode 100644 paddle/fluid/operators/pad_op_npu.cc
 delete mode 100644 paddle/fluid/operators/pool_op_npu.cc
 delete mode 100644 paddle/fluid/operators/randperm_op_npu.cc
 delete mode 100644 paddle/fluid/operators/range_op_npu.cc
 delete mode 100644 paddle/fluid/operators/range_op_npu_test.cc
 delete mode 100644 paddle/fluid/operators/reshape_op_npu.cc
 delete mode 100644 paddle/fluid/operators/roi_align_op_npu.cc
 delete mode 100644 paddle/fluid/operators/run_program_op_npu.cc
 delete mode 100644 paddle/fluid/operators/sampling_id_op_npu.cc
 delete mode 100644 paddle/fluid/operators/save_combine_op_npu.cc
 delete mode 100644 paddle/fluid/operators/save_op_npu.cc
 delete mode 100644 paddle/fluid/operators/scale_op_npu.cc
 delete mode 100644 paddle/fluid/operators/scatter_op_npu.cc
 delete mode 100644 paddle/fluid/operators/seed_op_npu.cc
 delete mode 100644 paddle/fluid/operators/set_value_op_npu.cc
 delete mode 100644 paddle/fluid/operators/shape_op_npu.cc
 delete mode 100644 paddle/fluid/operators/shard_index_op_npu.cc
 delete mode 100644 paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc
 delete mode 100644 paddle/fluid/operators/size_op_npu.cc
 delete mode 100644 paddle/fluid/operators/slice_op_npu.cc
 delete mode 100644 paddle/fluid/operators/smooth_l1_loss_op_npu.cc
 delete mode 100644 paddle/fluid/operators/softmax_op_npu.cc
 delete mode 100644 paddle/fluid/operators/softmax_op_npu_test.cc
 delete mode 100644 paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc
 delete mode 100644 paddle/fluid/operators/split_op_npu.cc
 delete mode 100644 paddle/fluid/operators/squared_l2_norm_op_npu.cc
 delete mode 100644 paddle/fluid/operators/squeeze_op_npu.cc
 delete mode 100644 paddle/fluid/operators/squeeze_op_npu_test.cc
 delete mode 100644 paddle/fluid/operators/stack_op_npu.cc
 delete mode 100644 paddle/fluid/operators/strided_slice_op_npu.cc
 delete mode 100644 paddle/fluid/operators/sum_op_npu.cc
 delete mode 100644 paddle/fluid/operators/sync_batch_norm_op_npu.cc
 delete mode 100644 paddle/fluid/operators/take_along_axis_op_npu.cc
 delete mode 100644 paddle/fluid/operators/tile_op_npu.cc
 delete mode 100644 paddle/fluid/operators/top_k_op_npu.cc
 delete mode 100644 paddle/fluid/operators/top_k_v2_op_npu.cc
 delete mode 100644 paddle/fluid/operators/transpose_op_npu.cc
 delete mode 100644 paddle/fluid/operators/transpose_op_npu_test.cc
 delete mode 100644 paddle/fluid/operators/tril_triu_op_npu.cc
 delete mode 100644 paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
 delete mode 100644 paddle/fluid/operators/uniform_random_op_npu.cc
 delete mode 100644 paddle/fluid/operators/unsqueeze_op_npu.cc
 delete mode 100644 paddle/fluid/operators/unsqueeze_op_npu_test.cc
 delete mode 100644 paddle/fluid/operators/unstack_op_npu.cc
 delete mode 100644 paddle/fluid/operators/where_index_op_npu.cc
 delete mode 100644 paddle/fluid/operators/where_op_npu.cc

diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 1d6e75c4b5d71c..a30909322eccfe 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -96,7 +96,7 @@ register_operators(EXCLUDES py_func_op dgc_op generated_op1 generated_op2 genera
         recurrent_op save_combine_op sparse_attention_op sync_batch_norm_op activation_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})
 
 op_library(generated_op UNITY SRCS generated_op1.cc generated_op2.cc generated_op3.cc generated_op4.cc DEPS ${OP_HEADER_DEPS})
-op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc run_program_op_npu.cc DEPS executor_cache ${OP_HEADER_DEPS})
+op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc DEPS executor_cache ${OP_HEADER_DEPS})
 target_link_libraries(run_program_op cuda_graph_with_memory_pool)
 op_library(quantize_linear_op DEPS phi)
 op_library(save_combine_op DEPS string_array phi)
diff --git a/paddle/fluid/operators/abs_op_npu.cc b/paddle/fluid/operators/abs_op_npu.cc
deleted file mode 100644
index 0a859d1f564a91..00000000000000
--- a/paddle/fluid/operators/abs_op_npu.cc
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the Licnse. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class AbsNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    const auto& runner = NpuOpRunner("Abs",
-                                     {
-                                         *x,
-                                     },
-                                     {*out},
-                                     {});
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class AbsGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-
-    dx->mutable_data<T>(ctx.GetPlace());
-
-    const auto& runner = NpuOpRunner("AbsGrad", {*x, *dout}, {*dx}, {});
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(
-    abs,
-    ops::AbsNPUKernel<plat::NPUDeviceContext, float>,
-    ops::AbsNPUKernel<plat::NPUDeviceContext, plat::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    abs_grad,
-    ops::AbsGradNPUKernel<plat::NPUDeviceContext, float>,
-    ops::AbsGradNPUKernel<plat::NPUDeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/activation_op_npu.cc b/paddle/fluid/operators/activation_op_npu.cc
deleted file mode 100644
index 9f3392f2eabc57..00000000000000
--- a/paddle/fluid/operators/activation_op_npu.cc
+++ /dev/null
@@ -1,1116 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the Licnse. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/activation_op.h"
-#include "paddle/phi/core/ddim.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class PowNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    auto factor = ctx.Attr<float>("factor");
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    const auto& runner = NpuOpRunner("Power",
-                                     {*x},
-                                     {*out},
-                                     {{"power", factor},
-                                      {"scale", static_cast<float>(1.0)},
-                                      {"shift", static_cast<float>(0.0)}});
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class PowGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto factor = ctx.Attr<float>("factor");
-
-    auto x_dims = x->dims();
-
-    auto place = ctx.GetPlace();
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    // NOTE(liym27): dx = dout * factor * x.pow(factor-1)
-
-    // Step1: Compute x_pow = x.pow(factor-1)
-    phi::DenseTensor x_pow(x->type());
-    x_pow.mutable_data<T>(x->dims(), place);
-    const auto& runner_pow = NpuOpRunner(
-        "Power", {*x}, {x_pow}, {{"power", factor - static_cast<float>(1)}});
-    runner_pow.Run(stream);
-
-    // Step 2: Construct a broadcast factor, which has the same shape with x.
-
-    // 2.1 Get a factor tensor with shape [1].
-    phi::DenseTensor factor_tensor(phi::DataType::FLOAT32);
-    factor_tensor.mutable_data<float>({1}, place);
-    FillNpuTensorWithConstant<float>(&factor_tensor, factor);
-
-    // 2.2 Get the factor which has the shape with x and the same value with
-    // factor.
-    phi::DenseTensor factor_bc_tensor(phi::DataType::FLOAT32);
-    factor_bc_tensor.mutable_data<float>(x_dims, place);
-    const auto& runner_bc = NpuOpRunner("FillD",
-                                        {factor_tensor},
-                                        {factor_bc_tensor},
-                                        {{"dims", phi::vectorize(x_dims)}});
-    runner_bc.Run(stream);
-
-    // Step 3: Compute x_power_mul_factor = factor * x.pow(factor-1)
-    phi::DenseTensor x_power_mul_factor(x->type());
-    x_power_mul_factor.mutable_data<T>(x->dims(), place);
-    const auto& runner_mul_1 =
-        NpuOpRunner("Mul", {factor_bc_tensor, x_pow}, {x_power_mul_factor}, {});
-    runner_mul_1.Run(stream);
-
-    // Step 4: Compute dx = dout * factor * x.pow(factor-1)
-    dx->mutable_data<T>(place);
-    const auto& runner_mul_2 =
-        NpuOpRunner("Mul", {*dout, x_power_mul_factor}, {*dx}, {});
-    runner_mul_2.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ReluNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    const auto& runner = NpuOpRunner("Relu",
-                                     {
-                                         *x,
-                                     },
-                                     {*out},
-                                     {});
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ReluGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Input<phi::DenseTensor>("Out");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    dx->mutable_data<T>(ctx.GetPlace());
-    const auto& runner = NpuOpRunner("ReluGrad", {*dout, *out}, {*dx}, {});
-
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class Relu6NPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    const auto& runner = NpuOpRunner("Relu6",
-                                     {
-                                         *x,
-                                     },
-                                     {*out},
-                                     {});
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class Relu6GradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Input<phi::DenseTensor>("Out");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    dx->mutable_data<T>(ctx.GetPlace());
-    const auto& runner = NpuOpRunner("Relu6Grad", {*dout, *out}, {*dx}, {});
-
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SqrtNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    auto place = ctx.GetPlace();
-
-    out->mutable_data<T>(place);
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner = NpuOpRunner("Sqrt", {*x}, {*out}, {});
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class LeakyReluNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    auto alpha = ctx.Attr<float>("alpha");
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner =
-        NpuOpRunner("LeakyRelu", {*x}, {*out}, {{"negative_slope", alpha}});
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class LeakyReluGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto alpha = ctx.Attr<float>("alpha");
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    dx->mutable_data<T>(ctx.GetPlace());
-    const auto& runner = NpuOpRunner(
-        "LeakyReluGrad", {*dout, *x}, {*dx}, {{"negative_slope", alpha}});
-
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SqrtGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Input<phi::DenseTensor>("Out");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-
-    auto place = ctx.GetPlace();
-
-    dx->mutable_data<T>(place);
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner_dx = NpuOpRunner("SqrtGrad", {*out, *dout}, {*dx}, {});
-    runner_dx.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class LogNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    auto place = ctx.GetPlace();
-
-    out->mutable_data<T>(place);
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    phi::DenseTensor one(x->type());
-    one.mutable_data<T>(x->dims(), place);
-    const auto& runner_one = NpuOpRunner("OnesLike", {*x}, {one}, {});
-    runner_one.Run(stream);
-
-    phi::DenseTensor sub(x->type());
-    sub.mutable_data<T>(x->dims(), place);
-    const auto& runner_sub = NpuOpRunner("Sub", {*x, one}, {sub}, {});
-    runner_sub.Run(stream);
-
-    const auto& runner_out = NpuOpRunner("Log1p", {sub}, {*out}, {});
-    runner_out.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class LogGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-
-    auto place = ctx.GetPlace();
-
-    dx->mutable_data<T>(place);
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    const auto& runner = NpuOpRunner("DivNoNan", {*dout, *x}, {*dx}, {});
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class TanhNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    auto place = ctx.GetPlace();
-
-    out->mutable_data<T>(place);
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner = NpuOpRunner("Tanh", {*x}, {*out}, {});
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class TanhGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* out = ctx.Input<phi::DenseTensor>("Out");
-
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-
-    auto place = ctx.GetPlace();
-
-    dx->mutable_data<T>(place);
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner_dx = NpuOpRunner("TanhGrad", {*out, *dout}, {*dx}, {});
-    runner_dx.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SquareNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    auto place = ctx.GetPlace();
-
-    out->mutable_data<T>(place);
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner = NpuOpRunner("Square", {*x}, {*out}, {});
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SquareGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-
-    auto factor = static_cast<float>(2.0);
-
-    auto place = ctx.GetPlace();
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    // Step 1: Compute x_muls_factor = factor * x
-    phi::DenseTensor x_muls_factor(x->type());
-    x_muls_factor.mutable_data<T>(x->dims(), place);
-    const auto& runner_muls_1 =
-        NpuOpRunner("Muls", {*x}, {x_muls_factor}, {{"value", factor}});
-    runner_muls_1.Run(stream);
-
-    // Step 2: Compute dx = dout * factor * x
-    dx->mutable_data<T>(place);
-    const auto& runner_mul_2 =
-        NpuOpRunner("Mul", {*dout, x_muls_factor}, {*dx}, {});
-    runner_mul_2.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SigmoidNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    auto place = ctx.GetPlace();
-
-    out->mutable_data<T>(place);
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner = NpuOpRunner("Sigmoid", {*x}, {*out}, {});
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SigmoidGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* out = ctx.Input<phi::DenseTensor>("Out");
-
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-
-    auto place = ctx.GetPlace();
-
-    dx->mutable_data<T>(place);
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner_dx =
-        NpuOpRunner("SigmoidGrad", {*out, *dout}, {*dx}, {});
-    runner_dx.Run(stream);
-  }
-};
-
-// Swish = x * sigmoid(beta * x)
-template <typename T>
-class SwishNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    float beta = ctx.Attr<float>("beta");
-
-    out->mutable_data<T>(ctx.GetPlace());
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& muls_runner =
-        NpuOpRunner("Muls", {*x}, {*out}, {{"value", beta}});
-    muls_runner.Run(stream);
-
-    const auto& sigmoid_runner = NpuOpRunner("Sigmoid", {*out}, {*out}, {});
-    sigmoid_runner.Run(stream);
-
-    const auto& mul_runner = NpuOpRunner("Mul", {*x, *out}, {*out});
-    mul_runner.Run(stream);
-  }
-};
-
-template <typename T>
-class SwishGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    float beta = ctx.Attr<float>("beta");
-
-    dx->mutable_data<T>(ctx.GetPlace());
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    phi::DenseTensor beta_x, sigmoid_out, swish_out;
-    beta_x.mutable_data<T>(x->dims(), ctx.GetPlace());
-    sigmoid_out.mutable_data<T>(x->dims(), ctx.GetPlace());
-    swish_out.mutable_data<T>(x->dims(), ctx.GetPlace());
-    const auto& muls_runner =
-        NpuOpRunner("Muls", {*x}, {beta_x}, {{"value", beta}});
-    muls_runner.Run(stream);
-
-    const auto& sigmoid_runner =
-        NpuOpRunner("Sigmoid", {beta_x}, {sigmoid_out}, {});
-    sigmoid_runner.Run(stream);
-
-    const auto& mul_runner =
-        NpuOpRunner("Mul", {sigmoid_out, *x}, {swish_out}, {});
-    mul_runner.Run(stream);
-    const auto& muls_runner2 =
-        NpuOpRunner("Muls", {swish_out}, {swish_out}, {{"value", beta}});
-    muls_runner2.Run(stream);
-
-    const auto& mul_runner1 =
-        NpuOpRunner("Mul", {sigmoid_out, swish_out}, {*dx}, {});
-    mul_runner1.Run(stream);
-
-    const auto& sub_runner = NpuOpRunner("Sub", {swish_out, *dx}, {*dx}, {});
-    sub_runner.Run(stream);
-
-    const auto& add_runner = NpuOpRunner("Add", {sigmoid_out, *dx}, {*dx}, {});
-    add_runner.Run(stream);
-
-    const auto& mul_runner2 = NpuOpRunner("Mul", {*dout, *dx}, {*dx}, {});
-    mul_runner2.Run(stream);
-  }
-};
-
-// HardSwish = min(max(0, x+offset), threshold) * x / scale
-template <typename T>
-class HardSwishNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    float threshold = ctx.Attr<float>("threshold");
-    float scale = ctx.Attr<float>("scale");
-    float offset = ctx.Attr<float>("offset");
-
-    auto place = ctx.GetPlace();
-
-    out->mutable_data<T>(place);
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    phi::DenseTensor tensor_offset(x->type());
-    tensor_offset.mutable_data<T>({1}, place);
-    FillNpuTensorWithConstant<T>(&tensor_offset, static_cast<T>(offset));
-
-    phi::DenseTensor add_offset_val(x->type());
-    add_offset_val.mutable_data<T>(x->dims(), place);
-    const auto& runner_add =
-        NpuOpRunner("AddV2", {*x, tensor_offset}, {add_offset_val});
-    runner_add.Run(stream);
-
-    phi::DenseTensor tensor_threshold(x->type());
-    tensor_threshold.mutable_data<T>({1}, place);
-    FillNpuTensorWithConstant<T>(&tensor_threshold, static_cast<T>(threshold));
-
-    phi::DenseTensor tensor_zero(x->type());
-    tensor_zero.mutable_data<T>({1}, place);
-    FillNpuTensorWithConstant<T>(&tensor_zero, static_cast<T>(0.0));
-
-    phi::DenseTensor clip_val(x->type());
-    clip_val.mutable_data<T>(x->dims(), place);
-    const auto& runner_clip =
-        NpuOpRunner("ClipByValue",
-                    {add_offset_val, tensor_zero, tensor_threshold},
-                    {clip_val});
-    runner_clip.Run(stream);
-
-    phi::DenseTensor tensor_scale_tmp(x->type());
-    tensor_scale_tmp.mutable_data<T>({1}, place);
-    FillNpuTensorWithConstant<T>(&tensor_scale_tmp, static_cast<T>(scale));
-    phi::DenseTensor tensor_scale(x->type());
-    tensor_scale.mutable_data<T>(x->dims(), place);
-    const auto& runner_fill =
-        NpuOpRunner("FillD",
-                    {tensor_scale_tmp},
-                    {tensor_scale},
-                    {{"dims", phi::vectorize(x->dims())}});
-    runner_fill.Run(stream);
-
-    phi::DenseTensor div_val(x->type());
-    div_val.mutable_data<T>(x->dims(), place);
-    const auto& runner_div =
-        NpuOpRunner("Div", {clip_val, tensor_scale}, {div_val});
-    runner_div.Run(stream);
-
-    const auto& runner_mul = NpuOpRunner("Mul", {*x, div_val}, {*out});
-    runner_mul.Run(stream);
-  }
-};
-
-template <typename T>
-class HardSwishGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-
-    float threshold = ctx.Attr<float>("threshold");
-    float scale = ctx.Attr<float>("scale");
-    float offset = ctx.Attr<float>("offset");
-
-    auto place = ctx.GetPlace();
-
-    dx->mutable_data<T>(place);
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    phi::DenseTensor tensor_offset(x->type());
-    tensor_offset.mutable_data<T>({1}, place);
-    FillNpuTensorWithConstant<T>(&tensor_offset, static_cast<T>(offset));
-
-    phi::DenseTensor add_offset_val(x->type());
-    add_offset_val.mutable_data<T>(x->dims(), place);
-    const auto& runner_add =
-        NpuOpRunner("AddV2", {*x, tensor_offset}, {add_offset_val});
-    runner_add.Run(stream);
-
-    phi::DenseTensor tmp1(x->type());
-    tmp1.mutable_data<T>(x->dims(), place);
-    const auto& runner_pow1 = NpuOpRunner(
-        "Power", {*x}, {tmp1}, {{"scale", 2.0f}, {"shift", offset}});
-    runner_pow1.Run(stream);
-
-    phi::DenseTensor tmp2(x->type());
-    tmp2.mutable_data<T>(x->dims(), place);
-    const auto& runner_ht_grad =
-        NpuOpRunner("HardtanhGrad",
-                    {add_offset_val, tmp1},
-                    {tmp2},
-                    {{"min_val", 0.0f}, {"max_val", threshold}});
-    runner_ht_grad.Run(stream);
-
-    phi::DenseTensor tmp3(x->type());
-    tmp3.mutable_data<T>(x->dims(), place);
-    const auto& runner_pow2 = NpuOpRunner(
-        "Power", {tmp2}, {tmp3}, {{"scale", 1.0f / scale}, {"shift", 1.0f}});
-    runner_pow2.Run(stream);
-
-    phi::DenseTensor tensor_threshold_tmp(x->type());
-    tensor_threshold_tmp.mutable_data<T>({1}, place);
-    FillNpuTensorWithConstant<T>(&tensor_threshold_tmp,
-                                 static_cast<T>(threshold));
-    phi::DenseTensor tensor_threshold(x->type());
-    tensor_threshold.mutable_data<T>(x->dims(), place);
-    const auto& runner_fill =
-        NpuOpRunner("FillD",
-                    {tensor_threshold_tmp},
-                    {tensor_threshold},
-                    {{"dims", phi::vectorize(x->dims())}});
-    runner_fill.Run(stream);
-
-    phi::DenseTensor tmp_bool(phi::DataType::BOOL);
-    tmp_bool.mutable_data<bool>(x->dims(), place);
-    const auto& runner_less =
-        NpuOpRunner("Less", {add_offset_val, tensor_threshold}, {tmp_bool});
-    runner_less.Run(stream);
-    phi::DenseTensor tmp4(x->type());
-    tmp4.mutable_data<T>(x->dims(), place);
-    auto dst_dtype =
-        ConvertToNpuDtype(framework::TransToProtoVarType(x->type()));
-    const auto& runner_cast =
-        NpuOpRunner("Cast",
-                    {tmp_bool},
-                    {tmp4},
-                    {{"dst_type", static_cast<int>(dst_dtype)}});
-    runner_cast.Run(stream);
-
-    phi::DenseTensor tmp5(x->type());
-    tmp5.mutable_data<T>(x->dims(), place);
-    const auto& runner_sub = NpuOpRunner("Sub", {tmp3, tmp4}, {tmp5});
-    runner_sub.Run(stream);
-
-    const auto& runner_final = NpuOpRunner("Mul", {tmp5, *dout}, {*dx});
-    runner_final.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class HardSigmoidNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    float slope = ctx.Attr<float>("slope");
-    float offset = ctx.Attr<float>("offset");
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    framework::NPUAttributeMap attr_input = {{"alpha", slope},
-                                             {"beta", offset}};
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner = NpuOpRunner("HardSigmoid", {*x}, {*out}, attr_input);
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class HardSigmoidGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* out = ctx.Input<phi::DenseTensor>("Out");
-
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-
-    float slope = ctx.Attr<float>("slope");
-    float offset = ctx.Attr<float>("offset");
-
-    dx->mutable_data<T>(ctx.GetPlace());
-
-    framework::NPUAttributeMap attr_input = {{"alpha", slope},
-                                             {"beta", offset}};
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner_dx =
-        NpuOpRunner("HardSigmoidGrad", {*dout, *out}, {*dx}, attr_input);
-    runner_dx.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ReciprocalNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    auto place = ctx.GetPlace();
-    out->mutable_data<T>(place);
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    const auto& runner = NpuOpRunner("Reciprocal", {*x}, {*out}, {});
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ReciprocalGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Input<phi::DenseTensor>("Out");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto place = ctx.GetPlace();
-    dx->mutable_data<T>(place);
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    const auto& runner_dx =
-        NpuOpRunner("ReciprocalGrad", {*out, *dout}, {*dx}, {});
-    runner_dx.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class CosNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    auto place = ctx.GetPlace();
-    out->mutable_data<T>(place);
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner = NpuOpRunner("Cos", {*x}, {*out}, {});
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class CosGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-
-    auto place = ctx.GetPlace();
-    dx->mutable_data<T>(place);
-
-    phi::DenseTensor sin_out(x->type());  // Temporary phi::DenseTensor
-    sin_out.Resize(x->dims());
-    sin_out.mutable_data<T>(place);
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    const auto& runner = NpuOpRunner("Sin", {*x}, {sin_out}, {});
-    runner.Run(stream);
-
-    const auto& runner_dx = NpuOpRunner("Mul", {*dout, sin_out}, {*dx}, {});
-    runner_dx.Run(stream);
-
-    phi::DenseTensor tmp(x->type());  // Temporary phi::DenseTensor
-    tmp.Resize(phi::make_ddim({1, 1}));
-    tmp.mutable_data<T>(place);
-    float factor = -1.;
-    FillNpuTensorWithConstant<T>(&tmp, static_cast<T>(factor));
-
-    const auto& runner_dx_ = NpuOpRunner("Xdivy", {*dx, tmp}, {*dx}, {});
-    runner_dx_.Run(stream);
-    // dx = -dout * Sine(x);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class AtanNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    auto place = ctx.GetPlace();
-    out->mutable_data<T>(place);
-    const auto& runner = NpuOpRunner("Atan", {*x}, {*out}, {});
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class AtanGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto place = ctx.GetPlace();
-    dx->mutable_data<T>(place);
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    const auto& runner_dx = NpuOpRunner("AtanGrad", {*x, *dout}, {*dx}, {});
-    runner_dx.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ExpNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-    const auto& runner = NpuOpRunner("Exp", {*x}, {*out}, {});
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ExpGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Input<phi::DenseTensor>("Out");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    dx->mutable_data<T>(ctx.GetPlace());
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    const auto& runner = NpuOpRunner("Mul", {*dout, *out}, {*dx}, {});
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SinNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    auto place = ctx.GetPlace();
-
-    out->mutable_data<T>(place);
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner = NpuOpRunner("Sin", {*x}, {*out}, {});
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    pow,
-    ops::PowNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::PowNPUKernel<paddle::platform::NPUDeviceContext,
-                      paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    pow_grad,
-    ops::PowGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::PowGradNPUKernel<paddle::platform::NPUDeviceContext,
-                          paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    relu,
-    ops::ReluNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::ReluNPUKernel<paddle::platform::NPUDeviceContext,
-                       paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    relu_grad,
-    ops::ReluGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::ReluGradNPUKernel<paddle::platform::NPUDeviceContext,
-                           paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    relu6,
-    ops::Relu6NPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::Relu6NPUKernel<paddle::platform::NPUDeviceContext,
-                        paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    relu6_grad,
-    ops::Relu6GradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::Relu6GradNPUKernel<paddle::platform::NPUDeviceContext,
-                            paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    leaky_relu,
-    ops::LeakyReluNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::LeakyReluNPUKernel<paddle::platform::NPUDeviceContext,
-                            paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    leaky_relu_grad,
-    ops::LeakyReluGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::LeakyReluGradNPUKernel<paddle::platform::NPUDeviceContext,
-                                paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    sqrt,
-    ops::SqrtNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::SqrtNPUKernel<paddle::platform::NPUDeviceContext,
-                       paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    sqrt_grad,
-    ops::SqrtGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::SqrtGradNPUKernel<paddle::platform::NPUDeviceContext,
-                           paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    log,
-    ops::LogNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::LogNPUKernel<paddle::platform::NPUDeviceContext,
-                      paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    log_grad,
-    ops::LogGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::LogGradNPUKernel<paddle::platform::NPUDeviceContext,
-                          paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    tanh,
-    ops::TanhNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::TanhNPUKernel<paddle::platform::NPUDeviceContext,
-                       paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    tanh_grad,
-    ops::TanhGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::TanhGradNPUKernel<paddle::platform::NPUDeviceContext,
-                           paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    square,
-    ops::SquareNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::SquareNPUKernel<paddle::platform::NPUDeviceContext,
-                         paddle::platform::float16>,
-    ops::SquareNPUKernel<paddle::platform::NPUDeviceContext, int>);
-
-REGISTER_OP_NPU_KERNEL(
-    square_grad,
-    ops::SquareGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::SquareNPUKernel<paddle::platform::NPUDeviceContext,
-                         paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    sigmoid,
-    ops::SigmoidNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::SigmoidNPUKernel<paddle::platform::NPUDeviceContext,
-                          paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    sigmoid_grad,
-    ops::SigmoidGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::SigmoidGradNPUKernel<paddle::platform::NPUDeviceContext,
-                              paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(swish,
-                       ops::SwishNPUKernel<float>,
-                       ops::SwishNPUKernel<paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(swish_grad,
-                       ops::SwishGradNPUKernel<float>,
-                       ops::SwishGradNPUKernel<paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(hard_swish,
-                       ops::HardSwishNPUKernel<float>,
-                       ops::HardSwishNPUKernel<paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(hard_swish_grad,
-                       ops::HardSwishGradNPUKernel<float>,
-                       ops::HardSwishGradNPUKernel<paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    hard_sigmoid,
-    ops::HardSigmoidNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::HardSigmoidNPUKernel<paddle::platform::NPUDeviceContext,
-                              paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    hard_sigmoid_grad,
-    ops::HardSigmoidGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::HardSigmoidGradNPUKernel<paddle::platform::NPUDeviceContext,
-                                  paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    reciprocal,
-    ops::ReciprocalNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::ReciprocalNPUKernel<paddle::platform::NPUDeviceContext, double>,
-    ops::ReciprocalNPUKernel<paddle::platform::NPUDeviceContext,
-                             paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    reciprocal_grad,
-    ops::ReciprocalGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::ReciprocalGradNPUKernel<paddle::platform::NPUDeviceContext, double>,
-    ops::ReciprocalGradNPUKernel<paddle::platform::NPUDeviceContext,
-                                 paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    cos,
-    ops::CosNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::CosNPUKernel<paddle::platform::NPUDeviceContext,
-                      paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    cos_grad,
-    ops::CosGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::CosGradNPUKernel<paddle::platform::NPUDeviceContext,
-                          paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    atan,
-    ops::AtanNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::AtanNPUKernel<paddle::platform::NPUDeviceContext,
-                       paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    atan_grad,
-    ops::AtanGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::AtanGradNPUKernel<paddle::platform::NPUDeviceContext,
-                           paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    exp,
-    ops::ExpNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::ExpNPUKernel<paddle::platform::NPUDeviceContext, double>);
-
-REGISTER_OP_NPU_KERNEL(
-    exp_grad,
-    ops::ExpGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::ExpGradNPUKernel<paddle::platform::NPUDeviceContext, double>);
-
-REGISTER_OP_NPU_KERNEL(
-    sin,
-    ops::SinNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::SinNPUKernel<paddle::platform::NPUDeviceContext, double>,
-    ops::SinNPUKernel<paddle::platform::NPUDeviceContext,
-                      paddle::platform::float16>);
diff --git a/paddle/fluid/operators/argsort_op_npu.cc b/paddle/fluid/operators/argsort_op_npu.cc
deleted file mode 100644
index 18915ee4f3d79b..00000000000000
--- a/paddle/fluid/operators/argsort_op_npu.cc
+++ /dev/null
@@ -1,286 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using NPUDeviceContext = platform::NPUDeviceContext;
-
-template <typename T>
-static void TranposeNPU(const framework::ExecutionContext& ctx,
-                        const aclrtStream& stream,
-                        std::vector<int64_t>* perm,
-                        const phi::DenseTensor& in,
-                        phi::DenseTensor* out) {
-  out->mutable_data<T>(ctx.GetPlace());
-  NpuOpRunner runner;
-  runner.SetType("Transpose")
-      .AddInput(in)
-      .AddInput(std::move(*perm))
-      .AddOutput(*out)
-      .Run(stream);
-}
-
-static void CastToInt64(const framework::ExecutionContext& ctx,
-                        const aclrtStream& stream,
-                        const phi::DenseTensor& in,
-                        phi::DenseTensor* out) {
-  out->mutable_data<int64_t>(ctx.GetPlace());
-  NpuOpRunner runner;
-  runner.SetType("Cast")
-      .AddInput(in)
-      .AddOutput(*out)
-      .AddAttr("dst_type", ACL_INT64)
-      .Run(stream);
-}
-
-static void CastToFP32(const framework::ExecutionContext& ctx,
-                       const aclrtStream& stream,
-                       const phi::DenseTensor& in,
-                       phi::DenseTensor* out) {
-  out->mutable_data<float>(ctx.GetPlace());
-  NpuOpRunner runner;
-  runner.SetType("Cast")
-      .AddInput(in)
-      .AddOutput(*out)
-      .AddAttr("dst_type", ACL_FLOAT)
-      .Run(stream);
-}
-
-template <typename T>
-class ArgsortNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<phi::DenseTensor>("X");
-    auto* output = ctx.Output<phi::DenseTensor>("Out");
-    auto* indices = ctx.Output<phi::DenseTensor>("Indices");
-    int axis = ctx.Attr<int>("axis");
-    bool descending = ctx.Attr<bool>("descending");
-
-    auto in_dims = input->dims();
-    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-
-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-    framework::NPUAttributeMap attr = {{"axis", -1},
-                                       {"descending", descending}};
-
-    phi::DenseTensor indices_tmp(phi::DataType::INT32);
-    indices_tmp.Resize(indices->dims());
-
-    if (framework::TransToProtoVarType(input->dtype()) ==
-        framework::proto::VarType::INT64) {
-      phi::DenseTensor input_fp32(phi::DataType::FLOAT32);
-      input_fp32.Resize(input->dims());
-      CastToFP32(ctx, stream, *input, &input_fp32);
-
-      phi::DenseTensor output_fp32(phi::DataType::FLOAT32);
-      output_fp32.Resize(output->dims());
-
-      if (axis == -1 || axis + 1 == in_dims.size()) {
-        output_fp32.mutable_data<float>(ctx.GetPlace());
-        indices_tmp.mutable_data<int32_t>(ctx.GetPlace());
-        const auto& runner =
-            NpuOpRunner("Sort", {input_fp32}, {output_fp32, indices_tmp}, attr);
-        runner.Run(stream);
-
-        CastToInt64(ctx, stream, output_fp32, output);
-      } else {
-        std::vector<int64_t> perm;
-        for (int64_t i = 0; i < in_dims.size(); i++) {
-          perm.emplace_back(i);
-        }
-        std::swap(perm[axis], perm[in_dims.size() - 1]);
-
-        std::vector<int64_t> shape;
-        for (size_t i = 0; i < perm.size(); i++) {
-          shape.emplace_back(in_dims[perm[i]]);
-        }
-        auto trans_dims = phi::make_ddim(shape);
-
-        phi::DenseTensor trans_input(input_fp32.type());
-        trans_input.Resize(trans_dims);
-        TranposeNPU<float>(ctx, stream, &perm, input_fp32, &trans_input);
-
-        phi::DenseTensor trans_output(input_fp32.type());
-        phi::DenseTensor trans_indices(phi::DataType::INT32);
-        trans_output.mutable_data<float>(trans_dims, ctx.GetPlace());
-        trans_indices.mutable_data<int32_t>(trans_dims, ctx.GetPlace());
-
-        const auto& runner = NpuOpRunner(
-            "Sort", {trans_input}, {trans_output, trans_indices}, attr);
-        runner.Run(stream);
-
-        TranposeNPU<float>(ctx, stream, &perm, trans_output, &output_fp32);
-        TranposeNPU<int32_t>(ctx, stream, &perm, trans_indices, &indices_tmp);
-
-        CastToInt64(ctx, stream, output_fp32, output);
-      }
-    } else {
-      if (axis == -1 || axis + 1 == in_dims.size()) {
-        output->mutable_data<T>(ctx.GetPlace());
-        indices_tmp.mutable_data<int32_t>(ctx.GetPlace());
-        const auto& runner =
-            NpuOpRunner("Sort", {*input}, {*output, indices_tmp}, attr);
-        runner.Run(stream);
-      } else {
-        std::vector<int64_t> perm;
-        for (int64_t i = 0; i < in_dims.size(); i++) {
-          perm.emplace_back(i);
-        }
-        std::swap(perm[axis], perm[in_dims.size() - 1]);
-
-        std::vector<int64_t> shape;
-        for (size_t i = 0; i < perm.size(); i++) {
-          shape.emplace_back(in_dims[perm[i]]);
-        }
-        auto trans_dims = phi::make_ddim(shape);
-
-        phi::DenseTensor trans_input(input->type());
-        trans_input.Resize(trans_dims);
-        TranposeNPU<T>(ctx, stream, &perm, *input, &trans_input);
-
-        phi::DenseTensor trans_output(input->type());
-        phi::DenseTensor trans_indices(phi::DataType::INT32);
-        trans_output.mutable_data<T>(trans_dims, ctx.GetPlace());
-        trans_indices.mutable_data<int32_t>(trans_dims, ctx.GetPlace());
-
-        const auto& runner = NpuOpRunner(
-            "Sort", {trans_input}, {trans_output, trans_indices}, attr);
-        runner.Run(stream);
-
-        TranposeNPU<T>(ctx, stream, &perm, trans_output, output);
-        TranposeNPU<int32_t>(ctx, stream, &perm, trans_indices, &indices_tmp);
-      }
-    }
-
-    CastToInt64(ctx, stream, indices_tmp, indices);
-  }
-};
-
-template <typename T, typename Type>
-static void FullAssignNPU(const framework::ExecutionContext& ctx,
-                          const aclrtStream& stream,
-                          const framework::DDim in_dims,
-                          const phi::DenseTensor& input,
-                          const phi::DenseTensor& indices,
-                          phi::DenseTensor* t_out) {
-  const int64_t input_height =
-      phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
-  const int64_t input_width = in_dims[in_dims.size() - 1];
-
-  phi::DenseTensor input_tmp;
-  input_tmp.ShareDataWith(input);
-  input_tmp.Resize(
-      phi::make_ddim(std::vector<int64_t>{input_height * input_width}));
-
-  phi::DenseTensor indices_tmp;
-  indices_tmp.ShareDataWith(indices);
-  indices_tmp.Resize(
-      phi::make_ddim(std::vector<int64_t>{input_height, input_width}));
-
-  std::vector<int64_t> indexs_value;
-  for (Type i = 0; i < input_height; i++) {
-    indexs_value.push_back(i * input_width);
-  }
-  phi::DenseTensor indexs_tmp(indices.type());
-  framework::TensorFromVector<int64_t>(
-      indexs_value, ctx.device_context(), &indexs_tmp);
-  indexs_tmp.Resize(phi::make_ddim(std::vector<int64_t>{input_height, 1}));
-
-  phi::DenseTensor indices_index(indices.type());
-  indices_index.mutable_data<int64_t>(indices_tmp.dims(), ctx.GetPlace());
-  const auto& runner_add =
-      NpuOpRunner("Add", {indices_tmp, indexs_tmp}, {indices_index}, {});
-  runner_add.Run(stream);
-
-  indices_index.Resize(
-      phi::make_ddim(std::vector<int64_t>{input_height * input_width}));
-
-  t_out->mutable_data<T>(ctx.GetPlace());
-  phi::DenseTensor out_tmp(t_out->type());
-  out_tmp.ShareDataWith(*t_out);
-
-  const auto& runner = NpuOpRunner("TensorScatterUpdate",
-                                   {input_tmp, indices_index, input_tmp},
-                                   {out_tmp},
-                                   {});
-  runner.Run(stream);
-}
-
-template <typename T>
-class ArgsortGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* indices = ctx.Input<phi::DenseTensor>("Indices");
-    auto* dX = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* dO = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    int axis = ctx.Attr<int>("axis");
-
-    auto in_dims = indices->dims();
-    axis = (axis < 0) ? (in_dims.size() + axis) : axis;
-    if (dO->numel() == 0) return;
-
-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-
-    if (axis == -1 || axis + 1 == in_dims.size()) {
-      FullAssignNPU<T, int64_t>(ctx, stream, in_dims, *dO, *indices, dX);
-    } else {
-      std::vector<int64_t> perm;
-      for (int64_t i = 0; i < in_dims.size(); i++) {
-        perm.emplace_back(i);
-      }
-      std::swap(perm[axis], perm[in_dims.size() - 1]);
-
-      std::vector<int64_t> shape;
-      for (size_t i = 0; i < perm.size(); i++) {
-        shape.emplace_back(in_dims[perm[i]]);
-      }
-      auto trans_dims = phi::make_ddim(shape);
-
-      phi::DenseTensor trans_dout(dO->type());
-      phi::DenseTensor trans_ids(indices->type());
-      trans_dout.Resize(trans_dims);
-      trans_ids.Resize(trans_dims);
-
-      TranposeNPU<T>(ctx, stream, &perm, *dO, &trans_dout);
-      TranposeNPU<int64_t>(ctx, stream, &perm, *indices, &trans_ids);
-
-      phi::DenseTensor trans_dx(dO->type());
-      trans_dx.Resize(trans_dims);
-      FullAssignNPU<T, int64_t>(
-          ctx, stream, trans_dims, trans_dout, trans_ids, &trans_dx);
-
-      TranposeNPU<T>(ctx, stream, &perm, trans_dx, dX);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(argsort,
-                       ops::ArgsortNPUKernel<float>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-                       ops::ArgsortNPUKernel<int64_t>,
-#endif
-                       ops::ArgsortNPUKernel<plat::float16>);
-
-REGISTER_OP_NPU_KERNEL(argsort_grad,
-                       ops::ArgsortGradNPUKernel<float>,
-                       ops::ArgsortGradNPUKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/assign_op_npu.cc b/paddle/fluid/operators/assign_op_npu.cc
deleted file mode 100644
index ff88427c123368..00000000000000
--- a/paddle/fluid/operators/assign_op_npu.cc
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-
-#include "paddle/fluid/operators/assign_op.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace paddle {
-namespace framework {
-class OpDesc;
-class Variable;
-}  // namespace framework
-namespace imperative {
-class OpBase;
-}  // namespace imperative
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-template <typename DeviceContext, typename T>
-class AssignNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    const auto& runner = NpuOpRunner("Assign", {*out, *x}, {*out}, {});
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(
-    assign,
-    ops::AssignNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::AssignNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::AssignNPUKernel<paddle::platform::NPUDeviceContext, double>)
diff --git a/paddle/fluid/operators/assign_op_npu_test.cc b/paddle/fluid/operators/assign_op_npu_test.cc
deleted file mode 100644
index 25d8d07802ad1e..00000000000000
--- a/paddle/fluid/operators/assign_op_npu_test.cc
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-USE_OP_ITSELF(assign);
-USE_OP_DEVICE_KERNEL(assign, NPU);
-
-template <typename T>
-void Compare(f::Scope* scope,
-             const p::DeviceContext& ctx,
-             std::string op_type) {
-  // init
-  auto x = scope->Var("X");
-  auto tensor_x = x->GetMutable<phi::DenseTensor>();
-
-  std::vector<T> init;
-  init.push_back(static_cast<T>(1.0));
-  init.push_back(static_cast<T>(2.0));
-  init.push_back(static_cast<T>(3.0));
-  init.push_back(static_cast<T>(4.0));
-
-  paddle::framework::TensorFromVector(init, ctx, tensor_x);
-  tensor_x->Resize({4});
-
-  ctx.Wait();
-
-  auto place = ctx.GetPlace();
-  auto out = scope->Var("Out");
-  auto tensor_out = out->GetMutable<phi::DenseTensor>();
-
-  auto op =
-      f::OpRegistry::CreateOp(op_type, {{"X", {"X"}}}, {{"Out", {"Out"}}}, {});
-
-  op->Run(*scope, place);
-
-  std::vector<T> out_vec;
-  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
-
-  ctx.Wait();
-
-  EXPECT_EQ((uint32_t)out_vec.size(), (uint32_t)4);
-  EXPECT_EQ(out_vec[0], static_cast<T>(1.0));
-  EXPECT_EQ(out_vec[1], static_cast<T>(2.0));
-  EXPECT_EQ(out_vec[2], static_cast<T>(3.0));
-  EXPECT_EQ(out_vec[3], static_cast<T>(4.0));
-}
-
-TEST(assign, NPU_fp32) {
-  f::Scope scope;
-  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<float>(&scope, *ctx, "assign");
-}
diff --git a/paddle/fluid/operators/assign_value_op_npu.cc b/paddle/fluid/operators/assign_value_op_npu.cc
deleted file mode 100644
index 5354f26d6fa73a..00000000000000
--- a/paddle/fluid/operators/assign_value_op_npu.cc
+++ /dev/null
@@ -1,23 +0,0 @@
-//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/assign_value_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(assign_value,
-                       ops::AssignValueKernel<bool>,
-                       ops::AssignValueKernel<int>,
-                       ops::AssignValueKernel<int64_t>,
-                       ops::AssignValueKernel<float>);
diff --git a/paddle/fluid/operators/batch_norm_op_npu.cc b/paddle/fluid/operators/batch_norm_op_npu.cc
deleted file mode 100644
index 15774d5712fff4..00000000000000
--- a/paddle/fluid/operators/batch_norm_op_npu.cc
+++ /dev/null
@@ -1,261 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/batch_norm_op.h"
-
-namespace paddle {
-namespace operators {
-
-using NPUDeviceContext = platform::NPUDeviceContext;
-
-template <typename T>
-class NPUBatchNormOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const float epsilon = ctx.Attr<float>("epsilon");
-    float momentum = ctx.Attr<float>("momentum");
-    const bool is_test = ctx.Attr<bool>("is_test");
-    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
-    const bool trainable_stats = ctx.Attr<bool>("trainable_statistics");
-
-    bool test_mode = is_test && (!trainable_stats);
-    bool training = !test_mode && !use_global_stats;
-
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
-
-    const auto *x = ctx.Input<phi::DenseTensor>("X");
-    const auto &x_dims = x->dims();
-    PADDLE_ENFORCE_EQ(
-        (x_dims.size() == 4UL || x_dims.size() == 3UL),
-        true,
-        platform::errors::InvalidArgument(
-            "The input tensor X's dimension must equal to 3 or 4. "
-            " But got X's shape = [%s], X's dimension = [%d].",
-            x_dims.to_str(),
-            x_dims.size()));
-
-    const auto *running_mean = ctx.Input<phi::DenseTensor>("Mean");
-    const auto *running_var = ctx.Input<phi::DenseTensor>("Variance");
-    const auto *scale = ctx.Input<phi::DenseTensor>("Scale");
-    const auto *bias = ctx.Input<phi::DenseTensor>("Bias");
-
-    auto *y = ctx.Output<phi::DenseTensor>("Y");
-    y->mutable_data<T>(ctx.GetPlace());
-
-    auto &dev_ctx = ctx.template device_context<NPUDeviceContext>();
-    auto x_tensor =
-        ctx.AllocateTmpTensor<T, NPUDeviceContext>(x->dims(), dev_ctx);
-    auto y_tesnor =
-        ctx.AllocateTmpTensor<T, NPUDeviceContext>(y->dims(), dev_ctx);
-    x_tensor.ShareDataWith(*x);
-    y_tesnor.ShareDataWith(*y);
-    if (data_layout == DataLayout::kNHWC) {
-      x_tensor.set_layout(DataLayout::kNHWC);
-      y_tesnor.set_layout(DataLayout::kNHWC);
-    }
-
-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-    if (!training) {
-      const auto &runner_infer =
-          NpuOpRunner("BNInfer",
-                      {x_tensor, *scale, *bias, *running_mean, *running_var},
-                      {y_tesnor},
-                      {{"epsilon", epsilon}});
-      runner_infer.Run(stream);
-    } else {
-      auto *mean_out = ctx.Output<phi::DenseTensor>("MeanOut");
-      auto *variance_out = ctx.Output<phi::DenseTensor>("VarianceOut");
-      auto *saved_mean = ctx.Output<phi::DenseTensor>("SavedMean");
-      auto *saved_variance = ctx.Output<phi::DenseTensor>("SavedVariance");
-      mean_out->mutable_data<float>(ctx.GetPlace());
-      variance_out->mutable_data<float>(ctx.GetPlace());
-      saved_mean->mutable_data<float>(ctx.GetPlace());
-      saved_variance->mutable_data<float>(ctx.GetPlace());
-
-      // if MomentumTensor is set, use MomentumTensor value, momentum
-      // is only used in this training branch
-      if (ctx.HasInput("MomentumTensor")) {
-        const auto *mom_tensor = ctx.Input<phi::DenseTensor>("MomentumTensor");
-        phi::DenseTensor mom_cpu;
-        paddle::framework::TensorCopySync(
-            *mom_tensor, platform::CPUPlace(), &mom_cpu);
-        momentum = mom_cpu.data<float>()[0];
-      }
-
-      phi::DenseTensor sum, square_sum;
-      sum.mutable_data<float>(running_mean->dims(), ctx.GetPlace());
-      square_sum.mutable_data<float>(running_mean->dims(), ctx.GetPlace());
-
-      // BNTrainingReduce ONLY support rank = 4
-      if (x->dims().size() == 3) {
-        auto x_shape_vec = phi::vectorize(x->dims());
-        if (data_layout == DataLayout::kNCHW) {
-          x_shape_vec.push_back(1);  // expand NCL -> NCL1
-        } else {
-          x_shape_vec.insert(x_shape_vec.begin() + 2, 1);  // expand NLC -> NL1C
-        }
-        auto x_new_shape = phi::make_ddim(x_shape_vec);
-        x_tensor.Resize(x_new_shape);
-        x_tensor.Resize(x_new_shape);
-      }
-      const auto &runner_reduce = NpuOpRunner("BNTrainingReduce",
-                                              {x_tensor},
-                                              {sum, square_sum},
-                                              {{"epsilon", epsilon}});
-      runner_reduce.Run(stream);
-
-      const auto &runner_update = NpuOpRunner(
-          "BNTrainingUpdate",
-          {x_tensor,
-           sum,
-           square_sum,
-           *scale,
-           *bias,
-           *running_mean,
-           *running_var},
-          {y_tesnor, *mean_out, *variance_out, *saved_mean, *saved_variance},
-          {{"factor", momentum}, {"epsilon", epsilon}});
-      runner_update.Run(stream);
-    }
-  }
-};
-
-template <typename T>
-class NPUBatchNormGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *x = ctx.Input<phi::DenseTensor>("X");
-    const auto *d_y = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
-    const auto *scale = ctx.Input<phi::DenseTensor>("Scale");
-    const auto *bias = ctx.Input<phi::DenseTensor>("Bias");
-    const auto *saved_mean = ctx.Input<phi::DenseTensor>("SavedMean");
-    // SavedVariance have been reverted in forward operator
-    const auto *saved_inv_variance =
-        ctx.Input<phi::DenseTensor>("SavedVariance");
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    bool use_global_stats = ctx.Attr<bool>("use_global_stats");
-    const bool is_test = ctx.Attr<bool>("is_test");
-    const float epsilon = ctx.Attr<float>("epsilon");
-    DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
-
-    auto *d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto *d_scale =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
-
-    use_global_stats = is_test || use_global_stats;
-
-    auto &dev_ctx = ctx.template device_context<NPUDeviceContext>();
-    auto x_tensor =
-        ctx.AllocateTmpTensor<T, NPUDeviceContext>(x->dims(), dev_ctx);
-    auto dy_tensor =
-        ctx.AllocateTmpTensor<T, NPUDeviceContext>(d_y->dims(), dev_ctx);
-    x_tensor.ShareDataWith(*x);
-    dy_tensor.ShareDataWith(*d_y);
-    if (data_layout == DataLayout::kNHWC) {
-      x_tensor.set_layout(DataLayout::kNHWC);
-      dy_tensor.set_layout(DataLayout::kNHWC);
-    }
-
-    auto scale_grad_tmp =
-        ctx.AllocateTmpTensor<T, NPUDeviceContext>(scale->dims(), dev_ctx);
-    auto bias_grad_tmp =
-        ctx.AllocateTmpTensor<T, NPUDeviceContext>(bias->dims(), dev_ctx);
-    if (d_scale == nullptr) {
-      d_scale = &scale_grad_tmp;
-    }
-    if (d_bias == nullptr) {
-      d_bias = &bias_grad_tmp;
-    }
-
-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-    if (d_scale && d_bias) {
-      d_scale->mutable_data<float>(ctx.GetPlace());
-      d_bias->mutable_data<float>(ctx.GetPlace());
-      if (use_global_stats) {
-        const auto *running_mean = ctx.Input<phi::DenseTensor>("Mean");
-        const auto *running_variance = ctx.Input<phi::DenseTensor>("Variance");
-        const auto &runner_update =
-            NpuOpRunner("BNTrainingUpdateGrad",
-                        {dy_tensor, x_tensor, *running_mean, *running_variance},
-                        {*d_scale, *d_bias},
-                        {{"epsilon", epsilon}});
-        runner_update.Run(stream);
-      } else {
-        const auto &runner_update =
-            NpuOpRunner("BNTrainingUpdateGrad",
-                        {dy_tensor, x_tensor, *saved_mean, *saved_inv_variance},
-                        {*d_scale, *d_bias},
-                        {{"epsilon", epsilon}});
-        runner_update.Run(stream);
-      }
-    }
-    if (d_x) {
-      d_x->mutable_data<T>(ctx.GetPlace());
-      auto dx_tensor =
-          ctx.AllocateTmpTensor<T, NPUDeviceContext>(d_x->dims(), dev_ctx);
-      dx_tensor.ShareDataWith(*d_x);
-      if (data_layout == DataLayout::kNHWC) {
-        dx_tensor.set_layout(DataLayout::kNHWC);
-      }
-      if (use_global_stats) {
-        if (x->dims().size() == 3) {
-          // BNInferGrad only support x rank = 4,
-          auto x_shape_vec = phi::vectorize(d_x->dims());
-          if (data_layout == DataLayout::kNCHW) {
-            x_shape_vec.push_back(1);  // expand NCL -> NCL1
-          } else {
-            x_shape_vec.insert(x_shape_vec.begin() + 2,
-                               1);  // expand NLC -> NL1C
-          }
-          auto x_new_shape = phi::make_ddim(x_shape_vec);
-          dx_tensor.Resize(x_new_shape);
-          dy_tensor.Resize(x_new_shape);
-        }
-        const auto *running_var = ctx.Input<phi::DenseTensor>("Variance");
-        const auto &runner_infer =
-            NpuOpRunner("BNInferGrad",
-                        {dy_tensor, *scale, *running_var},
-                        {dx_tensor},
-                        {{"epsilon", epsilon}});
-        runner_infer.Run(stream);
-      } else {
-        const auto &runner_reduce = NpuOpRunner("BNTrainingReduceGrad",
-                                                {dy_tensor,
-                                                 x_tensor,
-                                                 *d_scale,
-                                                 *d_bias,
-                                                 *scale,
-                                                 *saved_mean,
-                                                 *saved_inv_variance},
-                                                {dx_tensor},
-                                                {{"epsilon", epsilon}});
-        runner_reduce.Run(stream);
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(batch_norm,
-                       ops::NPUBatchNormOpKernel<float>,
-                       ops::NPUBatchNormOpKernel<plat::float16>);
-REGISTER_OP_NPU_KERNEL(batch_norm_grad,
-                       ops::NPUBatchNormGradOpKernel<float>,
-                       ops::NPUBatchNormGradOpKernel<plat::float16>);
diff --git a/paddle/fluid/operators/bce_loss_op_npu.cc b/paddle/fluid/operators/bce_loss_op_npu.cc
deleted file mode 100644
index ed8872d90ef6f5..00000000000000
--- a/paddle/fluid/operators/bce_loss_op_npu.cc
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class BCELossNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* labels = ctx.Input<phi::DenseTensor>("Label");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner =
-        NpuOpRunner("BinaryCrossEntropy",
-                    {*x, *labels},
-                    {*out},
-                    {{"reduction", static_cast<std::string>("none")}});
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class BCELossGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* labels = ctx.Input<phi::DenseTensor>("Label");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-
-    dx->mutable_data<T>(ctx.GetPlace());
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner =
-        NpuOpRunner("BinaryCrossEntropyGrad",
-                    {*x, *labels, *dout},
-                    {*dx},
-                    {{"reduction", static_cast<std::string>("none")}});
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(
-    bce_loss,
-    ops::BCELossNPUKernel<plat::NPUDeviceContext, float>,
-    ops::BCELossNPUKernel<plat::NPUDeviceContext, plat::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    bce_loss_grad,
-    ops::BCELossGradNPUKernel<plat::NPUDeviceContext, float>,
-    ops::BCELossGradNPUKernel<plat::NPUDeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/beam_search_op_npu.cc b/paddle/fluid/operators/beam_search_op_npu.cc
deleted file mode 100644
index 147d1be2262556..00000000000000
--- a/paddle/fluid/operators/beam_search_op_npu.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/beam_search_op.h"
-
-namespace ops = paddle::operators;
-using NPUCtx = paddle::platform::NPUDeviceContext;
-
-REGISTER_OP_NPU_KERNEL(beam_search,
-                       ops::BeamSearchOpKernel<float, NPUCtx>,
-                       ops::BeamSearchOpKernel<double, NPUCtx>,
-                       ops::BeamSearchOpKernel<int, NPUCtx>,
-                       ops::BeamSearchOpKernel<int64_t, NPUCtx>);
diff --git a/paddle/fluid/operators/cast_op_npu.cc b/paddle/fluid/operators/cast_op_npu.cc
deleted file mode 100644
index 411e112318d12c..00000000000000
--- a/paddle/fluid/operators/cast_op_npu.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-namespace paddle {
-namespace operators {
-
-static std::map<framework::proto::VarType::Type, aclDataType>
-    DTYPE_2_ACL_DTYPE = {
-        {framework::proto::VarType::BOOL, ACL_BOOL},
-        {framework::proto::VarType::INT16, ACL_INT16},
-        {framework::proto::VarType::INT32, ACL_INT32},
-        {framework::proto::VarType::INT64, ACL_INT64},
-        {framework::proto::VarType::FP16, ACL_FLOAT16},
-        {framework::proto::VarType::FP32, ACL_FLOAT},
-        {framework::proto::VarType::FP64, ACL_DOUBLE},
-};
-
-template <typename DeviceContext, typename T>
-class CastNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    int dtype = ctx.Attr<int>("out_dtype");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    auto place = ctx.GetPlace();
-
-    if (framework::TransToProtoVarType(x->dtype()) == dtype) {
-      // NOTE(zhiqiu): NPU cast op may result in wrong value, so
-      // add special case here.
-      VLOG(4) << "cast to same dtype:" << dtype;
-      out->mutable_data(place, x->type());
-      framework::TensorCopy(
-          *x,
-          ctx.GetPlace(),
-          ctx.template device_context<platform::DeviceContext>(),
-          out);
-      return;
-    }
-
-    auto iter = DTYPE_2_ACL_DTYPE.find(
-        static_cast<framework::proto::VarType::Type>(dtype));
-    int aclDtype = iter->second;
-
-    if (dtype == framework::proto::VarType::FP32) {
-      out->mutable_data<float>(place);
-    } else if (dtype == framework::proto::VarType::FP16) {
-      out->mutable_data<paddle::platform::float16>(place);
-    } else if (dtype == framework::proto::VarType::INT16) {
-      out->mutable_data<int16_t>(place);
-    } else if (dtype == framework::proto::VarType::INT32) {
-      out->mutable_data<int32_t>(place);
-    } else if (dtype == framework::proto::VarType::INT64) {
-      out->mutable_data<int64_t>(place);
-    } else if (dtype == framework::proto::VarType::FP64) {
-      out->mutable_data<double>(place);
-    } else if (dtype == framework::proto::VarType::BOOL) {
-      out->mutable_data<bool>(place);
-    }
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner = NpuOpRunner(
-        "Cast", {*x}, {*out}, {{"dst_type", static_cast<int32_t>(aclDtype)}});
-    runner.Run(stream);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    cast,
-    ops::CastNPUKernel<paddle::platform::NPUDeviceContext, int16_t>,
-    ops::CastNPUKernel<paddle::platform::NPUDeviceContext, int32_t>,
-    ops::CastNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
-    ops::CastNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::CastNPUKernel<paddle::platform::NPUDeviceContext, bool>,
-    ops::CastNPUKernel<paddle::platform::NPUDeviceContext, double>,
-    ops::CastNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::CastNPUKernel<paddle::platform::NPUDeviceContext,
-                       paddle::platform::float16>);
diff --git a/paddle/fluid/operators/clip_by_norm_op_npu.cc b/paddle/fluid/operators/clip_by_norm_op_npu.cc
deleted file mode 100644
index f22f58d1769ea1..00000000000000
--- a/paddle/fluid/operators/clip_by_norm_op_npu.cc
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/clip_by_norm_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class NPUClipByNormKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto max_norm = context.Attr<float>("max_norm");
-    auto in_var = context.InputVar("X");
-
-    if (!(in_var->IsType<phi::DenseTensor>())) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Invalid input variable type, only support LodTensor"
-          "type, but got type is %s.",
-          framework::ToTypeName(in_var->Type())));
-    }
-
-    auto place = context.GetPlace();
-    auto& dev_ctx =
-        context.template device_context<paddle::platform::NPUDeviceContext>();
-    auto stream = dev_ctx.stream();
-
-    auto* input = context.Input<phi::DenseTensor>("X");
-    auto* output = context.Output<phi::DenseTensor>("Out");
-    output->mutable_data<T>(place);
-
-    PADDLE_ENFORCE_NOT_NULL(input,
-                            platform::errors::InvalidArgument(
-                                "Input(X) of ClipByNormOp should not be null. "
-                                "Please check if it is created correctly."));
-
-    phi::DenseTensor square_sum(input->type());
-    square_sum.mutable_data<T>(framework::DDim({1}), place);
-    const auto& x_dims = input->dims();
-    std::vector<int> axis;
-    for (int i = 0; i < x_dims.size(); ++i) {
-      axis.push_back(i);
-    }
-    const auto& square_sum_runner =
-        NpuOpRunner("SquareSumV1",
-                    {*input},
-                    {square_sum},
-                    {{"axis", axis}, {"keep_dims", false}});
-    square_sum_runner.Run(stream);
-
-    phi::DenseTensor x_norm(input->type());
-    x_norm.mutable_data<T>(framework::DDim({1}), place);
-    const auto& x_norm_runner = NpuOpRunner("Sqrt", {square_sum}, {x_norm}, {});
-    x_norm_runner.Run(stream);
-
-    phi::DenseTensor x_norm_t;
-    framework::TensorCopySync(x_norm, platform::CPUPlace(), &x_norm_t);
-    auto x_norm_v = static_cast<float>(*x_norm_t.data<T>());
-    if (x_norm_v <= max_norm) {
-      framework::TensorCopy(*input, place, dev_ctx, output);
-    } else {
-      auto epsilon = x_norm_v <= static_cast<float>(1e-30)
-                         ? static_cast<float>(1e-6)
-                         : static_cast<float>(0);
-      float scaling = max_norm / (x_norm_v + epsilon);
-      const auto& muls_runner =
-          NpuOpRunner("Muls", {*input}, {*output}, {{"value", scaling}});
-      muls_runner.Run(stream);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(
-    clip_by_norm,
-    ops::NPUClipByNormKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::NPUClipByNormKernel<paddle::platform::NPUDeviceContext,
-                             plat::float16>);
diff --git a/paddle/fluid/operators/clip_op_npu.cc b/paddle/fluid/operators/clip_op_npu.cc
deleted file mode 100644
index 8977bd250e8685..00000000000000
--- a/paddle/fluid/operators/clip_op_npu.cc
+++ /dev/null
@@ -1,122 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ClipNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    auto min_tensor =
-        ctx.HasInput("Min") ? ctx.Input<phi::DenseTensor>("Min") : nullptr;
-    auto max_tensor =
-        ctx.HasInput("Max") ? ctx.Input<phi::DenseTensor>("Max") : nullptr;
-
-    phi::DenseTensor min_tensor_temp(x->type());
-    phi::DenseTensor max_tensor_temp(x->type());
-    if (min_tensor == nullptr) {
-      auto min_value = static_cast<T>(ctx.Attr<float>("min"));
-      min_tensor_temp.mutable_data<T>({1}, ctx.GetPlace());
-      FillNpuTensorWithConstant<T>(&min_tensor_temp, min_value);
-      min_tensor = &min_tensor_temp;
-    }
-
-    if (max_tensor == nullptr) {
-      auto max_value = static_cast<T>(ctx.Attr<float>("max"));
-      max_tensor_temp.mutable_data<T>({1}, ctx.GetPlace());
-      FillNpuTensorWithConstant<T>(&max_tensor_temp, max_value);
-      max_tensor = &max_tensor_temp;
-    }
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    const auto& runner =
-        NpuOpRunner("ClipByValue", {*x, *min_tensor, *max_tensor}, {*out}, {});
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ClipGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    dx->mutable_data<T>(ctx.GetPlace());
-
-    auto* min_tensor =
-        ctx.HasInput("Min") ? ctx.Input<phi::DenseTensor>("Min") : nullptr;
-    auto* max_tensor =
-        ctx.HasInput("Max") ? ctx.Input<phi::DenseTensor>("Max") : nullptr;
-
-    auto min_val = ctx.Attr<float>("min");
-    if (min_tensor) {
-      phi::DenseTensor min_data;
-      framework::TensorCopy(
-          *min_tensor,
-          platform::CPUPlace(),
-          ctx.template device_context<platform::DeviceContext>(),
-          &min_data);
-      ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
-      min_val = static_cast<float>(min_data.data<T>()[0]);
-    }
-
-    auto max_val = ctx.Attr<float>("max");
-    if (max_tensor) {
-      phi::DenseTensor max_data;
-      framework::TensorCopy(
-          *max_tensor,
-          platform::CPUPlace(),
-          ctx.template device_context<platform::DeviceContext>(),
-          &max_data);
-      ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
-      max_val = static_cast<float>(max_data.data<T>()[0]);
-    }
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    const auto& runner =
-        NpuOpRunner("HardtanhGrad",
-                    {*x, *dout},
-                    {*dx},
-                    {{"min_val", min_val}, {"max_val", max_val}});
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(
-    clip,
-    ops::ClipNPUKernel<plat::NPUDeviceContext, float>,
-    ops::ClipNPUKernel<plat::NPUDeviceContext, plat::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    clip_grad,
-    ops::ClipGradNPUKernel<plat::NPUDeviceContext, float>,
-    ops::ClipGradNPUKernel<plat::NPUDeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/concat_op_npu.cc b/paddle/fluid/operators/concat_op_npu.cc
deleted file mode 100644
index 491d44efa7261e..00000000000000
--- a/paddle/fluid/operators/concat_op_npu.cc
+++ /dev/null
@@ -1,136 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/concat_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class ConcatNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto ins = ctx.MultiInput<phi::DenseTensor>("X");
-    phi::DenseTensor* out = ctx.Output<phi::DenseTensor>("Out");
-    PADDLE_ENFORCE_NOT_NULL(ins[0],
-                            platform::errors::NotFound(
-                                "The first input tensor is not initalized."));
-    auto axis = ctx.Attr<int>("axis");
-
-    if (ctx.HasInput("AxisTensor")) {
-      PADDLE_THROW(platform::errors::NotFound(
-          "The AxisTensor is not supported on NPU now."));
-    }
-    axis = ComputeAxis(static_cast<int64_t>(axis),
-                       static_cast<int64_t>(ins[0]->dims().size()));
-
-    auto place = ctx.GetPlace();
-    out->mutable_data<T>(place);
-
-    std::vector<phi::DenseTensor> inputs;
-    std::vector<std::string> names;
-    for (size_t i = 0; i < ins.size(); ++i) {
-      if (ins[i] && ins[i]->numel() > 0) {
-        inputs.push_back(*ins[i]);
-        names.push_back("x" + std::to_string(i));
-      } else {
-        continue;
-      }
-    }
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    NpuOpRunner runner{
-        "ConcatD",
-        {inputs},
-        {*out},
-        {{"concat_dim", axis}, {"N", static_cast<int>(inputs.size())}}};
-    runner.AddInputNames(names);
-    runner.Run(stream);
-  }
-};
-
-template <typename T>
-class ConcatGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out_grad = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto ins = ctx.MultiInput<phi::DenseTensor>("X");
-    auto out_var_names = ctx.OutputNames(framework::GradVarName("X"));
-    auto outs = ctx.MultiOutput<phi::DenseTensor>(framework::GradVarName("X"));
-
-    PADDLE_ENFORCE_NOT_NULL(ins[0],
-                            platform::errors::NotFound(
-                                "The first input tensor is not initalized."));
-
-    auto axis = ctx.Attr<int>("axis");
-
-    axis = ComputeAxis(static_cast<int64_t>(axis),
-                       static_cast<int64_t>(ins[0]->dims().size()));
-
-    int offset = 0;
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    for (size_t j = 0; j < outs.size(); ++j) {
-      // For stop gradient
-      // get output tensor that the name is not kEmptyVarName
-      if (out_var_names[j] != framework::kEmptyVarName &&
-          outs[j]->numel() != 0UL) {
-        outs[j]->mutable_data<T>(ctx.GetPlace());
-        std::vector<int> offsets;
-        std::vector<int> sizes;
-        for (int dim = 0; dim < ins[j]->dims().size(); ++dim) {
-          if (dim == axis) {
-            offsets.push_back(offset);
-            sizes.push_back(ins[j]->dims()[dim]);
-          } else {
-            offsets.push_back(0);
-            sizes.push_back(ins[j]->dims()[dim]);
-          }
-        }
-        const auto& runner =
-            NpuOpRunner("SliceD",
-                        {*out_grad},
-                        {*outs[j]},
-                        {{"offsets", offsets}, {"size", sizes}});
-        runner.Run(stream);
-      }
-      if (ins[j]->numel() != 0UL) {
-        offset += ins[j]->dims()[axis];
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(concat,
-                       ops::ConcatNPUKernel<float>,
-                       ops::ConcatNPUKernel<paddle::platform::float16>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-                       ops::ConcatNPUKernel<int64_t>,
-#endif
-                       ops::ConcatNPUKernel<int>);
-
-REGISTER_OP_NPU_KERNEL(concat_grad,
-                       ops::ConcatGradNPUKernel<float>,
-                       ops::ConcatGradNPUKernel<paddle::platform::float16>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-                       ops::ConcatGradNPUKernel<int64_t>,
-#endif
-                       ops::ConcatGradNPUKernel<int>);
diff --git a/paddle/fluid/operators/conv_op_npu.cc b/paddle/fluid/operators/conv_op_npu.cc
deleted file mode 100644
index 44fb1aa5a17595..00000000000000
--- a/paddle/fluid/operators/conv_op_npu.cc
+++ /dev/null
@@ -1,688 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/conv_op.h"
-
-namespace paddle {
-namespace operators {
-
-using NPUDeviceContext = platform::NPUDeviceContext;
-static void CastToFP16(const framework::ExecutionContext& ctx,
-                       const aclrtStream& stream,
-                       const phi::DenseTensor& in,
-                       phi::DenseTensor* out) {
-  out->mutable_data<paddle::platform::float16>(ctx.GetPlace());
-  NpuOpRunner runner;
-  runner.SetType("Cast")
-      .AddInput(in)
-      .AddOutput(*out)
-      .AddAttr("dst_type", ACL_FLOAT16)
-      .Run(stream);
-}
-
-static void CastToFP32(const framework::ExecutionContext& ctx,
-                       const aclrtStream& stream,
-                       const phi::DenseTensor& in,
-                       phi::DenseTensor* out) {
-  out->mutable_data<float>(ctx.GetPlace());
-  NpuOpRunner runner;
-  runner.SetType("Cast")
-      .AddInput(in)
-      .AddOutput(*out)
-      .AddAttr("dst_type", ACL_FLOAT)
-      .Run(stream);
-}
-
-template <typename T>
-class DepthwiseConvNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("Input");
-    const phi::DenseTensor* filter = ctx.Input<phi::DenseTensor>("Filter");
-    phi::DenseTensor* output = ctx.Output<phi::DenseTensor>("Output");
-    output->mutable_data<T>(ctx.GetPlace());
-
-    const std::vector<int> stride = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> padding = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilation = ctx.Attr<std::vector<int>>("dilations");
-    const std::string data_format = ctx.Attr<std::string>("data_format");
-    const std::string padding_algorithm =
-        ctx.Attr<std::string>("padding_algorithm");
-
-    const bool channel_last = data_format == "NHWC";
-    if (channel_last) {
-      PADDLE_ENFORCE_EQ(
-          output->dims()[output->dims().size() - 1],
-          input->dims()[input->dims().size() - 1],
-          platform::errors::InvalidArgument(
-              "ShapeError: The output channels must be equal to the "
-              "input channels. But receivced output channel number is %d "
-              "and input channel number is %d",
-              output->dims()[output->dims().size() - 1],
-              input->dims()[input->dims().size() - 1]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          output->dims()[1],
-          input->dims()[1],
-          platform::errors::InvalidArgument(
-              "ShapeError: The output channels must be equal to the "
-              "input channels. But receivced output channel number is %d "
-              "and input channel number is %d",
-              output->dims()[1],
-              input->dims()[1]));
-    }
-
-    auto in_dims = input->dims();
-    auto filter_dims = filter->dims();
-    framework::DDim in_data_dims;
-    framework::DDim filter_data_dims;
-
-    if (channel_last) {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    }
-    filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size());
-
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(
-        &padding, &dilation, padding_algorithm, in_data_dims, stride, ksize);
-
-    std::vector<int> strides(4, 1);
-    std::vector<int> dilations(4, 1);
-
-    phi::DenseTensor input_tensor, output_tensor;
-    input_tensor.ShareDataWith(*input);
-    output_tensor.ShareDataWith(*output);
-
-    if (channel_last) {
-      input_tensor.set_layout(DataLayout::kNHWC);
-      output_tensor.set_layout(DataLayout::kNHWC);
-      strides[1] = stride[0];
-      strides[2] = stride[1];
-      dilations[1] = dilation[0];
-      dilations[2] = dilation[1];
-    } else {
-      strides[2] = stride[0];
-      strides[3] = stride[1];
-      dilations[2] = dilation[0];
-      dilations[3] = dilation[1];
-    }
-
-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-
-    // Transform filter (n, 1, h, w) --> (1, n, h, w)
-    phi::DenseTensor transformed_filter(filter->type());
-    transformed_filter.mutable_data<T>({filter->dims()[1],
-                                        filter->dims()[0],
-                                        filter->dims()[2],
-                                        filter->dims()[3]},
-                                       ctx.device_context().GetPlace());
-    std::vector<int> perm = {1, 0, 2, 3};
-    const auto& runner_trans = NpuOpRunner(
-        "TransposeD", {*filter}, {transformed_filter}, {{"perm", perm}});
-    runner_trans.Run(stream);
-
-    const auto& runner = NpuOpRunner("DepthwiseConv2D",
-                                     {input_tensor, transformed_filter},
-                                     {output_tensor},
-                                     {{"strides", strides},
-                                      {"dilations", dilations},
-                                      {"pads", padding},
-                                      {"data_format", data_format}});
-    runner.Run(stream);
-  }
-};
-
-template <typename T>
-class DepthwiseConvGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("Input");
-    const phi::DenseTensor* filter = ctx.Input<phi::DenseTensor>("Filter");
-    auto output_grad =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("Output"));
-    auto input_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
-    auto filter_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Filter"));
-
-    const std::vector<int> stride = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> padding = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilation = ctx.Attr<std::vector<int>>("dilations");
-    const std::string data_format = ctx.Attr<std::string>("data_format");
-    const std::string padding_algorithm =
-        ctx.Attr<std::string>("padding_algorithm");
-
-    const bool channel_last = data_format == "NHWC";
-
-    // update padding and dilation
-    auto in_dims = input->dims();
-    auto filter_dims = filter->dims();
-    framework::DDim in_data_dims;
-    framework::DDim filter_data_dims;
-
-    if (channel_last) {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    }
-    filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size());
-
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(
-        &padding, &dilation, padding_algorithm, in_data_dims, stride, ksize);
-
-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-
-    // Transform filter (n, 1, h, w) --> (1, n, h, w)
-    phi::DenseTensor transformed_filter(filter->type());
-    transformed_filter.mutable_data<T>({filter->dims()[1],
-                                        filter->dims()[0],
-                                        filter->dims()[2],
-                                        filter->dims()[3]},
-                                       ctx.device_context().GetPlace());
-    std::vector<int> perm = {1, 0, 2, 3};
-    const auto& runner_trans = NpuOpRunner(
-        "TransposeD", {*filter}, {transformed_filter}, {{"perm", perm}});
-    runner_trans.Run(stream);
-
-    // construct NPU attr
-    std::vector<int> strides(4, 1);
-    std::vector<int> dilations(4, 1);
-
-    phi::DenseTensor input_tensor, output_grad_tensor;
-    input_tensor.ShareDataWith(*input);
-    output_grad_tensor.ShareDataWith(*output_grad);
-    if (channel_last) {
-      input_tensor.set_layout(DataLayout::kNHWC);
-      output_grad_tensor.set_layout(DataLayout::kNHWC);
-      strides[1] = stride[0];
-      strides[2] = stride[1];
-      dilations[1] = dilation[0];
-      dilations[2] = dilation[1];
-    } else {
-      strides[2] = stride[0];
-      strides[3] = stride[1];
-      dilations[2] = dilation[0];
-      dilations[3] = dilation[1];
-    }
-
-    if (filter_grad) {
-      filter_grad->mutable_data<T>(ctx.GetPlace());
-
-      PADDLE_ENFORCE_EQ(
-          (dilations[2] == 1 && dilations[3] == 1),
-          true,
-          platform::errors::InvalidArgument(
-              "dilation_h and dilation_w in DepthwiseConv2DBackpropFilterD "
-              "must be equal to 1, but got dilation_h %d, dilation_w %d",
-              dilation[2],
-              dilation[3]));
-
-      NpuOpRunner runner;
-      runner.SetType("DepthwiseConv2DBackpropFilterD")
-          .AddInput(input_tensor)
-          .AddInput(output_grad_tensor)
-          .AddOutput(*filter_grad)
-          .AddAttr("filter_size", phi::vectorize(transformed_filter.dims()))
-          .AddAttr("strides", strides)
-          .AddAttr("dilations", dilations)
-          .AddAttr("pads", padding)
-          .AddAttr("data_format", data_format)
-          .Run(stream);
-    }
-    if (input_grad) {
-      input_grad->mutable_data<T>(ctx.GetPlace());
-      phi::DenseTensor input_grad_tensor;
-      input_grad_tensor.ShareDataWith(*input_grad);
-      if (channel_last) {
-        input_grad_tensor.set_layout(DataLayout::kNHWC);
-      }
-      NpuOpRunner runner;
-      runner.SetType("DepthwiseConv2DBackpropInputD")
-          .AddInput(transformed_filter)
-          .AddInput(output_grad_tensor)
-          .AddOutput(input_grad_tensor)
-          .AddAttr("input_size", phi::vectorize(input->dims()))
-          .AddAttr("strides", strides)
-          .AddAttr("dilations", dilations)
-          .AddAttr("pads", padding)
-          .AddAttr("data_format", data_format)
-          .Run(stream);
-    }
-  }
-};
-
-template <typename T>
-class NPUConvOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("Input");
-    auto* filter = ctx.Input<phi::DenseTensor>("Filter");
-    auto* output = ctx.Output<phi::DenseTensor>("Output");
-    output->mutable_data<T>(ctx.GetPlace());
-    const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    int groups = ctx.Attr<int>("groups");
-    const std::string padding_algorithm =
-        ctx.Attr<std::string>("padding_algorithm");
-    const std::string data_format = ctx.Attr<std::string>("data_format");
-
-    const bool channel_last = data_format == "NHWC";
-
-    // update padding and dilation
-    auto in_dims = input->dims();
-    auto filter_dims = filter->dims();
-    framework::DDim in_data_dims;
-    framework::DDim filter_data_dims;
-
-    if (channel_last) {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    }
-    filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size());
-
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(
-        &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
-
-    std::vector<int> strides_vec(4, 1);
-    std::vector<int> dilations_vec(4, 1);
-
-    phi::DenseTensor input_tensor, output_tensor;
-    input_tensor.ShareDataWith(*input);
-    output_tensor.ShareDataWith(*output);
-    if (channel_last) {
-      input_tensor.set_layout(DataLayout::kNHWC);
-      output_tensor.set_layout(DataLayout::kNHWC);
-      strides_vec[1] = strides[0];
-      strides_vec[2] = strides[1];
-      dilations_vec[1] = dilations[0];
-      dilations_vec[2] = dilations[1];
-    } else {
-      strides_vec[2] = strides[0];
-      strides_vec[3] = strides[1];
-      dilations_vec[2] = dilations[0];
-      dilations_vec[3] = dilations[1];
-    }
-
-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-    const auto& runner = NpuOpRunner("Conv2D",
-                                     {input_tensor, *filter},
-                                     {output_tensor},
-                                     {{"strides", strides_vec},
-                                      {"pads", paddings},
-                                      {"dilations", dilations_vec},
-                                      {"groups", groups},
-                                      {"data_format", data_format}});
-    runner.Run(stream);
-  }
-};
-
-template <typename T>
-class NPUConvGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto input = ctx.Input<phi::DenseTensor>("Input");
-    auto filter = ctx.Input<phi::DenseTensor>("Filter");
-    auto output_grad =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("Output"));
-    auto input_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
-    auto filter_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Filter"));
-
-    const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    int groups = ctx.Attr<int>("groups");
-    const std::string padding_algorithm =
-        ctx.Attr<std::string>("padding_algorithm");
-    const std::string data_format = ctx.Attr<std::string>("data_format");
-
-    const bool channel_last = data_format == "NHWC";
-
-    // update padding and dilation
-    auto in_dims = input->dims();
-    auto filter_dims = filter->dims();
-    framework::DDim in_data_dims;
-    framework::DDim filter_data_dims;
-
-    if (channel_last) {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    }
-    filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size());
-
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(
-        &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
-
-    std::vector<int> strides_vec(4, 1);
-    std::vector<int> dilations_vec(4, 1);
-
-    phi::DenseTensor input_tensor, output_grad_tensor;
-    input_tensor.ShareDataWith(*input);
-    output_grad_tensor.ShareDataWith(*output_grad);
-    if (channel_last) {
-      input_tensor.set_layout(DataLayout::kNHWC);
-      output_grad_tensor.set_layout(DataLayout::kNHWC);
-      strides_vec[1] = strides[0];
-      strides_vec[2] = strides[1];
-      dilations_vec[1] = dilations[0];
-      dilations_vec[2] = dilations[1];
-    } else {
-      strides_vec[2] = strides[0];
-      strides_vec[3] = strides[1];
-      dilations_vec[2] = dilations[0];
-      dilations_vec[3] = dilations[1];
-    }
-
-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-    if (filter_grad) {
-      filter_grad->mutable_data<T>(ctx.GetPlace());
-      std::vector<int> filter_shape_vec = phi::vectorize<int>(filter->dims());
-
-      phi::DenseTensor filter_grad_fp32(phi::DataType::FLOAT32);
-      filter_grad_fp32.Resize(filter_grad->dims());
-
-      if (framework::TransToProtoVarType(input->dtype()) ==
-          framework::proto::VarType::FP16) {
-        CastToFP32(ctx, stream, *filter_grad, &filter_grad_fp32);
-      } else {
-        filter_grad_fp32.ShareDataWith(*filter_grad);
-      }
-
-      const auto& runner = NpuOpRunner("Conv2DBackpropFilterD",
-                                       {input_tensor, output_grad_tensor},
-                                       {filter_grad_fp32},
-                                       {{"filter_size", filter_shape_vec},
-                                        {"strides", strides_vec},
-                                        {"pads", paddings},
-                                        {"dilations", dilations_vec},
-                                        {"groups", groups},
-                                        {"data_format", data_format}});
-      runner.Run(stream);
-
-      if (framework::TransToProtoVarType(input->dtype()) ==
-          framework::proto::VarType::FP16) {
-        CastToFP16(ctx, stream, filter_grad_fp32, filter_grad);
-      }
-    }
-    if (input_grad) {
-      input_grad->mutable_data<T>(ctx.GetPlace());
-      std::vector<int> input_shape_vec = phi::vectorize<int>(input->dims());
-
-      phi::DenseTensor input_grad_tensor;
-      input_grad_tensor.ShareDataWith(*input_grad);
-      if (channel_last) {
-        input_grad_tensor.set_layout(DataLayout::kNHWC);
-      }
-      const auto& runner = NpuOpRunner("Conv2DBackpropInputD",
-                                       {*filter, output_grad_tensor},
-                                       {input_grad_tensor},
-                                       {{"input_size", input_shape_vec},
-                                        {"strides", strides_vec},
-                                        {"pads", paddings},
-                                        {"dilations", dilations_vec},
-                                        {"groups", groups},
-                                        {"data_format", data_format}});
-      runner.Run(stream);
-    }
-  }
-};
-
-template <typename T>
-class NPUConv3dKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("Input");
-    const phi::DenseTensor* filter = ctx.Input<phi::DenseTensor>("Filter");
-    phi::DenseTensor* output = ctx.Output<phi::DenseTensor>("Output");
-
-    const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    int groups = ctx.Attr<int>("groups");
-    const std::string padding_algorithm =
-        ctx.Attr<std::string>("padding_algorithm");
-    const std::string data_format = ctx.Attr<std::string>("data_format");
-
-    PADDLE_ENFORCE_EQ(data_format,
-                      "NCDHW",
-                      platform::errors::Unimplemented(
-                          "the data_format must be NCDHW in "
-                          "the npu kernel of conv3d, but got data_format "
-                          "= [%s]",
-                          data_format));
-
-    PADDLE_ENFORCE_EQ(groups,
-                      1,
-                      platform::errors::Unimplemented(
-                          "the groups must be 1 in "
-                          "the npu kernel of conv3d, but got groups "
-                          "= [%d]",
-                          groups));
-
-    output->mutable_data<T>(ctx.GetPlace());
-
-    auto& dev_ctx = ctx.template device_context<NPUDeviceContext>();
-    auto input_tensor =
-        ctx.AllocateTmpTensor<T, NPUDeviceContext>(input->dims(), dev_ctx);
-    auto filter_tensor =
-        ctx.AllocateTmpTensor<T, NPUDeviceContext>(filter->dims(), dev_ctx);
-    auto output_tensor =
-        ctx.AllocateTmpTensor<T, NPUDeviceContext>(output->dims(), dev_ctx);
-
-    input_tensor.ShareDataWith(*input);
-    filter_tensor.ShareDataWith(*filter);
-    output_tensor.ShareDataWith(*output);
-
-    input_tensor.set_layout(DataLayout::kNCDHW);
-    filter_tensor.set_layout(DataLayout::kNCDHW);
-    output_tensor.set_layout(DataLayout::kNCDHW);
-
-    // update padding and dilation
-    auto in_dims = input->dims();
-    auto filter_dims = filter->dims();
-    framework::DDim in_data_dims;
-    framework::DDim filter_data_dims;
-
-    in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size());
-
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(
-        &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
-
-    std::vector<int> strides_vec(5, 1);
-    std::vector<int> dilations_vec(5, 1);
-
-    strides_vec[2] = strides[0];
-    strides_vec[3] = strides[1];
-    strides_vec[4] = strides[2];
-    dilations_vec[2] = dilations[0];
-    dilations_vec[3] = dilations[1];
-    dilations_vec[4] = dilations[2];
-
-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-    const auto& runner = NpuOpRunner("Conv3D",
-                                     {input_tensor, filter_tensor},
-                                     {output_tensor},
-                                     {{"strides", strides_vec},
-                                      {"pads", paddings},
-                                      {"dilations", dilations_vec},
-                                      {"groups", groups},
-                                      {"data_format", data_format}});
-    runner.Run(stream);
-  }
-};
-
-template <typename T>
-class NPUConv3dGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("Input");
-    const phi::DenseTensor* filter = ctx.Input<phi::DenseTensor>("Filter");
-    const phi::DenseTensor* output_grad =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("Output"));
-    phi::DenseTensor* input_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
-    phi::DenseTensor* filter_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Filter"));
-
-    const std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    int groups = ctx.Attr<int>("groups");
-    const std::string padding_algorithm =
-        ctx.Attr<std::string>("padding_algorithm");
-    const std::string data_format = ctx.Attr<std::string>("data_format");
-
-    PADDLE_ENFORCE_EQ(data_format,
-                      "NCDHW",
-                      platform::errors::Unimplemented(
-                          "the data_format must be NCDHW in "
-                          "the npu kernel of conv3d, but got data_format "
-                          "= [%s]",
-                          data_format));
-
-    PADDLE_ENFORCE_EQ(groups,
-                      1,
-                      platform::errors::Unimplemented(
-                          "the groups must be 1 in "
-                          "the npu kernel of conv3d, but got groups "
-                          "= [%d]",
-                          groups));
-
-    auto& dev_ctx = ctx.template device_context<NPUDeviceContext>();
-    auto input_tensor =
-        ctx.AllocateTmpTensor<T, NPUDeviceContext>(input->dims(), dev_ctx);
-    auto filter_tensor =
-        ctx.AllocateTmpTensor<T, NPUDeviceContext>(filter->dims(), dev_ctx);
-    auto output_grad_tensor = ctx.AllocateTmpTensor<T, NPUDeviceContext>(
-        output_grad->dims(), dev_ctx);
-
-    input_tensor.ShareDataWith(*input);
-    filter_tensor.ShareDataWith(*filter);
-    output_grad_tensor.ShareDataWith(*output_grad);
-
-    input_tensor.set_layout(DataLayout::kNCDHW);
-    filter_tensor.set_layout(DataLayout::kNCDHW);
-    output_grad_tensor.set_layout(DataLayout::kNCDHW);
-
-    // update padding and dilation
-    auto in_dims = input->dims();
-    auto filter_dims = filter->dims();
-    framework::DDim in_data_dims;
-    framework::DDim filter_data_dims;
-
-    in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size());
-
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    UpdatePaddingAndDilation(
-        &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
-
-    std::vector<int> strides_vec(5, 1);
-    std::vector<int> dilations_vec(5, 1);
-
-    strides_vec[2] = strides[0];
-    strides_vec[3] = strides[1];
-    strides_vec[4] = strides[2];
-    dilations_vec[2] = dilations[0];
-    dilations_vec[3] = dilations[1];
-    dilations_vec[4] = dilations[2];
-
-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-
-    if (filter_grad) {
-      filter_grad->mutable_data<T>(ctx.GetPlace());
-      std::vector<int> filter_shape_vec = phi::vectorize<int>(filter->dims());
-
-      phi::DenseTensor filter_grad_tensor =
-          ctx.AllocateTmpTensor<T, NPUDeviceContext>(filter_grad->dims(),
-                                                     dev_ctx);
-      filter_grad_tensor.ShareDataWith(*filter_grad);
-      filter_grad_tensor.set_layout(DataLayout::kNCDHW);
-
-      const auto& runner = NpuOpRunner("Conv3DBackpropFilterD",
-                                       {input_tensor, output_grad_tensor},
-                                       {filter_grad_tensor},
-                                       {{"filter_size", filter_shape_vec},
-                                        {"strides", strides_vec},
-                                        {"pads", paddings},
-                                        {"dilations", dilations_vec},
-                                        {"groups", groups},
-                                        {"data_format", data_format}});
-      runner.Run(stream);
-    }
-
-    if (input_grad) {
-      input_grad->mutable_data<T>(ctx.GetPlace());
-      std::vector<int> input_shape_vec = phi::vectorize<int>(input->dims());
-
-      phi::DenseTensor input_grad_tensor =
-          ctx.AllocateTmpTensor<T, NPUDeviceContext>(input_grad->dims(),
-                                                     dev_ctx);
-      input_grad_tensor.ShareDataWith(*input_grad);
-      input_grad_tensor.set_layout(DataLayout::kNCDHW);
-
-      const auto& runner = NpuOpRunner("Conv3DBackpropInputD",
-                                       {filter_tensor, output_grad_tensor},
-                                       {input_grad_tensor},
-                                       {{"input_size", input_shape_vec},
-                                        {"strides", strides_vec},
-                                        {"pads", paddings},
-                                        {"dilations", dilations_vec},
-                                        {"groups", groups},
-                                        {"data_format", data_format}});
-      runner.Run(stream);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(depthwise_conv2d,
-                       ops::DepthwiseConvNPUKernel<float>,
-                       ops::DepthwiseConvNPUKernel<plat::float16>);
-
-REGISTER_OP_NPU_KERNEL(depthwise_conv2d_grad,
-                       ops::DepthwiseConvGradNPUKernel<float>,
-                       ops::DepthwiseConvGradNPUKernel<plat::float16>);
-
-REGISTER_OP_NPU_KERNEL(conv2d,
-                       ops::NPUConvOpKernel<float>,
-                       ops::NPUConvOpKernel<plat::float16>);
-
-REGISTER_OP_NPU_KERNEL(conv2d_grad,
-                       ops::NPUConvGradOpKernel<float>,
-                       ops::NPUConvGradOpKernel<plat::float16>);
-
-REGISTER_OP_NPU_KERNEL(conv3d,
-                       ops::NPUConv3dKernel<float>,
-                       ops::NPUConv3dKernel<plat::float16>);
-
-REGISTER_OP_NPU_KERNEL(conv3d_grad,
-                       ops::NPUConv3dGradKernel<float>,
-                       ops::NPUConv3dGradKernel<plat::float16>);
diff --git a/paddle/fluid/operators/conv_transpose_op_npu.cc b/paddle/fluid/operators/conv_transpose_op_npu.cc
deleted file mode 100644
index f9da50848df2af..00000000000000
--- a/paddle/fluid/operators/conv_transpose_op_npu.cc
+++ /dev/null
@@ -1,317 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/conv_transpose_op.h"
-#include "paddle/phi/kernels/cpu/conv_util.h"
-
-namespace paddle {
-namespace operators {
-
-using NPUDeviceContext = platform::NPUDeviceContext;
-
-template <typename T>
-class Conv2DTransposeNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("Input");
-    const phi::DenseTensor* filter = ctx.Input<phi::DenseTensor>("Filter");
-    phi::DenseTensor* output = ctx.Output<phi::DenseTensor>("Output");
-    output->mutable_data<T>(ctx.GetPlace());
-    std::vector<int> output_padding =
-        ctx.Attr<std::vector<int>>("output_padding");
-    const std::vector<int> stride = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> padding = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilation = ctx.Attr<std::vector<int>>("dilations");
-    const std::string data_format = ctx.Attr<std::string>("data_format");
-    int groups = ctx.Attr<int>("groups");
-    const std::string padding_algorithm =
-        ctx.Attr<std::string>("padding_algorithm");
-
-    // check dimension
-    const bool channel_last = data_format == "NHWC";
-
-    // update padding and dilation
-    auto in_dims = input->dims();
-    auto filter_dims = filter->dims();
-    framework::DDim in_data_dims;
-    framework::DDim filter_data_dims;
-
-    if (channel_last) {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    }
-    filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size());
-
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    phi::UpdatePaddingAndDilation(
-        &padding, &dilation, padding_algorithm, in_data_dims, stride, ksize);
-
-    // construct NPU attr
-    std::vector<int> strides(4, 1);
-    std::vector<int> dilations(4, 1);
-
-    phi::DenseTensor input_tensor, output_tensor;
-    input_tensor.ShareDataWith(*input);
-    output_tensor.ShareDataWith(*output);
-
-    if (channel_last) {
-      input_tensor.set_layout(DataLayout::kNHWC);
-      output_tensor.set_layout(DataLayout::kNHWC);
-      strides[1] = stride[0];
-      strides[2] = stride[1];
-      dilations[1] = dilation[0];
-      dilations[2] = dilation[1];
-    } else {
-      strides[2] = stride[0];
-      strides[3] = stride[1];
-      dilations[2] = dilation[0];
-      dilations[3] = dilation[1];
-    }
-
-    for (auto i = output_padding.size(); i < 4; ++i) {
-      output_padding.insert(output_padding.begin(), 0);
-    }
-    auto output_dim_vec = phi::vectorize(output_tensor.dims());
-
-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-    const auto& runner = NpuOpRunner("Conv2DTransposeD",
-                                     {input_tensor, *filter},
-                                     {output_tensor},
-                                     {{"input_size", output_dim_vec},
-                                      {"strides", strides},
-                                      {"dilations", dilations},
-                                      {"output_padding", output_padding},
-                                      {"groups", groups},
-                                      {"pads", padding},
-                                      {"data_format", data_format}});
-    runner.Run(stream);
-  }
-};
-
-template <typename T>
-class Conv2DTransposeGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("Input");
-    const phi::DenseTensor* filter = ctx.Input<phi::DenseTensor>("Filter");
-    const phi::DenseTensor* output_grad =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("Output"));
-    phi::DenseTensor* input_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
-    phi::DenseTensor* filter_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Filter"));
-
-    if ((!input_grad) && (!filter_grad)) return;
-
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
-    const int groups = ctx.Attr<int>("groups");
-    std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
-    const std::string data_format = ctx.Attr<std::string>("data_format");
-    const phi::DataLayout data_layout = phi::StringToDataLayout(data_format);
-
-    auto in_dims = input->dims();
-    auto filter_dims = filter->dims();
-    // auto out_grad_dims = output_grad->dims();
-    // const int batch_size = static_cast<int>(input->dims()[0]);
-
-    const bool channel_last = (data_layout == phi::DataLayout::kNHWC);
-
-    framework::DDim in_data_dims;
-    if (channel_last) {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    }
-    framework::DDim filter_data_dims =
-        phi::slice_ddim(filter_dims, 2, filter_dims.size());
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    phi::UpdatePaddingAndDilation(
-        &paddings, &dilations, padding_algorithm, in_data_dims, strides, ksize);
-
-    std::vector<int> strides_vec(4, 1);
-    std::vector<int> dilations_vec(4, 1);
-
-    phi::DenseTensor input_tensor, output_grad_tensor;
-    input_tensor.ShareDataWith(*input);
-    output_grad_tensor.ShareDataWith(*output_grad);
-    if (channel_last) {
-      input_tensor.set_layout(DataLayout::kNHWC);
-      output_grad_tensor.set_layout(DataLayout::kNHWC);
-      strides_vec[1] = strides[0];
-      strides_vec[2] = strides[1];
-      dilations_vec[1] = dilations[0];
-      dilations_vec[2] = dilations[1];
-    } else {
-      strides_vec[2] = strides[0];
-      strides_vec[3] = strides[1];
-      dilations_vec[2] = dilations[0];
-      dilations_vec[3] = dilations[1];
-    }
-
-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-    if (filter_grad) {
-      filter_grad->mutable_data<T>(ctx.GetPlace());
-      const auto& runner =
-          NpuOpRunner("Conv2DBackpropFilterD",
-                      {output_grad_tensor, input_tensor},
-                      {*filter_grad},
-                      {{"filter_size", phi::vectorize<int>(filter_dims)},
-                       {"strides", strides_vec},
-                       {"pads", paddings},
-                       {"dilations", dilations_vec},
-                       {"groups", groups},
-                       {"data_format", data_format}});
-      runner.Run(stream);
-    }
-    if (input_grad) {
-      input_grad->mutable_data<T>(ctx.GetPlace());
-      phi::DenseTensor input_grad_tensor;
-      input_grad_tensor.ShareDataWith(*input_grad);
-      if (channel_last) {
-        input_grad_tensor.set_layout(DataLayout::kNHWC);
-      }
-      const auto& runner = NpuOpRunner("Conv2D",
-                                       {output_grad_tensor, *filter},
-                                       {input_grad_tensor},
-                                       {{"strides", strides_vec},
-                                        {"pads", paddings},
-                                        {"dilations", dilations_vec},
-                                        {"groups", groups},
-                                        {"data_format", data_format}});
-      runner.Run(stream);
-    }
-  }
-};
-
-template <typename T>
-class Conv3DTransposeNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const phi::DenseTensor* input = ctx.Input<phi::DenseTensor>("Input");
-    const phi::DenseTensor* filter = ctx.Input<phi::DenseTensor>("Filter");
-    phi::DenseTensor* output = ctx.Output<phi::DenseTensor>("Output");
-    output->mutable_data<T>(ctx.GetPlace());
-    std::vector<int> output_padding =
-        ctx.Attr<std::vector<int>>("output_padding");
-    const std::vector<int> stride = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> padding = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int> dilation = ctx.Attr<std::vector<int>>("dilations");
-    std::string data_format = ctx.Attr<std::string>("data_format");
-    int groups = ctx.Attr<int>("groups");
-    const std::string padding_algorithm =
-        ctx.Attr<std::string>("padding_algorithm");
-
-    // check dimension
-    const bool channel_last = data_format == "NHWC";
-
-    if (data_format == "NHWC") {
-      data_format = "NDHWC";
-    } else {
-      data_format = "NCDHW";
-    }
-
-    // update padding and dilation
-    auto in_dims = input->dims();
-    auto filter_dims = filter->dims();
-    framework::DDim in_data_dims;
-    framework::DDim filter_data_dims;
-
-    if (channel_last) {
-      in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1);
-    } else {
-      in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size());
-    }
-    filter_data_dims = phi::slice_ddim(filter_dims, 2, in_dims.size());
-
-    std::vector<int> ksize = phi::vectorize<int>(filter_data_dims);
-    phi::UpdatePaddingAndDilation(
-        &padding, &dilation, padding_algorithm, in_data_dims, stride, ksize);
-
-    // construct NPU attr
-    std::vector<int> strides(5, 1);
-    std::vector<int> dilations(5, 1);
-
-    phi::DenseTensor input_tensor, output_tensor, filter_tensor;
-    input_tensor.Resize(input->dims());
-    input_tensor.ShareDataWith(*input);
-    output_tensor.Resize(output->dims());
-    output_tensor.ShareDataWith(*output);
-    filter_tensor.Resize(filter->dims());
-    filter_tensor.ShareDataWith(*filter);
-
-    PADDLE_ENFORCE_EQ(
-        dilation[0],
-        1,
-        platform::errors::InvalidArgument(
-            "dilation[0] must be equal 1, but received %d.", dilation[0]));
-
-    if (channel_last) {
-      input_tensor.set_layout(DataLayout::kNDHWC);
-      output_tensor.set_layout(DataLayout::kNDHWC);
-      strides[1] = stride[0];
-      strides[2] = stride[1];
-      strides[3] = stride[2];
-      dilations[2] = dilation[1];
-      dilations[3] = dilation[2];
-    } else {
-      input_tensor.set_layout(DataLayout::kNCDHW);
-      output_tensor.set_layout(DataLayout::kNCDHW);
-      strides[2] = stride[0];
-      strides[3] = stride[1];
-      strides[4] = stride[2];
-      dilations[3] = dilation[1];
-      dilations[4] = dilation[2];
-    }
-    filter_tensor.set_layout(DataLayout::kNCDHW);
-
-    auto output_dim_vec = phi::vectorize<int32_t>(output_tensor.dims());
-
-    auto& dev_ctx = ctx.template device_context<NPUDeviceContext>();
-
-    NpuOpRunner runner;
-    runner.SetType("Conv3DBackpropInputD")
-        .AddInput(filter_tensor)
-        .AddInput(input_tensor)
-        .AddAttr("input_size", output_dim_vec)
-        .AddAttr("strides", strides)
-        .AddAttr("pads", padding)
-        .AddAttr("dilations", dilations)
-        .AddAttr("groups", groups)
-        .AddAttr("data_format", data_format)
-        .AddOutput(output_tensor);
-    runner.Run(dev_ctx.stream());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(conv2d_transpose,
-                       ops::Conv2DTransposeNPUKernel<float>,
-                       ops::Conv2DTransposeNPUKernel<plat::float16>);
-
-REGISTER_OP_NPU_KERNEL(conv2d_transpose_grad,
-                       ops::Conv2DTransposeGradNPUKernel<float>,
-                       ops::Conv2DTransposeGradNPUKernel<plat::float16>);
-
-REGISTER_OP_NPU_KERNEL(conv3d_transpose,
-                       ops::Conv3DTransposeNPUKernel<float>,
-                       ops::Conv3DTransposeNPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/crop_op_npu.cc b/paddle/fluid/operators/crop_op_npu.cc
deleted file mode 100644
index 5aaa832ce3383b..00000000000000
--- a/paddle/fluid/operators/crop_op_npu.cc
+++ /dev/null
@@ -1,113 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/crop_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class CropNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-
-    std::vector<int> offset_list;
-    if (ctx.HasInput("Offsets")) {
-      auto* offsets_tensor = ctx.Input<phi::DenseTensor>("Offsets");
-      paddle::framework::TensorToVector(
-          *offsets_tensor, ctx.device_context(), &offset_list);
-      if (offset_list.empty()) {
-        offset_list.resize(x->dims().size(), 0);
-      }
-    } else {
-      auto res = ctx.Attr<std::vector<int>>("offsets");
-      if (res.empty()) {
-        offset_list.resize(x->dims().size(), 0);
-      } else {
-        offset_list.insert(offset_list.end(), res.begin(), res.end());
-      }
-    }
-
-    PADDLE_ENFORCE_EQ(
-        static_cast<int64_t>(offset_list.size()),
-        x->dims().size(),
-        platform::errors::InvalidArgument(
-            "The shape (%d) of CropOp's "
-            "'offset' attribute should be equal to the shape of dims "
-            "(%d) of the Input(X).",
-            offset_list.size(),
-            x->dims().size()));
-
-    int axis_int = 0;
-    framework::NPUAttributeMap attr_input = {{"offsets", offset_list},
-                                             {"axis", axis_int}};
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    if (ctx.HasInput("Y")) {
-      auto* shape = ctx.Input<phi::DenseTensor>("Y");
-      PADDLE_ENFORCE_EQ(shape->dims().size(),
-                        x->dims().size(),
-                        platform::errors::InvalidArgument(
-                            "The shape of dims of (%d) of CropOp's "
-                            "Input(shape) should be equal to the shape of dims "
-                            "(%d) of the Input(X).",
-                            shape->dims().size(),
-                            x->dims().size()));
-
-      // shape memory maybe have gc.
-      phi::DenseTensor tmp_shape(*shape);
-      tmp_shape.mutable_data<T>(ctx.GetPlace());
-
-      const auto& runner =
-          NpuOpRunner("Crop", {*x, tmp_shape}, {*out}, attr_input);
-      auto stream =
-          ctx.template device_context<paddle::platform::NPUDeviceContext>()
-              .stream();
-      runner.Run(stream);
-    } else {
-      auto shape_size = ctx.Attr<std::vector<int>>("shape");
-      PADDLE_ENFORCE_EQ(shape_size.size(),
-                        x->dims().size(),
-                        platform::errors::InvalidArgument(
-                            "The shape of dims of (%d) of CropOp's "
-                            "Input(shape) should be equal to the shape of dims "
-                            "(%d) of the Input(X).",
-                            shape_size.size(),
-                            x->dims().size()));
-      phi::DenseTensor tmp_shape(x->dtype());
-      tmp_shape.Resize(phi::make_ddim(shape_size));
-      tmp_shape.mutable_data<T>(ctx.GetPlace());
-      const auto& runner =
-          NpuOpRunner("Crop", {*x, tmp_shape}, {*out}, attr_input);
-      auto stream =
-          ctx.template device_context<paddle::platform::NPUDeviceContext>()
-              .stream();
-      runner.Run(stream);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    crop,
-    ops::CropNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::CropNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::CropNPUKernel<paddle::platform::NPUDeviceContext,
-                       paddle::platform::float16>);
diff --git a/paddle/fluid/operators/cumsum_op_npu.cc b/paddle/fluid/operators/cumsum_op_npu.cc
deleted file mode 100644
index a5c77922054da5..00000000000000
--- a/paddle/fluid/operators/cumsum_op_npu.cc
+++ /dev/null
@@ -1,110 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor.h"
-
-namespace paddle {
-namespace operators {
-
-static void CumsumImp(const phi::DenseTensor& input,
-                      phi::DenseTensor* output,
-                      const framework::NPUAttributeMap& attr_input,
-                      const framework::ExecutionContext& ctx) {
-  auto stream =
-      ctx.template device_context<paddle::platform::NPUDeviceContext>()
-          .stream();
-  if (framework::TransToProtoVarType(input.dtype()) ==
-      framework::proto::VarType::INT64) {
-    phi::DenseTensor tmp_input;
-    tmp_input.mutable_data<float>(input.dims(), ctx.GetPlace());
-    auto dst_acl_dtype =
-        ConvertToNpuDtype(framework::TransToProtoVarType(tmp_input.type()));
-    const auto& cast_runner_1 =
-        NpuOpRunner("Cast",
-                    {input},
-                    {tmp_input},
-                    {{"dst_type", static_cast<int>(dst_acl_dtype)}});
-    cast_runner_1.Run(stream);
-
-    phi::DenseTensor tmp_output;
-    tmp_output.mutable_data<float>(output->dims(), ctx.GetPlace());
-    const auto& runner =
-        NpuOpRunner("CumsumD", {tmp_input}, {tmp_output}, attr_input);
-    runner.Run(stream);
-
-    dst_acl_dtype =
-        ConvertToNpuDtype(framework::TransToProtoVarType(output->type()));
-    const auto& cast_runner_2 =
-        NpuOpRunner("Cast",
-                    {tmp_output},
-                    {*output},
-                    {{"dst_type", static_cast<int>(dst_acl_dtype)}});
-    cast_runner_2.Run(stream);
-  } else {
-    const auto& runner = NpuOpRunner("CumsumD", {input}, {*output}, attr_input);
-    runner.Run(stream);
-  }
-}
-
-template <typename DeviceContext, typename T>
-class CumSumNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    int axis = ctx.Attr<int>("axis");
-    bool exclusive = ctx.Attr<bool>("exclusive");
-    bool reverse = ctx.Attr<bool>("reverse");
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    framework::NPUAttributeMap attr_input = {
-        {"axis", axis}, {"exclusive", exclusive}, {"reverse", reverse}};
-
-    bool flatten = ctx.Attr<bool>("flatten");
-    if (flatten) {
-      PADDLE_ENFORCE_EQ(
-          axis,
-          -1,
-          platform::errors::InvalidArgument(
-              "when flatten is true, attr axis must be default %d, but got %d",
-              -1,
-              axis));
-
-      phi::DenseTensor new_x(x->type());
-      new_x.ShareDataWith(*x);
-
-      new_x.Resize(phi::make_ddim({x->numel()}));
-
-      CumsumImp(new_x, out, attr_input, ctx);
-    } else {
-      CumsumImp(*x, out, attr_input, ctx);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(
-    cumsum,
-    ops::CumSumNPUKernel<plat::NPUDeviceContext, int>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-    ops::CumSumNPUKernel<plat::NPUDeviceContext, int64_t>,
-#endif
-    ops::CumSumNPUKernel<plat::NPUDeviceContext, float>,
-    ops::CumSumNPUKernel<plat::NPUDeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/dropout_op_npu.cc b/paddle/fluid/operators/dropout_op_npu.cc
deleted file mode 100644
index 9c84961f611c0e..00000000000000
--- a/paddle/fluid/operators/dropout_op_npu.cc
+++ /dev/null
@@ -1,212 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/phi/core/ddim.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class DropoutNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* seed_tensor =
-        ctx.HasInput("Seed") ? ctx.Input<phi::DenseTensor>("Seed") : nullptr;
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    auto* mask = ctx.Output<phi::DenseTensor>("Mask");
-
-    auto dropout_prob = ctx.Attr<float>("dropout_prob");
-    auto is_test = ctx.Attr<bool>("is_test");
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    if (dropout_prob == 1.) {
-      const auto& runner_zeros_out = NpuOpRunner("ZerosLike", {*out}, {*out});
-      runner_zeros_out.Run(stream);
-      mask->mutable_data<uint8_t>(ctx.GetPlace());
-      const auto& runner_zeros_mask =
-          NpuOpRunner("ZerosLike", {*mask}, {*mask});
-      runner_zeros_mask.Run(stream);
-      return;
-    }
-
-    // only achieve the default `upscale_in_train` method
-    if (!is_test) {
-      phi::DenseTensor tmp_x(x->dtype());
-      phi::DenseTensor tmp_out(out->dtype());
-      tmp_x.ShareDataWith(*x);
-      tmp_out.ShareDataWith(*out);
-      if (x->dims().size() == 1) {
-        // DropOutDoMask will get error result when input
-        // is 1-D. Make it become 2-D.
-        std::vector<int> vec_dim = phi::vectorize<int>(x->dims());
-        tmp_x.Resize(phi::make_ddim({vec_dim[0], 1}));
-        tmp_out.Resize(phi::make_ddim({vec_dim[0], 1}));
-      }
-
-      int seed = 0;
-      int seed2 = 0;
-      float keep_prob = 1. - dropout_prob;
-      if (seed_tensor) {
-        std::vector<int> seed_data;
-        paddle::framework::TensorToVector(
-            *seed_tensor, ctx.device_context(), &seed_data);
-        seed = seed_data[0];
-      } else {
-        seed = ctx.Attr<bool>("fix_seed") ? ctx.Attr<int>("seed") : 0;
-      }
-
-      phi::DenseTensor keep_prob_tensor(x->dtype());
-      keep_prob_tensor.mutable_data<T>({1}, ctx.GetPlace());
-      FillNpuTensorWithConstant<T>(&keep_prob_tensor,
-                                   static_cast<T>(keep_prob));
-
-      mask->mutable_data<uint8_t>(ctx.GetPlace());
-
-      // mask used in `DropOutGenMask` NPU OP is different from
-      // the output `Mask`.
-      phi::DenseTensor npu_mask(phi::DataType::UINT8);
-      uint32_t length = (x->numel() + 128 - 1) / 128 * 128;
-      npu_mask.Resize(phi::make_ddim({length / 8}));
-      npu_mask.mutable_data<uint8_t>(ctx.GetPlace());
-
-      // TODO(pangyoki): `keep_prob` used in `DropOutGenMask` NPU
-      // OP must be a scalar with shape[0]. At present, the shape
-      // of the `prob` phi::DenseTensor of this OP is forced to be set to 0
-      // in `npu_op_runner.cc`, which needs to be optimized later.
-      NpuOpRunner runner_gen_mask;
-      runner_gen_mask.SetType("DropOutGenMask")
-          .AddInput(phi::vectorize(tmp_out.dims()))
-          .AddInput(keep_prob_tensor)
-          .AddOutput(npu_mask)
-          .AddAttr("seed", seed)
-          .AddAttr("seed2", seed2);
-      runner_gen_mask.Run(stream);
-
-      NpuOpRunner runner_dropout;
-      runner_dropout.SetType("DropOutDoMask")
-          .AddInput(tmp_x)
-          .AddInput(npu_mask)
-          .AddInput(keep_prob_tensor)
-          .AddOutput(tmp_out);
-      runner_dropout.Run(stream);
-
-      // cast `out` from float/float16 to bool
-      phi::DenseTensor cast_mask(phi::DataType::BOOL);
-      cast_mask.Resize(mask->dims());
-      cast_mask.mutable_data<bool>(ctx.GetPlace());
-      auto dst_dtype_bool =
-          ConvertToNpuDtype(framework::TransToProtoVarType(cast_mask.dtype()));
-      const auto& runner_cast_mask_bool =
-          NpuOpRunner("Cast",
-                      {*out},
-                      {cast_mask},
-                      {{"dst_type", static_cast<int>(dst_dtype_bool)}});
-      runner_cast_mask_bool.Run(stream);
-
-      // cast cast_mask from bool to uint8
-      auto dst_dtype_uint8 =
-          ConvertToNpuDtype(framework::TransToProtoVarType(mask->dtype()));
-      const auto& runner_cast_mask_uint8 =
-          NpuOpRunner("Cast",
-                      {cast_mask},
-                      {*mask},
-                      {{"dst_type", static_cast<int>(dst_dtype_uint8)}});
-      runner_cast_mask_uint8.Run(stream);
-    } else {
-      framework::TensorCopy(
-          *x,
-          ctx.GetPlace(),
-          ctx.template device_context<platform::DeviceContext>(),
-          out);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class DropoutGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* mask = ctx.Input<phi::DenseTensor>("Mask");
-
-    auto dropout_prob = ctx.Attr<float>("dropout_prob");
-    auto is_test = ctx.Attr<bool>("is_test");
-
-    PADDLE_ENFORCE_EQ(is_test,
-                      false,
-                      platform::errors::PreconditionNotMet(
-                          "GradOp is only callable when is_test is false"));
-
-    dx->mutable_data<T>(ctx.GetPlace());
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    if (dropout_prob == 1.) {
-      const auto& runner_zeros = NpuOpRunner("ZerosLike", {*dx}, {*dx});
-      runner_zeros.Run(stream);
-      return;
-    }
-
-    // cast mask from uint8 to float32/float16
-    phi::DenseTensor cast_mask(dx->dtype());
-    cast_mask.Resize(mask->dims());
-    cast_mask.mutable_data<T>(ctx.GetPlace());
-    auto dst_dtype =
-        ConvertToNpuDtype(framework::TransToProtoVarType(dx->dtype()));
-    const auto& runner_cast_mask =
-        NpuOpRunner("Cast",
-                    {*mask},
-                    {cast_mask},
-                    {{"dst_type", static_cast<int>(dst_dtype)}});
-    runner_cast_mask.Run(stream);
-
-    const auto& runner =
-        NpuOpRunner("MaskedScale",
-                    {*dout, cast_mask},
-                    {*dx},
-                    {{"value", static_cast<float>(1. / (1 - dropout_prob))}});
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    dropout,
-    ops::DropoutNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::DropoutNPUKernel<paddle::platform::NPUDeviceContext,
-                          paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    dropout_grad,
-    ops::DropoutGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::DropoutGradNPUKernel<paddle::platform::NPUDeviceContext,
-                              paddle::platform::float16>);
diff --git a/paddle/fluid/operators/expand_as_v2_op_npu.cc b/paddle/fluid/operators/expand_as_v2_op_npu.cc
deleted file mode 100644
index 77f12f17ce2586..00000000000000
--- a/paddle/fluid/operators/expand_as_v2_op_npu.cc
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/expand_as_v2_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ExpandAsV2NPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto rank = context.Input<phi::DenseTensor>("X")->dims().size();
-    auto target_shape = context.Attr<std::vector<int>>("target_shape");
-    auto target_rank = target_shape.size();
-    PADDLE_ENFORCE_GE(target_rank,
-                      rank,
-                      platform::errors::InvalidArgument(
-                          "The rank (%d) of the input 'target_tensor' for "
-                          "expand_as_v2 op must be greater than or equal to "
-                          "the rank (%d) of the input 'x'.",
-                          target_rank,
-                          rank));
-    PADDLE_ENFORCE_GE(
-        rank,
-        1,
-        platform::errors::InvalidArgument("The rank (%d) of the input 'x' for "
-                                          "expand_as_v2 op must be positive.",
-                                          rank));
-    PADDLE_ENFORCE_LE(target_rank,
-                      MAX_RANK_SUPPORTED,
-                      platform::errors::InvalidArgument(
-                          "The rank (%d) of the input 'target_tensor' for "
-                          "expand_as_v2 op must be less than or equal to %d.",
-                          target_rank,
-                          MAX_RANK_SUPPORTED));
-    ExpandAs(context);
-  }
-
- protected:
-  void ExpandAs(const framework::ExecutionContext& context) const {
-    auto* in0 = context.Input<phi::DenseTensor>("X");
-    auto in_dims = in0->dims();
-    auto target_shape = context.Attr<std::vector<int>>("target_shape");
-    auto vec_in_dims = phi::vectorize<int>(in_dims);
-    auto diff = target_shape.size() - vec_in_dims.size();
-    vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
-
-    for (size_t i = 0; i < vec_in_dims.size(); ++i) {
-      PADDLE_ENFORCE_NE(target_shape[i],
-                        0,
-                        platform::errors::InvalidArgument(
-                            "The value of target shape cannot be zero."));
-      if (vec_in_dims[i] != 1) {
-        PADDLE_ENFORCE_EQ(
-            vec_in_dims[i],
-            target_shape[i],
-            platform::errors::InvalidArgument(
-                "The value (%d) of the non-singleton dimension does not match"
-                " the corresponding value (%d) in "
-                "target tensor for expand_as_v2 op.",
-                vec_in_dims[i],
-                target_shape[i]));
-      }
-    }
-    auto* out0 = context.Output<phi::DenseTensor>("Out");
-
-    framework::DDim out_dims = phi::make_ddim(target_shape);
-
-    out0->Resize(out_dims);
-    out0->mutable_data<T>(context.GetPlace());
-
-    const auto& runner =
-        NpuOpRunner("ExpandD", {*in0}, {*out0}, {{"shape", target_shape}});
-
-    auto stream =
-        context.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    runner.Run(stream);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_NPU_KERNEL(
-    expand_as_v2,
-    ops::ExpandAsV2NPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::ExpandAsV2NPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::ExpandAsV2NPUKernel<paddle::platform::NPUDeviceContext, int8_t>,
-    ops::ExpandAsV2NPUKernel<paddle::platform::NPUDeviceContext, uint8_t>,
-    ops::ExpandAsV2NPUKernel<paddle::platform::NPUDeviceContext,
-                             paddle::platform::float16>);
diff --git a/paddle/fluid/operators/expand_op_npu.cc b/paddle/fluid/operators/expand_op_npu.cc
deleted file mode 100644
index d7e553b83bb67b..00000000000000
--- a/paddle/fluid/operators/expand_op_npu.cc
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/expand_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ExpandNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto rank = context.Input<phi::DenseTensor>("X")->dims().size();
-    PADDLE_ENFORCE_GE(
-        rank,
-        1,
-        platform::errors::InvalidArgument(
-            "The number of dimensions of the input 'x' for Op(expand) "
-            "must be greater than or equal to 1, but the value received is %d.",
-            rank));
-    PADDLE_ENFORCE_LE(
-        rank,
-        MAX_RANK_SUPPORTED,
-        platform::errors::InvalidArgument(
-            "The number of dimensions of the input 'x' for Op(expand) "
-            "must be less than or equal to %d, but the value received is %d.",
-            MAX_RANK_SUPPORTED,
-            rank));
-    switch (rank) {
-      case 1:
-        Expand<1>(context);
-        break;
-      case 2:
-        Expand<2>(context);
-        break;
-      case 3:
-        Expand<3>(context);
-        break;
-      case 4:
-        Expand<4>(context);
-        break;
-      case 5:
-        Expand<5>(context);
-        break;
-      case 6:
-        Expand<6>(context);
-        break;
-    }
-  }
-
- protected:
-  template <int Rank>
-  void Expand(const framework::ExecutionContext& context) const {
-    auto* in0 = context.Input<phi::DenseTensor>("X");
-    auto in_dims = in0->dims();
-    auto expand_times = get_expand_times(context);
-    PADDLE_ENFORCE_EQ(static_cast<size_t>(in_dims.size()),
-                      expand_times.size(),
-                      platform::errors::InvalidArgument(
-                          "The number of elements (%d) of 'expand_times' for "
-                          "Op(expand) must be equal to the number "
-                          "of dimensions (%d) of the input.",
-                          expand_times.size(),
-                          static_cast<size_t>(in_dims.size())));
-    auto* out0 = context.Output<phi::DenseTensor>("Out");
-    framework::DDim out_dims(in_dims);
-
-    for (size_t i = 0; i < expand_times.size(); ++i) {
-      out_dims[i] *= expand_times[i];
-    }
-
-    auto place = context.GetPlace();
-    auto stream =
-        context.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    out0->Resize(out_dims);
-    out0->mutable_data<T>(place);
-
-    bool is_expand_times_all_one =
-        (out0->numel() == in0->numel()) ? true : false;
-
-    if (is_expand_times_all_one) {
-      memory::Copy(place,
-                   out0->mutable_data<T>(place),
-                   place,
-                   in0->data<T>(),
-                   in0->numel() * sizeof(T),
-                   stream);
-      if (out_dims != in_dims) {
-        out0->Resize(out_dims);
-      }
-    } else {
-      const auto& runner =
-          NpuOpRunner("TileD", {*in0}, {*out0}, {{"multiples", expand_times}});
-      runner.Run(stream);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_NPU_KERNEL(
-    expand,
-    ops::ExpandNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::ExpandNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::ExpandNPUKernel<paddle::platform::NPUDeviceContext,
-                         paddle::platform::float16>);
diff --git a/paddle/fluid/operators/expand_op_npu_test.cc b/paddle/fluid/operators/expand_op_npu_test.cc
deleted file mode 100644
index e9d12beaa78dea..00000000000000
--- a/paddle/fluid/operators/expand_op_npu_test.cc
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <iostream>
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-USE_OP(expand);
-USE_OP_DEVICE_KERNEL(expand, NPU);
-
-template <typename T>
-void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
-  // init
-  auto in = scope->Var("X");
-  auto expand_times = scope->Var("ExpandTimes");
-  auto out = scope->Var("Out");
-  auto in_t = in->GetMutable<phi::DenseTensor>();
-  auto out_t = out->GetMutable<phi::DenseTensor>();
-  auto expand_times_t = expand_times->GetMutable<phi::DenseTensor>();
-
-  auto place = ctx.GetPlace();
-  paddle::framework::TensorFromVector(std::vector<T>(3 * 1 * 7, 1), ctx, in_t);
-  paddle::framework::TensorFromVector(
-      std::vector<int>({1, 10, 1}), ctx, expand_times_t);
-
-  in_t->Resize(phi::make_ddim({3, 1, 7}));
-  expand_times_t->Resize(phi::make_ddim({3}));
-  out_t->Resize(phi::make_ddim({3, 10, 7}));
-  out_t->mutable_data<T>(place);
-
-  f::AttributeMap attrs = {{}};
-  auto op =
-      f::OpRegistry::CreateOp("expand",
-                              {{"X", {"X"}}, {"ExpandTimes", {"ExpandTimes"}}},
-                              {{"Out", {"Out"}}},
-                              attrs);
-  op->Run(*scope, place);
-  ctx.Wait();
-
-  auto out_dim = out_t->dims();
-  EXPECT_EQ(out_dim.at(0), 3);
-  EXPECT_EQ(out_dim.at(1), 10);
-  EXPECT_EQ(out_dim.at(2), 7);
-}
-
-TEST(expand, NPU_fp32) {
-  f::Scope scope;
-  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<float>(&scope, *ctx);
-}
diff --git a/paddle/fluid/operators/expand_v2_op_npu.cc b/paddle/fluid/operators/expand_v2_op_npu.cc
deleted file mode 100644
index 7f37fc67d529de..00000000000000
--- a/paddle/fluid/operators/expand_v2_op_npu.cc
+++ /dev/null
@@ -1,235 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/expand_v2_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ExpandV2NPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* X = ctx.Input<phi::DenseTensor>("X");
-    auto* Out = ctx.Output<phi::DenseTensor>("Out");
-
-    auto in_dims = X->dims();
-    auto expand_shape = get_expand_shape(ctx);
-    auto vec_in_dims = phi::vectorize<int>(in_dims);
-    auto diff = expand_shape.size() - vec_in_dims.size();
-    vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
-    std::vector<int> final_expand_shape(vec_in_dims.size());
-    for (size_t i = 0; i < vec_in_dims.size(); ++i) {
-      PADDLE_ENFORCE_NE(expand_shape[i],
-                        0,
-                        platform::errors::InvalidArgument(
-                            "The expanded size cannot be zero."));
-      if (i < diff) {  // expand_shape = [3,4,-1,-1], X = [10,2] -->
-                       // final_expand_shape = [3,4,10,2]
-        PADDLE_ENFORCE_GT(
-            expand_shape[i],
-            0,
-            platform::errors::InvalidArgument(
-                "The expanded size (%d) for non-existing dimensions must be "
-                "positive for expand_v2 op.",
-                expand_shape[i]));
-        final_expand_shape[i] = expand_shape[i];
-      } else if (expand_shape[i] > 0) {  // expand_shape = [3,4,10,4], X =
-                                         // [10,1] --> final_expand_shape =
-                                         // [3,4,10,4]
-        if (vec_in_dims[i] != 1) {
-          PADDLE_ENFORCE_EQ(
-              vec_in_dims[i],
-              expand_shape[i],
-              platform::errors::InvalidArgument(
-                  "The value (%d) of the non-singleton dimension does not match"
-                  " the corresponding value (%d) in shape for expand_v2 op.",
-                  vec_in_dims[i],
-                  expand_shape[i]));
-          final_expand_shape[i] = expand_shape[i];
-        } else {
-          final_expand_shape[i] = expand_shape[i];
-        }
-      } else {  // expand_shape = [3,4,-1,-1], X = [10,2] --> final_expand_shape
-                // = [3,4,10,2]
-        PADDLE_ENFORCE_EQ(
-            expand_shape[i],
-            -1,
-            platform::errors::InvalidArgument(
-                "When the value in shape is negative for expand_v2 op, "
-                "only -1 is supported, but the value received is %d.",
-                expand_shape[i]));
-        final_expand_shape[i] = vec_in_dims[i];
-      }
-    }
-
-    framework::NPUAttributeMap attr_input = {{"shape", final_expand_shape}};
-
-    auto rank = X->dims().size();
-
-    PADDLE_ENFORCE_GE(
-        rank,
-        1,
-        platform::errors::InvalidArgument(
-            "The rank of the input 'X' for expand_v2_npu op must be positive, "
-            "but the value received is %d.",
-            rank));
-    PADDLE_ENFORCE_LE(
-        rank,
-        MAX_RANK_SUPPORTED,
-        platform::errors::InvalidArgument(
-            "The rank of the input 'X' for expand_v2_npu op must be less than "
-            "or equal to %d, but the value received is %d.",
-            MAX_RANK_SUPPORTED,
-            rank));
-    auto shape_size = final_expand_shape.size();
-    PADDLE_ENFORCE_GE(
-        shape_size,
-        rank,
-        platform::errors::InvalidArgument(
-            "The number (%d) of elements of 'shape' for expand_v2_npu op must "
-            "be "
-            "greater than or equal to the rank (%d) of the input 'X'.",
-            shape_size,
-            rank));
-    PADDLE_ENFORCE_LE(shape_size,
-                      MAX_RANK_SUPPORTED,
-                      platform::errors::InvalidArgument(
-                          "The number (%d) of elements of 'shape' for "
-                          "expand_v2_npu op must be "
-                          "less than or equal to %d.",
-                          shape_size,
-                          MAX_RANK_SUPPORTED));
-
-    framework::DDim out_dims = phi::make_ddim(final_expand_shape);
-    Out->Resize(out_dims);
-    Out->mutable_data<T>(ctx.GetPlace());
-
-    const auto& dev_ctx =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>();
-    auto op_func = [](const std::vector<phi::DenseTensor>& inputs,
-                      const std::vector<phi::DenseTensor>& outputs,
-                      const NPUAttributeMap& attrs,
-                      const platform::NPUDeviceContext& dev_ctx) {
-      const auto& runner = NpuOpRunner("ExpandD", inputs, outputs, attrs);
-      runner.Run(dev_ctx.stream());
-    };
-
-    if (framework::TransToProtoVarType(X->dtype()) ==
-        framework::proto::VarType::BOOL) {
-      NpuOpRunner::TypeAdapter({*X},
-                               {*Out},
-                               attr_input,
-                               dev_ctx,
-                               op_func,
-                               {framework::proto::VarType::UINT8},
-                               {framework::proto::VarType::UINT8});
-    } else if (framework::TransToProtoVarType(X->dtype()) ==
-               framework::proto::VarType::INT64) {
-      NpuOpRunner::TypeAdapter({*X},
-                               {*Out},
-                               attr_input,
-                               dev_ctx,
-                               op_func,
-                               {framework::proto::VarType::INT32},
-                               {framework::proto::VarType::INT32});
-    } else {
-      const auto& runner = NpuOpRunner("ExpandD", {*X}, {*Out}, attr_input);
-      runner.Run(dev_ctx.stream());
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class ExpandV2NPUGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    dx->mutable_data<T>(ctx.GetPlace());
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    // case 1: reduce dout dims to dx dims
-    // For example: [2, 120] --> [120]
-    auto reduce_ndim = dout->dims().size() - dx->dims().size();
-    std::vector<int> axes;
-    for (auto i = 0; i < reduce_ndim; ++i) {
-      axes.push_back(i);
-    }
-
-    phi::DenseTensor tmp_dout(dout->dtype());
-    phi::DenseTensor reduced_dout(dx->dtype());
-    tmp_dout.ShareDataWith(*dout);
-    if (axes.size() != 0) {
-      std::vector<int64_t> reduced_dout_dims;
-      for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
-        reduced_dout_dims.push_back(dout->dims()[i]);
-      }
-      tmp_dout.Resize(phi::make_ddim(reduced_dout_dims));
-      reduced_dout.Resize(phi::make_ddim(reduced_dout_dims));
-      reduced_dout.mutable_data<T>(ctx.GetPlace());
-      const auto& runner = NpuOpRunner("ReduceSumD",
-                                       {*dout},
-                                       {reduced_dout},
-                                       {{"axes", axes}, {"keep_dims", false}});
-      runner.Run(stream);
-      tmp_dout = reduced_dout;
-    }
-
-    // case 2: reduce axis of dout in which dim is 1
-    // For example: [12, 140] --> [1, 140]
-
-    // case 3: copy dout to dx when shape is totally same, and dim in dx != 1
-    // For example: [2, 10, 5] --> [2, 10, 5]
-    axes.clear();
-    for (auto i = 0; i < dx->dims().size(); ++i) {
-      if (dx->dims()[i] == 1) {
-        axes.push_back(i);
-      }
-    }
-    if (axes.size() != 0) {
-      const auto& runner = NpuOpRunner("ReduceSumD",
-                                       {tmp_dout},
-                                       {*dx},
-                                       {{"axes", axes}, {"keep_dims", true}});
-      runner.Run(stream);
-    } else {
-      framework::TensorCopySync(tmp_dout, ctx.GetPlace(), dx);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_NPU_KERNEL(
-    expand_v2,
-    ops::ExpandV2NPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::ExpandV2NPUKernel<paddle::platform::NPUDeviceContext,
-                           paddle::platform::float16>,
-    ops::ExpandV2NPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
-    ops::ExpandV2NPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::ExpandV2NPUKernel<paddle::platform::NPUDeviceContext, bool>);
-
-REGISTER_OP_NPU_KERNEL(
-    expand_v2_grad,
-    ops::ExpandV2NPUGradKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::ExpandV2NPUGradKernel<paddle::platform::NPUDeviceContext,
-                               paddle::platform::float16>,
-    ops::ExpandV2NPUGradKernel<paddle::platform::NPUDeviceContext, int>);
diff --git a/paddle/fluid/operators/eye_op_npu.cc b/paddle/fluid/operators/eye_op_npu.cc
deleted file mode 100644
index ee71ebee9b0665..00000000000000
--- a/paddle/fluid/operators/eye_op_npu.cc
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class EyeNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto num_rows = ctx.Attr<int64_t>("num_rows");
-
-    auto d_nums = ctx.Attr<int>("dtype");
-    auto dtype =
-        ConvertToNpuDtype(static_cast<framework::proto::VarType::Type>(d_nums));
-
-    auto num_columns = ctx.Attr<int64_t>("num_columns");
-    if (num_columns == -1) num_columns = num_rows;
-
-    framework::NPUAttributeMap attr_input = {
-        {"num_rows", num_rows}, {"num_columns", num_columns}, {"dtype", dtype}};
-
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    const auto& runner = NpuOpRunner("Eye", {}, {*out}, attr_input);
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    eye,
-    ops::EyeNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::EyeNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::EyeNPUKernel<paddle::platform::NPUDeviceContext,
-                      paddle::platform::float16>);
diff --git a/paddle/fluid/operators/fill_any_like_op_npu.cc b/paddle/fluid/operators/fill_any_like_op_npu.cc
deleted file mode 100644
index 62d3e5a82f5a32..00000000000000
--- a/paddle/fluid/operators/fill_any_like_op_npu.cc
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class FillAnyLikeNPUKernel : public framework::OpKernel<T> {
- public:
-  using CommonType = typename std::common_type<
-      float,
-      typename std::conditional<std::is_same<T, platform::float16>::value,
-                                float,
-                                T>::type>::type;
-
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto data_type = static_cast<framework::proto::VarType::Type>(
-        context.Attr<int>("dtype"));
-    auto* out = context.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(context.GetPlace());
-
-    float value = context.Attr<float>("value");
-
-    auto common_type_value = static_cast<CommonType>(value);
-
-    PADDLE_ENFORCE_EQ(
-        (common_type_value >=
-         static_cast<CommonType>(std::numeric_limits<T>::lowest())) &&
-            (common_type_value <=
-             static_cast<CommonType>(std::numeric_limits<T>::max())),
-        true,
-        platform::errors::InvalidArgument(
-            "The filled value is out of range for target type, "
-            "current kernel type is %s, the range should between %f "
-            "and %f, but now value is %f.",
-            typeid(T).name(),
-            static_cast<CommonType>(std::numeric_limits<T>::lowest()),
-            static_cast<CommonType>(std::numeric_limits<T>::max()),
-            value));
-
-    PADDLE_ENFORCE_EQ(
-        std::isnan(value),
-        false,
-        platform::errors::InvalidArgument("The filled value is NaN."));
-
-    Tensor tensor_tmp(framework::TransToPhiDataType(data_type));
-    tensor_tmp.mutable_data<T>({1}, context.GetPlace());
-    FillNpuTensorWithConstant<T>(&tensor_tmp, static_cast<T>(value));
-
-    auto stream =
-        context.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    auto shape = out->dims();
-    NpuOpRunner runner;
-    runner.SetType("Fill")
-        .AddInput(phi::vectorize(shape))
-        .AddInput(tensor_tmp)
-        .AddOutput(*out)
-        .Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(fill_any_like,
-                       ops::FillAnyLikeNPUKernel<int>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-                       ops::FillAnyLikeNPUKernel<int64_t>,
-#endif
-                       ops::FillAnyLikeNPUKernel<float>,
-                       ops::FillAnyLikeNPUKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc
deleted file mode 100644
index fed75fc018a0c4..00000000000000
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/utils.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class FillConstantBatchSizeLikeOpNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto data_type =
-        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype"));
-    auto float_value = ctx.Attr<float>("value");
-    auto str_value = ctx.Attr<std::string>("str_value");
-    auto force_cpu = ctx.Attr<bool>("force_cpu");
-
-    auto *out = ctx.Output<phi::DenseTensor>("Out");
-    auto *in = ctx.Input<phi::DenseTensor>("Input");
-    if (in->lod().size() && ctx.Attr<int>("input_dim_idx") == 0) {
-      // set the correct batch size for the phi::DenseTensor.
-      auto odims = out->dims();
-      int output_dim_idx = ctx.Attr<int>("output_dim_idx");
-      odims[output_dim_idx] = static_cast<int>(in->lod().back().size()) - 1;
-      out->mutable_data<T>(odims, ctx.GetPlace());
-    }
-
-    T value;
-    if (str_value.empty()) {
-      value = static_cast<T>(float_value);
-    } else {
-      // handle NaN/Inf first, which cannot be read from stream.
-      if (str_value == "inf") {
-        value = static_cast<T>(std::numeric_limits<double>::infinity());
-      } else if (str_value == "-inf") {
-        value = static_cast<T>(-std::numeric_limits<double>::infinity());
-      } else if (str_value == "nan") {
-        value = static_cast<T>(std::numeric_limits<double>::quiet_NaN());
-      } else {
-        std::stringstream convert_stream(str_value);
-        if (std::is_same<int64_t, T>::value) {
-          int64_t tmp_value;
-          convert_stream >> tmp_value;
-          value = static_cast<T>(tmp_value);
-        } else {
-          double tmp_value;
-          convert_stream >> tmp_value;
-          value = static_cast<T>(tmp_value);
-        }
-      }
-    }
-
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    bool cpu_place = force_cpu || ctx.GetPlace() == platform::CPUPlace();
-    if (cpu_place) {
-      auto &dev_ctx = *pool.Get(platform::CPUPlace());
-      phi::funcs::SetConstant<phi::CPUContext, T> functor;
-      out->mutable_data(platform::CPUPlace(),
-                        framework::TransToPhiDataType(data_type));
-      functor(reinterpret_cast<const phi::CPUContext &>(dev_ctx),
-              out,
-              static_cast<T>(value));
-    } else {
-      out->mutable_data(ctx.GetPlace(),
-                        framework::TransToPhiDataType(data_type));
-      phi::DenseTensor tensor_tmp(framework::TransToPhiDataType(data_type));
-      tensor_tmp.mutable_data<T>({1}, ctx.GetPlace());
-      FillNpuTensorWithConstant<T>(&tensor_tmp, value);
-
-      auto stream =
-          ctx.template device_context<paddle::platform::NPUDeviceContext>()
-              .stream();
-      const auto &runner = NpuOpRunner("FillD",
-                                       {tensor_tmp},
-                                       {*out},
-                                       {{"dims", phi::vectorize(out->dims())}});
-      runner.Run(stream);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(fill_constant_batch_size_like,
-                       ops::FillConstantBatchSizeLikeOpNPUKernel<
-                           paddle::platform::NPUDeviceContext,
-                           float>,
-                       ops::FillConstantBatchSizeLikeOpNPUKernel<
-                           paddle::platform::NPUDeviceContext,
-                           int>,
-                       ops::FillConstantBatchSizeLikeOpNPUKernel<
-                           paddle::platform::NPUDeviceContext,
-                           paddle::platform::float16>);
diff --git a/paddle/fluid/operators/fill_constant_op_npu.cc b/paddle/fluid/operators/fill_constant_op_npu.cc
deleted file mode 100644
index 0724caf32793e0..00000000000000
--- a/paddle/fluid/operators/fill_constant_op_npu.cc
+++ /dev/null
@@ -1,113 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/utils.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class FillConstantNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto data_type =
-        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype"));
-    auto str_value = ctx.Attr<std::string>("str_value");
-    auto float_value = ctx.Attr<float>("value");
-
-    auto *out_var = ctx.Output<phi::DenseTensor>("Out");
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    T value;
-    if (str_value.empty()) {
-      value = static_cast<T>(float_value);
-    } else {
-      // handle NaN/Inf first, which cannot be read from stream.
-      if (str_value == "inf") {
-        value = static_cast<T>(std::numeric_limits<double>::infinity());
-      } else if (str_value == "-inf") {
-        value = static_cast<T>(-std::numeric_limits<double>::infinity());
-      } else if (str_value == "nan") {
-        value = static_cast<T>(std::numeric_limits<double>::quiet_NaN());
-      } else {
-        std::stringstream convert_stream(str_value);
-        if (std::is_same<int64_t, T>::value) {
-          int64_t tmp_value;
-          convert_stream >> tmp_value;
-          value = static_cast<T>(tmp_value);
-        } else {
-          double tmp_value;
-          convert_stream >> tmp_value;
-          value = static_cast<T>(tmp_value);
-        }
-      }
-    }
-    auto shape = GetShape(ctx);
-
-    out_var->mutable_data<T>(shape, ctx.GetPlace());
-    if (data_type != framework::proto::VarType::BOOL) {
-      Tensor tensor_value(framework::TransToPhiDataType(data_type));
-      tensor_value.mutable_data<T>({1}, ctx.GetPlace());
-      FillNpuTensorWithConstant<T>(&tensor_value, value);
-      NpuOpRunner runner;
-      runner.SetType("Fill")
-          .AddInput(phi::vectorize(shape))
-          .AddInput(tensor_value)
-          .AddOutput(*out_var)
-          .Run(stream);
-    } else {
-      const auto &dev_ctx =
-          ctx.template device_context<paddle::platform::NPUDeviceContext>();
-      auto op_func = [&shape, &value](
-                         const std::vector<Tensor> &inputs,
-                         const std::vector<Tensor> &outputs,
-                         const NPUAttributeMap &attrs,
-                         const platform::NPUDeviceContext &dev_ctx) {
-        Tensor tensor_value;
-        tensor_value.mutable_data<uint8_t>({1}, dev_ctx.GetPlace());
-        FillNpuTensorWithConstant<uint8_t>(&tensor_value,
-                                           static_cast<uint8_t>(value));
-
-        NpuOpRunner runner;
-        runner.SetType("Fill")
-            .AddInput(phi::vectorize(shape))
-            .AddInput(tensor_value)
-            .AddOutput(outputs[0])
-            .Run(dev_ctx.stream());
-      };
-      NpuOpRunner::TypeAdapter({},
-                               {*out_var},
-                               {},
-                               dev_ctx,
-                               op_func,
-                               {},
-                               {framework::proto::VarType::UINT8});
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_NPU_KERNEL(
-    fill_constant,
-    paddle::operators::FillConstantNPUKernel<float>,
-    paddle::operators::FillConstantNPUKernel<bool>,
-    paddle::operators::FillConstantNPUKernel<int>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-    paddle::operators::FillConstantNPUKernel<int64_t>,
-#endif
-    paddle::operators::FillConstantNPUKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/fill_zeros_like_op_npu.cc b/paddle/fluid/operators/fill_zeros_like_op_npu.cc
deleted file mode 100644
index 6cedc658f76f5d..00000000000000
--- a/paddle/fluid/operators/fill_zeros_like_op_npu.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/fill_zeros_like_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class FillZerosLikeNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<phi::DenseTensor>("X");
-    auto* out = context.Output<phi::DenseTensor>("Out");
-
-    out->mutable_data<T>(context.GetPlace());
-    auto stream =
-        context.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    const auto& runner = NpuOpRunner("ZerosLike", {*x}, {*out});
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    fill_zeros_like,
-    ops::FillZerosLikeNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::FillZerosLikeNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
-    ops::FillZerosLikeNPUKernel<paddle::platform::NPUDeviceContext,
-                                paddle::platform::float16>,
-    ops::FillZerosLikeNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::FillZerosLikeNPUKernel<paddle::platform::NPUDeviceContext, double>,
-    ops::FillZerosLikeNPUKernel<paddle::platform::NPUDeviceContext, bool>);
diff --git a/paddle/fluid/operators/flatten_op_npu.cc b/paddle/fluid/operators/flatten_op_npu.cc
deleted file mode 100644
index 2e43c33efd575b..00000000000000
--- a/paddle/fluid/operators/flatten_op_npu.cc
+++ /dev/null
@@ -1,148 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/operators/flatten_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class Flatten2NPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *in = context.Input<phi::DenseTensor>("X");
-    auto *out = context.Output<phi::DenseTensor>("Out");
-    auto &axis = context.Attr<int>("axis");
-    out->mutable_data(context.GetPlace(), in->type());
-    framework::NPUAttributeMap attr_input = {{"axis", axis}};
-
-    auto stream =
-        context.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    const auto &runner = NpuOpRunner("FlattenV2", {*in}, {*out}, attr_input);
-    runner.Run(stream);
-  }
-};
-
-template <typename T>
-class Flatten2GradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto *d_out = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-
-    auto xshape_dims = ctx.Input<phi::DenseTensor>("XShape")->dims();
-    auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size());
-
-    d_x->mutable_data(ctx.GetPlace(), d_out->type());
-    framework::TensorCopy(
-        *d_out,
-        ctx.GetPlace(),
-        ctx.template device_context<paddle::platform::NPUDeviceContext>(),
-        d_x);
-    d_x->Resize(x_dims);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class FlattenContiguousRangeNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *X = ctx.Input<phi::DenseTensor>("X");
-    auto *Out = ctx.Output<phi::DenseTensor>("Out");
-    int start_axis = ctx.Attr<int>("start_axis");
-    int stop_axis = ctx.Attr<int>("stop_axis");
-
-    Out->mutable_data<T>(ctx.GetPlace());
-
-    const auto &runner =
-        NpuOpRunner("FlattenV2",
-                    {*X},
-                    {*Out},
-                    {{"axis", static_cast<int32_t>(start_axis)},
-                     {"end_axis", static_cast<int32_t>(stop_axis)}});
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class FlattenContiguousRangeGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto *d_out = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-
-    auto xshape_dims = ctx.Input<phi::DenseTensor>("XShape")->dims();
-    auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size());
-
-    d_x->mutable_data(ctx.GetPlace(), d_out->type());
-    framework::TensorCopy(
-        *d_out,
-        ctx.GetPlace(),
-        ctx.template device_context<paddle::platform::NPUDeviceContext>(),
-        d_x);
-    d_x->Resize(x_dims);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(flatten2,
-                       ops::Flatten2NPUKernel<float>,
-                       ops::Flatten2NPUKernel<double>,
-                       ops::Flatten2NPUKernel<uint8_t>,
-                       ops::Flatten2NPUKernel<int>,
-                       ops::Flatten2NPUKernel<int8_t>,
-                       ops::Flatten2NPUKernel<int64_t>);
-REGISTER_OP_NPU_KERNEL(flatten2_grad,
-                       ops::Flatten2GradNPUKernel<float>,
-                       ops::Flatten2GradNPUKernel<double>,
-                       ops::Flatten2GradNPUKernel<uint8_t>,
-                       ops::Flatten2GradNPUKernel<int>,
-                       ops::Flatten2GradNPUKernel<int8_t>,
-                       ops::Flatten2GradNPUKernel<int64_t>);
-
-REGISTER_OP_NPU_KERNEL(
-    flatten_contiguous_range,
-    ops::FlattenContiguousRangeNPUKernel<paddle::platform::NPUDeviceContext,
-                                         float>,
-    ops::FlattenContiguousRangeNPUKernel<paddle::platform::NPUDeviceContext,
-                                         double>,
-    ops::FlattenContiguousRangeNPUKernel<paddle::platform::NPUDeviceContext,
-                                         uint8_t>,
-    ops::FlattenContiguousRangeNPUKernel<paddle::platform::NPUDeviceContext,
-                                         int>,
-    ops::FlattenContiguousRangeNPUKernel<paddle::platform::NPUDeviceContext,
-                                         int8_t>,
-    ops::FlattenContiguousRangeNPUKernel<paddle::platform::NPUDeviceContext,
-                                         int64_t>);
-REGISTER_OP_NPU_KERNEL(
-    flatten_contiguous_range_grad,
-    ops::FlattenContiguousRangeGradNPUKernel<paddle::platform::NPUDeviceContext,
-                                             float>,
-    ops::FlattenContiguousRangeGradNPUKernel<paddle::platform::NPUDeviceContext,
-                                             double>,
-    ops::FlattenContiguousRangeGradNPUKernel<paddle::platform::NPUDeviceContext,
-                                             uint8_t>,
-    ops::FlattenContiguousRangeGradNPUKernel<paddle::platform::NPUDeviceContext,
-                                             int>,
-    ops::FlattenContiguousRangeGradNPUKernel<paddle::platform::NPUDeviceContext,
-                                             int8_t>,
-    ops::FlattenContiguousRangeGradNPUKernel<paddle::platform::NPUDeviceContext,
-                                             int64_t>);
diff --git a/paddle/fluid/operators/gather_nd_op_npu.cc b/paddle/fluid/operators/gather_nd_op_npu.cc
deleted file mode 100644
index feb1567e58d78d..00000000000000
--- a/paddle/fluid/operators/gather_nd_op_npu.cc
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-
-using NPUDeviceContext = platform::NPUDeviceContext;
-
-template <typename T>
-class GatherNdNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *x = ctx.Input<phi::DenseTensor>("X");
-    auto *index = ctx.Input<phi::DenseTensor>("Index");
-    auto *out = ctx.Output<phi::DenseTensor>("Out");
-
-    out->template mutable_data<T>(ctx.GetPlace());
-
-    if (x->numel() == 0) return;
-
-    if (index->numel() == 0) {
-      framework::TensorCopy(*x, ctx.GetPlace(), ctx.device_context(), out);
-      return;
-    }
-
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match,
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Index holds the wrong type, it holds [%s],"
-                          "but desires to be [%s] or [%s]",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-
-    const auto &runner = NpuOpRunner("GatherNd", {*x, *index}, {*out}, {});
-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-    runner.Run(stream);
-  }
-};
-
-template <typename T>
-class GatherNdGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *index = ctx.Input<phi::DenseTensor>("Index");
-    auto *x = ctx.Input<phi::DenseTensor>("X");
-    auto *dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto *dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto *p = dx->mutable_data<T>(ctx.GetPlace());
-
-    if (dx->numel() == 0) return;
-
-    if (index->numel() == 0) {
-      framework::TensorCopy(*dout, ctx.GetPlace(), ctx.device_context(), dx);
-      return;
-    }
-
-    phi::DenseTensor tmp_tensor(index->type());
-    phi::DenseTensor tmp_tensor2(dout->type());
-    const auto index_dims = index->dims();
-    if (index_dims.size() == 1) {
-      tmp_tensor.ShareDataWith(*index);
-      std::vector<int64_t> new_dim = {1, index_dims[0]};
-      tmp_tensor.Resize(phi::make_ddim(new_dim));
-      index = &tmp_tensor;
-
-      tmp_tensor2.ShareDataWith(*dout);
-      std::vector<int64_t> new_dim2{1};
-      for (int i = index->numel(); i < x->dims().size(); i++) {
-        new_dim2.push_back(x->dims()[i]);
-      }
-      tmp_tensor2.Resize(phi::make_ddim(new_dim2));
-      dout = &tmp_tensor2;
-    }
-
-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-    platform::NPUMemsetAsync(
-        static_cast<void *>(p), 0, dx->numel() * sizeof(T), stream);
-
-    const auto &runner_scatter = NpuOpRunner(
-        "ScatterNdAdd", {*dx, *index, *dout}, {*dx}, {{"use_locking", false}});
-    runner_scatter.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_NPU_KERNEL(gather_nd,
-                       ops::GatherNdNPUKernel<paddle::platform::float16>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-                       ops::GatherNdNPUKernel<int64_t>,
-#endif
-                       ops::GatherNdNPUKernel<float>);
-
-REGISTER_OP_NPU_KERNEL(gather_nd_grad,
-                       ops::GatherNdGradNPUKernel<paddle::platform::float16>,
-                       ops::GatherNdGradNPUKernel<float>);
diff --git a/paddle/fluid/operators/gather_op_npu.cc b/paddle/fluid/operators/gather_op_npu.cc
deleted file mode 100644
index ab42d78a0a1d74..00000000000000
--- a/paddle/fluid/operators/gather_op_npu.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor_util.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class GatherOpNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *x = ctx.Input<phi::DenseTensor>("X");
-    auto *index = ctx.Input<phi::DenseTensor>("Index");
-    auto *out = ctx.Output<phi::DenseTensor>("Out");
-
-    out->mutable_data<T>(ctx.GetPlace());
-    const auto &runner = NpuOpRunner(
-        "Gather", {*x, *index}, {*out}, {{"validate_indices", true}});
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GatherGradOpNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *index = ctx.Input<phi::DenseTensor>("Index");
-    auto *x = ctx.Input<phi::DenseTensor>("X");
-    auto *dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto *dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    dx->mutable_data<T>(ctx.GetPlace());
-
-    // step1: Unsqueeze index
-    phi::DenseTensor tmp_tensor(index->type());
-    const auto index_dims = index->dims();
-    if (index_dims.size() == 1) {
-      tmp_tensor.ShareDataWith(*index);
-      std::vector<int64_t> new_dim = {index_dims[0], 1};
-      tmp_tensor.Resize(phi::make_ddim(new_dim));
-      index = &tmp_tensor;
-    }
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    // step2: ZerosLike x in device
-    Tensor zeroslike_xout(dx->type());
-    zeroslike_xout.Resize(x->dims());
-    auto p = zeroslike_xout.mutable_data<T>(ctx.GetPlace());
-
-    platform::NPUMemsetAsync(
-        static_cast<void *>(p), 0, zeroslike_xout.numel() * sizeof(T), stream);
-
-    // step3: scatter(x_grad)
-    const auto &runner_scatter = NpuOpRunner(
-        "TensorScatterUpdate", {zeroslike_xout, *index, *dout}, {*dx}, {});
-    runner_scatter.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_NPU_KERNEL(
-    gather,
-    ops::GatherOpNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::GatherOpNPUKernel<paddle::platform::NPUDeviceContext, double>,
-    ops::GatherOpNPUKernel<paddle::platform::NPUDeviceContext,
-                           paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    gather_grad,
-    ops::GatherGradOpNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::GatherGradOpNPUKernel<paddle::platform::NPUDeviceContext, double>,
-    ops::GatherGradOpNPUKernel<paddle::platform::NPUDeviceContext,
-                               paddle::platform::float16>);
diff --git a/paddle/fluid/operators/gather_op_npu_test.cc b/paddle/fluid/operators/gather_op_npu_test.cc
deleted file mode 100644
index 69d82ecaedeea4..00000000000000
--- a/paddle/fluid/operators/gather_op_npu_test.cc
+++ /dev/null
@@ -1,171 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-USE_OP_ITSELF(gather);
-USE_OP_DEVICE_KERNEL(gather, NPU);
-USE_OP_ITSELF(gather_grad);
-USE_OP_DEVICE_KERNEL(gather_grad, NPU);
-
-template <typename T>
-void Compare(f::Scope* scope,
-             const p::DeviceContext& ctx,
-             std::string op_type) {
-  // init
-  auto x = scope->Var("X");
-  auto tensor_x = x->GetMutable<phi::DenseTensor>();
-
-  auto index = scope->Var("Index");
-  auto tensor_index = index->GetMutable<phi::DenseTensor>();
-
-  std::vector<T> init_x;
-  for (int64_t i = 1; i < 7; ++i) {
-    // 1,2,3,4,5,6
-    init_x.push_back(static_cast<T>(i));
-  }
-
-  // [[1, 2],[3, 4],[5, 6]]
-  paddle::framework::TensorFromVector(init_x, ctx, tensor_x);
-  tensor_x->Resize(phi::make_ddim({3, 2}));
-
-  std::vector<int> init_index = {1, 2};
-  paddle::framework::TensorFromVector<int>(init_index, ctx, tensor_index);
-  tensor_index->Resize(phi::make_ddim({2}));
-
-  ctx.Wait();
-
-  auto out = scope->Var("Out");
-  auto tensor_out = out->GetMutable<phi::DenseTensor>();
-
-  // run
-  f::AttributeMap attrs = {{"validate_indices", true}};
-  auto op = f::OpRegistry::CreateOp(
-      op_type, {{"X", {"X"}}, {"Index", {"Index"}}}, {{"Out", {"Out"}}}, attrs);
-
-  auto place = ctx.GetPlace();
-  op->Run(*scope, place);
-
-  std::vector<T> out_vec;
-  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
-
-  ctx.Wait();
-
-  // ref:https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/tensor/manipulation/gather_cn.html#gather
-  for (int i = 0; i < static_cast<int>(out_vec.size()); ++i) {
-    VLOG(3) << "out_vec[" << i << "] : " << out_vec[i];
-  }
-  uint32_t expected_size = 4;
-  EXPECT_EQ((uint32_t)out_vec.size(), expected_size);
-
-  // {3, 4, 5, 6}
-  std::vector<T> expected_out_vec;
-  for (int64_t i = 3; i < 7; ++i) {
-    expected_out_vec.push_back(static_cast<T>(i));
-  }
-  for (uint32_t i = 0; i < out_vec.size(); i++) {
-    EXPECT_EQ(out_vec[i], expected_out_vec[i]);
-  }
-}
-
-template <typename T>
-void CompareGrad(f::Scope* scope,
-                 const p::DeviceContext& ctx,
-                 std::string op_type) {
-  // init
-  auto index = scope->Var("Index");
-  auto tensor_index = index->GetMutable<phi::DenseTensor>();
-
-  auto x = scope->Var("X");
-  auto tensor_x = x->GetMutable<phi::DenseTensor>();
-
-  auto dout = scope->Var("DOut");
-  auto tensor_dout = dout->GetMutable<phi::DenseTensor>();
-
-  std::vector<int> init_index = {0, 1};
-  paddle::framework::TensorFromVector<int>(init_index, ctx, tensor_index);
-  tensor_index->Resize(phi::make_ddim({2}));
-
-  std::vector<T> init_x = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
-  paddle::framework::TensorFromVector(init_x, ctx, tensor_x);
-  tensor_x->Resize(phi::make_ddim({3, 2}));
-
-  std::vector<T> init_dout = {5.0, 10.0, 2.0, 3.0};
-  paddle::framework::TensorFromVector(init_dout, ctx, tensor_dout);
-  tensor_dout->Resize(phi::make_ddim({2, 2}));
-
-  ctx.Wait();
-
-  auto dx = scope->Var("DX");
-  auto tensor_dx = dx->GetMutable<phi::DenseTensor>();
-
-  // run
-  f::AttributeMap attrs;
-  auto op = f::OpRegistry::CreateOp(
-      op_type,
-      {{"X", {"X"}}, {"Index", {"Index"}}, {"Out@GRAD", {"DOut"}}},
-      {{"X@GRAD", {"DX"}}},
-      attrs);
-
-  auto place = ctx.GetPlace();
-  op->Run(*scope, place);
-
-  std::vector<T> dx_vec;
-  paddle::framework::TensorToVector(*tensor_dx, ctx, &dx_vec);
-
-  ctx.Wait();
-
-  uint32_t expected_size = 3 * 2;
-  EXPECT_EQ((uint32_t)dx_vec.size(), expected_size);
-
-  std::vector<T> expected_dx_vec = {5.0, 10.0, 2.0, 3.0, 0.0, 0.0};
-  for (uint32_t i = 0; i < dx_vec.size(); i++) {
-    VLOG(3) << "dx_vec[i]=" << dx_vec[i];
-    EXPECT_EQ(dx_vec[i], expected_dx_vec[i]);
-  }
-}
-
-TEST(gather, NPU_fp32) {
-  f::Scope scope;
-  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<float>(&scope, *ctx, "gather");
-}
-
-TEST(gather, NPU_fp16) {
-  f::Scope scope;
-  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<p::float16>(&scope, *ctx, "gather");
-}
-
-TEST(gather_grad, NPU_fp32) {
-  f::Scope scope;
-  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  CompareGrad<float>(&scope, *ctx, "gather_grad");
-}
diff --git a/paddle/fluid/operators/gaussian_random_op_npu.cc b/paddle/fluid/operators/gaussian_random_op_npu.cc
deleted file mode 100644
index 9b3c23ad2b9c19..00000000000000
--- a/paddle/fluid/operators/gaussian_random_op_npu.cc
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <random>
-
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/phi/core/generator.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class NPUGaussianRandomKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    float mean = context.Attr<float>("mean");
-    float std = context.Attr<float>("std");
-    auto* tensor = context.Output<phi::DenseTensor>("Out");
-    tensor->mutable_data<T>(context.GetPlace());
-
-    phi::DenseTensor cpu_tensor(tensor->dtype());
-    cpu_tensor.Resize(tensor->dims());
-    T* cpu_data = cpu_tensor.mutable_data<T>(platform::CPUPlace());
-    std::normal_distribution<T> dist(mean, std);
-
-    int64_t size = tensor->numel();
-
-    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    auto engine = phi::GetCPURandomEngine(seed);
-    for (int64_t i = 0; i < size; ++i) {
-      cpu_data[i] = dist(*engine);
-    }
-    framework::TensorCopy(
-        cpu_tensor,
-        context.GetPlace(),
-        context.template device_context<platform::DeviceContext>(),
-        tensor);
-    context.template device_context<paddle::platform::NPUDeviceContext>()
-        .Wait();
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_NPU_KERNEL(gaussian_random, ops::NPUGaussianRandomKernel<float>);
diff --git a/paddle/fluid/operators/gelu_op_npu.cc b/paddle/fluid/operators/gelu_op_npu.cc
deleted file mode 100644
index 1b40a6fbb454c1..00000000000000
--- a/paddle/fluid/operators/gelu_op_npu.cc
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/tensor.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class GeluNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    auto place = ctx.GetPlace();
-
-    out->mutable_data<T>(place);
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner = NpuOpRunner("Gelu", {*x}, {*out}, {});
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GeluGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-
-    auto place = ctx.GetPlace();
-
-    dx->mutable_data<T>(place);
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    // NOTE(pangyoki): In the original implementation of GeluGrad op, the input
-    // is {*dout, *x, out}, where out = Gelu(x). However, we find that variable
-    // `out` was not actually used. In order to improve performance, the
-    // useless GELU operation was deleted.
-    // We directly use `*dout` as a placeholder to replace `out`, it will not
-    // be used in calculations.
-    const auto& runner_dx =
-        NpuOpRunner("GeluGrad", {*dout, *x, *dout}, {*dx}, {});
-    runner_dx.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    gelu,
-    ops::GeluNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::GeluNPUKernel<paddle::platform::NPUDeviceContext,
-                       paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    gelu_grad,
-    ops::GeluGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::GeluGradNPUKernel<paddle::platform::NPUDeviceContext,
-                           paddle::platform::float16>);
diff --git a/paddle/fluid/operators/gelu_op_npu_test.cc b/paddle/fluid/operators/gelu_op_npu_test.cc
deleted file mode 100644
index 9dca0bb8cba0f5..00000000000000
--- a/paddle/fluid/operators/gelu_op_npu_test.cc
+++ /dev/null
@@ -1,167 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-USE_OP_ITSELF(gelu);
-USE_OP_DEVICE_KERNEL(gelu, NPU);
-
-template <typename T>
-void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
-  // init
-  auto x = scope->Var("X");
-  auto tensor_x = x->GetMutable<phi::DenseTensor>();
-
-  std::vector<T> init_x;
-  for (int64_t i = 0; i < 10 * 10; ++i) {
-    init_x.push_back(static_cast<T>(1.0));
-  }
-
-  paddle::framework::TensorFromVector(init_x, ctx, tensor_x);
-  tensor_x->Resize({10, 10});
-
-  auto out = scope->Var("Out");
-  auto tensor_out = out->GetMutable<phi::DenseTensor>();
-
-  f::AttributeMap attrs;
-
-  ctx.Wait();
-
-  // run
-  auto place = ctx.GetPlace();
-
-  auto op = f::OpRegistry::CreateOp(
-      "gelu", {{"X", {"X"}}}, {{"Out", {"Out"}}}, attrs);
-  op->Run(*scope, place);
-
-  ctx.Wait();
-
-  // eval time
-  struct timeval start, end;
-  gettimeofday(&start, NULL);
-
-  for (int i = 0; i < 100; i++) {
-    op->Run(*scope, place);
-  }
-
-  ctx.Wait();
-
-  gettimeofday(&end, NULL);
-  int micros =
-      (((end.tv_sec - start.tv_sec) * 1000000) + end.tv_usec) - (start.tv_usec);
-  printf("used time: %d\n", micros / 100);
-
-  // eval value
-  std::vector<T> out_vec;
-  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
-
-  float expected = 0.841192;
-  for (uint32_t i = 0; i < out_vec.size(); i++) {
-    EXPECT_FLOAT_EQ(out_vec[i], static_cast<T>(expected));
-  }
-}
-
-template <typename T>
-void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
-  auto dout = scope->Var("DOut");
-  auto tensor_dout = dout->GetMutable<phi::DenseTensor>();
-
-  auto x = scope->Var("X");
-  auto tensor_x = x->GetMutable<phi::DenseTensor>();
-
-  std::vector<T> init_dout;
-  for (int64_t i = 0; i < 10 * 10; ++i) {
-    init_dout.push_back(static_cast<T>(1.0));
-  }
-
-  std::vector<T> init_x;
-  for (int64_t i = 0; i < 10 * 10; ++i) {
-    init_x.push_back(static_cast<T>(1.0));
-  }
-
-  paddle::framework::TensorFromVector(init_dout, ctx, tensor_dout);
-  tensor_dout->Resize({10, 10});
-  paddle::framework::TensorFromVector(init_x, ctx, tensor_x);
-  tensor_x->Resize({10, 10});
-
-  auto dx = scope->Var("DX");
-  auto tensor_dx = dx->GetMutable<phi::DenseTensor>();
-
-  f::AttributeMap attrs;
-
-  ctx.Wait();
-
-  // run
-  auto place = ctx.GetPlace();
-
-  auto op = f::OpRegistry::CreateOp("gelu_grad",
-                                    {{"Out@GRAD", {"DOut"}}, {"X", {"X"}}},
-                                    {{"X@GRAD", {"DX"}}},
-                                    attrs);
-  op->Run(*scope, place);
-
-  ctx.Wait();
-
-  // eval time
-  struct timeval start, end;
-  gettimeofday(&start, NULL);
-
-  for (int i = 0; i < 100; i++) {
-    op->Run(*scope, place);
-  }
-
-  ctx.Wait();
-
-  gettimeofday(&end, NULL);
-  int micros =
-      (((end.tv_sec - start.tv_sec) * 1000000) + end.tv_usec) - (start.tv_usec);
-  printf("used time: %d\n", micros / 100);
-
-  // eval value
-  std::vector<T> dx_vec;
-  paddle::framework::TensorToVector(*tensor_dx, ctx, &dx_vec);
-
-  float expected = 1.082964;
-  for (uint32_t i = 0; i < dx_vec.size(); i++) {
-    EXPECT_FLOAT_EQ(dx_vec[i], static_cast<T>(expected));
-  }
-}
-
-TEST(gelu, NPU_fp32) {
-  f::Scope scope;
-  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<float>(&scope, *ctx);
-}
-
-TEST(gelu_grad, NPU) {
-  f::Scope scope;
-  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  CompareGrad<float>(&scope, *ctx);
-}
diff --git a/paddle/fluid/operators/group_norm_op_npu.cc b/paddle/fluid/operators/group_norm_op_npu.cc
deleted file mode 100644
index 49fdd3566825bc..00000000000000
--- a/paddle/fluid/operators/group_norm_op_npu.cc
+++ /dev/null
@@ -1,327 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <vector>
-
-#include "paddle/fluid/framework/data_layout.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct GroupNormFunction {
- public:
-  explicit GroupNormFunction(const framework::ExecutionContext& ctx)
-      : ctx(ctx) {
-    place = ctx.GetPlace();
-    stream = ctx.template device_context<paddle::platform::NPUDeviceContext>()
-                 .stream();
-  }
-  void ReduceMean(const phi::DenseTensor* x,
-                  phi::DenseTensor* y,
-                  const std::vector<int>& dim,
-                  bool keep_dims = true) {
-    //  y should be init first
-    const auto& runner = NpuOpRunner(
-        "ReduceMeanD", {*x}, {*y}, {{"axes", dim}, {"keep_dims", keep_dims}});
-    runner.Run(stream);
-  }
-  void ReduceSum(const phi::DenseTensor* x,
-                 phi::DenseTensor* y,
-                 const std::vector<int>& dim,
-                 bool keep_dims = true) {
-    //  y should be init first
-    const auto& runner = NpuOpRunner(
-        "ReduceSumD", {*x}, {*y}, {{"axes", dim}, {"keep_dims", keep_dims}});
-    runner.Run(stream);
-  }
-  void Add(const phi::DenseTensor* x,
-           const phi::DenseTensor* y,
-           phi::DenseTensor* z) {
-    //  y should be init first
-    const auto& runner = NpuOpRunner("AddV2", {*x, *y}, {*z}, {});
-    runner.Run(stream);
-  }
-  void Sub(const phi::DenseTensor* x,
-           const phi::DenseTensor* y,
-           phi::DenseTensor* z) {
-    //  y should be init first
-    const auto& runner = NpuOpRunner("Sub", {*x, *y}, {*z}, {});
-    runner.Run(stream);
-  }
-  void Mul(const phi::DenseTensor* x,
-           const phi::DenseTensor* y,
-           phi::DenseTensor* z) {
-    //  y should be init first
-    const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*z}, {});
-    runner.Run(stream);
-  }
-  void Div(const phi::DenseTensor* x,
-           const phi::DenseTensor* y,
-           phi::DenseTensor* z) {
-    //  y should be init first
-    const auto& runner = NpuOpRunner("Div", {*x, *y}, {*z}, {});
-    runner.Run(stream);
-  }
-  void DivNoNan(const phi::DenseTensor* x,
-                const phi::DenseTensor* y,
-                phi::DenseTensor* z) {
-    //  y should be init first
-    const auto& runner = NpuOpRunner("DivNoNan", {*x, *y}, {*z}, {});
-    runner.Run(stream);
-  }
-  void Transpose(const phi::DenseTensor* x,
-                 phi::DenseTensor* y,
-                 const std::vector<int>& axis) {
-    //  y should be init first
-    const auto& runner =
-        NpuOpRunner("TransposeD", {*x}, {*y}, {{"perm", axis}});
-    runner.Run(stream);
-  }
-  void Sqrt(const phi::DenseTensor* x, phi::DenseTensor* y) {
-    //  y should be init first
-    const auto& runner = NpuOpRunner("Sqrt", {*x}, {*y}, {});
-    runner.Run(stream);
-  }
-  void Adds(const phi::DenseTensor* x, float scalar, phi::DenseTensor* y) {
-    //  y should be init first
-    const auto& runner = NpuOpRunner("Adds", {*x}, {*y}, {{"value", scalar}});
-    runner.Run(stream);
-  }
-  phi::DenseTensor ReduceMeanToNG(const phi::DenseTensor* x,
-                                  const DataLayout& data_layout,
-                                  const int64_t N,
-                                  const int64_t C,
-                                  const int64_t H,
-                                  const int64_t W,
-                                  const int G) {
-    phi::DenseTensor y(x->type());
-    // y.mutable_data<T>( {N,G,1}, place );
-    if (data_layout == DataLayout::kNCHW) {
-      y.mutable_data<T>({N, G, 1}, place);
-      //  shape of x is [N, G, C*H*W/G]
-      this->ReduceMean(x, &y, std::vector<int>{2});
-    } else {
-      y.mutable_data<T>({N, 1, G}, place);
-      //  shape of x is [N, C*H*W/G, G]
-      phi::DenseTensor x_trans(x->type());
-      x_trans.mutable_data<T>({N, G, C * H * W / G}, place);
-      this->Transpose(x, &x_trans, std::vector<int>{0, 2, 1});
-      this->ReduceMean(&x_trans, &y, std::vector<int>{2});
-    }
-    return y;
-  }
-
- private:
-  platform::Place place;
-  aclrtStream stream;
-  const framework::ExecutionContext& ctx;
-};
-
-template <typename T>
-class GroupNormNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
-    const float epsilon = ctx.Attr<float>("epsilon");
-    auto* scale = ctx.Input<phi::DenseTensor>("Scale");
-    auto* bias = ctx.Input<phi::DenseTensor>("Bias");
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-
-    auto* y = ctx.Output<phi::DenseTensor>("Y");
-    auto* mean = ctx.Output<phi::DenseTensor>("Mean");
-    auto* var = ctx.Output<phi::DenseTensor>("Variance");
-    const auto groups = ctx.Attr<int>("groups");
-
-    auto place = ctx.GetPlace();
-    phi::DenseTensor xnorm(x->type());
-    xnorm.mutable_data<T>(x->dims(), place);
-    GroupNormFunction<T> F(ctx);
-    if (data_layout != DataLayout::kNCHW) {
-      xnorm.Resize({x->dims()[0], x->dims()[3], x->dims()[1], x->dims()[2]});
-      F.Transpose(x, &xnorm, std::vector<int>{0, 3, 1, 2});
-    } else {
-      paddle::framework::TensorCopy(*x, platform::NPUPlace(), &xnorm);
-    }
-    auto N = xnorm.dims()[0];
-    auto C = xnorm.dims()[1];
-    auto H = xnorm.dims()[2];
-    auto W = xnorm.dims()[3];
-    xnorm.Resize({N * groups, C * H * W / groups});
-    std::vector<int> axis = {1};
-    auto reduce_dim = mean->dims();
-
-    mean->mutable_data<T>({N * groups, 1}, place);
-    var->mutable_data<T>({N * groups, 1}, place);
-    y->mutable_data<T>(place);
-    F.ReduceMean(&xnorm, mean, axis);
-
-    F.Sub(&xnorm, mean, &xnorm);
-    phi::DenseTensor sqr(x->type());
-    sqr.mutable_data<T>(xnorm.dims(), place);
-
-    F.Mul(&xnorm, &xnorm, &sqr);
-    F.ReduceMean(&sqr, var, axis);
-    phi::DenseTensor std(x->type());
-    std.mutable_data<T>(var->dims(), place);
-    F.Adds(var, epsilon, &std);
-    F.Sqrt(&std, &std);
-    y->Resize(xnorm.dims());
-    F.Div(&xnorm, &std, y);
-    y->Resize({N, C, H, W});
-    if (scale) {
-      phi::DenseTensor scale_t(scale->type());
-      scale_t.ShareDataWith(*scale);
-      scale_t.Resize({C, 1, 1});
-      F.Mul(y, &scale_t, y);
-    }
-    if (bias) {
-      phi::DenseTensor bias_t(bias->type());
-      bias_t.ShareDataWith(*bias);
-      bias_t.Resize({C, 1, 1});
-      F.Add(y, &bias_t, y);
-    }
-    if (data_layout != DataLayout::kNCHW) {
-      F.Transpose(y, y, std::vector<int>{0, 2, 3, 1});
-      y->Resize({x->dims()});
-    }
-    mean->Resize(reduce_dim);
-    var->Resize(reduce_dim);
-  }
-};
-
-template <typename T>
-class GroupNormGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
-    const float epsilon = ctx.Attr<float>("epsilon");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* var = ctx.Input<phi::DenseTensor>("Variance");
-
-    auto* scale = ctx.Input<phi::DenseTensor>("Scale");
-    auto* bias = ctx.Input<phi::DenseTensor>("Bias");
-    auto* d_y = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
-    const auto G = ctx.Attr<int>("groups");
-
-    // init output
-    auto* d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* d_scale =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Scale"));
-    auto* d_bias = ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
-
-    GroupNormFunction<T> F(ctx);
-    auto place = ctx.GetPlace();
-    auto _type = y->type();
-
-    phi::DenseTensor xnorm(_type);
-    xnorm.mutable_data<T>(y->dims(), place);
-    phi::DenseTensor scale_share(_type);
-    scale_share.ShareDataWith(*scale);
-    phi::DenseTensor bias_share(_type);
-    bias_share.ShareDataWith(*bias);
-
-    int64_t N = y->dims()[0];
-    int64_t C, H, W;
-    framework::DDim scale_bias_dim;
-    if (data_layout == DataLayout::kNCHW) {
-      C = y->dims()[1];
-      H = y->dims()[2];
-      W = y->dims()[3];
-      scale_bias_dim = phi::make_ddim({C, 1, 1});
-    } else {
-      C = y->dims()[3];
-      H = y->dims()[1];
-      W = y->dims()[2];
-      scale_bias_dim = phi::make_ddim({1, 1, C});
-    }
-    scale_share.Resize(scale_bias_dim);
-    bias_share.Resize(scale_bias_dim);
-    F.Sub(y, &bias_share, &xnorm);
-    F.DivNoNan(&xnorm, &scale_share, &xnorm);
-
-    if (d_bias) {
-      d_bias->mutable_data<T>(place);
-      if (data_layout == DataLayout::kNCHW) {
-        F.ReduceSum(d_y, d_bias, std::vector<int>{0, 2, 3}, false);
-      } else {
-        F.ReduceSum(d_y, d_bias, std::vector<int>{0, 1, 2}, false);
-      }
-    }
-    if (d_scale) {
-      d_scale->mutable_data<T>(place);
-      phi::DenseTensor dy_xnorm(_type);
-      dy_xnorm.mutable_data<T>(d_y->dims(), place);
-      F.Mul(d_y, &xnorm, &dy_xnorm);
-      if (data_layout == DataLayout::kNCHW) {
-        F.ReduceSum(&dy_xnorm, d_scale, std::vector<int>{0, 2, 3});
-      } else {
-        F.ReduceSum(&dy_xnorm, d_scale, std::vector<int>{0, 1, 2});
-      }
-    }
-
-    //  std = Sqrt(var+epsilon), init shape = [ N, G ]
-    phi::DenseTensor std(_type);
-    std.mutable_data<T>(var->dims(), place);
-    F.Adds(var, epsilon, &std);
-    F.Sqrt(&std, &std);
-    //  d_xnorm_std = dy_proc * scale / std
-    phi::DenseTensor d_xnorm_std(_type);
-    d_xnorm_std.mutable_data<T>(y->dims(), place);
-    F.Mul(d_y, &scale_share, &d_xnorm_std);
-    if (data_layout == DataLayout::kNCHW) {
-      xnorm.Resize({N, G, C * H * W / G});
-      d_xnorm_std.Resize({N, G, C * H * W / G});
-      std.Resize({N, G, 1});
-    } else {
-      xnorm.Resize({N, C * H * W / G, G});
-      d_xnorm_std.Resize({N, C * H * W / G, G});
-      std.Resize({N, 1, G});
-    }
-    F.Div(&d_xnorm_std, &std, &d_xnorm_std);
-
-    //  d_x = d_xnorm_std
-    //       - Mean ( d_xnorm_std * x_norm, axis=1, keepdim=True ) * x_norm
-    //       - Mean ( d_xnorm_std, axis=1, keepdim=True )
-    d_x->mutable_data<T>(place);
-    d_x->Resize(xnorm.dims());
-    F.Mul(&d_xnorm_std, &xnorm, d_x);
-    phi::DenseTensor dx1 = F.ReduceMeanToNG(d_x, data_layout, N, C, H, W, G);
-    F.Mul(&dx1, &xnorm, d_x);
-
-    phi::DenseTensor dx2 =
-        F.ReduceMeanToNG(&d_xnorm_std, data_layout, N, C, H, W, G);
-
-    F.Sub(&d_xnorm_std, d_x, d_x);
-    F.Sub(d_x, &dx2, d_x);
-
-    d_x->Resize(y->dims());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(group_norm,
-                       ops::GroupNormNPUKernel<float>,
-                       ops::GroupNormNPUKernel<plat::float16>);
-REGISTER_OP_NPU_KERNEL(group_norm_grad,
-                       ops::GroupNormGradNPUKernel<float>,
-                       ops::GroupNormGradNPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/huber_loss_op_npu.cc b/paddle/fluid/operators/huber_loss_op_npu.cc
deleted file mode 100644
index 4812dfa47dfedb..00000000000000
--- a/paddle/fluid/operators/huber_loss_op_npu.cc
+++ /dev/null
@@ -1,144 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-void HuberLossSub(const platform::Place& place,
-                  const aclrtStream& stream,
-                  const phi::DenseTensor* x,
-                  const phi::DenseTensor* y,
-                  phi::DenseTensor* z) {
-  //  Calculate z = x - y
-  z->mutable_data<T>(x->dims(), place);
-  const auto& runner = NpuOpRunner("Sub", {*x, *y}, {*z}, {});
-  runner.Run(stream);
-}
-
-template <typename T>
-void HuberLossMuls(const platform::Place& place,
-                   const aclrtStream& stream,
-                   const phi::DenseTensor* x,
-                   float scalar,
-                   phi::DenseTensor* y) {
-  //  Calculate y = x + scale
-  y->mutable_data<T>(x->dims(), place);
-  const auto& runner = NpuOpRunner("Muls", {*x}, {*y}, {{"value", scalar}});
-  runner.Run(stream);
-}
-
-template <typename T>
-void HuberLossZerosLike(const platform::Place& place,
-                        const aclrtStream& stream,
-                        const phi::DenseTensor* x,
-                        phi::DenseTensor* y) {
-  y->mutable_data<T>(x->dims(), place);
-  const auto& runner = NpuOpRunner("ZerosLike", {*x}, {*y}, {});
-  runner.Run(stream);
-}
-
-template <typename T>
-void HuberLossSmoothL1Loss(const platform::Place& place,
-                           const aclrtStream& stream,
-                           const phi::DenseTensor* x,
-                           const phi::DenseTensor* y,
-                           float delta,
-                           phi::DenseTensor* z) {
-  z->mutable_data<T>(x->dims(), place);
-  const auto& runner =
-      NpuOpRunner("SmoothL1Loss", {*x, *y}, {*z}, {{"sigma", delta}});
-  runner.Run(stream);
-}
-
-template <typename T>
-void HuberLossSmoothL1LossGrad(const platform::Place& place,
-                               const aclrtStream& stream,
-                               const phi::DenseTensor* pred,
-                               const phi::DenseTensor* lab,
-                               const phi::DenseTensor* dout,
-                               float sigma,
-                               phi::DenseTensor* grad) {
-  grad->mutable_data<T>(pred->dims(), place);
-  const auto& runner = NpuOpRunner(
-      "SmoothL1LossGrad", {*pred, *lab, *dout}, {*grad}, {{"sigma", sigma}});
-  runner.Run(stream);
-}
-
-template <typename T>
-class HuberLossNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in0 = ctx.Input<phi::DenseTensor>("X");
-    auto* in1 = ctx.Input<phi::DenseTensor>("Y");
-    auto* residual = ctx.Output<phi::DenseTensor>("Residual");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    auto delta = ctx.Attr<float>("delta");
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    auto place = ctx.GetPlace();
-    HuberLossSub<T>(place, stream, in1, in0, residual);
-
-    HuberLossSmoothL1Loss<T>(place, stream, in0, in1, delta, out);
-    HuberLossMuls<T>(place, stream, out, delta, out);
-  }
-};
-
-template <typename T>
-class HuberLossGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* residual = ctx.Input<phi::DenseTensor>("Residual");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
-    auto delta = ctx.Attr<float>("delta");
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    auto place = ctx.GetPlace();
-
-    phi::DenseTensor t_grad_rd;
-    if (dx || dy) {
-      phi::DenseTensor t_zero;
-      HuberLossZerosLike<T>(place, stream, residual, &t_zero);
-      HuberLossSmoothL1LossGrad<T>(
-          place, stream, residual, &t_zero, dout, delta, &t_grad_rd);
-    }
-    if (dx) {
-      HuberLossMuls<T>(place, stream, &t_grad_rd, -delta, dx);
-    }
-    if (dy) {
-      HuberLossMuls<T>(place, stream, &t_grad_rd, delta, dy);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(huber_loss,
-                       ops::HuberLossNPUKernel<float>,
-                       ops::HuberLossNPUKernel<plat::float16>);
-REGISTER_OP_NPU_KERNEL(huber_loss_grad,
-                       ops::HuberLossGradNPUKernel<float>,
-                       ops::HuberLossGradNPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/increment_op_npu.cc b/paddle/fluid/operators/increment_op_npu.cc
deleted file mode 100644
index 7188fe38fdc680..00000000000000
--- a/paddle/fluid/operators/increment_op_npu.cc
+++ /dev/null
@@ -1,55 +0,0 @@
-//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class IncrementalNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x_tensor = context.Input<phi::DenseTensor>("X");
-    auto* out_tensor = context.Output<phi::DenseTensor>("Out");
-    float step = context.Attr<float>("step");
-    out_tensor->mutable_data<T>(context.GetPlace());
-
-    Tensor step_tensor(x_tensor->dtype());
-
-    step_tensor.mutable_data<T>({1}, context.GetPlace());
-    FillNpuTensorWithConstant<T>(&step_tensor, static_cast<T>(step));
-
-    const auto& runner =
-        NpuOpRunner("Add", {*x_tensor, step_tensor}, {*out_tensor}, {});
-
-    auto stream =
-        context.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_NPU_KERNEL(
-    increment,
-    paddle::operators::IncrementalNPUKernel<float>,
-    paddle::operators::IncrementalNPUKernel<double>,
-    paddle::operators::IncrementalNPUKernel<int>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-    paddle::operators::IncrementalNPUKernel<int64_t>,
-#endif
-    paddle::operators::IncrementalNPUKernel<paddle::platform::float16>)
diff --git a/paddle/fluid/operators/increment_op_npu_test.cc b/paddle/fluid/operators/increment_op_npu_test.cc
deleted file mode 100644
index 2a77ff82d0fa31..00000000000000
--- a/paddle/fluid/operators/increment_op_npu_test.cc
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-USE_OP(increment);
-USE_OP_DEVICE_KERNEL(increment, NPU);
-
-template <typename T>
-void Compare(f::Scope* scope,
-             const p::DeviceContext& ctx,
-             std::string op_type) {
-  // init
-  auto x = scope->Var("X");
-  auto tensor_x = x->GetMutable<phi::DenseTensor>();
-
-  std::vector<T> init;
-  init.push_back(static_cast<T>(1.0));
-
-  paddle::framework::TensorFromVector(init, ctx, tensor_x);
-  tensor_x->Resize({1});
-
-  ctx.Wait();
-
-  auto place = ctx.GetPlace();
-  auto out = scope->Var("Out");
-  auto tensor_out = out->GetMutable<phi::DenseTensor>();
-
-  f::AttributeMap attr_input = {{"step", static_cast<float>(2.0)}};
-  auto op = f::OpRegistry::CreateOp(
-      "increment", {{"X", {"X"}}}, {{"Out", {"Out"}}}, attr_input);
-
-  op->Run(*scope, place);
-
-  std::vector<T> out_vec;
-  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
-
-  ctx.Wait();
-
-  EXPECT_EQ((uint32_t)out_vec.size(), (uint32_t)1);
-  EXPECT_EQ(out_vec[0], static_cast<T>(3.0));
-}
-
-TEST(increment, NPU_fp32) {
-  f::Scope scope;
-  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<float>(&scope, *ctx, "increment");
-}
-
-TEST(increment, NPU_fp64) {
-  f::Scope scope;
-  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<double>(&scope, *ctx, "increment");
-}
diff --git a/paddle/fluid/operators/index_sample_op_npu.cc b/paddle/fluid/operators/index_sample_op_npu.cc
deleted file mode 100644
index 64a50041421b3b..00000000000000
--- a/paddle/fluid/operators/index_sample_op_npu.cc
+++ /dev/null
@@ -1,138 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename IndexT>
-void IndexSampleGather(const paddle::platform::NPUDeviceContext& dev_ctx,
-                       const phi::DenseTensor* index,
-                       const phi::DenseTensor* input,
-                       phi::DenseTensor* out) {
-  auto index_dims = index->dims();
-  auto input_dims = input->dims();
-  auto batch_size = input_dims[0];
-  auto index_length = index_dims[1];
-
-  std::vector<IndexT> gather_index_vec;
-  std::vector<IndexT> index_vec;
-  framework::TensorToVector(*index, dev_ctx, &index_vec);
-  for (auto i = 0; i < batch_size; ++i) {
-    for (auto j = 0; j < index_length; j++) {
-      gather_index_vec.push_back(i);
-      gather_index_vec.push_back(index_vec[i * index_length + j]);
-    }
-  }
-  phi::DenseTensor gather_index;
-  framework::TensorFromVector(gather_index_vec, dev_ctx, &gather_index);
-  gather_index.Resize({batch_size, index_length, 2});
-
-  NpuOpRunner runner;
-  runner.SetType("GatherNd")
-      .AddInput(*input)
-      .AddInput(gather_index)
-      .AddOutput(*out);
-  runner.Run(dev_ctx.stream());
-}
-
-template <typename T>
-class IndexSampleNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>();
-    auto* input = ctx.Input<phi::DenseTensor>("X");
-    auto* index = ctx.Input<phi::DenseTensor>("Index");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    const auto& index_type = framework::TransToProtoVarType(index->dtype());
-    if (index_type == framework::proto::VarType::INT32) {
-      IndexSampleGather<int32_t>(dev_ctx, index, input, out);
-    } else {
-      IndexSampleGather<int64_t>(dev_ctx, index, input, out);
-    }
-  }
-};
-
-template <typename IndexT>
-void IndexSampleGradScatter(const paddle::platform::NPUDeviceContext& dev_ctx,
-                            const phi::DenseTensor* index,
-                            const phi::DenseTensor* out_grad,
-                            phi::DenseTensor* x_grad) {
-  auto index_dims = index->dims();
-  auto input_dims = x_grad->dims();
-  auto batch_size = input_dims[0];
-  auto index_length = index_dims[1];
-
-  std::vector<IndexT> scatter_index_vec;
-  std::vector<IndexT> index_vec;
-  framework::TensorToVector(*index, dev_ctx, &index_vec);
-  for (auto i = 0; i < batch_size; ++i) {
-    for (auto j = 0; j < index_length; j++) {
-      scatter_index_vec.push_back(i);
-      scatter_index_vec.push_back(index_vec[i * index_length + j]);
-    }
-  }
-  phi::DenseTensor scatter_index;
-  framework::TensorFromVector(scatter_index_vec, dev_ctx, &scatter_index);
-  scatter_index.Resize({batch_size, index_length, 2});
-
-  NpuOpRunner runner;
-  runner.SetType("ScatterNd")
-      .AddInput(scatter_index)
-      .AddInput(*out_grad)
-      .AddInput(phi::vectorize<IndexT>(x_grad->dims()))
-      .AddOutput(*x_grad);
-  runner.Run(dev_ctx.stream());
-}
-
-template <typename T>
-class IndexSampleGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>();
-    auto* index = ctx.Input<phi::DenseTensor>("Index");
-    auto* out_grad = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* x_grad = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    x_grad->mutable_data<T>(ctx.GetPlace());
-
-    const auto& index_type = framework::TransToProtoVarType(index->dtype());
-    if (index_type == framework::proto::VarType::INT32) {
-      IndexSampleGradScatter<int32_t>(dev_ctx, index, out_grad, x_grad);
-    } else {
-      IndexSampleGradScatter<int64_t>(dev_ctx, index, out_grad, x_grad);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(index_sample,
-                       ops::IndexSampleNPUKernel<plat::float16>,
-                       ops::IndexSampleNPUKernel<float>,
-                       ops::IndexSampleNPUKernel<int32_t>,
-                       ops::IndexSampleNPUKernel<int64_t>);
-REGISTER_OP_NPU_KERNEL(index_sample_grad,
-                       ops::IndexSampleGradNPUKernel<plat::float16>,
-                       ops::IndexSampleGradNPUKernel<float>,
-                       ops::IndexSampleGradNPUKernel<int32_t>,
-                       ops::IndexSampleGradNPUKernel<int64_t>);
diff --git a/paddle/fluid/operators/index_select_op_npu.cc b/paddle/fluid/operators/index_select_op_npu.cc
deleted file mode 100644
index dd9c5608a0469d..00000000000000
--- a/paddle/fluid/operators/index_select_op_npu.cc
+++ /dev/null
@@ -1,161 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class IndexSelectNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* index = ctx.Input<phi::DenseTensor>("Index");
-    auto dim = ctx.Attr<int>("dim");
-
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    NpuOpRunner runner;
-    runner.SetType("GatherV2")
-        .AddInput(*x)
-        .AddInput(*index)
-        .AddInput(std::vector<int32_t>{dim})
-        .AddOutput(*out);
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class IndexSelectGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x_grad = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* index = ctx.Input<phi::DenseTensor>("Index");
-    auto* out_grad = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    auto x_dims = x_grad->dims();
-    auto out_dims = out_grad->dims();
-
-    int dim = ctx.Attr<int>("dim");
-    if (dim < 0) {
-      dim += out_dims.size();
-    }
-
-    phi::DenseTensor casted_index;
-    if (framework::TransToProtoVarType(index->dtype()) !=
-        framework::proto::VarType::INT32) {
-      casted_index.mutable_data<int32_t>(index->dims(), ctx.GetPlace());
-      const auto& cast_runner = NpuOpRunner(
-          "Cast", {*index}, {casted_index}, {{"dst_type", ACL_INT32}});
-      cast_runner.Run(stream);
-    } else {
-      casted_index.ShareDataWith(*index);
-    }
-
-    if (dim == 0) {
-      x_grad->mutable_data<T>(ctx.GetPlace());
-      const auto& zeros_runner = NpuOpRunner("ZerosLike", {*x_grad}, {*x_grad});
-      zeros_runner.Run(stream);
-
-      NpuOpRunner runner;
-      runner.SetType("UnsortedSegmentSum")
-          .AddInput(*out_grad)
-          .AddInput(casted_index)
-          .AddInput(std::vector<int64_t>{x_dims[dim]})
-          .AddOutput(*x_grad);
-      runner.Run(stream);
-    } else {
-      phi::DenseTensor transed_out_grad;
-      std::vector<int> in_trans_perm;
-      in_trans_perm.push_back(dim);
-      for (int i = 0; i < out_dims.size(); ++i) {
-        if (i == dim) continue;
-        in_trans_perm.push_back(i);
-      }
-      framework::DDim transed_out_dims(out_dims);
-      for (size_t i = 0; i < in_trans_perm.size(); ++i) {
-        transed_out_dims[i] = out_dims[in_trans_perm[i]];
-      }
-      transed_out_grad.mutable_data<T>(transed_out_dims, ctx.GetPlace());
-      NpuOpRunner in_trans_runner;
-      in_trans_runner.SetType("Transpose")
-          .AddInput(*out_grad)
-          .AddInput(std::move(in_trans_perm))
-          .AddOutput(transed_out_grad);
-      in_trans_runner.Run(stream);
-
-      phi::DenseTensor sum_out;
-      framework::DDim sum_dims(x_dims);
-      sum_dims[0] = x_dims[dim];
-      auto idx = 1;
-      for (int i = 0; i < x_dims.size(); ++i) {
-        if (i == dim) continue;
-        sum_dims[idx++] = x_dims[i];
-      }
-      sum_out.mutable_data<T>(sum_dims, ctx.GetPlace());
-      const auto& zeros_runner = NpuOpRunner("ZerosLike", {sum_out}, {sum_out});
-      zeros_runner.Run(stream);
-
-      NpuOpRunner runner;
-      runner.SetType("UnsortedSegmentSum")
-          .AddInput(transed_out_grad)
-          .AddInput(casted_index)
-          .AddInput(std::vector<int64_t>{x_dims[dim]})
-          .AddOutput(sum_out);
-      runner.Run(stream);
-
-      std::vector<int> out_trans_perm;
-      for (int i = 1; i < 1 + dim; ++i) {
-        out_trans_perm.push_back(i);
-      }
-      out_trans_perm.push_back(0);
-      for (int i = 1 + dim; i < x_dims.size(); ++i) {
-        out_trans_perm.push_back(i);
-      }
-      x_grad->mutable_data<T>(ctx.GetPlace());
-      NpuOpRunner out_trans_runner;
-      out_trans_runner.SetType("Transpose")
-          .AddInput(sum_out)
-          .AddInput(std::move(out_trans_perm))
-          .AddOutput(*x_grad);
-      out_trans_runner.Run(stream);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_NPU_KERNEL(
-    index_select,
-    ops::IndexSelectNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::IndexSelectNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::IndexSelectNPUKernel<paddle::platform::NPUDeviceContext, int64_t>);
-REGISTER_OP_NPU_KERNEL(
-    index_select_grad,
-    ops::IndexSelectGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::IndexSelectGradNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::IndexSelectGradNPUKernel<paddle::platform::NPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/instance_norm_op_npu.cc b/paddle/fluid/operators/instance_norm_op_npu.cc
deleted file mode 100644
index 03307895f09e23..00000000000000
--- a/paddle/fluid/operators/instance_norm_op_npu.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class InstanceNormNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto epsilon = ctx.Attr<float>("epsilon");
-    const auto* x = ctx.Input<phi::DenseTensor>("X");
-    const auto* scale = ctx.Input<phi::DenseTensor>("Scale");
-    const auto* bias = ctx.Input<phi::DenseTensor>("Bias");
-    auto* y = ctx.Output<phi::DenseTensor>("Y");
-    auto* mean = ctx.Output<phi::DenseTensor>("SavedMean");
-    auto* variance = ctx.Output<phi::DenseTensor>("SavedVariance");
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-    dev_ctx.template Alloc<T>(y);
-    dev_ctx.template Alloc<T>(mean);
-    dev_ctx.template Alloc<T>(variance);
-
-    auto x_dims = x->dims();
-    auto y_dims = y->dims();
-
-    PADDLE_ENFORCE(x_dims.size() <= 5 && x_dims.size() >= 3,
-                   platform::errors::InvalidArgument(
-                       "InstanceNorm only supports the dimension of input "
-                       " less equal to 5 and greater equal to 3. the dimension "
-                       "of input is %d.",
-                       x_dims.size()));
-
-    auto tmp_x_dims = phi::vectorize<int>(x_dims);
-    auto tmp_y_dims = phi::vectorize<int>(y_dims);
-    if (x_dims.size() < 5) {
-      for (size_t i = x_dims.size(); i < 5; ++i) {
-        tmp_x_dims.insert(tmp_x_dims.begin() + 2, 1);
-        tmp_y_dims.insert(tmp_y_dims.begin() + 2, 1);
-      }
-    }
-
-    phi::DenseTensor tmp_x, tmp_y;
-    tmp_x.ShareDataWith(*x);
-
-    tmp_x.Resize(phi::make_ddim(tmp_x_dims));
-    tmp_x.set_layout(phi::DataLayout::NCDHW);
-    tmp_y.ShareDataWith(*y);
-    tmp_y.Resize(phi::make_ddim(tmp_y_dims));
-    tmp_y.set_layout(phi::DataLayout::NCDHW);
-
-    NpuOpRunner runner;
-
-    runner.SetType("InstanceNorm")
-        .AddInput(tmp_x)
-        .AddInput(*scale)
-        .AddInput(*bias)
-        .AddAttr("data_format", std::string("NCDHW"))
-        .AddAttr("epsilon", epsilon)
-        .AddOutput(tmp_y)
-        .AddOutput(*mean)
-        .AddOutput(*variance);
-    runner.Run(dev_ctx.stream());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(
-    instance_norm,
-    ops::InstanceNormNPUKernel<paddle::platform::NPUDeviceContext,
-                               plat::float16>,
-    ops::InstanceNormNPUKernel<paddle::platform::NPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/interpolate_op_npu.cc b/paddle/fluid/operators/interpolate_op_npu.cc
deleted file mode 100644
index 108efafff683f0..00000000000000
--- a/paddle/fluid/operators/interpolate_op_npu.cc
+++ /dev/null
@@ -1,226 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/operators/interpolate_op.h"
-
-namespace paddle {
-namespace operators {
-using DataLayout = phi::DataLayout;
-
-inline static void CheckArgument(const framework::ExecutionContext& ctx) {
-  const std::string interp_method = ctx.Attr<std::string>("interp_method");
-#if (CANN_VERSION_CODE < 512000)
-  bool align_corners = ctx.Attr<bool>("align_corners");
-  PADDLE_ENFORCE_EQ(
-      align_corners,
-      false,
-      platform::errors::InvalidArgument(
-          "NPU Interpolate Kernel has diff when align_corners is true."));
-#endif
-  PADDLE_ENFORCE_EQ(
-      interp_method,
-      "nearest",
-      platform::errors::InvalidArgument(
-          "NPU Interpolate Kernel only support nearest interpolotion."));
-}
-
-inline static void ExtractNCHW(const framework::DDim& dims,
-                               const DataLayout& data_layout,
-                               int32_t* n,
-                               int32_t* c,
-                               int32_t* h,
-                               int32_t* w) {
-  *n = dims[0];
-  if (data_layout == DataLayout::kNCHW) {
-    *c = dims[1];
-    *h = dims[2];
-    *w = dims[3];
-  } else {  // kNHWC
-    *h = dims[1];
-    *w = dims[2];
-    *c = dims[3];
-  }
-}
-
-static void CalcOutSize(const framework::ExecutionContext& ctx,
-                        int32_t in_h,
-                        int32_t in_w,
-                        int32_t* out_h,
-                        int32_t* out_w) {
-  // Priority: SizeTensor > OutSize > Scale > scale > out_h & out_w
-  *out_h = ctx.Attr<int>("out_h");
-  *out_w = ctx.Attr<int>("out_w");
-
-  auto dev_ctx = platform::DeviceContextPool::Instance().Get(ctx.GetPlace());
-  auto list_new_size_tensor = ctx.MultiInput<phi::DenseTensor>("SizeTensor");
-
-  if (list_new_size_tensor.size() > 0) {
-    std::vector<int32_t> new_size_h(1);
-    std::vector<int32_t> new_size_w(1);
-    framework::TensorToVector(*list_new_size_tensor[0], *dev_ctx, &new_size_h);
-    framework::TensorToVector(*list_new_size_tensor[1], *dev_ctx, &new_size_w);
-    *out_h = new_size_h[0];
-    *out_w = new_size_w[0];
-  } else {
-    float scale;
-    auto scale_tensor = ctx.Input<phi::DenseTensor>("Scale");
-    if (scale_tensor != nullptr) {
-      std::vector<float> scale_data;
-      framework::TensorToVector(*scale_tensor, *dev_ctx, &scale_data);
-      scale = scale_data[0];
-    } else {
-      scale = ctx.Attr<float>("scale");
-    }
-
-    if (scale > 0) {
-      *out_h = static_cast<int32_t>(in_h * scale);
-      *out_w = static_cast<int32_t>(in_w * scale);
-    }
-
-    auto out_size = ctx.Input<phi::DenseTensor>("OutSize");
-    if (out_size != nullptr) {
-      std::vector<int> out_size_data;
-      framework::TensorToVector(*out_size, *dev_ctx, &out_size_data);
-      *out_h = out_size_data[0];
-      *out_w = out_size_data[1];
-    }
-  }
-
-  PADDLE_ENFORCE_GT(*out_h,
-                    0,
-                    platform::errors::InvalidArgument(
-                        "out_h in Attr(out_shape) of Op(interpolate) "
-                        "should be greater than 0."));
-  PADDLE_ENFORCE_GT(*out_w,
-                    0,
-                    platform::errors::InvalidArgument(
-                        "out_w in Attr(out_shape) of Op(interpolate) "
-                        "should be greater than 0."));
-}
-
-template <typename T>
-class InterpolateNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    // NOTE(Ruibiao):
-    // this kernel only support nearest interpolotion for 2D images
-    // the Ascend 'ResizeNearestNeighborV2' used in this kernle has diff
-    // when 'align_corners' is 'true' or data type is 'double'
-    CheckArgument(ctx);
-
-    auto* input = ctx.Input<phi::DenseTensor>("X");
-    framework::DDim input_dims = input->dims();
-
-    const std::string data_layout_str =
-        ctx.Attr<std::string>("data_layout");  // kNCHW or kNHWC
-    const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
-
-    int32_t n, c, h, w, out_h, out_w;
-    ExtractNCHW(input_dims, data_layout, &n, &c, &h, &w);
-    CalcOutSize(ctx, h, w, &out_h, &out_w);
-
-    // the 'input' tensor may has no set (or wrong set) of the layout
-    phi::DenseTensor input_x(input->type());
-    input_x.ShareDataWith(*input);
-    input_x.set_layout(data_layout);
-
-    auto* output = ctx.Output<phi::DenseTensor>("Out");
-    framework::DDim output_dims;
-    if (data_layout == DataLayout::kNCHW) {
-      output_dims = {n, c, out_h, out_w};
-    } else {
-      output_dims = {n, out_h, out_w, c};
-    }
-    output->set_layout(data_layout);
-    output->mutable_data<T>(output_dims, ctx.GetPlace());
-
-    NpuOpRunner npu_op_runner;
-    auto npu_stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    npu_op_runner.SetType("ResizeNearestNeighborV2")
-        .AddInput(input_x)
-        .AddInput(std::vector<int32_t>{out_h, out_w})
-        .AddOutput(*output)
-        .AddAttr("align_corners", false)
-        .AddAttr("half_pixel_centers", false)
-        .Run(npu_stream);
-  }
-};
-
-template <typename T>
-class InterpolateGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    // NOTE(Ruibiao):
-    // this kernel only support nearest interpolotion for 2D images
-    // the Ascend 'ResizeNearestNeighborV2' used in this kernle has diff
-    // when 'align_corners' is 'true' or data type is 'double'
-    CheckArgument(ctx);
-
-    auto* input = ctx.Input<phi::DenseTensor>("X");
-    framework::DDim input_dims = input->dims();
-
-    const std::string data_layout_str =
-        ctx.Attr<std::string>("data_layout");  // kNCHW or kNHWC
-    const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
-
-    int32_t n, c, h, w, out_h, out_w;
-    ExtractNCHW(input_dims, data_layout, &n, &c, &h, &w);
-    CalcOutSize(ctx, h, w, &out_h, &out_w);
-
-    // the 'output_grad' tensor may has no set (or wrong set) of the layout
-    auto* output_grad =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    phi::DenseTensor output_grad_tmp(output_grad->type());
-    output_grad_tmp.ShareDataWith(*output_grad);
-    output_grad_tmp.set_layout(data_layout);
-
-    auto* input_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    input_grad->set_layout(data_layout);
-    framework::DDim input_grad_dims;
-    if (data_layout == DataLayout::kNCHW) {
-      input_grad_dims = {n, c, h, w};
-    } else {
-      input_grad_dims = {n, h, w, c};
-    }
-    input_grad->mutable_data<T>(input_grad_dims, ctx.GetPlace());
-
-    NpuOpRunner npu_op_runner;
-    auto npu_stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    npu_op_runner.SetType("ResizeNearestNeighborV2Grad")
-        .AddInput(output_grad_tmp)
-        .AddInput(std::vector<int32_t>{h, w})
-        .AddOutput(*input_grad)
-        .AddAttr("align_corners", false)
-        .AddAttr("half_pixel_centers", false)
-        .Run(npu_stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_NPU_KERNEL(nearest_interp,
-                       ops::InterpolateNPUKernel<float>,
-                       ops::InterpolateNPUKernel<uint8_t>);
-REGISTER_OP_NPU_KERNEL(nearest_interp_grad,
-                       ops::InterpolateGradNPUKernel<float>);
diff --git a/paddle/fluid/operators/interpolate_v2_op_npu.cc b/paddle/fluid/operators/interpolate_v2_op_npu.cc
deleted file mode 100644
index d16494f229e42a..00000000000000
--- a/paddle/fluid/operators/interpolate_v2_op_npu.cc
+++ /dev/null
@@ -1,812 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/interpolate_function.h"
-
-namespace paddle {
-namespace operators {
-
-using DataLayout = phi::DataLayout;
-using DDim = framework::DDim;
-using fp16 = paddle::platform::float16;
-
-template <typename T>
-struct InterpolateFunction {
- public:
-  explicit InterpolateFunction(const framework::ExecutionContext& ctx)
-      : ctx(ctx) {
-    place = ctx.GetPlace();
-    stream = ctx.template device_context<paddle::platform::NPUDeviceContext>()
-                 .stream();
-    t0.mutable_data<float>({1}, place);
-    t1.mutable_data<float>({1}, place);
-    tn.mutable_data<float>({1}, place);
-    FillNpuTensorWithConstant<float>(&t0, static_cast<float>(0));
-    FillNpuTensorWithConstant<float>(&t1, static_cast<float>(1));
-  }
-  void Arange(int n, phi::DenseTensor* x) {
-    FillNpuTensorWithConstant<float>(&tn, static_cast<float>(n));
-    const auto& runner = NpuOpRunner("Range", {t0, tn, t1}, {*x}, {});
-    runner.Run(stream);
-  }
-  void ReduceSum(const phi::DenseTensor* x,
-                 phi::DenseTensor* y,
-                 const std::vector<int>& dim,
-                 bool keep_dims = true) {
-    const auto& runner = NpuOpRunner(
-        "ReduceSumD", {*x}, {*y}, {{"axes", dim}, {"keep_dims", keep_dims}});
-    runner.Run(stream);
-  }
-  void Add(const phi::DenseTensor* x,
-           const phi::DenseTensor* y,
-           phi::DenseTensor* z) {
-    const auto& runner = NpuOpRunner("AddV2", {*x, *y}, {*z}, {});
-    runner.Run(stream);
-  }
-  void Adds(const phi::DenseTensor* x, float scalar, phi::DenseTensor* y) {
-    const auto& runner = NpuOpRunner("Adds", {*x}, {*y}, {{"value", scalar}});
-    runner.Run(stream);
-  }
-  void Mul(const phi::DenseTensor* x,
-           const phi::DenseTensor* y,
-           phi::DenseTensor* z) {
-    const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*z}, {});
-    runner.Run(stream);
-  }
-  void Sub(const phi::DenseTensor* x,
-           const phi::DenseTensor* y,
-           phi::DenseTensor* z) {
-    const auto& runner = NpuOpRunner("Sub", {*x, *y}, {*z}, {});
-    runner.Run(stream);
-  }
-  void Cast(const phi::DenseTensor* x, phi::DenseTensor* y) {
-    auto dst_dtype =
-        ConvertToNpuDtype(framework::TransToProtoVarType(y->dtype()));
-    const auto& runner = NpuOpRunner(
-        "Cast", {*x}, {*y}, {{"dst_type", static_cast<int>(dst_dtype)}});
-    runner.Run(stream);
-  }
-  void Gather(const phi::DenseTensor* x,
-              const phi::DenseTensor* indices,
-              const int axis,
-              phi::DenseTensor* y) {
-    const auto& runner =
-        NpuOpRunner("GatherV2D", {*x, *indices}, {*y}, {{"axis", axis}});
-    runner.Run(stream);
-  }
-  void GatherGrad(const phi::DenseTensor* gy,
-                  const phi::DenseTensor* indices,
-                  const int axis,
-                  phi::DenseTensor* gx) {
-    //  1  gy swapaxis: axis & 0
-    int len = (gy->dims()).size();
-    std::vector<int> axis_swap(len);
-    for (int i = 0; i < len; i++) {
-      axis_swap[i] = i;
-    }
-    axis_swap[0] = axis;
-    axis_swap[axis] = 0;
-    auto y_new_shape = gy->dims();
-    auto yt = y_new_shape[axis];
-    y_new_shape[axis] = y_new_shape[0];
-    y_new_shape[0] = yt;
-    phi::DenseTensor gy_t;
-    gy_t.mutable_data<T>(y_new_shape, place);
-    Transpose(gy, &gy_t, axis_swap);
-    //  2  scatter
-    auto x_new_shape = gx->dims();
-    auto xt = x_new_shape[axis];
-    x_new_shape[axis] = x_new_shape[0];
-    x_new_shape[0] = xt;
-    phi::DenseTensor gx_zero, gx_t;
-    gx_zero.mutable_data<T>(x_new_shape, place);
-    gx_t.mutable_data<T>(x_new_shape, place);
-    FillNpuTensorWithConstant<T>(&gx_zero, static_cast<T>(0));
-    gx_zero.Resize(x_new_shape);
-    Scatter(&gx_zero, indices, &gy_t, &gx_t);
-    //  3  gx swapaxis: axis, 0
-    Transpose(&gx_t, gx, axis_swap);
-  }
-  void Scatter(const phi::DenseTensor* x,
-               const phi::DenseTensor* index,
-               const phi::DenseTensor* updates,
-               phi::DenseTensor* y) {
-    const auto& runner =
-        NpuOpRunner("TensorScatterAdd", {*x, *index, *updates}, {*y}, {});
-    runner.Run(stream);
-  }
-  void Transpose(const phi::DenseTensor* x,
-                 phi::DenseTensor* y,
-                 const std::vector<int>& axis) {
-    const auto& runner =
-        NpuOpRunner("TransposeD", {*x}, {*y}, {{"perm", axis}});
-    runner.Run(stream);
-  }
-  void Muls(const phi::DenseTensor* x, float scalar, phi::DenseTensor* y) {
-    const auto& runner = NpuOpRunner("Muls", {*x}, {*y}, {{"value", scalar}});
-    runner.Run(stream);
-  }
-  void Maximum(const phi::DenseTensor* x,
-               const phi::DenseTensor* y,
-               phi::DenseTensor* z) {
-    const auto& runner = NpuOpRunner("Maximum", {*x, *y}, {*z}, {});
-    runner.Run(stream);
-  }
-  void Minimum(const phi::DenseTensor* x,
-               const phi::DenseTensor* y,
-               phi::DenseTensor* z) {
-    const auto& runner = NpuOpRunner("Minimum", {*x, *y}, {*z}, {});
-    runner.Run(stream);
-  }
-  void Floor(const phi::DenseTensor* x, phi::DenseTensor* y) {
-    const auto& runner = NpuOpRunner("Floor", {*x}, {*y}, {});
-    runner.Run(stream);
-  }
-
- private:
-  platform::Place place;
-  aclrtStream stream;
-  const framework::ExecutionContext& ctx;
-  phi::DenseTensor t0;
-  phi::DenseTensor t1;
-  phi::DenseTensor tn;
-};
-
-template <>
-void InterpolateFunction<fp16>::Arange(int n, phi::DenseTensor* x) {
-  phi::DenseTensor x_fp32(phi::DataType::FLOAT32);
-  x_fp32.mutable_data<float>(x->dims(), place);
-  FillNpuTensorWithConstant<float>(&tn, static_cast<float>(n));
-  const auto& runner = NpuOpRunner("Range", {t0, tn, t1}, {x_fp32}, {});
-  runner.Run(stream);
-  Cast(&x_fp32, x);
-}
-
-void InterpolateParamCompute(const float scale_h,
-                             const float scale_w,
-                             const bool align_corners,
-                             const int align_mode,
-                             const DataLayout& data_layout,
-                             const DDim& indim,
-                             const DDim& outdim,
-                             int* axis_h,
-                             int* axis_w,
-                             int* in_h,
-                             int* in_w,
-                             int* out_h,
-                             int* out_w,
-                             float* ratio_h,
-                             float* ratio_w) {
-  if (data_layout == DataLayout::kNCHW) {
-    *axis_h = 2;
-    *axis_w = 3;
-  } else {
-    *axis_h = 1;
-    *axis_w = 2;
-  }
-  *out_h = outdim[*axis_h];
-  *out_w = outdim[*axis_w];
-  *in_h = indim[*axis_h];
-  *in_w = indim[*axis_w];
-  *ratio_h = 0.0f;
-  *ratio_w = 0.0f;
-  if (*out_h > 1) {
-    *ratio_h =
-        align_corners
-            ? static_cast<float>(*in_h - 1) / (*out_h - 1)
-            : (scale_h > 0 ? 1 / scale_h : static_cast<float>(*in_h) / *out_h);
-  }
-  if (*out_w > 1) {
-    *ratio_w =
-        align_corners
-            ? static_cast<float>(*in_w - 1) / (*out_w - 1)
-            : (scale_w > 0 ? 1 / scale_w : static_cast<float>(*in_w) / *out_w);
-  }
-}
-
-template <typename T>
-void BilinearParamTensorCompute(const framework::ExecutionContext& ctx,
-                                const DataLayout& data_layout,
-                                int in_h,
-                                int in_w,
-                                int out_h,
-                                int out_w,
-                                bool align_cond,
-                                float ratio_h,
-                                float ratio_w,
-                                phi::DenseTensor* h0,
-                                phi::DenseTensor* h1,
-                                phi::DenseTensor* w0,
-                                phi::DenseTensor* w1,
-                                phi::DenseTensor* coef_h0,
-                                phi::DenseTensor* coef_h1,
-                                phi::DenseTensor* coef_w0,
-                                phi::DenseTensor* coef_w1) {
-  InterpolateFunction<T> F(ctx);
-  auto place = ctx.GetPlace();
-  phi::DenseTensor _h0, _w0;
-  _h0.mutable_data<T>({out_h}, place);
-  _w0.mutable_data<T>({out_w}, place);
-  F.Arange(out_h, &_h0);
-  F.Arange(out_w, &_w0);
-  if (align_cond) {
-    F.Adds(&_h0, static_cast<float>(0.5), &_h0);
-    F.Adds(&_w0, static_cast<float>(0.5), &_w0);
-    F.Muls(&_h0, ratio_h, &_h0);
-    F.Muls(&_w0, ratio_w, &_w0);
-    F.Adds(&_h0, static_cast<float>(-0.5), &_h0);
-    F.Adds(&_w0, static_cast<float>(-0.5), &_w0);
-  } else {
-    F.Muls(&_h0, ratio_h, &_h0);
-    F.Muls(&_w0, ratio_w, &_w0);
-  }
-
-  phi::DenseTensor zero_t;
-  phi::DenseTensor one_t;
-  zero_t.mutable_data<T>({1}, place);
-  one_t.mutable_data<T>({1}, place);
-  FillNpuTensorWithConstant<T>(&zero_t, static_cast<T>(0));
-  FillNpuTensorWithConstant<T>(&one_t, static_cast<T>(1));
-  F.Maximum(&_h0, &zero_t, &_h0);
-  F.Maximum(&_w0, &zero_t, &_w0);
-
-  phi::DenseTensor _h0_floor, _w0_floor;
-  _h0_floor.mutable_data<T>({out_h}, place);
-  _w0_floor.mutable_data<T>({out_w}, place);
-  F.Floor(&_h0, &_h0_floor);
-  F.Floor(&_w0, &_w0_floor);
-  F.Cast(&_h0_floor, h0);
-  F.Cast(&_w0_floor, w0);
-
-  phi::DenseTensor one_int;
-  one_int.mutable_data<int>({1}, place);
-  FillNpuTensorWithConstant<int>(&one_int, static_cast<int>(1));
-  F.Add(h0, &one_int, h1);
-  F.Add(w0, &one_int, w1);
-  phi::DenseTensor t_max_h, t_max_w;
-  t_max_h.mutable_data<int>({1}, place);
-  t_max_w.mutable_data<int>({1}, place);
-  FillNpuTensorWithConstant<int>(&t_max_h, static_cast<int>(in_h - 1));
-  FillNpuTensorWithConstant<int>(&t_max_w, static_cast<int>(in_w - 1));
-  F.Minimum(h1, &t_max_h, h1);
-  F.Minimum(w1, &t_max_w, w1);
-
-  F.Sub(&_h0, &_h0_floor, coef_h1);
-  F.Sub(&_w0, &_w0_floor, coef_w1);
-  F.Sub(&one_t, coef_h1, coef_h0);
-  F.Sub(&one_t, coef_w1, coef_w0);
-
-  if (data_layout == DataLayout::kNCHW) {
-    coef_h0->Resize({out_h, 1});
-    coef_h1->Resize({out_h, 1});
-  } else {
-    coef_h0->Resize({out_h, 1, 1});
-    coef_h1->Resize({out_h, 1, 1});
-    coef_w0->Resize({out_w, 1});
-    coef_w1->Resize({out_w, 1});
-  }
-}
-
-template <typename T>
-void BilinearFwdNpu(const framework::ExecutionContext& ctx,
-                    const phi::DenseTensor* input,
-                    phi::DenseTensor* output,
-                    const float scale_h,
-                    const float scale_w,
-                    const bool align_corners,
-                    const int align_mode,
-                    const DataLayout& data_layout) {
-  InterpolateFunction<T> F(ctx);
-  auto place = ctx.GetPlace();
-  auto outdim = output->dims();
-  auto indim = input->dims();
-
-  int axis_h, axis_w;
-  int out_h, out_w, in_h, in_w;
-  float ratio_h, ratio_w;
-  InterpolateParamCompute(scale_h,
-                          scale_w,
-                          align_corners,
-                          align_mode,
-                          data_layout,
-                          indim,
-                          outdim,
-                          &axis_h,
-                          &axis_w,
-                          &in_h,
-                          &in_w,
-                          &out_h,
-                          &out_w,
-                          &ratio_h,
-                          &ratio_w);
-
-  phi::DenseTensor h0, h1, w0, w1;
-  h0.mutable_data<int>({out_h}, place);
-  h1.mutable_data<int>({out_h}, place);
-  w0.mutable_data<int>({out_w}, place);
-  w1.mutable_data<int>({out_w}, place);
-  phi::DenseTensor coef_h0, coef_h1, coef_w0, coef_w1;
-  coef_h0.mutable_data<T>({out_h}, place);
-  coef_h1.mutable_data<T>({out_h}, place);
-  coef_w0.mutable_data<T>({out_w}, place);
-  coef_w1.mutable_data<T>({out_w}, place);
-  bool align_cond = align_mode == 0 && !align_corners;
-  BilinearParamTensorCompute<T>(ctx,
-                                data_layout,
-                                in_h,
-                                in_w,
-                                out_h,
-                                out_w,
-                                align_cond,
-                                ratio_h,
-                                ratio_w,
-                                &h0,
-                                &h1,
-                                &w0,
-                                &w1,
-                                &coef_h0,
-                                &coef_h1,
-                                &coef_w0,
-                                &coef_w1);
-
-  phi::DenseTensor input_gather_h0, input_gather_h1;
-  auto dim_gather_h = indim;
-  dim_gather_h[axis_h] = out_h;
-  input_gather_h0.mutable_data<T>(dim_gather_h, place);
-  input_gather_h1.mutable_data<T>(dim_gather_h, place);
-
-  F.Gather(input, &h0, axis_h, &input_gather_h0);
-  F.Gather(input, &h1, axis_h, &input_gather_h1);
-
-  F.Mul(&input_gather_h0, &coef_h0, &input_gather_h0);
-  F.Mul(&input_gather_h1, &coef_h1, &input_gather_h1);
-  phi::DenseTensor out_x4;
-  out_x4.mutable_data<T>({4, outdim[0], outdim[1], outdim[2], outdim[3]},
-                         place);
-  phi::DenseTensor input_gather_h0_w0 = out_x4.Slice(0, 1);
-  phi::DenseTensor input_gather_h0_w1 = out_x4.Slice(1, 2);
-  phi::DenseTensor input_gather_h1_w0 = out_x4.Slice(2, 3);
-  phi::DenseTensor input_gather_h1_w1 = out_x4.Slice(3, 4);
-  F.Gather(&input_gather_h0, &w0, axis_w, &input_gather_h0_w0);
-  F.Gather(&input_gather_h0, &w1, axis_w, &input_gather_h0_w1);
-  F.Gather(&input_gather_h1, &w0, axis_w, &input_gather_h1_w0);
-  F.Gather(&input_gather_h1, &w1, axis_w, &input_gather_h1_w1);
-  F.Mul(&input_gather_h0_w0, &coef_w0, &input_gather_h0_w0);
-  F.Mul(&input_gather_h0_w1, &coef_w1, &input_gather_h0_w1);
-  F.Mul(&input_gather_h1_w0, &coef_w0, &input_gather_h1_w0);
-  F.Mul(&input_gather_h1_w1, &coef_w1, &input_gather_h1_w1);
-  F.ReduceSum(&out_x4, output, std::vector<int>{0}, false);
-}
-
-template <typename T>
-void BilinearBwdNpu(const framework::ExecutionContext& ctx,
-                    const phi::DenseTensor* gout,
-                    phi::DenseTensor* gin,
-                    const float scale_h,
-                    const float scale_w,
-                    const bool align_corners,
-                    const int align_mode,
-                    const DataLayout& data_layout) {
-  InterpolateFunction<T> F(ctx);
-  auto place = ctx.GetPlace();
-  auto outdim = gout->dims();
-  auto indim = gin->dims();
-
-  int axis_h, axis_w;
-  int out_h, out_w, in_h, in_w;
-  float ratio_h, ratio_w;
-  InterpolateParamCompute(scale_h,
-                          scale_w,
-                          align_corners,
-                          align_mode,
-                          data_layout,
-                          indim,
-                          outdim,
-                          &axis_h,
-                          &axis_w,
-                          &in_h,
-                          &in_w,
-                          &out_h,
-                          &out_w,
-                          &ratio_h,
-                          &ratio_w);
-
-  phi::DenseTensor h0, h1, w0, w1;
-  h0.mutable_data<int>({out_h}, place);
-  h1.mutable_data<int>({out_h}, place);
-  w0.mutable_data<int>({out_w}, place);
-  w1.mutable_data<int>({out_w}, place);
-  phi::DenseTensor coef_h0, coef_h1, coef_w0, coef_w1;
-  coef_h0.mutable_data<T>({out_h}, place);
-  coef_h1.mutable_data<T>({out_h}, place);
-  coef_w0.mutable_data<T>({out_w}, place);
-  coef_w1.mutable_data<T>({out_w}, place);
-  bool align_cond = align_mode == 0 && !align_corners;
-  BilinearParamTensorCompute<T>(ctx,
-                                data_layout,
-                                in_h,
-                                in_w,
-                                out_h,
-                                out_w,
-                                align_cond,
-                                ratio_h,
-                                ratio_w,
-                                &h0,
-                                &h1,
-                                &w0,
-                                &w1,
-                                &coef_h0,
-                                &coef_h1,
-                                &coef_w0,
-                                &coef_w1);
-
-  phi::DenseTensor gy_w0, gy_w1;
-  gy_w0.mutable_data<T>(outdim, place);
-  gy_w1.mutable_data<T>(outdim, place);
-  F.Mul(gout, &coef_w0, &gy_w0);
-  F.Mul(gout, &coef_w1, &gy_w1);
-
-  auto dim_gather_h = indim;
-  dim_gather_h[axis_h] = out_h;
-  phi::DenseTensor g_gather_w0, g_gather_w1;
-  g_gather_w0.mutable_data<T>(dim_gather_h, place);
-  g_gather_w1.mutable_data<T>(dim_gather_h, place);
-  w0.Resize({out_w, 1});
-  w1.Resize({out_w, 1});
-  F.GatherGrad(&gy_w0, &w0, axis_w, &g_gather_w0);
-  F.GatherGrad(&gy_w1, &w1, axis_w, &g_gather_w1);
-
-  F.Add(&g_gather_w0, &g_gather_w1, &g_gather_w0);
-  F.Mul(&g_gather_w0, &coef_h1, &g_gather_w1);
-  F.Mul(&g_gather_w0, &coef_h0, &g_gather_w0);
-
-  phi::DenseTensor gx_0, gx_1;
-  gx_0.mutable_data<T>(indim, place);
-  gx_1.mutable_data<T>(indim, place);
-  h0.Resize({out_h, 1});
-  h1.Resize({out_h, 1});
-  F.GatherGrad(&g_gather_w0, &h0, axis_h, &gx_0);
-  F.GatherGrad(&g_gather_w1, &h1, axis_h, &gx_1);
-
-  F.Add(&gx_0, &gx_1, gin);
-}
-
-template <typename DeviceContext, typename T>
-class InterpolateV2NPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<phi::DenseTensor>("X");
-    auto* output = ctx.Output<phi::DenseTensor>("Out");
-
-    auto input_dims = input->dims();
-    PADDLE_ENFORCE_EQ(
-        input_dims.size(),
-        4UL,
-        platform::errors::External(
-            "NPU Interpolate Kernel only support 4-D phi::DenseTensor."));
-
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
-    int n, c, in_d, in_h, in_w;
-    phi::funcs::ExtractNCDWH(
-        input_dims, data_layout, &n, &c, &in_d, &in_h, &in_w);
-
-    auto interp_method = ctx.Attr<std::string>("interp_method");
-    bool align_corners = ctx.Attr<bool>("align_corners");
-
-    // To-do(qili93): need to support align_corners = true case, try ReSizeD
-    PADDLE_ENFORCE_EQ(
-        align_corners,
-        false,
-        platform::errors::InvalidArgument(
-            "NPU Interpolate Kernel has diff when align_corners is true."));
-
-    int out_h = ctx.Attr<int>("out_h");
-    int out_w = ctx.Attr<int>("out_w");
-    float scale_h = -1;
-    float scale_w = -1;
-
-    // Priority: SizeTensor > OutSize > Scale > scale > out_h & out_w
-    auto list_new_shape_tensor = ctx.MultiInput<phi::DenseTensor>("SizeTensor");
-    if (list_new_shape_tensor.size() > 0) {
-      std::vector<int32_t> output_h(1);
-      std::vector<int32_t> output_w(1);
-      auto dev_ctx =
-          platform::DeviceContextPool::Instance().Get(ctx.GetPlace());
-      framework::TensorToVector(*list_new_shape_tensor[0], *dev_ctx, &output_h);
-      framework::TensorToVector(*list_new_shape_tensor[1], *dev_ctx, &output_w);
-      out_h = output_h[0];
-      out_w = output_w[0];
-    } else if (ctx.HasInput("OutSize")) {
-      auto out_size = ctx.Input<phi::DenseTensor>("OutSize");
-      auto out_size_data = phi::funcs::get_new_data_from_tensor<int>(out_size);
-      out_h = out_size_data[0];
-      out_w = out_size_data[1];
-    } else {
-      auto scale_tensor = ctx.Input<phi::DenseTensor>("Scale");
-      auto scale = ctx.Attr<std::vector<float>>("scale");
-      if (scale_tensor != nullptr) {
-        auto scale_data =
-            phi::funcs::get_new_data_from_tensor<float>(scale_tensor);
-        if (scale_data.size() > 1) {
-          scale_h = scale_data[0];
-          scale_w = scale_data[1];
-        } else {
-          scale_h = scale_data[0];
-          scale_w = scale_data[0];
-        }
-        PADDLE_ENFORCE_EQ(
-            scale_w > 0,
-            true,
-            platform::errors::InvalidArgument(
-                "The scale_w in input 'Scale' phi::DenseTensor of "
-                "Operator(interpolate) "
-                "should be greater than 0, but received value is %d.",
-                scale_w));
-        PADDLE_ENFORCE_EQ(
-            scale_h > 0,
-            true,
-            platform::errors::InvalidArgument(
-                "The scale_h in input 'Scale' phi::DenseTensor of "
-                "Operator(interpolate) "
-                "should be greater than 0, but received value is %d.",
-                scale_h));
-      } else {
-        if (scale.size() > 1) {
-          scale_h = scale[0];
-          scale_w = scale[1];
-
-          PADDLE_ENFORCE_EQ(
-              scale_w > 0,
-              true,
-              platform::errors::InvalidArgument(
-                  "The scale_w in Attr(scale) of Operator(interpolate) "
-                  "should be greater than 0, but received value is %d.",
-                  scale_w));
-          PADDLE_ENFORCE_EQ(
-              scale_h > 0,
-              true,
-              platform::errors::InvalidArgument(
-                  "The scale_h in Attr(scale) of Operator(interpolate) "
-                  "should be greater than 0, but received value is %d.",
-                  scale_h));
-        }
-      }
-      if (scale_h > 0. && scale_w > 0.) {
-        out_h = static_cast<int>(in_h * scale_h);
-        out_w = static_cast<int>(in_w * scale_w);
-      }
-    }
-    PADDLE_ENFORCE_GT(out_h,
-                      0,
-                      platform::errors::InvalidArgument(
-                          "out_h in Attr(out_shape) of Op(interpolate) "
-                          "should be greater than 0."));
-    PADDLE_ENFORCE_GT(out_w,
-                      0,
-                      platform::errors::InvalidArgument(
-                          "out_w in Attr(out_shape) of Op(interpolate) "
-                          "should be greater than 0."));
-    framework::DDim dim_out;
-    if (data_layout == DataLayout::kNCHW) {
-      dim_out = {n, c, out_h, out_w};
-    } else {
-      dim_out = {n, out_h, out_w, c};
-    }
-    output->mutable_data<T>(dim_out, ctx.GetPlace());
-
-    if (in_h == out_h && in_w == out_w) {
-      framework::TensorCopy(*input, ctx.GetPlace(), output);
-      return;
-    }
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    // To-do(qili93): need to support bilineare, try ResizeD
-    // Add bilineare by zhulei
-    if ("nearest" == interp_method) {
-      NpuOpRunner runner;
-      runner.SetType("ResizeNearestNeighborV2")
-          .AddInput(*input)
-          .AddInput(std::vector<int32_t>{out_h, out_w})
-          .AddOutput(*output)
-          .AddAttr("align_corners", align_corners)
-          .AddAttr("half_pixel_centers", false);
-      runner.Run(stream);
-    } else if ("bilinear" == interp_method) {
-      int align_mode = ctx.Attr<int>("align_mode");
-      BilinearFwdNpu<T>(ctx,
-                        input,
-                        output,
-                        scale_h,
-                        scale_w,
-                        align_corners,
-                        align_mode,
-                        data_layout);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class InterpolateV2NPUGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<phi::DenseTensor>("X");
-    auto* output_grad =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* input_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
-    int n, c, in_d, in_h, in_w;
-    phi::funcs::ExtractNCDWH(
-        input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
-
-    auto interp_method = ctx.Attr<std::string>("interp_method");
-    bool align_corners = ctx.Attr<bool>("align_corners");
-
-    // To-do(qili93): need to support align_corners = true case, try ReSizeD
-    PADDLE_ENFORCE_EQ(
-        align_corners,
-        false,
-        platform::errors::InvalidArgument(
-            "NPU Interpolate Kernel has diff when align_corners is true."));
-
-    int out_h = ctx.Attr<int>("out_h");
-    int out_w = ctx.Attr<int>("out_w");
-    float scale_h = -1;
-    float scale_w = -1;
-
-    // Priority: SizeTensor > OutSize > Scale > scale > out_h & out_w
-    auto list_new_size_tensor = ctx.MultiInput<phi::DenseTensor>("SizeTensor");
-    if (list_new_size_tensor.size() > 0) {
-      std::vector<int32_t> output_h(1);
-      std::vector<int32_t> output_w(1);
-      auto dev_ctx =
-          platform::DeviceContextPool::Instance().Get(ctx.GetPlace());
-      framework::TensorToVector(*list_new_size_tensor[0], *dev_ctx, &output_h);
-      framework::TensorToVector(*list_new_size_tensor[1], *dev_ctx, &output_w);
-      out_h = output_h[0];
-      out_w = output_w[0];
-    } else if (ctx.HasInput("OutSize")) {
-      auto out_size = ctx.Input<phi::DenseTensor>("OutSize");
-      auto out_size_data = phi::funcs::get_new_data_from_tensor<int>(out_size);
-      out_h = out_size_data[0];
-      out_w = out_size_data[1];
-    } else {
-      auto scale_tensor = ctx.Input<phi::DenseTensor>("Scale");
-      auto scale = ctx.Attr<std::vector<float>>("scale");
-      if (scale_tensor != nullptr) {
-        auto scale_data =
-            phi::funcs::get_new_data_from_tensor<float>(scale_tensor);
-        if (scale_data.size() > 1) {
-          scale_h = scale_data[0];
-          scale_w = scale_data[1];
-        } else {
-          scale_w = scale_data[0];
-          scale_h = scale_data[0];
-        }
-        PADDLE_ENFORCE_EQ(
-            scale_w > 0,
-            true,
-            platform::errors::InvalidArgument(
-                "The scale_w in input 'Scale' phi::DenseTensor of "
-                "Operator(interpolate) "
-                "should be greater than 0, but received value is %d.",
-                scale_w));
-        PADDLE_ENFORCE_EQ(
-            scale_h > 0,
-            true,
-            platform::errors::InvalidArgument(
-                "The scale_h in input 'Scale' phi::DenseTensor of "
-                "Operator(interpolate) "
-                "should be greater than 0, but received value is %d.",
-                scale_h));
-      } else {
-        if (scale.size() > 1) {
-          scale_h = scale[0];
-          scale_w = scale[1];
-          PADDLE_ENFORCE_EQ(
-              scale_w > 0,
-              true,
-              platform::errors::InvalidArgument(
-                  "The scale_w in Attr(scale) of Operator(interpolate) "
-                  "should be greater than 0, but received value is %d.",
-                  scale_w));
-          PADDLE_ENFORCE_EQ(
-              scale_h > 0,
-              true,
-              platform::errors::InvalidArgument(
-                  "The scale_h in Attr(scale) of Operator(interpolate) "
-                  "should be greater than 0, but received value is %d.",
-                  scale_h));
-        }
-      }
-      if (scale_h > 0. && scale_w > 0.) {
-        out_h = static_cast<int>(in_h * scale_h);
-        out_w = static_cast<int>(in_w * scale_w);
-      }
-    }
-
-    framework::DDim dim_grad;
-    if (data_layout == DataLayout::kNCHW) {
-      dim_grad = {n, c, in_h, in_w};
-    } else {
-      dim_grad = {n, in_h, in_w, c};
-    }
-
-    input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
-
-    if (in_h == out_h && in_w == out_w) {
-      framework::TensorCopy(*output_grad, ctx.GetPlace(), input_grad);
-      return;
-    }
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    // To-do(qili93): need to support bilineare, try ResizeGradD
-    if ("nearest" == interp_method) {
-      NpuOpRunner runner;
-      runner.SetType("ResizeNearestNeighborV2Grad")
-          .AddInput(*output_grad)
-          .AddInput(std::vector<int32_t>{in_h, in_w})
-          .AddOutput(*input_grad)
-          .AddAttr("align_corners", align_corners)
-          .AddAttr("half_pixel_centers", false);
-      runner.Run(stream);
-    } else if ("bilinear" == interp_method) {
-      int align_mode = ctx.Attr<int>("align_mode");
-      BilinearBwdNpu<T>(ctx,
-                        output_grad,
-                        input_grad,
-                        scale_h,
-                        scale_w,
-                        align_corners,
-                        align_mode,
-                        data_layout);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(
-    nearest_interp_v2,
-    ops::InterpolateV2NPUKernel<plat::NPUDeviceContext, float>,
-    ops::InterpolateV2NPUKernel<plat::NPUDeviceContext, plat::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    nearest_interp_v2_grad,
-    ops::InterpolateV2NPUGradKernel<plat::NPUDeviceContext, float>,
-    ops::InterpolateV2NPUGradKernel<plat::NPUDeviceContext, plat::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    bilinear_interp_v2,
-    ops::InterpolateV2NPUKernel<plat::NPUDeviceContext, float>,
-    ops::InterpolateV2NPUKernel<plat::NPUDeviceContext, plat::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    bilinear_interp_v2_grad,
-    ops::InterpolateV2NPUGradKernel<plat::NPUDeviceContext, float>,
-    ops::InterpolateV2NPUGradKernel<plat::NPUDeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/is_empty_op_npu.cc b/paddle/fluid/operators/is_empty_op_npu.cc
deleted file mode 100644
index 91a0698d626f55..00000000000000
--- a/paddle/fluid/operators/is_empty_op_npu.cc
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/is_empty_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(
-    is_empty,
-    ops::IsEmptyOpKernel<plat::NPUDeviceContext, float>,
-    ops::IsEmptyOpKernel<plat::NPUDeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/kldiv_loss_op_npu.cc b/paddle/fluid/operators/kldiv_loss_op_npu.cc
deleted file mode 100644
index d2b4626c58cb47..00000000000000
--- a/paddle/fluid/operators/kldiv_loss_op_npu.cc
+++ /dev/null
@@ -1,170 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the Licnse. */
-
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class KLDivLossNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<phi::DenseTensor>("X");
-    auto* target = ctx.Input<phi::DenseTensor>("Target");
-    auto* loss = ctx.Output<phi::DenseTensor>("Loss");
-    auto reduction = ctx.Attr<std::string>("reduction");
-    loss->mutable_data<T>(ctx.GetPlace());
-
-    auto& dev_ctx = ctx.template device_context<platform::NPUDeviceContext>();
-    auto stream = dev_ctx.stream();
-
-    if ("none" == reduction) {
-      // log(label)
-      auto ones_tensor = ctx.AllocateTmpTensor<T, platform::NPUDeviceContext>(
-          target->dims(), dev_ctx);
-      const auto& ones_runner =
-          NpuOpRunner("OnesLike", {*target}, {ones_tensor}, {});
-      ones_runner.Run(stream);
-
-      auto sub_tensor = ctx.AllocateTmpTensor<T, platform::NPUDeviceContext>(
-          target->dims(), dev_ctx);
-      const auto& sub_runner =
-          NpuOpRunner("Sub", {*target, ones_tensor}, {sub_tensor}, {});
-      sub_runner.Run(stream);
-
-      auto log_target = ctx.AllocateTmpTensor<T, platform::NPUDeviceContext>(
-          target->dims(), dev_ctx);
-      const auto& log_runner =
-          NpuOpRunner("Log1p", {sub_tensor}, {log_target}, {});
-      log_runner.Run(stream);
-
-      // log(label) - input
-      const auto& sub_runner2 =
-          NpuOpRunner("Sub", {log_target, *input}, {*loss}, {});
-      sub_runner2.Run(stream);
-
-      // label * (log(label) - input)
-      auto min_value =
-          ctx.AllocateTmpTensor<T, platform::NPUDeviceContext>({1}, dev_ctx);
-      auto max_value =
-          ctx.AllocateTmpTensor<T, platform::NPUDeviceContext>({1}, dev_ctx);
-      FillNpuTensorWithConstant(&min_value, static_cast<T>(0));
-      FillNpuTensorWithConstant(&max_value, std::numeric_limits<T>::max());
-
-      auto cliped_target = ctx.AllocateTmpTensor<T, platform::NPUDeviceContext>(
-          target->dims(), dev_ctx);
-      const auto& clip_runner = NpuOpRunner(
-          "ClipByValue", {*target, min_value, max_value}, {cliped_target}, {});
-      clip_runner.Run(stream);
-
-      const auto& mul_runner =
-          NpuOpRunner("Mul", {*loss, cliped_target}, {*loss}, {});
-      mul_runner.Run(stream);
-    } else if ("batchmean" == reduction || "sum" == reduction) {
-      const auto& runner = NpuOpRunner(
-          "KLDiv", {*input, *target}, {*loss}, {{"reduction", reduction}});
-      runner.Run(stream);
-    } else if ("mean" == reduction) {
-      const auto& runner = NpuOpRunner("KLDiv",
-                                       {*input, *target},
-                                       {*loss},
-                                       {{"reduction", std::string("sum")}});
-      runner.Run(stream);
-
-      const int numel = input->numel();
-      const auto& muls_runner =
-          NpuOpRunner("Muls",
-                      {*loss},
-                      {*loss},
-                      {{"value", static_cast<float>(1.0 / numel)}});
-      muls_runner.Run(stream);
-    }
-  }
-};
-
-template <typename T>
-class KLDivLossGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* target = ctx.Input<phi::DenseTensor>("Target");
-    auto* loss_grad =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("Loss"));
-    auto* input_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto reduction = ctx.Attr<std::string>("reduction");
-    input_grad->mutable_data<T>(ctx.GetPlace());
-
-    auto& dev_ctx = ctx.template device_context<platform::NPUDeviceContext>();
-    auto stream = dev_ctx.stream();
-
-    phi::DenseTensor loss_grad_transformed;
-    if ("none" == reduction) {
-      loss_grad_transformed.ShareDataWith(*loss_grad);
-    } else {
-      loss_grad_transformed.mutable_data<T>(input_grad->dims(), ctx.GetPlace());
-
-      NpuOpRunner broadcast_runner;
-      broadcast_runner.SetType("BroadcastTo");
-      broadcast_runner.AddInput(*loss_grad);
-      broadcast_runner.AddInput(phi::vectorize<int>(input_grad->dims()));
-      broadcast_runner.AddOutput(loss_grad_transformed);
-      broadcast_runner.Run(stream);
-    }
-    auto min_value =
-        ctx.AllocateTmpTensor<T, platform::NPUDeviceContext>({1}, dev_ctx);
-    auto max_value =
-        ctx.AllocateTmpTensor<T, platform::NPUDeviceContext>({1}, dev_ctx);
-    FillNpuTensorWithConstant(&min_value, static_cast<T>(0));
-    FillNpuTensorWithConstant(&max_value, std::numeric_limits<T>::max());
-
-    auto cliped_target = ctx.AllocateTmpTensor<T, platform::NPUDeviceContext>(
-        target->dims(), dev_ctx);
-    const auto& clip_runner = NpuOpRunner(
-        "ClipByValue", {*target, min_value, max_value}, {cliped_target}, {});
-    clip_runner.Run(stream);
-
-    const auto& mul_runner = NpuOpRunner(
-        "Mul", {cliped_target, loss_grad_transformed}, {*input_grad}, {});
-    mul_runner.Run(stream);
-
-    float k = -1.0f;
-
-    if ("mean" == reduction) {
-      k = static_cast<float>(-1.0 / input_grad->numel());
-    } else if ("batchmean" == reduction) {
-      k = static_cast<float>(-1.0 / input_grad->dims()[0]);
-    }
-
-    const auto& muls_runner =
-        NpuOpRunner("Muls", {*input_grad}, {*input_grad}, {{"value", k}});
-    muls_runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(kldiv_loss,
-                       ops::KLDivLossNPUKernel<float>,
-                       ops::KLDivLossNPUKernel<plat::float16>);
-
-REGISTER_OP_NPU_KERNEL(kldiv_loss_grad,
-                       ops::KLDivLossGradNPUKernel<float>,
-                       ops::KLDivLossGradNPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/label_smooth_op_npu.cc b/paddle/fluid/operators/label_smooth_op_npu.cc
deleted file mode 100644
index 5c267625f55f74..00000000000000
--- a/paddle/fluid/operators/label_smooth_op_npu.cc
+++ /dev/null
@@ -1,114 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-void LabelSmoothMuls(const platform::Place& place,
-                     const aclrtStream& stream,
-                     const phi::DenseTensor* in,
-                     float val,
-                     phi::DenseTensor* out) {
-  out->mutable_data<T>(in->dims(), place);
-  const auto& runner = NpuOpRunner("Muls", {*in}, {*out}, {{"value", val}});
-  runner.Run(stream);
-}
-
-template <typename T>
-void LabelSmoothAdds(const platform::Place& place,
-                     const aclrtStream& stream,
-                     const phi::DenseTensor* in,
-                     float val,
-                     phi::DenseTensor* out) {
-  out->mutable_data<T>(in->dims(), place);
-  const auto& runner = NpuOpRunner("Adds", {*in}, {*out}, {{"value", val}});
-  runner.Run(stream);
-}
-
-template <typename T>
-void LabelSmoothAddBroadCast(const platform::Place& place,
-                             const aclrtStream& stream,
-                             const phi::DenseTensor* in1,
-                             const phi::DenseTensor* in2,
-                             phi::DenseTensor* out) {
-  out->mutable_data<T>(place);
-  const auto& runner = NpuOpRunner("AddV2", {*in1, *in2}, {*out}, {});
-  runner.Run(stream);
-}
-
-template <typename T>
-class LabelSmoothNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out_t = ctx.Output<phi::DenseTensor>("Out");
-    auto* in_t = ctx.Input<phi::DenseTensor>("X");
-    auto* dist_t = ctx.Input<phi::DenseTensor>("PriorDist");
-    auto epsilon = ctx.Attr<float>("epsilon");
-
-    auto label_dim = in_t->dims()[in_t->dims().size() - 1];
-    auto place = ctx.GetPlace();
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    if (dist_t) {
-      phi::DenseTensor tmp;
-      phi::DenseTensor dist;
-      phi::DenseTensor tmp2;
-      LabelSmoothMuls<T>(place, stream, in_t, (1 - epsilon), &tmp);
-      LabelSmoothMuls<T>(place, stream, dist_t, epsilon, &tmp2);
-      tmp2.Resize({1, label_dim});
-      LabelSmoothAddBroadCast<T>(place, stream, &tmp, &tmp2, out_t);
-    } else {
-      phi::DenseTensor tmp;
-      LabelSmoothMuls<T>(place, stream, in_t, (1 - epsilon), &tmp);
-      LabelSmoothAdds<T>(place, stream, &tmp, (epsilon / label_dim), out_t);
-    }
-  }
-};
-
-template <typename T>
-class LabelSmoothGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* d_out_t = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* d_in_t = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto epsilon = ctx.Attr<float>("epsilon");
-
-    auto place = ctx.GetPlace();
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    LabelSmoothMuls<T>(place, stream, d_out_t, 1 - epsilon, d_in_t);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(label_smooth,
-                       ops::LabelSmoothNPUKernel<float>,
-                       ops::LabelSmoothNPUKernel<plat::float16>);
-REGISTER_OP_NPU_KERNEL(label_smooth_grad,
-                       ops::LabelSmoothGradNPUKernel<float>,
-                       ops::LabelSmoothGradNPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/layer_norm_op_npu.cc b/paddle/fluid/operators/layer_norm_op_npu.cc
deleted file mode 100644
index ca6762f2e325a5..00000000000000
--- a/paddle/fluid/operators/layer_norm_op_npu.cc
+++ /dev/null
@@ -1,449 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using DDim = framework::DDim;
-
-using DataLayout = phi::DataLayout;
-
-template <typename T>
-class NormDataType;
-
-template <>
-class NormDataType<platform::float16> {
- public:
-  // The scaling param type is float for HALF and FLOAT tensors
-  using ScalingParamType = const float;
-  using BatchNormParamType = float;
-};
-
-template <>
-class NormDataType<float> {
- public:
-  using ScalingParamType = const float;
-  using BatchNormParamType = float;
-};
-
-template <typename T>
-using NormDataType = NormDataType<T>;
-template <typename T>
-using LayerNormParamType = typename NormDataType<T>::BatchNormParamType;
-
-template <typename T>
-class LayerNormNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using U = LayerNormParamType<T>;
-    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
-    const auto epsilon = ctx.Attr<float>("epsilon");
-    const auto* x = ctx.Input<phi::DenseTensor>("X");
-    const auto* scale = ctx.Input<phi::DenseTensor>("Scale");
-    const auto* bias = ctx.Input<phi::DenseTensor>("Bias");
-    auto* y = ctx.Output<phi::DenseTensor>("Y");
-    auto* mean = ctx.Output<phi::DenseTensor>("Mean");
-    auto* variance = ctx.Output<phi::DenseTensor>("Variance");
-    const auto& x_dims = x->dims();
-    std::vector<int> axes;
-    auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
-    int right = static_cast<int>(matrix_dim[1]);
-
-    // The shape of scale and bias should be equal to x.shape[begin_norm_axis:],
-    // required by Ascend.
-    for (auto i = begin_norm_axis; i < x_dims.size(); ++i) {
-      axes.push_back(x_dims[i]);
-    }
-
-    auto place = ctx.GetPlace();
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    phi::DenseTensor default_scale(x->type());
-    if (!scale) {
-      default_scale.mutable_data<T>(phi::make_ddim(axes), place);
-      phi::DenseTensor value(x->type());
-      value.mutable_data<T>({1}, place);
-      FillNpuTensorWithConstant<T>(&value, static_cast<T>(1.0));
-      const auto& runner =
-          NpuOpRunner("FillD", {value}, {default_scale}, {{"dims", axes}});
-      runner.Run(stream);
-      scale = &default_scale;
-    } else {
-      const_cast<phi::DenseTensor*>(scale)->Resize(phi::make_ddim(axes));
-    }
-
-    phi::DenseTensor default_bias(x->type());
-    if (!bias) {
-      default_bias.mutable_data<T>(phi::make_ddim(axes), place);
-      phi::DenseTensor value(x->type());
-      value.mutable_data<T>({1}, place);
-      FillNpuTensorWithConstant<T>(&value, static_cast<T>(0));
-      const auto& runner =
-          NpuOpRunner("FillD", {value}, {default_bias}, {{"dims", axes}});
-      runner.Run(stream);
-      bias = &default_bias;
-    } else {
-      const_cast<phi::DenseTensor*>(bias)->Resize(phi::make_ddim(axes));
-    }
-
-    // cast scale from LayerNormParamType to T if needed
-    phi::DenseTensor cast_scale(x->type());
-    if (framework::TransToProtoVarType(x->dtype()) ==
-            framework::proto::VarType::FP16 &&
-        framework::TransToProtoVarType(scale->dtype()) ==
-            framework::proto::VarType::FP32) {
-      cast_scale.Resize(scale->dims());
-      cast_scale.mutable_data<T>(ctx.GetPlace());
-      auto dst_dtype =
-          ConvertToNpuDtype(framework::TransToProtoVarType(x->type()));
-      const auto& runner_cast_scale =
-          NpuOpRunner("Cast",
-                      {*scale},
-                      {cast_scale},
-                      {{"dst_type", static_cast<int>(dst_dtype)}});
-      runner_cast_scale.Run(stream);
-    } else {
-      cast_scale.ShareDataWith(*scale);
-    }
-
-    // cast bias from LayerNormParamType to T if needed
-    phi::DenseTensor cast_bias(x->type());
-    if (framework::TransToProtoVarType(x->dtype()) ==
-            framework::proto::VarType::FP16 &&
-        framework::TransToProtoVarType(bias->dtype()) ==
-            framework::proto::VarType::FP32) {
-      cast_bias.Resize(bias->dims());
-      cast_bias.mutable_data<T>(ctx.GetPlace());
-      auto dst_dtype =
-          ConvertToNpuDtype(framework::TransToProtoVarType(x->type()));
-      const auto& runner_cast_bias =
-          NpuOpRunner("Cast",
-                      {*bias},
-                      {cast_bias},
-                      {{"dst_type", static_cast<int>(dst_dtype)}});
-      runner_cast_bias.Run(stream);
-    } else {
-      cast_bias.ShareDataWith(*bias);
-    }
-
-    y->mutable_data<T>(ctx.GetPlace());
-
-    // mean should be of  U type
-    phi::DenseTensor* tmp_mean = mean;
-    phi::DenseTensor cast_mean(x->type());
-    if (framework::TransToProtoVarType(x->dtype()) ==
-            framework::proto::VarType::FP16 &&
-        (framework::TransToProtoVarType(scale->dtype()) ==
-             framework::proto::VarType::FP32 ||
-         framework::TransToProtoVarType(bias->dtype()) ==
-             framework::proto::VarType::FP32)) {
-      cast_mean.Resize(mean->dims());
-      cast_mean.mutable_data<T>(ctx.GetPlace());
-      tmp_mean = &cast_mean;
-      mean->mutable_data<U>(ctx.GetPlace());
-    } else {
-      mean->mutable_data<T>(ctx.GetPlace());
-    }
-
-    // same for variance
-    phi::DenseTensor* tmp_variance = variance;
-    phi::DenseTensor cast_variance(x->type());
-    if (framework::TransToProtoVarType(x->dtype()) ==
-            framework::proto::VarType::FP16 &&
-        (framework::TransToProtoVarType(scale->dtype()) ==
-             framework::proto::VarType::FP32 ||
-         framework::TransToProtoVarType(bias->dtype()) ==
-             framework::proto::VarType::FP32)) {
-      cast_variance.Resize(variance->dims());
-      cast_variance.mutable_data<T>(ctx.GetPlace());
-      tmp_variance = &cast_variance;
-      variance->mutable_data<U>(ctx.GetPlace());
-    } else {
-      variance->mutable_data<T>(ctx.GetPlace());
-    }
-
-    const auto& runner = NpuOpRunner("LayerNorm",
-                                     {*x, cast_scale, cast_bias},
-                                     {*y, *tmp_mean, *tmp_variance},
-                                     {{"begin_norm_axis", begin_norm_axis},
-                                      {"begin_params_axis", begin_norm_axis},
-                                      {"epsilon", epsilon}});
-    runner.Run(stream);
-
-    // cast back from FP16 to FP32
-    if (framework::TransToProtoVarType(x->dtype()) ==
-            framework::proto::VarType::FP16 &&
-        framework::TransToProtoVarType(mean->dtype()) ==
-            framework::proto::VarType::FP32) {
-      auto dst_dtype =
-          ConvertToNpuDtype(framework::TransToProtoVarType(mean->type()));
-      const auto& runner_cast_mean =
-          NpuOpRunner("Cast",
-                      {*tmp_mean},
-                      {*mean},
-                      {{"dst_type", static_cast<int>(dst_dtype)}});
-      runner_cast_mean.Run(stream);
-    }
-    // same for variance
-    if (framework::TransToProtoVarType(x->dtype()) ==
-            framework::proto::VarType::FP16 &&
-        framework::TransToProtoVarType(variance->dtype()) ==
-            framework::proto::VarType::FP32) {
-      auto dst_dtype =
-          ConvertToNpuDtype(framework::TransToProtoVarType(variance->type()));
-      const auto& runner_cast_variance =
-          NpuOpRunner("Cast",
-                      {*tmp_variance},
-                      {*variance},
-                      {{"dst_type", static_cast<int>(dst_dtype)}});
-      runner_cast_variance.Run(stream);
-    }
-
-    // revert shape of scale and bias
-    // TODO(zhiqiu): better implementation, use tmp tensor to avoid write input
-    // tensor.
-    const_cast<phi::DenseTensor*>(scale)->Resize(phi::make_ddim({right}));
-    const_cast<phi::DenseTensor*>(bias)->Resize(phi::make_ddim({right}));
-  }
-};
-
-template <typename T>
-class LayerNormGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using U = LayerNormParamType<T>;
-    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
-    const auto* x = ctx.Input<phi::DenseTensor>("X");
-    const auto& x_dims = x->dims();
-    const auto* mean = ctx.Input<phi::DenseTensor>("Mean");
-    const auto* variance = ctx.Input<phi::DenseTensor>("Variance");
-    const auto* scale = ctx.Input<phi::DenseTensor>("Scale");
-    const auto* dy = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* dscale =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Scale"));
-    auto* dbias = ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
-
-    auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
-    int right = static_cast<int>(matrix_dim[1]);
-
-    std::vector<int> axes;
-    for (auto i = begin_norm_axis; i < x_dims.size(); ++i) {
-      axes.push_back(x_dims[i]);
-    }
-
-    auto place = ctx.GetPlace();
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    // No need to compute any gradient, jusr return
-    if (!dx && !dscale && !dbias) {
-      return;
-    }
-
-    // The rank of mean should be equal to x, required by Ascend.
-    std::vector<int> new_shape;
-    for (auto i = 0; i < begin_norm_axis; ++i) {
-      new_shape.push_back(x_dims[i]);
-    }
-    for (auto i = begin_norm_axis; i < x_dims.size(); ++i) {
-      new_shape.push_back(1);
-    }
-
-    auto mean_dims = mean->dims();
-    const_cast<phi::DenseTensor*>(mean)->Resize(phi::make_ddim({new_shape}));
-    const_cast<phi::DenseTensor*>(variance)->Resize(
-        phi::make_ddim({new_shape}));
-
-    phi::DenseTensor default_scale(x->type());
-    if (!scale) {
-      default_scale.mutable_data<T>(phi::make_ddim(axes), place);
-      phi::DenseTensor value(x->type());
-      value.mutable_data<T>({1}, place);
-      FillNpuTensorWithConstant<T>(&value, static_cast<T>(1.0));
-      const auto& runner =
-          NpuOpRunner("FillD", {value}, {default_scale}, {{"dims", axes}});
-      runner.Run(stream);
-      scale = &default_scale;
-    } else {
-      const_cast<phi::DenseTensor*>(scale)->Resize(phi::make_ddim(axes));
-    }
-
-    // cast scale from LayerNormParamType to T if needed
-    phi::DenseTensor cast_scale(x->type());
-    if (framework::TransToProtoVarType(x->dtype()) ==
-            framework::proto::VarType::FP16 &&
-        framework::TransToProtoVarType(scale->dtype()) ==
-            framework::proto::VarType::FP32) {
-      cast_scale.Resize(scale->dims());
-      cast_scale.mutable_data<T>(ctx.GetPlace());
-      auto dst_dtype =
-          ConvertToNpuDtype(framework::TransToProtoVarType(x->type()));
-      const auto& runner_cast_scale =
-          NpuOpRunner("Cast",
-                      {*scale},
-                      {cast_scale},
-                      {{"dst_type", static_cast<int>(dst_dtype)}});
-      runner_cast_scale.Run(stream);
-    } else {
-      cast_scale.ShareDataWith(*scale);
-    }
-
-    // cast mean from LayerNormParamType to T if needed
-    phi::DenseTensor cast_mean(x->type());
-    if (framework::TransToProtoVarType(x->dtype()) ==
-            framework::proto::VarType::FP16 &&
-        framework::TransToProtoVarType(mean->dtype()) ==
-            framework::proto::VarType::FP32) {
-      cast_mean.Resize(mean->dims());
-      cast_mean.mutable_data<T>(ctx.GetPlace());
-      auto dst_dtype =
-          ConvertToNpuDtype(framework::TransToProtoVarType(x->type()));
-      const auto& runner_cast_mean =
-          NpuOpRunner("Cast",
-                      {*mean},
-                      {cast_mean},
-                      {{"dst_type", static_cast<int>(dst_dtype)}});
-      runner_cast_mean.Run(stream);
-    } else {
-      cast_mean.ShareDataWith(*mean);
-    }
-
-    // cast variance from LayerNormParamType to T if needed
-    phi::DenseTensor cast_variance(x->type());
-    if (framework::TransToProtoVarType(x->dtype()) ==
-            framework::proto::VarType::FP16 &&
-        framework::TransToProtoVarType(variance->dtype()) ==
-            framework::proto::VarType::FP32) {
-      cast_variance.Resize(variance->dims());
-      cast_variance.mutable_data<T>(ctx.GetPlace());
-      auto dst_dtype =
-          ConvertToNpuDtype(framework::TransToProtoVarType(x->type()));
-      const auto& runner_cast_variance =
-          NpuOpRunner("Cast",
-                      {*variance},
-                      {cast_variance},
-                      {{"dst_type", static_cast<int>(dst_dtype)}});
-      runner_cast_variance.Run(stream);
-    } else {
-      cast_variance.ShareDataWith(*variance);
-    }
-
-    phi::DenseTensor dx_(dy->type()), dscale_(dy->type()), dbias_(dy->type());
-    dx = (dx == nullptr) ? &dx_ : dx;
-    dscale = (dscale == nullptr) ? &dscale_ : dscale;
-    dbias = (dbias == nullptr) ? &dbias_ : dbias;
-
-    dx->Resize(x->dims());
-    dx->mutable_data<T>(ctx.GetPlace());
-
-    dscale->Resize(phi::make_ddim(axes));
-
-    dbias->Resize(phi::make_ddim(axes));
-
-    // dscale should be of  U type
-    phi::DenseTensor* tmp_dscale = dscale;
-    phi::DenseTensor cast_dscale(x->type());
-    if (framework::TransToProtoVarType(x->dtype()) ==
-            framework::proto::VarType::FP16 &&
-        (framework::TransToProtoVarType(mean->dtype()) ==
-             framework::proto::VarType::FP32 ||
-         framework::TransToProtoVarType(variance->dtype()) ==
-             framework::proto::VarType::FP32)) {
-      cast_dscale.Resize(dscale->dims());
-      cast_dscale.mutable_data<T>(ctx.GetPlace());
-      tmp_dscale = &cast_dscale;
-      dscale->mutable_data<U>(ctx.GetPlace());
-    } else {
-      dscale->mutable_data<T>(ctx.GetPlace());
-    }
-
-    // same for dbias
-    phi::DenseTensor* tmp_dbias = dbias;
-    phi::DenseTensor cast_dbias(x->type());
-    if (framework::TransToProtoVarType(x->dtype()) ==
-            framework::proto::VarType::FP16 &&
-        (framework::TransToProtoVarType(mean->dtype()) ==
-             framework::proto::VarType::FP32 ||
-         framework::TransToProtoVarType(variance->dtype()) ==
-             framework::proto::VarType::FP32)) {
-      cast_dbias.Resize(dbias->dims());
-      cast_dbias.mutable_data<T>(ctx.GetPlace());
-      tmp_dbias = &cast_dbias;
-      dbias->mutable_data<U>(ctx.GetPlace());
-    } else {
-      dbias->mutable_data<T>(ctx.GetPlace());
-    }
-
-    const auto& runner =
-        NpuOpRunner("LayerNormGrad",
-                    {*dy, *x, cast_variance, cast_mean, cast_scale},
-                    {*dx, *tmp_dscale, *tmp_dbias},
-                    {});
-    runner.Run(stream);
-
-    // cast back from FP16 to FP32
-    if (framework::TransToProtoVarType(x->dtype()) ==
-            framework::proto::VarType::FP16 &&
-        framework::TransToProtoVarType(dscale->dtype()) ==
-            framework::proto::VarType::FP32) {
-      auto dst_dtype =
-          ConvertToNpuDtype(framework::TransToProtoVarType(dscale->type()));
-      const auto& runner_cast_dscale =
-          NpuOpRunner("Cast",
-                      {*tmp_dscale},
-                      {*dscale},
-                      {{"dst_type", static_cast<int>(dst_dtype)}});
-      runner_cast_dscale.Run(stream);
-    }
-    // same for dbias
-    if (framework::TransToProtoVarType(x->dtype()) ==
-            framework::proto::VarType::FP16 &&
-        framework::TransToProtoVarType(dbias->dtype()) ==
-            framework::proto::VarType::FP32) {
-      auto dst_dtype =
-          ConvertToNpuDtype(framework::TransToProtoVarType(dbias->type()));
-      const auto& runner_cast_dbias =
-          NpuOpRunner("Cast",
-                      {*tmp_dbias},
-                      {*dbias},
-                      {{"dst_type", static_cast<int>(dst_dtype)}});
-      runner_cast_dbias.Run(stream);
-    }
-
-    const_cast<phi::DenseTensor*>(mean)->Resize(mean_dims);
-    const_cast<phi::DenseTensor*>(variance)->Resize(mean_dims);
-    const_cast<phi::DenseTensor*>(scale)->Resize(phi::make_ddim({right}));
-    dscale->Resize(phi::make_ddim({right}));
-    dbias->Resize(phi::make_ddim({right}));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(layer_norm,
-                       ops::LayerNormNPUKernel<float>,
-                       ops::LayerNormNPUKernel<plat::float16>);
-REGISTER_OP_NPU_KERNEL(layer_norm_grad,
-                       ops::LayerNormGradNPUKernel<float>,
-                       ops::LayerNormGradNPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/load_combine_op_npu.cc b/paddle/fluid/operators/load_combine_op_npu.cc
deleted file mode 100644
index 4b9b96c23b0b71..00000000000000
--- a/paddle/fluid/operators/load_combine_op_npu.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/load_combine_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    load_combine,
-    ops::LoadCombineOpKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::LoadCombineOpKernel<paddle::platform::NPUDeviceContext, double>,
-    ops::LoadCombineOpKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::LoadCombineOpKernel<paddle::platform::NPUDeviceContext, int8_t>,
-    ops::LoadCombineOpKernel<paddle::platform::NPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/load_op_npu.cc b/paddle/fluid/operators/load_op_npu.cc
deleted file mode 100644
index 0e8517fd7b5296..00000000000000
--- a/paddle/fluid/operators/load_op_npu.cc
+++ /dev/null
@@ -1,134 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <fstream>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/data_type_transform.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/profiler/event_tracing.h"
-
-namespace paddle {
-namespace operators {
-template <typename DeviceContext, typename T>
-class LoadOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto place = ctx.GetPlace();
-    // FIXME(yuyang18): We save variable to local file now, but we should change
-    // it to save an output stream.
-    auto filename = ctx.Attr<std::string>("file_path");
-    std::ifstream fin(filename, std::ios::binary);
-    PADDLE_ENFORCE_EQ(static_cast<bool>(fin),
-                      true,
-                      platform::errors::Unavailable(
-                          "Load operator fail to open file %s, please check "
-                          "whether the model file is complete or damaged.",
-                          filename));
-
-    auto out_var_name = ctx.OutputNames("Out").data();
-    auto *out_var = ctx.OutputVar("Out");
-
-    PADDLE_ENFORCE_NOT_NULL(
-        out_var,
-        platform::errors::InvalidArgument(
-            "The variable %s to be loaded cannot be found.", out_var_name));
-
-    if (out_var->IsType<phi::DenseTensor>()) {
-      LoadLodTensor(fin, place, out_var, ctx);
-    } else if (out_var->IsType<phi::SelectedRows>()) {
-      LoadSelectedRows(fin, place, out_var);
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Load operator only supports loading phi::DenseTensor and "
-          "SelectedRows "
-          "variable, %s has wrong type",
-          out_var_name));
-    }
-  }
-
-  void LoadLodTensor(std::istream &fin,
-                     const platform::Place &place,
-                     framework::Variable *var,
-                     const framework::ExecutionContext &ctx) const {
-    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-    auto *tensor = var->GetMutable<phi::DenseTensor>();
-
-    auto seek = ctx.Attr<int64_t>("seek");
-
-    if (seek != -1) {
-      PADDLE_ENFORCE_GE(seek,
-                        0,
-                        platform::errors::InvalidArgument(
-                            "seek witn tensor must great than or equal to 0"));
-      auto shape = ctx.Attr<std::vector<int64_t>>("shape");
-      paddle::framework::DeserializeFromStream(
-          fin, tensor, dev_ctx, seek, shape);
-    } else {
-      paddle::framework::DeserializeFromStream(fin, tensor, dev_ctx);
-    }
-
-    auto load_as_fp16 = ctx.Attr<bool>("load_as_fp16");
-    auto in_dtype = tensor->dtype();
-    auto out_dtype = load_as_fp16 ? phi::DataType::FLOAT16 : in_dtype;
-
-    if (in_dtype != out_dtype) {
-      // convert to float16 tensor
-      auto in_kernel_type =
-          phi::KernelKey(place, phi::DataLayout::ALL_LAYOUT, in_dtype);
-      auto out_kernel_type =
-          phi::KernelKey(place, phi::DataLayout::ALL_LAYOUT, out_dtype);
-      phi::DenseTensor fp16_tensor;
-      // copy LoD info to the new tensor
-      fp16_tensor.set_lod(tensor->lod());
-      framework::TransDataType(
-          in_kernel_type, out_kernel_type, *tensor, &fp16_tensor);
-
-      // reset output tensor
-      var->Clear();
-      tensor = var->GetMutable<phi::DenseTensor>();
-      tensor->set_lod(fp16_tensor.lod());
-      tensor->ShareDataWith(fp16_tensor);
-    }
-  }
-
-  void LoadSelectedRows(std::istream &fin,
-                        const platform::Place &place,
-                        framework::Variable *var) const {
-    auto *selectedRows = var->GetMutable<phi::SelectedRows>();
-    // get device context from pool
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-    framework::DeserializeFromStream(fin, selectedRows, dev_ctx);
-    selectedRows->SyncIndex();
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    load,
-    ops::LoadOpKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::LoadOpKernel<paddle::platform::NPUDeviceContext, double>,
-    ops::LoadOpKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::LoadOpKernel<paddle::platform::NPUDeviceContext, int8_t>,
-    ops::LoadOpKernel<paddle::platform::NPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/log_loss_op_npu.cc b/paddle/fluid/operators/log_loss_op_npu.cc
deleted file mode 100644
index 0eb4ebe2442c1f..00000000000000
--- a/paddle/fluid/operators/log_loss_op_npu.cc
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cmath>
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-void LogLossAdds(const platform::Place& place,
-                 const aclrtStream& stream,
-                 const phi::DenseTensor* x,
-                 float scale,
-                 phi::DenseTensor* y) {
-  //  Calculate y = x + scale
-  y->mutable_data<T>(x->dims(), place);
-  const auto& runner = NpuOpRunner("Adds", {*x}, {*y}, {{"value", scale}});
-  runner.Run(stream);
-}
-
-template <typename T>
-void LogLossMuls(const platform::Place& place,
-                 const aclrtStream& stream,
-                 const phi::DenseTensor* x,
-                 float scale,
-                 phi::DenseTensor* y) {
-  //  Calculate y = x + scale
-  y->mutable_data<T>(x->dims(), place);
-  const auto& runner = NpuOpRunner("Muls", {*x}, {*y}, {{"value", scale}});
-  runner.Run(stream);
-}
-
-template <typename T>
-void LogLossBCE(const platform::Place& place,
-                const aclrtStream& stream,
-                const phi::DenseTensor* x,
-                const phi::DenseTensor* y,
-                phi::DenseTensor* z) {
-  z->mutable_data<T>(x->dims(), place);
-  const auto& runner =
-      NpuOpRunner("BinaryCrossEntropy",
-                  {*x, *y},
-                  {*z},
-                  {{"reduction", static_cast<std::string>("none")}});
-  runner.Run(stream);
-}
-
-template <typename T>
-void LogLossBCEGrad(const platform::Place& place,
-                    const aclrtStream& stream,
-                    const phi::DenseTensor* x,
-                    const phi::DenseTensor* y,
-                    const phi::DenseTensor* dout,
-                    phi::DenseTensor* dx) {
-  dx->mutable_data<T>(x->dims(), place);
-  const auto& runner =
-      NpuOpRunner("BinaryCrossEntropyGrad",
-                  {*x, *y, *dout},
-                  {*dx},
-                  {{"reduction", static_cast<std::string>("none")}});
-  runner.Run(stream);
-}
-
-template <typename T, typename AttrType = T>
-class LogLossNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* y = ctx.Output<phi::DenseTensor>("Loss");
-    auto* pred = ctx.Input<phi::DenseTensor>("Predicted");
-    auto* label = ctx.Input<phi::DenseTensor>("Labels");
-    auto epsilon = static_cast<T>(ctx.Attr<AttrType>("epsilon"));
-
-    auto place = ctx.GetPlace();
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    float factor = 1 / (1 + 2 * epsilon);
-    float coef = std::log(factor);
-    LogLossAdds<T>(place, stream, pred, epsilon, y);
-    LogLossMuls<T>(place, stream, y, factor, y);
-    LogLossBCE<T>(place, stream, y, label, y);
-    LogLossAdds<T>(place, stream, y, coef, y);
-  }
-};
-
-template <typename T, typename AttrType = T>
-class LogLossGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* pred = ctx.Input<phi::DenseTensor>("Predicted");
-    auto* label = ctx.Input<phi::DenseTensor>("Labels");
-    auto* dloss = ctx.Input<phi::DenseTensor>(framework::GradVarName("Loss"));
-    auto* dpred =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Predicted"));
-    auto epsilon = static_cast<T>(ctx.Attr<AttrType>("epsilon"));
-
-    auto place = ctx.GetPlace();
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    if (dpred) {
-      LogLossBCEGrad<T>(place, stream, pred, label, dloss, dpred);
-      LogLossMuls<T>(place, stream, dpred, 1 / (1 + 2 * epsilon), dpred);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(log_loss, ops::LogLossNPUKernel<float>);
-
-REGISTER_OP_NPU_KERNEL(log_loss_grad, ops::LogLossGradNPUKernel<float>);
diff --git a/paddle/fluid/operators/log_softmax_op_npu.cc b/paddle/fluid/operators/log_softmax_op_npu.cc
deleted file mode 100644
index 34f9c11e066a75..00000000000000
--- a/paddle/fluid/operators/log_softmax_op_npu.cc
+++ /dev/null
@@ -1,78 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/axis_utils.h"
-
-namespace paddle {
-namespace operators {
-
-using NPUDeviceContext = platform::NPUDeviceContext;
-
-template <typename T>
-class LogSoftmaxNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* X = ctx.Input<phi::DenseTensor>("X");
-    auto* Out = ctx.Output<phi::DenseTensor>("Out");
-    const int rank = X->dims().size();
-    const int axis = phi::funcs::CanonicalAxis(ctx.Attr<int>("axis"), rank);
-    Out->mutable_data<T>(ctx.GetPlace());
-
-    if (X->numel() != 0) {
-      auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-      const auto& runner = NpuOpRunner(
-          "LogSoftmaxV2", {*X}, {*Out}, {{"axes", std::vector<int>{axis}}});
-      runner.Run(stream);
-    }
-  }
-};
-
-template <typename T>
-class LogSoftmaxGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* Out = ctx.Input<phi::DenseTensor>("Out");
-    auto* dOut = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dX = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    const int rank = dOut->dims().size();
-    const int axis = phi::funcs::CanonicalAxis(ctx.Attr<int>("axis"), rank);
-
-    // allocate memory on device.
-    dX->mutable_data<T>(ctx.GetPlace());
-
-    if (dOut->numel() != 0) {
-      auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-      const auto& runner = NpuOpRunner("LogSoftmaxGrad",
-                                       {*dOut, *Out},
-                                       {*dX},
-                                       {{"axis", std::vector<int>{axis}}});
-      runner.Run(stream);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(log_softmax,
-                       ops::LogSoftmaxNPUKernel<float>,
-                       ops::LogSoftmaxNPUKernel<plat::float16>);
-
-REGISTER_OP_NPU_KERNEL(log_softmax_grad,
-                       ops::LogSoftmaxGradNPUKernel<float>,
-                       ops::LogSoftmaxGradNPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu.cc b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
deleted file mode 100644
index 8ae050541fb230..00000000000000
--- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc
+++ /dev/null
@@ -1,176 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor_util.h"
-
-namespace paddle {
-namespace operators {
-
-constexpr int64_t kNoPadding = -1;
-
-template <typename DeviceContext, typename T>
-class LookupTableV2NPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *ids_t = ctx.Input<phi::DenseTensor>("Ids");      // int tensor
-    auto *output_t = ctx.Output<phi::DenseTensor>("Out");  // float tensor
-    auto *table_t = ctx.Input<phi::DenseTensor>("W");
-
-    auto *table_var = ctx.InputVar("W");
-    PADDLE_ENFORCE_EQ(
-        table_var->IsType<phi::DenseTensor>(),
-        true,
-        platform::errors::InvalidArgument("npu only accept phi::DenseTensor"));
-    output_t->mutable_data<T>(ctx.GetPlace());
-
-    int64_t padding_idx = ctx.Attr<int64_t>("padding_idx");
-    if (padding_idx == kNoPadding) {
-      NpuOpRunner runner;
-      runner.SetType("GatherV2")
-          .AddInput(*table_t)
-          .AddInput(*ids_t)
-          .AddInput(std::vector<int32_t>{0})
-#if (CANN_VERSION_CODE >= 503003)
-          .AddAttrs({{"batch_dims", 0}})
-#endif
-          .AddOutput(*output_t);
-      runner.Run();
-    } else {
-      phi::DenseTensor tmp_table_t(table_t->type());
-      tmp_table_t.mutable_data<T>(table_t->dims(), ctx.GetPlace());
-
-      phi::DenseTensor index;
-      index.mutable_data<int32_t>({1, 1}, ctx.GetPlace());
-      FillNpuTensorWithConstant<int32_t>(&index,
-                                         static_cast<int32_t>(padding_idx));
-
-      auto updata_dim = phi::make_ddim({1, table_t->dims()[1]});
-      phi::DenseTensor update;
-      update.mutable_data<T>(updata_dim, ctx.GetPlace());
-      FillNpuTensorWithConstant<T>(&update, static_cast<T>(0));
-      update.Resize(updata_dim);
-
-      NpuOpRunner update_runner;
-      update_runner.SetType("TensorScatterUpdate")
-          .AddInput(*table_t)
-          .AddInput(index)
-          .AddInput(update)
-          .AddOutput(tmp_table_t);
-      update_runner.Run();
-
-      NpuOpRunner runner;
-      runner.SetType("GatherV2")
-          .AddInput(tmp_table_t)
-          .AddInput(*ids_t)
-          .AddInput(std::vector<int32_t>{0})
-#if (CANN_VERSION_CODE >= 503003)
-          .AddAttrs({{"batch_dims", 0}})
-#endif
-          .AddOutput(*output_t);
-      runner.Run();
-    }
-  }
-};
-
-template <typename T>
-class LookupTableV2GradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *ids_t = ctx.Input<phi::DenseTensor>("Ids");
-    auto *output_grad_t =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto *table_grad_t =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("W"));
-    table_grad_t->mutable_data<T>(ctx.GetPlace());
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    int64_t padding_idx = ctx.Attr<int64_t>("padding_idx");
-
-    /* EmbeddingDenseGrad has bug on large shape, temporarily disable it.
-
-    int embedding_dim = table_grad_t->dims()[1];
-    if (embedding_dim % 32 == 0) {
-      // NOTE(pangyoki): The embedding_dim of phi::DenseTensor used in
-      // EmbeddingDenseGrad must be an integer multiple of 32.
-      int num_weights = table_grad_t->dims()[0];
-      const auto &runner =
-          NpuOpRunner("EmbeddingDenseGrad", {*output_grad_t, *ids_t},
-                      {*table_grad_t}, {{"num_weights", num_weights},
-                                        {"padding_idx", -1},
-                                        {"scale_grad_by_freq", false}});
-      runner.Run(stream);
-      return;
-    }
-    */
-
-    const auto &runner_zeros =
-        NpuOpRunner("ZerosLike", {*table_grad_t}, {*table_grad_t});
-    runner_zeros.Run(stream);
-
-    if (padding_idx == kNoPadding) {
-      // NOTE(zhiqiu): It seems in cann 20.1, the first input and output
-      // can be different tensor, but in cann 20.2+, it does inplace operation.
-      // Thus, the first input and output should be same tensor.
-      const auto &runner_scatter =
-          NpuOpRunner("ScatterAdd",
-                      {*table_grad_t, *ids_t, *output_grad_t},
-                      {*table_grad_t},
-                      {{"use_locking", true}});
-      runner_scatter.Run(stream);
-    } else {
-      phi::DenseTensor casted_ids_t;
-      if (framework::TransToProtoVarType(ids_t->dtype()) !=
-          framework::proto::VarType::INT32) {
-        casted_ids_t.mutable_data<int32_t>(ids_t->dims(), ctx.GetPlace());
-        const auto &cast_runner = NpuOpRunner(
-            "Cast", {*ids_t}, {casted_ids_t}, {{"dst_type", ACL_INT32}});
-        cast_runner.Run(stream);
-      } else {
-        casted_ids_t.ShareDataWith(*ids_t);
-      }
-      auto table_grad_dims = table_grad_t->dims();
-
-      NpuOpRunner runner;
-      runner.SetType("UnsortedSegmentSum")
-          .AddInput(*output_grad_t)
-          .AddInput(casted_ids_t)
-          .AddInput(std::vector<int64_t>{table_grad_dims[0]})
-          .AddOutput(*table_grad_t);
-      runner.Run(stream);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    lookup_table_v2,
-    ops::LookupTableV2NPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::LookupTableV2NPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::LookupTableV2NPUKernel<paddle::platform::NPUDeviceContext,
-                                paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    lookup_table_v2_grad,
-    ops::LookupTableV2GradNPUKernel<float>,
-    ops::LookupTableV2GradNPUKernel<int>,
-    ops::LookupTableV2GradNPUKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/masked_select_op_npu.cc b/paddle/fluid/operators/masked_select_op_npu.cc
deleted file mode 100644
index 96fba4b968869c..00000000000000
--- a/paddle/fluid/operators/masked_select_op_npu.cc
+++ /dev/null
@@ -1,202 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class MaskedSelectedNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto input = ctx.Input<phi::DenseTensor>("X");
-    auto mask = ctx.Input<phi::DenseTensor>("Mask");
-    auto out = ctx.Output<phi::DenseTensor>("Y");
-
-    auto input_dim = input->dims();
-    auto mask_dim = mask->dims();
-    PADDLE_ENFORCE_EQ(
-        input_dim,
-        mask_dim,
-        platform::errors::InvalidArgument(
-            "The dim size of input and mask in OP(masked_selected) "
-            "must be equal, but got input dim:(%ld), mask dim: "
-            "(%ld). Please check input "
-            "value.",
-            input_dim,
-            mask_dim));
-
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>();
-    auto stream = dev_ctx.stream();
-
-    Tensor mask_int32, out_size;
-    std::vector<int32_t> out_size_vec;
-    mask_int32.mutable_data<int32_t>(mask->dims(), ctx.GetPlace());
-    out_size.mutable_data<int32_t>({1}, ctx.GetPlace());
-    {
-      const auto& cast_runner = NpuOpRunner(
-          "Cast",
-          {*mask},
-          {mask_int32},
-          {{"dst_type",
-            static_cast<int32_t>(
-                ConvertToNpuDtype(framework::proto::VarType::INT32))}});
-      cast_runner.Run(stream);
-
-      mask_int32.Resize({mask_int32.numel()});
-      NpuOpRunner sum_runner;
-      sum_runner.SetType("ReduceSum");
-      sum_runner.AddInput(mask_int32);
-      sum_runner.AddInput(std::vector<int32_t>({0}));
-      sum_runner.AddOutput(out_size);
-      sum_runner.AddAttr("keep_dims", false);
-      sum_runner.Run(stream);
-      paddle::framework::TensorToVector(out_size, dev_ctx, &out_size_vec);
-    }
-
-    out->Resize({out_size_vec[0]});
-    out->mutable_data<T>(ctx.GetPlace());
-
-    Tensor topkv2_out, indices;
-    topkv2_out.mutable_data<int32_t>({out_size_vec[0]}, ctx.GetPlace());
-    indices.mutable_data<int32_t>({out_size_vec[0]}, ctx.GetPlace());
-    {
-      NpuOpRunner topkv2_runner;
-      topkv2_runner.SetType("TopKV2")
-          .AddInput(mask_int32)
-          .AddInput(out_size)
-          .AddOutput(topkv2_out)
-          .AddOutput(indices)
-          .AddAttr("sorted", false)
-          .AddAttr("dim", 0)
-          .AddAttr("largest", true)
-          .Run(stream);
-      // TopKV2 may be unstable
-      NpuOpRunner topkv2_runner2;
-      topkv2_runner2.SetType("TopKV2")
-          .AddInput(indices)
-          .AddInput(out_size)
-          .AddOutput(topkv2_out)
-          .AddOutput(indices)
-          .AddAttr("sorted", true)
-          .AddAttr("dim", 0)
-          .AddAttr("largest", false)
-          .Run(stream);
-
-      Tensor input_tmp;
-      input_tmp.ShareDataWith(*input);
-      input_tmp.Resize({input->numel()});
-      const auto& gather_runner = NpuOpRunner(
-          "GatherV2D", {input_tmp, topkv2_out}, {*out}, {{"axis", 0}});
-      gather_runner.Run(stream);
-    }
-  }
-};
-
-template <typename T>
-class MaskedSelectedGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto mask = ctx.Input<phi::DenseTensor>("Mask");
-    auto y_grad = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
-    auto x_grad = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-
-    x_grad->mutable_data<T>(ctx.GetPlace());
-
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>();
-    auto stream = dev_ctx.stream();
-
-    Tensor mask_int32, out_size;
-    std::vector<int32_t> out_size_vec;
-    mask_int32.mutable_data<int32_t>(mask->dims(), ctx.GetPlace());
-    out_size.mutable_data<int32_t>({1}, ctx.GetPlace());
-    {
-      const auto& cast_runner = NpuOpRunner(
-          "Cast",
-          {*mask},
-          {mask_int32},
-          {{"dst_type",
-            static_cast<int32_t>(
-                ConvertToNpuDtype(framework::proto::VarType::INT32))}});
-      cast_runner.Run(stream);
-
-      mask_int32.Resize({mask_int32.numel()});
-      NpuOpRunner sum_runner;
-      sum_runner.SetType("ReduceSum");
-      sum_runner.AddInput(mask_int32);
-      sum_runner.AddInput(std::vector<int32_t>({0}));
-      sum_runner.AddOutput(out_size);
-      sum_runner.AddAttr("keep_dims", false);
-      sum_runner.Run(stream);
-      paddle::framework::TensorToVector(out_size, dev_ctx, &out_size_vec);
-    }
-
-    Tensor topkv2_out, indices;
-    topkv2_out.mutable_data<int32_t>({out_size_vec[0]}, ctx.GetPlace());
-    indices.mutable_data<int32_t>({out_size_vec[0]}, ctx.GetPlace());
-    {
-      NpuOpRunner topkv2_runner;
-      topkv2_runner.SetType("TopKV2")
-          .AddInput(mask_int32)
-          .AddInput(out_size)
-          .AddOutput(topkv2_out)
-          .AddOutput(indices)
-          .AddAttr("sorted", false)
-          .AddAttr("dim", 0)
-          .AddAttr("largest", true)
-          .Run(stream);
-
-      NpuOpRunner topkv2_runner2;
-      topkv2_runner2.SetType("TopKV2")
-          .AddInput(indices)
-          .AddInput(out_size)
-          .AddOutput(topkv2_out)
-          .AddOutput(indices)
-          .AddAttr("sorted", true)
-          .AddAttr("dim", 0)
-          .AddAttr("largest", false)
-          .Run(stream);
-
-      topkv2_out.Resize({out_size_vec[0], 1});
-      x_grad->Resize({x_grad->numel()});
-      NpuOpRunner scatter_runner;
-      scatter_runner.SetType("ScatterNd");
-      scatter_runner.AddInput(topkv2_out);
-      scatter_runner.AddInput(*y_grad);
-      scatter_runner.AddInput(
-          std::vector<int32_t>({static_cast<int32_t>(x_grad->numel())}));
-      scatter_runner.AddOutput(*x_grad);
-      scatter_runner.Run(stream);
-      x_grad->Resize(mask->dims());
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(masked_select,
-                       ops::MaskedSelectedNPUKernel<plat::float16>,
-                       ops::MaskedSelectedNPUKernel<float>,
-                       ops::MaskedSelectedNPUKernel<int>,
-                       ops::MaskedSelectedNPUKernel<int64_t>);
-REGISTER_OP_NPU_KERNEL(masked_select_grad,
-                       ops::MaskedSelectedGradNPUKernel<plat::float16>,
-                       ops::MaskedSelectedGradNPUKernel<float>,
-                       ops::MaskedSelectedGradNPUKernel<int>,
-                       ops::MaskedSelectedGradNPUKernel<int64_t>);
diff --git a/paddle/fluid/operators/matmul_op_npu.cc b/paddle/fluid/operators/matmul_op_npu.cc
deleted file mode 100644
index d49d9a319ccffa..00000000000000
--- a/paddle/fluid/operators/matmul_op_npu.cc
+++ /dev/null
@@ -1,561 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/op_version_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using NPUDeviceContext = platform::NPUDeviceContext;
-
-template <typename T>
-static void Mul(const framework::ExecutionContext& ctx,
-                const aclrtStream& stream,
-                const phi::DenseTensor& X,
-                const phi::DenseTensor& Y,
-                phi::DenseTensor* Out,
-                const float alpha) {
-  Out->mutable_data<T>(ctx.GetPlace());
-
-  if (fabs(alpha - 1.0) < std::numeric_limits<float>::epsilon()) {
-    const auto& runner_dx = NpuOpRunner("Mul", {X, Y}, {*Out}, {});
-    runner_dx.Run(stream);
-  } else {
-    phi::DenseTensor Out_temp(Out->dtype());
-    Out_temp.mutable_data<T>(Out->dims(), ctx.GetPlace());
-    const auto& runner_dx = NpuOpRunner("Mul", {X, Y}, {Out_temp}, {});
-    runner_dx.Run(stream);
-
-    const auto& runner =
-        NpuOpRunner("Muls", {Out_temp}, {*Out}, {{"value", alpha}});
-    runner.Run(stream);
-  }
-}
-
-template <typename T>
-static void Dot(const framework::ExecutionContext& ctx,
-                const aclrtStream& stream,
-                const phi::DenseTensor& X,
-                const phi::DenseTensor& Y,
-                phi::DenseTensor* Out,
-                const float alpha) {
-  Out->mutable_data<T>(ctx.GetPlace());
-
-  if (fabs(alpha - 1.0) < std::numeric_limits<float>::epsilon()) {
-    const auto& runner = NpuOpRunner("Dot", {X, Y}, {*Out});
-    runner.Run(stream);
-  } else {
-    phi::DenseTensor Out_temp(Out->dtype());
-    Out_temp.mutable_data<T>(Out->dims(), ctx.GetPlace());
-    const auto& out_temp_runner = NpuOpRunner("Dot", {X, Y}, {Out_temp});
-    out_temp_runner.Run(stream);
-
-    const auto& runner =
-        NpuOpRunner("Muls", {Out_temp}, {*Out}, {{"value", alpha}});
-    runner.Run(stream);
-  }
-}
-
-template <typename T>
-static void MatMul2D(const framework::ExecutionContext& ctx,
-                     const aclrtStream& stream,
-                     const phi::DenseTensor& X,
-                     const phi::DenseTensor& Y,
-                     phi::DenseTensor* Out,
-                     const bool trans_x,
-                     const bool trans_y,
-                     const float alpha) {
-  Out->mutable_data<T>(ctx.GetPlace());
-
-  if (fabs(alpha - 1.0) < std::numeric_limits<float>::epsilon()) {
-    const auto& runner =
-        NpuOpRunner("MatMul",
-                    {X, Y},
-                    {*Out},
-                    {{"transpose_x1", trans_x}, {"transpose_x2", trans_y}});
-    runner.Run(stream);
-  } else {
-    phi::DenseTensor Out_temp(Out->dtype());
-    Out_temp.mutable_data<T>(Out->dims(), ctx.GetPlace());
-    const auto& out_temp_runner =
-        NpuOpRunner("MatMul",
-                    {X, Y},
-                    {Out_temp},
-                    {{"transpose_x1", trans_x}, {"transpose_x2", trans_y}});
-    out_temp_runner.Run(stream);
-
-    const auto& runner =
-        NpuOpRunner("Muls", {Out_temp}, {*Out}, {{"value", alpha}});
-    runner.Run(stream);
-  }
-}
-
-template <typename T>
-static void MatMulND(const framework::ExecutionContext& ctx,
-                     const aclrtStream& stream,
-                     const phi::DenseTensor& X,
-                     const phi::DenseTensor& Y,
-                     phi::DenseTensor* Out,
-                     const bool trans_x,
-                     const bool trans_y,
-                     const float alpha) {
-  Out->mutable_data<T>(ctx.GetPlace());
-
-  if (fabs(alpha - 1.0) < std::numeric_limits<float>::epsilon()) {
-    const auto& runner =
-        NpuOpRunner("BatchMatMul",
-                    {X, Y},
-                    {*Out},
-                    {{"adj_x1", trans_x}, {"adj_x2", trans_y}});
-    runner.Run(stream);
-  } else {
-    phi::DenseTensor Out_temp(Out->dtype());
-    Out_temp.mutable_data<T>(Out->dims(), ctx.GetPlace());
-    const auto& out_temp_runner =
-        NpuOpRunner("BatchMatMul",
-                    {X, Y},
-                    {Out_temp},
-                    {{"adj_x1", trans_x}, {"adj_x2", trans_y}});
-    out_temp_runner.Run(stream);
-
-    const auto& runner =
-        NpuOpRunner("Muls", {Out_temp}, {*Out}, {{"value", alpha}});
-    runner.Run(stream);
-  }
-}
-
-template <typename T>
-static void ReduceDims(const framework::ExecutionContext& ctx,
-                       const aclrtStream& stream,
-                       const std::vector<int64_t>& dims,
-                       const std::vector<int64_t>& brd_dims,
-                       const phi::DenseTensor& in,
-                       phi::DenseTensor* out) {
-  std::vector<int64_t> axes;
-  int64_t size = brd_dims.size();
-  int64_t diff = brd_dims.size() - dims.size();
-  for (int64_t i = 0; i < size; ++i) {
-    if (i < diff) {
-      axes.push_back(i);
-      continue;
-    }
-    if (brd_dims[i] > dims[i - diff]) {
-      axes.push_back(i);
-    }
-  }
-  out->mutable_data<T>(ctx.GetPlace());
-  const auto& runner = NpuOpRunner(
-      "ReduceSumD", {in}, {*out}, {{"axes", axes}, {"keep_dims", false}});
-  runner.Run(stream);
-}
-
-template <typename DeviceContext, typename T>
-class MatMulNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* X = ctx.Input<phi::DenseTensor>("X");
-    auto* Y = ctx.Input<phi::DenseTensor>("Y");
-    auto* Out = ctx.Output<phi::DenseTensor>("Out");
-    bool transpose_x = ctx.Attr<bool>("transpose_X");
-    bool transpose_y = ctx.Attr<bool>("transpose_Y");
-    float alpha = static_cast<T>(ctx.Attr<float>("alpha"));
-
-    std::vector<int64_t> x_dims = phi::vectorize(X->dims());
-    std::vector<int64_t> y_dims = phi::vectorize(Y->dims());
-    std::vector<int64_t> out_dims = phi::vectorize(Out->dims());
-    int x_ndim = x_dims.size();
-    int y_ndim = y_dims.size();
-    int out_ndim = out_dims.size();
-
-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-
-    // Case 1: [K] x [K] = [1]
-    if (x_ndim == 1 && y_ndim == 1) {
-      PADDLE_ENFORCE_EQ(
-          X->numel(),
-          Y->numel(),
-          platform::errors::InvalidArgument(
-              "X's numbers must be equal to Y's numbers,"
-              "when X/Y's dims =1. But received X has [%d] elements,"
-              "received Y has [%d] elements",
-              X->numel(),
-              Y->numel()));
-      Out->Resize({1});
-      Dot<T>(ctx, stream, *X, *Y, Out, alpha);
-      return;
-    }
-
-    // Resize dim 1 to 2
-    phi::DenseTensor x_temp, y_temp;
-    x_temp.ShareDataWith(*X);
-    y_temp.ShareDataWith(*Y);
-    if (x_ndim == 1) {
-      x_dims.insert(x_dims.begin(), 1);
-      out_dims.insert(out_dims.end() - 1, 1);
-      x_temp.Resize(phi::make_ddim(x_dims));
-      x_ndim = 2;
-      out_ndim += 1;
-    }
-    if (y_ndim == 1) {
-      y_dims.push_back(1);
-      out_dims.push_back(1);
-      y_temp.Resize(phi::make_ddim(y_dims));
-      y_ndim = 2;
-      out_ndim += 1;
-    }
-
-    const int K = transpose_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1];
-    if (transpose_y) {
-      PADDLE_ENFORCE_EQ(
-          y_dims[y_ndim - 1],
-          K,
-          platform::errors::InvalidArgument("Input(Y) has error dim."
-                                            "Y'dims[%d] must be equal to %d"
-                                            "But received Y'dims[%d] is %d",
-                                            y_ndim - 1,
-                                            K,
-                                            y_ndim - 1,
-                                            y_dims[y_ndim - 1]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          y_dims[y_ndim - 2],
-          K,
-          platform::errors::InvalidArgument("Input(Y) has error dim."
-                                            "Y'dims[%d] must be equal to %d"
-                                            "But received Y'dims[%d] is %d",
-                                            y_ndim - 2,
-                                            K,
-                                            y_ndim - 2,
-                                            y_dims[y_ndim - 2]));
-    }
-
-    // Case 2: [M, K] x [K, N] = [M, N]
-    if (x_ndim == 2 && y_ndim == 2) {
-      MatMul2D<T>(
-          ctx, stream, x_temp, y_temp, Out, transpose_x, transpose_y, alpha);
-      return;
-    }
-
-    // Case 3: [B, M, K] x [K, N] =  [B, M, N], when transpose_x = false
-    // Equal: [B * M, K] x [K, N] = [B * M, N] => [B, M, N]
-    if (transpose_x == false && y_ndim == 2) {
-      std::vector<int64_t> vec_dim = {x_temp.numel() / K, K};
-      x_temp.Resize(phi::make_ddim(vec_dim));
-      MatMul2D<T>(
-          ctx, stream, x_temp, y_temp, Out, transpose_x, transpose_y, alpha);
-      return;
-    }
-
-    // Case 4: [B, M, K] x  [B, K, N] = [B, M, N]
-    std::vector<int64_t> x_broadcast_dims(out_ndim, 1);
-    std::vector<int64_t> y_broadcast_dims(out_ndim, 1);
-    std::copy(out_dims.begin(), out_dims.end() - 2, x_broadcast_dims.begin());
-    std::copy(out_dims.begin(), out_dims.end() - 2, y_broadcast_dims.begin());
-    std::copy(x_dims.end() - 2, x_dims.end(), x_broadcast_dims.end() - 2);
-    std::copy(y_dims.end() - 2, y_dims.end(), y_broadcast_dims.end() - 2);
-
-    phi::DenseTensor x_temp_brd(X->dtype());
-    if (x_dims == x_broadcast_dims) {
-      x_temp_brd.ShareDataWith(*X);
-      x_temp_brd.Resize(phi::make_ddim(x_broadcast_dims));
-    } else {
-      x_temp_brd.Resize(phi::make_ddim(x_broadcast_dims));
-      x_temp_brd.mutable_data<T>(ctx.GetPlace());
-      NpuOpRunner runner_brd;
-      runner_brd.SetType("BroadcastTo")
-          .AddInput(x_temp)
-          .AddInput(std::move(x_broadcast_dims))
-          .AddOutput(x_temp_brd)
-          .Run(stream);
-    }
-
-    phi::DenseTensor y_temp_brd(Y->dtype());
-    if (y_dims == y_broadcast_dims) {
-      y_temp_brd.ShareDataWith(*Y);
-      y_temp_brd.Resize(phi::make_ddim(y_broadcast_dims));
-    } else {
-      y_temp_brd.Resize(phi::make_ddim(y_broadcast_dims));
-      y_temp_brd.mutable_data<T>(ctx.GetPlace());
-      NpuOpRunner runner_brd;
-      runner_brd.SetType("BroadcastTo")
-          .AddInput(y_temp)
-          .AddInput(std::move(y_broadcast_dims))
-          .AddOutput(y_temp_brd)
-          .Run(stream);
-    }
-    MatMulND<T>(ctx,
-                stream,
-                x_temp_brd,
-                y_temp_brd,
-                Out,
-                transpose_x,
-                transpose_y,
-                alpha);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class MatMulGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* X = ctx.Input<phi::DenseTensor>("X");
-    auto* Y = ctx.Input<phi::DenseTensor>("Y");
-    auto* dOut = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dX = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* dY = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
-    bool transpose_x = ctx.Attr<bool>("transpose_X");
-    bool transpose_y = ctx.Attr<bool>("transpose_Y");
-    float alpha = static_cast<T>(ctx.Attr<float>("alpha"));
-
-    std::vector<int64_t> x_dims = phi::vectorize(X->dims());
-    std::vector<int64_t> y_dims = phi::vectorize(Y->dims());
-    std::vector<int64_t> out_dims = phi::vectorize(dOut->dims());
-    int x_ndim = x_dims.size();
-    int y_ndim = y_dims.size();
-    int out_ndim = out_dims.size();
-
-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-
-    // Case 1: [K] x [K] = [1]
-    if (x_ndim == 1 && y_ndim == 1) {
-      phi::DenseTensor dout_temp(dOut->dtype());
-      dout_temp.Resize(X->dims());
-      dout_temp.mutable_data<T>(ctx.GetPlace());
-      NpuOpRunner runner;
-      runner.SetType("BroadcastTo")
-          .AddInput(*dOut)
-          .AddInput(std::move(x_dims))
-          .AddOutput(dout_temp)
-          .Run(stream);
-
-      if (dX) {
-        Mul<T>(ctx, stream, dout_temp, *Y, dX, alpha);
-      }
-      if (dY) {
-        Mul<T>(ctx, stream, dout_temp, *X, dY, alpha);
-      }
-      return;
-    }
-
-    // Resize dim 1 to 2
-    phi::DenseTensor x_temp, y_temp, dout_temp;
-    x_temp.ShareDataWith(*X);
-    y_temp.ShareDataWith(*Y);
-    dout_temp.ShareDataWith(*dOut);
-    if (x_ndim == 1) {
-      x_dims.insert(x_dims.begin(), 1);
-      out_dims.insert(out_dims.end() - 1, 1);
-      x_temp.Resize(phi::make_ddim(x_dims));
-      dout_temp.Resize(phi::make_ddim(out_dims));
-      x_ndim = 2;
-      out_ndim += 1;
-    }
-    if (y_ndim == 1) {
-      y_dims.push_back(1);
-      out_dims.push_back(1);
-      y_temp.Resize(phi::make_ddim(y_dims));
-      dout_temp.Resize(phi::make_ddim(out_dims));
-      y_ndim = 2;
-      out_ndim += 1;
-    }
-
-    // Case 2: [M, K] x [K, N] = [M, N]
-    if (out_ndim == 2) {
-      if (dX) {
-        dX->Resize(phi::make_ddim(x_dims));
-        if (transpose_x) {
-          MatMul2D<T>(
-              ctx, stream, y_temp, dout_temp, dX, transpose_y, true, alpha);
-        } else {
-          MatMul2D<T>(
-              ctx, stream, dout_temp, y_temp, dX, false, !transpose_y, alpha);
-        }
-        dX->Resize(X->dims());
-      }
-      if (dY) {
-        dY->Resize(phi::make_ddim(y_dims));
-        if (transpose_y) {
-          MatMul2D<T>(
-              ctx, stream, dout_temp, x_temp, dY, true, transpose_x, alpha);
-        } else {
-          MatMul2D<T>(
-              ctx, stream, x_temp, dout_temp, dY, !transpose_x, false, alpha);
-        }
-        dY->Resize(Y->dims());
-      }
-      return;
-    }
-
-    const int K = transpose_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1];
-    const int N = transpose_y ? y_dims[y_ndim - 2] : y_dims[y_ndim - 1];
-
-    // Case 3: [B, M, K] x [K, N] =  [B, M, N], when transpose_x = false
-    // Equal: [B * M, K] x [K, N] = [B * M, N] => [B, M, N]
-    if (transpose_x == false && y_ndim == 2) {
-      std::vector<int64_t> x_vec_dim = {x_temp.numel() / K, K};
-      dout_temp.Resize(
-          phi::make_ddim(std::vector<int64_t>{dout_temp.numel() / N, N}));
-      if (dX) {
-        dX->Resize(phi::make_ddim(x_vec_dim));
-        MatMul2D<T>(
-            ctx, stream, dout_temp, y_temp, dX, false, !transpose_y, alpha);
-        dX->Resize(X->dims());
-      }
-      if (dY) {
-        x_temp.Resize(phi::make_ddim(x_vec_dim));
-        if (transpose_y) {
-          MatMul2D<T>(ctx, stream, dout_temp, x_temp, dY, true, false, alpha);
-        } else {
-          MatMul2D<T>(ctx, stream, x_temp, dout_temp, dY, true, false, alpha);
-        }
-      }
-      return;
-    }
-
-    // Case 4: [B, M, K] x  [B, K, N] = [B, M, N]
-    std::vector<int64_t> x_broadcast_dims(out_ndim, 1);
-    std::vector<int64_t> y_broadcast_dims(out_ndim, 1);
-    std::copy(out_dims.begin(), out_dims.end() - 2, x_broadcast_dims.begin());
-    std::copy(out_dims.begin(), out_dims.end() - 2, y_broadcast_dims.begin());
-    std::copy(x_dims.end() - 2, x_dims.end(), x_broadcast_dims.end() - 2);
-    std::copy(y_dims.end() - 2, y_dims.end(), y_broadcast_dims.end() - 2);
-
-    phi::DenseTensor x_temp_brd(X->dtype());
-    if (x_dims == x_broadcast_dims) {
-      x_temp_brd.ShareDataWith(*X);
-      x_temp_brd.Resize(phi::make_ddim(x_broadcast_dims));
-    } else {
-      x_temp_brd.Resize(phi::make_ddim(x_broadcast_dims));
-      x_temp_brd.mutable_data<T>(ctx.GetPlace());
-      NpuOpRunner runner_brd;
-      runner_brd.SetType("BroadcastTo")
-          .AddInput(x_temp)
-          .AddInput(std::move(x_broadcast_dims))
-          .AddOutput(x_temp_brd)
-          .Run(stream);
-    }
-
-    phi::DenseTensor y_temp_brd(Y->dtype());
-    if (y_dims == y_broadcast_dims) {
-      y_temp_brd.ShareDataWith(*Y);
-      y_temp_brd.Resize(phi::make_ddim(y_broadcast_dims));
-    } else {
-      y_temp_brd.Resize(phi::make_ddim(y_broadcast_dims));
-      y_temp_brd.mutable_data<T>(ctx.GetPlace());
-      NpuOpRunner runner_brd;
-      runner_brd.SetType("BroadcastTo")
-          .AddInput(y_temp)
-          .AddInput(std::move(y_broadcast_dims))
-          .AddOutput(y_temp_brd)
-          .Run(stream);
-    }
-
-    if (dX) {
-      if (x_dims == x_broadcast_dims) {
-        if (transpose_x) {
-          MatMulND<T>(
-              ctx, stream, y_temp_brd, dout_temp, dX, transpose_y, true, alpha);
-        } else {
-          MatMulND<T>(ctx,
-                      stream,
-                      dout_temp,
-                      y_temp_brd,
-                      dX,
-                      false,
-                      !transpose_y,
-                      alpha);
-        }
-      } else {
-        phi::DenseTensor dx_temp(X->dtype());
-        dx_temp.Resize(phi::make_ddim(x_broadcast_dims));
-        if (transpose_x) {
-          MatMulND<T>(ctx,
-                      stream,
-                      y_temp_brd,
-                      dout_temp,
-                      &dx_temp,
-                      transpose_y,
-                      true,
-                      alpha);
-        } else {
-          MatMulND<T>(ctx,
-                      stream,
-                      dout_temp,
-                      y_temp_brd,
-                      &dx_temp,
-                      false,
-                      !transpose_y,
-                      alpha);
-        }
-        ReduceDims<T>(ctx, stream, x_dims, x_broadcast_dims, dx_temp, dX);
-      }
-    }
-    if (dY) {
-      if (y_dims == y_broadcast_dims) {
-        if (transpose_y) {
-          MatMulND<T>(
-              ctx, stream, dout_temp, x_temp_brd, dY, true, transpose_x, alpha);
-        } else {
-          MatMulND<T>(ctx,
-                      stream,
-                      x_temp_brd,
-                      dout_temp,
-                      dY,
-                      !transpose_x,
-                      false,
-                      alpha);
-        }
-      } else {
-        phi::DenseTensor dy_temp(Y->dtype());
-        dy_temp.Resize(phi::make_ddim(y_broadcast_dims));
-        if (transpose_y) {
-          MatMulND<T>(ctx,
-                      stream,
-                      dout_temp,
-                      x_temp_brd,
-                      &dy_temp,
-                      true,
-                      transpose_x,
-                      alpha);
-        } else {
-          MatMulND<T>(ctx,
-                      stream,
-                      x_temp_brd,
-                      dout_temp,
-                      &dy_temp,
-                      !transpose_x,
-                      false,
-                      alpha);
-        }
-        ReduceDims<T>(ctx, stream, y_dims, y_broadcast_dims, dy_temp, dY);
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    matmul,
-    ops::MatMulNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::MatMulNPUKernel<paddle::platform::NPUDeviceContext,
-                         paddle::platform::float16>);
-REGISTER_OP_NPU_KERNEL(
-    matmul_grad,
-    ops::MatMulGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::MatMulGradNPUKernel<paddle::platform::NPUDeviceContext,
-                             paddle::platform::float16>);
diff --git a/paddle/fluid/operators/matmul_v2_op_npu.cc b/paddle/fluid/operators/matmul_v2_op_npu.cc
deleted file mode 100644
index 2a398fbb5499bf..00000000000000
--- a/paddle/fluid/operators/matmul_v2_op_npu.cc
+++ /dev/null
@@ -1,480 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/operators/matmul_v2_op.h"
-
-namespace paddle {
-namespace operators {
-
-using NPUDeviceContext = platform::NPUDeviceContext;
-
-template <typename T>
-static void MatMul2D(const framework::ExecutionContext& ctx,
-                     const aclrtStream& stream,
-                     const phi::DenseTensor& X,
-                     const phi::DenseTensor& Y,
-                     phi::DenseTensor* Out,
-                     const bool trans_x,
-                     const bool trans_y) {
-  Out->mutable_data<T>(ctx.GetPlace());
-  const auto& runner =
-      NpuOpRunner("MatMul",
-                  {X, Y},
-                  {*Out},
-                  {{"transpose_x1", trans_x}, {"transpose_x2", trans_y}});
-  runner.Run(stream);
-}
-
-template <typename T>
-static void MatMulND(const framework::ExecutionContext& ctx,
-                     const aclrtStream& stream,
-                     const phi::DenseTensor& X,
-                     const phi::DenseTensor& Y,
-                     phi::DenseTensor* Out,
-                     const bool trans_x,
-                     const bool trans_y) {
-  Out->mutable_data<T>(ctx.GetPlace());
-  const auto& runner = NpuOpRunner("BatchMatMul",
-                                   {X, Y},
-                                   {*Out},
-                                   {{"adj_x1", trans_x}, {"adj_x2", trans_y}});
-  runner.Run(stream);
-}
-
-#if (CANN_VERSION_CODE < 504000)
-template <>
-void MatMulND<phi::dtype::float16>(const framework::ExecutionContext& ctx,
-                                   const aclrtStream& stream,
-                                   const phi::DenseTensor& X,
-                                   const phi::DenseTensor& Y,
-                                   phi::DenseTensor* Out,
-                                   const bool trans_x,
-                                   const bool trans_y) {
-  Out->mutable_data<phi::dtype::float16>(ctx.GetPlace());
-  phi::DenseTensor x_fp32, y_fp32, out_fp32;
-  x_fp32.Resize(X.dims());
-  y_fp32.Resize(Y.dims());
-  out_fp32.Resize(Out->dims());
-  x_fp32.mutable_data<float>(ctx.GetPlace());
-  y_fp32.mutable_data<float>(ctx.GetPlace());
-  out_fp32.mutable_data<float>(ctx.GetPlace());
-
-  const auto& cast_x =
-      NpuOpRunner("Cast",
-                  {X},
-                  {x_fp32},
-                  {{"dst_type",
-                    static_cast<int>(ConvertToNpuDtype(
-                        framework::TransToProtoVarType(x_fp32.type())))}});
-  cast_x.Run(stream);
-  const auto& cast_y =
-      NpuOpRunner("Cast",
-                  {Y},
-                  {y_fp32},
-                  {{"dst_type",
-                    static_cast<int>(ConvertToNpuDtype(
-                        framework::TransToProtoVarType(y_fp32.type())))}});
-  cast_y.Run(stream);
-
-  const auto& runner = NpuOpRunner("BatchMatMul",
-                                   {x_fp32, y_fp32},
-                                   {out_fp32},
-                                   {{"adj_x1", trans_x}, {"adj_x2", trans_y}});
-  runner.Run(stream);
-
-  const auto& cast_out = NpuOpRunner(
-      "Cast",
-      {out_fp32},
-      {*Out},
-      {{"dst_type",
-        static_cast<int>(
-            ConvertToNpuDtype(framework::TransToProtoVarType(Out->type())))}});
-  cast_out.Run(stream);
-}
-#endif
-
-template <typename T>
-static void ReduceDims(const framework::ExecutionContext& ctx,
-                       const aclrtStream& stream,
-                       const std::vector<int64_t>& dims,
-                       const std::vector<int64_t>& brd_dims,
-                       const phi::DenseTensor& in,
-                       phi::DenseTensor* out) {
-  std::vector<int64_t> axes;
-  int64_t size = brd_dims.size();
-  int64_t diff = brd_dims.size() - dims.size();
-  for (int64_t i = 0; i < size; ++i) {
-    if (i < diff) {
-      axes.push_back(i);
-      continue;
-    }
-    if (brd_dims[i] > dims[i - diff]) {
-      axes.push_back(i);
-    }
-  }
-  out->mutable_data<T>(ctx.GetPlace());
-  const auto& runner = NpuOpRunner(
-      "ReduceSumD", {in}, {*out}, {{"axes", axes}, {"keep_dims", false}});
-  runner.Run(stream);
-}
-
-template <typename T>
-class MatMulV2NPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* X = ctx.Input<phi::DenseTensor>("X");
-    auto* Y = ctx.Input<phi::DenseTensor>("Y");
-    auto* Out = ctx.Output<phi::DenseTensor>("Out");
-    const bool trans_x = ctx.Attr<bool>("trans_x");
-    const bool trans_y = ctx.Attr<bool>("trans_y");
-
-    std::vector<int64_t> x_dims = phi::vectorize(X->dims());
-    std::vector<int64_t> y_dims = phi::vectorize(Y->dims());
-    std::vector<int64_t> out_dims = phi::vectorize(Out->dims());
-    int x_ndim = x_dims.size();
-    int y_ndim = y_dims.size();
-    int out_ndim = out_dims.size();
-
-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-
-    // Case 1: [K] x [K] = [1]
-    if (x_ndim == 1 && y_ndim == 1) {
-      PADDLE_ENFORCE_EQ(
-          X->numel(),
-          Y->numel(),
-          platform::errors::InvalidArgument(
-              "X's numbers must be equal to Y's numbers,"
-              "when X/Y's dims =1. But received X has [%d] elements,"
-              "received Y has [%d] elements",
-              X->numel(),
-              Y->numel()));
-      Out->Resize({1});
-      Out->mutable_data<T>(ctx.GetPlace());
-
-      const auto& runner = NpuOpRunner("Dot", {*X, *Y}, {*Out});
-      runner.Run(stream);
-      return;
-    }
-
-    // Resize dim 1 to 2
-    phi::DenseTensor x_temp, y_temp;
-    x_temp.ShareDataWith(*X);
-    y_temp.ShareDataWith(*Y);
-    if (x_ndim == 1) {
-      x_dims.insert(x_dims.begin(), 1);
-      out_dims.insert(out_dims.end() - 1, 1);
-      x_temp.Resize(phi::make_ddim(x_dims));
-      x_ndim = 2;
-      out_ndim += 1;
-    }
-    if (y_ndim == 1) {
-      y_dims.push_back(1);
-      out_dims.push_back(1);
-      y_temp.Resize(phi::make_ddim(y_dims));
-      y_ndim = 2;
-      out_ndim += 1;
-    }
-
-    const int K = trans_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1];
-    if (trans_y) {
-      PADDLE_ENFORCE_EQ(
-          y_dims[y_ndim - 1],
-          K,
-          platform::errors::InvalidArgument("Input(Y) has error dim."
-                                            "Y'dims[%d] must be equal to %d"
-                                            "But received Y'dims[%d] is %d",
-                                            y_ndim - 1,
-                                            K,
-                                            y_ndim - 1,
-                                            y_dims[y_ndim - 1]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          y_dims[y_ndim - 2],
-          K,
-          platform::errors::InvalidArgument("Input(Y) has error dim."
-                                            "Y'dims[%d] must be equal to %d"
-                                            "But received Y'dims[%d] is %d",
-                                            y_ndim - 2,
-                                            K,
-                                            y_ndim - 2,
-                                            y_dims[y_ndim - 2]));
-    }
-
-    // Case 2: [M, K] x [K, N] = [M, N]
-    if (x_ndim == 2 && y_ndim == 2) {
-      MatMul2D<T>(ctx, stream, x_temp, y_temp, Out, trans_x, trans_y);
-      return;
-    }
-
-    // Case 3: [B, M, K] x [K, N] =  [B, M, N], when trans_x = false
-    // Equal: [B * M, K] x [K, N] = [B * M, N] => [B, M, N]
-    if (trans_x == false && y_ndim == 2) {
-      std::vector<int64_t> vec_dim = {x_temp.numel() / K, K};
-      x_temp.Resize(phi::make_ddim(vec_dim));
-      MatMul2D<T>(ctx, stream, x_temp, y_temp, Out, trans_x, trans_y);
-      return;
-    }
-
-    // Case 4: [B, M, K] x  [B, K, N] = [B, M, N]
-    std::vector<int64_t> x_broadcast_dims(out_ndim, 1);
-    std::vector<int64_t> y_broadcast_dims(out_ndim, 1);
-    std::copy(out_dims.begin(), out_dims.end() - 2, x_broadcast_dims.begin());
-    std::copy(out_dims.begin(), out_dims.end() - 2, y_broadcast_dims.begin());
-    std::copy(x_dims.end() - 2, x_dims.end(), x_broadcast_dims.end() - 2);
-    std::copy(y_dims.end() - 2, y_dims.end(), y_broadcast_dims.end() - 2);
-
-    phi::DenseTensor x_temp_brd(X->type());
-    if (x_dims == x_broadcast_dims) {
-      x_temp_brd.ShareDataWith(*X);
-      x_temp_brd.Resize(phi::make_ddim(x_broadcast_dims));
-    } else {
-      x_temp_brd.Resize(phi::make_ddim(x_broadcast_dims));
-      x_temp_brd.mutable_data<T>(ctx.GetPlace());
-      NpuOpRunner runner_brd;
-      runner_brd.SetType("BroadcastTo")
-          .AddInput(x_temp)
-          .AddInput(std::move(x_broadcast_dims))
-          .AddOutput(x_temp_brd)
-          .Run(stream);
-    }
-
-    phi::DenseTensor y_temp_brd(Y->type());
-    if (y_dims == y_broadcast_dims) {
-      y_temp_brd.ShareDataWith(*Y);
-      y_temp_brd.Resize(phi::make_ddim(y_broadcast_dims));
-    } else {
-      y_temp_brd.Resize(phi::make_ddim(y_broadcast_dims));
-      y_temp_brd.mutable_data<T>(ctx.GetPlace());
-      NpuOpRunner runner_brd;
-      runner_brd.SetType("BroadcastTo")
-          .AddInput(y_temp)
-          .AddInput(std::move(y_broadcast_dims))
-          .AddOutput(y_temp_brd)
-          .Run(stream);
-    }
-    MatMulND<T>(ctx, stream, x_temp_brd, y_temp_brd, Out, trans_x, trans_y);
-  }
-};
-
-template <typename T>
-class MatMulV2GradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* X = ctx.Input<phi::DenseTensor>("X");
-    auto* Y = ctx.Input<phi::DenseTensor>("Y");
-    auto* dOut = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dX = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* dY = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
-    const bool trans_x = ctx.Attr<bool>("trans_x");
-    const bool trans_y = ctx.Attr<bool>("trans_y");
-
-    std::vector<int64_t> x_dims = phi::vectorize(X->dims());
-    std::vector<int64_t> y_dims = phi::vectorize(Y->dims());
-    std::vector<int64_t> out_dims = phi::vectorize(dOut->dims());
-    int x_ndim = x_dims.size();
-    int y_ndim = y_dims.size();
-    int out_ndim = out_dims.size();
-
-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-
-    // Case 1: [K] x [K] = [1]
-    if (x_ndim == 1 && y_ndim == 1) {
-      phi::DenseTensor dout_temp(dOut->type());
-      dout_temp.Resize(X->dims());
-      dout_temp.mutable_data<T>(ctx.GetPlace());
-      NpuOpRunner runner;
-      runner.SetType("BroadcastTo")
-          .AddInput(*dOut)
-          .AddInput(std::move(x_dims))
-          .AddOutput(dout_temp)
-          .Run(stream);
-
-      if (dX) {
-        dX->mutable_data<T>(ctx.GetPlace());
-        const auto& runner_dx = NpuOpRunner("Mul", {dout_temp, *Y}, {*dX}, {});
-        runner_dx.Run(stream);
-      }
-      if (dY) {
-        dY->mutable_data<T>(ctx.GetPlace());
-        const auto& runner_dy = NpuOpRunner("Mul", {dout_temp, *X}, {*dY}, {});
-        runner_dy.Run(stream);
-      }
-      return;
-    }
-
-    // Resize dim 1 to 2
-    phi::DenseTensor x_temp, y_temp, dout_temp;
-    x_temp.ShareDataWith(*X);
-    y_temp.ShareDataWith(*Y);
-    dout_temp.ShareDataWith(*dOut);
-    if (x_ndim == 1) {
-      x_dims.insert(x_dims.begin(), 1);
-      out_dims.insert(out_dims.end() - 1, 1);
-      x_temp.Resize(phi::make_ddim(x_dims));
-      dout_temp.Resize(phi::make_ddim(out_dims));
-      x_ndim = 2;
-      out_ndim += 1;
-    }
-    if (y_ndim == 1) {
-      y_dims.push_back(1);
-      out_dims.push_back(1);
-      y_temp.Resize(phi::make_ddim(y_dims));
-      dout_temp.Resize(phi::make_ddim(out_dims));
-      y_ndim = 2;
-      out_ndim += 1;
-    }
-
-    // Case 2: [M, K] x [K, N] = [M, N]
-    if (out_ndim == 2) {
-      if (dX) {
-        dX->Resize(phi::make_ddim(x_dims));
-        if (trans_x) {
-          MatMul2D<T>(ctx, stream, y_temp, dout_temp, dX, trans_y, true);
-        } else {
-          MatMul2D<T>(ctx, stream, dout_temp, y_temp, dX, false, !trans_y);
-        }
-        dX->Resize(X->dims());
-      }
-      if (dY) {
-        dY->Resize(phi::make_ddim(y_dims));
-        if (trans_y) {
-          MatMul2D<T>(ctx, stream, dout_temp, x_temp, dY, true, trans_x);
-        } else {
-          MatMul2D<T>(ctx, stream, x_temp, dout_temp, dY, !trans_x, false);
-        }
-        dY->Resize(Y->dims());
-      }
-      return;
-    }
-
-    const int K = trans_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1];
-    const int N = trans_y ? y_dims[y_ndim - 2] : y_dims[y_ndim - 1];
-
-    // Case 3: [B, M, K] x [K, N] =  [B, M, N], when trans_x = false
-    // Equal: [B * M, K] x [K, N] = [B * M, N] => [B, M, N]
-    if (trans_x == false && y_ndim == 2) {
-      std::vector<int64_t> x_vec_dim = {x_temp.numel() / K, K};
-      dout_temp.Resize(
-          phi::make_ddim(std::vector<int64_t>{dout_temp.numel() / N, N}));
-      if (dX) {
-        dX->Resize(phi::make_ddim(x_vec_dim));
-        MatMul2D<T>(ctx, stream, dout_temp, y_temp, dX, false, !trans_y);
-        dX->Resize(X->dims());
-      }
-      if (dY) {
-        x_temp.Resize(phi::make_ddim(x_vec_dim));
-        if (trans_y) {
-          MatMul2D<T>(ctx, stream, dout_temp, x_temp, dY, true, false);
-        } else {
-          MatMul2D<T>(ctx, stream, x_temp, dout_temp, dY, true, false);
-        }
-      }
-      return;
-    }
-
-    // Case 4: [B, M, K] x  [B, K, N] = [B, M, N]
-    std::vector<int64_t> x_broadcast_dims(out_ndim, 1);
-    std::vector<int64_t> y_broadcast_dims(out_ndim, 1);
-    std::copy(out_dims.begin(), out_dims.end() - 2, x_broadcast_dims.begin());
-    std::copy(out_dims.begin(), out_dims.end() - 2, y_broadcast_dims.begin());
-    std::copy(x_dims.end() - 2, x_dims.end(), x_broadcast_dims.end() - 2);
-    std::copy(y_dims.end() - 2, y_dims.end(), y_broadcast_dims.end() - 2);
-
-    phi::DenseTensor x_temp_brd(X->type());
-    if (x_dims == x_broadcast_dims) {
-      x_temp_brd.ShareDataWith(*X);
-      x_temp_brd.Resize(phi::make_ddim(x_broadcast_dims));
-    } else {
-      x_temp_brd.Resize(phi::make_ddim(x_broadcast_dims));
-      x_temp_brd.mutable_data<T>(ctx.GetPlace());
-      NpuOpRunner runner_brd;
-      runner_brd.SetType("BroadcastTo")
-          .AddInput(x_temp)
-          .AddInput(std::move(x_broadcast_dims))
-          .AddOutput(x_temp_brd)
-          .Run(stream);
-    }
-
-    phi::DenseTensor y_temp_brd(Y->type());
-    if (y_dims == y_broadcast_dims) {
-      y_temp_brd.ShareDataWith(*Y);
-      y_temp_brd.Resize(phi::make_ddim(y_broadcast_dims));
-    } else {
-      y_temp_brd.Resize(phi::make_ddim(y_broadcast_dims));
-      y_temp_brd.mutable_data<T>(ctx.GetPlace());
-      NpuOpRunner runner_brd;
-      runner_brd.SetType("BroadcastTo")
-          .AddInput(y_temp)
-          .AddInput(std::move(y_broadcast_dims))
-          .AddOutput(y_temp_brd)
-          .Run(stream);
-    }
-
-    if (dX) {
-      if (x_dims == x_broadcast_dims) {
-        if (trans_x) {
-          MatMulND<T>(ctx, stream, y_temp_brd, dout_temp, dX, trans_y, true);
-        } else {
-          MatMulND<T>(ctx, stream, dout_temp, y_temp_brd, dX, false, !trans_y);
-        }
-      } else {
-        phi::DenseTensor dx_temp(X->type());
-        dx_temp.Resize(phi::make_ddim(x_broadcast_dims));
-        if (trans_x) {
-          MatMulND<T>(
-              ctx, stream, y_temp_brd, dout_temp, &dx_temp, trans_y, true);
-        } else {
-          MatMulND<T>(
-              ctx, stream, dout_temp, y_temp_brd, &dx_temp, false, !trans_y);
-        }
-        ReduceDims<T>(ctx, stream, x_dims, x_broadcast_dims, dx_temp, dX);
-      }
-    }
-    if (dY) {
-      if (y_dims == y_broadcast_dims) {
-        if (trans_y) {
-          MatMulND<T>(ctx, stream, dout_temp, x_temp_brd, dY, true, trans_x);
-        } else {
-          MatMulND<T>(ctx, stream, x_temp_brd, dout_temp, dY, !trans_x, false);
-        }
-      } else {
-        phi::DenseTensor dy_temp(Y->type());
-        dy_temp.Resize(phi::make_ddim(y_broadcast_dims));
-        if (trans_y) {
-          MatMulND<T>(
-              ctx, stream, dout_temp, x_temp_brd, &dy_temp, true, trans_x);
-        } else {
-          MatMulND<T>(
-              ctx, stream, x_temp_brd, dout_temp, &dy_temp, !trans_x, false);
-        }
-        ReduceDims<T>(ctx, stream, y_dims, y_broadcast_dims, dy_temp, dY);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(matmul_v2,
-                       ops::MatMulV2NPUKernel<float>,
-                       ops::MatMulV2NPUKernel<paddle::platform::float16>);
-REGISTER_OP_NPU_KERNEL(matmul_v2_grad,
-                       ops::MatMulV2GradNPUKernel<float>,
-                       ops::MatMulV2GradNPUKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/mean_op_npu.cc b/paddle/fluid/operators/mean_op_npu.cc
deleted file mode 100644
index 3df6a6a04d5413..00000000000000
--- a/paddle/fluid/operators/mean_op_npu.cc
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class MeanNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    std::vector<int> axes;
-
-    framework::NPUAttributeMap attr_input = {{"keep_dims", false},
-                                             {"axes", axes}};
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    const auto& runner = NpuOpRunner("ReduceMeanD", {*x}, {*out}, attr_input);
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class MeanGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto stream =
-        context.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    auto grad = context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-
-    PADDLE_ENFORCE_EQ(
-        grad->numel(),
-        1,
-        platform::errors::InvalidArgument(
-            "Mean Gradient Input phi::DenseTensor len should be 1. But "
-            "received Out@Grad's elements num is %d.",
-            grad->numel()));
-
-    auto IG = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    IG->mutable_data<T>(context.GetPlace());
-
-    // ones
-    phi::DenseTensor ones(grad->dtype());
-    ones.mutable_data<T>(IG->dims(), context.GetPlace());
-    const auto& runner_ones = NpuOpRunner("OnesLike", {*IG}, {ones}, {});
-    runner_ones.Run(stream);
-
-    // means
-    phi::DenseTensor mean_tensor(grad->dtype());
-    mean_tensor.Resize({1});
-    mean_tensor.mutable_data<T>(context.GetPlace());
-    FillNpuTensorWithConstant<T>(
-        &mean_tensor, static_cast<T>(1.0 / static_cast<float>(IG->numel())));
-
-    // means mul ones
-    phi::DenseTensor mean_ma(grad->dtype());
-    mean_ma.Resize(IG->dims());
-    mean_ma.mutable_data<T>(context.GetPlace());
-    const auto& runner_mul_1 =
-        NpuOpRunner("Mul", {mean_tensor, ones}, {mean_ma}, {});
-    runner_mul_1.Run(stream);
-
-    // and mul grad
-    const auto& runner_mul_2 = NpuOpRunner("Mul", {mean_ma, *grad}, {*IG}, {});
-    runner_mul_2.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(
-    mean,
-    ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, plat::float16>)
-
-REGISTER_OP_NPU_KERNEL(
-    mean_grad,
-    ops::MeanGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::MeanGradNPUKernel<paddle::platform::NPUDeviceContext, plat::float16>)
diff --git a/paddle/fluid/operators/meshgrid_op_npu.cc b/paddle/fluid/operators/meshgrid_op_npu.cc
deleted file mode 100644
index e60af8bd480ea8..00000000000000
--- a/paddle/fluid/operators/meshgrid_op_npu.cc
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class MeshgridNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto ins = context.MultiInput<phi::DenseTensor>("X");
-    auto outs = context.MultiOutput<phi::DenseTensor>("Out");
-    PADDLE_ENFORCE_EQ(
-        (ins.size() > 1) && (ins.size() < 7),
-        true,
-        platform::errors::InvalidArgument(
-            "Excepted Tensor numbers between 2 and 6, but only received d% .",
-            ins.size()));
-
-    int64_t size = ins.size();
-    std::vector<int64_t> shape(size);
-
-    for (int64_t i = 0; i < size; i++) {
-      switch (ins[i]->dims().size()) {
-        case 0:
-          shape[i] = 1;
-          break;
-        case 1:
-          shape[i] = ins[i]->dims()[0];
-          break;
-        default:
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "Expected scalar or 1D tensor in the tensor list but got tensor "
-              "%d: ",
-              i));
-      }
-    }
-
-    for (int64_t i = 0; i < size; i++) {
-      std::vector<int64_t> view_shape(size, 1);
-      view_shape[i] = shape[i];
-
-      framework::DDim out_dims_reshape = phi::make_ddim(view_shape);
-      phi::DenseTensor reshape_ins_tensor(ins[i]->dtype());
-      reshape_ins_tensor.ShareDataWith(*ins[i]);
-      reshape_ins_tensor.Resize(out_dims_reshape);
-
-      framework::DDim out_dims = phi::make_ddim(shape);
-      outs[i]->Resize(out_dims);
-      outs[i]->mutable_data<T>(context.GetPlace());
-
-      auto stream =
-          context.template device_context<paddle::platform::NPUDeviceContext>()
-              .stream();
-      NpuOpRunner runner;
-      runner.SetType("BroadcastTo")
-          .AddInput(reshape_ins_tensor)
-          .AddInput(std::move(shape))
-          .AddOutput(*(outs[i]))
-          .Run(stream);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_NPU_KERNEL(
-    meshgrid,
-    paddle::operators::MeshgridNPUKernel<int>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-    paddle::operators::MeshgridNPUKernel<int64_t>,
-#endif
-    paddle::operators::MeshgridNPUKernel<float>,
-    paddle::operators::MeshgridNPUKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/mul_op_npu.cc b/paddle/fluid/operators/mul_op_npu.cc
deleted file mode 100644
index d8b713de96fff4..00000000000000
--- a/paddle/fluid/operators/mul_op_npu.cc
+++ /dev/null
@@ -1,274 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class MulNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    int x_num_col_dims = ctx.Attr<int>("x_num_col_dims");
-    int y_num_col_dims = ctx.Attr<int>("y_num_col_dims");
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    if (x_num_col_dims == 1 && y_num_col_dims == 1) {
-      if (x->dims().size() == 2 && y->dims().size() == 2) {
-        out->mutable_data<T>(ctx.GetPlace());
-        const auto& runner =
-            NpuOpRunner("MatMul",
-                        {*x, *y},
-                        {*out},
-                        {{"transpose_x1", false}, {"transpose_x2", false}});
-
-        runner.Run(stream);
-      } else if (x->dims().size() >= 3 && y->dims().size() == 2) {
-        // reshape
-        Tensor tmp_x(x->type());
-        int64_t sec_dim = x->dims()[1];
-        for (auto i = 2; i < x->dims().size(); i++) {
-          sec_dim *= x->dims()[i];
-        }
-        int64_t first_dim = x->dims()[0];
-        tmp_x.ShareDataWith(*x);
-        tmp_x.Resize(phi::make_ddim({first_dim, sec_dim}));
-        out->mutable_data<T>(ctx.GetPlace());
-        // matmul
-        const auto& runner =
-            NpuOpRunner("MatMul",
-                        {tmp_x, *y},
-                        {*out},
-                        {{"transpose_x1", false}, {"transpose_x2", false}});
-        runner.Run(stream);
-      } else {
-        PADDLE_THROW(
-            platform::errors::InvalidArgument("npu error: not support dims"));
-      }
-      // to do other
-    } else if (x->dims().size() == 3 && y->dims().size() == 2) {
-      // for example: x.shape=[2, 3, 4] y.shape=[4, 5], expect [2, 3, 5]
-      PADDLE_ENFORCE_EQ(x_num_col_dims,
-                        2,
-                        platform::errors::InvalidArgument(
-                            "now only support x_num_col_dims == 2: but got %d",
-                            x_num_col_dims));
-      if (framework::TransToProtoVarType(x->dtype()) ==
-              framework::proto::VarType::FP16 &&
-          framework::TransToProtoVarType(y->dtype()) ==
-              framework::proto::VarType::FP16) {
-        // NOTE: When the dim of the input and output shapes is inconsistent,
-        // (Boradcast) BatchMatMul NPU OP only support FP16.
-        out->mutable_data<T>(ctx.GetPlace());
-        const auto& runner =
-            NpuOpRunner("BatchMatMul",
-                        {*x, *y},
-                        {*out},
-                        {{"adj_x1", false}, {"adj_x2", false}});
-
-        auto stream =
-            ctx.template device_context<paddle::platform::NPUDeviceContext>()
-                .stream();
-        runner.Run(stream);
-      } else {
-        // flatten => x.shape=[6, 4]
-        Tensor tmp_x(x->type());
-        int64_t first_dim = x->dims()[0] * x->dims()[1];
-        int64_t sec_dim = x->dims()[2];
-        tmp_x.ShareDataWith(*x);
-        tmp_x.Resize(phi::make_ddim({first_dim, sec_dim}));
-
-        // matmul [6,4] , [4, 5] => [6, 5]
-        out->mutable_data<T>(ctx.GetPlace());
-
-        Tensor tmp_out(x->type());
-        tmp_out.ShareDataWith(*out);
-        tmp_out.Resize(phi::make_ddim({first_dim, y->dims()[1]}));
-
-        const auto& runner_matmul =
-            NpuOpRunner("MatMul",
-                        {tmp_x, *y},
-                        {tmp_out},
-                        {{"transpose_x1", false}, {"transpose_x2", false}});
-        runner_matmul.Run(stream);
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class MulGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
-    int x_num_col_dims = ctx.Attr<int>("x_num_col_dims");
-    int y_num_col_dims = ctx.Attr<int>("y_num_col_dims");
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    if (x_num_col_dims == 1 && y_num_col_dims == 1) {
-      if (x->dims().size() == 2 && y->dims().size() == 2) {
-        if (dx) {
-          dx->mutable_data<T>(ctx.GetPlace());
-          const auto& runner_dx =
-              NpuOpRunner("MatMul",
-                          {*dout, *y},
-                          {*dx},
-                          {{"transpose_x1", false}, {"transpose_x2", true}});
-
-          runner_dx.Run(stream);
-        }
-
-        if (dy) {
-          dy->mutable_data<T>(ctx.GetPlace());
-          const auto& runner_dy =
-              NpuOpRunner("MatMul",
-                          {*x, *dout},
-                          {*dy},
-                          {{"transpose_x1", true}, {"transpose_x2", false}});
-
-          runner_dy.Run(stream);
-        }
-      } else if (x->dims().size() >= 3 && y->dims().size() == 2) {
-        // flatten => x.shape=[6, 4]
-        // matmul
-        if (dx) {
-          // matmul [2, 5] * [12, 5] => [2, 12]
-          dx->mutable_data<T>(ctx.GetPlace());
-          Tensor tmp_dx(x->type());
-          tmp_dx.ShareDataWith(*dx);
-          tmp_dx.Resize(phi::make_ddim({dout->dims()[0], y->dims()[0]}));
-
-          const auto& runner_matmul =
-              NpuOpRunner("MatMul",
-                          {*dout, *y},
-                          {tmp_dx},
-                          {{"transpose_x1", false}, {"transpose_x2", true}});
-          runner_matmul.Run(stream);
-        }
-
-        if (dy) {
-          // flatten
-          Tensor tmp_x(x->type());
-          int64_t sec_dim = x->dims()[1];
-          for (auto i = 2; i < x->dims().size(); i++) {
-            sec_dim *= x->dims()[i];
-          }
-          int64_t first_dim = x->dims()[0];
-          tmp_x.ShareDataWith(*x);
-          tmp_x.Resize(phi::make_ddim({first_dim, sec_dim}));
-          dy->mutable_data<T>(ctx.GetPlace());
-          const auto& runner_dy =
-              NpuOpRunner("MatMul",
-                          {tmp_x, *dout},
-                          {*dy},
-                          {{"transpose_x1", true}, {"transpose_x2", false}});
-
-          runner_dy.Run(stream);
-        }
-      }
-    } else if (x->dims().size() == 3 && y->dims().size() == 2) {
-      // for example: x.shape=[2, 3, 4] y.shape=[4, 5], expect [2, 3, 5]
-      PADDLE_ENFORCE_EQ(x_num_col_dims,
-                        2,
-                        platform::errors::InvalidArgument(
-                            "now only support x_num_col_dims == 2: but got %d",
-                            x_num_col_dims));
-      // tmp_dout both used by dx and dy
-      Tensor tmp_dout(x->type());
-      int64_t dout_first_dim = dout->dims()[0] * dout->dims()[1];
-      int64_t dout_sec_dim = dout->dims()[2];
-      tmp_dout.ShareDataWith(*dout);
-      tmp_dout.Resize(phi::make_ddim({dout_first_dim, dout_sec_dim}));
-
-      if (dx) {
-        // tmp_dout * y [2, 3, 5] * [4,5] => [2, 3, 4]
-        if (framework::TransToProtoVarType(dout->dtype()) ==
-                framework::proto::VarType::FP16 &&
-            framework::TransToProtoVarType(y->dtype()) ==
-                framework::proto::VarType::FP16) {
-          // NOTE: When the dim of the input and output shapes is inconsistent,
-          // (Boradcast) BatchMatMul NPU OP only support FP16.
-          dx->mutable_data<T>(ctx.GetPlace());
-          const auto& runner =
-              NpuOpRunner("BatchMatMul",
-                          {*dout, *y},
-                          {*dx},
-                          {{"adj_x1", false}, {"adj_x2", true}});
-
-          auto stream =
-              ctx.template device_context<paddle::platform::NPUDeviceContext>()
-                  .stream();
-          runner.Run(stream);
-        } else {
-          dx->mutable_data<T>(ctx.GetPlace());
-          Tensor tmp_dx(x->type());
-          tmp_dx.ShareDataWith(*dx);
-          tmp_dx.Resize(phi::make_ddim({dout_first_dim, y->dims()[0]}));
-
-          const auto& runner_matmul =
-              NpuOpRunner("MatMul",
-                          {tmp_dout, *y},
-                          {tmp_dx},
-                          {{"transpose_x1", false}, {"transpose_x2", true}});
-          runner_matmul.Run(stream);
-        }
-      }
-      if (dy) {
-        // flatten x.shape [2,3,4] => [6, 4]
-        Tensor tmp_x(x->type());
-        int64_t first_dim = x->dims()[0] * x->dims()[1];
-        int64_t sec_dim = x->dims()[2];
-        tmp_x.ShareDataWith(*x);
-        tmp_x.Resize(phi::make_ddim({first_dim, sec_dim}));
-        // mamtul [6,4] [6,5] =>[4,5]
-        dy->mutable_data<T>(ctx.GetPlace());
-        const auto& runner_dy =
-            NpuOpRunner("MatMul",
-                        {tmp_x, tmp_dout},
-                        {*dy},
-                        {{"transpose_x1", true}, {"transpose_x2", false}});
-        runner_dy.Run(stream);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    mul,
-    ops::MulNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::MulNPUKernel<paddle::platform::NPUDeviceContext,
-                      paddle::platform::float16>);
-REGISTER_OP_NPU_KERNEL(
-    mul_grad,
-    ops::MulGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::MulGradNPUKernel<paddle::platform::NPUDeviceContext,
-                          paddle::platform::float16>);
diff --git a/paddle/fluid/operators/multinomial_op_npu.cc b/paddle/fluid/operators/multinomial_op_npu.cc
deleted file mode 100644
index 425b7c6738633d..00000000000000
--- a/paddle/fluid/operators/multinomial_op_npu.cc
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-// TODO(Aganlengzi): delete this macro control and remove REMOVE_ITEM in
-// cmake/operators.cmake when Paddle supports
-#if (CANN_VERSION_CODE >= 504000)
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class NPUMultinomialKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const auto x = ctx.Input<phi::DenseTensor>("X");
-    auto out = ctx.Output<phi::DenseTensor>("Out");
-    const int64_t num_samples = ctx.Attr<int>("num_samples");
-    const bool replacement = ctx.Attr<bool>("replacement");
-
-    auto place = ctx.GetPlace();
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    out->mutable_data<int64_t>(place);
-
-    const auto& runner = NpuOpRunner(
-        "MultinomialWithReplacementD",
-        {*x},
-        {*out},
-        {{"num_samples", num_samples}, {"replacement", replacement}});
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(
-    multinomial,
-    ops::NPUMultinomialKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::NPUMultinomialKernel<paddle::platform::NPUDeviceContext, double>)
-#endif
diff --git a/paddle/fluid/operators/norm_op_npu.cc b/paddle/fluid/operators/norm_op_npu.cc
deleted file mode 100644
index b839b3e8ec2e0f..00000000000000
--- a/paddle/fluid/operators/norm_op_npu.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using DDim = framework::DDim;
-
-void CheckAxis(int axis, int rank) {
-  // check the axis is in [-rank, rank-1]
-  if (axis <= rank - 1 && axis >= -rank) return;
-  PADDLE_THROW(platform::errors::InvalidArgument(
-      "axis in norm operator must between (%d) and (%d)"
-      "but got (%d).",
-      -rank,
-      rank - 1,
-      axis));
-}
-
-template <typename DeviceContext, typename T>
-class NormNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    VLOG(4) << "Launch Norm Op Kernel on NPU." << std::endl;
-    auto *in_x = ctx.Input<phi::DenseTensor>("X");
-    auto *out_y = ctx.Output<phi::DenseTensor>("Out");
-    auto *out_norm = ctx.Output<phi::DenseTensor>("Norm");
-    out_y->mutable_data<T>(ctx.GetPlace());
-    out_norm->mutable_data<T>(ctx.GetPlace());
-    auto xdim = in_x->dims();
-    float eps = ctx.Attr<float>("epsilon");
-    int axis = ctx.Attr<int>("axis");
-    CheckAxis(axis, xdim.size());
-    if (axis < 0) axis = xdim.size() + axis;
-
-    framework::NPUAttributeMap attr_input_norm;
-    attr_input_norm["axes"] = std::vector<int>({axis});
-    attr_input_norm["p"] = 2;
-    attr_input_norm["keepdim"] = true;
-    attr_input_norm["epsilon"] = eps;
-    const auto &runner =
-        NpuOpRunner("LpNorm", {*in_x}, {*out_norm}, attr_input_norm);
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-    NpuOpRunner("Div", {*in_x, *out_norm}, {*out_y}, {}).Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class NormGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    float epsilon = ctx.Attr<float>("epsilon");
-    int axis = ctx.Attr<int>("axis");
-
-    auto *x = ctx.Input<phi::DenseTensor>("X");
-    auto *y = ctx.Input<phi::DenseTensor>("Out");
-    auto *dy = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto *dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-
-    auto xdim = x->dims();
-    CheckAxis(axis, xdim.size());
-
-    auto place = ctx.GetPlace();
-
-    dx->mutable_data<T>(place);
-
-    framework::NPUAttributeMap attr_input_norm;
-    attr_input_norm["dim"] = std::vector<int>({axis});
-    attr_input_norm["eps"] = epsilon;
-    const auto &runner =
-        NpuOpRunner("L2NormalizeGrad", {*x, *y, *dy}, {*dx}, attr_input_norm);
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(
-    norm,
-    ops::NormNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::NormNPUKernel<paddle::platform::NPUDeviceContext,
-                       paddle::platform::float16>)
-
-REGISTER_OP_NPU_KERNEL(
-    norm_grad,
-    ops::NormGradNPUKernel<plat::NPUDeviceContext, float>,
-    ops::NormGradNPUKernel<plat::NPUDeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/one_hot_op_npu.cc b/paddle/fluid/operators/one_hot_op_npu.cc
deleted file mode 100644
index e44f6286afa9ba..00000000000000
--- a/paddle/fluid/operators/one_hot_op_npu.cc
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/one_hot_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class OneHotNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>();
-    auto* in = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    int depth = ctx.Attr<int>("depth");
-
-    if (ctx.HasInput("depth_tensor")) {
-      auto* depth_tensor = ctx.Input<phi::DenseTensor>("depth_tensor");
-      std::vector<int32_t> depth_data;
-      framework::TensorToVector(*depth_tensor, dev_ctx, &depth_data);
-      depth = depth_data[0];
-      auto in_dims = in->dims();
-      framework::DDim out_dims(in_dims);
-      out_dims[out_dims.size() - 1] = depth;
-      out->Resize(out_dims);
-    }
-    out->mutable_data<float>(ctx.GetPlace());
-
-    float on_value = 1.0f, off_value = 0.0f;
-    if (framework::TransToProtoVarType(in->dtype()) ==
-        framework::proto::VarType::INT32) {
-      NpuOpRunner runner;
-      runner.SetType("OneHot")
-          .AddInput(*in)
-          .AddInput(std::vector<int32_t>({static_cast<int32_t>(depth)}))
-          .AddInput(std::vector<float>({on_value}))
-          .AddInput(std::vector<float>({off_value}))
-          .AddAttr("axis", -1)
-          .AddOutput(*out);
-      runner.Run(dev_ctx.stream());
-    } else {
-      phi::DenseTensor transformed_in;
-      transformed_in.mutable_data<int32_t>(in->dims(), dev_ctx.GetPlace());
-      const auto& cast_runner = NpuOpRunner(
-          "Cast", {*in}, {transformed_in}, {{"dst_type", ACL_INT32}});
-      cast_runner.Run(dev_ctx.stream());
-      NpuOpRunner runner;
-      runner.SetType("OneHot")
-          .AddInput(transformed_in)
-          .AddInput(std::vector<int32_t>({static_cast<int32_t>(depth)}))
-          .AddInput(std::vector<float>({on_value}))
-          .AddInput(std::vector<float>({off_value}))
-          .AddAttr("axis", -1)
-          .AddOutput(*out);
-      runner.Run(dev_ctx.stream());
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(one_hot,
-                       ops::OneHotNPUKernel<int32_t>,
-                       ops::OneHotNPUKernel<int64_t>);
diff --git a/paddle/fluid/operators/one_hot_v2_op_npu.cc b/paddle/fluid/operators/one_hot_v2_op_npu.cc
deleted file mode 100644
index b213d3345d1f0c..00000000000000
--- a/paddle/fluid/operators/one_hot_v2_op_npu.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class OneHotV2NPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>();
-    auto* in = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    int depth = ctx.Attr<int>("depth");
-
-    if (ctx.HasInput("depth_tensor")) {
-      auto* depth_tensor = ctx.Input<phi::DenseTensor>("depth_tensor");
-      std::vector<int32_t> depth_data;
-      framework::TensorToVector(*depth_tensor, dev_ctx, &depth_data);
-      depth = depth_data[0];
-      auto out_dims = out->dims();
-      out_dims[out_dims.size() - 1] = depth;
-      out->Resize(out_dims);
-    }
-    out->mutable_data<float>(ctx.GetPlace());
-
-    float on_value = 1.0f, off_value = 0.0f;
-    if (framework::TransToProtoVarType(in->dtype()) ==
-        framework::proto::VarType::INT32) {
-      NpuOpRunner runner;
-      runner.SetType("OneHot")
-          .AddInput(*in)
-          .AddInput(std::vector<int32_t>({static_cast<int32_t>(depth)}))
-          .AddInput(std::vector<float>({on_value}))
-          .AddInput(std::vector<float>({off_value}))
-          .AddAttr("axis", -1)
-          .AddOutput(*out);
-      runner.Run(dev_ctx.stream());
-    } else {
-      phi::DenseTensor transformed_in;
-      transformed_in.mutable_data<int32_t>(in->dims(), dev_ctx.GetPlace());
-      const auto& cast_runner = NpuOpRunner(
-          "Cast", {*in}, {transformed_in}, {{"dst_type", ACL_INT32}});
-      cast_runner.Run(dev_ctx.stream());
-      NpuOpRunner runner;
-      runner.SetType("OneHot")
-          .AddInput(transformed_in)
-          .AddInput(std::vector<int32_t>({static_cast<int32_t>(depth)}))
-          .AddInput(std::vector<float>({on_value}))
-          .AddInput(std::vector<float>({off_value}))
-          .AddAttr("axis", -1)
-          .AddOutput(*out);
-      runner.Run(dev_ctx.stream());
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(one_hot_v2,
-                       ops::OneHotV2NPUKernel<int32_t>,
-                       ops::OneHotV2NPUKernel<int64_t>);
diff --git a/paddle/fluid/operators/p_norm_op_npu.cc b/paddle/fluid/operators/p_norm_op_npu.cc
deleted file mode 100644
index c2d99fa42f2f8b..00000000000000
--- a/paddle/fluid/operators/p_norm_op_npu.cc
+++ /dev/null
@@ -1,228 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class PnormNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in_x = ctx.Input<phi::DenseTensor>("X");
-    auto* out_norm = ctx.Output<phi::DenseTensor>("Out");
-    out_norm->mutable_data<T>(ctx.GetPlace());
-
-    float porder = ctx.Attr<float>("porder");
-    int axis = ctx.Attr<int>("axis");
-    bool keepdim = ctx.Attr<bool>("keepdim");
-
-    auto xdim = in_x->dims();
-    if (axis < 0) axis = xdim.size() + axis;
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    int p = 0;
-    bool combine_op =
-        !(porder == 0 || porder == INFINITY || porder == -INFINITY);
-    if (porder == INFINITY) {
-      p = INT_MAX;
-    } else if (porder == -INFINITY) {
-      p = INT_MIN;
-    } else {
-      p = static_cast<int>(porder);
-      float t = 0;
-      float diff = abs(std::modf(porder, &t));
-      if (diff < 1e-5) {
-        combine_op = false;
-      }
-    }
-
-    if (!combine_op) {
-      const auto& runner = NpuOpRunner("LpNorm",
-                                       {*in_x},
-                                       {*out_norm},
-                                       {{"p", p},
-                                        {"axes", std::vector<int32_t>({axis})},
-                                        {"keep_dims", keepdim}});
-      runner.Run(stream);
-    } else {
-      phi::DenseTensor tmp_x;
-      tmp_x.mutable_data<T>(xdim, ctx.GetPlace());
-
-      const auto& power_runner1 =
-          NpuOpRunner("Power",
-                      {*in_x},
-                      {tmp_x},
-                      {{"power", porder}, {"scale", 1.0f}, {"shift", 0.0f}});
-      power_runner1.Run(stream);
-
-      const auto& reduce_runner = NpuOpRunner(
-          "ReduceSumD",
-          {tmp_x},
-          {*out_norm},
-          {{"axes", std::vector<int32_t>({axis})}, {"keep_dims", keepdim}});
-      reduce_runner.Run(stream);
-
-      const auto& power_runner2 = NpuOpRunner(
-          "Power",
-          {*out_norm},
-          {*out_norm},
-          {{"power", 1 / porder}, {"scale", 1.0f}, {"shift", 0.0f}});
-      power_runner2.Run(stream);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class PnormGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Out");
-    auto* dy = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-
-    auto place = ctx.GetPlace();
-    dx->mutable_data<T>(place);
-
-    auto xdim = x->dims();
-    float porder = ctx.Attr<float>("porder");
-    bool keepdim = ctx.Attr<bool>("keepdim");
-
-    int axis = ctx.Attr<int>("axis");
-    axis = axis < 0 ? xdim.size() + axis : axis;
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    phi::DenseTensor y_share(y->type());
-    phi::DenseTensor dy_share(dy->type());
-    y_share.ShareDataWith(*y);
-    dy_share.ShareDataWith(*dy);
-    auto ydim = xdim;
-    if (!keepdim) {
-      ydim[axis] = 1;
-    } else {
-      ydim = y->dims();
-    }
-    y_share.Resize(ydim);
-    dy_share.Resize(ydim);
-
-    if (porder == 0) {
-      FillNpuTensorWithConstant(dx, static_cast<T>(0));
-      dx->Resize(xdim);
-    } else if (porder == INFINITY || porder == -INFINITY) {
-      phi::DenseTensor x_abs;
-      x_abs.mutable_data<T>(xdim, place);
-      const auto& r_abs = NpuOpRunner("Abs", {*x}, {x_abs}, {});
-      r_abs.Run(stream);
-
-      phi::DenseTensor t_cond;
-      t_cond.mutable_data<bool>(xdim, place);
-      const auto& r_equal =
-          NpuOpRunner("Equal", {x_abs, y_share}, {t_cond}, {});
-      r_equal.Run(stream);
-
-      phi::DenseTensor t_zero;
-      t_zero.mutable_data<T>({1}, place);
-      FillNpuTensorWithConstant(&t_zero, static_cast<T>(0));
-
-      phi::DenseTensor x_sign;
-      x_sign.mutable_data<T>(xdim, place);
-      const auto& r_sign = NpuOpRunner("Sign", {*x}, {x_sign}, {});
-      r_sign.Run(stream);
-
-      const auto& r_mul = NpuOpRunner("Mul", {x_sign, dy_share}, {*dx}, {});
-      r_mul.Run(stream);
-
-      const auto& r_sel =
-          NpuOpRunner("SelectV2", {t_cond, *dx, t_zero}, {*dx}, {});
-      r_sel.Run(stream);
-    } else {
-      phi::DenseTensor x_abs;
-      x_abs.mutable_data<T>(xdim, place);
-      const auto& r_abs = NpuOpRunner("Abs", {*x}, {x_abs}, {});
-      r_abs.Run(stream);
-
-      phi::DenseTensor x_sign;
-      x_sign.mutable_data<T>(xdim, place);
-      const auto& r_sign = NpuOpRunner("Sign", {*x}, {x_sign}, {});
-      r_sign.Run(stream);
-
-      phi::DenseTensor y_pow;
-      y_pow.mutable_data<T>(ydim, place);
-      if (porder >= 1) {
-        const auto& r_pow1 = NpuOpRunner(
-            "Power",
-            {x_abs},
-            {x_abs},
-            {{"power", (porder - 1)}, {"scale", 1.0f}, {"shift", 0.0f}});
-        r_pow1.Run(stream);
-
-        const auto& r_pow2 = NpuOpRunner(
-            "Power",
-            {y_share},
-            {y_pow},
-            {{"power", (porder - 1)}, {"scale", 1.0f}, {"shift", 0.0f}});
-        r_pow2.Run(stream);
-
-        const auto& r_div = NpuOpRunner("DivNoNan", {x_abs, y_pow}, {*dx}, {});
-        r_div.Run(stream);
-      } else {
-        const auto& r_pow1 = NpuOpRunner(
-            "Power",
-            {x_abs},
-            {x_abs},
-            {{"power", (1 - porder)}, {"scale", 1.0f}, {"shift", 0.0f}});
-        r_pow1.Run(stream);
-
-        const auto& r_pow2 = NpuOpRunner(
-            "Power",
-            {y_share},
-            {y_pow},
-            {{"power", (1 - porder)}, {"scale", 1.0f}, {"shift", 0.0f}});
-        r_pow2.Run(stream);
-
-        const auto& r_div = NpuOpRunner("DivNoNan", {y_pow, x_abs}, {*dx}, {});
-        r_div.Run(stream);
-      }
-
-      const auto& r_mul1 = NpuOpRunner("Mul", {*dx, x_sign}, {*dx}, {});
-      r_mul1.Run(stream);
-
-      const auto& r_mul2 = NpuOpRunner("Mul", {*dx, dy_share}, {*dx}, {});
-      r_mul2.Run(stream);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(
-    p_norm,
-    ops::PnormNPUKernel<plat::NPUDeviceContext, float>,
-    ops::PnormNPUKernel<plat::NPUDeviceContext, plat::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    p_norm_grad,
-    ops::PnormGradNPUKernel<plat::NPUDeviceContext, float>,
-    ops::PnormGradNPUKernel<plat::NPUDeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/pad3d_op_npu.cc b/paddle/fluid/operators/pad3d_op_npu.cc
deleted file mode 100644
index 0f45d0b51c8373..00000000000000
--- a/paddle/fluid/operators/pad3d_op_npu.cc
+++ /dev/null
@@ -1,147 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-
-static inline std::vector<int> GetPaddings(
-    const framework::ExecutionContext& context) {
-  std::vector<int> paddings(6);
-  auto* paddings_t = context.Input<phi::DenseTensor>("Paddings");
-  if (paddings_t) {
-    paddle::framework::TensorToVector(
-        *paddings_t, context.device_context(), &paddings);
-  } else {
-    auto pads = context.Attr<std::vector<int>>("paddings");
-    std::copy(pads.begin(), pads.end(), paddings.data());
-  }
-  return paddings;
-}
-
-template <typename T>
-class Pad3dNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<phi::DenseTensor>("X");
-    auto in_dims = x->dims();
-
-    std::vector<int> pads = GetPaddings(context);
-    auto mode = context.Attr<std::string>("mode");
-    float value = context.Attr<float>("value");
-    auto data_format = context.Attr<std::string>("data_format");
-
-    auto* out = context.Output<phi::DenseTensor>("Out");
-
-    PADDLE_ENFORCE_LT(abs(value),
-                      1e-5,
-                      platform::errors::Unimplemented(
-                          "Ascend npu only support constant_values=0 right now,"
-                          "but received constant_value is %f .",
-                          value));
-
-    PADDLE_ENFORCE_EQ(mode,
-                      "constant",
-                      platform::errors::Unimplemented(
-                          "Ascend npu only support mode=constant right now,"
-                          "but received mode is %s .",
-                          mode));
-
-    std::vector<int> paddings(
-        {0, 0, 0, 0, pads[4], pads[5], pads[2], pads[3], pads[0], pads[1]});
-    if (data_format == "NCDHW") {
-      out->Resize({in_dims[0],
-                   in_dims[1],
-                   in_dims[2] + pads[4] + pads[5],
-                   in_dims[3] + pads[2] + pads[3],
-                   in_dims[4] + pads[0] + pads[1]});
-    } else {
-      out->Resize({in_dims[0],
-                   in_dims[1] + pads[4] + pads[5],
-                   in_dims[2] + pads[2] + pads[3],
-                   in_dims[3] + pads[0] + pads[1],
-                   in_dims[4]});
-      paddings = {
-          0, 0, pads[4], pads[5], pads[2], pads[3], pads[0], pads[1], 0, 0};
-    }
-    out->mutable_data<T>(context.GetPlace());
-
-    NpuOpRunner runner;
-    runner.SetType("PadV3")
-        .AddInput(*x)
-        .AddInput(std::move(paddings))
-        .AddInput(
-            std::vector<int>({0}))  // npu only support constant_value=0 now
-        .AddOutput(*out)
-        .AddAttr("mode", mode);
-
-    auto stream =
-        context.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-template <typename T>
-class Pad3dGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    std::vector<int> pads = GetPaddings(context);
-    auto mode = context.Attr<std::string>("mode");
-    auto data_format = context.Attr<std::string>("data_format");
-
-    auto* d_out =
-        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* d_in = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto d_in_dims = d_in->dims();
-    d_in->mutable_data<T>(context.GetPlace());
-
-    const int pad_left = pads[0];
-    const int pad_top = pads[2];
-    const int pad_front = pads[4];
-
-    auto stream =
-        context.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    std::vector<int64_t> size(
-        {d_in_dims[0], d_in_dims[1], d_in_dims[2], d_in_dims[3], d_in_dims[4]});
-    if (mode == "constant") {  // this method can be only used for constant mode
-      std::vector<int> offsets({0, 0, pad_front, pad_top, pad_left});
-      if (data_format == "NDHWC") {
-        offsets = {0, pad_front, pad_top, pad_left, 0};
-      }
-      const auto& runner = NpuOpRunner(
-          "SliceD", {*d_out}, {*d_in}, {{"offsets", offsets}, {"size", size}});
-      runner.Run(stream);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(pad3d,
-                       ops::Pad3dNPUKernel<plat::float16>,
-                       ops::Pad3dNPUKernel<float>,
-                       ops::Pad3dNPUKernel<int>);
-
-REGISTER_OP_NPU_KERNEL(pad3d_grad,
-                       ops::Pad3dNPUKernel<plat::float16>,
-                       ops::Pad3dGradNPUKernel<float>);
diff --git a/paddle/fluid/operators/pad_op_npu.cc b/paddle/fluid/operators/pad_op_npu.cc
deleted file mode 100644
index 48c2254b1ec91e..00000000000000
--- a/paddle/fluid/operators/pad_op_npu.cc
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class PadNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<phi::DenseTensor>("X");
-    auto* out = context.Output<phi::DenseTensor>("Out");
-    auto paddings = context.Attr<std::vector<int>>("paddings");
-    float pad_value = context.Attr<float>("pad_value");
-
-    PADDLE_ENFORCE_LT(abs(pad_value),
-                      1e-5,
-                      platform::errors::Unimplemented(
-                          "Ascend npu only support pad_value=0 right now,"
-                          "but received pad_value is %f .",
-                          pad_value));
-
-    out->mutable_data<T>(context.GetPlace());
-
-    NpuOpRunner runner;
-    runner.SetType("Pad")
-        .AddInput(*x)
-        .AddInput(std::move(paddings))
-        .AddOutput(*out);
-
-    auto stream =
-        context.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-template <typename T>
-class PadGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* d_out =
-        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* d_x = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto paddings = context.Attr<std::vector<int>>("paddings");
-
-    d_x->mutable_data<T>(context.GetPlace());
-
-    auto d_x_dims = d_x->dims();
-    auto size = phi::vectorize(d_x_dims);
-    std::vector<int> offsets(0);
-    int i = 0;
-    for (auto iter = paddings.begin(); iter < paddings.end(); ++iter, ++i) {
-      if (i % 2 == 0) {
-        offsets.push_back(*iter);
-      }
-    }
-
-    auto stream =
-        context.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner = NpuOpRunner(
-        "SliceD", {*d_out}, {*d_x}, {{"offsets", offsets}, {"size", size}});
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(pad,
-                       ops::PadNPUKernel<plat::float16>,
-                       ops::PadNPUKernel<float>,
-                       ops::PadNPUKernel<int>);
-
-REGISTER_OP_NPU_KERNEL(pad_grad,
-                       ops::PadGradNPUKernel<plat::float16>,
-                       ops::PadGradNPUKernel<float>);
diff --git a/paddle/fluid/operators/pool_op_npu.cc b/paddle/fluid/operators/pool_op_npu.cc
deleted file mode 100644
index e14c55a63642a1..00000000000000
--- a/paddle/fluid/operators/pool_op_npu.cc
+++ /dev/null
@@ -1,334 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/pooling.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class NPUPoolOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto &dev_ctx = ctx.template device_context<platform::NPUDeviceContext>();
-    const Tensor *in_x = ctx.Input<phi::DenseTensor>("X");
-    Tensor *out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    std::string pooling_type = ctx.Attr<std::string>("pooling_type");
-    std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    std::string data_format = ctx.Attr<std::string>("data_format");
-
-    bool global_pooling = ctx.Attr<bool>("global_pooling");
-    bool ceil_mode = ctx.Attr<bool>("ceil_mode");
-    bool exclusive = ctx.Attr<bool>("exclusive");
-    bool adaptive = ctx.Attr<bool>("adaptive");
-    std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
-
-    const bool channel_last = data_format == "NHWC";
-
-    auto in_x_dims = in_x->dims();
-    auto out_dims = out->dims();
-    framework::DDim data_dims;
-    framework::DDim out_data_dims;
-
-    Tensor in_x_tensor, out_tensor;
-    in_x_tensor.ShareDataWith(*in_x);
-    out_tensor.ShareDataWith(*out);
-    std::vector<int> ksize_vec(4, 1);
-    std::vector<int> strides_vec(4, 1);
-
-    if (channel_last) {
-      data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1);
-      out_data_dims = phi::slice_ddim(out_dims, 1, out_dims.size() - 1);
-      ksize_vec[1] = ksize[0];
-      ksize_vec[2] = ksize[1];
-      strides_vec[1] = strides[0];
-      strides_vec[2] = strides[1];
-      in_x_tensor.set_layout(DataLayout::kNHWC);
-      out_tensor.set_layout(DataLayout::kNHWC);
-    } else {
-      data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size());
-      out_data_dims = phi::slice_ddim(out_dims, 2, out_dims.size());
-      ksize_vec[2] = ksize[0];
-      ksize_vec[3] = ksize[1];
-      strides_vec[2] = strides[0];
-      strides_vec[3] = strides[1];
-    }
-    phi::funcs::UpdatePadding(&paddings,
-                              global_pooling,
-                              adaptive,
-                              padding_algorithm,
-                              data_dims,
-                              strides,
-                              ksize);
-#if (CANN_VERSION_CODE < 512000)
-    PADDLE_ENFORCE_LT(
-        std::max(paddings[0], paddings[1]),
-        ksize[0],
-        platform::errors::InvalidArgument(
-            "Paddings should be less than %d, but max(pads[0], pads[1]) is %d.",
-            ksize[0],
-            std::max(paddings[0], paddings[1])));
-    PADDLE_ENFORCE_LT(
-        std::max(paddings[2], paddings[3]),
-        ksize[1],
-        platform::errors::InvalidArgument(
-            "Paddings should be less than %d, but max(pads[2], pads[3]) is %d.",
-            ksize[1],
-            std::max(paddings[2], paddings[3])));
-#endif
-    if (adaptive) {
-      std::string pooling_mode = "AdaptiveAvgPool2d";
-      if (pooling_type == "max") {
-        pooling_mode = "AdaptiveMaxPool2d";
-      }
-
-      // AdaptiveAvgPool2d only support NCHW
-      Tensor transformed_input, transformed_output;
-      if (pooling_type == "avg" && channel_last) {
-        transformed_input.mutable_data<T>(
-            phi::make_dim(
-                in_x_dims[0], in_x_dims[3], in_x_dims[1], in_x_dims[2]),
-            ctx.GetPlace());
-        transformed_output.mutable_data<T>(
-            phi::make_dim(out_dims[0], out_dims[3], out_dims[1], out_dims[2]),
-            ctx.GetPlace());
-
-        const auto &trans_runner =
-            NpuOpRunner("TransData",
-                        {in_x_tensor},
-                        {transformed_input},
-                        {{"src_format", std::string("NHWC")},
-                         {"dst_format", std::string("NCHW")}});
-        trans_runner.Run(dev_ctx.stream());
-      } else {
-        transformed_input.ShareDataWith(in_x_tensor);
-        transformed_output.ShareDataWith(out_tensor);
-      }
-
-      const auto &runner =
-          NpuOpRunner(pooling_mode,
-                      {transformed_input},
-                      {transformed_output},
-                      {{"output_size", phi::vectorize<int>(out_data_dims)}});
-      runner.Run(dev_ctx.stream());
-
-      if (pooling_type == "avg" && channel_last) {
-        const auto &trans_runner =
-            NpuOpRunner("TransData",
-                        {transformed_output},
-                        {out_tensor},
-                        {{"src_format", std::string("NCHW")},
-                         {"dst_format", std::string("NHWC")}});
-        trans_runner.Run(dev_ctx.stream());
-      }
-    } else {
-      std::string pooling_mode = "AvgPoolV2";
-      if (pooling_type == "max") {
-        PADDLE_ENFORCE_EQ(
-            exclusive,
-            true,
-            platform::errors::InvalidArgument(
-                "MaxPool only support exclusive=false, but got true"));
-        pooling_mode = "MaxPoolV3";
-      }
-
-      const auto &runner =
-          NpuOpRunner(pooling_mode,
-                      {in_x_tensor},
-                      {out_tensor},
-                      {{"ksize", ksize_vec},
-                       {"strides", strides_vec},
-                       {"padding_mode", std::string("CALCULATED")},
-                       {"pads", paddings},
-                       {"data_format", data_format},
-                       {"global_pooling", global_pooling},
-                       {"ceil_mode", ceil_mode},
-                       {"exclusive", exclusive}});
-      runner.Run(dev_ctx.stream());
-    }
-  }
-};
-
-template <typename T>
-class NPUPoolGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto &dev_ctx = ctx.template device_context<platform::NPUDeviceContext>();
-    const Tensor *in_x = ctx.Input<phi::DenseTensor>("X");
-    const Tensor *out = ctx.Input<phi::DenseTensor>("Out");
-    const Tensor *out_grad =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    Tensor *in_x_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    in_x_grad->mutable_data<T>(ctx.GetPlace());
-
-    std::string pooling_type = ctx.Attr<std::string>("pooling_type");
-    std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
-    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    bool ceil_mode = ctx.Attr<bool>("ceil_mode");
-    bool exclusive = ctx.Attr<bool>("exclusive");
-    bool adaptive = ctx.Attr<bool>("adaptive");
-    std::string data_format = ctx.Attr<std::string>("data_format");
-    bool global_pooling = ctx.Attr<bool>("global_pooling");
-    std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
-
-    const bool channel_last = data_format == "NHWC";
-
-    // update paddings
-    auto in_x_dims = in_x->dims();
-    auto out_dims = out->dims();
-    framework::DDim data_dims;
-    framework::DDim out_data_dims;
-    std::vector<int> ksize_vec(4, 1);
-    std::vector<int> strides_vec(4, 1);
-
-    Tensor in_x_tensor, out_tensor, out_grad_tensor, in_x_grad_tensor;
-    in_x_tensor.ShareDataWith(*in_x);
-    out_tensor.ShareDataWith(*out);
-    out_grad_tensor.ShareDataWith(*out_grad);
-    in_x_grad_tensor.ShareDataWith(*in_x_grad);
-    if (channel_last) {
-      data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1);
-      out_data_dims = phi::slice_ddim(out_dims, 1, out_dims.size() - 1);
-      ksize_vec[1] = ksize[0];
-      ksize_vec[2] = ksize[1];
-      strides_vec[1] = strides[0];
-      strides_vec[2] = strides[1];
-      in_x_tensor.set_layout(DataLayout::kNHWC);
-      out_tensor.set_layout(DataLayout::kNHWC);
-      out_grad_tensor.set_layout(DataLayout::kNHWC);
-      in_x_grad_tensor.set_layout(DataLayout::kNHWC);
-    } else {
-      data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size());
-      out_data_dims = phi::slice_ddim(out_dims, 2, out_dims.size());
-      ksize_vec[2] = ksize[0];
-      ksize_vec[3] = ksize[1];
-      strides_vec[2] = strides[0];
-      strides_vec[3] = strides[1];
-    }
-    phi::funcs::UpdatePadding(&paddings,
-                              global_pooling,
-                              adaptive,
-                              padding_algorithm,
-                              data_dims,
-                              strides,
-                              ksize);
-#if (CANN_VERSION_CODE < 512000)
-    PADDLE_ENFORCE_LT(
-        std::max(paddings[0], paddings[1]),
-        ksize[0],
-        platform::errors::InvalidArgument(
-            "Paddings should be less than %d, but max(pads[0], pads[1]) is %d.",
-            ksize[0],
-            std::max(paddings[0], paddings[1])));
-    PADDLE_ENFORCE_LT(
-        std::max(paddings[2], paddings[3]),
-        ksize[1],
-        platform::errors::InvalidArgument(
-            "Paddings should be less than %d, but max(pads[2], pads[3]) is %d.",
-            ksize[1],
-            std::max(paddings[2], paddings[3])));
-#endif
-    if (adaptive || (global_pooling && pooling_type == "max")) {
-      PADDLE_ENFORCE_EQ(data_dims[0] % out_data_dims[0],
-                        0,
-                        platform::errors::InvalidArgument(
-                            "When adaptive = True, H and W must be divisible, "
-                            "but input dims is %s, output dims is %s",
-                            data_dims,
-                            out_data_dims));
-      PADDLE_ENFORCE_EQ(data_dims[1] % out_data_dims[1],
-                        0,
-                        platform::errors::InvalidArgument(
-                            "When adaptive = True, H and W must be divisible, "
-                            "but input dims is %s, output dims is %s",
-                            data_dims,
-                            out_data_dims));
-      if (channel_last) {
-        strides_vec[1] = data_dims[0] / out_data_dims[0];
-        strides_vec[2] = data_dims[1] / out_data_dims[1];
-        ksize_vec[1] = strides_vec[1];
-        ksize_vec[2] = strides_vec[2];
-      } else {
-        strides_vec[2] = data_dims[0] / out_data_dims[0];
-        strides_vec[3] = data_dims[1] / out_data_dims[1];
-        ksize_vec[2] = strides_vec[2];
-        ksize_vec[3] = strides_vec[3];
-      }
-    }
-
-    NPUAttributeMap attrs = {{"ksize", ksize_vec},
-                             {"strides", strides_vec},
-                             {"padding_mode", std::string("CALCULATED")},
-                             {"pads", paddings},
-                             {"data_format", data_format},
-                             {"global_pooling", global_pooling},
-                             {"ceil_mode", ceil_mode},
-                             {"exclusive", exclusive}};
-
-    if (pooling_type == "max") {
-      if (global_pooling) {
-        for (auto &s : strides_vec) {
-          s = 1;
-        }
-        PADDLE_ENFORCE_LT(std::max(data_dims[0], data_dims[1]),
-                          255,
-                          platform::errors::InvalidArgument(
-                              "MaxPoolGrad H, W must be less than 255 when "
-                              "global_pooling = True, but got %s",
-                              data_dims));
-        attrs["global_pooling"] = false;
-      }
-
-      const auto &runner =
-          NpuOpRunner("MaxPoolV3Grad",
-                      {in_x_tensor, out_tensor, out_grad_tensor},
-                      {in_x_grad_tensor},
-                      attrs);  // 0: floor, 1: ceil
-      runner.Run(dev_ctx.stream());
-    } else if (pooling_type == "avg") {
-      PADDLE_ENFORCE(strides[0] == strides[1],
-                     platform::errors::InvalidArgument(
-                         "AvgPoolGrad dose not support Asymmetric strides. but "
-                         "strides = (%d, %d)",
-                         strides[0],
-                         strides[1]));
-
-      NpuOpRunner runner;
-      runner.SetType("AvgPoolV2Grad");
-      runner.AddInput(phi::vectorize<int>(in_x->dims()));
-      runner.AddInput(out_grad_tensor);
-      runner.AddOutput(in_x_grad_tensor);
-      runner.AddAttrs(attrs);
-      runner.Run(dev_ctx.stream());
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(pool2d,
-                       ops::NPUPoolOpKernel<float>,
-                       ops::NPUPoolOpKernel<plat::float16>);
-REGISTER_OP_NPU_KERNEL(pool2d_grad,
-                       ops::NPUPoolGradOpKernel<float>,
-                       ops::NPUPoolGradOpKernel<plat::float16>);
diff --git a/paddle/fluid/operators/randperm_op_npu.cc b/paddle/fluid/operators/randperm_op_npu.cc
deleted file mode 100644
index fd03ce027bda57..00000000000000
--- a/paddle/fluid/operators/randperm_op_npu.cc
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/randperm_op.h"
-
-template <typename T>
-using kernel =
-    paddle::operators::RandpermKernel<paddle::platform::NPUDeviceContext, T>;
-
-REGISTER_OP_NPU_KERNEL(
-    randperm, kernel<int64_t>, kernel<int>, kernel<float>, kernel<double>);
diff --git a/paddle/fluid/operators/range_op_npu.cc b/paddle/fluid/operators/range_op_npu.cc
deleted file mode 100644
index b2266608d7dca3..00000000000000
--- a/paddle/fluid/operators/range_op_npu.cc
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/range_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class RangeNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* start_t = context.Input<phi::DenseTensor>("Start");
-    auto* end_t = context.Input<phi::DenseTensor>("End");
-    auto* step_t = context.Input<phi::DenseTensor>("Step");
-    auto* out = context.Output<phi::DenseTensor>("Out");
-
-    phi::DenseTensor n;
-    framework::TensorCopy(
-        *start_t,
-        platform::CPUPlace(),
-        context.template device_context<platform::NPUDeviceContext>(),
-        &n);
-    context.template device_context<paddle::platform::NPUDeviceContext>()
-        .Wait();
-    T start = n.data<T>()[0];
-    framework::TensorCopy(
-        *end_t,
-        platform::CPUPlace(),
-        context.template device_context<platform::NPUDeviceContext>(),
-        &n);
-    context.template device_context<paddle::platform::NPUDeviceContext>()
-        .Wait();
-    T end = n.data<T>()[0];
-    framework::TensorCopy(
-        *step_t,
-        platform::CPUPlace(),
-        context.template device_context<platform::NPUDeviceContext>(),
-        &n);
-    context.template device_context<paddle::platform::NPUDeviceContext>()
-        .Wait();
-    T step = n.data<T>()[0];
-
-    int64_t size = 0;
-    GetSize(start, end, step, &size);
-
-    out->Resize(phi::make_ddim({size}));
-    out->mutable_data<T>(context.GetPlace());
-
-    std::vector<T> odata;
-    T value = start;
-    for (int64_t i = 0; i < size; ++i) {
-      odata.push_back(value);
-      value += step;
-    }
-
-    framework::TensorFromVector(odata, context.device_context(), out);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_NPU_KERNEL(range,
-                       paddle::operators::RangeNPUKernel<int>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-                       paddle::operators::RangeNPUKernel<int64_t>,
-#endif
-                       paddle::operators::RangeNPUKernel<float>,
-                       paddle::operators::RangeNPUKernel<double>)
diff --git a/paddle/fluid/operators/range_op_npu_test.cc b/paddle/fluid/operators/range_op_npu_test.cc
deleted file mode 100644
index 068d5d6be12cd3..00000000000000
--- a/paddle/fluid/operators/range_op_npu_test.cc
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-USE_OP_ITSELF(range);
-USE_OP_DEVICE_KERNEL(range, NPU);
-
-template <typename T>
-void Compare(f::Scope* scope,
-             const p::DeviceContext& ctx,
-             std::string op_type) {
-  // init
-  auto start = scope->Var("Start");
-  auto tensor_start = start->GetMutable<phi::DenseTensor>();
-  std::vector<T> init_start;
-  init_start.push_back(static_cast<T>(1));
-  paddle::framework::TensorFromVector(init_start, ctx, tensor_start);
-  tensor_start->Resize({1});
-
-  auto end = scope->Var("End");
-  auto tensor_end = end->GetMutable<phi::DenseTensor>();
-  std::vector<T> init_end;
-  init_end.push_back(static_cast<T>(10));
-  paddle::framework::TensorFromVector(init_end, ctx, tensor_end);
-  tensor_end->Resize({1});
-
-  auto step = scope->Var("Step");
-  auto tensor_step = step->GetMutable<phi::DenseTensor>();
-  std::vector<T> init_step;
-  init_step.push_back(static_cast<T>(2));
-  paddle::framework::TensorFromVector(init_step, ctx, tensor_step);
-  tensor_step->Resize({1});
-
-  ctx.Wait();
-
-  auto place = ctx.GetPlace();
-  auto out = scope->Var("Out");
-  auto tensor_out = out->GetMutable<phi::DenseTensor>();
-
-  // run
-  auto op = f::OpRegistry::CreateOp(
-      op_type,
-      {{"Start", {"Start"}}, {"End", {"End"}}, {"Step", {"Step"}}},
-      {{"Out", {"Out"}}},
-      {});
-
-  op->Run(*scope, place);
-
-  std::vector<T> out_vec;
-  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
-  ctx.Wait();
-
-  EXPECT_EQ(static_cast<T>(out_vec.size()), static_cast<T>(5));
-  EXPECT_EQ(static_cast<T>(out_vec[0]), static_cast<T>(1.0));
-  EXPECT_EQ(static_cast<T>(out_vec[1]), static_cast<T>(3.0));
-  EXPECT_EQ(static_cast<T>(out_vec[2]), static_cast<T>(5.0));
-  EXPECT_EQ(static_cast<T>(out_vec[3]), static_cast<T>(7.0));
-  EXPECT_EQ(static_cast<T>(out_vec[4]), static_cast<T>(9.0));
-}
-
-TEST(range, NPU) {
-  f::Scope scope;
-  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<int>(&scope, *ctx, "range");
-}
diff --git a/paddle/fluid/operators/reshape_op_npu.cc b/paddle/fluid/operators/reshape_op_npu.cc
deleted file mode 100644
index 2d4497a19e77bb..00000000000000
--- a/paddle/fluid/operators/reshape_op_npu.cc
+++ /dev/null
@@ -1,167 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/tensor_utils.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class Reshape2NPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    auto place = ctx.GetPlace();
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    std::vector<int32_t> target_shape_vector;
-    auto shape_tensor_vector = ctx.MultiInput<phi::DenseTensor>("ShapeTensor");
-    if (shape_tensor_vector.size() > 0) {
-      for (auto* shape_tensor : shape_tensor_vector) {
-        PADDLE_ENFORCE_EQ(
-            shape_tensor->dims().size(),
-            1,
-            platform::errors::InvalidArgument(
-                "If the element type of 'shape' in Reshape Op is Tensor, "
-                "the element's shape must be [1]. But received the element's "
-                "shape is [%d]",
-                shape_tensor->dims().size()));
-
-        target_shape_vector.push_back(
-            phi::GetVectorFromTensor<int>(shape_tensor)[0]);
-      }
-    } else {
-      auto* shape_tensor = ctx.HasInput("Shape")
-                               ? ctx.Input<phi::DenseTensor>("Shape")
-                               : nullptr;
-      if (shape_tensor) {
-        target_shape_vector = phi::GetVectorFromTensor<int>(shape_tensor);
-      } else {
-        target_shape_vector = ctx.Attr<std::vector<int>>("shape");
-        PADDLE_ENFORCE_GT(
-            target_shape_vector.size(),
-            0,
-            platform::errors::InvalidArgument(
-                "The length of shape attribute should be larger than 0 when "
-                "input ShapeTensor and Shape are empty!"));
-      }
-    }
-
-    int num_negative =
-        std::count(target_shape_vector.begin(), target_shape_vector.end(), -1);
-    PADDLE_ENFORCE_LE(
-        num_negative,
-        1,
-        platform::errors::InvalidArgument(
-            "The max number of -1 in shape attribute or shape tensor is 1 "
-            "but received %d.",
-            num_negative));
-    auto it_zero =
-        std::find(target_shape_vector.begin(), target_shape_vector.end(), 0);
-    if (it_zero != target_shape_vector.end()) {
-      int x_rank = x->dims().size();
-      for (size_t i = 0; i < target_shape_vector.size(); i++) {
-        if (target_shape_vector[i] == 0) {
-          PADDLE_ENFORCE_LT(
-              i,
-              x_rank,
-              platform::errors::InvalidArgument(
-                  "The index of 0 in shape attribute or shape tensor",
-                  "should be less than input dim size, ",
-                  "but the index is %d and input dim size is %d",
-                  i,
-                  x_rank));
-          target_shape_vector[i] = x->dims().at(i);
-        }
-      }
-    }
-
-    auto it =
-        std::find(target_shape_vector.begin(), target_shape_vector.end(), -1);
-    if (it != target_shape_vector.end()) {
-      auto ddim_out_vec = phi::vectorize(x->dims());
-      int ddim_out_product = std::accumulate(
-          ddim_out_vec.begin(), ddim_out_vec.end(), 1, std::multiplies<int>());
-      int reshape_out_product = std::accumulate(target_shape_vector.begin(),
-                                                target_shape_vector.end(),
-                                                -1,
-                                                std::multiplies<int>());
-      int index = std::distance(target_shape_vector.begin(), it);
-      target_shape_vector[index] = ddim_out_product / reshape_out_product;
-    }
-
-    auto out_dims = phi::make_ddim(target_shape_vector);
-    out->mutable_data<T>(out_dims, place);
-
-    NpuOpRunner runner;
-    // the shape input must be on the host side
-    runner.SetType("Reshape")
-        .AddInput(*x)
-        .AddInput(std::vector<int32_t>(target_shape_vector))
-        .AddOutput(*out)
-        .AddAttr("axis", 0)
-        .AddAttr("num_axes", -1);
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class Reshape2GradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* d_out = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto in_dims = d_x->dims();
-
-    d_x->mutable_data(ctx.GetPlace(), d_out->type());
-    framework::TensorCopy(
-        *d_out,
-        ctx.GetPlace(),
-        ctx.template device_context<platform::DeviceContext>(),
-        d_x);
-    d_x->Resize(in_dims);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    reshape2,
-    ops::Reshape2NPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::Reshape2NPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::Reshape2NPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
-    ops::Reshape2NPUKernel<paddle::platform::NPUDeviceContext, bool>,
-    ops::Reshape2NPUKernel<paddle::platform::NPUDeviceContext, double>,
-    ops::Reshape2NPUKernel<paddle::platform::NPUDeviceContext, uint8_t>,
-    ops::Reshape2NPUKernel<paddle::platform::NPUDeviceContext,
-                           paddle::platform::float16>);
-REGISTER_OP_NPU_KERNEL(
-    reshape2_grad,
-    ops::Reshape2GradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::Reshape2GradNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::Reshape2GradNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
-    ops::Reshape2GradNPUKernel<paddle::platform::NPUDeviceContext, bool>,
-    ops::Reshape2GradNPUKernel<paddle::platform::NPUDeviceContext, double>,
-    ops::Reshape2GradNPUKernel<paddle::platform::NPUDeviceContext, uint8_t>,
-    ops::Reshape2GradNPUKernel<paddle::platform::NPUDeviceContext,
-                               paddle::platform::float16>);
diff --git a/paddle/fluid/operators/roi_align_op_npu.cc b/paddle/fluid/operators/roi_align_op_npu.cc
deleted file mode 100644
index 7d15dc2a46558c..00000000000000
--- a/paddle/fluid/operators/roi_align_op_npu.cc
+++ /dev/null
@@ -1,200 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ROIAlignNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* X = ctx.Input<phi::DenseTensor>("X");              // (B,C,H,W）
-    auto* ROIs = ctx.Input<phi::DenseTensor>("ROIs");        // (N，4）
-    auto* ROIsNum = ctx.Input<phi::DenseTensor>("RoisNum");  // [0 1 1 2 2 2]
-    auto* Out = ctx.Output<phi::DenseTensor>("Out");
-    Out->mutable_data<T>(ctx.GetPlace());
-
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto sample_num = ctx.Attr<int>("sampling_ratio");
-    auto aligned = ctx.Attr<bool>("aligned");
-    auto roi_end_mode = 0;
-    PADDLE_ENFORCE_EQ(
-        aligned,
-        false,
-        platform::errors::InvalidArgument(
-            "ROIAlignNPU only support Aligned attribute equaled to False"));
-
-    framework::NPUAttributeMap attr_roi = {{"spatial_scale", spatial_scale},
-                                           {"pooled_height", pooled_height},
-                                           {"pooled_width", pooled_width},
-                                           {"sample_num", sample_num},
-                                           {"roi_end_mode", roi_end_mode}};
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    // Combine *ROIsNum with ROIs to get new ROIs
-    // change roisnum's datatype & resize
-    int dtype =
-        static_cast<int>(ConvertToNpuDtype(framework::proto::VarType::FP32));
-    framework::NPUAttributeMap attr_cast = {{"dst_type", dtype}};
-    phi::DenseTensor ROIsNum_fp(ROIs->dtype());
-    ROIsNum_fp.Resize(phi::make_ddim({ROIs->dims()[0], 1}));
-    ROIsNum_fp.mutable_data<T>(ctx.GetPlace());
-
-    const auto& runner_c =
-        NpuOpRunner("Cast", {*ROIsNum}, {ROIsNum_fp}, attr_cast);
-    runner_c.Run(stream);
-
-    // concate to make (N, 5)
-    std::vector<phi::DenseTensor> x_list;
-    x_list.push_back(ROIsNum_fp);
-    x_list.push_back(*ROIs);
-    auto axis = 1;
-    // output of concate
-    phi::DenseTensor ROIs_N5(ROIs->dtype());
-    ROIs_N5.Resize(phi::make_ddim({ROIs->dims()[0], 5}));
-    ROIs_N5.mutable_data<T>(ctx.GetPlace());
-
-    // attribute of concate
-    auto EleNum = 2;
-    framework::NPUAttributeMap attr_concat = {{"N", EleNum},
-                                              {"concat_dim", axis}};
-
-    NpuOpRunner runner0;
-    runner0.SetType("ConcatD")
-        .AddInputs(x_list)
-        .AddOutput(ROIs_N5)
-        .AddInputNames({"x0", "x1"})
-        .AddAttrs(attr_concat);
-    runner0.Run(stream);
-
-    const auto& runner =
-        NpuOpRunner("ROIAlign", {*X, ROIs_N5}, {*Out}, attr_roi);
-    runner.Run(stream);
-  }
-};
-
-template <typename T>
-class ROIAlignNPUGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<phi::DenseTensor>("X");
-    auto* rois = ctx.Input<phi::DenseTensor>("ROIs");
-    auto* out_grad = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* in_grad = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-    auto sample_num = ctx.Attr<int>("sampling_ratio");
-    auto in_dims = in->dims();
-    auto aligned = ctx.Attr<bool>("aligned");
-
-    int rois_num = rois->dims()[0];
-
-    auto place = ctx.GetPlace();
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    if (!in_grad) {
-      return;
-    }
-    in_grad->mutable_data<T>(place);
-
-    PADDLE_ENFORCE_EQ(
-        aligned,
-        false,
-        platform::errors::InvalidArgument(
-            "ROIAlignGradNPU only support Aligned attribute equaled to False"));
-    PADDLE_ENFORCE_EQ(
-        ctx.HasInput("RoisNum"),
-        true,
-        platform::errors::NotFound("Input(RoisNum) of ROIAlignGradOp "
-                                   "is not found while using NPU."));
-    PADDLE_ENFORCE_EQ(
-        framework::TransToProtoVarType(rois->dtype()),
-        framework::proto::VarType::FP32,
-        platform::errors::InvalidArgument(
-            "ROIAlignGradNPU only support ROIs type equaled to FP32."));
-
-    // Cast RoisNum to fp32 tensor
-    auto* RoisNum = ctx.Input<phi::DenseTensor>("RoisNum");
-    phi::DenseTensor ROIs_N5;
-    ROIs_N5.mutable_data<float>({rois_num, 5}, place);
-    phi::DenseTensor ROIsNum_fp;
-    ROIsNum_fp.mutable_data<T>(RoisNum->dims(), place);  // shape = [rois_num]
-    int nputype_fp32 =
-        static_cast<int>(ConvertToNpuDtype(framework::proto::VarType::FP32));
-    const auto& runner_cast = NpuOpRunner(
-        "Cast", {*RoisNum}, {ROIsNum_fp}, {{"dst_type", nputype_fp32}});
-    runner_cast.Run(stream);
-    ROIsNum_fp.Resize({rois_num, 1});
-
-    // Combine *ROIsNum with ROIs to get new ROIs
-    std::vector<phi::DenseTensor> x_list;
-    x_list.push_back(ROIsNum_fp);
-    x_list.push_back(*rois);
-    const auto& runner_concat = NpuOpRunner(
-        "ConcatD", {x_list}, {ROIs_N5}, {{"N", 2}, {"concat_dim", 1}});
-    runner_concat.Run(stream);
-
-    //  If CANN version code is less than 504, by analysis, in order to match
-    //  cpu grad version, rois[:,3:5] should substrate 1 before call ascend grad
-    //  function
-#if (CANN_VERSION_CODE < 504000)
-    std::vector<float> vec_dlt = {0, 0, 0, -1.0f, -1.0f};
-    phi::DenseTensor tsr_dlt;
-    tsr_dlt.mutable_data<float>({5}, place);
-    framework::TensorFromVector<float>(vec_dlt, ctx.device_context(), &tsr_dlt);
-    ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
-    const auto& runner_add =
-        NpuOpRunner("AddV2", {ROIs_N5, tsr_dlt}, {ROIs_N5}, {});
-    runner_add.Run(stream);
-#endif
-
-    //  Call ascend RoiAlignGrad function
-    int roi_end_mode = 0;
-    const auto& runner_roi_align_grad =
-        NpuOpRunner("ROIAlignGrad",
-                    {*out_grad, ROIs_N5},
-                    {*in_grad},
-                    {{"xdiff_shape", phi::vectorize<int>(in_dims)},
-                     {"pooled_width", pooled_width},
-                     {"pooled_height", pooled_height},
-                     {"spatial_scale", spatial_scale},
-                     {"sample_num", sample_num},
-                     {"roi_end_mode", roi_end_mode}});
-    runner_roi_align_grad.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_NPU_KERNEL(
-    roi_align,
-    ops::ROIAlignNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::ROIAlignNPUKernel<paddle::platform::NPUDeviceContext, double>,
-    ops::ROIAlignNPUKernel<paddle::platform::NPUDeviceContext, int>);
-
-REGISTER_OP_NPU_KERNEL(roi_align_grad,
-                       ops::ROIAlignNPUGradKernel<float>,
-                       ops::ROIAlignNPUGradKernel<double>,
-                       ops::ROIAlignNPUGradKernel<int>);
diff --git a/paddle/fluid/operators/run_program_op_npu.cc b/paddle/fluid/operators/run_program_op_npu.cc
deleted file mode 100644
index e45ce0a2bef9ff..00000000000000
--- a/paddle/fluid/operators/run_program_op_npu.cc
+++ /dev/null
@@ -1,13 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
diff --git a/paddle/fluid/operators/sampling_id_op_npu.cc b/paddle/fluid/operators/sampling_id_op_npu.cc
deleted file mode 100644
index 5657edcfa35bb3..00000000000000
--- a/paddle/fluid/operators/sampling_id_op_npu.cc
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/sampling_id_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_NPU_KERNEL(sampling_id,
-                       paddle::operators::SamplingIdKernel<float>,
-                       paddle::operators::SamplingIdKernel<double>);
diff --git a/paddle/fluid/operators/save_combine_op_npu.cc b/paddle/fluid/operators/save_combine_op_npu.cc
deleted file mode 100644
index 1fb136a5110dbd..00000000000000
--- a/paddle/fluid/operators/save_combine_op_npu.cc
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/save_combine_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    save_combine,
-    ops::SaveCombineOpKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::SaveCombineOpKernel<paddle::platform::NPUDeviceContext, double>,
-    ops::SaveCombineOpKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::SaveCombineOpKernel<paddle::platform::NPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/save_op_npu.cc b/paddle/fluid/operators/save_op_npu.cc
deleted file mode 100644
index d6063d66f1531c..00000000000000
--- a/paddle/fluid/operators/save_op_npu.cc
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/save_op.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    save,
-    ops::SaveOpKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::SaveOpKernel<paddle::platform::NPUDeviceContext, double>,
-    ops::SaveOpKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::SaveOpKernel<paddle::platform::NPUDeviceContext, uint8_t>,
-    ops::SaveOpKernel<paddle::platform::NPUDeviceContext, int8_t>,
-    ops::SaveOpKernel<paddle::platform::NPUDeviceContext, int64_t>,
-    ops::SaveOpKernel<paddle::platform::NPUDeviceContext,
-                      paddle::platform::float16>);
diff --git a/paddle/fluid/operators/scale_op_npu.cc b/paddle/fluid/operators/scale_op_npu.cc
deleted file mode 100644
index c25a49c4f3b600..00000000000000
--- a/paddle/fluid/operators/scale_op_npu.cc
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-static inline T GetAttrFromTensor(const phi::DenseTensor* tensor) {
-  const auto* tensor_data = tensor->data<T>();
-  phi::DenseTensor cpu_tensor;
-  if (platform::is_gpu_place(tensor->place()) ||
-      platform::is_npu_place(tensor->place())) {
-    paddle::framework::TensorCopySync(
-        *tensor, platform::CPUPlace(), &cpu_tensor);
-    tensor_data = cpu_tensor.data<T>();
-  }
-  return tensor_data[0];
-}
-
-template <typename T>
-class ScaleNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    auto scale = ctx.Attr<float>("scale");
-    auto bias = ctx.Attr<float>("bias");
-    auto bias_after_scale = ctx.Attr<bool>("bias_after_scale");
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    float power = 1.0;
-    VLOG(4) << "scale:" << scale << ", bias:" << bias
-            << " ,bias_after_scale:" << bias_after_scale;
-    if (ctx.HasInput("ScaleTensor")) {
-      auto* scale_tensor = ctx.Input<phi::DenseTensor>("ScaleTensor");
-      scale = static_cast<float>(GetAttrFromTensor<T>(scale_tensor));
-    }
-    if (isinf(scale)) {
-      if (signbit(scale)) {
-        scale = -std::numeric_limits<float>::max();
-      } else {
-        scale = std::numeric_limits<float>::max();
-      }
-    }
-    if (!bias_after_scale) {
-      bias *= scale;
-    }
-    out->mutable_data<T>(ctx.GetPlace());
-
-    framework::NPUAttributeMap attrs = {
-        {"power", power}, {"scale", scale}, {"shift", bias}};
-    const auto& dev_ctx =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>();
-    auto op_func = [](const std::vector<Tensor>& inputs,
-                      const std::vector<Tensor>& outputs,
-                      const NPUAttributeMap& attrs,
-                      const platform::NPUDeviceContext& dev_ctx) {
-      const auto& muls_runner = NpuOpRunner(
-          "Muls", {inputs[0]}, {outputs[0]}, {{"value", attrs.at("scale")}});
-      muls_runner.Run(dev_ctx.stream());
-
-      const auto& adds_runner = NpuOpRunner(
-          "Adds", {outputs[0]}, {outputs[0]}, {{"value", attrs.at("shift")}});
-      adds_runner.Run(dev_ctx.stream());
-    };
-
-    if (framework::TransToProtoVarType(x->dtype()) ==
-        framework::proto::VarType::INT32) {
-      NpuOpRunner::TypeAdapter({*x},
-                               {*out},
-                               attrs,
-                               dev_ctx,
-                               op_func,
-                               {framework::proto::VarType::INT32},
-                               {framework::proto::VarType::INT32});
-    } else if (framework::TransToProtoVarType(x->dtype()) ==
-               framework::proto::VarType::INT64) {
-      NpuOpRunner::TypeAdapter({*x},
-                               {*out},
-                               attrs,
-                               dev_ctx,
-                               op_func,
-                               {framework::proto::VarType::INT32},
-                               {framework::proto::VarType::INT32});
-    } else {
-      const auto& runner = NpuOpRunner("Power", {*x}, {*out}, attrs);
-      runner.Run(stream);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_NPU_KERNEL(
-    scale,
-    paddle::operators::ScaleNPUKernel<float>,
-    paddle::operators::ScaleNPUKernel<paddle::platform::float16>,
-    paddle::operators::ScaleNPUKernel<int64_t>,
-    paddle::operators::ScaleNPUKernel<int>);
diff --git a/paddle/fluid/operators/scatter_op_npu.cc b/paddle/fluid/operators/scatter_op_npu.cc
deleted file mode 100644
index b2b09faaa9d445..00000000000000
--- a/paddle/fluid/operators/scatter_op_npu.cc
+++ /dev/null
@@ -1,13 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
diff --git a/paddle/fluid/operators/seed_op_npu.cc b/paddle/fluid/operators/seed_op_npu.cc
deleted file mode 100644
index 1843e993d552a1..00000000000000
--- a/paddle/fluid/operators/seed_op_npu.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/seed_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class NPUSeedKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    int user_seed = ctx.Attr<int>("seed");
-    std::random_device rnd;
-    int seed;
-
-    if (user_seed != 0) {
-      seed = user_seed;
-    } else {
-      seed = rnd();
-    }
-
-    out->mutable_data<T>(ctx.GetPlace());
-    FillNpuTensorWithConstant<int>(out, seed);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(
-    seed, ops::NPUSeedKernel<paddle::platform::NPUDeviceContext, int>);
diff --git a/paddle/fluid/operators/set_value_op_npu.cc b/paddle/fluid/operators/set_value_op_npu.cc
deleted file mode 100644
index b572e98eb81e9f..00000000000000
--- a/paddle/fluid/operators/set_value_op_npu.cc
+++ /dev/null
@@ -1,198 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/set_value_op.h"
-#include "paddle/phi/kernels/funcs/slice_utils.h"
-
-namespace paddle {
-namespace operators {
-
-using NPUDeviceContext = platform::NPUDeviceContext;
-
-template <typename T>
-class SetValueNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto* in = ctx.Input<phi::DenseTensor>("Input");
-    auto* value_tensor = ctx.Input<phi::DenseTensor>("ValueTensor");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    auto starts_tensor_list =
-        ctx.MultiInput<phi::DenseTensor>("StartsTensorList");
-    auto ends_tensor_list = ctx.MultiInput<phi::DenseTensor>("EndsTensorList");
-    auto steps_tensor_list =
-        ctx.MultiInput<phi::DenseTensor>("StepsTensorList");
-
-    auto axes = ctx.Attr<std::vector<int64_t>>("axes");
-    auto starts = ctx.Attr<std::vector<int64_t>>("starts");
-    auto ends = ctx.Attr<std::vector<int64_t>>("ends");
-    auto steps = ctx.Attr<std::vector<int64_t>>("steps");
-    auto shape = ctx.Attr<std::vector<int64_t>>("shape");
-    auto decrease_axes = ctx.Attr<std::vector<int64_t>>("decrease_axes");
-    auto none_axes = ctx.Attr<std::vector<int64_t>>("none_axes");
-
-    if (!starts_tensor_list.empty()) {
-      starts = GetDataFromTensorList<int64_t>(starts_tensor_list);
-    }
-    if (!ends_tensor_list.empty()) {
-      ends = GetDataFromTensorList<int64_t>(ends_tensor_list);
-    }
-    if (!steps_tensor_list.empty()) {
-      steps = GetDataFromTensorList<int64_t>(steps_tensor_list);
-    }
-
-    auto in_dims = in->dims();
-    phi::funcs::CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends, &steps);
-    auto slice_dims =
-        phi::funcs::GetSliceDims(in_dims, axes, starts, ends, &steps);
-    auto decrease_slice_dims =
-        phi::funcs::GetDecreasedDims(slice_dims, decrease_axes);
-
-    auto slice_dims_for_assign = decrease_slice_dims;
-    if (!none_axes.empty()) {
-      std::vector<int64_t> slice_dims_with_none;
-
-      size_t none_axes_cur = 0, decrease_axes_cur = 0;
-      for (int i = 0; i < slice_dims.size(); ++i) {
-        while (none_axes_cur < none_axes.size() &&
-               none_axes[none_axes_cur] <= i) {
-          slice_dims_with_none.push_back(1);
-          none_axes_cur++;
-        }
-        if (decrease_axes_cur < decrease_axes.size() &&
-            decrease_axes[decrease_axes_cur] == i) {
-          decrease_axes_cur++;
-        } else {
-          slice_dims_with_none.push_back(slice_dims[i]);
-        }
-      }
-      while (none_axes_cur < none_axes.size()) {
-        slice_dims_with_none.push_back(1);
-        none_axes_cur++;
-      }
-
-      slice_dims_for_assign = phi::make_ddim(slice_dims_with_none);
-    }
-
-    paddle::framework::TensorCopy(*in, ctx.GetPlace(), out);
-
-    auto starts_indices = std::vector<int64_t>(in_dims.size(), 0);
-    auto ends_indices = std::vector<int64_t>(in_dims.size(), 0);
-    auto strides_indices = std::vector<int64_t>(in_dims.size(), 0);
-
-    for (int i = 0; i < in_dims.size(); ++i) {
-      starts_indices[i] = 0;
-      ends_indices[i] = slice_dims[i];
-      strides_indices[i] = 1;
-    }
-    for (size_t i = 0; i < axes.size(); i++) {
-      int axis_index = axes[i];
-      starts_indices[axis_index] = starts[i];
-      ends_indices[axis_index] = ends[i];
-      strides_indices[axis_index] = steps[i];
-    }
-
-    int64_t stride_step = phi::product(in_dims);
-    std::vector<int64_t> index_indices(1, 0);
-    for (size_t i = 0; i < strides_indices.size(); ++i) {
-      auto index_size = index_indices.size();
-      stride_step /= in_dims[i];
-      for (size_t j = 0; j < index_size; ++j) {
-        auto start_index = *index_indices.begin();
-        if (strides_indices[i] > 0) {
-          for (int64_t k = starts_indices[i]; k < ends_indices[i];
-               k += strides_indices[i]) {
-            index_indices.push_back(start_index + k * stride_step);
-          }
-        } else {
-          for (int64_t k = starts_indices[i]; k > ends_indices[i];
-               k += strides_indices[i]) {
-            index_indices.push_back(start_index + k * stride_step);
-          }
-        }
-        index_indices.erase(index_indices.begin());
-      }
-    }
-
-    PADDLE_ENFORCE_EQ(
-        static_cast<int64_t>(index_indices.size()),
-        phi::product(slice_dims_for_assign),
-        platform::errors::InvalidArgument(
-            "OP(set_value) error index indices and value update not match "));
-
-    phi::DenseTensor value_t(in->type());
-    if (value_tensor != nullptr) {
-      value_t.ShareDataWith(*value_tensor);
-    } else {
-      auto value_dims = phi::make_ddim(shape);
-      CheckIsDimsMatch(slice_dims_for_assign, value_dims);
-
-      value_t.mutable_data<T>(value_dims, ctx.GetPlace());
-      auto value_name =
-          GetValueName(framework::TransToProtoVarType(in->dtype()));
-      CopyVectorToTensor<T>(value_name.c_str(), &value_t, ctx);
-      value_t.Resize(value_dims);
-    }
-
-    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
-
-    phi::DenseTensor value_temp(in->type());
-    if (slice_dims_for_assign == value_t.dims()) {
-      value_temp.ShareDataWith(value_t);
-    } else {
-      value_temp.Resize(slice_dims_for_assign);
-      value_temp.mutable_data<T>(ctx.GetPlace());
-      NpuOpRunner runner_brd;
-      runner_brd.SetType("BroadcastTo")
-          .AddInput(value_t)
-          .AddInput(phi::vectorize(slice_dims_for_assign))
-          .AddOutput(value_temp)
-          .Run(stream);
-    }
-
-    int64_t input_numel = phi::product(in_dims);
-    int64_t index_numel = index_indices.size();
-
-    phi::DenseTensor in_temp, out_temp, val_temp;
-    in_temp.ShareDataWith(*in);
-    out_temp.ShareDataWith(*out);
-    val_temp.ShareDataWith(value_temp);
-    in_temp.Resize(phi::make_ddim({input_numel}));
-    out_temp.Resize(phi::make_ddim({input_numel}));
-    val_temp.Resize(phi::make_ddim({index_numel}));
-
-    NpuOpRunner runner;
-    runner.SetType("ScatterUpdate")
-        .AddInput(in_temp)
-        .AddInput(std::move(index_indices))
-        .AddInput(val_temp)
-        .AddOutput(out_temp)
-#if (CANN_VERSION_CODE >= 504000)
-        .AddAttrs({{"use_locking", false}})
-#endif
-        .Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(set_value,
-                       ops::SetValueNPUKernel<int>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-                       ops::SetValueNPUKernel<int64_t>,
-#endif
-                       ops::SetValueNPUKernel<float>)
diff --git a/paddle/fluid/operators/shape_op_npu.cc b/paddle/fluid/operators/shape_op_npu.cc
deleted file mode 100644
index 76f4539e70b2f7..00000000000000
--- a/paddle/fluid/operators/shape_op_npu.cc
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ShapeNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("Input");
-    auto* out_t = ctx.Output<phi::DenseTensor>("Out");
-    out_t->Resize({x->dims().size()});
-    out_t->mutable_data<int32_t>(ctx.GetPlace());
-
-    // The output data type defaults to int32.
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    NpuOpRunner runner;
-    auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::INT32);
-    runner.SetType("Shape").AddInput(*x).AddOutput(*out_t).AddAttr(
-        "dtype", static_cast<int>(dst_dtype));
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    shape,
-    ops::ShapeNPUKernel<paddle::platform::NPUDeviceContext, bool>,
-    ops::ShapeNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::ShapeNPUKernel<paddle::platform::NPUDeviceContext, int8_t>,
-    ops::ShapeNPUKernel<paddle::platform::NPUDeviceContext, uint8_t>,
-    ops::ShapeNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
-    ops::ShapeNPUKernel<paddle::platform::NPUDeviceContext,
-                        paddle::platform::float16>,
-    ops::ShapeNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::ShapeNPUKernel<paddle::platform::NPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/shard_index_op_npu.cc b/paddle/fluid/operators/shard_index_op_npu.cc
deleted file mode 100644
index 4181db1d8e04cd..00000000000000
--- a/paddle/fluid/operators/shard_index_op_npu.cc
+++ /dev/null
@@ -1,121 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class ShardIndexNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    VLOG(4) << "start kernel";
-    auto* in = context.Input<phi::DenseTensor>("X");
-    auto* out = context.Output<phi::DenseTensor>("Out");
-    int index_num = context.Attr<int>("index_num");
-    int nshards = context.Attr<int>("nshards");
-    int shard_id = context.Attr<int>("shard_id");
-    int ignore_value = context.Attr<int>("ignore_value");
-
-    PADDLE_ENFORCE_GT(
-        index_num,
-        0,
-        platform::errors::InvalidArgument(
-            "The value 'index_num' for Op(shard_index) must be greater than 0, "
-            "but the value given is %d.",
-            index_num));
-    PADDLE_ENFORCE_GT(nshards,
-                      0,
-                      platform::errors::InvalidArgument(
-                          "The value 'nshard' for Op(shard_index) must be "
-                          "greater than 0, but the value given is %d.",
-                          nshards));
-    PADDLE_ENFORCE_GE(
-        shard_id,
-        0,
-        platform::errors::InvalidArgument(
-            "The value 'shard_id' for Op(shard_index) must be greater or "
-            "equal to 0, but the value given is %d.",
-            shard_id));
-    PADDLE_ENFORCE_LT(
-        shard_id,
-        nshards,
-        platform::errors::InvalidArgument(
-            "The value 'shard_id' for Op(shard_index) must be less than "
-            "nshards (%d), but the value given is %d.",
-            nshards,
-            shard_id));
-
-    int shard_size = (index_num + nshards - 1) / nshards;
-
-    auto place = context.GetPlace();
-    out->Resize(in->dims());
-    out->set_lod(in->lod());
-    out->mutable_data<T>(place);
-
-    phi::DenseTensor tmp(in->type());
-    tmp.mutable_data<T>(framework::DDim({1}), place);
-    FillNpuTensorWithConstant(&tmp, shard_size);
-
-    phi::DenseTensor condition(phi::DataType::BOOL);
-    condition.mutable_data<bool>(in->dims(), place);
-
-    phi::DenseTensor tmp2(in->type());
-    tmp2.mutable_data<T>(in->dims(), place);
-
-    phi::DenseTensor tmp3(in->type());
-    tmp3.mutable_data<T>(in->dims(), place);
-
-    auto stream =
-        context.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    NpuOpRunner runner;
-    runner.AddInputs({*in, tmp});
-    runner.AddOutputs({tmp2});
-    runner.SetType("Mod");
-    runner.Run(stream);
-
-    NpuOpRunner runner1;
-    runner1.AddInputs({*in, tmp});
-    runner1.AddOutputs({tmp3});
-    runner1.SetType("FloorDiv");
-    runner1.Run(stream);
-
-    FillNpuTensorWithConstant(&tmp, shard_id);
-    NpuOpRunner runner2;
-    runner2.AddInputs({tmp3, tmp});
-    runner2.AddOutputs({condition});
-    runner2.SetType("Equal");
-    runner2.Run(stream);
-
-    phi::DenseTensor tmp4(in->type());
-    tmp4.mutable_data<T>(in->dims(), place);
-    FillNpuTensorWithConstant(&tmp4, ignore_value);
-    tmp4.Resize(in->dims());
-
-    NpuOpRunner runner3;
-    runner3.AddInputs({condition, tmp2, tmp4});
-    runner3.AddOutputs({*out});
-    runner3.SetType("Select");
-    runner3.Run(stream);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_NPU_KERNEL(shard_index,
-                       ops::ShardIndexNPUKernel<int>,
-                       ops::ShardIndexNPUKernel<int64_t>);
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc
deleted file mode 100644
index 0d4ad6331e8070..00000000000000
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-const int kIgnoreIndex = -100;
-
-void CheckAttrs(const framework::ExecutionContext& ctx) {
-  // Add this check is due to Ascend SigmoidCrossEntropyWithLogits
-  // and SigmoidCrossEntropyWithLogitsGrad does't supoort
-  // attr normalize and ignore_index
-  bool normalize = ctx.Attr<bool>("normalize");
-  int ignore_index = ctx.Attr<int>("ignore_index");
-  PADDLE_ENFORCE_EQ(normalize,
-                    false,
-                    platform::errors::InvalidArgument(
-                        "attr normalize must be false, but got true"));
-  PADDLE_ENFORCE_EQ(ignore_index,
-                    kIgnoreIndex,
-                    platform::errors::InvalidArgument(
-                        "attr ignore_index must be default %d, but got %d",
-                        kIgnoreIndex,
-                        ignore_index));
-}
-
-template <typename DeviceContext, typename T>
-class SigmoidCrossEntropyWithLogitsNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    CheckAttrs(ctx);
-
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* label = ctx.Input<phi::DenseTensor>("Label");
-
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    auto place = ctx.GetPlace();
-
-    out->mutable_data<T>(place);
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner =
-        NpuOpRunner("SigmoidCrossEntropyWithLogits", {*x, *label}, {*out}, {});
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SigmoidCrossEntropyWithLogitsNPUGradKernel
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    CheckAttrs(ctx);
-
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* label = ctx.Input<phi::DenseTensor>("Label");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-
-    auto place = ctx.GetPlace();
-
-    dx->mutable_data<T>(place);
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    const auto& runner_dx = NpuOpRunner(
-        "SigmoidCrossEntropyWithLogitsGrad", {*x, *label, *dout}, {*dx}, {});
-    runner_dx.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(
-    sigmoid_cross_entropy_with_logits,
-    ops::SigmoidCrossEntropyWithLogitsNPUKernel<plat::NPUDeviceContext, float>,
-    ops::SigmoidCrossEntropyWithLogitsNPUKernel<plat::NPUDeviceContext,
-                                                plat::float16>);
-REGISTER_OP_NPU_KERNEL(
-    sigmoid_cross_entropy_with_logits_grad,
-    ops::SigmoidCrossEntropyWithLogitsNPUGradKernel<plat::NPUDeviceContext,
-                                                    float>,
-    ops::SigmoidCrossEntropyWithLogitsNPUGradKernel<plat::NPUDeviceContext,
-                                                    plat::float16>);
diff --git a/paddle/fluid/operators/size_op_npu.cc b/paddle/fluid/operators/size_op_npu.cc
deleted file mode 100644
index 594b0cc18e886a..00000000000000
--- a/paddle/fluid/operators/size_op_npu.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class SizeNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("Input");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    Tensor cpu_tensor;
-    auto cpu_data =
-        cpu_tensor.mutable_data<int64_t>(out->dims(), platform::CPUPlace());
-    cpu_data[0] = x->numel();
-    paddle::framework::TensorCopy(
-        cpu_tensor,
-        ctx.GetPlace(),
-        ctx.template device_context<platform::DeviceContext>(),
-        out);
-    ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    size,
-    ops::SizeNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::SizeNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
-    ops::SizeNPUKernel<paddle::platform::NPUDeviceContext,
-                       paddle::platform::float16>,
-    ops::SizeNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::SizeNPUKernel<paddle::platform::NPUDeviceContext, double>,
-    ops::SizeNPUKernel<paddle::platform::NPUDeviceContext, bool>);
diff --git a/paddle/fluid/operators/slice_op_npu.cc b/paddle/fluid/operators/slice_op_npu.cc
deleted file mode 100644
index a54ba630b274c0..00000000000000
--- a/paddle/fluid/operators/slice_op_npu.cc
+++ /dev/null
@@ -1,254 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/utils.h"
-#include "paddle/phi/core/tensor_utils.h"
-#include "paddle/phi/kernels/funcs/slice_utils.h"
-
-namespace paddle {
-namespace operators {
-
-using NPUDeviceContext = platform::NPUDeviceContext;
-
-void UpdateAttr(const framework::DDim& in_dims,
-                const std::vector<int> axes,
-                const std::vector<int> starts,
-                const std::vector<int> ends,
-                std::vector<int>* offsets,
-                std::vector<int>* size) {
-  int cnt = 0;
-  for (int i = 0; i < in_dims.size(); ++i) {
-    int start = 0;
-    int end = in_dims[i];
-    // NOTE(zhiqiu): Becareful that cnt may > axes.size() and result in
-    // overflow.
-    int axis = cnt < static_cast<int>(axes.size()) ? axes[cnt] : -1;
-    if (axis == i) {
-      start = starts[cnt];
-      if (start < 0) {
-        start = (start + in_dims[i]);
-      }
-      start = std::max(start, static_cast<int>(0));
-      end = ends[cnt];
-      if (end < 0) {
-        end = (end + in_dims[i]);
-      }
-      end = std::min(end, static_cast<int>(in_dims[i]));
-      cnt++;
-    }
-
-    (*offsets)[i] = start;
-    (*size)[i] = end - start;
-  }
-}
-
-template <typename T>
-class SliceNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<phi::DenseTensor>("Input");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    auto axes_int = ctx.Attr<std::vector<int>>("axes");
-    auto starts_int = ctx.Attr<std::vector<int>>("starts");
-    auto ends_int = ctx.Attr<std::vector<int>>("ends");
-    std::vector<int> axes(axes_int.begin(), axes_int.end());
-    std::vector<int> starts(starts_int.begin(), starts_int.end());
-    std::vector<int> ends(ends_int.begin(), ends_int.end());
-
-    auto decrease_axis = ctx.Attr<std::vector<int>>("decrease_axis");
-    auto infer_flags = ctx.Attr<std::vector<int>>("infer_flags");
-
-    const auto& in_dims = input->dims();
-
-    // Get the accurate attribute value of starts and ends
-    auto starts_tensor_list =
-        ctx.MultiInput<phi::DenseTensor>("StartsTensorList");
-    if (ctx.HasInput("StartsTensor")) {
-      starts = phi::GetVectorFromTensor<int>(
-          ctx.Input<phi::DenseTensor>("StartsTensor"));
-    } else if (starts_tensor_list.size() > 0) {
-      starts = GetDataFromTensorList<int>(starts_tensor_list);
-    }
-
-    auto ends_tensor_list = ctx.MultiInput<phi::DenseTensor>("EndsTensorList");
-    if (ctx.HasInput("EndsTensor")) {
-      ends = phi::GetVectorFromTensor<int>(
-          ctx.Input<phi::DenseTensor>("EndsTensor"));
-    } else if (ends_tensor_list.size() > 0) {
-      ends = GetDataFromTensorList<int>(ends_tensor_list);
-    }
-
-    PADDLE_ENFORCE_EQ(
-        starts.size(),
-        axes.size(),
-        platform::errors::InvalidArgument(
-            "The size of starts must be equal to the size of axes."));
-    PADDLE_ENFORCE_EQ(
-        ends.size(),
-        axes.size(),
-        platform::errors::InvalidArgument(
-            "The size of ends must be equal to the size of axes."));
-
-    if (ctx.HasInput("StartsTensor") || ctx.HasInput("EndsTensor") ||
-        starts_tensor_list.size() > 0 || ends_tensor_list.size() > 0) {
-      // Infer output dims
-      auto out_dims = out->dims();
-      auto slice_dims = out_dims;
-      for (size_t i = 0; i < axes.size(); ++i) {
-        // when start == -1 && end == start+1
-        if (starts[i] == -1 && ends[i] == 0 && infer_flags[i] == -1) {
-          auto ret =
-              std::find(decrease_axis.begin(), decrease_axis.end(), axes[i]);
-          if (ret != decrease_axis.end()) {
-            ends[i] = in_dims[axes[i]];
-          }
-        }
-      }
-
-      phi::funcs::CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends);
-      slice_dims = phi::funcs::GetSliceDims<int>(
-          in_dims, axes, starts, ends, nullptr, nullptr);
-      out_dims = phi::funcs::GetDecreasedDims(slice_dims, decrease_axis);
-
-      out->Resize(out_dims);
-    }
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    std::vector<int> offsets(in_dims.size());
-    std::vector<int> size(in_dims.size());
-
-    UpdateAttr(in_dims, axes, starts, ends, &offsets, &size);
-
-    auto& dev_ctx = ctx.template device_context<NPUDeviceContext>();
-    auto stream = dev_ctx.stream();
-#if CANN_VERSION_CODE < 512000
-    const auto& runner =
-        NpuOpRunner("SliceD", {*input}, {*out}, {{"offsets", offsets}, {
-                                                   "size",
-                                                   size
-                                                 }});
-#else
-    NpuOpRunner runner;
-    runner.SetType("Slice")
-        .AddInput(*input)
-        .AddInput(std::move(offsets))
-        .AddInput(std::move(size))
-        .AddOutput(*out);
-#endif
-    runner.Run(stream);
-  }
-};
-
-template <typename T>
-class SliceGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<phi::DenseTensor>("Input");
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dinput =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
-
-    auto axes_int = ctx.Attr<std::vector<int>>("axes");
-    auto starts_int = ctx.Attr<std::vector<int>>("starts");
-    auto ends_int = ctx.Attr<std::vector<int>>("ends");
-    std::vector<int> axes(axes_int.begin(), axes_int.end());
-    std::vector<int> starts(starts_int.begin(), starts_int.end());
-    std::vector<int> ends(ends_int.begin(), ends_int.end());
-
-    // Get the accurate attribute value of starts and ends
-    auto starts_tensor_list =
-        ctx.MultiInput<phi::DenseTensor>("StartsTensorList");
-    if (ctx.HasInput("StartsTensor")) {
-      starts = phi::GetVectorFromTensor<int>(
-          ctx.Input<phi::DenseTensor>("StartsTensor"));
-    } else if (starts_tensor_list.size() > 0) {
-      starts = GetDataFromTensorList<int>(starts_tensor_list);
-    }
-
-    auto ends_tensor_list = ctx.MultiInput<phi::DenseTensor>("EndsTensorList");
-    if (ctx.HasInput("EndsTensor")) {
-      ends = phi::GetVectorFromTensor<int>(
-          ctx.Input<phi::DenseTensor>("EndsTensor"));
-    } else if (ends_tensor_list.size() > 0) {
-      ends = GetDataFromTensorList<int>(ends_tensor_list);
-    }
-
-    const auto& in_dims = input->dims();
-    int rank = in_dims.size();
-
-    std::vector<int> offsets(rank);
-    std::vector<int> size(rank);
-    UpdateAttr(in_dims, axes, starts, ends, &offsets, &size);
-
-    std::vector<std::vector<int64_t>> paddings(rank, std::vector<int64_t>(2));
-    for (int i = 0; i < rank; ++i) {
-      paddings[i][0] = static_cast<int64_t>(offsets[i]);
-      paddings[i][1] = static_cast<int64_t>(in_dims[i] - size[i] - offsets[i]);
-    }
-
-    phi::DenseTensor tmp_dout;
-    tmp_dout.ShareDataWith(*dout);
-    auto out_dims = dout->dims();
-    auto decrease_axis = ctx.Attr<std::vector<int>>("decrease_axis");
-    auto decrease_size = decrease_axis.size();
-    if (decrease_size > 0) {
-      if (decrease_size == static_cast<size_t>(in_dims.size())) {
-        out_dims = phi::make_ddim(std::vector<int>(decrease_size, 1));
-      } else {
-        std::vector<int> origin_out_shape(out_dims.size() + decrease_size, -1);
-        for (size_t i = 0; i < decrease_size; ++i) {
-          origin_out_shape[decrease_axis[i]] = 1;
-        }
-        int index = 0;
-        for (size_t i = 0; i < origin_out_shape.size(); ++i) {
-          if (origin_out_shape[i] == -1) {
-            origin_out_shape[i] = out_dims[index];
-            ++index;
-          }
-        }
-        out_dims = phi::make_ddim(origin_out_shape);
-      }
-      tmp_dout.Resize(out_dims);
-    }
-
-    dinput->mutable_data<T>(ctx.GetPlace());
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    const auto& runner =
-        NpuOpRunner("PadD", {tmp_dout}, {*dinput}, {{"paddings", paddings}});
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(slice,
-                       ops::SliceNPUKernel<float>,
-                       ops::SliceNPUKernel<int>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-                       ops::SliceNPUKernel<int64_t>,
-#endif
-                       ops::SliceNPUKernel<paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(slice_grad,
-                       ops::SliceGradNPUKernel<float>,
-                       ops::SliceGradNPUKernel<int>,
-                       ops::SliceGradNPUKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/smooth_l1_loss_op_npu.cc b/paddle/fluid/operators/smooth_l1_loss_op_npu.cc
deleted file mode 100644
index abb6353ca0d1da..00000000000000
--- a/paddle/fluid/operators/smooth_l1_loss_op_npu.cc
+++ /dev/null
@@ -1,218 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/smooth_l1_loss_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class SmoothL1LossNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in_x = context.Input<phi::DenseTensor>("X");
-    auto* in_y = context.Input<phi::DenseTensor>("Y");
-    auto* inside_weight = context.Input<phi::DenseTensor>("InsideWeight");
-    auto* outside_weight = context.Input<phi::DenseTensor>("OutsideWeight");
-    auto* out_diff = context.Output<phi::DenseTensor>("Diff");
-    auto* out_loss = context.Output<phi::DenseTensor>("Out");
-    out_diff->mutable_data<T>(context.GetPlace());
-    out_loss->mutable_data<T>(context.GetPlace());
-
-    auto sigma = context.Attr<float>("sigma");
-    T sigma2 = 1.0 / (sigma * sigma);
-    bool has_weight = (inside_weight != nullptr) && (outside_weight != nullptr);
-    // out_diff = in_x - in_y
-    auto stream =
-        context.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    const auto& runner1 = NpuOpRunner("Sub", {*in_x, *in_y}, {*out_diff}, {});
-    runner1.Run(stream);
-
-    phi::DenseTensor no_reduce_loss(in_x->dtype());
-    no_reduce_loss.Resize(in_x->dims());
-    no_reduce_loss.mutable_data<T>(context.GetPlace());
-    // multiply inside weight before get the loss
-    if (has_weight) {
-      phi::DenseTensor tmp_diff(out_diff->dtype());
-      tmp_diff.Resize(out_diff->dims());
-      tmp_diff.mutable_data<T>(context.GetPlace());
-      const auto& runner2 =
-          NpuOpRunner("Mul", {*out_diff, *inside_weight}, {tmp_diff}, {});
-      runner2.Run(stream);
-      framework::TensorCopy(
-          tmp_diff,
-          context.GetPlace(),
-          context.template device_context<paddle::platform::NPUDeviceContext>(),
-          out_diff);
-
-      phi::DenseTensor tmp_x(in_x->dtype());
-      tmp_x.Resize(in_x->dims());
-      tmp_x.mutable_data<T>(context.GetPlace());
-
-      phi::DenseTensor tmp_y(in_y->dtype());
-      tmp_y.Resize(in_y->dims());
-      tmp_y.mutable_data<T>(context.GetPlace());
-
-      // mul input and inside_weight
-      const auto& runner_x =
-          NpuOpRunner("Mul", {*in_x, *inside_weight}, {tmp_x}, {});
-      runner_x.Run(stream);
-      const auto& runner_y =
-          NpuOpRunner("Mul", {*in_y, *inside_weight}, {tmp_y}, {});
-      runner_y.Run(stream);
-      const auto& runner3 = NpuOpRunner("SmoothL1Loss",
-                                        {tmp_x, tmp_y},
-                                        {no_reduce_loss},
-                                        {{"sigma", sigma2}});
-      runner3.Run(stream);
-    } else {
-      const auto& runner3 = NpuOpRunner("SmoothL1Loss",
-                                        {*in_x, *in_y},
-                                        {no_reduce_loss},
-                                        {{"sigma", sigma2}});
-      runner3.Run(stream);
-    }
-
-    // multiply outside weight and loss
-    // reduceSum because the output'shape must be [B,1]
-    if (has_weight) {
-      phi::DenseTensor tmp_loss(no_reduce_loss.dtype());
-      tmp_loss.Resize(no_reduce_loss.dims());
-      tmp_loss.mutable_data<T>(context.GetPlace());
-      const auto& runner4 =
-          NpuOpRunner("Mul", {no_reduce_loss, *outside_weight}, {tmp_loss}, {});
-      runner4.Run(stream);
-      const auto& runner5 =
-          NpuOpRunner("ReduceSumD",
-                      {tmp_loss},
-                      {*out_loss},
-                      {{"axes", std::vector<int>{1}}, {"keep_dims", true}});
-      runner5.Run(stream);
-    } else {
-      const auto& runner5 =
-          NpuOpRunner("ReduceSumD",
-                      {no_reduce_loss},
-                      {*out_loss},
-                      {{"axes", std::vector<int>{1}}, {"keep_dims", true}});
-      runner5.Run(stream);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SmoothL1LossGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* inside_weight = context.Input<phi::DenseTensor>("InsideWeight");
-    auto* outside_weight = context.Input<phi::DenseTensor>("OutsideWeight");
-    auto* diff = context.Input<phi::DenseTensor>("Diff");
-    auto* og = context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* outx_grad =
-        context.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* outy_grad =
-        context.Output<phi::DenseTensor>(framework::GradVarName("Y"));
-    auto sigma = context.Attr<T>("sigma");
-    T sigma2 = 1.0 / (sigma * sigma);
-    bool has_weight = (inside_weight != nullptr) && (outside_weight != nullptr);
-
-    auto stream =
-        context.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    // diff == in_x - in_y == diff - 0
-    phi::DenseTensor tmp_zero(diff->dtype());
-    tmp_zero.Resize(diff->dims());
-    tmp_zero.mutable_data<T>(context.GetPlace());
-    const auto& runner_zero = NpuOpRunner("ZerosLike", {*diff}, {tmp_zero}, {});
-    runner_zero.Run(stream);
-
-    phi::DenseTensor grad(diff->dtype());
-    grad.Resize(diff->dims());
-    grad.mutable_data<T>(context.GetPlace());
-    // broadcast og(output_grad) to adapt to the npu interface
-    const auto& runner_broad =
-        NpuOpRunner("BroadcastToD",
-                    {*og},
-                    {grad},
-                    {{"shape", phi::vectorize(diff->dims())}});
-    runner_broad.Run(stream);
-
-    phi::DenseTensor gradient(diff->dtype());
-    gradient.Resize(diff->dims());
-    gradient.mutable_data<T>(context.GetPlace());
-    // diff == diff - 0 == in_x - in_y
-    const auto& runner_grad = NpuOpRunner("SmoothL1LossGrad",
-                                          {*diff, tmp_zero, grad},
-                                          {gradient},
-                                          {{"sigma", sigma2}});
-    runner_grad.Run(stream);
-
-    // mul weight and gradient
-    if (has_weight) {
-      phi::DenseTensor weight(inside_weight->dtype());
-      weight.Resize(inside_weight->dims());
-      weight.mutable_data<T>(context.GetPlace());
-      const auto& runner_weight =
-          NpuOpRunner("Mul", {*inside_weight, *outside_weight}, {weight}, {});
-      runner_weight.Run(stream);
-
-      phi::DenseTensor tmp_grad(gradient.dtype());
-      tmp_grad.Resize(gradient.dims());
-      tmp_grad.mutable_data<T>(context.GetPlace());
-      const auto& runner_weight_grad =
-          NpuOpRunner("Mul", {gradient, weight}, {tmp_grad}, {});
-      runner_weight_grad.Run(stream);
-
-      framework::TensorCopy(
-          tmp_grad,
-          context.GetPlace(),
-          context.template device_context<paddle::platform::NPUDeviceContext>(),
-          &gradient);
-    }
-    // outx_grad = gradient
-    if (outx_grad) {
-      outx_grad->mutable_data<T>(context.GetPlace());
-      framework::TensorCopy(
-          gradient,
-          context.GetPlace(),
-          context.template device_context<paddle::platform::NPUDeviceContext>(),
-          outx_grad);
-    }
-
-    // outy_grad = - gradient
-    if (outy_grad) {
-      outy_grad->mutable_data<T>(context.GetPlace());
-      phi::DenseTensor coeff(phi::DataType::FLOAT32);
-      coeff.mutable_data<float>({1}, context.GetPlace());
-      FillNpuTensorWithConstant<float>(&coeff, -1);
-      const auto& runner_y_grad =
-          NpuOpRunner("Mul", {coeff, gradient}, {*outy_grad}, {});
-      runner_y_grad.Run(stream);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_NPU_KERNEL(
-    smooth_l1_loss,
-    ops::SmoothL1LossNPUKernel<paddle::platform::NPUDeviceContext, float>);
-
-REGISTER_OP_NPU_KERNEL(
-    smooth_l1_loss_grad,
-    ops::SmoothL1LossGradNPUKernel<paddle::platform::NPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/softmax_op_npu.cc b/paddle/fluid/operators/softmax_op_npu.cc
deleted file mode 100644
index de7df0de5b3d58..00000000000000
--- a/paddle/fluid/operators/softmax_op_npu.cc
+++ /dev/null
@@ -1,103 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/axis_utils.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class SoftmaxNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<phi::DenseTensor>("X");
-    auto axis = ctx.Attr<int>("axis");
-    std::vector<int> axes;
-    axes.push_back(axis);
-    framework::NPUAttributeMap attr_input = {{"axes", axes}};
-
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    const auto& runner = NpuOpRunner("SoftmaxV2", {*in}, {*out}, attr_input);
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SoftmaxGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Input<phi::DenseTensor>("Out");
-    auto* dOut = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-
-    auto* dX = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-
-    auto dims = dX->dims();
-    const int rank = dims.size();
-    const int axis = phi::funcs::CanonicalAxis(ctx.Attr<int>("axis"), rank);
-    int64_t first_dim = 1;
-    int64_t sec_dim = 1;
-    for (int i = 0; i < axis; i++) {
-      first_dim *= dims[i];
-    }
-    for (int i = axis; i < rank; i++) {
-      sec_dim *= dims[i];
-    }
-
-    Tensor tmp_out;
-    tmp_out.ShareDataWith(*out).Resize({first_dim, sec_dim});
-
-    Tensor tmp_dOut;
-    tmp_dOut.ShareDataWith(*dOut).Resize({first_dim, sec_dim});
-
-    dX->Resize(phi::make_ddim({first_dim, sec_dim}));
-    dX->mutable_data<T>(ctx.GetPlace());
-
-    framework::NPUAttributeMap attr_input = {};
-    const auto& runner = NpuOpRunner(
-        std::string("SoftmaxGrad"), {tmp_out, tmp_dOut}, {*dX}, attr_input);
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-
-    dX->Resize(dims);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(
-    softmax,
-    ops::SoftmaxNPUKernel<plat::NPUDeviceContext, float>,
-    ops::SoftmaxNPUKernel<plat::NPUDeviceContext, double>,
-    ops::SoftmaxNPUKernel<plat::NPUDeviceContext, plat::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    softmax_grad,
-    ops::SoftmaxGradNPUKernel<plat::NPUDeviceContext, float>,
-    ops::SoftmaxGradNPUKernel<plat::NPUDeviceContext, double>,
-    ops::SoftmaxGradNPUKernel<plat::NPUDeviceContext,
-                              paddle::platform::float16>);
diff --git a/paddle/fluid/operators/softmax_op_npu_test.cc b/paddle/fluid/operators/softmax_op_npu_test.cc
deleted file mode 100644
index dd1462b1c07cc5..00000000000000
--- a/paddle/fluid/operators/softmax_op_npu_test.cc
+++ /dev/null
@@ -1,171 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-USE_OP_ITSELF(softmax);
-USE_OP_DEVICE_KERNEL(softmax, NPU);
-
-template <typename T>
-void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
-  // init
-  auto x = scope->Var("X");
-  auto tensor_x = x->GetMutable<phi::DenseTensor>();
-
-  std::vector<T> init;
-  for (int i = 3; i < 9; ++i) {
-    init.push_back(static_cast<T>(i));
-  }
-
-  paddle::framework::TensorFromVector(init, ctx, tensor_x);
-  tensor_x->Resize({2, 3});
-
-  ctx.Wait();
-
-  auto place = ctx.GetPlace();
-  auto out = scope->Var("Out");
-  auto tensor_out = out->GetMutable<phi::DenseTensor>();
-  tensor_out->Resize({2, 3});
-  tensor_out->mutable_data<T>(place);  // allocate
-
-  // run
-  int axis = 1;
-  f::AttributeMap attrs = {
-      {"axis", axis},
-      {"use_cudnn", false},
-      {"use_mkldnn", false},
-      {"mkldnn_data_type", std::string("float32")},
-      {"is_test", false},
-  };
-
-  auto op = f::OpRegistry::CreateOp(
-      "softmax", {{"X", {"X"}}}, {{"Out", {"Out"}}}, attrs);
-
-  op->Run(*scope, place);
-  ctx.Wait();
-
-  std::vector<T> out_vec;
-  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
-
-  for (int i = 0; i < static_cast<int>(out_vec.size()); ++i) {
-    VLOG(3) << "out_vec[" << i << "] : " << out_vec[i];
-  }
-
-  ctx.Wait();
-
-  EXPECT_EQ((uint32_t)out_vec.size(), (uint32_t)(6));
-}
-
-template <typename T>
-void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
-  // init
-  auto out = scope->Var("Out");
-  auto tensor_out = out->GetMutable<phi::DenseTensor>();
-
-  std::vector<T> out_init;
-
-  out_init.push_back(static_cast<T>(0.6670));
-  out_init.push_back(static_cast<T>(0.5888));
-  out_init.push_back(static_cast<T>(0.4543));
-  out_init.push_back(static_cast<T>(0.3330));
-  out_init.push_back(static_cast<T>(0.4112));
-  out_init.push_back(static_cast<T>(0.5457));
-
-  paddle::framework::TensorFromVector(out_init, ctx, tensor_out);
-  tensor_out->Resize({2, 3});
-
-  ctx.Wait();
-
-  auto dout = scope->Var("DOut");
-  auto tensor_dout = dout->GetMutable<phi::DenseTensor>();
-
-  std::vector<T> dout_init;
-  for (int i = 0; i < 6; ++i) {
-    dout_init.push_back(static_cast<T>(1.0));
-  }
-
-  paddle::framework::TensorFromVector(dout_init, ctx, tensor_dout);
-  tensor_dout->Resize({2, 3});
-
-  ctx.Wait();
-
-  auto dx = scope->Var("DX");
-  auto tensor_dx = dx->GetMutable<phi::DenseTensor>();
-
-  ctx.Wait();
-
-  // run
-  f::AttributeMap attrs;
-  attrs = {
-      {"name", std::string("softmax_grad")},
-      {"axis", static_cast<int>(0)},
-      {"use_cudnn", false},
-      {"use_mkldnn", false},
-      {"mkldnn_data_type", std::string("float32")},
-      {"is_test", false},
-      {"data_format", std::string("AnyLayout")},
-  };
-  auto op = f::OpRegistry::CreateOp("softmax_grad",
-                                    {{"Out", {"Out"}}, {"Out@GRAD", {"DOut"}}},
-                                    {{"X@GRAD", {"DX"}}},
-                                    attrs);
-
-  auto place = ctx.GetPlace();
-  op->Run(*scope, place);
-  ctx.Wait();
-
-  EXPECT_EQ((uint32_t)tensor_dx->dims()[0], (uint32_t)(2));
-  EXPECT_EQ((uint32_t)tensor_dx->dims()[1], (uint32_t)(3));
-
-  ctx.Wait();
-
-  std::vector<float> out_vec;
-  paddle::framework::TensorToVector(*tensor_dx, ctx, &out_vec);
-
-  ctx.Wait();
-
-  EXPECT_EQ((uint32_t)out_vec.size(), (uint32_t)(6));
-  EXPECT_NEAR((float)out_vec[0], (float)(-0.4737), 0.1);
-  EXPECT_NEAR((float)out_vec[1], (float)(-0.4181), 0.1);
-  EXPECT_NEAR((float)out_vec[2], (float)(-0.3226), 0.1);
-  EXPECT_NEAR((float)out_vec[3], (float)(-0.0965), 0.1);
-  EXPECT_NEAR((float)out_vec[4], (float)(-0.1192), 0.1);
-  EXPECT_NEAR((float)out_vec[5], (float)(-0.1582), 0.1);
-}
-
-TEST(softmax, NPU_fp32) {
-  f::Scope scope;
-  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<float>(&scope, *ctx);
-}
-
-TEST(softmax_grad, NPU_fp32) {
-  f::Scope scope;
-  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  CompareGrad<float>(&scope, *ctx);
-}
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc
deleted file mode 100644
index af0e9d55445d5e..00000000000000
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc
+++ /dev/null
@@ -1,141 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/axis_utils.h"
-#include "paddle/phi/kernels/funcs/cross_entropy.h"
-#include "paddle/phi/kernels/funcs/softmax.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class SoftmaxWithCrossEntropyNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* logits = ctx.Input<phi::DenseTensor>("Logits");
-    auto* labels = ctx.Input<phi::DenseTensor>("Label");
-    auto* softmax = ctx.Output<phi::DenseTensor>("Softmax");
-    auto* loss = ctx.Output<phi::DenseTensor>("Loss");
-    auto* backprop = ctx.Output<phi::DenseTensor>("Backprop");
-    auto soft_label = ctx.Attr<bool>("soft_label");
-    PADDLE_ENFORCE_EQ(soft_label,
-                      false,
-                      platform::errors::Unimplemented(
-                          "soft_label=True is not supported in "
-                          "the npu kernel of softmax_with_cross_entropy."));
-
-    const int rank = logits->dims().size();
-    const int axis = phi::funcs::CanonicalAxis(ctx.Attr<int>("axis"), rank);
-    const int n = phi::funcs::SizeToAxis(axis, logits->dims());
-    const int d = phi::funcs::SizeFromAxis(axis, logits->dims());
-
-    PADDLE_ENFORCE_EQ(
-        labels->numel(),
-        n,
-        platform::errors::Unimplemented(
-            "The size of labels should be equal to phi::funcs::SizeToAxis of "
-            "logits,"
-            "but got size of labels is %d and phi::funcs::SizeToAxis is %d.",
-            labels->numel(),
-            n));
-
-    loss->mutable_data<T>(ctx.GetPlace());
-    backprop->mutable_data<T>(ctx.GetPlace());
-    softmax->mutable_data<T>(ctx.GetPlace());
-
-    phi::DenseTensor logits_2d, labels_1d, loss_1d, backprop_2d, softmax_2d;
-    logits_2d.ShareDataWith(*logits).Resize({n, d});
-    labels_1d.ShareDataWith(*labels).Resize({n});
-    loss_1d.ShareDataWith(*loss).Resize({n});
-    backprop_2d.ShareDataWith(*backprop).Resize({n, d});
-    softmax_2d.ShareDataWith(*softmax).Resize({n, d});
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    std::vector<int> axes;
-    for (auto i = axis; i < logits->dims().size(); ++i) {
-      axes.push_back(i);
-    }
-    const auto& runner_softmax =
-        NpuOpRunner("SoftmaxV2", {*logits}, {*softmax}, {{"axes", axes}});
-    runner_softmax.Run(stream);
-
-    // SparseSoftmaxCrossEntropyWithLogits
-    const auto& runner_s = NpuOpRunner("SparseSoftmaxCrossEntropyWithLogits",
-                                       {logits_2d, labels_1d},
-                                       {loss_1d, backprop_2d},
-                                       {});
-    runner_s.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SoftmaxWithCrossEntropyGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* backprop = ctx.Input<phi::DenseTensor>("Backprop");
-    auto* loss_grad =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("Loss"));
-    auto* logits_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Logits"));
-
-    PADDLE_ENFORCE_NOT_NULL(backprop,
-                            platform::errors::PreconditionNotMet(
-                                "backprop should not be null in NPU kernel of "
-                                "softmax_with_cross_entropy_grad."));
-    logits_grad->mutable_data<T>(ctx.GetPlace());
-
-    const int rank = logits_grad->dims().size();
-    const int axis = phi::funcs::CanonicalAxis(ctx.Attr<int>("axis"), rank);
-    const int n = phi::funcs::SizeToAxis(axis, logits_grad->dims());
-    const int d = phi::funcs::SizeFromAxis(axis, logits_grad->dims());
-
-    phi::DenseTensor logits_grad_2d, loss_grad_1d, backprop_2d;
-
-    logits_grad_2d.ShareDataWith(*logits_grad).Resize({n, d});
-    loss_grad_1d.ShareDataWith(*loss_grad).Resize({n});
-    backprop_2d.ShareDataWith(*backprop).Resize({n, d});
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    const auto& runner_mul =
-        NpuOpRunner("Mul", {*loss_grad, *backprop}, {*logits_grad}, {});
-    runner_mul.Run(stream);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    softmax_with_cross_entropy,
-    ops::SoftmaxWithCrossEntropyNPUKernel<paddle::platform::NPUDeviceContext,
-                                          float>,
-    ops::SoftmaxWithCrossEntropyNPUKernel<paddle::platform::NPUDeviceContext,
-                                          paddle::platform::float16>);
-REGISTER_OP_NPU_KERNEL(softmax_with_cross_entropy_grad,
-                       ops::SoftmaxWithCrossEntropyGradNPUKernel<
-                           paddle::platform::NPUDeviceContext,
-                           float>,
-                       ops::SoftmaxWithCrossEntropyGradNPUKernel<
-                           paddle::platform::NPUDeviceContext,
-                           paddle::platform::float16>);
diff --git a/paddle/fluid/operators/split_op_npu.cc b/paddle/fluid/operators/split_op_npu.cc
deleted file mode 100644
index 763b375d00e9b8..00000000000000
--- a/paddle/fluid/operators/split_op_npu.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/operators/split_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class SplitNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<phi::DenseTensor>("X");
-    auto outs = ctx.MultiOutput<phi::DenseTensor>("Out");
-    int num = ctx.Attr<int>("num");
-    std::vector<int> sections = ctx.Attr<std::vector<int>>("sections");
-    int axis = ctx.Attr<int>("axis");
-
-    if (ctx.HasInput("AxisTensor")) {
-      // TODO(liupeng51):
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "The AxisTensor is not supported on NPU now."));
-    }
-    if (ctx.HasInput("SectionsTensorList")) {
-      // TODO(liupeng51):
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "The SectionsTensorList is not supported on NPU now."));
-    }
-
-    std::vector<phi::DenseTensor> outputs;
-    for (size_t j = 0; j < outs.size(); ++j) {
-      outs[j]->mutable_data<T>(ctx.GetPlace());
-      outputs.push_back(*outs[j]);
-    }
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    NpuOpRunner runner;
-    if (sections.size() == 0) {
-      framework::NPUAttributeMap attr_input = {{"num_split", num},
-                                               {"split_dim", axis}};
-      runner.SetType("SplitD").AddInputs({*in}).AddOutputs(outputs).AddAttrs(
-          attr_input);
-    } else {
-      framework::NPUAttributeMap attr_input = {
-          {"size_splits", sections},
-          {"split_dim", axis},
-          {"num_split", static_cast<int32_t>(sections.size())}};
-      runner.SetType("SplitVD").AddInput(*in).AddOutputs(outputs).AddAttrs(
-          attr_input);
-    }
-
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(split,
-                       ops::SplitNPUKernel<float>,
-                       ops::SplitNPUKernel<int>,
-                       ops::SplitNPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/squared_l2_norm_op_npu.cc b/paddle/fluid/operators/squared_l2_norm_op_npu.cc
deleted file mode 100644
index fb7d4607fc085f..00000000000000
--- a/paddle/fluid/operators/squared_l2_norm_op_npu.cc
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class SquaredL2NormNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *x = context.Input<phi::DenseTensor>("X");
-    auto *out = context.Output<phi::DenseTensor>("Out");
-
-    auto place = context.GetPlace();
-    auto stream =
-        context.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    std::vector<int> axis;
-    for (int i = 0; i < x->dims().size(); ++i) {
-      axis.push_back(i);
-    }
-    out->mutable_data<T>(place);
-    const auto &runner = NpuOpRunner(
-        "SquareSumV1", {*x}, {*out}, {{"axis", axis}, {"keep_dims", false}});
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SquaredL2NormGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *x = context.Input<phi::DenseTensor>("X");
-    auto *x_grad =
-        context.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto *out_grad =
-        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-
-    PADDLE_ENFORCE_EQ(
-        out_grad->numel(),
-        1,
-        platform::errors::InvalidArgument(
-            "Input(GRAD@Out) of SquaredL2NormGradOP should be a scalar."));
-
-    auto place = context.GetPlace();
-    auto stream =
-        context.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    // broadcast out_grad
-    phi::DenseTensor broadcasted_out_grad;
-    broadcasted_out_grad.mutable_data<T>(x_grad->dims(), place);
-    const auto &broadcast_runner =
-        NpuOpRunner("BroadcastToD",
-                    {*out_grad},
-                    {broadcasted_out_grad},
-                    {{"shape", phi::vectorize(x_grad->dims())}});
-    broadcast_runner.Run(stream);
-    // mul x
-    phi::DenseTensor tmp_x_grad;
-    tmp_x_grad.mutable_data<T>(x_grad->dims(), place);
-    const auto &mul_x_runner =
-        NpuOpRunner("Mul", {broadcasted_out_grad, *x}, {tmp_x_grad}, {});
-    mul_x_runner.Run(stream);
-    // mul coefficient:2
-    phi::DenseTensor coefficient;
-    coefficient.mutable_data<T>({1}, place);
-    FillNpuTensorWithConstant<T>(&coefficient, static_cast<T>(2.0));
-    x_grad->mutable_data<T>(place);
-    const auto &mul_coefficient_runner =
-        NpuOpRunner("Mul", {tmp_x_grad, coefficient}, {*x_grad}, {});
-    mul_coefficient_runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(
-    squared_l2_norm,
-    ops::SquaredL2NormNPUKernel<plat::NPUDeviceContext, float>);
-REGISTER_OP_NPU_KERNEL(
-    squared_l2_norm_grad,
-    ops::SquaredL2NormGradNPUKernel<plat::NPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/squeeze_op_npu.cc b/paddle/fluid/operators/squeeze_op_npu.cc
deleted file mode 100644
index 308f092ad740f1..00000000000000
--- a/paddle/fluid/operators/squeeze_op_npu.cc
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/squeeze_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(
-    squeeze,
-    ops::SqueezeKernel<plat::NPUDeviceContext, float>,
-    ops::SqueezeKernel<plat::NPUDeviceContext, double>,
-    ops::SqueezeKernel<plat::NPUDeviceContext, plat::float16>,
-    ops::SqueezeKernel<plat::NPUDeviceContext, bool>,
-    ops::SqueezeKernel<plat::NPUDeviceContext, int>,
-    ops::SqueezeKernel<plat::NPUDeviceContext, uint8_t>,
-    ops::SqueezeKernel<plat::NPUDeviceContext, int8_t>,
-    ops::SqueezeKernel<plat::NPUDeviceContext, int64_t>);
-REGISTER_OP_NPU_KERNEL(
-    squeeze2,
-    ops::SqueezeKernel<plat::NPUDeviceContext, float>,
-    ops::SqueezeKernel<plat::NPUDeviceContext, double>,
-    ops::SqueezeKernel<plat::NPUDeviceContext, plat::float16>,
-    ops::SqueezeKernel<plat::NPUDeviceContext, bool>,
-    ops::SqueezeKernel<plat::NPUDeviceContext, int>,
-    ops::SqueezeKernel<plat::NPUDeviceContext, uint8_t>,
-    ops::SqueezeKernel<plat::NPUDeviceContext, int8_t>,
-    ops::SqueezeKernel<plat::NPUDeviceContext, int64_t>);
-REGISTER_OP_NPU_KERNEL(
-    squeeze_grad,
-    ops::SqueezeGradKernel<plat::NPUDeviceContext, float>,
-    ops::SqueezeGradKernel<plat::NPUDeviceContext, double>,
-    ops::SqueezeGradKernel<plat::NPUDeviceContext, plat::float16>,
-    ops::SqueezeGradKernel<plat::NPUDeviceContext, bool>,
-    ops::SqueezeGradKernel<plat::NPUDeviceContext, int>,
-    ops::SqueezeGradKernel<plat::NPUDeviceContext, uint8_t>,
-    ops::SqueezeGradKernel<plat::NPUDeviceContext, int8_t>,
-    ops::SqueezeGradKernel<plat::NPUDeviceContext, int64_t>);
-REGISTER_OP_NPU_KERNEL(
-    squeeze2_grad,
-    ops::Squeeze2GradKernel<plat::NPUDeviceContext, float>,
-    ops::Squeeze2GradKernel<plat::NPUDeviceContext, double>,
-    ops::Squeeze2GradKernel<plat::NPUDeviceContext, plat::float16>,
-    ops::Squeeze2GradKernel<plat::NPUDeviceContext, bool>,
-    ops::Squeeze2GradKernel<plat::NPUDeviceContext, int>,
-    ops::Squeeze2GradKernel<plat::NPUDeviceContext, uint8_t>,
-    ops::Squeeze2GradKernel<plat::NPUDeviceContext, int8_t>,
-    ops::Squeeze2GradKernel<plat::NPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/squeeze_op_npu_test.cc b/paddle/fluid/operators/squeeze_op_npu_test.cc
deleted file mode 100644
index f0f683e4882465..00000000000000
--- a/paddle/fluid/operators/squeeze_op_npu_test.cc
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-USE_OP(squeeze);
-USE_OP_DEVICE_KERNEL(squeeze, NPU);
-
-template <typename T>
-void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
-  // init
-  auto x = scope->Var("X");
-  auto tensor_x = x->GetMutable<phi::DenseTensor>();
-
-  int dim0 = 1;
-  int dim1 = 10;
-  int dim2 = 1;
-
-  std::vector<T> init;
-  for (int64_t i = 0; i < dim0 * dim1 * dim2; ++i) {
-    init.push_back(static_cast<T>(0.1));
-  }
-
-  paddle::framework::TensorFromVector(init, ctx, tensor_x);
-  tensor_x->Resize({dim0, dim1, dim2});
-
-  ctx.Wait();
-
-  // run
-  auto place = ctx.GetPlace();
-  auto out = scope->Var("Out");
-  auto tensor_out = out->GetMutable<phi::DenseTensor>();
-
-  std::vector<int> axis;
-  axis.push_back(2);
-  f::AttributeMap attrs = {{"axes", axis}};
-
-  auto op = f::OpRegistry::CreateOp(
-      "squeeze", {{"X", {"X"}}}, {{"Out", {"Out"}}}, attrs);
-
-  op->Run(*scope, place);
-  ctx.Wait();
-
-  EXPECT_EQ((uint32_t)tensor_out->dims().size(), uint32_t(2));
-  EXPECT_EQ((uint32_t)tensor_out->dims()[0], uint32_t(dim0));
-  EXPECT_EQ((uint32_t)tensor_out->dims()[1], uint32_t(dim1));
-
-  std::vector<T> out_vec;
-  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
-  for (uint32_t i = 0; i < out_vec.size(); i++) {
-    EXPECT_EQ(out_vec[i], static_cast<T>(0.1));
-  }
-
-  ctx.Wait();
-}
-
-TEST(squeeze, NPU_fp32) {
-  f::Scope scope;
-  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<float>(&scope, *ctx);
-}
diff --git a/paddle/fluid/operators/stack_op_npu.cc b/paddle/fluid/operators/stack_op_npu.cc
deleted file mode 100644
index 8c6447971d9ad2..00000000000000
--- a/paddle/fluid/operators/stack_op_npu.cc
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class StackNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto x = ctx.MultiInput<phi::DenseTensor>("X");
-    auto* y = ctx.Output<phi::DenseTensor>("Y");
-    int axis = ctx.Attr<int>("axis");
-    if (axis < 0) axis += (x[0]->dims().size() + 1);
-    int num = static_cast<int>(x.size());
-
-    PADDLE_ENFORCE_GT(num,
-                      0,
-                      platform::errors::InvalidArgument(
-                          "number of input phi::DenseTensor <= 0"));
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    std::vector<phi::DenseTensor> x_list;
-    for (int i = 0; i < num; i++) {
-      x_list.push_back(*x[i]);
-    }
-    y->mutable_data<T>(ctx.GetPlace());
-
-    const auto& runner =
-        NpuOpRunner("Pack", {x_list}, {*y}, {{"axis", axis}, {"N", num}});
-    runner.Run(stream);
-  }
-};
-
-template <typename T>
-class StackGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dy = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
-    auto dx = ctx.MultiOutput<phi::DenseTensor>(framework::GradVarName("X"));
-    int axis = ctx.Attr<int>("axis");
-    if (axis < 0) axis += dy->dims().size();
-    int num = dy->dims()[axis];
-
-    PADDLE_ENFORCE_GT(num,
-                      0,
-                      platform::errors::InvalidArgument(
-                          "number of input phi::DenseTensor <= 0"));
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    std::vector<phi::DenseTensor> dx_list;
-    for (int i = 0; i < num; i++) {
-      dx[i]->mutable_data<T>(ctx.GetPlace());
-      dx_list.push_back(*dx[i]);
-    }
-
-    const auto& runner =
-        NpuOpRunner("Unpack", {*dy}, {dx_list}, {{"axis", axis}, {"num", num}});
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_NPU_KERNEL(
-    stack,
-    paddle::operators::StackNPUKernel<int>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-    paddle::operators::StackNPUKernel<int64_t>,
-#endif
-    paddle::operators::StackNPUKernel<float>,
-    paddle::operators::StackNPUKernel<paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    stack_grad,
-    paddle::operators::StackGradNPUKernel<int>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-    paddle::operators::StackGradNPUKernel<int64_t>,
-#endif
-    paddle::operators::StackGradNPUKernel<float>,
-    paddle::operators::StackGradNPUKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/strided_slice_op_npu.cc b/paddle/fluid/operators/strided_slice_op_npu.cc
deleted file mode 100644
index 4c3bfed5d5d4bc..00000000000000
--- a/paddle/fluid/operators/strided_slice_op_npu.cc
+++ /dev/null
@@ -1,480 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/utils.h"
-#include "paddle/phi/core/tensor_utils.h"
-#include "paddle/phi/kernels/funcs/strided_slice.h"
-
-namespace paddle {
-namespace operators {
-
-using Variable = framework::Variable;
-using LoDTensorArray = framework::LoDTensorArray;
-using DDim = framework::DDim;
-
-template <typename DeviceContext, typename T>
-class StridedSliceNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const Variable* input_var = ctx.InputVar("Input");
-    bool is_tensor_array = input_var->IsType<LoDTensorArray>();
-    PADDLE_ENFORCE_EQ(is_tensor_array,
-                      false,
-                      platform::errors::InvalidArgument(
-                          "phi::DenseTensor array as input is not supported."));
-    int rank = ctx.Input<phi::DenseTensor>("Input")->dims().size();
-    switch (rank) {
-      case 1:
-        StridedSliceCompute<1>(ctx);
-        break;
-      case 2:
-        StridedSliceCompute<2>(ctx);
-        break;
-      case 3:
-        StridedSliceCompute<3>(ctx);
-        break;
-      case 4:
-        StridedSliceCompute<4>(ctx);
-        break;
-      case 5:
-        StridedSliceCompute<5>(ctx);
-        break;
-      case 6:
-        StridedSliceCompute<6>(ctx);
-        break;
-      default:
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "The rank of input is supported up to 6."));
-        break;
-    }
-  }
-
- private:
-  template <size_t D>
-  void StridedSliceCompute(const framework::ExecutionContext& ctx) const {
-    auto place = ctx.GetPlace();
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    auto in = ctx.Input<phi::DenseTensor>("Input");
-    auto out = ctx.Output<phi::DenseTensor>("Out");
-    auto in_dims = in->dims();
-
-    // list<int>
-    auto starts_int = ctx.Attr<std::vector<int>>("starts");
-    auto ends_int = ctx.Attr<std::vector<int>>("ends");
-    auto strides_int = ctx.Attr<std::vector<int>>("strides");
-
-    std::vector<int64_t> starts(starts_int.begin(), starts_int.end());
-    std::vector<int64_t> ends(ends_int.begin(), ends_int.end());
-    std::vector<int64_t> strides(strides_int.begin(), strides_int.end());
-
-    auto axes = ctx.Attr<std::vector<int>>("axes");
-    auto infer_flags = ctx.Attr<std::vector<int>>("infer_flags");
-    auto decrease_axis = ctx.Attr<std::vector<int>>("decrease_axis");
-
-    // vector<phi::DenseTensor<int32>>
-    auto list_new_ends_tensor =
-        ctx.MultiInput<phi::DenseTensor>("EndsTensorList");
-    auto list_new_starts_tensor =
-        ctx.MultiInput<phi::DenseTensor>("StartsTensorList");
-    auto list_new_strides_tensor =
-        ctx.MultiInput<phi::DenseTensor>("StridesTensorList");
-
-    // phi::DenseTensor<int32>
-    if (list_new_starts_tensor.size() > 0) {
-      starts = GetDataFromTensorList<int64_t>(list_new_starts_tensor);
-    } else if (ctx.HasInput("StartsTensor")) {
-      auto* starts_tensor = ctx.Input<phi::DenseTensor>("StartsTensor");
-      starts = phi::GetVectorFromTensor<int64_t>(starts_tensor);
-    }
-
-    if (list_new_ends_tensor.size() > 0) {
-      ends = GetDataFromTensorList<int64_t>(list_new_ends_tensor);
-    } else if (ctx.HasInput("EndsTensor")) {
-      auto* ends_tensor = ctx.Input<phi::DenseTensor>("EndsTensor");
-      ends = phi::GetVectorFromTensor<int64_t>(ends_tensor);
-    }
-
-    if (list_new_strides_tensor.size() > 0) {
-      strides = GetDataFromTensorList<int64_t>(list_new_strides_tensor);
-    } else if (ctx.HasInput("StridesTensor")) {
-      auto* strides_tensor = ctx.Input<phi::DenseTensor>("StridesTensor");
-      strides = phi::GetVectorFromTensor<int64_t>(strides_tensor);
-    }
-
-    // out dims calculation
-    std::vector<int64_t> out_dims_vector(in_dims.size(), -1);
-    phi::funcs::StridedSliceOutDims(starts,
-                                    ends,
-                                    strides,
-                                    axes,
-                                    infer_flags,
-                                    in_dims,
-                                    decrease_axis,
-                                    out_dims_vector.data(),
-                                    axes.size(),
-                                    false);
-    framework::DDim out_dims(phi::make_ddim(out_dims_vector));
-
-    // check whether need to reverse (false: stride > 0; true: stride < 0)
-    std::vector<int> reverse_vector(starts.size(), 0);
-    phi::funcs::StridedSliceFunctor(starts.data(),
-                                    ends.data(),
-                                    strides.data(),
-                                    axes.data(),
-                                    reverse_vector.data(),
-                                    in_dims,
-                                    infer_flags,
-                                    decrease_axis,
-                                    starts.size());
-
-    // construct the starts_indices, ends_indices and strides_indices tensor for
-    // calling StridedSlice op
-    std::vector<int64_t> starts_indices_vector(D, 0);
-    std::vector<int64_t> ends_indices_vector(out_dims_vector.begin(),
-                                             out_dims_vector.end());
-    std::vector<int64_t> strides_indices_vector(D, 1);
-
-    for (size_t axis = 0; axis < axes.size(); axis++) {
-      int axis_index = axes[axis];
-      starts_indices_vector[axis_index] = starts[axis];
-      ends_indices_vector[axis_index] = ends[axis];
-      strides_indices_vector[axis_index] = strides[axis];
-    }
-
-    phi::DenseTensor starts_indices_tensor;
-    phi::DenseTensor ends_indices_tensor;
-    phi::DenseTensor strides_indices_tensor;
-
-    starts_indices_tensor.mutable_data<int64_t>({D}, place);
-    ends_indices_tensor.mutable_data<int64_t>({D}, place);
-    strides_indices_tensor.mutable_data<int64_t>({D}, place);
-
-    paddle::framework::TensorFromVector(
-        starts_indices_vector, ctx.device_context(), &starts_indices_tensor);
-    paddle::framework::TensorFromVector(
-        ends_indices_vector, ctx.device_context(), &ends_indices_tensor);
-    paddle::framework::TensorFromVector(
-        strides_indices_vector, ctx.device_context(), &strides_indices_tensor);
-
-    auto out_dims_origin = out_dims;
-    if (decrease_axis.size() > 0) {
-      std::vector<int64_t> new_out_shape;
-      for (size_t i = 0; i < decrease_axis.size(); ++i) {
-        PADDLE_ENFORCE_EQ(
-            out_dims[decrease_axis[i]],
-            1,
-            platform::errors::InvalidArgument(
-                "the size of decrease dimension should be 1, but received %d.",
-                out_dims[decrease_axis[i]]));
-        out_dims_origin[decrease_axis[i]] = 0;
-      }
-
-      for (int i = 0; i < out_dims_origin.size(); ++i) {
-        if (out_dims_origin[i] != 0) {
-          new_out_shape.push_back(out_dims_origin[i]);
-        }
-      }
-      if (new_out_shape.size() == 0) {
-        new_out_shape.push_back(1);
-      }
-      out_dims_origin = phi::make_ddim(new_out_shape);
-    }
-
-    bool need_reverse = false;
-    for (size_t axis = 0; axis < axes.size(); axis++) {
-      if (reverse_vector[axis] == 1) {
-        need_reverse = true;
-        break;
-      }
-    }
-
-    out->Resize(out_dims);
-    out->mutable_data<T>(place);
-
-    const auto& runner = NpuOpRunner("StridedSlice",
-                                     {*in,
-                                      starts_indices_tensor,
-                                      ends_indices_tensor,
-                                      strides_indices_tensor},
-                                     {*out},
-                                     {{"begin_mask", 0},
-                                      {"end_mask", 0},
-                                      {"ellipsis_mask", 0},
-                                      {"new_axis_mask", 0},
-                                      {"shrink_axis_mask", 0}});
-    runner.Run(stream);
-
-    if (need_reverse) {
-      phi::DenseTensor out_tmp;
-      out_tmp.mutable_data<T>(out_dims, place);
-      paddle::framework::TensorCopy(
-          *out,
-          place,
-          ctx.template device_context<platform::DeviceContext>(),
-          &out_tmp);
-
-      phi::DenseTensor reverse_axis;
-      std::vector<int> reverse_axis_vector;
-      for (size_t axis = 0; axis < axes.size(); axis++) {
-        if (reverse_vector[axis] == 1) {
-          reverse_axis_vector.push_back(axes[axis]);
-        }
-      }
-      reverse_axis.mutable_data<int>(
-          {static_cast<int>(reverse_axis_vector.size())}, place);
-      paddle::framework::TensorFromVector(
-          reverse_axis_vector, ctx.device_context(), &reverse_axis);
-
-      const auto& runner_reverse =
-          NpuOpRunner("ReverseV2", {out_tmp, reverse_axis}, {*out});
-      runner_reverse.Run(stream);
-    }
-
-    if (decrease_axis.size() > 0) {
-      out->Resize(out_dims_origin);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class StridedSliceGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const Variable* input_var = ctx.InputVar("Input");
-    bool is_tensor_array = input_var->IsType<LoDTensorArray>();
-    PADDLE_ENFORCE_EQ(is_tensor_array,
-                      false,
-                      platform::errors::InvalidArgument(
-                          "phi::DenseTensor array as input is not supported."));
-    int rank = ctx.Input<phi::DenseTensor>("Input")->dims().size();
-
-    switch (rank) {
-      case 1:
-        StridedSliceGradCompute<1>(ctx);
-        break;
-      case 2:
-        StridedSliceGradCompute<2>(ctx);
-        break;
-      case 3:
-        StridedSliceGradCompute<3>(ctx);
-        break;
-      case 4:
-        StridedSliceGradCompute<4>(ctx);
-        break;
-      case 5:
-        StridedSliceGradCompute<5>(ctx);
-        break;
-      case 6:
-        StridedSliceGradCompute<6>(ctx);
-        break;
-      default:
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "The rank of input is supported up to 6."));
-        break;
-    }
-  }
-
- private:
-  template <size_t D>
-  void StridedSliceGradCompute(const framework::ExecutionContext& ctx) const {
-    auto place = ctx.GetPlace();
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>();
-
-    auto* input = ctx.Input<phi::DenseTensor>("Input");
-    auto input_dims = input->dims();
-    auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
-    dx->mutable_data<T>(input_dims, place);
-
-    auto starts_int = ctx.Attr<std::vector<int>>("starts");
-    auto ends_int = ctx.Attr<std::vector<int>>("ends");
-    auto strides_int = ctx.Attr<std::vector<int>>("strides");
-
-    std::vector<int64_t> starts(starts_int.begin(), starts_int.end());
-    std::vector<int64_t> ends(ends_int.begin(), ends_int.end());
-    std::vector<int64_t> strides(strides_int.begin(), strides_int.end());
-
-    auto axes = ctx.Attr<std::vector<int>>("axes");
-    auto infer_flags = ctx.Attr<std::vector<int>>("infer_flags");
-    auto decrease_axis = ctx.Attr<std::vector<int>>("decrease_axis");
-
-    auto list_new_ends_tensor =
-        ctx.MultiInput<phi::DenseTensor>("EndsTensorList");
-    auto list_new_starts_tensor =
-        ctx.MultiInput<phi::DenseTensor>("StartsTensorList");
-    auto list_new_strides_tensor =
-        ctx.MultiInput<phi::DenseTensor>("StridesTensorList");
-
-    if (list_new_starts_tensor.size() > 0) {
-      starts = GetDataFromTensorList<int64_t>(list_new_starts_tensor);
-    } else if (ctx.HasInput("StartsTensor")) {
-      auto* starts_tensor = ctx.Input<phi::DenseTensor>("StartsTensor");
-      starts = phi::GetVectorFromTensor<int64_t>(starts_tensor);
-    }
-
-    if (list_new_ends_tensor.size() > 0) {
-      ends = GetDataFromTensorList<int64_t>(list_new_ends_tensor);
-    } else if (ctx.HasInput("EndsTensor")) {
-      auto* ends_tensor = ctx.Input<phi::DenseTensor>("EndsTensor");
-      ends = phi::GetVectorFromTensor<int64_t>(ends_tensor);
-    }
-
-    if (list_new_strides_tensor.size() > 0) {
-      strides = GetDataFromTensorList<int64_t>(list_new_strides_tensor);
-    } else if (ctx.HasInput("StridesTensor")) {
-      auto* strides_tensor = ctx.Input<phi::DenseTensor>("StridesTensor");
-      strides = phi::GetVectorFromTensor<int64_t>(strides_tensor);
-    }
-
-    std::vector<int64_t> out_dims_vector(input_dims.size(), -1);
-    phi::funcs::StridedSliceOutDims(starts,
-                                    ends,
-                                    strides,
-                                    axes,
-                                    infer_flags,
-                                    input_dims,
-                                    decrease_axis,
-                                    out_dims_vector.data(),
-                                    axes.size(),
-                                    false);
-
-    std::vector<int> reverse_vector(starts.size(), 0);
-    phi::funcs::StridedSliceFunctor(starts.data(),
-                                    ends.data(),
-                                    strides.data(),
-                                    axes.data(),
-                                    reverse_vector.data(),
-                                    input_dims,
-                                    infer_flags,
-                                    decrease_axis,
-                                    starts.size());
-
-    std::vector<int64_t> starts_indices_vector(D, 0);
-    std::vector<int64_t> ends_indices_vector(out_dims_vector.begin(),
-                                             out_dims_vector.end());
-    std::vector<int64_t> strides_indices_vector(D, 1);
-
-    for (size_t axis = 0; axis < axes.size(); axis++) {
-      int axis_index = axes[axis];
-      starts_indices_vector[axis_index] = starts[axis];
-      ends_indices_vector[axis_index] = ends[axis];
-      strides_indices_vector[axis_index] = strides[axis];
-    }
-
-    phi::DenseTensor starts_indices_tensor;
-    phi::DenseTensor ends_indices_tensor;
-    phi::DenseTensor strides_indices_tensor;
-
-    starts_indices_tensor.mutable_data<int64_t>({D}, place);
-    ends_indices_tensor.mutable_data<int64_t>({D}, place);
-    strides_indices_tensor.mutable_data<int64_t>({D}, place);
-
-    paddle::framework::TensorFromVector(
-        starts_indices_vector, dev_ctx, &starts_indices_tensor);
-    paddle::framework::TensorFromVector(
-        ends_indices_vector, dev_ctx, &ends_indices_tensor);
-    paddle::framework::TensorFromVector(
-        strides_indices_vector, dev_ctx, &strides_indices_tensor);
-
-    std::vector<int64_t> input_dims_vector;
-    for (int i = 0; i < input_dims.size(); i++) {
-      input_dims_vector.push_back(input_dims[i]);
-    }
-    phi::DenseTensor input_dims_tensor;
-    paddle::framework::TensorFromVector(
-        input_dims_vector, dev_ctx, &input_dims_tensor);
-
-    bool need_reverse = false;
-    for (size_t axis = 0; axis < axes.size(); axis++) {
-      if (reverse_vector[axis] == 1) {
-        need_reverse = true;
-        break;
-      }
-    }
-
-    auto stream = dev_ctx.stream();
-    framework::NPUAttributeMap attr_input = {{"begin_mask", 0},
-                                             {"end_mask", 0},
-                                             {"ellipsis_mask", 0},
-                                             {"new_axis_mask", 0},
-                                             {"shrink_axis_mask", 0}};
-
-    if (need_reverse) {
-      phi::DenseTensor reverse_axis;
-      std::vector<int> reverse_axis_vector;
-      for (size_t axis = 0; axis < axes.size(); axis++) {
-        if (reverse_vector[axis] == 1) {
-          reverse_axis_vector.push_back(axes[axis]);
-        }
-      }
-      reverse_axis.mutable_data<int>(
-          {static_cast<int>(reverse_axis_vector.size())}, place);
-      paddle::framework::TensorFromVector(
-          reverse_axis_vector, dev_ctx, &reverse_axis);
-
-      phi::DenseTensor dout_tmp;
-      dout_tmp.mutable_data<T>(dout->dims(), place);
-      const auto& runner_reverse =
-          NpuOpRunner("ReverseV2", {*dout, reverse_axis}, {dout_tmp});
-      runner_reverse.Run(stream);
-
-      const auto& runner = NpuOpRunner("StridedSliceGrad",
-                                       {input_dims_tensor,
-                                        starts_indices_tensor,
-                                        ends_indices_tensor,
-                                        strides_indices_tensor,
-                                        dout_tmp},
-                                       {*dx},
-                                       attr_input);
-      runner.Run(stream);
-    } else {
-      const auto& runner = NpuOpRunner("StridedSliceGrad",
-                                       {input_dims_tensor,
-                                        starts_indices_tensor,
-                                        ends_indices_tensor,
-                                        strides_indices_tensor,
-                                        *dout},
-                                       {*dx},
-                                       attr_input);
-      runner.Run(stream);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_NPU_KERNEL(
-    strided_slice,
-    ops::StridedSliceNPUKernel<plat::NPUDeviceContext, bool>,
-    ops::StridedSliceNPUKernel<plat::NPUDeviceContext, int>,
-    ops::StridedSliceNPUKernel<plat::NPUDeviceContext, int64_t>,
-    ops::StridedSliceNPUKernel<plat::NPUDeviceContext, float>,
-    ops::StridedSliceNPUKernel<plat::NPUDeviceContext, double>);
-
-REGISTER_OP_NPU_KERNEL(
-    strided_slice_grad,
-    ops::StridedSliceGradNPUKernel<plat::NPUDeviceContext, plat::float16>,
-    ops::StridedSliceGradNPUKernel<plat::NPUDeviceContext, float>,
-    ops::StridedSliceGradNPUKernel<plat::NPUDeviceContext, double>,
-    ops::StridedSliceGradNPUKernel<plat::NPUDeviceContext, int>,
-    ops::StridedSliceGradNPUKernel<plat::NPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/sum_op_npu.cc b/paddle/fluid/operators/sum_op_npu.cc
deleted file mode 100644
index 5d1656b79e9a85..00000000000000
--- a/paddle/fluid/operators/sum_op_npu.cc
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using SelectedRows = phi::SelectedRows;
-
-template <typename DeviceContext, typename T>
-class SumNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto out_var = ctx.OutputVar("Out");
-    if (out_var->IsType<phi::DenseTensor>()) {
-      auto *out = out_var->GetMutable<phi::DenseTensor>();
-      auto x = ctx.MultiInput<phi::DenseTensor>("X");
-      out->mutable_data<T>(ctx.GetPlace());
-
-      auto place = ctx.GetPlace();
-
-      int n = static_cast<int>(x.size());
-      if (n == 1) {
-        paddle::framework::TensorCopy(*x[0], place, out);
-        return;
-      }
-
-      std::vector<phi::DenseTensor> inputs;
-      std::vector<std::string> names;
-      for (int i = 0; i < n; ++i) {
-        if (x[i] && x[i]->numel() > 0) {
-          inputs.push_back(*x[i]);
-          names.push_back("x" + std::to_string(i));
-        } else {
-          continue;
-        }
-      }
-
-      auto stream =
-          ctx.template device_context<paddle::platform::NPUDeviceContext>()
-              .stream();
-      NpuOpRunner runner{"AddN", {inputs}, {*out}, {{"N", n}}};
-      runner.AddInputNames(names);
-      runner.Run(stream);
-    } else if (out_var->IsType<framework::LoDTensorArray>()) {
-      auto in_vars = ctx.MultiInputVar("X");
-      bool in_place = out_var == in_vars[0];
-      auto &out_array = *out_var->GetMutable<framework::LoDTensorArray>();
-      for (size_t i = in_place ? 1 : 0; i < in_vars.size(); ++i) {
-        PADDLE_ENFORCE_EQ(in_vars[i]->IsType<framework::LoDTensorArray>(),
-                          true,
-                          platform::errors::InvalidArgument(
-                              "Only support all inputs are TensorArray, "
-                              "but inputs[%d] is not TensorArray.",
-                              i));
-        auto &in_array = in_vars[i]->Get<framework::LoDTensorArray>();
-
-        for (size_t i = 0; i < in_array.size(); ++i) {
-          if (in_array[i].IsInitialized() && (in_array[i].numel() != 0)) {
-            if (i >= out_array.size()) {
-              out_array.resize(i + 1);
-            }
-            if (!out_array[i].IsInitialized() || (out_array[i].numel() == 0)) {
-              framework::TensorCopy(in_array[i],
-                                    in_array[i].place(),
-                                    ctx.device_context(),
-                                    &out_array[i]);
-              out_array[i].set_lod(in_array[i].lod());
-            } else {
-              PADDLE_ENFORCE_EQ(
-                  out_array[i].lod(),
-                  in_array[i].lod(),
-                  platform::errors::InvalidArgument(
-                      "The lod message between inputs[%d] and"
-                      " outputs[%d] must be same, but now is not same.",
-                      i,
-                      i));
-              auto stream = ctx.template device_context<
-                                   paddle::platform::NPUDeviceContext>()
-                                .stream();
-              NpuOpRunner runner{
-                  "Add", {out_array[i], in_array[i]}, {out_array[i]}, {}};
-              runner.Run(stream);
-            }
-          }
-        }
-      }
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Expected type of Output(out) must be phi::DenseTensor or "
-          "LoDTensorArray. But got "
-          "unsupport type: %s.",
-          framework::ToTypeName(out_var->Type())));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    sum,
-    ops::SumNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::SumNPUKernel<paddle::platform::NPUDeviceContext,
-                      paddle::platform::float16>);
diff --git a/paddle/fluid/operators/sync_batch_norm_op_npu.cc b/paddle/fluid/operators/sync_batch_norm_op_npu.cc
deleted file mode 100644
index 1b3ed3ccc7a737..00000000000000
--- a/paddle/fluid/operators/sync_batch_norm_op_npu.cc
+++ /dev/null
@@ -1,1105 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the Licnse. */
-
-#include "paddle/fluid/operators/batch_norm_op.h"
-#include "paddle/fluid/platform/collective_helper.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-void training_or_inference(const framework::ExecutionContext &ctx,
-                           const aclrtStream &stream,
-                           const platform::Place &place,
-                           const DataLayout &layout,
-                           const bool &test_mode,
-                           const int &N,
-                           const int &C,
-                           const int &H,
-                           const int &W,
-                           const float epsilon,
-                           const float &momentum,
-                           const phi::DenseTensor *common_mean,
-                           const phi::DenseTensor *common_var,
-                           const phi::DenseTensor *x,
-                           const phi::DenseTensor *scale,
-                           const phi::DenseTensor *bias,
-                           const phi::DenseTensor *mean,
-                           const phi::DenseTensor *variance,
-                           phi::DenseTensor *mean_out,
-                           phi::DenseTensor *variance_out,
-                           phi::DenseTensor *saved_mean,
-                           phi::DenseTensor *saved_variance,
-                           phi::DenseTensor *y) {
-  std::vector<int> axes;
-  if (layout == phi::DataLayout::kNCHW) {
-    axes = {0, 2, 3};
-  } else if (layout == phi::DataLayout::kNHWC) {
-    axes = {0, 1, 2};
-  }
-
-  std::vector<int> multiples;
-  if (layout == phi::DataLayout::kNCHW)
-    multiples = {N, 1, H, W};
-  else if (layout == phi::DataLayout::kNHWC)
-    multiples = {N, H, W, 1};
-
-  phi::DenseTensor common_mean_tile_1;
-  {
-    common_mean_tile_1.Resize({C});
-    common_mean_tile_1.mutable_data<float>(place);
-    paddle::framework::TensorCopySync(*common_mean, place, &common_mean_tile_1);
-    if (layout == phi::DataLayout::kNCHW)
-      common_mean_tile_1.Resize({1, C, 1, 1});
-    else if (layout == phi::DataLayout::kNHWC)
-      common_mean_tile_1.Resize({1, 1, 1, C});
-  }
-
-  phi::DenseTensor common_mean_tile;
-  {
-    framework::NPUAttributeMap attr_input = {{"multiples", multiples}};
-    common_mean_tile.Resize(x->dims());
-    common_mean_tile.mutable_data<float>(place);
-    const auto &runner = NpuOpRunner(
-        "TileD", {common_mean_tile_1}, {common_mean_tile}, attr_input);
-    runner.Run(stream);
-  }
-
-  phi::DenseTensor common_var_tile_1;
-  {
-    common_var_tile_1.Resize({C});
-    common_var_tile_1.mutable_data<float>(place);
-    paddle::framework::TensorCopySync(*common_var, place, &common_var_tile_1);
-    if (layout == phi::DataLayout::kNCHW)
-      common_var_tile_1.Resize({1, C, 1, 1});
-    else if (layout == phi::DataLayout::kNHWC)
-      common_var_tile_1.Resize({1, 1, 1, C});
-  }
-
-  phi::DenseTensor common_var_tile;
-  {
-    framework::NPUAttributeMap attr_input = {{"multiples", multiples}};
-    common_var_tile.Resize(x->dims());
-    common_var_tile.mutable_data<float>(place);
-    const auto &runner = NpuOpRunner(
-        "TileD", {common_var_tile_1}, {common_var_tile}, attr_input);
-    runner.Run(stream);
-  }
-
-  phi::DenseTensor common_var_tile_add_epsilon;
-  {
-    framework::NPUAttributeMap attr_input = {{"value", epsilon}};
-    common_var_tile_add_epsilon.Resize(x->dims());
-    common_var_tile_add_epsilon.mutable_data<float>(place);
-    const auto &runner = NpuOpRunner(
-        "Adds", {common_var_tile}, {common_var_tile_add_epsilon}, attr_input);
-    runner.Run(stream);
-  }
-
-  phi::DenseTensor common_var_tile_add_epsilon_sqrt;
-  {
-    common_var_tile_add_epsilon_sqrt.Resize(x->dims());
-    common_var_tile_add_epsilon_sqrt.mutable_data<float>(place);
-    const auto &runner = NpuOpRunner("Sqrt",
-                                     {common_var_tile_add_epsilon},
-                                     {common_var_tile_add_epsilon_sqrt},
-                                     {});
-    runner.Run(stream);
-  }
-
-  phi::DenseTensor x_sub_common_mean;
-  {
-    x_sub_common_mean.Resize(x->dims());
-    x_sub_common_mean.mutable_data<float>(place);
-    const auto &runner =
-        NpuOpRunner("Sub", {*x, common_mean_tile}, {x_sub_common_mean}, {});
-    runner.Run(stream);
-  }
-
-  phi::DenseTensor normalized;
-  {
-    normalized.Resize(x->dims());
-    normalized.mutable_data<float>(place);
-    const auto &runner =
-        NpuOpRunner("Div",
-                    {x_sub_common_mean, common_var_tile_add_epsilon_sqrt},
-                    {normalized},
-                    {});
-    runner.Run(stream);
-  }
-
-  phi::DenseTensor scale_tile_1;
-  {
-    scale_tile_1.Resize({C});
-    scale_tile_1.mutable_data<float>(place);
-    paddle::framework::TensorCopySync(*scale, place, &scale_tile_1);
-    if (layout == phi::DataLayout::kNCHW)
-      scale_tile_1.Resize({1, C, 1, 1});
-    else if (layout == phi::DataLayout::kNHWC)
-      scale_tile_1.Resize({1, 1, 1, C});
-  }
-
-  phi::DenseTensor scale_tile;
-  {
-    framework::NPUAttributeMap attr_input = {{"multiples", multiples}};
-    scale_tile.Resize(x->dims());
-    scale_tile.mutable_data<float>(place);
-    const auto &runner =
-        NpuOpRunner("TileD", {scale_tile_1}, {scale_tile}, attr_input);
-    runner.Run(stream);
-  }
-
-  phi::DenseTensor normalized_mul_scale;
-  {
-    normalized_mul_scale.Resize(x->dims());
-    normalized_mul_scale.mutable_data<float>(place);
-    const auto &runner = NpuOpRunner(
-        "Mul", {normalized, scale_tile}, {normalized_mul_scale}, {});
-    runner.Run(stream);
-  }
-
-  phi::DenseTensor bias_tile_1;
-  {
-    bias_tile_1.Resize({C});
-    bias_tile_1.mutable_data<float>(place);
-    paddle::framework::TensorCopySync(*bias, place, &bias_tile_1);
-    if (layout == phi::DataLayout::kNCHW)
-      bias_tile_1.Resize({1, C, 1, 1});
-    else if (layout == phi::DataLayout::kNHWC)
-      bias_tile_1.Resize({1, 1, 1, C});
-  }
-
-  phi::DenseTensor bias_tile;
-  {
-    framework::NPUAttributeMap attr_input = {{"multiples", multiples}};
-    bias_tile.Resize(x->dims());
-    bias_tile.mutable_data<float>(place);
-    const auto &runner =
-        NpuOpRunner("TileD", {bias_tile_1}, {bias_tile}, attr_input);
-    runner.Run(stream);
-  }
-
-  // calculate y
-  {
-    y->mutable_data<T>(place);
-    const auto &runner =
-        NpuOpRunner("Add", {normalized_mul_scale, bias_tile}, {*y}, {});
-    runner.Run(stream);
-  }
-
-  if (!test_mode) {
-    phi::DenseTensor ones;
-    {
-      ones.Resize({C});
-      ones.mutable_data<float>(place);
-      FillNpuTensorWithConstant<float>(&ones, 1);
-    }
-
-    // cacl mean_out
-    {
-      phi::DenseTensor common_mean_mul_1_sub_momentum;
-      {
-        framework::NPUAttributeMap attr_input = {{"value", 1 - momentum}};
-        common_mean_mul_1_sub_momentum.Resize({C});
-        common_mean_mul_1_sub_momentum.mutable_data<float>(place);
-        const auto &runner = NpuOpRunner("Muls",
-                                         {*common_mean},
-                                         {common_mean_mul_1_sub_momentum},
-                                         attr_input);
-        runner.Run(stream);
-      }
-
-      phi::DenseTensor mean_mul_momentum;
-      {
-        framework::NPUAttributeMap attr_input = {{"value", momentum}};
-        mean_mul_momentum.Resize({C});
-        mean_mul_momentum.mutable_data<float>(place);
-        const auto &runner =
-            NpuOpRunner("Muls", {*mean}, {mean_mul_momentum}, attr_input);
-        runner.Run(stream);
-      }
-
-      mean_out->mutable_data<float>(place);
-
-      const auto &runner =
-          NpuOpRunner("Add",
-                      {common_mean_mul_1_sub_momentum, mean_mul_momentum},
-                      {*mean_out},
-                      {});
-      runner.Run(stream);
-    }
-
-    // cacl variance_out
-    {
-      phi::DenseTensor momentum_mul_var;
-      {
-        framework::NPUAttributeMap attr_input = {{"value", momentum}};
-        momentum_mul_var.Resize({C});
-        momentum_mul_var.mutable_data<float>(place);
-        const auto &runner =
-            NpuOpRunner("Muls", {*variance}, {momentum_mul_var}, attr_input);
-        runner.Run(stream);
-      }
-
-      phi::DenseTensor var_ref_mul_1_sub_momentum;
-      {
-        framework::NPUAttributeMap attr_input = {{"value", 1 - momentum}};
-        var_ref_mul_1_sub_momentum.Resize({C});
-        var_ref_mul_1_sub_momentum.mutable_data<float>(place);
-        const auto &runner = NpuOpRunner(
-            "Muls", {*common_var}, {var_ref_mul_1_sub_momentum}, attr_input);
-        runner.Run(stream);
-      }
-
-      variance_out->mutable_data<float>(place);
-
-      const auto &runner =
-          NpuOpRunner("Add",
-                      {var_ref_mul_1_sub_momentum, momentum_mul_var},
-                      {*variance_out},
-                      {});
-      runner.Run(stream);
-    }
-
-    // cacl saved_variance
-    {
-      phi::DenseTensor var_ref_add_epsilon;
-      {
-        framework::NPUAttributeMap attr_input = {{"value", epsilon}};
-        var_ref_add_epsilon.Resize({C});
-        var_ref_add_epsilon.mutable_data<float>(place);
-        const auto &runner = NpuOpRunner(
-            "Adds", {*common_var}, {var_ref_add_epsilon}, attr_input);
-        runner.Run(stream);
-      }
-
-      phi::DenseTensor var_ref_add_epsilon_sqrt;
-      {
-        var_ref_add_epsilon_sqrt.Resize({C});
-        var_ref_add_epsilon_sqrt.mutable_data<float>(place);
-        const auto &runner = NpuOpRunner(
-            "Sqrt", {var_ref_add_epsilon}, {var_ref_add_epsilon_sqrt}, {});
-        runner.Run(stream);
-      }
-
-      saved_variance->mutable_data<float>(place);
-
-      const auto &runner = NpuOpRunner(
-          "Div", {ones, var_ref_add_epsilon_sqrt}, {*saved_variance}, {});
-      runner.Run(stream);
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class SyncBatchNormNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const float epsilon = ctx.Attr<float>("epsilon");
-    float momentum = ctx.Attr<float>("momentum");
-    const bool is_test = ctx.Attr<bool>("is_test");
-    const std::string layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout layout = phi::StringToDataLayout(layout_str);
-    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
-    const bool trainable_stats = ctx.Attr<bool>("trainable_statistics");
-
-    PADDLE_ENFORCE_EQ(use_global_stats,
-                      false,
-                      platform::errors::InvalidArgument(
-                          "sync_batch_norm doesn't support "
-                          "to set use_global_stats True. Please use batch_norm "
-                          "in this case."));
-
-    const auto *x = ctx.Input<phi::DenseTensor>("X");
-    auto *y = ctx.Output<phi::DenseTensor>("Y");
-    const auto *scale = ctx.Input<phi::DenseTensor>("Scale");
-    const auto *bias = ctx.Input<phi::DenseTensor>("Bias");
-    const auto *mean = ctx.Input<phi::DenseTensor>("Mean");
-    const auto *variance = ctx.Input<phi::DenseTensor>("Variance");
-    auto *mean_out = ctx.Output<phi::DenseTensor>("MeanOut");
-    auto *variance_out = ctx.Output<phi::DenseTensor>("VarianceOut");
-    auto *saved_mean = ctx.Output<phi::DenseTensor>("SavedMean");
-    auto *saved_variance = ctx.Output<phi::DenseTensor>("SavedVariance");
-
-    const auto &x_dims = x->dims();
-    PADDLE_ENFORCE_EQ(x_dims.size(),
-                      4,
-                      platform::errors::InvalidArgument(
-                          "The input tensor X's dimension must equal to 4. But "
-                          "received X's shape = [%s], X's dimension = [%d].",
-                          x_dims,
-                          x_dims.size()));
-
-    int N, C, H, W, D;
-    phi::funcs::ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D);
-
-    int x_numel = x->numel();
-    auto place = ctx.GetPlace();
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    std::vector<int> axes;
-    if (layout == phi::DataLayout::kNCHW) {
-      axes = {0, 2, 3};
-    } else if (layout == phi::DataLayout::kNHWC) {
-      axes = {0, 1, 2};
-    }
-
-    bool test_mode = is_test && (!trainable_stats);
-    if (test_mode) {  // inference
-      // cacl saved_mean
-      saved_mean->mutable_data<float>(place);
-      paddle::framework::TensorCopySync(*mean, place, saved_mean);
-
-      // cacl saved_variance
-      saved_variance->mutable_data<float>(place);
-      paddle::framework::TensorCopySync(*variance, place, saved_variance);
-
-      // cacl y
-      training_or_inference<T>(ctx,
-                               stream,
-                               place,
-                               layout,
-                               test_mode,
-                               N,
-                               C,
-                               H,
-                               W,
-                               epsilon,
-                               momentum,
-                               mean,
-                               variance,
-                               x,
-                               scale,
-                               bias,
-                               mean,
-                               variance,
-                               NULL,
-                               NULL,
-                               NULL,
-                               NULL,
-                               y);
-
-    } else {  // training
-      if (ctx.HasInput("MomentumTensor")) {
-        const auto *mom_tensor = ctx.Input<phi::DenseTensor>("MomentumTensor");
-        phi::DenseTensor mom_cpu;
-        paddle::framework::TensorCopySync(
-            *mom_tensor, platform::CPUPlace(), &mom_cpu);
-        momentum = mom_cpu.data<float>()[0];
-      }
-
-      // cacl saved_mean and var_ref
-      phi::DenseTensor var_ref;
-      var_ref.Resize({C});
-      var_ref.mutable_data<float>(place);
-      {
-        phi::DenseTensor x_sum;
-        {
-          framework::NPUAttributeMap attr_input = {{"keep_dims", false},
-                                                   {"axes", axes}};
-          x_sum.Resize({C});
-          x_sum.mutable_data<float>(place);
-          const auto &runner =
-              NpuOpRunner("ReduceSumD", {*x}, {x_sum}, attr_input);
-          runner.Run(stream);
-        }
-
-        phi::DenseTensor x_square;
-        {
-          x_square.Resize(x->dims());
-          x_square.mutable_data<float>(place);
-          const auto &runner = NpuOpRunner("Square", {*x}, {x_square}, {});
-          runner.Run(stream);
-        }
-
-        phi::DenseTensor x_square_sum;
-        {
-          framework::NPUAttributeMap attr_input = {{"keep_dims", false},
-                                                   {"axes", axes}};
-          x_square_sum.Resize({C});
-          x_square_sum.mutable_data<float>(place);
-          const auto &runner =
-              NpuOpRunner("ReduceSumD", {x_square}, {x_square_sum}, attr_input);
-          runner.Run(stream);
-        }
-
-        auto comm = paddle::platform::HCCLCommContext::Instance().Get(0, place);
-
-        float device_counts = 0.0;
-        if (comm) {
-          HcclDataType dtype = platform::ToHCCLDataType(
-              framework::TransToProtoVarType(mean_out->dtype()));
-
-          phi::DenseTensor device_count_tensor;
-          {
-            device_count_tensor.Resize({1});
-            device_count_tensor.mutable_data<float>(place);
-            FillNpuTensorWithConstant<float>(&device_count_tensor, 1);
-          }
-
-          // HcclAllReduce device_count_tensor
-          {
-            void *sendbuff = reinterpret_cast<void *>(
-                const_cast<float *>(device_count_tensor.data<float>()));
-            void *recvbuff = sendbuff;
-            PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllReduce(
-                sendbuff,
-                recvbuff,
-                1,
-                dtype,
-                HCCL_REDUCE_SUM,
-                comm->comm(),
-                reinterpret_cast<void *>(stream)));
-          }
-
-          std::vector<float> device_count_vec(1);
-          paddle::framework::TensorToVector(
-              device_count_tensor, ctx.device_context(), &device_count_vec);
-          device_counts = device_count_vec[0];
-
-          // HcclAllReduce x_sum
-          {
-            void *sendbuff = reinterpret_cast<void *>(
-                const_cast<float *>(x_sum.data<float>()));
-            void *recvbuff = sendbuff;
-            PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllReduce(
-                sendbuff,
-                recvbuff,
-                C,
-                dtype,
-                HCCL_REDUCE_SUM,
-                comm->comm(),
-                reinterpret_cast<void *>(stream)));
-          }
-
-          // HcclAllReduce x_square_sum
-          {
-            void *sendbuff = reinterpret_cast<void *>(
-                const_cast<float *>(x_square_sum.data<float>()));
-            void *recvbuff = sendbuff;
-            PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllReduce(
-                sendbuff,
-                recvbuff,
-                C,
-                dtype,
-                HCCL_REDUCE_SUM,
-                comm->comm(),
-                reinterpret_cast<void *>(stream)));
-          }
-        }
-
-        // cacl saved_mean
-        {
-          framework::NPUAttributeMap attr_input = {
-              {"value", 1.0f * C / x_numel / device_counts}};
-          saved_mean->mutable_data<float>(place);
-          const auto &runner =
-              NpuOpRunner("Muls", {x_sum}, {*saved_mean}, attr_input);
-          runner.Run(stream);
-        }
-
-        // cacl var_ref
-        {
-          phi::DenseTensor saved_mean_square;
-          {
-            saved_mean_square.Resize({C});
-            saved_mean_square.mutable_data<float>(place);
-            const auto &runner =
-                NpuOpRunner("Square", {*saved_mean}, {saved_mean_square}, {});
-            runner.Run(stream);
-          }
-
-          phi::DenseTensor var_ref_tmp;
-          var_ref_tmp.Resize({C});
-          var_ref_tmp.mutable_data<float>(place);
-          {
-            framework::NPUAttributeMap attr_input = {
-                {"value", 1.0f * C / x_numel / device_counts}};
-            const auto &runner =
-                NpuOpRunner("Muls", {x_square_sum}, {var_ref_tmp}, attr_input);
-            runner.Run(stream);
-          }
-
-          // cacl var_ref
-          {
-            const auto &runner = NpuOpRunner(
-                "Sub", {var_ref_tmp, saved_mean_square}, {var_ref}, {});
-            runner.Run(stream);
-          }
-        }
-      }
-
-      training_or_inference<T>(ctx,
-                               stream,
-                               place,
-                               layout,
-                               test_mode,
-                               N,
-                               C,
-                               H,
-                               W,
-                               epsilon,
-                               momentum,
-                               saved_mean,
-                               &var_ref,
-                               x,
-                               scale,
-                               bias,
-                               mean,
-                               variance,
-                               mean_out,
-                               variance_out,
-                               saved_mean,
-                               saved_variance,
-                               y);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    float epsilon = ctx.Attr<float>("epsilon");
-    const std::string layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout layout = phi::StringToDataLayout(layout_str);
-
-    const auto *d_y = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
-    const auto *scale = ctx.Input<phi::DenseTensor>("Scale");
-    auto *d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto *d_scale =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
-    const auto *saved_mean = ctx.Input<phi::DenseTensor>("SavedMean");
-
-    const phi::DenseTensor *x;
-    if (ctx.HasInput("Y")) {
-      PADDLE_ENFORCE_EQ(true,
-                        false,
-                        platform::errors::InvalidArgument(
-                            "sync_batch_norm_grad doesn't support input Y"));
-    } else {
-      x = ctx.Input<phi::DenseTensor>("X");
-    }
-
-    int N, C, H, W, D;
-    phi::funcs::ExtractNCWHD(x->dims(), layout, &N, &C, &H, &W, &D);
-
-    int x_numel = x->numel();
-    auto place = ctx.GetPlace();
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    std::vector<int> axes;
-    if (layout == phi::DataLayout::kNCHW) {
-      axes = {0, 2, 3};
-    } else if (layout == phi::DataLayout::kNHWC) {
-      axes = {0, 1, 2};
-    }
-
-    std::vector<int> multiples;
-    if (layout == phi::DataLayout::kNCHW)
-      multiples = {N, 1, H, W};
-    else if (layout == phi::DataLayout::kNHWC)
-      multiples = {N, H, W, 1};
-
-    auto comm = paddle::platform::HCCLCommContext::Instance().Get(0, place);
-    HcclDataType dtype = platform::ToHCCLDataType(
-        framework::TransToProtoVarType(scale->dtype()));
-
-    float device_counts = 0.0;
-    if (comm) {
-      phi::DenseTensor device_count_tensor;
-      {
-        device_count_tensor.Resize({1});
-        device_count_tensor.mutable_data<float>(place);
-        FillNpuTensorWithConstant<float>(&device_count_tensor, 1);
-      }
-
-      // HcclAllReduce device_count_tensor
-      {
-        void *sendbuff = reinterpret_cast<void *>(
-            const_cast<float *>(device_count_tensor.data<float>()));
-        void *recvbuff = sendbuff;
-        PADDLE_ENFORCE_NPU_SUCCESS(
-            platform::dynload::HcclAllReduce(sendbuff,
-                                             recvbuff,
-                                             1,
-                                             dtype,
-                                             HCCL_REDUCE_SUM,
-                                             comm->comm(),
-                                             reinterpret_cast<void *>(stream)));
-      }
-
-      std::vector<float> device_count_vec(1);
-      paddle::framework::TensorToVector(
-          device_count_tensor, ctx.device_context(), &device_count_vec);
-      device_counts = device_count_vec[0];
-      PADDLE_ENFORCE_GE(
-          device_counts,
-          2,
-          platform::errors::PreconditionNotMet("device_counts should >= 2."));
-    }
-
-    // cacl var_ref
-    phi::DenseTensor var_ref;
-    var_ref.Resize({C});
-    var_ref.mutable_data<float>(place);
-    {
-      // cacl var_ref
-      {
-        phi::DenseTensor x_square;
-        {
-          x_square.Resize(x->dims());
-          x_square.mutable_data<float>(place);
-          const auto &runner = NpuOpRunner("Square", {*x}, {x_square}, {});
-          runner.Run(stream);
-        }
-
-        phi::DenseTensor x_square_sum;
-        {
-          framework::NPUAttributeMap attr_input = {{"keep_dims", false},
-                                                   {"axes", axes}};
-          x_square_sum.Resize({C});
-          x_square_sum.mutable_data<float>(place);
-          const auto &runner =
-              NpuOpRunner("ReduceSumD", {x_square}, {x_square_sum}, attr_input);
-          runner.Run(stream);
-        }
-
-        phi::DenseTensor x_square_sum_mean;
-        {
-          framework::NPUAttributeMap attr_input = {
-              {"value", 1.0f * C / x_numel}};
-          x_square_sum_mean.Resize({C});
-          x_square_sum_mean.mutable_data<float>(place);
-          const auto &runner = NpuOpRunner(
-              "Muls", {x_square_sum}, {x_square_sum_mean}, attr_input);
-          runner.Run(stream);
-        }
-
-        phi::DenseTensor mean_square;
-        {
-          mean_square.Resize({C});
-          mean_square.mutable_data<float>(place);
-          const auto &runner =
-              NpuOpRunner("Square", {*saved_mean}, {mean_square}, {});
-          runner.Run(stream);
-        }
-
-        // cacl var_ref
-        {
-          const auto &runner = NpuOpRunner(
-              "Sub", {x_square_sum_mean, mean_square}, {var_ref}, {});
-          runner.Run(stream);
-        }
-      }
-    }
-
-    phi::DenseTensor saved_mean_tile_1;
-    {
-      saved_mean_tile_1.Resize({C});
-      saved_mean_tile_1.mutable_data<float>(place);
-      paddle::framework::TensorCopySync(*saved_mean, place, &saved_mean_tile_1);
-      if (layout == phi::DataLayout::kNCHW)
-        saved_mean_tile_1.Resize({1, C, 1, 1});
-      else if (layout == phi::DataLayout::kNHWC)
-        saved_mean_tile_1.Resize({1, 1, 1, C});
-    }
-
-    phi::DenseTensor saved_mean_tile;
-    {
-      framework::NPUAttributeMap attr_input = {{"multiples", multiples}};
-      saved_mean_tile.Resize(x->dims());
-      saved_mean_tile.mutable_data<float>(place);
-      const auto &runner = NpuOpRunner(
-          "TileD", {saved_mean_tile_1}, {saved_mean_tile}, attr_input);
-      runner.Run(stream);
-    }
-
-    phi::DenseTensor x_sub_saved_mean;
-    {
-      x_sub_saved_mean.Resize(x->dims());
-      x_sub_saved_mean.mutable_data<float>(place);
-      const auto &runner =
-          NpuOpRunner("Sub", {*x, saved_mean_tile}, {x_sub_saved_mean}, {});
-      runner.Run(stream);
-    }
-
-    phi::DenseTensor var_ref_tile_1;
-    {
-      var_ref_tile_1.Resize({C});
-      var_ref_tile_1.mutable_data<float>(place);
-      paddle::framework::TensorCopySync(var_ref, place, &var_ref_tile_1);
-      if (layout == phi::DataLayout::kNCHW)
-        var_ref_tile_1.Resize({1, C, 1, 1});
-      else if (layout == phi::DataLayout::kNHWC)
-        var_ref_tile_1.Resize({1, 1, 1, C});
-    }
-
-    phi::DenseTensor var_ref_tile;
-    {
-      framework::NPUAttributeMap attr_input = {{"multiples", multiples}};
-      var_ref_tile.Resize(x->dims());
-      var_ref_tile.mutable_data<float>(place);
-      const auto &runner =
-          NpuOpRunner("TileD", {var_ref_tile_1}, {var_ref_tile}, attr_input);
-      runner.Run(stream);
-    }
-
-    phi::DenseTensor var_ref_tile_add_epsilon;
-    {
-      framework::NPUAttributeMap attr_input = {{"value", epsilon}};
-      var_ref_tile_add_epsilon.Resize(x->dims());
-      var_ref_tile_add_epsilon.mutable_data<float>(place);
-      const auto &runner = NpuOpRunner(
-          "Adds", {var_ref_tile}, {var_ref_tile_add_epsilon}, attr_input);
-      runner.Run(stream);
-    }
-
-    phi::DenseTensor var_ref_tile_add_epsilon_sqrt;
-    {
-      var_ref_tile_add_epsilon_sqrt.Resize(x->dims());
-      var_ref_tile_add_epsilon_sqrt.mutable_data<float>(place);
-      const auto &runner = NpuOpRunner("Sqrt",
-                                       {var_ref_tile_add_epsilon},
-                                       {var_ref_tile_add_epsilon_sqrt},
-                                       {});
-      runner.Run(stream);
-    }
-
-    phi::DenseTensor dy_mul_x_sub_mean_for_scale;
-    {
-      if (framework::TransToProtoVarType(d_y->dtype()) ==
-          framework::proto::VarType::FP16) {
-        dy_mul_x_sub_mean_for_scale.Resize(x->dims());
-        dy_mul_x_sub_mean_for_scale.mutable_data<float>(place);
-        const auto &runner = NpuOpRunner(
-            "Mul", {*d_y, x_sub_saved_mean}, {dy_mul_x_sub_mean_for_scale}, {});
-        runner.Run(stream);
-      } else {
-        dy_mul_x_sub_mean_for_scale.Resize(x->dims());
-        dy_mul_x_sub_mean_for_scale.mutable_data<float>(place);
-        const auto &runner = NpuOpRunner(
-            "Mul", {*d_y, x_sub_saved_mean}, {dy_mul_x_sub_mean_for_scale}, {});
-        runner.Run(stream);
-      }
-    }
-
-    phi::DenseTensor dy_mul_x_sub_mean;
-    {
-      if (framework::TransToProtoVarType(d_y->dtype()) ==
-          framework::proto::VarType::FP16) {
-        dy_mul_x_sub_mean.Resize(x->dims());
-        dy_mul_x_sub_mean.mutable_data<float>(place);
-        const auto &runner = NpuOpRunner(
-            "Mul", {*d_y, x_sub_saved_mean}, {dy_mul_x_sub_mean}, {});
-        runner.Run(stream);
-      } else {
-        dy_mul_x_sub_mean.Resize(x->dims());
-        dy_mul_x_sub_mean.mutable_data<float>(place);
-        const auto &runner = NpuOpRunner(
-            "Mul", {*d_y, x_sub_saved_mean}, {dy_mul_x_sub_mean}, {});
-        runner.Run(stream);
-      }
-    }
-
-    // HcclAllReduce dy_mul_x_sub_mean
-    if (comm) {
-      {
-        void *sendbuff = reinterpret_cast<void *>(
-            const_cast<float *>(dy_mul_x_sub_mean.data<float>()));
-        void *recvbuff = sendbuff;
-        PADDLE_ENFORCE_NPU_SUCCESS(
-            platform::dynload::HcclAllReduce(sendbuff,
-                                             recvbuff,
-                                             C,
-                                             dtype,
-                                             HCCL_REDUCE_SUM,
-                                             comm->comm(),
-                                             reinterpret_cast<void *>(stream)));
-      }
-
-      {
-        framework::NPUAttributeMap attr_input = {
-            {"value", 1.0f / device_counts}};
-        const auto &runner = NpuOpRunner(
-            "Muls", {dy_mul_x_sub_mean}, {dy_mul_x_sub_mean}, attr_input);
-        runner.Run(stream);
-      }
-    }
-
-    // cacl d_x
-    if (d_x) {
-      phi::DenseTensor dy_mean;
-      {
-        if (framework::TransToProtoVarType(d_y->dtype()) ==
-            framework::proto::VarType::FP16) {
-          framework::NPUAttributeMap attr_input = {{"keep_dims", false},
-                                                   {"axes", axes}};
-          dy_mean.Resize({C});
-          dy_mean.mutable_data<float>(place);
-          const auto &runner =
-              NpuOpRunner("ReduceMeanD", {*d_y}, {dy_mean}, attr_input);
-          runner.Run(stream);
-        } else {
-          framework::NPUAttributeMap attr_input = {{"keep_dims", false},
-                                                   {"axes", axes}};
-          dy_mean.Resize({C});
-          dy_mean.mutable_data<float>(place);
-          const auto &runner =
-              NpuOpRunner("ReduceMeanD", {*d_y}, {dy_mean}, attr_input);
-          runner.Run(stream);
-        }
-      }
-
-      // HcclAllReduce dy_mean
-      if (comm) {
-        {
-          void *sendbuff = reinterpret_cast<void *>(
-              const_cast<float *>(dy_mean.data<float>()));
-          void *recvbuff = sendbuff;
-          PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllReduce(
-              sendbuff,
-              recvbuff,
-              C,
-              dtype,
-              HCCL_REDUCE_SUM,
-              comm->comm(),
-              reinterpret_cast<void *>(stream)));
-        }
-
-        {
-          framework::NPUAttributeMap attr_input = {
-              {"value", 1.0f / device_counts}};
-          const auto &runner =
-              NpuOpRunner("Muls", {dy_mean}, {dy_mean}, attr_input);
-          runner.Run(stream);
-        }
-      }
-
-      phi::DenseTensor dy_mean_tile_1;
-      {
-        dy_mean_tile_1.Resize({C});
-        dy_mean_tile_1.mutable_data<float>(place);
-        paddle::framework::TensorCopySync(dy_mean, place, &dy_mean_tile_1);
-        if (layout == phi::DataLayout::kNCHW)
-          dy_mean_tile_1.Resize({1, C, 1, 1});
-        else if (layout == phi::DataLayout::kNHWC)
-          dy_mean_tile_1.Resize({1, 1, 1, C});
-      }
-
-      phi::DenseTensor dy_mean_tile;
-      {
-        framework::NPUAttributeMap attr_input = {{"multiples", multiples}};
-        dy_mean_tile.Resize(x->dims());
-        dy_mean_tile.mutable_data<float>(place);
-        const auto &runner =
-            NpuOpRunner("TileD", {dy_mean_tile_1}, {dy_mean_tile}, attr_input);
-        runner.Run(stream);
-      }
-
-      phi::DenseTensor dy_sub_dy_mean;
-      {
-        if (framework::TransToProtoVarType(d_y->dtype()) ==
-            framework::proto::VarType::FP16) {
-          dy_sub_dy_mean.Resize(x->dims());
-          dy_sub_dy_mean.mutable_data<float>(place);
-          const auto &runner =
-              NpuOpRunner("Sub", {*d_y, dy_mean_tile}, {dy_sub_dy_mean}, {});
-          runner.Run(stream);
-        } else {
-          dy_sub_dy_mean.Resize(x->dims());
-          dy_sub_dy_mean.mutable_data<float>(place);
-          const auto &runner =
-              NpuOpRunner("Sub", {*d_y, dy_mean_tile}, {dy_sub_dy_mean}, {});
-          runner.Run(stream);
-        }
-      }
-
-      phi::DenseTensor dy_mul_x_sub_mean_mean;
-      {
-        framework::NPUAttributeMap attr_input = {{"keep_dims", false},
-                                                 {"axes", axes}};
-        dy_mul_x_sub_mean_mean.Resize({C});
-        dy_mul_x_sub_mean_mean.mutable_data<float>(place);
-        const auto &runner = NpuOpRunner("ReduceMeanD",
-                                         {dy_mul_x_sub_mean},
-                                         {dy_mul_x_sub_mean_mean},
-                                         attr_input);
-        runner.Run(stream);
-      }
-
-      phi::DenseTensor dy_mul_x_sub_mean_mean_tile_1;
-      {
-        dy_mul_x_sub_mean_mean_tile_1.Resize({C});
-        dy_mul_x_sub_mean_mean_tile_1.mutable_data<float>(place);
-        paddle::framework::TensorCopySync(
-            dy_mul_x_sub_mean_mean, place, &dy_mul_x_sub_mean_mean_tile_1);
-        if (layout == phi::DataLayout::kNCHW)
-          dy_mul_x_sub_mean_mean_tile_1.Resize({1, C, 1, 1});
-        else if (layout == phi::DataLayout::kNHWC)
-          dy_mul_x_sub_mean_mean_tile_1.Resize({1, 1, 1, C});
-      }
-
-      phi::DenseTensor dy_mul_x_sub_mean_mean_tile;
-      {
-        framework::NPUAttributeMap attr_input = {{"multiples", multiples}};
-        dy_mul_x_sub_mean_mean_tile.Resize(x->dims());
-        dy_mul_x_sub_mean_mean_tile.mutable_data<float>(place);
-        const auto &runner = NpuOpRunner("TileD",
-                                         {dy_mul_x_sub_mean_mean_tile_1},
-                                         {dy_mul_x_sub_mean_mean_tile},
-                                         attr_input);
-        runner.Run(stream);
-      }
-
-      // (x - mean) * np.mean(dy * (x - mean), axis=axis)
-      // x_sub_saved_mean * dy_mul_x_sub_mean_mean_tile
-      phi::DenseTensor tmp1;
-      {
-        tmp1.Resize(x->dims());
-        tmp1.mutable_data<float>(place);
-        const auto &runner = NpuOpRunner(
-            "Mul", {x_sub_saved_mean, dy_mul_x_sub_mean_mean_tile}, {tmp1}, {});
-        runner.Run(stream);
-      }
-
-      // (x - mean) * np.mean(dy * (x - mean), axis=axis) / (var + epsilon)
-      // tmp1 / (var + epsilon)
-      // tmp1 / var_ref_tile_add_epsilon
-      phi::DenseTensor tmp2;
-      {
-        tmp2.Resize(x->dims());
-        tmp2.mutable_data<float>(place);
-        const auto &runner =
-            NpuOpRunner("Div", {tmp1, var_ref_tile_add_epsilon}, {tmp2}, {});
-        runner.Run(stream);
-      }
-
-      // dy - np.mean(dy, axis) - (x - mean) * np.mean(dy * (x - mean), axis) /
-      // (var + epsilon)
-      // dy_sub_dy_mean - tmp2
-      phi::DenseTensor tmp3;
-      {
-        tmp3.Resize(x->dims());
-        tmp3.mutable_data<float>(place);
-        const auto &runner =
-            NpuOpRunner("Sub", {dy_sub_dy_mean, tmp2}, {tmp3}, {});
-        runner.Run(stream);
-      }
-
-      phi::DenseTensor scale_tile_1;
-      {
-        scale_tile_1.Resize({C});
-        scale_tile_1.mutable_data<float>(place);
-        paddle::framework::TensorCopySync(*scale, place, &scale_tile_1);
-        if (layout == phi::DataLayout::kNCHW)
-          scale_tile_1.Resize({1, C, 1, 1});
-        else if (layout == phi::DataLayout::kNHWC)
-          scale_tile_1.Resize({1, 1, 1, C});
-      }
-
-      phi::DenseTensor scale_tile;
-      {
-        framework::NPUAttributeMap attr_input = {{"multiples", multiples}};
-        scale_tile.Resize(x->dims());
-        scale_tile.mutable_data<float>(place);
-        const auto &runner =
-            NpuOpRunner("TileD", {scale_tile_1}, {scale_tile}, attr_input);
-        runner.Run(stream);
-      }
-
-      // scale * (dy - np.mean(dy, axis) - (x - mean) * np.mean(dy * (x - mean),
-      // axis) / (var + epsilon))
-      // scale * tmp3
-      phi::DenseTensor dx_1;
-      {
-        dx_1.Resize(x->dims());
-        dx_1.mutable_data<float>(place);
-
-        const auto &runner = NpuOpRunner("Mul", {scale_tile, tmp3}, {dx_1}, {});
-        runner.Run(stream);
-      }
-
-      // dx_1 / var_ref_tile_add_epsilon_sqrt
-      {
-        d_x->Resize(x->dims());
-        d_x->mutable_data<T>(place);
-        const auto &runner = NpuOpRunner(
-            "Div", {dx_1, var_ref_tile_add_epsilon_sqrt}, {*d_x}, {});
-        runner.Run(stream);
-      }
-    }
-
-    // cacl d_scale
-    if (d_scale) {
-      phi::DenseTensor d_scale_2;
-      {
-        d_scale_2.Resize(x->dims());
-        d_scale_2.mutable_data<float>(place);
-        const auto &runner = NpuOpRunner(
-            "Div",
-            {dy_mul_x_sub_mean_for_scale, var_ref_tile_add_epsilon_sqrt},
-            {d_scale_2},
-            {});
-        runner.Run(stream);
-      }
-
-      {
-        framework::NPUAttributeMap attr_input = {{"keep_dims", false},
-                                                 {"axes", axes}};
-        d_scale->mutable_data<float>(place);
-        const auto &runner =
-            NpuOpRunner("ReduceSumD", {d_scale_2}, {*d_scale}, attr_input);
-        runner.Run(stream);
-      }
-    }
-
-    // cacl d_bias
-    if (d_bias) {
-      if (framework::TransToProtoVarType(d_y->dtype()) ==
-          framework::proto::VarType::FP16) {
-        framework::NPUAttributeMap attr_input = {{"keep_dims", false},
-                                                 {"axes", axes}};
-        d_bias->mutable_data<float>(place);
-        const auto &runner =
-            NpuOpRunner("ReduceSumD", {*d_y}, {*d_bias}, attr_input);
-        runner.Run(stream);
-      } else {
-        framework::NPUAttributeMap attr_input = {{"keep_dims", false},
-                                                 {"axes", axes}};
-        d_bias->mutable_data<float>(place);
-        const auto &runner =
-            NpuOpRunner("ReduceSumD", {*d_y}, {*d_bias}, attr_input);
-        runner.Run(stream);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(
-    sync_batch_norm,
-    ops::SyncBatchNormNPUKernel<plat::NPUDeviceContext, float>);
-REGISTER_OP_NPU_KERNEL(
-    sync_batch_norm_grad,
-    ops::SyncBatchNormNPUGradKernel<plat::NPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/take_along_axis_op_npu.cc b/paddle/fluid/operators/take_along_axis_op_npu.cc
deleted file mode 100644
index ce10caf1b2e19b..00000000000000
--- a/paddle/fluid/operators/take_along_axis_op_npu.cc
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-// TODO(Aganlengzi): delete this macro control and remove REMOVE_ITEM in
-// cmake/operators.cmake when Paddle supports
-#if (CANN_VERSION_CODE >= 504000)
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class NPUTakeAlongAxisKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto input = ctx.Input<phi::DenseTensor>("Input");
-    auto axis = ctx.Attr<int>("Axis");
-    auto index = ctx.Input<phi::DenseTensor>("Index");
-    auto result = ctx.Output<phi::DenseTensor>("Result");
-    result->mutable_data<T>(ctx.GetPlace());
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    const auto& runner = NpuOpRunner(
-        "GatherElements", {*input, *index}, {*result}, {{"dim", axis}});
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class NPUTakeAlongAxisGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto axis = ctx.Attr<int>("Axis");
-    auto index = ctx.Input<phi::DenseTensor>("Index");
-    auto result_grad =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("Result"));
-
-    auto input_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Input"));
-    input_grad->mutable_data<T>(ctx.GetPlace());
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    const auto& runner = NpuOpRunner("ScatterAddWithAxis",
-                                     {*input_grad, *index, *result_grad},
-                                     {*input_grad},
-                                     {{"axis", axis}});
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(
-    take_along_axis,
-    ops::NPUTakeAlongAxisKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::NPUTakeAlongAxisKernel<paddle::platform::NPUDeviceContext, int64_t>,
-    ops::NPUTakeAlongAxisKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::NPUTakeAlongAxisKernel<paddle::platform::NPUDeviceContext, double>)
-REGISTER_OP_NPU_KERNEL(
-    take_along_axis_grad,
-    ops::NPUTakeAlongAxisGradKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::NPUTakeAlongAxisGradKernel<paddle::platform::NPUDeviceContext,
-                                    int64_t>,
-    ops::NPUTakeAlongAxisGradKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::NPUTakeAlongAxisGradKernel<paddle::platform::NPUDeviceContext, double>)
-
-#endif
diff --git a/paddle/fluid/operators/tile_op_npu.cc b/paddle/fluid/operators/tile_op_npu.cc
deleted file mode 100644
index 2e3ab9dac04613..00000000000000
--- a/paddle/fluid/operators/tile_op_npu.cc
+++ /dev/null
@@ -1,138 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/tile_op_functor.h"
-
-namespace paddle {
-namespace operators {
-
-using NPUDeviceContext = platform::NPUDeviceContext;
-
-template <typename T>
-class TileNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto rank = context.Input<phi::DenseTensor>("X")->dims().size();
-    PADDLE_ENFORCE_GE(
-        rank,
-        1,
-        platform::errors::InvalidArgument(
-            "The rank of the input 'x' for tile op must be a positive "
-            "integer, but the value received is %d.",
-            rank));
-    PADDLE_ENFORCE_LE(
-        rank,
-        MAX_RANK_SUPPORTED,
-        platform::errors::InvalidArgument(
-            "The rank of the input 'x' for tile op "
-            "must be less than or equal to %d, but the value received is %d.",
-            MAX_RANK_SUPPORTED,
-            rank));
-    auto repeat_times = get_repeat_times(context);
-    int repeat_times_size = repeat_times.size();
-    PADDLE_ENFORCE_GE(
-        repeat_times_size,
-        1,
-        platform::errors::InvalidArgument(
-            "The number of elements of the input 'repeat_times' for tile "
-            "op must be positive, but the value received is %d.",
-            repeat_times_size));
-    PADDLE_ENFORCE_LE(
-        repeat_times_size,
-        MAX_RANK_SUPPORTED,
-        platform::errors::InvalidArgument(
-            "The number of elements of the input 'repeat_times' for tile op "
-            "must be less than or equal to %d, but the value received is %d.",
-            MAX_RANK_SUPPORTED,
-            repeat_times_size));
-    rank = std::max(rank, repeat_times_size);
-    Tile(context);
-  }
-
- protected:
-  void Tile(const framework::ExecutionContext& context) const {
-    auto* in0 = context.Input<phi::DenseTensor>("X");
-
-    auto in_dims = in0->dims();
-    auto repeat_times = get_repeat_times(context);
-    for (size_t i = 0; i < repeat_times.size(); ++i) {
-      PADDLE_ENFORCE_GT(
-          repeat_times[i],
-          0,
-          platform::errors::InvalidArgument(
-              "All elements of the input 'repeat_times' for tile op must "
-              "be positive integers, but the value received is %d.",
-              repeat_times[i]));
-    }
-    auto vec_in_dims = phi::vectorize<int>(in_dims);
-    if (repeat_times.size() < vec_in_dims.size()) {
-      int diff = vec_in_dims.size() - repeat_times.size();
-      repeat_times.insert(repeat_times.begin(), diff, 1);
-    } else {
-      int diff = repeat_times.size() - vec_in_dims.size();
-      vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
-    }
-    PADDLE_ENFORCE_EQ(
-        repeat_times.size(),
-        vec_in_dims.size(),
-        platform::errors::InvalidArgument(
-            "The rank (%d) of the input 'x' and the rank (%d) of the input "
-            "'repeat_times' for tile op must match after promotion.",
-            vec_in_dims.size(),
-            repeat_times.size()));
-    auto* out0 = context.Output<phi::DenseTensor>("Out");
-
-    framework::DDim new_in_dims = phi::make_ddim(vec_in_dims);
-    framework::DDim out_dims(new_in_dims);
-
-    for (size_t i = 0; i < repeat_times.size(); ++i) {
-      out_dims[i] *= repeat_times[i];
-    }
-
-    out0->Resize(out_dims);
-    out0->mutable_data<T>(context.GetPlace());
-
-    std::vector<int> temp(repeat_times.size(), 1);
-    if (repeat_times == temp) {
-      framework::TensorCopy(*in0,
-                            context.GetPlace(),
-                            context.template device_context<NPUDeviceContext>(),
-                            out0);
-      return;
-    }
-
-    // const auto& runner =
-    //     NpuOpRunner("TileD", {*in0}, {*out0}, {{"multiples", repeat_times}});
-    auto stream = context.template device_context<NPUDeviceContext>().stream();
-    NpuOpRunner runner;
-    runner.SetType("Tile")
-        .AddInput(*in0)
-        .AddInput(std::move(repeat_times))
-        .AddOutput(*out0)
-        .Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_NPU_KERNEL(tile,
-                       ops::TileNPUKernel<float>,
-                       ops::TileNPUKernel<int>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-                       ops::TileNPUKernel<int64_t>,
-#endif
-                       ops::TileNPUKernel<bool>,
-                       ops::TileNPUKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/top_k_op_npu.cc b/paddle/fluid/operators/top_k_op_npu.cc
deleted file mode 100644
index 478523721458dd..00000000000000
--- a/paddle/fluid/operators/top_k_op_npu.cc
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/operators/top_k_op.h"
-
-namespace paddle {
-namespace operators {
-
-void gen_assist_seq(phi::DenseTensor* assit_tensor,
-                    int64_t dim,
-                    const framework::ExecutionContext& ctx) {
-  const int64_t dimx2 = dim;
-  std::vector<paddle::platform::float16> assit;
-  assit.resize(2 * dimx2);
-  for (int64_t i = 0; i < dimx2; i++) {
-    // for i in range [0, dim]
-    assit[i] = static_cast<paddle::platform::float16>(i);
-
-    // for i in range [dim, dimx2]
-    int64_t idx =
-        static_cast<int64_t>(static_cast<paddle::platform::float16>(i));
-    int64_t gap = i - idx;
-    assit[i + dim] = static_cast<paddle::platform::float16>(gap);
-  }
-  framework::TensorFromVector(assit, ctx.device_context(), assit_tensor);
-}
-
-template <typename DeviceContext, typename T>
-class TopkNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    // read input
-    auto* input = ctx.Input<phi::DenseTensor>("X");
-    auto* output = ctx.Output<phi::DenseTensor>("Out");
-    auto* indices = ctx.Output<phi::DenseTensor>("Indices");
-
-    size_t k = static_cast<int>(ctx.Attr<int>("k"));
-
-    output->mutable_data<T>(ctx.GetPlace());
-    indices->mutable_data<int64_t>(ctx.GetPlace());
-
-    // prepare assit
-    auto size = input->dims().size();
-    // dim is the last dimension of input
-    auto dim = input->dims()[size - 1];
-    phi::DenseTensor assist_seq_tensor;
-    assist_seq_tensor.Resize({2 * dim});
-    assist_seq_tensor.mutable_data<T>(ctx.GetPlace());
-    gen_assist_seq(&assist_seq_tensor, dim, ctx);
-
-    framework::NPUAttributeMap attr_input = {{"sorted", "true"},
-                                             {"k", static_cast<int>(k)},
-                                             {"dim", -1},
-                                             {"largest", true}};
-
-    phi::DenseTensor tmp_indices(phi::DataType::INT32);
-    tmp_indices.Resize(indices->dims());
-    tmp_indices.mutable_data<int>(ctx.GetPlace());
-
-    // run ascend
-    const auto& runner = NpuOpRunner("TopKD",
-                                     {*input, assist_seq_tensor},
-                                     {*output, tmp_indices},
-                                     attr_input);
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-
-    // cast indices from INT32 to INT64
-    auto dst_dtype =
-        ConvertToNpuDtype(framework::TransToProtoVarType(indices->dtype()));
-    const auto& runner_cast_indices =
-        NpuOpRunner("Cast",
-                    {tmp_indices},
-                    {*indices},
-                    {{"dst_type", static_cast<int>(dst_dtype)}});
-    runner_cast_indices.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-// Ascend Op TopKD only support input float 16 dtype
-REGISTER_OP_NPU_KERNEL(top_k,
-                       ops::TopkNPUKernel<paddle::platform::NPUDeviceContext,
-                                          paddle::platform::float16>);
diff --git a/paddle/fluid/operators/top_k_v2_op_npu.cc b/paddle/fluid/operators/top_k_v2_op_npu.cc
deleted file mode 100644
index 4e0b0650b9af68..00000000000000
--- a/paddle/fluid/operators/top_k_v2_op_npu.cc
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-// NOTE(Ruibiao): the Ascend TopKV2 operator used in this kernel
-// may lead to large accuracy error for float32 data
-template <typename T>
-class TopkV2NPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<phi::DenseTensor>("X");
-    auto* k_tensor = context.Input<phi::DenseTensor>("K");
-    auto* out = context.Output<phi::DenseTensor>("Out");
-    auto* indices = context.Output<phi::DenseTensor>("Indices");  // type: INT64
-
-    int32_t k = static_cast<int32_t>(context.Attr<int>("k"));
-    int axis = static_cast<int>(context.Attr<int>("axis"));
-    const bool sorted = static_cast<bool>(context.Attr<bool>("sorted"));
-    const bool largest = static_cast<bool>(context.Attr<bool>("largest"));
-
-    if (axis < 0) {
-      axis += input->dims().size();
-    }
-
-    if (k_tensor != nullptr) {
-      std::vector<int> v_tmp(1);
-      paddle::framework::TensorToVector(
-          *k_tensor,
-          context.template device_context<paddle::platform::NPUDeviceContext>(),
-          &v_tmp);
-      k = static_cast<int32_t>(v_tmp[0]);
-    }
-
-    framework::DDim output_dims = input->dims();
-    output_dims[axis] = k;
-
-    out->Resize(output_dims);
-    indices->Resize(output_dims);
-
-    out->mutable_data<T>(context.GetPlace());
-    indices->mutable_data<int64_t>(context.GetPlace());
-
-    phi::DenseTensor indices_int32(phi::DataType::INT32);
-    indices_int32.Resize(output_dims);
-    indices_int32.mutable_data<int32_t>(context.GetPlace());
-
-    auto npu_stream =
-        context.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    NpuOpRunner npu_op_runner_topkv2;
-    npu_op_runner_topkv2.SetType("TopKV2")
-        .AddInput(*input)
-        .AddInput(std::vector<int32_t>{k})
-        .AddOutput(*out)
-        .AddOutput(indices_int32)
-        .AddAttr("sorted", sorted)
-        .AddAttr("dim", axis)
-        .AddAttr("largest", largest)
-        .Run(npu_stream);
-
-    // Cast 'indices_int32' to 'indices', from INT32 to INT64
-    auto dst_dtype =
-        ConvertToNpuDtype(framework::TransToProtoVarType(indices->type()));
-    const auto& npu_op_runner_cast =
-        NpuOpRunner("Cast",
-                    {indices_int32},
-                    {*indices},
-                    {{"dst_type", static_cast<int>(dst_dtype)}});
-    npu_op_runner_cast.Run(npu_stream);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(top_k_v2,
-                       ops::TopkV2NPUKernel<float>,
-                       ops::TopkV2NPUKernel<plat::float16>,
-                       ops::TopkV2NPUKernel<double>,
-                       ops::TopkV2NPUKernel<int32_t>,
-                       ops::TopkV2NPUKernel<int64_t>);
diff --git a/paddle/fluid/operators/transpose_op_npu.cc b/paddle/fluid/operators/transpose_op_npu.cc
deleted file mode 100644
index 5af2edd60ce8f2..00000000000000
--- a/paddle/fluid/operators/transpose_op_npu.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/expand_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class TransposeNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
-    out->mutable_data<T>(ctx.device_context().GetPlace());
-    NpuOpRunner runner;
-    runner.SetType("Transpose")
-        .AddInput(*x)
-        .AddInput(std::move(axis))
-        .AddOutput(*out);
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-template <typename T>
-class TransposeGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out_grad = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* x_grad = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
-    std::vector<int> reversed_axis(axis);
-    for (size_t i = 0; i < axis.size(); i++) {
-      reversed_axis[axis[i]] = i;
-    }
-    x_grad->mutable_data<T>(ctx.GetPlace());
-    NpuOpRunner runner;
-    runner.SetType("Transpose")
-        .AddInput(*out_grad)
-        .AddInput(std::move(reversed_axis))
-        .AddOutput(*x_grad);
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    transpose2,
-    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext,
-                            paddle::platform::float16>,
-    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, int>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
-#endif
-    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, uint8_t>,
-    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, int8_t>);
-
-REGISTER_OP_NPU_KERNEL(transpose2_grad,
-                       ops::TransposeGradNPUKernel<float>,
-                       ops::TransposeGradNPUKernel<paddle::platform::float16>,
-                       ops::TransposeGradNPUKernel<int>,
-#ifdef PADDLE_WITH_ASCEND_INT64
-                       ops::TransposeGradNPUKernel<int64_t>,
-#endif
-                       ops::TransposeGradNPUKernel<uint8_t>,
-                       ops::TransposeGradNPUKernel<int8_t>);
diff --git a/paddle/fluid/operators/transpose_op_npu_test.cc b/paddle/fluid/operators/transpose_op_npu_test.cc
deleted file mode 100644
index 0ef5af349decfa..00000000000000
--- a/paddle/fluid/operators/transpose_op_npu_test.cc
+++ /dev/null
@@ -1,141 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <cmath>
-#include <iostream>
-#include <numeric>
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-USE_OP_ITSELF(transpose2);
-USE_OP_DEVICE_KERNEL(transpose2, NPU);
-
-template <typename T>
-void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
-  // init
-  auto x = scope->Var("X");
-  auto out = scope->Var("Out");
-  auto xshape = scope->Var("XShape");
-  auto* x_t = x->GetMutable<phi::DenseTensor>();
-  auto* out_t = out->GetMutable<phi::DenseTensor>();
-  auto* xshape_t = xshape->GetMutable<phi::DenseTensor>();
-  auto place = ctx.GetPlace();
-
-  int dim0 = 2;
-  int dim1 = 3;
-  paddle::framework::TensorFromVector(
-      std::vector<T>({0, 1, 2, 3, 4, 5}), ctx, x_t);
-  ctx.Wait();
-  x_t->Resize({dim0, dim1});
-  out_t->Resize({dim0, dim1});
-  ctx.Wait();
-  out_t->mutable_data<T>(place);
-  ctx.Wait();
-  xshape_t->Resize({dim0, dim1});
-  xshape_t->mutable_data<T>(place);
-  f::AttributeMap attrs = {{"axis", std::vector<int>({1, 0})},
-                           {"data_format", std::string("AnyLayout")}};
-  auto op = f::OpRegistry::CreateOp("transpose2",
-                                    {{"X", {"X"}}},
-                                    {{"Out", {"Out"}}, {"XShape", {"XShape"}}},
-                                    attrs);
-  ctx.Wait();
-  op->Run(*scope, place);
-  ctx.Wait();
-  std::vector<T> out_v;
-  paddle::framework::TensorToVector(*out_t, ctx, &out_v);
-  ctx.Wait();
-
-  EXPECT_EQ(out_t->numel(), dim0 * dim1);
-  EXPECT_EQ(out_v[0], 0);
-  EXPECT_EQ(out_v[1], 3);
-  EXPECT_EQ(out_v[2], 1);
-  EXPECT_EQ(out_v[3], 4);
-  EXPECT_EQ(out_v[4], 2);
-  EXPECT_EQ(out_v[5], 5);
-}
-
-template <typename T>
-void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
-  // init
-  auto xshape = scope->Var("XShape");
-  auto x_grad = scope->Var("X@GRAD");
-  auto out_grad = scope->Var("Out@GRAD");
-
-  auto* x_grad_t = x_grad->GetMutable<phi::DenseTensor>();
-  auto* xshape_t = xshape->GetMutable<phi::DenseTensor>();
-  auto* out_grad_t = out_grad->GetMutable<phi::DenseTensor>();
-
-  int dim0 = 2;
-  int dim1 = 3;
-  auto place = ctx.GetPlace();
-
-  paddle::framework::TensorFromVector(
-      std::vector<T>({0, 1, 2, 3, 4, 5}), ctx, out_grad_t);
-  ctx.Wait();
-
-  x_grad_t->Resize({dim0, dim1});
-  xshape_t->Resize(
-      {0,
-       dim0,
-       dim1});  // NOTE(zhiqiu): 0 is needed, see its infershape function
-  out_grad_t->Resize({dim0, dim1});
-
-  f::AttributeMap attrs = {{"axis", std::vector<int>({1, 0})},
-                           {"data_format", std::string("AnyLayout")}};
-
-  auto op = f::OpRegistry::CreateOp(
-      "transpose2_grad",
-      {{"Out@GRAD", {"Out@GRAD"}}, {"XShape", {"XShape"}}},
-      {{"X@GRAD", {"X@GRAD"}}},
-      attrs);
-
-  op->Run(*scope, place);
-  ctx.Wait();
-  std::vector<T> out_v;
-  paddle::framework::TensorToVector(*x_grad_t, ctx, &out_v);
-  ctx.Wait();
-
-  EXPECT_EQ(x_grad_t->numel(), dim0 * dim1);
-  EXPECT_EQ(out_v[0], 0);
-  EXPECT_EQ(out_v[1], 3);
-  EXPECT_EQ(out_v[2], 1);
-  EXPECT_EQ(out_v[3], 4);
-  EXPECT_EQ(out_v[4], 2);
-  EXPECT_EQ(out_v[5], 5);
-}
-
-TEST(transpose2, NPU_fp32) {
-  f::Scope scope;
-  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<float>(&scope, *ctx);
-}
-
-TEST(transpose2_grad, NPU_fp32) {
-  f::Scope scope;
-  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  CompareGrad<float>(&scope, *ctx);
-}
diff --git a/paddle/fluid/operators/tril_triu_op_npu.cc b/paddle/fluid/operators/tril_triu_op_npu.cc
deleted file mode 100644
index b47797a5bb131d..00000000000000
--- a/paddle/fluid/operators/tril_triu_op_npu.cc
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class TrilTriuNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    int diagonal = ctx.Attr<int>("diagonal");
-    bool lower = ctx.Attr<bool>("lower");
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    std::string op_type = lower ? "Tril" : "Triu";
-
-    framework::NPUAttributeMap attr_input = {{"diagonal", diagonal}};
-
-    const auto& dev_ctx =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>();
-
-    auto op_func_tril = [](const std::vector<Tensor>& inputs,
-                           const std::vector<Tensor>& outputs,
-                           const NPUAttributeMap& attrs,
-                           const platform::NPUDeviceContext& dev_ctx) {
-      const auto& runner = NpuOpRunner("Tril", inputs, outputs, attrs);
-      runner.Run(dev_ctx.stream());
-    };
-
-    auto op_func_triu = [](const std::vector<Tensor>& inputs,
-                           const std::vector<Tensor>& outputs,
-                           const NPUAttributeMap& attrs,
-                           const platform::NPUDeviceContext& dev_ctx) {
-      const auto& runner = NpuOpRunner("Triu", inputs, outputs, attrs);
-      runner.Run(dev_ctx.stream());
-    };
-
-    if (framework::TransToProtoVarType(x->dtype()) ==
-        framework::proto::VarType::BOOL) {
-      if (lower) {
-        NpuOpRunner::TypeAdapter({*x},
-                                 {*out},
-                                 attr_input,
-                                 dev_ctx,
-                                 op_func_tril,
-                                 {framework::proto::VarType::UINT8},
-                                 {framework::proto::VarType::UINT8});
-      } else {
-        NpuOpRunner::TypeAdapter({*x},
-                                 {*out},
-                                 attr_input,
-                                 dev_ctx,
-                                 op_func_triu,
-                                 {framework::proto::VarType::UINT8},
-                                 {framework::proto::VarType::UINT8});
-      }
-    } else {
-      const auto& runner = NpuOpRunner(op_type, {*x}, {*out}, attr_input);
-      runner.Run(dev_ctx.stream());
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(
-    tril_triu,
-    ops::TrilTriuNPUKernel<plat::NPUDeviceContext, float>,
-    ops::TrilTriuNPUKernel<plat::NPUDeviceContext, int>,
-    ops::TrilTriuNPUKernel<plat::NPUDeviceContext, bool>,
-    ops::TrilTriuNPUKernel<plat::NPUDeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
deleted file mode 100644
index da9fa93130bd19..00000000000000
--- a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
+++ /dev/null
@@ -1,112 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/operators/truncated_gaussian_random_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class TruncatedGaussianRandomNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    // TODO(zhiqiu): support dynamic shape and call ParameterizedTruncatedNormal
-    std::vector<int> shape = ctx.Attr<std::vector<int>>("shape");
-    phi::DenseTensor shape_tensor(phi::DataType::INT32);
-    shape_tensor.mutable_data<int32_t>({static_cast<int>(shape.size())},
-                                       ctx.GetPlace());
-    paddle::framework::TensorFromVector(
-        shape, ctx.device_context(), &shape_tensor);
-    float mean = ctx.Attr<float>("mean");
-    phi::DenseTensor mean_tensor(phi::DataType::FLOAT32);
-    mean_tensor.mutable_data<float>({1}, ctx.GetPlace());
-    FillNpuTensorWithConstant<float>(&mean_tensor, mean);
-
-    float std = ctx.Attr<float>("std");
-    phi::DenseTensor std_tensor(phi::DataType::FLOAT32);
-    std_tensor.mutable_data<float>({1}, ctx.GetPlace());
-    FillNpuTensorWithConstant<float>(&std_tensor, std);
-
-    int32_t seed_var = ctx.Attr<int32_t>("seed");
-
-    phi::DenseTensor min_tensor(phi::DataType::FLOAT32);
-    min_tensor.mutable_data<float>({1}, ctx.GetPlace());
-    float min_value = mean - std * 2.0;
-    FillNpuTensorWithConstant<float>(&min_tensor, min_value);
-
-    phi::DenseTensor max_tensor(phi::DataType::FLOAT32);
-    max_tensor.mutable_data<float>({1}, ctx.GetPlace());
-    float max_value = mean + std * 2.0;
-    FillNpuTensorWithConstant<float>(&max_tensor, max_value);
-
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    const auto& runner = NpuOpRunner(
-        "ParameterizedTruncatedNormal",
-        {shape_tensor, mean_tensor, std_tensor, min_tensor, max_tensor},
-        {*out},
-        {{"seed", seed_var}});
-    runner.Run(stream);
-  }
-};
-
-// NOTE(zhiqiu): actually, this is cpu version kernel, and we need to make the
-// above
-// npu version work in the future.
-template <typename T>
-class NPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    float mean = context.Attr<float>("mean");
-    float std = context.Attr<float>("std");
-    auto* tensor = context.Output<phi::DenseTensor>("Out");
-    tensor->mutable_data<T>(context.GetPlace());
-
-    phi::DenseTensor cpu_tensor(tensor->dtype());
-    cpu_tensor.Resize(tensor->dims());
-    T* cpu_data = cpu_tensor.mutable_data<T>(platform::CPUPlace());
-    std::uniform_real_distribution<T> dist(std::numeric_limits<float>::min(),
-                                           1.0);
-    TruncatedNormal<T> truncated_normal(mean, std);
-    int64_t size = tensor->numel();
-
-    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    auto engine = phi::GetCPURandomEngine(seed);
-    for (int64_t i = 0; i < size; ++i) {
-      cpu_data[i] = truncated_normal(dist(*engine));
-    }
-    framework::TensorCopy(
-        cpu_tensor,
-        context.GetPlace(),
-        context.template device_context<platform::DeviceContext>(),
-        tensor);
-    context.template device_context<paddle::platform::NPUDeviceContext>()
-        .Wait();
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(truncated_gaussian_random,
-                       ops::NPUTruncatedGaussianRandomKernel<float>);
diff --git a/paddle/fluid/operators/uniform_random_op_npu.cc b/paddle/fluid/operators/uniform_random_op_npu.cc
deleted file mode 100644
index 5958a7751b8beb..00000000000000
--- a/paddle/fluid/operators/uniform_random_op_npu.cc
+++ /dev/null
@@ -1,115 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/uniform_random_op.h"
-#include "paddle/phi/core/generator.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class NPUUniformRandomKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    phi::DenseTensor *tensor = nullptr;
-    auto out_var = ctx.OutputVar("Out");
-    std::vector<int64_t> new_shape;
-    auto list_new_shape_tensor =
-        ctx.MultiInput<phi::DenseTensor>("ShapeTensorList");
-    if (list_new_shape_tensor.size() > 0 || ctx.HasInput("ShapeTensor")) {
-      if (ctx.HasInput("ShapeTensor")) {
-        auto *shape_tensor = ctx.Input<phi::DenseTensor>("ShapeTensor");
-        new_shape = GetNewDataFromShapeTensor(shape_tensor);
-      } else if (list_new_shape_tensor.size() > 0) {
-        new_shape = GetNewDataFromShapeTensorList(list_new_shape_tensor);
-      }
-    }
-
-    if (out_var->IsType<phi::SelectedRows>()) {
-      auto *selected_rows = out_var->GetMutable<phi::SelectedRows>();
-      tensor = selected_rows->mutable_value();
-      auto shape = ctx.Attr<std::vector<int64_t>>("shape");
-      if (!new_shape.empty()) shape = new_shape;
-      tensor->Resize(phi::make_ddim(shape));
-      selected_rows->mutable_rows()->reserve(shape[0]);
-    } else if (out_var->IsType<phi::DenseTensor>()) {
-      tensor = out_var->GetMutable<phi::DenseTensor>();
-      if (!new_shape.empty()) tensor->Resize(phi::make_ddim(new_shape));
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Expected type of Output(out) in uniform_random_op must be "
-          "phi::DenseTensor, "
-          "SelectedRows. But got "
-          "unsupport type: %s.",
-          framework::ToTypeName(out_var->Type())));
-    }
-    tensor->mutable_data<T>(ctx.GetPlace());
-    int64_t size = tensor->numel();
-
-    phi::DenseTensor cpu_tensor(tensor->dtype());
-    cpu_tensor.Resize(tensor->dims());
-    T *data_cpu = cpu_tensor.mutable_data<T>(platform::CPUPlace());
-
-    std::uniform_real_distribution<T> dist(
-        static_cast<T>(ctx.Attr<float>("min")),
-        static_cast<T>(ctx.Attr<float>("max")));
-    unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
-    auto engine = phi::GetCPURandomEngine(seed);
-
-    for (int64_t i = 0; i < size; ++i) {
-      data_cpu[i] = dist(*engine);
-    }
-
-    unsigned int diag_num =
-        static_cast<unsigned int>(ctx.Attr<int>("diag_num"));
-    unsigned int diag_step =
-        static_cast<unsigned int>(ctx.Attr<int>("diag_step"));
-    auto diag_val = static_cast<T>(ctx.Attr<float>("diag_val"));
-    if (diag_num > 0) {
-      PADDLE_ENFORCE_GT(
-          size,
-          (diag_num - 1) * (diag_step + 1),
-          platform::errors::InvalidArgument(
-              "ShapeInvalid: the diagonal's elements is equal (num-1) "
-              "* (step-1) with num %d, step %d,"
-              "It should be smaller than %d, but received %d",
-              diag_num,
-              diag_step,
-              (diag_num - 1) * (diag_step + 1),
-              size));
-      for (int64_t i = 0; i < diag_num; ++i) {
-        int64_t pos = i * diag_step + i;
-        data_cpu[pos] = diag_val;
-      }
-    }
-
-    // copy to NPU
-    framework::TensorCopy(
-        cpu_tensor,
-        ctx.GetPlace(),
-        ctx.template device_context<platform::DeviceContext>(),
-        tensor);
-    ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_NPU_KERNEL(uniform_random,
-                       paddle::operators::NPUUniformRandomKernel<float>);
diff --git a/paddle/fluid/operators/unsqueeze_op_npu.cc b/paddle/fluid/operators/unsqueeze_op_npu.cc
deleted file mode 100644
index b2b09faaa9d445..00000000000000
--- a/paddle/fluid/operators/unsqueeze_op_npu.cc
+++ /dev/null
@@ -1,13 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
diff --git a/paddle/fluid/operators/unsqueeze_op_npu_test.cc b/paddle/fluid/operators/unsqueeze_op_npu_test.cc
deleted file mode 100644
index bf66941f902788..00000000000000
--- a/paddle/fluid/operators/unsqueeze_op_npu_test.cc
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-USE_OP(unsqueeze);
-USE_OP_DEVICE_KERNEL(unsqueeze, NPU);
-
-template <typename T>
-void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
-  // init
-  auto x = scope->Var("X");
-  auto tensor_x = x->GetMutable<phi::DenseTensor>();
-
-  int dim0 = 5;
-  int dim1 = 10;
-
-  std::vector<T> init;
-  for (int64_t i = 0; i < dim0 * dim1; ++i) {
-    init.push_back(static_cast<T>(0.1));
-  }
-
-  paddle::framework::TensorFromVector(init, ctx, tensor_x);
-  tensor_x->Resize({dim0, dim1});
-
-  ctx.Wait();
-
-  // run
-  auto place = ctx.GetPlace();
-  auto out = scope->Var("Out");
-  auto tensor_out = out->GetMutable<phi::DenseTensor>();
-
-  std::vector<int> axis;
-  axis.push_back(1);
-  f::AttributeMap attrs = {{"axes", axis}};
-
-  auto op = f::OpRegistry::CreateOp(
-      "unsqueeze", {{"X", {"X"}}}, {{"Out", {"Out"}}}, attrs);
-
-  op->Run(*scope, place);
-  ctx.Wait();
-
-  EXPECT_EQ((uint32_t)tensor_out->dims().size(), uint32_t(3));
-  EXPECT_EQ((uint32_t)tensor_out->dims()[0], uint32_t(5));
-  EXPECT_EQ((uint32_t)tensor_out->dims()[1], uint32_t(1));
-  EXPECT_EQ((uint32_t)tensor_out->dims()[2], uint32_t(10));
-
-  std::vector<T> out_vec;
-  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
-  for (uint32_t i = 0; i < out_vec.size(); i++) {
-    EXPECT_EQ(out_vec[i], static_cast<T>(0.1));
-  }
-
-  ctx.Wait();
-}
-
-TEST(unsqueeze, NPU_fp32) {
-  f::Scope scope;
-  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<float>(&scope, *ctx);
-}
diff --git a/paddle/fluid/operators/unstack_op_npu.cc b/paddle/fluid/operators/unstack_op_npu.cc
deleted file mode 100644
index 4c1aa39168b69c..00000000000000
--- a/paddle/fluid/operators/unstack_op_npu.cc
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class UnStackNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *dy = ctx.Input<phi::DenseTensor>("X");
-    auto dx = ctx.MultiOutput<phi::DenseTensor>("Y");
-    int axis = ctx.Attr<int>("axis");
-    if (axis < 0) axis += dy->dims().size();
-    int num = dy->dims()[axis];
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    std::vector<phi::DenseTensor> dx_list;
-    for (int i = 0; i < num; i++) {
-      dx[i]->mutable_data<T>(ctx.GetPlace());
-      dx_list.push_back(*dx[i]);
-    }
-
-    const auto &runner =
-        NpuOpRunner("Unpack", {*dy}, {dx_list}, {{"axis", axis}, {"num", num}});
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class UnStackGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto x = ctx.MultiInput<phi::DenseTensor>(framework::GradVarName("Y"));
-    auto *y = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    int axis = ctx.Attr<int>("axis");
-    if (axis < 0) axis += (x[0]->dims().size() + 1);
-    int num = static_cast<int>(x.size());
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    std::vector<phi::DenseTensor> x_list;
-    for (int i = 0; i < num; i++) {
-      x_list.push_back(*x[i]);
-    }
-    y->mutable_data<T>(ctx.GetPlace());
-
-    const auto &runner =
-        NpuOpRunner("Pack", {x_list}, {*y}, {{"axis", axis}, {"N", num}});
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace plat = paddle::platform;
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    unstack,
-    ops::UnStackNPUKernel<plat::NPUDeviceContext, float>,
-    ops::UnStackNPUKernel<plat::NPUDeviceContext, plat::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    unstack_grad,
-    ops::UnStackGradNPUKernel<plat::NPUDeviceContext, float>,
-    ops::UnStackGradNPUKernel<plat::NPUDeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/where_index_op_npu.cc b/paddle/fluid/operators/where_index_op_npu.cc
deleted file mode 100644
index b5c61e6b988aac..00000000000000
--- a/paddle/fluid/operators/where_index_op_npu.cc
+++ /dev/null
@@ -1,105 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class NPUWhereIndexKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto& dev_ctx =
-        context.template device_context<platform::NPUDeviceContext>();
-    auto* condition = context.Input<phi::DenseTensor>("Condition");
-    auto* out = context.Output<phi::DenseTensor>("Out");
-
-    auto dims = condition->dims();
-    const int rank = dims.size();
-
-    auto place = context.GetPlace();
-    const aclrtStream& stream = dev_ctx.stream();
-
-    // Run Cast and ReduceSum to get 0 dim of Out
-    phi::DenseTensor booled_cond;
-    if (framework::TransToProtoVarType(condition->dtype()) !=
-        framework::proto::VarType::BOOL) {
-      auto bool_type = ConvertToNpuDtype(framework::proto::VarType::BOOL);
-      booled_cond.mutable_data<bool>(dims, place);
-      const auto& booled_runner =
-          NpuOpRunner("Cast",
-                      {*condition},
-                      {booled_cond},
-                      {{"dst_type", static_cast<int>(bool_type)}});
-      booled_runner.Run(stream);
-    } else {
-      booled_cond.ShareDataWith(*condition);
-    }
-    phi::DenseTensor casted_cond;
-    auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::INT64);
-    casted_cond.mutable_data<int64_t>(dims, place);
-    const auto& cast_runner =
-        NpuOpRunner("Cast",
-                    {booled_cond},
-                    {casted_cond},
-                    {{"dst_type", static_cast<int>(dst_dtype)}});
-    cast_runner.Run(stream);
-
-    phi::DenseTensor sumed_true_num;
-    sumed_true_num.mutable_data<int64_t>({1}, place);
-    phi::DenseTensor cond_axes;
-    cond_axes.mutable_data<int>({dims.size()}, place);
-    std::vector<int> axes_vec;
-    for (int i = 0; i < dims.size(); ++i) {
-      axes_vec.push_back(i);
-    }
-    framework::TensorFromVector<int>(axes_vec, dev_ctx, &cond_axes);
-    const auto& sum_runner = NpuOpRunner("ReduceSum",
-                                         {casted_cond, cond_axes},
-                                         {sumed_true_num},
-                                         {{"keep_dims", false}});
-    sum_runner.Run(stream);
-
-    phi::DenseTensor local_true_num;
-    paddle::framework::TensorCopySync(
-        sumed_true_num, platform::CPUPlace(), &local_true_num);
-    auto true_num = *local_true_num.data<int64_t>();
-
-    out->Resize(phi::make_ddim({true_num, rank}));
-    out->mutable_data<int64_t>(place);
-
-    if (true_num == 0) {
-      return;
-    }
-
-    out->set_layout(DataLayout::kAnyLayout);
-    NpuOpRunner runner{"Where", {*condition}, {*out}};
-    runner.Run(stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_NPU_KERNEL(where_index,
-                       ops::NPUWhereIndexKernel<int64_t>,
-                       ops::NPUWhereIndexKernel<int>,
-                       ops::NPUWhereIndexKernel<bool>,
-                       ops::NPUWhereIndexKernel<float>,
-                       ops::NPUWhereIndexKernel<double>);
diff --git a/paddle/fluid/operators/where_op_npu.cc b/paddle/fluid/operators/where_op_npu.cc
deleted file mode 100644
index e1af771f947bb5..00000000000000
--- a/paddle/fluid/operators/where_op_npu.cc
+++ /dev/null
@@ -1,96 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class WhereNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* condition = ctx.Input<phi::DenseTensor>("Condition");
-    auto* X = ctx.Input<phi::DenseTensor>("X");
-    auto* Y = ctx.Input<phi::DenseTensor>("Y");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    const auto& runner =
-        NpuOpRunner("Select", {*condition, *X, *Y}, {*out}, {});
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class WhereGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* condition = ctx.Input<phi::DenseTensor>("Condition");
-    auto* dout_t = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* dx_t = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* dy_t = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
-
-    if (dx_t != nullptr) {
-      dx_t->mutable_data<T>(ctx.GetPlace());
-    }
-    if (dy_t != nullptr) {
-      dy_t->mutable_data<T>(ctx.GetPlace());
-    }
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    phi::DenseTensor tensor_zeros(dout_t->dtype());
-    tensor_zeros.mutable_data<T>(dout_t->dims(), ctx.GetPlace());
-    const auto& runner =
-        NpuOpRunner("ZerosLike", {*dout_t}, {tensor_zeros}, {});
-    runner.Run(stream);
-
-    if (dx_t != nullptr) {
-      const auto& runner = NpuOpRunner(
-          "Select", {*condition, *dout_t, tensor_zeros}, {*dx_t}, {});
-      runner.Run(stream);
-    }
-    if (dy_t != nullptr) {
-      const auto& runner = NpuOpRunner(
-          "Select", {*condition, tensor_zeros, *dout_t}, {*dy_t}, {});
-      runner.Run(stream);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    where,
-    ops::WhereNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::WhereNPUKernel<paddle::platform::NPUDeviceContext, double>,
-    ops::WhereNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::WhereNPUKernel<paddle::platform::NPUDeviceContext, int64_t>);
-
-REGISTER_OP_NPU_KERNEL(
-    where_grad,
-    ops::WhereGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::WhereGradNPUKernel<paddle::platform::NPUDeviceContext, double>,
-    ops::WhereGradNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::WhereGradNPUKernel<paddle::platform::NPUDeviceContext, int64_t>);

From 1ad943dd348c54d0abd4357b743a60caf01d140d Mon Sep 17 00:00:00 2001
From: risemeup1 <62429225+risemeup1@users.noreply.github.com>
Date: Mon, 10 Apr 2023 18:59:16 +0800
Subject: [PATCH 029/156] fix cuda compule error (#52654)

---
 cmake/cuda.cmake | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 710931d937e427..9c1d71914bc21c 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -58,7 +58,8 @@ function(detect_installed_gpus out_variable)
       "}\n")
 
     execute_process(
-      COMMAND "${CUDA_NVCC_EXECUTABLE}" "--run" "${cufile}"
+      COMMAND "${CUDA_NVCC_EXECUTABLE}" "-ccbin=${CMAKE_C_COMPILER}" "--run"
+              "${cufile}"
       WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
       RESULT_VARIABLE nvcc_res
       OUTPUT_VARIABLE nvcc_out

From bc9956ccfe19d20512bbfd6bcb03017696454e7f Mon Sep 17 00:00:00 2001
From: HongyuJia <jiahongyu@baidu.com>
Date: Mon, 10 Apr 2023 19:36:00 +0800
Subject: [PATCH 030/156] [CustomOP unittest] Polish unit test, phi->custom
 (#52670)

* [CustomOP unittest] Polish unit test, phi->custom

* Change phi->custom in custom_linear_op.cc
---
 test/custom_op/custom_linear_op.cc    |  23 +--
 test/custom_op/test_custom_inplace.py | 238 +++++++++++++-------------
 test/custom_op/test_custom_linear.py  |  36 ++--
 test/custom_op/test_multi_out_jit.py  |  32 ++--
 4 files changed, 171 insertions(+), 158 deletions(-)

diff --git a/test/custom_op/custom_linear_op.cc b/test/custom_op/custom_linear_op.cc
index ebfaaecd490934..fb7aac40b4f410 100644
--- a/test/custom_op/custom_linear_op.cc
+++ b/test/custom_op/custom_linear_op.cc
@@ -17,16 +17,17 @@ limitations under the License. */
 #include "paddle/extension.h"
 
 // The linear implemented here must be passed in bias
-std::vector<paddle::Tensor> PhiLinearForward(const paddle::Tensor& x,
-                                             const paddle::Tensor& weight,
-                                             const paddle::Tensor& bias) {
+std::vector<paddle::Tensor> CustomLinearForward(const paddle::Tensor& x,
+                                                const paddle::Tensor& weight,
+                                                const paddle::Tensor& bias) {
   return {paddle::add(paddle::matmul(x, weight), bias)};
 }
 
-std::vector<paddle::Tensor> PhiLinearBackward(const paddle::Tensor& x,
-                                              const paddle::Tensor& weight,
-                                              const paddle::Tensor& bias,
-                                              const paddle::Tensor& out_grad) {
+std::vector<paddle::Tensor> CustomLinearBackward(
+    const paddle::Tensor& x,
+    const paddle::Tensor& weight,
+    const paddle::Tensor& bias,
+    const paddle::Tensor& out_grad) {
   auto x_grad = paddle::matmul(out_grad, weight, false, true);
   auto weight_grad = paddle::matmul(x, out_grad, true, false);
   auto bias_grad = paddle::experimental::sum(out_grad, {0});
@@ -96,14 +97,14 @@ std::vector<paddle::DataType> LinearInferDtype(
   return {x_dtype};
 }
 
-PD_BUILD_OP(phi_linear)
+PD_BUILD_OP(custom_linear)
     .Inputs({"X", "Weight", "Bias"})
     .Outputs({"Out"})
-    .SetKernelFn(PD_KERNEL(PhiLinearForward))
+    .SetKernelFn(PD_KERNEL(CustomLinearForward))
     .SetInferShapeFn(PD_INFER_SHAPE(LinearInferShape))
     .SetInferDtypeFn(PD_INFER_DTYPE(LinearInferDtype));
 
-PD_BUILD_GRAD_OP(phi_linear)
+PD_BUILD_GRAD_OP(custom_linear)
     .Inputs({"X", "Weight", "Bias", paddle::Grad("Out")})
     .Outputs({paddle::Grad("X"), paddle::Grad("Weight"), paddle::Grad("Bias")})
-    .SetKernelFn(PD_KERNEL(PhiLinearBackward));
+    .SetKernelFn(PD_KERNEL(CustomLinearBackward));
diff --git a/test/custom_op/test_custom_inplace.py b/test/custom_op/test_custom_inplace.py
index bdfe018c40f672..2c0a5d4c513c18 100644
--- a/test/custom_op/test_custom_inplace.py
+++ b/test/custom_op/test_custom_inplace.py
@@ -41,11 +41,11 @@
 )
 
 
-def inplace_dynamic_add(phi_func, device, dtype, np_x, np_y):
+def inplace_dynamic_add(custom_func, device, dtype, np_x, np_y):
     paddle.set_device(device)
     x = paddle.to_tensor(np_x, dtype=dtype, stop_gradient=True)
     y = paddle.to_tensor(np_y, dtype=dtype, stop_gradient=False)
-    if phi_func:
+    if custom_func:
         out = custom_inplace.custom_add(x, y)
     else:
         out = x.add_(y)
@@ -88,14 +88,14 @@ def inplace_static_add(func, device, dtype, np_x, np_y):
     return x_v, out_v, x_grad_v, y_grad_v, out_grad_v
 
 
-def inplace_dynamic_add_vector(phi_func, device, dtype, np_inputs, np_y):
+def inplace_dynamic_add_vector(custom_func, device, dtype, np_inputs, np_y):
     paddle.set_device(device)
     inputs = [
         paddle.to_tensor(np_input, dtype=dtype, stop_gradient=True)
         for np_input in np_inputs
     ]
     y = paddle.to_tensor(np_y, dtype=dtype, stop_gradient=False)
-    if phi_func:
+    if custom_func:
         out = custom_inplace.custom_add_vec(inputs, y)
     else:
         out = [x.add_(y) for x in inputs]
@@ -111,7 +111,7 @@ def inplace_dynamic_add_vector(phi_func, device, dtype, np_inputs, np_y):
     )
 
 
-def inplace_static_add_vector(phi_func, device, dtype, np_inputs, np_y):
+def inplace_static_add_vector(custom_func, device, dtype, np_inputs, np_y):
     paddle.enable_static()
     paddle.set_device(device)
     with static.scope_guard(static.Scope()):
@@ -126,7 +126,7 @@ def inplace_static_add_vector(phi_func, device, dtype, np_inputs, np_y):
             x1.stop_gradient = False
             x2.stop_gradient = False
             y.stop_gradient = False
-            if phi_func:
+            if custom_func:
                 out = custom_inplace.custom_add_vec([x1, x2], y)
             else:
                 out = [paddle.add(x1, y), paddle.add(x2, y)]
@@ -170,13 +170,13 @@ def inplace_static_add_vector(phi_func, device, dtype, np_inputs, np_y):
     )
 
 
-def inplace_dynamic_relu_net(phi_func, device, dtype, np_x, np_y, np_z):
+def inplace_dynamic_relu_net(custom_func, device, dtype, np_x, np_y, np_z):
     paddle.set_device(device)
     x = paddle.to_tensor(np_x, dtype=dtype, stop_gradient=False)
     y = paddle.to_tensor(np_y, dtype=dtype, stop_gradient=False)
     z = paddle.to_tensor(np_z, dtype=dtype, stop_gradient=False)
     out_xy = x + y
-    if phi_func:
+    if custom_func:
         out_xy = custom_inplace.custom_relu_inplace(out_xy)
         out_xyz = out_xy + z
         out = custom_inplace.custom_relu_inplace(out_xyz)
@@ -229,13 +229,13 @@ def inplace_static_relu_net(func, device, dtype, np_x, np_y, np_z):
     return x_v, y_v, out_v, x_grad_v, y_grad_v
 
 
-def dynamic_multi_inplace(phi_func, device, dtype, np_x, np_y, np_a, np_b):
+def dynamic_multi_inplace(custom_func, device, dtype, np_x, np_y, np_a, np_b):
     paddle.set_device(device)
     x = paddle.to_tensor(np_x, dtype=dtype, stop_gradient=True)
     y = paddle.to_tensor(np_y, dtype=dtype, stop_gradient=False)
     a = paddle.to_tensor(np_a, dtype=dtype, stop_gradient=True)
     b = paddle.to_tensor(np_b, dtype=dtype, stop_gradient=False)
-    if phi_func:
+    if custom_func:
         out_xy, out_ab = custom_inplace.custom_multi_inplace(x, y, a, b)
     else:
         out_xy = x.add_(y)
@@ -257,7 +257,7 @@ def dynamic_multi_inplace(phi_func, device, dtype, np_x, np_y, np_a, np_b):
     )
 
 
-def static_multi_inplace(phi_func, device, dtype, np_x, np_y, np_a, np_b):
+def static_multi_inplace(custom_func, device, dtype, np_x, np_y, np_a, np_b):
     paddle.enable_static()
     paddle.set_device(device)
     with static.scope_guard(static.Scope()):
@@ -270,7 +270,7 @@ def static_multi_inplace(phi_func, device, dtype, np_x, np_y, np_a, np_b):
             y.stop_gradient = False
             a.stop_gradient = False
             b.stop_gradient = False
-            if phi_func:
+            if custom_func:
                 out_xy, out_ab = custom_inplace.custom_multi_inplace(x, y, a, b)
             else:
                 out_xy = paddle.add(x, y)
@@ -379,11 +379,11 @@ def test_static_add(self):
                     self.np_y,
                 )
                 (
-                    phi_x,
-                    phi_out,
-                    phi_x_grad,
-                    phi_y_grad,
-                    phi_out_grad,
+                    custom_x,
+                    custom_out,
+                    custom_x_grad,
+                    custom_y_grad,
+                    custom_out_grad,
                 ) = inplace_static_add(
                     custom_inplace.custom_add,
                     device,
@@ -391,15 +391,15 @@ def test_static_add(self):
                     self.np_x,
                     self.np_y,
                 )
-                self.check_output(phi_x, phi_out, "inplace_phi_x")
+                self.check_output(custom_x, custom_out, "inplace_custom_x")
                 self.check_output(
-                    phi_x_grad, phi_out_grad, "inplace_phi_x_grad"
+                    custom_x_grad, custom_out_grad, "inplace_custom_x_grad"
                 )
 
-                self.check_output(phi_out, pd_out, "out")
-                self.check_output(phi_x_grad, pd_x_grad, "x_grad")
-                self.check_output(phi_y_grad, pd_y_grad, "y_grad")
-                self.check_output(phi_out_grad, pd_out_grad, "out_grad")
+                self.check_output(custom_out, pd_out, "out")
+                self.check_output(custom_x_grad, pd_x_grad, "x_grad")
+                self.check_output(custom_y_grad, pd_y_grad, "y_grad")
+                self.check_output(custom_out_grad, pd_out_grad, "out_grad")
 
     def test_dynamic_add(self):
         for device in self.devices:
@@ -418,11 +418,11 @@ def test_dynamic_add(self):
                     self.np_y,
                 )
                 (
-                    phi_x,
-                    phi_y,
-                    phi_out,
-                    phi_x_grad,
-                    phi_y_grad,
+                    custom_x,
+                    custom_y,
+                    custom_out,
+                    custom_x_grad,
+                    custom_y_grad,
                 ) = inplace_dynamic_add(
                     True,
                     device,
@@ -431,14 +431,14 @@ def test_dynamic_add(self):
                     self.np_y,
                 )
 
-                self.check_output(phi_x, phi_out, "inplace_phi_x")
+                self.check_output(custom_x, custom_out, "inplace_custom_x")
                 self.check_output(pd_x, pd_out, "inplace_pd_x")
 
-                self.check_output(phi_x, pd_x, "x")
-                self.check_output(phi_y, pd_y, "y")
-                self.check_output(phi_out, pd_out, "out")
-                self.check_output(phi_x_grad, pd_x_grad, "x_grad")
-                self.check_output(phi_y_grad, pd_y_grad, "y_grad")
+                self.check_output(custom_x, pd_x, "x")
+                self.check_output(custom_y, pd_y, "y")
+                self.check_output(custom_out, pd_out, "out")
+                self.check_output(custom_x_grad, pd_x_grad, "x_grad")
+                self.check_output(custom_y_grad, pd_y_grad, "y_grad")
 
     def test_static_add_vector(self):
         for device in self.devices:
@@ -456,10 +456,10 @@ def test_static_add_vector(self):
                     self.np_y,
                 )
                 (
-                    phi_out,
-                    phi_x_grad,
-                    phi_y_grad,
-                    phi_out_grad,
+                    custom_out,
+                    custom_x_grad,
+                    custom_y_grad,
+                    custom_out_grad,
                 ) = inplace_static_add_vector(
                     False,
                     device,
@@ -468,10 +468,10 @@ def test_static_add_vector(self):
                     self.np_y,
                 )
 
-                self.check_output(phi_out, pd_out, "out")
-                self.check_output(phi_x_grad, pd_x_grad, "x_grad")
-                self.check_output(phi_y_grad, pd_y_grad, "y_grad")
-                self.check_output(phi_out_grad, pd_out_grad, "out_grad")
+                self.check_output(custom_out, pd_out, "out")
+                self.check_output(custom_x_grad, pd_x_grad, "x_grad")
+                self.check_output(custom_y_grad, pd_y_grad, "y_grad")
+                self.check_output(custom_out_grad, pd_out_grad, "out_grad")
 
     def test_dynamic_add_vector(self):
         for device in self.devices:
@@ -490,11 +490,11 @@ def test_dynamic_add_vector(self):
                     self.np_y,
                 )
                 (
-                    phi_x,
-                    phi_y,
-                    phi_out,
-                    phi_x_grad,
-                    phi_y_grad,
+                    custom_x,
+                    custom_y,
+                    custom_out,
+                    custom_x_grad,
+                    custom_y_grad,
                 ) = inplace_dynamic_add_vector(
                     False,
                     device,
@@ -503,14 +503,14 @@ def test_dynamic_add_vector(self):
                     self.np_y,
                 )
 
-                self.check_output(phi_x, phi_out, "inplace_phi_x")
+                self.check_output(custom_x, custom_out, "inplace_custom_x")
                 self.check_output(pd_x, pd_out, "inplace_pd_x")
 
-                self.check_output(phi_x, pd_x, "x")
-                self.check_output(phi_y, pd_y, "y")
-                self.check_output(phi_out, pd_out, "out")
-                self.check_output(phi_x_grad, pd_x_grad, "x_grad")
-                self.check_output(phi_y_grad, pd_y_grad, "y_grad")
+                self.check_output(custom_x, pd_x, "x")
+                self.check_output(custom_y, pd_y, "y")
+                self.check_output(custom_out, pd_out, "out")
+                self.check_output(custom_x_grad, pd_x_grad, "x_grad")
+                self.check_output(custom_y_grad, pd_y_grad, "y_grad")
 
     def test_static_relu_net(self):
         for device in self.devices:
@@ -530,11 +530,11 @@ def test_static_relu_net(self):
                     self.np_z,
                 )
                 (
-                    phi_x,
-                    phi_y,
-                    phi_out,
-                    phi_x_grad,
-                    phi_y_grad,
+                    custom_x,
+                    custom_y,
+                    custom_out,
+                    custom_x_grad,
+                    custom_y_grad,
                 ) = inplace_static_relu_net(
                     custom_inplace.custom_relu_inplace,
                     device,
@@ -543,11 +543,11 @@ def test_static_relu_net(self):
                     self.np_y,
                     self.np_z,
                 )
-                self.check_output_allclose(phi_x, pd_x, "x")
-                self.check_output_allclose(phi_y, pd_y, "y")
-                self.check_output_allclose(phi_out, pd_out, "out")
-                self.check_output_allclose(phi_x_grad, pd_x_grad, "x_grad")
-                self.check_output_allclose(phi_y_grad, pd_y_grad, "y_grad")
+                self.check_output_allclose(custom_x, pd_x, "x")
+                self.check_output_allclose(custom_y, pd_y, "y")
+                self.check_output_allclose(custom_out, pd_out, "out")
+                self.check_output_allclose(custom_x_grad, pd_x_grad, "x_grad")
+                self.check_output_allclose(custom_y_grad, pd_y_grad, "y_grad")
 
     def test_dynamic_relu_net(self):
         for device in self.devices:
@@ -567,11 +567,11 @@ def test_dynamic_relu_net(self):
                     self.np_z,
                 )
                 (
-                    phi_x,
-                    phi_y,
-                    phi_out,
-                    phi_x_grad,
-                    phi_y_grad,
+                    custom_x,
+                    custom_y,
+                    custom_out,
+                    custom_x_grad,
+                    custom_y_grad,
                 ) = inplace_dynamic_relu_net(
                     True,
                     device,
@@ -581,11 +581,11 @@ def test_dynamic_relu_net(self):
                     self.np_z,
                 )
 
-                self.check_output(phi_x, pd_x, "x")
-                self.check_output(phi_y, pd_y, "y")
-                self.check_output(phi_out, pd_out, "out")
-                self.check_output(phi_x_grad, pd_x_grad, "x_grad")
-                self.check_output(phi_y_grad, pd_y_grad, "y_grad")
+                self.check_output(custom_x, pd_x, "x")
+                self.check_output(custom_y, pd_y, "y")
+                self.check_output(custom_out, pd_out, "out")
+                self.check_output(custom_x_grad, pd_x_grad, "x_grad")
+                self.check_output(custom_y_grad, pd_y_grad, "y_grad")
 
     def test_static_multi_inplace(self):
         for device in self.devices:
@@ -611,16 +611,16 @@ def test_static_multi_inplace(self):
                     self.np_b,
                 )
                 (
-                    phi_x,
-                    phi_out_xy,
-                    phi_x_grad,
-                    phi_y_grad,
-                    phi_out_xy_grad,
-                    phi_a,
-                    phi_out_ab,
-                    phi_a_grad,
-                    phi_b_grad,
-                    phi_out_ab_grad,
+                    custom_x,
+                    custom_out_xy,
+                    custom_x_grad,
+                    custom_y_grad,
+                    custom_out_xy_grad,
+                    custom_a,
+                    custom_out_ab,
+                    custom_a_grad,
+                    custom_b_grad,
+                    custom_out_ab_grad,
                 ) = static_multi_inplace(
                     True,
                     device,
@@ -630,23 +630,27 @@ def test_static_multi_inplace(self):
                     self.np_a,
                     self.np_b,
                 )
-                self.check_output(phi_x, pd_out_xy, "inplace_phi_x")
+                self.check_output(custom_x, pd_out_xy, "inplace_custom_x")
                 self.check_output(
-                    phi_x_grad, phi_out_xy_grad, "inplace_phi_x_grad"
+                    custom_x_grad, custom_out_xy_grad, "inplace_custom_x_grad"
                 )
-                self.check_output(phi_a, pd_out_ab, "inplace_phi_a")
+                self.check_output(custom_a, pd_out_ab, "inplace_custom_a")
                 self.check_output(
-                    phi_a_grad, phi_out_ab_grad, "inplace_phi_a_grad"
+                    custom_a_grad, custom_out_ab_grad, "inplace_custom_a_grad"
                 )
 
-                self.check_output(phi_out_xy, pd_out_xy, "outxy")
-                self.check_output(phi_x_grad, pd_x_grad, "x_grad")
-                self.check_output(phi_y_grad, pd_y_grad, "y_grad")
-                self.check_output(phi_out_xy_grad, pd_out_xy_grad, "outxy_grad")
-                self.check_output(phi_out_ab, pd_out_ab, "outab")
-                self.check_output(phi_a_grad, pd_a_grad, "a_grad")
-                self.check_output(phi_b_grad, pd_b_grad, "b_grad")
-                self.check_output(phi_out_ab_grad, pd_out_ab_grad, "outab_grad")
+                self.check_output(custom_out_xy, pd_out_xy, "outxy")
+                self.check_output(custom_x_grad, pd_x_grad, "x_grad")
+                self.check_output(custom_y_grad, pd_y_grad, "y_grad")
+                self.check_output(
+                    custom_out_xy_grad, pd_out_xy_grad, "outxy_grad"
+                )
+                self.check_output(custom_out_ab, pd_out_ab, "outab")
+                self.check_output(custom_a_grad, pd_a_grad, "a_grad")
+                self.check_output(custom_b_grad, pd_b_grad, "b_grad")
+                self.check_output(
+                    custom_out_ab_grad, pd_out_ab_grad, "outab_grad"
+                )
 
     def test_dynamic_multi_inplace(self):
         for device in self.devices:
@@ -672,16 +676,16 @@ def test_dynamic_multi_inplace(self):
                     self.np_b,
                 )
                 (
-                    phi_x,
-                    phi_y,
-                    phi_out_xy,
-                    phi_x_grad,
-                    phi_y_grad,
-                    phi_a,
-                    phi_b,
-                    phi_out_ab,
-                    phi_a_grad,
-                    phi_b_grad,
+                    custom_x,
+                    custom_y,
+                    custom_out_xy,
+                    custom_x_grad,
+                    custom_y_grad,
+                    custom_a,
+                    custom_b,
+                    custom_out_ab,
+                    custom_a_grad,
+                    custom_b_grad,
                 ) = dynamic_multi_inplace(
                     True,
                     device,
@@ -692,21 +696,21 @@ def test_dynamic_multi_inplace(self):
                     self.np_b,
                 )
 
-                self.check_output(phi_x, phi_out_xy, "inplace_phi_x")
+                self.check_output(custom_x, custom_out_xy, "inplace_custom_x")
                 self.check_output(pd_x, pd_out_xy, "inplace_pd_x")
-                self.check_output(phi_a, phi_out_ab, "inplace_phi_a")
+                self.check_output(custom_a, custom_out_ab, "inplace_custom_a")
                 self.check_output(pd_a, pd_out_ab, "inplace_pd_a")
 
-                self.check_output(phi_x, pd_x, "x")
-                self.check_output(phi_y, pd_y, "y")
-                self.check_output(phi_out_xy, pd_out_xy, "outxy")
-                self.check_output(phi_x_grad, pd_x_grad, "x_grad")
-                self.check_output(phi_y_grad, pd_y_grad, "y_grad")
-                self.check_output(phi_a, pd_a, "a")
-                self.check_output(phi_b, pd_b, "b")
-                self.check_output(phi_out_ab, pd_out_ab, "outab")
-                self.check_output(phi_a_grad, pd_a_grad, "a_grad")
-                self.check_output(phi_b_grad, pd_b_grad, "b_grad")
+                self.check_output(custom_x, pd_x, "x")
+                self.check_output(custom_y, pd_y, "y")
+                self.check_output(custom_out_xy, pd_out_xy, "outxy")
+                self.check_output(custom_x_grad, pd_x_grad, "x_grad")
+                self.check_output(custom_y_grad, pd_y_grad, "y_grad")
+                self.check_output(custom_a, pd_a, "a")
+                self.check_output(custom_b, pd_b, "b")
+                self.check_output(custom_out_ab, pd_out_ab, "outab")
+                self.check_output(custom_a_grad, pd_a_grad, "a_grad")
+                self.check_output(custom_b_grad, pd_b_grad, "b_grad")
 
 
 if __name__ == "__main__":
diff --git a/test/custom_op/test_custom_linear.py b/test/custom_op/test_custom_linear.py
index 5d2a55456d7d23..5cd4b5e14f7dd5 100644
--- a/test/custom_op/test_custom_linear.py
+++ b/test/custom_op/test_custom_linear.py
@@ -112,12 +112,12 @@ def test_static(self):
         for device in self.devices:
             for dtype in self.dtypes:
                 (
-                    phi_out,
-                    phi_x_grad,
-                    phi_weight_grad,
-                    phi_bias_grad,
+                    custom_out,
+                    custom_x_grad,
+                    custom_weight_grad,
+                    custom_bias_grad,
                 ) = linear_static(
-                    custom_ops.phi_linear,
+                    custom_ops.custom_linear,
                     device,
                     dtype,
                     self.np_x,
@@ -132,23 +132,23 @@ def test_static(self):
                     self.np_weight,
                     self.np_bias,
                 )
-                self.check_output(phi_out, pd_out, "out")
-                self.check_output(phi_x_grad, pd_x_grad, "x_grad")
+                self.check_output(custom_out, pd_out, "out")
+                self.check_output(custom_x_grad, pd_x_grad, "x_grad")
                 self.check_output(
-                    phi_weight_grad, pd_weight_grad, "weight_grad"
+                    custom_weight_grad, pd_weight_grad, "weight_grad"
                 )
-                self.check_output(phi_bias_grad, pd_bias_grad, "bias_grad")
+                self.check_output(custom_bias_grad, pd_bias_grad, "bias_grad")
 
     def test_dynamic(self):
         for device in self.devices:
             for dtype in self.dtypes:
                 (
-                    phi_out,
-                    phi_x_grad,
-                    phi_weight_grad,
-                    phi_bias_grad,
+                    custom_out,
+                    custom_x_grad,
+                    custom_weight_grad,
+                    custom_bias_grad,
                 ) = linear_dynamic(
-                    custom_ops.phi_linear,
+                    custom_ops.custom_linear,
                     device,
                     dtype,
                     self.np_x,
@@ -168,12 +168,12 @@ def test_dynamic(self):
                     self.np_weight,
                     self.np_bias,
                 )
-                self.check_output(phi_out, pd_out, "phi_out")
-                self.check_output(phi_x_grad, pd_x_grad, "x_grad")
+                self.check_output(custom_out, pd_out, "custom_out")
+                self.check_output(custom_x_grad, pd_x_grad, "x_grad")
                 self.check_output(
-                    phi_weight_grad, pd_weight_grad, "weight_grad"
+                    custom_weight_grad, pd_weight_grad, "weight_grad"
                 )
-                self.check_output(phi_bias_grad, pd_bias_grad, "bias_grad")
+                self.check_output(custom_bias_grad, pd_bias_grad, "bias_grad")
 
 
 if __name__ == "__main__":
diff --git a/test/custom_op/test_multi_out_jit.py b/test/custom_op/test_multi_out_jit.py
index 9b652a0efccae1..f3e3a6ec8abc13 100644
--- a/test/custom_op/test_multi_out_jit.py
+++ b/test/custom_op/test_multi_out_jit.py
@@ -40,13 +40,13 @@
 )
 
 
-def discrete_out_dynamic(use_phi, device, dtype, np_w, np_x, np_y, np_z):
+def discrete_out_dynamic(use_custom, device, dtype, np_w, np_x, np_y, np_z):
     paddle.set_device(device)
     w = paddle.to_tensor(np_w, dtype=dtype, stop_gradient=False)
     x = paddle.to_tensor(np_x, dtype=dtype, stop_gradient=False)
     y = paddle.to_tensor(np_y, dtype=dtype, stop_gradient=False)
     z = paddle.to_tensor(np_z, dtype=dtype, stop_gradient=False)
-    if use_phi:
+    if use_custom:
         out = multi_out_module.discrete_out(w, x, y, z)
     else:
         out = w * 1 + x * 2 + y * 3 + z * 4
@@ -55,7 +55,7 @@ def discrete_out_dynamic(use_phi, device, dtype, np_w, np_x, np_y, np_z):
     return out.numpy(), w.grad.numpy(), y.grad.numpy()
 
 
-def discrete_out_static(use_phi, device, dtype, np_w, np_x, np_y, np_z):
+def discrete_out_static(use_custom, device, dtype, np_w, np_x, np_y, np_z):
     paddle.enable_static()
     paddle.set_device(device)
     with static.scope_guard(static.Scope()):
@@ -68,7 +68,7 @@ def discrete_out_static(use_phi, device, dtype, np_w, np_x, np_y, np_z):
             x.stop_gradient = False
             y.stop_gradient = False
             z.stop_gradient = False
-            if use_phi:
+            if use_custom:
                 out = multi_out_module.discrete_out(w, x, y, z)
             else:
                 out = w * 1 + x * 2 + y * 3 + z * 4
@@ -180,7 +180,11 @@ def test_discrete_out_static(self):
                     self.np_y,
                     self.np_z,
                 )
-                (phi_out, phi_w_grad, phi_y_grad,) = discrete_out_static(
+                (
+                    custom_out,
+                    custom_w_grad,
+                    custom_y_grad,
+                ) = discrete_out_static(
                     True,
                     device,
                     dtype,
@@ -189,10 +193,10 @@ def test_discrete_out_static(self):
                     self.np_y,
                     self.np_z,
                 )
-                self.check_output(phi_out, pd_out, "out")
+                self.check_output(custom_out, pd_out, "out")
                 # NOTE: In static mode, the output gradient of custom operator has been optimized to shape=[1]. However, native paddle op's output shape = [4, 8], hence we need to fetch pd_w_grad[0][0] (By the way, something wrong with native paddle's gradient, the outputs with other indexes instead of pd_w_grad[0][0] is undefined in this unittest.)
-                self.check_output(phi_w_grad, pd_w_grad[0][0], "w_grad")
-                self.check_output(phi_y_grad, pd_y_grad[0][0], "y_grad")
+                self.check_output(custom_w_grad, pd_w_grad[0][0], "w_grad")
+                self.check_output(custom_y_grad, pd_y_grad[0][0], "y_grad")
 
     def test_discrete_out_dynamic(self):
         for device in self.devices:
@@ -206,7 +210,11 @@ def test_discrete_out_dynamic(self):
                     self.np_y,
                     self.np_z,
                 )
-                (phi_out, phi_w_grad, phi_y_grad,) = discrete_out_dynamic(
+                (
+                    custom_out,
+                    custom_w_grad,
+                    custom_y_grad,
+                ) = discrete_out_dynamic(
                     True,
                     device,
                     dtype,
@@ -215,9 +223,9 @@ def test_discrete_out_dynamic(self):
                     self.np_y,
                     self.np_z,
                 )
-                self.check_output(phi_out, pd_out, "out")
-                self.check_output(phi_w_grad, pd_w_grad, "w_grad")
-                self.check_output(phi_y_grad, pd_y_grad, "y_grad")
+                self.check_output(custom_out, pd_out, "out")
+                self.check_output(custom_w_grad, pd_w_grad, "w_grad")
+                self.check_output(custom_y_grad, pd_y_grad, "y_grad")
 
 
 if __name__ == '__main__':

From 0b89cb1d006435fa32f46d49df440f0e86e7c19e Mon Sep 17 00:00:00 2001
From: JYChen <zoooo0820@qq.com>
Date: Mon, 10 Apr 2023 20:01:05 +0800
Subject: [PATCH 031/156] remove legacy profiler (#52624)

* remove legacy profiler

* rm test_parallel_executor_profiler
---
 python/paddle/cost_model/cost_model.py        |   3 +-
 python/paddle/fluid/__init__.py               |   3 +-
 python/paddle/fluid/profiler.py               | 425 ------------------
 .../fluid/tests/unittests/CMakeLists.txt      |  12 -
 .../fluid/tests/unittests/test_newprofiler.py |  11 -
 .../test_parallel_executor_profiler.py        |  51 ---
 .../fluid/tests/unittests/test_profiler.py    | 207 ---------
 python/paddle/utils/__init__.py               |   3 -
 python/paddle/utils/profiler.py               | 172 -------
 tools/print_signatures.py                     |   1 -
 10 files changed, 3 insertions(+), 885 deletions(-)
 delete mode 100644 python/paddle/fluid/profiler.py
 delete mode 100644 python/paddle/fluid/tests/unittests/test_parallel_executor_profiler.py
 delete mode 100644 python/paddle/utils/profiler.py

diff --git a/python/paddle/cost_model/cost_model.py b/python/paddle/cost_model/cost_model.py
index ad8be331cb2296..04a8760d66dda2 100644
--- a/python/paddle/cost_model/cost_model.py
+++ b/python/paddle/cost_model/cost_model.py
@@ -58,7 +58,8 @@ def profile_measure(
         exe = paddle.static.Executor(place)
 
         exe.run(startup_program)
-        paddle.fluid.profiler.start_profiler("All")
+        p = paddle.profiler.Profiler()
+        p.start()
         exe.run(main_program, feed={"X": x}, fetch_list=[])
 
         cost_model = core.CostModel()
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 0a3feff50ec7ac..15ea4367e5e42c 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -75,7 +75,7 @@
     CustomPlace,
 )
 from .lod_tensor import create_lod_tensor, create_random_int_lodtensor
-from . import profiler
+
 from . import unique_name
 from . import compiler
 from .compiler import *
@@ -130,7 +130,6 @@
         'ParamAttr',
         'WeightNormParamAttr',
         'DataFeeder',
-        'profiler',
         'unique_name',
         'Scope',
         '_cuda_synchronize',
diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py
deleted file mode 100644
index 750ea5d8e13f14..00000000000000
--- a/python/paddle/fluid/profiler.py
+++ /dev/null
@@ -1,425 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from . import core
-from .wrapped_decorator import signature_safe_contextmanager
-import os
-import sys
-
-from paddle.utils.deprecated import deprecated
-
-__all__ = [
-    'cuda_profiler',
-    'reset_profiler',
-    'profiler',
-    'start_profiler',
-    'stop_profiler',
-]
-
-NVPROF_CONFIG = [
-    "gpustarttimestamp",
-    "gpuendtimestamp",
-    "gridsize3d",
-    "threadblocksize",
-    "streamid",
-    "enableonstart 0",
-    "conckerneltrace",
-]
-
-
-@deprecated(
-    since="2.3.0",
-    update_to="paddle.profiler.Profiler",
-    level=1,
-    reason="Please use new profiler tool, this profiler tool is no longer maintained.",
-)
-@signature_safe_contextmanager
-def cuda_profiler(output_file, output_mode=None, config=None):
-    """
-    API cuda_profiler has been abandoned. If you have relevant requirements, you can use `paddle.utils.profiler.start_profiler` and `paddle.utils.profiler.stop_profiler`.
-    The relevant reference documents are as follows:
-    <https://www.paddlepaddle.org.cn/documentation/docs/en/api/paddle/utils/profiler/start_profiler_en.html#start-profiler>
-    <https://www.paddlepaddle.org.cn/documentation/docs/en/api/paddle/utils/profiler/stop_profiler_en.html#stop-profiler>
-    <https://www.paddlepaddle.org.cn/documentation/docs/en/advanced_guide/performance_improving/analysis_tools/timeline_en.html>
-    """
-    raise RuntimeError(
-        "API cuda_profiler has been abandoned. If you have relevant requirements, you can use `paddle.utils.profiler.start_profiler` and `paddle.utils.profiler.stop_profiler`.\nThe relevant reference documents are as follows:\n<https://www.paddlepaddle.org.cn/documentation/docs/en/api/paddle/utils/profiler/start_profiler_en.html#start-profiler>\n<https://www.paddlepaddle.org.cn/documentation/docs/en/api/paddle/utils/profiler/stop_profiler_en.html#stop-profiler>\n<https://www.paddlepaddle.org.cn/documentation/docs/en/advanced_guide/performance_improving/analysis_tools/timeline_en.html>"
-    )
-
-
-@signature_safe_contextmanager
-def npu_profiler(output_file, config=None):
-    """
-    The NPU profiler.
-
-    This fuctions is used to profile NPU program by NPU runtime application
-    programming interface. The profiling result will be written into
-    `output_file`. The users can set set the NPU profiling config by `config` argument.
-
-    After getting the profiling result file, users can use
-    `tools provided by Ascend <https://support.huaweicloud.com/tg-Inference-cann/atlasprofiling_16_0006.html>`_
-    to load this output file to visualize results.
-
-    Args:
-        output_file (str) : The output file name, the result will be
-            written into this file. It should be absolute path.
-        config (list<str>, optional) : NPU profile config. For more details, please
-            refer to `User Guide <https://support.huaweicloud.com/tg-Inference-cann/atlasprofiling_16_0006.html>`_ .
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import paddle.fluid.profiler as profiler
-            import numpy as np
-            import paddle
-
-            epoc = 8
-            dshape = [4, 3, 28, 28]
-            data = paddle.static.data(name='data', shape=[None, 3, 28, 28], dtype='float32')
-            conv = paddle.static.nn.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
-
-            place = fluid.NPUPlace(0)
-            exe = fluid.Executor(place)
-            exe.run(fluid.default_startup_program())
-
-            output_file = 'npu.txt'
-            with profiler.npu_profiler(output_file) as npu_prof:
-                for i in range(epoc):
-                    input = np.random.random(dshape).astype('float32')
-                    exe.run(fluid.default_main_program(), feed={'data': input})
-            # then use  NPU profiler tools to load this output file
-            # to visualize results.
-    """
-    # TODO: support config in python.
-    if not config:
-        config = core.npu_prof_create_config()
-
-    core.npu_prof_init(output_file)
-    # Enables profiler collection by the active NPU profiling tool.
-    core.npu_prof_start(config)
-    try:
-        yield
-    # Disables profiler collection.
-    finally:
-        core.npu_prof_stop(config)
-        core.npu_prof_finalize()
-
-
-@deprecated(
-    since="2.3.0",
-    update_to="paddle.profiler.Profiler",
-    level=1,
-    reason="Please use new profiler tool, this profiler tool is no longer maintained.",
-)
-def reset_profiler():
-    """
-    Clear the previous time record. It works for
-    `fluid.profiler.start_profiler`, `fluid.profiler.stop_profiler`,
-    and `fluid.profiler.profiler`.
-
-    Examples:
-
-        .. code-block:: python
-
-            # required: gpu
-            import paddle.fluid as fluid
-            import paddle.fluid.profiler as profiler
-            with profiler.profiler('CPU', 'total', '/tmp/profile'):
-                for iter in range(10):
-                    if iter == 2:
-                        profiler.reset_profiler()
-                    # ...
-    """
-    core.reset_profiler()
-
-
-@deprecated(
-    since="2.3.0",
-    update_to="paddle.profiler.Profiler",
-    level=1,
-    reason="Please use new profiler tool, this profiler tool is no longer maintained.",
-)
-def start_profiler(state, tracer_option='Default'):
-    """
-    Enable the profiler. Uers can use `fluid.profiler.start_profiler` and
-    `fluid.profiler.stop_profiler` to profile, which is equal to the usage
-    of `fluid.profiler.profiler` interface.
-
-    Args:
-        state (str) : The profiling state, which should be one of 'CPU', 'GPU'
-            or 'All'. 'CPU' means only profiling CPU; 'GPU' means profiling
-            both CPU and GPU; 'All' means profiling both CPU and GPU, and
-            generates timeline as well.
-        tracer_option (str, optional) : tracer_option can be one of ['Default', 'OpDetail', 'AllOpDetail'], it
-            can control the profile level and print the different level profile result. `Default` option print
-            the different Op type profiling result and the `OpDetail` option print the detail profiling
-            result of different op types such as compute and data transform, `AllOpDetail` option
-            print the detail profiling result of different op name same as `OpDetail`.
-
-    Raises:
-        ValueError: If `state` is not in ['CPU', 'GPU', 'All'] or `tracer_option`
-            is not in ['Default', 'OpDetail', 'AllOpDetail'].
-
-    Examples:
-
-        .. code-block:: python
-
-            # required: gpu
-            import paddle.fluid as fluid
-            import paddle.fluid.profiler as profiler
-
-            profiler.start_profiler('GPU')
-            for iter in range(10):
-                if iter == 2:
-                    profiler.reset_profiler()
-                # except each iteration
-            profiler.stop_profiler('total', '/tmp/profile')
-
-            profiler.start_profiler('GPU', "OpDetail")
-            for iter in range(10):
-                if iter == 2:
-                    profiler.reset_profiler()
-                # except each iteration
-            profiler.stop_profiler('total', '/tmp/profile')
-    """
-    if core.is_profiler_enabled():
-        return
-    if state not in ['CPU', 'GPU', "All"]:
-        raise ValueError("The state must be 'CPU' or 'GPU' or 'All'.")
-    if state == "GPU":
-        prof_state = core.ProfilerState.kCUDA
-    elif state == "CPU":
-        prof_state = core.ProfilerState.kCPU
-    else:
-        prof_state = core.ProfilerState.kAll
-
-    if tracer_option not in ['Default', 'OpDetail', 'AllOpDetail']:
-        raise ValueError(
-            "tracer option must be 'Default', 'OpDetail', 'AllOpDetail'."
-        )
-    if tracer_option == "Default":
-        prof_tracer_option = core.TracerOption.kDefault
-    elif tracer_option == "OpDetail":
-        prof_tracer_option = core.TracerOption.kOpDetail
-    else:
-        prof_tracer_option = core.TracerOption.kAllOpDetail
-
-    core.set_tracer_option(prof_tracer_option)
-    core.enable_profiler(prof_state)
-
-
-@deprecated(
-    since="2.3.0",
-    update_to="paddle.profiler.Profiler",
-    level=1,
-    reason="Please use new profiler tool, this profiler tool is no longer maintained.",
-)
-def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
-    """
-    Stop the profiler. Uers can use `fluid.profiler.start_profiler` and
-    `fluid.profiler.stop_profiler` to profile, which is equal to the usage
-    of `fluid.profiler.profiler` interface.
-
-    Args:
-        sorted_key (str, optional) : The order of profiling results, which
-            should be one of None, 'calls', 'total', 'max', 'min' or 'ave'.
-            Default is None, means the profiling results will be printed
-            in the order of first end time of events.
-            The `calls` means sorting by the number of calls.
-            The `total` means sorting by the total execution time.
-            The `max` means sorting by the maximum execution time.
-            The `min` means sorting by the minimum execution time.
-            The `ave` means sorting by the average execution time.
-            and write it into `profile_path`. The default profile_path is `/tmp/profile`.
-        profile_path (str, optional) : If state == 'All', it will generate timeline,
-
-    Raises:
-        ValueError: If `sorted_key` is not in
-            ['calls', 'total', 'max', 'min', 'ave'].
-
-    Examples:
-
-        .. code-block:: python
-
-            # required: gpu
-            import paddle.fluid as fluid
-            import paddle.fluid.profiler as profiler
-
-            profiler.start_profiler('GPU')
-            for iter in range(10):
-                if iter == 2:
-                    profiler.reset_profiler()
-                # except each iteration
-            profiler.stop_profiler('total', '/tmp/profile')
-    """
-    if not core.is_profiler_enabled():
-        return
-    sorted_key = 'default' if sorted_key is None else sorted_key
-    if sorted_key not in ['default', 'calls', 'total', 'max', 'min', 'ave']:
-        raise ValueError(
-            "The sorted_key must be None or in 'calls', 'total', "
-            "'max', 'min' and 'ave'"
-        )
-    key_map = {
-        'default': core.EventSortingKey.kDefault,
-        'calls': core.EventSortingKey.kCalls,
-        'total': core.EventSortingKey.kTotal,
-        'max': core.EventSortingKey.kMax,
-        'min': core.EventSortingKey.kMin,
-        'ave': core.EventSortingKey.kAve,
-    }
-    # TODO(qingqing) : redirect C++ ostream to Python stream.
-    # with core.ostream_redirect(stdout=True, stderr=True):
-    core.disable_profiler(key_map[sorted_key], profile_path)
-
-
-@deprecated(
-    since="2.3.0",
-    update_to="paddle.profiler.Profiler",
-    level=1,
-    reason="Please use new profiler tool, this profiler tool is no longer maintained.",
-)
-@signature_safe_contextmanager
-def profiler(
-    state, sorted_key=None, profile_path='/tmp/profile', tracer_option='Default'
-):
-    """
-    The profiler interface. This profiler can be used to profile both CPU and GPU program.
-
-    Args:
-        state (str) : The profiling state, which should be one of 'CPU', 'GPU'
-            or 'All'. 'CPU' means only profiling CPU; 'GPU' means profiling
-            both CPU and GPU; 'All' means profiling both CPU and GPU, and
-            generates timeline as well.
-        sorted_key (str, optional) : The order of profiling results, which
-            should be one of None, 'calls', 'total', 'max', 'min' or 'ave'.
-            Default is None, means the profiling results will be printed
-            in the order of first end time of events.
-            The `calls` means sorting by the number of calls.
-            The `total` means sorting by the total execution time.
-            The `max` means sorting by the maximum execution time.
-            The `min` means sorting by the minimum execution time.
-            The `ave` means sorting by the average execution time.
-        profile_path (str, optional) : If state == 'All', it will generate timeline,
-            and write it into `profile_path`. The default profile_path is `/tmp/profile`.
-        tracer_option (str, optional) : tracer_option can be one of ['Default', 'OpDetail', 'AllOpDetail'], it
-            can control the profile level and print the different level profile result. `Default` option print
-            the different Op type profiling result and the `OpDetail` option print the detail profiling
-            result of different op types such as compute and data transform, `AllOpDetail` option
-            print the detail profiling result of different op name same as `OpDetail`.
-
-    Raises:
-        ValueError: If `state` is not in ['CPU', 'GPU', 'All']. If `sorted_key` is
-            not in ['calls', 'total', 'max', 'min', 'ave'].
-
-    Examples:
-
-        .. code-block:: python
-
-            # required: gpu
-            import paddle.fluid as fluid
-            import paddle.fluid.profiler as profiler
-            import numpy as np
-            import paddle
-            paddle.enable_static()
-
-            epoc = 8
-            dshape = [4, 3, 28, 28]
-            data = paddle.static.data(name='data', shape=[None, 3, 28, 28], dtype='float32')
-            conv = paddle.static.nn.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
-
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            exe.run(fluid.default_startup_program())
-
-            with profiler.profiler('CPU', 'total', '/tmp/profile', 'Default') as prof:
-                for i in range(epoc):
-                    input = np.random.random(dshape).astype('float32')
-                    exe.run(fluid.default_main_program(), feed={'data': input})
-
-    Examples Results:
-
-        .. code-block:: text
-
-            #### Examples Results ####
-            #### 1) sorted_key = 'total', 'calls', 'max', 'min', 'ave' ####
-            # The only difference in 5 sorted_key results is the following sentence:
-            # "Sorted by number of xxx in descending order in the same thread."
-            # The reason is that in this example, above 5 columns are already sorted.
-            ------------------------->     Profiling Report     <-------------------------
-
-            Place: CPU
-            Time unit: ms
-            Sorted by total time in descending order in the same thread
-            #Sorted by number of calls in descending order in the same thread
-            #Sorted by number of max in descending order in the same thread
-            #Sorted by number of min in descending order in the same thread
-            #Sorted by number of avg in descending order in the same thread
-
-            Event                       Calls       Total       Min.        Max.        Ave.        Ratio.
-            thread0::conv2d             8           129.406     0.304303    127.076     16.1758     0.983319
-            thread0::elementwise_add    8           2.11865     0.193486    0.525592    0.264832    0.016099
-            thread0::feed               8           0.076649    0.006834    0.024616    0.00958112  0.000582432
-
-            #### 2) sorted_key = None  ####
-            # Since the profiling results are printed in the order of first end time of Ops,
-            # the printed order is feed->conv2d->elementwise_add
-            ------------------------->     Profiling Report     <-------------------------
-
-            Place: CPU
-            Time unit: ms
-            Sorted by event first end time in descending order in the same thread
-
-            Event                       Calls       Total       Min.        Max.        Ave.        Ratio.
-            thread0::feed               8           0.077419    0.006608    0.023349    0.00967738  0.00775934
-            thread0::conv2d             8           7.93456     0.291385    5.63342     0.99182     0.795243
-            thread0::elementwise_add    8           1.96555     0.191884    0.518004    0.245693    0.196998
-    """
-    start_profiler(state, tracer_option)
-    try:
-        yield
-    finally:
-        stop_profiler(sorted_key, profile_path)
-
-
-@signature_safe_contextmanager
-def _nvprof_range(iter_id, start, end, exit_after_prof=True):
-    '''
-    A range profiler interface (not public yet).
-
-    Examples:
-
-        .. code-block:: python
-
-            model = Model()
-            for i in range(max_iter):
-                paddle.fluid.profiler._nvprof_range(i, 10, 20):
-                    out = model(in)
-    '''
-    try:
-        if iter_id == start:
-            core.nvprof_start()
-            core.nvprof_enable_record_event()
-        if iter_id >= start:
-            core.nvprof_nvtx_push(str(iter_id))
-        yield
-    finally:
-        if iter_id < end:
-            core.nvprof_nvtx_pop()
-        if iter_id == end:
-            core.nvprof_stop()
-            if exit_after_prof:
-                sys.exit()
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index a75c04cd76bc15..999006a004431d 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -400,7 +400,6 @@ endfunction()
 list(REMOVE_ITEM TEST_OPS test_feed_data_check_shape_type)
 list(REMOVE_ITEM TEST_OPS test_fetch_lod_tensor_array)
 list(REMOVE_ITEM TEST_OPS test_warpctc_op)
-list(REMOVE_ITEM TEST_OPS test_parallel_executor_profiler)
 list(REMOVE_ITEM TEST_OPS test_data_norm_op)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer_auto_growth)
@@ -694,17 +693,6 @@ if(WITH_DISTRIBUTE)
   endif()
 endif()
 
-# profiler will random hang in linux cuda 10.1 or 10.2
-# see https://github.com/PaddlePaddle/Paddle/issues/29082 for details.
-# We guess there are some bugs in linux cuda 10.1 or 10.2,
-# since this unittest is stable in cuda 11.2 and 10.2 (windows-ci pipeline) now.
-if(NOT (LINUX AND CUDA_VERSION LESS 11.0))
-  py_test_modules(test_parallel_executor_profiler MODULES
-                  test_parallel_executor_profiler)
-  set_tests_properties(test_parallel_executor_profiler
-                       PROPERTIES LABELS "RUN_TYPE=DIST")
-  set_tests_properties(test_parallel_executor_profiler PROPERTIES TIMEOUT 120)
-endif()
 py_test_modules(test_parallel_executor_transformer MODULES
                 test_parallel_executor_transformer)
 if(WIN32)
diff --git a/python/paddle/fluid/tests/unittests/test_newprofiler.py b/python/paddle/fluid/tests/unittests/test_newprofiler.py
index 19256e7cf6b5b9..7874012e6864eb 100755
--- a/python/paddle/fluid/tests/unittests/test_newprofiler.py
+++ b/python/paddle/fluid/tests/unittests/test_newprofiler.py
@@ -197,17 +197,6 @@ def my_scheduler1(num_step):
         prof.stop()
 
 
-class TestNvprof(unittest.TestCase):
-    def test_nvprof(self):
-        for i in range(10):
-            paddle.fluid.profiler._nvprof_range(i, 10, 20)
-            x_value = np.random.randn(2, 3, 3)
-            x = paddle.to_tensor(
-                x_value, stop_gradient=False, place=paddle.CPUPlace()
-            )
-            y = x / 2.0
-
-
 class TestGetProfiler(unittest.TestCase):
     def test_getprofiler(self):
         config_content = '''
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_profiler.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_profiler.py
deleted file mode 100644
index 684efbb4a0e13f..00000000000000
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_profiler.py
+++ /dev/null
@@ -1,51 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import unittest
-
-from paddle import fluid
-from paddle.fluid import core
-from paddle.fluid.tests.unittests.test_profiler import TestProfiler
-
-# NCCL 2.7 decides to use shared memory while NCCL 2.6 didn't, hence causing the error.
-# include/shm.h:28 NCCL WARN Call to posix_fallocate failed: No space left on device
-#
-# Set environment variables NCCL_SHM_DISABLE=1 to disables the Shared Memory (SHM) transports
-# and force to use P2P which is the default transports way of NCCL2.6.
-os.environ['NCCL_SHM_DISABLE'] = str(1)
-
-
-class TestPEProfiler(TestProfiler):
-    def test_cpu_profiler(self):
-        exe = fluid.Executor(fluid.CPUPlace())
-        self.net_profiler(exe, 'CPU', "Default", use_parallel_executor=True)
-
-    @unittest.skipIf(
-        not core.is_compiled_with_cuda(), "profiler is enabled only with GPU"
-    )
-    def test_cuda_profiler(self):
-        exe = fluid.Executor(fluid.CUDAPlace(0))
-        self.net_profiler(exe, 'GPU', "OpDetail", use_parallel_executor=True)
-
-    @unittest.skipIf(
-        not core.is_compiled_with_cuda(), "profiler is enabled only with GPU"
-    )
-    def test_all_profiler(self):
-        exe = fluid.Executor(fluid.CUDAPlace(0))
-        self.net_profiler(exe, 'All', "AllOpDetail", use_parallel_executor=True)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_profiler.py b/python/paddle/fluid/tests/unittests/test_profiler.py
index c5d654fe3b458c..c155f66266e232 100644
--- a/python/paddle/fluid/tests/unittests/test_profiler.py
+++ b/python/paddle/fluid/tests/unittests/test_profiler.py
@@ -12,219 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-import tempfile
 import unittest
 
-import numpy as np
-
 import paddle
-from paddle import fluid, utils
-from paddle.fluid import core, profiler
-from paddle.fluid.proto.profiler import profiler_pb2
 from paddle.utils.flops import flops
 
 
-class TestProfiler(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        os.environ['CPU_NUM'] = str(4)
-
-    def build_program(self, compile_program=True):
-        startup_program = fluid.Program()
-        main_program = fluid.Program()
-        with fluid.program_guard(main_program, startup_program):
-            image = paddle.static.data(
-                name='x', shape=[-1, 784], dtype='float32'
-            )
-            hidden1 = paddle.static.nn.fc(x=image, size=64, activation='relu')
-            i = paddle.zeros(shape=[1], dtype='int64')
-            counter = paddle.tensor.fill_constant(
-                shape=[1], dtype='int64', value=0, force_cpu=True
-            )
-            until = paddle.tensor.fill_constant([1], dtype='int64', value=10)
-            data_arr = paddle.tensor.array_write(hidden1, i)
-            cond = paddle.less_than(x=counter, y=until)
-            while_op = paddle.static.nn.control_flow.While(cond=cond)
-            with while_op.block():
-                hidden_n = paddle.static.nn.fc(
-                    x=hidden1, size=64, activation='relu'
-                )
-                paddle.tensor.array_write(hidden_n, i, data_arr)
-                paddle.increment(x=counter, value=1)
-                paddle.assign(paddle.less_than(x=counter, y=until), cond)
-
-            hidden_n = paddle.tensor.array_read(data_arr, i)
-            hidden2 = paddle.static.nn.fc(
-                x=hidden_n, size=64, activation='relu'
-            )
-            predict = paddle.static.nn.fc(
-                x=hidden2, size=10, activation='softmax'
-            )
-            label = paddle.static.data(name='y', shape=[-1, 1], dtype='int64')
-            cost = paddle.nn.functional.cross_entropy(
-                input=predict, label=label, reduction='none', use_softmax=False
-            )
-            avg_cost = paddle.mean(cost)
-            batch_size = paddle.tensor.create_tensor(dtype='int64')
-            batch_acc = paddle.static.accuracy(
-                input=predict, label=label, total=batch_size
-            )
-
-        optimizer = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
-        opts = optimizer.minimize(avg_cost, startup_program=startup_program)
-
-        if compile_program:
-            # TODO(luotao): profiler tool may have bug with multi-thread parallel executor.
-            # https://github.com/PaddlePaddle/Paddle/pull/25200#issuecomment-650483092
-            train_program = fluid.compiler.CompiledProgram(main_program)
-        else:
-            train_program = main_program
-        return train_program, startup_program, avg_cost, batch_size, batch_acc
-
-    def get_profile_path(self):
-        profile_path = os.path.join(tempfile.gettempdir(), "profile")
-        open(profile_path, "w").write("")
-        return profile_path
-
-    def check_profile_result(self, profile_path):
-        data = open(profile_path, 'rb').read()
-        if len(data) > 0:
-            profile_pb = profiler_pb2.Profile()
-            profile_pb.ParseFromString(data)
-            self.assertGreater(len(profile_pb.events), 0)
-            for event in profile_pb.events:
-                if event.type == profiler_pb2.Event.GPUKernel:
-                    if not event.detail_info and not event.name.startswith(
-                        "MEM"
-                    ):
-                        raise Exception(
-                            "Kernel %s missing event. Has this kernel been recorded by RecordEvent?"
-                            % event.name
-                        )
-                elif event.type == profiler_pb2.Event.CPU and (
-                    event.name.startswith("Driver API")
-                    or event.name.startswith("Runtime API")
-                ):
-                    print("Warning: unregister", event.name)
-
-    def run_iter(self, exe, main_program, fetch_list):
-        x = np.random.random((32, 784)).astype("float32")
-        y = np.random.randint(0, 10, (32, 1)).astype("int64")
-        outs = exe.run(
-            main_program, feed={'x': x, 'y': y}, fetch_list=fetch_list
-        )
-
-    def net_profiler(
-        self,
-        exe,
-        state,
-        tracer_option,
-        batch_range=None,
-        use_parallel_executor=False,
-        use_new_api=False,
-    ):
-        (
-            main_program,
-            startup_program,
-            avg_cost,
-            batch_size,
-            batch_acc,
-        ) = self.build_program(compile_program=use_parallel_executor)
-        exe.run(startup_program)
-
-        profile_path = self.get_profile_path()
-        if not use_new_api:
-            with profiler.profiler(state, 'total', profile_path, tracer_option):
-                for iter in range(10):
-                    if iter == 2:
-                        profiler.reset_profiler()
-                    self.run_iter(
-                        exe, main_program, [avg_cost, batch_acc, batch_size]
-                    )
-        else:
-            options = utils.ProfilerOptions(
-                options={
-                    'state': state,
-                    'sorted_key': 'total',
-                    'tracer_level': tracer_option,
-                    'batch_range': [0, 10]
-                    if batch_range is None
-                    else batch_range,
-                    'profile_path': profile_path,
-                }
-            )
-            with utils.Profiler(enabled=True, options=options) as prof:
-                for iter in range(10):
-                    self.run_iter(
-                        exe, main_program, [avg_cost, batch_acc, batch_size]
-                    )
-                    utils.get_profiler().record_step()
-                    if batch_range is None and iter == 2:
-                        utils.get_profiler().reset()
-        # TODO(luotao): check why nccl kernel in profile result.
-        # https://github.com/PaddlePaddle/Paddle/pull/25200#issuecomment-650483092
-        # self.check_profile_result(profile_path)
-
-    def test_cpu_profiler(self):
-        exe = fluid.Executor(fluid.CPUPlace())
-        for use_new_api in [False, True]:
-            self.net_profiler(
-                exe,
-                'CPU',
-                "Default",
-                batch_range=[5, 10],
-                use_new_api=use_new_api,
-            )
-
-    @unittest.skipIf(
-        not core.is_compiled_with_cuda(), "profiler is enabled only with GPU"
-    )
-    def test_cuda_profiler(self):
-        exe = fluid.Executor(fluid.CUDAPlace(0))
-        for use_new_api in [False, True]:
-            self.net_profiler(
-                exe,
-                'GPU',
-                "OpDetail",
-                batch_range=[0, 10],
-                use_new_api=use_new_api,
-            )
-
-    @unittest.skipIf(
-        not core.is_compiled_with_cuda(), "profiler is enabled only with GPU"
-    )
-    def test_all_profiler(self):
-        exe = fluid.Executor(fluid.CUDAPlace(0))
-        for use_new_api in [False, True]:
-            self.net_profiler(
-                exe,
-                'All',
-                "AllOpDetail",
-                batch_range=None,
-                use_new_api=use_new_api,
-            )
-
-
-class TestProfilerAPIError(unittest.TestCase):
-    def test_errors(self):
-        options = utils.ProfilerOptions()
-        self.assertIsNone(options['profile_path'])
-        self.assertIsNone(options['timeline_path'])
-
-        options = options.with_state('All')
-        self.assertTrue(options['state'] == 'All')
-        try:
-            print(options['test'])
-        except ValueError:
-            pass
-
-        global_profiler = utils.get_profiler()
-        with utils.Profiler(enabled=True) as prof:
-            self.assertTrue(utils.get_profiler() == prof)
-            self.assertTrue(global_profiler != prof)
-
-
 class TestFLOPSAPI(unittest.TestCase):
     def test_flops(self):
         self.assertTrue(flops('relu', {'X': [[12, 12]]}, {'output': 4}) == 144)
diff --git a/python/paddle/utils/__init__.py b/python/paddle/utils/__init__.py
index a6fd7bcaf749b0..df62d9982f6f4f 100644
--- a/python/paddle/utils/__init__.py
+++ b/python/paddle/utils/__init__.py
@@ -13,9 +13,6 @@
 # limitations under the License.
 
 from . import gast
-from .profiler import ProfilerOptions  # noqa: F401
-from .profiler import Profiler  # noqa: F401
-from .profiler import get_profiler  # noqa: F401
 from .deprecated import deprecated  # noqa: F401
 from .lazy_import import try_import  # noqa: F401
 from .op_version import OpLastCheckpointChecker  # noqa: F401
diff --git a/python/paddle/utils/profiler.py b/python/paddle/utils/profiler.py
deleted file mode 100644
index 6381ddc1456d13..00000000000000
--- a/python/paddle/utils/profiler.py
+++ /dev/null
@@ -1,172 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import warnings
-
-from ..fluid import core
-from ..fluid.profiler import cuda_profiler  # noqa: F401
-from ..fluid.profiler import profiler  # noqa: F401
-from ..fluid.profiler import reset_profiler, start_profiler, stop_profiler
-from .deprecated import deprecated
-
-__all__ = [  # noqa
-    'Profiler',
-    'get_profiler',
-    'ProfilerOptions',
-    'cuda_profiler',
-    'start_profiler',
-    'profiler',
-    'stop_profiler',
-    'reset_profiler',
-]
-
-
-@deprecated(
-    since="2.4.2",
-    update_to="paddle.profiler.Profiler",
-    level=1,
-    reason="Please use new profiler tool, this profiler tool is no longer maintained.",
-)
-class ProfilerOptions:
-    def __init__(self, options=None):
-        self.options = {
-            'state': 'All',
-            'sorted_key': 'default',
-            'tracer_level': 'Default',
-            'batch_range': [0, sys.maxsize],
-            'output_thread_detail': False,
-            'profile_path': 'none',
-            'timeline_path': 'none',
-            'op_summary_path': 'none',
-        }
-        if options is not None:
-            for key in self.options.keys():
-                if options.get(key, None) is not None:
-                    self.options[key] = options[key]
-
-    # function to set one specified option
-    def with_state(self, state):
-        self.options['state'] = state
-        return self
-
-    def __getitem__(self, name):
-        if self.options.get(name, None) is None:
-            raise ValueError(
-                "ProfilerOptions does not have an option named %s." % name
-            )
-        else:
-            if (
-                isinstance(self.options[name], str)
-                and self.options[name] == 'none'
-            ):
-                return None
-            else:
-                return self.options[name]
-
-
-_current_profiler = None
-
-
-@deprecated(
-    since="2.4.2",
-    update_to="paddle.profiler.Profiler",
-    level=1,
-    reason="Please use new profiler tool, this profiler tool is no longer maintained.",
-)
-class Profiler:
-    def __init__(self, enabled=True, options=None):
-        if options is not None:
-            self.profiler_options = options
-        else:
-            self.profiler_options = ProfilerOptions()
-        self.batch_id = 0
-        self.enabled = enabled
-
-    def __enter__(self):
-        # record current profiler
-        global _current_profiler
-        self.previous_profiler = _current_profiler
-        _current_profiler = self
-
-        if self.enabled:
-            if self.profiler_options['batch_range'][0] == 0:
-                self.start()
-        return self
-
-    def __exit__(self, exception_type, exception_value, traceback):
-        global _current_profiler
-        _current_profiler = self.previous_profiler
-
-        if self.enabled:
-            self.stop()
-
-    def start(self):
-        if self.enabled:
-            try:
-                start_profiler(
-                    state=self.profiler_options['state'],
-                    tracer_option=self.profiler_options['tracer_level'],
-                )
-            except Exception as e:
-                warnings.warn(
-                    "Profiler is not enabled becuase following exception:\n{}".format(
-                        e
-                    )
-                )
-
-    def stop(self):
-        if self.enabled:
-            try:
-                stop_profiler(
-                    sorted_key=self.profiler_options['sorted_key'],
-                    profile_path=self.profiler_options['profile_path'],
-                )
-            except Exception as e:
-                warnings.warn(
-                    "Profiler is not disabled becuase following exception:\n{}".format(
-                        e
-                    )
-                )
-
-    def reset(self):
-        if self.enabled and core.is_profiler_enabled():
-            reset_profiler()
-
-    def record_step(self, change_profiler_status=True):
-        if not self.enabled:
-            return
-        self.batch_id = self.batch_id + 1
-        if change_profiler_status:
-            if self.batch_id == self.profiler_options['batch_range'][0]:
-                if core.is_profiler_enabled():
-                    self.reset()
-                else:
-                    self.start()
-
-            if self.batch_id == self.profiler_options['batch_range'][1]:
-                self.stop()
-
-
-@deprecated(
-    since="2.4.2",
-    update_to="paddle.profiler.Profiler",
-    level=1,
-    reason="Please use new profiler tool, this profiler tool is no longer maintained.",
-)
-def get_profiler():
-    global _current_profiler
-    if _current_profiler is None:
-        _current_profiler = Profiler()
-    return _current_profiler
diff --git a/tools/print_signatures.py b/tools/print_signatures.py
index 6f4673c2a6ed90..3af095fff2e103 100644
--- a/tools/print_signatures.py
+++ b/tools/print_signatures.py
@@ -263,7 +263,6 @@ def check_public_api():
         paddle.text,
         paddle.utils,
         paddle.utils.download,
-        paddle.utils.profiler,
         paddle.utils.cpp_extension,
         paddle.sysconfig,
         paddle.vision,

From 1bc0095519912ce1382f88c68bd67b9b701770fa Mon Sep 17 00:00:00 2001
From: lzydev <lizhiyu02@baidu.com>
Date: Mon, 10 Apr 2023 20:06:54 +0800
Subject: [PATCH 032/156] Autogen segment_pool (#52538)

* autogen segment_pool

* delete legacy_dygraph about segment_pool
---
 paddle/fluid/operators/segment_pool_op.cc | 158 ----------------------
 paddle/phi/api/yaml/backward.yaml         |  12 ++
 paddle/phi/api/yaml/legacy_backward.yaml  |  12 --
 paddle/phi/api/yaml/legacy_ops.yaml       |  10 --
 paddle/phi/api/yaml/op_compat.yaml        |   7 +
 paddle/phi/api/yaml/ops.yaml              |  11 ++
 paddle/phi/ops/compat/segment_pool_sig.cc |  36 -----
 python/paddle/geometric/math.py           |   8 +-
 python/paddle/incubate/tensor/math.py     |  29 +---
 9 files changed, 40 insertions(+), 243 deletions(-)
 delete mode 100644 paddle/fluid/operators/segment_pool_op.cc
 delete mode 100644 paddle/phi/ops/compat/segment_pool_sig.cc

diff --git a/paddle/fluid/operators/segment_pool_op.cc b/paddle/fluid/operators/segment_pool_op.cc
deleted file mode 100644
index c2199b70365b1e..00000000000000
--- a/paddle/fluid/operators/segment_pool_op.cc
+++ /dev/null
@@ -1,158 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/phi/infermeta/binary.h"
-
-namespace paddle {
-namespace operators {
-
-class SegmentPoolOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                          ctx.GetPlace());
-  }
-};
-
-class SegmentPoolOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor) The input data of SegmentPoolOp");
-    AddInput("SegmentIds",
-             "(Tensor) 1-D tensor which have the same size with the fist "
-             "dimension of input X.");
-    AddOutput("Out", "(Tensor) The output of SegmentPoolOp.");
-    AddOutput("SummedIds",
-              "(Tensor) This tensor is used to counts of segment ids for the "
-              "backward of the mean pool.")
-        .AsIntermediate();
-    AddAttr<std::string>(
-        "pooltype",
-        "(string, default 'SUM') the pooling type of SegmentPoolOp.")
-        .SetDefault("SUM")
-        .InEnum({"SUM", "MEAN", "MIN", "MAX"});
-    AddComment(R"DOC(
-Segment Pool Operator.
-
-This operator will pool the elements of input `X` which with the same index
-in `SegmentIds`.
-
-For SUM operation, it computes a tensor such that $Out_i = \sum_{j} X_{j}$
-where sum is over j such that `SegmentIds[j] == i`.
-
-For MEAN operation, it computes a tensor such that
-$Out_i = \frac{1}{n_i}  \sum_{j} X_{j}$ where sum is over j such that
-`SegmentIds[j] == i` and $n_i$ is the number of all index `SegmentIds[j] == i`.
-
-For MIN operation, it computes a tensor such that $Out_i = \min_{j} X_{j}$
-where min is over j such that `SegmentIds[j] == i`.
-
-For MAX operation, it computes a tensor such that $Out_i = \max_{j} X_{j}$
-where max is over j such that `SegmentIds[j] == i`.
-    )DOC");
-  }
-};
-
-class SegmentPoolGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input",
-                   framework::GradVarName("Out"),
-                   "SegmentPoolGrad");
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "SegmentPoolGrad");
-    auto og_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-    auto x_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(og_dims.size(),
-                      x_dims.size(),
-                      platform::errors::InvalidArgument(
-                          "The rank of output grad must equal to Input(X). But "
-                          "received: input rank %u, input shape [%s].",
-                          og_dims.size(),
-                          og_dims));
-    for (int64_t i = 1; i < og_dims.size(); ++i) {
-      PADDLE_ENFORCE_EQ(
-          og_dims[i],
-          x_dims[i],
-          platform::errors::InvalidArgument(
-              "The dimension mismatch between Input(OUT@GRAD) and "
-              "Input(X). Received Input(OUT@GRAD): input rank %u, "
-              "input shape [%s]; received Input(X): input rank %u, "
-              "input shape [%s].",
-              og_dims.size(),
-              og_dims,
-              x_dims.size(),
-              x_dims));
-    }
-
-    ctx->ShareDim("X", /*->*/ framework::GradVarName("X"));
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(
-                              ctx, framework::GradVarName("Out")),
-                          ctx.GetPlace());
-  }
-};
-
-template <typename T>
-class SegmentPoolGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op_desc_ptr) const override {
-    op_desc_ptr->SetType("segment_pool_grad");
-    op_desc_ptr->SetInput("X", this->Input("X"));
-    op_desc_ptr->SetInput("SegmentIds", this->Input("SegmentIds"));
-    op_desc_ptr->SetInput("Out", this->Output("Out"));
-    if (PADDLE_GET_CONST(std::string, this->GetAttr("pooltype")) == "MEAN") {
-      op_desc_ptr->SetInput("SummedIds", this->Output("SummedIds"));
-    }
-    op_desc_ptr->SetInput(framework::GradVarName("Out"),
-                          this->OutputGrad("Out"));
-    op_desc_ptr->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    op_desc_ptr->SetAttrMap(this->Attrs());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-DECLARE_INFER_SHAPE_FUNCTOR(segment_pool,
-                            SegmentPoolInferShapeFunctor,
-                            PD_INFER_META(phi::SegmentPoolInferMeta));
-
-REGISTER_OPERATOR(segment_pool,
-                  ops::SegmentPoolOp,
-                  ops::SegmentPoolOpMaker,
-                  ops::SegmentPoolGradOpMaker<paddle::framework::OpDesc>,
-                  ops::SegmentPoolGradOpMaker<paddle::imperative::OpBase>,
-                  SegmentPoolInferShapeFunctor);
-REGISTER_OPERATOR(segment_pool_grad, ops::SegmentPoolGradOp);
diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml
index 7116b2be70dbf0..1c3599d07a595d 100644
--- a/paddle/phi/api/yaml/backward.yaml
+++ b/paddle/phi/api/yaml/backward.yaml
@@ -1404,6 +1404,18 @@
     func : scatter_nd_add_grad
   no_need_buffer : updates
 
+- backward_op : segment_pool_grad
+  forward : segment_pool (Tensor x, Tensor segment_ids, str pooltype="SUM") -> Tensor(out), Tensor(summed_ids)
+  args : (Tensor x, Tensor segment_ids, Tensor out, Tensor summed_ids, Tensor out_grad, str pooltype)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  kernel :
+    func : segment_pool_grad
+    data_type : out_grad
+  optional : summed_ids
+
 - backward_op : selu_grad
   forward : selu (Tensor x, float scale=1.0507009873554804934193349852946, float alpha=1.6732632423543772848170429916717) -> Tensor(out)
   args : (Tensor out, Tensor out_grad, float scale, float alpha)
diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml
index 6ba507312b303e..3cf1904b1ed724 100755
--- a/paddle/phi/api/yaml/legacy_backward.yaml
+++ b/paddle/phi/api/yaml/legacy_backward.yaml
@@ -938,18 +938,6 @@
     func : rrelu_grad
     data_type : x
 
-- backward_op : segment_pool_grad
-  forward : segment_pool (Tensor x, Tensor segment_ids, str pooltype) -> Tensor(out), Tensor(summed_ids)
-  args : (Tensor x, Tensor segment_ids, Tensor out, Tensor summed_ids, Tensor out_grad, str pooltype)
-  output : Tensor(x_grad)
-  infer_meta :
-    func : UnchangedInferMeta
-    param : [x]
-  kernel :
-    func : segment_pool_grad
-    data_type : x
-  optional : summed_ids
-
 - backward_op : slice_double_grad
   forward : slice_grad (Tensor input, Tensor grad_out, int64_t[] axes, IntArray starts, IntArray ends, int64_t[] infer_flags, int64_t[] decrease_axis) -> Tensor(grad_input)
   args : (Tensor grad_input_grad, int64_t[] axes, IntArray starts, IntArray ends, int64_t[] infer_flags, int64_t[] decrease_axis)
diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml
index 32966d54e09594..60ab3606853061 100755
--- a/paddle/phi/api/yaml/legacy_ops.yaml
+++ b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -1226,16 +1226,6 @@
   intermediate : noise
   backward : rrelu_grad
 
-- op : segment_pool
-  args : (Tensor x, Tensor segment_ids, str pooltype)
-  output : Tensor(out), Tensor(summed_ids)
-  infer_meta :
-    func : SegmentPoolInferMeta
-  kernel :
-    func : segment_pool
-    data_type : x
-  backward : segment_pool_grad
-
 - op : shape
   args : (Tensor input)
   output : Tensor(out)
diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
index f905b04c92df7b..aff1486f2ae696 100644
--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -1805,6 +1805,13 @@
   extra :
     attrs : [bool deterministic = false, str rng_name = "", bool force_cpu = false]
 
+- op : segment_pool
+  backward : segment_pool_grad
+  inputs :
+    {x : X, segment_ids : SegmentIds}
+  outputs :
+    {out : Out, summed_ids : SummedIds}
+
 - op : selu
   backward : selu_grad
   inputs :
diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
index e0598f15b58105..91f7ba04d061a1 100644
--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -1485,6 +1485,17 @@
     func : searchsorted
     data_type : sorted_sequence
 
+- op : segment_pool
+  args : (Tensor x, Tensor segment_ids, str pooltype="SUM")
+  output : Tensor(out), Tensor(summed_ids)
+  infer_meta :
+    func : SegmentPoolInferMeta
+  kernel :
+    func : segment_pool
+    data_type : x
+  intermediate : summed_ids
+  backward : segment_pool_grad
+
 - op : selu
   args : (Tensor x, float scale=1.0507009873554804934193349852946, float alpha=1.6732632423543772848170429916717)
   output : Tensor
diff --git a/paddle/phi/ops/compat/segment_pool_sig.cc b/paddle/phi/ops/compat/segment_pool_sig.cc
deleted file mode 100644
index 62b2b08f4c1864..00000000000000
--- a/paddle/phi/ops/compat/segment_pool_sig.cc
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-
-KernelSignature SegmentPoolGradOpArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  return KernelSignature("segment_pool_grad",
-                         {
-                             "X",
-                             "SegmentIds",
-                             "Out",
-                             "SummedIds",
-                             "Out@GRAD",
-                         },
-                         {"pooltype"},
-                         {"X@GRAD"});
-}
-
-}  // namespace phi
-
-PD_REGISTER_ARG_MAPPING_FN(segment_pool_grad,
-                           phi::SegmentPoolGradOpArgumentMapping);
diff --git a/python/paddle/geometric/math.py b/python/paddle/geometric/math.py
index fabaab5efc5176..fdec045ec1317b 100644
--- a/python/paddle/geometric/math.py
+++ b/python/paddle/geometric/math.py
@@ -51,7 +51,7 @@ def segment_sum(data, segment_ids, name=None):
 
     """
     if in_dygraph_mode():
-        return _C_ops.segment_pool(data, segment_ids, "SUM")[0]
+        return _C_ops.segment_pool(data, segment_ids, "SUM")
     else:
         check_variable_and_dtype(
             data,
@@ -108,7 +108,7 @@ def segment_mean(data, segment_ids, name=None):
     """
 
     if in_dygraph_mode():
-        return _C_ops.segment_pool(data, segment_ids, "MEAN")[0]
+        return _C_ops.segment_pool(data, segment_ids, "MEAN")
     else:
 
         check_variable_and_dtype(
@@ -165,7 +165,7 @@ def segment_min(data, segment_ids, name=None):
     """
 
     if in_dygraph_mode():
-        return _C_ops.segment_pool(data, segment_ids, "MIN")[0]
+        return _C_ops.segment_pool(data, segment_ids, "MIN")
     else:
         check_variable_and_dtype(
             data,
@@ -221,7 +221,7 @@ def segment_max(data, segment_ids, name=None):
     """
 
     if in_dygraph_mode():
-        return _C_ops.segment_pool(data, segment_ids, "MAX")[0]
+        return _C_ops.segment_pool(data, segment_ids, "MAX")
     else:
         check_variable_and_dtype(
             data,
diff --git a/python/paddle/incubate/tensor/math.py b/python/paddle/incubate/tensor/math.py
index d24dc46fef363a..e7f8246ba90ea5 100644
--- a/python/paddle/incubate/tensor/math.py
+++ b/python/paddle/incubate/tensor/math.py
@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from paddle import _C_ops, _legacy_C_ops
+from paddle import _C_ops
 from paddle.fluid.data_feeder import check_variable_and_dtype
 from paddle.fluid.framework import in_dygraph_mode
-from paddle.fluid.layer_helper import LayerHelper, _non_static_mode
+from paddle.fluid.layer_helper import LayerHelper
 from paddle.utils import deprecated
 
 __all__ = []
@@ -64,7 +64,7 @@ def segment_sum(data, segment_ids, name=None):
 
     """
     if in_dygraph_mode():
-        return _C_ops.segment_pool(data, segment_ids, "SUM")[0]
+        return _C_ops.segment_pool(data, segment_ids, "SUM")
     else:
         check_variable_and_dtype(
             data, "X", ("float32", "float64", "int32", "int64"), "segment_pool"
@@ -130,12 +130,7 @@ def segment_mean(data, segment_ids, name=None):
     """
 
     if in_dygraph_mode():
-        return _C_ops.segment_pool(data, segment_ids, "MEAN")[0]
-    if _non_static_mode():
-        out, tmp = _legacy_C_ops.segment_pool(
-            data, segment_ids, 'pooltype', "MEAN"
-        )
-        return out
+        return _C_ops.segment_pool(data, segment_ids, "MEAN")
 
     check_variable_and_dtype(
         data, "X", ("float32", "float64", "int32", "int64"), "segment_pool"
@@ -200,13 +195,7 @@ def segment_min(data, segment_ids, name=None):
     """
 
     if in_dygraph_mode():
-        return _C_ops.segment_pool(data, segment_ids, "MIN")[0]
-
-    if _non_static_mode():
-        out, tmp = _legacy_C_ops.segment_pool(
-            data, segment_ids, 'pooltype', "MIN"
-        )
-        return out
+        return _C_ops.segment_pool(data, segment_ids, "MIN")
 
     check_variable_and_dtype(
         data, "X", ("float32", "float64", "int32", "int64"), "segment_pool"
@@ -271,13 +260,7 @@ def segment_max(data, segment_ids, name=None):
     """
 
     if in_dygraph_mode():
-        out, tmp = _C_ops.segment_pool(data, segment_ids, "MAX")
-        return out
-
-    if _non_static_mode():
-        out, tmp = _legacy_C_ops.segment_pool(
-            data, segment_ids, 'pooltype', "MAX"
-        )
+        out = _C_ops.segment_pool(data, segment_ids, "MAX")
         return out
 
     check_variable_and_dtype(

From 94a8177f04b2f545139a01188185ec7a46325bad Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Tue, 11 Apr 2023 09:36:53 +0800
Subject: [PATCH 033/156] [Dy2St]Ignore os as not to_static module (#52715)

---
 python/paddle/jit/dy2static/convert_call_func.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/python/paddle/jit/dy2static/convert_call_func.py b/python/paddle/jit/dy2static/convert_call_func.py
index 2f0494551c88a0..d964bd633c1165 100644
--- a/python/paddle/jit/dy2static/convert_call_func.py
+++ b/python/paddle/jit/dy2static/convert_call_func.py
@@ -17,6 +17,7 @@
 import functools
 import inspect
 import logging
+import os
 import pdb
 import re
 from typing import Any, List
@@ -79,13 +80,14 @@ def builtin_modules():
     Return builtin modules.
     """
     modules = [
-        collections,
-        pdb,
         copy,
+        collections,
         inspect,
-        re,
-        numpy,
         logging,
+        numpy,
+        os,
+        pdb,
+        re,
     ]
     try:
         import six

From 4a790cba53c037e6ffa6617fe86f68a79325f164 Mon Sep 17 00:00:00 2001
From: Chitsing KUI <kuizhiqing@msn.com>
Date: Tue, 11 Apr 2023 10:01:52 +0800
Subject: [PATCH 034/156] fix c_embedding bug (#52742)

---
 paddle/fluid/operators/collective/c_embedding_op.cu | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/paddle/fluid/operators/collective/c_embedding_op.cu b/paddle/fluid/operators/collective/c_embedding_op.cu
index b44aaf74e49e8e..8b521580c5cd51 100644
--- a/paddle/fluid/operators/collective/c_embedding_op.cu
+++ b/paddle/fluid/operators/collective/c_embedding_op.cu
@@ -19,6 +19,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 
+DECLARE_bool(cudnn_deterministic);
+
 namespace paddle {
 namespace operators {
 
@@ -164,6 +166,10 @@ class CEmbeddingGradCUDAKernel : public framework::OpKernel<T> {
     t.device(*dev_ctx.eigen_device()) = t.constant(static_cast<T>(0));
 
     const auto &index_type = framework::TransToProtoVarType(ids_t->dtype());
+    if (FLAGS_cudnn_deterministic) {
+      VLOG(2) << "Run grad kernel of embedding with single thread.";
+      blocks = 1;
+    }
     if (index_type == framework::proto::VarType::INT32) {
       CEmbeddingGrad<T, int32_t>
           <<<blocks, threads, 0, dev_ctx.stream()>>>(d_table,

From 0cb0f70afd721321791752766b1d2e074b82edff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=9F=B3=E6=99=93=E4=BC=9F?=
 <39303645+Shixiaowei02@users.noreply.github.com>
Date: Tue, 11 Apr 2023 10:04:39 +0800
Subject: [PATCH 035/156] check the precision of cast operator test (#52317)

---
 python/paddle/fluid/tests/unittests/test_cast_op.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_cast_op.py b/python/paddle/fluid/tests/unittests/test_cast_op.py
index 816c507cea71a9..baa5bc3d90dd34 100644
--- a/python/paddle/fluid/tests/unittests/test_cast_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cast_op.py
@@ -68,7 +68,7 @@ def setUp(self):
         self.public_python_api = cast_wrapper
 
     def test_check_output(self):
-        self.check_output(atol=1e-3)
+        self.check_output()
 
     def test_grad(self):
         self.check_grad(['X'], ['Out'], check_prim=True, only_check_prim=True)
@@ -89,7 +89,7 @@ def setUp(self):
         self.public_python_api = cast_wrapper
 
     def test_check_output(self):
-        self.check_output(atol=1e-3)
+        self.check_output()
 
     def test_grad(self):
         self.check_grad(['X'], ['Out'], check_prim=True, only_check_prim=True)

From e041ffca8df68afe2cb914416989079cfaf7faf9 Mon Sep 17 00:00:00 2001
From: jjyaoao <88936287+jjyaoao@users.noreply.github.com>
Date: Tue, 11 Apr 2023 10:24:30 +0800
Subject: [PATCH 036/156] remove paddle/infrt/ (#52719)

* remove paddle/infrt/

* delete .lit_test_times.txt
---
 .gitignore                                    |   1 -
 .../tests/models/efficientnet-b4/net/utils.py | 424 ------------------
 2 files changed, 425 deletions(-)
 delete mode 100644 paddle/infrt/tests/models/efficientnet-b4/net/utils.py

diff --git a/.gitignore b/.gitignore
index 047d9684b4cd0f..a4690ba4425206 100644
--- a/.gitignore
+++ b/.gitignore
@@ -73,7 +73,6 @@ tools/nvcc_lazy
 
 # This file is automatically generated.
 # TODO(zhiqiang) Move this file to build directory.
-.lit_test_times.txt
 paddle/fluid/pybind/eager_op_function.cc
 tools/nvcc_lazy
 
diff --git a/paddle/infrt/tests/models/efficientnet-b4/net/utils.py b/paddle/infrt/tests/models/efficientnet-b4/net/utils.py
deleted file mode 100644
index 29c02c05842190..00000000000000
--- a/paddle/infrt/tests/models/efficientnet-b4/net/utils.py
+++ /dev/null
@@ -1,424 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import collections
-import math
-import re
-from functools import partial
-
-import paddle
-import paddle.nn.functional as F
-from paddle import nn
-
-# Parameters for the entire model (stem, all blocks, and head)
-GlobalParams = collections.namedtuple(
-    'GlobalParams',
-    [
-        'batch_norm_momentum',
-        'batch_norm_epsilon',
-        'dropout_rate',
-        'num_classes',
-        'width_coefficient',
-        'depth_coefficient',
-        'depth_divisor',
-        'min_depth',
-        'drop_connect_rate',
-        'image_size',
-    ],
-)
-
-# Parameters for an individual model block
-BlockArgs = collections.namedtuple(
-    'BlockArgs',
-    [
-        'kernel_size',
-        'num_repeat',
-        'input_filters',
-        'output_filters',
-        'expand_ratio',
-        'id_skip',
-        'stride',
-        'se_ratio',
-    ],
-)
-
-# Change namedtuple defaults
-GlobalParams.__new__.__defaults__ = (None,) * len(GlobalParams._fields)
-BlockArgs.__new__.__defaults__ = (None,) * len(BlockArgs._fields)
-
-
-def round_filters(filters, global_params):
-    """Calculate and round number of filters based on depth multiplier."""
-    multiplier = global_params.width_coefficient
-    if not multiplier:
-        return filters
-    divisor = global_params.depth_divisor
-    min_depth = global_params.min_depth
-    filters *= multiplier
-    min_depth = min_depth or divisor
-    new_filters = max(
-        min_depth, int(filters + divisor / 2) // divisor * divisor
-    )
-    if new_filters < 0.9 * filters:  # prevent rounding by more than 10%
-        new_filters += divisor
-    return int(new_filters)
-
-
-def round_repeats(repeats, global_params):
-    """Round number of filters based on depth multiplier."""
-    multiplier = global_params.depth_coefficient
-    if not multiplier:
-        return repeats
-    return int(math.ceil(multiplier * repeats))
-
-
-def drop_connect(inputs, prob, training):
-    """Drop input connection"""
-    if not training:
-        return inputs
-    keep_prob = 1.0 - prob
-    inputs_shape = paddle.shape(inputs)
-    random_tensor = keep_prob + paddle.rand(shape=[inputs_shape[0], 1, 1, 1])
-    binary_tensor = paddle.floor(random_tensor)
-    output = inputs / keep_prob * binary_tensor
-    return output
-
-
-def get_same_padding_conv2d(image_size=None):
-    """Chooses static padding if you have specified an image size, and dynamic padding otherwise.
-    Static padding is necessary for ONNX exporting of models."""
-    if image_size is None:
-        return Conv2dDynamicSamePadding
-    else:
-        return partial(Conv2dStaticSamePadding, image_size=image_size)
-
-
-class Conv2dDynamicSamePadding(nn.Conv2D):
-    """2D Convolutions like TensorFlow, for a dynamic image size"""
-
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        kernel_size,
-        stride=1,
-        dilation=1,
-        groups=1,
-        bias_attr=None,
-    ):
-        super().__init__(
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride,
-            0,
-            dilation,
-            groups,
-            bias_attr=bias_attr,
-        )
-        self.stride = (
-            self._stride if len(self._stride) == 2 else [self._stride[0]] * 2
-        )
-
-    def forward(self, x):
-        ih, iw = x.shape[-2:]
-        kh, kw = self.weight.shape[-2:]
-        sh, sw = self.stride
-        oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
-        pad_h = max(
-            (oh - 1) * self.stride[0] + (kh - 1) * self._dilation[0] + 1 - ih, 0
-        )
-        pad_w = max(
-            (ow - 1) * self.stride[1] + (kw - 1) * self._dilation[1] + 1 - iw, 0
-        )
-        if pad_h > 0 or pad_w > 0:
-            x = F.pad(
-                x,
-                [
-                    pad_w // 2,
-                    pad_w - pad_w // 2,
-                    pad_h // 2,
-                    pad_h - pad_h // 2,
-                ],
-            )
-        return F.conv2d(
-            x,
-            self.weight,
-            self.bias,
-            self.stride,
-            self._padding,
-            self._dilation,
-            self._groups,
-        )
-
-
-class Conv2dStaticSamePadding(nn.Conv2D):
-    """2D Convolutions like TensorFlow, for a fixed image size"""
-
-    def __init__(
-        self, in_channels, out_channels, kernel_size, image_size=None, **kwargs
-    ):
-        if 'stride' in kwargs and isinstance(kwargs['stride'], list):
-            kwargs['stride'] = kwargs['stride'][0]
-        super().__init__(in_channels, out_channels, kernel_size, **kwargs)
-        self.stride = (
-            self._stride if len(self._stride) == 2 else [self._stride[0]] * 2
-        )
-
-        # Calculate padding based on image size and save it
-        assert image_size is not None
-        ih, iw = (
-            image_size if type(image_size) == list else [image_size, image_size]
-        )
-        kh, kw = self.weight.shape[-2:]
-        sh, sw = self.stride
-        oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
-        pad_h = max(
-            (oh - 1) * self.stride[0] + (kh - 1) * self._dilation[0] + 1 - ih, 0
-        )
-        pad_w = max(
-            (ow - 1) * self.stride[1] + (kw - 1) * self._dilation[1] + 1 - iw, 0
-        )
-        if pad_h > 0 or pad_w > 0:
-            self.static_padding = nn.Pad2D(
-                [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2]
-            )
-        else:
-            self.static_padding = Identity()
-
-    def forward(self, x):
-        x = self.static_padding(x)
-        x = F.conv2d(
-            x,
-            self.weight,
-            self.bias,
-            self.stride,
-            self._padding,
-            self._dilation,
-            self._groups,
-        )
-        return x
-
-
-class Identity(nn.Layer):
-    def __init__(
-        self,
-    ):
-        super().__init__()
-
-    def forward(self, x):
-        return x
-
-
-def efficientnet_params(model_name):
-    """Map EfficientNet model name to parameter coefficients."""
-    params_dict = {
-        # Coefficients:   width,depth,resolution,dropout
-        'efficientnet-b0': (1.0, 1.0, 224, 0.2),
-        'efficientnet-b1': (1.0, 1.1, 240, 0.2),
-        'efficientnet-b2': (1.1, 1.2, 260, 0.3),
-        'efficientnet-b3': (1.2, 1.4, 300, 0.3),
-        'efficientnet-b4': (1.4, 1.8, 380, 0.4),
-        'efficientnet-b5': (1.6, 2.2, 456, 0.4),
-        'efficientnet-b6': (1.8, 2.6, 528, 0.5),
-        'efficientnet-b7': (2.0, 3.1, 600, 0.5),
-        'efficientnet-b8': (2.2, 3.6, 672, 0.5),
-        'efficientnet-l2': (4.3, 5.3, 800, 0.5),
-    }
-    return params_dict[model_name]
-
-
-class BlockDecoder:
-    """Block Decoder for readability, straight from the official TensorFlow repository"""
-
-    @staticmethod
-    def _decode_block_string(block_string):
-        """Gets a block through a string notation of arguments."""
-        assert isinstance(block_string, str)
-
-        ops = block_string.split('_')
-        options = {}
-        for op in ops:
-            splits = re.split(r'(\d.*)', op)
-            if len(splits) >= 2:
-                key, value = splits[:2]
-                options[key] = value
-
-        # Check stride
-        assert ('s' in options and len(options['s']) == 1) or (
-            len(options['s']) == 2 and options['s'][0] == options['s'][1]
-        )
-
-        return BlockArgs(
-            kernel_size=int(options['k']),
-            num_repeat=int(options['r']),
-            input_filters=int(options['i']),
-            output_filters=int(options['o']),
-            expand_ratio=int(options['e']),
-            id_skip=('noskip' not in block_string),
-            se_ratio=float(options['se']) if 'se' in options else None,
-            stride=[int(options['s'][0])],
-        )
-
-    @staticmethod
-    def _encode_block_string(block):
-        """Encodes a block to a string."""
-        args = [
-            'r%d' % block.num_repeat,
-            'k%d' % block.kernel_size,
-            's%d%d' % (block.strides[0], block.strides[1]),
-            'e%s' % block.expand_ratio,
-            'i%d' % block.input_filters,
-            'o%d' % block.output_filters,
-        ]
-        if 0 < block.se_ratio <= 1:
-            args.append('se%s' % block.se_ratio)
-        if block.id_skip is False:
-            args.append('noskip')
-        return '_'.join(args)
-
-    @staticmethod
-    def decode(string_list):
-        """
-        Decodes a list of string notations to specify blocks inside the network.
-
-        :param string_list: a list of strings, each string is a notation of block
-        :return: a list of BlockArgs namedtuples of block args
-        """
-        assert isinstance(string_list, list)
-        blocks_args = []
-        for block_string in string_list:
-            blocks_args.append(BlockDecoder._decode_block_string(block_string))
-        return blocks_args
-
-    @staticmethod
-    def encode(blocks_args):
-        """
-        Encodes a list of BlockArgs to a list of strings.
-
-        :param blocks_args: a list of BlockArgs namedtuples of block args
-        :return: a list of strings, each string is a notation of block
-        """
-        block_strings = []
-        for block in blocks_args:
-            block_strings.append(BlockDecoder._encode_block_string(block))
-        return block_strings
-
-
-def efficientnet(
-    width_coefficient=None,
-    depth_coefficient=None,
-    dropout_rate=0.2,
-    drop_connect_rate=0.2,
-    image_size=None,
-    num_classes=1000,
-):
-    """Get block arguments according to parameter and coefficients."""
-    blocks_args = [
-        'r1_k3_s11_e1_i32_o16_se0.25',
-        'r2_k3_s22_e6_i16_o24_se0.25',
-        'r2_k5_s22_e6_i24_o40_se0.25',
-        'r3_k3_s22_e6_i40_o80_se0.25',
-        'r3_k5_s11_e6_i80_o112_se0.25',
-        'r4_k5_s22_e6_i112_o192_se0.25',
-        'r1_k3_s11_e6_i192_o320_se0.25',
-    ]
-    blocks_args = BlockDecoder.decode(blocks_args)
-
-    global_params = GlobalParams(
-        batch_norm_momentum=0.99,
-        batch_norm_epsilon=1e-3,
-        dropout_rate=dropout_rate,
-        drop_connect_rate=drop_connect_rate,
-        num_classes=num_classes,
-        width_coefficient=width_coefficient,
-        depth_coefficient=depth_coefficient,
-        depth_divisor=8,
-        min_depth=None,
-        image_size=image_size,
-    )
-
-    return blocks_args, global_params
-
-
-def get_model_params(model_name, override_params):
-    """Get the block args and global params for a given model"""
-    if model_name.startswith('efficientnet'):
-        w, d, s, p = efficientnet_params(model_name)
-        blocks_args, global_params = efficientnet(
-            width_coefficient=w,
-            depth_coefficient=d,
-            dropout_rate=p,
-            image_size=s,
-        )
-    else:
-        raise NotImplementedError(
-            'model name is not pre-defined: %s' % model_name
-        )
-    if override_params:
-        global_params = global_params._replace(**override_params)
-    return blocks_args, global_params
-
-
-url_map = {
-    'efficientnet-b0': '/home/aistudio/data/weights/efficientnet-b0-355c32eb.pdparams',
-    'efficientnet-b1': '/home/aistudio/data/weights/efficientnet-b1-f1951068.pdparams',
-    'efficientnet-b2': '/home/aistudio/data/weights/efficientnet-b2-8bb594d6.pdparams',
-    'efficientnet-b3': '/home/aistudio/data/weights/efficientnet-b3-5fb5a3c3.pdparams',
-    'efficientnet-b4': '/home/aistudio/data/weights/efficientnet-b4-6ed6700e.pdparams',
-    'efficientnet-b5': '/home/aistudio/data/weights/efficientnet-b5-b6417697.pdparams',
-    'efficientnet-b6': '/home/aistudio/data/weights/efficientnet-b6-c76e70fd.pdparams',
-    'efficientnet-b7': '/home/aistudio/data/weights/efficientnet-b7-dcc49843.pdparams',
-}
-
-url_map_advprop = {
-    'efficientnet-b0': '/home/aistudio/data/weights/adv-efficientnet-b0-b64d5a18.pdparams',
-    'efficientnet-b1': '/home/aistudio/data/weights/adv-efficientnet-b1-0f3ce85a.pdparams',
-    'efficientnet-b2': '/home/aistudio/data/weights/adv-efficientnet-b2-6e9d97e5.pdparams',
-    'efficientnet-b3': '/home/aistudio/data/weights/adv-efficientnet-b3-cdd7c0f4.pdparams',
-    'efficientnet-b4': '/home/aistudio/data/weights/adv-efficientnet-b4-44fb3a87.pdparams',
-    'efficientnet-b5': '/home/aistudio/data/weights/adv-efficientnet-b5-86493f6b.pdparams',
-    'efficientnet-b6': '/home/aistudio/data/weights/adv-efficientnet-b6-ac80338e.pdparams',
-    'efficientnet-b7': '/home/aistudio/data/weights/adv-efficientnet-b7-4652b6dd.pdparams',
-    'efficientnet-b8': '/home/aistudio/data/weights/adv-efficientnet-b8-22a8fe65.pdparams',
-}
-
-
-def load_pretrained_weights(
-    model, model_name, weights_path=None, load_fc=True, advprop=False
-):
-    """Loads pretrained weights from weights path or download using url.
-    Args:
-        model (Module): The whole model of efficientnet.
-        model_name (str): Model name of efficientnet.
-        weights_path (None or str):
-            str: path to pretrained weights file on the local disk.
-            None: use pretrained weights downloaded from the Internet.
-        load_fc (bool): Whether to load pretrained weights for fc layer at the end of the model.
-        advprop (bool): Whether to load pretrained weights
-                        trained with advprop (valid when weights_path is None).
-    """
-
-    # AutoAugment or Advprop (different preprocessing)
-    url_map_ = url_map_advprop if advprop else url_map
-    state_dict = paddle.load(url_map_[model_name])
-
-    if load_fc:
-        model.set_state_dict(state_dict)
-    else:
-        state_dict.pop('_fc.weight')
-        state_dict.pop('_fc.bias')
-        model.set_state_dict(state_dict)
-
-    print(f'Loaded pretrained weights for {model_name}')

From 757aa4702248a7d24f5daaf8f24ba3dd98e747f4 Mon Sep 17 00:00:00 2001
From: ykkk2333 <77383312+ykkk2333@users.noreply.github.com>
Date: Tue, 11 Apr 2023 10:40:18 +0800
Subject: [PATCH 037/156] update xpu.cmake to 20230408 (#52409)

---
 cmake/external/xpu.cmake                   | 2 +-
 paddle/phi/kernels/xpu/pool_grad_kernel.cc | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index 138f06c4ae8fc5..65c02e1e520a8e 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -8,7 +8,7 @@ set(XPU_API_LIB_NAME "libxpuapi.so")
 set(XPU_RT_LIB_NAME "libxpurt.so")
 set(XPU_XFT_LIB_NAME "libxft.so")
 
-set(XPU_BASE_DATE "20230323")
+set(XPU_BASE_DATE "20230408")
 set(XPU_XCCL_BASE_VERSION "1.0.13")
 set(XPU_XFT_BASE_VERSION "latest")
 
diff --git a/paddle/phi/kernels/xpu/pool_grad_kernel.cc b/paddle/phi/kernels/xpu/pool_grad_kernel.cc
index afc0bb8fbe12b3..1e412e66e68f96 100644
--- a/paddle/phi/kernels/xpu/pool_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/pool_grad_kernel.cc
@@ -114,8 +114,6 @@ void Pool2dGradKernel(const Context& ctx,
     } else if (pooling_type == "avg") {
       // When output dim is 1 * 1 (1 * 1 * 1 in pool_3d), use scale
       // and broadcast kernels to get same output, but better performance.
-      // Since the dim is special in particular models,
-      // use 'export XPU_POOLING_GRAD_SPECIAL=1' to open this path
       if (out_h == 1 && out_w == 1 && std::is_same<T, float>::value) {
         xpu::ctx_guard RAII_GUARD(ctx.x_context());
         float scale = 1.0 / (in_h * in_w);

From a6ae1e356b0f8962d39d08e13af3bb9220ec9edd Mon Sep 17 00:00:00 2001
From: RedContritio <RedContritio@qq.com>
Date: Tue, 11 Apr 2023 10:41:18 +0800
Subject: [PATCH 038/156] support auto generate for op momentum optimizer
 (#52611)

* support auto generate for op momentum optimizer

* remove momentum_op.* and update signature

* fix dgc momentum op maker error
---
 .../operators/optimizers/dgc_momentum_op.cc   | 138 +++++++++++++++++-
 .../operators/optimizers/dgc_momentum_op.h    |   3 +-
 .../fluid/operators/optimizers/momentum_op.cc | 135 -----------------
 .../fluid/operators/optimizers/momentum_op.h  | 126 ----------------
 .../optimizers/unity_build_rule.cmake         |   1 -
 paddle/phi/api/yaml/legacy_ops.yaml           |  11 --
 paddle/phi/api/yaml/op_compat.yaml            |   6 +
 paddle/phi/api/yaml/op_version.yaml           |  21 +++
 paddle/phi/api/yaml/ops.yaml                  |  12 ++
 paddle/phi/ops/compat/momentum_sig.cc         |  49 -------
 10 files changed, 174 insertions(+), 328 deletions(-)
 delete mode 100644 paddle/fluid/operators/optimizers/momentum_op.cc
 delete mode 100644 paddle/fluid/operators/optimizers/momentum_op.h
 delete mode 100644 paddle/phi/ops/compat/momentum_sig.cc

diff --git a/paddle/fluid/operators/optimizers/dgc_momentum_op.cc b/paddle/fluid/operators/optimizers/dgc_momentum_op.cc
index 37196c02e0629c..89ea35ca9e7f9a 100644
--- a/paddle/fluid/operators/optimizers/dgc_momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/dgc_momentum_op.cc
@@ -19,9 +19,9 @@
 namespace paddle {
 namespace operators {
 
-class DGCMomentumOp : public MomentumOp {
+class DGCMomentumOp : public framework::OperatorWithKernel {
  public:
-  using MomentumOp::MomentumOp;
+  using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
@@ -32,7 +32,82 @@ class DGCMomentumOp : public MomentumOp {
     OP_INOUT_CHECK(ctx->HasInput("nranks"), "Input", "nranks", "DGCMomentumOp");
     OP_INOUT_CHECK(
         ctx->HasOutput("Grad_out"), "Output", "Grad_out", "DGCMomentumOp");
-    return MomentumOp::InferShape(ctx);
+
+    PADDLE_ENFORCE_EQ(ctx->HasInput("Param"),
+                      true,
+                      platform::errors::NotFound(
+                          "Input(param) of Momentum should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasInput("Grad"),
+                      true,
+                      platform::errors::NotFound(
+                          "Input(grad) of Momentum should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasInput("Velocity"),
+                      true,
+                      platform::errors::NotFound(
+                          "Input(velocity) of Momentum should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("LearningRate"),
+        true,
+        platform::errors::NotFound(
+            "Input(LearningRate) of Momentum should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->GetInputsVarType("Param").front(),
+                      framework::proto::VarType::LOD_TENSOR,
+                      platform::errors::InvalidArgument(
+                          "The input var's type should be phi::DenseTensor, "
+                          "but the received is %s",
+                          ctx->GetInputsVarType("Param").front()));
+
+    PADDLE_ENFORCE_EQ(ctx->HasOutput("ParamOut"),
+                      true,
+                      platform::errors::NotFound(
+                          "Output(ParamOut) of Momentum should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasOutput("VelocityOut"),
+        true,
+        platform::errors::NotFound(
+            "Output(VelocityOut) of Momentum should not be null."));
+
+    auto lr_dims = ctx->GetInputDim("LearningRate");
+    PADDLE_ENFORCE_NE(phi::product(lr_dims),
+                      0,
+                      platform::errors::InvalidArgument(
+                          "Maybe the Input variable LearningRate has not "
+                          "been initialized. You may need to confirm "
+                          "if you put exe.run(startup_program) "
+                          "after optimizer.minimize function."));
+    PADDLE_ENFORCE_EQ(phi::product(lr_dims),
+                      1,
+                      platform::errors::InvalidArgument(
+                          "Learning_rate should be a scalar. But Received "
+                          "LearningRate's dim [%s]",
+                          phi::product(lr_dims)));
+
+    auto param_dim = ctx->GetInputDim("Param");
+    if (ctx->GetInputsVarType("Grad")[0] ==
+        framework::proto::VarType::LOD_TENSOR) {
+      PADDLE_ENFORCE_EQ(
+          param_dim,
+          ctx->GetInputDim("Grad"),
+          platform::errors::InvalidArgument(
+              "Param and Grad input of MomentumOp should have the same "
+              "dimension. But received Param's dim [%s] and Grad's dim [%s].",
+              param_dim,
+              ctx->GetInputDim("Grad")));
+      PADDLE_ENFORCE_EQ(
+          param_dim,
+          ctx->GetInputDim("Velocity"),
+          platform::errors::InvalidArgument(
+              "Param and Velocity of MomentumOp should have the same "
+              "dimension. But received Param's dim [%s] and Velocity [%s].",
+              param_dim,
+              ctx->GetInputDim("Velocity")));
+    }
+
+    ctx->SetOutputDim("ParamOut", param_dim);
+    ctx->SetOutputDim("VelocityOut", param_dim);
+    if (ctx->HasOutput("MasterParamOut")) {
+      ctx->SetOutputDim("MasterParamOut", param_dim);
+    }
   }
 
   phi::KernelKey GetKernelTypeForVar(
@@ -49,22 +124,75 @@ class DGCMomentumOp : public MomentumOp {
     return framework::OperatorWithKernel::GetKernelTypeForVar(
         var_name, tensor, expected_kernel_type);
   }
+
+  phi::KernelKey GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto input_data_type =
+        OperatorWithKernel::IndicateVarDataType(ctx, "Param");
+    return phi::KernelKey(input_data_type, ctx.GetPlace());
+  }
 };
 
-class DGCMomentumOpMaker : public MomentumOpMaker {
+class DGCMomentumOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
+    AddInput("Param",
+             "(phi::DenseTensor, default phi::DenseTensor<float>) "
+             "Input parameter that has to be updated");
+    AddInput("Grad",
+             "(phi::DenseTensor, default phi::DenseTensor<float>) "
+             "Input gradient of the parameter");
+    AddInput("Velocity",
+             "(phi::DenseTensor, default phi::DenseTensor<float>) "
+             "Input velocity (corresponding to the parameter) "
+             "that has to be updated");
+    AddInput("LearningRate",
+             "(phi::DenseTensor, default phi::DenseTensor<float>) "
+             "Input learning rate");
+    AddInput("MasterParam", "FP32 master weight for AMP.").AsDispensable();
     AddInput("current_step", "(Tensor) Current step.");
     AddInput("nranks", "(Tensor) The number of trainers.");
 
+    AddOutput("ParamOut",
+              "(phi::DenseTensor) This output is updated parameter. "
+              "It shared memory with Input(Param).");
+    AddOutput("VelocityOut",
+              "(phi::DenseTensor) This output is updated velocity. "
+              "It shared memory with Input(Velocity).");
+    AddOutput("MasterParamOut",
+              "The updated FP32 master weight for AMP. "
+              "It shared memory with Input(MasterParam).")
+        .AsDispensable();
     AddOutput("Grad_out", "(Tensor) Output grad gradient");
 
+    AddAttr<float>("mu", "(float) Momentum coefficient");
+    AddAttr<bool>("use_nesterov",
+                  "(bool, default false) "
+                  "Use Nesterov Momentum")
+        .SetDefault(false);
+    AddAttr<std::string>("regularization_method",
+                         "(string) regularization_method, right now only "
+                         "support l2decay or none")
+        .SetDefault("");
+    AddAttr<float>("regularization_coeff", "(float) regularization_coeff")
+        .SetDefault(0.0f);
+    AddAttr<bool>("multi_precision",
+                  "(bool, default false) "
+                  "Whether to use multi-precision during weight updating.")
+        .SetDefault(false);
+    AddAttr<float>(
+        "rescale_grad",
+        "(float, default 1.0) Multiply the gradient with `rescale_grad`"
+        "before updating. Often choose to be `1.0/batch_size`.")
+        .SetDefault(1.0f);
     AddAttr<float>("rampup_begin_step",
                    "(float, -1.0)"
                    "The period when begin DGC.")
         .SetDefault(-1.0);
 
-    return MomentumOpMaker::Make();
+    AddComment(R"DOC(
+DGC Momentum Operator.
+)DOC");
   }
 };
 
diff --git a/paddle/fluid/operators/optimizers/dgc_momentum_op.h b/paddle/fluid/operators/optimizers/dgc_momentum_op.h
index d41f2116405a12..18d0a78e4bd109 100644
--- a/paddle/fluid/operators/optimizers/dgc_momentum_op.h
+++ b/paddle/fluid/operators/optimizers/dgc_momentum_op.h
@@ -16,7 +16,8 @@
 
 #include <memory>
 
-#include "paddle/fluid/operators/optimizers/momentum_op.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/momentum_kernel.h"
 #include "paddle/phi/kernels/sgd_kernel.h"
 
diff --git a/paddle/fluid/operators/optimizers/momentum_op.cc b/paddle/fluid/operators/optimizers/momentum_op.cc
deleted file mode 100644
index 538028139b8c46..00000000000000
--- a/paddle/fluid/operators/optimizers/momentum_op.cc
+++ /dev/null
@@ -1,135 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/optimizers/momentum_op.h"
-
-#include "paddle/fluid/framework/op_version_registry.h"
-
-namespace paddle {
-namespace operators {
-
-class MomentumOpInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext* ctx) const override {
-    auto in_var_type = ctx->GetInputType("Param");
-    PADDLE_ENFORCE_EQ(
-        in_var_type == framework::proto::VarType::SELECTED_ROWS ||
-            in_var_type == framework::proto::VarType::LOD_TENSOR,
-        true,
-        platform::errors::InvalidArgument(
-            "Only support LodTensor and SelectedRows, Unexpected Input Type."));
-
-    ctx->SetOutputType("ParamOut", in_var_type, framework::ALL_ELEMENTS);
-  }
-};
-
-void MomentumOpMaker::Make() {
-  AddInput("Param",
-           "(phi::DenseTensor, default phi::DenseTensor<float>) "
-           "Input parameter that has to be updated");
-  AddInput("Grad",
-           "(phi::DenseTensor, default phi::DenseTensor<float>) "
-           "Input gradient of the parameter");
-  AddInput("Velocity",
-           "(phi::DenseTensor, default phi::DenseTensor<float>) "
-           "Input velocity (corresponding to the parameter) "
-           "that has to be updated");
-  AddInput("LearningRate",
-           "(phi::DenseTensor, default phi::DenseTensor<float>) "
-           "Input learning rate");
-  AddInput("MasterParam", "FP32 master weight for AMP.").AsDispensable();
-  AddOutput("ParamOut",
-            "(phi::DenseTensor) This output is updated parameter. "
-            "It shared memory with Input(Param).");
-  AddOutput("VelocityOut",
-            "(phi::DenseTensor) This output is updated velocity. "
-            "It shared memory with Input(Velocity).");
-  AddOutput("MasterParamOut",
-            "The updated FP32 master weight for AMP. "
-            "It shared memory with Input(MasterParam).")
-      .AsDispensable();
-
-  AddAttr<float>("mu", "(float) Momentum coefficient");
-  AddAttr<bool>("use_nesterov",
-                "(bool, default false) "
-                "Use Nesterov Momentum")
-      .SetDefault(false);
-  AddAttr<std::string>(
-      "regularization_method",
-      "(string) regularization_method, right now only support l2decay or none")
-      .SetDefault("");
-  AddAttr<float>("regularization_coeff", "(float) regularization_coeff")
-      .SetDefault(0.0f);
-  AddAttr<bool>("multi_precision",
-                "(bool, default false) "
-                "Whether to use multi-precision during weight updating.")
-      .SetDefault(false);
-  AddAttr<float>(
-      "rescale_grad",
-      "(float, default 1.0) Multiply the gradient with `rescale_grad`"
-      "before updating. Often choose to be `1.0/batch_size`.")
-      .SetDefault(1.0f);
-
-  AddComment(R"DOC(
-Momentum Optimizer.
-
-This optimizer has a flag for Nestrov Momentum.
-The update equations are as follows:
-
-$$
-velocity = mu * velocity + gradient \\
-if (use\_nesterov):   \\
-  param = param - (gradient + mu * velocity) * learning\_rate \\
-else:   \\
-  param = param - learning\_rate * velocity. \\
-$$
-
-)DOC");
-}
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    momentum,
-    ops::MomentumOp,
-    ops::MomentumOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    ops::MomentumOpInferVarType);
-
-REGISTER_OP_VERSION(momentum).AddCheckpoint(
-    R"ROC(
-      Upgrade momentum add 4 attributes [regularization_method, regularization_coeff,
-      multi_precision, rescale_grad].
-    )ROC",
-    paddle::framework::compatible::OpVersionDesc()
-        .NewInput("MasterParam", "FP32 master weight for AMP.")
-        .NewOutput("MasterParamOut",
-                   "The updated FP32 master weight for AMP. "
-                   "It shared memory with Input(MasterParam).")
-        .NewAttr("regularization_method",
-                 "(string) regularization_method, right now only support "
-                 "l2decay or none",
-                 std::string(""))
-        .NewAttr("regularization_coeff", "(float) regularization_coeff", 0.0f)
-        .NewAttr(
-            "multi_precision",
-            "(bool) Whether to use multi-precision during weight updating.",
-            false)
-        .NewAttr("rescale_grad",
-                 "(float) Multiply the gradient with `rescale_grad`"
-                 "before updating. Often choose to be `1.0/batch_size`.",
-                 1.0f));
diff --git a/paddle/fluid/operators/optimizers/momentum_op.h b/paddle/fluid/operators/optimizers/momentum_op.h
deleted file mode 100644
index 316f742a2fd360..00000000000000
--- a/paddle/fluid/operators/optimizers/momentum_op.h
+++ /dev/null
@@ -1,126 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/fluid/platform/for_range.h"
-#include "paddle/phi/kernels/funcs/algorithm.h"
-
-namespace paddle {
-namespace operators {
-
-class MomentumOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override;
-};
-
-class MomentumOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Param"),
-                      true,
-                      platform::errors::NotFound(
-                          "Input(param) of Momentum should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Grad"),
-                      true,
-                      platform::errors::NotFound(
-                          "Input(grad) of Momentum should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Velocity"),
-                      true,
-                      platform::errors::NotFound(
-                          "Input(velocity) of Momentum should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("LearningRate"),
-        true,
-        platform::errors::NotFound(
-            "Input(LearningRate) of Momentum should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->GetInputsVarType("Param").front(),
-                      framework::proto::VarType::LOD_TENSOR,
-                      platform::errors::InvalidArgument(
-                          "The input var's type should be phi::DenseTensor, "
-                          "but the received is %s",
-                          ctx->GetInputsVarType("Param").front()));
-
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("ParamOut"),
-                      true,
-                      platform::errors::NotFound(
-                          "Output(ParamOut) of Momentum should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("VelocityOut"),
-        true,
-        platform::errors::NotFound(
-            "Output(VelocityOut) of Momentum should not be null."));
-
-    auto lr_dims = ctx->GetInputDim("LearningRate");
-    PADDLE_ENFORCE_NE(phi::product(lr_dims),
-                      0,
-                      platform::errors::InvalidArgument(
-                          "Maybe the Input variable LearningRate has not "
-                          "been initialized. You may need to confirm "
-                          "if you put exe.run(startup_program) "
-                          "after optimizer.minimize function."));
-    PADDLE_ENFORCE_EQ(phi::product(lr_dims),
-                      1,
-                      platform::errors::InvalidArgument(
-                          "Learning_rate should be a scalar. But Received "
-                          "LearningRate's dim [%s]",
-                          phi::product(lr_dims)));
-
-    auto param_dim = ctx->GetInputDim("Param");
-    if (ctx->GetInputsVarType("Grad")[0] ==
-        framework::proto::VarType::LOD_TENSOR) {
-      PADDLE_ENFORCE_EQ(
-          param_dim,
-          ctx->GetInputDim("Grad"),
-          platform::errors::InvalidArgument(
-              "Param and Grad input of MomentumOp should have the same "
-              "dimension. But received Param's dim [%s] and Grad's dim [%s].",
-              param_dim,
-              ctx->GetInputDim("Grad")));
-      PADDLE_ENFORCE_EQ(
-          param_dim,
-          ctx->GetInputDim("Velocity"),
-          platform::errors::InvalidArgument(
-              "Param and Velocity of MomentumOp should have the same "
-              "dimension. But received Param's dim [%s] and Velocity [%s].",
-              param_dim,
-              ctx->GetInputDim("Velocity")));
-    }
-
-    ctx->SetOutputDim("ParamOut", param_dim);
-    ctx->SetOutputDim("VelocityOut", param_dim);
-    if (ctx->HasOutput("MasterParamOut")) {
-      ctx->SetOutputDim("MasterParamOut", param_dim);
-    }
-  }
-
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto input_data_type =
-        OperatorWithKernel::IndicateVarDataType(ctx, "Param");
-    return phi::KernelKey(input_data_type, ctx.GetPlace());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/optimizers/unity_build_rule.cmake b/paddle/fluid/operators/optimizers/unity_build_rule.cmake
index 05daf4cad0cf8a..86cf7fed5a9a21 100644
--- a/paddle/fluid/operators/optimizers/unity_build_rule.cmake
+++ b/paddle/fluid/operators/optimizers/unity_build_rule.cmake
@@ -8,7 +8,6 @@ register_unity_group(
   cc
   ftrl_op.cc
   lars_momentum_op.cc
-  momentum_op.cc
   proximal_adagrad_op.cc
   adagrad_op.cc
   adam_op.cc
diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml
index 60ab3606853061..620548a90a894b 100755
--- a/paddle/phi/api/yaml/legacy_ops.yaml
+++ b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -948,17 +948,6 @@
     func : mish
   backward : mish_grad
 
-- op : momentum_
-  args : (Tensor param, Tensor grad, Tensor velocity, Tensor learning_rate, Tensor master_param, float mu, bool use_nesterov = false, str regularization_method = "", float regularization_coeff = 0.0, bool multi_precision = false, float rescale_grad = 1.0f)
-  output : Tensor(param_out), Tensor(velocity_out), Tensor(master_param_out)
-  infer_meta:
-    func : MomentumInferMeta
-  kernel :
-    func : momentum
-    data_type : param
-  optional : master_param
-  inplace : (param -> param_out), (velocity -> velocity_out), (master_param -> master_param_out)
-
 - op : multiclass_nms3
   args : (Tensor bboxes, Tensor scores, Tensor rois_num, float score_threshold, int nms_top_k, int keep_top_k, float nms_threshold=0.3, bool normalized=true, float nms_eta=1.0, int background_label=0)
   output : Tensor(out), Tensor(index), Tensor(nms_rois_num)
diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
index aff1486f2ae696..2ab480f517b1ae 100644
--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -1431,6 +1431,12 @@
   outputs :
     {out : Out, indices : Indices}
 
+- op : momentum_
+  inputs :
+    {param : Param, grad : Grad, velocity : Velocity, learning_rate : LearningRate, master_param : MasterParam}
+  outputs :
+    {param_out : ParamOut, velocity_out : VelocityOut, master_param_out : MasterParamOut}
+
 - op : multi_dot
   backward : multi_dot_grad
   inputs :
diff --git a/paddle/phi/api/yaml/op_version.yaml b/paddle/phi/api/yaml/op_version.yaml
index b36cd86f78ae8a..e8bdbb28259f9e 100644
--- a/paddle/phi/api/yaml/op_version.yaml
+++ b/paddle/phi/api/yaml/op_version.yaml
@@ -153,6 +153,27 @@
         - add_output : RoisNum
           comment : The number of RoIs in each image.
 
+- op : momentum
+  version :
+    - checkpoint : Upgrade momentum add 4 attributes [regularization_method, regularization_coeff, multi_precision, rescale_grad].
+      action :
+        - add_input : MasterParam
+          comment : FP32 master weight for AMP.
+        - add_output : MasterParamOut
+          comment : The updated FP32 master weight for AMP. It shared memory with Input(MasterParam).
+        - add_attr : regularization_method
+          comment : (string) regularization_method, right now only support l2decay or none
+          default : std::string("")
+        - add_attr : regularization_coeff
+          comment : (float) regularization_coeff
+          default : 0.0
+        - add_attr : multi_precision
+          comment : (bool) Whether to use multi-precision during weight updating.
+          default : "false"
+        - add_attr : rescale_grad
+          comment : (float) Multiply the gradient with `rescale_grad` before updating. Often choose to be `1.0/batch_size`.
+          default : 1.0
+
 - op : not_equal
   version :
     - checkpoint : Upgrade compare ops, add a new attribute [force_cpu]
diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
index 91f7ba04d061a1..23ae6a8baa75b9 100644
--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -1184,6 +1184,18 @@
     func : mode
   backward : mode_grad
 
+- op : momentum_
+  args : (Tensor param, Tensor grad, Tensor velocity, Tensor learning_rate, Tensor master_param, float mu, bool use_nesterov = false, str regularization_method = "", float regularization_coeff = 0.0f, bool multi_precision = false, float rescale_grad = 1.0f)
+  output : Tensor(param_out), Tensor(velocity_out), Tensor(master_param_out)
+  infer_meta:
+    func : MomentumInferMeta
+  kernel :
+    func : momentum {dense, dense, dense, dense, dense -> dense, dense, dense},
+           momentum_dense_param_sparse_grad {dense, selected_rows, dense, dense, dense -> dense, dense, dense}
+    data_type : param
+  optional : master_param, master_param_out
+  inplace : (param -> param_out), (velocity -> velocity_out), (master_param -> master_param_out)
+
 - op : multi_dot
   args : (Tensor[] x)
   output : Tensor
diff --git a/paddle/phi/ops/compat/momentum_sig.cc b/paddle/phi/ops/compat/momentum_sig.cc
deleted file mode 100644
index 3511ddc63c891c..00000000000000
--- a/paddle/phi/ops/compat/momentum_sig.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-
-KernelSignature MomentumOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  if (ctx.IsDenseTensorInput("Grad")) {
-    return KernelSignature(
-        "momentum",
-        {"Param", "Grad", "Velocity", "LearningRate", "MasterParam"},
-        {"mu",
-         "use_nesterov",
-         "regularization_method",
-         "regularization_coeff",
-         "multi_precision",
-         "rescale_grad"},
-        {"ParamOut", "VelocityOut", "MasterParamOut"});
-  } else if (ctx.IsSelectedRowsInput("Grad")) {
-    return KernelSignature(
-        "momentum_dense_param_sparse_grad",
-        {"Param", "Grad", "Velocity", "LearningRate", "MasterParam"},
-        {"mu",
-         "use_nesterov",
-         "regularization_method",
-         "regularization_coeff",
-         "multi_precision",
-         "rescale_grad"},
-        {"ParamOut", "VelocityOut", "MasterParamOut"});
-  }
-
-  return KernelSignature("unregistered", {}, {}, {});
-}
-
-}  // namespace phi
-
-PD_REGISTER_ARG_MAPPING_FN(momentum, phi::MomentumOpArgumentMapping);

From 5b09dd565465455b0fcfd04a2a5f80a3392ee000 Mon Sep 17 00:00:00 2001
From: Thomas Young <35565423+HexToString@users.noreply.github.com>
Date: Tue, 11 Apr 2023 10:42:02 +0800
Subject: [PATCH 039/156] [AMP OP&Test] add bf16 fp16 type support for
 expand_v2_op and top_k_v2_op (#51263)

---
 .../phi/kernels/funcs/top_k_function_cuda.h   | 29 +++++++++
 paddle/phi/kernels/gpu/top_k_grad_kernel.cu   |  5 +-
 paddle/phi/kernels/gpu/top_k_kernel.cu        |  5 +-
 .../tests/unittests/test_expand_v2_op.py      | 60 +++++++++++++++++--
 .../fluid/tests/unittests/test_top_k_v2_op.py | 52 +++++++++++++++-
 python/paddle/tensor/manipulation.py          | 10 +++-
 6 files changed, 151 insertions(+), 10 deletions(-)

diff --git a/paddle/phi/kernels/funcs/top_k_function_cuda.h b/paddle/phi/kernels/funcs/top_k_function_cuda.h
index de58c05149a53d..26374ca36007a3 100644
--- a/paddle/phi/kernels/funcs/top_k_function_cuda.h
+++ b/paddle/phi/kernels/funcs/top_k_function_cuda.h
@@ -26,6 +26,7 @@ limitations under the License. */
 #include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
+#include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
@@ -49,6 +50,10 @@ namespace detail {
 template <>
 struct radix_key_codec_base<phi::dtype::float16>
     : radix_key_codec_integral<phi::dtype::float16, uint16_t> {};
+
+template <>
+struct radix_key_codec_base<phi::dtype::bfloat16>
+    : radix_key_codec_integral<phi::dtype::bfloat16, uint16_t> {};
 }  // namespace detail
 }  // namespace rocprim
 namespace cub = hipcub;
@@ -58,6 +63,12 @@ namespace cub {
 template <>
 struct NumericTraits<phi::dtype::float16>
     : BaseTraits<FLOATING_POINT, true, false, uint16_t, phi::dtype::float16> {};
+
+template <>
+struct NumericTraits<phi::dtype::bfloat16>
+    : BaseTraits<FLOATING_POINT, true, false, uint16_t, phi::dtype::bfloat16> {
+};
+
 }  // namespace cub
 #endif
 
@@ -586,6 +597,24 @@ struct RadixTypeConfig<phi::dtype::float16> {
   }
 };
 
+template <>
+struct RadixTypeConfig<phi::dtype::bfloat16> {
+  typedef uint32_t RadixType;
+
+  static inline __device__ RadixType Convert(phi::dtype::bfloat16 v) {
+    RadixType x = v.x;
+    RadixType mask = (x & 0x00008000) ? 0x0000ffff : 0x00008000;
+    return (v == v) ? (x ^ mask) : 0xffff;
+  }
+
+  static inline __device__ phi::dtype::bfloat16 Deconvert(RadixType v) {
+    RadixType mask = (v & 0x00008000) ? 0x00008000 : 0x0000ffff;
+    phi::dtype::bfloat16 r;
+    r.x = (v ^ mask);
+    return r;
+  }
+};
+
 /*---------------------------Helper Functions------------------*/
 __device__ __forceinline__ int GetLaneId() {
   int lane_id;
diff --git a/paddle/phi/kernels/gpu/top_k_grad_kernel.cu b/paddle/phi/kernels/gpu/top_k_grad_kernel.cu
index 638d53c010ce64..6c2e880e9a9efb 100644
--- a/paddle/phi/kernels/gpu/top_k_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/top_k_grad_kernel.cu
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/top_k_grad_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/top_k_function_cuda.h"
@@ -89,4 +89,5 @@ PD_REGISTER_KERNEL(topk_grad,
                    double,
                    int,
                    int64_t,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/top_k_kernel.cu b/paddle/phi/kernels/gpu/top_k_kernel.cu
index 6811b3e31db544..e2793955ef9c17 100644
--- a/paddle/phi/kernels/gpu/top_k_kernel.cu
+++ b/paddle/phi/kernels/gpu/top_k_kernel.cu
@@ -15,11 +15,13 @@
 #include "paddle/phi/kernels/top_k_kernel.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/gather.cu.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/top_k_function_cuda.h"
+
 namespace phi {
 
 #define FIXED_BLOCK_DIM_BASE(dim, ...) \
@@ -348,6 +350,7 @@ PD_REGISTER_KERNEL(topk,
                    double,
                    int,
                    int64_t,
-                   phi::dtype::float16) {
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {
   kernel->OutputAt(1).SetDataType(phi::DataType::INT64);
 }
diff --git a/python/paddle/fluid/tests/unittests/test_expand_v2_op.py b/python/paddle/fluid/tests/unittests/test_expand_v2_op.py
index 5c0f6ff707fb45..27fc92292f36f9 100644
--- a/python/paddle/fluid/tests/unittests/test_expand_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_expand_v2_op.py
@@ -17,7 +17,7 @@
 import gradient_checker
 import numpy as np
 from decorator_helper import prog_scope
-from eager_op_test import OpTest
+from eager_op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import fluid
@@ -202,6 +202,56 @@ def test_check_output(self):
         self.check_output()
 
 
+#  Situation 7: input x is Float16
+class TestExpandV2FP16Op(OpTest):
+    def setUp(self):
+        self.op_type = "expand_v2"
+        self.prim_op_type = "prim"
+        self.dtype = np.float16
+        self.python_api = paddle.expand
+        self.public_python_api = paddle.expand
+        self.inputs = {
+            'X': np.random.randint(10, size=(8, 8, 5)).astype(self.dtype)
+        }
+        self.attrs = {'shape': [8, 8, 5]}
+        output = np.tile(self.inputs['X'], (1, 1, 1))
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', check_prim=True)
+
+
+#  Situation 8: input x is BF16
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support the bfloat16",
+)
+class TestExpandV2BF16Op(OpTest):
+    def setUp(self):
+        self.op_type = "expand_v2"
+        self.prim_op_type = "prim"
+        self.dtype = np.uint16
+        self.python_api = paddle.expand
+        self.public_python_api = paddle.expand
+        x = np.random.randint(10, size=(8, 8, 5)).astype(np.float32)
+        self.inputs = {'X': convert_float_to_uint16(x)}
+        self.attrs = {'shape': [8, 8, 5]}
+        output = np.tile(x, (1, 1, 1)).astype(np.float32)
+        self.outputs = {'Out': convert_float_to_uint16(output)}
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(place, ['X'], 'Out', check_prim=True)
+
+
 class TestExpandV2Error(unittest.TestCase):
     def test_errors(self):
         with program_guard(Program(), Program()):
@@ -338,7 +388,7 @@ def test_grad(self):
             self.func(p)
 
 
-# Situation 7: comp case, shape is a list(without tensor)
+# Situation 9: comp case, shape is a list(without tensor)
 class TestExpandV2CompOpRank1(OpTest):
     def setUp(self):
         self.op_type = "expand_v2"
@@ -392,7 +442,7 @@ def init_data(self):
         self.expand_times = (1, 1, 1, 1)
 
 
-# Situation 8: comp case, input x is Integer
+# Situation 10: comp case, input x is Integer
 class TestExpandV2CompOpInteger(OpTest):
     def setUp(self):
         self.op_type = "expand_v2"
@@ -410,7 +460,7 @@ def test_check_output(self):
         self.check_output(check_prim=True)
 
 
-#  Situation 9: comp case, input x is Bool
+#  Situation 11: comp case, input x is Bool
 class TestExpandV2CompOpBoolean(OpTest):
     def setUp(self):
         self.op_type = "expand_v2"
@@ -426,7 +476,7 @@ def test_check_output(self):
         self.check_output(check_prim=True)
 
 
-#  Situation 10: comp case, input x is Integer
+#  Situation 12: comp case, input x is Integer
 class TestExpandV2CompOpInt64_t(OpTest):
     def setUp(self):
         self.op_type = "expand_v2"
diff --git a/python/paddle/fluid/tests/unittests/test_top_k_v2_op.py b/python/paddle/fluid/tests/unittests/test_top_k_v2_op.py
index d64906560dc092..5612703968dad0 100644
--- a/python/paddle/fluid/tests/unittests/test_top_k_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_top_k_v2_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from eager_op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle.fluid import core
@@ -189,6 +189,56 @@ def setUp(self):
         self.outputs = {'Out': output, 'Indices': indices}
 
 
+class TestTopkFP16Op(TestTopkOp):
+    def setUp(self):
+        self.op_type = "top_k_v2"
+        self.python_api = paddle.topk
+        self.public_python_api = paddle.topk
+        self.dtype = np.float16
+        self.prim_op_type = "prim"
+        self.input_data = np.random.rand(10, 20).astype(self.dtype)
+        self.init_args()
+        self.inputs = {'X': self.input_data}
+        self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
+        output, indices = numpy_topk(
+            self.input_data, axis=self.axis, k=self.k, largest=self.largest
+        )
+        self.outputs = {'Out': output, 'Indices': indices}
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support the bfloat16",
+)
+class TestTopkBF16Op(TestTopkOp):
+    def setUp(self):
+        self.op_type = "top_k_v2"
+        self.python_api = paddle.topk
+        self.public_python_api = paddle.topk
+        self.dtype = np.uint16
+        self.prim_op_type = "prim"
+        self.input_data = np.random.rand(10, 20).astype(np.float32)
+        self.init_args()
+        self.inputs = {'X': convert_float_to_uint16(self.input_data)}
+        self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
+        output, indices = numpy_topk(
+            self.input_data, axis=self.axis, k=self.k, largest=self.largest
+        )
+        self.outputs = {
+            'Out': convert_float_to_uint16(output),
+            'Indices': indices,
+        }
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place, check_eager=True)
+
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(place, {'X'}, 'Out', check_eager=True)
+
+
 class TestTopKAPI(unittest.TestCase):
     def setUp(self):
         np.random.seed(123)
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 40a83f6dbf3707..09aaff08c3ca5e 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -3418,7 +3418,15 @@ def expand(x, shape, name=None):
         check_variable_and_dtype(
             x,
             'x',
-            ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
+            [
+                'bool',
+                'float16',
+                'float32',
+                'float64',
+                'int32',
+                'int64',
+                'uint16',
+            ],
             'expand',
         )
         check_type(shape, 'shape', (list, tuple, Variable), 'expand')

From b0ebd3447077b4603b530cbacd11fad963852e34 Mon Sep 17 00:00:00 2001
From: mhy-666 <57670156+mhy-666@users.noreply.github.com>
Date: Tue, 11 Apr 2023 10:42:13 +0800
Subject: [PATCH 040/156] [AMP OP&Test] add fp16/bf16 unittest for
 softmax_with_cross_entropy ops (#52412)

* add softmax_with_cross_entropybf16 test

* correct defalut value in testBF16/FP16 op

* fix test checkout/grad, add skipif
---
 .../test_softmax_with_cross_entropy_op.py     | 65 +++++++++++++++++--
 1 file changed, 61 insertions(+), 4 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
index ca11dd0a2bdbf7..51aa23306481bd 100644
--- a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from eager_op_test import OpTest, convert_float_to_uint16
 from test_softmax_op import stable_softmax
 
 import paddle
@@ -478,6 +478,7 @@ def initParams(self):
     def setUp(self):
         self.initParams()
         self.op_type = "softmax_with_cross_entropy"
+        self.dtype = np.float16
 
         # NOTE: numpy float16 have very low accuracy, use float32 for numpy check.
         date_type = np.float32 if core.is_compiled_with_rocm() else np.float64
@@ -508,12 +509,12 @@ def setUp(self):
 
     def test_check_output(self):
         if self.python_api is not None:
-            self.check_output(atol=1e-2)
-        self.check_output(atol=1e-2)
+            self.check_output()
+        self.check_output()
 
     def test_check_grad(self):
         if self.python_api is not None:
-            self.check_grad(["Logits"], "Loss", max_relative_error=0.1)
+            self.check_grad(["Logits"], "Loss")
         self.check_grad(["Logits"], "Loss", max_relative_error=0.1)
 
 
@@ -917,6 +918,62 @@ def initParams(self):
         self.use_softmax = True
 
 
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and not support the bfloat16",
+)
+class TestSoftmaxWithCrossEntropyOpBF16(TestSoftmaxWithCrossEntropyOp):
+    def setUp(self):
+        self.initParams()
+        self.op_type = "softmax_with_cross_entropy"
+        self.dtype = np.uint16
+
+        # NOTE: numpy bf16 have very low accuracy, use float32 for numpy check.
+        date_type = np.float32
+        logits = getattr(
+            self,
+            "logits",
+            np.random.uniform(0.1, 1.0, self.shape).astype(date_type),
+        )
+        softmax = np.apply_along_axis(stable_softmax, self.axis, logits)
+
+        axis_dim = self.shape[self.axis]
+        self.shape[self.axis] = 1
+        labels = np.random.randint(0, axis_dim, self.shape, dtype="int64")
+
+        loss = cross_entropy(softmax, labels, self.soft_label, self.axis)
+
+        self.inputs = {
+            "Logits": convert_float_to_uint16(logits),
+            "Label": labels,
+        }
+        self.outputs = {
+            "Softmax": convert_float_to_uint16(softmax),
+            "Loss": convert_float_to_uint16(loss),
+        }
+        self.attrs = {
+            "numeric_stable_mode": self.numeric_stable_mode,
+            "soft_label": self.soft_label,
+        }
+        if self.axis != -1:
+            self.attrs['axis'] = self.axis
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        if self.python_api is not None:
+            self.check_output_with_place(place)
+        self.check_output_with_place(place, atol=1e-2)
+
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        if self.python_api is not None:
+            self.check_grad_with_place(place, ["Logits"], "Loss")
+        self.check_grad_with_place(
+            place, ["Logits"], "Loss", max_relative_error=0.1
+        )
+
+
 if __name__ == "__main__":
     paddle.enable_static()
     unittest.main()

From c4e1fcba23544afdc9e3d6255991f38e09888883 Mon Sep 17 00:00:00 2001
From: RedContritio <RedContritio@qq.com>
Date: Tue, 11 Apr 2023 10:45:05 +0800
Subject: [PATCH 041/156] support auto generate for op adagrad optimizer
 (#52695)

---
 .../fluid/operators/optimizers/adagrad_op.cc  | 92 -------------------
 .../optimizers/unity_build_rule.cmake         |  1 -
 paddle/phi/api/yaml/legacy_ops.yaml           | 12 ---
 paddle/phi/api/yaml/op_compat.yaml            |  6 ++
 paddle/phi/api/yaml/ops.yaml                  | 12 +++
 paddle/phi/ops/compat/adagrad_sig.cc          | 39 --------
 6 files changed, 18 insertions(+), 144 deletions(-)
 delete mode 100644 paddle/fluid/operators/optimizers/adagrad_op.cc
 delete mode 100644 paddle/phi/ops/compat/adagrad_sig.cc

diff --git a/paddle/fluid/operators/optimizers/adagrad_op.cc b/paddle/fluid/operators/optimizers/adagrad_op.cc
deleted file mode 100644
index 95f4092d358037..00000000000000
--- a/paddle/fluid/operators/optimizers/adagrad_op.cc
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cmath>
-#include <vector>
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/phi/infermeta/multiary.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-#include "paddle/phi/kernels/funcs/selected_rows_functor.h"
-
-namespace paddle {
-namespace operators {
-
-class AdagradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "Param"),
-                          ctx.GetPlace());
-  }
-};
-
-class AdagradOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Param", "(Tensor) Input parameter");
-    AddInput("Grad", "(Tensor) Input gradient");
-    AddInput("Moment", "(Tensor) Second moment");
-    AddInput("LearningRate", "(Tensor) Learning rate");
-    AddInput("MasterParam", "FP32 master weight for AMP.").AsDispensable();
-
-    AddOutput("ParamOut", "(Tensor) Output parameter");
-    AddOutput("MomentOut", "(Tensor) Output second moment");
-    AddOutput("MasterParamOut",
-              "The updated FP32 master weight for AMP. "
-              "It shared memory with Input(MasterParam).")
-        .AsDispensable();
-
-    AddAttr<float>("epsilon",
-                   "(float, default 1.0e-6) "
-                   "Constant for numerical stability")
-        .SetDefault(1.0e-6f);
-    AddAttr<bool>("multi_precision",
-                  "(bool, default false) "
-                  "Whether to use multi-precision during weight updating.")
-        .SetDefault(false);
-    AddComment(R"DOC(
-
-Adaptive Gradient Algorithm (Adagrad).
-
-The update is done as follows:
-
-$$moment\_out = moment + grad * grad \\
-param\_out = param - \frac{learning\_rate * grad}{\sqrt{moment\_out} + \epsilon}
-$$
-
-The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
-does not have the epsilon attribute. It is added here in our implementation
-as also proposed here: http://cs231n.github.io/neural-networks-3/#ada
-for numerical stability to avoid the division by zero error.
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-DECLARE_INFER_SHAPE_FUNCTOR(adagrad,
-                            AdagradInferShapeFunctor,
-                            PD_INFER_META(phi::AdagradInferMeta));
-REGISTER_OP_WITHOUT_GRADIENT(adagrad,
-                             ops::AdagradOp,
-                             ops::AdagradOpMaker,
-                             AdagradInferShapeFunctor);
diff --git a/paddle/fluid/operators/optimizers/unity_build_rule.cmake b/paddle/fluid/operators/optimizers/unity_build_rule.cmake
index 86cf7fed5a9a21..676d554bc00733 100644
--- a/paddle/fluid/operators/optimizers/unity_build_rule.cmake
+++ b/paddle/fluid/operators/optimizers/unity_build_rule.cmake
@@ -9,7 +9,6 @@ register_unity_group(
   ftrl_op.cc
   lars_momentum_op.cc
   proximal_adagrad_op.cc
-  adagrad_op.cc
   adam_op.cc
   adamax_op.cc
   dgc_momentum_op.cc
diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml
index 620548a90a894b..c70d365927f1c1 100755
--- a/paddle/phi/api/yaml/legacy_ops.yaml
+++ b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -21,18 +21,6 @@
   optional : master_param
   inplace : (param -> param_out), (avg_squared_grad -> moment_out), (avg_squared_update -> inf_norm_out), (master_param -> master_param_out)
 
-- op : adagrad_
-  args : (Tensor param, Tensor grad, Tensor moment, Tensor learning_rate, Tensor master_param, float epsilon, bool multi_precision)
-  output : Tensor(param_out), Tensor(moment_out), Tensor(master_param_out)
-  infer_meta :
-    func : AdagradInferMeta
-  kernel :
-    func : adagrad {dense, dense, dense, dense, dense -> dense, dense, dense}
-           adagrad_dense_param_sparse_grad {dense, selected_rows, dense, dense, dense-> dense, dense, dense}
-    data_type : param
-  optional : master_param
-  inplace : (param -> param_out), (moment -> moment_out), (master_param -> master_param_out)
-
 - op : adam_
   args : (Tensor param, Tensor grad, Tensor learning_rate, Tensor moment1, Tensor moment2, Tensor beta1_pow, Tensor beta2_pow, Tensor master_param, Tensor skip_update, Scalar beta1, Scalar beta2, Scalar epsilon, bool lazy_mode, int64_t min_row_size_to_use_multithread, bool multi_precision, bool use_global_beta_pow)
   output : Tensor(param_out), Tensor(moment1_out), Tensor(moment2_out), Tensor(beta1_pow_out), Tensor(beta2_pow_out), Tensor(master_param_outs)
diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
index 2ab480f517b1ae..baf9c516a6ca87 100644
--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -44,6 +44,12 @@
   extra :
     attrs : [bool use_mkldnn = false, bool use_cudnn = false]
 
+- op : adagrad_
+  inputs :
+    { param : Param, grad : Grad, moment : Moment, learning_rate : LearningRate, master_param : MasterParam }
+  outputs :
+    { param_out : ParamOut, moment_out : MomentOut, master_param_out : MasterParamOut }
+
 - op : add (elementwise_add)
   backward : add_grad (elementwise_add_grad)
   extra :
diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
index 23ae6a8baa75b9..91e478c008a903 100644
--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -32,6 +32,18 @@
     func : acosh
   backward : acosh_grad
 
+- op : adagrad_
+  args : (Tensor param, Tensor grad, Tensor moment, Tensor learning_rate, Tensor master_param, float epsilon = 1.0e-6f, bool multi_precision = false)
+  output : Tensor(param_out), Tensor(moment_out), Tensor(master_param_out)
+  infer_meta :
+    func : AdagradInferMeta
+  kernel :
+    func : adagrad {dense, dense, dense, dense, dense -> dense, dense, dense}
+           adagrad_dense_param_sparse_grad {dense, selected_rows, dense, dense, dense -> dense, dense, dense}
+    data_type : param
+  optional : master_param, master_param_out
+  inplace : (param -> param_out), (moment -> moment_out), (master_param -> master_param_out)
+
 - op : addmm
   args : (Tensor input, Tensor x, Tensor y, float beta=1.0, float alpha=1.0)
   output : Tensor
diff --git a/paddle/phi/ops/compat/adagrad_sig.cc b/paddle/phi/ops/compat/adagrad_sig.cc
deleted file mode 100644
index 2a79261d21feba..00000000000000
--- a/paddle/phi/ops/compat/adagrad_sig.cc
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-
-KernelSignature AdagradOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  if (ctx.IsDenseTensorInput("Grad")) {
-    return KernelSignature(
-        "adagrad",
-        {"Param", "Grad", "Moment", "LearningRate", "MasterParam"},
-        {"epsilon", "multi_precision"},
-        {"ParamOut", "MomentOut", "MasterParamOut"});
-  } else if (ctx.IsSelectedRowsInput("Grad")) {
-    return KernelSignature(
-        "adagrad_dense_param_sparse_grad",
-        {"Param", "Grad", "Moment", "LearningRate", "MasterParam"},
-        {"epsilon", "multi_precision"},
-        {"ParamOut", "MomentOut", "MasterParamOut"});
-  }
-
-  return KernelSignature("unregistered", {}, {}, {});
-}
-
-}  // namespace phi
-
-PD_REGISTER_ARG_MAPPING_FN(adagrad, phi::AdagradOpArgumentMapping);

From ab754417ae0ff6ae18592cb711245caca0be757c Mon Sep 17 00:00:00 2001
From: Wang Xin <xinwang614@gmail.com>
Date: Tue, 11 Apr 2023 10:47:38 +0800
Subject: [PATCH 042/156] add autogen code support for reverse op (#52701)

* add autogen code support for reverse op

* bug fixed
---
 paddle/fluid/operators/reverse_op.cc     | 117 -----------------------
 paddle/phi/api/yaml/backward.yaml        |   6 ++
 paddle/phi/api/yaml/legacy_backward.yaml |   6 --
 paddle/phi/api/yaml/legacy_ops.yaml      |   9 --
 paddle/phi/api/yaml/op_compat.yaml       |  11 +++
 paddle/phi/api/yaml/ops.yaml             |  10 ++
 6 files changed, 27 insertions(+), 132 deletions(-)
 delete mode 100644 paddle/fluid/operators/reverse_op.cc

diff --git a/paddle/fluid/operators/reverse_op.cc b/paddle/fluid/operators/reverse_op.cc
deleted file mode 100644
index 07c3aac52078a5..00000000000000
--- a/paddle/fluid/operators/reverse_op.cc
+++ /dev/null
@@ -1,117 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <memory>
-#include <vector>
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/phi/infermeta/unary.h"
-
-namespace paddle {
-namespace operators {
-
-class ReverseOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto input_data_type =
-        framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
-
-    return phi::KernelKey(input_data_type, ctx.GetPlace());
-  }
-};
-
-class ReverseOpVarTypeInference : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext* ctx) const override {
-    ctx->SetOutputType("Out", ctx->GetInputType("X"));
-    ctx->SetOutputDataType("Out", ctx->GetInputDataType("X"));
-  }
-};
-
-class ReverseOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The phi::DenseTensor to be flipped.");
-    AddOutput("Out", "The phi::DenseTensor after flipping.");
-    AddAttr<std::vector<int>>(
-        "axis", "The axises that along which order of elements is reversed.")
-        .SupportTensor();
-    AddComment(R"DOC(
-      Reverse Operator.
-
-      Reverse the order of elements in the input phi::DenseTensor along given axises.
-
-      Case 1:
-        Given
-            X = [[1, 2, 3, 4, 5]
-                 [6, 7, 8, 9, 10]
-                 [11, 12, 13, 14, 15]],
-        and
-            axis = [0],
-        we get:
-            Out = [[11, 12, 13, 14, 15]
-                   [6, 7, 8, 9, 10]
-                   [1, 2, 3, 4, 5]].
-
-      Case 2:
-        Given
-            X = [[[1, 2, 3, 4]
-                  [5, 6, 7, 8]]
-                 [[9, 10, 11, 12]
-                  [13, 14, 15, 16]]],
-        and
-            axis = [0, 2],
-        we get:
-            Out = [[[12, 11, 10, 9]
-                    [16, 15, 14, 13]]
-                   [[4, 3, 2, 1]
-                    [8, 7, 6, 5]]],
-    )DOC");
-  }
-};
-
-template <typename T>
-class ReverseGradMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
-  void Apply(GradOpPtr<T> grad_op) const override {
-    grad_op->SetType("reverse");
-    grad_op->SetInput("X", this->OutputGrad("Out"));
-    grad_op->SetOutput("Out", this->InputGrad("X"));
-    grad_op->SetAttr("axis", this->GetAttr("axis"));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-DECLARE_INFER_SHAPE_FUNCTOR(reverse,
-                            ReverseInferShapeFunctor,
-                            PD_INFER_META(phi::ReverseInferMeta));
-REGISTER_OPERATOR(reverse,
-                  ops::ReverseOp,
-                  ops::ReverseOpMaker,
-                  ops::ReverseGradMaker<paddle::framework::OpDesc>,
-                  ops::ReverseGradMaker<paddle::imperative::OpBase>,
-                  ops::ReverseOpVarTypeInference,
-                  ReverseInferShapeFunctor);
-REGISTER_OPERATOR(reverse_grad, ops::ReverseOp, ops::ReverseOpVarTypeInference);
diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml
index 1c3599d07a595d..f4608f008535ca 100644
--- a/paddle/phi/api/yaml/backward.yaml
+++ b/paddle/phi/api/yaml/backward.yaml
@@ -1328,6 +1328,12 @@
   kernel :
     func : renorm_grad
 
+- backward_op : reverse_grad
+  forward : reverse (Tensor x, IntArray axis) -> Tensor(out)
+  args : (Tensor out_grad, IntArray axis)
+  output : Tensor(x_grad)
+  invoke : reverse(out_grad, axis)
+
 - backward_op : roll_grad
   forward : roll(Tensor x, IntArray shifts, int64_t[] axis) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, IntArray shifts, int64_t[] axis)
diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml
index 3cf1904b1ed724..181b819cde96ee 100755
--- a/paddle/phi/api/yaml/legacy_backward.yaml
+++ b/paddle/phi/api/yaml/legacy_backward.yaml
@@ -884,12 +884,6 @@
   backward : reshape_double_grad
   inplace : (out_grad -> x_grad)
 
-- backward_op : reverse_grad
-  forward : reverse (Tensor x, IntArray axis) -> Tensor(out)
-  args : (Tensor out_grad, IntArray axis)
-  output : Tensor(x_grad)
-  invoke : reverse(out_grad, axis)
-
 - backward_op : rnn_grad
   forward : rnn (Tensor x, Tensor[] pre_state, Tensor[] weight_list, Tensor sequence_length, Tensor dropout_state_in, float dropout_prob, bool is_bidirec, int input_size, int hidden_size, int num_layers, str mode, int seed, bool is_test) -> Tensor(out), Tensor(dropout_state_out), Tensor[](state), Tensor(reserve)
   args : (Tensor x, Tensor[] pre_state, Tensor[] weight_list, Tensor sequence_length, Tensor out, Tensor dropout_state_out, Tensor reserve, Tensor out_grad, Tensor[] state_grad, float dropout_prob, bool is_bidirec, int input_size, int hidden_size, int num_layers, str mode, int seed, bool is_test)
diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml
index c70d365927f1c1..89aef2203ccaf2 100755
--- a/paddle/phi/api/yaml/legacy_ops.yaml
+++ b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -1133,15 +1133,6 @@
   intermediate : xshape
   backward: reshape_grad
 
-- op : reverse
-  args : (Tensor x, IntArray axis)
-  output : Tensor
-  infer_meta :
-    func : ReverseInferMeta
-  kernel :
-    func : reverse
-  backward : reverse_grad
-
 - op : rmsprop_
   args : (Tensor param, Tensor mean_square, Tensor grad, Tensor moment, Tensor learning_rate, Tensor mean_grad, Tensor master_param, float epsilon, float decay, float momentum, bool centered, bool multi_precision)
   output : Tensor(param_out), Tensor(moment_out), Tensor(mean_square_out), Tensor(mean_grad_out), Tensor(master_param_out)
diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
index baf9c516a6ca87..8e36affbb7596b 100644
--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -1751,6 +1751,17 @@
   extra :
     attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32", bool use_quantizer = false]
 
+- op : reverse
+  inputs:
+    x : X
+  outputs:
+    out : Out
+  int_array:
+    axis :
+      data_type : int
+      support_tensor : true
+  manual_signature : [reverse]
+
 - op : roll
   backward : roll_grad
   inputs :
diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
index 91e478c008a903..8de00fc785ca0a 100644
--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -1436,6 +1436,16 @@
     func : renorm
   backward : renorm_grad
 
+- op : reverse
+  args : (Tensor x, IntArray axis)
+  output : Tensor
+  infer_meta :
+    func : ReverseInferMeta
+  kernel :
+    func : reverse
+    data_type : x
+  backward : reverse_grad
+
 - op : roll
   args : (Tensor x, IntArray shifts={}, int64_t[] axis={})
   output : Tensor(out)

From f5332cad5d3abf7a05118512cd9de905d55ddcac Mon Sep 17 00:00:00 2001
From: jiangcheng <thisjiang@qq.com>
Date: Tue, 11 Apr 2023 10:49:18 +0800
Subject: [PATCH 043/156] [CINN] Reopen some prim+cinn unittest (#52595)

* [CINN] Reopen some prim+cinn unittest

* revert scatter for ci

* change cinn test timeout from 120 to 150
---
 .../fluid/tests/unittests/CMakeLists.txt      |  1 +
 .../fluid/tests/unittests/test_assign_op.py   |  2 --
 .../fluid/tests/unittests/test_concat_op.py   |  8 +++++-
 .../fluid/tests/unittests/test_cumsum_op.py   |  4 +--
 .../fluid/tests/unittests/test_dropout_op.py  | 16 ++++++------
 .../unittests/test_elementwise_add_op.py      |  3 ---
 .../unittests/test_elementwise_max_op.py      | 25 +++++++++++++++++--
 .../tests/unittests/test_gather_nd_op.py      |  3 +--
 .../tests/unittests/test_group_norm_op.py     | 10 ++++++--
 .../fluid/tests/unittests/test_mean_op.py     |  2 +-
 .../fluid/tests/unittests/test_split_op.py    |  1 -
 .../fluid/tests/unittests/test_squeeze2_op.py |  1 -
 .../tests/unittests/test_transpose_op.py      |  1 -
 13 files changed, 52 insertions(+), 25 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 999006a004431d..6f461538a7c8d9 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -1136,6 +1136,7 @@ set(TEST_CINN_OPS
 foreach(TEST_CINN_OPS ${TEST_CINN_OPS})
   if(WITH_CINN)
     set_tests_properties(${TEST_CINN_OPS} PROPERTIES LABELS "RUN_TYPE=CINN")
+    set_tests_properties(${TEST_CINN_OPS} PROPERTIES TIMEOUT 150)
   endif()
 endforeach()
 
diff --git a/python/paddle/fluid/tests/unittests/test_assign_op.py b/python/paddle/fluid/tests/unittests/test_assign_op.py
index c7fc518986d351..22efd0ac661754 100644
--- a/python/paddle/fluid/tests/unittests/test_assign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_assign_op.py
@@ -32,7 +32,6 @@ def setUp(self):
         self.public_python_api = paddle.assign
         self.op_type = "assign"
         self.prim_op_type = "prim"
-        self.enable_cinn = False
         x = np.random.random(size=(100, 10)).astype('float64')
         self.inputs = {'X': x}
         self.outputs = {'Out': x}
@@ -57,7 +56,6 @@ def setUp(self):
         self.public_python_api = paddle.assign
         self.op_type = "assign"
         self.prim_op_type = "prim"
-        self.enable_cinn = False
         x = np.random.random(size=(100, 10)).astype('float16')
         self.inputs = {'X': x}
         self.outputs = {'Out': x}
diff --git a/python/paddle/fluid/tests/unittests/test_concat_op.py b/python/paddle/fluid/tests/unittests/test_concat_op.py
index 13135eb31b8de2..664664dc247025 100644
--- a/python/paddle/fluid/tests/unittests/test_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/test_concat_op.py
@@ -30,9 +30,9 @@ def setUp(self):
         self.python_api = paddle.concat
         self.public_python_api = paddle.concat
         self.prim_op_type = "prim"
-        self.enable_cinn = False
         self.dtype = self.get_dtype()
         self.init_test_data()
+        self.if_enable_cinn()
         self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]}
         self.attrs = {'axis': self.axis}
         if self.axis < 0:
@@ -82,6 +82,9 @@ def init_test_data(self):
             self.x2 = np.random.random((5, 3, 4, 5)).astype(self.dtype)
         self.axis = 1
 
+    def if_enable_cinn(self):
+        pass
+
 
 class TestConcatOp2(TestConcatOp):
     def init_test_data(self):
@@ -291,6 +294,9 @@ class TestConcatBf16(parent):
         def get_dtype(self):
             return np.uint16
 
+        def if_enable_cinn(self):
+            self.enable_cinn = False
+
     cls_name = "{}_{}".format(parent.__name__, "Bf16")
     TestConcatBf16.__name__ = cls_name
     globals()[cls_name] = TestConcatBf16
diff --git a/python/paddle/fluid/tests/unittests/test_cumsum_op.py b/python/paddle/fluid/tests/unittests/test_cumsum_op.py
index 98bcc81aeb3a25..b2031a792bd0ec 100644
--- a/python/paddle/fluid/tests/unittests/test_cumsum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cumsum_op.py
@@ -314,7 +314,7 @@ def setUp(self):
         self.python_api = cumsum_wrapper
         self.public_python_api = paddle.cumsum
         self.init_dtype()
-        self.enable_cinn = False
+        self.enable_cinn = True
         self.attrs = {'axis': 2, "exclusive": True}
         self.x = np.random.random((4, 5, 20)).astype(self.dtype)
         self.out = np.concatenate(
@@ -389,7 +389,7 @@ def init_dtype(self):
             self.dtype = self.dtype_ = np.float16
 
         def set_enable_cinn(self):
-            self.enable_cinn = False
+            self.enable_cinn = True
 
         def test_check_output(self):
             self.check_output()
diff --git a/python/paddle/fluid/tests/unittests/test_dropout_op.py b/python/paddle/fluid/tests/unittests/test_dropout_op.py
index 6c5d88155bcd41..6cae44cc471c22 100644
--- a/python/paddle/fluid/tests/unittests/test_dropout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dropout_op.py
@@ -308,7 +308,6 @@ def setUp(self):
             'is_test': True,
         }
         self.outputs = {'Out': out}
-        self.enable_cinn = False
         # Because prim op compare res with dygraph
         # when p = 0 dropout api return x,in dygraph mode x_grad = out_grad,
         # but in static mode x_grad = []
@@ -1689,23 +1688,25 @@ def test_jit_comp_with_cinn(self):
         rev_actual = []
         paddle.disable_static()
         for place in self.places:
-            if isinstance(place, fluid.CPUPlace):
-                paddle.set_device("cpu")
-            if isinstance(place, fluid.CUDAPlace):
-                paddle.set_device("gpu")
+            if not isinstance(place, fluid.CUDAPlace):
+                continue
+            paddle.set_device("gpu")
             paddle.seed(self.seed)
             input_ = paddle.to_tensor(
                 data=self.x, dtype=self.dtype, place=place, stop_gradient=False
             )
             net = PrimNet()
-            net = apply_to_static(net, False)
+            net = apply_to_static(net, True)
             output = net(
                 input_, self.p, training=(not self.is_test), mode=self.mode
             )
             grad = paddle.grad(output, input_)
             fwd_actual.append(output.numpy())
             rev_actual.append(grad[0].numpy())
-        for i in range(len(self.places)):
+        i = 0
+        for place in self.places:
+            if not isinstance(self.places[i], fluid.CUDAPlace):
+                continue
             np.testing.assert_allclose(
                 self.fwd_desire[i].sum(),
                 fwd_actual[i].sum(),
@@ -1718,6 +1719,7 @@ def test_jit_comp_with_cinn(self):
                 rtol=1e-2,  # mean of uniform distribution, scale for avoid random failed
                 atol=0,
             )
+            i += 1
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
index a10c6d186205ae..2ef1a2a1178278 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
@@ -497,9 +497,6 @@ def init_input_output(self):
         self.y = np.random.rand(100, 1).astype(self.dtype)
         self.out = self.x + self.y.reshape(1, 100, 1)
 
-    def if_enable_cinn(self):
-        self.enable_cinn = False
-
 
 @skip_check_grad_ci(
     reason="[skip shape check] Use y_shape(1) to test broadcast."
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_max_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_max_op.py
index 11493ab44e3c5f..d3ff2c4f7c8ced 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_max_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_max_op.py
@@ -36,7 +36,7 @@ def setUp(self):
         self.init_data()
         self.op_type = "elementwise_max"
         self.prim_op_type = "prim"
-        self.enable_cinn = False
+        self.if_enbale_cinn()
         self.python_api = paddle.maximum
         self.public_python_api = paddle.maximum
         self.inputs = {'X': self.x, 'Y': self.y}
@@ -95,6 +95,9 @@ def test_check_grad_ingore_y(self):
                 check_prim=True,
             )
 
+    def if_enbale_cinn(self):
+        pass
+
 
 class TestElementwiseFP16Op(TestElementwiseOp):
     def init_data(self):
@@ -108,7 +111,7 @@ def setUp(self):
         self.init_data()
         self.op_type = "elementwise_max"
         self.prim_op_type = "prim"
-        self.enable_cinn = False
+        self.if_enbale_cinn()
         self.python_api = paddle.maximum
         self.dtype = np.float16
         self.public_python_api = paddle.maximum
@@ -121,36 +124,54 @@ def init_data(self):
         self.x = np.random.uniform(0.1, 1, []).astype("float64")
         self.y = np.random.uniform(0.1, 1, []).astype("float64")
 
+    def if_enbale_cinn(self):
+        self.enable_cinn = False
+
 
 class TestElementwiseMaxFP16Op_ZeroDim1(TestElementwiseFP16Op):
     def init_data(self):
         self.x = np.random.uniform(0.1, 1, []).astype(np.float16)
         self.y = np.random.uniform(0.1, 1, []).astype(np.float16)
 
+    def if_enbale_cinn(self):
+        self.enable_cinn = False
+
 
 class TestElementwiseMaxOp_ZeroDim2(TestElementwiseOp):
     def init_data(self):
         self.x = np.random.uniform(0.1, 1, [13, 17]).astype("float64")
         self.y = np.random.uniform(0.1, 1, []).astype("float64")
 
+    def if_enbale_cinn(self):
+        self.enable_cinn = False
+
 
 class TestElementwiseMaxFP16Op_ZeroDim2(TestElementwiseFP16Op):
     def init_data(self):
         self.x = np.random.uniform(0.1, 1, [13, 17]).astype(np.float16)
         self.y = np.random.uniform(0.1, 1, []).astype(np.float16)
 
+    def if_enbale_cinn(self):
+        self.enable_cinn = False
+
 
 class TestElementwiseMaxOp_ZeroDim3(TestElementwiseOp):
     def init_data(self):
         self.x = np.random.uniform(0.1, 1, []).astype("float64")
         self.y = np.random.uniform(0.1, 1, [13, 17]).astype("float64")
 
+    def if_enbale_cinn(self):
+        self.enable_cinn = False
+
 
 class TestElementwiseMaxFP16Op_ZeroDim3(TestElementwiseFP16Op):
     def init_data(self):
         self.x = np.random.uniform(0.1, 1, []).astype(np.float16)
         self.y = np.random.uniform(0.1, 1, [13, 17]).astype(np.float16)
 
+    def if_enbale_cinn(self):
+        self.enable_cinn = False
+
 
 @unittest.skipIf(
     core.is_compiled_with_cuda()
diff --git a/python/paddle/fluid/tests/unittests/test_gather_nd_op.py b/python/paddle/fluid/tests/unittests/test_gather_nd_op.py
index f4f68bbbae0896..c564d2ae8c303e 100644
--- a/python/paddle/fluid/tests/unittests/test_gather_nd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gather_nd_op.py
@@ -141,7 +141,6 @@ def setUp(self):
         self.prim_op_type = "prim"
         self.python_api = paddle.gather_nd
         self.public_python_api = paddle.gather_nd
-        self.enable_cinn = False
         self.config_dtype()
         if self.dtype == np.float64:
             target_dtype = "float64"
@@ -216,6 +215,7 @@ def setUp(self):
             output = convert_float_to_uint16(output)
         self.inputs = {'X': xnp, 'Index': index}
         self.outputs = {'Out': output}
+        # the outputs are 0D-tensor, CINN not support
         self.enable_cinn = False
 
     def config_dtype(self):
@@ -258,7 +258,6 @@ def setUp(self):
         self.prim_op_type = "prim"
         self.python_api = paddle.gather_nd
         self.public_python_api = paddle.gather_nd
-        self.enable_cinn = False
         self.config_dtype()
         if self.dtype == np.float64:
             target_dtype = "float64"
diff --git a/python/paddle/fluid/tests/unittests/test_group_norm_op.py b/python/paddle/fluid/tests/unittests/test_group_norm_op.py
index 45986b231ac493..71d3e687dae1f9 100644
--- a/python/paddle/fluid/tests/unittests/test_group_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_group_norm_op.py
@@ -1217,6 +1217,8 @@ def test_jit_comp_with_cinn(self):
         fwd_actual = []
         rev_actual = []
         for place in self.places:
+            if not isinstance(place, fluid.CUDAPlace):
+                continue
             input_ = paddle.to_tensor(
                 data=self.x, dtype=self.dtype, place=place, stop_gradient=False
             )
@@ -1241,13 +1243,16 @@ def test_jit_comp_with_cinn(self):
                 self.data_format,
             )
             # failed in cinn test
-            net = apply_to_static(net, False)
+            net = apply_to_static(net, True)
             output = net(input_)
             grad = paddle.grad(output, input_)
             fwd_actual.append(output.numpy())
             rev_actual.append(grad[0].numpy())
 
-        for i in range(len(self.places)):
+        i = 0
+        for place in self.places:
+            if not isinstance(place, fluid.CUDAPlace):
+                continue
             atol = self.threshold_list[i][2]
             rtol = self.threshold_list[i][2]
             np.testing.assert_allclose(
@@ -1269,6 +1274,7 @@ def test_jit_comp_with_cinn(self):
                 atol=atol,
                 err_msg='%s jit_cinn rev' % self.places[i],
             )
+            i += 1
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_mean_op.py b/python/paddle/fluid/tests/unittests/test_mean_op.py
index 5c1d2c23b18f64..8e48a334ee40c2 100644
--- a/python/paddle/fluid/tests/unittests/test_mean_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mean_op.py
@@ -176,7 +176,7 @@ def set_attrs(self):
         pass
 
     def if_enable_cinn(self):
-        self.enable_cinn = False
+        pass
 
     def test_check_output(self):
         if self.dtype != 'float16':
diff --git a/python/paddle/fluid/tests/unittests/test_split_op.py b/python/paddle/fluid/tests/unittests/test_split_op.py
index 18cfbe59fe83d5..3149ca82b3f623 100644
--- a/python/paddle/fluid/tests/unittests/test_split_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_op.py
@@ -196,7 +196,6 @@ def setUp(self):
         self.python_out_sig = ['out0', 'out1', 'out2']
         self._set_op_type()
         self.prim_op_type = "prim"
-        self.enable_cinn = False
         self.dtype = self.get_dtype()
         self.init_data()
         self.inputs = {'X': self.x}
diff --git a/python/paddle/fluid/tests/unittests/test_squeeze2_op.py b/python/paddle/fluid/tests/unittests/test_squeeze2_op.py
index e34879a3c83bfa..8a5c5e74efcaf0 100755
--- a/python/paddle/fluid/tests/unittests/test_squeeze2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_squeeze2_op.py
@@ -73,7 +73,6 @@ def setUp(self):
         self.prim_op_type = "comp"
         self.python_api = paddle.squeeze
         self.public_python_api = paddle.squeeze
-        self.enable_cinn = False
         self.python_out_sig = [
             "Out"
         ]  # python out sig is customized output signature.
diff --git a/python/paddle/fluid/tests/unittests/test_transpose_op.py b/python/paddle/fluid/tests/unittests/test_transpose_op.py
index 4530bd175a0576..3865476f529b57 100644
--- a/python/paddle/fluid/tests/unittests/test_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py
@@ -125,7 +125,6 @@ def setUp(self):
         self.python_api = paddle.transpose
         self.public_python_api = paddle.transpose
         self.prim_op_type = "prim"
-        self.enable_cinn = False
         self.inputs = {'X': np.random.random(self.shape).astype("float64")}
         self.attrs = {
             'axis': list(self.axis),

From 9eda000cc2a01a2d26b15039da8a5172c4d9940b Mon Sep 17 00:00:00 2001
From: jjyaoao <88936287+jjyaoao@users.noreply.github.com>
Date: Tue, 11 Apr 2023 11:03:15 +0800
Subject: [PATCH 044/156] Delete the keyword WITH_ASCEND_INT64 in
 configure.cmake and CMakeList (#52718)

* Delete the keyword WITH_ASCEND_INT64 in configure.cmake and CMakeList

* try pass Static-Check
---
 CMakeLists.txt        | 1 -
 cmake/configure.cmake | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index aa6ec5e55c07fc..9dc6febdfaaa53 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -288,7 +288,6 @@ option(NEW_RELEASE_ALL
        OFF)
 option(NEW_RELEASE_JIT
        "PaddlePaddle next-level release strategy for backup jit package" OFF)
-option(WITH_ASCEND_INT64 "Compile with int64 kernel for ascend NPU" OFF)
 option(WITH_POCKETFFT "Compile with pocketfft support" ON)
 option(WITH_RECORD_BUILDTIME
        "Compile PaddlePaddle with record all targets build time" OFF)
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 294c5519b4befb..973e21fd55056d 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -93,8 +93,8 @@ if(WITH_BOX_PS)
   add_definitions(-DPADDLE_WITH_BOX_PS)
 endif()
 
-if(WITH_ASCEND_INT64)
-  add_definitions(-DPADDLE_WITH_ASCEND_INT64)
+if(WITH_ASCEND)
+  add_definitions(-DPADDLE_WITH_ASCEND)
 endif()
 
 if(WITH_XPU)

From da0c7e1454b787b65333750b55de89cfa8dd565e Mon Sep 17 00:00:00 2001
From: LinearTemporalLogic
 <127285600+LinearTemporalLogic@users.noreply.github.com>
Date: Tue, 11 Apr 2023 11:04:45 +0800
Subject: [PATCH 045/156] Add output defs for eigh kernel (#51362)

* Add output defs for eigh kernel

* fix

* update

* update

* fix

* fix
---
 paddle/phi/infermeta/unary.cc         | 2 ++
 paddle/phi/kernels/cpu/eigh_kernel.cc | 6 +++++-
 paddle/phi/kernels/gpu/eigh_kernel.cu | 6 +++++-
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 0aac6f969beb72..8c87abf4fd86a5 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -844,7 +844,9 @@ void EighInferMeta(const MetaTensor& x,
     values_dim.emplace_back(input_dim[i]);
   }
   out_w->set_dims(phi::make_ddim(values_dim));
+  out_w->set_dtype(dtype::ToReal(x.dtype()));
   out_v->set_dims(input_dim);
+  out_v->set_dtype(dtype::ToReal(x.dtype()));
 }
 
 void EigvalsInferMeta(const MetaTensor& x, MetaTensor* out, MetaConfig config) {
diff --git a/paddle/phi/kernels/cpu/eigh_kernel.cc b/paddle/phi/kernels/cpu/eigh_kernel.cc
index 0f0a10c8377921..cfb17589d505dd 100644
--- a/paddle/phi/kernels/cpu/eigh_kernel.cc
+++ b/paddle/phi/kernels/cpu/eigh_kernel.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/phi/kernels/eigh_kernel.h"
 
+#include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
 #include "paddle/phi/kernels/funcs/values_vectors_functor.h"
@@ -40,4 +41,7 @@ PD_REGISTER_KERNEL(eigh,
                    float,
                    double,
                    phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
+                   phi::dtype::complex<double>) {
+  kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED);
+  kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED);
+}
diff --git a/paddle/phi/kernels/gpu/eigh_kernel.cu b/paddle/phi/kernels/gpu/eigh_kernel.cu
index 3ffbb0b95b6665..b5548da2c71416 100644
--- a/paddle/phi/kernels/gpu/eigh_kernel.cu
+++ b/paddle/phi/kernels/gpu/eigh_kernel.cu
@@ -17,6 +17,7 @@
 
 #include "paddle/phi/kernels/eigh_kernel.h"
 
+#include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
 #include "paddle/phi/kernels/funcs/values_vectors_functor.h"
@@ -43,6 +44,9 @@ PD_REGISTER_KERNEL(eigh,  // cuda_only
                    float,
                    double,
                    phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
+                   phi::dtype::complex<double>) {
+  kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED);
+  kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED);
+}
 
 #endif  // not PADDLE_WITH_HIP

From 410e25fb9090a31f2ff91a816432317fdda87379 Mon Sep 17 00:00:00 2001
From: RedContritio <RedContritio@qq.com>
Date: Tue, 11 Apr 2023 11:08:39 +0800
Subject: [PATCH 046/156] support auto generate for flatten
 (flatten_contiguous_range) (#52512)

* support auto generate for flatten (flatten_contiguous_range)

* add data_type for flatten_grad
---
 paddle/fluid/operators/flatten_op.cc     | 128 -----------------------
 paddle/phi/api/yaml/backward.yaml        |  12 +++
 paddle/phi/api/yaml/legacy_backward.yaml |  14 ---
 paddle/phi/api/yaml/legacy_ops.yaml      |  13 ---
 paddle/phi/api/yaml/op_compat.yaml       |   4 +
 paddle/phi/api/yaml/ops.yaml             |  13 +++
 paddle/phi/ops/compat/flatten_sig.cc     |   4 +
 7 files changed, 33 insertions(+), 155 deletions(-)

diff --git a/paddle/fluid/operators/flatten_op.cc b/paddle/fluid/operators/flatten_op.cc
index 6aaa251ead9f93..530b3560bb878a 100644
--- a/paddle/fluid/operators/flatten_op.cc
+++ b/paddle/fluid/operators/flatten_op.cc
@@ -283,123 +283,6 @@ class Flatten2GradOp : public framework::OperatorWithKernel {
   }
 };
 
-class FlattenContiguousRangeOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "FlattenContiguousRange");
-    OP_INOUT_CHECK(
-        ctx->HasOutput("Out"), "Output", "Out", "FlattenContiguousRange");
-    const auto &start_axis = ctx->Attrs().Get<int>("start_axis");
-    const auto &stop_axis = ctx->Attrs().Get<int>("stop_axis");
-
-    // Construct MetaTensor for InferMeta Func
-    using CompatMetaTensor = framework::CompatMetaTensor;
-    CompatMetaTensor x(ctx->GetInputVarPtrs("X")[0], ctx->IsRuntime());
-    CompatMetaTensor out(ctx->GetOutputVarPtrs("Out")[0], ctx->IsRuntime());
-    std::unique_ptr<CompatMetaTensor> xshape(nullptr);
-    if (ctx->HasOutput("XShape")) {
-      xshape = std::move(std::unique_ptr<CompatMetaTensor>(new CompatMetaTensor(
-          ctx->GetOutputVarPtrs("XShape")[0], ctx->IsRuntime())));
-    }
-    phi::FlattenWithXShapeInferMeta(
-        x, start_axis, stop_axis, &out, xshape.get());
-  }
-};
-
-class FlattenContiguousRangeOpMaker : public FlattenOpMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor) A tensor of rank >= axis.");
-    AddOutput("Out",
-              "A 2D tensor is reshaped input tensor. The input dimensions"
-              "up to axis are flattened to the outer dimension of the output"
-              "and the remaining input dimensions are flattened into the inner"
-              "dimension of the output.");
-    AddAttr<int>("start_axis",
-                 "(int)"
-                 "Indicate the input start dimension (exclusive) to flatten")
-        .SetDefault(1);
-    AddAttr<int>("stop_axis",
-                 "(int)"
-                 "Indicate the input stop dimension (exclusive) to flatten")
-        .SetDefault(1);
-    AddComment(R"DOC(
-Flatten Operator
-
-Flattens the input tensor into a new matrix according to start_axis and stop_axis.
-
-Examples:
-Case 1:
-  Given
-    X.shape = (3, 100, 100, 4)
-  and
-    start_axis = 2, stop_axis = -1
-  We get:
-    Out.shape = (3, 100, 400)
-
-Case 2:
-  Given
-    X.shape = (3, 100, 100, 4)
-  and
-    start_axis = 0, stop_axis = -1
-  We get:
-    Out.shape = (3 * 100 * 100 * 4)
-)DOC");
-    AddOutput("XShape",
-              "XShape is just used to store the shape and lod of X, which will "
-              "be used in FlattenGradOp.")
-        .AsIntermediate()
-        .AsExtra();
-  }
-};
-
-template <typename T>
-class FlattenContiguousRangeGradOpMaker
-    : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
-  void Apply(GradOpPtr<T> grad_op) const override {
-    grad_op->SetType("flatten_contiguous_range_grad");
-    grad_op->SetInput("XShape", this->Output("XShape"));
-    grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    grad_op->SetAttrMap(this->Attrs());
-  }
-};
-
-class FlattenContiguousRangeGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *context) const override {
-    OP_INOUT_CHECK(context->HasInput("XShape"),
-                   "Input",
-                   "XShape",
-                   "FlattenContiguousRangeGrad");
-    OP_INOUT_CHECK(context->HasInput(framework::GradVarName("Out")),
-                   "Input",
-                   framework::GradVarName("Out"),
-                   "FlattenContiguousRangeGrad");
-    // Construct MetaTensor for InferMeta Func
-    using CompatMetaTensor = framework::CompatMetaTensor;
-    CompatMetaTensor xshape(context->GetInputVarPtrs("XShape")[0],
-                            context->IsRuntime());
-    CompatMetaTensor dx(
-        context->GetOutputVarPtrs(framework::GradVarName("X"))[0],
-        context->IsRuntime());
-    phi::KernelWithXShapeInferMeta(xshape, &dx);
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(
-                              ctx, framework::GradVarName("Out")),
-                          ctx.GetPlace());
-  }
-};
 DECLARE_INPLACE_OP_INFERER(FlattenOpInplaceInferer, {"X", "Out"});
 DECLARE_INPLACE_OP_INFERER(FlattenGradInplaceInferer,
                            {framework::GradVarName("Out"),
@@ -431,17 +314,6 @@ REGISTER_OPERATOR(flatten2_grad,
                   ops::Flatten2GradOp,
                   ops::FlattenGradInplaceInferer);
 
-REGISTER_OPERATOR(
-    flatten_contiguous_range,
-    ops::FlattenContiguousRangeOp,
-    ops::FlattenContiguousRangeOpMaker,
-    ops::FlattenContiguousRangeGradOpMaker<paddle::framework::OpDesc>,
-    ops::FlattenContiguousRangeGradOpMaker<paddle::imperative::OpBase>,
-    ops::FlattenOpInplaceInferer);
-REGISTER_OPERATOR(flatten_contiguous_range_grad,
-                  ops::FlattenContiguousRangeGradOp,
-                  ops::FlattenGradInplaceInferer);
-
 REGISTER_OP_CPU_KERNEL(flatten,
                        ops::FlattenKernel<phi::CPUContext, float>,
                        ops::FlattenKernel<phi::CPUContext, double>,
diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml
index f4608f008535ca..7bf3b5cd2fcd89 100644
--- a/paddle/phi/api/yaml/backward.yaml
+++ b/paddle/phi/api/yaml/backward.yaml
@@ -627,6 +627,18 @@
     func : flash_attn_unpadded_grad
     data_type: q
 
+- backward_op : flatten_grad
+  forward : flatten(Tensor x, int start_axis = 1, int stop_axis = 1) -> Tensor(out), Tensor(xshape)
+  args : (Tensor xshape, Tensor out_grad)
+  output : Tensor(x_grad)
+  infer_meta :
+    func :  KernelWithXShapeInferMeta
+    param : [xshape]
+  kernel :
+    func : flatten_grad
+    data_type : out_grad
+  inplace : (out_grad -> x_grad)
+
 - backward_op : flip_grad
   forward : flip (Tensor x, int[] axis) -> Tensor(out)
   args : (Tensor out_grad, int[] axis)
diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml
index 181b819cde96ee..4ba99b1b813120 100755
--- a/paddle/phi/api/yaml/legacy_backward.yaml
+++ b/paddle/phi/api/yaml/legacy_backward.yaml
@@ -397,20 +397,6 @@
     func : fill_grad
   inplace : (out_grad -> x_grad)
 
-- backward_op : flatten_grad
-  forward : flatten(Tensor x, int start_axis, int stop_axis) -> Tensor(out), Tensor(xshape)
-  args : (Tensor xshape, Tensor out_grad)
-  output : Tensor(x_grad)
-  infer_meta :
-    func :  KernelWithXShapeInferMeta
-    param : [xshape]
-  kernel :
-    func : flatten_grad
-    data_type: out_grad
-    backend: out_grad
-    layout: out_grad
-  inplace : (out_grad -> x_grad)
-
 - backward_op : fmax_grad
   forward : fmax(Tensor x, Tensor y) -> Tensor(out)
   args : (Tensor x, Tensor y, Tensor out_grad)
diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml
index 89aef2203ccaf2..53ae099e762ead 100755
--- a/paddle/phi/api/yaml/legacy_ops.yaml
+++ b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -508,19 +508,6 @@
   inplace : (x -> out)
   backward: fill_grad
 
-- op : flatten
-  args : (Tensor x, int start_axis, int stop_axis)
-  output : Tensor(out), Tensor(xshape)
-  infer_meta :
-    func : FlattenWithXShapeInferMeta
-  kernel :
-    func : flatten
-    backend : x
-  inplace : (x -> out)
-  view : (x -> out)
-  intermediate : xshape
-  backward : flatten_grad
-
 - op : floor_divide
   args : (Tensor x, Tensor y)
   output : Tensor(out)
diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
index 8e36affbb7596b..f807a3d748ba10 100644
--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -806,12 +806,16 @@
     out : Out
 
 - op : flatten (flatten_contiguous_range)
+  backward : flatten_grad (flatten_contiguous_range_grad)
   inputs :
     x : X
   outputs :
     {out : Out, xshape : XShape}
   attrs :
     {start_axis : start_axis, stop_axis : stop_axis}
+  extra :
+    outputs : [xshape]
+  manual_signature : [flatten, flatten_grad]
 
 - op : flip
   inputs :
diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
index 8de00fc785ca0a..3afbf00c049e64 100644
--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -660,6 +660,19 @@
   intermediate : softmax_lse, seed_offset
   backward : flash_attn_unpadded_grad
 
+- op : flatten
+  args : (Tensor x, int start_axis = 1, int stop_axis = 1)
+  output : Tensor(out), Tensor(xshape)
+  infer_meta :
+    func : FlattenWithXShapeInferMeta
+  kernel :
+    func : flatten
+    data_type : x
+  inplace : (x -> out)
+  view : (x -> out)
+  intermediate : xshape
+  backward : flatten_grad
+
 - op : flip
   args : (Tensor x, int[] axis)
   output : Tensor (out)
diff --git a/paddle/phi/ops/compat/flatten_sig.cc b/paddle/phi/ops/compat/flatten_sig.cc
index b225dc625240b9..cd3ccd136de29a 100644
--- a/paddle/phi/ops/compat/flatten_sig.cc
+++ b/paddle/phi/ops/compat/flatten_sig.cc
@@ -17,6 +17,10 @@ limitations under the License. */
 namespace phi {
 
 KernelSignature FlattenOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.IsForInferShape()) {
+    return KernelSignature(
+        "flatten", {"X"}, {"start_axis", "stop_axis"}, {"Out", "XShape"});
+  }
   if (ctx.HasOutput("XShape")) {
     return KernelSignature(
         "flatten", {"X"}, {"start_axis", "stop_axis"}, {"Out", "XShape"});

From 2a4200363c0d64b91bb9e92cc5cd781315bc9e4c Mon Sep 17 00:00:00 2001
From: RedContritio <RedContritio@qq.com>
Date: Tue, 11 Apr 2023 11:09:57 +0800
Subject: [PATCH 047/156] support auto generate for op merged_momentum
 optimizer (#52708)

* fix error in generator/type_mapping.py

* support auto generate for op merged_momentum optimizer
---
 .../fluid/operators/generator/type_mapping.py |   2 +-
 .../optimizers/merged_momentum_op.cc          | 111 ------------------
 paddle/phi/api/yaml/legacy_ops.yaml           |  11 --
 paddle/phi/api/yaml/op_compat.yaml            |   6 +
 paddle/phi/api/yaml/ops.yaml                  |  11 ++
 paddle/phi/ops/compat/merged_momentum_sig.cc  |  40 -------
 6 files changed, 18 insertions(+), 163 deletions(-)
 delete mode 100644 paddle/fluid/operators/optimizers/merged_momentum_op.cc
 delete mode 100644 paddle/phi/ops/compat/merged_momentum_sig.cc

diff --git a/paddle/fluid/operators/generator/type_mapping.py b/paddle/fluid/operators/generator/type_mapping.py
index 8aec1bcc49a5e4..e6b59b7823abed 100644
--- a/paddle/fluid/operators/generator/type_mapping.py
+++ b/paddle/fluid/operators/generator/type_mapping.py
@@ -76,7 +76,7 @@
     'int64_t[]': 'std::vector<int64_t>',
     'float[]': 'std::vector<float>',
     'double[]': 'std::vector<double>',
-    'str[]': 'std::vector<<std::string>',
+    'str[]': 'std::vector<std::string>',
 }
 
 output_type_map = {'Tensor': 'Tensor', 'Tensor[]': 'std::vector<Tensor>'}
diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op.cc b/paddle/fluid/operators/optimizers/merged_momentum_op.cc
deleted file mode 100644
index 17d31e35fdec23..00000000000000
--- a/paddle/fluid/operators/optimizers/merged_momentum_op.cc
+++ /dev/null
@@ -1,111 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/phi/infermeta/multiary.h"
-
-namespace paddle {
-namespace operators {
-
-class MergedMomentumOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    auto param_dtype =
-        framework::OperatorWithKernel::IndicateVarDataType(ctx, "Param");
-    return phi::KernelKey(param_dtype, ctx.GetPlace());
-  }
-};
-
-class MergedMomentumOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Param",
-             "(Tensor, default Tensor<float>) "
-             "Input parameter that has to be updated")
-        .AsDuplicable();
-    AddInput("Grad",
-             "(Tensor, default Tensor<float>) "
-             "Input gradient of the parameter")
-        .AsDuplicable();
-    AddInput("Velocity",
-             "(Tensor, default Tensor<float>) "
-             "Input velocity (corresponding to the parameter) "
-             "that has to be updated")
-        .AsDuplicable();
-    AddInput("LearningRate",
-             "(Tensor, default Tensor<float>) "
-             "Input learning rate")
-        .AsDuplicable();
-    AddInput("MasterParam", "FP32 master weight for AMP.")
-        .AsDispensable()
-        .AsDuplicable();
-    AddOutput("ParamOut",
-              "(Tensor) This output is updated parameter. "
-              "It shared memory with Input(Param).")
-        .AsDuplicable();
-    AddOutput("VelocityOut",
-              "(Tensor) This output is updated velocity. "
-              "It shared memory with Input(Velocity).")
-        .AsDuplicable();
-    AddOutput("MasterParamOut",
-              "The updated FP32 master weight for AMP. "
-              "It shared memory with Input(MasterParam).")
-        .AsDispensable()
-        .AsDuplicable();
-    AddAttr<float>("mu", "(float) Momentum coefficient");
-    AddAttr<bool>("use_nesterov",
-                  "(bool, default false) "
-                  "Use Nesterov Momentum or not.")
-        .SetDefault(false);
-    AddAttr<std::vector<std::string>>(
-        "regularization_method",
-        "(string) regularization_method, right now only "
-        "support l2decay or none")
-        .SetDefault({});
-    AddAttr<std::vector<float>>("regularization_coeff",
-                                "(float) regularization_coeff")
-        .SetDefault({});
-    AddAttr<bool>("multi_precision",
-                  "(bool, default false) "
-                  "Whether to use multi-precision during weight updating.")
-        .SetDefault(false);
-    AddAttr<float>(
-        "rescale_grad",
-        "(float, default 1.0) Multiply the gradient with `rescale_grad`"
-        "before updating. Often choose to be `1.0/batch_size`.")
-        .SetDefault(1.0f);
-    AddComment(R"DOC(Merged Momentum Optimizer.)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-DECLARE_INFER_SHAPE_FUNCTOR(merged_momentum,
-                            MergedMomentumInferShapeFunctor,
-                            PD_INFER_META(phi::MergedMomentumInferMeta));
-
-REGISTER_OP_WITHOUT_GRADIENT(merged_momentum,
-                             ops::MergedMomentumOp,
-                             ops::MergedMomentumOpMaker,
-                             MergedMomentumInferShapeFunctor);
diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml
index 53ae099e762ead..e44bbe7e6dd5bd 100755
--- a/paddle/phi/api/yaml/legacy_ops.yaml
+++ b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -884,17 +884,6 @@
     data_type : param
   inplace : (param -> param_out), (moment1 -> moment1_out), (moment2 -> moment2_out), (beta1_pow -> beta1_pow_out), (beta2_pow -> beta2_pow_out), (master_param -> master_param_out)
 
-- op : merged_momentum_
-  args : (Tensor[] param, Tensor[] grad, Tensor[] velocity, Tensor[] learning_rate, Tensor[] master_param, float mu, bool use_nesterov = false, str[] regularization_method = {}, float[] regularization_coeff = {}, bool multi_precision = false, float rescale_grad = 1.0f)
-  output : Tensor[](param_out){param.size()}, Tensor[](velocity_out){param.size()}, Tensor[](master_param_out){param.size()}
-  infer_meta :
-    func : MergedMomentumInferMeta
-  optional: master_param
-  kernel :
-    func : merged_momentum
-    data_type : param
-  inplace : (param -> param_out), (velocity -> velocity_out), (master_param -> master_param_out)
-
 - op : min
   args : (Tensor x, IntArray axis={}, bool keepdim=false)
   output : Tensor(out)
diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
index f807a3d748ba10..98a00e6f5a9c06 100644
--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -1421,6 +1421,12 @@
   outputs :
     out : Out
 
+- op : merged_momentum_
+  inputs :
+    {param : Param, grad : Grad, velocity : Velocity, learning_rate : LearningRate, master_param : MasterParam}
+  outputs :
+    {param_out : ParamOut, velocity_out : VelocityOut, master_param_out : MasterParamOut}
+
 - op : meshgrid
   backward : meshgrid_grad
   inputs :
diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
index 3afbf00c049e64..31f4a114b7142f 100644
--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -1190,6 +1190,17 @@
   kernel :
     func : merge_selected_rows {selected_rows -> selected_rows}
 
+- op : merged_momentum_
+  args : (Tensor[] param, Tensor[] grad, Tensor[] velocity, Tensor[] learning_rate, Tensor[] master_param, float mu, bool use_nesterov = false, str[] regularization_method = {}, float[] regularization_coeff = {}, bool multi_precision = false, float rescale_grad = 1.0f)
+  output : Tensor[](param_out){param.size()}, Tensor[](velocity_out){param.size()}, Tensor[](master_param_out){param.size()}
+  infer_meta :
+    func : MergedMomentumInferMeta
+  kernel :
+    func : merged_momentum
+    data_type : param
+  optional: master_param, master_param_out
+  inplace : (param -> param_out), (velocity -> velocity_out), (master_param -> master_param_out)
+
 - op : meshgrid
   args : (Tensor[] inputs)
   output : Tensor[]{inputs.size()}
diff --git a/paddle/phi/ops/compat/merged_momentum_sig.cc b/paddle/phi/ops/compat/merged_momentum_sig.cc
deleted file mode 100644
index 3444d5e2d3097b..00000000000000
--- a/paddle/phi/ops/compat/merged_momentum_sig.cc
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-
-KernelSignature MergedMomentumOpArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  return KernelSignature(
-      "merged_momentum",
-      {"Param", "Grad", "Velocity", "LearningRate", "MasterParam"},
-      {"mu",
-       "use_nesterov",
-       "regularization_method",
-       "regularization_coeff",
-       "multi_precision",
-       "rescale_grad"},
-      {
-          "ParamOut",
-          "VelocityOut",
-          "MasterParamOut",
-      });
-}
-
-}  // namespace phi
-
-PD_REGISTER_ARG_MAPPING_FN(merged_momentum,
-                           phi::MergedMomentumOpArgumentMapping);

From 3e66845f017033669bccca6a385cde64937544bb Mon Sep 17 00:00:00 2001
From: WangZhen <23097963+0x45f@users.noreply.github.com>
Date: Tue, 11 Apr 2023 11:16:53 +0800
Subject: [PATCH 048/156] [Dy2St]Add backend for to_static API (#52596)

* Add backend for to_static API
---
 .../fluid/tests/unittests/test_input_spec.py  |  2 +-
 python/paddle/jit/api.py                      | 24 ++++++-
 .../paddle/jit/dy2static/partial_program.py   | 13 ++--
 .../jit/dy2static/program_translator.py       | 68 +++++++++++--------
 python/paddle/jit/dy2static/utils.py          | 21 +++++-
 test/dygraph_to_static/test_cinn_prim.py      | 15 ++++
 .../test_partial_program_hook.py              |  2 +-
 7 files changed, 105 insertions(+), 40 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_input_spec.py b/python/paddle/fluid/tests/unittests/test_input_spec.py
index dad821438afb86..2bdce8b4b58c5f 100644
--- a/python/paddle/fluid/tests/unittests/test_input_spec.py
+++ b/python/paddle/fluid/tests/unittests/test_input_spec.py
@@ -349,7 +349,7 @@ def test_run(self):
         )
         x = paddle.randn([2, 10])
         out = net(x)
-        np.testing.assert_equal(out.shape, [2, 5])
+        np.testing.assert_equal(net.forward._input_spec, None)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/jit/api.py b/python/paddle/jit/api.py
index bc07609a111eee..bde75f6ad73a02 100644
--- a/python/paddle/jit/api.py
+++ b/python/paddle/jit/api.py
@@ -218,8 +218,23 @@ def ignore_module(modules: list[Any]):
     add_ignore_module(modules)
 
 
+def _check_and_set_backend(backend, build_strategy):
+    if backend not in ['CINN', None]:
+        raise ValueError(
+            "The backend of to_static should be 'CINN' or None, but received {}.".format(
+                backend
+            )
+        )
+    if backend == 'CINN':
+        build_strategy.build_cinn_pass = True
+
+
 def to_static(
-    function=None, input_spec=None, build_strategy=None, property=False
+    function=None,
+    input_spec=None,
+    build_strategy=None,
+    backend=None,
+    **kwargs,
 ):
     """
     Converts imperative dygraph APIs into declarative function APIs. Decorator
@@ -228,7 +243,6 @@ def to_static(
     Tensor(s) to do imperative training, inference, or other operations. If the
     decorated function calls other imperative function, the called one will be
     converted into declarative function as well.
-
     Args:
         function (callable): callable imperative function.
         input_spec(list[InputSpec]|tuple[InputSpec]): list/tuple of InputSpec to specific the shape/dtype/name
@@ -238,7 +252,8 @@ def to_static(
             in the computational graph and memory optimization during the execution
             of the computational graph. For more information about build_strategy,
             please refer to :code:`paddle.static.BuildStrategy`. The default is None.
-        property(bool, Optional): whether the fucntion is python property. The default is False.
+        backend(str, Optional): Specifies compilation backend, which can be `CINN` or None. When backend is `CINN`, CINN compiler will be used to speed up training and inference.
+        kwargs: Support keys including `property`, set `property` to True if the fucntion is python property.
 
 
     Returns:
@@ -263,6 +278,7 @@ def func(x):
             print(x_v) # [[2. 2.]]
 
     """
+    property = kwargs.get("property", False)
 
     def decorated(python_func):
         """
@@ -279,6 +295,7 @@ def decorated(python_func):
                 input_spec=input_spec,
                 build_strategy=build_strategy,
                 property=property,
+                backend=backend,
             ),
         )
 
@@ -291,6 +308,7 @@ def decorated(python_func):
                 type(build_strategy).__name__
             )
         )
+    _check_and_set_backend(backend, build_strategy)
 
     # for usage: `to_static(foo, ...)`
     if function is not None:
diff --git a/python/paddle/jit/dy2static/partial_program.py b/python/paddle/jit/dy2static/partial_program.py
index 9538bb9300742c..7a6afc82b1bf07 100644
--- a/python/paddle/jit/dy2static/partial_program.py
+++ b/python/paddle/jit/dy2static/partial_program.py
@@ -27,7 +27,12 @@
 from paddle.optimizer.lr import LRScheduler
 
 from . import logging_utils
-from .utils import RETURN_NO_VALUE_MAGIC_NUM, _out_grad_names, _param_grad_names
+from .utils import (
+    RETURN_NO_VALUE_MAGIC_NUM,
+    _out_grad_names,
+    _param_grad_names,
+    backend_guard,
+)
 
 __all__ = []
 
@@ -197,6 +202,7 @@ def __init__(
         # program_id -> list(scope)
         self._scope_cache = {}
         self._hooker = None
+        self._backend = kwargs.get('backend', None)
 
     def __call__(self, inputs):
         """
@@ -636,10 +642,9 @@ def _append_backward_desc(self, main_program):
 
         start_idx = len(program.block(0).ops) + len(self._outputs.tolist())
         if targets:
-            # TODO(CZ): later when use cinn, set_prim_all_enabled and check_and_set_prim_all_enabled will be set at else branch.
-            core.check_and_set_prim_all_enabled()
             start_idx = len(program.block(0).ops) + len(self._outputs.tolist())
-            backward.gradients(targets=targets, inputs=[])
+            with backend_guard(self._backend):
+                backward.gradients(targets=targets, inputs=[])
 
             if self._hooker:
                 program, start_idx = self._hooker.after_append_backward(
diff --git a/python/paddle/jit/dy2static/program_translator.py b/python/paddle/jit/dy2static/program_translator.py
index 3777af8879d9a9..a8be1abb2a10fd 100644
--- a/python/paddle/jit/dy2static/program_translator.py
+++ b/python/paddle/jit/dy2static/program_translator.py
@@ -48,6 +48,7 @@
     NO_SHAPE_VAR_TYPE,
     ast_to_func,
     ast_to_source_code,
+    backend_guard,
     func_to_source_code,
     input_specs_compatible,
     is_paddle_func,
@@ -334,7 +335,7 @@ def __init__(self, function, input_spec=None, **kwargs):
             self._class_instance = None
 
         if input_spec is not None and prim_or_cinn_is_enabled(
-            kwargs.get("build_strategy", None)
+            kwargs.get("build_strategy", None), kwargs.get("backend", None)
         ):
             from paddle.static import InputSpec
 
@@ -1184,11 +1185,9 @@ def __init__(self):
     def _build_once(self, cache_key):
         # TODO(Aurelius84): Need a gloabl FLAGS to enable/disable to_prim
         enable_prim = cache_key.kwargs['build_strategy'].build_cinn_pass
-        # TODO(CZ): later when use cinn, set_prim_all_enabled and check_and_set_prim_all_enabled will be set at else branch.
 
         # NOTE(xiongkun): Need a global FLAGS to enable/disable fallback
         enable_fallback = enable_prim
-        core.check_and_set_prim_all_enabled()
         try:
             concrete_program = ConcreteProgram.from_func_spec(
                 func_spec=cache_key.function_spec,
@@ -1216,7 +1215,8 @@ def _build_once(self, cache_key):
             else:
                 raise
 
-        if prim_or_cinn_is_enabled(cache_key.kwargs['build_strategy']):
+        backend = cache_key.kwargs['backend']
+        if prim_or_cinn_is_enabled(cache_key.kwargs['build_strategy'], backend):
             for var in concrete_program.main_program.list_vars():
                 if var.type not in NO_SHAPE_VAR_TYPE and -1 in var.shape:
                     warnings.warn(
@@ -1228,10 +1228,11 @@ def _build_once(self, cache_key):
         partial_program = partial_program_from(
             concrete_program, cache_key.class_instance is not None
         )
-        if core._is_fwd_prim_enabled():
-            partial_program.set_hooker(
-                PrimHooker(concrete_program.main_program)
-            )
+        with backend_guard(backend):
+            if core._is_fwd_prim_enabled():
+                partial_program.set_hooker(
+                    PrimHooker(concrete_program.main_program, backend)
+                )
         return concrete_program, partial_program
 
     def __getitem__(self, item):
@@ -1291,39 +1292,46 @@ def clear(self):
 
 
 class PrimHooker(PartialProgramLayerHook):
-    def __init__(self, original_program):
+    def __init__(self, original_program, backend):
         if len(original_program.blocks) > 1:
             raise ValueError(
                 'The primitive mode only support one block currently.'
             )
+        self.backend = backend
         self.custom_vjps = set()
-        if core._is_all_prim_enabled():
-            self.custom_vjps = {
-                op.type
-                for op in original_program.block(0).ops
-                if core.has_comp_grad_op_maker(op.type)
-            }
+        with backend_guard(self.backend):
+            if core._is_all_prim_enabled():
+                self.custom_vjps = {
+                    op.type
+                    for op in original_program.block(0).ops
+                    if core.has_comp_grad_op_maker(op.type)
+                }
 
     def before_append_backward(self, forward_program):
-        if core._is_fwd_prim_enabled():
-            _to_prim(forward_program.blocks, blacklist=self.custom_vjps)
-        return forward_program
+        with backend_guard(self.backend):
+            if core._is_fwd_prim_enabled():
+                _to_prim(forward_program.blocks, blacklist=self.custom_vjps)
+            return forward_program
 
     def after_append_backward(self, whole_program, backward_start_idx):
-        backward_length = len(whole_program.block(0).ops) - backward_start_idx
-        if core._is_fwd_prim_enabled() and len(self.custom_vjps) != 0:
-            # only process backward part of block
-            _to_prim(whole_program.blocks, backward_length=backward_length)
-        new_start_index = len(whole_program.block(0).ops) - backward_length
-        if backward_length > 0:
-            # only process forward part of block
-            _to_prim(whole_program.blocks, start_idx=new_start_index)
-        return whole_program, new_start_index
+        with backend_guard(self.backend):
+            backward_length = (
+                len(whole_program.block(0).ops) - backward_start_idx
+            )
+            if core._is_fwd_prim_enabled() and len(self.custom_vjps) != 0:
+                # only process backward part of block
+                _to_prim(whole_program.blocks, backward_length=backward_length)
+            new_start_index = len(whole_program.block(0).ops) - backward_length
+            if backward_length > 0:
+                # only process forward part of block
+                _to_prim(whole_program.blocks, start_idx=new_start_index)
+            return whole_program, new_start_index
 
     def after_infer(self, infer_program):
-        if core._is_fwd_prim_enabled():
-            _to_prim(infer_program.block(0))
-        return infer_program
+        with backend_guard(self.backend):
+            if core._is_fwd_prim_enabled():
+                _to_prim(infer_program.block(0))
+            return infer_program
 
 
 class ProgramTranslator:
diff --git a/python/paddle/jit/dy2static/utils.py b/python/paddle/jit/dy2static/utils.py
index 3608b8d0641a55..28c8c739f2efca 100644
--- a/python/paddle/jit/dy2static/utils.py
+++ b/python/paddle/jit/dy2static/utils.py
@@ -35,6 +35,7 @@
 from paddle.fluid import core, unique_name
 from paddle.fluid.data_feeder import convert_dtype
 from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.wrapped_decorator import signature_safe_contextmanager
 from paddle.utils import gast
 
 from .ast_utils import ast_to_source_code
@@ -1498,7 +1499,10 @@ def _out_grad_names(program_desc, fwd_end_op_index, out_size):
     return names
 
 
-def prim_or_cinn_is_enabled(build_strategy):
+def prim_or_cinn_is_enabled(build_strategy, backend):
+    if backend == 'CINN':
+        return True
+
     if build_strategy is not None and build_strategy.build_cinn_pass:
         return True
 
@@ -1534,3 +1538,18 @@ def name_judge():
         return True
     else:
         return False
+
+
+@signature_safe_contextmanager
+def backend_guard(backend):
+    core.check_and_set_prim_all_enabled()
+    orign_fwd = core._is_fwd_prim_enabled()
+    orign_bwd = core._is_bwd_prim_enabled()
+
+    if backend == 'CINN':
+        core._set_prim_all_enabled(True)
+    try:
+        yield
+    finally:
+        core._set_prim_forward_enabled(orign_fwd)
+        core._set_prim_backward_enabled(orign_bwd)
diff --git a/test/dygraph_to_static/test_cinn_prim.py b/test/dygraph_to_static/test_cinn_prim.py
index 6ace7696c383a3..c5527e85238b6b 100644
--- a/test/dygraph_to_static/test_cinn_prim.py
+++ b/test/dygraph_to_static/test_cinn_prim.py
@@ -163,5 +163,20 @@ def test_cinn_prim(self):
             )
 
 
+class TestBackend(unittest.TestCase):
+    def test_backend(self):
+        x = paddle.randn([2, 4])
+        out1 = self.forward(x, 'CINN')
+        out2 = self.forward(x, None)
+        np.testing.assert_allclose(out1, out2, rtol=1e-6)
+
+    def forward(self, x, beckend=None):
+        paddle.seed(2022)
+        net = PrimeNet()
+        net = paddle.jit.to_static(net, backend=beckend)
+        out = net(x)
+        return out
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/dygraph_to_static/test_partial_program_hook.py b/test/dygraph_to_static/test_partial_program_hook.py
index 896dde419bf200..b9a64d3d0993a3 100644
--- a/test/dygraph_to_static/test_partial_program_hook.py
+++ b/test/dygraph_to_static/test_partial_program_hook.py
@@ -44,7 +44,7 @@ def f():
             f
         ).get_concrete_program()
         self._hook = program_translator.PrimHooker(
-            concrete_program.main_program
+            concrete_program.main_program, None
         )
         self._forward = partial_program.forward_program
         self._whole = partial_program._train_program

From 17fec4e9aae895c513d9b07af55adf56059c5e42 Mon Sep 17 00:00:00 2001
From: cyberslack_lee <luhputu0815@gmail.com>
Date: Tue, 11 Apr 2023 11:19:51 +0800
Subject: [PATCH 049/156] =?UTF-8?q?=E3=80=90Hackathon4=20No58=E3=80=91empt?=
 =?UTF-8?q?y=5Flike=20fp16&bf16=20API=20test=20(#52668)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../tests/unittests/test_empty_like_op.py     | 90 +++++++++++++++++--
 python/paddle/tensor/attribute.py             |  1 +
 python/paddle/tensor/creation.py              | 20 ++++-
 3 files changed, 102 insertions(+), 9 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_empty_like_op.py b/python/paddle/fluid/tests/unittests/test_empty_like_op.py
index 8ccaabd7c2cf06..164275b1a7d838 100644
--- a/python/paddle/fluid/tests/unittests/test_empty_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_empty_like_op.py
@@ -15,6 +15,7 @@
 import unittest
 
 import numpy as np
+from eager_op_test import convert_uint16_to_float
 
 import paddle
 from paddle.fluid import core
@@ -38,7 +39,7 @@ def __check_out__(self, out):
             f'shape should be {self.dst_shape}, but get {shape}',
         )
 
-        if data_type in ['float32', 'float64', 'int32', 'int64']:
+        if data_type in ['float16', 'float32', 'float64', 'int32', 'int64']:
             max_value = np.nanmax(out)
             min_value = np.nanmin(out)
             always_non_full_zero = max_value >= min_value
@@ -47,6 +48,16 @@ def __check_out__(self, out):
                 always_full_zero or always_non_full_zero,
                 'always_full_zero or always_non_full_zero.',
             )
+        elif data_type in ['uint16']:
+            uout = convert_uint16_to_float(out)
+            max_value = np.nanmax(uout)
+            min_value = np.nanmin(uout)
+            always_non_full_zero = max_value >= min_value
+            always_full_zero = max_value == 0.0 and min_value == 0.0
+            self.assertTrue(
+                always_full_zero or always_non_full_zero,
+                'always_full_zero or always_non_full_zero.',
+            )
         elif data_type in ['bool']:
             total_num = out.size
             true_num = np.sum(out)
@@ -154,16 +165,13 @@ def setUp(self):
 
     def test_static_graph(self):
         paddle.enable_static()
-
-        dtype = 'float32'
-
         train_program = Program()
         startup_program = Program()
 
         with program_guard(train_program, startup_program):
-            x = np.random.random(self.x_shape).astype(dtype)
+            x = np.random.random(self.x_shape).astype(self.dtype)
             data_x = paddle.static.data(
-                'x', shape=self.data_x_shape, dtype=dtype
+                'x', shape=self.data_x_shape, dtype=self.dtype
             )
 
             out = paddle.empty_like(data_x)
@@ -176,7 +184,7 @@ def test_static_graph(self):
         exe = paddle.static.Executor(place)
         res = exe.run(train_program, feed={'x': x}, fetch_list=[out])
 
-        self.dst_dtype = dtype
+        self.dst_dtype = self.dtype
         self.dst_shape = x.shape
         self.__check_out__(res[0])
 
@@ -185,12 +193,80 @@ def test_static_graph(self):
     def init_config(self):
         self.x_shape = (200, 3)
         self.data_x_shape = [200, 3]
+        self.dtype = 'float32'
 
 
 class TestEmptyLikeAPI_Static2(TestEmptyLikeAPI_Static):
     def init_config(self):
         self.x_shape = (3, 200, 3)
         self.data_x_shape = [-1, 200, 3]
+        self.dtype = 'float32'
+
+
+class TestEmptyLikeAPI_StaticForFP16Op(TestEmptyLikeAPICommon):
+    def setUp(self):
+        self.init_config()
+
+    def init_config(self):
+        self.x_shape = (200, 3)
+        self.data_x_shape = [200, 3]
+        self.dtype = 'float16'
+
+    def test_static_graph(self):
+        paddle.enable_static()
+        if paddle.fluid.core.is_compiled_with_cuda():
+            place = paddle.CUDAPlace(0)
+            with paddle.static.program_guard(
+                paddle.static.Program(), paddle.static.Program()
+            ):
+                x = np.random.random([200, 3]).astype(self.dtype)
+                data_x = paddle.static.data(
+                    name="x", shape=[200, 3], dtype=self.dtype
+                )
+                out = paddle.empty_like(data_x)
+                exe = paddle.static.Executor(place)
+                res = exe.run(
+                    paddle.static.default_main_program(),
+                    feed={'x': x},
+                    fetch_list=[out],
+                )
+
+            self.dst_dtype = self.dtype
+            self.dst_shape = x.shape
+            self.__check_out__(res[0])
+
+
+class TestEmptyLikeAPI_StaticForBF16Op(TestEmptyLikeAPICommon):
+    def setUp(self):
+        self.init_config()
+
+    def init_config(self):
+        self.x_shape = (200, 3)
+        self.data_x_shape = [200, 3]
+        self.dtype = 'uint16'
+
+    def test_static_graph(self):
+        paddle.enable_static()
+        if paddle.fluid.core.is_compiled_with_cuda():
+            place = paddle.CUDAPlace(0)
+            with paddle.static.program_guard(
+                paddle.static.Program(), paddle.static.Program()
+            ):
+                x = np.random.random([200, 3]).astype(np.uint16)
+                data_x = paddle.static.data(
+                    name="x", shape=[200, 3], dtype=np.uint16
+                )
+                out = paddle.empty_like(data_x)
+                exe = paddle.static.Executor(place)
+                res = exe.run(
+                    paddle.static.default_main_program(),
+                    feed={'x': x},
+                    fetch_list=[out],
+                )
+
+            self.dst_dtype = self.dtype
+            self.dst_shape = x.shape
+            self.__check_out__(res[0])
 
 
 class TestEmptyError(unittest.TestCase):
diff --git a/python/paddle/tensor/attribute.py b/python/paddle/tensor/attribute.py
index 7a859d64d0c517..63af833747b1b4 100644
--- a/python/paddle/tensor/attribute.py
+++ b/python/paddle/tensor/attribute.py
@@ -120,6 +120,7 @@ def shape(input):
                 'int64',
                 'complex64',
                 'complex128',
+                'uint16',
             ],
             'shape',
         )
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 602fa7186ec84e..99d9ad594c1196 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -1954,13 +1954,29 @@ def empty_like(x, dtype=None, name=None):
         check_variable_and_dtype(
             x,
             'x',
-            ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
+            [
+                'bool',
+                'float16',
+                'float32',
+                'float64',
+                'int32',
+                'int64',
+                'uint16',
+            ],
             'empty_like',
         )
         check_dtype(
             dtype,
             'dtype',
-            ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
+            [
+                'bool',
+                'float16',
+                'float32',
+                'float64',
+                'int32',
+                'int64',
+                'uint16',
+            ],
             'empty_like',
         )
         out = helper.create_variable_for_type_inference(dtype=dtype)

From 230325906483b3e3b473f5177ede1a0de2132415 Mon Sep 17 00:00:00 2001
From: wangzhen38 <41941775+wangzhen38@users.noreply.github.com>
Date: Tue, 11 Apr 2023 11:20:35 +0800
Subject: [PATCH 050/156] [BUG Fixs] adadelta lr support (#49732)

---
 .../fluid/operators/optimizers/adadelta_op.cc |  1 +
 paddle/fluid/pybind/eager_generator.h         |  7 ++-
 paddle/phi/api/yaml/legacy_ops.yaml           |  2 +-
 paddle/phi/infermeta/multiary.cc              |  6 +++
 paddle/phi/infermeta/multiary.h               |  1 +
 paddle/phi/kernels/adadelta_kernel.h          |  1 +
 .../phi/kernels/impl/adadelta_kernel_impl.h   | 47 ++++++++++++-------
 paddle/phi/kernels/xpu/adadelta_kernel.cc     |  1 +
 paddle/phi/ops/compat/adadelta_sig.cc         | 20 ++++----
 python/paddle/fluid/optimizer.py              |  2 +
 .../fluid/tests/unittests/test_adadelta_op.py |  9 +++-
 python/paddle/optimizer/adadelta.py           |  2 +
 12 files changed, 69 insertions(+), 30 deletions(-)

diff --git a/paddle/fluid/operators/optimizers/adadelta_op.cc b/paddle/fluid/operators/optimizers/adadelta_op.cc
index 2df8ff971cef10..cb2c374d017fd3 100644
--- a/paddle/fluid/operators/optimizers/adadelta_op.cc
+++ b/paddle/fluid/operators/optimizers/adadelta_op.cc
@@ -39,6 +39,7 @@ class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("AvgSquaredGrad", "(Tensor) Input average of squared gradient");
     AddInput("AvgSquaredUpdate",
              "(Tensor) Input average of squared parameter updates");
+    AddInput("LearningRate", "(Tensor) Learning rate");
     AddInput("MasterParam", "FP32 master weight for AMP.").AsDispensable();
 
     AddOutput("ParamOut", "(Tensor) Output parameter");
diff --git a/paddle/fluid/pybind/eager_generator.h b/paddle/fluid/pybind/eager_generator.h
index 2eb7934c911c5a..03b8690569c226 100644
--- a/paddle/fluid/pybind/eager_generator.h
+++ b/paddle/fluid/pybind/eager_generator.h
@@ -220,7 +220,12 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
     {"sgd", {"Param", "LearningRate", "Grad", "MasterParam"}},
     {"adagrad", {"Param", "Grad", "Moment", "LearningRate", "MasterParam"}},
     {"adadelta",
-     {"Param", "Grad", "AvgSquaredGrad", "AvgSquaredUpdate", "MasterParam"}},
+     {"Param",
+      "Grad",
+      "AvgSquaredGrad",
+      "AvgSquaredUpdate",
+      "LearningRate",
+      "MasterParam"}},
     {"graph_khop_sampler", {"Row", "Eids", "Col_Ptr", "X"}},
     {"nce",
      {"Input",
diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml
index e44bbe7e6dd5bd..2d0aadcf5362ca 100755
--- a/paddle/phi/api/yaml/legacy_ops.yaml
+++ b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -11,7 +11,7 @@
   backward : abs_grad
 
 - op : adadelta_
-  args : (Tensor param, Tensor grad, Tensor avg_squared_grad, Tensor avg_squared_update, Tensor master_param, float rho, float epsilon, bool multi_precision)
+  args : (Tensor param, Tensor grad, Tensor avg_squared_grad, Tensor avg_squared_update, Tensor learning_rate, Tensor master_param, float rho, float epsilon, bool multi_precision)
   output : Tensor(param_out), Tensor(moment_out), Tensor(inf_norm_out), Tensor(master_param_out)
   infer_meta :
     func : AdadeltaInferMeta
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index ea93a5874932ed..7364f85e75155b 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -40,6 +40,7 @@ void AdadeltaInferMeta(const MetaTensor& param,
                        const MetaTensor& grad,
                        const MetaTensor& avg_squared_grad,
                        const MetaTensor& avg_squared_update,
+                       const MetaTensor& learning_rate,
                        const MetaTensor& master_param,
                        float rho,
                        float epsilon,
@@ -48,6 +49,11 @@ void AdadeltaInferMeta(const MetaTensor& param,
                        MetaTensor* avg_squared_grad_out,
                        MetaTensor* avg_squared_update_out,
                        MetaTensor* master_param_out) {
+  auto lr_dims = learning_rate.dims();
+  PADDLE_ENFORCE_EQ(
+      phi::product(lr_dims),
+      1,
+      phi::errors::InvalidArgument("LearningRate should have one element"));
   auto param_dims = param.dims();
   PADDLE_ENFORCE_EQ(
       param_dims,
diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h
index cf6ca3c2a9fb6f..178910e3620c9a 100644
--- a/paddle/phi/infermeta/multiary.h
+++ b/paddle/phi/infermeta/multiary.h
@@ -43,6 +43,7 @@ void AdadeltaInferMeta(const MetaTensor& param,
                        const MetaTensor& grad,
                        const MetaTensor& avg_squared_grad,
                        const MetaTensor& avg_squared_update,
+                       const MetaTensor& learning_rate,
                        const MetaTensor& master_param,
                        float rho,
                        float epsilon,
diff --git a/paddle/phi/kernels/adadelta_kernel.h b/paddle/phi/kernels/adadelta_kernel.h
index 15c07b3e6f9675..16f4e6ca269809 100644
--- a/paddle/phi/kernels/adadelta_kernel.h
+++ b/paddle/phi/kernels/adadelta_kernel.h
@@ -24,6 +24,7 @@ void AdadeltaKernel(const Context& dev_ctx,
                     const DenseTensor& grad,
                     const DenseTensor& avg_squared_grad,
                     const DenseTensor& avg_squared_update,
+                    const DenseTensor& learning_rate,
                     const paddle::optional<DenseTensor>& master_param,
                     float rho,
                     float epsilon,
diff --git a/paddle/phi/kernels/impl/adadelta_kernel_impl.h b/paddle/phi/kernels/impl/adadelta_kernel_impl.h
index b0c0a072acd55d..c432c72d832c60 100644
--- a/paddle/phi/kernels/impl/adadelta_kernel_impl.h
+++ b/paddle/phi/kernels/impl/adadelta_kernel_impl.h
@@ -13,11 +13,14 @@
 // limitations under the License.
 
 #pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
 
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/kernels/adadelta_kernel.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
-#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace phi {
 
@@ -27,6 +30,7 @@ void AdadeltaKernel(const Context& dev_ctx,
                     const DenseTensor& grad,
                     const DenseTensor& avg_squared_grad,
                     const DenseTensor& avg_squared_update,
+                    const DenseTensor& learning_rate,
                     const paddle::optional<DenseTensor>& master_param,
                     float rho,
                     float epsilon,
@@ -56,29 +60,36 @@ void AdadeltaKernel(const Context& dev_ctx,
   auto eigen_avg_squared_update_out =
       EigenVector<MPDType>::Flatten(*avg_squared_update_out);
   auto& place = *dev_ctx.eigen_device();
-
   auto eigen_grad_cast = eigen_grad.template cast<MPDType>();
-
   eigen_avg_squared_grad_out.device(place) =
       rho_ * eigen_avg_squared_grad + (1 - rho_) * eigen_grad_cast.square();
-  auto update = -((eigen_avg_squared_update + epsilon_) /
-                  (eigen_avg_squared_grad_out + epsilon_))
-                     .sqrt() *
-                eigen_grad_cast;
-  eigen_avg_squared_update_out.device(place) =
-      rho_ * eigen_avg_squared_update + (1 - rho_) * update.square();
-
-  if (multi_precision) {
-    auto eigen_master_param_out =
-        EigenVector<MPDType>::Flatten(*master_param_outs);
-    auto eigen_master_param = EigenVector<MPDType>::Flatten(*master_param);
-
-    eigen_master_param_out.device(place) = eigen_master_param + update;
+  auto update =
+      -(((eigen_avg_squared_update + epsilon_).sqrt()) /
+        ((eigen_avg_squared_grad_out + epsilon_).sqrt()) * eigen_grad_cast);
+  Eigen::DSizes<int, 1> m_dsize(avg_squared_update_out->numel());
+  if (paddle::platform::is_cpu_place(dev_ctx.GetPlace())) {
+    auto* lr = learning_rate.data<T>();
     eigen_param_out.device(place) =
-        (eigen_param.template cast<MPDType>() + update).template cast<T>();
+        eigen_param + lr[0] * update.template cast<T>();
   } else {
-    eigen_param_out.device(place) = eigen_param + update.template cast<T>();
+    auto lr = EigenVector<MPDType>::Flatten(learning_rate);
+    if (multi_precision) {
+      auto eigen_master_param_out =
+          EigenVector<MPDType>::Flatten(*master_param_outs);
+      auto eigen_master_param = EigenVector<MPDType>::Flatten(*master_param);
+
+      eigen_master_param_out.device(place) =
+          eigen_master_param + lr.broadcast(m_dsize) * update;
+      eigen_param_out.device(place) = (eigen_param.template cast<MPDType>() +
+                                       lr.broadcast(m_dsize) * update)
+                                          .template cast<T>();
+    } else {
+      eigen_param_out.device(place) =
+          eigen_param + (lr.broadcast(m_dsize) * update).template cast<T>();
+    }
   }
+  eigen_avg_squared_update_out.device(place) =
+      rho_ * eigen_avg_squared_update + (1 - rho_) * update.square();
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/xpu/adadelta_kernel.cc b/paddle/phi/kernels/xpu/adadelta_kernel.cc
index e02a5aeabad2e3..b87ec1afbdc362 100644
--- a/paddle/phi/kernels/xpu/adadelta_kernel.cc
+++ b/paddle/phi/kernels/xpu/adadelta_kernel.cc
@@ -25,6 +25,7 @@ void AdadeltaKernel(const Context& dev_ctx,
                     const DenseTensor& grad,
                     const DenseTensor& avg_squared_grad,
                     const DenseTensor& avg_squared_update,
+                    const DenseTensor& learning_rate,
                     const paddle::optional<DenseTensor>& master_param,
                     float rho,
                     float epsilon,
diff --git a/paddle/phi/ops/compat/adadelta_sig.cc b/paddle/phi/ops/compat/adadelta_sig.cc
index fd285e7e5d0e53..da7e4229a0d22f 100644
--- a/paddle/phi/ops/compat/adadelta_sig.cc
+++ b/paddle/phi/ops/compat/adadelta_sig.cc
@@ -18,14 +18,18 @@ namespace phi {
 
 KernelSignature AdadeltaOpArgumentMapping(const ArgumentMappingContext& ctx) {
   if (ctx.IsDenseTensorInput("Grad")) {
-    return KernelSignature(
-        "adadelta",
-        {"Param", "Grad", "AvgSquaredGrad", "AvgSquaredUpdate", "MasterParam"},
-        {"rho", "epsilon", "multi_precision"},
-        {"ParamOut",
-         "AvgSquaredGradOut",
-         "AvgSquaredUpdateOut",
-         "MasterParamOut"});
+    return KernelSignature("adadelta",
+                           {"Param",
+                            "Grad",
+                            "AvgSquaredGrad",
+                            "AvgSquaredUpdate",
+                            "LearningRate",
+                            "MasterParam"},
+                           {"rho", "epsilon", "multi_precision"},
+                           {"ParamOut",
+                            "AvgSquaredGradOut",
+                            "AvgSquaredUpdateOut",
+                            "MasterParamOut"});
   }
 
   return KernelSignature("unregistered", {}, {}, {});
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 6ed9e674689eea..db483b151e4eb2 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -3215,6 +3215,7 @@ def _append_optimize_op(self, block, param_and_grad):
                 param_and_grad[1],
                 avg_squared_grad_acc,
                 avg_squared_update_acc,
+                self._create_param_lr(param_and_grad),
                 master_weight,
                 self._rho,
                 self._epsilon,
@@ -3227,6 +3228,7 @@ def _append_optimize_op(self, block, param_and_grad):
                 "Grad": param_and_grad[1],
                 "AvgSquaredGrad": avg_squared_grad_acc,
                 "AvgSquaredUpdate": avg_squared_update_acc,
+                "LearningRate": self._create_param_lr(param_and_grad),
             }
             outputs = {
                 "ParamOut": param_and_grad[0],
diff --git a/python/paddle/fluid/tests/unittests/test_adadelta_op.py b/python/paddle/fluid/tests/unittests/test_adadelta_op.py
index 11db47b2475b9c..f3eca8fec9cc78 100644
--- a/python/paddle/fluid/tests/unittests/test_adadelta_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adadelta_op.py
@@ -26,6 +26,7 @@ def adadelta_wrapper(
     Grad,
     AvgSquaredGrad,
     AvgSquaredUpdate,
+    LearningRate,
     master_weight=None,
     rho=0.95,
     epsilon=1e-6,
@@ -35,12 +36,13 @@ def adadelta_wrapper(
         Grad,
         AvgSquaredGrad,
         AvgSquaredUpdate,
+        LearningRate,
         None,
         rho,
         epsilon,
         False,
     )
-    return Param, AvgSquaredGrad, AvgSquaredUpdate
+    return Param, AvgSquaredGrad, AvgSquaredUpdate, LearningRate
 
 
 class TestAdadeltaOp1(OpTest):
@@ -58,11 +60,13 @@ def setUp(self):
         rho = 0.95
         epsilon = 1e-6
 
+        learning_rate = 1.0
         self.inputs = {
             'Param': param,
             'Grad': grad,
             'AvgSquaredGrad': avg_squared_grad,
             'AvgSquaredUpdate': avg_squared_update,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
         }
 
         self.attrs = {'rho': rho, 'epsilon': epsilon}
@@ -113,12 +117,13 @@ def setUp(self):
         epsilon = 1e-6
 
         self.attrs = {'rho': rho, 'epsilon': epsilon}
-
+        learning_rate = 1.0
         self.inputs = {
             'Param': param,
             'Grad': grad,
             'AvgSquaredGrad': avg_squared_grad,
             'AvgSquaredUpdate': avg_squared_update,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
         }
 
         avg_squared_grad_out = rho * avg_squared_grad + (1 - rho) * np.square(
diff --git a/python/paddle/optimizer/adadelta.py b/python/paddle/optimizer/adadelta.py
index 1cdb61f698e6be..c760c535da0220 100644
--- a/python/paddle/optimizer/adadelta.py
+++ b/python/paddle/optimizer/adadelta.py
@@ -197,6 +197,7 @@ def _append_optimize_op(self, block, param_and_grad):
                     param_and_grad[1],
                     avg_squared_grad_acc,
                     avg_squared_update_acc,
+                    self._create_param_lr(param_and_grad),
                     master_weight,
                     self._rho,
                     self._epsilon,
@@ -213,6 +214,7 @@ def _append_optimize_op(self, block, param_and_grad):
                 "Grad": param_and_grad[1],
                 "AvgSquaredGrad": avg_squared_grad_acc,
                 "AvgSquaredUpdate": avg_squared_update_acc,
+                "LearningRate": self._create_param_lr(param_and_grad),
             }
             outputs = {
                 "ParamOut": param_and_grad[0],

From f80a0fe9d81513957020b5bfd82cb4249101f0d3 Mon Sep 17 00:00:00 2001
From: risemeup1 <62429225+risemeup1@users.noreply.github.com>
Date: Tue, 11 Apr 2023 12:01:31 +0800
Subject: [PATCH 051/156] fix_mac_m1_error (#52720)

---
 .../fleet/base/distributed_strategy.py          | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index 194e4bd6675555..0f09440e4337c9 100755
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -24,12 +24,6 @@
 from paddle.fluid.framework import _global_flags
 from paddle.fluid.wrapped_decorator import wrap_decorator
 
-protobuf_version = google.protobuf.__version__
-if protobuf_version >= "4.21.0":
-    from google._upb import _message
-else:
-    from google.protobuf.pyext import _message
-
 __all__ = []
 
 non_auto_func_called = True
@@ -2512,10 +2506,19 @@ def __repr__(self):
                                 self.strategy, f.name + "_configs"
                             )
                             config_fields = my_configs.DESCRIPTOR.fields
+                            protobuf_version = google.protobuf.__version__
+                            if protobuf_version >= "4.21.0":
+                                RepeatedScalarContainer = (
+                                    google._upb._message.RepeatedScalarContainer
+                                )
+                            else:
+                                RepeatedScalarContainer = (
+                                    google.protobuf.pyext._message.RepeatedScalarContainer
+                                )
                             for ff in config_fields:
                                 if isinstance(
                                     getattr(my_configs, ff.name),
-                                    _message.RepeatedScalarContainer,
+                                    RepeatedScalarContainer,
                                 ):
                                     values = getattr(my_configs, ff.name)
                                     for i, v in enumerate(values):

From 6b74cf76cbaf521cd34633a572acb6abbbd124d8 Mon Sep 17 00:00:00 2001
From: wuhuachaocoding <77733235+wuhuachaocoding@users.noreply.github.com>
Date: Tue, 11 Apr 2023 12:11:47 +0800
Subject: [PATCH 052/156] mp sync params & grads & opt states. (#51428)

---
 .../framework/distributed_strategy.proto      |   8 +
 .../fleet/base/distributed_strategy.py        |   6 +
 .../hybrid_parallel_optimizer.py              |  81 +++++++++-
 .../fleet/hybrid_parallel_mp_model.py         | 144 ++++++++++++++++++
 4 files changed, 238 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
index b9055d38d38c52..de2e38c2f11650 100755
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -50,11 +50,19 @@ message ShardingConfig {
   optional bool enable_tuning = 15 [ default = false ]; // incubate for auto parallel
 }
 
+// for dygraph
+message MpConfig {
+    optional bool sync_param= 1 [ default = false ];
+    optional bool sync_grad= 2 [ default = false ];
+    optional bool sync_moment= 3 [ default = false ];
+}
+
 message HybridConfig {
   optional int32 dp_degree = 1 [ default = -1 ];
   optional int32 mp_degree = 2 [ default = 1 ];
   optional int32 pp_degree = 3 [ default = 1 ];
   optional int32 sharding_degree = 4 [ default = 1 ];
+  optional MpConfig mp_configs = 5;
 }
 
 message AMPConfig {
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index 0f09440e4337c9..86292a2d90e794 100755
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -1696,6 +1696,12 @@ def hybrid_configs(self, configs):
         check_configs_key(
             self.strategy.hybrid_configs, hybrid_config, "hybrid_configs"
         )
+
+        if "mp_configs" in configs:
+            assign_configs_value(
+                self.strategy.hybrid_configs.mp_configs, configs["mp_configs"]
+            )
+            configs.pop("mp_configs")
         assign_configs_value(self.strategy.hybrid_configs, configs)
 
     @property
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
index 98604b8db3d8cd..acd34f1b1d5b89 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
@@ -12,9 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+
 import paddle
 from paddle import framework
 from paddle.autograd import no_grad
+from paddle.distributed import fleet
 from paddle.framework import core
 from paddle.nn import ClipGradByGlobalNorm, clip
 
@@ -292,6 +294,83 @@ def __init__(self, optimizer, hcg, strategy):
                                 self._inner_opt._grad_clip, hcg
                             )
 
+    def _filter_fn(self, param):
+        p_name = param.name
+        tar_param = ["embedding", "layer_norm", ".b_"]
+        if param.is_distributed is False:
+            for tar in tar_param:
+                if tar in p_name:
+                    return True
+        return False
+
+    def _step(self, parameters_list):
+        mp_group = self._hcg.get_model_parallel_group()
+        src_rank = self._hcg.get_model_parallel_group_src_rank()
+        params = None
+        mp_configs = None
+
+        if mp_group.nranks > 1:
+            mp_configs = fleet.fleet._user_defined_strategy.hybrid_configs[
+                "mp_configs"
+            ]
+
+        if mp_configs and (
+            mp_configs.sync_param
+            or mp_configs.sync_grad
+            or mp_configs.sync_moment
+        ):
+            params = sorted(
+                [p for p in parameters_list if self._filter_fn(p)],
+                key=lambda p: p.name,
+            )
+
+        if mp_group.nranks > 1 and mp_configs and mp_configs.sync_grad:
+            for p in params:
+                if p.grad is None:
+                    continue
+                paddle.distributed.broadcast(
+                    p.grad, src=src_rank, group=mp_group, sync_op=True
+                )
+
+        self._inner_opt.step()
+
+        if mp_group.nranks > 1 and mp_configs and mp_configs.sync_param:
+            for p in params:
+                paddle.distributed.broadcast(
+                    p, src=src_rank, group=mp_group, sync_op=True
+                )
+
+        if mp_group.nranks > 1 and mp_configs and mp_configs.sync_moment:
+            for p in params:
+                # support opt state of adam and adamw to broadcast now.
+                if isinstance(
+                    self._inner_opt,
+                    (paddle.optimizer.Adam, paddle.optimizer.AdamW),
+                ):
+                    if (
+                        self._inner_opt._multi_precision
+                        and p.name in self._master_weights
+                    ):
+                        paddle.distributed.broadcast(
+                            self._inner_opt._master_weights[p.name],
+                            src=src_rank,
+                            group=mp_group,
+                            sync_op=True,
+                        )
+
+                    moment1 = self._inner_opt._get_accumulator(
+                        self._inner_opt._moment1_acc_str, p
+                    )
+                    moment2 = self._inner_opt._get_accumulator(
+                        self._inner_opt._moment2_acc_str, p
+                    )
+                    paddle.distributed.broadcast(
+                        moment1, src=src_rank, group=mp_group, sync_op=True
+                    )
+                    paddle.distributed.broadcast(
+                        moment2, src=src_rank, group=mp_group, sync_op=True
+                    )
+
     @no_grad()
     @framework.dygraph_only
     def step(self):
@@ -302,7 +381,7 @@ def step(self):
         if self._dp_enable:
             fused_allreduce_gradients(list(parameters_list), self._hcg)
 
-        self._inner_opt.step()
+        self._step(parameters_list)
 
     @no_grad()
     def minimize(
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_mp_model.py b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_mp_model.py
index dec1eb949ddb85..26e740bfa6b79b 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_mp_model.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_mp_model.py
@@ -181,6 +181,150 @@ def forward(self, x):
         return x
 
 
+class TestDistMPSyncTraning(unittest.TestCase):
+    def setUp(self):
+        strategy = fleet.DistributedStrategy()
+        self.model_parallel_size = 2
+        self.data_parallel_size = 1
+        strategy.hybrid_configs = {
+            "dp_degree": self.data_parallel_size,
+            "mp_degree": self.model_parallel_size,
+            "pp_degree": 1,
+            "mp_configs": {
+                "sync_param": False,
+                "sync_grad": False,
+                "sync_moment": False,
+            },
+        }
+        fleet.init(is_collective=True, strategy=strategy)
+
+    def build_model_optimizer_train(
+        self,
+        batchs,
+        fp16=False,
+        mp_sync_param=False,
+        mp_sync_grad=False,
+        mp_sync_moment=False,
+    ):
+        hcg = fleet.get_hybrid_communicate_group()
+        word_size = hcg.get_model_parallel_world_size()
+        mp_id = hcg.get_model_parallel_rank()
+        dp_id = hcg.get_data_parallel_rank()
+        rank_id = dist.get_rank()
+        paddle.seed(2023)
+        np.random.seed(2023)
+        random.seed(2023)
+        set_random_seed(1024, dp_id, rank_id)
+
+        np_fc1 = np.random.random_sample((hidden_size, inner_size))
+        np_fc2 = np.random.random_sample((inner_size, hidden_size))
+
+        model = SimpleMPNet(
+            vocab_size,
+            hidden_size,
+            inner_size,
+            output_size,
+            np_fc1,
+            np_fc2,
+            mp_id,
+        )
+        optimizer = paddle.optimizer.AdamW(
+            learning_rate=0.1, parameters=model.parameters()
+        )
+
+        strategy = fleet.fleet._user_defined_strategy
+        strategy.hybrid_configs = {
+            "dp_degree": self.data_parallel_size,
+            "mp_degree": self.model_parallel_size,
+            "pp_degree": 1,
+            "mp_configs": {
+                "sync_param": mp_sync_param,
+                "sync_grad": mp_sync_grad,
+                "sync_moment": mp_sync_moment,
+            },
+        }
+
+        model = fleet.distributed_model(model)
+        optimizer = fleet.distributed_optimizer(optimizer)
+        return self.train_batch(batchs, model, optimizer, fp16)
+
+    def train_batch(self, batchs, model, optimizer, fp16=False):
+        losses = []
+        if fp16:
+            scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
+            scaler = fleet.distributed_scaler(scaler)
+        for batch in batchs:
+            with paddle.amp.auto_cast(enable=fp16, level='O1'):
+                output = model(batch)
+                loss = output.mean()
+                losses.append(loss.numpy())
+            if fp16:
+                scaled = scaler.scale(loss)
+                scaled.backward()
+                scaler.step(optimizer)
+                scaler.update()
+            else:
+                loss.backward()
+                optimizer.step()
+                optimizer.clear_grad()
+        return losses
+
+    def mp_sync_base(
+        self, mp_sync_param=False, mp_sync_grad=False, mp_sync_moment=False
+    ):
+        batchs = []
+        for _ in range(5):
+            np_data = np.random.randint(
+                0,
+                vocab_size,
+                (
+                    batch_size,
+                    seq_length,
+                ),
+            )
+            batchs.append(paddle.to_tensor(np_data))
+
+        losses = self.build_model_optimizer_train(batchs)
+        losses_sync = self.build_model_optimizer_train(
+            batchs,
+            mp_sync_param=mp_sync_param,
+            mp_sync_grad=mp_sync_grad,
+            mp_sync_moment=mp_sync_moment,
+        )
+
+        for i in range(len(losses)):
+            np.testing.assert_allclose(losses[i], losses_sync[i], rtol=1e-6)
+
+        # test fp16
+        losses_fp16 = self.build_model_optimizer_train(batchs, fp16=True)
+        losses_sync_fp16 = self.build_model_optimizer_train(
+            batchs,
+            fp16=True,
+            mp_sync_param=mp_sync_param,
+            mp_sync_grad=mp_sync_grad,
+            mp_sync_moment=mp_sync_moment,
+        )
+
+        for i in range(len(losses_fp16)):
+            np.testing.assert_allclose(
+                losses_fp16[i], losses_sync_fp16[i], rtol=1e-6
+            )
+
+    def test_mp_sync_param(self):
+        self.mp_sync_base(mp_sync_param=True)
+
+    def test_mp_sync_grad(self):
+        self.mp_sync_base(mp_sync_grad=True)
+
+    def test_mp_sync_moment(self):
+        self.mp_sync_base(mp_sync_moment=True)
+
+    def test_mp_sync_all(self):
+        self.mp_sync_base(
+            mp_sync_param=True, mp_sync_grad=True, mp_sync_moment=True
+        )
+
+
 class TestDistMPTraning(unittest.TestCase):
     def setUp(self):
         strategy = fleet.DistributedStrategy()

From aaf873b2859f8e70c8ed5be830674be211b2df8d Mon Sep 17 00:00:00 2001
From: WJJ1995 <wjjisloser@163.com>
Date: Tue, 11 Apr 2023 13:17:40 +0800
Subject: [PATCH 053/156] [AMP OP&Test]Add fp16/bf16 support
 isnan/isfinite/isinf op (#52259)

* add bfp16 test for isfinite

* fixed for ci

* deal with comments

* fixed test

* skip test in cpu

* deal with comments

* fixed for ci

* fixed testcase

* fixed for ci

* fixed for testcase
---
 paddle/fluid/framework/data_type.h            | 13 ++--
 paddle/fluid/operators/isfinite_op.cu         | 12 ++--
 paddle/phi/kernels/cpu/isfinite_kernel.cc     |  3 +
 paddle/phi/kernels/funcs/isfinite_functor.h   | 21 ++++++
 paddle/phi/kernels/gpu/isfinite_kernel.cu     |  3 +
 .../fluid/tests/unittests/test_isfinite_op.py | 68 ++++++++++++++++++-
 python/paddle/tensor/math.py                  | 33 ++++++++-
 7 files changed, 139 insertions(+), 14 deletions(-)
 mode change 100644 => 100755 paddle/fluid/operators/isfinite_op.cu
 mode change 100644 => 100755 python/paddle/fluid/tests/unittests/test_isfinite_op.py

diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h
index a05f2858c0df3b..7e002c8154147d 100644
--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
@@ -83,12 +83,13 @@ struct DataTypeTrait<void> {
   _ForEachDataTypeHelper_(                                      \
       callback, ::paddle::platform::complex<double>, COMPLEX128);
 
-#define _ForEachDataTypeNormal_(callback)            \
-  _ForEachDataTypeHelper_(callback, float, FP32);    \
-  _ForEachDataTypeHelper_(callback, double, FP64);   \
-  _ForEachDataTypeHelper_(callback, int, INT32);     \
-  _ForEachDataTypeHelper_(callback, int64_t, INT64); \
-  _ForEachDataTypeHelper_(callback, ::paddle::platform::float16, FP16);
+#define _ForEachDataTypeNormal_(callback)                               \
+  _ForEachDataTypeHelper_(callback, float, FP32);                       \
+  _ForEachDataTypeHelper_(callback, double, FP64);                      \
+  _ForEachDataTypeHelper_(callback, int, INT32);                        \
+  _ForEachDataTypeHelper_(callback, int64_t, INT64);                    \
+  _ForEachDataTypeHelper_(callback, ::paddle::platform::float16, FP16); \
+  _ForEachDataTypeHelper_(callback, ::paddle::platform::bfloat16, BF16);
 
 // For the use of thrust, as index-type elements can be only integers.
 #define _ForEachDataTypeTiny_(callback)          \
diff --git a/paddle/fluid/operators/isfinite_op.cu b/paddle/fluid/operators/isfinite_op.cu
old mode 100644
new mode 100755
index d8e18f58fa9f2d..80a65cbda916b7
--- a/paddle/fluid/operators/isfinite_op.cu
+++ b/paddle/fluid/operators/isfinite_op.cu
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/operators/isfinite_op.h"
+#include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
@@ -22,18 +23,21 @@ REGISTER_OP_CUDA_KERNEL(
     ops::OverflowKernel<phi::GPUContext, int, ops::InfinityFunctor>,
     ops::OverflowKernel<phi::GPUContext, float, ops::InfinityFunctor>,
     ops::OverflowKernel<phi::GPUContext, double, ops::InfinityFunctor>,
-    ops::OverflowKernel<phi::GPUContext, plat::float16, ops::InfinityFunctor>);
+    ops::OverflowKernel<phi::GPUContext, plat::float16, ops::InfinityFunctor>,
+    ops::OverflowKernel<phi::GPUContext, plat::bfloat16, ops::InfinityFunctor>);
 
 REGISTER_OP_CUDA_KERNEL(
     isnan,
     ops::OverflowKernel<phi::GPUContext, int, ops::NANFunctor>,
     ops::OverflowKernel<phi::GPUContext, float, ops::NANFunctor>,
     ops::OverflowKernel<phi::GPUContext, double, ops::NANFunctor>,
-    ops::OverflowKernel<phi::GPUContext, plat::float16, ops::NANFunctor>);
+    ops::OverflowKernel<phi::GPUContext, plat::float16, ops::NANFunctor>,
+    ops::OverflowKernel<phi::GPUContext, plat::bfloat16, ops::NANFunctor>);
 
 REGISTER_OP_CUDA_KERNEL(
     isfinite,
     ops::OverflowKernel<phi::GPUContext, int, ops::IsfiniteFunctor>,
     ops::OverflowKernel<phi::GPUContext, float, ops::IsfiniteFunctor>,
     ops::OverflowKernel<phi::GPUContext, double, ops::IsfiniteFunctor>,
-    ops::OverflowKernel<phi::GPUContext, plat::float16, ops::IsfiniteFunctor>);
+    ops::OverflowKernel<phi::GPUContext, plat::float16, ops::IsfiniteFunctor>,
+    ops::OverflowKernel<phi::GPUContext, plat::bfloat16, ops::IsfiniteFunctor>);
diff --git a/paddle/phi/kernels/cpu/isfinite_kernel.cc b/paddle/phi/kernels/cpu/isfinite_kernel.cc
index 85d125794871d3..c9f69c5f7e4f5e 100644
--- a/paddle/phi/kernels/cpu/isfinite_kernel.cc
+++ b/paddle/phi/kernels/cpu/isfinite_kernel.cc
@@ -25,6 +25,7 @@ PD_REGISTER_KERNEL(isinf,
                    float,
                    double,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    int,
                    int64_t) {
   kernel->OutputAt(0).SetDataType(phi::DataType::BOOL);
@@ -37,6 +38,7 @@ PD_REGISTER_KERNEL(isnan,
                    float,
                    double,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    int,
                    int64_t) {
   kernel->OutputAt(0).SetDataType(phi::DataType::BOOL);
@@ -49,6 +51,7 @@ PD_REGISTER_KERNEL(isfinite,
                    float,
                    double,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    int,
                    int64_t) {
   kernel->OutputAt(0).SetDataType(phi::DataType::BOOL);
diff --git a/paddle/phi/kernels/funcs/isfinite_functor.h b/paddle/phi/kernels/funcs/isfinite_functor.h
index 1dc4fd57b48574..795b8f275c87ea 100644
--- a/paddle/phi/kernels/funcs/isfinite_functor.h
+++ b/paddle/phi/kernels/funcs/isfinite_functor.h
@@ -45,6 +45,13 @@ struct IsNanFunctor<phi::dtype::float16, void> {
   }
 };
 
+template <>
+struct IsNanFunctor<phi::dtype::bfloat16, void> {
+  HOSTDEVICE bool operator()(const phi::dtype::bfloat16& a) const {
+    return phi::dtype::isnan(a);
+  }
+};
+
 template <typename T, class Enable = void>
 struct IsInfFunctor {
   HOSTDEVICE bool operator()(const T& a) const {
@@ -69,6 +76,13 @@ struct IsInfFunctor<phi::dtype::float16, void> {
   }
 };
 
+template <>
+struct IsInfFunctor<phi::dtype::bfloat16, void> {
+  HOSTDEVICE bool operator()(const phi::dtype::bfloat16& a) const {
+    return phi::dtype::isinf(a);
+  }
+};
+
 template <typename T, class Enable = void>
 struct IsFiniteFunctor {
   HOSTDEVICE bool operator()(const T& a) const {
@@ -94,5 +108,12 @@ struct IsFiniteFunctor<phi::dtype::float16, void> {
   }
 };
 
+template <>
+struct IsFiniteFunctor<phi::dtype::bfloat16, void> {
+  HOSTDEVICE bool operator()(const phi::dtype::bfloat16& a) const {
+    return phi::dtype::isfinite(a);
+  }
+};
+
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/isfinite_kernel.cu b/paddle/phi/kernels/gpu/isfinite_kernel.cu
index e8c2fa022ec7a5..9bde1d7a5bd387 100644
--- a/paddle/phi/kernels/gpu/isfinite_kernel.cu
+++ b/paddle/phi/kernels/gpu/isfinite_kernel.cu
@@ -25,6 +25,7 @@ PD_REGISTER_KERNEL(isinf,
                    float,
                    double,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    int,
                    int64_t) {
   kernel->OutputAt(0).SetDataType(phi::DataType::BOOL);
@@ -37,6 +38,7 @@ PD_REGISTER_KERNEL(isnan,
                    float,
                    double,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    int,
                    int64_t) {
   kernel->OutputAt(0).SetDataType(phi::DataType::BOOL);
@@ -49,6 +51,7 @@ PD_REGISTER_KERNEL(isfinite,
                    float,
                    double,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    int,
                    int64_t) {
   kernel->OutputAt(0).SetDataType(phi::DataType::BOOL);
diff --git a/python/paddle/fluid/tests/unittests/test_isfinite_op.py b/python/paddle/fluid/tests/unittests/test_isfinite_op.py
old mode 100644
new mode 100755
index 6599f66140c229..efda5d502c6a6a
--- a/python/paddle/fluid/tests/unittests/test_isfinite_op.py
+++ b/python/paddle/fluid/tests/unittests/test_isfinite_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from eager_op_test import OpTest, convert_float_to_uint16
 
 from paddle.fluid import core
 
@@ -48,6 +48,28 @@ def init_dtype(self):
         self.dtype = np.float16
 
 
+# BFP16 isinf Test
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support the bfloat16",
+)
+class TestInfBF16(OpTest):
+    def setUp(self):
+        self.op_type = "isinf"
+        self.dtype = np.uint16
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(np.float32)
+        x[0] = np.inf
+        x[-1] = np.inf
+
+        out = np.array(True)
+        self.inputs = {'X': convert_float_to_uint16(x)}
+        self.outputs = {'Out': out}
+
+    def test_output(self):
+        self.check_output_with_place(core.CUDAPlace(0))
+
+
 class TestNAN(OpTest):
     def setUp(self):
         self.op_type = "isnan"
@@ -76,6 +98,28 @@ def init_dtype(self):
         self.dtype = np.float16
 
 
+# BFP16 isnan Test
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support the bfloat16",
+)
+class TestNANBF16(OpTest):
+    def setUp(self):
+        self.op_type = "isnan"
+        self.dtype = np.uint16
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(np.float32)
+        x[0] = np.nan
+        x[-1] = np.nan
+
+        out = np.array(True)
+        self.inputs = {'X': convert_float_to_uint16(x)}
+        self.outputs = {'Out': out}
+
+    def test_output(self):
+        self.check_output_with_place(core.CUDAPlace(0))
+
+
 class TestIsfinite(OpTest):
     def setUp(self):
         self.op_type = "isfinite"
@@ -105,5 +149,27 @@ def init_dtype(self):
         self.dtype = np.float16
 
 
+# BFP16 isfinite Test
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support the bfloat16",
+)
+class TestIsfiniteBF16(OpTest):
+    def setUp(self):
+        self.op_type = "isfinite"
+        self.dtype = np.uint16
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(np.float32)
+        x[0] = np.inf
+        x[-1] = np.nan
+
+        out = np.array(False)
+        self.inputs = {'X': convert_float_to_uint16(x)}
+        self.outputs = {'Out': out}
+
+    def test_output(self):
+        self.check_output_with_place(core.CUDAPlace(0))
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index ba7efb7956f77d..1e969be880401e 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -3466,7 +3466,14 @@ def isfinite(x, name=None):
         check_variable_and_dtype(
             x,
             'x',
-            ['float16', 'float32', 'float64', 'int32', 'int64'],
+            [
+                'float16',
+                'float32',
+                'float64',
+                'int32',
+                'int64',
+                'uint16',
+            ],
             'isfinite',
         )
         out = helper.create_variable_for_type_inference('bool')
@@ -3502,7 +3509,17 @@ def isinf(x, name=None):
     else:
         helper = LayerHelper("isinf_v2", **locals())
         check_variable_and_dtype(
-            x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64'], 'isinf'
+            x,
+            'x',
+            [
+                'float16',
+                'float32',
+                'float64',
+                'int32',
+                'int64',
+                'uint16',
+            ],
+            'isinf',
         )
         out = helper.create_variable_for_type_inference(dtype='bool')
         helper.append_op(type="isinf_v2", inputs={"X": x}, outputs={"Out": out})
@@ -3535,7 +3552,17 @@ def isnan(x, name=None):
     else:
         helper = LayerHelper("isnan_v2", **locals())
         check_variable_and_dtype(
-            x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64'], 'isnan'
+            x,
+            'x',
+            [
+                'float16',
+                'float32',
+                'float64',
+                'int32',
+                'int64',
+                'uint16',
+            ],
+            'isnan',
         )
         out = helper.create_variable_for_type_inference(dtype='bool')
         helper.append_op(type="isnan_v2", inputs={"X": x}, outputs={"Out": out})

From dee7d78d53a7a84b660df93b617d7b8ca2d53ec0 Mon Sep 17 00:00:00 2001
From: YuhangLi <104877312+piDack@users.noreply.github.com>
Date: Tue, 11 Apr 2023 13:19:35 +0800
Subject: [PATCH 054/156] [AMP OP&Test]stack & unstack ops fp16 bf16 support
 (#50999)

* stack fp16 & bf16 support

* unstack fp16 support

* unstack bf16 support

* append stack fp16 ut

* add unstack

* recover unstack cpu kernel

* fix some issue for unstack ut

* delete unuse var

* add check_place

* fix inference err
---
 .../fluid/tests/unittests/test_stack_op.py    | 41 ++++++++
 .../fluid/tests/unittests/test_unstack_op.py  | 98 ++++++++++++++++++-
 2 files changed, 138 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_stack_op.py b/python/paddle/fluid/tests/unittests/test_stack_op.py
index d2411dda4b95a4..b6a19615a6edab 100644
--- a/python/paddle/fluid/tests/unittests/test_stack_op.py
+++ b/python/paddle/fluid/tests/unittests/test_stack_op.py
@@ -105,6 +105,47 @@ def initParameters(self):
         self.enable_cinn = False
 
 
+class TestStackFP16Op(TestStackOpBase):
+    def initParameters(self):
+        self.dtype = np.float16
+
+
+class TestStackFP16Op1(TestStackOpBase):
+    def initParameters(self):
+        self.dtype = np.float16
+        self.num_inputs = 8
+
+
+class TestStackFP16Op2(TestStackOpBase):
+    def initParameters(self):
+        self.dtype = np.float16
+        self.num_inputs = 10
+
+
+class TestStackFP16Op3(TestStackOpBase):
+    def initParameters(self):
+        self.dtype = np.float16
+        self.axis = -1
+
+
+class TestStackFP16Op4(TestStackOpBase):
+    def initParameters(self):
+        self.dtype = np.float16
+        self.axis = -4
+
+
+class TestStackFP16Op5(TestStackOpBase):
+    def initParameters(self):
+        self.dtype = np.float16
+        self.axis = 1
+
+
+class TestStackFP16Op6(TestStackOpBase):
+    def initParameters(self):
+        self.dtype = np.float16
+        self.axis = 3
+
+
 class TestStackBF16Op(OpTest):
     def initDefaultParameters(self):
         self.num_inputs = 4
diff --git a/python/paddle/fluid/tests/unittests/test_unstack_op.py b/python/paddle/fluid/tests/unittests/test_unstack_op.py
index 34c6950d7f1d8c..9e20a78011c9df 100755
--- a/python/paddle/fluid/tests/unittests/test_unstack_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unstack_op.py
@@ -15,9 +15,11 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from eager_op_test import OpTest, convert_float_to_uint16
 
 import paddle
+from paddle import fluid
+from paddle.fluid import core
 
 
 class TestUnStackOpBase(OpTest):
@@ -64,6 +66,35 @@ def test_check_grad(self):
         self.check_grad(['X'], self.get_y_names())
 
 
+class TestUnStackFP16Op(TestUnStackOpBase):
+    def initParameters(self):
+        self.dtype = np.float16
+
+
+class TestStackFP16Op3(TestUnStackOpBase):
+    def initParameters(self):
+        self.dtype = np.float16
+        self.axis = -1
+
+
+class TestStackFP16Op4(TestUnStackOpBase):
+    def initParameters(self):
+        self.dtype = np.float16
+        self.axis = -3
+
+
+class TestStackFP16Op5(TestUnStackOpBase):
+    def initParameters(self):
+        self.dtype = np.float16
+        self.axis = 1
+
+
+class TestStackFP16Op6(TestUnStackOpBase):
+    def initParameters(self):
+        self.dtype = np.float16
+        self.axis = 2
+
+
 class TestStackOp3(TestUnStackOpBase):
     def initParameters(self):
         self.axis = -1
@@ -84,6 +115,71 @@ def initParameters(self):
         self.axis = 2
 
 
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and do not support bfloat16",
+)
+class TestUnStackBF16Op(OpTest):
+    def initDefaultParameters(self):
+        self.input_dim = (5, 6, 7)
+        self.axis = 0
+        self.dtype = np.uint16
+
+    def initParameters(self):
+        pass
+
+    def get_y_names(self):
+        y_names = []
+        for i in range(self.input_dim[self.axis]):
+            y_names.append(f'y{i}')
+        return y_names
+
+    def setUp(self):
+        self.initDefaultParameters()
+        self.initParameters()
+        self.op_type = 'unstack'
+        self.python_api = paddle.unstack
+        self.x = np.random.random(size=self.input_dim).astype(np.float32)
+        outs = np.split(self.x, self.input_dim[self.axis], self.axis)
+        new_shape = list(self.input_dim)
+        del new_shape[self.axis]
+        y_names = self.get_y_names()
+        tmp = []
+        tmp_names = []
+        for i in range(self.input_dim[self.axis]):
+            tmp.append(
+                (
+                    y_names[i],
+                    np.reshape(convert_float_to_uint16(outs[i]), new_shape),
+                )
+            )
+            tmp_names.append(y_names[i])
+
+        self.x = convert_float_to_uint16(self.x)
+        self.python_out_sig = tmp_names
+        self.inputs = {'X': self.x}
+        self.outputs = {'Y': tmp}
+        self.attrs = {'axis': self.axis, 'num': self.input_dim[self.axis]}
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        with fluid.dygraph.guard():
+            x = paddle.to_tensor(self.inputs['X'])
+            x.stop_gradient = False
+            y = paddle.unstack(
+                x, axis=self.attrs['axis'], num=self.attrs['num']
+            )
+            dx = paddle.grad(y, x)[0].numpy()
+            dx_expected = convert_float_to_uint16(
+                np.ones(self.input_dim, np.float32)
+            )
+            np.testing.assert_array_equal(dx, dx_expected)
+
+
 class TestUnstackZeroInputOp(unittest.TestCase):
     def unstack_zero_input_static(self):
 

From f352c23ed51c050f7273bd0b1ba54edd23f1be68 Mon Sep 17 00:00:00 2001
From: HongyuJia <jiahongyu@baidu.com>
Date: Tue, 11 Apr 2023 13:31:19 +0800
Subject: [PATCH 055/156] [CustomOP Unittest] Polish unit test of custom
 operator, kCPU->CPU (#52725)

* [CustomOP Unittest] Polish unit test of custom operator, kCPU->CPU

* AllocationType::CPU -> is_cpu()
---
 test/custom_op/attr_test_op.cc           |  8 +++---
 test/custom_op/context_pool_test_op.cc   |  3 +--
 test/custom_op/custom_concat_op.cc       |  3 +--
 test/custom_op/custom_conj_op.cc         |  3 +--
 test/custom_op/custom_inplace.cc         | 31 ++++++++++++------------
 test/custom_op/custom_optional.cc        | 18 ++++++++------
 test/custom_op/custom_relu_op.cc         | 16 ++++++------
 test/custom_op/custom_relu_op_xpu.cc     |  2 +-
 test/custom_op/custom_simple_slice_op.cc |  3 +--
 test/custom_op/custom_tanh_op.cc         |  3 +--
 test/custom_op/dispatch_test_op.cc       | 12 ++++-----
 test/custom_op/multi_out_test_op.cc      |  6 ++---
 12 files changed, 53 insertions(+), 55 deletions(-)

diff --git a/test/custom_op/attr_test_op.cc b/test/custom_op/attr_test_op.cc
index 14cb0aa7c716d8..819d5e0ea3a2d8 100644
--- a/test/custom_op/attr_test_op.cc
+++ b/test/custom_op/attr_test_op.cc
@@ -132,7 +132,7 @@ std::vector<paddle::Tensor> AttrTestForward(
     std::vector<float> float_vec_attr,
     std::vector<int64_t> int64_vec_attr,
     std::vector<std::string> str_vec_attr) {
-  auto out = paddle::Tensor(paddle::PlaceType::kCPU, x.shape());
+  auto out = paddle::empty_like(x);
 
   PD_DISPATCH_FLOATING_TYPES(
       x.type(), "assign_cpu_kernel", ([&] {
@@ -173,7 +173,7 @@ std::vector<paddle::Tensor> AttrTestBackward(
     int int_attr,
     const std::vector<float>& float_vec_attr,
     const std::vector<std::string>& str_vec_attr) {
-  auto grad_x = paddle::Tensor(paddle::PlaceType::kCPU, grad_out.shape());
+  auto grad_x = paddle::empty_like(grad_out);
 
   PD_DISPATCH_FLOATING_TYPES(grad_out.type(), "assign_cpu_kernel", ([&] {
                                assign_cpu_kernel<data_t>(
@@ -198,7 +198,7 @@ std::vector<paddle::Tensor> ConstAttrTestForward(
     const std::vector<float>& float_vec_attr,
     const std::vector<int64_t>& int64_vec_attr,
     const std::vector<std::string>& str_vec_attr) {
-  auto out = paddle::Tensor(paddle::PlaceType::kCPU, x.shape());
+  auto out = paddle::empty_like(x);
 
   PD_DISPATCH_FLOATING_TYPES(
       x.type(), "assign_cpu_kernel", ([&] {
@@ -239,7 +239,7 @@ std::vector<paddle::Tensor> ConstAttrTestBackward(
     const int& int_attr,
     const std::vector<float>& float_vec_attr,
     const std::vector<std::string>& str_vec_attr) {
-  auto grad_x = paddle::Tensor(paddle::PlaceType::kCPU, grad_out.shape());
+  auto grad_x = paddle::empty_like(grad_out);
 
   PD_DISPATCH_FLOATING_TYPES(grad_out.type(), "assign_cpu_kernel", ([&] {
                                assign_cpu_kernel<data_t>(
diff --git a/test/custom_op/context_pool_test_op.cc b/test/custom_op/context_pool_test_op.cc
index 1687bdccc9227d..72b28064f0a3f8 100644
--- a/test/custom_op/context_pool_test_op.cc
+++ b/test/custom_op/context_pool_test_op.cc
@@ -17,8 +17,7 @@
 #include "paddle/extension.h"
 #include "paddle/phi/backends/context_pool.h"
 
-#define CHECK_INPUT(x) \
-  PD_CHECK(x.place() == paddle::PlaceType::kCPU, #x " must be a CPU Tensor.")
+#define CHECK_INPUT(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
 
 std::vector<paddle::Tensor> ContextPoolTest(const paddle::Tensor& x) {
   // 1. test cpu context
diff --git a/test/custom_op/custom_concat_op.cc b/test/custom_op/custom_concat_op.cc
index 80f76e2df54fea..e34fffff7b2bb4 100644
--- a/test/custom_op/custom_concat_op.cc
+++ b/test/custom_op/custom_concat_op.cc
@@ -17,8 +17,7 @@
 #include "concat_and_split.h"  // NOLINT
 #include "paddle/extension.h"
 
-#define CHECK_INPUT(x) \
-  PD_CHECK(x.place() == paddle::PlaceType::kCPU, #x " must be a CPU Tensor.")
+#define CHECK_INPUT(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
 
 int64_t ComputeAxis(int64_t axis, int64_t rank) {
   PD_CHECK(axis >= -rank && axis < rank,
diff --git a/test/custom_op/custom_conj_op.cc b/test/custom_op/custom_conj_op.cc
index 56938552420e73..0f76f715c427fb 100644
--- a/test/custom_op/custom_conj_op.cc
+++ b/test/custom_op/custom_conj_op.cc
@@ -18,8 +18,7 @@
 
 #include "paddle/extension.h"
 
-#define CHECK_INPUT(x) \
-  PD_CHECK(x.place() == paddle::PlaceType::kCPU, #x " must be a CPU Tensor.")
+#define CHECK_INPUT(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
 
 template <typename data_t>
 using EnableComplex = typename std::enable_if<
diff --git a/test/custom_op/custom_inplace.cc b/test/custom_op/custom_inplace.cc
index fbbe10b513ece5..f7db7922bf3f72 100644
--- a/test/custom_op/custom_inplace.cc
+++ b/test/custom_op/custom_inplace.cc
@@ -18,6 +18,8 @@
 
 #include "paddle/extension.h"
 
+#define CHECK_INPUT(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
+
 template <typename data_t>
 void add_data_pointer(const data_t* x_data, data_t* out_data, int64_t numel) {
   for (size_t i = 0; i < numel; ++i) {
@@ -52,7 +54,7 @@ void relu_backward_kernel(const data_t* out_data,
 }
 
 void AddForward(paddle::Tensor& x, const paddle::Tensor& y) {  // NOLINT
-  PD_CHECK(x.place() == paddle::PlaceType::kCPU, "x must be a CPU Tensor.");
+  CHECK_INPUT(x);
 
   PD_DISPATCH_FLOATING_TYPES(
       x.type(), "AddForward", ([&] {
@@ -63,8 +65,8 @@ void AddForward(paddle::Tensor& x, const paddle::Tensor& y) {  // NOLINT
 std::vector<paddle::Tensor> AddBackward(const paddle::Tensor& x,
                                         const paddle::Tensor& y,
                                         paddle::Tensor& out_grad) {  // NOLINT
-  PD_CHECK(x.place() == paddle::PlaceType::kCPU, "x must be a CPU Tensor.");
-  PD_CHECK(y.place() == paddle::PlaceType::kCPU, "y must be a CPU Tensor.");
+  CHECK_INPUT(x);
+  CHECK_INPUT(y);
 
   paddle::Tensor y_grad = paddle::empty(x.shape(), x.dtype(), x.place());
 
@@ -92,7 +94,7 @@ PD_BUILD_GRAD_OP(custom_add)
 // out[i] = x[i] + y
 void AddVectorForward(std::vector<paddle::Tensor>& x,  // NOLINT
                       const paddle::Tensor& y) {
-  PD_CHECK(y.place() == paddle::PlaceType::kCPU, "y must be a CPU Tensor.");
+  CHECK_INPUT(y);
 
   PD_DISPATCH_FLOATING_TYPES(y.type(), "AddVectorForward", ([&] {
                                for (size_t i = 0; i < x.size(); ++i) {
@@ -109,9 +111,8 @@ std::vector<paddle::Tensor> AddVectorBackward(
     const std::vector<paddle::Tensor>& x,
     const paddle::Tensor& y,
     std::vector<paddle::Tensor>& out_grad) {  // NOLINT
-  PD_CHECK(x[0].place() == paddle::PlaceType::kCPU,
-           "x[0] must be a CPU Tensor.");
-  PD_CHECK(y.place() == paddle::PlaceType::kCPU, "y must be a CPU Tensor.");
+  CHECK_INPUT(x[0]);
+  CHECK_INPUT(y);
   PD_CHECK(x.size() == out_grad.size(),
            "x must have the same size as out_grad.");
 
@@ -145,8 +146,8 @@ void MultiInplaceForward(paddle::Tensor& x,  // NOLINT
                          const paddle::Tensor& y,
                          paddle::Tensor& a,  // NOLINT
                          const paddle::Tensor& b) {
-  PD_CHECK(x.place() == paddle::PlaceType::kCPU, "x must be a CPU Tensor.");
-  PD_CHECK(a.place() == paddle::PlaceType::kCPU, "a must be a CPU Tensor.");
+  CHECK_INPUT(x);
+  CHECK_INPUT(a);
 
   PD_DISPATCH_FLOATING_TYPES(
       x.type(), "MultiInplaceForward", ([&] {
@@ -162,10 +163,10 @@ std::vector<paddle::Tensor> MultiInplaceBackward(
     const paddle::Tensor& a,
     const paddle::Tensor& b,
     paddle::Tensor& outab_grad) {  // NOLINT
-  PD_CHECK(x.place() == paddle::PlaceType::kCPU, "x must be a CPU Tensor.");
-  PD_CHECK(y.place() == paddle::PlaceType::kCPU, "y must be a CPU Tensor.");
-  PD_CHECK(a.place() == paddle::PlaceType::kCPU, "a must be a CPU Tensor.");
-  PD_CHECK(b.place() == paddle::PlaceType::kCPU, "b must be a CPU Tensor.");
+  CHECK_INPUT(x);
+  CHECK_INPUT(y);
+  CHECK_INPUT(a);
+  CHECK_INPUT(b);
 
   paddle::Tensor y_grad = paddle::empty(x.shape(), x.dtype(), x.place());
   paddle::Tensor b_grad = paddle::empty(a.shape(), a.dtype(), a.place());
@@ -200,7 +201,7 @@ PD_BUILD_GRAD_OP(custom_multi_inplace)
     .SetKernelFn(PD_KERNEL(MultiInplaceBackward));
 
 void ReluForwardInplace(paddle::Tensor& x) {  // NOLINT
-  PD_CHECK(x.place() == paddle::PlaceType::kCPU, "x must be a CPU Tensor.");
+  CHECK_INPUT(x);
 
   PD_DISPATCH_FLOATING_TYPES(x.type(), "ReluForward", ([&] {
                                relu_forward_kernel<data_t>(x.data<data_t>(),
@@ -211,7 +212,7 @@ void ReluForwardInplace(paddle::Tensor& x) {  // NOLINT
 void ReluBackwardInplace(const paddle::Tensor& x,
                          const paddle::Tensor& out,
                          paddle::Tensor& grad_out) {  // NOLINT
-  PD_CHECK(out.place() == paddle::PlaceType::kCPU, "x must be a CPU Tensor.");
+  CHECK_INPUT(out);
 
   PD_DISPATCH_FLOATING_TYPES(
       grad_out.type(), "ReluBackward", ([&] {
diff --git a/test/custom_op/custom_optional.cc b/test/custom_op/custom_optional.cc
index 0e28ce84d5a357..9d247f4a27694d 100644
--- a/test/custom_op/custom_optional.cc
+++ b/test/custom_op/custom_optional.cc
@@ -18,6 +18,8 @@
 
 #include "paddle/extension.h"
 
+#define CHECK_INPUT(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
+
 template <typename data_t>
 void add_one_pointer(const data_t* x_data, data_t* out_data, int64_t numel) {
   for (size_t i = 0; i < numel; ++i) {
@@ -45,7 +47,7 @@ if (y) {
 std::vector<paddle::Tensor> AddForward(
     const paddle::Tensor& x,
     const paddle::optional<paddle::Tensor>& y) {  // NOLINT
-  PD_CHECK(x.place() == paddle::PlaceType::kCPU, "x must be a CPU Tensor.");
+  CHECK_INPUT(x);
   paddle::Tensor out = paddle::empty(x.shape(), x.dtype(), x.place());
 
   if (y) {
@@ -85,7 +87,7 @@ std::vector<paddle::Tensor> AddBackward(
     const paddle::Tensor& x,
     const paddle::optional<paddle::Tensor>& y,
     const paddle::Tensor& out_grad) {  // NOLINT
-  PD_CHECK(x.place() == paddle::PlaceType::kCPU, "x must be a CPU Tensor.");
+  CHECK_INPUT(x);
   paddle::Tensor x_grad = paddle::zeros(x.shape(), x.dtype(), x.place());
 
   if (y) {
@@ -118,7 +120,7 @@ if (y) {
 std::vector<paddle::Tensor> AddVectorForward(
     const paddle::Tensor& x,
     const paddle::optional<std::vector<paddle::Tensor>>& y) {  // NOLINT
-  PD_CHECK(x.place() == paddle::PlaceType::kCPU, "x must be a CPU Tensor.");
+  CHECK_INPUT(x);
   paddle::Tensor out = paddle::zeros(x.shape(), x.dtype(), x.place());
 
   PD_DISPATCH_FLOATING_TYPES(
@@ -167,7 +169,7 @@ std::vector<paddle::Tensor> AddVectorBackward(
     const paddle::Tensor& x,
     const paddle::optional<std::vector<paddle::Tensor>>& y,
     const paddle::Tensor& out_grad) {  // NOLINT
-  PD_CHECK(x.place() == paddle::PlaceType::kCPU, "x must be a CPU Tensor.");
+  CHECK_INPUT(x);
 
   paddle::Tensor x_grad = paddle::zeros(x.shape(), x.dtype(), x.place());
 
@@ -208,7 +210,7 @@ if (y) {
 std::vector<paddle::Tensor> AddOptionalInplaceForward(
     const paddle::Tensor& x,
     paddle::optional<paddle::Tensor>& y) {  // NOLINT
-  PD_CHECK(x.place() == paddle::PlaceType::kCPU, "x must be a CPU Tensor.");
+  CHECK_INPUT(x);
   paddle::Tensor outX = paddle::zeros(x.shape(), x.dtype(), x.place());
 
   PD_DISPATCH_FLOATING_TYPES(
@@ -252,7 +254,7 @@ std::vector<paddle::Tensor> AddOptionalInplaceBackward(
     const paddle::optional<paddle::Tensor>& y,
     const paddle::Tensor& outx_grad,
     paddle::optional<paddle::Tensor>& outy_grad) {  // NOLINT
-  PD_CHECK(x.place() == paddle::PlaceType::kCPU, "x must be a CPU Tensor.");
+  CHECK_INPUT(x);
 
   paddle::Tensor x_grad = paddle::zeros(x.shape(), x.dtype(), x.place());
 
@@ -313,7 +315,7 @@ if (y) {
 std::vector<paddle::Tensor> AddOptionalInplaceVectorForward(
     const paddle::Tensor& x,
     paddle::optional<std::vector<paddle::Tensor>>& y) {  // NOLINT
-  PD_CHECK(x.place() == paddle::PlaceType::kCPU, "x must be a CPU Tensor.");
+  CHECK_INPUT(x);
   paddle::Tensor outX = paddle::zeros(x.shape(), x.dtype(), x.place());
 
   PD_DISPATCH_FLOATING_TYPES(
@@ -359,7 +361,7 @@ std::vector<paddle::Tensor> AddOptionalInplaceVectorBackward(
     const paddle::optional<std::vector<paddle::Tensor>>& y,
     const paddle::Tensor& outx_grad,
     paddle::optional<std::vector<paddle::Tensor>>& outy_grad) {  // NOLINT
-  PD_CHECK(x.place() == paddle::PlaceType::kCPU, "x must be a CPU Tensor.");
+  CHECK_INPUT(x);
 
   paddle::Tensor x_grad = paddle::zeros(x.shape(), x.dtype(), x.place());
 
diff --git a/test/custom_op/custom_relu_op.cc b/test/custom_op/custom_relu_op.cc
index 7575887318ce35..5627bb28b921f4 100644
--- a/test/custom_op/custom_relu_op.cc
+++ b/test/custom_op/custom_relu_op.cc
@@ -128,9 +128,9 @@ std::vector<paddle::Tensor> ReluBackward(const paddle::Tensor& x,
 
 std::vector<paddle::Tensor> ReluDoubleBackward(const paddle::Tensor& out,
                                                const paddle::Tensor& ddx) {
-  if (out.place() == paddle::PlaceType::kCPU) {
+  if (out.is_cpu()) {
     return relu_cpu_double_backward(out, ddx);
-  } else if (out.place() == paddle::PlaceType::kGPU) {
+  } else if (out.is_gpu()) {
     return relu_cuda_double_backward(out, ddx);
   } else {
     PD_THROW("Not implemented.");
@@ -179,9 +179,9 @@ std::vector<paddle::Tensor> relu_cuda_backward_without_x(
 
 std::vector<paddle::Tensor> ReluBackwardWithoutX(
     const paddle::Tensor& out, const paddle::Tensor& grad_out) {
-  if (out.place() == paddle::PlaceType::kCPU) {
+  if (out.is_cpu()) {
     return relu_cpu_backward_without_x(out, grad_out);
-  } else if (out.place() == paddle::PlaceType::kGPU) {
+  } else if (out.is_gpu()) {
     return relu_cuda_backward_without_x(out, grad_out);
   } else {
     PD_THROW("Not implemented.");
@@ -235,9 +235,9 @@ void relu_cuda_backward_out(const paddle::Tensor& x,
                             paddle::Tensor* grad_x);
 
 void ReluForwardOut(const paddle::Tensor& x, paddle::Tensor* out) {
-  if (x.place() == paddle::PlaceType::kCPU) {
+  if (x.is_cpu()) {
     return relu_cpu_forward_out(x, out);
-  } else if (x.place() == paddle::PlaceType::kGPU) {
+  } else if (x.is_gpu()) {
     return relu_cuda_forward_out(x, out);
   } else {
     PD_THROW("Not implemented.");
@@ -248,9 +248,9 @@ void ReluBackwardOut(const paddle::Tensor& x,
                      const paddle::Tensor& out,
                      const paddle::Tensor& grad_out,
                      paddle::Tensor* grad_x) {
-  if (x.place() == paddle::PlaceType::kCPU) {
+  if (x.is_cpu()) {
     return relu_cpu_backward_out(x, out, grad_out, grad_x);
-  } else if (x.place() == paddle::PlaceType::kGPU) {
+  } else if (x.is_gpu()) {
     return relu_cuda_backward_out(x, out, grad_out, grad_x);
   } else {
     PD_THROW("Not implemented.");
diff --git a/test/custom_op/custom_relu_op_xpu.cc b/test/custom_op/custom_relu_op_xpu.cc
index c38f8b877da2c3..ee717785ad8486 100644
--- a/test/custom_op/custom_relu_op_xpu.cc
+++ b/test/custom_op/custom_relu_op_xpu.cc
@@ -161,7 +161,7 @@ std::vector<paddle::Tensor> ReluBackward(const paddle::Tensor& x,
 
 std::vector<paddle::Tensor> ReluDoubleBackward(const paddle::Tensor& out,
                                                const paddle::Tensor& ddx) {
-  if (out.place() == paddle::PlaceType::kCPU) {
+  if (out.is_cpu()) {
     return relu_cpu_double_backward(out, ddx);
   } else if (out.place().GetType() == phi::AllocationType::XPU) {
     return relu_xpu_double_backward(out, ddx);
diff --git a/test/custom_op/custom_simple_slice_op.cc b/test/custom_op/custom_simple_slice_op.cc
index 783e0cd96fdd95..21bd1b8ada27de 100644
--- a/test/custom_op/custom_simple_slice_op.cc
+++ b/test/custom_op/custom_simple_slice_op.cc
@@ -17,8 +17,7 @@
 
 #include "paddle/extension.h"
 
-#define CHECK_INPUT(x) \
-  PD_CHECK(x.place() == paddle::PlaceType::kCPU, #x " must be a CPU Tensor.")
+#define CHECK_INPUT(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
 
 std::vector<paddle::Tensor> SimpleSliceFunction(const paddle::Tensor& x,
                                                 int64_t begin_index,
diff --git a/test/custom_op/custom_tanh_op.cc b/test/custom_op/custom_tanh_op.cc
index 399eb5b6366d77..a7a61b95283520 100644
--- a/test/custom_op/custom_tanh_op.cc
+++ b/test/custom_op/custom_tanh_op.cc
@@ -18,8 +18,7 @@
 
 #include "paddle/extension.h"
 
-#define CHECK_CPU_INPUT(x) \
-  PD_CHECK(x.place() == paddle::PlaceType::kCPU, #x " must be a CPU Tensor.")
+#define CHECK_CPU_INPUT(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
 
 template <typename data_t>
 void tanh_cpu_forward_kernel(const data_t* x_data,
diff --git a/test/custom_op/dispatch_test_op.cc b/test/custom_op/dispatch_test_op.cc
index 0f7d323b5451ef..39e1a24fe2327f 100644
--- a/test/custom_op/dispatch_test_op.cc
+++ b/test/custom_op/dispatch_test_op.cc
@@ -27,7 +27,7 @@ void assign_cpu_kernel(const data_t* x_data,
 }
 
 std::vector<paddle::Tensor> DispatchTestInterger(const paddle::Tensor& x) {
-  auto out = paddle::Tensor(paddle::PlaceType::kCPU, x.shape());
+  auto out = paddle::empty_like(x);
 
   PD_DISPATCH_INTEGRAL_TYPES(
       x.type(), "assign_cpu_kernel", ([&] {
@@ -45,7 +45,7 @@ PD_BUILD_OP(dispatch_test_integer)
 
 std::vector<paddle::Tensor> DispatchTestFloatAndInteger(
     const paddle::Tensor& x) {
-  auto out = paddle::Tensor(paddle::PlaceType::kCPU, x.shape());
+  auto out = paddle::empty_like(x);
 
   PD_DISPATCH_FLOATING_AND_INTEGRAL_TYPES(
       x.type(), "assign_cpu_kernel", ([&] {
@@ -62,7 +62,7 @@ PD_BUILD_OP(dispatch_test_float_and_integer)
     .SetKernelFn(PD_KERNEL(DispatchTestFloatAndInteger));
 
 std::vector<paddle::Tensor> DispatchTestComplex(const paddle::Tensor& x) {
-  auto out = paddle::Tensor(paddle::PlaceType::kCPU, x.shape());
+  auto out = paddle::empty_like(x);
 
   PD_DISPATCH_COMPLEX_TYPES(
       x.type(), "assign_cpu_kernel", ([&] {
@@ -80,7 +80,7 @@ PD_BUILD_OP(dispatch_test_complex)
 
 std::vector<paddle::Tensor> DispatchTestFloatAndComplex(
     const paddle::Tensor& x) {
-  auto out = paddle::Tensor(paddle::PlaceType::kCPU, x.shape());
+  auto out = paddle::empty_like(x);
 
   PD_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
       x.type(), "assign_cpu_kernel", ([&] {
@@ -98,7 +98,7 @@ PD_BUILD_OP(dispatch_test_float_and_complex)
 
 std::vector<paddle::Tensor> DispatchTestFloatAndIntegerAndComplex(
     const paddle::Tensor& x) {
-  auto out = paddle::Tensor(paddle::PlaceType::kCPU, x.shape());
+  auto out = paddle::empty_like(x);
 
   PD_DISPATCH_FLOATING_AND_INTEGRAL_AND_COMPLEX_TYPES(
       x.type(), "assign_cpu_kernel", ([&] {
@@ -115,7 +115,7 @@ PD_BUILD_OP(dispatch_test_float_and_integer_and_complex)
     .SetKernelFn(PD_KERNEL(DispatchTestFloatAndIntegerAndComplex));
 
 std::vector<paddle::Tensor> DispatchTestFloatAndHalf(const paddle::Tensor& x) {
-  auto out = paddle::Tensor(paddle::PlaceType::kCPU, x.shape());
+  auto out = paddle::empty_like(x);
 
   PD_DISPATCH_FLOATING_AND_HALF_TYPES(
       x.type(), "assign_cpu_kernel", ([&] {
diff --git a/test/custom_op/multi_out_test_op.cc b/test/custom_op/multi_out_test_op.cc
index d9e0526e4206ea..7007058cbb93ec 100644
--- a/test/custom_op/multi_out_test_op.cc
+++ b/test/custom_op/multi_out_test_op.cc
@@ -34,7 +34,7 @@ void fill_constant_cpu_kernel(data_t* out_data, int64_t x_numel, data_t value) {
 }
 
 std::vector<paddle::Tensor> MultiOutCPU(const paddle::Tensor& x) {
-  auto out = paddle::Tensor(paddle::PlaceType::kCPU, x.shape());
+  auto out = paddle::empty_like(x);
 
   PD_DISPATCH_FLOATING_TYPES(
       x.type(), "assign_cpu_kernel", ([&] {
@@ -43,13 +43,13 @@ std::vector<paddle::Tensor> MultiOutCPU(const paddle::Tensor& x) {
       }));
 
   // fake multi output: Fake_float64 with float64 dtype
-  auto fake_float64 = paddle::Tensor(paddle::PlaceType::kCPU, x.shape());
+  auto fake_float64 = paddle::empty_like(x);
 
   fill_constant_cpu_kernel<double>(
       fake_float64.mutable_data<double>(x.place()), x.size(), 0.);
 
   // fake multi output: ZFake_int32 with int32 dtype
-  auto zfake_int32 = paddle::Tensor(paddle::PlaceType::kCPU, x.shape());
+  auto zfake_int32 = paddle::empty_like(x);
 
   fill_constant_cpu_kernel<int32_t>(
       zfake_int32.mutable_data<int32_t>(x.place()), x.size(), 1);

From 9b88eef1a1b6ecb5a3c305d57b98e6c0d87bb318 Mon Sep 17 00:00:00 2001
From: HongyuJia <jiahongyu@baidu.com>
Date: Tue, 11 Apr 2023 13:31:29 +0800
Subject: [PATCH 056/156] [Polish CustomOP] Polish python codes, delete useless
 variable (#52728)

---
 python/paddle/utils/cpp_extension/extension_utils.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/python/paddle/utils/cpp_extension/extension_utils.py b/python/paddle/utils/cpp_extension/extension_utils.py
index e78cc85f73ca0a..8ff70ca4c0e6f9 100644
--- a/python/paddle/utils/cpp_extension/extension_utils.py
+++ b/python/paddle/utils/cpp_extension/extension_utils.py
@@ -1041,7 +1041,6 @@ def _gen_output_content(
 ):
     # ' ' * tab space * tab number
     indent = ' ' * 4 * 2
-    inplace_idx = {v: k for k, v in inplace_reverse_idx.items()}
     dynamic_content = f"""
 {indent}res = []
 {indent}start_idx = 0"""
@@ -1134,7 +1133,6 @@ def _custom_api_content(op_name):
         attrs_map,
         inplace_reverse_idx,
     )
-    lower_in_list = [p.split("@")[0].lower() for p in in_names]
     API_TEMPLATE = textwrap.dedent(
         """
         import paddle.fluid.core as core
@@ -1161,11 +1159,6 @@ def {op_name}({params_list}):
     api_content = API_TEMPLATE.format(
         op_name=op_name,
         params_list=params_list,
-        ins_map=ins_map,
-        attrs_map=attrs_map,
-        # "[x, y, z]""
-        in_names="[" + ",".join(lower_in_list) + "]",
-        attr_names="[" + ",".join(attr_names) + "]",
         outs_list=outs_list,
         dynamic_content=dynamic_content,
         static_content=static_content,

From 439551bd6edb9191b09da101203c55fc211298fd Mon Sep 17 00:00:00 2001
From: Kai Xing <xingkai98@126.com>
Date: Tue, 11 Apr 2023 14:05:15 +0800
Subject: [PATCH 057/156] [Test MV] fft (#52634)

* [Test MV] fft

* Update test_spectral_op.py
---
 python/paddle/fluid/tests/unittests/CMakeLists.txt            | 4 ----
 test/CMakeLists.txt                                           | 4 +++-
 .../paddle/fluid/tests/unittests => test}/fft/CMakeLists.txt  | 0
 {python/paddle/fluid/tests/unittests => test}/fft/__init__.py | 0
 .../fluid/tests/unittests => test}/fft/spectral_op_np.py      | 0
 {python/paddle/fluid/tests/unittests => test}/fft/test_fft.py | 0
 .../unittests => test}/fft/test_fft_with_static_graph.py      | 0
 .../fluid/tests/unittests => test}/fft/test_spectral_op.py    | 2 +-
 8 files changed, 4 insertions(+), 6 deletions(-)
 rename {python/paddle/fluid/tests/unittests => test}/fft/CMakeLists.txt (100%)
 rename {python/paddle/fluid/tests/unittests => test}/fft/__init__.py (100%)
 rename {python/paddle/fluid/tests/unittests => test}/fft/spectral_op_np.py (100%)
 rename {python/paddle/fluid/tests/unittests => test}/fft/test_fft.py (100%)
 rename {python/paddle/fluid/tests/unittests => test}/fft/test_fft_with_static_graph.py (100%)
 rename {python/paddle/fluid/tests/unittests => test}/fft/test_spectral_op.py (99%)

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 6f461538a7c8d9..491cae679975f4 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -774,10 +774,6 @@ add_subdirectory(sequence)
 add_subdirectory(rnn)
 add_subdirectory(distribution)
 
-if(NOT WIN32 OR NOT WITH_GPU)
-  add_subdirectory(fft)
-endif()
-
 if(WITH_XPU)
   add_subdirectory(xpu)
 endif()
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index d9d9cb5504f1cf..215771713ecc20 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -107,7 +107,9 @@ if(WITH_TESTING)
   # add_subdirectory(distributed_passes)
   # add_subdirectory(distribution)
   add_subdirectory(dygraph_to_static)
-  # add_subdirectory(fft)
+  if(NOT WIN32 OR NOT WITH_GPU)
+    add_subdirectory(fft)
+  endif()
   # add_subdirectory(fleet)
   if(WITH_IPU)
     add_subdirectory(ipu)
diff --git a/python/paddle/fluid/tests/unittests/fft/CMakeLists.txt b/test/fft/CMakeLists.txt
similarity index 100%
rename from python/paddle/fluid/tests/unittests/fft/CMakeLists.txt
rename to test/fft/CMakeLists.txt
diff --git a/python/paddle/fluid/tests/unittests/fft/__init__.py b/test/fft/__init__.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/fft/__init__.py
rename to test/fft/__init__.py
diff --git a/python/paddle/fluid/tests/unittests/fft/spectral_op_np.py b/test/fft/spectral_op_np.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/fft/spectral_op_np.py
rename to test/fft/spectral_op_np.py
diff --git a/python/paddle/fluid/tests/unittests/fft/test_fft.py b/test/fft/test_fft.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/fft/test_fft.py
rename to test/fft/test_fft.py
diff --git a/python/paddle/fluid/tests/unittests/fft/test_fft_with_static_graph.py b/test/fft/test_fft_with_static_graph.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/fft/test_fft_with_static_graph.py
rename to test/fft/test_fft_with_static_graph.py
diff --git a/python/paddle/fluid/tests/unittests/fft/test_spectral_op.py b/test/fft/test_spectral_op.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/fft/test_spectral_op.py
rename to test/fft/test_spectral_op.py
index 6b8ab6cc2ff045..075d68b68ed472 100644
--- a/python/paddle/fluid/tests/unittests/fft/test_spectral_op.py
+++ b/test/fft/test_spectral_op.py
@@ -29,7 +29,7 @@
 from paddle import _C_ops
 
 sys.path.append("../")
-from eager_op_test import OpTest
+from paddle.fluid.tests.unittests.eager_op_test import OpTest
 
 paddle.enable_static()
 

From 6366cffe07644d32c0d542e0374ef206d86fdb8d Mon Sep 17 00:00:00 2001
From: wanghuancoder <wanghuan29@baidu.com>
Date: Tue, 11 Apr 2023 14:06:16 +0800
Subject: [PATCH 058/156] fix check nan bug (#52729)

---
 .../auto_code_generator/generator/eager_gen.py      |  3 +++
 paddle/fluid/eager/nan_inf_utils.cc                 | 13 +++++++++++++
 paddle/fluid/eager/nan_inf_utils.h                  |  8 ++++++++
 3 files changed, 24 insertions(+)

diff --git a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
index 278fbf127036b7..e22355d88d3290 100644
--- a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
@@ -276,6 +276,8 @@ class {} : public egr::GradNodeBase {{
   // Before log info
 {}
   // Forward API Call
+{}
+  // Check NaN and Inf if needed
 {}
   // Get Outputs
 {}
@@ -1675,6 +1677,7 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
                     forward_api_name,
                     before_log_str,
                     forward_call_str,
+                    check_nan_inf_str,
                     get_outputs_str,
                     forward_api_name,
                     check_inplace_str,
diff --git a/paddle/fluid/eager/nan_inf_utils.cc b/paddle/fluid/eager/nan_inf_utils.cc
index 17cf8825d5c151..6eae40fca36cfd 100644
--- a/paddle/fluid/eager/nan_inf_utils.cc
+++ b/paddle/fluid/eager/nan_inf_utils.cc
@@ -122,6 +122,11 @@ void CheckTensorHasNanOrInf(const std::string& api_name, const Tensor& tensor) {
   }
 }
 
+void CheckTensorHasNanOrInf(const std::string& api_name,
+                            const paddle::optional<Tensor>& tensor) {
+  CheckTensorHasNanOrInf(api_name, tensor.get());
+}
+
 void CheckTensorHasNanOrInf(const std::string& api_name,
                             const TupleOfTwoTensors& tensors) {
   CheckTensorHasNanOrInf(api_name, std::get<0>(tensors));
@@ -169,6 +174,14 @@ void CheckTensorHasNanOrInf(const std::string& api_name,
   }
 }
 
+void CheckTensorHasNanOrInf(
+    const std::string& api_name,
+    const paddle::optional<std::vector<Tensor>>& tensors) {
+  if (tensors) {
+    CheckTensorHasNanOrInf(api_name, tensors.get());
+  }
+}
+
 void CheckTensorHasNanOrInf(
     const std::string& api_name,
     const paddle::small_vector<std::vector<paddle::Tensor>,
diff --git a/paddle/fluid/eager/nan_inf_utils.h b/paddle/fluid/eager/nan_inf_utils.h
index cb19fd2f9d7947..8d7ed7ffb76b20 100644
--- a/paddle/fluid/eager/nan_inf_utils.h
+++ b/paddle/fluid/eager/nan_inf_utils.h
@@ -20,6 +20,7 @@
 
 #include "paddle/fluid/eager/type_defs.h"
 #include "paddle/phi/api/include/tensor.h"
+#include "paddle/utils/optional.h"
 #include "paddle/utils/small_vector.h"
 
 namespace egr {
@@ -36,6 +37,9 @@ using TupleOfTensorAndVector =
 
 void CheckTensorHasNanOrInf(const std::string& api_name, const Tensor& tensor);
 
+void CheckTensorHasNanOrInf(const std::string& api_name,
+                            const paddle::optional<Tensor>& tensor);
+
 void CheckTensorHasNanOrInf(const std::string& api_name,
                             const TupleOfTwoTensors& tensors);
 
@@ -54,6 +58,10 @@ void CheckTensorHasNanOrInf(const std::string& api_name,
 void CheckTensorHasNanOrInf(const std::string& api_name,
                             const std::vector<Tensor>& tensors);
 
+void CheckTensorHasNanOrInf(
+    const std::string& api_name,
+    const paddle::optional<std::vector<Tensor>>& tensors);
+
 void CheckTensorHasNanOrInf(const std::string& api_name,
                             const TupleOfTensorAndVector& tensors);
 

From 29ab75b638e151c424c711b3b1b2543fda3a0e88 Mon Sep 17 00:00:00 2001
From: RedContritio <RedContritio@qq.com>
Date: Tue, 11 Apr 2023 14:10:49 +0800
Subject: [PATCH 059/156] move test_*tokenizer to /test/tokenizer (#52658)

---
 python/paddle/fluid/tests/unittests/CMakeLists.txt   |  2 --
 test/CMakeLists.txt                                  |  2 +-
 test/tokenizer/CMakeLists.txt                        | 12 ++++++++++++
 .../tests/unittests => test}/tokenizer/__init__.py   |  0
 .../unittests => test}/tokenizer/bert_tokenizer.py   |  0
 .../tokenizer}/test_faster_tokenizer_op.py           |  5 +----
 .../unittests => test}/tokenizer/tokenizer_utils.py  |  0
 7 files changed, 14 insertions(+), 7 deletions(-)
 create mode 100644 test/tokenizer/CMakeLists.txt
 rename {python/paddle/fluid/tests/unittests => test}/tokenizer/__init__.py (100%)
 rename {python/paddle/fluid/tests/unittests => test}/tokenizer/bert_tokenizer.py (100%)
 rename {python/paddle/fluid/tests/unittests => test/tokenizer}/test_faster_tokenizer_op.py (99%)
 rename {python/paddle/fluid/tests/unittests => test}/tokenizer/tokenizer_utils.py (100%)

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 491cae679975f4..63279cffc3e51b 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -552,8 +552,6 @@ if((WITH_GPU) AND (CUDA_VERSION GREATER_EQUAL 11.6))
 endif()
 
 set_tests_properties(test_conv2d_op PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
-set_tests_properties(test_faster_tokenizer_op PROPERTIES LABELS
-                                                         "RUN_TYPE=EXCLUSIVE")
 set_tests_properties(test_conv2d_op_depthwise_conv
                      PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
 set_tests_properties(test_conv2d_api PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 215771713ecc20..4d50fe16b9b051 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -126,7 +126,7 @@ if(WITH_TESTING)
   add_subdirectory(rpc)
   # add_subdirectory(sequence)
   add_subdirectory(standalone_executor)
-  # add_subdirectory(tokenizer)
+  add_subdirectory(tokenizer)
   # add_subdirectory(white_list)
   add_subdirectory(xpu)
 endif()
diff --git a/test/tokenizer/CMakeLists.txt b/test/tokenizer/CMakeLists.txt
new file mode 100644
index 00000000000000..1cf384df660b38
--- /dev/null
+++ b/test/tokenizer/CMakeLists.txt
@@ -0,0 +1,12 @@
+file(
+  GLOB TEST_OPS
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+foreach(src ${TEST_OPS})
+  py_test(${src} SRCS ${src}.py)
+endforeach()
+
+set_tests_properties(test_faster_tokenizer_op PROPERTIES LABELS
+                                                         "RUN_TYPE=EXCLUSIVE")
diff --git a/python/paddle/fluid/tests/unittests/tokenizer/__init__.py b/test/tokenizer/__init__.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/tokenizer/__init__.py
rename to test/tokenizer/__init__.py
diff --git a/python/paddle/fluid/tests/unittests/tokenizer/bert_tokenizer.py b/test/tokenizer/bert_tokenizer.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/tokenizer/bert_tokenizer.py
rename to test/tokenizer/bert_tokenizer.py
diff --git a/python/paddle/fluid/tests/unittests/test_faster_tokenizer_op.py b/test/tokenizer/test_faster_tokenizer_op.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/test_faster_tokenizer_op.py
rename to test/tokenizer/test_faster_tokenizer_op.py
index 6972505bf3cbb6..37bb09a514a187 100755
--- a/python/paddle/fluid/tests/unittests/test_faster_tokenizer_op.py
+++ b/test/tokenizer/test_faster_tokenizer_op.py
@@ -13,20 +13,17 @@
 # limitations under the License.
 
 import os
-import sys
 import tempfile
 import unittest
 
 import numpy as np
+from bert_tokenizer import BertTokenizer
 
 import paddle
 from paddle import _legacy_C_ops, nn
 from paddle.fluid.framework import _non_static_mode, core
 from paddle.fluid.layer_helper import LayerHelper
 
-sys.path.append("./tokenizer")
-from tokenizer.bert_tokenizer import BertTokenizer
-
 
 def to_string_tensor(string_values, name):
     """
diff --git a/python/paddle/fluid/tests/unittests/tokenizer/tokenizer_utils.py b/test/tokenizer/tokenizer_utils.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/tokenizer/tokenizer_utils.py
rename to test/tokenizer/tokenizer_utils.py

From f03dcff7a57d83bf6997990ced8bcd85e6aadd70 Mon Sep 17 00:00:00 2001
From: Zhang Zheng <32410583+ZzSean@users.noreply.github.com>
Date: Tue, 11 Apr 2023 14:13:11 +0800
Subject: [PATCH 060/156] Update approver list of checking file diff,
 test=document_fix (#52756)

---
 tools/check_file_diff_approvals.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index 50f9344c66fe41..8cfcb63e84c700 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -248,8 +248,8 @@ fi
 NO_NPU_FILE=`git diff --name-only upstream/$BRANCH | grep -v "_npu.py"`
 HAS_UNITTEST_SKIP=`git diff -U0 upstream/$BRANCH ${NO_NPU_FILE} | grep "^+[[:space:]]\{0,\}@unittest.skip" || true`
 if [ "${HAS_UNITTEST_SKIP}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    echo_line="Unittest is not allowed to be disabled.\nYou must have one RD (kolinwei(Recommend), wanghuancoder, luotao1, QingshuChen or qili93) approval for the usage of @unittest.skip or @unittest.skipIf.\n${HAS_UNITTEST_SKIP}\n"
-    check_approval 1 22165420 6836917 46661762 26922892 16605440 2002279
+    echo_line="Unittest is not allowed to be disabled.\nYou must have one RD (kolinwei(Recommend), wanghuancoder, luotao1, QingshuChen, qili93 or ZzSean) approval for the usage of @unittest.skip or @unittest.skipIf.\n${HAS_UNITTEST_SKIP}\n"
+    check_approval 1 22165420 6836917 46661762 26922892 16605440 2002279 32410583
   fi
 
 HAS_MODIFIED_DEMO_CMAKE=`git diff --name-only upstream/$BRANCH | grep "paddle/fluid/inference/api/demo_ci/CMakeLists.txt" || true`
@@ -456,8 +456,8 @@ if [ "${NEW_OP_TEST_ADDED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     CHECK_WHOLE=$CHECK_OUTPUT$CHECK_OUTPUT_WITH_PLACE$CHECK_GRAD$CHECK_GRAD_CHECK
     if [ "${CHECK_WHOLE}" != "" ] ; then
         CHECK_OP=${CHECK_WHOLE//+/'\n+'}
-        echo_line="Please use the default precision parameters of 'atol, rtol, eps, max_relative_error'. If you don't use the default value, you must have one RD (Xreki (Recommend), fuyinno4, QingshuChen(Recommend for kunlun), zhiqiu or qili93 (Recommend for NPU) , luotao1, lanxianghit or phlrain) approval for the usage of other values. The detailed information is in the link: https://github.cor/PaddlePaddle/Paddle/wiki/OP-test-accuracy-requirements. The error line is ${CHECK_OP}\n"
-        check_approval 1 6836917 47554610 12538138 43953930 35824027 6888866 16605440 2002279
+        echo_line="Please use the default precision parameters of 'atol, rtol, eps, max_relative_error'. If you don't use the default value, you must have one RD (Xreki (Recommend), fuyinno4, QingshuChen(Recommend for kunlun), zhiqiu or qili93 (Recommend for NPU) , luotao1, lanxianghit, phlrain or ZzSean) approval for the usage of other values. The detailed information is in the link: https://github.cor/PaddlePaddle/Paddle/wiki/OP-test-accuracy-requirements. The error line is ${CHECK_OP}\n"
+        check_approval 1 6836917 47554610 12538138 43953930 35824027 6888866 16605440 2002279 32410583
     fi
 fi
 

From 3951c40d911554966726ec575ac303ea89899e0b Mon Sep 17 00:00:00 2001
From: zhangyuqin1998 <75946871+zhangyuqin1998@users.noreply.github.com>
Date: Tue, 11 Apr 2023 15:12:12 +0800
Subject: [PATCH 061/156] delete remote_prefetch (#52748)

---
 paddle/fluid/operators/hierarchical_sigmoid_op.cc           | 1 -
 paddle/phi/api/yaml/legacy_backward.yaml                    | 4 ++--
 paddle/phi/api/yaml/legacy_ops.yaml                         | 2 +-
 paddle/phi/infermeta/multiary.cc                            | 1 -
 paddle/phi/infermeta/multiary.h                             | 1 -
 paddle/phi/kernels/cpu/hsigmoid_loss_grad.h                 | 1 -
 paddle/phi/kernels/cpu/hsigmoid_loss_grad_kernel.cc         | 2 --
 paddle/phi/kernels/cpu/hsigmoid_loss_kernel.cc              | 1 -
 paddle/phi/kernels/hsigmoid_loss_grad_kernel.h              | 1 -
 paddle/phi/kernels/hsigmoid_loss_kernel.h                   | 1 -
 .../phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.cc  | 2 --
 .../phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.h   | 1 -
 paddle/phi/ops/compat/hierarchical_sigmoid_sig.cc           | 6 +++---
 python/paddle/fluid/tests/unittests/test_hsigmoid_op.py     | 1 -
 python/paddle/nn/functional/loss.py                         | 1 -
 15 files changed, 6 insertions(+), 20 deletions(-)

diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
index e1de4a9a4d312c..e73d2a2b5ce399 100644
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
@@ -124,7 +124,6 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<AttrType>("num_classes", "(int, optional), The number of classes")
         .SetDefault(2);
     // for parameter prefetch
-    AddAttr<bool>("remote_prefetch", "").SetDefault(false);
     AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
     AddAttr<std::vector<int64_t>>("height_sections",
                                   "Height for each output SelectedRows.")
diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml
index 4ba99b1b813120..4e21865c23b317 100755
--- a/paddle/phi/api/yaml/legacy_backward.yaml
+++ b/paddle/phi/api/yaml/legacy_backward.yaml
@@ -475,8 +475,8 @@
     func : heaviside_grad
 
 - backward_op : hsigmoid_loss_grad
-  forward : hsigmoid_loss (Tensor x, Tensor label, Tensor w, Tensor bias, Tensor path, Tensor code, int num_classes, bool remote_prefetch, bool is_sparse) -> Tensor(out), Tensor(pre_out), Tensor(w_out)
-  args : (Tensor x, Tensor w, Tensor label, Tensor path, Tensor code, Tensor bias, Tensor pre_out, Tensor out_grad, int num_classes, bool remote_prefetch, bool is_sparse)
+  forward : hsigmoid_loss (Tensor x, Tensor label, Tensor w, Tensor bias, Tensor path, Tensor code, int num_classes, bool is_sparse) -> Tensor(out), Tensor(pre_out), Tensor(w_out)
+  args : (Tensor x, Tensor w, Tensor label, Tensor path, Tensor code, Tensor bias, Tensor pre_out, Tensor out_grad, int num_classes, bool is_sparse)
   output : Tensor(x_grad), Tensor(w_grad), Tensor(bias_grad)
   infer_meta :
     func : GeneralTernaryGradInferMeta
diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml
index 2d0aadcf5362ca..217afd146f8442 100755
--- a/paddle/phi/api/yaml/legacy_ops.yaml
+++ b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -684,7 +684,7 @@
   backward : heaviside_grad
 
 - op : hsigmoid_loss
-  args : (Tensor x, Tensor label, Tensor w, Tensor bias, Tensor path, Tensor code, int num_classes, bool remote_prefetch, bool is_sparse)
+  args : (Tensor x, Tensor label, Tensor w, Tensor bias, Tensor path, Tensor code, int num_classes, bool is_sparse)
   output : Tensor(out), Tensor(pre_out), Tensor(w_out)
   infer_meta :
     func : HSigmoidLossInferMeta
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index 7364f85e75155b..71fe149e7c0c0f 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -1432,7 +1432,6 @@ void HSigmoidLossInferMeta(const MetaTensor& x,
                            const MetaTensor& path,
                            const MetaTensor& code,
                            int num_classes,
-                           bool remote_prefetch,
                            bool is_sparse,
                            MetaTensor* out,
                            MetaTensor* pre_out,
diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h
index 178910e3620c9a..307e6115cfd566 100644
--- a/paddle/phi/infermeta/multiary.h
+++ b/paddle/phi/infermeta/multiary.h
@@ -312,7 +312,6 @@ void HSigmoidLossInferMeta(const MetaTensor& x,
                            const MetaTensor& path,
                            const MetaTensor& code,
                            int num_classes,
-                           bool remote_prefetch,
                            bool is_sparse,
                            MetaTensor* out,
                            MetaTensor* pre_out,
diff --git a/paddle/phi/kernels/cpu/hsigmoid_loss_grad.h b/paddle/phi/kernels/cpu/hsigmoid_loss_grad.h
index 8c8b40c8d9fd0e..f4b35c91018362 100644
--- a/paddle/phi/kernels/cpu/hsigmoid_loss_grad.h
+++ b/paddle/phi/kernels/cpu/hsigmoid_loss_grad.h
@@ -35,7 +35,6 @@ void HSigmoidLossGradKernelImpl(const Context& ctx,
                                 const DenseTensor& pre_out,
                                 const DenseTensor& out_grad,
                                 int num_classes,
-                                bool remote_prefetch,
                                 bool is_sparse,
                                 DenseTensor* x_grad,
                                 DenseTensor* w_grad,
diff --git a/paddle/phi/kernels/cpu/hsigmoid_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/hsigmoid_loss_grad_kernel.cc
index bc741b32b3afc9..9b7a2fd574ea85 100644
--- a/paddle/phi/kernels/cpu/hsigmoid_loss_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/hsigmoid_loss_grad_kernel.cc
@@ -31,7 +31,6 @@ void HSigmoidLossGradKernel(const Context& ctx,
                             const DenseTensor& pre_out,
                             const DenseTensor& out_grad,
                             int num_classes,
-                            bool remote_prefetch,
                             bool is_sparse,
                             DenseTensor* x_grad,
                             DenseTensor* w_grad,
@@ -46,7 +45,6 @@ void HSigmoidLossGradKernel(const Context& ctx,
                                 pre_out,
                                 out_grad,
                                 num_classes,
-                                remote_prefetch,
                                 is_sparse,
                                 x_grad,
                                 w_grad,
diff --git a/paddle/phi/kernels/cpu/hsigmoid_loss_kernel.cc b/paddle/phi/kernels/cpu/hsigmoid_loss_kernel.cc
index c6ee49ef34786a..2a611a8d541ca4 100644
--- a/paddle/phi/kernels/cpu/hsigmoid_loss_kernel.cc
+++ b/paddle/phi/kernels/cpu/hsigmoid_loss_kernel.cc
@@ -34,7 +34,6 @@ void HSigmoidLossKernel(const Context& ctx,
                         const paddle::optional<DenseTensor>& path,
                         const paddle::optional<DenseTensor>& code,
                         int num_classes,
-                        bool remote_prefetch,
                         bool is_sparse,
                         DenseTensor* out,
                         DenseTensor* pre_out,
diff --git a/paddle/phi/kernels/hsigmoid_loss_grad_kernel.h b/paddle/phi/kernels/hsigmoid_loss_grad_kernel.h
index c36b343017fd50..254264b8c276e7 100644
--- a/paddle/phi/kernels/hsigmoid_loss_grad_kernel.h
+++ b/paddle/phi/kernels/hsigmoid_loss_grad_kernel.h
@@ -29,7 +29,6 @@ void HSigmoidLossGradKernel(const Context& ctx,
                             const DenseTensor& pre_out,
                             const DenseTensor& out_grad,
                             int num_classes,
-                            bool remote_prefetch,
                             bool is_sparse,
                             DenseTensor* x_grad,
                             DenseTensor* w_grad,
diff --git a/paddle/phi/kernels/hsigmoid_loss_kernel.h b/paddle/phi/kernels/hsigmoid_loss_kernel.h
index 33a90c637e4e43..f1b659a5ba1295 100644
--- a/paddle/phi/kernels/hsigmoid_loss_kernel.h
+++ b/paddle/phi/kernels/hsigmoid_loss_kernel.h
@@ -27,7 +27,6 @@ void HSigmoidLossKernel(const Context& ctx,
                         const paddle::optional<DenseTensor>& path,
                         const paddle::optional<DenseTensor>& code,
                         int num_classes,
-                        bool remote_prefetch,
                         bool is_sparse,
                         DenseTensor* out,
                         DenseTensor* pre_out,
diff --git a/paddle/phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.cc b/paddle/phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.cc
index 4bb0352528e4e4..9d450f1d5dbed0 100644
--- a/paddle/phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.cc
@@ -48,7 +48,6 @@ void HSigmoidLossGradKernel(const Context& ctx,
                             const DenseTensor& pre_out,
                             const DenseTensor& out_grad,
                             int num_classes,
-                            bool remote_prefetch,
                             bool is_sparse,
                             DenseTensor* x_grad,
                             SelectedRows* w_grad,
@@ -74,7 +73,6 @@ void HSigmoidLossGradKernel(const Context& ctx,
                                      pre_out,
                                      out_grad,
                                      num_classes,
-                                     remote_prefetch,
                                      is_sparse,
                                      x_grad,
                                      w_grad_value,
diff --git a/paddle/phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.h b/paddle/phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.h
index 94ac63183fbfb8..50719408acf111 100644
--- a/paddle/phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.h
+++ b/paddle/phi/kernels/selected_rows/hsigmoid_loss_grad_kernel.h
@@ -31,7 +31,6 @@ void HSigmoidLossGradKernel(const Context& ctx,
                             const DenseTensor& pre_out,
                             const DenseTensor& out_grad,
                             int num_classes,
-                            bool remote_prefetch,
                             bool is_sparse,
                             DenseTensor* x_grad,
                             SelectedRows* w_grad,
diff --git a/paddle/phi/ops/compat/hierarchical_sigmoid_sig.cc b/paddle/phi/ops/compat/hierarchical_sigmoid_sig.cc
index a8db0b33242bd9..9499e0b9fc0dd6 100644
--- a/paddle/phi/ops/compat/hierarchical_sigmoid_sig.cc
+++ b/paddle/phi/ops/compat/hierarchical_sigmoid_sig.cc
@@ -20,7 +20,7 @@ KernelSignature HierarchicalSigmoidOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
   return KernelSignature("hsigmoid_loss",
                          {"X", "Label", "W", "Bias", "PathTable", "PathCode"},
-                         {"num_classes", "remote_prefetch", "is_sparse"},
+                         {"num_classes", "is_sparse"},
                          {"Out", "PreOut", "W_Out"});
 }
 
@@ -36,7 +36,7 @@ KernelSignature HierarchicalSigmoidGradOpArgumentMapping(
                             "Bias",
                             "PreOut",
                             "Out@GRAD"},
-                           {"num_classes", "remote_prefetch", "is_sparse"},
+                           {"num_classes", "is_sparse"},
                            {"X@GRAD", "W@GRAD", "Bias@GRAD"});
   } else if (ctx.IsSelectedRowsOutput("W@GRAD")) {
     return KernelSignature("hsigmoid_loss_grad_sr",
@@ -48,7 +48,7 @@ KernelSignature HierarchicalSigmoidGradOpArgumentMapping(
                             "Bias",
                             "PreOut",
                             "Out@GRAD"},
-                           {"num_classes", "remote_prefetch", "is_sparse"},
+                           {"num_classes", "is_sparse"},
                            {"X@GRAD", "W@GRAD", "Bias@GRAD"});
   } else {
     return KernelSignature("unregistered", {}, {}, {});
diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
index 9698fe9c54c05a..752fbab31d57a8 100644
--- a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
@@ -177,7 +177,6 @@ def python_api(
     path_code=None,
     num_classes=-1,
     is_sparse=False,
-    remote_prefetch=False,
 ):
     return paddle.nn.functional.hsigmoid_loss(
         input,
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 4b57c9d936123c..c2c98361c75e75 100644
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -1016,7 +1016,6 @@ def hsigmoid_loss(
         attrs = {
             "num_classes": num_classes,
             "is_sparse": is_sparse,
-            "remote_prefetch": is_sparse,
         }
 
         inputs = {

From 5ab7927325ea7e77d950a3cf38d9f7cd6d1a483e Mon Sep 17 00:00:00 2001
From: wangxiaoning <71813629+wangxn12138@users.noreply.github.com>
Date: Tue, 11 Apr 2023 15:13:13 +0800
Subject: [PATCH 062/156] fix save inf (#52632)

---
 .../distributed/fleet/runtime/parameter_server_runtime.py    | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
index 24df3203183f5c..3776583371526d 100644
--- a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
+++ b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
@@ -24,7 +24,6 @@
     Variable,
     default_main_program,
     default_startup_program,
-    save_inference_model,
 )
 
 from ..base.private_helper_function import wait_server_ready
@@ -735,7 +734,7 @@ def _ps_inference_save_inference_model(
                 raise TypeError(
                     "in fleet.save_inference_model() function, main_program must be as Program type, CompiledProgram is not allowed"
                 )
-            save_inference_model(
+            paddle.fluid.io.save_inference_model(
                 dirname,
                 feeded_var_names,
                 target_vars,
@@ -746,7 +745,7 @@ def _ps_inference_save_inference_model(
                 export_for_deployment,
             )
         else:
-            save_inference_model(
+            paddle.fluid.io.save_inference_model(
                 dirname,
                 feeded_var_names,
                 target_vars,

From 4a74f4c5aaca9dc36fe2abb7990fe3bd056d87ec Mon Sep 17 00:00:00 2001
From: RedContritio <RedContritio@qq.com>
Date: Tue, 11 Apr 2023 15:39:33 +0800
Subject: [PATCH 063/156] support auto generate static for randperm (#52531)

* support auto generate static for randperm

* remove enforce in randperm infermeta
---
 paddle/fluid/operators/randperm_op.cc         | 98 -------------------
 paddle/fluid/operators/unity_build_rule.cmake |  2 -
 paddle/phi/api/yaml/op_compat.yaml            |  6 ++
 paddle/phi/api/yaml/static_ops.yaml           | 11 +++
 paddle/phi/ops/compat/randperm_sig.cc         | 25 -----
 5 files changed, 17 insertions(+), 125 deletions(-)
 delete mode 100644 paddle/fluid/operators/randperm_op.cc
 delete mode 100644 paddle/phi/ops/compat/randperm_sig.cc

diff --git a/paddle/fluid/operators/randperm_op.cc b/paddle/fluid/operators/randperm_op.cc
deleted file mode 100644
index 187b227f331707..00000000000000
--- a/paddle/fluid/operators/randperm_op.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/randperm_op.h"
-
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-
-class RandpermOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"),
-                      true,
-                      platform::errors::NotFound(
-                          "The output(Out) of randperm op must not be null."));
-    int n = ctx->Attrs().Get<int>("n");
-    PADDLE_ENFORCE_GT(
-        n,
-        0,
-        platform::errors::InvalidArgument(
-            "The input 'n' of randperm op should be greater than 0. "
-            "But received %d.",
-            n));
-
-    ctx->SetOutputDim("Out", phi::make_ddim({n}));
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    auto data_type =
-        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype"));
-    return phi::KernelKey(data_type, ctx.GetPlace());
-  }
-};
-
-class RandpermOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddOutput("Out", "The output tensor of randperm op.");
-
-    AddAttr<int>(
-        "n", "The upper bound (exclusive), and it should be greater than 0.");
-    AddAttr<int>("dtype",
-                 "The data type of output tensor. "
-                 "Default: 3[int64].")
-        .SetDefault(framework::proto::VarType::INT64);
-    AddAttr<int>("seed",
-                 "Random seed used for permute samples. "
-                 "0 means use a seed generated by the system."
-                 "Note that if seed is not 0, this operator will always "
-                 "generate the same random permutation every time. "
-                 "Default: 0.")
-        .SetDefault(0);
-
-    AddComment(R"DOC(
-This operator returns a random permutation of integers from 0 to n-1.
-)DOC");
-  }
-};
-
-class RandpermOpVarTypeInference : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto var_data_type = static_cast<framework::proto::VarType::Type>(
-        PADDLE_GET_CONST(int, ctx->GetAttr("dtype")));
-    ctx->SetOutputDataType("Out", var_data_type);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(
-    randperm,
-    paddle::operators::RandpermOp,
-    paddle::operators::RandpermOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    paddle::operators::RandpermOpVarTypeInference);
diff --git a/paddle/fluid/operators/unity_build_rule.cmake b/paddle/fluid/operators/unity_build_rule.cmake
index 7ca431e8ea5d10..91033e2fa67074 100644
--- a/paddle/fluid/operators/unity_build_rule.cmake
+++ b/paddle/fluid/operators/unity_build_rule.cmake
@@ -222,7 +222,6 @@ register_unity_group(
   mkldnn/quantize_mkldnn_op.cc
   queue_generator_op.cc
   random_crop_op.cc
-  randperm_op.cc
   range_op.cc
   rank_attention_op.cc
   rank_loss_op.cc
@@ -500,7 +499,6 @@ register_unity_group(
 register_unity_group(
   cu
   random_crop_op.cu
-  randperm_op.cu
   range_op.cu
   reverse_op.cu
   partial_concat_op.cu
diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
index 98a00e6f5a9c06..90c75a8dcc6cd7 100644
--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -1684,6 +1684,12 @@
       tensors_name : ShapeTensorList
   manual_signature : [randint]
 
+- op : randperm
+  outputs :
+    out : Out
+  extra :
+    attrs : [int seed = 0]
+
 - op : real
   backward : real_grad
   inputs :
diff --git a/paddle/phi/api/yaml/static_ops.yaml b/paddle/phi/api/yaml/static_ops.yaml
index 4e0d4cfc931c50..f0f26e27c1f2c4 100644
--- a/paddle/phi/api/yaml/static_ops.yaml
+++ b/paddle/phi/api/yaml/static_ops.yaml
@@ -260,6 +260,17 @@
     param : [low, high, shape, dtype]
     data_type : dtype
 
+- op : randperm
+  args : (int n, DataType dtype = DataType::INT64)
+  output : Tensor(out)
+  infer_meta :
+    func : RandpermInferMeta
+    param : [n, dtype]
+  kernel :
+    func : randperm
+    param : [n, dtype]
+    data_type : dtype
+
 - op : reduce
   args : (Tensor x, int ring_id = 0, int root_id = 0, int reduce_type = 0)
   output : Tensor(out)
diff --git a/paddle/phi/ops/compat/randperm_sig.cc b/paddle/phi/ops/compat/randperm_sig.cc
deleted file mode 100644
index 14b28512e402a3..00000000000000
--- a/paddle/phi/ops/compat/randperm_sig.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-
-KernelSignature RandpermOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  return KernelSignature("randperm", {}, {"n", "dtype"}, {"Out"});
-}
-
-}  // namespace phi
-
-PD_REGISTER_ARG_MAPPING_FN(randperm, phi::RandpermOpArgumentMapping);

From 6741dd22f90d82a230b3d0083f496e1ae64e2f50 Mon Sep 17 00:00:00 2001
From: RedContritio <RedContritio@qq.com>
Date: Tue, 11 Apr 2023 15:40:34 +0800
Subject: [PATCH 064/156] support auto generate for op average_accumulates
 (#52704)

---
 .../fluid/operators/average_accumulates_op.cc | 142 ------------------
 paddle/fluid/operators/unity_build_rule.cmake |   2 -
 paddle/phi/api/yaml/legacy_ops.yaml           |  10 --
 paddle/phi/api/yaml/ops.yaml                  |  10 ++
 .../phi/ops/compat/average_accumulates_sig.cc |  39 -----
 5 files changed, 10 insertions(+), 193 deletions(-)
 delete mode 100644 paddle/fluid/operators/average_accumulates_op.cc
 delete mode 100644 paddle/phi/ops/compat/average_accumulates_sig.cc

diff --git a/paddle/fluid/operators/average_accumulates_op.cc b/paddle/fluid/operators/average_accumulates_op.cc
deleted file mode 100644
index a59b78c3cd44b5..00000000000000
--- a/paddle/fluid/operators/average_accumulates_op.cc
+++ /dev/null
@@ -1,142 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/phi/infermeta/multiary.h"
-
-namespace paddle {
-namespace operators {
-
-class AverageAccumulatesOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "param"),
-                          ctx.GetPlace());
-  }
-};
-
-class AverageAccumulatesOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("param", "(Tensor), The parameter to be accumulated.");
-    AddInput("in_sum_1",
-             "(Tensor), A tensor used to store the parameter "
-             "sums with the same shape as input(param).");
-    AddInput("in_sum_2",
-             "(Tensor), A auxiliary tensor to help "
-             "accumulating sums of parameter values with the same shape as "
-             "input(param). It is used to avoid loss of precision due to too "
-             "many sums.");
-    AddInput("in_sum_3",
-             "(Tensor), A auxiliary tensor to help "
-             "accumulating sums of parameter values with the same shape as "
-             "input(param).");
-    AddInput("in_num_accumulates",
-             "(Tensor<int64_t>), The accumulating times of current window with "
-             "shape [1].");
-    AddInput(
-        "in_old_num_accumulates",
-        "(Tensor<int64_t>), The accumulating times of previous window with "
-        "shape [1].");
-    AddInput("in_num_updates",
-             "(Tensor<int64_t>), The total number of batches used by training "
-             "before this batch with shape [1].");
-
-    AddOutput("out_sum_1",
-              "(Tensor), A tensor used to store the "
-              "parameter sums with the same shape as input(param).");
-    AddOutput("out_sum_2",
-              "(Tensor), A auxiliary tensor to help "
-              "accumulating sums of parameter values with the same shape as "
-              "input(param). It is used to avoid loss of precision due to too "
-              "many sums.");
-    AddOutput("out_sum_3",
-              "(Tensor), A auxiliary tensor to help "
-              "accumulating sums of parameter values with the same shape as "
-              "input(param).");
-    AddOutput(
-        "out_num_accumulates",
-        "(Tensor<int64_t>), The accumulating times of current window with "
-        "shape [1].");
-    AddOutput(
-        "out_old_num_accumulates",
-        "(Tensor<int64_t>) The accumulating times of previous window with "
-        "shape [1].");
-    AddOutput("out_num_updates",
-              "(Tensor<int64_t>), The total number of batches used by training "
-              "before this batch with shape [1].");
-
-    AddAttr<float>("average_window",
-                   "(float, default 0) "
-                   "The rate of average window size relative to num_updates.")
-        .SetDefault(0);
-    AddAttr<int64_t>("max_average_window",
-                     "(int64_t) "
-                     "Maximum size of average window. It suggests that the "
-                     "number of mini-batches "
-                     "in one pass is appropriate value to set.");
-    AddAttr<int64_t>("min_average_window",
-                     "(int64_t, default 10000L) "
-                     "Minimu size of average window.")
-        .SetDefault(10000L);
-
-    AddComment(R"DOC(
-AverageAccumulates Operator.
-Accumulate the sum of parameter within sliding window. The size of sliding window is
-determined by 'average_window', 'max_average_window' and 'min_average_window'.
-Memory was shared by Input(in_sum_1) and Output(out_sum_1) which acts as an accumulator 'sum_1'.
-'sum_2', 'sum_3', 'num_accumulates', 'old_num_accumulates' and 'num_updates' were the same as 'sum_1'.
-
-All the accumulators were inited to zero before training.
-
-And for a mini-batch in training, accumulators were computed as below steps:
-    num_updates += 1
-    num_accumulates += 1
-    sum_1 += param
-    if num_updates % kMaxNumAccumulates == 0:
-        sum_2 += sum_1
-        sum_1 = 0
-    if num_accumulates >= min_average_window && num_accumulates >= min(max_average_window, num_updates * average_window):
-        sum_3 = sum_1 + sum_2
-        sum_1 = 0
-        sum_2 = 0
-        old_num_accumulates = num_accumulates
-        num_accumulates = 0
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-DECLARE_INFER_SHAPE_FUNCTOR(average_accumulates,
-                            AverageAccumulatesInferShapeFunctor,
-                            PD_INFER_META(phi::AverageAccumulatesInferMeta));
-
-REGISTER_OPERATOR(
-    average_accumulates,
-    ops::AverageAccumulatesOp,
-    ops::AverageAccumulatesOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    AverageAccumulatesInferShapeFunctor);
diff --git a/paddle/fluid/operators/unity_build_rule.cmake b/paddle/fluid/operators/unity_build_rule.cmake
index 91033e2fa67074..af90cbdfc1639a 100644
--- a/paddle/fluid/operators/unity_build_rule.cmake
+++ b/paddle/fluid/operators/unity_build_rule.cmake
@@ -17,7 +17,6 @@ register_unity_group(
   assert_op.cc
   assign_value_op.cc
   attention_lstm_op.cc
-  average_accumulates_op.cc
   batch_fc_op.cc
   bce_loss_op.cc
   beam_search_op.cc
@@ -385,7 +384,6 @@ register_unity_group(
   conv_transpose_op.cu
   cos_sim_op.cu
   crop_op.cu
-  average_accumulates_op.cu
   conj_op.cu
   correlation_op.cu)
 register_unity_group(
diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml
index 217afd146f8442..b075b1935e1bb3 100755
--- a/paddle/phi/api/yaml/legacy_ops.yaml
+++ b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -153,16 +153,6 @@
     data_type : dtype
     backend : place > output
 
-- op : average_accumulates_
-  args : (Tensor param, Tensor in_sum_1, Tensor in_sum_2, Tensor in_sum_3, Tensor in_num_accumulates, Tensor in_old_num_accumulates, Tensor in_num_updates, float average_window, int64_t max_average_window, int64_t min_average_window)
-  output : Tensor(out_sum_1), Tensor(out_sum_2), Tensor(out_sum_3), Tensor(out_num_accumulates), Tensor(out_old_num_accumulates), Tensor(out_num_updates)
-  infer_meta:
-    func : AverageAccumulatesInferMeta
-  kernel :
-    func : average_accumulates {dense, dense, dense, dense, dense ,dense, dense -> dense, dense, dense, dense, dense, dense}
-    data_type : param
-  inplace : (in_sum_1 -> out_sum_1), (in_sum_2 -> out_sum_2), (in_sum_3 -> out_sum_3), (in_num_accumulates -> out_num_accumulates), (in_old_num_accumulates -> out_old_num_accumulates), (in_num_updates -> out_num_updates)
-
 - op : batch_norm
   args : (Tensor x, Tensor mean, Tensor variance, Tensor scale, Tensor bias, bool is_test, float momentum, float epsilon, str data_layout, bool use_global_stats, bool trainable_statistics)
   output : Tensor(out), Tensor(mean_out), Tensor(variance_out), Tensor(saved_mean), Tensor(saved_variance), Tensor(reserve_space)
diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
index 31f4a114b7142f..aed95190bcfe59 100644
--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -185,6 +185,16 @@
     data_type : x
   optional : ins_tag_weight
 
+- op : average_accumulates_
+  args : (Tensor param, Tensor in_sum_1, Tensor in_sum_2, Tensor in_sum_3, Tensor in_num_accumulates, Tensor in_old_num_accumulates, Tensor in_num_updates, float average_window = 0, int64_t max_average_window = INT64_MAX, int64_t min_average_window = 10000L)
+  output : Tensor(out_sum_1), Tensor(out_sum_2), Tensor(out_sum_3), Tensor(out_num_accumulates), Tensor(out_old_num_accumulates), Tensor(out_num_updates)
+  infer_meta:
+    func : AverageAccumulatesInferMeta
+  kernel :
+    func : average_accumulates {dense, dense, dense, dense, dense ,dense, dense -> dense, dense, dense, dense, dense, dense}
+    data_type : param
+  inplace : (in_sum_1 -> out_sum_1), (in_sum_2 -> out_sum_2), (in_sum_3 -> out_sum_3), (in_num_accumulates -> out_num_accumulates), (in_old_num_accumulates -> out_old_num_accumulates), (in_num_updates -> out_num_updates)
+
 - op : bce_loss
   args : (Tensor input, Tensor label)
   output : Tensor
diff --git a/paddle/phi/ops/compat/average_accumulates_sig.cc b/paddle/phi/ops/compat/average_accumulates_sig.cc
deleted file mode 100644
index c14e8ab3575531..00000000000000
--- a/paddle/phi/ops/compat/average_accumulates_sig.cc
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-KernelSignature AverageAccumulatesOpArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  return KernelSignature(
-      "average_accumulates",
-      {"param",
-       "in_sum_1",
-       "in_sum_2",
-       "in_sum_3",
-       "in_num_accumulates",
-       "in_old_num_accumulates",
-       "in_num_updates"},
-      {"average_window", "max_average_window", "min_average_window"},
-      {"out_sum_1",
-       "out_sum_2",
-       "out_sum_3",
-       "out_num_accumulates",
-       "out_old_num_accumulates",
-       "out_num_updates"});
-}
-}  // namespace phi
-PD_REGISTER_ARG_MAPPING_FN(average_accumulates,
-                           phi::AverageAccumulatesOpArgumentMapping);

From dd74b3d1859d3349b32f64ff966d4d50c85c81ad Mon Sep 17 00:00:00 2001
From: Xiaoxu Chen <chenxx_id@163.com>
Date: Tue, 11 Apr 2023 15:46:42 +0800
Subject: [PATCH 065/156] [prim]use Operator to reconstruct the primitive
 operator defined in c++ (#51997)

---
 .../utils/static/composite_grad_desc_maker.h  |  3 +
 paddle/fluid/pybind/protobuf.cc               |  3 +-
 python/paddle/fluid/backward.py               | 69 ++++++++++++++-----
 python/paddle/fluid/framework.py              | 35 ++++++++--
 .../unittests/prim/test_comp_dispensable.py   | 45 ++++++++++++
 5 files changed, 129 insertions(+), 26 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/prim/test_comp_dispensable.py

diff --git a/paddle/fluid/prim/utils/static/composite_grad_desc_maker.h b/paddle/fluid/prim/utils/static/composite_grad_desc_maker.h
index 83b18814b19d44..b1b24af231f68d 100644
--- a/paddle/fluid/prim/utils/static/composite_grad_desc_maker.h
+++ b/paddle/fluid/prim/utils/static/composite_grad_desc_maker.h
@@ -575,6 +575,9 @@ class CompositeGradOpMakerBase {
 
   const std::unordered_map<std::string, framework::Attribute>& RuntimeAttrs()
       const {
+    LOG(WARNING) << "CompositeGradOpMaker doesn't support use runtime attrs, "
+                    "but find the op"
+                 << fwd_op_.Type() << "use runtime attr.";
     return fwd_op_.GetRuntimeAttrMap();
   }
 
diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc
index 9661d5524140bb..5493cc945cf4c2 100644
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -425,7 +425,8 @@ void BindOpDesc(pybind11::module *m) {
                     &pd::OpDesc::SetDistAttr,
                     pybind11::return_value_policy::reference)
       .def("inputs", [](pd::OpDesc &self) { return self.Inputs(); })
-      .def("outputs", &pd::OpDesc::Outputs);
+      .def("outputs", &pd::OpDesc::Outputs)
+      .def("get_attr_map", &pd::OpDesc::GetAttrMap);
 
   pybind11::class_<paddle::experimental::Scalar> scalar(*m, "Scalar", "");
   scalar.def(py::init<bool>())
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index 9a6572db727788..46f225e0d09105 100755
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -1715,35 +1715,68 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
 
 
 def infershape_for_composite(block, grad_op_desc):
-    # pruning empty output
+    # NOTE: why pruning the operator with empty output here ?
+    # Some backward operator will output emtpy var, which will cause infer
+    # shape error, such assign with input's stop_gradient=True
     if len(grad_op_desc.output_arg_names()) == 0:
         return
 
-    # append op to block
-    op_desc = block.desc.append_op()
-    op_desc.copy_from(grad_op_desc)
-    op_desc._set_attr(
-        core.op_proto_and_checker_maker.kOpRoleAttrName(),
-        core.op_proto_and_checker_maker.OpRole.Backward,
-    )
-
-    # create output var
+    # create output variable
     new_vars = set()
-    # create new gradient variables
-    for grad_var_name in op_desc.output_arg_names():
+    for grad_var_name in grad_op_desc.output_arg_names():
         if not (
             block.desc.has_var_recursive(grad_var_name.encode())
             or grad_var_name == core.empty_var_name()
         ):
-            block.desc.var(grad_var_name.encode())
+            # NOTE: stop_gradient will be set in append_op
+            desc = block.desc.var(grad_var_name.encode())
+            block.create_var(name=grad_var_name, desc=desc, type=desc.type())
             new_vars.add(grad_var_name)
 
-    # infer shape and infer dthype
-    op_desc.check_attrs()
-    op_desc.infer_var_type(block.desc)
-    op_desc.infer_shape(block.desc)
+    # NOTE For the primitive operator generated by decompositing phi grad kernel,
+    # we Operator to reconstruct the op_desc for reusing some complex logic, such
+    # as processing dispensable input, intermediate output, extra attrs, etc...
+    if framework.OpProtoHolder.instance().has_op_proto(grad_op_desc.type()):
+        op = block.append_op(
+            type=grad_op_desc.type(),
+            inputs={
+                name: [block._find_var_recursive(arg) for arg in args]
+                for name, args in grad_op_desc.inputs().items()
+            },
+            outputs={
+                name: [block._find_var_recursive(arg) for arg in args]
+                for name, args in grad_op_desc.outputs().items()
+            },
+            # NOTE Runtime attr will be ignore as the c++ GetRuntimeAttr
+            # interface cann't be exported to python. Please note the WARNNING
+            # message logged in RuntimeAttrs of composite_grad_desc_maker.h
+            attrs=grad_op_desc.get_attr_map(),
+        )
+        op.desc._set_attr(
+            core.op_proto_and_checker_maker.kOpRoleAttrName(),
+            core.op_proto_and_checker_maker.OpRole.Backward,
+        )
+        grad_op_desc.copy_from(op.desc)
+    # For the backward operator, we reuse the logic of _append_backward_var
+    else:
+        op_desc = block.desc.append_op()
+        op_desc.copy_from(grad_op_desc)
+        op_desc._set_attr(
+            core.op_proto_and_checker_maker.kOpRoleAttrName(),
+            core.op_proto_and_checker_maker.OpRole.Backward,
+        )
+        op_desc.check_attrs()
+        op_desc.infer_var_type(block.desc)
+        op_desc.infer_shape(block.desc)
+        for arg in op_desc.output_arg_names():
+            if arg in new_vars:
+                _infer_var_data_type_shape_(arg, block)
+
+        grad_op_desc.copy_from(op_desc)
 
-    for arg in op_desc.output_arg_names():
+    # NOTE: Some operator doesn't infer dtype correctly, this patch set the
+    # grad_var dtype same with corresponding forward variable.
+    for arg in grad_op_desc.output_arg_names():
         if arg in new_vars:
             _infer_var_data_type_shape_(arg, block)
 
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 708cc462e78ea9..db17ea368849d1 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -2916,14 +2916,35 @@ def find_name(var_list, name):
                 for m in proto.outputs:
                     if (m.name not in outputs) and m.dispensable:
                         continue
-                    if not ((m.name in outputs) or m.dispensable):
-                        raise ValueError(
-                            (
-                                "Incorrect setting for output(s) of "
-                                "operator \"%s\", should set: [%s]."
+
+                    # FIXME: The outputs of primitive operator currently
+                    # doesn't include intermediate output as it will be dropped
+                    # in operator codegen, such as xshape output of reshape2.
+                    # It will fixed when the operator codegen support
+                    # intermediate output.
+                    if core._is_bwd_prim_enabled():
+                        if not (
+                            (m.name in outputs)
+                            or m.dispensable
+                            or m.intermediate
+                        ):
+                            raise ValueError(
+                                (
+                                    "Incorrect setting for output(s) of "
+                                    "operator \"%s\", should set: [%s]."
+                                )
+                                % (type, m.name)
                             )
-                            % (type, m.name)
-                        )
+                    else:
+                        if not ((m.name in outputs) or m.dispensable):
+                            raise ValueError(
+                                (
+                                    "Incorrect setting for output(s) of "
+                                    "operator \"%s\", should set: [%s]."
+                                )
+                                % (type, m.name)
+                            )
+
                 for out_proto in proto.outputs:
                     if out_proto.name not in outputs:
                         continue
diff --git a/python/paddle/fluid/tests/unittests/prim/test_comp_dispensable.py b/python/paddle/fluid/tests/unittests/prim/test_comp_dispensable.py
new file mode 100644
index 00000000000000..a4f4df5fdd1c5d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/prim/test_comp_dispensable.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle
+
+
+class TestDispensable(unittest.TestCase):
+    def setUp(self):
+        paddle.fluid.core._set_prim_all_enabled(True)
+
+    def tearDown(self):
+        paddle.fluid.core._set_prim_all_enabled(False)
+
+    def test_dispensable(self):
+        @paddle.jit.to_static
+        def f(x):
+            return paddle.split(x, num_or_sections=2)
+
+        f = paddle.jit.to_static(f)
+        x = paddle.rand((8,))
+        x.stop_gradient = False
+
+        op = f.get_concrete_program(x)[1].backward_program.block(0).ops[-1]
+        self.assertEqual(
+            op.attr('op_role'),
+            int(paddle.fluid.core.op_proto_and_checker_maker.OpRole.Backward),
+        )
+        self.assertIn('AxisTensor', op.input_names)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 327c0e4dfc8c32cceed653499422dea4713e2c54 Mon Sep 17 00:00:00 2001
From: cyber-pioneer <116002591+cyber-pioneer@users.noreply.github.com>
Date: Tue, 11 Apr 2023 16:28:46 +0800
Subject: [PATCH 066/156] [Prim] Reset base prim resnet ci result (#52752)

* change base res

* fix prim cinn res
---
 test/prim/model/test_resnet_prim_cinn.py | 37 ++++++++++++------------
 1 file changed, 19 insertions(+), 18 deletions(-)

diff --git a/test/prim/model/test_resnet_prim_cinn.py b/test/prim/model/test_resnet_prim_cinn.py
index deda6671d52a86..2012d84546e642 100644
--- a/test/prim/model/test_resnet_prim_cinn.py
+++ b/test/prim/model/test_resnet_prim_cinn.py
@@ -70,15 +70,15 @@
 # The results in ci as as follows:
 DY2ST_PRIM_GT = [
     5.82879114151001,
-    8.333706855773926,
-    5.07769250869751,
-    8.66937255859375,
-    8.411705017089844,
-    7.252340793609619,
-    9.683248519897461,
-    8.177335739135742,
-    8.195427894592285,
-    10.219732284545898,
+    8.33370590209961,
+    5.091761589050293,
+    8.776082992553711,
+    8.274380683898926,
+    7.546653747558594,
+    9.607137680053711,
+    8.27371597290039,
+    8.429732322692871,
+    10.362630844116211,
 ]
 DY2ST_CINN_GT = [
     5.828789710998535,
@@ -92,17 +92,18 @@
     8.383116722106934,
     10.120304107666016,
 ]
+
 DY2ST_PRIM_CINN_GT = [
     5.828784942626953,
-    8.341737747192383,
-    5.113619327545166,
-    8.625601768493652,
-    8.082450866699219,
-    7.4913249015808105,
-    9.858025550842285,
-    8.287693977355957,
-    8.435812950134277,
-    10.372406005859375,
+    8.34173583984375,
+    5.116049289703369,
+    8.511833190917969,
+    7.9524407386779785,
+    7.395752906799316,
+    9.666715621948242,
+    8.277752876281738,
+    8.718518257141113,
+    10.199666023254395,
 ]
 
 if core.is_compiled_with_cuda():

From 259b0aadb60c1530bd3996eae22870ca7e749bf5 Mon Sep 17 00:00:00 2001
From: wz1qqx <55830058+wz1qqx@users.noreply.github.com>
Date: Tue, 11 Apr 2023 16:51:20 +0800
Subject: [PATCH 067/156] [XPU] fix error pattern and rename max name (#52726)

---
 .../framework/ir/xpu/conv2d_xpu_fuse_pass.cc  | 56 ++++++++++++-------
 paddle/phi/api/yaml/fused_ops.yaml            |  8 +--
 paddle/phi/infermeta/fusion.cc                | 14 ++---
 paddle/phi/infermeta/fusion.h                 |  8 +--
 .../kernels/fusion/xpu/conv2d_xpu_kernel.cc   | 25 ++++-----
 5 files changed, 63 insertions(+), 48 deletions(-)

diff --git a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc
index f124c3cc44adf0..0b591120014e35 100644
--- a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc
@@ -99,13 +99,15 @@ Conv2dXPUPattern::Conv2dXPUPattern(PDPattern* pattern,
   auto conv = pattern->NewNode(conv_repr())->assert_is_op(conv_type_);
   auto input = pattern->NewNode(input_repr())
                    ->assert_is_op_input(conv_type_, "Input")
-                   ->AsInput();
+                   ->AsInput()
+                   ->assert_more([](Node* node) {
+                     return node->Var()->GetShape().size() == 4;
+                   });
   auto conv_filter = pattern->NewNode(conv_filter_repr())
                          ->assert_is_op_input(conv_type_, "Filter")
                          ->AsInput();
   auto conv_out = pattern->NewNode(conv_out_repr())
-                      ->assert_is_op_output(conv_type_, "Output")
-                      ->assert_var_not_persistable();
+                      ->assert_is_op_output(conv_type_, "Output");
   conv->LinksFrom({input, conv_filter}).LinksTo({conv_out});
   // ew_bias_add op
   PDNode* ew_bias_add = nullptr;
@@ -116,11 +118,17 @@ Conv2dXPUPattern::Conv2dXPUPattern(PDPattern* pattern,
     ew_bias_add_y = pattern->NewNode(ew_bias_add_y_repr())
                         ->assert_is_op_input("elementwise_add", "Y")
                         ->assert_is_persistable_var()
-                        ->assert_has_n_outputs(1);
+                        ->assert_has_n_outputs(1)
+                        ->assert_more([](Node* node) {
+                          return node->Var()->GetShape().size() == 1;
+                        });
     ew_bias_add =
         pattern->NewNode(ew_bias_add_repr())->assert_is_op("elementwise_add");
     ew_bias_add_out = pattern->NewNode(ew_bias_add_out_repr())
                           ->assert_is_op_output("elementwise_add", "Out");
+    if (with_bn_ || with_branch_ || !act_type_.empty()) {
+      ew_bias_add_out->assert_has_n_outputs(1);
+    }
     ew_bias_add->LinksFrom({conv_out, ew_bias_add_y})
         .LinksTo({ew_bias_add_out});
   } else {
@@ -159,6 +167,9 @@ Conv2dXPUPattern::Conv2dXPUPattern(PDPattern* pattern,
     bn = pattern->NewNode(bn_repr())->assert_is_op("batch_norm");
     bn_out =
         pattern->NewNode(bn_out_repr())->assert_is_op_output("batch_norm", "Y");
+    if (with_branch_ || !act_type_.empty()) {
+      bn_out->assert_has_n_outputs(1);
+    }
     bn_mean_out = pattern->NewNode(bn_mean_out_repr())
                       ->assert_is_op_output("batch_norm", "MeanOut");
     bn_saved_mean = pattern->NewNode(bn_saved_mean_repr())
@@ -179,23 +190,27 @@ Conv2dXPUPattern::Conv2dXPUPattern(PDPattern* pattern,
       bn_out->assert_is_op_input("elementwise_add", "Y")->AsIntermediate();
       ew_branch_add_in = pattern->NewNode(ew_branch_add_in_repr())
                              ->assert_is_op_input("elementwise_add", "X")
-                             ->AsInput()
-                             ->assert_more([](Node* node) {
-                               return node->Var()->GetShape().size() == 4;
-                             });
+                             ->AsInput();
     } else if (with_branch_y_) {
       bn_out->assert_is_op_input("elementwise_add", "X")->AsIntermediate();
       ew_branch_add_in = pattern->NewNode(ew_branch_add_in_repr())
                              ->assert_is_op_input("elementwise_add", "Y")
-                             ->AsInput()
-                             ->assert_more([](Node* node) {
-                               return node->Var()->GetShape().size() == 4;
-                             });
+                             ->AsInput();
     }
-    ew_branch_add =
-        pattern->NewNode(ew_branch_add_repr())->assert_is_op("elementwise_add");
+    ew_branch_add = pattern->NewNode(ew_branch_add_repr())
+                        ->assert_is_op("elementwise_add")
+                        ->assert_more([](Node* node) {
+                          if (node->inputs.size() != 2) {
+                            return false;
+                          }
+                          return node->inputs[0]->Var()->GetShape() ==
+                                 node->inputs[1]->Var()->GetShape();
+                        });
     ew_branch_add_out = pattern->NewNode(ew_branch_add_out_repr())
                             ->assert_is_op_output("elementwise_add", "Out");
+    if (!act_type_.empty()) {
+      ew_branch_add_out->assert_has_n_outputs(1);
+    }
     ew_branch_add->LinksFrom({bn_out, ew_branch_add_in})
         .LinksTo({ew_branch_add_out});
   } else {
@@ -401,6 +416,7 @@ int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph,
         scope->FindVar(conv_filter->Name())->GetMutable<phi::DenseTensor>();
     auto filter_dims = filter_t->dims();
     bool has_bias = with_bn || with_conv_bias;
+    bool has_branch = with_branch_x || with_branch_y;
     // Create conv_fusion_bias (conv bias) variable
     Node* fusion_bias_node = nullptr;
     if (has_bias) {
@@ -501,18 +517,17 @@ int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph,
     framework::OpDesc conv2d_xpu_op_desc(block);
     // set input&output var
     conv2d_xpu_op_desc.SetType("conv2d_xpu");
-    conv2d_xpu_op_desc.SetInput("input", {input->Name()});
+    conv2d_xpu_op_desc.SetInput("x", {input->Name()});
     conv2d_xpu_op_desc.SetInput("filter", {filter_int16->Name()});
     conv2d_xpu_op_desc.SetInput("filter_max", {filter_max->Name()});
-    conv2d_xpu_op_desc.SetOutput("output", {conv2d_xpu_out_name});
-    conv2d_xpu_op_desc.SetOutput("output_max", {conv_out_max_name});
+    conv2d_xpu_op_desc.SetOutput("out", {conv2d_xpu_out_name});
+    conv2d_xpu_op_desc.SetOutput("out_max", {conv_out_max_name});
     // set fusion_bias input node
     if (has_bias) {
       conv2d_xpu_op_desc.SetInput("bias", {fusion_bias_node->Name()});
-      conv2d_xpu_op_desc.SetAttr("has_bias", has_bias);
     }
     // set ew_branch_add input node
-    if (ew_branch_add_in != nullptr) {
+    if (ew_branch_add != nullptr) {
       conv2d_xpu_op_desc.SetInput("branch", {ew_branch_add_in->Name()});
     }
     // set attrs of conv2d_xpu
@@ -566,7 +581,8 @@ int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph,
     conv2d_xpu_op_desc.SetAttr("place_z", std::vector<int>{10});
     conv2d_xpu_op_desc.SetAttr("paddings", conv_paddings);
     conv2d_xpu_op_desc.SetAttr("block_lod", std::vector<int>{1});
-    conv2d_xpu_op_desc.SetAttr("has_branch", with_branch_x || with_branch_y);
+    conv2d_xpu_op_desc.SetAttr("has_branch", has_branch);
+    conv2d_xpu_op_desc.SetAttr("has_bias", has_bias);
 
     auto* conv2d_xpu = graph->CreateOpNode(&conv2d_xpu_op_desc);
     IR_NODE_LINK_TO(input, conv2d_xpu);
diff --git a/paddle/phi/api/yaml/fused_ops.yaml b/paddle/phi/api/yaml/fused_ops.yaml
index c9fae2a81e3b74..b43d02fced54ce 100644
--- a/paddle/phi/api/yaml/fused_ops.yaml
+++ b/paddle/phi/api/yaml/fused_ops.yaml
@@ -5,14 +5,14 @@
 # otherwise the operator only could be used in static mode.
 
 - op : conv2d_xpu
-  args : (Tensor input, Tensor input_max, Tensor filter, Tensor filter_max, Tensor bias, Tensor branch, int[] paddings, int[] dilations, int[] strides, str padding_algorithm, int groups, bool has_bias, bool has_branch, int act_type, float act_param)
-  output : Tensor(output), Tensor(output_max)
+  args : (Tensor x, Tensor x_max, Tensor filter, Tensor filter_max, Tensor bias, Tensor branch, int[] paddings, int[] dilations, int[] strides, str padding_algorithm, int groups, bool has_bias, bool has_branch, int act_type, float act_param)
+  output : Tensor(out), Tensor(out_max)
   infer_meta :
     func : Conv2dXPUInferMeta
   kernel :
     func : conv2d_xpu
-    data_type : input
-  optional : bias, branch, input_max
+    data_type : x
+  optional : bias, branch, x_max
 
 - op : embedding_with_eltwise_add_xpu
   args : (Tensor[] ids, Tensor[] tables, int64_t padding_idx)
diff --git a/paddle/phi/infermeta/fusion.cc b/paddle/phi/infermeta/fusion.cc
index ad8409487bb589..5c0aa3b8e89fdb 100644
--- a/paddle/phi/infermeta/fusion.cc
+++ b/paddle/phi/infermeta/fusion.cc
@@ -35,8 +35,8 @@ inline int ConvOutSize(int input_size,
   return output_size;
 }
 
-void Conv2dXPUInferMeta(const MetaTensor& input,
-                        const MetaTensor& input_max,
+void Conv2dXPUInferMeta(const MetaTensor& x,
+                        const MetaTensor& x_max,
                         const MetaTensor& filter,
                         const MetaTensor& filter_max,
                         const MetaTensor& bias,
@@ -50,9 +50,9 @@ void Conv2dXPUInferMeta(const MetaTensor& input,
                         bool has_branch,
                         int act_type,
                         float act_param,
-                        MetaTensor* output,
-                        MetaTensor* output_max) {
-  auto in_dims = input.dims();
+                        MetaTensor* out,
+                        MetaTensor* out_max) {
+  auto in_dims = x.dims();
   auto filter_dims = filter.dims();
   // do some checks
   PADDLE_ENFORCE_EQ(
@@ -157,8 +157,8 @@ void Conv2dXPUInferMeta(const MetaTensor& input,
                                     strides[i]));
   }
   // set output and output max dims
-  output->set_dims(DDim(out_shape.data(), out_shape.size()));
-  output_max->set_dims(phi::make_ddim({4}));
+  out->set_dims(DDim(out_shape.data(), out_shape.size()));
+  out_max->set_dims(phi::make_ddim({4}));
 }
 
 void EmbeddingWithEltwiseAddXPUInferMeta(
diff --git a/paddle/phi/infermeta/fusion.h b/paddle/phi/infermeta/fusion.h
index 9dcf7342ae1936..3105ea8a6d5781 100644
--- a/paddle/phi/infermeta/fusion.h
+++ b/paddle/phi/infermeta/fusion.h
@@ -22,8 +22,8 @@ namespace phi {
 // Common InferMeta Functions for fusion operators.
 // NOTE: The InferMeta Functions in this file are arranged in alphabetic order.
 
-void Conv2dXPUInferMeta(const MetaTensor& input,
-                        const MetaTensor& input_max,
+void Conv2dXPUInferMeta(const MetaTensor& x,
+                        const MetaTensor& x_max,
                         const MetaTensor& filter,
                         const MetaTensor& filter_max,
                         const MetaTensor& bias,
@@ -37,8 +37,8 @@ void Conv2dXPUInferMeta(const MetaTensor& input,
                         bool has_branch,
                         int act_type,
                         float act_param,
-                        MetaTensor* output,
-                        MetaTensor* output_max);
+                        MetaTensor* out,
+                        MetaTensor* out_max);
 
 void EmbeddingWithEltwiseAddXPUInferMeta(
     const std::vector<const MetaTensor*>& ids,
diff --git a/paddle/phi/kernels/fusion/xpu/conv2d_xpu_kernel.cc b/paddle/phi/kernels/fusion/xpu/conv2d_xpu_kernel.cc
index 9da39097e0f8d7..0f7d8902de3284 100644
--- a/paddle/phi/kernels/fusion/xpu/conv2d_xpu_kernel.cc
+++ b/paddle/phi/kernels/fusion/xpu/conv2d_xpu_kernel.cc
@@ -21,8 +21,8 @@ namespace fusion {
 
 template <typename T, typename Context>
 void Conv2dXPUKernel(const Context& ctx,
-                     const DenseTensor& input,
-                     const paddle::optional<DenseTensor>& input_max,
+                     const DenseTensor& x,
+                     const paddle::optional<DenseTensor>& x_max,
                      const DenseTensor& filter,
                      const DenseTensor& filter_max,
                      const paddle::optional<DenseTensor>& bias,
@@ -36,10 +36,10 @@ void Conv2dXPUKernel(const Context& ctx,
                      bool has_branch,
                      int act_type,
                      float act_param,
-                     DenseTensor* output,
-                     DenseTensor* output_max) {
+                     DenseTensor* out,
+                     DenseTensor* out_max) {
   using XPUType = typename XPUTypeTrait<T>::Type;
-  auto input_dims = input.dims();
+  auto input_dims = x.dims();
   auto filter_dims = filter.dims();
   // update paddings and dilations accoring to padding_algorithm
   std::vector<int> paddings_vec = paddings;
@@ -62,17 +62,16 @@ void Conv2dXPUKernel(const Context& ctx,
   int win_h = static_cast<int>(filter_dims[2]);
   int win_w = static_cast<int>(filter_dims[3]);
 
-  auto* input_data = reinterpret_cast<const XPUType*>(input.data<T>());
-  const float* input_max_data = input_max.get_ptr() == nullptr
-                                    ? nullptr
-                                    : input_max.get_ptr()->data<float>();
+  auto* input_data = reinterpret_cast<const XPUType*>(x.data<T>());
+  const float* input_max_data =
+      x_max.get_ptr() == nullptr ? nullptr : x_max.get_ptr()->data<float>();
   auto* branch_data =
       branch.get_ptr() == nullptr
           ? nullptr
           : reinterpret_cast<const XPUType*>(branch.get_ptr()->data<T>());
   const float* bias_data =
       bias.get_ptr() == nullptr ? nullptr : bias.get_ptr()->data<float>();
-  auto* out_data = reinterpret_cast<XPUType*>(ctx.template Alloc<T>(output));
+  auto* out_data = reinterpret_cast<XPUType*>(ctx.template Alloc<T>(out));
 
   xpu::Activation_t act(static_cast<xpu::Activation_t::act_enum>(act_type));
   if (act_type == xpu::Activation_t::LEAKY_RELU) {
@@ -98,13 +97,13 @@ void Conv2dXPUKernel(const Context& ctx,
           /* int64_t groups */ groups,
           /* const float* in_maxptr */ input_max_data,
           /* const float* filter_maxptr */ filter_max.data<float>(),
-          /* float* out_maxptr */ ctx.template Alloc<float>(output_max),
+          /* float* out_maxptr */ ctx.template Alloc<float>(out_max),
           /* bool is_nchw */ true,
           /* const float* bias */ bias_data,
           /* const TY* branch */ branch_data,
           /* const baidu::xpu::api::Activation_t& act */ act,
-          /* const float* branch_maxptr */ nullptr);
-  // /* const float* scale */ nullptr);
+          /* const float* branch_maxptr */ nullptr,
+          /* const float* scale */ nullptr);
   PADDLE_ENFORCE_XDNN_SUCCESS(r, "conv2d_xpu");
 }
 

From 10fd4a95b30195e0cff4fbaa6d2a0cd8039ce129 Mon Sep 17 00:00:00 2001
From: Yuanle Liu <yuanlehome@163.com>
Date: Tue, 11 Apr 2023 17:04:52 +0800
Subject: [PATCH 068/156] [Paddle Inference] Predictor support paddle::Tensor
 (#50445)

---
 cmake/phi_header.cmake                        |  34 ++--
 paddle/fluid/framework/feed_fetch_method.cc   |   2 +-
 .../fluid/inference/api/analysis_predictor.cc | 151 ++++++++++++++++--
 .../fluid/inference/api/analysis_predictor.h  |  42 ++++-
 .../inference/api/demo_ci/CMakeLists.txt      |   2 +-
 paddle/fluid/inference/api/paddle_api.h       |  10 ++
 .../inference/api/paddle_inference_api.h      |  11 ++
 paddle/fluid/inference/api/paddle_tensor.h    |   2 +
 paddle/fluid/jit/engine/predictor_engine.cc   | 136 +---------------
 paddle/fluid/pybind/eager_utils.cc            |   4 +
 paddle/fluid/pybind/inference_api.cc          |  31 ++--
 paddle/phi/api/include/tensor.h               |   6 +-
 paddle/phi/api/lib/api_custom_impl.cc         |   1 +
 .../inference/test_inference_predictor_run.py | 128 +++++++++++++++
 14 files changed, 382 insertions(+), 178 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_inference_predictor_run.py

diff --git a/cmake/phi_header.cmake b/cmake/phi_header.cmake
index d5000eadbd14de..b1476761897eaa 100644
--- a/cmake/phi_header.cmake
+++ b/cmake/phi_header.cmake
@@ -17,24 +17,21 @@ set(PADDLE_INFERENCE_INSTALL_DIR
 
 function(phi_header_path_compat TARGET_PATH)
   message(STATUS "phi header path compat processing: ${TARGET_PATH}")
-  string(FIND ${TARGET_PATH} "experimental" pos)
-  if(pos GREATER 1)
-    file(GLOB HEADERS "${TARGET_PATH}/*" "*.h")
-    foreach(header ${HEADERS})
-      if(${header} MATCHES ".*.h$")
-        file(READ ${header} HEADER_CONTENT)
-        string(REPLACE "paddle/phi/" "paddle/include/experimental/phi/"
-                       HEADER_CONTENT "${HEADER_CONTENT}")
-        string(REPLACE "paddle/fluid/platform/"
-                       "paddle/include/experimental/phi/" HEADER_CONTENT
-                       "${HEADER_CONTENT}")
-        string(REPLACE "paddle/utils/" "paddle/include/experimental/utils/"
-                       HEADER_CONTENT "${HEADER_CONTENT}")
-        file(WRITE ${header} "${HEADER_CONTENT}")
-        message(STATUS "phi header path compat processing complete: ${header}")
-      endif()
-    endforeach()
-  endif()
+  file(GLOB HEADERS "${TARGET_PATH}/*" "*.h")
+  foreach(header ${HEADERS})
+    if(${header} MATCHES ".*.h$")
+      file(READ ${header} HEADER_CONTENT)
+      string(REPLACE "paddle/phi/" "paddle/include/experimental/phi/"
+                     HEADER_CONTENT "${HEADER_CONTENT}")
+      string(REPLACE "paddle/fluid/platform/"
+                     "paddle/include/experimental/phi/" HEADER_CONTENT
+                     "${HEADER_CONTENT}")
+      string(REPLACE "paddle/utils/" "paddle/include/experimental/utils/"
+                     HEADER_CONTENT "${HEADER_CONTENT}")
+      file(WRITE ${header} "${HEADER_CONTENT}")
+      message(STATUS "phi header path compat processing complete: ${header}")
+    endif()
+  endforeach()
 endfunction()
 
 phi_header_path_compat(
@@ -51,6 +48,7 @@ phi_header_path_compat(
   ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/common)
 phi_header_path_compat(
   ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/core)
+phi_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/)
 
 # In order to be compatible with the original behavior, the header file name needs to be changed
 file(RENAME
diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc
index f21ca0c858acc0..0294e1ca54b437 100644
--- a/paddle/fluid/framework/feed_fetch_method.cc
+++ b/paddle/fluid/framework/feed_fetch_method.cc
@@ -95,7 +95,7 @@ phi::DenseTensor& GetVariableTensor(const Scope& scope,
   PADDLE_ENFORCE_EQ(var->IsType<phi::DenseTensor>(),
                     true,
                     platform::errors::InvalidArgument(
-                        "Only support lod tensor in GetVariableTensor now."));
+                        "Only support DenseTensor in GetVariableTensor now."));
   return *var->GetMutable<phi::DenseTensor>();
 }
 
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index b07c47b81eff45..38222b797f14fd 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -155,11 +155,10 @@ phi::Backend ConvertBackend(paddle_infer::PlaceType backend) {
       return phi::Backend::CPU;
   }
 }
-}  // namespace
 
-bool PaddleTensorToLoDTensor(const PaddleTensor &pt,
-                             phi::DenseTensor *t,
-                             const platform::Place &place) {
+bool PaddleTensorToDenseTensor(const PaddleTensor &pt,
+                               phi::DenseTensor *t,
+                               const platform::Place &place) {
   framework::DDim ddim = phi::make_ddim(pt.shape);
   void *input_ptr;
   if (pt.dtype == PaddleDType::INT64) {
@@ -270,6 +269,7 @@ bool PaddleTensorToLoDTensor(const PaddleTensor &pt,
   t->set_lod(lod);
   return true;
 }
+}  // namespace
 
 bool AnalysisPredictor::Init(
     const std::shared_ptr<framework::Scope> &parent_scope,
@@ -919,6 +919,17 @@ void AnalysisPredictor::MkldnnPreSet(const std::vector<PaddleTensor> &inputs) {
 #endif
 }
 
+void AnalysisPredictor::MkldnnPreSet(
+    const std::vector<paddle::Tensor> &inputs) {
+#ifdef PADDLE_WITH_MKLDNN
+  std::vector<std::vector<int>> inputs_shape;
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    inputs_shape.emplace_back(phi::vectorize<int>(inputs[i].dims()));
+  }
+  MkldnnPreSet(inputs_shape);
+#endif
+}
+
 void AnalysisPredictor::MkldnnPreSet(
     const std::vector<std::vector<int>> &inputs_shape) {
 #ifdef PADDLE_WITH_MKLDNN
@@ -1033,6 +1044,70 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
   return true;
 }
 
+bool AnalysisPredictor::Run(const std::vector<paddle::Tensor> &inputs,
+                            std::vector<paddle::Tensor> *outputs) {
+  inference::DisplayMemoryInfo(place_, "before run");
+  paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
+#ifdef PADDLE_WITH_MKLDNN
+  if (config_.use_mkldnn_) MkldnnPreSet(inputs);
+#endif
+  VLOG(3) << "predict start";
+  // set feed variable
+  framework::Scope *scope = sub_scope_ ? sub_scope_ : scope_.get();
+  PADDLE_ENFORCE_NOT_NULL(
+      scope,
+      platform::errors::PreconditionNotMet("The scope should not be nullptr."));
+  if (!SetFeed(inputs, scope)) {
+    LOG(ERROR) << "fail to set feed";
+    return false;
+  }
+
+#ifdef PADDLE_WITH_TENSORRT
+  if (config_.tensorrt_engine_enabled()) {
+    inference::tensorrt::TensorRTEngine::predictor_id_per_thread =
+        predictor_id_;
+    VLOG(3) << "thread_local var predictor_id in TensorRTEngine is set to: "
+            << inference::tensorrt::TensorRTEngine::predictor_id_per_thread;
+  }
+#endif
+
+  // Run the inference program
+  // if share variables, we need not create variables
+  executor_->Run();
+
+  inference::DisplayMemoryInfo(place_, "after run");
+
+  // get fetch variable
+  if (!GetFetch(outputs, scope)) {
+    LOG(ERROR) << "fail to get fetches";
+    return false;
+  }
+
+  // All the containers in the scope will be hold in inference, but the
+  // operators assume that the container will be reset after each batch.
+  // Here is a bugfix, collect all the container variables, and reset then to a
+  // bool; the next time, the operator will call MutableData and construct a new
+  // container again, so that the container will be empty for each batch.
+  if (sub_scope_) {
+    tensor_array_batch_cleaner_.CollectNoTensorVars(sub_scope_);
+  }
+  tensor_array_batch_cleaner_.ResetNoTensorVars();
+
+  // recover the cpu_math_library_num_threads to 1, in order to avoid thread
+  // conflict when integrating it into deployment service.
+  paddle::platform::SetNumThreads(1);
+#ifdef PADDLE_WITH_MKLDNN
+  if (config_.use_mkldnn_) MkldnnPostReset();
+#endif
+#if defined(PADDLE_WITH_MKLML)
+  // Frees unused memory allocated by the Intel® MKL Memory Allocator to
+  // avoid memory leak. See:
+  // https://software.intel.com/en-us/mkl-developer-reference-c-mkl-free-buffers
+  platform::dynload::MKL_Free_Buffers();
+#endif
+  return true;
+}
+
 bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
                                 framework::Scope *scope) {
   VLOG(3) << "Predictor::set_feed";
@@ -1047,7 +1122,7 @@ bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
 
   for (size_t i = 0; i < inputs.size(); ++i) {
     phi::DenseTensor *input = &feed_tensors_[i];
-    if (!PaddleTensorToLoDTensor(inputs[i], input, place_)) {
+    if (!PaddleTensorToDenseTensor(inputs[i], input, place_)) {
       return false;
     }
     int idx = -1;
@@ -1061,7 +1136,41 @@ bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
     } else {
       idx = PADDLE_GET_CONST(int, feeds_[i]->GetAttr("col"));
     }
-    framework::SetFeedVariable(scope, *input, "feed", idx);
+    framework::SetFeedVariable(scope, *input, framework::kFeedOpType, idx);
+  }
+  return true;
+}
+
+bool AnalysisPredictor::SetFeed(const std::vector<paddle::Tensor> &inputs,
+                                framework::Scope *scope) {
+  VLOG(3) << "Predictor::set_feed";
+  PADDLE_ENFORCE_EQ(inputs.size(),
+                    feeds_.size(),
+                    platform::errors::InvalidArgument(
+                        "wrong feed input size, need %d but get %d.",
+                        feeds_.size(),
+                        inputs.size()));
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    PADDLE_ENFORCE_EQ(inputs[i].initialized(),
+                      true,
+                      paddle::platform::errors::InvalidArgument(
+                          "The input Tensor expected to be initialized."));
+  }
+
+  if (std::all_of(inputs.cbegin(), inputs.cend(), [&](const paddle::Tensor &t) {
+        return !t.name().empty() && feed_names_.count(t.name());
+      })) {
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      auto &t = framework::GetVariableTensor(*scope, inputs[i].name());
+      t.ShareDataWith(
+          *std::dynamic_pointer_cast<phi::DenseTensor>(inputs[i].impl()));
+    }
+  } else {
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      auto &t = framework::GetVariableTensor(*scope, idx2feeds_[i]);
+      t.ShareDataWith(
+          *std::dynamic_pointer_cast<phi::DenseTensor>(inputs[i].impl()));
+    }
   }
   return true;
 }
@@ -1100,7 +1209,7 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
             idx,
             i));
     framework::FetchType &fetch_var =
-        framework::GetFetchVariable(*scope, "fetch", idx);
+        framework::GetFetchVariable(*scope, framework::kFetchOpType, idx);
     auto &fetch = PADDLE_GET(phi::DenseTensor, fetch_var);
     auto type = framework::TransToProtoVarType(fetch.dtype());
     auto output = &(outputs->at(i));
@@ -1125,6 +1234,19 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
   return true;
 }
 
+bool AnalysisPredictor::GetFetch(std::vector<paddle::Tensor> *outputs,
+                                 framework::Scope *scope) {
+  VLOG(3) << "Predictor::get_fetch";
+  outputs->resize(fetches_.size());
+  for (size_t i = 0; i < fetches_.size(); ++i) {
+    auto const &name = idx2fetches_[i];
+    auto &t = framework::GetVariableTensor(*scope, name);
+    (*outputs)[i] =
+        std::move(paddle::Tensor(std::make_shared<phi::DenseTensor>(t), name));
+  }
+  return true;
+}
+
 void AnalysisPredictor::PrepareArgument() {
   VLOG(3) << "AnalysisPredictor::PrepareArgument";
   // Init std::unique_ptr argument_.
@@ -1579,7 +1701,7 @@ void AnalysisPredictor::PrepareFeedFetch() {
                               "The sub_scope should not be nullptr."));
   CreateFeedFetchVar(sub_scope_);
   for (auto *op : inference_program_->Block(0).AllOps()) {
-    if (op->Type() == "feed") {
+    if (op->Type() == framework::kFeedOpType) {
       int idx = PADDLE_GET_CONST(int, op->GetAttr("col"));
       if (feeds_.size() <= static_cast<size_t>(idx)) {
         feeds_.resize(idx + 1);
@@ -1587,7 +1709,7 @@ void AnalysisPredictor::PrepareFeedFetch() {
       feeds_[idx] = op;
       feed_names_[op->Output("Out")[0]] = idx;
       idx2feeds_[idx] = op->Output("Out")[0];
-    } else if (op->Type() == "fetch") {
+    } else if (op->Type() == framework::kFetchOpType) {
       int idx = PADDLE_GET_CONST(int, op->GetAttr("col"));
       if (fetches_.size() <= static_cast<size_t>(idx)) {
         fetches_.resize(idx + 1);
@@ -1602,9 +1724,9 @@ void AnalysisPredictor::CreateFeedFetchVar(framework::Scope *scope) {
   PADDLE_ENFORCE_NOT_NULL(
       scope,
       platform::errors::InvalidArgument("The scope should not be nullptr."));
-  auto *var = scope->Var("feed");
+  auto *var = scope->Var(framework::kFeedOpType);
   var->GetMutable<framework::FeedList>();
-  var = scope->Var("fetch");
+  var = scope->Var(framework::kFetchOpType);
   var->GetMutable<framework::FetchList>();
 }
 
@@ -2186,7 +2308,7 @@ void AnalysisPredictor::ClearIntermediateTensor() {
       const std::string name = var->Name();
       auto *variable = executor_->GetScope()->FindVar(name);
       if (variable != nullptr && variable->IsType<phi::DenseTensor>() &&
-          name != "feed" && name != "fetch") {
+          name != framework::kFeedOpType && name != framework::kFetchOpType) {
         VLOG(3) << "Clear Intermediate Tensor: " << name;
         auto *t = variable->GetMutable<phi::DenseTensor>();
         t->clear();
@@ -2653,6 +2775,11 @@ std::map<std::string, DataType> Predictor::GetOutputTypes() {
 
 bool Predictor::Run() { return predictor_->ZeroCopyRun(); }
 
+bool Predictor::Run(const std::vector<paddle::Tensor> &inputs,
+                    std::vector<paddle::Tensor> *outputs) {
+  return predictor_->Run(inputs, outputs);
+}
+
 std::unique_ptr<Predictor> Predictor::Clone(void *stream) {
   auto analysis_pred = predictor_->Clone(stream);
   std::unique_ptr<Predictor> pred(new Predictor(std::move(analysis_pred)));
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index 5a578a9b94fcb7..83207a8bfd654c 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -31,15 +31,14 @@
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/resource_manager.h"
 #include "paddle/fluid/platform/device/gpu/gpu_types.h"
-#include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/phi/core/dense_tensor.h"
 #ifdef PADDLE_WITH_TESTING
 #include <gtest/gtest.h>
 #include <gtest/gtest_prod.h>
 #endif
 
 namespace paddle_infer {
-using float16 = paddle::platform::float16;
 namespace experimental {
 class InternalUtils;
 };
@@ -150,6 +149,16 @@ class AnalysisPredictor : public PaddlePredictor {
            std::vector<PaddleTensor> *output_data,
            int batch_size = -1) override;
 
+  ///
+  /// \brief Run the prediction engine (Recommended).
+  ///
+  /// \param[in] inputs input tensors
+  /// \param[out] outputs output tensors
+  /// \return Whether the function executed successfully
+  ///
+  bool Run(const std::vector<paddle::Tensor> &inputs,
+           std::vector<paddle::Tensor> *outputs) override;
+
   ///
   /// \brief Get the input names
   ///
@@ -378,6 +387,17 @@ class AnalysisPredictor : public PaddlePredictor {
   ///
   bool SetFeed(const std::vector<PaddleTensor> &input_datas,
                framework::Scope *scope);
+
+  ///
+  /// \brief Prepare input data, only used in Run()
+  ///
+  /// \param[in] inputs inpute tensors
+  /// \param[in] scope the scope used by predictor
+  /// \return Whether the function executed successfully
+  ///
+  bool SetFeed(const std::vector<paddle::Tensor> &inputs,
+               framework::Scope *scope);
+
   ///
   /// \brief Get the output data, only used in Run()
   ///
@@ -387,6 +407,16 @@ class AnalysisPredictor : public PaddlePredictor {
   ///
   bool GetFetch(std::vector<PaddleTensor> *output_data,
                 framework::Scope *scope);
+
+  ///
+  /// \brief Get the output data, only used in Run()
+  ///
+  /// \param[out] outputs output tensors
+  /// \param[in] scope the scope used by predictor
+  /// \return Whether the function executed successfully
+  ///
+  bool GetFetch(std::vector<paddle::Tensor> *outputs, framework::Scope *scope);
+
   ///
   /// \brief Get the output data, only used in GetFetch()
   ///
@@ -404,6 +434,14 @@ class AnalysisPredictor : public PaddlePredictor {
   /// \param[in] inputs tensors
   ///
   void MkldnnPreSet(const std::vector<PaddleTensor> &inputs);
+  ///
+  /// \brief PreSet for Mkldnn multi-thread and dynamic shape input.
+  ///
+  /// Used in AnalysisPredictor::Run().
+  ///
+  /// \param[in] inputs tensors
+  ///
+  void MkldnnPreSet(const std::vector<paddle::Tensor> &inputs);
 
   ///
   /// \brief PreSet for Mkldnn multi-thread and dynamic shape input.
diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
index fc23caee656380..11f214bc45d535 100644
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -83,7 +83,7 @@ else()
   if(WITH_MKL)
     set(FLAG_OPENMP "-fopenmp")
   endif()
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 ${FLAG_OPENMP}")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 ${FLAG_OPENMP}")
 endif()
 
 if(WITH_GPU)
diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
index e83c1a9f9444c9..3a51f91b3afc22 100644
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -221,6 +221,16 @@ class PD_INFER_DECL PaddlePredictor {
                    std::vector<PaddleTensor>* output_data,
                    int batch_size = -1) = 0;
 
+  /// \brief This interface takes input and runs the network (Recommended).
+  /// \param[in] inputs An list of Tensor as the input to the network.
+  /// \param[out] output_data Pointer to the tensor list, which holds the output
+  /// Tensor
+  /// \return Whether the run is successful
+  virtual bool Run(const std::vector<paddle::Tensor>& inputs,
+                   std::vector<paddle::Tensor>* outputs) {
+    return false;
+  }
+
   /// \brief  Used to get the name of the network input.
   /// Be inherited by AnalysisPredictor, Only used in ZeroCopy scenarios.
   /// \return Input tensor names.
diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
index d7f15e0529894f..54a9d9af117caa 100644
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -128,6 +128,17 @@ class PD_INFER_DECL Predictor {
   ///
   bool Run();
 
+  ///
+  /// \brief Run the prediction engine (Recommended)
+  ///
+  /// \param[in] inputs An list of Tensor as the input to the network.
+  /// \param[out] outputs Pointer to the tensor list, which holds the output
+  /// Tensor
+  ///
+  /// \return Whether the run is successful
+  bool Run(const std::vector<paddle::Tensor>& inputs,
+           std::vector<paddle::Tensor>* outputs);
+
   ///
   /// \brief Get the output names
   ///
diff --git a/paddle/fluid/inference/api/paddle_tensor.h b/paddle/fluid/inference/api/paddle_tensor.h
index 0301892792dc30..b9c86a60f55b84 100644
--- a/paddle/fluid/inference/api/paddle_tensor.h
+++ b/paddle/fluid/inference/api/paddle_tensor.h
@@ -21,6 +21,8 @@
 
 #include "paddle_infer_declare.h"  // NOLINT
 
+#include "paddle/phi/api/include/tensor.h"  // expose paddle::Tensor
+
 #ifdef PADDLE_WITH_ONNXRUNTIME
 #include "onnxruntime_c_api.h"    // NOLINT
 #include "onnxruntime_cxx_api.h"  // NOLINT
diff --git a/paddle/fluid/jit/engine/predictor_engine.cc b/paddle/fluid/jit/engine/predictor_engine.cc
index d18f4f487dbe2e..54e35bc0f69dd6 100644
--- a/paddle/fluid/jit/engine/predictor_engine.cc
+++ b/paddle/fluid/jit/engine/predictor_engine.cc
@@ -22,11 +22,6 @@
 namespace paddle {
 namespace jit {
 
-static PaddleTensor DenseTensorToPaddleTensor(DenseTensor *t);
-static bool PaddleTensorToDenseTensor(const PaddleTensor &pt,
-                                      DenseTensor *t,
-                                      const platform::Place &place);
-
 PredictorEngine::PredictorEngine(
     const std::shared_ptr<FunctionInfo> &info,
     const std::shared_ptr<VariableMap> &params_dict,
@@ -52,6 +47,7 @@ PredictorEngine::PredictorEngine(
   config.SetSkipLoadParams(true);
   config.SetApplyOptim(true);
   config.SwitchIrOptim(true);
+  config.SwitchUseFeedFetchOps(false);
 
   predictor_.reset(new AnalysisPredictor(config));
 
@@ -78,135 +74,15 @@ std::unique_ptr<BaseEngine> PredictorEngine::Clone(void *stream) {
 
 std::vector<Tensor> PredictorEngine::operator()(
     const std::vector<Tensor> &inputs) {
-  auto dense_tensors = utils::ToDenseTensors(inputs);
-  return utils::ToTensors(this->operator()(dense_tensors));
-}
-
-std::vector<DenseTensor> PredictorEngine::operator()(
-    const std::vector<DenseTensor> &inputs) {
-  std::vector<PaddleTensor> pt_inputs;
-  std::vector<PaddleTensor> pt_outputs;
-  for (auto &t : inputs) {
-    auto non_const_t = const_cast<DenseTensor *>(&t);
-    pt_inputs.emplace_back(DenseTensorToPaddleTensor(non_const_t));
-  }
-
-  predictor_->Run(pt_inputs, &pt_outputs);
-
-  std::vector<DenseTensor> outputs;
-  for (auto &pt : pt_outputs) {
-    DenseTensor t;
-    PaddleTensorToDenseTensor(pt, &t, place_);
-    outputs.emplace_back(t);
-  }
+  std::vector<Tensor> outputs;
+  predictor_->Run(inputs, &outputs);
 
   return outputs;
 }
 
-static PaddleTensor DenseTensorToPaddleTensor(DenseTensor *t) {
-  PaddleTensor pt;
-  switch (framework::TransToProtoVarType(t->dtype())) {
-    case framework::proto::VarType::INT32: {
-      pt.data.Reset(t->data(), t->numel() * sizeof(int32_t));
-      pt.dtype = PaddleDType::INT32;
-    } break;
-    case framework::proto::VarType::INT64: {
-      pt.data.Reset(t->data(), t->numel() * sizeof(int64_t));
-      pt.dtype = PaddleDType::INT64;
-    } break;
-    case framework::proto::VarType::FP32: {
-      pt.data.Reset(t->data(), t->numel() * sizeof(float));
-      pt.dtype = PaddleDType::FLOAT32;
-    } break;
-    default:
-      PADDLE_THROW(
-          platform::errors::Unimplemented("Unsupported tensor date type. Now "
-                                          "only supports INT64, FP32, INT32."));
-  }
-  pt.shape = phi::vectorize<int>(t->dims());
-  return pt;
-}
-
-static bool PaddleTensorToDenseTensor(const PaddleTensor &pt,
-                                      DenseTensor *t,
-                                      const platform::Place &place) {
-  framework::DDim ddim = phi::make_ddim(pt.shape);
-  void *input_ptr;
-  switch (pt.dtype) {
-    case PaddleDType::INT64:
-      input_ptr = t->mutable_data<int64_t>(ddim, place);
-      break;
-    case PaddleDType::FLOAT32:
-      input_ptr = t->mutable_data<float>(ddim, place);
-      break;
-    case PaddleDType::INT32:
-      input_ptr = t->mutable_data<int32_t>(ddim, place);
-      break;
-    case PaddleDType::FLOAT16:
-      input_ptr = t->mutable_data<float16>(ddim, place);
-      break;
-    default:
-      LOG(ERROR) << "unsupported feed type " << pt.dtype;
-      return false;
-  }
-
-  PADDLE_ENFORCE_NOT_NULL(
-      input_ptr,
-      paddle::platform::errors::Fatal(
-          "Cannot convert to LoDTensor because LoDTensor creation failed."));
-  PADDLE_ENFORCE_NOT_NULL(
-      pt.data.data(),
-      paddle::platform::errors::InvalidArgument(
-          "The data contained in the input PaddleTensor is illegal."));
-
-  if (platform::is_cpu_place(place)) {
-    // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
-    std::memcpy(
-        static_cast<void *>(input_ptr), pt.data.data(), pt.data.length());
-  } else if (platform::is_ipu_place(place)) {
-#ifdef PADDLE_WITH_IPU
-    std::memcpy(
-        static_cast<void *>(input_ptr), pt.data.data(), pt.data.length());
-#else
-    PADDLE_THROW(paddle::platform::errors::Fatal(
-        "Not compile with WITH_IPU, should not reach here."));
-#endif
-  } else if (platform::is_gpu_place(place)) {
-    PADDLE_ENFORCE_EQ(platform::is_xpu_place(place),
-                      false,
-                      platform::errors::InvalidArgument(
-                          "Only one choice can be made between CPU and XPU."));
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto *dev_ctx = static_cast<const phi::GPUContext *>(pool.Get(place));
-    auto dst_gpu_place = place;
-    memory::Copy(dst_gpu_place,
-                 static_cast<void *>(input_ptr),
-                 platform::CPUPlace(),
-                 pt.data.data(),
-                 pt.data.length(),
-                 dev_ctx->stream());
-#else
-    PADDLE_THROW(paddle::platform::errors::Fatal(
-        "Not compile with CUDA, should not reach here."));
-#endif
-  } else if (platform::is_xpu_place(place)) {
-#ifdef PADDLE_WITH_XPU
-    auto dst_xpu_place = place;
-    memory::Copy(dst_xpu_place,
-                 static_cast<void *>(input_ptr),
-                 platform::CPUPlace(),
-                 pt.data.data(),
-                 pt.data.length());
-#else
-    PADDLE_THROW(paddle::platform::errors::Fatal(
-        "Not compile with XPU, should not reach here."));
-#endif
-  } else {
-    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
-        "The analysis predictor supports CPU, GPU and XPU now."));
-  }
-  return true;
+std::vector<DenseTensor> PredictorEngine::operator()(
+    const std::vector<DenseTensor> &inputs) {
+  return utils::ToDenseTensors(this->operator()(utils::ToTensors(inputs)));
 }
 
 }  // namespace jit
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index f2d1c396617b12..93030c9138fa84 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -770,7 +770,11 @@ PyObject* ToPyObject(const std::vector<std::vector<size_t>>& value) {
 
 PyObject* ToPyObject(const std::vector<paddle::Tensor>& value,
                      bool return_py_none_if_not_initialize) {
+  // NOTE(liuyuanle): I encountered a bug(access violation) in windows. ref to
+  // https://stackoverflow.com/questions/55598839/how-to-fix-access-violation-error-when-returning-pyobject-from-c-function-usin
+  PyGILState_STATE gstate = PyGILState_Ensure();
   PyObject* result = PyList_New((Py_ssize_t)value.size());
+  PyGILState_Release(gstate);
 
   for (size_t i = 0; i < value.size(); i++) {
     if (!value[i].initialized() && return_py_none_if_not_initialize) {
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index 44966f930d3f15..0546dd84b6882b 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -65,7 +65,7 @@ constexpr int NPY_UINT16_ = 4;
 // paddle::platform::float16 as numpy.float16.
 // Ref: https://github.com/pybind/pybind11/issues/1776
 template <>
-struct npy_format_descriptor<paddle_infer::float16> {
+struct npy_format_descriptor<phi::dtype::float16> {
   static py::dtype dtype() {
     handle ptr = npy_api::get().PyArray_DescrFromType_(NPY_FLOAT16_);
     return reinterpret_borrow<py::dtype>(ptr);
@@ -180,7 +180,7 @@ py::dtype PaddleDTypeToNumpyDType(PaddleDType dtype) {
       dt = py::dtype::of<float>();
       break;
     case PaddleDType::FLOAT16:
-      dt = py::dtype::of<paddle_infer::float16>();
+      dt = py::dtype::of<phi::dtype::float16>();
       break;
     case PaddleDType::UINT8:
       dt = py::dtype::of<uint8_t>();
@@ -264,7 +264,7 @@ void PaddleInferShareExternalData(paddle_infer::Tensor &tensor,  // NOLINT
         ToPaddleInferPlace(input_tensor.place().GetType()));
   } else if (input_tensor.dtype() == phi::DataType::FLOAT16) {
     tensor.ShareExternalData(
-        static_cast<paddle::platform::float16 *>(input_tensor.data()),
+        static_cast<phi::dtype::float16 *>(input_tensor.data()),
         shape,
         ToPaddleInferPlace(input_tensor.place().GetType()));
   } else if (input_tensor.dtype() == phi::DataType::INT32) {
@@ -353,7 +353,7 @@ size_t PaddleGetDTypeSize(PaddleDType dt) {
       size = sizeof(float);
       break;
     case PaddleDType::FLOAT16:
-      size = sizeof(paddle_infer::float16);
+      size = sizeof(phi::dtype::float16);
       break;
     case PaddleDType::INT8:
       size = sizeof(int8_t);
@@ -392,8 +392,8 @@ py::array ZeroCopyTensorToNumpy(ZeroCopyTensor &tensor) {  // NOLINT
       tensor.copy_to_cpu<float>(static_cast<float *>(array.mutable_data()));
       break;
     case PaddleDType::FLOAT16:
-      tensor.copy_to_cpu<paddle::platform::float16>(
-          static_cast<paddle::platform::float16 *>(array.mutable_data()));
+      tensor.copy_to_cpu<phi::dtype::float16>(
+          static_cast<phi::dtype::float16 *>(array.mutable_data()));
       break;
     case PaddleDType::UINT8:
       tensor.copy_to_cpu<uint8_t>(static_cast<uint8_t *>(array.mutable_data()));
@@ -432,8 +432,8 @@ py::array PaddleInferTensorToNumpy(paddle_infer::Tensor &tensor) {  // NOLINT
       tensor.CopyToCpu<float>(static_cast<float *>(array.mutable_data()));
       break;
     case PaddleDType::FLOAT16:
-      tensor.CopyToCpu<paddle::platform::float16>(
-          static_cast<paddle::platform::float16 *>(array.mutable_data()));
+      tensor.CopyToCpu<phi::dtype::float16>(
+          static_cast<phi::dtype::float16 *>(array.mutable_data()));
       break;
     case PaddleDType::UINT8:
       tensor.CopyToCpu(static_cast<uint8_t *>(array.mutable_data()));
@@ -1062,6 +1062,16 @@ void BindPaddleInferPredictor(py::module *m) {
       .def("get_output_names", &paddle_infer::Predictor::GetOutputNames)
       .def("get_input_handle", &paddle_infer::Predictor::GetInputHandle)
       .def("get_output_handle", &paddle_infer::Predictor::GetOutputHandle)
+      .def(
+          "run",
+          [](paddle_infer::Predictor &self, py::handle py_in_tensor_list) {
+            auto in_tensor_list =
+                CastPyArg2VectorOfTensor(py_in_tensor_list.ptr(), 0);
+            std::vector<paddle::Tensor> outputs;
+            self.Run(in_tensor_list, &outputs);
+            return py::handle(ToPyObject(outputs));
+          },
+          py::arg("inputs"))
       .def("run", [](paddle_infer::Predictor &self) { self.Run(); })
       .def("clone",
            [](paddle_infer::Predictor &self) { return self.Clone(nullptr); })
@@ -1091,9 +1101,9 @@ void BindZeroCopyTensor(py::module *m) {
       .def("copy_from_cpu", &ZeroCopyTensorCreate<int32_t>)
       .def("copy_from_cpu", &ZeroCopyTensorCreate<int64_t>)
       .def("copy_from_cpu", &ZeroCopyTensorCreate<float>)
+      .def("copy_from_cpu", &ZeroCopyTensorCreate<phi::dtype::float16>)
       // NOTE(liuyuanle): double must be bound after float.
       .def("copy_from_cpu", &ZeroCopyTensorCreate<double>)
-      .def("copy_from_cpu", &ZeroCopyTensorCreate<paddle_infer::float16>)
       .def("copy_from_cpu", &ZeroCopyTensorCreate<bool>)
       .def("copy_from_cpu", &ZeroCopyStringTensorCreate)
       .def("copy_to_cpu", &ZeroCopyTensorToNumpy)
@@ -1116,10 +1126,9 @@ void BindPaddleInferTensor(py::module *m) {
       .def("_copy_from_cpu_bind", &PaddleInferTensorCreate<int32_t>)
       .def("_copy_from_cpu_bind", &PaddleInferTensorCreate<int64_t>)
       .def("_copy_from_cpu_bind", &PaddleInferTensorCreate<float>)
+      .def("_copy_from_cpu_bind", &PaddleInferTensorCreate<phi::dtype::float16>)
       // NOTE(liuyuanle): double must be bound after float.
       .def("_copy_from_cpu_bind", &PaddleInferTensorCreate<double>)
-      .def("_copy_from_cpu_bind",
-           &PaddleInferTensorCreate<paddle_infer::float16>)
       .def("_copy_from_cpu_bind", &PaddleInferTensorCreate<bool>)
       .def("_copy_from_cpu_bind", &PaddleInferStringTensorCreate)
       .def("_share_external_data_bind", &PaddleInferShareExternalData)
diff --git a/paddle/phi/api/include/tensor.h b/paddle/phi/api/include/tensor.h
index d3943750fd21ef..24bcc63dbd278f 100644
--- a/paddle/phi/api/include/tensor.h
+++ b/paddle/phi/api/include/tensor.h
@@ -416,7 +416,7 @@ class PADDLE_API Tensor final {
   /**
    * @brief Return the name of Tensor.
    * @note Used to adapt original execution mechanism and debug analysis
-   * in the development of new dygraph. It may be removed in the future.
+   * in the development of new dygraph.
    *
    * @return const std::string&
    */
@@ -425,7 +425,7 @@ class PADDLE_API Tensor final {
   /**
    * @brief Set name of Tensor.
    * @note Used to adapt original execution mechanism and debug analysis
-   * in the development of new dygraph. It may be removed in the future.
+   * in the development of new dygraph.
    *
    * @param const std::string& name
    */
@@ -657,7 +657,7 @@ class PADDLE_API Tensor final {
 
   /**
    * Tensor name: used to adapt original execution mechanism and debug analysis
-   * in the development of new dygraph. It may be removed in the future.
+   * in the development of new dygraph.
    */
   std::string name_{""};
 
diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc
index 3cebef1588ea5a..6a409b64196239 100644
--- a/paddle/phi/api/lib/api_custom_impl.cc
+++ b/paddle/phi/api/lib/api_custom_impl.cc
@@ -136,6 +136,7 @@ Tensor add_n_impl(const std::vector<Tensor>& x) {
 Tensor copy_to_impl(const Tensor& x, Place place, bool blocking) {
   Tensor out;
   copy(x, place, blocking, &out);
+  out.set_name(x.name());
   return out;
 }
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_inference_predictor_run.py b/python/paddle/fluid/tests/unittests/ir/inference/test_inference_predictor_run.py
new file mode 100644
index 00000000000000..99ba29956c5dad
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_inference_predictor_run.py
@@ -0,0 +1,128 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tempfile
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle.inference import Config, create_predictor
+
+
+class TestNet(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.fc1 = paddle.nn.Linear(4, 4)
+        self.fc2 = paddle.nn.Linear(4, 4)
+
+    def forward(self, x1, x2):
+        y1 = self.fc1(x1)
+        y2 = self.fc2(x2)
+        return y1 + y2
+
+
+@unittest.skipIf(
+    not paddle.is_compiled_with_cuda(), 'should compile with cuda.'
+)
+class TestPredictorRunWithTensor(unittest.TestCase):
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+        net = TestNet()
+        model = paddle.jit.to_static(
+            net,
+            input_spec=[
+                paddle.static.InputSpec(
+                    shape=[None, 4], dtype='float32', name='input0'
+                ),
+                paddle.static.InputSpec(
+                    shape=[None, 4], dtype='float32', name='input1'
+                ),
+            ],
+        )
+        paddle.jit.save(
+            model,
+            os.path.join(
+                self.temp_dir.name, 'test_predictor_run_model/inference'
+            ),
+        )
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
+    def init_predictor(self):
+        config = Config(
+            os.path.join(
+                self.temp_dir.name,
+                'test_predictor_run_model/inference.pdmodel',
+            ),
+            os.path.join(
+                self.temp_dir.name,
+                'test_predictor_run_model/inference.pdiparams',
+            ),
+        )
+        config.enable_use_gpu(256, 0)
+        config.enable_memory_optim()
+        predictor = create_predictor(config)
+        return predictor
+
+    def get_inputs(self):
+        input0 = np.array([[1, 2, 3, 4], [2, 3, 4, 5]]).astype(np.float32)
+        input1 = np.array([[0.1, 0.2, 0.3, 0.4], [1.2, 1.3, 1.4, 1.5]]).astype(
+            np.float32
+        )
+
+        input0_tensor = paddle.to_tensor(input0)
+        input1_tensor = paddle.to_tensor(input1)
+
+        return [input0_tensor, input1_tensor]
+
+    def get_disorder_output(self):
+        predictor = self.init_predictor()
+
+        [input0_tensor, input1_tensor] = self.get_inputs()
+
+        input_names = predictor.get_input_names()
+        input0_tensor.name = input_names[0]
+        input1_tensor.name = input_names[1]
+
+        # disorder
+        inputs = [input1_tensor, input0_tensor]
+        outputs = predictor.run(inputs)
+
+        return outputs[0]
+
+    def get_inorder_output(self):
+        predictor = self.init_predictor()
+
+        [input0_tensor, input1_tensor] = self.get_inputs()
+
+        # inorder
+        inputs = [input0_tensor, input1_tensor]
+        outputs = predictor.run(inputs)
+
+        return outputs[0]
+
+    def test_output(self):
+        inorder_output = self.get_inorder_output()
+        disorder_output = self.get_disorder_output()
+
+        assert np.allclose(
+            inorder_output.numpy().flatten(), disorder_output.numpy().flatten()
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()

From 745425778aac65c71a9af45e7982227e4fa84add Mon Sep 17 00:00:00 2001
From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com>
Date: Tue, 11 Apr 2023 17:08:54 +0800
Subject: [PATCH 069/156] remove -Wimplicit-fallthrough (#52717)

* delete [-Wno-error=terminate], test=develop

* remove GPUps[-Wterminate],test=develop

* remove some -Wno-, test=develop

* modify ~MatmulDescriptor

* mess

* remove -Wimplicit-fallthrough, test=develop

* remove -Wimplicit-fallthrough, test=develop

* remove -Wimplicit-fallthrough, test=develop

* remove -Wimplicit-fallthrough, test=develop

* remove , test=develop
---
 paddle/fluid/imperative/layout_transformer.h        |  6 ++++++
 .../api/composite_backward/composite_backward_api.h |  4 ++++
 paddle/utils/string/tinyformat/tinyformat.h         | 13 +++++++++++++
 3 files changed, 23 insertions(+)

diff --git a/paddle/fluid/imperative/layout_transformer.h b/paddle/fluid/imperative/layout_transformer.h
index 4dba2d16d598c1..93c924a095c9ee 100644
--- a/paddle/fluid/imperative/layout_transformer.h
+++ b/paddle/fluid/imperative/layout_transformer.h
@@ -402,10 +402,16 @@ class ArgmaxOpTransformer
           case paddle::framework::proto::AttrType::INT: {
             auto axis = PADDLE_GET_CONST(int, (*attrs)["axis"]);
             (*attrs)["axis"] = static_cast<int>(perm[axis]);
+#ifdef LINUX
+            __attribute__((fallthrough));
+#endif
           }
           case paddle::framework::proto::AttrType::LONG: {
             auto axis = PADDLE_GET_CONST(int64_t, (*attrs)["axis"]);
             (*attrs)["axis"] = static_cast<int64_t>(perm[axis]);
+#ifdef LINUX
+            __attribute__((fallthrough));
+#endif
           }
           default:
             VLOG(4) << "The data_type of axis is Error, axis must be int or "
diff --git a/paddle/fluid/prim/api/composite_backward/composite_backward_api.h b/paddle/fluid/prim/api/composite_backward/composite_backward_api.h
index 286d3cae8de5df..6697f1a614c381 100644
--- a/paddle/fluid/prim/api/composite_backward/composite_backward_api.h
+++ b/paddle/fluid/prim/api/composite_backward/composite_backward_api.h
@@ -1485,7 +1485,11 @@ void batch_norm_grad(const Tensor& x,
         }
         break;
       }
+#ifdef LINUX
+      __attribute__((fallthrough));
+#endif
     }
+
     default:
       PADDLE_THROW(phi::errors::InvalidArgument("Unknown storage order: %s",
                                                 data_layout));
diff --git a/paddle/utils/string/tinyformat/tinyformat.h b/paddle/utils/string/tinyformat/tinyformat.h
index f9c55fe1835fd2..bd8d47849db966 100644
--- a/paddle/utils/string/tinyformat/tinyformat.h
+++ b/paddle/utils/string/tinyformat/tinyformat.h
@@ -691,6 +691,9 @@ inline const char *streamStateFromFormat(std::ostream &out,       // NOLINT
       break;
     case 'X':
       out.setf(std::ios::uppercase);
+#ifdef LINUX
+      __attribute__((fallthrough));
+#endif
     case 'x':
     case 'p':
       out.setf(std::ios::hex, std::ios::basefield);
@@ -698,17 +701,27 @@ inline const char *streamStateFromFormat(std::ostream &out,       // NOLINT
       break;
     case 'E':
       out.setf(std::ios::uppercase);
+#ifdef LINUX
+      __attribute__((fallthrough));
+#endif
     case 'e':
       out.setf(std::ios::scientific, std::ios::floatfield);
       out.setf(std::ios::dec, std::ios::basefield);
       break;
     case 'F':
       out.setf(std::ios::uppercase);
+#ifdef LINUX
+      __attribute__((fallthrough));
+#endif
     case 'f':
       out.setf(std::ios::fixed, std::ios::floatfield);
       break;
     case 'G':
       out.setf(std::ios::uppercase);
+#ifdef LINUX
+      __attribute__((fallthrough));
+#endif
+
     case 'g':
       out.setf(std::ios::dec, std::ios::basefield);
       // As in boost::format, let stream decide float format.

From 8e9bfa7f38e3de7ae28030216248894aa285476b Mon Sep 17 00:00:00 2001
From: lzydev <lizhiyu02@baidu.com>
Date: Tue, 11 Apr 2023 17:59:31 +0800
Subject: [PATCH 070/156] autogen unique (#52738)

---
 .../generator/get_expected_kernel_func.cc     |  18 ++
 .../generator/get_expected_kernel_func.h      |   4 +
 paddle/fluid/operators/unique_op.cc           | 168 ------------------
 paddle/phi/api/yaml/op_compat.yaml            |   9 +
 paddle/phi/api/yaml/static_ops.yaml           |  10 ++
 paddle/phi/ops/compat/unique_sig.cc           |  11 ++
 6 files changed, 52 insertions(+), 168 deletions(-)
 delete mode 100644 paddle/fluid/operators/unique_op.cc

diff --git a/paddle/fluid/operators/generator/get_expected_kernel_func.cc b/paddle/fluid/operators/generator/get_expected_kernel_func.cc
index a4b0e637e12dc7..ce2cbb43deed0e 100644
--- a/paddle/fluid/operators/generator/get_expected_kernel_func.cc
+++ b/paddle/fluid/operators/generator/get_expected_kernel_func.cc
@@ -158,5 +158,23 @@ phi::KernelKey GetMatrixNmsExpectedKernelType(
                         platform::CPUPlace());
 }
 
+phi::KernelKey GetUniqueExpectedKernelType(
+    const framework::ExecutionContext& ctx,
+    const framework::OperatorWithKernel* op_ptr) {
+  (void)ctx;
+  // Return CPUPlace when Attr("is_sorted") is false. Because it means
+  // that fluid.layers.unique is called, but there is no cuda kernel.
+  if (!ctx.Attr<bool>("is_sorted")) {
+    return phi::KernelKey(
+        op_ptr->OperatorWithKernel::IndicateVarDataType(ctx, "X"),
+        platform::CPUPlace());
+  } else {
+    // new version paddle.unique is called.
+    return phi::KernelKey(
+        op_ptr->OperatorWithKernel::IndicateVarDataType(ctx, "X"),
+        ctx.GetPlace());
+  }
+}
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/generator/get_expected_kernel_func.h b/paddle/fluid/operators/generator/get_expected_kernel_func.h
index a83f5865e34995..cbbb74e2312ed3 100644
--- a/paddle/fluid/operators/generator/get_expected_kernel_func.h
+++ b/paddle/fluid/operators/generator/get_expected_kernel_func.h
@@ -44,5 +44,9 @@ phi::KernelKey GetMatrixNmsExpectedKernelType(
     const framework::ExecutionContext& ctx,
     const framework::OperatorWithKernel* op_ptr);
 
+phi::KernelKey GetUniqueExpectedKernelType(
+    const framework::ExecutionContext& ctx,
+    const framework::OperatorWithKernel* op_ptr);
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/unique_op.cc b/paddle/fluid/operators/unique_op.cc
deleted file mode 100644
index 5484a16ca6bd4d..00000000000000
--- a/paddle/fluid/operators/unique_op.cc
+++ /dev/null
@@ -1,168 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/unique_op.h"
-
-#include <memory>
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/phi/infermeta/unary.h"
-
-namespace paddle {
-namespace operators {
-
-class UniqueOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "unique");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "unique");
-
-    bool return_index = ctx->Attrs().Get<bool>("return_index");
-    bool return_inverse = ctx->Attrs().Get<bool>("return_inverse");
-    bool return_counts = ctx->Attrs().Get<bool>("return_counts");
-    auto axis_vec = ctx->Attrs().Get<std::vector<int>>("axis");
-    auto data_type =
-        static_cast<phi::DataType>(static_cast<framework::proto::VarType::Type>(
-            ctx->Attrs().Get<int>("dtype")));
-
-    // Construct MetaTensor for InferMeta Func
-    using CompatMetaTensor = framework::CompatMetaTensor;
-    CompatMetaTensor x(ctx->GetInputVarPtrs("X")[0], ctx->IsRuntime());
-    CompatMetaTensor out(ctx->GetOutputVarPtrs("Out")[0], ctx->IsRuntime());
-    std::unique_ptr<CompatMetaTensor> indices(nullptr);
-    std::unique_ptr<CompatMetaTensor> index(nullptr);
-    std::unique_ptr<CompatMetaTensor> counts(nullptr);
-
-    if (return_index) {
-      OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "unique");
-      indices =
-          std::move(std::unique_ptr<CompatMetaTensor>(new CompatMetaTensor(
-              ctx->GetOutputVarPtrs("Indices")[0], ctx->IsRuntime())));
-    }
-    if (return_inverse) {
-      OP_INOUT_CHECK(ctx->HasOutput("Index"), "Output", "Index", "unique");
-      index = std::move(std::unique_ptr<CompatMetaTensor>(new CompatMetaTensor(
-          ctx->GetOutputVarPtrs("Index")[0], ctx->IsRuntime())));
-    }
-    if (return_counts) {
-      OP_INOUT_CHECK(ctx->HasOutput("Counts"), "Output", "Counts", "unique");
-      counts = std::move(std::unique_ptr<CompatMetaTensor>(new CompatMetaTensor(
-          ctx->GetOutputVarPtrs("Counts")[0], ctx->IsRuntime())));
-    }
-    bool is_sorted = ctx->Attrs().Get<bool>("is_sorted");
-    if (is_sorted) {
-      phi::UniqueInferMeta(x,
-                           return_index,
-                           return_inverse,
-                           return_counts,
-                           axis_vec,
-                           data_type,
-                           &out,
-                           indices.get(),
-                           index.get(),
-                           counts.get());
-    } else {
-      OP_INOUT_CHECK(ctx->HasOutput("Index"), "Output", "Index", "unique");
-      if (index == nullptr) {
-        index =
-            std::move(std::unique_ptr<CompatMetaTensor>(new CompatMetaTensor(
-                ctx->GetOutputVarPtrs("Index")[0], ctx->IsRuntime())));
-      }
-      phi::UniqueRawInferMeta(x,
-                              return_index,
-                              return_inverse,
-                              return_counts,
-                              axis_vec,
-                              data_type,
-                              is_sorted,
-                              &out,
-                              indices.get(),
-                              index.get(),
-                              counts.get());
-    }
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    // Return CPUPlace when Attr("is_sorted") is false. Because it means
-    // that fluid.layers.unique is called, but there is no cuda kernel.
-    if (!ctx.Attr<bool>("is_sorted")) {
-      return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                            platform::CPUPlace());
-    } else {
-      // new version paddle.unique is called.
-      return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                            ctx.GetPlace());
-    }
-  }
-};
-
-class UniqueOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "Input tensor. It should be a 1-D tensor when Attr(is_sorted)"
-             " is false or a N-D tensor when Attr(is_sorted) is true.");
-    AddAttr<int>("dtype", "data type for output index");
-    AddOutput("Out", "A unique subsequence for input tensor.");
-    AddOutput("Index",
-              "Equivalent to inverse in numpy.unique, "
-              "the indices for where elements in the original input ended up "
-              "in the returned unique tensor.");
-    AddOutput(
-        "Indices",
-        "The indices of the input tensor that result in the unique tensor.")
-        .AsDispensable();
-    AddOutput("Counts", "The counts for each unique element.").AsDispensable();
-    AddAttr<bool>("return_index",
-                  "If True, also return the indices of the input"
-                  " tensor that result in the unique Tensor.")
-        .SetDefault(false);
-    AddAttr<bool>(
-        "return_inverse",
-        "If True, also return the indices for where elements"
-        " in the original input ended up in the returned unique tensor.")
-        .SetDefault(false);
-    AddAttr<bool>("return_counts",
-                  "If True, also return the counts for each unique element.")
-        .SetDefault(false);
-    AddAttr<std::vector<int>>(
-        "axis",
-        "The axis to apply unique. If None, the input will be flattened.")
-        .SetDefault({});
-    AddAttr<bool>("is_sorted",
-                  "If True, the unique elements of X are in ascending order."
-                  "Otherwise, the unique elements are not sorted.")
-        .SetDefault(false);
-    AddComment(R"DOC(
-    1. Return a unique subsequence for 1-D input tensor, and an index tensor
-    pointing to this unique subsequence when Attr(is_sorted) is false. This
-    means paddle.unique is called.
-
-    2. Returns the unique elements of X in ascending order when Attr(is_sorted)
-    is true. This means fluid.layers.unique is called.
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_WITHOUT_GRADIENT(unique, ops::UniqueOp, ops::UniqueOpMaker);
diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
index 90c75a8dcc6cd7..e53909aa3fdee9 100644
--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -2241,6 +2241,15 @@
         support_tensor : true
   manual_signature : [uniform]
 
+- op : unique
+  inputs :
+    {x : X}
+  outputs :
+    {out : Out, indices : Indices, inverse : Index, counts : Counts}
+  get_expected_kernel_type :
+    unique : GetUniqueExpectedKernelType
+  manual_signature : [unique]
+
 - op : unique_consecutive
   inputs :
     x : X
diff --git a/paddle/phi/api/yaml/static_ops.yaml b/paddle/phi/api/yaml/static_ops.yaml
index f0f26e27c1f2c4..802c6b1d46df54 100644
--- a/paddle/phi/api/yaml/static_ops.yaml
+++ b/paddle/phi/api/yaml/static_ops.yaml
@@ -342,3 +342,13 @@
     func : uniform
     param: [shape, dtype, min, max, seed]
     data_type : dtype
+
+- op : unique
+  args : (Tensor x, bool return_index=false, bool return_inverse=false, bool return_counts=false, int[] axis={}, DataType dtype=DataType::INT64, bool is_sorted=false)
+  output : Tensor(out), Tensor(indices), Tensor(inverse), Tensor(counts)
+  optional : indices, counts
+  infer_meta :
+    func : UniqueRawInferMeta
+  kernel :
+    func : unique
+    data_type : x
diff --git a/paddle/phi/ops/compat/unique_sig.cc b/paddle/phi/ops/compat/unique_sig.cc
index 2a7ba543012f3e..8a38775bc60802 100644
--- a/paddle/phi/ops/compat/unique_sig.cc
+++ b/paddle/phi/ops/compat/unique_sig.cc
@@ -17,6 +17,17 @@ limitations under the License. */
 namespace phi {
 
 KernelSignature UniqueOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.IsForInferShape()) {
+    return KernelSignature("unique_raw",
+                           {"X"},
+                           {"return_index",
+                            "return_inverse",
+                            "return_counts",
+                            "axis",
+                            "dtype",
+                            "is_sorted"},
+                           {"Out", "Indices", "Index", "Counts"});
+  }
   bool is_sorted = paddle::any_cast<bool>(ctx.Attr("is_sorted"));
   if (is_sorted) {
     return KernelSignature(

From 2a24a6bb895c53677df88d5c1197b5e7f441be55 Mon Sep 17 00:00:00 2001
From: HongyuJia <jiahongyu@baidu.com>
Date: Tue, 11 Apr 2023 20:51:15 +0800
Subject: [PATCH 071/156] [CustomOP Unittest] Polish unit test, unify
 check_output (#52737)

* [CustomOP Unittest] Polish unit test, unify check_output

* fix test_static_save_and_run_inference_predictor
---
 test/custom_op/test_custom_conj.py            |  56 +++----
 test/custom_op/test_custom_inplace.py         | 154 ++++++++----------
 test/custom_op/test_custom_linear.py          |  31 +---
 test/custom_op/test_custom_optional.py        | 124 +++++---------
 test/custom_op/test_custom_relu_op_setup.py   |  66 ++------
 .../test_custom_relu_op_xpu_setup.py          |  69 +-------
 test/custom_op/test_custom_simple_slice.py    |  10 +-
 .../custom_op/test_custom_tanh_double_grad.py |  34 +---
 test/custom_op/test_custom_tensor_operator.py |  55 ++++---
 test/custom_op/test_multi_out_jit.py          |  31 ++--
 test/custom_op/utils.py                       |  42 +++++
 11 files changed, 255 insertions(+), 417 deletions(-)

diff --git a/test/custom_op/test_custom_conj.py b/test/custom_op/test_custom_conj.py
index c30463bc348690..f51038ae1b34c8 100644
--- a/test/custom_op/test_custom_conj.py
+++ b/test/custom_op/test_custom_conj.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from utils import extra_cc_args, extra_nvcc_args, paddle_includes
+from utils import check_output, extra_cc_args, extra_nvcc_args, paddle_includes
 
 import paddle
 from paddle import static
@@ -100,42 +100,27 @@ def setUp(self):
         self.dtypes = ['float32', 'float64']
         self.shape = [2, 20, 2, 3]
 
-    def check_output(self, out, pd_out, name):
-        np.testing.assert_array_equal(
-            out,
-            pd_out,
-            err_msg='custom op {}: {},\n paddle api {}: {}'.format(
-                name, out, name, pd_out
-            ),
-        )
-
-    def run_dynamic(self, dtype, np_input):
-        out, x_grad = conj_dynamic(custom_ops.custom_conj, dtype, np_input)
-        pd_out, pd_x_grad = conj_dynamic(paddle.conj, dtype, np_input)
-
-        self.check_output(out, pd_out, "out")
-        self.check_output(x_grad, pd_x_grad, "x's grad")
-
-    def run_static(self, dtype, np_input):
-        out, x_grad = conj_static(
-            custom_ops.custom_conj, self.shape, dtype, np_input
-        )
-        pd_out, pd_x_grad = conj_static(
-            paddle.conj, self.shape, dtype, np_input
-        )
-
-        self.check_output(out, pd_out, "out")
-        self.check_output(x_grad, pd_x_grad, "x's grad")
-
     def test_dynamic(self):
         for dtype in self.dtypes:
             np_input = np.random.random(self.shape).astype(dtype)
-            self.run_dynamic(dtype, np_input)
+            out, x_grad = conj_dynamic(custom_ops.custom_conj, dtype, np_input)
+            pd_out, pd_x_grad = conj_dynamic(paddle.conj, dtype, np_input)
+
+            check_output(out, pd_out, "out")
+            check_output(x_grad, pd_x_grad, "x's grad")
 
     def test_static(self):
         for dtype in self.dtypes:
             np_input = np.random.random(self.shape).astype(dtype)
-            self.run_static(dtype, np_input)
+            out, x_grad = conj_static(
+                custom_ops.custom_conj, self.shape, dtype, np_input
+            )
+            pd_out, pd_x_grad = conj_static(
+                paddle.conj, self.shape, dtype, np_input
+            )
+
+            check_output(out, pd_out, "out")
+            check_output(x_grad, pd_x_grad, "x's grad")
 
     # complex only used in dynamic mode now
     def test_complex_dynamic(self):
@@ -143,7 +128,16 @@ def test_complex_dynamic(self):
             np_input = np.random.random(self.shape).astype(
                 dtype
             ) + 1j * np.random.random(self.shape).astype(dtype)
-            self.run_dynamic(to_complex(dtype), np_input)
+
+            out, x_grad = conj_dynamic(
+                custom_ops.custom_conj, to_complex(dtype), np_input
+            )
+            pd_out, pd_x_grad = conj_dynamic(
+                paddle.conj, to_complex(dtype), np_input
+            )
+
+            check_output(out, pd_out, "out")
+            check_output(x_grad, pd_x_grad, "x's grad")
 
 
 if __name__ == "__main__":
diff --git a/test/custom_op/test_custom_inplace.py b/test/custom_op/test_custom_inplace.py
index 2c0a5d4c513c18..f5eed712cdcf9c 100644
--- a/test/custom_op/test_custom_inplace.py
+++ b/test/custom_op/test_custom_inplace.py
@@ -16,7 +16,13 @@
 import unittest
 
 import numpy as np
-from utils import extra_cc_args, extra_nvcc_args, paddle_includes
+from utils import (
+    check_output,
+    check_output_allclose,
+    extra_cc_args,
+    extra_nvcc_args,
+    paddle_includes,
+)
 
 import paddle
 from paddle import static
@@ -342,26 +348,6 @@ def setUp(self):
             np.random.random((3, 2)).astype("float32"),
         ]
 
-    def check_output(self, out, pd_out, name):
-        np.testing.assert_array_equal(
-            out,
-            pd_out,
-            err_msg='custom op {}: {},\n paddle api {}: {}'.format(
-                name, out, name, pd_out
-            ),
-        )
-
-    def check_output_allclose(self, out, pd_out, name):
-        np.testing.assert_allclose(
-            out,
-            pd_out,
-            rtol=5e-5,
-            atol=1e-2,
-            err_msg='custom op {}: {},\n paddle api {}: {}'.format(
-                name, out, name, pd_out
-            ),
-        )
-
     def test_static_add(self):
         for device in self.devices:
             for dtype in self.dtypes:
@@ -391,15 +377,15 @@ def test_static_add(self):
                     self.np_x,
                     self.np_y,
                 )
-                self.check_output(custom_x, custom_out, "inplace_custom_x")
-                self.check_output(
+                check_output(custom_x, custom_out, "inplace_custom_x")
+                check_output(
                     custom_x_grad, custom_out_grad, "inplace_custom_x_grad"
                 )
 
-                self.check_output(custom_out, pd_out, "out")
-                self.check_output(custom_x_grad, pd_x_grad, "x_grad")
-                self.check_output(custom_y_grad, pd_y_grad, "y_grad")
-                self.check_output(custom_out_grad, pd_out_grad, "out_grad")
+                check_output(custom_out, pd_out, "out")
+                check_output(custom_x_grad, pd_x_grad, "x_grad")
+                check_output(custom_y_grad, pd_y_grad, "y_grad")
+                check_output(custom_out_grad, pd_out_grad, "out_grad")
 
     def test_dynamic_add(self):
         for device in self.devices:
@@ -431,14 +417,14 @@ def test_dynamic_add(self):
                     self.np_y,
                 )
 
-                self.check_output(custom_x, custom_out, "inplace_custom_x")
-                self.check_output(pd_x, pd_out, "inplace_pd_x")
+                check_output(custom_x, custom_out, "inplace_custom_x")
+                check_output(pd_x, pd_out, "inplace_pd_x")
 
-                self.check_output(custom_x, pd_x, "x")
-                self.check_output(custom_y, pd_y, "y")
-                self.check_output(custom_out, pd_out, "out")
-                self.check_output(custom_x_grad, pd_x_grad, "x_grad")
-                self.check_output(custom_y_grad, pd_y_grad, "y_grad")
+                check_output(custom_x, pd_x, "x")
+                check_output(custom_y, pd_y, "y")
+                check_output(custom_out, pd_out, "out")
+                check_output(custom_x_grad, pd_x_grad, "x_grad")
+                check_output(custom_y_grad, pd_y_grad, "y_grad")
 
     def test_static_add_vector(self):
         for device in self.devices:
@@ -468,10 +454,10 @@ def test_static_add_vector(self):
                     self.np_y,
                 )
 
-                self.check_output(custom_out, pd_out, "out")
-                self.check_output(custom_x_grad, pd_x_grad, "x_grad")
-                self.check_output(custom_y_grad, pd_y_grad, "y_grad")
-                self.check_output(custom_out_grad, pd_out_grad, "out_grad")
+                check_output(custom_out, pd_out, "out")
+                check_output(custom_x_grad, pd_x_grad, "x_grad")
+                check_output(custom_y_grad, pd_y_grad, "y_grad")
+                check_output(custom_out_grad, pd_out_grad, "out_grad")
 
     def test_dynamic_add_vector(self):
         for device in self.devices:
@@ -503,14 +489,14 @@ def test_dynamic_add_vector(self):
                     self.np_y,
                 )
 
-                self.check_output(custom_x, custom_out, "inplace_custom_x")
-                self.check_output(pd_x, pd_out, "inplace_pd_x")
+                check_output(custom_x, custom_out, "inplace_custom_x")
+                check_output(pd_x, pd_out, "inplace_pd_x")
 
-                self.check_output(custom_x, pd_x, "x")
-                self.check_output(custom_y, pd_y, "y")
-                self.check_output(custom_out, pd_out, "out")
-                self.check_output(custom_x_grad, pd_x_grad, "x_grad")
-                self.check_output(custom_y_grad, pd_y_grad, "y_grad")
+                check_output(custom_x, pd_x, "x")
+                check_output(custom_y, pd_y, "y")
+                check_output(custom_out, pd_out, "out")
+                check_output(custom_x_grad, pd_x_grad, "x_grad")
+                check_output(custom_y_grad, pd_y_grad, "y_grad")
 
     def test_static_relu_net(self):
         for device in self.devices:
@@ -543,11 +529,11 @@ def test_static_relu_net(self):
                     self.np_y,
                     self.np_z,
                 )
-                self.check_output_allclose(custom_x, pd_x, "x")
-                self.check_output_allclose(custom_y, pd_y, "y")
-                self.check_output_allclose(custom_out, pd_out, "out")
-                self.check_output_allclose(custom_x_grad, pd_x_grad, "x_grad")
-                self.check_output_allclose(custom_y_grad, pd_y_grad, "y_grad")
+                check_output_allclose(custom_x, pd_x, "x")
+                check_output_allclose(custom_y, pd_y, "y")
+                check_output_allclose(custom_out, pd_out, "out")
+                check_output_allclose(custom_x_grad, pd_x_grad, "x_grad")
+                check_output_allclose(custom_y_grad, pd_y_grad, "y_grad")
 
     def test_dynamic_relu_net(self):
         for device in self.devices:
@@ -581,11 +567,11 @@ def test_dynamic_relu_net(self):
                     self.np_z,
                 )
 
-                self.check_output(custom_x, pd_x, "x")
-                self.check_output(custom_y, pd_y, "y")
-                self.check_output(custom_out, pd_out, "out")
-                self.check_output(custom_x_grad, pd_x_grad, "x_grad")
-                self.check_output(custom_y_grad, pd_y_grad, "y_grad")
+                check_output(custom_x, pd_x, "x")
+                check_output(custom_y, pd_y, "y")
+                check_output(custom_out, pd_out, "out")
+                check_output(custom_x_grad, pd_x_grad, "x_grad")
+                check_output(custom_y_grad, pd_y_grad, "y_grad")
 
     def test_static_multi_inplace(self):
         for device in self.devices:
@@ -630,27 +616,23 @@ def test_static_multi_inplace(self):
                     self.np_a,
                     self.np_b,
                 )
-                self.check_output(custom_x, pd_out_xy, "inplace_custom_x")
-                self.check_output(
+                check_output(custom_x, pd_out_xy, "inplace_custom_x")
+                check_output(
                     custom_x_grad, custom_out_xy_grad, "inplace_custom_x_grad"
                 )
-                self.check_output(custom_a, pd_out_ab, "inplace_custom_a")
-                self.check_output(
+                check_output(custom_a, pd_out_ab, "inplace_custom_a")
+                check_output(
                     custom_a_grad, custom_out_ab_grad, "inplace_custom_a_grad"
                 )
 
-                self.check_output(custom_out_xy, pd_out_xy, "outxy")
-                self.check_output(custom_x_grad, pd_x_grad, "x_grad")
-                self.check_output(custom_y_grad, pd_y_grad, "y_grad")
-                self.check_output(
-                    custom_out_xy_grad, pd_out_xy_grad, "outxy_grad"
-                )
-                self.check_output(custom_out_ab, pd_out_ab, "outab")
-                self.check_output(custom_a_grad, pd_a_grad, "a_grad")
-                self.check_output(custom_b_grad, pd_b_grad, "b_grad")
-                self.check_output(
-                    custom_out_ab_grad, pd_out_ab_grad, "outab_grad"
-                )
+                check_output(custom_out_xy, pd_out_xy, "outxy")
+                check_output(custom_x_grad, pd_x_grad, "x_grad")
+                check_output(custom_y_grad, pd_y_grad, "y_grad")
+                check_output(custom_out_xy_grad, pd_out_xy_grad, "outxy_grad")
+                check_output(custom_out_ab, pd_out_ab, "outab")
+                check_output(custom_a_grad, pd_a_grad, "a_grad")
+                check_output(custom_b_grad, pd_b_grad, "b_grad")
+                check_output(custom_out_ab_grad, pd_out_ab_grad, "outab_grad")
 
     def test_dynamic_multi_inplace(self):
         for device in self.devices:
@@ -696,21 +678,21 @@ def test_dynamic_multi_inplace(self):
                     self.np_b,
                 )
 
-                self.check_output(custom_x, custom_out_xy, "inplace_custom_x")
-                self.check_output(pd_x, pd_out_xy, "inplace_pd_x")
-                self.check_output(custom_a, custom_out_ab, "inplace_custom_a")
-                self.check_output(pd_a, pd_out_ab, "inplace_pd_a")
-
-                self.check_output(custom_x, pd_x, "x")
-                self.check_output(custom_y, pd_y, "y")
-                self.check_output(custom_out_xy, pd_out_xy, "outxy")
-                self.check_output(custom_x_grad, pd_x_grad, "x_grad")
-                self.check_output(custom_y_grad, pd_y_grad, "y_grad")
-                self.check_output(custom_a, pd_a, "a")
-                self.check_output(custom_b, pd_b, "b")
-                self.check_output(custom_out_ab, pd_out_ab, "outab")
-                self.check_output(custom_a_grad, pd_a_grad, "a_grad")
-                self.check_output(custom_b_grad, pd_b_grad, "b_grad")
+                check_output(custom_x, custom_out_xy, "inplace_custom_x")
+                check_output(pd_x, pd_out_xy, "inplace_pd_x")
+                check_output(custom_a, custom_out_ab, "inplace_custom_a")
+                check_output(pd_a, pd_out_ab, "inplace_pd_a")
+
+                check_output(custom_x, pd_x, "x")
+                check_output(custom_y, pd_y, "y")
+                check_output(custom_out_xy, pd_out_xy, "outxy")
+                check_output(custom_x_grad, pd_x_grad, "x_grad")
+                check_output(custom_y_grad, pd_y_grad, "y_grad")
+                check_output(custom_a, pd_a, "a")
+                check_output(custom_b, pd_b, "b")
+                check_output(custom_out_ab, pd_out_ab, "outab")
+                check_output(custom_a_grad, pd_a_grad, "a_grad")
+                check_output(custom_b_grad, pd_b_grad, "b_grad")
 
 
 if __name__ == "__main__":
diff --git a/test/custom_op/test_custom_linear.py b/test/custom_op/test_custom_linear.py
index 5cd4b5e14f7dd5..60a881bdb6a0cf 100644
--- a/test/custom_op/test_custom_linear.py
+++ b/test/custom_op/test_custom_linear.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from utils import extra_cc_args, extra_nvcc_args, paddle_includes
+from utils import check_output, extra_cc_args, extra_nvcc_args, paddle_includes
 
 import paddle
 import paddle.nn.functional as F
@@ -99,15 +99,6 @@ def setUp(self):
         self.np_weight = np.full([2, 4], fill_value=0.5, dtype="float32")
         self.np_bias = np.ones([4], dtype="float32")
 
-    def check_output(self, out, pd_out, name):
-        np.testing.assert_array_equal(
-            out,
-            pd_out,
-            err_msg='custom op {}: {},\n paddle api {}: {}'.format(
-                name, out, name, pd_out
-            ),
-        )
-
     def test_static(self):
         for device in self.devices:
             for dtype in self.dtypes:
@@ -132,12 +123,10 @@ def test_static(self):
                     self.np_weight,
                     self.np_bias,
                 )
-                self.check_output(custom_out, pd_out, "out")
-                self.check_output(custom_x_grad, pd_x_grad, "x_grad")
-                self.check_output(
-                    custom_weight_grad, pd_weight_grad, "weight_grad"
-                )
-                self.check_output(custom_bias_grad, pd_bias_grad, "bias_grad")
+                check_output(custom_out, pd_out, "out")
+                check_output(custom_x_grad, pd_x_grad, "x_grad")
+                check_output(custom_weight_grad, pd_weight_grad, "weight_grad")
+                check_output(custom_bias_grad, pd_bias_grad, "bias_grad")
 
     def test_dynamic(self):
         for device in self.devices:
@@ -168,12 +157,10 @@ def test_dynamic(self):
                     self.np_weight,
                     self.np_bias,
                 )
-                self.check_output(custom_out, pd_out, "custom_out")
-                self.check_output(custom_x_grad, pd_x_grad, "x_grad")
-                self.check_output(
-                    custom_weight_grad, pd_weight_grad, "weight_grad"
-                )
-                self.check_output(custom_bias_grad, pd_bias_grad, "bias_grad")
+                check_output(custom_out, pd_out, "custom_out")
+                check_output(custom_x_grad, pd_x_grad, "x_grad")
+                check_output(custom_weight_grad, pd_weight_grad, "weight_grad")
+                check_output(custom_bias_grad, pd_bias_grad, "bias_grad")
 
 
 if __name__ == "__main__":
diff --git a/test/custom_op/test_custom_optional.py b/test/custom_op/test_custom_optional.py
index 53d4f159527407..1c1335b37bd98f 100644
--- a/test/custom_op/test_custom_optional.py
+++ b/test/custom_op/test_custom_optional.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from utils import extra_cc_args, extra_nvcc_args, paddle_includes
+from utils import check_output, extra_cc_args, extra_nvcc_args, paddle_includes
 
 import paddle
 from paddle import static
@@ -465,44 +465,6 @@ def setUp(self):
             np.random.random((3, 2)).astype("float32"),
         ]
 
-    def check_output(self, out, pd_out, name):
-        if out is None and pd_out is None:
-            return
-        assert out is not None, "out value of " + name + " is None"
-        assert pd_out is not None, "pd_out value of " + name + " is None"
-        if isinstance(out, list) and isinstance(pd_out, list):
-            for idx in range(len(out)):
-                np.testing.assert_array_equal(
-                    out[idx],
-                    pd_out[idx],
-                    err_msg='custom op {}: {},\n paddle api {}: {}'.format(
-                        name, out[idx], name, pd_out[idx]
-                    ),
-                )
-        else:
-            np.testing.assert_array_equal(
-                out,
-                pd_out,
-                err_msg='custom op {}: {},\n paddle api {}: {}'.format(
-                    name, out, name, pd_out
-                ),
-            )
-
-    def check_output_allclose(self, out, pd_out, name):
-        if out is None and pd_out is None:
-            return
-        assert out is not None, "out value of " + name + " is None"
-        assert pd_out is not None, "pd_out value of " + name + " is None"
-        np.testing.assert_allclose(
-            out,
-            pd_out,
-            rtol=5e-5,
-            atol=1e-2,
-            err_msg='custom op {}: {},\n paddle api {}: {}'.format(
-                name, out, name, pd_out
-            ),
-        )
-
     def test_optional_static_add(self):
         for device in self.devices:
             for dtype in self.dtypes:
@@ -526,9 +488,9 @@ def test_optional_static_add(self):
                         np_y,
                     )
 
-                    self.check_output(custom_x, pd_x, "x")
-                    self.check_output(custom_out, pd_out, "out")
-                    self.check_output(custom_x_grad, pd_x_grad, "x_grad")
+                    check_output(custom_x, pd_x, "x")
+                    check_output(custom_out, pd_out, "out")
+                    check_output(custom_x_grad, pd_x_grad, "x_grad")
 
     def test_optional_dynamic_add(self):
         for device in self.devices:
@@ -553,9 +515,9 @@ def test_optional_dynamic_add(self):
                         np_y,
                     )
 
-                    self.check_output(custom_x, pd_x, "x")
-                    self.check_output(custom_out, pd_out, "out")
-                    self.check_output(custom_x_grad, pd_x_grad, "x_grad")
+                    check_output(custom_x, pd_x, "x")
+                    check_output(custom_out, pd_out, "out")
+                    check_output(custom_x_grad, pd_x_grad, "x_grad")
 
     def test_optional_inplace_static_add(self):
         for device in self.devices:
@@ -576,13 +538,11 @@ def test_optional_inplace_static_add(self):
                         np_y,
                     )
 
-                    self.check_output(custom_tuple[0], pd_tuple[0], "x")
-                    self.check_output(custom_tuple[1], pd_tuple[1], "out")
-                    self.check_output(custom_tuple[2], pd_tuple[2], "x_grad")
+                    check_output(custom_tuple[0], pd_tuple[0], "x")
+                    check_output(custom_tuple[1], pd_tuple[1], "out")
+                    check_output(custom_tuple[2], pd_tuple[2], "x_grad")
                     if len(custom_tuple) > 3:
-                        self.check_output(
-                            custom_tuple[3], pd_tuple[3], "y_grad"
-                        )
+                        check_output(custom_tuple[3], pd_tuple[3], "y_grad")
 
     def test_optional_inplace_dynamic_add(self):
         for device in self.devices:
@@ -619,16 +579,16 @@ def test_optional_inplace_dynamic_add(self):
                         np_y,
                     )
 
-                    self.check_output(pd_y, pd_outy, "inplace_pd_y")
-                    self.check_output(custom_y, custom_outy, "inplace_custom_y")
+                    check_output(pd_y, pd_outy, "inplace_pd_y")
+                    check_output(custom_y, custom_outy, "inplace_custom_y")
 
-                    self.check_output(custom_x, pd_x, "x")
-                    self.check_output(custom_outx, pd_outx, "outx")
-                    self.check_output(custom_y, pd_y, "y")
-                    self.check_output(custom_outy, pd_outy, "outy")
-                    self.check_output(custom_out, pd_out, "out")
-                    self.check_output(custom_x_grad, pd_x_grad, "x_grad")
-                    self.check_output(custom_y_grad, pd_y_grad, "y_grad")
+                    check_output(custom_x, pd_x, "x")
+                    check_output(custom_outx, pd_outx, "outx")
+                    check_output(custom_y, pd_y, "y")
+                    check_output(custom_outy, pd_outy, "outy")
+                    check_output(custom_out, pd_out, "out")
+                    check_output(custom_x_grad, pd_x_grad, "x_grad")
+                    check_output(custom_y_grad, pd_y_grad, "y_grad")
 
     def test_optional_vector_static_add(self):
         for device in self.devices:
@@ -653,9 +613,9 @@ def test_optional_vector_static_add(self):
                         np_y,
                     )
 
-                    self.check_output(custom_x, pd_x, "x")
-                    self.check_output(custom_out, pd_out, "out")
-                    self.check_output(custom_x_grad, pd_x_grad, "x_grad")
+                    check_output(custom_x, pd_x, "x")
+                    check_output(custom_out, pd_out, "out")
+                    check_output(custom_x_grad, pd_x_grad, "x_grad")
 
     def test_optional_vector_dynamic_add(self):
         for device in self.devices:
@@ -680,9 +640,9 @@ def test_optional_vector_dynamic_add(self):
                         np_y,
                     )
 
-                    self.check_output(custom_x, pd_x, "x")
-                    self.check_output(custom_out, pd_out, "out")
-                    self.check_output(custom_x_grad, pd_x_grad, "x_grad")
+                    check_output(custom_x, pd_x, "x")
+                    check_output(custom_out, pd_out, "out")
+                    check_output(custom_x_grad, pd_x_grad, "x_grad")
 
     def test_optional_inplace_vector_static_add(self):
         for device in self.devices:
@@ -703,16 +663,12 @@ def test_optional_inplace_vector_static_add(self):
                         np_y,
                     )
 
-                    self.check_output(custom_tuple[0], pd_tuple[0], "x")
-                    self.check_output(custom_tuple[1], pd_tuple[1], "out")
-                    self.check_output(custom_tuple[2], pd_tuple[2], "x_grad")
+                    check_output(custom_tuple[0], pd_tuple[0], "x")
+                    check_output(custom_tuple[1], pd_tuple[1], "out")
+                    check_output(custom_tuple[2], pd_tuple[2], "x_grad")
                     if len(custom_tuple) > 3:
-                        self.check_output(
-                            custom_tuple[3], pd_tuple[3], "y1_grad"
-                        )
-                        self.check_output(
-                            custom_tuple[4], pd_tuple[4], "y2_grad"
-                        )
+                        check_output(custom_tuple[3], pd_tuple[3], "y1_grad")
+                        check_output(custom_tuple[4], pd_tuple[4], "y2_grad")
 
     def test_optional_inplace_vector_dynamic_add(self):
         for device in self.devices:
@@ -749,16 +705,16 @@ def test_optional_inplace_vector_dynamic_add(self):
                         np_y,
                     )
 
-                    self.check_output(pd_y, pd_outy, "inplace_pd_y")
-                    self.check_output(custom_y, custom_outy, "inplace_custom_y")
+                    check_output(pd_y, pd_outy, "inplace_pd_y")
+                    check_output(custom_y, custom_outy, "inplace_custom_y")
 
-                    self.check_output(custom_x, pd_x, "x")
-                    self.check_output(custom_outx, pd_outx, "outx")
-                    self.check_output(custom_y, pd_y, "y")
-                    self.check_output(custom_outy, pd_outy, "outy")
-                    self.check_output(custom_out, pd_out, "out")
-                    self.check_output(custom_x_grad, pd_x_grad, "x_grad")
-                    self.check_output(custom_y_grad, pd_y_grad, "y_grad")
+                    check_output(custom_x, pd_x, "x")
+                    check_output(custom_outx, pd_outx, "outx")
+                    check_output(custom_y, pd_y, "y")
+                    check_output(custom_outy, pd_outy, "outy")
+                    check_output(custom_out, pd_out, "out")
+                    check_output(custom_x_grad, pd_x_grad, "x_grad")
+                    check_output(custom_y_grad, pd_y_grad, "y_grad")
 
 
 if __name__ == "__main__":
diff --git a/test/custom_op/test_custom_relu_op_setup.py b/test/custom_op/test_custom_relu_op_setup.py
index 8a164b0472933e..8673a806313fef 100644
--- a/test/custom_op/test_custom_relu_op_setup.py
+++ b/test/custom_op/test_custom_relu_op_setup.py
@@ -18,6 +18,7 @@
 import unittest
 
 import numpy as np
+from utils import check_output, check_output_allclose
 
 import paddle
 from paddle import static
@@ -205,13 +206,7 @@ def test_static(self):
                     pd_out = custom_relu_static(
                         custom_op, device, dtype, x, False
                     )
-                    np.testing.assert_array_equal(
-                        out,
-                        pd_out,
-                        err_msg='custom op out: {},\n paddle api out: {}'.format(
-                            out, pd_out
-                        ),
-                    )
+                    check_output(out, pd_out, "out")
 
     def test_dynamic(self):
         for device in self.devices:
@@ -226,20 +221,8 @@ def test_dynamic(self):
                     pd_out, pd_x_grad = custom_relu_dynamic(
                         custom_op, device, dtype, x, False
                     )
-                    np.testing.assert_array_equal(
-                        out,
-                        pd_out,
-                        err_msg='custom op out: {},\n paddle api out: {}'.format(
-                            out, pd_out
-                        ),
-                    )
-                    np.testing.assert_array_equal(
-                        x_grad,
-                        pd_x_grad,
-                        err_msg='custom op x grad: {},\n paddle api x grad: {}'.format(
-                            x_grad, pd_x_grad
-                        ),
-                    )
+                    check_output(out, pd_out, "out")
+                    check_output(x_grad, pd_x_grad, "x_grad")
 
     def test_static_save_and_load_inference_model(self):
         paddle.enable_static()
@@ -263,13 +246,7 @@ def test_static_save_and_load_inference_model(self):
                     feed={feed_target_names[0]: np_data},
                     fetch_list=fetch_targets,
                 )
-                np.testing.assert_array_equal(
-                    predict,
-                    predict_infer,
-                    err_msg='custom op predict: {},\n custom op infer predict: {}'.format(
-                        predict, predict_infer
-                    ),
-                )
+                check_output(predict, predict_infer, "predict")
         paddle.disable_static()
 
     def test_static_save_and_run_inference_predictor(self):
@@ -298,12 +275,9 @@ def test_static_save_and_run_inference_predictor(self):
                 predictor.get_output_names()[0]
             )
             predict_infer = output_tensor.copy_to_cpu()
-            self.assertTrue(
-                np.isclose(predict, predict_infer, rtol=5e-5).any(),
-                "custom op predict: {},\n custom op infer predict: {}".format(
-                    predict, predict_infer
-                ),
-            )
+            predict = np.array(predict).flatten()
+            predict_infer = np.array(predict_infer).flatten()
+            check_output_allclose(predict, predict_infer, "predict")
         paddle.disable_static()
 
     def test_double_grad_dynamic(self):
@@ -318,20 +292,8 @@ def test_double_grad_dynamic(self):
                 pd_out, pd_dx_grad = custom_relu_double_grad_dynamic(
                     self.custom_ops[0], device, dtype, x, False
                 )
-                np.testing.assert_array_equal(
-                    out,
-                    pd_out,
-                    err_msg='custom op out: {},\n paddle api out: {}'.format(
-                        out, pd_out
-                    ),
-                )
-                np.testing.assert_array_equal(
-                    dx_grad,
-                    pd_dx_grad,
-                    err_msg='custom op dx grad: {},\n paddle api dx grad: {}'.format(
-                        dx_grad, pd_dx_grad
-                    ),
-                )
+                check_output(out, pd_out, "out")
+                check_output(dx_grad, pd_dx_grad, "dx_grad")
 
     def test_with_dataloader(self):
         for device in self.devices:
@@ -355,13 +317,7 @@ def test_with_dataloader(self):
                 image = paddle.to_tensor(image)
                 out = self.custom_ops[0](image)
                 pd_out = paddle.nn.functional.relu(image)
-                np.testing.assert_array_equal(
-                    out,
-                    pd_out,
-                    err_msg='custom op out: {},\n paddle api out: {}'.format(
-                        out, pd_out
-                    ),
-                )
+                check_output(out, pd_out, "out")
 
                 if batch_id == 5:
                     break
diff --git a/test/custom_op/test_custom_relu_op_xpu_setup.py b/test/custom_op/test_custom_relu_op_xpu_setup.py
index 3eed65668ebc8d..e054eadafd03ae 100644
--- a/test/custom_op/test_custom_relu_op_xpu_setup.py
+++ b/test/custom_op/test_custom_relu_op_xpu_setup.py
@@ -18,6 +18,7 @@
 import unittest
 
 import numpy as np
+from utils import check_output, check_output_allclose
 
 import paddle
 from paddle import static
@@ -183,13 +184,7 @@ def test_static(self):
             pd_out = custom_relu_static(
                 self.custom_op, self.device, dtype, x, False
             )
-            np.testing.assert_array_equal(
-                out,
-                pd_out,
-                err_msg='custom op out: {},\n paddle api out: {}'.format(
-                    out, pd_out
-                ),
-            )
+            check_output(out, pd_out, "out")
 
     def test_dynamic(self):
         for dtype in self.dtypes:
@@ -200,20 +195,8 @@ def test_dynamic(self):
             pd_out, pd_x_grad = custom_relu_dynamic(
                 self.custom_op, self.device, dtype, x, False
             )
-            np.testing.assert_array_equal(
-                out,
-                pd_out,
-                err_msg='custom op out: {},\n paddle api out: {}'.format(
-                    out, pd_out
-                ),
-            )
-            np.testing.assert_array_equal(
-                x_grad,
-                pd_x_grad,
-                err_msg='custom op x grad: {},\n paddle api x grad: {}'.format(
-                    x_grad, pd_x_grad
-                ),
-            )
+            check_output(out, pd_out, "out")
+            check_output(x_grad, pd_x_grad, "x_grad")
 
     def test_static_save_and_load_inference_model(self):
         paddle.enable_static()
@@ -237,14 +220,7 @@ def test_static_save_and_load_inference_model(self):
                 feed={feed_target_names[0]: np_data},
                 fetch_list=fetch_targets,
             )
-            np.testing.assert_allclose(
-                predict,
-                predict_infer,
-                atol=1e-2,
-                err_msg='custom op predict: {},\n custom op infer predict: {}'.format(
-                    predict, predict_infer
-                ),
-            )
+            check_output(predict, predict_infer, "predict")
         paddle.disable_static()
 
     def test_static_save_and_run_inference_predictor(self):
@@ -272,15 +248,7 @@ def test_static_save_and_run_inference_predictor(self):
         predict_infer = output_tensor.copy_to_cpu()
         predict = np.array(predict).flatten()
         predict_infer = np.array(predict_infer).flatten()
-        np.testing.assert_allclose(
-            predict,
-            predict_infer,
-            rtol=5e-5,
-            atol=1e-2,
-            err_msg="custom op predict: {},\n custom op infer predict: {}".format(
-                predict, predict_infer
-            ),
-        )
+        check_output_allclose(predict, predict_infer, "predict")
         paddle.disable_static()
 
     def test_func_double_grad_dynamic(self):
@@ -292,20 +260,8 @@ def test_func_double_grad_dynamic(self):
             pd_out, pd_dx_grad = custom_relu_double_grad_dynamic(
                 self.custom_op, self.device, dtype, x, False
             )
-            np.testing.assert_array_equal(
-                out,
-                pd_out,
-                err_msg='custom op out: {},\n paddle api out: {}'.format(
-                    out, pd_out
-                ),
-            )
-            np.testing.assert_array_equal(
-                dx_grad,
-                pd_dx_grad,
-                err_msg='custom op dx grad: {},\n paddle api dx grad: {}'.format(
-                    dx_grad, pd_dx_grad
-                ),
-            )
+            check_output(out, pd_out, "out")
+            check_output(dx_grad, pd_dx_grad, "dx_grad")
 
     def test_with_dataloader(self):
         paddle.disable_static()
@@ -328,14 +284,7 @@ def test_with_dataloader(self):
         for batch_id, (image, _) in enumerate(train_loader()):
             out = self.custom_op(image)
             pd_out = paddle.nn.functional.relu(image)
-            np.testing.assert_allclose(
-                out,
-                pd_out,
-                atol=1e-2,
-                err_msg='custom op out: {},\n paddle api out: {}'.format(
-                    out, pd_out
-                ),
-            )
+            check_output_allclose(out, pd_out, "out", atol=1e-2)
 
             if batch_id == 5:
                 break
diff --git a/test/custom_op/test_custom_simple_slice.py b/test/custom_op/test_custom_simple_slice.py
index d69322103520c2..e2662e70f3bc6f 100644
--- a/test/custom_op/test_custom_simple_slice.py
+++ b/test/custom_op/test_custom_simple_slice.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from utils import extra_cc_args, extra_nvcc_args, paddle_includes
+from utils import check_output, extra_cc_args, extra_nvcc_args, paddle_includes
 
 import paddle
 from paddle.utils.cpp_extension import get_build_directory, load
@@ -47,13 +47,7 @@ def test_slice_output(self):
         x = paddle.to_tensor(np_x)
         custom_op_out = custom_ops.custom_simple_slice(x, 2, 3)
         np_out = np_x[2:3]
-        np.testing.assert_array_equal(
-            custom_op_out,
-            np_out,
-            err_msg='custom op: {},\n numpy: {}'.format(
-                np_out, custom_op_out.numpy()
-            ),
-        )
+        check_output(custom_op_out, np_out, "out")
 
 
 if __name__ == "__main__":
diff --git a/test/custom_op/test_custom_tanh_double_grad.py b/test/custom_op/test_custom_tanh_double_grad.py
index 08c57dac91fe17..a47ce712dd6a4c 100644
--- a/test/custom_op/test_custom_tanh_double_grad.py
+++ b/test/custom_op/test_custom_tanh_double_grad.py
@@ -16,7 +16,12 @@
 import unittest
 
 import numpy as np
-from utils import extra_cc_args, extra_nvcc_args, paddle_includes
+from utils import (
+    check_output_allclose,
+    extra_cc_args,
+    extra_nvcc_args,
+    paddle_includes,
+)
 
 import paddle
 from paddle.utils.cpp_extension import get_build_directory, load
@@ -77,30 +82,9 @@ def test_double_grad_dynamic(self):
                 pd_out, pd_dx_grad, pd_dout = custom_tanh_double_grad_dynamic(
                     paddle.tanh, device, dtype, x
                 )
-                np.testing.assert_allclose(
-                    out,
-                    pd_out,
-                    rtol=1e-05,
-                    err_msg='custom op out: {},\n paddle api out: {}'.format(
-                        out, pd_out
-                    ),
-                )
-                np.testing.assert_allclose(
-                    dx_grad,
-                    pd_dx_grad,
-                    rtol=1e-05,
-                    err_msg='custom op dx grad: {},\n paddle api dx grad: {}'.format(
-                        dx_grad, pd_dx_grad
-                    ),
-                )
-                np.testing.assert_allclose(
-                    dout,
-                    pd_dout,
-                    rtol=1e-05,
-                    err_msg='custom op out grad: {},\n paddle api out grad: {}'.format(
-                        dout, pd_dout
-                    ),
-                )
+                check_output_allclose(out, pd_out, "out", rtol=1e-05)
+                check_output_allclose(dx_grad, pd_dx_grad, "out", rtol=1e-05)
+                check_output_allclose(dout, pd_dout, "dout", rtol=1e-05)
 
 
 if __name__ == "__main__":
diff --git a/test/custom_op/test_custom_tensor_operator.py b/test/custom_op/test_custom_tensor_operator.py
index 4e524b2f5b16bc..f6edbd934171d3 100644
--- a/test/custom_op/test_custom_tensor_operator.py
+++ b/test/custom_op/test_custom_tensor_operator.py
@@ -16,7 +16,12 @@
 import unittest
 
 import numpy as np
-from utils import extra_cc_args, paddle_includes
+from utils import (
+    check_output,
+    check_output_allclose,
+    extra_cc_args,
+    paddle_includes,
+)
 
 import paddle
 from paddle import static
@@ -260,7 +265,7 @@ def _test_static(self):
                 pd_out = test_custom_add_static(
                     self.add, device, dtype, x, False
                 )
-                np.testing.assert_allclose(out, pd_out, rtol=1e-5, atol=1e-8)
+                check_output_allclose(out, pd_out, "out", rtol=1e-5, atol=1e-8)
 
                 out = test_custom_subtract_static(
                     self.subtract, device, dtype, x
@@ -268,7 +273,7 @@ def _test_static(self):
                 pd_out = test_custom_subtract_static(
                     self.subtract, device, dtype, x, False
                 )
-                np.testing.assert_allclose(out, pd_out, rtol=1e-5, atol=1e-8)
+                check_output_allclose(out, pd_out, "out", rtol=1e-5, atol=1e-8)
 
                 out = test_custom_multiply_static(
                     self.multiply, device, dtype, x
@@ -276,13 +281,13 @@ def _test_static(self):
                 pd_out = test_custom_multiply_static(
                     self.multiply, device, dtype, x, False
                 )
-                np.testing.assert_allclose(out, pd_out, rtol=1e-5, atol=1e-8)
+                check_output_allclose(out, pd_out, "out", rtol=1e-5, atol=1e-8)
 
                 out = test_custom_divide_static(self.divide, device, dtype, x)
                 pd_out = test_custom_divide_static(
                     self.divide, device, dtype, x, False
                 )
-                np.testing.assert_allclose(out, pd_out, rtol=1e-5, atol=1e-8)
+                check_output_allclose(out, pd_out, "out", rtol=1e-5, atol=1e-8)
 
     def _test_dynamic(self):
         for device in self.devices:
@@ -297,9 +302,9 @@ def _test_dynamic(self):
                 pd_out, pd_x_grad = test_custom_add_dynamic(
                     self.add, device, dtype, x, False
                 )
-                np.testing.assert_allclose(out, pd_out, rtol=1e-5, atol=1e-8)
-                np.testing.assert_allclose(
-                    x_grad, pd_x_grad, rtol=1e-5, atol=1e-8
+                check_output_allclose(out, pd_out, "out", rtol=1e-5, atol=1e-8)
+                check_output_allclose(
+                    x_grad, pd_x_grad, "x_grad", rtol=1e-5, atol=1e-8
                 )
 
                 out, x_grad = test_custom_subtract_dynamic(
@@ -308,9 +313,9 @@ def _test_dynamic(self):
                 pd_out, pd_x_grad = test_custom_subtract_dynamic(
                     self.subtract, device, dtype, x, False
                 )
-                np.testing.assert_allclose(out, pd_out, rtol=1e-5, atol=1e-8)
-                np.testing.assert_allclose(
-                    x_grad, pd_x_grad, rtol=1e-5, atol=1e-8
+                check_output_allclose(out, pd_out, "out", rtol=1e-5, atol=1e-8)
+                check_output_allclose(
+                    x_grad, pd_x_grad, "x_grad", rtol=1e-5, atol=1e-8
                 )
 
                 out, x_grad = test_custom_multiply_dynamic(
@@ -319,9 +324,9 @@ def _test_dynamic(self):
                 pd_out, pd_x_grad = test_custom_multiply_dynamic(
                     self.multiply, device, dtype, x, False
                 )
-                np.testing.assert_allclose(out, pd_out, rtol=1e-5, atol=1e-8)
-                np.testing.assert_allclose(
-                    x_grad, pd_x_grad, rtol=1e-5, atol=1e-8
+                check_output_allclose(out, pd_out, "out", rtol=1e-5, atol=1e-8)
+                check_output_allclose(
+                    x_grad, pd_x_grad, "x_grad", rtol=1e-5, atol=1e-8
                 )
 
                 out, x_grad = test_custom_divide_dynamic(
@@ -330,7 +335,7 @@ def _test_dynamic(self):
                 pd_out, pd_x_grad = test_custom_divide_dynamic(
                     self.divide, device, dtype, x, False
                 )
-                np.testing.assert_allclose(out, pd_out, rtol=1e-5, atol=1e-8)
+                check_output_allclose(out, pd_out, "out", rtol=1e-5, atol=1e-8)
 
     def _test_logical_operants(self):
         for device in self.devices:
@@ -342,19 +347,19 @@ def _test_logical_operants(self):
 
             out = self.custom_module.custom_logical_and(x, y)
             pd_out = paddle.bitwise_and(x, y)
-            np.testing.assert_equal(out.numpy(), pd_out.numpy())
+            check_output(out.numpy(), pd_out.numpy(), "out")
 
             out = self.custom_module.custom_logical_or(x, y)
             pd_out = paddle.bitwise_or(x, y)
-            np.testing.assert_equal(out.numpy(), pd_out.numpy())
+            check_output(out.numpy(), pd_out.numpy(), "out")
 
             out = self.custom_module.custom_logical_xor(x, y)
             pd_out = paddle.bitwise_xor(x, y)
-            np.testing.assert_equal(out.numpy(), pd_out.numpy())
+            check_output(out.numpy(), pd_out.numpy(), "out")
 
             out = self.custom_module.custom_logical_not(x)
             pd_out = paddle.bitwise_not(x)
-            np.testing.assert_equal(out.numpy(), pd_out.numpy())
+            check_output(out.numpy(), pd_out.numpy(), "out")
 
     def _test_compare_operants(self):
         for device in self.devices:
@@ -366,27 +371,27 @@ def _test_compare_operants(self):
 
             out = self.custom_module.custom_less_than(x, y)
             pd_out = paddle.less_than(x, y)
-            np.testing.assert_equal(out.numpy(), pd_out.numpy())
+            check_output(out.numpy(), pd_out.numpy(), "out")
 
             out = self.custom_module.custom_less_equal(x, y)
             pd_out = paddle.less_equal(x, y)
-            np.testing.assert_equal(out.numpy(), pd_out.numpy())
+            check_output(out.numpy(), pd_out.numpy(), "out")
 
             out = self.custom_module.custom_equal(x, y)
             pd_out = paddle.equal(x, y)
-            np.testing.assert_equal(out.numpy(), pd_out.numpy())
+            check_output(out.numpy(), pd_out.numpy(), "out")
 
             out = self.custom_module.custom_not_equal(x, y)
             pd_out = paddle.not_equal(x, y)
-            np.testing.assert_equal(out.numpy(), pd_out.numpy())
+            check_output(out.numpy(), pd_out.numpy(), "out")
 
             out = self.custom_module.custom_greater_than(x, y)
             pd_out = paddle.greater_than(x, y)
-            np.testing.assert_equal(out.numpy(), pd_out.numpy())
+            check_output(out.numpy(), pd_out.numpy(), "out")
 
             out = self.custom_module.custom_greater_equal(x, y)
             pd_out = paddle.greater_equal(x, y)
-            np.testing.assert_equal(out.numpy(), pd_out.numpy())
+            check_output(out.numpy(), pd_out.numpy(), "out")
 
 
 if __name__ == '__main__':
diff --git a/test/custom_op/test_multi_out_jit.py b/test/custom_op/test_multi_out_jit.py
index f3e3a6ec8abc13..a191ab33e6a7cb 100644
--- a/test/custom_op/test_multi_out_jit.py
+++ b/test/custom_op/test_multi_out_jit.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from utils import extra_cc_args, paddle_includes
+from utils import check_output, extra_cc_args, paddle_includes
 
 import paddle
 from paddle import static
@@ -105,15 +105,6 @@ def setUp(self):
         self.np_y = np.random.uniform(-1, 1, [4, 8]).astype("float32")
         self.np_z = np.random.uniform(-1, 1, [4, 8]).astype("float32")
 
-    def check_output(self, out, pd_out, name):
-        np.testing.assert_array_equal(
-            out,
-            pd_out,
-            err_msg='custom op {}: {},\n paddle api {}: {}'.format(
-                name, out, name, pd_out
-            ),
-        )
-
     def run_static(self, device, dtype):
         paddle.set_device(device)
         x_data = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
@@ -140,14 +131,12 @@ def check_multi_outputs(self, outs, is_dynamic=False):
             one_int32 = one_int32.numpy()
         # Fake_float64
         self.assertTrue('float64' in str(zero_float64.dtype))
-        np.testing.assert_array_equal(
-            zero_float64, np.zeros([4, 8]).astype('float64')
+        check_output(
+            zero_float64, np.zeros([4, 8]).astype('float64'), "zero_float64"
         )
         # ZFake_int32
         self.assertTrue('int32' in str(one_int32.dtype))
-        np.testing.assert_array_equal(
-            one_int32, np.ones([4, 8]).astype('int32')
-        )
+        check_output(one_int32, np.ones([4, 8]).astype('int32'), "one_int32")
 
     def test_multi_out_static(self):
         paddle.enable_static()
@@ -193,10 +182,10 @@ def test_discrete_out_static(self):
                     self.np_y,
                     self.np_z,
                 )
-                self.check_output(custom_out, pd_out, "out")
+                check_output(custom_out, pd_out, "out")
                 # NOTE: In static mode, the output gradient of custom operator has been optimized to shape=[1]. However, native paddle op's output shape = [4, 8], hence we need to fetch pd_w_grad[0][0] (By the way, something wrong with native paddle's gradient, the outputs with other indexes instead of pd_w_grad[0][0] is undefined in this unittest.)
-                self.check_output(custom_w_grad, pd_w_grad[0][0], "w_grad")
-                self.check_output(custom_y_grad, pd_y_grad[0][0], "y_grad")
+                check_output(custom_w_grad, pd_w_grad[0][0], "w_grad")
+                check_output(custom_y_grad, pd_y_grad[0][0], "y_grad")
 
     def test_discrete_out_dynamic(self):
         for device in self.devices:
@@ -223,9 +212,9 @@ def test_discrete_out_dynamic(self):
                     self.np_y,
                     self.np_z,
                 )
-                self.check_output(custom_out, pd_out, "out")
-                self.check_output(custom_w_grad, pd_w_grad, "w_grad")
-                self.check_output(custom_y_grad, pd_y_grad, "y_grad")
+                check_output(custom_out, pd_out, "out")
+                check_output(custom_w_grad, pd_w_grad, "w_grad")
+                check_output(custom_y_grad, pd_y_grad, "y_grad")
 
 
 if __name__ == '__main__':
diff --git a/test/custom_op/utils.py b/test/custom_op/utils.py
index 7e199f3a6114d1..d65a0f2175f6ee 100644
--- a/test/custom_op/utils.py
+++ b/test/custom_op/utils.py
@@ -16,6 +16,8 @@
 import sys
 from site import getsitepackages
 
+import numpy as np
+
 from paddle.utils.cpp_extension.extension_utils import IS_WINDOWS
 
 IS_MAC = sys.platform.startswith('darwin')
@@ -39,3 +41,43 @@
 extra_cc_args = ['-w', '-g'] if not IS_WINDOWS else ['/w']
 extra_nvcc_args = ['-O3']
 extra_compile_args = {'cc': extra_cc_args, 'nvcc': extra_nvcc_args}
+
+
+def check_output(out, pd_out, name):
+    if out is None and pd_out is None:
+        return
+    assert out is not None, "out value of " + name + " is None"
+    assert pd_out is not None, "pd_out value of " + name + " is None"
+    if isinstance(out, list) and isinstance(pd_out, list):
+        for idx in range(len(out)):
+            np.testing.assert_array_equal(
+                out[idx],
+                pd_out[idx],
+                err_msg='custom op {}: {},\n paddle api {}: {}'.format(
+                    name, out[idx], name, pd_out[idx]
+                ),
+            )
+    else:
+        np.testing.assert_array_equal(
+            out,
+            pd_out,
+            err_msg='custom op {}: {},\n paddle api {}: {}'.format(
+                name, out, name, pd_out
+            ),
+        )
+
+
+def check_output_allclose(out, pd_out, name, rtol=5e-5, atol=1e-2):
+    if out is None and pd_out is None:
+        return
+    assert out is not None, "out value of " + name + " is None"
+    assert pd_out is not None, "pd_out value of " + name + " is None"
+    np.testing.assert_allclose(
+        out,
+        pd_out,
+        rtol,
+        atol,
+        err_msg='custom op {}: {},\n paddle api {}: {}'.format(
+            name, out, name, pd_out
+        ),
+    )

From 7a78a57143b0f66a9a020f961cbb99b93059882a Mon Sep 17 00:00:00 2001
From: wanghuancoder <wanghuan29@baidu.com>
Date: Wed, 12 Apr 2023 09:58:11 +0800
Subject: [PATCH 072/156] fix force sync bug in paddle.grad (#52779)

---
 paddle/fluid/eager/backward.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc
index a220fe18fb35d3..2216b6b01427ee 100644
--- a/paddle/fluid/eager/backward.cc
+++ b/paddle/fluid/eager/backward.cc
@@ -113,7 +113,6 @@ std::vector<paddle::Tensor> RunBackward(
 
   std::queue<GradNodeBase*> force_sequential_nodes_forward_queue =
       egr::Controller::Instance().GetForceSequentialNodes();
-  egr::Controller::Instance().ClearForceSequentialNodes();
   std::deque<GradNodeBase*> force_sequential_nodes_queue;
   std::set<GradNodeBase*> force_sequential_nodes_set;
   std::set<GradNodeBase*> ready_force_sequential_nodes;
@@ -421,6 +420,7 @@ void Backward(const std::vector<paddle::Tensor>& tensors,  // outputs
   VLOG(3) << "Run in Backward";
   paddle::platform::RecordEvent backward_record_event(
       "backward", paddle::platform::TracerEventType::UserDefined, 1);
+  egr::Controller::Instance().ClearForceSequentialNodes();
   RunBackward(tensors, grad_tensors, retain_graph);
   phi::autotune::AutoTuneStatus::Instance().Update();
 }

From f05c870b30ebb8c1470e05e64325fc29882db6b7 Mon Sep 17 00:00:00 2001
From: megemini <megemini@outlook.com>
Date: Wed, 12 Apr 2023 10:04:44 +0800
Subject: [PATCH 073/156] =?UTF-8?q?=E3=80=90Hackathon=204th=20No.13?=
 =?UTF-8?q?=E3=80=91=E4=B8=BA=20Paddle=20=E6=96=B0=E5=A2=9E=20Bernoulli=20?=
 =?UTF-8?q?API=20=20(#52244)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* 【Hackathon 4th No.13】为 Paddle 新增 Bernoulli API

* [Change]change unittest_py scipy version

* [Change]修改BernoulliNumpy的类型参数;优化静态图测试流程

* [Change]优化类的初始化及逻辑;增加0D相关测试用例
---
 python/paddle/distribution/__init__.py        |   2 +
 python/paddle/distribution/bernoulli.py       | 485 ++++++++++++++
 python/paddle/distribution/kl.py              |   6 +
 .../test_distribution_bernoulli.py            | 596 ++++++++++++++++++
 .../test_distribution_bernoulli_static.py     | 468 ++++++++++++++
 5 files changed, 1557 insertions(+)
 create mode 100644 python/paddle/distribution/bernoulli.py
 create mode 100644 python/paddle/fluid/tests/unittests/distribution/test_distribution_bernoulli.py
 create mode 100644 python/paddle/fluid/tests/unittests/distribution/test_distribution_bernoulli_static.py

diff --git a/python/paddle/distribution/__init__.py b/python/paddle/distribution/__init__.py
index 77b83fa6a94c54..418ef478aaf139 100644
--- a/python/paddle/distribution/__init__.py
+++ b/python/paddle/distribution/__init__.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from paddle.distribution import transform
+from paddle.distribution.bernoulli import Bernoulli
 from paddle.distribution.beta import Beta
 from paddle.distribution.categorical import Categorical
 from paddle.distribution.dirichlet import Dirichlet
@@ -30,6 +31,7 @@
 from paddle.distribution.laplace import Laplace
 
 __all__ = [  # noqa
+    'Bernoulli',
     'Beta',
     'Categorical',
     'Dirichlet',
diff --git a/python/paddle/distribution/bernoulli.py b/python/paddle/distribution/bernoulli.py
new file mode 100644
index 00000000000000..d6c6551b0c5ced
--- /dev/null
+++ b/python/paddle/distribution/bernoulli.py
@@ -0,0 +1,485 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import numpy as np
+
+import paddle
+from paddle.distribution import exponential_family
+from paddle.fluid.data_feeder import check_type, convert_dtype
+from paddle.fluid.framework import _non_static_mode
+from paddle.fluid.layers import tensor
+from paddle.nn.functional import (
+    binary_cross_entropy_with_logits,
+    sigmoid,
+    softplus,
+)
+
+# Smallest representable number
+EPS = {
+    'float32': paddle.finfo(paddle.float32).eps,
+    'float64': paddle.finfo(paddle.float64).eps,
+}
+
+
+def _clip_probs(probs, dtype):
+    """Clip probs from [0, 1] to (0, 1) with ``eps``.
+
+    Args:
+        probs (Tensor): probs of Bernoulli.
+        dtype (str): data type.
+
+    Returns:
+        Tensor: Clipped probs.
+    """
+    eps = EPS.get(dtype)
+    return paddle.clip(probs, min=eps, max=1 - eps).astype(dtype)
+
+
+class Bernoulli(exponential_family.ExponentialFamily):
+    r"""Bernoulli distribution parameterized by ``probs``, which is the probability of value 1.
+
+    In probability theory and statistics, the Bernoulli distribution, named after Swiss
+    mathematician Jacob Bernoulli, is the discrete probability distribution of a random
+    variable which takes the value 1 with probability ``p`` and the value 0 with
+    probability ``q=1-p``.
+
+    The probability mass function of this distribution, over possible outcomes ``k``, is
+
+    .. math::
+
+        {\begin{cases}
+        q=1-p & \text{if }value=0 \\
+        p & \text{if }value=1
+        \end{cases}}
+
+    Args:
+        probs (float|Tensor): The ``probs`` input of Bernoulli distribution. The data type is float32 or float64. The range must be in [0, 1].
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            from paddle.distribution import Bernoulli
+
+            # init `probs` with a float
+            rv = Bernoulli(probs=0.3)
+
+            print(rv.mean)
+            # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+            #        [0.30000001])
+
+            print(rv.variance)
+            # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+            #        [0.21000001])
+
+            print(rv.entropy())
+            # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+            #        [0.61086434])
+    """
+
+    def __init__(self, probs, name=None):
+        self.name = name or 'Bernoulli'
+        if not _non_static_mode():
+            check_type(
+                probs,
+                'probs',
+                (float, tensor.Variable),
+                self.name,
+            )
+
+        # Get/convert probs to tensor.
+        if self._validate_args(probs):
+            self.probs = probs
+            self.dtype = convert_dtype(probs.dtype)
+        else:
+            [self.probs] = self._to_tensor(probs)
+            self.dtype = paddle.get_default_dtype()
+
+        # Check probs range [0, 1].
+        if _non_static_mode():
+            """Not use `paddle.any` in static mode, which always be `True`."""
+            if (
+                paddle.any(self.probs < 0)
+                or paddle.any(self.probs > 1)
+                or paddle.any(paddle.isnan(self.probs))
+            ):
+                raise ValueError("The arg of `probs` must be in range [0, 1].")
+
+        # Clip probs from [0, 1] to (0, 1) with smallest representable number `eps`.
+        self.probs = _clip_probs(self.probs, self.dtype)
+        self.logits = self._probs_to_logits(self.probs, is_binary=True)
+
+        super().__init__(batch_shape=self.probs.shape, event_shape=())
+
+    @property
+    def mean(self):
+        """Mean of Bernoulli distribution.
+
+        Returns:
+            Tensor: Mean value of distribution.
+        """
+        return self.probs
+
+    @property
+    def variance(self):
+        """Variance of Bernoulli distribution.
+
+        Returns:
+            Tensor: Variance value of distribution.
+        """
+        return paddle.multiply(self.probs, (1 - self.probs))
+
+    def sample(self, shape):
+        """Sample from Bernoulli distribution.
+
+        Args:
+            shape (Sequence[int]): Sample shape.
+
+        Returns:
+            Tensor: Sampled data with shape `sample_shape` + `batch_shape` + `event_shape`.
+
+        Examples:
+
+            .. code-block:: python
+
+                import paddle
+                from paddle.distribution import Bernoulli
+
+                rv = Bernoulli(paddle.full((), 0.3))
+                print(rv.sample([100]).shape)
+                # [100]
+
+                rv = Bernoulli(paddle.to_tensor(0.3))
+                print(rv.sample([100]).shape)
+                # [100, 1]
+
+                rv = Bernoulli(paddle.to_tensor([0.3, 0.5]))
+                print(rv.sample([100]).shape)
+                # [100, 2]
+
+                rv = Bernoulli(paddle.to_tensor([0.3, 0.5]))
+                print(rv.sample([100, 2]).shape)
+                # [100, 2, 2]
+        """
+        name = self.name + '_sample'
+        if not _non_static_mode():
+            check_type(
+                shape,
+                'shape',
+                (np.ndarray, tensor.Variable, list, tuple),
+                name,
+            )
+
+        shape = shape if isinstance(shape, tuple) else tuple(shape)
+        shape = self._extend_shape(shape)
+
+        with paddle.no_grad():
+            return paddle.bernoulli(self.probs.expand(shape), name=name)
+
+    def rsample(self, shape, temperature=1.0):
+        """Sample from Bernoulli distribution (reparameterized).
+
+        The `rsample` is a continuously approximate of Bernoulli distribution reparameterized sample method.
+        [1] Chris J. Maddison, Andriy Mnih, and Yee Whye Teh. The Concrete Distribution: A Continuous Relaxation of Discrete Random Variables. 2016.
+        [2] Eric Jang, Shixiang Gu, and Ben Poole. Categorical Reparameterization with Gumbel-Softmax. 2016.
+
+        Note:
+            `rsample` need to be followed by a `sigmoid`, which converts samples' value to unit interval (0, 1).
+
+        Args:
+            shape (Sequence[int]): Sample shape.
+            temperature (float): temperature for rsample, must be positive.
+
+        Returns:
+            Tensor: Sampled data with shape `sample_shape` + `batch_shape` + `event_shape`.
+
+        Examples:
+
+            .. code-block:: python
+
+                import paddle
+                from paddle.distribution import Bernoulli
+
+                paddle.seed(2023)
+
+                rv = Bernoulli(paddle.full((), 0.3))
+                print(rv.sample([100]).shape)
+                # [100]
+
+                rv = Bernoulli(0.3)
+                print(rv.rsample([100]).shape)
+                # [100, 1]
+
+                rv = Bernoulli(paddle.to_tensor([0.3, 0.5]))
+                print(rv.rsample([100]).shape)
+                # [100, 2]
+
+                rv = Bernoulli(paddle.to_tensor([0.3, 0.5]))
+                print(rv.rsample([100, 2]).shape)
+                # [100, 2, 2]
+
+                # `rsample` has to be followed by a `sigmoid`
+                rv = Bernoulli(0.3)
+                rsample = rv.rsample([3, ])
+                rsample_sigmoid = paddle.nn.functional.sigmoid(rsample)
+                print(rsample, rsample_sigmoid)
+                # Tensor(shape=[3, 1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                #        [[-0.88315082],
+                #         [-0.62347704],
+                #         [-0.31513220]]) Tensor(shape=[3, 1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                #        [[0.29252526],
+                #         [0.34899110],
+                #         [0.42186251]])
+
+                # The smaller the `temperature`, the distribution of `rsample` closer to `sample`, with `probs` of 0.3.
+                print(paddle.nn.functional.sigmoid(rv.rsample([1000, ], temperature=1.0)).sum())
+                # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                #        [361.06829834])
+
+                print(paddle.nn.functional.sigmoid(rv.rsample([1000, ], temperature=0.1)).sum())
+                # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                #        [288.66418457])
+        """
+        name = self.name + '_rsample'
+        if not _non_static_mode():
+            check_type(
+                shape,
+                'shape',
+                (np.ndarray, tensor.Variable, list, tuple),
+                name,
+            )
+            check_type(
+                temperature,
+                'temperature',
+                (float,),
+                name,
+            )
+
+        shape = shape if isinstance(shape, tuple) else tuple(shape)
+        shape = self._extend_shape(shape)
+
+        temperature = paddle.full(
+            shape=(), fill_value=temperature, dtype=self.dtype
+        )
+
+        probs = self.probs.expand(shape)
+        uniforms = paddle.rand(shape, dtype=self.dtype)
+        return paddle.divide(
+            paddle.add(
+                paddle.subtract(uniforms.log(), (-uniforms).log1p()),
+                paddle.subtract(probs.log(), (-probs).log1p()),
+            ),
+            temperature,
+        )
+
+    def cdf(self, value):
+        r"""Cumulative distribution function(CDF) evaluated at value.
+
+        .. math::
+
+            { \begin{cases}
+            0 & \text{if } value \lt  0 \\
+            1 - p & \text{if } 0 \leq value \lt  1 \\
+            1 & \text{if } value \geq 1
+            \end{cases}
+            }
+
+        Args:
+            value (Tensor): Value to be evaluated.
+
+        Returns:
+            Tensor: CDF evaluated at value.
+
+        Examples:
+
+            .. code-block:: python
+
+                import paddle
+                from paddle.distribution import Bernoulli
+
+                rv = Bernoulli(0.3)
+                print(rv.cdf(paddle.to_tensor([1.0])))
+                # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                #        [1.])
+        """
+        name = self.name + '_cdf'
+        if not _non_static_mode():
+            check_type(value, 'value', tensor.Variable, name)
+
+        value = self._check_values_dtype_in_probs(self.probs, value)
+        probs, value = paddle.broadcast_tensors([self.probs, value])
+
+        zeros = paddle.zeros_like(probs)
+        ones = paddle.ones_like(probs)
+
+        return paddle.where(
+            value < 0,
+            zeros,
+            paddle.where(value < 1, paddle.subtract(ones, probs), ones),
+            name=name,
+        )
+
+    def log_prob(self, value):
+        """Log of probability densitiy function.
+
+        Args:
+            value (Tensor): Value to be evaluated.
+
+        Returns:
+            Tensor: Log of probability densitiy evaluated at value.
+
+        Examples:
+
+            .. code-block:: python
+
+                import paddle
+                from paddle.distribution import Bernoulli
+
+                rv = Bernoulli(0.3)
+                print(rv.log_prob(paddle.to_tensor([1.0])))
+                # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                #        [-1.20397282])
+        """
+        name = self.name + '_log_prob'
+        if not _non_static_mode():
+            check_type(value, 'value', tensor.Variable, name)
+
+        value = self._check_values_dtype_in_probs(self.probs, value)
+        logits, value = paddle.broadcast_tensors([self.logits, value])
+        return -binary_cross_entropy_with_logits(
+            logits, value, reduction='none', name=name
+        )
+
+    def prob(self, value):
+        r"""Probability density function(PDF) evaluated at value.
+
+        .. math::
+
+            { \begin{cases}
+                q=1-p & \text{if }value=0 \\
+                p & \text{if }value=1
+                \end{cases}
+            }
+
+        Args:
+            value (Tensor): Value to be evaluated.
+
+        Returns:
+            Tensor: PDF evaluated at value.
+
+        Examples:
+
+            .. code-block:: python
+
+                import paddle
+                from paddle.distribution import Bernoulli
+
+                rv = Bernoulli(0.3)
+                print(rv.prob(paddle.to_tensor([1.0])))
+                # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                #        [0.29999998])
+        """
+        name = self.name + '_prob'
+        if not _non_static_mode():
+            check_type(value, 'value', tensor.Variable, name)
+
+        return self.log_prob(value).exp(name=name)
+
+    def entropy(self):
+        r"""Entropy of Bernoulli distribution.
+
+        .. math::
+
+            {
+                entropy = -(q \log q + p \log p)
+            }
+
+        Returns:
+            Tensor: Entropy of distribution.
+
+        Examples:
+
+            .. code-block:: python
+
+                import paddle
+                from paddle.distribution import Bernoulli
+
+                rv = Bernoulli(0.3)
+                print(rv.entropy())
+                # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                #        [0.61086434])
+        """
+        name = self.name + '_entropy'
+
+        return binary_cross_entropy_with_logits(
+            self.logits, self.probs, reduction='none', name=name
+        )
+
+    def kl_divergence(self, other):
+        r"""The KL-divergence between two Bernoulli distributions.
+
+        .. math::
+
+            {
+                KL(a || b) = p_a \log(p_a / p_b) + (1 - p_a) \log((1 - p_a) / (1 - p_b))
+            }
+
+        Args:
+            other (Bernoulli): instance of Bernoulli.
+
+        Returns:
+            Tensor: kl-divergence between two Bernoulli distributions.
+
+        Examples:
+
+            .. code-block:: python
+
+                import paddle
+                from paddle.distribution import Bernoulli
+
+                rv = Bernoulli(0.3)
+                rv_other = Bernoulli(0.7)
+
+                print(rv.kl_divergence(rv_other))
+                # Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+                #        [0.33891910])
+        """
+        name = self.name + '_kl_divergence'
+        if not _non_static_mode():
+            check_type(other, 'other', Bernoulli, name)
+
+        a_logits = self.logits
+        b_logits = other.logits
+
+        log_pa = -softplus(-a_logits)
+        log_pb = -softplus(-b_logits)
+
+        pa = sigmoid(a_logits)
+        one_minus_pa = sigmoid(-a_logits)
+
+        log_one_minus_pa = -softplus(a_logits)
+        log_one_minus_pb = -softplus(b_logits)
+
+        return paddle.add(
+            paddle.subtract(
+                paddle.multiply(log_pa, pa), paddle.multiply(log_pb, pa)
+            ),
+            paddle.subtract(
+                paddle.multiply(log_one_minus_pa, one_minus_pa),
+                paddle.multiply(log_one_minus_pb, one_minus_pa),
+            ),
+        )
diff --git a/python/paddle/distribution/kl.py b/python/paddle/distribution/kl.py
index ac3b94d4ebd666..3d630b5802b722 100644
--- a/python/paddle/distribution/kl.py
+++ b/python/paddle/distribution/kl.py
@@ -15,6 +15,7 @@
 import warnings
 
 import paddle
+from paddle.distribution.bernoulli import Bernoulli
 from paddle.distribution.beta import Beta
 from paddle.distribution.categorical import Categorical
 from paddle.distribution.dirichlet import Dirichlet
@@ -143,6 +144,11 @@ def __le__(self, other):
         return True
 
 
+@register_kl(Bernoulli, Bernoulli)
+def _kl_bernoulli_bernoulli(p, q):
+    return p.kl_divergence(q)
+
+
 @register_kl(Beta, Beta)
 def _kl_beta_beta(p, q):
     return (
diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_bernoulli.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_bernoulli.py
new file mode 100644
index 00000000000000..2229880b7a6bfa
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_bernoulli.py
@@ -0,0 +1,596 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import scipy.special
+import scipy.stats
+from config import ATOL, DEVICES, RTOL
+from parameterize import (
+    TEST_CASE_NAME,
+    parameterize_cls,
+    parameterize_func,
+    place,
+)
+from test_distribution import DistributionNumpy
+
+import paddle
+from paddle.distribution import Bernoulli
+from paddle.distribution.kl import kl_divergence
+from paddle.fluid.data_feeder import convert_dtype
+
+np.random.seed(2023)
+paddle.seed(2023)
+
+# Smallest representable number.
+EPS = {
+    'float32': np.finfo('float32').eps,
+    'float64': np.finfo('float64').eps,
+}
+
+
+def _clip_probs_ndarray(probs, dtype):
+    """Clip probs from [0, 1] to (0, 1) with ``eps``"""
+    eps = EPS.get(dtype)
+    return np.clip(probs, a_min=eps, a_max=1 - eps).astype(dtype)
+
+
+def _sigmoid(z):
+    return scipy.special.expit(z)
+
+
+def _kstest(samples_a, samples_b, temperature=1):
+    """Uses the Kolmogorov-Smirnov test for goodness of fit."""
+    _, p_value = scipy.stats.ks_2samp(samples_a, samples_b)
+    return not (p_value < 0.02 * (min(1, temperature)))
+
+
+class BernoulliNumpy(DistributionNumpy):
+    def __init__(self, probs):
+        probs = np.array(probs)
+        if str(probs.dtype) not in ['float32', 'float64']:
+            self.dtype = 'float32'
+        else:
+            self.dtype = probs.dtype
+
+        self.batch_shape = np.shape(probs)
+
+        self.probs = _clip_probs_ndarray(
+            np.array(probs, dtype=self.dtype), str(self.dtype)
+        )
+        self.logits = self._probs_to_logits(self.probs, is_binary=True)
+
+        self.rv = scipy.stats.bernoulli(self.probs.astype('float64'))
+
+    @property
+    def mean(self):
+        return self.rv.mean().astype(self.dtype)
+
+    @property
+    def variance(self):
+        return self.rv.var().astype(self.dtype)
+
+    def sample(self, shape):
+        shape = np.array(shape, dtype='int')
+        if shape.ndim:
+            shape = shape.tolist()
+        else:
+            shape = [shape.tolist()]
+        return self.rv.rvs(size=shape + list(self.batch_shape)).astype(
+            self.dtype
+        )
+
+    def log_prob(self, value):
+        return self.rv.logpmf(value).astype(self.dtype)
+
+    def prob(self, value):
+        return self.rv.pmf(value).astype(self.dtype)
+
+    def cdf(self, value):
+        return self.rv.cdf(value).astype(self.dtype)
+
+    def entropy(self):
+        return (
+            np.maximum(
+                self.logits,
+                0,
+            )
+            - self.logits * self.probs
+            + np.log(1 + np.exp(-np.abs(self.logits)))
+        ).astype(self.dtype)
+
+    def kl_divergence(self, other):
+        """
+        .. math::
+
+            KL[a || b] = Pa * Log[Pa / Pb] + (1 - Pa) * Log[(1 - Pa) / (1 - Pb)]
+        """
+        p_a = self.probs
+        p_b = other.probs
+        return (
+            p_a * np.log(p_a / p_b) + (1 - p_a) * np.log((1 - p_a) / (1 - p_b))
+        ).astype(self.dtype)
+
+    def _probs_to_logits(self, probs, is_binary=False):
+        return (
+            (np.log(probs) - np.log1p(-probs)) if is_binary else np.log(probs)
+        ).astype(self.dtype)
+
+
+class BernoulliTest(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static(self.place)
+        with paddle.fluid.dygraph.guard(self.place):
+            # just for convenience
+            self.dtype = self.expected_dtype
+
+            # init numpy with `dtype`
+            self.init_numpy_data(self.probs, self.dtype)
+
+            # init paddle and check dtype convert.
+            self.init_dynamic_data(self.probs, self.default_dtype, self.dtype)
+
+    def init_numpy_data(self, probs, dtype):
+        probs = np.array(probs).astype(dtype)
+        self.rv_np = BernoulliNumpy(probs)
+
+    def init_dynamic_data(self, probs, default_dtype, dtype):
+        self.rv_paddle = Bernoulli(probs)
+        self.assertTrue(
+            dtype == convert_dtype(self.rv_paddle.probs.dtype),
+            (dtype, self.rv_paddle.probs.dtype),
+        )
+
+
+@place(DEVICES)
+@parameterize_cls(
+    (TEST_CASE_NAME, 'probs', 'default_dtype', 'expected_dtype'),
+    [
+        # 0-D probs
+        ('probs_00_32', paddle.full((), 0.0), 'float32', 'float32'),
+        ('probs_03_32', paddle.full((), 0.3), 'float32', 'float32'),
+        ('probs_10_32', paddle.full((), 1.0), 'float32', 'float32'),
+        (
+            'probs_00_64',
+            paddle.full((), 0.0, dtype='float64'),
+            'float64',
+            'float64',
+        ),
+        (
+            'probs_03_64',
+            paddle.full((), 0.3, dtype='float64'),
+            'float64',
+            'float64',
+        ),
+        (
+            'probs_10_64',
+            paddle.full((), 1.0, dtype='float64'),
+            'float64',
+            'float64',
+        ),
+        # 1-D probs
+        ('probs_00', 0.0, 'float64', 'float32'),
+        ('probs_03', 0.3, 'float64', 'float32'),
+        ('probs_10', 1.0, 'float64', 'float32'),
+        ('probs_tensor_03_32', paddle.to_tensor(0.3), 'float32', 'float32'),
+        (
+            'probs_tensor_03_64',
+            paddle.to_tensor(0.3, dtype='float64'),
+            'float64',
+            'float64',
+        ),
+        (
+            'probs_tensor_03_list_32',
+            paddle.to_tensor(
+                [
+                    0.3,
+                ]
+            ),
+            'float32',
+            'float32',
+        ),
+        (
+            'probs_tensor_03_list_64',
+            paddle.to_tensor(
+                [
+                    0.3,
+                ],
+                dtype='float64',
+            ),
+            'float64',
+            'float64',
+        ),
+        # N-D probs
+        (
+            'probs_tensor_0305',
+            paddle.to_tensor((0.3, 0.5)),
+            'float32',
+            'float32',
+        ),
+        (
+            'probs_tensor_03050104',
+            paddle.to_tensor(((0.3, 0.5), (0.1, 0.4))),
+            'float32',
+            'float32',
+        ),
+    ],
+)
+class BernoulliTestFeature(BernoulliTest):
+    def test_mean(self):
+        with paddle.fluid.dygraph.guard(self.place):
+            np.testing.assert_allclose(
+                self.rv_paddle.mean,
+                self.rv_np.mean,
+                rtol=RTOL.get(self.dtype),
+                atol=ATOL.get(self.dtype),
+            )
+
+    def test_variance(self):
+        with paddle.fluid.dygraph.guard(self.place):
+            np.testing.assert_allclose(
+                self.rv_paddle.variance,
+                self.rv_np.variance,
+                rtol=RTOL.get(self.dtype),
+                atol=ATOL.get(self.dtype),
+            )
+
+    @parameterize_func(
+        [
+            (
+                paddle.to_tensor(
+                    [
+                        0.0,
+                    ]
+                ),
+            ),
+            (
+                paddle.to_tensor(
+                    0.0,
+                ),
+            ),
+            (paddle.to_tensor(1.0),),
+            (paddle.to_tensor(0.0, dtype='float64'),),
+        ]
+    )
+    def test_log_prob(self, value):
+        with paddle.fluid.dygraph.guard(self.place):
+            if convert_dtype(value.dtype) == convert_dtype(
+                self.rv_paddle.probs.dtype
+            ):
+                log_prob = self.rv_paddle.log_prob(value)
+                np.testing.assert_allclose(
+                    log_prob,
+                    self.rv_np.log_prob(value),
+                    rtol=RTOL.get(self.dtype),
+                    atol=ATOL.get(self.dtype),
+                )
+                self.assertTrue(self.dtype == convert_dtype(log_prob.dtype))
+
+            else:
+                with self.assertWarns(UserWarning):
+                    self.rv_paddle.log_prob(value)
+
+    @parameterize_func(
+        [
+            (
+                paddle.to_tensor(
+                    [
+                        0.0,
+                    ]
+                ),
+            ),
+            (paddle.to_tensor(0.0),),
+            (paddle.to_tensor(1.0),),
+            (paddle.to_tensor(0.0, dtype='float64'),),
+        ]
+    )
+    def test_prob(self, value):
+        with paddle.fluid.dygraph.guard(self.place):
+            if convert_dtype(value.dtype) == convert_dtype(
+                self.rv_paddle.probs.dtype
+            ):
+                prob = self.rv_paddle.prob(value)
+                np.testing.assert_allclose(
+                    prob,
+                    self.rv_np.prob(value),
+                    rtol=RTOL.get(self.dtype),
+                    atol=ATOL.get(self.dtype),
+                )
+                self.assertTrue(self.dtype == convert_dtype(prob.dtype))
+
+            else:
+                with self.assertWarns(UserWarning):
+                    self.rv_paddle.prob(value)
+
+    @parameterize_func(
+        [
+            (
+                paddle.to_tensor(
+                    [
+                        0.0,
+                    ]
+                ),
+            ),
+            (paddle.to_tensor(0.0),),
+            (paddle.to_tensor(0.3),),
+            (paddle.to_tensor(0.7),),
+            (paddle.to_tensor(1.0),),
+            (paddle.to_tensor(0.0, dtype='float64'),),
+        ]
+    )
+    def test_cdf(self, value):
+        with paddle.fluid.dygraph.guard(self.place):
+            if convert_dtype(value.dtype) == convert_dtype(
+                self.rv_paddle.probs.dtype
+            ):
+                cdf = self.rv_paddle.cdf(value)
+                np.testing.assert_allclose(
+                    cdf,
+                    self.rv_np.cdf(value),
+                    rtol=RTOL.get(self.dtype),
+                    atol=ATOL.get(self.dtype),
+                )
+                self.assertTrue(self.dtype == convert_dtype(cdf.dtype))
+
+            else:
+                with self.assertWarns(UserWarning):
+                    self.rv_paddle.cdf(value)
+
+    def test_entropy(self):
+        with paddle.fluid.dygraph.guard(self.place):
+            np.testing.assert_allclose(
+                self.rv_paddle.entropy(),
+                self.rv_np.entropy(),
+                rtol=RTOL.get(self.dtype),
+                atol=ATOL.get(self.dtype),
+            )
+
+    def test_kl_divergence(self):
+        with paddle.fluid.dygraph.guard(self.place):
+            other_probs = paddle.to_tensor(0.9, dtype=self.dtype)
+
+            rv_paddle_other = Bernoulli(other_probs)
+            rv_np_other = BernoulliNumpy(other_probs)
+
+            np.testing.assert_allclose(
+                self.rv_paddle.kl_divergence(rv_paddle_other),
+                self.rv_np.kl_divergence(rv_np_other),
+                rtol=RTOL.get(self.dtype),
+                atol=ATOL.get(self.dtype),
+            )
+
+            np.testing.assert_allclose(
+                kl_divergence(self.rv_paddle, rv_paddle_other),
+                self.rv_np.kl_divergence(rv_np_other),
+                rtol=RTOL.get(self.dtype),
+                atol=ATOL.get(self.dtype),
+            )
+
+
+@place(DEVICES)
+@parameterize_cls(
+    (
+        TEST_CASE_NAME,
+        'probs',
+        'default_dtype',
+        'expected_dtype',
+        'shape',
+        'expected_shape',
+    ),
+    [
+        # 0-D probs
+        (
+            'probs_0d_1d',
+            paddle.full((), 0.3),
+            'float32',
+            'float32',
+            [
+                100,
+            ],
+            [
+                100,
+            ],
+        ),
+        (
+            'probs_0d_2d',
+            paddle.full((), 0.3),
+            'float32',
+            'float32',
+            [100, 1],
+            [100, 1],
+        ),
+        (
+            'probs_0d_3d',
+            paddle.full((), 0.3),
+            'float32',
+            'float32',
+            [100, 2, 3],
+            [100, 2, 3],
+        ),
+        # 1-D probs
+        (
+            'probs_1d_1d_32',
+            paddle.to_tensor(0.3),
+            'float32',
+            'float32',
+            [
+                100,
+            ],
+            [100, 1],
+        ),
+        (
+            'probs_1d_1d_64',
+            paddle.to_tensor(0.3, dtype='float64'),
+            'float64',
+            'float64',
+            paddle.to_tensor(
+                [
+                    100,
+                ]
+            ),
+            [100, 1],
+        ),
+        (
+            'probs_1d_2d',
+            paddle.to_tensor(0.3),
+            'float32',
+            'float32',
+            [100, 2],
+            [100, 2, 1],
+        ),
+        (
+            'probs_1d_3d',
+            paddle.to_tensor(0.3),
+            'float32',
+            'float32',
+            [100, 2, 3],
+            [100, 2, 3, 1],
+        ),
+        # N-D probs
+        (
+            'probs_2d_1d',
+            paddle.to_tensor((0.3, 0.5)),
+            'float32',
+            'float32',
+            [
+                100,
+            ],
+            [100, 2],
+        ),
+        (
+            'probs_2d_2d',
+            paddle.to_tensor((0.3, 0.5)),
+            'float32',
+            'float32',
+            [100, 3],
+            [100, 3, 2],
+        ),
+        (
+            'probs_2d_3d',
+            paddle.to_tensor((0.3, 0.5)),
+            'float32',
+            'float32',
+            [100, 4, 3],
+            [100, 4, 3, 2],
+        ),
+    ],
+)
+class BernoulliTestSample(BernoulliTest):
+    def test_sample(self):
+        with paddle.fluid.dygraph.guard(self.place):
+            sample_np = self.rv_np.sample(self.shape)
+            sample_paddle = self.rv_paddle.sample(self.shape)
+
+            self.assertEqual(list(sample_paddle.shape), self.expected_shape)
+            self.assertEqual(sample_paddle.dtype, self.rv_paddle.probs.dtype)
+
+            if self.probs.ndim:
+                for i in range(len(self.probs)):
+                    self.assertTrue(
+                        _kstest(
+                            sample_np[..., i].reshape(-1),
+                            sample_paddle.numpy()[..., i].reshape(-1),
+                        )
+                    )
+            else:
+                self.assertTrue(
+                    _kstest(
+                        sample_np.reshape(-1),
+                        sample_paddle.numpy().reshape(-1),
+                    )
+                )
+
+    @parameterize_func(
+        [
+            (1.0,),
+            (0.1,),
+        ]
+    )
+    def test_rsample(self, temperature):
+        """Compare two samples from `rsample` method, one from scipy `sample` and another from paddle `rsample`."""
+        with paddle.fluid.dygraph.guard(self.place):
+            sample_np = self.rv_np.sample(self.shape)
+            rsample_paddle = self.rv_paddle.rsample(self.shape, temperature)
+
+            self.assertEqual(list(rsample_paddle.shape), self.expected_shape)
+            self.assertEqual(rsample_paddle.dtype, self.rv_paddle.probs.dtype)
+
+            if self.probs.ndim:
+                for i in range(len(self.probs)):
+                    self.assertTrue(
+                        _kstest(
+                            sample_np[..., i].reshape(-1),
+                            (
+                                _sigmoid(rsample_paddle.numpy()[..., i]) > 0.5
+                            ).reshape(-1),
+                            temperature,
+                        )
+                    )
+            else:
+                self.assertTrue(
+                    _kstest(
+                        sample_np.reshape(-1),
+                        (_sigmoid(rsample_paddle.numpy()) > 0.5).reshape(-1),
+                        temperature,
+                    )
+                )
+
+    def test_rsample_backpropagation(self):
+        with paddle.fluid.dygraph.guard(self.place):
+            self.rv_paddle.probs.stop_gradient = False
+            rsample_paddle = self.rv_paddle.rsample(self.shape)
+            rsample_paddle = paddle.nn.functional.sigmoid(rsample_paddle)
+            grads = paddle.grad([rsample_paddle], [self.rv_paddle.probs])
+            self.assertEqual(len(grads), 1)
+            self.assertEqual(grads[0].dtype, self.rv_paddle.probs.dtype)
+            self.assertEqual(grads[0].shape, self.rv_paddle.probs.shape)
+
+
+@place(DEVICES)
+@parameterize_cls([TEST_CASE_NAME], ['BernoulliTestError'])
+class BernoulliTestError(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static(self.place)
+
+    @parameterize_func(
+        [
+            (-0.1, ValueError),
+            (1.1, ValueError),
+            (np.nan, ValueError),
+            (-1j + 1, TypeError),
+        ]
+    )
+    def test_bad_init(self, probs, error):
+        with paddle.fluid.dygraph.guard(self.place):
+            self.assertRaises(error, Bernoulli, probs)
+
+    @parameterize_func(
+        [
+            (
+                [0.3, 0.5],
+                paddle.to_tensor([0.1, 0.2, 0.3]),
+            ),
+        ]
+    )
+    def test_bad_broadcast(self, probs, value):
+        with paddle.fluid.dygraph.guard(self.place):
+            rv = Bernoulli(probs)
+            self.assertRaises(ValueError, rv.cdf, value)
+            self.assertRaises(ValueError, rv.log_prob, value)
+            self.assertRaises(ValueError, rv.prob, value)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/distribution/test_distribution_bernoulli_static.py b/python/paddle/fluid/tests/unittests/distribution/test_distribution_bernoulli_static.py
new file mode 100644
index 00000000000000..3390262792668e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/distribution/test_distribution_bernoulli_static.py
@@ -0,0 +1,468 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from config import ATOL, DEVICES, RTOL
+from parameterize import (
+    TEST_CASE_NAME,
+    parameterize_cls,
+    parameterize_func,
+    place,
+)
+from test_distribution_bernoulli import BernoulliNumpy, _kstest, _sigmoid
+
+import paddle
+from paddle.distribution import Bernoulli
+from paddle.distribution.kl import kl_divergence
+
+np.random.seed(2023)
+paddle.seed(2023)
+paddle.enable_static()
+default_dtype = paddle.get_default_dtype()
+
+
+@place(DEVICES)
+@parameterize_cls(
+    (TEST_CASE_NAME, 'params'),  # params: name, probs, probs_other, value
+    [
+        (
+            'params',
+            (
+                # 1-D probs
+                (
+                    'probs_not_iterable',
+                    0.3,
+                    0.7,
+                    1.0,
+                ),
+                (
+                    'probs_not_iterable_and_broadcast_for_value',
+                    0.3,
+                    0.7,
+                    np.array([[0.0, 1.0], [1.0, 0.0]], dtype=default_dtype),
+                ),
+                # N-D probs
+                (
+                    'probs_tuple_0305',
+                    (0.3, 0.5),
+                    0.7,
+                    1.0,
+                ),
+                (
+                    'probs_tuple_03050104',
+                    ((0.3, 0.5), (0.1, 0.4)),
+                    0.7,
+                    1.0,
+                ),
+            ),
+        )
+    ],
+)
+class BernoulliTestFeature(unittest.TestCase):
+    def setUp(self):
+        self.program = paddle.static.Program()
+        self.executor = paddle.static.Executor(self.place)
+
+        self.params_len = len(self.params)
+
+        with paddle.static.program_guard(self.program):
+            self.init_numpy_data(self.params)
+            self.init_static_data(self.params)
+
+    def init_numpy_data(self, params):
+        self.mean_np = []
+        self.variance_np = []
+        self.log_prob_np = []
+        self.prob_np = []
+        self.cdf_np = []
+        self.entropy_np = []
+        self.kl_np = []
+
+        for _, probs, probs_other, value in params:
+            rv_np = BernoulliNumpy(probs)
+            rv_np_other = BernoulliNumpy(probs_other)
+
+            self.mean_np.append(rv_np.mean)
+            self.variance_np.append(rv_np.variance)
+            self.log_prob_np.append(rv_np.log_prob(value))
+            self.prob_np.append(rv_np.prob(value))
+            self.cdf_np.append(rv_np.cdf(value))
+            self.entropy_np.append(rv_np.entropy())
+            self.kl_np.append(rv_np.kl_divergence(rv_np_other))
+
+    def init_static_data(self, params):
+        with paddle.static.program_guard(self.program):
+            rv_paddles = []
+            rv_paddles_other = []
+            values = []
+            for _, probs, probs_other, value in params:
+                if not isinstance(value, np.ndarray):
+                    value = paddle.full([1], value, dtype=default_dtype)
+                else:
+                    value = paddle.to_tensor(value, place=self.place)
+
+                rv_paddles.append(Bernoulli(probs=paddle.to_tensor(probs)))
+                rv_paddles_other.append(
+                    Bernoulli(probs=paddle.to_tensor(probs_other))
+                )
+                values.append(value)
+
+            results = self.executor.run(
+                self.program,
+                feed={},
+                fetch_list=[
+                    [
+                        rv_paddles[i].mean,
+                        rv_paddles[i].variance,
+                        rv_paddles[i].log_prob(values[i]),
+                        rv_paddles[i].prob(values[i]),
+                        rv_paddles[i].cdf(values[i]),
+                        rv_paddles[i].entropy(),
+                        rv_paddles[i].kl_divergence(rv_paddles_other[i]),
+                        kl_divergence(rv_paddles[i], rv_paddles_other[i]),
+                    ]
+                    for i in range(self.params_len)
+                ],
+            )
+
+            self.mean_paddle = []
+            self.variance_paddle = []
+            self.log_prob_paddle = []
+            self.prob_paddle = []
+            self.cdf_paddle = []
+            self.entropy_paddle = []
+            self.kl_paddle = []
+            self.kl_func_paddle = []
+            for i in range(self.params_len):
+                (
+                    _mean,
+                    _variance,
+                    _log_prob,
+                    _prob,
+                    _cdf,
+                    _entropy,
+                    _kl,
+                    _kl_func,
+                ) = results[i * 8 : (i + 1) * 8]
+                self.mean_paddle.append(_mean)
+                self.variance_paddle.append(_variance)
+                self.log_prob_paddle.append(_log_prob)
+                self.prob_paddle.append(_prob)
+                self.cdf_paddle.append(_cdf)
+                self.entropy_paddle.append(_entropy)
+                self.kl_paddle.append(_kl)
+                self.kl_func_paddle.append(_kl_func)
+
+    def test_all(self):
+        for i in range(self.params_len):
+            self._test_mean(i)
+            self._test_variance(i)
+            self._test_log_prob(i)
+            self._test_prob(i)
+            self._test_cdf(i)
+            self._test_entropy(i)
+            self._test_kl_divergence(i)
+
+    def _test_mean(self, i):
+        np.testing.assert_allclose(
+            self.mean_np[i],
+            self.mean_paddle[i],
+            rtol=RTOL.get(default_dtype),
+            atol=ATOL.get(default_dtype),
+        )
+
+    def _test_variance(self, i):
+        np.testing.assert_allclose(
+            self.variance_np[i],
+            self.variance_paddle[i],
+            rtol=RTOL.get(default_dtype),
+            atol=ATOL.get(default_dtype),
+        )
+
+    def _test_log_prob(self, i):
+        np.testing.assert_allclose(
+            self.log_prob_np[i],
+            self.log_prob_paddle[i],
+            rtol=RTOL.get(default_dtype),
+            atol=ATOL.get(default_dtype),
+        )
+
+    def _test_prob(self, i):
+        np.testing.assert_allclose(
+            self.prob_np[i],
+            self.prob_paddle[i],
+            rtol=RTOL.get(default_dtype),
+            atol=ATOL.get(default_dtype),
+        )
+
+    def _test_cdf(self, i):
+        np.testing.assert_allclose(
+            self.cdf_np[i],
+            self.cdf_paddle[i],
+            rtol=RTOL.get(default_dtype),
+            atol=ATOL.get(default_dtype),
+        )
+
+    def _test_entropy(self, i):
+        np.testing.assert_allclose(
+            self.entropy_np[i],
+            self.entropy_paddle[i],
+            rtol=RTOL.get(default_dtype),
+            atol=ATOL.get(default_dtype),
+        )
+
+    def _test_kl_divergence(self, i):
+        np.testing.assert_allclose(
+            self.kl_np[i],
+            self.kl_paddle[i],
+            rtol=RTOL.get(default_dtype),
+            atol=ATOL.get(default_dtype),
+        )
+
+        np.testing.assert_allclose(
+            self.kl_np[i],
+            self.kl_func_paddle[i],
+            rtol=RTOL.get(default_dtype),
+            atol=ATOL.get(default_dtype),
+        )
+
+
+@place(DEVICES)
+@parameterize_cls(
+    (TEST_CASE_NAME, 'probs', 'shape', 'temperature', 'expected_shape'),
+    [
+        # 1-D probs
+        (
+            'probs_03',
+            (0.3,),
+            [
+                100,
+            ],
+            0.1,
+            [100, 1],
+        ),
+        # N-D probs
+        (
+            'probs_0305',
+            (0.3, 0.5),
+            [
+                100,
+            ],
+            0.1,
+            [100, 2],
+        ),
+    ],
+)
+class BernoulliTestSample(unittest.TestCase):
+    def setUp(self):
+        self.program = paddle.static.Program()
+        self.executor = paddle.static.Executor(self.place)
+
+        with paddle.static.program_guard(self.program):
+            self.init_numpy_data(self.probs, self.shape)
+            self.init_static_data(self.probs, self.shape, self.temperature)
+
+    def init_numpy_data(self, probs, shape):
+        self.rv_np = BernoulliNumpy(probs)
+        self.sample_np = self.rv_np.sample(shape)
+
+    def init_static_data(self, probs, shape, temperature):
+        with paddle.static.program_guard(self.program):
+            self.rv_paddle = Bernoulli(probs=paddle.to_tensor(probs))
+
+            [self.sample_paddle, self.rsample_paddle] = self.executor.run(
+                self.program,
+                feed={},
+                fetch_list=[
+                    self.rv_paddle.sample(shape),
+                    self.rv_paddle.rsample(shape, temperature),
+                ],
+            )
+
+    def test_sample(self):
+        with paddle.static.program_guard(self.program):
+            self.assertEqual(
+                list(self.sample_paddle.shape), self.expected_shape
+            )
+
+            for i in range(len(self.probs)):
+                self.assertTrue(
+                    _kstest(
+                        self.sample_np[..., i].reshape(-1),
+                        self.sample_paddle[..., i].reshape(-1),
+                    )
+                )
+
+    def test_rsample(self):
+        """Compare two samples from `rsample` method, one from scipy and another from paddle."""
+        with paddle.static.program_guard(self.program):
+            self.assertEqual(
+                list(self.rsample_paddle.shape), self.expected_shape
+            )
+
+            for i in range(len(self.probs)):
+                self.assertTrue(
+                    _kstest(
+                        self.sample_np[..., i].reshape(-1),
+                        (_sigmoid(self.rsample_paddle[..., i]) > 0.5).reshape(
+                            -1
+                        ),
+                        self.temperature,
+                    )
+                )
+
+
+@place(DEVICES)
+@parameterize_cls([TEST_CASE_NAME], ['BernoulliTestError'])
+class BernoulliTestError(unittest.TestCase):
+    def setUp(self):
+        self.program = paddle.static.Program()
+        self.executor = paddle.static.Executor(self.place)
+
+    @parameterize_func(
+        [
+            (0,),  # int
+            ((0.3,),),  # tuple
+            (
+                [
+                    0.3,
+                ],
+            ),  # list
+            (
+                np.array(
+                    [
+                        0.3,
+                    ]
+                ),
+            ),  # ndarray
+            (-1j + 1,),  # complex
+            ('0',),  # str
+        ]
+    )
+    def test_bad_init_type(self, probs):
+        with paddle.static.program_guard(self.program):
+            with self.assertRaises(TypeError):
+                [_] = self.executor.run(
+                    self.program, feed={}, fetch_list=[Bernoulli(probs=probs)]
+                )
+
+    @parameterize_func(
+        [
+            (100,),  # int
+            (100.0,),  # float
+        ]
+    )
+    def test_bad_sample_shape_type(self, shape):
+        with paddle.static.program_guard(self.program):
+            rv = Bernoulli(0.3)
+
+            with self.assertRaises(TypeError):
+                [_] = self.executor.run(
+                    self.program, feed={}, fetch_list=[rv.sample(shape)]
+                )
+
+            with self.assertRaises(TypeError):
+                [_] = self.executor.run(
+                    self.program, feed={}, fetch_list=[rv.rsample(shape)]
+                )
+
+    @parameterize_func(
+        [
+            (1,),  # int
+        ]
+    )
+    def test_bad_rsample_temperature_type(self, temperature):
+        with paddle.static.program_guard(self.program):
+            rv = Bernoulli(0.3)
+
+            with self.assertRaises(TypeError):
+                [_] = self.executor.run(
+                    self.program,
+                    feed={},
+                    fetch_list=[rv.rsample([100], temperature)],
+                )
+
+    @parameterize_func(
+        [
+            (1,),  # int
+            (1.0,),  # float
+            ([1.0],),  # list
+            ((1.0),),  # tuple
+            (np.array(1.0),),  # ndarray
+        ]
+    )
+    def test_bad_value_type(self, value):
+        with paddle.static.program_guard(self.program):
+            rv = Bernoulli(0.3)
+
+            with self.assertRaises(TypeError):
+                [_] = self.executor.run(
+                    self.program, feed={}, fetch_list=[rv.log_prob(value)]
+                )
+
+            with self.assertRaises(TypeError):
+                [_] = self.executor.run(
+                    self.program, feed={}, fetch_list=[rv.prob(value)]
+                )
+
+            with self.assertRaises(TypeError):
+                [_] = self.executor.run(
+                    self.program, feed={}, fetch_list=[rv.cdf(value)]
+                )
+
+    @parameterize_func(
+        [
+            (np.array(1.0),),  # ndarray or other distribution
+        ]
+    )
+    def test_bad_kl_other_type(self, other):
+        with paddle.static.program_guard(self.program):
+            rv = Bernoulli(0.3)
+
+            with self.assertRaises(TypeError):
+                [_] = self.executor.run(
+                    self.program, feed={}, fetch_list=[rv.kl_divergence(other)]
+                )
+
+    @parameterize_func(
+        [
+            (paddle.to_tensor([0.1, 0.2, 0.3]),),
+        ]
+    )
+    def test_bad_broadcast(self, value):
+        with paddle.static.program_guard(self.program):
+            rv = Bernoulli(paddle.to_tensor([0.3, 0.5]))
+
+            # `logits, value = paddle.broadcast_tensors([self.logits, value])`
+            # raise ValueError in dygraph, raise TypeError in static.
+            with self.assertRaises(TypeError):
+                [_] = self.executor.run(
+                    self.program, feed={}, fetch_list=[rv.cdf(value)]
+                )
+
+            with self.assertRaises(TypeError):
+                [_] = self.executor.run(
+                    self.program, feed={}, fetch_list=[rv.log_prob(value)]
+                )
+
+            with self.assertRaises(TypeError):
+                [_] = self.executor.run(
+                    self.program, feed={}, fetch_list=[rv.prob(value)]
+                )
+
+
+if __name__ == '__main__':
+    unittest.main()

From b0f17d05b952321cf1011e90036cfbc39381a9c3 Mon Sep 17 00:00:00 2001
From: chenjian <chenjian26@baidu.com>
Date: Wed, 12 Apr 2023 10:20:25 +0800
Subject: [PATCH 074/156] [Prim] Add instance_norm composite rule (#52203)

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* isamp

* gpu

* cpu

* noamp

* fix instance_norm

* fix

* fix unit test

* fix unit test

* add unit test

* fix

* add big data tests

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* add test case

* fix

* fix

* fix

* fix

* fix

* remove amp test

---------

Co-authored-by: heyanru01 <429520051@qq.com>
---
 .../tests/unittests/test_instance_norm_op.py  | 639 +++++++++++++++++-
 .../incubate/autograd/composite_rules.py      |  30 +
 2 files changed, 667 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_instance_norm_op.py b/python/paddle/fluid/tests/unittests/test_instance_norm_op.py
index 9e4445b7575cdb..bab904db6eef05 100644
--- a/python/paddle/fluid/tests/unittests/test_instance_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_instance_norm_op.py
@@ -15,9 +15,11 @@
 import unittest
 
 import numpy as np
+import parameterized as param
+from eager_op_test import OpTest
 
 import paddle
-from paddle import fluid
+from paddle import fluid, nn
 from paddle.fluid import Program, core, program_guard
 from paddle.fluid.dygraph import to_variable
 
@@ -33,7 +35,7 @@ def _reference_instance_norm_naive(x, scale, bias, epsilon, mean, var):
     var_tile = np.reshape(var, (n, c, 1, 1))
     var_tile = np.tile(var_tile, (1, 1, h, w))
 
-    x_norm = (x - mean_tile) / np.sqrt(var_tile + epsilon).astype('float32')
+    x_norm = (x - mean_tile) / np.sqrt(var_tile + epsilon)
     scale_tile = np.reshape(scale, (1, c, 1, 1))
     scale_tile = np.tile(scale_tile, (n, 1, h, w))
     bias_tile = np.reshape(bias, (1, c, 1, 1))
@@ -84,6 +86,633 @@ def _cal_mean_variance(x, epsilon, mean_shape):
     return mean, var
 
 
+def instance_norm_wrapper(x, weight=None, bias=None, esp=1e-05):
+    return paddle.nn.functional.instance_norm(
+        x, None, None, weight, bias, True, 0.9, esp
+    )
+
+
+class TestInstanceNormOp(OpTest):
+    def setUp(self):
+        self.op_type = "instance_norm"
+        self.prim_op_type = "comp"
+        self.python_api = instance_norm_wrapper
+        self.public_python_api = instance_norm_wrapper
+        self.python_out_sig = ['Y']
+        self.fw_comp_rtol = 1e-6
+        self.fw_comp_atol = 1e-6
+        self.rev_comp_rtol = 1e-4
+        self.rev_comp_atol = 1e-4
+        self.init_test_case()
+        ref_y_np, ref_mean_np, ref_var_np_tmp = _reference_instance_norm_naive(
+            self.x_np,
+            self.scale_np,
+            self.bias_np,
+            self.epsilon,
+            self.mean_np,
+            self.var_np,
+        )
+
+        ref_var_np = 1 / np.sqrt(ref_var_np_tmp + self.epsilon)
+        self.inputs = {
+            'X': self.x_np,
+            'Scale': self.scale_np,
+            'Bias': self.bias_np,
+        }
+        self.attrs = {'epsilon': self.epsilon}
+        self.outputs = {
+            'Y': ref_y_np,
+            'SavedMean': ref_mean_np,
+            'SavedVariance': ref_var_np,
+        }
+        self.enable_cinn = False
+
+    def test_check_output(self):
+        self.check_output(check_prim=True)
+
+    def test_check_grad(self):
+        self.check_grad(['X', 'Scale', 'Bias'], 'Y', check_prim=True)
+
+    def init_test_case(self):
+        x_shape = [2, 100, 4, 5]
+        n, c, h, w = x_shape[0], x_shape[1], x_shape[2], x_shape[3]
+        self.epsilon = 1e-05
+        dtype = np.float32
+        scale_shape = [c]
+        mean_shape = [n * c]
+        np.random.seed()
+        self.x_np = np.random.random_sample(x_shape).astype(dtype)
+        self.scale_np = np.random.random_sample(scale_shape).astype(dtype)
+        self.bias_np = np.random.random_sample(scale_shape).astype(dtype)
+        self.mean_np, self.var_np = _cal_mean_variance(
+            self.x_np, self.epsilon, mean_shape
+        )
+        self.dtype = dtype
+
+
+class TestInstanceNormFP64(TestInstanceNormOp):
+    def init_test_case(self):
+        x_shape = [2, 100, 4, 5]
+        n, c, h, w = x_shape[0], x_shape[1], x_shape[2], x_shape[3]
+        self.epsilon = 1e-5
+        dtype = np.float64
+        scale_shape = [c]
+        mean_shape = [n * c]
+        np.random.seed()
+        self.x_np = np.random.random_sample(x_shape).astype(dtype)
+        self.scale_np = np.ones(scale_shape).astype(dtype)
+        self.bias_np = np.zeros(scale_shape).astype(dtype)
+        self.mean_np, self.var_np = _cal_mean_variance(
+            self.x_np, self.epsilon, mean_shape
+        )
+        self.fw_comp_rtol = 1e-14
+        self.fw_comp_atol = 1e-14
+        self.rev_comp_rtol = 1e-13
+        self.rev_comp_atol = 1e-13
+        self.dtype = dtype
+
+
+class PrimGroupNorm(paddle.nn.Layer):
+    def __init__(self, num_channels, scale, bias):
+        super().__init__()
+        self.func = nn.InstanceNorm2D(num_channels)
+        paddle.assign(scale, self.func.scale)
+        paddle.assign(bias, self.func.bias)
+
+    def forward(self, x):
+        out = self.func(x)
+        return out
+
+
+def apply_to_static(net, use_cinn):
+    build_strategy = paddle.static.BuildStrategy()
+    build_strategy.build_cinn_pass = use_cinn
+    return paddle.jit.to_static(net, build_strategy=False)
+
+
+places = [paddle.CPUPlace()]
+if paddle.is_compiled_with_cuda():
+    places.append(paddle.CUDAPlace(0))
+
+
+@param.parameterized_class(
+    (
+        'name',
+        'shape',
+        'epsilon',
+        'data_format',
+        'places',
+        'dtype',
+        'threshold_list',
+        'special_threshold',
+    ),
+    (
+        (
+            'test0',
+            (2, 100, 3, 5),
+            1e-5,
+            'NCHW',
+            places,
+            'float32',
+            [
+                [1e-5, 1e-5, 1e-5],  # cpu thresholds for static
+                [1e-5, 1e-5, 1e-5],  # gpu thresholds for static
+            ],
+            None,
+        ),
+        (
+            'test1',
+            (2, 100, 3, 5),
+            1e-5,
+            'NCHW',
+            places,
+            'float32',
+            [
+                [1e-5, 1e-5, 1e-5],  # cpu thresholds for static
+                [1e-5, 1e-5, 1e-5],  # gpu thresholds for static
+            ],
+            None,
+        ),
+        (
+            'testbigdata_fp32',
+            (8, 32, 32, 64),
+            1e-5,
+            'NCHW',
+            places,
+            'float32',
+            [
+                [1e-5, 1e-5, 1e-5],  # cpu thresholds for static
+                [1e-5, 1e-5, 1e-5],  # gpu thresholds for static
+            ],  # gpu thresholds
+            [2e-2, 2e-2, 2e-2],  # special grad threshold for scale
+        ),
+        (
+            'test0_fp64',
+            (2, 100, 3, 5),
+            1e-5,
+            'NCHW',
+            places,
+            'float64',
+            [
+                [1e-14, 1e-14, 1e-14],  # cpu thresholds for static
+                [1e-14, 1e-14, 1e-14],  # gpu thresholds for static
+            ],
+            [1e-13, 1e-13, 1e-13],
+        ),
+        (
+            'test1_fp64',
+            (2, 100, 3, 5),
+            1e-5,
+            'NCHW',
+            places,
+            'float64',
+            [
+                [1e-14, 1e-14, 1e-14],  # cpu thresholds for static
+                [1e-14, 1e-14, 1e-14],  # gpu thresholds for static
+            ],
+            [1e-13, 1e-13, 1e-13],
+        ),
+        (
+            'testbigdata_fp64',
+            (8, 32, 32, 64),
+            1e-5,
+            'NCHW',
+            places,
+            'float64',
+            [
+                [1e-14, 1e-14, 1e-14],  # cpu thresholds
+                [1e-14, 1e-14, 1e-14],
+            ],  # gpu thresholds
+            [5e-11, 5e-11, 5e-11],  # for X_grad
+        ),
+    ),
+)
+class TestCompositeInstanceNormNorm(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        core._set_prim_all_enabled(True)
+
+    @classmethod
+    def tearDownClass(cls):
+        core._set_prim_all_enabled(False)
+
+    def setUp(self):
+        np.random.seed(1234)
+        self.fwd_desire = []
+        self.rev_desire = []
+        self.x = np.random.random(self.shape).astype(self.dtype)
+        self.scale = np.random.random([self.shape[1]]).astype(self.dtype)
+        self.bias = np.random.random([self.shape[1]]).astype(self.dtype)
+        self.num_channels = self.shape[1]
+
+        self.static_fwd_desire = []
+        self.static_rev_desire = []
+        for place in self.places:
+            fwd_desire, rev_desire = self.get_eager_desire(place)
+            self.fwd_desire.append(fwd_desire.numpy())
+            self.rev_desire.append(rev_desire.numpy())
+            self.static_fwd_desire.append([])
+            self.static_rev_desire.append([])
+            fwd, rev = self.get_static_desire(place)
+            self.static_fwd_desire[-1].append(fwd[0])
+            self.static_fwd_desire[-1].append(fwd[1])
+            self.static_fwd_desire[-1].append(fwd[2])
+            self.static_rev_desire[-1].append(rev[0])
+            self.static_rev_desire[-1].append(rev[1])
+            self.static_rev_desire[-1].append(rev[2])
+
+    def get_eager_desire(self, place):
+        if isinstance(place, fluid.CPUPlace):
+            paddle.set_device("cpu")
+        if isinstance(place, fluid.CUDAPlace):
+            paddle.set_device("gpu")
+        core.set_prim_eager_enabled(False)
+        paddle.disable_static()
+        input_ = paddle.to_tensor(
+            data=self.x, dtype=self.dtype, place=place, stop_gradient=False
+        )
+        scale_ = paddle.to_tensor(
+            data=self.scale, dtype=self.dtype, place=place, stop_gradient=False
+        )
+        bias_ = paddle.to_tensor(
+            data=self.bias, dtype=self.dtype, place=place, stop_gradient=False
+        )
+        output = paddle.nn.functional.instance_norm(
+            input_, None, None, scale_, bias_, True, 0.9, self.epsilon
+        )
+        grad = paddle.grad(output, input_)
+
+        return output, grad[0]
+
+    def get_static_desire(self, place):
+        core._set_prim_all_enabled(False)
+        paddle.enable_static()
+        if isinstance(place, fluid.CPUPlace):
+            paddle.set_device("cpu")
+        if isinstance(place, fluid.CUDAPlace):
+            paddle.set_device("gpu")
+
+        mp, sp = paddle.static.Program(), paddle.static.Program()
+        with paddle.static.program_guard(mp, sp):
+            input_ = paddle.static.data(
+                'x', shape=self.x.shape, dtype=self.x.dtype
+            )
+            input_.stop_gradient = False
+
+            scale_ = paddle.static.data(
+                'scale_', shape=self.scale.shape, dtype=self.scale.dtype
+            )
+            scale_.stop_gradient = False
+
+            bias_ = paddle.static.data(
+                'bias_', shape=self.bias.shape, dtype=self.bias.dtype
+            )
+            bias_.stop_gradient = False
+
+            output = paddle.nn.functional.instance_norm(
+                input_, None, None, scale_, bias_, True, 0.9, self.epsilon
+            )
+
+            blocks = mp.blocks
+            names = dict(
+                zip(
+                    blocks[0].ops[0].output_names,
+                    blocks[0].ops[0].output_arg_names,
+                )
+            )
+            vars_list = [
+                names[key]
+                for key in [
+                    "Y",
+                    "SavedMean",
+                    "SavedVariance",
+                ]
+            ]
+
+            fwd_ops = [op.type for op in blocks[0].ops]
+            # Ensure that instance_norm in original block
+            assert 'instance_norm' in fwd_ops
+
+            if core._is_fwd_prim_enabled():
+                paddle.incubate.autograd.primapi.to_prim(mp.blocks)
+                fwd_ops_new = [op.type for op in blocks[0].ops]
+                # Ensure that instance_norm is splitted into small ops
+                assert 'instance_norm' not in fwd_ops_new
+
+            grads = paddle.static.gradients([output], [input_, scale_, bias_])
+
+        exe = paddle.static.Executor(place)
+        exe.run(sp)
+        out_list = exe.run(
+            mp,
+            feed={
+                input_.name: self.x,
+                scale_.name: self.scale,
+                bias_.name: self.bias,
+            },
+            fetch_list=vars_list + [grads],
+        )
+        paddle.disable_static()
+        core._set_prim_all_enabled(True)
+
+        return out_list[:3], out_list[3:]
+
+    def test_static_comp(self):
+        paddle.enable_static()
+        mps = []
+        fwd_actual = []
+        rev_actual = []
+        if len(self.places) < 1:
+            return
+
+        with paddle.fluid.framework._static_guard():
+            for place in self.places:
+                fwd_actual.append([])
+                rev_actual.append([])
+                mp, sp = paddle.static.Program(), paddle.static.Program()
+                with paddle.static.program_guard(mp, sp):
+                    input_ = paddle.static.data(
+                        'x', shape=self.x.shape, dtype=self.x.dtype
+                    )
+                    input_.stop_gradient = False
+
+                    scale_ = paddle.static.data(
+                        'scale_', shape=self.scale.shape, dtype=self.scale.dtype
+                    )
+                    scale_.stop_gradient = False
+
+                    bias_ = paddle.static.data(
+                        'bias_', shape=self.bias.shape, dtype=self.bias.dtype
+                    )
+                    bias_.stop_gradient = False
+
+                    output = paddle.nn.functional.instance_norm(
+                        input_,
+                        None,
+                        None,
+                        scale_,
+                        bias_,
+                        True,
+                        0.9,
+                        self.epsilon,
+                    )
+
+                    blocks = mp.blocks
+                    names = dict(
+                        zip(
+                            blocks[0].ops[0].output_names,
+                            blocks[0].ops[0].output_arg_names,
+                        )
+                    )
+                    vars_list = [
+                        names[key]
+                        for key in [
+                            "Y",
+                            "SavedMean",
+                            "SavedVariance",
+                        ]
+                    ]
+
+                    fwd_ops = [op.type for op in blocks[0].ops]
+                    # Ensure that instance_norm in original block
+                    assert 'instance_norm' in fwd_ops
+
+                    if core._is_fwd_prim_enabled():
+                        paddle.incubate.autograd.primapi.to_prim(mp.blocks)
+                        fwd_ops_new = [op.type for op in blocks[0].ops]
+                        # Ensure that instance_norm is splitted into small ops
+                        assert 'instance_norm' not in fwd_ops_new
+
+                    grads = paddle.static.gradients(
+                        output, [input_, scale_, bias_]
+                    )
+                exe = paddle.static.Executor(place)
+                exe.run(sp)
+                out_list = exe.run(
+                    mp,
+                    feed={
+                        input_.name: self.x,
+                        scale_.name: self.scale,
+                        bias_.name: self.bias,
+                    },
+                    fetch_list=vars_list + [grads],
+                )
+                fwd_actual[-1].append(out_list[0])
+                fwd_actual[-1].append(out_list[1])
+                fwd_actual[-1].append(out_list[2])
+                rev_actual[-1].append(out_list[3])
+                rev_actual[-1].append(out_list[4])
+                rev_actual[-1].append(out_list[5])
+                mps.append(mp)
+
+        vars_name = [
+            "Y",
+            "SavedMean",
+            "SavedVariance",
+            "X_grad",
+            "Scale_grad",
+            "Bias_grad",
+        ]
+
+        for i in range(len(self.places)):
+            self.assertTrue(
+                'instance_norm' not in [op.type for op in mps[i].block(0).ops]
+            )
+            atol = self.threshold_list[i][0]
+            rtol = self.threshold_list[i][0]
+            for j in range(len(self.static_fwd_desire[i])):
+                # in float16 type, Y is float16, mean and var are float16
+                # so check mean and var with float32 gpu threshold
+                if self.dtype == 'float16' and j > 0:
+                    atol = 1e-5
+                    rtol = 1e-5
+
+                np.testing.assert_allclose(
+                    self.static_fwd_desire[i][j],
+                    fwd_actual[i][j],
+                    rtol=rtol,
+                    atol=atol,
+                    err_msg=f"Check diff failed of place:{self.places[i]}, output: {vars_name[j]}",
+                )
+                max_abs_diff = np.max(
+                    np.abs(self.static_fwd_desire[i][j] - fwd_actual[i][j])
+                )
+                print(
+                    self.shape,
+                    self.dtype,
+                    self.places[i],
+                    vars_name[j],
+                    max_abs_diff,
+                )
+            # compare with eager_desire
+            np.testing.assert_allclose(
+                self.fwd_desire[i],
+                fwd_actual[i][0],
+                rtol=rtol,
+                atol=atol,
+                err_msg=f"Check diff failed with fwd_eager:{self.places[i]}",
+            )
+
+            for j in range(len(self.static_rev_desire[i])):
+                if self.special_threshold is not None and j <= 1:
+                    atol = self.special_threshold[i]
+                    rtol = self.special_threshold[i]
+                else:
+                    atol = self.threshold_list[i][0]
+                    rtol = self.threshold_list[i][0]
+
+                max_abs_diff = np.max(
+                    np.abs(self.static_rev_desire[i][j] - rev_actual[i][j])
+                )
+
+                print(
+                    self.shape,
+                    self.dtype,
+                    self.places[i],
+                    vars_name[j + 3],
+                    max_abs_diff,
+                )
+
+                np.testing.assert_allclose(
+                    self.static_rev_desire[i][j],
+                    rev_actual[i][j],
+                    rtol=rtol,
+                    atol=atol,
+                    err_msg=f"Check diff failed of place:{self.places[i]}, output: {vars_name[j + 3]}",
+                )
+
+            # now use larger threshold when testing cpu grads to bypass cpu grad test
+            if self.special_threshold is not None and i == 0:
+                atol = self.special_threshold[i]
+                rtol = self.special_threshold[i]
+            # compare with eager_desire
+            np.testing.assert_allclose(
+                self.rev_desire[i],
+                rev_actual[i][0],
+                rtol=rtol,
+                atol=atol,
+                err_msg=f"Check diff failed with rev_eager:{self.places[i]}",
+            )
+
+        paddle.disable_static()
+
+    def test_jit_comp(self):
+        fwd_actual = []
+        rev_actual = []
+        for place in self.places:
+            input_ = paddle.to_tensor(
+                data=self.x, dtype=self.dtype, place=place, stop_gradient=False
+            )
+            scale_ = paddle.to_tensor(
+                data=self.scale,
+                dtype=self.dtype,
+                place=place,
+                stop_gradient=False,
+            )
+            bias_ = paddle.to_tensor(
+                data=self.bias,
+                dtype=self.dtype,
+                place=place,
+                stop_gradient=False,
+            )
+            net = PrimGroupNorm(self.num_channels, scale_, bias_)
+            net = apply_to_static(net, False)
+            output = net(input_)
+
+            grad = paddle.grad(output, input_)
+            fwd_actual.append(output.numpy())
+            rev_actual.append(grad[0].numpy())
+
+        for i in range(len(self.places)):
+            atol = self.threshold_list[i][1]
+            rtol = self.threshold_list[i][1]
+            np.testing.assert_allclose(
+                self.fwd_desire[i],
+                fwd_actual[i],
+                rtol=rtol,
+                atol=atol,
+                err_msg='%s jit fwd' % self.places[i],
+            )
+
+            # now use larger threshold when testing cpu grads to bypass cpu grad test
+            if self.special_threshold is not None:
+                atol = self.special_threshold[i]
+                rtol = self.special_threshold[i]
+
+            np.testing.assert_allclose(
+                self.rev_desire[i],
+                rev_actual[i],
+                rtol=rtol,
+                atol=atol,
+                err_msg='%s jit rev' % self.places[i],
+            )
+
+    def test_jit_comp_with_cinn(self):
+        fwd_actual = []
+        rev_actual = []
+        for place in self.places:
+            input_ = paddle.to_tensor(
+                data=self.x, dtype=self.dtype, place=place, stop_gradient=False
+            )
+            scale_ = paddle.to_tensor(
+                data=self.scale,
+                dtype=self.dtype,
+                place=place,
+                stop_gradient=False,
+            )
+            bias_ = paddle.to_tensor(
+                data=self.bias,
+                dtype=self.dtype,
+                place=place,
+                stop_gradient=False,
+            )
+            net = PrimGroupNorm(self.num_channels, scale_, bias_)
+            net = apply_to_static(net, False)
+            output = net(input_)
+            grad = paddle.grad(output, input_)
+            fwd_actual.append(output.numpy())
+            rev_actual.append(grad[0].numpy())
+
+        for i in range(len(self.places)):
+            atol = self.threshold_list[i][2]
+            rtol = self.threshold_list[i][2]
+            np.testing.assert_allclose(
+                self.fwd_desire[i],
+                fwd_actual[i],
+                rtol=rtol,  # mean of uniform distribution, scale for avoid random failed
+                atol=atol,
+                err_msg='%s jit_cinn fwd' % self.places[i],
+            )
+            # now use larger threshold when testing cpu grads to bypass cpu grad test
+            if self.special_threshold is not None:
+                atol = self.special_threshold[i]
+                rtol = self.special_threshold[i]
+            np.testing.assert_allclose(
+                self.rev_desire[i],
+                rev_actual[i],
+                rtol=rtol,  # mean of uniform distribution, scale for avoid random failed
+                atol=atol,
+                err_msg='%s jit_cinn rev' % self.places[i],
+            )
+
+
+class TestInstanceNormCase1(TestInstanceNormOp):
+    def init_test_case(self):
+        x_shape = [2, 100, 4, 5]
+        n, c, h, w = x_shape[0], x_shape[1], x_shape[2], x_shape[3]
+        self.epsilon = 1e-05
+        dtype = np.float32
+        scale_shape = [c]
+        mean_shape = [n * c]
+        np.random.seed()
+        self.x_np = np.random.random_sample(x_shape).astype(dtype)
+        self.scale_np = np.ones(scale_shape).astype(dtype)
+        self.bias_np = np.zeros(scale_shape).astype(dtype)
+        self.mean_np, self.var_np = _cal_mean_variance(
+            self.x_np, self.epsilon, mean_shape
+        )
+
+
 class TestInstanceNormOpTraining(unittest.TestCase):
     def setUp(self):
         self.epsilon = 1e-5
@@ -112,6 +741,7 @@ def set_global_mean_var(self, mean_shape, x):
 
     def test_forward_backward(self):
         def test_with_place(place, shape):
+            paddle.enable_static()
             epsilon = self.epsilon
             n, c, h, w = shape[0], shape[1], shape[2], shape[3]
             scale_shape = [c]
@@ -207,6 +837,7 @@ def test_with_place(place, shape):
             for id, name in enumerate(self.fetch_list):
                 self.__assert_close(var_dict[name], out[id], name)
             print("op test forward passes: ", str(place))
+            paddle.disable_static()
 
         places = [core.CPUPlace()]
 
@@ -234,6 +865,7 @@ def init_test_case(self):
 
 class TestInstanceNormOpError(unittest.TestCase):
     def test_errors(self):
+        paddle.enable_static()
         with program_guard(Program(), Program()):
             # the input of instance_norm must be Variable.
             x1 = fluid.create_lod_tensor(
@@ -246,14 +878,17 @@ def test_errors(self):
                 name='x2', shape=[-1, 3, 4, 5, 6], dtype="int32"
             )
             self.assertRaises(TypeError, paddle.static.nn.instance_norm, x2)
+        paddle.disable_static()
 
 
 class TestInstanceNormOpErrorCase1(unittest.TestCase):
     def test_errors(self):
+        paddle.enable_static()
         with program_guard(Program(), Program()):
             # the first dimension of input for instance_norm must between [2d, 5d]
             x = paddle.static.data(name='x', shape=[3], dtype="float32")
             self.assertRaises(ValueError, paddle.static.nn.instance_norm, x)
+        paddle.disable_static()
 
 
 class TestElasticNormOp(unittest.TestCase):
diff --git a/python/paddle/incubate/autograd/composite_rules.py b/python/paddle/incubate/autograd/composite_rules.py
index 84b7d415638b2c..9d0229627728d7 100644
--- a/python/paddle/incubate/autograd/composite_rules.py
+++ b/python/paddle/incubate/autograd/composite_rules.py
@@ -178,6 +178,36 @@ def layernorm_composite(x, scale, bias, epsilon, begin_norm_axis):
     return out, mean_, variance
 
 
+@REGISTER_COMPOSITE('instance_norm')
+def instancenorm_composite(x, scale, bias, epsilon):
+    """
+    define composite rule of op instance_norm
+    out = (x - mean(x)) / sqrt(var + epsilon))
+    var = mean((x-mean(x))^2)
+    """
+    n, c, h, w = x.shape
+    axis = tuple(range(2, len(x.shape)))
+    mean_ = mean(x, axis=axis, keepdim=True)
+    difference = x - mean_
+    var_tmp1 = difference * difference
+    variance = mean(var_tmp1, axis=axis, keepdim=True)
+    var_tmp3 = variance + epsilon
+    sqrt_var = pow(var_tmp3, full([], 0.5, dtype=var_tmp3.dtype))
+    out = difference / sqrt_var
+
+    if scale is not None:
+        scale_tile = reshape(scale, [1, c, 1, 1])
+        out = out * scale_tile
+    if bias is not None:
+        bias_tile = reshape(bias, [1, c, 1, 1])
+        out = out + bias_tile
+
+    mean_ = reshape(mean_, [-1])
+    saved_variance = 1 / sqrt_var
+    saved_variance = reshape(saved_variance, [-1])
+    return out, mean_, saved_variance
+
+
 @REGISTER_COMPOSITE('gelu')
 def gelu_composite(x, approximate):
     """define composite rule of op gelu"""

From a20605682ea1fab07c861123676719f9cc97527a Mon Sep 17 00:00:00 2001
From: Huihuang Zheng <zhhsplendid@163.com>
Date: Wed, 12 Apr 2023 10:20:50 +0800
Subject: [PATCH 075/156] Modify LayerNorm Composite Rule (#52712)

* [Do NOT merge] Expr PR on Composite

* Expr PR on Composite

* Revert some compsite experiment

* Remove unnecessary composite code

* Add rsqrt as sub primitives
---
 python/paddle/incubate/autograd/composite_rules.py | 4 ++--
 python/paddle/incubate/autograd/primitives.py      | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/paddle/incubate/autograd/composite_rules.py b/python/paddle/incubate/autograd/composite_rules.py
index 9d0229627728d7..ba92c5dba718db 100644
--- a/python/paddle/incubate/autograd/composite_rules.py
+++ b/python/paddle/incubate/autograd/composite_rules.py
@@ -160,8 +160,8 @@ def layernorm_composite(x, scale, bias, epsilon, begin_norm_axis):
     var_tmp1 = difference * difference
     variance = mean(var_tmp1, axis=axis, keepdim=True)
     var_tmp3 = variance + epsilon
-    sqrt_var = sqrt(var_tmp3)
-    out = difference / sqrt_var
+    rsqrt_var = rsqrt(var_tmp3)
+    out = difference * rsqrt_var
 
     if scale is not None:
         scale = reshape(scale, x.shape[begin_norm_axis:])
diff --git a/python/paddle/incubate/autograd/primitives.py b/python/paddle/incubate/autograd/primitives.py
index cc8ba89423d7c8..9f52d9d69ac233 100644
--- a/python/paddle/incubate/autograd/primitives.py
+++ b/python/paddle/incubate/autograd/primitives.py
@@ -50,6 +50,7 @@
 from paddle.tensor import pow  # noqa: F401
 from paddle.tensor import prod  # noqa: F401
 from paddle.tensor import reshape  # noqa: F401
+from paddle.tensor import rsqrt  # noqa: F401
 from paddle.tensor import sign  # noqa: F401
 from paddle.tensor import sin  # noqa: F401
 from paddle.tensor import sinh  # noqa: F401
@@ -117,6 +118,7 @@
     'ones',
     'zeros',
     'sqrt',
+    'rsqrt',
 ]
 
 others = [

From 523f8a266f8930213343fb3179f83b032874544b Mon Sep 17 00:00:00 2001
From: Guoxia Wang <mingzilaochongtu@gmail.com>
Date: Wed, 12 Apr 2023 10:21:25 +0800
Subject: [PATCH 076/156] [AMP OP&Test] support bf16 for batch norm (#52407)

* [AMP OP&Test] support bf16 for batchnorm

* codestyle

* Update batch_norm_grad_kernel.cu

* Update batch_norm_kernel.cu

* fix codestyle

* fix

* fix

* fix

* fix

* fix

* Update batch_norm_kernel.cc
---
 paddle/phi/kernels/batch_norm_kernel.cc       | 18 ++++++
 .../phi/kernels/gpu/batch_norm_grad_kernel.cu | 36 +++++++++++-
 paddle/phi/kernels/gpu/batch_norm_kernel.cu   | 24 ++++++++
 .../tests/unittests/test_batch_norm_op.py     | 56 +++++++++++++++++--
 4 files changed, 127 insertions(+), 7 deletions(-)

diff --git a/paddle/phi/kernels/batch_norm_kernel.cc b/paddle/phi/kernels/batch_norm_kernel.cc
index eddd65184fe934..570ba8dae06cfc 100644
--- a/paddle/phi/kernels/batch_norm_kernel.cc
+++ b/paddle/phi/kernels/batch_norm_kernel.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/phi/kernels/batch_norm_kernel.h"
 
+#include "paddle/phi/backends/gpu/gpu_dnn.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 
@@ -66,6 +67,22 @@ PD_REGISTER_KERNEL(batch_norm_infer,
                    float,
                    double) {}
 #ifdef PADDLE_WITH_CUDA
+#if CUDNN_VERSION_MIN(8, 1, 0)
+PD_REGISTER_KERNEL(batch_norm_infer,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormInferKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16,
+                   phi::dtype::float16) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16 ||
+      kernel_key.dtype() == phi::DataType::BFLOAT16) {
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+  }
+}
+#else
 PD_REGISTER_KERNEL(batch_norm_infer,
                    GPU,
                    ALL_LAYOUT,
@@ -79,6 +96,7 @@ PD_REGISTER_KERNEL(batch_norm_infer,
   }
 }
 #endif
+#endif
 #ifdef PADDLE_WITH_HIP
 PD_REGISTER_KERNEL(batch_norm_infer,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
index ede24587449026..db7f3c3224a03e 100644
--- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
@@ -1314,14 +1314,18 @@ PD_REGISTER_KERNEL(batch_norm_grad_raw,
                    float,
                    phi::dtype::float16) {}
 #else
+#if CUDNN_VERSION_MIN(8, 1, 0)
+
 PD_REGISTER_KERNEL(batch_norm_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::BatchNormGradKernel,
                    float,
                    double,
+                   phi::dtype::bfloat16,
                    phi::dtype::float16) {
-  if (kernel_key.dtype() == phi::DataType::FLOAT16) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16 ||
+      kernel_key.dtype() == phi::DataType::BFLOAT16) {
     kernel->OutputAt(0).SetDataType(phi::DataType::FLOAT32);  // x_grad
     kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);  // scale_grad
     kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);  // bias_grad
@@ -1334,6 +1338,22 @@ PD_REGISTER_KERNEL(batch_norm_grad_raw,
                    phi::BatchNormGradRawKernel,
                    float,
                    double,
+                   phi::dtype::bfloat16,
+                   phi::dtype::float16) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16 ||
+      kernel_key.dtype() == phi::DataType::BFLOAT16) {
+    kernel->OutputAt(0).SetDataType(phi::DataType::FLOAT32);  // x_grad
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);  // scale_grad
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);  // bias_grad
+  }
+}
+#else
+PD_REGISTER_KERNEL(batch_norm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormGradKernel,
+                   float,
+                   double,
                    phi::dtype::float16) {
   if (kernel_key.dtype() == phi::DataType::FLOAT16) {
     kernel->OutputAt(0).SetDataType(phi::DataType::FLOAT32);  // x_grad
@@ -1342,6 +1362,20 @@ PD_REGISTER_KERNEL(batch_norm_grad_raw,
   }
 }
 
+PD_REGISTER_KERNEL(batch_norm_grad_raw,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormGradRawKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16) {
+    kernel->OutputAt(0).SetDataType(phi::DataType::FLOAT32);  // x_grad
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);  // scale_grad
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);  // bias_grad
+  }
+}
+#endif
 #endif
 
 #ifdef PADDLE_WITH_HIP
diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
index 63276e4d53024b..fb1bca3daba86d 100644
--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -1221,6 +1221,7 @@ PD_REGISTER_KERNEL(batch_norm,
                    ALL_LAYOUT,
                    phi::BatchNormKernel,
                    float,
+                   phi::dtype::bfloat16,
                    phi::dtype::float16) {
   kernel->InputAt(1).SetDataType(phi::DataType::FLOAT32);
   kernel->InputAt(2).SetDataType(phi::DataType::FLOAT32);
@@ -1232,6 +1233,28 @@ PD_REGISTER_KERNEL(batch_norm,
   kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
 }
 #else
+#if CUDNN_VERSION_MIN(8, 1, 0)
+PD_REGISTER_KERNEL(batch_norm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16,
+                   phi::dtype::float16) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16 ||
+      kernel_key.dtype() == phi::DataType::BFLOAT16) {
+    kernel->InputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->InputAt(2).SetDataType(phi::DataType::FLOAT32);
+    kernel->InputAt(3).SetDataType(phi::DataType::FLOAT32);
+    kernel->InputAt(4).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
+  }
+}
+#else
 PD_REGISTER_KERNEL(batch_norm,
                    GPU,
                    ALL_LAYOUT,
@@ -1250,5 +1273,6 @@ PD_REGISTER_KERNEL(batch_norm,
     kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
   }
 }
+#endif
 
 #endif
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
index 86ffea08a22542..bbe322ae0175b7 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
@@ -16,7 +16,12 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, _set_use_system_allocator
+from eager_op_test import (
+    OpTest,
+    _set_use_system_allocator,
+    convert_float_to_uint16,
+    convert_uint16_to_float,
+)
 from op import Operator
 
 import paddle
@@ -239,7 +244,10 @@ def check_with_place(self, place, data_layout, dtype, shape):
                 raise ValueError("Unknown data layout.")
         scale_shape = [c]
 
-        x_val = np.random.random_sample(x_shape).astype(dtype)
+        if dtype == np.uint16:
+            x_val = np.random.random_sample(x_shape).astype(np.float32)
+        else:
+            x_val = np.random.random_sample(x_shape).astype(dtype)
         # generate some negative values to test case with relu fused
         x_val = x_val - 0.5
         scale_val = np.random.random_sample(scale_shape).astype(np.float32)
@@ -248,12 +256,20 @@ def check_with_place(self, place, data_layout, dtype, shape):
         mean = np.zeros(scale_shape).astype(np.float32)
         variance = np.ones(scale_shape).astype(np.float32)
 
-        y_out = _reference_testing(
-            x_val, scale_val, bias_val, mean, variance, epsilon, data_layout
-        ).astype(dtype)
+        if dtype == np.uint16:
+            y_out = _reference_testing(
+                x_val, scale_val, bias_val, mean, variance, epsilon, data_layout
+            ).astype(np.float32)
+            y_out = convert_float_to_uint16(y_out)
+        else:
+            y_out = _reference_testing(
+                x_val, scale_val, bias_val, mean, variance, epsilon, data_layout
+            ).astype(dtype)
         if self.fuse_with_relu:
             y_out = np.maximum(y_out, 0)
 
+        if dtype == np.uint16:
+            x_val = convert_float_to_uint16(x_val)
         scope = core.Scope()
 
         # create input
@@ -324,6 +340,11 @@ def check_with_place(self, place, data_layout, dtype, shape):
             y_tensor._set_dims(dims)
 
         # check inference result
+        atol = 1e-3
+        if dtype == np.uint16:
+            y_tensor = convert_uint16_to_float(y_tensor)
+            y_out = convert_uint16_to_float(y_out)
+            atol = 1e-2
         self.__assert_close(
             y_tensor,
             y_out,
@@ -335,7 +356,7 @@ def check_with_place(self, place, data_layout, dtype, shape):
             + str(np.dtype(dtype))
             + str(np.array(y_tensor))
             + str(y_out),
-            atol=1e-3,
+            atol=atol,
         )
 
     def test_check_output(self):
@@ -376,6 +397,29 @@ def test_check_output(self):
                 self.check_with_place(place, data_format, self.dtype, [2, 3])
 
 
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support the bfloat16",
+)
+class TestBF16BatchNormOpInference(TestBatchNormOpInference):
+    def setUp(self):
+        self.dtype = np.uint16
+        self.use_mkldnn = False
+        self.fuse_with_relu = False
+        self.init_kernel_type()
+
+    def test_check_output(self):
+        places = [core.CUDAPlace(0)]
+        for place in places:
+            # for data_format in ["NCHW", "NHWC"]:
+            for data_format in ["NCHW"]:
+                self.check_with_place(
+                    place, data_format, self.dtype, [2, 3, 4, 5]
+                )
+                self.check_with_place(place, data_format, self.dtype, [2, 3])
+
+
 class TestBatchNormOpTraining(unittest.TestCase):
     def setUp(self):
         self.use_mkldnn = False

From 9a7c83bdefb7bad3d476ea4eb524074c0a890229 Mon Sep 17 00:00:00 2001
From: RedContritio <RedContritio@qq.com>
Date: Wed, 12 Apr 2023 10:22:23 +0800
Subject: [PATCH 077/156] [Move Test] xpu (#52661)

* move python/paddle/fluid/tests/unittests/xpu to test/xpu

* update CMakeLists.txt

* remove xpu in fluid/tests/unittests/

* add path to op_test_xpu

* fix incorrect path

* update test script

* fix test_adadelta_op_xpu error
---
 paddle/scripts/paddle_build.sh                |  2 +-
 .../fluid/tests/unittests/CMakeLists.txt      |  4 --
 .../fluid/tests/unittests/xpu/CMakeLists.txt  | 36 ------------------
 test/CMakeLists.txt                           |  4 +-
 test/xpu/CMakeLists.txt                       | 37 +++++++++++++++++++
 .../xpu/collective_allgather_op_xpu.py        |  0
 .../xpu/collective_allreduce_op_xpu.py        |  0
 .../xpu/collective_broadcast_op_xpu.py        |  0
 .../xpu/collective_concat_op.py               |  0
 .../xpu/collective_identity_op_xpu.py         |  0
 ...ctive_softmax_with_cross_entropy_op_xpu.py |  0
 .../xpu/collective_split_op.py                |  0
 .../xpu/get_test_cover_info.py                |  0
 .../unittests => test/xpu}/op_test_xpu.py     | 12 ++++--
 ...allel_dygraph_dataparallel_with_pylayer.py |  0
 .../xpu/parallel_dygraph_gradient_check.py    |  0
 ...el_dygraph_gradient_check_in_eager_mode.py |  0
 .../xpu/process_group_bkcl.py                 |  0
 .../xpu/test_accuracy_op_xpu.py               |  7 +---
 .../xpu/test_activation_op_xpu.py             |  9 ++---
 .../xpu/test_adadelta_op_xpu.py               | 12 +++---
 .../xpu/test_adagrad_op_xpu.py                | 14 +++----
 .../xpu/test_adam_op_xpu.py                   |  7 +---
 .../xpu/test_adamw_op_xpu.py                  |  8 +---
 .../xpu/test_affine_channel_op_xpu.py         |  4 --
 .../test_amp_check_finite_and_scale_op_xpu.py |  7 +---
 .../xpu/test_arg_max_op_xpu.py                |  8 +---
 .../xpu/test_argsort_op_xpu.py                |  8 +---
 .../xpu/test_assign_op_xpu.py                 |  8 +---
 .../xpu/test_assign_value_op_xpu.py           |  7 +---
 .../xpu/test_atan_op_xpu.py                   |  6 +--
 .../xpu/test_batch_norm_op_xpu.py             |  5 +--
 .../xpu/test_bce_loss_op_xpu.py               |  7 +---
 .../xpu/test_bilinear_interp_op_xpu.py        |  3 --
 .../xpu/test_bilinear_interp_v2_op_xpu.py     |  8 +---
 .../xpu/test_bitwise_op_xpu.py                |  9 ++---
 .../unittests => test}/xpu/test_bmm_op_xpu.py |  8 +---
 .../unittests => test}/xpu/test_c_concat.py   | 14 +++----
 .../xpu/test_c_embedding_op_xpu.py            |  2 -
 .../unittests => test}/xpu/test_c_split.py    | 14 +++----
 .../xpu/test_cast_op_xpu.py                   |  7 +---
 .../xpu/test_clip_by_norm_op_xpu.py           |  7 +---
 .../xpu/test_clip_op_xpu.py                   |  7 +---
 .../xpu/test_coalesce_tensor_op_xpu.py        |  7 +---
 .../xpu/test_collective_allgather_xpu.py      | 14 +++----
 .../xpu/test_collective_allreduce_xpu.py      | 14 +++----
 .../xpu/test_collective_base_xpu.py           |  0
 .../xpu/test_collective_broadcast_xpu.py      |  6 +--
 .../xpu/test_collective_identity_xpu.py       | 14 +++----
 .../xpu/test_collective_process_group.py      |  0
 ...llective_softmax_with_cross_entropy_xpu.py | 14 +++----
 .../xpu/test_compare_op_xpu.py                |  7 +---
 .../xpu/test_concat_op_xpu.py                 |  8 ++--
 .../xpu/test_conv2d_op_xpu.py                 |  7 +---
 .../xpu/test_conv2d_transpose_op_xpu.py       |  7 +---
 .../xpu/test_conv3d_op_xpu.py                 |  5 +--
 .../xpu/test_cumprod_op_xpu.py                |  8 +---
 .../xpu/test_cumsum_op_xpu.py                 |  8 +---
 .../xpu/test_deformable_conv_op_xpu.py        |  7 +---
 .../xpu/test_depthwise_conv2d_op_xpu.py       |  7 +---
 .../xpu/test_device_guard_xpu.py              |  4 --
 .../xpu/test_diag_v2_op_xpu.py                |  7 +---
 .../xpu/test_diagonal_op_xpu.py               | 12 +++---
 .../test_distribute_fpn_proposals_op_xpu.py   |  5 +--
 .../xpu/test_dropout_op_xpu.py                |  5 +--
 .../xpu/test_einsum_op_xpu.py                 | 11 ++----
 .../xpu/test_elementwise_add_op_xpu.py        | 11 +++---
 .../xpu/test_elementwise_add_op_xpu_kp.py     |  7 ++--
 .../xpu/test_elementwise_div_op_xpu.py        |  8 ++--
 .../xpu/test_elementwise_floordiv_op_xpu.py   |  8 ++--
 .../xpu/test_elementwise_max_op_xpu.py        |  8 ++--
 .../xpu/test_elementwise_min_op_xpu.py        |  8 ++--
 .../xpu/test_elementwise_mod_op_xpu.py        |  8 ++--
 .../xpu/test_elementwise_mul_op_xpu.py        |  8 ++--
 .../xpu/test_elementwise_pow_op_xpu.py        |  8 ++--
 .../xpu/test_elementwise_sub_op_xpu.py        | 11 +++---
 .../xpu/test_empty_op_xpu.py                  |  8 +---
 .../xpu/test_expand_as_v2_op_xpu.py           |  7 +---
 .../xpu/test_expand_v2_op_xpu.py              |  7 +---
 .../xpu/test_fill_any_like_op_xpu.py          |  8 +---
 .../xpu/test_fill_any_op_xpu.py               |  7 +---
 .../xpu/test_fill_constant_op_xpu.py          |  8 ++--
 .../xpu/test_fill_diagonal_tensor_op_xpu.py   | 12 +++---
 .../xpu/test_fill_op_xpu.py                   |  7 +---
 .../xpu/test_flatten2_op_xpu.py               |  6 +--
 .../test_flatten_contiguous_range_op_xpu.py   | 10 +----
 .../xpu/test_flatten_op_xpu.py                |  6 +--
 .../xpu/test_fleet_exe_dist_model_run_xpu.py  |  0
 .../xpu/test_fused_attention_op_xpu.py        | 11 ++----
 .../xpu/test_fused_feedforward_op_xpu.py      |  9 +----
 .../test_fused_gemm_epilogue_grad_op_xpu.py   |  8 +---
 .../xpu/test_fused_gemm_epilogue_op_xpu.py    |  7 +---
 .../test_fused_resnet_basic_block_op_xpu.py   |  6 +--
 .../xpu/test_gather_nd_op_xpu.py              |  8 +---
 .../xpu/test_gather_op_xpu.py                 |  7 +---
 .../xpu/test_gaussian_random_op_xpu.py        |  7 +---
 .../xpu/test_gen_bkcl_id_op.py                |  3 --
 .../xpu/test_generate_proposals_v2_op_xpu.py  | 13 ++-----
 .../xpu/test_grid_sampler_op_xpu.py           |  8 +---
 .../xpu/test_group_norm_op_xpu.py             |  9 ++---
 .../xpu/test_huber_loss_op_xpu.py             |  9 ++---
 .../xpu/test_increment_op_xpu.py              |  8 +---
 .../xpu/test_index_sample_op_xpu.py           |  8 +---
 .../xpu/test_index_select_op_xpu.py           | 12 ++----
 .../xpu/test_instance_norm_op_xpu.py          | 15 +++-----
 .../xpu/test_iou_similarity_op_xpu.py         |  9 ++---
 .../xpu/test_isfinite_op_xpu.py               |  7 +---
 .../xpu/test_kldiv_loss_op_xpu.py             |  7 +---
 .../xpu/test_label_smooth_op_xpu.py           | 11 ++----
 .../xpu/test_lamb_op_xpu.py                   |  7 +---
 .../xpu/test_layer_norm_op_xpu.py             | 14 +++----
 .../xpu/test_linspace_op_xpu.py               |  8 +---
 .../xpu/test_log_loss_op_xpu.py               |  4 +-
 .../xpu/test_log_softmax_op_xpu.py            |  8 +---
 .../xpu/test_logical_op_xpu.py                |  9 ++---
 .../xpu/test_logsumexp_op_xpu.py              |  6 +--
 .../xpu/test_lookup_table_v2_op_xpu.py        |  8 +---
 .../xpu/test_masked_select_op_xpu.py          |  8 +---
 .../xpu/test_matmul_op_xpu.py                 |  7 +---
 .../xpu/test_matmul_v2_op_xpu.py              |  7 +---
 .../xpu/test_mean_op_xpu.py                   |  8 +---
 .../xpu/test_merged_momentum_op_xpu.py        |  7 +---
 .../xpu/test_merged_momentum_op_xpu_base.py   |  0
 .../xpu/test_meshgrid_op_xpu.py               |  7 +---
 .../xpu/test_momentum_op_xpu.py               |  8 +---
 .../unittests => test}/xpu/test_mul_op_xpu.py |  7 +---
 .../xpu/test_nearest_interp_op_xpu.py         |  3 --
 .../xpu/test_nearest_interp_v2_op_xpu.py      |  8 +---
 .../xpu/test_one_hot_op_xpu.py                |  7 +---
 .../xpu/test_one_hot_v2_op_xpu.py             | 15 +++-----
 .../xpu/test_p_norm_op_xpu.py                 | 11 ++----
 .../xpu/test_pad3d_op_xpu.py                  |  8 +---
 .../xpu/test_parallel_dygraph_dataparallel.py |  0
 .../xpu/test_pixel_shuffle_op_xpu.py          |  8 +---
 .../xpu/test_pool2d_op_xpu.py                 |  9 ++---
 .../xpu/test_pool3d_op_xpu.py                 | 11 ++----
 .../xpu/test_pool_max_op_xpu.py               |  7 +---
 ...st_pow2_decay_with_linear_warmup_op_xpu.py |  8 ++--
 .../xpu/test_prelu_op_xpu.py                  |  8 +---
 .../xpu/test_prior_box_op_xpu.py              |  8 +---
 .../xpu/test_prod_op_xpu.py                   |  4 +-
 .../xpu/test_randint_op_xpu.py                |  8 +---
 .../xpu/test_randperm_op_xpu.py               | 15 +++-----
 .../unittests => test}/xpu/test_range_xpu.py  | 11 ++----
 .../xpu/test_recompute_op_xpu.py              |  0
 .../xpu/test_reduce_all_op_xpu.py             |  8 +---
 .../xpu/test_reduce_amax_op_xpu.py            |  8 +---
 .../xpu/test_reduce_amin_op_xpu.py            |  8 +---
 .../xpu/test_reduce_any_op_xpu.py             |  8 +---
 .../xpu/test_reduce_max_op_xpu.py             |  8 +---
 .../xpu/test_reduce_mean_op_xpu.py            |  7 +---
 .../xpu/test_reduce_min_op_xpu.py             |  8 +---
 .../xpu/test_reduce_prod_op_xpu.py            |  8 +---
 .../xpu/test_reduce_sum_op_xpu.py             |  8 +---
 .../xpu/test_refactor_op_xpu.py               |  9 ++---
 .../xpu/test_reshape2_op_xpu.py               |  8 +---
 .../xpu/test_rmsprop_op_xpu.py                |  8 +---
 .../unittests => test}/xpu/test_rnn_op_xpu.py | 12 +++---
 .../xpu/test_roi_align_op_xpu.py              |  7 +---
 .../xpu/test_roll_op_xpu.py                   | 11 ++----
 .../xpu/test_scale_op_xpu.py                  |  8 +---
 .../xpu/test_scatter_nd_add_op_xpu.py         |  8 +---
 .../xpu/test_scatter_op_xpu.py                |  8 +---
 .../xpu/test_sequence_conv_op_xpu.py          |  4 +-
 .../xpu/test_sequence_unpad_op_xpu.py         |  8 +---
 .../xpu/test_set_value_op_xpu.py              |  4 +-
 .../unittests => test}/xpu/test_sgd_op_xpu.py |  7 +---
 .../xpu/test_shape_op_xpu.py                  |  7 +---
 ...igmoid_cross_entropy_with_logits_op_xpu.py |  9 ++---
 .../xpu/test_sign_op_xpu.py                   |  8 +---
 .../xpu/test_slice_op_xpu.py                  | 11 ++----
 .../xpu/test_softmax_op_xpu.py                | 11 ++----
 .../test_softmax_with_cross_entropy_op_xpu.py | 10 ++---
 .../xpu/test_split_op_xpu.py                  |  7 +---
 .../xpu/test_squeeze2_op_xpu.py               |  7 +---
 .../xpu/test_squeeze_op_xpu.py                |  7 +---
 .../xpu/test_stack_op_xpu.py                  |  8 ++--
 .../xpu/test_strided_slice_op_xpu.py          | 11 ++----
 .../unittests => test}/xpu/test_sum_op_xpu.py |  7 +---
 .../xpu/test_temporal_shift_op_xpu.py         |  8 +---
 .../xpu/test_tile_op_xpu.py                   |  7 +---
 .../xpu/test_top_k_op_xpu.py                  |  7 +---
 .../xpu/test_top_k_v2_op_xpu.py               |  7 +---
 .../xpu/test_transpose_op_xpu.py              |  7 +---
 .../xpu/test_tril_triu_op_xpu.py              |  8 +---
 .../test_truncated_gaussian_random_op_xpu.py  |  7 +---
 .../xpu/test_unbind_op_xpu.py                 |  6 +--
 .../xpu/test_unfold_op_xpu.py                 | 13 +++----
 .../xpu/test_uniform_random_op_xpu.py         |  4 +-
 .../xpu/test_unsqueeze2_op_xpu.py             |  7 +---
 .../xpu/test_unsqueeze_op_xpu.py              |  7 +---
 .../xpu/test_unstack_op_xpu.py                |  7 +---
 .../xpu/test_update_loss_scaling_op_xpu.py    |  6 +--
 .../xpu/test_warpctc_op_xpu.py                |  8 ++--
 .../xpu/test_where_index_xpu.py               |  8 +---
 .../xpu/test_where_op_xpu.py                  |  8 +---
 .../xpu/test_while_op_xpu.py                  |  0
 .../unittests => test}/xpu/test_xpu_place.py  |  0
 .../xpu/test_xpu_stream_event.py              |  0
 .../xpu/test_zero_dim_tensor_xpu.py           |  0
 tools/get_pr_ut.py                            |  2 +-
 201 files changed, 503 insertions(+), 993 deletions(-)
 delete mode 100644 python/paddle/fluid/tests/unittests/xpu/CMakeLists.txt
 rename {python/paddle/fluid/tests/unittests => test}/xpu/collective_allgather_op_xpu.py (100%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/collective_allreduce_op_xpu.py (100%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/collective_broadcast_op_xpu.py (100%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/collective_concat_op.py (100%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/collective_identity_op_xpu.py (100%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/collective_softmax_with_cross_entropy_op_xpu.py (100%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/collective_split_op.py (100%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/get_test_cover_info.py (100%)
 rename {python/paddle/fluid/tests/unittests => test/xpu}/op_test_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/parallel_dygraph_dataparallel_with_pylayer.py (100%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/parallel_dygraph_gradient_check.py (100%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/parallel_dygraph_gradient_check_in_eager_mode.py (100%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/process_group_bkcl.py (100%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_accuracy_op_xpu.py (97%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_activation_op_xpu.py (99%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_adadelta_op_xpu.py (96%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_adagrad_op_xpu.py (97%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_adam_op_xpu.py (99%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_adamw_op_xpu.py (99%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_affine_channel_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_amp_check_finite_and_scale_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_arg_max_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_argsort_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_assign_op_xpu.py (96%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_assign_value_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_atan_op_xpu.py (96%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_batch_norm_op_xpu.py (99%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_bce_loss_op_xpu.py (97%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_bilinear_interp_op_xpu.py (99%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_bilinear_interp_v2_op_xpu.py (99%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_bitwise_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_bmm_op_xpu.py (97%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_c_concat.py (95%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_c_embedding_op_xpu.py (96%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_c_split.py (95%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_cast_op_xpu.py (97%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_clip_by_norm_op_xpu.py (97%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_clip_op_xpu.py (99%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_coalesce_tensor_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_collective_allgather_xpu.py (95%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_collective_allreduce_xpu.py (95%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_collective_base_xpu.py (100%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_collective_broadcast_xpu.py (92%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_collective_identity_xpu.py (95%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_collective_process_group.py (100%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_collective_softmax_with_cross_entropy_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_compare_op_xpu.py (99%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_concat_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_conv2d_op_xpu.py (99%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_conv2d_transpose_op_xpu.py (99%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_conv3d_op_xpu.py (99%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_cumprod_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_cumsum_op_xpu.py (97%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_deformable_conv_op_xpu.py (99%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_depthwise_conv2d_op_xpu.py (99%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_device_guard_xpu.py (99%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_diag_v2_op_xpu.py (99%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_diagonal_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_distribute_fpn_proposals_op_xpu.py (99%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_dropout_op_xpu.py (99%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_einsum_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_elementwise_add_op_xpu.py (99%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_elementwise_add_op_xpu_kp.py (99%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_elementwise_div_op_xpu.py (99%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_elementwise_floordiv_op_xpu.py (96%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_elementwise_max_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_elementwise_min_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_elementwise_mod_op_xpu.py (97%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_elementwise_mul_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_elementwise_pow_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_elementwise_sub_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_empty_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_expand_as_v2_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_expand_v2_op_xpu.py (99%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_fill_any_like_op_xpu.py (97%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_fill_any_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_fill_constant_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_fill_diagonal_tensor_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_fill_op_xpu.py (97%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_flatten2_op_xpu.py (97%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_flatten_contiguous_range_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_flatten_op_xpu.py (97%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_fleet_exe_dist_model_run_xpu.py (100%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_fused_attention_op_xpu.py (99%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_fused_feedforward_op_xpu.py (99%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_fused_gemm_epilogue_grad_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_fused_gemm_epilogue_op_xpu.py (99%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_fused_resnet_basic_block_op_xpu.py (99%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_gather_nd_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_gather_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_gaussian_random_op_xpu.py (99%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_gen_bkcl_id_op.py (99%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_generate_proposals_v2_op_xpu.py (99%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_grid_sampler_op_xpu.py (99%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_group_norm_op_xpu.py (97%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_huber_loss_op_xpu.py (97%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_increment_op_xpu.py (97%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_index_sample_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_index_select_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_instance_norm_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_iou_similarity_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_isfinite_op_xpu.py (96%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_kldiv_loss_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_label_smooth_op_xpu.py (97%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_lamb_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_layer_norm_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_linspace_op_xpu.py (97%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_log_loss_op_xpu.py (96%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_log_softmax_op_xpu.py (97%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_logical_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_logsumexp_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_lookup_table_v2_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_masked_select_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_matmul_op_xpu.py (99%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_matmul_v2_op_xpu.py (99%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_mean_op_xpu.py (97%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_merged_momentum_op_xpu.py (96%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_merged_momentum_op_xpu_base.py (100%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_meshgrid_op_xpu.py (97%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_momentum_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_mul_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_nearest_interp_op_xpu.py (99%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_nearest_interp_v2_op_xpu.py (99%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_one_hot_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_one_hot_v2_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_p_norm_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_pad3d_op_xpu.py (99%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_parallel_dygraph_dataparallel.py (100%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_pixel_shuffle_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_pool2d_op_xpu.py (99%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_pool3d_op_xpu.py (99%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_pool_max_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_pow2_decay_with_linear_warmup_op_xpu.py (96%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_prelu_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_prior_box_op_xpu.py (99%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_prod_op_xpu.py (99%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_randint_op_xpu.py (96%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_randperm_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_range_xpu.py (97%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_recompute_op_xpu.py (100%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_reduce_all_op_xpu.py (97%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_reduce_amax_op_xpu.py (96%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_reduce_amin_op_xpu.py (96%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_reduce_any_op_xpu.py (97%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_reduce_max_op_xpu.py (97%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_reduce_mean_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_reduce_min_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_reduce_prod_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_reduce_sum_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_refactor_op_xpu.py (99%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_reshape2_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_rmsprop_op_xpu.py (99%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_rnn_op_xpu.py (97%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_roi_align_op_xpu.py (99%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_roll_op_xpu.py (97%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_scale_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_scatter_nd_add_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_scatter_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_sequence_conv_op_xpu.py (99%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_sequence_unpad_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_set_value_op_xpu.py (99%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_sgd_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_shape_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_sigmoid_cross_entropy_with_logits_op_xpu.py (99%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_sign_op_xpu.py (97%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_slice_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_softmax_op_xpu.py (97%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_softmax_with_cross_entropy_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_split_op_xpu.py (97%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_squeeze2_op_xpu.py (97%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_squeeze_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_stack_op_xpu.py (97%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_strided_slice_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_sum_op_xpu.py (99%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_temporal_shift_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_tile_op_xpu.py (99%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_top_k_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_top_k_v2_op_xpu.py (99%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_transpose_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_tril_triu_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_truncated_gaussian_random_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_unbind_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_unfold_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_uniform_random_op_xpu.py (95%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_unsqueeze2_op_xpu.py (99%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_unsqueeze_op_xpu.py (97%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_unstack_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_update_loss_scaling_op_xpu.py (99%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_warpctc_op_xpu.py (99%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_where_index_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_where_op_xpu.py (98%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_while_op_xpu.py (100%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_xpu_place.py (100%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_xpu_stream_event.py (100%)
 rename {python/paddle/fluid/tests/unittests => test}/xpu/test_zero_dim_tensor_xpu.py (100%)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 2db59b7b61ce7e..4693d78e2dc327 100644
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -2237,7 +2237,7 @@ set +x
 set -x
         ut_endTime_s=`date +%s`
         echo "XPU testCase Time: $[ $ut_endTime_s - $ut_startTime_s ]s"
-        python ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py
+        python ${PADDLE_ROOT}/build/test/xpu/get_test_cover_info.py
         unset XPU_OP_LIST_DIR
         if [[ "$EXIT_CODE" != "0" ]]; then
             exit 8;
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 63279cffc3e51b..909b658c0983ca 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -772,10 +772,6 @@ add_subdirectory(sequence)
 add_subdirectory(rnn)
 add_subdirectory(distribution)
 
-if(WITH_XPU)
-  add_subdirectory(xpu)
-endif()
-
 # dist xpu tests:
 if(WITH_XPU_BKCL)
   py_test(test_collective_allreduce_api_xpu
diff --git a/python/paddle/fluid/tests/unittests/xpu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/xpu/CMakeLists.txt
deleted file mode 100644
index cc46e42f8ca644..00000000000000
--- a/python/paddle/fluid/tests/unittests/xpu/CMakeLists.txt
+++ /dev/null
@@ -1,36 +0,0 @@
-file(
-  GLOB TEST_OPS
-  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
-  "test_*.py")
-string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
-
-if(WITH_XPU_BKCL)
-  list(REMOVE_ITEM TEST_OPS "test_gen_bkcl_id_op")
-endif()
-
-file(
-  GLOB DIST_TEST_OPS
-  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
-  "test_dist_*.py")
-if(WITH_XPU_BKCL)
-  list(APPEND DIST_TEST_OPS test_gen_bkcl_id_op)
-endif()
-
-foreach(TEST_OP ${TEST_OPS})
-  py_test_modules(${TEST_OP} MODULES ${TEST_OP})
-endforeach()
-
-foreach(TEST_OP ${DIST_TEST_OPS})
-  py_test_modules(${TEST_OP} MODULES ${TEST_OP})
-endforeach()
-
-set_tests_properties(test_conv2d_op_xpu PROPERTIES TIMEOUT 120)
-set_tests_properties(test_mul_op_xpu PROPERTIES TIMEOUT 120)
-set_tests_properties(test_matmul_v2_op_xpu PROPERTIES TIMEOUT 900)
-set_tests_properties(test_matmul_op_xpu PROPERTIES TIMEOUT 300)
-set_tests_properties(test_collective_identity_xpu
-                     PROPERTIES LABELS "RUN_TYPE=DIST_KUNLUN")
-set_tests_properties(test_collective_allgather_xpu
-                     PROPERTIES LABELS "RUN_TYPE=DIST_KUNLUN")
-set_tests_properties(test_collective_allreduce_xpu
-                     PROPERTIES LABELS "RUN_TYPE=DIST_KUNLUN")
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 4d50fe16b9b051..8bbd59a7176fff 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -128,7 +128,9 @@ if(WITH_TESTING)
   add_subdirectory(standalone_executor)
   add_subdirectory(tokenizer)
   # add_subdirectory(white_list)
-  add_subdirectory(xpu)
+  if(WITH_XPU)
+    add_subdirectory(xpu)
+  endif()
 endif()
 
 get_property(test_srcs GLOBAL PROPERTY TEST_SRCS)
diff --git a/test/xpu/CMakeLists.txt b/test/xpu/CMakeLists.txt
index e0543ef9e50f58..4ecde12f008af6 100644
--- a/test/xpu/CMakeLists.txt
+++ b/test/xpu/CMakeLists.txt
@@ -1,3 +1,40 @@
 if(WITH_XPU)
   add_subdirectory(cpp)
 endif()
+
+file(
+  GLOB TEST_OPS
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+if(WITH_XPU_BKCL)
+  list(REMOVE_ITEM TEST_OPS "test_gen_bkcl_id_op")
+endif()
+
+file(
+  GLOB DIST_TEST_OPS
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "test_dist_*.py")
+if(WITH_XPU_BKCL)
+  list(APPEND DIST_TEST_OPS test_gen_bkcl_id_op)
+endif()
+
+foreach(TEST_OP ${TEST_OPS})
+  py_test_modules(${TEST_OP} MODULES ${TEST_OP})
+endforeach()
+
+foreach(TEST_OP ${DIST_TEST_OPS})
+  py_test_modules(${TEST_OP} MODULES ${TEST_OP})
+endforeach()
+
+set_tests_properties(test_conv2d_op_xpu PROPERTIES TIMEOUT 120)
+set_tests_properties(test_mul_op_xpu PROPERTIES TIMEOUT 120)
+set_tests_properties(test_matmul_v2_op_xpu PROPERTIES TIMEOUT 900)
+set_tests_properties(test_matmul_op_xpu PROPERTIES TIMEOUT 300)
+set_tests_properties(test_collective_identity_xpu
+                     PROPERTIES LABELS "RUN_TYPE=DIST_KUNLUN")
+set_tests_properties(test_collective_allgather_xpu
+                     PROPERTIES LABELS "RUN_TYPE=DIST_KUNLUN")
+set_tests_properties(test_collective_allreduce_xpu
+                     PROPERTIES LABELS "RUN_TYPE=DIST_KUNLUN")
diff --git a/python/paddle/fluid/tests/unittests/xpu/collective_allgather_op_xpu.py b/test/xpu/collective_allgather_op_xpu.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/xpu/collective_allgather_op_xpu.py
rename to test/xpu/collective_allgather_op_xpu.py
diff --git a/python/paddle/fluid/tests/unittests/xpu/collective_allreduce_op_xpu.py b/test/xpu/collective_allreduce_op_xpu.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/xpu/collective_allreduce_op_xpu.py
rename to test/xpu/collective_allreduce_op_xpu.py
diff --git a/python/paddle/fluid/tests/unittests/xpu/collective_broadcast_op_xpu.py b/test/xpu/collective_broadcast_op_xpu.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/xpu/collective_broadcast_op_xpu.py
rename to test/xpu/collective_broadcast_op_xpu.py
diff --git a/python/paddle/fluid/tests/unittests/xpu/collective_concat_op.py b/test/xpu/collective_concat_op.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/xpu/collective_concat_op.py
rename to test/xpu/collective_concat_op.py
diff --git a/python/paddle/fluid/tests/unittests/xpu/collective_identity_op_xpu.py b/test/xpu/collective_identity_op_xpu.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/xpu/collective_identity_op_xpu.py
rename to test/xpu/collective_identity_op_xpu.py
diff --git a/python/paddle/fluid/tests/unittests/xpu/collective_softmax_with_cross_entropy_op_xpu.py b/test/xpu/collective_softmax_with_cross_entropy_op_xpu.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/xpu/collective_softmax_with_cross_entropy_op_xpu.py
rename to test/xpu/collective_softmax_with_cross_entropy_op_xpu.py
diff --git a/python/paddle/fluid/tests/unittests/xpu/collective_split_op.py b/test/xpu/collective_split_op.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/xpu/collective_split_op.py
rename to test/xpu/collective_split_op.py
diff --git a/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py b/test/xpu/get_test_cover_info.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py
rename to test/xpu/get_test_cover_info.py
diff --git a/python/paddle/fluid/tests/unittests/op_test_xpu.py b/test/xpu/op_test_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/op_test_xpu.py
rename to test/xpu/op_test_xpu.py
index af92704a57216b..02e68b3c3ed936 100644
--- a/python/paddle/fluid/tests/unittests/op_test_xpu.py
+++ b/test/xpu/op_test_xpu.py
@@ -12,15 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import sys
+
 import numpy as np
+
+sys.path.append('..')
+sys.path.append('../../python/paddle/fluid/tests/unittests')
+
 from eager_op_test import OpTest
-from testsuite import append_loss_ops, create_op, set_input
-from white_list import no_grad_set_white_list, op_threshold_white_list
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     get_xpu_op_support_types,
     is_empty_grad_op_type,
     type_dict_str_to_numpy,
 )
+from testsuite import append_loss_ops, create_op, set_input
+from white_list import no_grad_set_white_list, op_threshold_white_list
 
 import paddle
 from paddle import fluid
diff --git a/python/paddle/fluid/tests/unittests/xpu/parallel_dygraph_dataparallel_with_pylayer.py b/test/xpu/parallel_dygraph_dataparallel_with_pylayer.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/xpu/parallel_dygraph_dataparallel_with_pylayer.py
rename to test/xpu/parallel_dygraph_dataparallel_with_pylayer.py
diff --git a/python/paddle/fluid/tests/unittests/xpu/parallel_dygraph_gradient_check.py b/test/xpu/parallel_dygraph_gradient_check.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/xpu/parallel_dygraph_gradient_check.py
rename to test/xpu/parallel_dygraph_gradient_check.py
diff --git a/python/paddle/fluid/tests/unittests/xpu/parallel_dygraph_gradient_check_in_eager_mode.py b/test/xpu/parallel_dygraph_gradient_check_in_eager_mode.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/xpu/parallel_dygraph_gradient_check_in_eager_mode.py
rename to test/xpu/parallel_dygraph_gradient_check_in_eager_mode.py
diff --git a/python/paddle/fluid/tests/unittests/xpu/process_group_bkcl.py b/test/xpu/process_group_bkcl.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/xpu/process_group_bkcl.py
rename to test/xpu/process_group_bkcl.py
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_accuracy_op_xpu.py b/test/xpu/test_accuracy_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_accuracy_op_xpu.py
rename to test/xpu/test_accuracy_op_xpu.py
index 082e883ded741d..a87f6c084351ce 100755
--- a/python/paddle/fluid/tests/unittests/xpu/test_accuracy_op_xpu.py
+++ b/test/xpu/test_accuracy_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py b/test/xpu/test_activation_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py
rename to test/xpu/test_activation_op_xpu.py
index b071db95b40bc9..a877b09bbc9572 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py
+++ b/test/xpu/test_activation_op_xpu.py
@@ -15,17 +15,16 @@
 import sys
 import unittest
 
-import numpy as np
-
-sys.path.append("..")
+sys.path.append('../../python/paddle/fluid/tests/unittests')
 
+import numpy as np
 from eager_op_test import OpTest
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 import paddle.nn.functional as F
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_adadelta_op_xpu.py b/test/xpu/test_adadelta_op_xpu.py
similarity index 96%
rename from python/paddle/fluid/tests/unittests/xpu/test_adadelta_op_xpu.py
rename to test/xpu/test_adadelta_op_xpu.py
index 71b691a6f27438..b6ef0fbdf8ec8f 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_adadelta_op_xpu.py
+++ b/test/xpu/test_adadelta_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import fluid
@@ -52,11 +48,13 @@ def setUp(self):
             rho = 0.95
             epsilon = 1e-6
 
+            learning_rate = 1.0
             self.inputs = {
                 'Param': param,
                 'Grad': grad,
                 'AvgSquaredGrad': avg_squared_grad,
                 'AvgSquaredUpdate': avg_squared_update,
+                'LearningRate': np.array([learning_rate]).astype("float32"),
             }
 
             self.attrs = {'rho': rho, 'epsilon': epsilon}
@@ -107,11 +105,13 @@ def setUp(self):
             rho = 0.95
             epsilon = 1e-6
 
+            learning_rate = 1.0
             self.inputs = {
                 'Param': param,
                 'Grad': grad,
                 'AvgSquaredGrad': avg_squared_grad,
                 'AvgSquaredUpdate': avg_squared_update,
+                'LearningRate': np.array([learning_rate]).astype("float32"),
             }
 
             avg_squared_grad_out = rho * avg_squared_grad + (
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_adagrad_op_xpu.py b/test/xpu/test_adagrad_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_adagrad_op_xpu.py
rename to test/xpu/test_adagrad_op_xpu.py
index 942ffd26a4c091..34040ebd3f77b5 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_adagrad_op_xpu.py
+++ b/test/xpu/test_adagrad_op_xpu.py
@@ -12,21 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-import numpy as np
-
-import paddle
-
-sys.path.append("..")
 import unittest
 
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+import numpy as np
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
+
+import paddle
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_adam_op_xpu.py b/test/xpu/test_adam_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_adam_op_xpu.py
rename to test/xpu/test_adam_op_xpu.py
index 6d44d355e4cdc9..990136c57170e1 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_adam_op_xpu.py
+++ b/test/xpu/test_adam_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle.fluid import core
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_adamw_op_xpu.py b/test/xpu/test_adamw_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_adamw_op_xpu.py
rename to test/xpu/test_adamw_op_xpu.py
index d30fdbed09db87..768cbe8151da3e 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_adamw_op_xpu.py
+++ b/test/xpu/test_adamw_op_xpu.py
@@ -12,20 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
-
 import unittest
 from functools import partial
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import fluid
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_affine_channel_op_xpu.py b/test/xpu/test_affine_channel_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_affine_channel_op_xpu.py
rename to test/xpu/test_affine_channel_op_xpu.py
index 6f85dc47488ab9..c200235ff879c5 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_affine_channel_op_xpu.py
+++ b/test/xpu/test_affine_channel_op_xpu.py
@@ -15,10 +15,6 @@
 Unit testing for affine_channel_op
 """
 
-import sys
-
-sys.path.append("..")
-
 import unittest
 
 import numpy as np
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_amp_check_finite_and_scale_op_xpu.py b/test/xpu/test_amp_check_finite_and_scale_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_amp_check_finite_and_scale_op_xpu.py
rename to test/xpu/test_amp_check_finite_and_scale_op_xpu.py
index e171625dd4367d..6abcf53707a33a 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_amp_check_finite_and_scale_op_xpu.py
+++ b/test/xpu/test_amp_check_finite_and_scale_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_arg_max_op_xpu.py b/test/xpu/test_arg_max_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_arg_max_op_xpu.py
rename to test/xpu/test_arg_max_op_xpu.py
index d9a69216351a4a..4a8e0dc28fad18 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_arg_max_op_xpu.py
+++ b/test/xpu/test_arg_max_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_argsort_op_xpu.py b/test/xpu/test_argsort_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_argsort_op_xpu.py
rename to test/xpu/test_argsort_op_xpu.py
index 39f554f9ac1767..f3a8a69ee5ded0 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_argsort_op_xpu.py
+++ b/test/xpu/test_argsort_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_assign_op_xpu.py b/test/xpu/test_assign_op_xpu.py
similarity index 96%
rename from python/paddle/fluid/tests/unittests/xpu/test_assign_op_xpu.py
rename to test/xpu/test_assign_op_xpu.py
index 97460b54aa310d..d3102dd448a49e 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_assign_op_xpu.py
+++ b/test/xpu/test_assign_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_assign_value_op_xpu.py b/test/xpu/test_assign_value_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_assign_value_op_xpu.py
rename to test/xpu/test_assign_value_op_xpu.py
index d98e6375da52d2..a0e3a57dc8ac5b 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_assign_value_op_xpu.py
+++ b/test/xpu/test_assign_value_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import fluid
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_atan_op_xpu.py b/test/xpu/test_atan_op_xpu.py
similarity index 96%
rename from python/paddle/fluid/tests/unittests/xpu/test_atan_op_xpu.py
rename to test/xpu/test_atan_op_xpu.py
index bb02e1320da15c..4ab5b14e9b44ed 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_atan_op_xpu.py
+++ b/test/xpu/test_atan_op_xpu.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
@@ -21,14 +20,13 @@
 
 paddle.enable_static()
 
-sys.path.append("..")
 
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 
 class XPUTestAtanOp(XPUOpTestWrapper):
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py b/test/xpu/test_batch_norm_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py
rename to test/xpu/test_batch_norm_op_xpu.py
index 446d49717af819..6cf666c8094c90 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py
+++ b/test/xpu/test_batch_norm_op_xpu.py
@@ -12,13 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_bce_loss_op_xpu.py b/test/xpu/test_bce_loss_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_bce_loss_op_xpu.py
rename to test/xpu/test_bce_loss_op_xpu.py
index 883063969ff6ae..acc3bd06e61033 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_bce_loss_op_xpu.py
+++ b/test/xpu/test_bce_loss_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_bilinear_interp_op_xpu.py b/test/xpu/test_bilinear_interp_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_bilinear_interp_op_xpu.py
rename to test/xpu/test_bilinear_interp_op_xpu.py
index dc8e996e093827..a5a849f080e6a5 100755
--- a/python/paddle/fluid/tests/unittests/xpu/test_bilinear_interp_op_xpu.py
+++ b/test/xpu/test_bilinear_interp_op_xpu.py
@@ -12,13 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import paddle
 
-sys.path.append("..")
-
 paddle.enable_static()
 '''
 def bilinear_interp_np(input,
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_bilinear_interp_v2_op_xpu.py b/test/xpu/test_bilinear_interp_v2_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_bilinear_interp_v2_op_xpu.py
rename to test/xpu/test_bilinear_interp_v2_op_xpu.py
index ebd48f55d57f12..dd0a6049221fd4 100755
--- a/python/paddle/fluid/tests/unittests/xpu/test_bilinear_interp_v2_op_xpu.py
+++ b/test/xpu/test_bilinear_interp_v2_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_bitwise_op_xpu.py b/test/xpu/test_bitwise_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_bitwise_op_xpu.py
rename to test/xpu/test_bitwise_op_xpu.py
index 8fcf5a7af7811a..1d21108bf8cd5f 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_bitwise_op_xpu.py
+++ b/test/xpu/test_bitwise_op_xpu.py
@@ -15,17 +15,16 @@
 import sys
 import unittest
 
-import numpy as np
-
-sys.path.append("..")
+sys.path.append('../../python/paddle/fluid/tests/unittests')
 
+import numpy as np
 from eager_op_test import OpTest
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_bmm_op_xpu.py b/test/xpu/test_bmm_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_bmm_op_xpu.py
rename to test/xpu/test_bmm_op_xpu.py
index d0d43dd94b0aa8..48bd4ea692cf82 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_bmm_op_xpu.py
+++ b/test/xpu/test_bmm_op_xpu.py
@@ -10,19 +10,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
-
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_c_concat.py b/test/xpu/test_c_concat.py
similarity index 95%
rename from python/paddle/fluid/tests/unittests/xpu/test_c_concat.py
rename to test/xpu/test_c_concat.py
index 313ae27a5b617e..d2490aa3772dcd 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_c_concat.py
+++ b/test/xpu/test_c_concat.py
@@ -12,21 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
-from test_collective_base_xpu import TestDistBase
-
-import paddle
-from paddle.fluid import core
-
-sys.path.append("..")
-
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from test_collective_base_xpu import TestDistBase
+
+import paddle
+from paddle.fluid import core
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_c_embedding_op_xpu.py b/test/xpu/test_c_embedding_op_xpu.py
similarity index 96%
rename from python/paddle/fluid/tests/unittests/xpu/test_c_embedding_op_xpu.py
rename to test/xpu/test_c_embedding_op_xpu.py
index b685458a3eed6f..4d0989c322e545 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_c_embedding_op_xpu.py
+++ b/test/xpu/test_c_embedding_op_xpu.py
@@ -12,10 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
-sys.path.append("..")
 import paddle
 from paddle.fluid.tests.unittests.c_embedding_op_base import (
     TestCEmbeddingCPU,
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_c_split.py b/test/xpu/test_c_split.py
similarity index 95%
rename from python/paddle/fluid/tests/unittests/xpu/test_c_split.py
rename to test/xpu/test_c_split.py
index c5b0f236935aff..67e2f1a6cc5f62 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_c_split.py
+++ b/test/xpu/test_c_split.py
@@ -12,21 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
-from test_collective_base_xpu import TestDistBase
-
-import paddle
-from paddle.fluid import core
-
-sys.path.append("..")
-
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from test_collective_base_xpu import TestDistBase
+
+import paddle
+from paddle.fluid import core
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_cast_op_xpu.py b/test/xpu/test_cast_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_cast_op_xpu.py
rename to test/xpu/test_cast_op_xpu.py
index e013432d13b976..baf814e08de8a1 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_cast_op_xpu.py
+++ b/test/xpu/test_cast_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import fluid
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_clip_by_norm_op_xpu.py b/test/xpu/test_clip_by_norm_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_clip_by_norm_op_xpu.py
rename to test/xpu/test_clip_by_norm_op_xpu.py
index 206f65c10afcd4..4bec31b80d85cb 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_clip_by_norm_op_xpu.py
+++ b/test/xpu/test_clip_by_norm_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_clip_op_xpu.py b/test/xpu/test_clip_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_clip_op_xpu.py
rename to test/xpu/test_clip_op_xpu.py
index 994153a8dd7253..79d4e3e7798694 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_clip_op_xpu.py
+++ b/test/xpu/test_clip_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import fluid
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_coalesce_tensor_op_xpu.py b/test/xpu/test_coalesce_tensor_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_coalesce_tensor_op_xpu.py
rename to test/xpu/test_coalesce_tensor_op_xpu.py
index 2324d09857dcf2..f0f053137949f0 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_coalesce_tensor_op_xpu.py
+++ b/test/xpu/test_coalesce_tensor_op_xpu.py
@@ -12,22 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
 
 from paddle.fluid import core
 
-sys.path.append("..")
-
 alignment = 256
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_collective_allgather_xpu.py b/test/xpu/test_collective_allgather_xpu.py
similarity index 95%
rename from python/paddle/fluid/tests/unittests/xpu/test_collective_allgather_xpu.py
rename to test/xpu/test_collective_allgather_xpu.py
index be1326d176456e..3651ed2062957e 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_collective_allgather_xpu.py
+++ b/test/xpu/test_collective_allgather_xpu.py
@@ -12,21 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
-from test_collective_base_xpu import TestDistBase
-
-import paddle
-from paddle.fluid import core
-
-sys.path.append("..")
-
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from test_collective_base_xpu import TestDistBase
+
+import paddle
+from paddle.fluid import core
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_collective_allreduce_xpu.py b/test/xpu/test_collective_allreduce_xpu.py
similarity index 95%
rename from python/paddle/fluid/tests/unittests/xpu/test_collective_allreduce_xpu.py
rename to test/xpu/test_collective_allreduce_xpu.py
index 187494f50154e7..05539aeaae4328 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_collective_allreduce_xpu.py
+++ b/test/xpu/test_collective_allreduce_xpu.py
@@ -12,21 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
-from test_collective_base_xpu import TestDistBase
-
-import paddle
-from paddle.fluid import core
-
-sys.path.append("..")
-
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from test_collective_base_xpu import TestDistBase
+
+import paddle
+from paddle.fluid import core
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_collective_base_xpu.py b/test/xpu/test_collective_base_xpu.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/xpu/test_collective_base_xpu.py
rename to test/xpu/test_collective_base_xpu.py
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_collective_broadcast_xpu.py b/test/xpu/test_collective_broadcast_xpu.py
similarity index 92%
rename from python/paddle/fluid/tests/unittests/xpu/test_collective_broadcast_xpu.py
rename to test/xpu/test_collective_broadcast_xpu.py
index e015d0f92b1147..5ddb451e7e4fa9 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_collective_broadcast_xpu.py
+++ b/test/xpu/test_collective_broadcast_xpu.py
@@ -12,18 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
+from get_test_cover_info import XPUOpTestWrapper, create_test_class
 from test_collective_base_xpu import TestDistBase
 
 import paddle
 from paddle.fluid import core
 
-sys.path.append("..")
-
-from xpu.get_test_cover_info import XPUOpTestWrapper, create_test_class
-
 paddle.enable_static()
 
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_collective_identity_xpu.py b/test/xpu/test_collective_identity_xpu.py
similarity index 95%
rename from python/paddle/fluid/tests/unittests/xpu/test_collective_identity_xpu.py
rename to test/xpu/test_collective_identity_xpu.py
index 3b5a2fa767a973..421f9168a28d3f 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_collective_identity_xpu.py
+++ b/test/xpu/test_collective_identity_xpu.py
@@ -12,21 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
-from test_collective_base_xpu import TestDistBase
-
-import paddle
-from paddle.fluid import core
-
-sys.path.append("..")
-
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from test_collective_base_xpu import TestDistBase
+
+import paddle
+from paddle.fluid import core
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_collective_process_group.py b/test/xpu/test_collective_process_group.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/xpu/test_collective_process_group.py
rename to test/xpu/test_collective_process_group.py
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_collective_softmax_with_cross_entropy_xpu.py b/test/xpu/test_collective_softmax_with_cross_entropy_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_collective_softmax_with_cross_entropy_xpu.py
rename to test/xpu/test_collective_softmax_with_cross_entropy_xpu.py
index 703194eb58d5e6..0bc75c7a4930b5 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_collective_softmax_with_cross_entropy_xpu.py
+++ b/test/xpu/test_collective_softmax_with_cross_entropy_xpu.py
@@ -13,22 +13,18 @@
 # limitations under the License.
 
 import os
-import sys
 import unittest
 
 import numpy as np
-from test_collective_base_xpu import DataTypeCast, TestDistBase
-
-import paddle
-from paddle.framework import core
-
-sys.path.append("..")
-
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from test_collective_base_xpu import DataTypeCast, TestDistBase
+
+import paddle
+from paddle.framework import core
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_compare_op_xpu.py b/test/xpu/test_compare_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_compare_op_xpu.py
rename to test/xpu/test_compare_op_xpu.py
index e16b9032f2ea48..4793122a81753e 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_compare_op_xpu.py
+++ b/test/xpu/test_compare_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_concat_op_xpu.py b/test/xpu/test_concat_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_concat_op_xpu.py
rename to test/xpu/test_concat_op_xpu.py
index 5867858a97b4d9..4f722ef6d9853e 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_concat_op_xpu.py
+++ b/test/xpu/test_concat_op_xpu.py
@@ -13,18 +13,18 @@
 # limitations under the License.
 
 import sys
-
-sys.path.append("..")
 import unittest
 
+sys.path.append('../../python/paddle/fluid/tests/unittests')
+
 import numpy as np
 from eager_op_test import skip_check_grad_ci
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_conv2d_op_xpu.py b/test/xpu/test_conv2d_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_conv2d_op_xpu.py
rename to test/xpu/test_conv2d_op_xpu.py
index a3eb2a1f3a77e0..d09402f934c699 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_conv2d_op_xpu.py
+++ b/test/xpu/test_conv2d_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle.fluid import core
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_conv2d_transpose_op_xpu.py b/test/xpu/test_conv2d_transpose_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_conv2d_transpose_op_xpu.py
rename to test/xpu/test_conv2d_transpose_op_xpu.py
index a5be198089e861..7bf01d23fb56f5 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_conv2d_transpose_op_xpu.py
+++ b/test/xpu/test_conv2d_transpose_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_conv3d_op_xpu.py b/test/xpu/test_conv3d_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_conv3d_op_xpu.py
rename to test/xpu/test_conv3d_op_xpu.py
index f6578371b97adf..f9904148f9b38f 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_conv3d_op_xpu.py
+++ b/test/xpu/test_conv3d_op_xpu.py
@@ -12,14 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
+from get_test_cover_info import XPUOpTestWrapper, create_test_class
 from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import XPUOpTestWrapper, create_test_class
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_cumprod_op_xpu.py b/test/xpu/test_cumprod_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_cumprod_op_xpu.py
rename to test/xpu/test_cumprod_op_xpu.py
index 3ea12d2bf9f416..fb3763ac5e8f7c 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_cumprod_op_xpu.py
+++ b/test/xpu/test_cumprod_op_xpu.py
@@ -13,19 +13,15 @@
 # limitations under the License.
 
 import random
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_cumsum_op_xpu.py b/test/xpu/test_cumsum_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_cumsum_op_xpu.py
rename to test/xpu/test_cumsum_op_xpu.py
index 8ba052171fc2a7..2e3555b7025762 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_cumsum_op_xpu.py
+++ b/test/xpu/test_cumsum_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_deformable_conv_op_xpu.py b/test/xpu/test_deformable_conv_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_deformable_conv_op_xpu.py
rename to test/xpu/test_deformable_conv_op_xpu.py
index 84afb9cbd03ea1..8577cb24977048 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_deformable_conv_op_xpu.py
+++ b/test/xpu/test_deformable_conv_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
-from op_test_xpu import OpTest, XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import OpTest, XPUOpTest
 
 import paddle
 from paddle.fluid import core
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_depthwise_conv2d_op_xpu.py b/test/xpu/test_depthwise_conv2d_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_depthwise_conv2d_op_xpu.py
rename to test/xpu/test_depthwise_conv2d_op_xpu.py
index 7ccf79170ddf2e..a0b01c921280f0 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_depthwise_conv2d_op_xpu.py
+++ b/test/xpu/test_depthwise_conv2d_op_xpu.py
@@ -12,9 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
@@ -22,12 +19,12 @@
 import paddle
 
 paddle.enable_static()
-from test_conv2d_op_xpu import XPUTestConv2DOp, XPUTestConv2DOp_v2
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from test_conv2d_op_xpu import XPUTestConv2DOp, XPUTestConv2DOp_v2
 
 
 class XPUTestDepthwiseConv2DOp(XPUOpTestWrapper):
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_device_guard_xpu.py b/test/xpu/test_device_guard_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_device_guard_xpu.py
rename to test/xpu/test_device_guard_xpu.py
index 01581c9ac61cc2..cc9fb142279ac1 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_device_guard_xpu.py
+++ b/test/xpu/test_device_guard_xpu.py
@@ -12,11 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
-
-sys.path.append("..")
-
 import warnings
 
 import paddle
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_diag_v2_op_xpu.py b/test/xpu/test_diag_v2_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_diag_v2_op_xpu.py
rename to test/xpu/test_diag_v2_op_xpu.py
index 0a2eac8720ef1c..51f42d00507fe6 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_diag_v2_op_xpu.py
+++ b/test/xpu/test_diag_v2_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
-sys.path.append("..")
-
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import fluid
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_diagonal_op_xpu.py b/test/xpu/test_diagonal_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_diagonal_op_xpu.py
rename to test/xpu/test_diagonal_op_xpu.py
index 001cd727a081c6..bbf289ce4c9fd0 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_diagonal_op_xpu.py
+++ b/test/xpu/test_diagonal_op_xpu.py
@@ -15,18 +15,18 @@
 import sys
 import unittest
 
-import numpy as np
-
-import paddle
+sys.path.append('../../python/paddle/fluid/tests/unittests')
 
-sys.path.append("..")
+import numpy as np
 from eager_op_test import skip_check_grad_ci
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
+
+import paddle
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_distribute_fpn_proposals_op_xpu.py b/test/xpu/test_distribute_fpn_proposals_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_distribute_fpn_proposals_op_xpu.py
rename to test/xpu/test_distribute_fpn_proposals_op_xpu.py
index c11899d4c7ed7e..230b9647f6ef14 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_distribute_fpn_proposals_op_xpu.py
+++ b/test/xpu/test_distribute_fpn_proposals_op_xpu.py
@@ -11,9 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
@@ -23,7 +20,7 @@
 
 paddle.enable_static()
 
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_dropout_op_xpu.py b/test/xpu/test_dropout_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_dropout_op_xpu.py
rename to test/xpu/test_dropout_op_xpu.py
index 562b968b5f698a..1a3459736c2682 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_dropout_op_xpu.py
+++ b/test/xpu/test_dropout_op_xpu.py
@@ -12,9 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
@@ -26,7 +23,7 @@
 
 paddle.enable_static()
 
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_einsum_op_xpu.py b/test/xpu/test_einsum_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_einsum_op_xpu.py
rename to test/xpu/test_einsum_op_xpu.py
index cb73f85671a839..57a82009834fa7 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_einsum_op_xpu.py
+++ b/test/xpu/test_einsum_op_xpu.py
@@ -12,20 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-import paddle
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
+
+import paddle
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op_xpu.py b/test/xpu/test_elementwise_add_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op_xpu.py
rename to test/xpu/test_elementwise_add_op_xpu.py
index 06db6b54a6740c..8d894a7b8828c4 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op_xpu.py
+++ b/test/xpu/test_elementwise_add_op_xpu.py
@@ -13,19 +13,18 @@
 # limitations under the License.
 
 import sys
-
-import numpy as np
-
-sys.path.append("..")
 import unittest
 
+sys.path.append('../../python/paddle/fluid/tests/unittests')
+
+import numpy as np
 from eager_op_test import OpTest, skip_check_grad_ci
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import fluid
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op_xpu_kp.py b/test/xpu/test_elementwise_add_op_xpu_kp.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op_xpu_kp.py
rename to test/xpu/test_elementwise_add_op_xpu_kp.py
index d9ef90fb2363f6..267ba2ec2b9596 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op_xpu_kp.py
+++ b/test/xpu/test_elementwise_add_op_xpu_kp.py
@@ -13,12 +13,11 @@
 # limitations under the License.
 
 import sys
-
-import numpy as np
-
-sys.path.append("..")
 import unittest
 
+sys.path.append('../../python/paddle/fluid/tests/unittests')
+
+import numpy as np
 from eager_op_test import OpTest, skip_check_grad_ci
 from op_test_xpu import XPUOpTest
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_div_op_xpu.py b/test/xpu/test_elementwise_div_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_elementwise_div_op_xpu.py
rename to test/xpu/test_elementwise_div_op_xpu.py
index fc1bf1d834aeb1..ca7693d0ab8e74 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_div_op_xpu.py
+++ b/test/xpu/test_elementwise_div_op_xpu.py
@@ -12,18 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import sys
-
-sys.path.append("..")
 import unittest
 
+sys.path.append('../../python/paddle/fluid/tests/unittests')
+
 import numpy as np
 from eager_op_test import skip_check_grad_ci
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import fluid
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_floordiv_op_xpu.py b/test/xpu/test_elementwise_floordiv_op_xpu.py
similarity index 96%
rename from python/paddle/fluid/tests/unittests/xpu/test_elementwise_floordiv_op_xpu.py
rename to test/xpu/test_elementwise_floordiv_op_xpu.py
index 83c476a213ac0d..3aa7a7f2c138a2 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_floordiv_op_xpu.py
+++ b/test/xpu/test_elementwise_floordiv_op_xpu.py
@@ -12,18 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import sys
-
-sys.path.append("..")
 import unittest
 
+sys.path.append('../../python/paddle/fluid/tests/unittests')
+
 import numpy as np
 from eager_op_test import OpTest
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_max_op_xpu.py b/test/xpu/test_elementwise_max_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_elementwise_max_op_xpu.py
rename to test/xpu/test_elementwise_max_op_xpu.py
index 66982e9a2c5e53..d9e96ec1fcb2c6 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_max_op_xpu.py
+++ b/test/xpu/test_elementwise_max_op_xpu.py
@@ -12,18 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import sys
-
-sys.path.append("..")
 import unittest
 
+sys.path.append('../../python/paddle/fluid/tests/unittests')
+
 import numpy as np
 from eager_op_test import skip_check_grad_ci
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_min_op_xpu.py b/test/xpu/test_elementwise_min_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_elementwise_min_op_xpu.py
rename to test/xpu/test_elementwise_min_op_xpu.py
index c79cc9b8e130c1..34223b52780f7e 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_min_op_xpu.py
+++ b/test/xpu/test_elementwise_min_op_xpu.py
@@ -12,18 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import sys
-
-sys.path.append("..")
 import unittest
 
+sys.path.append('../../python/paddle/fluid/tests/unittests')
+
 import numpy as np
 from eager_op_test import skip_check_grad_ci
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_mod_op_xpu.py b/test/xpu/test_elementwise_mod_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_elementwise_mod_op_xpu.py
rename to test/xpu/test_elementwise_mod_op_xpu.py
index c00ea8db5c859f..f909a12cc5e219 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_mod_op_xpu.py
+++ b/test/xpu/test_elementwise_mod_op_xpu.py
@@ -12,18 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import sys
-
-sys.path.append("..")
 import unittest
 
+sys.path.append('../../python/paddle/fluid/tests/unittests')
+
 import numpy as np
 from eager_op_test import OpTest
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import fluid
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_mul_op_xpu.py b/test/xpu/test_elementwise_mul_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_elementwise_mul_op_xpu.py
rename to test/xpu/test_elementwise_mul_op_xpu.py
index 6dea1d6b991993..a6c1319b5f19dd 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_mul_op_xpu.py
+++ b/test/xpu/test_elementwise_mul_op_xpu.py
@@ -12,18 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import sys
-
-sys.path.append("..")
 import unittest
 
+sys.path.append('../../python/paddle/fluid/tests/unittests')
+
 import numpy as np
 from eager_op_test import OpTest, skip_check_grad_ci
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_pow_op_xpu.py b/test/xpu/test_elementwise_pow_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_elementwise_pow_op_xpu.py
rename to test/xpu/test_elementwise_pow_op_xpu.py
index 431ca838c1ab79..5864bfa00c793b 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_pow_op_xpu.py
+++ b/test/xpu/test_elementwise_pow_op_xpu.py
@@ -12,18 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import sys
-
-sys.path.append("..")
 import unittest
 
+sys.path.append('../../python/paddle/fluid/tests/unittests')
+
 import numpy as np
 from eager_op_test import OpTest, skip_check_grad_ci
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_sub_op_xpu.py b/test/xpu/test_elementwise_sub_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_elementwise_sub_op_xpu.py
rename to test/xpu/test_elementwise_sub_op_xpu.py
index 5b731ef32bb0d0..aeddf4641d726e 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_sub_op_xpu.py
+++ b/test/xpu/test_elementwise_sub_op_xpu.py
@@ -13,19 +13,18 @@
 # limitations under the License.
 
 import sys
-
-import numpy as np
-
-sys.path.append("..")
 import unittest
 
+sys.path.append('../../python/paddle/fluid/tests/unittests')
+
+import numpy as np
 from eager_op_test import skip_check_grad_ci
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_empty_op_xpu.py b/test/xpu/test_empty_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_empty_op_xpu.py
rename to test/xpu/test_empty_op_xpu.py
index 87241881275222..71c25f335b1ba4 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_empty_op_xpu.py
+++ b/test/xpu/test_empty_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
-
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle.fluid.framework import convert_np_dtype_to_dtype_
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_expand_as_v2_op_xpu.py b/test/xpu/test_expand_as_v2_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_expand_as_v2_op_xpu.py
rename to test/xpu/test_expand_as_v2_op_xpu.py
index ac5e06c2682c86..586761c9aeac42 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_expand_as_v2_op_xpu.py
+++ b/test/xpu/test_expand_as_v2_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import fluid
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_expand_v2_op_xpu.py b/test/xpu/test_expand_v2_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_expand_v2_op_xpu.py
rename to test/xpu/test_expand_v2_op_xpu.py
index f7098282a62a38..9d869d14b32e2b 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_expand_v2_op_xpu.py
+++ b/test/xpu/test_expand_v2_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import fluid
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_fill_any_like_op_xpu.py b/test/xpu/test_fill_any_like_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_fill_any_like_op_xpu.py
rename to test/xpu/test_fill_any_like_op_xpu.py
index af8f9518b5483d..079a86b07c44a9 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_fill_any_like_op_xpu.py
+++ b/test/xpu/test_fill_any_like_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
-
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_fill_any_op_xpu.py b/test/xpu/test_fill_any_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_fill_any_op_xpu.py
rename to test/xpu/test_fill_any_op_xpu.py
index 95d514d94cecdd..e351d9dacd1a35 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_fill_any_op_xpu.py
+++ b/test/xpu/test_fill_any_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_fill_constant_op_xpu.py b/test/xpu/test_fill_constant_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_fill_constant_op_xpu.py
rename to test/xpu/test_fill_constant_op_xpu.py
index 4bd9abae9a5b90..d2a01a1e6377be 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_fill_constant_op_xpu.py
+++ b/test/xpu/test_fill_constant_op_xpu.py
@@ -13,18 +13,18 @@
 # limitations under the License.
 
 import sys
-
-sys.path.append("..")
 import unittest
 
+sys.path.append('../../python/paddle/fluid/tests/unittests')
+
 import numpy as np
 from eager_op_test import convert_float_to_uint16
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_fill_diagonal_tensor_op_xpu.py b/test/xpu/test_fill_diagonal_tensor_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_fill_diagonal_tensor_op_xpu.py
rename to test/xpu/test_fill_diagonal_tensor_op_xpu.py
index 3fbdf7abe6d141..de5025e8c4c05e 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_fill_diagonal_tensor_op_xpu.py
+++ b/test/xpu/test_fill_diagonal_tensor_op_xpu.py
@@ -15,18 +15,18 @@
 import sys
 import unittest
 
-import numpy as np
-
-import paddle
+sys.path.append('../../python/paddle/fluid/tests/unittests')
 
-sys.path.append("..")
+import numpy as np
 from eager_op_test import skip_check_grad_ci
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
+
+import paddle
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_fill_op_xpu.py b/test/xpu/test_fill_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_fill_op_xpu.py
rename to test/xpu/test_fill_op_xpu.py
index 4cb43a2ba430c5..99ca677ce42007 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_fill_op_xpu.py
+++ b/test/xpu/test_fill_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle.fluid import core
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_flatten2_op_xpu.py b/test/xpu/test_flatten2_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_flatten2_op_xpu.py
rename to test/xpu/test_flatten2_op_xpu.py
index 380da7b62d0b66..9595b9877bc5a9 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_flatten2_op_xpu.py
+++ b/test/xpu/test_flatten2_op_xpu.py
@@ -12,17 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
-sys.path.append("..")
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_flatten_contiguous_range_op_xpu.py b/test/xpu/test_flatten_contiguous_range_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_flatten_contiguous_range_op_xpu.py
rename to test/xpu/test_flatten_contiguous_range_op_xpu.py
index af6f2095fc97d4..05ad91958374b9 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_flatten_contiguous_range_op_xpu.py
+++ b/test/xpu/test_flatten_contiguous_range_op_xpu.py
@@ -12,21 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_flatten_op_xpu.py b/test/xpu/test_flatten_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_flatten_op_xpu.py
rename to test/xpu/test_flatten_op_xpu.py
index 9876b6c3815402..7673ec9ba3d6d2 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_flatten_op_xpu.py
+++ b/test/xpu/test_flatten_op_xpu.py
@@ -12,17 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
-sys.path.append("..")
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_fleet_exe_dist_model_run_xpu.py b/test/xpu/test_fleet_exe_dist_model_run_xpu.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/xpu/test_fleet_exe_dist_model_run_xpu.py
rename to test/xpu/test_fleet_exe_dist_model_run_xpu.py
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_fused_attention_op_xpu.py b/test/xpu/test_fused_attention_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_fused_attention_op_xpu.py
rename to test/xpu/test_fused_attention_op_xpu.py
index 3cdb5094f21d4a..9db584f278e7f8 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_fused_attention_op_xpu.py
+++ b/test/xpu/test_fused_attention_op_xpu.py
@@ -12,20 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-import numpy as np
-
-sys.path.append("..")
-
 import unittest
 
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+import numpy as np
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 import paddle.incubate.nn.functional as incubate_f
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_fused_feedforward_op_xpu.py b/test/xpu/test_fused_feedforward_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_fused_feedforward_op_xpu.py
rename to test/xpu/test_fused_feedforward_op_xpu.py
index feb7549a33e500..11f7148e188d09 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_fused_feedforward_op_xpu.py
+++ b/test/xpu/test_fused_feedforward_op_xpu.py
@@ -11,16 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import sys
-
-import numpy as np
-
-sys.path.append("..")
-
 import unittest
 
+import numpy as np
+from get_test_cover_info import XPUOpTestWrapper, create_test_class
 from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import XPUOpTestWrapper, create_test_class
 
 import paddle
 import paddle.incubate.nn.functional as incubate_f
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_fused_gemm_epilogue_grad_op_xpu.py b/test/xpu/test_fused_gemm_epilogue_grad_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_fused_gemm_epilogue_grad_op_xpu.py
rename to test/xpu/test_fused_gemm_epilogue_grad_op_xpu.py
index 35b943a3f4c777..394fe515554f33 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_fused_gemm_epilogue_grad_op_xpu.py
+++ b/test/xpu/test_fused_gemm_epilogue_grad_op_xpu.py
@@ -13,19 +13,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
-
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle.fluid import core
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_fused_gemm_epilogue_op_xpu.py b/test/xpu/test_fused_gemm_epilogue_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_fused_gemm_epilogue_op_xpu.py
rename to test/xpu/test_fused_gemm_epilogue_op_xpu.py
index 590276f58e4dc3..37b1271963faf1 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_fused_gemm_epilogue_op_xpu.py
+++ b/test/xpu/test_fused_gemm_epilogue_op_xpu.py
@@ -13,18 +13,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import _legacy_C_ops
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_fused_resnet_basic_block_op_xpu.py b/test/xpu/test_fused_resnet_basic_block_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_fused_resnet_basic_block_op_xpu.py
rename to test/xpu/test_fused_resnet_basic_block_op_xpu.py
index 9c3156997035a8..060217a6a1082b 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_fused_resnet_basic_block_op_xpu.py
+++ b/test/xpu/test_fused_resnet_basic_block_op_xpu.py
@@ -13,13 +13,13 @@
 # limitations under the License.
 
 import sys
-
-sys.path.append("..")
 import unittest
 
+sys.path.append('../../python/paddle/fluid/tests/unittests')
+
 import numpy as np
 from eager_op_test import OpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_gather_nd_op_xpu.py b/test/xpu/test_gather_nd_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_gather_nd_op_xpu.py
rename to test/xpu/test_gather_nd_op_xpu.py
index a22c10e0fec56e..e642afffb44cff 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_gather_nd_op_xpu.py
+++ b/test/xpu/test_gather_nd_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_gather_op_xpu.py b/test/xpu/test_gather_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_gather_op_xpu.py
rename to test/xpu/test_gather_op_xpu.py
index a57af602f97125..0d132e7185e643 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_gather_op_xpu.py
+++ b/test/xpu/test_gather_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
-sys.path.append("..")
-
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_gaussian_random_op_xpu.py b/test/xpu/test_gaussian_random_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_gaussian_random_op_xpu.py
rename to test/xpu/test_gaussian_random_op_xpu.py
index 9d5b5e747f4458..f30b994dcd18be 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_gaussian_random_op_xpu.py
+++ b/test/xpu/test_gaussian_random_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import fluid
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_gen_bkcl_id_op.py b/test/xpu/test_gen_bkcl_id_op.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_gen_bkcl_id_op.py
rename to test/xpu/test_gen_bkcl_id_op.py
index e13efff36e4845..7c7ae3511a2525 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_gen_bkcl_id_op.py
+++ b/test/xpu/test_gen_bkcl_id_op.py
@@ -13,10 +13,7 @@
 # limitations under the License.
 
 import os
-import sys
 import unittest
-
-sys.path.append("..")
 from multiprocessing import Process
 
 from launch_function_helper import _find_free_port, wait
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_generate_proposals_v2_op_xpu.py b/test/xpu/test_generate_proposals_v2_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_generate_proposals_v2_op_xpu.py
rename to test/xpu/test_generate_proposals_v2_op_xpu.py
index 3a97b28267d926..b7e22032f5f0cf 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_generate_proposals_v2_op_xpu.py
+++ b/test/xpu/test_generate_proposals_v2_op_xpu.py
@@ -12,22 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-import unittest
-
-import numpy as np
-
-sys.path.append("..")
-
 import copy
 import math
+import unittest
 
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+import numpy as np
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle.fluid import core
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_grid_sampler_op_xpu.py b/test/xpu/test_grid_sampler_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_grid_sampler_op_xpu.py
rename to test/xpu/test_grid_sampler_op_xpu.py
index c92ddc9531b212..1e171f23493920 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_grid_sampler_op_xpu.py
+++ b/test/xpu/test_grid_sampler_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_group_norm_op_xpu.py b/test/xpu/test_group_norm_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_group_norm_op_xpu.py
rename to test/xpu/test_group_norm_op_xpu.py
index 67161776f81c28..16cec44287df84 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_group_norm_op_xpu.py
+++ b/test/xpu/test_group_norm_op_xpu.py
@@ -15,17 +15,16 @@
 import sys
 import unittest
 
-import numpy as np
-
-sys.path.append("..")
+sys.path.append('../../python/paddle/fluid/tests/unittests')
 
+import numpy as np
 from eager_op_test import OpTest
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_huber_loss_op_xpu.py b/test/xpu/test_huber_loss_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_huber_loss_op_xpu.py
rename to test/xpu/test_huber_loss_op_xpu.py
index 2a51e6ea950142..fa1e0b4b2ce879 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_huber_loss_op_xpu.py
+++ b/test/xpu/test_huber_loss_op_xpu.py
@@ -15,17 +15,16 @@
 import sys
 import unittest
 
-import numpy as np
-
-sys.path.append("..")
+sys.path.append('../../python/paddle/fluid/tests/unittests')
 
+import numpy as np
 from eager_op_test import OpTest
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_increment_op_xpu.py b/test/xpu/test_increment_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_increment_op_xpu.py
rename to test/xpu/test_increment_op_xpu.py
index 8ebbeae9654a69..5ef28f30b44a9c 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_increment_op_xpu.py
+++ b/test/xpu/test_increment_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_index_sample_op_xpu.py b/test/xpu/test_index_sample_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_index_sample_op_xpu.py
rename to test/xpu/test_index_sample_op_xpu.py
index c9701af3e5786e..e5204a1247f462 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_index_sample_op_xpu.py
+++ b/test/xpu/test_index_sample_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import fluid
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_index_select_op_xpu.py b/test/xpu/test_index_select_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_index_select_op_xpu.py
rename to test/xpu/test_index_select_op_xpu.py
index 03e7debb59acf1..62b9dd54c2e8bc 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_index_select_op_xpu.py
+++ b/test/xpu/test_index_select_op_xpu.py
@@ -12,23 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
-from paddle import fluid
-from paddle.fluid import Program, program_guard
-
-sys.path.append("..")
-
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
+from paddle import fluid
+from paddle.fluid import Program, program_guard
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_instance_norm_op_xpu.py b/test/xpu/test_instance_norm_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_instance_norm_op_xpu.py
rename to test/xpu/test_instance_norm_op_xpu.py
index 8e0b777ea852ca..5eb3e955deddf5 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_instance_norm_op_xpu.py
+++ b/test/xpu/test_instance_norm_op_xpu.py
@@ -12,22 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-import paddle
-from paddle import fluid
-from paddle.fluid import Program, program_guard
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
+
+import paddle
+from paddle import fluid
+from paddle.fluid import Program, program_guard
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_iou_similarity_op_xpu.py b/test/xpu/test_iou_similarity_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_iou_similarity_op_xpu.py
rename to test/xpu/test_iou_similarity_op_xpu.py
index 9d8873666a3abd..301d5fb07b99af 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_iou_similarity_op_xpu.py
+++ b/test/xpu/test_iou_similarity_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
-from numpy import random
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from numpy import random
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_isfinite_op_xpu.py b/test/xpu/test_isfinite_op_xpu.py
similarity index 96%
rename from python/paddle/fluid/tests/unittests/xpu/test_isfinite_op_xpu.py
rename to test/xpu/test_isfinite_op_xpu.py
index c5253bb90cbaaa..93e6cf3533ecad 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_isfinite_op_xpu.py
+++ b/test/xpu/test_isfinite_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_kldiv_loss_op_xpu.py b/test/xpu/test_kldiv_loss_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_kldiv_loss_op_xpu.py
rename to test/xpu/test_kldiv_loss_op_xpu.py
index b3e3d7e5a058dd..d3dd09b6c3c28f 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_kldiv_loss_op_xpu.py
+++ b/test/xpu/test_kldiv_loss_op_xpu.py
@@ -11,18 +11,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle.nn.functional import kl_div
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_label_smooth_op_xpu.py b/test/xpu/test_label_smooth_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_label_smooth_op_xpu.py
rename to test/xpu/test_label_smooth_op_xpu.py
index b83a32a313ad4d..4ad7b3dc6a871b 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_label_smooth_op_xpu.py
+++ b/test/xpu/test_label_smooth_op_xpu.py
@@ -12,20 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-import paddle
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
+
+import paddle
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_lamb_op_xpu.py b/test/xpu/test_lamb_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_lamb_op_xpu.py
rename to test/xpu/test_lamb_op_xpu.py
index 70794de507f5e5..c82bb5cd4e166b 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_lamb_op_xpu.py
+++ b/test/xpu/test_lamb_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_layer_norm_op_xpu.py b/test/xpu/test_layer_norm_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_layer_norm_op_xpu.py
rename to test/xpu/test_layer_norm_op_xpu.py
index 12e6b49424093e..1b98c4fe081b47 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_layer_norm_op_xpu.py
+++ b/test/xpu/test_layer_norm_op_xpu.py
@@ -12,23 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 from functools import reduce
-
-import numpy as np
-
-import paddle
-
-sys.path.append("..")
 from operator import mul
 
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+import numpy as np
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
+
+import paddle
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_linspace_op_xpu.py b/test/xpu/test_linspace_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_linspace_op_xpu.py
rename to test/xpu/test_linspace_op_xpu.py
index 65247c5bec50d0..70fdb01b92159b 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_linspace_op_xpu.py
+++ b/test/xpu/test_linspace_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest, convert_np_dtype_to_dtype_
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest, convert_np_dtype_to_dtype_
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_log_loss_op_xpu.py b/test/xpu/test_log_loss_op_xpu.py
similarity index 96%
rename from python/paddle/fluid/tests/unittests/xpu/test_log_loss_op_xpu.py
rename to test/xpu/test_log_loss_op_xpu.py
index 42a59da6d0ddea..920c6c1f46931e 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_log_loss_op_xpu.py
+++ b/test/xpu/test_log_loss_op_xpu.py
@@ -13,10 +13,10 @@
 # limitations under the License.
 
 import sys
-
-sys.path.append("..")
 import unittest
 
+sys.path.append('../../python/paddle/fluid/tests/unittests')
+
 import numpy as np
 from eager_op_test import OpTest
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_log_softmax_op_xpu.py b/test/xpu/test_log_softmax_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_log_softmax_op_xpu.py
rename to test/xpu/test_log_softmax_op_xpu.py
index fdaaadcae81e19..269d3e76bca4ac 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_log_softmax_op_xpu.py
+++ b/test/xpu/test_log_softmax_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 import paddle.nn.functional as F
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_logical_op_xpu.py b/test/xpu/test_logical_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_logical_op_xpu.py
rename to test/xpu/test_logical_op_xpu.py
index b07327283746d0..44f891d2e3f653 100755
--- a/python/paddle/fluid/tests/unittests/xpu/test_logical_op_xpu.py
+++ b/test/xpu/test_logical_op_xpu.py
@@ -15,17 +15,16 @@
 import sys
 import unittest
 
-import numpy as np
-
-sys.path.append("..")
+sys.path.append('../../python/paddle/fluid/tests/unittests')
 
+import numpy as np
 from eager_op_test import OpTest
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_logsumexp_op_xpu.py b/test/xpu/test_logsumexp_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_logsumexp_op_xpu.py
rename to test/xpu/test_logsumexp_op_xpu.py
index 46515eb6b1cfda..1d871797bb60c7 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_logsumexp_op_xpu.py
+++ b/test/xpu/test_logsumexp_op_xpu.py
@@ -12,15 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
-import paddle
-
-sys.path.append("..")
 import numpy as np
 from op_test_xpu import XPUOpTest
 
+import paddle
+
 paddle.enable_static()
 
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_lookup_table_v2_op_xpu.py b/test/xpu/test_lookup_table_v2_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_lookup_table_v2_op_xpu.py
rename to test/xpu/test_lookup_table_v2_op_xpu.py
index 8cb36afb2e4904..7af995692a7a3a 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_lookup_table_v2_op_xpu.py
+++ b/test/xpu/test_lookup_table_v2_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_masked_select_op_xpu.py b/test/xpu/test_masked_select_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_masked_select_op_xpu.py
rename to test/xpu/test_masked_select_op_xpu.py
index d526dae396ddea..4ed6cd0a06e376 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_masked_select_op_xpu.py
+++ b/test/xpu/test_masked_select_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py b/test/xpu/test_matmul_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py
rename to test/xpu/test_matmul_op_xpu.py
index 3484264cff6dd9..07cea1b943c91c 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py
+++ b/test/xpu/test_matmul_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import fluid
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py b/test/xpu/test_matmul_v2_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py
rename to test/xpu/test_matmul_v2_op_xpu.py
index 4149af12268521..eb10d1462e4661 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py
+++ b/test/xpu/test_matmul_v2_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_mean_op_xpu.py b/test/xpu/test_mean_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_mean_op_xpu.py
rename to test/xpu/test_mean_op_xpu.py
index a13bea88b6a806..66ed8d7edbce3d 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_mean_op_xpu.py
+++ b/test/xpu/test_mean_op_xpu.py
@@ -12,21 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-import paddle
-
-sys.path.append("..")
 from op_test_xpu import XPUOpTest
 
+import paddle
 from paddle.fluid import Program, program_guard
 
 np.random.seed(10)
 
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_merged_momentum_op_xpu.py b/test/xpu/test_merged_momentum_op_xpu.py
similarity index 96%
rename from python/paddle/fluid/tests/unittests/xpu/test_merged_momentum_op_xpu.py
rename to test/xpu/test_merged_momentum_op_xpu.py
index 1a6455a2a712e4..8f3afc5a32697b 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_merged_momentum_op_xpu.py
+++ b/test/xpu/test_merged_momentum_op_xpu.py
@@ -12,17 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
-sys.path.append("..")
-
-from test_merged_momentum_op_xpu_base import TestMergedMomentumBase
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from test_merged_momentum_op_xpu_base import TestMergedMomentumBase
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_merged_momentum_op_xpu_base.py b/test/xpu/test_merged_momentum_op_xpu_base.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/xpu/test_merged_momentum_op_xpu_base.py
rename to test/xpu/test_merged_momentum_op_xpu_base.py
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_meshgrid_op_xpu.py b/test/xpu/test_meshgrid_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_meshgrid_op_xpu.py
rename to test/xpu/test_meshgrid_op_xpu.py
index dfb70604d65e12..6c00fa39d71bf9 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_meshgrid_op_xpu.py
+++ b/test/xpu/test_meshgrid_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_momentum_op_xpu.py b/test/xpu/test_momentum_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_momentum_op_xpu.py
rename to test/xpu/test_momentum_op_xpu.py
index 73d39c17ed0727..50854cdeb9faeb 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_momentum_op_xpu.py
+++ b/test/xpu/test_momentum_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle.fluid import core
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_mul_op_xpu.py b/test/xpu/test_mul_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_mul_op_xpu.py
rename to test/xpu/test_mul_op_xpu.py
index 760f88bea0f25d..a924cf42e84a3a 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_mul_op_xpu.py
+++ b/test/xpu/test_mul_op_xpu.py
@@ -12,19 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
+from op_test_xpu import XPUOpTest
 
 import paddle
 
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-
 paddle.enable_static()
 
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_op_xpu.py b/test/xpu/test_nearest_interp_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_op_xpu.py
rename to test/xpu/test_nearest_interp_op_xpu.py
index 441439838cbcd0..235ccbdd2de9ca 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_op_xpu.py
+++ b/test/xpu/test_nearest_interp_op_xpu.py
@@ -12,13 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import paddle
 
-sys.path.append("..")
-
 paddle.enable_static()
 '''
 def nearest_neighbor_interp_np(X,
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_v2_op_xpu.py b/test/xpu/test_nearest_interp_v2_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_v2_op_xpu.py
rename to test/xpu/test_nearest_interp_v2_op_xpu.py
index 35c362b9a9aba1..9caac459a9451b 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_v2_op_xpu.py
+++ b/test/xpu/test_nearest_interp_v2_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_one_hot_op_xpu.py b/test/xpu/test_one_hot_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_one_hot_op_xpu.py
rename to test/xpu/test_one_hot_op_xpu.py
index 4a24e3e2028e5b..941387b3eb1fbe 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_one_hot_op_xpu.py
+++ b/test/xpu/test_one_hot_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle.fluid import core
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_one_hot_v2_op_xpu.py b/test/xpu/test_one_hot_v2_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_one_hot_v2_op_xpu.py
rename to test/xpu/test_one_hot_v2_op_xpu.py
index 7fca3ab6827abd..80a60eed539c02 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_one_hot_v2_op_xpu.py
+++ b/test/xpu/test_one_hot_v2_op_xpu.py
@@ -12,22 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-import paddle
-from paddle import fluid
-from paddle.fluid import core
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
+
+import paddle
+from paddle import fluid
+from paddle.fluid import core
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_p_norm_op_xpu.py b/test/xpu/test_p_norm_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_p_norm_op_xpu.py
rename to test/xpu/test_p_norm_op_xpu.py
index 959ae77ca0117d..3f09c8eeda7723 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_p_norm_op_xpu.py
+++ b/test/xpu/test_p_norm_op_xpu.py
@@ -12,20 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-import paddle
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
+
+import paddle
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_pad3d_op_xpu.py b/test/xpu/test_pad3d_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_pad3d_op_xpu.py
rename to test/xpu/test_pad3d_op_xpu.py
index 7c4db207c8ef93..2757ed1e3e70a5 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_pad3d_op_xpu.py
+++ b/test/xpu/test_pad3d_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 import paddle.nn.functional as F
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_parallel_dygraph_dataparallel.py b/test/xpu/test_parallel_dygraph_dataparallel.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/xpu/test_parallel_dygraph_dataparallel.py
rename to test/xpu/test_parallel_dygraph_dataparallel.py
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_pixel_shuffle_op_xpu.py b/test/xpu/test_pixel_shuffle_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_pixel_shuffle_op_xpu.py
rename to test/xpu/test_pixel_shuffle_op_xpu.py
index 6674cf33ebb04a..444066ffbc5485 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_pixel_shuffle_op_xpu.py
+++ b/test/xpu/test_pixel_shuffle_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_pool2d_op_xpu.py b/test/xpu/test_pool2d_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_pool2d_op_xpu.py
rename to test/xpu/test_pool2d_op_xpu.py
index f5a7bb398d63b2..5c4233ee36e781 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_pool2d_op_xpu.py
+++ b/test/xpu/test_pool2d_op_xpu.py
@@ -12,19 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from test_pool2d_op import adaptive_end_index, adaptive_start_index
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
+from test_pool2d_op import adaptive_end_index, adaptive_start_index
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_pool3d_op_xpu.py b/test/xpu/test_pool3d_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_pool3d_op_xpu.py
rename to test/xpu/test_pool3d_op_xpu.py
index 06161a14054c8a..43b3675563e64e 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_pool3d_op_xpu.py
+++ b/test/xpu/test_pool3d_op_xpu.py
@@ -12,20 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-import paddle
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
+
+import paddle
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_pool_max_op_xpu.py b/test/xpu/test_pool_max_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_pool_max_op_xpu.py
rename to test/xpu/test_pool_max_op_xpu.py
index 57f09ab1f74107..0eb11bb83b70db 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_pool_max_op_xpu.py
+++ b/test/xpu/test_pool_max_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_pow2_decay_with_linear_warmup_op_xpu.py b/test/xpu/test_pow2_decay_with_linear_warmup_op_xpu.py
similarity index 96%
rename from python/paddle/fluid/tests/unittests/xpu/test_pow2_decay_with_linear_warmup_op_xpu.py
rename to test/xpu/test_pow2_decay_with_linear_warmup_op_xpu.py
index 78ca6933181aae..71da7768cc12f6 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_pow2_decay_with_linear_warmup_op_xpu.py
+++ b/test/xpu/test_pow2_decay_with_linear_warmup_op_xpu.py
@@ -15,14 +15,14 @@
 import sys
 import unittest
 
+sys.path.append('../../python/paddle/fluid/tests/unittests')
+
+from get_test_cover_info import record_op_test
+
 import paddle
 from paddle.fluid.contrib.layers.nn import pow2_decay_with_linear_warmup
 from paddle.optimizer.lr import LinearWarmup, PolynomialDecay
 
-sys.path.append("..")
-
-from xpu.get_test_cover_info import record_op_test
-
 
 def gen_pow2_warmup_op_lr(warmup_steps, total_steps, base_lr, end_lr, place):
     main = paddle.static.Program()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_prelu_op_xpu.py b/test/xpu/test_prelu_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_prelu_op_xpu.py
rename to test/xpu/test_prelu_op_xpu.py
index 6bd4fcf8d5c563..0a0ea282697220 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_prelu_op_xpu.py
+++ b/test/xpu/test_prelu_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import fluid
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_prior_box_op_xpu.py b/test/xpu/test_prior_box_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_prior_box_op_xpu.py
rename to test/xpu/test_prior_box_op_xpu.py
index 52d3ca875efdff..3b69cbaba341e5 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_prior_box_op_xpu.py
+++ b/test/xpu/test_prior_box_op_xpu.py
@@ -13,19 +13,15 @@
 # limitations under the License.
 
 import math
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_prod_op_xpu.py b/test/xpu/test_prod_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_prod_op_xpu.py
rename to test/xpu/test_prod_op_xpu.py
index 1fb907f9f09244..a873fa8ecaf31a 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_prod_op_xpu.py
+++ b/test/xpu/test_prod_op_xpu.py
@@ -15,9 +15,9 @@
 import sys
 import unittest
 
-import numpy as np
+sys.path.append('../../python/paddle/fluid/tests/unittests')
 
-sys.path.append("..")
+import numpy as np
 from test_sum_op import TestReduceOPTensorAxisBase
 
 import paddle
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_randint_op_xpu.py b/test/xpu/test_randint_op_xpu.py
similarity index 96%
rename from python/paddle/fluid/tests/unittests/xpu/test_randint_op_xpu.py
rename to test/xpu/test_randint_op_xpu.py
index baeff8a10a6401..e697109a1baeac 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_randint_op_xpu.py
+++ b/test/xpu/test_randint_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_randperm_op_xpu.py b/test/xpu/test_randperm_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_randperm_op_xpu.py
rename to test/xpu/test_randperm_op_xpu.py
index 0e285f6b03c0e4..f28944e0009a21 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_randperm_op_xpu.py
+++ b/test/xpu/test_randperm_op_xpu.py
@@ -12,22 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-import paddle
-from paddle.fluid import core
-from paddle.static import Program, program_guard
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
+
+import paddle
+from paddle.fluid import core
+from paddle.static import Program, program_guard
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_range_xpu.py b/test/xpu/test_range_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_range_xpu.py
rename to test/xpu/test_range_xpu.py
index 2870cbb7a7cc6e..f202a08c0f3646 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_range_xpu.py
+++ b/test/xpu/test_range_xpu.py
@@ -12,20 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-import paddle
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
+
+import paddle
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_recompute_op_xpu.py b/test/xpu/test_recompute_op_xpu.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/xpu/test_recompute_op_xpu.py
rename to test/xpu/test_recompute_op_xpu.py
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reduce_all_op_xpu.py b/test/xpu/test_reduce_all_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_reduce_all_op_xpu.py
rename to test/xpu/test_reduce_all_op_xpu.py
index 987b968b0a691d..313d8297a17054 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_reduce_all_op_xpu.py
+++ b/test/xpu/test_reduce_all_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reduce_amax_op_xpu.py b/test/xpu/test_reduce_amax_op_xpu.py
similarity index 96%
rename from python/paddle/fluid/tests/unittests/xpu/test_reduce_amax_op_xpu.py
rename to test/xpu/test_reduce_amax_op_xpu.py
index 49ffef884d3db5..0de9b6c6e73069 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_reduce_amax_op_xpu.py
+++ b/test/xpu/test_reduce_amax_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reduce_amin_op_xpu.py b/test/xpu/test_reduce_amin_op_xpu.py
similarity index 96%
rename from python/paddle/fluid/tests/unittests/xpu/test_reduce_amin_op_xpu.py
rename to test/xpu/test_reduce_amin_op_xpu.py
index 4f2ca6fea3ff87..ad1d643bb9703d 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_reduce_amin_op_xpu.py
+++ b/test/xpu/test_reduce_amin_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reduce_any_op_xpu.py b/test/xpu/test_reduce_any_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_reduce_any_op_xpu.py
rename to test/xpu/test_reduce_any_op_xpu.py
index a255dc390bcc00..5b4e0740cfacc4 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_reduce_any_op_xpu.py
+++ b/test/xpu/test_reduce_any_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reduce_max_op_xpu.py b/test/xpu/test_reduce_max_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_reduce_max_op_xpu.py
rename to test/xpu/test_reduce_max_op_xpu.py
index dd00a711f85ac0..1b76f78d09ac7d 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_reduce_max_op_xpu.py
+++ b/test/xpu/test_reduce_max_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reduce_mean_op_xpu.py b/test/xpu/test_reduce_mean_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_reduce_mean_op_xpu.py
rename to test/xpu/test_reduce_mean_op_xpu.py
index ed3d51ff4fd625..d2447debaa479e 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_reduce_mean_op_xpu.py
+++ b/test/xpu/test_reduce_mean_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reduce_min_op_xpu.py b/test/xpu/test_reduce_min_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_reduce_min_op_xpu.py
rename to test/xpu/test_reduce_min_op_xpu.py
index 87ab399863596f..692d06df6a6d22 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_reduce_min_op_xpu.py
+++ b/test/xpu/test_reduce_min_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reduce_prod_op_xpu.py b/test/xpu/test_reduce_prod_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_reduce_prod_op_xpu.py
rename to test/xpu/test_reduce_prod_op_xpu.py
index 1e9c259f0e580c..ab44b1be351e99 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_reduce_prod_op_xpu.py
+++ b/test/xpu/test_reduce_prod_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reduce_sum_op_xpu.py b/test/xpu/test_reduce_sum_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_reduce_sum_op_xpu.py
rename to test/xpu/test_reduce_sum_op_xpu.py
index 4137b2b18cbc23..e6ed19365c65ea 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_reduce_sum_op_xpu.py
+++ b/test/xpu/test_reduce_sum_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_refactor_op_xpu.py b/test/xpu/test_refactor_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_refactor_op_xpu.py
rename to test/xpu/test_refactor_op_xpu.py
index 55f32c876c74a1..976a33244209af 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_refactor_op_xpu.py
+++ b/test/xpu/test_refactor_op_xpu.py
@@ -15,17 +15,16 @@
 import sys
 import unittest
 
-import numpy as np
-
-sys.path.append("..")
+sys.path.append('../../python/paddle/fluid/tests/unittests')
 
+import numpy as np
 from eager_op_test import OpTest
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle.fluid import core
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reshape2_op_xpu.py b/test/xpu/test_reshape2_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_reshape2_op_xpu.py
rename to test/xpu/test_reshape2_op_xpu.py
index 41415a574b5895..ce825d89c0957f 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_reshape2_op_xpu.py
+++ b/test/xpu/test_reshape2_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_rmsprop_op_xpu.py b/test/xpu/test_rmsprop_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_rmsprop_op_xpu.py
rename to test/xpu/test_rmsprop_op_xpu.py
index c905eb9a579744..604f9e4bb00bd0 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_rmsprop_op_xpu.py
+++ b/test/xpu/test_rmsprop_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import fluid
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_rnn_op_xpu.py b/test/xpu/test_rnn_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_rnn_op_xpu.py
rename to test/xpu/test_rnn_op_xpu.py
index e28a7ff9c10d59..2b6100247e3798 100755
--- a/python/paddle/fluid/tests/unittests/xpu/test_rnn_op_xpu.py
+++ b/test/xpu/test_rnn_op_xpu.py
@@ -10,26 +10,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import random
+import sys
 import unittest
 
+sys.path.append('../../python/paddle/fluid/tests/unittests')
+
 import numpy as np
 from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle.fluid import core
 
-sys.path.append("../rnn")
+sys.path.append('../../python/paddle/fluid/tests/unittests/rnn')
 from convert import get_params_for_net
-from rnn_numpy import LSTM
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from rnn_numpy import LSTM
 
 random.seed(2)
 np.set_printoptions(threshold=np.inf)
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_roi_align_op_xpu.py b/test/xpu/test_roi_align_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_roi_align_op_xpu.py
rename to test/xpu/test_roi_align_op_xpu.py
index 1c3ad0af30c9cd..d65f78be1a4887 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_roi_align_op_xpu.py
+++ b/test/xpu/test_roi_align_op_xpu.py
@@ -12,19 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import math
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle.fluid import core
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_roll_op_xpu.py b/test/xpu/test_roll_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_roll_op_xpu.py
rename to test/xpu/test_roll_op_xpu.py
index 25b156a280a64e..8c3a9c6fcb1641 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_roll_op_xpu.py
+++ b/test/xpu/test_roll_op_xpu.py
@@ -12,20 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-import paddle
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
+
+import paddle
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_scale_op_xpu.py b/test/xpu/test_scale_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_scale_op_xpu.py
rename to test/xpu/test_scale_op_xpu.py
index 1dd41e90a17000..fbc3b7f8208569 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_scale_op_xpu.py
+++ b/test/xpu/test_scale_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle.fluid import Program, program_guard
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_scatter_nd_add_op_xpu.py b/test/xpu/test_scatter_nd_add_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_scatter_nd_add_op_xpu.py
rename to test/xpu/test_scatter_nd_add_op_xpu.py
index 12e159706ea1ef..f303cd9ce51503 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_scatter_nd_add_op_xpu.py
+++ b/test/xpu/test_scatter_nd_add_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_scatter_op_xpu.py b/test/xpu/test_scatter_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_scatter_op_xpu.py
rename to test/xpu/test_scatter_op_xpu.py
index 565549f0f16bbb..50c860bdd86732 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_scatter_op_xpu.py
+++ b/test/xpu/test_scatter_op_xpu.py
@@ -12,20 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
     type_dict_str_to_numpy,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_sequence_conv_op_xpu.py b/test/xpu/test_sequence_conv_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_sequence_conv_op_xpu.py
rename to test/xpu/test_sequence_conv_op_xpu.py
index a4f960fc9e31bc..4a52ea54f4aff7 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_sequence_conv_op_xpu.py
+++ b/test/xpu/test_sequence_conv_op_xpu.py
@@ -21,12 +21,12 @@
 import paddle
 
 sys.path.append("../")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 paddle.enable_static()
 np.set_printoptions(threshold=np.inf)
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_sequence_unpad_op_xpu.py b/test/xpu/test_sequence_unpad_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_sequence_unpad_op_xpu.py
rename to test/xpu/test_sequence_unpad_op_xpu.py
index 65f52bcfc0b1da..15215fcb0c614d 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_sequence_unpad_op_xpu.py
+++ b/test/xpu/test_sequence_unpad_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_set_value_op_xpu.py b/test/xpu/test_set_value_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_set_value_op_xpu.py
rename to test/xpu/test_set_value_op_xpu.py
index 90277c7f484d6a..e749eb8bc1b116 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_set_value_op_xpu.py
+++ b/test/xpu/test_set_value_op_xpu.py
@@ -21,12 +21,12 @@
 import numpy as np
 
 sys.path.append("../")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle.fluid.layer_helper import LayerHelper
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_sgd_op_xpu.py b/test/xpu/test_sgd_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_sgd_op_xpu.py
rename to test/xpu/test_sgd_op_xpu.py
index 42cdfd0c82d2aa..6c57c19438ad65 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_sgd_op_xpu.py
+++ b/test/xpu/test_sgd_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import fluid
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_shape_op_xpu.py b/test/xpu/test_shape_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_shape_op_xpu.py
rename to test/xpu/test_shape_op_xpu.py
index 2f8d7ec8300772..a812369ea526ef 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_shape_op_xpu.py
+++ b/test/xpu/test_shape_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle.fluid import core
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_sigmoid_cross_entropy_with_logits_op_xpu.py b/test/xpu/test_sigmoid_cross_entropy_with_logits_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_sigmoid_cross_entropy_with_logits_op_xpu.py
rename to test/xpu/test_sigmoid_cross_entropy_with_logits_op_xpu.py
index 8c0b3e4c733847..30369e9f22d859 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_sigmoid_cross_entropy_with_logits_op_xpu.py
+++ b/test/xpu/test_sigmoid_cross_entropy_with_logits_op_xpu.py
@@ -12,19 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from scipy.special import expit, logit
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
+from scipy.special import expit, logit
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_sign_op_xpu.py b/test/xpu/test_sign_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_sign_op_xpu.py
rename to test/xpu/test_sign_op_xpu.py
index 8743310a9c6974..e6b2334f9b7f34 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_sign_op_xpu.py
+++ b/test/xpu/test_sign_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_slice_op_xpu.py b/test/xpu/test_slice_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_slice_op_xpu.py
rename to test/xpu/test_slice_op_xpu.py
index 09368723a1f487..f19c3d37e283ed 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_slice_op_xpu.py
+++ b/test/xpu/test_slice_op_xpu.py
@@ -12,20 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-import paddle
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
+
+import paddle
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_softmax_op_xpu.py b/test/xpu/test_softmax_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_softmax_op_xpu.py
rename to test/xpu/test_softmax_op_xpu.py
index 24c25bbe1a88ea..9b849832bd984c 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_softmax_op_xpu.py
+++ b/test/xpu/test_softmax_op_xpu.py
@@ -12,20 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-import paddle
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
+
+import paddle
 
 paddle.enable_static()
 np.random.seed(10)
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_softmax_with_cross_entropy_op_xpu.py b/test/xpu/test_softmax_with_cross_entropy_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_softmax_with_cross_entropy_op_xpu.py
rename to test/xpu/test_softmax_with_cross_entropy_op_xpu.py
index 1ecc1eb4934ca2..cb623e900d42b7 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_softmax_with_cross_entropy_op_xpu.py
+++ b/test/xpu/test_softmax_with_cross_entropy_op_xpu.py
@@ -12,20 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
-
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from test_softmax_op import stable_softmax
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
+from test_softmax_op import stable_softmax
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_split_op_xpu.py b/test/xpu/test_split_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_split_op_xpu.py
rename to test/xpu/test_split_op_xpu.py
index dca61b4b129a1d..8bc7ee9af1b045 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_split_op_xpu.py
+++ b/test/xpu/test_split_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_squeeze2_op_xpu.py b/test/xpu/test_squeeze2_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_squeeze2_op_xpu.py
rename to test/xpu/test_squeeze2_op_xpu.py
index b9598bc3ca08ab..4e26152551c57c 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_squeeze2_op_xpu.py
+++ b/test/xpu/test_squeeze2_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
-sys.path.append("..")
-
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_squeeze_op_xpu.py b/test/xpu/test_squeeze_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_squeeze_op_xpu.py
rename to test/xpu/test_squeeze_op_xpu.py
index 85339b9eb8b8ab..5aae366c856359 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_squeeze_op_xpu.py
+++ b/test/xpu/test_squeeze_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
-sys.path.append("..")
-
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import fluid
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_stack_op_xpu.py b/test/xpu/test_stack_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_stack_op_xpu.py
rename to test/xpu/test_stack_op_xpu.py
index b13e1b9b300aad..3732de7dc33f5b 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_stack_op_xpu.py
+++ b/test/xpu/test_stack_op_xpu.py
@@ -13,18 +13,18 @@
 # limitations under the License.
 
 import sys
-
-sys.path.append("..")
 import unittest
 
+sys.path.append("../../python/paddle/fluid/tests/unittests")
+
 import numpy as np
 from eager_op_test import skip_check_grad_ci
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_strided_slice_op_xpu.py b/test/xpu/test_strided_slice_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_strided_slice_op_xpu.py
rename to test/xpu/test_strided_slice_op_xpu.py
index 7659ffd4ae0c6d..63954dfd7859c5 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_strided_slice_op_xpu.py
+++ b/test/xpu/test_strided_slice_op_xpu.py
@@ -12,20 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-import paddle
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
+
+import paddle
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_sum_op_xpu.py b/test/xpu/test_sum_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_sum_op_xpu.py
rename to test/xpu/test_sum_op_xpu.py
index 77d934e478cb52..3b51b0adb76d02 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_sum_op_xpu.py
+++ b/test/xpu/test_sum_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import fluid
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_temporal_shift_op_xpu.py b/test/xpu/test_temporal_shift_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_temporal_shift_op_xpu.py
rename to test/xpu/test_temporal_shift_op_xpu.py
index 4a1967326504fe..71904903fc1454 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_temporal_shift_op_xpu.py
+++ b/test/xpu/test_temporal_shift_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 import paddle.nn.functional as F
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_tile_op_xpu.py b/test/xpu/test_tile_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_tile_op_xpu.py
rename to test/xpu/test_tile_op_xpu.py
index c6f9c79be4d6f1..dc2b0d7f0edcd9 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_tile_op_xpu.py
+++ b/test/xpu/test_tile_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import fluid
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_top_k_op_xpu.py b/test/xpu/test_top_k_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_top_k_op_xpu.py
rename to test/xpu/test_top_k_op_xpu.py
index 8dfbddbb1cf59e..131bb0c1d0711d 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_top_k_op_xpu.py
+++ b/test/xpu/test_top_k_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_top_k_v2_op_xpu.py b/test/xpu/test_top_k_v2_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_top_k_v2_op_xpu.py
rename to test/xpu/test_top_k_v2_op_xpu.py
index eaad7001928faf..8230aa0ff5d224 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_top_k_v2_op_xpu.py
+++ b/test/xpu/test_top_k_v2_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_transpose_op_xpu.py b/test/xpu/test_transpose_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_transpose_op_xpu.py
rename to test/xpu/test_transpose_op_xpu.py
index 458cf8a6674210..f314eb6e4dc77c 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_transpose_op_xpu.py
+++ b/test/xpu/test_transpose_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_tril_triu_op_xpu.py b/test/xpu/test_tril_triu_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_tril_triu_op_xpu.py
rename to test/xpu/test_tril_triu_op_xpu.py
index 010cf6fb6102e3..15371d894fa8db 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_tril_triu_op_xpu.py
+++ b/test/xpu/test_tril_triu_op_xpu.py
@@ -10,19 +10,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
-
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import tensor
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_truncated_gaussian_random_op_xpu.py b/test/xpu/test_truncated_gaussian_random_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_truncated_gaussian_random_op_xpu.py
rename to test/xpu/test_truncated_gaussian_random_op_xpu.py
index 7355acdfcee487..c217a2641d1605 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_truncated_gaussian_random_op_xpu.py
+++ b/test/xpu/test_truncated_gaussian_random_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import fluid
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_unbind_op_xpu.py b/test/xpu/test_unbind_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_unbind_op_xpu.py
rename to test/xpu/test_unbind_op_xpu.py
index fd0f36677f8fc3..dc8ea7ae6bc14d 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_unbind_op_xpu.py
+++ b/test/xpu/test_unbind_op_xpu.py
@@ -12,17 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
-sys.path.append("..")
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import fluid, tensor
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_unfold_op_xpu.py b/test/xpu/test_unfold_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_unfold_op_xpu.py
rename to test/xpu/test_unfold_op_xpu.py
index e1034d6363628b..c6e80469f7d0d0 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_unfold_op_xpu.py
+++ b/test/xpu/test_unfold_op_xpu.py
@@ -12,21 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-import paddle
-from paddle import fluid
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
+
+import paddle
+from paddle import fluid
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_uniform_random_op_xpu.py b/test/xpu/test_uniform_random_op_xpu.py
similarity index 95%
rename from python/paddle/fluid/tests/unittests/xpu/test_uniform_random_op_xpu.py
rename to test/xpu/test_uniform_random_op_xpu.py
index 3dff72b5d680ca..f5fd57bd366969 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_uniform_random_op_xpu.py
+++ b/test/xpu/test_uniform_random_op_xpu.py
@@ -13,10 +13,10 @@
 # limitations under the License.
 
 import sys
-
-sys.path.append("..")
 import unittest
 
+sys.path.append('../../python/paddle/fluid/tests/unittests')
+
 import numpy as np
 from test_uniform_random_op import (
     TestUniformRandomOp,
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_unsqueeze2_op_xpu.py b/test/xpu/test_unsqueeze2_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_unsqueeze2_op_xpu.py
rename to test/xpu/test_unsqueeze2_op_xpu.py
index 56862299074f30..d8cb02e64f9933 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_unsqueeze2_op_xpu.py
+++ b/test/xpu/test_unsqueeze2_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
-sys.path.append("..")
-
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_unsqueeze_op_xpu.py b/test/xpu/test_unsqueeze_op_xpu.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/xpu/test_unsqueeze_op_xpu.py
rename to test/xpu/test_unsqueeze_op_xpu.py
index 4f2b1d2b5a8adc..333633031bdfd3 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_unsqueeze_op_xpu.py
+++ b/test/xpu/test_unsqueeze_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
-sys.path.append("..")
-
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_unstack_op_xpu.py b/test/xpu/test_unstack_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_unstack_op_xpu.py
rename to test/xpu/test_unstack_op_xpu.py
index 6195ec55abd411..9d305a312b74b7 100755
--- a/python/paddle/fluid/tests/unittests/xpu/test_unstack_op_xpu.py
+++ b/test/xpu/test_unstack_op_xpu.py
@@ -12,18 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_update_loss_scaling_op_xpu.py b/test/xpu/test_update_loss_scaling_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_update_loss_scaling_op_xpu.py
rename to test/xpu/test_update_loss_scaling_op_xpu.py
index e2b7263fed26e9..86e6aac6badb51 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_update_loss_scaling_op_xpu.py
+++ b/test/xpu/test_update_loss_scaling_op_xpu.py
@@ -12,17 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
-sys.path.append("..")
 import numpy as np
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import fluid
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_warpctc_op_xpu.py b/test/xpu/test_warpctc_op_xpu.py
similarity index 99%
rename from python/paddle/fluid/tests/unittests/xpu/test_warpctc_op_xpu.py
rename to test/xpu/test_warpctc_op_xpu.py
index d09db48cffc51d..95cf65075472fe 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_warpctc_op_xpu.py
+++ b/test/xpu/test_warpctc_op_xpu.py
@@ -13,18 +13,16 @@
 # limitations under the License.
 
 import sys
-
-sys.path.append("..")
 import unittest
 
 import numpy as np
-from op_test_xpu import XPUOpTest
-from test_softmax_op import stable_softmax
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
+from test_softmax_op import stable_softmax
 
 import paddle
 import paddle.nn.functional as F
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_where_index_xpu.py b/test/xpu/test_where_index_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_where_index_xpu.py
rename to test/xpu/test_where_index_xpu.py
index 1a8e7aa96453ca..cca29f57373365 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_where_index_xpu.py
+++ b/test/xpu/test_where_index_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import fluid
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_where_op_xpu.py b/test/xpu/test_where_op_xpu.py
similarity index 98%
rename from python/paddle/fluid/tests/unittests/xpu/test_where_op_xpu.py
rename to test/xpu/test_where_op_xpu.py
index a7a26f32b02cbe..8dd7500517aed5 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_where_op_xpu.py
+++ b/test/xpu/test_where_op_xpu.py
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
-
-sys.path.append("..")
-
-from op_test_xpu import XPUOpTest
-from xpu.get_test_cover_info import (
+from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test_xpu import XPUOpTest
 
 import paddle
 from paddle import fluid
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_while_op_xpu.py b/test/xpu/test_while_op_xpu.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/xpu/test_while_op_xpu.py
rename to test/xpu/test_while_op_xpu.py
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_xpu_place.py b/test/xpu/test_xpu_place.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/xpu/test_xpu_place.py
rename to test/xpu/test_xpu_place.py
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_xpu_stream_event.py b/test/xpu/test_xpu_stream_event.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/xpu/test_xpu_stream_event.py
rename to test/xpu/test_xpu_stream_event.py
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py b/test/xpu/test_zero_dim_tensor_xpu.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/xpu/test_zero_dim_tensor_xpu.py
rename to test/xpu/test_zero_dim_tensor_xpu.py
diff --git a/tools/get_pr_ut.py b/tools/get_pr_ut.py
index c6f190189a67f4..14c4e5075f6d82 100644
--- a/tools/get_pr_ut.py
+++ b/tools/get_pr_ut.py
@@ -406,7 +406,7 @@ def get_pr_ut(self):
                         ut_list.append('md_placeholder')
                         onlyCommentsFilesOrXpu.append(f_judge)
                     elif (
-                        'tests/unittests/xpu' in f_judge
+                        'test/xpu' in f_judge
                         or 'tests/unittests/npu' in f_judge
                         or 'op_npu.cc' in f_judge
                     ):

From b835d958e53ab7f18f77cbb9797e607f83db4447 Mon Sep 17 00:00:00 2001
From: Yuanle Liu <yuanlehome@163.com>
Date: Wed, 12 Apr 2023 10:30:03 +0800
Subject: [PATCH 078/156] fix convert_to_mixed_precision api save model bug
 (#52767)

* update save model

* update
---
 .../passes/convert_to_mixed_precision.cc      | 57 +++++++++++++------
 1 file changed, 39 insertions(+), 18 deletions(-)

diff --git a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc
index 2589a20eb284dc..963197850c9fd3 100644
--- a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc
+++ b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc
@@ -102,32 +102,53 @@ void ConvertToMixedPrecisionPass::SaveMixedModel() {
   framework::ProgramDesc mixed_program_desc;
   framework::ir::GraphToProgram(*main_graph_, &mixed_program_desc);
 
-  auto parameters = scope_.LocalVarNames();
-  std::sort(parameters.begin(), parameters.end());
-
-  auto SerializeParams = [&]() -> std::string {
-    std::ostringstream os;
-    phi::CPUContext ctx;
-    for (const auto& param : parameters) {
-      PADDLE_ENFORCE_NOT_NULL(
-          scope_.FindVar(param),
-          platform::errors::NotFound(
-              "Block should already have a '%s' variable", param));
-      auto* tensor = scope_.FindVar(param)->GetMutable<phi::DenseTensor>();
-      framework::SerializeToStream(os, *tensor, ctx);
+  auto SerializeParams = [&](const std::string& path) {
+    auto IsPersistable = [](const framework::VarDesc* var) {
+      if (var->Persistable() &&
+          var->GetType() != framework::proto::VarType::FEED_MINIBATCH &&
+          var->GetType() != framework::proto::VarType::FETCH_LIST &&
+          var->GetType() != framework::proto::VarType::RAW) {
+        return true;
+      }
+      return false;
+    };
+    framework::ProgramDesc save_program;
+    auto* save_block = save_program.MutableBlock(0);
+
+    const auto& global_block = mixed_program_desc.Block(0);
+    std::vector<std::string> save_var_list;
+    for (framework::VarDesc* var : global_block.AllVars()) {
+      if (IsPersistable(var)) {
+        framework::VarDesc* new_var = save_block->Var(var->Name());
+        new_var->SetShape(var->GetShape());
+        new_var->SetDataType(var->GetDataType());
+        new_var->SetType(var->GetType());
+        new_var->SetLoDLevel(var->GetLoDLevel());
+        new_var->SetPersistable(true);
+
+        save_var_list.push_back(new_var->Name());
+      }
     }
-    return os.str();
+    std::sort(save_var_list.begin(), save_var_list.end());
+    auto* op = save_block->AppendOp();
+    op->SetType("save_combine");
+    op->SetInput("X", save_var_list);
+    op->SetAttr("file_path", path);
+    op->CheckAttrs();
+
+    framework::Executor exe(platform::CPUPlace{});
+    exe.Run(save_program, &scope_, 0, true, true);
   };
 
-  auto StrToBinary = [](const std::string& path, const std::string& str) {
+  auto SerializeProg = [&](const std::string& path) {
+    auto str = mixed_program_desc.Proto()->SerializeAsString();
     std::ofstream file(path.c_str(), std::ios::binary);
     file.write(str.c_str(), str.size());
     file.close();
   };
 
-  StrToBinary(mixed_model_file_,
-              mixed_program_desc.Proto()->SerializeAsString());
-  StrToBinary(mixed_params_file_, SerializeParams());
+  SerializeProg(mixed_model_file_);
+  SerializeParams(mixed_params_file_);
 }
 
 bool OpSupportPrecision(const std::string& op_type,

From d12b1ffa4bca06c10ed9b70a2675285cfaae818b Mon Sep 17 00:00:00 2001
From: Yuanle Liu <yuanlehome@163.com>
Date: Wed, 12 Apr 2023 10:32:00 +0800
Subject: [PATCH 079/156] move delete_cast_op_pass (#52788)

---
 paddle/fluid/framework/ir/CMakeLists.txt      | 11 +++--
 .../ir/{xpu => }/delete_cast_op_pass.cc       | 49 +++++++++----------
 .../ir/{xpu => }/delete_cast_op_pass.h        |  0
 .../ir/{xpu => }/delete_cast_op_pass_test.cc  |  0
 .../inference/api/paddle_pass_builder.cc      |  1 +
 5 files changed, 31 insertions(+), 30 deletions(-)
 rename paddle/fluid/framework/ir/{xpu => }/delete_cast_op_pass.cc (93%)
 rename paddle/fluid/framework/ir/{xpu => }/delete_cast_op_pass.h (100%)
 rename paddle/fluid/framework/ir/{xpu => }/delete_cast_op_pass_test.cc (100%)

diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 91c3ba6d608b4c..b1db3dd0a43cb6 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -126,6 +126,7 @@ pass_library(matmul_scale_fuse_pass inference)
 pass_library(gpu_cpu_map_matmul_to_mul_pass inference)
 pass_library(dense_fc_to_sparse_pass inference)
 pass_library(dense_multihead_matmul_to_sparse_pass inference)
+pass_library(delete_cast_op_pass inference)
 pass_library(generate_pass DEPS pass_desc_proto)
 target_link_libraries(generate_pass pass_desc_proto)
 
@@ -242,7 +243,6 @@ if(WITH_XPU)
   pass_library(fused_multi_transformer_xpu_quant_pass inference DIR xpu DEPS
                ${XPU_PASS_DEPS})
   pass_library(stack_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS})
-  pass_library(delete_cast_op_pass inference DIR xpu DEPS ${XPU_PASS_DEPS})
 endif()
 
 cc_library(
@@ -407,6 +407,11 @@ cc_test(
   test_delete_dequant_weight_linear_op_pass
   SRCS delete_weight_dequant_linear_op_pass_tester.cc
   DEPS delete_weight_dequant_linear_op_pass)
+cc_test(
+  test_delete_cast_op_pass
+  SRCS delete_cast_op_pass_test.cc
+  DEPS delete_cast_op_pass)
+
 if(WITH_GPU OR WITH_ROCM)
   cc_test(
     test_embedding_eltwise_layernorm_fuse_pass
@@ -521,8 +526,4 @@ if(WITH_XPU)
     test_stack_fuse_pass
     SRCS xpu/stack_fuse_pass_test.cc
     DEPS stack_fuse_pass)
-  cc_test(
-    test_delete_cast_op_pass
-    SRCS xpu/delete_cast_op_pass_test.cc
-    DEPS delete_cast_op_pass)
 endif()
diff --git a/paddle/fluid/framework/ir/xpu/delete_cast_op_pass.cc b/paddle/fluid/framework/ir/delete_cast_op_pass.cc
similarity index 93%
rename from paddle/fluid/framework/ir/xpu/delete_cast_op_pass.cc
rename to paddle/fluid/framework/ir/delete_cast_op_pass.cc
index fb417322476b2b..bfda0f32380102 100644
--- a/paddle/fluid/framework/ir/xpu/delete_cast_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_cast_op_pass.cc
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/xpu/delete_cast_op_pass.h"
-#include <string>
+#include "paddle/fluid/framework/ir/delete_cast_op_pass.h"
+
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/framework/ir/xpu/pass_utils.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 
@@ -127,11 +126,11 @@ int DeleteCastOpPass::ApplyCastWriteReadPass(ir::Graph* graph) const {
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* graph) {
     VLOG(4) << "handle ApplyCastWriteReadPass fuse";
-    GET_IR_NODE(cast0);
-    GET_IR_NODE(write_to_array);
-    GET_IR_NODE(cast0_in);
-    GET_IR_NODE(cast0_out);
-    GET_IR_NODE(write_to_array_out);
+    GET_IR_NODE_FROM_SUBGRAPH(cast0, cast0, pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(write_to_array, write_to_array, pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(cast0_in, cast0_in, pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(cast0_out, cast0_out, pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(write_to_array_out, write_to_array_out, pattern);
 
     // write_to_array_out(in graph1) may not link to any op nodes, so we fine
     // read_from_array by write_to_array_out name.
@@ -281,13 +280,13 @@ int DeleteCastOpPass::ApplyCastLodResetWriteReadPass(ir::Graph* graph) const {
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* graph) {
     VLOG(4) << "handle ApplyCastLodResetWriteReadPass fuse";
-    GET_IR_NODE(cast0);
-    GET_IR_NODE(lod_reset);
-    GET_IR_NODE(write_to_array);
-    GET_IR_NODE(cast0_in);
-    GET_IR_NODE(cast0_out);
-    GET_IR_NODE(lod_reset_out);
-    GET_IR_NODE(write_to_array_out);
+    GET_IR_NODE_FROM_SUBGRAPH(cast0, cast0, pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(lod_reset, lod_reset, pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(write_to_array, write_to_array, pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(cast0_in, cast0_in, pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(cast0_out, cast0_out, pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(lod_reset_out, lod_reset_out, pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(write_to_array_out, write_to_array_out, pattern);
 
     // write_to_array_out(in graph1) may not link to any op nodes, so we fine
     // read_from_array by write_to_array_out name.
@@ -482,13 +481,13 @@ int DeleteCastOpPass::ApplyCastIndexSamplePass(ir::Graph* graph) const {
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* graph) {
     VLOG(4) << "handle ApplyCastIndexSamplePass fuse";
-    GET_IR_NODE(cast0);
-    GET_IR_NODE(index_sample);
-    GET_IR_NODE(cast1);
-    GET_IR_NODE(cast0_in);
-    GET_IR_NODE(cast0_out);
-    GET_IR_NODE(index_sample_out);
-    GET_IR_NODE(cast1_out);
+    GET_IR_NODE_FROM_SUBGRAPH(cast0, cast0, pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(index_sample, index_sample, pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(cast1, cast1, pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(cast0_in, cast0_in, pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(cast0_out, cast0_out, pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(index_sample_out, index_sample_out, pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(cast1_out, cast1_out, pattern);
 
     index_sample->Op()->RenameInput(cast0_out->Name(), cast0_in->Name());
     index_sample->Op()->RenameOutput(index_sample_out->Name(),
@@ -545,9 +544,9 @@ int DeleteCastOpPass::ApplyCastPass(ir::Graph* graph) const {
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* graph) {
     VLOG(4) << "handle ApplyCastPass fuse";
-    GET_IR_NODE(cast);
-    GET_IR_NODE(cast_in);
-    GET_IR_NODE(cast_out);
+    GET_IR_NODE_FROM_SUBGRAPH(cast, cast, pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(cast_in, cast_in, pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(cast_out, cast_out, pattern);
     for (auto* out_op_node : cast_out->outputs) {
       out_op_node->Op()->RenameInput(cast_out->Name(), cast_in->Name());
       IR_NODE_LINK_TO(cast_in, out_op_node);
diff --git a/paddle/fluid/framework/ir/xpu/delete_cast_op_pass.h b/paddle/fluid/framework/ir/delete_cast_op_pass.h
similarity index 100%
rename from paddle/fluid/framework/ir/xpu/delete_cast_op_pass.h
rename to paddle/fluid/framework/ir/delete_cast_op_pass.h
diff --git a/paddle/fluid/framework/ir/xpu/delete_cast_op_pass_test.cc b/paddle/fluid/framework/ir/delete_cast_op_pass_test.cc
similarity index 100%
rename from paddle/fluid/framework/ir/xpu/delete_cast_op_pass_test.cc
rename to paddle/fluid/framework/ir/delete_cast_op_pass_test.cc
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 3cc8b077ad7e63..a1fe08b081eebd 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -276,6 +276,7 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
         "transpose_flatten_concat_fuse_pass",  //
         "conv2d_fusion_layout_transfer_pass",  //
         "auto_mixed_precision_pass",           //
+        "delete_cast_op_pass",                 //
         "inplace_op_var_pass",                 // should be the last pass.
   });
 

From 8d7c15a7b04f833f97dc09abf4f62ca411b5728e Mon Sep 17 00:00:00 2001
From: jiangcheng <thisjiang@qq.com>
Date: Wed, 12 Apr 2023 10:46:37 +0800
Subject: [PATCH 080/156] [CINN] add cinn sub-graph save into graphviz flag
 (#52766)

---
 .../framework/paddle2cinn/cinn_compiler.cc    | 23 ++++++++++++++++++-
 paddle/phi/core/flags.cc                      | 14 +++++++++++
 2 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
index 359bab844303f3..4c1538a28fedb2 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
@@ -31,6 +31,7 @@
 #include "cinn/frontend/syntax.h"
 #include "cinn/hlir/framework/graph.h"
 #include "cinn/hlir/framework/graph_compiler.h"
+#include "cinn/hlir/framework/visualize_helper.h"
 #include "gflags/gflags.h"
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/ir/graph.h"
@@ -49,6 +50,7 @@
 
 DECLARE_bool(enable_pe_launch_cinn);
 DECLARE_bool(enable_cinn_auto_tune);
+DECLARE_string(cinn_subgraph_graphviz_dir);
 namespace paddle {
 namespace framework {
 namespace paddle2cinn {
@@ -73,7 +75,6 @@ const CinnCompiledObject &CinnCompiler::Compile(
     const std::map<std::string, const phi::DenseTensor *> &input_tensors,
     const Target &target,
     void *stream) {
-  VLOG(4) << "-- The graph to be compiled is:\n" << VizGraph(graph);
   CinnCacheKeyByAddress cur_key_by_address(
       graph, input_tensors, target.arch_str());
   CinnCacheKeyByStructure cur_key_by_struct;
@@ -85,6 +86,26 @@ const CinnCompiledObject &CinnCompiler::Compile(
     if (!cache_by_struct_.count(cur_key_by_struct)) {
       VLOG(4) << "Not found CinnCompiledObject in cache_by_struct_.";
       std::int64_t compiled_num = real_compiled_num_.fetch_add(1);
+
+      if (!FLAGS_cinn_subgraph_graphviz_dir.empty()) {
+        const std::string &viz_path = FLAGS_cinn_subgraph_graphviz_dir +
+                                      "/fusion_groups_" +
+                                      std::to_string(compiled_num) + "/";
+        if (!::cinn::hlir::framework::MakeDirectory(
+                viz_path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH)) {
+          LOG_IF(WARNING, compiled_num == 0)
+              << "Failed to make directory: \"" << viz_path
+              << "\", the CINN subgraph's graphviz dot file will not print.";
+        } else {
+          LOG_IF(INFO, compiled_num == 0)
+              << "The CINN subgraph's graphviz dot file will writing into "
+                 "path: \""
+              << FLAGS_cinn_subgraph_graphviz_dir << "\"";
+          ::cinn::hlir::framework::WriteToFile(viz_path + "cinn_subgraph.dot",
+                                               VizGraph(graph));
+        }
+      }
+
       auto compiled_res =
           CompileGraph(graph, input_tensors, target, compiled_num, stream);
       std::unique_lock<std::mutex> guard(lock_);
diff --git a/paddle/phi/core/flags.cc b/paddle/phi/core/flags.cc
index 9cff3acccbd416..ad2e38b70d0ed6 100644
--- a/paddle/phi/core/flags.cc
+++ b/paddle/phi/core/flags.cc
@@ -1001,6 +1001,20 @@ PADDLE_DEFINE_EXPORTED_bool(enable_cinn_auto_tune,
                             "It controls whether to use cinn with "
                             "its auto-tune feature enabled");
 
+/*
+ * CINN related FLAG
+ * Name: FLAGS_cinn_subgraph_graphviz_dir
+ * Since Version: 2.3
+ * Value Range: string, default=""
+ * Example: FLAGS_cinn_subgraph_graphviz_dir="./cinn_graph/" will save the
+ * CINN sub-graph into "./cinn_graph/", and each sub-graph will save into
+ * "fusion_groups_*"" directory
+ */
+PADDLE_DEFINE_EXPORTED_string(cinn_subgraph_graphviz_dir,
+                              "",
+                              "Specify the directory path of dot file of "
+                              "graph, which is used for debug.");
+
 #endif
 
 /*

From 189e0d44eaa3ef7833d1f7ed351ebcbc3113b83a Mon Sep 17 00:00:00 2001
From: wangzhen38 <41941775+wangzhen38@users.noreply.github.com>
Date: Wed, 12 Apr 2023 11:01:45 +0800
Subject: [PATCH 081/156] Patch del (#52754)

* [DO NOT MERGE] adadelta lr support

* [DO NOT MERGE] gpu support

* [test] follow torch

* fix acc update order

* for ci

* [bug fix] update master para

* [bug fix] update test

* [bug fix] for ci test

* for ci

* fix xpu

* [adadelta fix] del fluid head file

* for ci

* del notes
---
 .../phi/kernels/impl/adadelta_kernel_impl.h   | 34 +++++++------------
 1 file changed, 12 insertions(+), 22 deletions(-)

diff --git a/paddle/phi/kernels/impl/adadelta_kernel_impl.h b/paddle/phi/kernels/impl/adadelta_kernel_impl.h
index c432c72d832c60..18fcd953d65324 100644
--- a/paddle/phi/kernels/impl/adadelta_kernel_impl.h
+++ b/paddle/phi/kernels/impl/adadelta_kernel_impl.h
@@ -13,10 +13,6 @@
 // limitations under the License.
 
 #pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/kernels/adadelta_kernel.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
@@ -67,26 +63,20 @@ void AdadeltaKernel(const Context& dev_ctx,
       -(((eigen_avg_squared_update + epsilon_).sqrt()) /
         ((eigen_avg_squared_grad_out + epsilon_).sqrt()) * eigen_grad_cast);
   Eigen::DSizes<int, 1> m_dsize(avg_squared_update_out->numel());
-  if (paddle::platform::is_cpu_place(dev_ctx.GetPlace())) {
-    auto* lr = learning_rate.data<T>();
+  auto lr = EigenVector<MPDType>::Flatten(learning_rate);
+  if (multi_precision) {
+    auto eigen_master_param_out =
+        EigenVector<MPDType>::Flatten(*master_param_outs);
+    auto eigen_master_param = EigenVector<MPDType>::Flatten(*master_param);
+
+    eigen_master_param_out.device(place) =
+        eigen_master_param + lr.broadcast(m_dsize) * update;
     eigen_param_out.device(place) =
-        eigen_param + lr[0] * update.template cast<T>();
+        (eigen_param.template cast<MPDType>() + lr.broadcast(m_dsize) * update)
+            .template cast<T>();
   } else {
-    auto lr = EigenVector<MPDType>::Flatten(learning_rate);
-    if (multi_precision) {
-      auto eigen_master_param_out =
-          EigenVector<MPDType>::Flatten(*master_param_outs);
-      auto eigen_master_param = EigenVector<MPDType>::Flatten(*master_param);
-
-      eigen_master_param_out.device(place) =
-          eigen_master_param + lr.broadcast(m_dsize) * update;
-      eigen_param_out.device(place) = (eigen_param.template cast<MPDType>() +
-                                       lr.broadcast(m_dsize) * update)
-                                          .template cast<T>();
-    } else {
-      eigen_param_out.device(place) =
-          eigen_param + (lr.broadcast(m_dsize) * update).template cast<T>();
-    }
+    eigen_param_out.device(place) =
+        eigen_param + (lr.broadcast(m_dsize) * update).template cast<T>();
   }
   eigen_avg_squared_update_out.device(place) =
       rho_ * eigen_avg_squared_update + (1 - rho_) * update.square();

From a482f6bfff60de3c689dd6208bf2b3b2bc030da1 Mon Sep 17 00:00:00 2001
From: Galaxy1458 <55453380+Galaxy1458@users.noreply.github.com>
Date: Wed, 12 Apr 2023 11:05:16 +0800
Subject: [PATCH 082/156] remove [-Wimplicit-fallthrough=] warning (#52783)

* test, test=develop

* test, test=develop

* test, test=develop
---
 paddle/fluid/imperative/layout_transformer.h  |  8 ++------
 .../composite_backward_api.h                  |  5 +----
 paddle/utils/string/tinyformat/tinyformat.h   | 19 +++++++------------
 3 files changed, 10 insertions(+), 22 deletions(-)

diff --git a/paddle/fluid/imperative/layout_transformer.h b/paddle/fluid/imperative/layout_transformer.h
index 93c924a095c9ee..2bdbead6aae0d9 100644
--- a/paddle/fluid/imperative/layout_transformer.h
+++ b/paddle/fluid/imperative/layout_transformer.h
@@ -402,16 +402,12 @@ class ArgmaxOpTransformer
           case paddle::framework::proto::AttrType::INT: {
             auto axis = PADDLE_GET_CONST(int, (*attrs)["axis"]);
             (*attrs)["axis"] = static_cast<int>(perm[axis]);
-#ifdef LINUX
-            __attribute__((fallthrough));
-#endif
+            break;
           }
           case paddle::framework::proto::AttrType::LONG: {
             auto axis = PADDLE_GET_CONST(int64_t, (*attrs)["axis"]);
             (*attrs)["axis"] = static_cast<int64_t>(perm[axis]);
-#ifdef LINUX
-            __attribute__((fallthrough));
-#endif
+            break;
           }
           default:
             VLOG(4) << "The data_type of axis is Error, axis must be int or "
diff --git a/paddle/fluid/prim/api/composite_backward/composite_backward_api.h b/paddle/fluid/prim/api/composite_backward/composite_backward_api.h
index 6697f1a614c381..c0830b2a754280 100644
--- a/paddle/fluid/prim/api/composite_backward/composite_backward_api.h
+++ b/paddle/fluid/prim/api/composite_backward/composite_backward_api.h
@@ -1483,11 +1483,8 @@ void batch_norm_grad(const Tensor& x,
         if (bias_grad) {
           set_output<T>(out_grad_data_sum, bias_grad);
         }
-        break;
       }
-#ifdef LINUX
-      __attribute__((fallthrough));
-#endif
+      break;
     }
 
     default:
diff --git a/paddle/utils/string/tinyformat/tinyformat.h b/paddle/utils/string/tinyformat/tinyformat.h
index bd8d47849db966..41319c391455e6 100644
--- a/paddle/utils/string/tinyformat/tinyformat.h
+++ b/paddle/utils/string/tinyformat/tinyformat.h
@@ -691,9 +691,8 @@ inline const char *streamStateFromFormat(std::ostream &out,       // NOLINT
       break;
     case 'X':
       out.setf(std::ios::uppercase);
-#ifdef LINUX
-      __attribute__((fallthrough));
-#endif
+      break;
+
     case 'x':
     case 'p':
       out.setf(std::ios::hex, std::ios::basefield);
@@ -701,26 +700,22 @@ inline const char *streamStateFromFormat(std::ostream &out,       // NOLINT
       break;
     case 'E':
       out.setf(std::ios::uppercase);
-#ifdef LINUX
-      __attribute__((fallthrough));
-#endif
+      break;
     case 'e':
       out.setf(std::ios::scientific, std::ios::floatfield);
       out.setf(std::ios::dec, std::ios::basefield);
       break;
     case 'F':
       out.setf(std::ios::uppercase);
-#ifdef LINUX
-      __attribute__((fallthrough));
-#endif
+
+      break;
+
     case 'f':
       out.setf(std::ios::fixed, std::ios::floatfield);
       break;
     case 'G':
       out.setf(std::ios::uppercase);
-#ifdef LINUX
-      __attribute__((fallthrough));
-#endif
+      break;
 
     case 'g':
       out.setf(std::ios::dec, std::ios::basefield);

From 3a7980f27c1f46c55628e236ea57aca3cc6b1569 Mon Sep 17 00:00:00 2001
From: xiongkun <xiongkun03@baidu.com>
Date: Wed, 12 Apr 2023 11:07:00 +0800
Subject: [PATCH 083/156] bugfix for sparse tensor reduce. (#52785)

---
 paddle/fluid/distributed/collective/reducer.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc
index 63071139a5f409..defc84fbe3d9cf 100644
--- a/paddle/fluid/distributed/collective/reducer.cc
+++ b/paddle/fluid/distributed/collective/reducer.cc
@@ -821,9 +821,9 @@ void EagerReducer::MarkVarReady(const size_t var_index,
 
   auto &group = groups_[group_index];
   auto &group_tensor = group.dense_tensors_[inside_group_index];
-  const auto length = group.length_[inside_group_index];
 
   if (!group.is_sparse_) {
+    const auto length = group.length_[inside_group_index];
     if (is_used_var) {
       auto *autograd_meta = tensors_[var_index].get_autograd_meta();
       auto &grad_tensor =

From f9b155f98956368c9d248f8a9cce598c90b204c5 Mon Sep 17 00:00:00 2001
From: Wei Shengyu <weisy11@163.com>
Date: Wed, 12 Apr 2023 11:18:13 +0800
Subject: [PATCH 084/156] [AMP OP&Test] add fp16/bf16 unittest for pool2d op
 (#52288)

* add bf16 support and bf16/fp16 unittest for pool2d

* add include files

* dbg

* reformat

* reformat

* modify code according to review comment

* remove duplicate code

* remove dup code

* remove useless include

* dbg
---
 paddle/phi/kernels/gpu/pool_grad_kernel.cu    |   3 +-
 paddle/phi/kernels/gpu/pool_kernel.cu         |   3 +-
 .../fluid/tests/unittests/test_pool2d_op.py   | 114 +++++++++++++++---
 3 files changed, 103 insertions(+), 17 deletions(-)

diff --git a/paddle/phi/kernels/gpu/pool_grad_kernel.cu b/paddle/phi/kernels/gpu/pool_grad_kernel.cu
index 598a48f802891e..e4cfcb23b730e7 100644
--- a/paddle/phi/kernels/gpu/pool_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/pool_grad_kernel.cu
@@ -25,7 +25,8 @@ PD_REGISTER_KERNEL(pool2d_grad,
                    phi::Pool2dGradKernel,
                    float,
                    double,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 PD_REGISTER_KERNEL(pool2d_double_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/pool_kernel.cu b/paddle/phi/kernels/gpu/pool_kernel.cu
index 6323909c9d0dca..65d0ef4bdc9168 100644
--- a/paddle/phi/kernels/gpu/pool_kernel.cu
+++ b/paddle/phi/kernels/gpu/pool_kernel.cu
@@ -25,7 +25,8 @@ PD_REGISTER_KERNEL(pool2d,
                    phi::Pool2dKernel,
                    float,
                    double,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 PD_REGISTER_KERNEL(max_pool2d_with_index,
                    GPU,
                    ALL_LAYOUT,
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
index 5ab2bad28e3c3f..aae7ba87697ced 100644
--- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
@@ -15,6 +15,7 @@
 import unittest
 
 import numpy as np
+from eager_op_test import convert_float_to_uint16
 
 import paddle
 from paddle.fluid import core
@@ -366,7 +367,11 @@ def setUp(self):
         self.init_data_format()
         self.init_shape()
 
-        input = np.random.random(self.shape).astype(self.dtype)
+        if self.is_bfloat16_op():
+            input = np.random.random(self.shape).astype(np.float32)
+        else:
+            input = np.random.random(self.shape).astype(self.dtype)
+
         output = pool2D_forward_naive(
             input,
             self.ksize,
@@ -379,8 +384,14 @@ def setUp(self):
             self.data_format,
             self.pool_type,
             self.padding_algorithm,
-        ).astype(self.dtype)
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(input)}
+        )
+
+        if self.is_bfloat16_op():
+            output = convert_float_to_uint16(output)
+            self.inputs = {'X': convert_float_to_uint16(input)}
+        else:
+            output = output.astype(self.dtype)
+            self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(input)}
 
         self.attrs = {
             'strides': self.strides,
@@ -427,7 +438,6 @@ def test_check_grad(self):
                 place,
                 {'X'},
                 'Out',
-                max_relative_error=0.07,
                 check_dygraph=(not self.use_mkldnn),
             )
         elif self.pool_type != "max":
@@ -577,7 +587,6 @@ def test_check_output(self):
                 if core.is_float16_supported(place):
                     self.check_output_with_place(
                         place,
-                        atol=1e-3,
                         check_dygraph=(not self.use_mkldnn),
                     )
 
@@ -593,7 +602,6 @@ def test_check_grad(self):
                     place,
                     {'X'},
                     'Out',
-                    max_relative_error=0.07,
                     check_dygraph=(not self.use_mkldnn),
                 )
 
@@ -618,7 +626,6 @@ def test_check_output(self):
                 if core.is_float16_supported(place):
                     self.check_output_with_place(
                         place,
-                        atol=1e-3,
                         check_dygraph=(not self.use_mkldnn),
                     )
 
@@ -634,7 +641,6 @@ def test_check_grad(self):
                     place,
                     {'X'},
                     'Out',
-                    max_relative_error=0.07,
                     check_dygraph=(not self.use_mkldnn),
                 )
 
@@ -643,20 +649,58 @@ def test_check_grad(self):
     globals()[cls_name] = TestFp16Case
 
 
+def create_test_bf16_class(parent, check_grad=True):
+    @unittest.skipIf(
+        not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
+    )
+    class TestBf16Case(parent):
+        def init_kernel_type(self):
+            self.use_cuda = True
+            self.dtype = np.uint16
+
+        def test_check_output(self):
+            if core.is_compiled_with_cuda():
+                place = core.CUDAPlace(0)
+                self.check_output_with_place(
+                    place,
+                    check_dygraph=(not self.use_mkldnn),
+                )
+
+        def test_check_grad(self):
+            place = core.CUDAPlace(0)
+            if self.pool_type != "max" and check_grad:
+                self.check_grad_with_place(
+                    place,
+                    {'X'},
+                    'Out',
+                    check_dygraph=(not self.use_mkldnn),
+                )
+
+    cls_name = "{}_{}".format(parent.__name__, "Bf16Op")
+    TestBf16Case.__name__ = cls_name
+    globals()[cls_name] = TestBf16Case
+
+
 create_test_cudnn_fp16_class(TestPool2D_Op)
-create_test_cudnn_fp16_class(TestCase1, check_grad=False)
+create_test_cudnn_fp16_class(TestCase1)
 create_test_cudnn_fp16_class(TestCase2)
 create_test_cudnn_fp16_class(TestCase3)
 create_test_cudnn_fp16_class(TestCase4)
 create_test_cudnn_fp16_class(TestCase5)
 
 create_test_fp16_class(TestPool2D_Op)
-create_test_fp16_class(TestCase1, check_grad=False)
+create_test_fp16_class(TestCase1)
 create_test_fp16_class(TestCase2)
 create_test_fp16_class(TestCase3)
 create_test_fp16_class(TestCase4)
 create_test_fp16_class(TestCase5)
 
+create_test_bf16_class(TestPool2D_Op)
+create_test_bf16_class(TestCase1)
+create_test_bf16_class(TestCase2)
+create_test_bf16_class(TestCase3)
+create_test_bf16_class(TestCase4)
+create_test_bf16_class(TestCase5)
 # --------------------test pool2d use ceil mode--------------------
 
 
@@ -796,12 +840,26 @@ def init_shape(self):
 create_test_cudnn_class(TestCase5_AsyPadding)
 
 create_test_cudnn_fp16_class(TestPool2D_AsyPadding)
-create_test_cudnn_fp16_class(TestCase1_AsyPadding, check_grad=False)
+create_test_cudnn_fp16_class(TestCase1_AsyPadding)
 create_test_cudnn_fp16_class(TestCase2_AsyPadding)
 create_test_cudnn_fp16_class(TestCase3_AsyPadding)
 create_test_cudnn_fp16_class(TestCase4_AsyPadding)
 create_test_cudnn_fp16_class(TestCase5_AsyPadding)
 
+create_test_fp16_class(TestPool2D_AsyPadding)
+create_test_fp16_class(TestCase1_AsyPadding)
+create_test_fp16_class(TestCase2_AsyPadding)
+create_test_fp16_class(TestCase3_AsyPadding)
+create_test_fp16_class(TestCase4_AsyPadding)
+create_test_fp16_class(TestCase5_AsyPadding)
+
+create_test_bf16_class(TestPool2D_AsyPadding)
+create_test_bf16_class(TestCase1_AsyPadding)
+create_test_bf16_class(TestCase2_AsyPadding)
+create_test_bf16_class(TestCase3_AsyPadding)
+create_test_bf16_class(TestCase4_AsyPadding)
+create_test_bf16_class(TestCase5_AsyPadding)
+
 create_test_cudnn_use_ceil_class(TestPool2D_AsyPadding)
 create_test_cudnn_use_ceil_class(TestCase1_AsyPadding)
 
@@ -908,12 +966,26 @@ def init_shape(self):
 create_test_cudnn_class(TestCase5_channel_last)
 
 create_test_cudnn_fp16_class(TestPool2D_channel_last)
-create_test_cudnn_fp16_class(TestCase1_channel_last, check_grad=False)
+create_test_cudnn_fp16_class(TestCase1_channel_last)
 create_test_cudnn_fp16_class(TestCase2_channel_last)
 create_test_cudnn_fp16_class(TestCase3_channel_last)
 create_test_cudnn_fp16_class(TestCase4_channel_last)
 create_test_cudnn_fp16_class(TestCase5_channel_last)
 
+create_test_fp16_class(TestPool2D_channel_last)
+create_test_fp16_class(TestCase1_channel_last)
+create_test_fp16_class(TestCase2_channel_last)
+create_test_fp16_class(TestCase3_channel_last)
+create_test_fp16_class(TestCase4_channel_last)
+create_test_fp16_class(TestCase5_channel_last)
+
+create_test_bf16_class(TestPool2D_channel_last)
+create_test_bf16_class(TestCase1_channel_last)
+create_test_bf16_class(TestCase2_channel_last)
+create_test_bf16_class(TestCase3_channel_last)
+create_test_bf16_class(TestCase4_channel_last)
+create_test_bf16_class(TestCase5_channel_last)
+
 create_test_cudnn_use_ceil_class(TestPool2D_channel_last)
 create_test_cudnn_use_ceil_class(TestCase1_channel_last)
 
@@ -1023,14 +1095,26 @@ def init_shape(self):
 create_test_cudnn_class(TestCase5_AsyPadding_channel_last)
 
 create_test_cudnn_fp16_class(TestPool2D_AsyPadding_channel_last)
-create_test_cudnn_fp16_class(
-    TestCase1_AsyPadding_channel_last, check_grad=False
-)
+create_test_cudnn_fp16_class(TestCase1_AsyPadding_channel_last)
 create_test_cudnn_fp16_class(TestCase2_AsyPadding_channel_last)
 create_test_cudnn_fp16_class(TestCase3_AsyPadding_channel_last)
 create_test_cudnn_fp16_class(TestCase4_AsyPadding_channel_last)
 create_test_cudnn_fp16_class(TestCase5_AsyPadding_channel_last)
 
+create_test_fp16_class(TestPool2D_AsyPadding_channel_last)
+create_test_fp16_class(TestCase1_AsyPadding_channel_last)
+create_test_fp16_class(TestCase2_AsyPadding_channel_last)
+create_test_fp16_class(TestCase3_AsyPadding_channel_last)
+create_test_fp16_class(TestCase4_AsyPadding_channel_last)
+create_test_fp16_class(TestCase5_AsyPadding_channel_last)
+
+create_test_bf16_class(TestPool2D_AsyPadding_channel_last)
+create_test_bf16_class(TestCase1_AsyPadding_channel_last)
+create_test_bf16_class(TestCase2_AsyPadding_channel_last)
+create_test_bf16_class(TestCase3_AsyPadding_channel_last)
+create_test_bf16_class(TestCase4_AsyPadding_channel_last)
+create_test_bf16_class(TestCase5_AsyPadding_channel_last)
+
 create_test_cudnn_use_ceil_class(TestPool2D_AsyPadding_channel_last)
 create_test_cudnn_use_ceil_class(TestCase1_AsyPadding_channel_last)
 

From 0baacc694d35769dbfc6ccf790e42b80ba3e70a6 Mon Sep 17 00:00:00 2001
From: WangZhen <23097963+0x45f@users.noreply.github.com>
Date: Wed, 12 Apr 2023 11:33:24 +0800
Subject: [PATCH 085/156] Fix backend typo in ut (#52757)

---
 test/dygraph_to_static/test_cinn_prim.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/dygraph_to_static/test_cinn_prim.py b/test/dygraph_to_static/test_cinn_prim.py
index c5527e85238b6b..388cb67c66f43d 100644
--- a/test/dygraph_to_static/test_cinn_prim.py
+++ b/test/dygraph_to_static/test_cinn_prim.py
@@ -170,10 +170,10 @@ def test_backend(self):
         out2 = self.forward(x, None)
         np.testing.assert_allclose(out1, out2, rtol=1e-6)
 
-    def forward(self, x, beckend=None):
+    def forward(self, x, backend=None):
         paddle.seed(2022)
         net = PrimeNet()
-        net = paddle.jit.to_static(net, backend=beckend)
+        net = paddle.jit.to_static(net, backend=backend)
         out = net(x)
         return out
 

From 2309aa585cd9a4d5f35a8ea936b388d9a58e8645 Mon Sep 17 00:00:00 2001
From: gaoziyuan <88373061+gzy19990617@users.noreply.github.com>
Date: Wed, 12 Apr 2023 12:04:17 +0800
Subject: [PATCH 086/156] =?UTF-8?q?=E3=80=90Hackathon=2078=E3=80=91?=
 =?UTF-8?q?=E4=B8=BAPaddle-TRT=E5=A2=9E=E5=8A=A0cumsum=E7=AE=97=E5=AD=90?=
 =?UTF-8?q?=20(#52518)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../fluid/inference/api/analysis_predictor.cc |   1 +
 .../inference/tensorrt/convert/CMakeLists.txt |   1 +
 .../inference/tensorrt/convert/cumsum_op.cc   | 157 ++++++++++++++++
 .../inference/tensorrt/convert/op_converter.h |  46 +++++
 paddle/fluid/inference/tensorrt/op_teller.cc  |  25 ++-
 .../ir/inference/test_trt_convert_cumsum.py   | 176 ++++++++++++++++++
 6 files changed, 404 insertions(+), 2 deletions(-)
 create mode 100644 paddle/fluid/inference/tensorrt/convert/cumsum_op.cc
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_cumsum.py

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 38222b797f14fd..6523e5cfced3ea 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -2688,6 +2688,7 @@ USE_TRT_CONVERTER(expand_v2)
 USE_TRT_CONVERTER(take_along_axis)
 USE_TRT_CONVERTER(skip_groupnorm_act)
 USE_TRT_CONVERTER(preln_groupnorm_act)
+USE_TRT_CONVERTER(cumsum)
 #if IS_TRT_VERSION_GE(8522)
 USE_TRT_CONVERTER(flash_multihead_matmul)
 USE_TRT_CONVERTER(cross_multihead_matmul)
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index cbe26a3d31e4d9..1793e1207771e2 100755
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -106,6 +106,7 @@ list(
   skip_groupnorm_act_op.cc
   preln_groupnorm_act_op.cc
   expand_v2_op.cc
+  cumsum_op.cc
   temporal_shift_op.cc)
 
 if(${TENSORRT_MAJOR_VERSION} GREATER_EQUAL 7)
diff --git a/paddle/fluid/inference/tensorrt/convert/cumsum_op.cc b/paddle/fluid/inference/tensorrt/convert/cumsum_op.cc
new file mode 100644
index 00000000000000..a46bf1efa171ba
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/cumsum_op.cc
@@ -0,0 +1,157 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * Cumsum Op
+ */
+class CumsumOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope,
+                  bool test_mode) override {
+#if IS_TRT_VERSION_GE(7220)
+    VLOG(3) << "convert a cumsum op to tensorrt layer";
+    framework::OpDesc op_desc(op, nullptr);
+    std::string input_x_name = op_desc.Input("X").front();
+    std::string output_name = op_desc.Output("Out").front();
+    auto* input_x_tensor = engine_->GetITensor(input_x_name);
+    auto dims = input_x_tensor->getDimensions();
+    auto rank = dims.nbDims;
+    int axis = 0;
+    if (op_desc.HasAttr("axis")) {
+      axis = PADDLE_GET_CONST(int, op_desc.GetAttr("axis"));
+      if (axis < 0) {
+        axis += rank;
+      }
+    }
+
+    // getAxisLength default is a scalar
+    auto getAxisLength =
+        [&](nvinfer1::ITensor* inpTensor, int axis, bool scalar = true) {
+          auto dims = inpTensor->getDimensions();
+          int d = dims.d[axis];
+          if (d >= 0) {
+            return Add1DConstantLayer(d, "", scalar);
+          } else {
+            nvinfer1::ITensor* inpShape = Shape(inpTensor);
+            return GetEleTensorOfShape(inpShape, d, scalar);
+          }
+        };
+
+    // Create "inputSliced" tensor that is sliced on dimension[axis] to length 1
+    nvinfer1::Dims start;
+    start.nbDims = rank;
+    std::vector<int32_t> start_vec(rank, 0);
+    std::fill(start.d, start.d + rank, 0);
+
+    nvinfer1::Dims size;
+    size.nbDims = rank;
+    nvinfer1::Dims stride;
+    stride.nbDims = rank;
+    auto axisLength = getAxisLength(input_x_tensor, axis, false);
+
+    auto starts_tensor =
+        Add1DConstantLayer(start_vec, output_name + "_start_tensor_");
+    auto sizes_tensor = axis == 0 ? Add1DConstantLayer(1)
+                                  : getAxisLength(input_x_tensor, 0, false);
+    auto strides_tensor = axis == 0 ? axisLength : Add1DConstantLayer(1);
+
+    for (int i = 1; i < rank; i++) {
+      if (i == axis) {
+        std::vector<nvinfer1::ITensor*> strides_itensors = {strides_tensor,
+                                                            axisLength};
+        strides_tensor = Concat(strides_itensors);
+        std::vector<nvinfer1::ITensor*> sizes_itensors = {
+            sizes_tensor, Add1DConstantLayer(1)};
+        sizes_tensor = Concat(sizes_itensors);
+      } else {
+        auto currLength = getAxisLength(input_x_tensor, i, false);
+        std::vector<nvinfer1::ITensor*> strides_itensors = {
+            strides_tensor, Add1DConstantLayer(1)};
+        strides_tensor = Concat(strides_itensors);
+        std::vector<nvinfer1::ITensor*> sizes_itensors = {sizes_tensor,
+                                                          currLength};
+        sizes_tensor = Concat(sizes_itensors);
+      }
+    }
+
+    auto inputSliced = TRT_ENGINE_ADD_LAYER(
+        engine_, Slice, *input_x_tensor, start, size, stride);
+    inputSliced->setInput(1, *starts_tensor);
+    inputSliced->setInput(2, *sizes_tensor);
+    inputSliced->setInput(3, *strides_tensor);
+    auto inputSliced_output = inputSliced->getOutput(0);
+
+    // Scan through each slice across axis and add it to the running sum
+    auto loop = TRT_ENGINE_ADD_LAYER(engine_, Loop);
+    nvinfer1::ITensor* tripLimit = getAxisLength(input_x_tensor, axis);
+    loop->addTripLimit(*tripLimit, nvinfer1::TripLimit::kCOUNT);
+    auto iterator = loop->addIterator(*input_x_tensor, axis);
+    auto data = iterator->getOutput(0);
+
+    // Squeeze inputSliced down to same shape as `data`
+    auto sliced_dims = inputSliced_output->getDimensions();
+    std::vector<int32_t> subscripts(sliced_dims.nbDims);
+    std::iota(subscripts.begin(), subscripts.end(), 0);
+    auto p = std::remove_if(subscripts.begin(),
+                            subscripts.end(),
+                            [axis](int x) { return x == axis; });
+    subscripts.resize(p - subscripts.begin());
+    auto newDims = Gather(Shape(inputSliced_output), subscripts);
+    inputSliced_output = Reshape(inputSliced_output, newDims);
+
+    // creat ZeroTensor
+    std::vector<float> zero_vec{0.f};
+    auto zero = Add1DConstantLayer(zero_vec);
+    auto cast = TRT_ENGINE_ADD_LAYER(engine_, Identity, *zero);
+    cast->setOutputType(0, inputSliced_output->getType());
+
+    zero = TRT_ENGINE_ADD_LAYER(
+               engine_,
+               ElementWise,
+               *inputSliced_output,
+               *BroadcastTensors(cast->getOutput(0), inputSliced_output),
+               nvinfer1::ElementWiseOperation::kPROD)
+               ->getOutput(0);
+
+    auto runningSum = loop->addRecurrence(*zero);
+    auto runningSumTensor = runningSum->getOutput(0);
+    auto curSum = TRT_ENGINE_ADD_LAYER(engine_,
+                                       ElementWise,
+                                       *data,
+                                       *runningSumTensor,
+                                       nvinfer1::ElementWiseOperation::kSUM);
+    runningSum->setInput(1, *curSum->getOutput(0));
+    auto reverseFlag = nvinfer1::LoopOutput::kCONCATENATE;
+    nvinfer1::ILoopOutputLayer* loopOut =
+        loop->addLoopOutput(*curSum->getOutput(0), reverseFlag, axis);
+    loopOut->setInput(1, *tripLimit);
+    RreplenishLayerAndOutput(loopOut, "cumsum", {output_name}, test_mode);
+#else
+    VLOG(3) << "Cumsum is not supported when TensorRT < 7.2.2";
+#endif
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(cumsum, CumsumOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index db19e5c45d3dec..e2dfe4d5ba304c 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -416,6 +416,52 @@ class OpConverter {
     return TRT_ENGINE_ADD_LAYER(engine_, Shape, *input)->getOutput(0);
   }
 
+  nvinfer1::ITensor* Reshape(nvinfer1::ITensor* input,
+                             nvinfer1::ITensor* newShape) {
+    nvinfer1::ITensor* oldShape = Shape(input);
+    if (oldShape == newShape) {
+      return input;
+    }
+    auto* shuffle = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
+    shuffle->setInput(1, *newShape);
+    return shuffle->getOutput(0);
+  }
+
+  nvinfer1::ITensor* BroadcastTensor(nvinfer1::ITensor* input,
+                                     const int nbDims) {
+    auto oldShape = Shape(input);
+    auto oldShapeDims = oldShape->getDimensions();
+    const int rank = oldShapeDims.nbDims;
+    if (rank > nbDims) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Cannot broadcast a higher rank tensor to a lower rank tensor."));
+    }
+    if (rank < nbDims) {
+      nvinfer1::ITensor* concat_shape_tensor;
+      auto* one_rank_tensor =
+          Add1DConstantLayer(std::vector<int32_t>(nbDims - rank, 1));
+      std::vector<nvinfer1::ITensor*> itensors;
+      itensors.push_back(one_rank_tensor);
+      itensors.push_back(oldShape);
+      concat_shape_tensor = Concat(itensors);
+      input = Reshape(input, concat_shape_tensor);
+    }
+    return input;
+  }
+
+  nvinfer1::ITensor* BroadcastTensors(nvinfer1::ITensor* a,
+                                      nvinfer1::ITensor* b) {
+    const int aDims = a->getDimensions().nbDims;
+    const int bDims = b->getDimensions().nbDims;
+    if (aDims == bDims) {
+      VLOG(3) << "Broadcast two equal rank tensors";
+    }
+    if (aDims > bDims) {
+      return BroadcastTensor(b, aDims);
+    }
+    return BroadcastTensor(a, bDims);
+  }
+
   // Concat not make rank changed
   nvinfer1::ITensor* Concat(const std::vector<nvinfer1::ITensor*>& inputs,
                             int axis = 0) {
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 24dca82d3fba17..85f5c003746c20 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -2705,6 +2705,25 @@ struct SimpleOpTypeSetTeller : public Teller {
 #endif
     }
 
+    if (op_type == "cumsum") {
+#if !IS_TRT_VERSION_GE(7220)
+      VLOG(3) << "cumsum is not supported when TensorRT < 7.2.2";
+      return false;
+#endif
+      if (!with_dynamic_shape) {
+        VLOG(3) << "the cumsum does not support "
+                   "static shape yet";
+        return false;
+      }
+      auto* block = desc.Block();
+      if (block == nullptr) {
+        VLOG(3) << "The block desc is nullptr, we can't continue to analyze. "
+                   "Developers need to check whether block_desc is passed in "
+                   "the pass.";
+        return false;
+      }
+    }
+
     if (op_type == "temporal_shift") {
 #if !IS_TRT_VERSION_GE(8200)
       VLOG(3) << "temporal_shift is not supported when TensorRT < 8.2";
@@ -2906,7 +2925,8 @@ struct SimpleOpTypeSetTeller : public Teller {
       "skip_groupnorm_act",
       "preln_groupnorm_act",
       "temporal_shift",
-      "grid_sampler"};
+      "grid_sampler",
+      "cumsum"};
 
   std::unordered_set<std::string> teller_set{
       "mul",
@@ -3064,7 +3084,8 @@ struct SimpleOpTypeSetTeller : public Teller {
       "skip_groupnorm_act",
       "preln_groupnorm_act",
       "temporal_shift",
-      "grid_sampler"};
+      "grid_sampler",
+      "cumsum"};
 };
 
 struct GenericPluginTeller : public Teller {
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_cumsum.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_cumsum.py
new file mode 100644
index 00000000000000..60dbfa37aab227
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_cumsum.py
@@ -0,0 +1,176 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from functools import partial
+from typing import List
+
+import numpy as np
+from program_config import ProgramConfig, TensorConfig
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest
+
+import paddle.inference as paddle_infer
+
+
+class TrtConvertCumsum(TrtLayerAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        ver = paddle_infer.get_trt_compile_version()
+        if ver[0] * 1000 + ver[1] * 100 + ver[2] * 10 < 7220:
+            return False
+        return True
+
+    def sample_program_configs(self):
+        self.trt_param.workspace_size = 1073741824
+
+        def generate_input1():
+            if self.dims == 2:
+                self.input_shape = [2, 3]
+                return np.random.random([2, 3]).astype(np.int32)
+            elif self.dims == 3:
+                self.input_shape = [2, 3, 4]
+                return np.random.random([2, 3, 4]).astype(np.int64)
+            elif self.dims == 4:
+                self.input_shape = [4, 3, 32, 32]
+                return np.random.random([4, 3, 32, 32]).astype(np.float32) - 0.5
+
+        for dims in [2, 3, 4]:
+            for axis in range(-1, dims):
+                for type in ["int32", "int64", "float32", "float64"]:
+                    self.dims = dims
+                    ops_config = [
+                        {
+                            "op_type": "cumsum",
+                            "op_inputs": {
+                                "X": ["input_data"],
+                            },
+                            "op_outputs": {"Out": ["output_data"]},
+                            "op_attrs": {"axis": axis, "dtype": type},
+                        }
+                    ]
+                    ops = self.generate_op_config(ops_config)
+
+                    program_config = ProgramConfig(
+                        ops=ops,
+                        weights={},
+                        inputs={
+                            "input_data": TensorConfig(
+                                data_gen=partial(generate_input1)
+                            ),
+                        },
+                        outputs=["output_data"],
+                    )
+
+                    yield program_config
+
+        # no op_attrs
+        for dims in [2, 3, 4]:
+            self.dims = dims
+            ops_config = [
+                {
+                    "op_type": "cumsum",
+                    "op_inputs": {
+                        "X": ["input_data"],
+                    },
+                    "op_outputs": {"Out": ["output_data"]},
+                    "op_attrs": {},
+                }
+            ]
+            ops = self.generate_op_config(ops_config)
+
+            program_config = ProgramConfig(
+                ops=ops,
+                weights={},
+                inputs={
+                    "input_data": TensorConfig(
+                        data_gen=partial(generate_input1)
+                    ),
+                },
+                outputs=["output_data"],
+            )
+
+            yield program_config
+
+    def sample_predictor_configs(
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
+        def generate_dynamic_shape():
+
+            if self.dims == 2:
+                self.dynamic_shape.min_input_shape = {
+                    "input_data": [2, 3],
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input_data": [2, 3],
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input_data": [2, 3],
+                }
+
+            elif self.dims == 3:
+                self.dynamic_shape.min_input_shape = {
+                    "input_data": [2, 3, 4],
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input_data": [2, 3, 4],
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input_data": [2, 3, 4],
+                }
+
+            elif self.dims == 4:
+                self.dynamic_shape.min_input_shape = {
+                    "input_data": [4, 3, 32, 32],
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input_data": [4, 3, 32, 32],
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input_data": [4, 3, 32, 32],
+                }
+
+        def generate_trt_nodes_num(attrs, dynamic_shape):
+            ver = paddle_infer.get_trt_compile_version()
+            if ver[0] * 1000 + ver[1] * 100 + ver[2] * 10 < 7220:
+                return 0, 3
+            return 1, 2
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        attrs = [
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
+        ]
+
+        # for static_shape
+        clear_dynamic_shape()
+
+        # for dynamic_shape
+        generate_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True
+        ), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True
+        ), 1e-2
+
+    def test(self):
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()

From f650e9011cc7018776840378092075344e4ddba5 Mon Sep 17 00:00:00 2001
From: qizhaoaoe <10208099+qizhaoaoe@users.noreply.github.com>
Date: Wed, 12 Apr 2023 12:39:39 +0800
Subject: [PATCH 087/156] fix dtype cast in amp for instance_norm. (#52765)

* fix dtype cast in amp.

* add test case and update docs.

* remove set_prim.
---
 python/paddle/amp/auto_cast.py                | 11 ++--
 .../unittests/test_instance_norm_op_v2.py     | 64 ++++++++++++++++++-
 python/paddle/static/amp/fp16_utils.py        |  2 +
 3 files changed, 71 insertions(+), 6 deletions(-)

diff --git a/python/paddle/amp/auto_cast.py b/python/paddle/amp/auto_cast.py
index 33c7855d897243..bc76f866d94eb2 100644
--- a/python/paddle/amp/auto_cast.py
+++ b/python/paddle/amp/auto_cast.py
@@ -213,6 +213,9 @@ def pure_fp16_initialize(models):
                     paddle.nn.BatchNorm3D,
                     paddle.nn.LayerNorm,
                     paddle.nn.SyncBatchNorm,
+                    paddle.nn.InstanceNorm1D,
+                    paddle.nn.InstanceNorm2D,
+                    paddle.nn.InstanceNorm3D,
                 ),
             ):
                 continue
@@ -522,7 +525,7 @@ def amp_decorate(
 ):
     """
     Decorate models and optimizers for auto-mixed-precision. When level is O1(amp), the decorate will do nothing.
-    When level is O2(pure fp16), the decorate will cast all parameters of models to FP16, except BatchNorm and LayerNorm.
+    When level is O2(pure fp16), the decorate will cast all parameters of models to FP16, except BatchNorm, InstanceNorm and LayerNorm.
 
     Commonly, it is used together with `amp_guard` to achieve Pure fp16 in imperative mode.
 
@@ -530,7 +533,7 @@ def amp_decorate(
         models(Layer|list of Layer, optional): The defined models by user, models must be either a single model or a list of models. Default is None.
         optimizers(Optimizer|list of Optimizer, optional): The defined optimizers by user, optimizers must be either a single optimizer or a list of optimizers. Default is None.
         level(str, optional): Auto mixed precision level. Accepted values are "O1" and "O2": O1 represent mixed precision, the decorator will do nothing;
-             O2 represent Pure fp16/bf16, the decorator will cast all parameters of models to FP16/BF16, except BatchNorm and LayerNorm. Default is O1(amp)
+             O2 represent Pure fp16/bf16, the decorator will cast all parameters of models to FP16/BF16, except BatchNorm, InstanceNorm and LayerNorm. Default is O1(amp)
         dtype(str, optional): Whether to use 'float16' or 'bfloat16'. Default is 'float16'.
         master_weight(bool, optinal): For level='O2', whether to use multi-precision during weight updating. If master_weight is None, in O2 level optimizer will use multi-precision. Default is None.
         save_dtype(float, optional): The save model parameter dtype when use `paddle.save` or `paddle.jit.save`,it should be float16, bfloat16, float32, float64 or None.
@@ -741,7 +744,7 @@ def decorate(
 ):
     """
     Decorate models and optimizers for auto-mixed-precision. When level is O1(amp), the decorate will do nothing.
-    When level is O2(pure float16/bfloat16), the decorate will cast all parameters of models to float16/bfloat16, except BatchNorm and LayerNorm.
+    When level is O2(pure float16/bfloat16), the decorate will cast all parameters of models to float16/bfloat16, except BatchNorm, InstanceNorm and LayerNorm.
 
     Commonly, it is used together with `auto_cast` to achieve Pure float16/bfloat16 in imperative mode.
 
@@ -749,7 +752,7 @@ def decorate(
         models(Layer|list of Layer): The defined models by user, models must be either a single model or a list of models. Default is None.
         optimizers(Optimizer|list of Optimizer, optional): The defined optimizers by user, optimizers must be either a single optimizer or a list of optimizers. Default is None.
         level(str, optional): Auto mixed precision level. Accepted values are 'O1' and 'O2': O1 represent mixed precision, the decorator will do nothing;
-             O2 represent Pure float16/bfloat16, the decorator will cast all parameters of models to float16/bfloat16, except BatchNorm and LayerNorm. Default is O1(amp)
+             O2 represent Pure float16/bfloat16, the decorator will cast all parameters of models to float16/bfloat16, except BatchNorm, InstanceNorm and LayerNorm. Default is O1(amp)
         dtype(str, optional): Whether to use 'float16' or 'bfloat16'. Default is 'float16'.
         master_weight(bool, optinal): For level='O2', whether to use multi-precision during weight updating. If master_weight is None, in O2 level optimizer will use multi-precision. Default is None.
         save_dtype(float, optional): The save model parameter dtype when use `paddle.save` or `paddle.jit.save`,it should be float16, bfloat16, float32, float64 or None.
diff --git a/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py
index d214965b2dd6e6..ab687aeb034f57 100644
--- a/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py
@@ -18,8 +18,9 @@
 from eager_op_test import OpTest, convert_float_to_uint16
 
 import paddle
-from paddle import fluid
-from paddle.fluid import Program, core, program_guard
+import paddle.nn.functional as F
+from paddle import fluid, nn
+from paddle.fluid import Program, core, framework, program_guard
 
 
 class TestInstanceNorm(unittest.TestCase):
@@ -319,5 +320,64 @@ def test_check_grad(self):
         )
 
 
+class PrimNet(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.conv = nn.Conv2D(2, 4, (3, 3), bias_attr=False)
+        self.instance_norm = nn.InstanceNorm2D(4)
+
+    def forward(self, x):
+        y = self.conv(x)
+        out = self.instance_norm(y)
+        res = F.max_pool2d(out, kernel_size=2, stride=2, padding=0)
+        return res
+
+
+def apply_to_static(net, use_cinn):
+    build_strategy = paddle.static.BuildStrategy()
+    build_strategy.build_cinn_pass = use_cinn
+    return paddle.jit.to_static(net, build_strategy=False)
+
+
+class TestPrimForwardAndBackward(unittest.TestCase):
+    """
+    Test PrimNet with @to_static + amp O2(with fp32)
+    """
+
+    def setUp(self):
+        paddle.seed(2022)
+        paddle.disable_static()
+        self.x = paddle.randn([4, 2, 6, 6], dtype="float32")
+        self.x.stop_gradient = False
+
+    def train(self, use_amp, data_layout="NCHW"):
+        paddle.seed(2022)
+        net = PrimNet()
+        sgd = paddle.optimizer.SGD(
+            learning_rate=0.1, parameters=net.parameters()
+        )
+        net = apply_to_static(net, False)
+        if use_amp:
+            net = paddle.amp.decorate(models=net, level='O2')
+        with paddle.amp.auto_cast(enable=use_amp, level='O2'):
+            out = net(self.x)
+            loss = paddle.mean(out)
+            loss.backward()
+            sgd.step()
+            sgd.clear_grad()
+            return loss
+
+    def test_amp_nchw(self):
+        if not isinstance(framework._current_expected_place(), core.CPUPlace):
+            expected = self.train(False)
+            actual = self.train(True)
+            np.testing.assert_allclose(
+                expected,
+                actual,
+                rtol=1e-3,
+                atol=1e-3,
+            )
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/static/amp/fp16_utils.py b/python/paddle/static/amp/fp16_utils.py
index ced21f9bb758ec..19d287f6fa07dd 100644
--- a/python/paddle/static/amp/fp16_utils.py
+++ b/python/paddle/static/amp/fp16_utils.py
@@ -99,6 +99,8 @@ def _keep_fp32_input(op, in_name):
         return in_name != 'X'
     if op_type == 'layer_norm' and _keep_layer_norm_scale_bias_to_fp32():
         return in_name != 'X'
+    if op_type == 'instance_norm':
+        return in_name != 'X'
     if op_type == 'fused_bn_add_activation':
         return in_name not in {'X', 'Z'}
     if op_type == 'resnet_unit':

From f063074f4827d646141f55680a756bc43bd7d036 Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Wed, 12 Apr 2023 13:25:20 +0800
Subject: [PATCH 088/156] [API]Fix paddle.arange infershape always -1 (#52764)

---
 .../paddle/fluid/tests/unittests/test_arange.py   |  1 +
 python/paddle/tensor/creation.py                  | 15 ++++++++-------
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_arange.py b/python/paddle/fluid/tests/unittests/test_arange.py
index a0d1ddc8b9eecc..b8d9866ebc531f 100644
--- a/python/paddle/fluid/tests/unittests/test_arange.py
+++ b/python/paddle/fluid/tests/unittests/test_arange.py
@@ -151,6 +151,7 @@ def test_out(self):
 
         expected_data = np.arange(0, 5, 1).astype(np.float32)
         self.assertEqual((out == expected_data).all(), True)
+        self.assertListEqual(list(x1.shape), [5])
 
 
 class TestArangeImperative(unittest.TestCase):
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 99d9ad594c1196..456e83f816865b 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -1293,6 +1293,14 @@ def arange(start=0, end=None, step=1, dtype=None, name=None):
         end = start
         start = 0
 
+    out_shape = None
+    if not in_dygraph_mode() and (
+        not isinstance(start, Variable)
+        and not isinstance(end, Variable)
+        and not isinstance(step, Variable)
+    ):
+        out_shape = [int(math.ceil((end - start) / step))]
+
     if not isinstance(dtype, core.VarDesc.VarType):
         dtype = convert_np_dtype_to_dtype_(dtype)
 
@@ -1324,13 +1332,6 @@ def arange(start=0, end=None, step=1, dtype=None, name=None):
             'range/arange',
         )
         helper = LayerHelper('range', **locals())
-        out_shape = None
-        if (
-            not isinstance(start, Variable)
-            and not isinstance(end, Variable)
-            and not isinstance(step, Variable)
-        ):
-            out_shape = [int(math.ceil((end - start) / step))]
         out = helper.create_variable_for_type_inference(dtype, shape=out_shape)
         helper.append_op(
             type='range',

From 05fd6d10e9dbd0601e660794385b895d694b604d Mon Sep 17 00:00:00 2001
From: CHANGer <changtao02@baidu.com>
Date: Wed, 12 Apr 2023 14:05:14 +0800
Subject: [PATCH 089/156] [Auto Parallel]Add the single-node topology detection
 (#52723)

---
 .../distributed/auto_parallel/topology.py     | 351 ++++++++++++++++++
 .../unittests/auto_parallel/test_topology.py  |  33 ++
 2 files changed, 384 insertions(+)
 create mode 100644 python/paddle/distributed/auto_parallel/topology.py
 create mode 100644 python/paddle/fluid/tests/unittests/auto_parallel/test_topology.py

diff --git a/python/paddle/distributed/auto_parallel/topology.py b/python/paddle/distributed/auto_parallel/topology.py
new file mode 100644
index 00000000000000..9de045bd612a3d
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/topology.py
@@ -0,0 +1,351 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import subprocess
+import warnings
+
+
+def call_cmd(cmd, err_msg, default_value):
+    process = subprocess.Popen(
+        cmd,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        universal_newlines=True,
+        shell=True,
+    )
+    stdout, stderr = process.communicate()
+    if stderr:
+        warnings.warn(err_msg)
+        stdout = default_value
+
+    return stdout
+
+
+class SingleNodeTopology:
+    def __init__(self):
+        self.pcie_latency = 0.0
+        self.pcie_bandwidth = float('inf')
+        self.nvlink_bandwidth = -1.0
+        self.nb_devices = 8
+
+        self.machine = {}
+        self.devices = []
+        self.links = []
+        self.json_object = None
+
+    def calculate_cpu_flops(self):
+        # Get number sockets
+        cmd = "lscpu | grep 'Socket(s)' | awk '{print $NF}'"
+        err_msg = "Failed to get number of sockets"
+        default_value = 4
+        nb_sockets = call_cmd(cmd, err_msg, default_value)
+
+        # Get number of cores per socket
+        cmd = "lscpu | grep 'Core(s) per socket' | awk '{print $NF}'"
+        err_msg = "Failed to get number of cores per socket"
+        default_value = 20
+        nb_cores_per_socket = call_cmd(cmd, err_msg, default_value)
+
+        # Get clock speed
+        cmd = "lscpu | grep GHz | awk -F '@' '{print $NF}' | awk -F 'G' '{print $1}'"
+        err_msg = "Failed to get cpu clock rate"
+        default_value = 2.4
+        clock_rate = call_cmd(cmd, err_msg, default_value)
+
+        # Get number of FMA units
+        # TODO(changtao02): find a way to detect this value
+        nb_fmas = 2
+
+        # Get SIMD width
+        simd_width_sp = 0
+        simd_width_dp = 0
+
+        cmd = "lscpu | grep sse"
+        err_msg = "Failed to get cpu vector size"
+        default_value = "sse"
+        vector_size = call_cmd(cmd, err_msg, default_value)
+
+        if vector_size:
+            simd_width_sp = 4  # 128 / 32
+            simd_width_dp = 2  # 128 / 64
+
+        cmd = "lscpu | grep avx2"
+        err_msg = "Failed to get cpu vector size"
+        default_value = "avx2"
+        vector_size = call_cmd(cmd, err_msg, default_value)
+
+        if vector_size:
+            simd_width_sp = 8  # 256 / 32
+            simd_width_dp = 4  # 256 / 64
+
+        cmd = "lscpu | grep avx512"
+        err_msg = "Failed to get cpu vector size"
+        default_value = "avx512"
+        vector_size = call_cmd(cmd, err_msg, default_value)
+
+        if vector_size:
+            simd_width_sp = 16  # 512 / 32
+            simd_width_dp = 8  # 512 / 64
+
+        gflops_per_element = (
+            int(nb_sockets)
+            * int(nb_cores_per_socket)
+            * float(clock_rate)
+            * nb_fmas
+        )
+        sp_gflops = gflops_per_element * simd_width_sp
+        dp_gflops = gflops_per_element * simd_width_dp
+
+        self.machine['sp_gflops'] = sp_gflops
+        self.machine['dp_gflops'] = dp_gflops
+
+    def pcie_gen2bandwidth(self, pcie_generation):
+        if pcie_generation == 1:
+            return 0.25
+        elif pcie_generation == 2:
+            return 0.5
+        elif pcie_generation == 3:
+            return 1.0
+        elif pcie_generation == 4:
+            return 2.0
+        elif pcie_generation == 5:
+            return 4.0
+        elif pcie_generation == 6:
+            return 8.0
+
+    def model2gflops(self, model):
+        if "H100" in model and "SXM5" in model:
+            return 60000, 30000
+        elif "H100" in model and "PCIe" in model:
+            return 48000, 24000
+        elif "A100" in model:
+            return 19500, 9700
+        elif "V100" in model:
+            return 15700, 7800
+        elif "P100" in model:
+            return 10600, 5300
+
+    def get_link_bandwidth(self, source_id, target_id):
+        # Get link type
+        row_id = 2 + source_id
+        column_id = 2 + target_id
+
+        cmd = (
+            "cat matrix.txt | awk 'FNR=="
+            + str(row_id)
+            + " {print $"
+            + str(column_id)
+            + "}'"
+        )
+        err_msg = "Failed to get topo matrix"
+        default_value = "NVL"
+        link_type = call_cmd(cmd, err_msg, default_value)
+
+        link_bandwidth = self.pcie_bandwidth
+
+        if "NV" in link_type:
+            if self.nvlink_bandwidth == -1.0:
+                cmd = "nvidia-smi nvlink -s -i 0 | tail -n 1 | awk '{print $3}'"
+                err_msg = "Failed to get nvlink bandwidth"
+                default_value = "25"
+                self.nvlink_bandwidth = float(
+                    call_cmd(cmd, err_msg, default_value)
+                )
+
+            link_bandwidth = int(link_type[2:]) * self.nvlink_bandwidth
+            link_type = "NVL"
+
+        return link_type, link_bandwidth
+
+    def get_host_info(self):
+        # Get hostname
+        cmd = "hostname -s"
+        err_msg = "Failed to get hostname"
+        default_value = "localhost"
+        hostname = call_cmd(cmd, err_msg, default_value).strip()
+
+        # Get ip address
+        cmd = "hostname -i"
+        err_msg = "Failed to get host ip address"
+        default_value = "127.0.0.1"
+        ip_addr = call_cmd(cmd, err_msg, default_value).strip()
+
+        # Get CPU memory (GB)
+        cmd = "cat /proc/meminfo | grep 'MemAvailable' | awk -F ':' '{print $NF}' | awk '{print $1}'"
+        err_msg = "Failed to get cpu memory"
+        default_value = "41366484"
+        cpu_memory = int(call_cmd(cmd, err_msg, default_value)) // 1e6
+
+        # Get single-point flops and double-point flops (GFLOPs)
+        self.calculate_cpu_flops()
+
+        self.machine['hostname'] = hostname
+        self.machine['addr'] = ip_addr
+        self.machine['memory'] = cpu_memory
+
+    def get_device_info(self):
+        # Get device count
+        cmd = "nvidia-smi -L | wc -l"
+        err_msg = "Failed to get device count"
+        default_value = "8"
+        self.nb_devices = int(call_cmd(cmd, err_msg, default_value))
+
+        # Get PCIe latency and bandwidth (ms, GB/s)
+        for i in range(self.nb_devices):
+            cmd = (
+                "nvidia-smi --id="
+                + str(i)
+                + " --query-gpu=pcie.link.gen.max --format=csv,noheader"
+            )
+            err_msg = "Failed to get max pcie link generation"
+            default_value = "4"
+            pcie_generation = int(call_cmd(cmd, err_msg, default_value))
+
+            cmd = (
+                "nvidia-smi --id="
+                + str(i)
+                + " --query-gpu=pcie.link.width.max --format=csv,noheader"
+            )
+            err_msg = "Failed to get max pcie link width"
+            default_value = "16"
+            pcie_width = int(call_cmd(cmd, err_msg, default_value))
+
+            self.pcie_bandwidth = min(
+                self.pcie_bandwidth,
+                self.pcie_gen2bandwidth(pcie_generation) * pcie_width,
+            )
+
+        dev_global_ids = []
+        dev_local_ids = []
+        dev_types = []
+        dev_models = []
+        dev_memories = []  # GiB
+        dev_sp_gflops = []  # GB/s
+        dev_dp_gflops = []  # GB/s
+
+        # Get device info
+        for i in range(self.nb_devices):
+            dev_global_ids.append(i)
+            dev_local_ids.append(i)
+            dev_types.append("GPU")
+
+            cmd = (
+                "nvidia-smi --id="
+                + str(i)
+                + " --query-gpu=name --format=csv,noheader"
+            )
+            err_msg = "Failed to get device name"
+            default_value = "NVIDIA A100-SXM4-40GB"
+            dev_models.append(call_cmd(cmd, err_msg, default_value).strip())
+
+            cmd = (
+                "nvidia-smi --id="
+                + str(i)
+                + " --query-gpu=memory.free --format=csv,noheader | awk '{print $1}'"
+            )
+            err_msg = "Failed to get device available memory"
+            default_value = "40536"
+            dev_memories.append(
+                int(call_cmd(cmd, err_msg, default_value)) // 1e3
+            )
+
+            sp_gflops, dp_gflops = self.model2gflops(dev_models[i])
+            dev_sp_gflops.append(sp_gflops)
+            dev_dp_gflops.append(dp_gflops)
+
+        for i in range(len(dev_global_ids)):
+            device = {}
+            device['global_id'] = dev_global_ids[i]
+            device['local_id'] = dev_local_ids[i]
+            device['type'] = dev_types[i]
+            device['model'] = dev_models[i]
+            device['memory'] = dev_memories[i]
+            device['sp_gflops'] = dev_sp_gflops[i]
+            device['dp_gflops'] = dev_dp_gflops[i]
+            self.devices.append(device)
+
+        self.machine['latency'] = self.pcie_latency
+        self.machine['bandwidth'] = self.pcie_bandwidth
+        self.machine['devices'] = self.devices
+
+    def get_link_info(self):
+        link_source_global_ids = []
+        link_target_global_ids = []
+        link_types = []
+        link_latencies = []  # ms
+        link_bandwidths = []  # GB/s
+
+        cmd = "nvidia-smi topo -m > matrix.txt"
+        err_msg = "Failed to get topo matrix"
+        default_value = ""
+        call_cmd(cmd, err_msg, default_value)
+
+        # Get link info between devices
+        for i in range(self.nb_devices):
+            for j in range(self.nb_devices):
+                if i == j:
+                    link_types.append("X")
+                    link_bandwidths.append(-1.0)
+                else:
+                    link_source_global_ids.append(i)
+                    link_target_global_ids.append(j)
+                    link_latencies.append(0.0)
+                    if i > j:
+                        index = j * self.nb_devices + i
+                        link_types.append(link_types[index])
+                        link_bandwidths.append(link_bandwidths[index])
+                    elif i < j:
+                        link_type, link_bandwidth = self.get_link_bandwidth(
+                            i, j
+                        )
+                        link_types.append(link_type)
+                        link_bandwidths.append(link_bandwidth)
+
+        for i in reversed(range(self.nb_devices)):
+            link_types.pop(i * self.nb_devices + i)
+            link_bandwidths.pop(i * self.nb_devices + i)
+
+        cmd = "rm matrix.txt"
+        err_msg = "Failed to delete matrix.txt"
+        default_value = ""
+        call_cmd(cmd, err_msg, default_value)
+
+        for i in range(len(link_types)):
+            link = {}
+            link['source_global_id'] = link_source_global_ids[i]
+            link['target_global_id'] = link_target_global_ids[i]
+            link['type'] = link_types[i]
+            link['latency'] = link_latencies[i]
+            link['bandwidth'] = link_bandwidths[i]
+            self.links.append(link)
+
+        self.machine['links'] = self.links
+
+    def detect(self):
+        # Get host info
+        self.get_host_info()
+
+        # Get device info
+        self.get_device_info()
+
+        # Get link info between devices
+        self.get_link_info()
+
+        self.json_object = json.dumps(self.machine, indent=4)
+        print(self.json_object)
+
+    def dump(self, output_path):
+        with open(output_path, "w") as outfile:
+            json.dump(self.machine, outfile, indent=4)
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_topology.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_topology.py
new file mode 100644
index 00000000000000..6807d22ffc3f15
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_topology.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from paddle.distributed.auto_parallel.topo import SingleNodeTopology
+
+
+def check_empty_json_object(json_object):
+    return json_object is not None
+
+
+class TestSingleNodeTopology(unittest.TestCase):
+    def test_empty_topology_json_object(self):
+        topo = SingleNodeTopology()
+        topo.detect()
+
+        self.assertTrue(check_empty_json_object(topo.json_object))
+
+
+if __name__ == "__main__":
+    unittest.main()

From 2131ee5c54e2dd9ad4948170a24cc59da0c16eda Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E6=98=A5=E4=B9=94?=
 <83450930+Liyulingyue@users.noreply.github.com>
Date: Wed, 12 Apr 2023 14:12:45 +0800
Subject: [PATCH 090/156] remove *hccl*.cc (#52798)

* remove c_comm_init_hccl_op.cc and c_gen_hccl_id_op.cc

* remove gen_hccl_id_op.cc
---
 .../fluid/operators/collective/CMakeLists.txt |   2 -
 .../collective/c_comm_init_hccl_op.cc         |  82 ----
 .../operators/collective/c_gen_hccl_id_op.cc  |  75 ----
 .../operators/collective/gen_hccl_id_op.cc    |  83 ----
 .../collective/gen_hccl_id_op_helper.cc       | 378 ------------------
 .../collective/gen_hccl_id_op_helper.h        |  52 ---
 .../fleet/meta_optimizers/common.py           |  26 --
 python/paddle/fluid/framework.py              |   2 -
 .../unittests/ir/inference/program_config.py  |   2 -
 9 files changed, 702 deletions(-)
 delete mode 100644 paddle/fluid/operators/collective/c_comm_init_hccl_op.cc
 delete mode 100644 paddle/fluid/operators/collective/c_gen_hccl_id_op.cc
 delete mode 100644 paddle/fluid/operators/collective/gen_hccl_id_op.cc
 delete mode 100644 paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc
 delete mode 100644 paddle/fluid/operators/collective/gen_hccl_id_op_helper.h

diff --git a/paddle/fluid/operators/collective/CMakeLists.txt b/paddle/fluid/operators/collective/CMakeLists.txt
index 3855733a982718..b356497962689b 100644
--- a/paddle/fluid/operators/collective/CMakeLists.txt
+++ b/paddle/fluid/operators/collective/CMakeLists.txt
@@ -27,8 +27,6 @@ register_operators(
   gen_bkcl_id_op
   c_gen_nccl_id_op
   gen_nccl_id_op
-  c_gen_hccl_id_op
-  gen_hccl_id_op
   c_gen_cncl_id_op
   DEPS
   ${COLLECTIVE_DEPS})
diff --git a/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc b/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc
deleted file mode 100644
index 98bcd78b9dadc6..00000000000000
--- a/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace framework {
-class Scope;
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-
-class CCommInitOpAscend : public framework::OperatorBase {
- public:
-  CCommInitOpAscend(const std::string& type,
-                    const framework::VariableNameMap& inputs,
-                    const framework::VariableNameMap& outputs,
-                    const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override {
-    PADDLE_ENFORCE_EQ(platform::is_npu_place(place),
-                      true,
-                      platform::errors::PreconditionNotMet(
-                          "CCommInitOpAscend can run on npu place only."));
-
-    auto var = scope.FindVar(Input("X"));
-    PADDLE_ENFORCE_NOT_NULL(
-        var, platform::errors::InvalidArgument("Input con not be empty."));
-
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "PaddlePaddle should compile with NPU."));
-  }
-};
-
-class CCommInitOpAscendMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "Raw variable contains a NCCL UniqueId instaces.");
-    AddComment(R"DOC(
-CCommInit operator
-
-Initialize collective communicatoin context within this trainer
-)DOC");
-    AddAttr<int>("rank_ids",
-                 "(int) The number of ranks of distributed trainers");
-    AddAttr<int>("rank",
-                 "(int) The rank of the trainer in distributed training.");
-    AddAttr<int>("device_id",
-                 "(int) The deivce_id on which to initialize the communicator."
-                 "Now, you only have to set this attr manually for pipeline "
-                 "training. Otherwise, make it as default.")
-        .SetDefault(-1);
-    AddAttr<int>("ring_id", "(int default 0) user specified ring id")
-        .SetDefault(0);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(c_comm_init_hccl,
-                  ops::CCommInitOpAscend,
-                  ops::CCommInitOpAscendMaker);
diff --git a/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc
deleted file mode 100644
index 130c45dfaad506..00000000000000
--- a/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <string>
-
-#include "glog/logging.h"
-#include "paddle/fluid/framework/op_proto_maker.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/var_type_traits.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/gen_comm_id_helper.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace operators {
-
-class CGenHCCLIdOp : public framework::OperatorBase {
- public:
-  CGenHCCLIdOp(const std::string& type,
-               const framework::VariableNameMap& inputs,
-               const framework::VariableNameMap& outputs,
-               const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& dev_place) const override {}
-};
-
-class CGenHCCLIdOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    VLOG(3) << "ele";
-    AddOutput("Out", "Raw variable contains a HCCL UniqueId instaces.");
-    AddComment(R"DOC(
-CGenHCCLId operator
-
-For trainer 0: generate a new UniqueId and send it to all the other trainers.
-For trainer 1~n: start a gRPC server to get the UniqueId, once got, stop the server.
-)DOC");
-    AddAttr<std::string>("endpoint",
-                         "(string), e.g. 127.0.0.1:6175 "
-                         "current listen endpoint");
-    AddAttr<std::vector<std::string>>(
-        "other_endpoints",
-        "['trainer1_ip:port', 'trainer2_ip:port', ...] "
-        "list of other trainer endpoints")
-        .SetDefault({});
-    AddAttr<int>("rank",
-                 "(int default 0) "
-                 "The rank of the trainer in distributed training.")
-        .SetDefault(0);
-    AddAttr<int>("ring_id", "(int default 0) user specified ring id")
-        .SetDefault(0);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(c_gen_hccl_id, ops::CGenHCCLIdOp, ops::CGenHCCLIdOpMaker);
diff --git a/paddle/fluid/operators/collective/gen_hccl_id_op.cc b/paddle/fluid/operators/collective/gen_hccl_id_op.cc
deleted file mode 100644
index d472d589de5444..00000000000000
--- a/paddle/fluid/operators/collective/gen_hccl_id_op.cc
+++ /dev/null
@@ -1,83 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <ostream>
-#include <string>
-
-#include "glog/logging.h"
-#include "paddle/fluid/framework/op_proto_maker.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/var_type_traits.h"
-#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/string/split.h"
-
-namespace paddle {
-namespace operators {
-
-class GenHCCLIdOp : public framework::OperatorBase {
- public:
-  GenHCCLIdOp(const std::string& type,
-              const framework::VariableNameMap& inputs,
-              const framework::VariableNameMap& outputs,
-              const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& dev_place) const override {}
-};
-
-class GenHCCLIdOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddOutput("HCCLID", "Raw variable contains a HCCL UniqueId instaces.");
-    AddComment(R"DOC(
-GenHCCLId operator
-
-For trainer 0: generate a new UniqueId and send it to all the other trainers.
-For trainer 1~n: start a gRPC server to get the UniqueId, once got, stop the server.
-)DOC");
-    AddAttr<std::vector<std::string>>(
-        "trainers",
-        "['trainer0_ip:port', 'trainer1_ip:port', ...] "
-        "list of all trainer endpoints")
-        .SetDefault({});
-    AddAttr<int>("trainer_id",
-                 "(int) "
-                 "The index of the trainer in distributed training.");
-    AddAttr<int>("hccl_comm_num",
-                 "(int default 1) "
-                 "The number of nccl communicator num.")
-        .SetDefault(1);
-    AddAttr<bool>("use_hierarchical_allreduce",
-                  "(bool default false) "
-                  "Wheter to use hierarchical allreduce.")
-        .SetDefault(false);
-    AddAttr<int>("hierarchical_allreduce_inter_nranks",
-                 "(int default 1) "
-                 "Wheter to use hierarchical allreduce.")
-        .SetDefault(-1);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(gen_hccl_id, ops::GenHCCLIdOp, ops::GenHCCLIdOpMaker);
diff --git a/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc
deleted file mode 100644
index 41367305e26669..00000000000000
--- a/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc
+++ /dev/null
@@ -1,378 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
-
-#include <arpa/inet.h>
-#include <netdb.h>
-#include <netinet/in.h>
-#include <stdlib.h>
-#include <sys/socket.h>
-
-#include <algorithm>
-#include <ostream>
-#include <string>
-
-#include "glog/logging.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/var_type_traits.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/string/split.h"
-
-DECLARE_int32(get_host_by_name_time);
-
-namespace paddle {
-namespace operators {
-
-constexpr char COMM_HEAD[] = "_pd_gen_comm_id_";
-#define HCCL_UNIQUE_ID_BYTES 1024
-
-// Check system calls, such as socket, bind.
-#define CHECK_SYS_CALL(call, name)          \
-  do {                                      \
-    int retval;                             \
-    CHECK_SYS_CALL_VAL(call, name, retval); \
-  } while (false)
-
-#define CHECK_SYS_CALL_VAL(call, name, retval)              \
-  do {                                                      \
-    RETRY_SYS_CALL_VAL(call, name, retval);                 \
-    if (retval == -1) {                                     \
-      PADDLE_THROW(platform::errors::Unavailable(           \
-          "Call to %s failed: %s", name, strerror(errno))); \
-    }                                                       \
-  } while (false)
-
-#define RETRY_SYS_CALL_VAL(call, name, retval)                           \
-  do {                                                                   \
-    retval = (call);                                                     \
-    if (retval == -1 &&                                                  \
-        (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) {   \
-      LOG(WARNING) << "Call " << name << " returned " << strerror(errno) \
-                   << " retry";                                          \
-    } else {                                                             \
-      break;                                                             \
-    }                                                                    \
-  } while (true)
-
-static int SocketSend(int fd, const char* buffer, int size) {
-  int offset = 0;
-  int bytes = 0;
-  while (offset < size) {
-    bytes = send(fd, buffer + offset, size - offset, 0);
-    if (bytes == -1) {
-      if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
-        // send failed
-        return -1;
-      } else {
-        bytes = 0;
-      }
-    }
-    offset += bytes;
-  }
-  return offset;
-}
-
-static int SocketRecv(int fd, char* buffer, int size) {
-  int offset = 0;
-  int bytes = 0;
-  while (offset < size) {
-    bytes = recv(fd, buffer + offset, size - offset, 0);
-    if (bytes == 0) {
-      // closed by client, maybe probing alive client
-      return 0;
-    }
-    if (bytes == -1) {
-      if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
-        return -1;
-      } else {
-        bytes = 0;
-      }
-    }
-    offset += bytes;
-  }
-  return offset;
-}
-
-static void BindOrConnectFailed(int timeout,
-                                int* try_times,
-                                int* total_time,
-                                const char* op,
-                                const std::string& ep) {
-  PADDLE_ENFORCE_LT(
-      *total_time,
-      timeout,
-      platform::errors::Unavailable("%s addr=%s timeout, failed reason: %s",
-                                    op,
-                                    ep.c_str(),
-                                    strerror(errno)));
-  ++(*try_times);
-  int retry_time = std::min(*try_times * 500, 3000);  // max 3 seconds
-  *total_time += retry_time;
-
-  LOG(WARNING) << op << " addr=" << ep << " failed " << *try_times
-               << " times with reason: " << strerror(errno) << " retry after "
-               << retry_time / 1000.0 << " seconds";
-  std::this_thread::sleep_for(std::chrono::milliseconds(retry_time));
-}
-
-int CreateListenSocket(const std::string& ep) {
-  auto addr = paddle::string::Split(ep, ':');
-  PADDLE_ENFORCE_EQ(
-      addr.size(),
-      2UL,
-      platform::errors::InvalidArgument(
-          "The endpoint should contain host and port, but got %s.", ep));
-  std::string host = addr[0];
-  int port = std::stoi(addr[1]);
-
-  // creating socket fd
-  int server_fd = -1;
-  CHECK_SYS_CALL_VAL(socket(AF_INET, SOCK_STREAM, 0), "socket", server_fd);
-
-  // NOTE. Solutions to `Address already in use`.
-  // 1. Reuse addr&port. Otherwise, once the server closes the socket
-  // before client, the server will enter TIME-WAIT status. If we bind port
-  // again, the error `Address already in use` will appear.
-  // 2. Or we can close the client first to ensure that the server does
-  // not enter the TIME-WAIT state. But this is obviously not as convenient
-  // as the reuse method.
-  int opt = 1;
-#if defined(SO_REUSEPORT)
-  // since Linux kernel 3.9
-  CHECK_SYS_CALL(setsockopt(server_fd,
-                            SOL_SOCKET,
-                            SO_REUSEADDR | SO_REUSEPORT,
-                            &opt,
-                            sizeof(opt)),
-                 "setsockopt");
-#else
-  CHECK_SYS_CALL(
-      setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)),
-      "setsockopt");
-#endif
-
-  struct sockaddr_in address;
-  address.sin_family = AF_INET;
-  address.sin_addr.s_addr = INADDR_ANY;
-  address.sin_port = htons(port);
-
-  // TODO(wangxi) Set from env, default 900s=15min
-  int timeout = 900 * 1000;
-  int try_times = 0;
-  int total_time = 0;
-  while (true) {
-    int ret_val = -1;
-    RETRY_SYS_CALL_VAL(
-        bind(server_fd, (struct sockaddr*)&address, sizeof(address)),
-        "bind",
-        ret_val);
-
-    if (ret_val == -1) {
-      BindOrConnectFailed(timeout, &try_times, &total_time, "bind", ep);
-      continue;
-    }
-    break;
-  }
-
-  CHECK_SYS_CALL(listen(server_fd, 3), "listen");
-  LOG(INFO) << "Server listening on: " << ep << " successful.";
-  return server_fd;
-}
-
-void CloseSocket(int fd) { CHECK_SYS_CALL(close(fd), "close"); }
-
-static int SocketAccept(int server_fd, const char* head) {
-  struct sockaddr_in client_addr;
-  socklen_t addr_length = sizeof(client_addr);
-  char buffer[1024] = {0};
-  int conn = -1;
-
-  while (true) {
-    CHECK_SYS_CALL_VAL(accept(server_fd,
-                              reinterpret_cast<struct sockaddr*>(&client_addr),
-                              &addr_length),
-                       "accept",
-                       conn);
-
-    int ret_val = SocketRecv(conn, buffer, strlen(head));
-    if (ret_val > 0 && strncmp(buffer, head, strlen(head)) == 0) {
-      break;  // accept client
-    } else {
-      VLOG(3) << "socket read failed with ret_val=" << ret_val;
-      CloseSocket(conn);
-    }
-  }
-  return conn;
-}
-
-static int ConnectAddr(const std::string& ep, const char* head) {
-  auto addr = paddle::string::Split(ep, ':');
-  PADDLE_ENFORCE_EQ(
-      addr.size(),
-      2UL,
-      platform::errors::InvalidArgument(
-          "The endpoint should contain host and port, but got %s.", ep));
-  std::string host = addr[0];
-  int port = std::stoi(addr[1]);
-
-  int sock = -1;
-  CHECK_SYS_CALL_VAL(socket(AF_INET, SOCK_STREAM, 0), "socket", sock);
-
-  struct sockaddr_in server_addr;
-  memset(&server_addr, 0, sizeof(server_addr));
-  server_addr.sin_family = AF_INET;
-  server_addr.sin_port = htons(port);
-
-  char* ip = NULL;
-  struct hostent* hp = NULL;
-  // sleep for get_host_by_name_time seconds.
-  for (int i = 0; 2 * i < FLAGS_get_host_by_name_time; i++) {
-    hp = gethostbyname(host.c_str());
-    if (hp != NULL) {
-      break;
-    }
-    std::this_thread::sleep_for(std::chrono::seconds(2));
-    LOG(WARNING) << "gethostbyname " << host.c_str() << " error!";
-  }
-  PADDLE_ENFORCE_NOT_NULL(
-      hp,
-      platform::errors::InvalidArgument("Fail to get host by name %s.", host));
-
-  int i = 0;
-  while (hp->h_addr_list[i] != NULL) {
-    ip = inet_ntoa(*(struct in_addr*)hp->h_addr_list[i]);
-    VLOG(3) << "gethostbyname  host:" << host << "  ->ip: " << ip;
-    break;
-  }
-
-  PADDLE_ENFORCE_GT(inet_pton(AF_INET, ip, &server_addr.sin_addr),
-                    0,
-                    platform::errors::Unavailable(
-                        "Open address %s failed: %s", ep, strerror(errno)));
-
-  // TODO(wangxi) Set from env, default 900s=15min
-  int timeout = 900 * 1000;
-  int try_times = 0;
-  int total_time = 0;
-  while (true) {
-    int ret_val = -1;
-    RETRY_SYS_CALL_VAL(
-        connect(sock, (struct sockaddr*)&server_addr, sizeof(server_addr)),
-        "connect",
-        ret_val);
-
-    if (ret_val == -1) {
-      BindOrConnectFailed(timeout, &try_times, &total_time, "connect", ep);
-      continue;
-    }
-
-    CHECK_SYS_CALL(SocketSend(sock, head, strlen(head)), "send");
-    break;
-  }
-  return sock;
-}
-
-static void RecvHCCLID(int conn, HcclRootInfo* hccl_id) {
-  char buffer[1024] = {0};
-  static_assert(HCCL_UNIQUE_ID_BYTES <= 1024,
-                "hccl id bytes must <= buffer size");
-
-  CHECK_SYS_CALL(SocketRecv(conn, buffer, HCCL_UNIQUE_ID_BYTES),
-                 "recv hccl id");
-  memcpy(hccl_id, buffer, HCCL_UNIQUE_ID_BYTES);
-}
-
-static void SendHCCLID(int conn, HcclRootInfo* hccl_id) {
-  char buffer[1024] = {0};
-  memcpy(buffer, hccl_id, HCCL_UNIQUE_ID_BYTES);
-
-  CHECK_SYS_CALL(SocketSend(conn, buffer, HCCL_UNIQUE_ID_BYTES),
-                 "send hccl id");
-}
-
-void SendBroadCastHCCLID(std::vector<std::string> servers,
-                         int hccl_comm_num,
-                         std::function<std::string(size_t)> func,
-                         const framework::Scope& scope) {
-  // connect with server
-  std::vector<int> connects;
-  for (auto server : servers) {
-    VLOG(3) << "connecting endpoint: " << server;
-    int conn = ConnectAddr(server, COMM_HEAD);
-    connects.push_back(conn);
-  }
-  VLOG(3) << "connecting completed...";
-
-  for (int i = 0; i < hccl_comm_num; ++i) {
-    std::string var_name = func(i);
-    auto var = scope.FindVar(var_name);
-    PADDLE_ENFORCE_NOT_NULL(
-        var,
-        platform::errors::NotFound("Variable with name %s is not found",
-                                   var_name.c_str()));
-    auto hccl_id = var->GetMutable<HcclRootInfo>();
-    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclGetRootInfo(hccl_id));
-
-    int j = 0;
-    for (auto conn : connects) {
-      VLOG(3) << "sending hccl_id_var: " << var_name << " to " << servers[j]
-              << " hccl_comm_no: " << i;
-      SendHCCLID(conn, hccl_id);
-      ++j;
-    }
-    VLOG(3) << "sending completed...";
-  }
-
-  // close client
-  for (auto conn : connects) {
-    CloseSocket(conn);
-  }
-}
-
-void RecvBroadCastHCCLID(std::string endpoint,
-                         int hccl_comm_num,
-                         std::function<std::string(size_t)> func,
-                         const framework::Scope& scope) {
-  int server = CreateListenSocket(endpoint);
-  RecvBroadCastHCCLID(server, endpoint, hccl_comm_num, func, scope);
-  CloseSocket(server);
-}
-
-void RecvBroadCastHCCLID(int server_fd,
-                         std::string endpoint,
-                         int hccl_comm_num,
-                         std::function<std::string(size_t)> func,
-                         const framework::Scope& scope) {
-  int client = SocketAccept(server_fd, COMM_HEAD);
-
-  for (int i = 0; i < hccl_comm_num; ++i) {
-    std::string var_name = func(i);
-    auto var = scope.FindVar(var_name);
-    PADDLE_ENFORCE_NOT_NULL(
-        var,
-        platform::errors::NotFound("Variable with name %s is not found",
-                                   var_name.c_str()));
-    auto hccl_id = var->GetMutable<HcclRootInfo>();
-
-    VLOG(3) << "trainer: " << endpoint << " receiving hccl_id_var: " << var_name
-            << " from trainer 0, hccl_comm_no: " << i;
-    RecvHCCLID(client, hccl_id);
-  }
-  VLOG(3) << "receiving completed...";
-  CloseSocket(client);
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/collective/gen_hccl_id_op_helper.h b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.h
deleted file mode 100644
index a64a44f9f61665..00000000000000
--- a/paddle/fluid/operators/collective/gen_hccl_id_op_helper.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <functional>
-#include <string>
-#include <vector>
-
-namespace paddle {
-namespace framework {
-class Scope;
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-
-int CreateListenSocket(const std::string& ep);
-
-void CloseSocket(int fd);
-
-void SendBroadCastHCCLID(std::vector<std::string> servers,
-                         int nccl_comm_num,
-                         std::function<std::string(size_t)> func,
-                         const framework::Scope& scope);
-
-// server listen on endpoint, then recv nccl id
-void RecvBroadCastHCCLID(std::string endpoint,
-                         int nccl_comm_num,
-                         std::function<std::string(size_t)> func,
-                         const framework::Scope& scope);
-
-// recv nccl id from socket
-void RecvBroadCastHCCLID(int server_fd,
-                         std::string endpoint,
-                         int nccl_comm_num,
-                         std::function<std::string(size_t)> func,
-                         const framework::Scope& scope);
-}  // namespace operators
-}  // namespace paddle
diff --git a/python/paddle/distributed/fleet/meta_optimizers/common.py b/python/paddle/distributed/fleet/meta_optimizers/common.py
index c9474d397417ac..15bd883e970be3 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/common.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/common.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
 
 import paddle
 from paddle.framework import core
@@ -196,31 +195,6 @@ def _add_sync_by_allreduce(block):
                     OP_ROLE_KEY: OpRole.Forward,
                 },
             )
-        elif core.is_compiled_with_custom_device('npu'):
-            block.append_op(
-                type='c_gen_hccl_id',
-                inputs={},
-                outputs={'Out': comm_id_var},
-                attrs={
-                    'rank': rank,
-                    'endpoint': current_endpoint,
-                    'other_endpoints': other_endpoints,
-                    'ring_id': ring_id,
-                    OP_ROLE_KEY: OpRole.Forward,
-                },
-            )
-            block.append_op(
-                type='c_comm_init_hccl',
-                inputs={'X': comm_id_var},
-                outputs={},
-                attrs={
-                    'rank': rank,
-                    'ring_id': ring_id,
-                    'device_id': int(os.getenv("FLAGS_selected_npus")),
-                    'rank_ids': nranks,
-                    OP_ROLE_KEY: OpRole.Forward,
-                },
-            )
         else:
             raise ValueError(
                 "comm_id must be generated in paddlepaddle-xpu or paddlepaddle-xpu."
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index db17ea368849d1..537abbc50a8a2f 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -2758,8 +2758,6 @@ class Operator:
         'heter_listen_and_serv',
         'c_wait_comm',
         'c_wait_compute',
-        'c_gen_hccl_id',
-        'c_comm_init_hccl',
         'copy_cross_scope',
         'c_gen_cncl_id',
     }
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/program_config.py b/python/paddle/fluid/tests/unittests/ir/inference/program_config.py
index 4591d5512c0926..04e804ea135f76 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/program_config.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/program_config.py
@@ -131,8 +131,6 @@ def __repr__(self):
     'heter_listen_and_serv',
     'c_wait_comm',
     'c_wait_compute',
-    'c_gen_hccl_id',
-    'c_comm_init_hccl',
     'copy_cross_scope',
 }
 

From 895b8737f6622de0fe5117b99626cee2ca81bdb0 Mon Sep 17 00:00:00 2001
From: cyber-pioneer <116002591+cyber-pioneer@users.noreply.github.com>
Date: Wed, 12 Apr 2023 14:25:14 +0800
Subject: [PATCH 091/156] fix prim resnet cinn value (#52813)

---
 test/prim/model/test_resnet_prim_cinn.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/test/prim/model/test_resnet_prim_cinn.py b/test/prim/model/test_resnet_prim_cinn.py
index 2012d84546e642..eee27720313dec 100644
--- a/test/prim/model/test_resnet_prim_cinn.py
+++ b/test/prim/model/test_resnet_prim_cinn.py
@@ -94,16 +94,16 @@
 ]
 
 DY2ST_PRIM_CINN_GT = [
-    5.828784942626953,
-    8.34173583984375,
-    5.116049289703369,
-    8.511833190917969,
-    7.9524407386779785,
-    7.395752906799316,
-    9.666715621948242,
-    8.277752876281738,
-    8.718518257141113,
-    10.199666023254395,
+    5.828786849975586,
+    8.332868576049805,
+    5.038548469543457,
+    8.554015159606934,
+    8.106254577636719,
+    7.493070125579834,
+    9.479158401489258,
+    8.270158767700195,
+    8.324719429016113,
+    10.140411376953125,
 ]
 
 if core.is_compiled_with_cuda():

From 523fae593c9babd77745814f90bd28f1ce3c5dcb Mon Sep 17 00:00:00 2001
From: RedContritio <RedContritio@qq.com>
Date: Wed, 12 Apr 2023 14:46:34 +0800
Subject: [PATCH 092/156] fix CMakeLists.txt error of incorrect default value
 (#52780)

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9dc6febdfaaa53..ef5d415212eeb5 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -257,7 +257,7 @@ option(WITH_BOX_PS "Compile with box_ps support" OFF)
 option(WITH_XBYAK "Compile with xbyak support" ON)
 option(WITH_CONTRIB "Compile the third-party contributation" OFF)
 option(WITH_PSCORE "Compile with parameter server support" ${WITH_DISTRIBUTE})
-option(WITH_HETERPS "Compile with heterps" OFF})
+option(WITH_HETERPS "Compile with heterps" OFF)
 option(WITH_INFERENCE_API_TEST
        "Test fluid inference C++ high-level api interface" OFF)
 option(WITH_INFERENCE_NVTX "Paddle inference with nvtx for profiler" OFF)

From 41e37d4c84b053a386b33ce4a863faf99416010e Mon Sep 17 00:00:00 2001
From: jiangcheng <thisjiang@qq.com>
Date: Wed, 12 Apr 2023 14:56:42 +0800
Subject: [PATCH 093/156] [CINN] add python.version.cinn_commit api (#52727)

* [CINN] add python.version.cinn_commit api

* update cinn version get function

* fix cinn_commit in setup.py also need len>0 check bug
---
 python/env_dict.py.in |  4 ++-
 python/setup.py.in    | 52 +++++++++++++++++++++++++++++-
 setup.py              | 75 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 129 insertions(+), 2 deletions(-)

diff --git a/python/env_dict.py.in b/python/env_dict.py.in
index 5b2078c67510c3..00ca04dc56cded 100644
--- a/python/env_dict.py.in
+++ b/python/env_dict.py.in
@@ -73,5 +73,7 @@ env_dict={
     'JIT_RELEASE_WHL':'@JIT_RELEASE_WHL@',
     'WITH_PSLIB':'@WITH_PSLIB@',
     'PYBIND_INCLUDE_DIR':'@PYBIND_INCLUDE_DIR@',
-    'WITH_PYTHON':'@WITH_PYTHON@'
+    'WITH_PYTHON':'@WITH_PYTHON@',
+    'WITH_CINN':'@WITH_CINN@',
+    'CINN_SOURCE_DIR':'@CINN_SOURCE_DIR@'
 }
diff --git a/python/setup.py.in b/python/setup.py.in
index 650a4449b24c61..fa32dcf13c3d51 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -100,6 +100,32 @@ def is_taged():
     else:
         return False
 
+def get_cinn_version():
+    if '@WITH_CINN@' != 'ON':
+        return "False"
+
+    cinn_git_version = 'Unknown'
+    try:
+        cmd = ['git', 'describe', '--exact-match', '--tags', 'HEAD', '2>/dev/null']
+        cinn_tag = subprocess.Popen(cmd, stdout = subprocess.PIPE, cwd='@CINN_SOURCE_DIR@').communicate()[0].strip()
+        if len(cinn_tag) > 0:
+            cinn_git_version = cinn_tag
+    except:
+        pass
+
+    if cinn_git_version == 'Unknown':
+        try:
+            cmd = ['git', 'rev-parse', 'HEAD']
+            cinn_commit = subprocess.Popen(cmd, stdout = subprocess.PIPE,
+                cwd='@CINN_SOURCE_DIR@').communicate()[0].strip()
+            if len(cinn_commit) > 0:
+                cinn_git_version = cinn_commit
+        except:
+            pass
+
+    cinn_git_version = cinn_git_version.decode('utf-8')
+    return str(cinn_git_version)
+
 def write_version_py(filename='paddle/version/__init__.py'):
     cnt = '''# THIS FILE IS GENERATED FROM PADDLEPADDLE SETUP.PY
 #
@@ -115,6 +141,7 @@ xpu_xccl_version = '%(xpu_xccl)s'
 istaged          = %(istaged)s
 commit           = '%(commit)s'
 with_mkl         = '%(with_mkl)s'
+cinn_version      = '%(cinn)s'
 
 __all__ = ['cuda', 'cudnn', 'show', 'xpu', 'xpu_xccl']
 
@@ -143,6 +170,8 @@ def show():
 
         xpu_xccl: the xpu xccl version of package. It will return `False` if non-XPU version paddle package is installed
 
+        cinn: the cinn version of package. It will return `False` if paddle package is not compiled with CINN
+
     Examples:
         .. code-block:: python
 
@@ -159,6 +188,7 @@ def show():
             # cudnn: '7.6.5'
             # xpu: '20230114'
             # xpu_xccl: '1.0.7'
+            # cinn: False
 
             # Case 2: paddle is not tagged
             paddle.version.show()
@@ -167,6 +197,7 @@ def show():
             # cudnn: '7.6.5'
             # xpu: '20230114'
             # xpu_xccl: '1.0.7'
+            # cinn: False
     """
     if istaged:
         print('full_version:', full_version)
@@ -180,6 +211,7 @@ def show():
     print('cudnn:', cudnn_version)
     print('xpu:', xpu_version)
     print('xpu_xccl:', xpu_xccl_version)
+    print('cinn:', cinn_version)
 
 def mkl():
     return with_mkl
@@ -251,6 +283,23 @@ def xpu_xccl():
 
     """
     return xpu_xccl_version
+
+def cinn():
+    """Get CINN version of paddle package.
+
+    Returns:
+        string: Return the version information of CINN. If paddle package is not compiled with CINN, it will return False.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            paddle.version.cinn()
+            # False
+
+    """
+    return cinn_version
 '''
     commit = git_commit()
 
@@ -275,7 +324,8 @@ def xpu_xccl():
             'xpu_xccl': get_xpu_xccl_version(),
             'commit': commit,
             'istaged': is_taged(),
-            'with_mkl': '@WITH_MKL@'})
+            'with_mkl': '@WITH_MKL@',
+            'cinn': get_cinn_version()})
 
 write_version_py(filename='@PADDLE_BINARY_DIR@/python/paddle/version/__init__.py')
 
diff --git a/setup.py b/setup.py
index daa9dbd5cc6e40..288500feba8548 100644
--- a/setup.py
+++ b/setup.py
@@ -427,6 +427,57 @@ def is_taged():
         return False
 
 
+def get_cinn_version():
+    if env_dict.get("WITH_CINN") != 'ON':
+        return "False"
+
+    cinn_git_version = 'Unknown'
+    # try get cinn tag name
+    try:
+        cmd = [
+            'git',
+            'describe',
+            '--exact-match',
+            '--tags',
+            'HEAD',
+            '2>/dev/null',
+        ]
+        cinn_tag = (
+            subprocess.Popen(
+                cmd,
+                stdout=subprocess.PIPE,
+                cwd=env_dict.get("CINN_SOURCE_DIR"),
+            )
+            .communicate()[0]
+            .strip()
+        )
+        if len(cinn_tag) > 0:
+            cinn_git_version = cinn_tag
+    except:
+        pass
+
+    if cinn_git_version == 'Unknown':
+        # try get cinn commit id
+        try:
+            cmd = ['git', 'rev-parse', 'HEAD']
+            cinn_commit = (
+                subprocess.Popen(
+                    cmd,
+                    stdout=subprocess.PIPE,
+                    cwd=env_dict.get("CINN_SOURCE_DIR"),
+                )
+                .communicate()[0]
+                .strip()
+            )
+            if len(cinn_commit) > 0:
+                cinn_git_version = cinn_commit
+        except:
+            pass
+
+    cinn_git_version = cinn_git_version.decode('utf-8')
+    return str(cinn_git_version)
+
+
 def write_version_py(filename='paddle/version/__init__.py'):
     cnt = '''# THIS FILE IS GENERATED FROM PADDLEPADDLE SETUP.PY
 #
@@ -442,6 +493,7 @@ def write_version_py(filename='paddle/version/__init__.py'):
 istaged          = %(istaged)s
 commit           = '%(commit)s'
 with_mkl         = '%(with_mkl)s'
+cinn_version      = '%(cinn)s'
 
 __all__ = ['cuda', 'cudnn', 'show', 'xpu', 'xpu_xccl']
 
@@ -470,6 +522,8 @@ def show():
 
         xpu_xccl: the xpu xccl version of package. It will return `False` if non-XPU version paddle package is installed
 
+        cinn: the cinn version of package. It will return `False` if paddle package is not compiled with CINN
+
     Examples:
         .. code-block:: python
 
@@ -486,6 +540,7 @@ def show():
             # cudnn: '7.6.5'
             # xpu: '20230114'
             # xpu_xccl: '1.0.7'
+            # cinn: False
 
             # Case 2: paddle is not tagged
             paddle.version.show()
@@ -494,6 +549,7 @@ def show():
             # cudnn: '7.6.5'
             # xpu: '20230114'
             # xpu_xccl: '1.0.7'
+            # cinn: False
     """
     if istaged:
         print('full_version:', full_version)
@@ -507,6 +563,7 @@ def show():
     print('cudnn:', cudnn_version)
     print('xpu:', xpu_version)
     print('xpu_xccl:', xpu_xccl_version)
+    print('cinn:', cinn_version)
 
 def mkl():
     return with_mkl
@@ -578,6 +635,23 @@ def xpu_xccl():
 
     """
     return xpu_xccl_version
+
+def cinn():
+    """Get CINN version of paddle package.
+
+    Returns:
+        string: Return the version information of CINN. If paddle package is not compiled with CINN, it will return False.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            paddle.version.cinn()
+            # False
+
+    """
+    return cinn_version
 '''
     commit = git_commit()
 
@@ -605,6 +679,7 @@ def xpu_xccl():
                 'commit': commit,
                 'istaged': is_taged(),
                 'with_mkl': env_dict.get("WITH_MKL"),
+                'cinn': get_cinn_version(),
             }
         )
 

From 998235e66a9160d127d204743be9d6dd462c0f3c Mon Sep 17 00:00:00 2001
From: YepKong <48173002+YepKong@users.noreply.github.com>
Date: Wed, 12 Apr 2023 15:37:42 +0800
Subject: [PATCH 094/156] add autogen code support for squared_l2_norm_op
 (#52662)

* add autogen code support for squared_l2_norm_op

* Update ops.yaml
---
 paddle/fluid/operators/squared_l2_norm_op.cc | 89 --------------------
 paddle/phi/api/yaml/backward.yaml            | 10 +++
 paddle/phi/api/yaml/legacy_backward.yaml     | 10 ---
 paddle/phi/api/yaml/legacy_ops.yaml          |  9 --
 paddle/phi/api/yaml/op_compat.yaml           |  7 ++
 paddle/phi/api/yaml/ops.yaml                 |  9 ++
 paddle/phi/ops/compat/squared_l2_norm_sig.cc | 35 --------
 7 files changed, 26 insertions(+), 143 deletions(-)
 delete mode 100644 paddle/fluid/operators/squared_l2_norm_op.cc
 delete mode 100644 paddle/phi/ops/compat/squared_l2_norm_sig.cc

diff --git a/paddle/fluid/operators/squared_l2_norm_op.cc b/paddle/fluid/operators/squared_l2_norm_op.cc
deleted file mode 100644
index 2e97f5b9b0dc27..00000000000000
--- a/paddle/fluid/operators/squared_l2_norm_op.cc
+++ /dev/null
@@ -1,89 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/phi/infermeta/unary.h"
-
-namespace paddle {
-namespace operators {
-
-class SquaredL2NormOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-};
-
-template <typename T>
-class SquaredL2NormGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("squared_l2_norm_grad");
-
-    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    op->SetInput("X", this->Input("X"));
-
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-
-    op->SetAttrMap(this->Attrs());
-  }
-};
-
-class SquaredL2NormGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-};
-
-class SquaredL2NormOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor) The input of squared_l2_norm op.");
-    AddOutput("Out", "(Scalar) The output of squared_l2_norm op.");
-    AddComment(R"DOC(
-SquaredL2Norm Operator.
-
-Computes the squared L2 norm of a tensor.
-
-$$Out = \sum_{i} X_{i}^2$$
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-DECLARE_INFER_SHAPE_FUNCTOR(squared_l2_norm,
-                            SquaredL2NormInferShapeFunctor,
-                            PD_INFER_META(phi::SquaredL2NormInferMeta));
-
-DECLARE_INFER_SHAPE_FUNCTOR(squared_l2_norm_grad,
-                            SquaredL2NormGradInferShapeFunctor,
-                            PD_INFER_META(phi::UnchangedInferMeta));
-
-REGISTER_OPERATOR(squared_l2_norm,
-                  ops::SquaredL2NormOp,
-                  ops::SquaredL2NormOpMaker,
-                  ops::SquaredL2NormGradOpMaker<paddle::framework::OpDesc>,
-                  ops::SquaredL2NormGradOpMaker<paddle::imperative::OpBase>,
-                  SquaredL2NormInferShapeFunctor);
-
-REGISTER_OPERATOR(squared_l2_norm_grad,
-                  ops::SquaredL2NormGradOp,
-                  SquaredL2NormGradInferShapeFunctor);
diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml
index 7bf3b5cd2fcd89..d288f0bf18f6a8 100644
--- a/paddle/phi/api/yaml/backward.yaml
+++ b/paddle/phi/api/yaml/backward.yaml
@@ -1718,6 +1718,16 @@
   backward : square_double_grad
   inplace : (out_grad -> x_grad)
 
+- backward_op : squared_l2_norm_grad
+  forward : squared_l2_norm(Tensor x) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param: [x]
+  kernel :
+    func : squared_l2_norm_grad
+
 - backward_op : squeeze_double_grad
   forward : squeeze_grad(Tensor xshape, Tensor grad_out, IntArray axis) -> Tensor(grad_x)
   args : (Tensor grad_x_grad, IntArray axis)
diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml
index 4e21865c23b317..3a67b3e4a3e463 100755
--- a/paddle/phi/api/yaml/legacy_backward.yaml
+++ b/paddle/phi/api/yaml/legacy_backward.yaml
@@ -962,16 +962,6 @@
   invoke : concat( out_grad, axis)
   composite : split_grad(out_grad, axis, x_grad)
 
-- backward_op : squared_l2_norm_grad
-  forward : squared_l2_norm(Tensor x) -> Tensor(out)
-  args : (Tensor x, Tensor out_grad)
-  output : Tensor(x_grad)
-  infer_meta :
-    func : UnchangedInferMeta
-    param: [x]
-  kernel :
-    func : squared_l2_norm_grad
-
 - backward_op : strided_slice_grad
   forward : strided_slice (Tensor x, int[] axes, IntArray starts, IntArray ends, IntArray strides) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, int[] axes, IntArray starts, IntArray ends, IntArray strides)
diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml
index b075b1935e1bb3..100329f555bea0 100755
--- a/paddle/phi/api/yaml/legacy_ops.yaml
+++ b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -1208,15 +1208,6 @@
     func : split_with_num
   backward : split_with_num_grad
 
-- op : squared_l2_norm
-  args : (Tensor x)
-  output : Tensor
-  infer_meta :
-    func : SquaredL2NormInferMeta
-  kernel :
-    func : squared_l2_norm
-  backward : squared_l2_norm_grad
-
 - op : strided_slice
   args : (Tensor x, int[] axes, IntArray starts, IntArray ends, IntArray strides)
   output : Tensor
diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
index e53909aa3fdee9..bfbab2d52af4ea 100644
--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -2322,3 +2322,10 @@
     {x: X, label: Label}
   outputs :
     out : Out
+
+- op: squared_l2_norm
+  backward: squared_l2_norm_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
index aed95190bcfe59..980505ddeb2f1d 100644
--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -1765,6 +1765,15 @@
            square_sr {selected_rows -> selected_rows}
   backward : square_grad
 
+- op : squared_l2_norm
+  args : (Tensor x)
+  output : Tensor(out)
+  infer_meta :
+    func : SquaredL2NormInferMeta
+  kernel :
+    func : squared_l2_norm
+  backward : squared_l2_norm_grad
+
 - op : squeeze
   args : (Tensor x, IntArray axis={})
   output : Tensor(out), Tensor(xshape)
diff --git a/paddle/phi/ops/compat/squared_l2_norm_sig.cc b/paddle/phi/ops/compat/squared_l2_norm_sig.cc
deleted file mode 100644
index 7b228008f2839d..00000000000000
--- a/paddle/phi/ops/compat/squared_l2_norm_sig.cc
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-
-KernelSignature SquaredL2NormOpArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  return KernelSignature("squared_l2_norm", {"X"}, {}, {"Out"});
-}
-
-KernelSignature SquaredL2NormGradOpArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  return KernelSignature(
-      "squared_l2_norm_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"});
-}
-
-}  // namespace phi
-
-PD_REGISTER_ARG_MAPPING_FN(squared_l2_norm,
-                           phi::SquaredL2NormOpArgumentMapping);
-PD_REGISTER_ARG_MAPPING_FN(squared_l2_norm_grad,
-                           phi::SquaredL2NormGradOpArgumentMapping);

From a64d50b720b9cf4354a1dd45bdbdfcc639c15b18 Mon Sep 17 00:00:00 2001
From: liuruyan <44316842+liuruyan@users.noreply.github.com>
Date: Wed, 12 Apr 2023 15:48:22 +0800
Subject: [PATCH 095/156] Add layer func: float(), half(), bfloat16(). (#51635)

---
 paddle/fluid/pybind/place.cc         |  24 +++-
 paddle/fluid/pybind/pybind.cc        |  11 --
 python/paddle/amp/__init__.py        |  66 +++++++++-
 python/paddle/nn/layer/layers.py     | 176 ++++++++++++++++++++++++++-
 test/amp/test_layer_convert_dtype.py | 172 ++++++++++++++++++++++++++
 5 files changed, 434 insertions(+), 15 deletions(-)
 create mode 100644 test/amp/test_layer_convert_dtype.py

diff --git a/paddle/fluid/pybind/place.cc b/paddle/fluid/pybind/place.cc
index d1d336b5bb0095..aec21c6b0f6292 100644
--- a/paddle/fluid/pybind/place.cc
+++ b/paddle/fluid/pybind/place.cc
@@ -373,7 +373,16 @@ void BindPlace(pybind11::module &m) {  // NOLINT
 #endif
       .def("__repr__", string::to_string<const platform::CUDAPlace &>)
       .def("__str__", string::to_string<const platform::CUDAPlace &>);
-
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool {
+    // Only GPUs with Compute Capability >= 53 support float16
+    return platform::GetGPUComputeCapability(place.device) >= 53;
+  });
+  m.def("is_bfloat16_supported", [](const platform::CUDAPlace &place) -> bool {
+    // Only GPUs with Compute Capability >= 80 support bfloat16
+    return platform::GetGPUComputeCapability(place.device) >= 80;
+  });
+#endif
   py::class_<platform::XPUPlace> xpuplace(m, "XPUPlace", R"DOC(
     **Note**:
     Examples:
@@ -492,7 +501,18 @@ void BindPlace(pybind11::module &m) {  // NOLINT
            &IsSamePlace<platform::CPUPlace, platform::CUDAPinnedPlace>)
       .def("__repr__", string::to_string<const platform::CPUPlace &>)
       .def("__str__", string::to_string<const platform::CPUPlace &>);
-
+  m.def("is_float16_supported",
+        [](const platform::CPUPlace &place) -> bool { return false; });
+  m.def("is_bfloat16_supported", [](const platform::CPUPlace &place) -> bool {
+#ifndef PADDLE_WITH_MKLDNN
+    return false;
+#else
+    if (phi::backends::cpu::MayIUse(phi::backends::cpu::cpu_isa_t::avx512_core))
+      return true;
+    else
+      return false;
+#endif
+  });
   py::class_<paddle::platform::CUDAPinnedPlace> cudapinnedplace(
       m, "CUDAPinnedPlace", R"DOC(
     CUDAPinnedPlace is a descriptor of a device.
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 65aa609e34fde1..bde6357ccbe2f9 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1960,17 +1960,6 @@ All parameter, weight, gradient are variables in Paddle.
       py::arg("sleep_inter") = 0,
       py::arg("redirect_stderr") = false);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool {
-    // Only GPUs with Compute Capability >= 53 support float16
-    return platform::GetGPUComputeCapability(place.device) >= 53;
-  });
-  m.def("is_bfloat16_supported", [](const platform::CUDAPlace &place) -> bool {
-    // Only GPUs with Compute Capability >= 80 support bfloat16
-    return platform::GetGPUComputeCapability(place.device) >= 80;
-  });
-#endif
-
   m.def("set_feed_variable",
         static_cast<void (*)(  // NOLINT
             Scope *,
diff --git a/python/paddle/amp/__init__.py b/python/paddle/amp/__init__.py
index 7437b0d9df6e3a..60df9de03ad119 100644
--- a/python/paddle/amp/__init__.py
+++ b/python/paddle/amp/__init__.py
@@ -28,4 +28,68 @@
 
 from . import debugging  # noqa: F401
 
-__all__ = ['auto_cast', 'GradScaler', 'decorate']
+from paddle.fluid import core
+from paddle.fluid.framework import (
+    _current_expected_place,
+    _get_paddle_place,
+)
+
+__all__ = [
+    'auto_cast',
+    'GradScaler',
+    'decorate',
+    'is_float16_supported',
+    'is_bfloat16_supported',
+]
+
+
+def is_float16_supported(device=None):
+    """
+    Determine whether the place supports float16 in the auto-mixed-precision training.
+
+    Args:
+        device (str|None, optional): Specify the running device.
+            It can be ``cpu``, ``gpu``, ``xpu``, ``gpu:x`` and ``xpu:x``,
+            where ``x`` is the index of the GPUs or XPUs. if device is None, the device is the current device. Default: None.
+
+    Examples:
+
+     .. code-block:: python
+
+        import paddle
+        paddle.amp.is_float16_supported() # True or False
+    """
+
+    device = (
+        _current_expected_place()
+        if device is None
+        else _get_paddle_place(device)
+    )
+
+    return core.is_float16_supported(device)
+
+
+def is_bfloat16_supported(device=None):
+    """
+    Determine whether the place supports bfloat16 in the auto-mixed-precision training.
+
+    Args:
+        device (str|None, optional): Specify the running device.
+            It can be ``cpu``, ``gpu``, ``xpu``, ``gpu:x`` and ``xpu:x``,
+            where ``x`` is the index of the GPUs or XPUs. if device is None, the device is the current device. Default: None.
+
+    Examples:
+
+     .. code-block:: python
+
+        import paddle
+        paddle.amp.is_bfloat16_supported() # True or False
+    """
+
+    device = (
+        _current_expected_place()
+        if device is None
+        else _get_paddle_place(device)
+    )
+
+    return core.is_bfloat16_supported(device)
diff --git a/python/paddle/nn/layer/layers.py b/python/paddle/nn/layer/layers.py
index 8a3659e87b037f..0babc935f1de79 100644
--- a/python/paddle/nn/layer/layers.py
+++ b/python/paddle/nn/layer/layers.py
@@ -22,7 +22,7 @@
 import numpy as np
 
 import paddle
-from paddle import profiler
+from paddle import nn, profiler
 from paddle.fluid import core, framework, unique_name
 from paddle.fluid.core import VarDesc
 from paddle.fluid.dygraph import no_grad
@@ -125,6 +125,13 @@ def _addindent(string, indent):
     return s1[0] + '\n' + '\n'.join(s2)
 
 
+def _layer_trans_dtype(layer, dtype, excluded_layers):
+    if type(layer) in excluded_layers:
+        return
+
+    layer._to_impl(dtype=dtype, floating_only=True, include_sublayers=False)
+
+
 class LayerObjectHelper(LayerHelperBase):
     def __init__(self, name):
         super().__init__(name, layer_type=name)
@@ -2146,3 +2153,170 @@ def _startup_program(self):
     # [aliases] Compatible with old method names
     set_dict = set_state_dict
     load_dict = set_state_dict
+
+    def float(self, excluded_layers=None):
+        '''
+        Casts all floating point parameters and buffers to ``float`` data type.
+
+        Parameters:
+            excluded_layers(nn.Layer|list|None, optional): Specify the layers that need to be kept original data type. if excluded_layers is None, casts all floating point parameters and buffers. Default: None.
+
+        Returns:
+            Layer: self
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+
+                class Model(paddle.nn.Layer):
+                    def __init__(self):
+                        super().__init__()
+                        self.linear = paddle.nn.Linear(1, 1)
+                        self.dropout = paddle.nn.Dropout(p=0.5)
+
+                    def forward(self, input):
+                        out = self.linear(input)
+                        out = self.dropout(out)
+                        return out
+
+                model = Model()
+                model.float()
+        '''
+
+        excluded_layers = [] if excluded_layers is None else excluded_layers
+
+        if isinstance(excluded_layers, type):
+            excluded_layers = [excluded_layers]
+        elif isinstance(excluded_layers, list):
+            pass
+        else:
+            raise TypeError(
+                "excluded_layers should be type nn.Layer or list, but got %s.",
+                type(excluded_layers).__name__,
+            )
+
+        def layer_trans(layer):
+            _layer_trans_dtype(layer, paddle.float32, excluded_layers)
+
+        return self.apply(layer_trans)
+
+    def float16(self, excluded_layers=None):
+        '''
+        Casts all floating point parameters and buffers to ``float16`` data type.
+
+
+        .. note::
+            ``nn.BatchNorm`` does not support ``bfloat16`` weights, so it would not be converted by default.
+
+
+        Parameters:
+           excluded_layers(nn.Layer|list|None, optional): Specify the layers that need to be kept original data type. if excluded_layers is None, casts all floating point parameters and buffers except ``nn.BatchNorm``. Default: None.
+
+        Returns:
+            Layer: self
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+
+                class Model(paddle.nn.Layer):
+                    def __init__(self):
+                        super().__init__()
+                        self.linear = paddle.nn.Linear(1, 1)
+                        self.dropout = paddle.nn.Dropout(p=0.5)
+
+                    def forward(self, input):
+                        out = self.linear(input)
+                        out = self.dropout(out)
+                        return out
+
+                model = Model()
+                model.float16()
+        '''
+
+        if paddle.amp.is_float16_supported() is False:
+            warnings.warn(
+                "Paddle compiled by the user does not support float16, so keep original data type."
+            )
+            return self
+
+        excluded_layers = (
+            [nn.BatchNorm] if excluded_layers is None else excluded_layers
+        )
+
+        if isinstance(excluded_layers, type):
+            excluded_layers = [excluded_layers]
+        elif isinstance(excluded_layers, list):
+            pass
+        else:
+            raise TypeError(
+                "excluded_layers should be type nn.Layer or list, but got %s.",
+                type(excluded_layers).__name__,
+            )
+
+        def layer_trans(layer):
+            _layer_trans_dtype(layer, paddle.float16, excluded_layers)
+
+        return self.apply(layer_trans)
+
+    def bfloat16(self, excluded_layers=None):
+        '''
+        Casts all floating point parameters and buffers to ``bfloat16`` data type.
+
+
+        .. note::
+            ``nn.BatchNorm`` does not support ``bfloat16`` weights, so it would not be converted by default.
+
+
+        Parameters:
+            excluded_layers(nn.Layer|list|None, optional): Specify the layers that need to be kept original data type. if excluded_layers is None, casts all floating point parameters and buffers except ``nn.BatchNorm``. Default: None.
+
+        Returns:
+            Layer: self
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+
+                class Model(paddle.nn.Layer):
+                    def __init__(self):
+                        super().__init__()
+                        self.linear = paddle.nn.Linear(1, 1)
+                        self.dropout = paddle.nn.Dropout(p=0.5)
+
+                    def forward(self, input):
+                        out = self.linear(input)
+                        out = self.dropout(out)
+                        return out
+
+                model = Model()
+                model.bfloat16()
+        '''
+
+        if paddle.amp.is_bfloat16_supported() is False:
+            warnings.warn(
+                "Paddle compiled by the user does not support bfloat16, so keep original data type."
+            )
+            return self
+
+        excluded_layers = (
+            [nn.BatchNorm] if excluded_layers is None else excluded_layers
+        )
+
+        if isinstance(excluded_layers, type):
+            excluded_layers = [excluded_layers]
+        elif isinstance(excluded_layers, list):
+            pass
+        else:
+            raise TypeError(
+                "excluded_layers should be type nn.Layer or list, but got %s.",
+                type(excluded_layers).__name__,
+            )
+
+        def layer_trans(layer):
+            _layer_trans_dtype(layer, paddle.bfloat16, excluded_layers)
+
+        return self.apply(layer_trans)
diff --git a/test/amp/test_layer_convert_dtype.py b/test/amp/test_layer_convert_dtype.py
new file mode 100644
index 00000000000000..86ee5c5fc25035
--- /dev/null
+++ b/test/amp/test_layer_convert_dtype.py
@@ -0,0 +1,172 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+from paddle.fluid import core
+
+
+class MyModel(paddle.nn.Layer):
+    def __init__(self, input_size, hidden_size):
+        super().__init__()
+        self.linear1 = paddle.nn.Linear(input_size, hidden_size)
+        self.linear2 = paddle.nn.Linear(hidden_size, hidden_size)
+        self.linear3 = paddle.nn.Linear(hidden_size, 1)
+        self.batchnorm = paddle.nn.Sequential(paddle.nn.BatchNorm(hidden_size))
+        register_buffer_in_temp = paddle.ones([4, 6])
+        self.register_buffer('register_buffer_in', register_buffer_in_temp)
+
+    def forward(self, inputs):
+        x = self.linear1(inputs)
+        x = F.relu(x)
+        x = self.batchnorm(x)
+        x = self.linear3(x)
+        return x
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda(), "Require compiled with CUDA."
+)
+class TestDtypeConvert(unittest.TestCase):
+    def setUp(self):
+        self.batch_size, self.input_size, self.hidden_size = 128, 128, 256
+
+    def verify_trans_dtype(
+        self, test_type=None, excluded_layers=None, corrected_dtype=None
+    ):
+        model = MyModel(self.input_size, self.hidden_size)
+        if test_type == 'float16':
+            model.float16(excluded_layers=excluded_layers)
+        elif test_type == 'bfloat16':
+            model.bfloat16(excluded_layers=excluded_layers)
+        else:
+            model.float(excluded_layers=excluded_layers)
+
+        for name, para in model.named_parameters():
+            if 'linear' in name:
+                self.assertEqual(para.dtype, corrected_dtype)
+            elif 'batchnorm' in name:
+                if excluded_layers is None:
+                    self.assertEqual(para.dtype, paddle.float32)
+                else:
+                    self.assertEqual(para.dtype, paddle.float16)
+
+    def test_excluded_layers(self):
+        self.verify_trans_dtype(
+            test_type='float16',
+            excluded_layers=[nn.Linear],
+            corrected_dtype=paddle.float32,
+        )
+        self.verify_trans_dtype(
+            test_type='float16',
+            excluded_layers=nn.Linear,
+            corrected_dtype=paddle.float32,
+        )
+
+    def test_float16(self):
+        self.verify_trans_dtype(
+            test_type='float16',
+            corrected_dtype=paddle.float16,
+        )
+
+    @unittest.skipIf(
+        not core.is_compiled_with_cuda()
+        or paddle.device.cuda.get_device_capability()[0] >= 8.0,
+        "run test when maximum gpu's compute capability is 8.0.",
+    )
+    def test_unsupported_bfloat16(self):
+        self.verify_trans_dtype(
+            test_type='bfloat16',
+            corrected_dtype=paddle.float32,
+        )
+
+    @unittest.skipIf(
+        not core.is_compiled_with_cuda()
+        or paddle.device.cuda.get_device_capability()[0] < 8.0,
+        "run test when gpu's compute capability is at least 8.0.",
+    )
+    def test_supported_bfloat16(self):
+        self.verify_trans_dtype(
+            test_type='bfloat16',
+            corrected_dtype=paddle.bfloat16,
+        )
+
+    def test_float32(self):
+        paddle.set_default_dtype('float16')
+        self.verify_trans_dtype(
+            test_type='float32',
+            corrected_dtype=paddle.float32,
+        )
+        paddle.set_default_dtype('float32')
+
+    def test_excluded_layers_type_error(self):
+        self.assertRaises(
+            TypeError, self.verify_trans_dtype, excluded_layers=111
+        )
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda(), "Require compiled with CUDA."
+)
+class TestSupportedTypeInfo(unittest.TestCase):
+    def test_cpu(self):
+        res = paddle.amp.is_float16_supported('cpu')
+        self.assertEqual(res, False)
+        res = paddle.amp.is_bfloat16_supported('cpu')
+        self.assertEqual(res, True)
+
+    def test_gpu_fp16_supported(self):
+        res = paddle.amp.is_float16_supported()
+        self.assertEqual(res, True)
+        res = paddle.amp.is_float16_supported('gpu')
+        self.assertEqual(res, True)
+        res = paddle.amp.is_float16_supported('gpu:0')
+        self.assertEqual(res, True)
+
+    @unittest.skipIf(
+        not core.is_compiled_with_cuda()
+        or paddle.device.cuda.get_device_capability()[0] >= 8.0,
+        "run test when maximum gpu's compute capability is 8.0.",
+    )
+    def test_gpu_bf16_unsupported(self):
+        res = paddle.amp.is_bfloat16_supported()
+        self.assertEqual(res, False)
+        res = paddle.amp.is_bfloat16_supported('gpu')
+        self.assertEqual(res, False)
+
+    @unittest.skipIf(
+        not core.is_compiled_with_cuda()
+        or paddle.device.cuda.get_device_capability()[0] < 8.0,
+        "run test when gpu's compute capability is at least 8.0.",
+    )
+    def test_gpu_bf16_supported(self):
+        res = paddle.amp.is_bfloat16_supported()
+        self.assertEqual(res, True)
+        res = paddle.amp.is_bfloat16_supported('gpu')
+        self.assertEqual(res, True)
+
+    def test_device_value_error(self):
+        self.assertRaises(
+            ValueError, paddle.amp.is_float16_supported, device='xxx'
+        )
+        self.assertRaises(
+            ValueError, paddle.amp.is_float16_supported, device=111
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()

From 9f2e30641929eeb35426ff39d1ef41b7c235eb67 Mon Sep 17 00:00:00 2001
From: xiaoguoguo626807 <100397923+xiaoguoguo626807@users.noreply.github.com>
Date: Wed, 12 Apr 2023 15:49:26 +0800
Subject: [PATCH 096/156] recover multiply prune (#52713)

---
 paddle/fluid/eager/auto_code_generator/generator/eager_gen.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
index e22355d88d3290..4e105d138b7e8e 100644
--- a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
@@ -73,7 +73,6 @@
 # bacward api's output usually affected by backward api's input
 special_prune_dict = {
     "matmul_grad": {"x": "grad_y", "y": "grad_x"},
-    "multiply_grad": {"x": "grad_y", "y": "grad_x"},
 }
 
 

From cea62c00248385e1a058dff1d94caa6477c4c031 Mon Sep 17 00:00:00 2001
From: WangZhen <23097963+0x45f@users.noreply.github.com>
Date: Wed, 12 Apr 2023 16:04:46 +0800
Subject: [PATCH 097/156] Eval during train for ResNet (#52768)

* Eval during train for ResNet
---
 test/prim/model/test_resnet_prim_cinn.py | 78 ++++++++++++++----------
 1 file changed, 47 insertions(+), 31 deletions(-)

diff --git a/test/prim/model/test_resnet_prim_cinn.py b/test/prim/model/test_resnet_prim_cinn.py
index eee27720313dec..46ea9bfba72a7b 100644
--- a/test/prim/model/test_resnet_prim_cinn.py
+++ b/test/prim/model/test_resnet_prim_cinn.py
@@ -131,31 +131,13 @@ def optimizer_setting(parameter_list=None):
     return optimizer
 
 
-def train(to_static, enable_prim, enable_cinn):
-    if core.is_compiled_with_cuda():
-        paddle.set_device('gpu')
-    else:
-        paddle.set_device('cpu')
-    np.random.seed(SEED)
-    paddle.seed(SEED)
-    paddle.framework.random._manual_program_seed(SEED)
-    fluid.core._set_prim_all_enabled(enable_prim)
-
-    train_reader = paddle.batch(
-        reader_decorator(paddle.dataset.flowers.train(use_xmap=False)),
-        batch_size=batch_size,
-        drop_last=True,
-    )
-    data_loader = fluid.io.DataLoader.from_generator(capacity=5, iterable=True)
-    data_loader.set_sample_list_generator(train_reader)
-
-    resnet = resnet50(False)
-    if to_static:
-        build_strategy = paddle.static.BuildStrategy()
-        if enable_cinn:
-            build_strategy.build_cinn_pass = True
-        resnet = paddle.jit.to_static(resnet, build_strategy=build_strategy)
-    optimizer = optimizer_setting(parameter_list=resnet.parameters())
+def run(model, data_loader, optimizer, mode):
+    if mode == 'train':
+        model.train()
+        end_step = 9
+    elif mode == 'eval':
+        model.eval()
+        end_step = 1
 
     for epoch in range(epoch_num):
         total_acc1 = 0.0
@@ -167,7 +149,7 @@ def train(to_static, enable_prim, enable_cinn):
             start_time = time.time()
             img, label = data
 
-            pred = resnet(img)
+            pred = model(img)
             avg_loss = paddle.nn.functional.cross_entropy(
                 input=pred,
                 label=label,
@@ -179,9 +161,10 @@ def train(to_static, enable_prim, enable_cinn):
             acc_top1 = paddle.static.accuracy(input=pred, label=label, k=1)
             acc_top5 = paddle.static.accuracy(input=pred, label=label, k=5)
 
-            avg_loss.backward()
-            optimizer.minimize(avg_loss)
-            resnet.clear_gradients()
+            if mode == 'train':
+                avg_loss.backward()
+                optimizer.minimize(avg_loss)
+                model.clear_gradients()
 
             total_acc1 += acc_top1
             total_acc5 += acc_top5
@@ -190,8 +173,9 @@ def train(to_static, enable_prim, enable_cinn):
 
             end_time = time.time()
             print(
-                "epoch %d | batch step %d, loss %0.8f, acc1 %0.3f, acc5 %0.3f, time %f"
+                "[%s]epoch %d | batch step %d, loss %0.8f, acc1 %0.3f, acc5 %0.3f, time %f"
                 % (
+                    mode,
                     epoch,
                     batch_id,
                     avg_loss,
@@ -200,7 +184,7 @@ def train(to_static, enable_prim, enable_cinn):
                     end_time - start_time,
                 )
             )
-            if batch_id >= 9:
+            if batch_id >= end_step:
                 # avoid dataloader throw abort signaal
                 data_loader._reset()
                 break
@@ -208,6 +192,38 @@ def train(to_static, enable_prim, enable_cinn):
     return losses
 
 
+def train(to_static, enable_prim, enable_cinn):
+    if core.is_compiled_with_cuda():
+        paddle.set_device('gpu')
+    else:
+        paddle.set_device('cpu')
+    np.random.seed(SEED)
+    paddle.seed(SEED)
+    paddle.framework.random._manual_program_seed(SEED)
+    fluid.core._set_prim_all_enabled(enable_prim)
+
+    train_reader = paddle.batch(
+        reader_decorator(paddle.dataset.flowers.train(use_xmap=False)),
+        batch_size=batch_size,
+        drop_last=True,
+    )
+    data_loader = fluid.io.DataLoader.from_generator(capacity=5, iterable=True)
+    data_loader.set_sample_list_generator(train_reader)
+
+    resnet = resnet50(False)
+    if to_static:
+        build_strategy = paddle.static.BuildStrategy()
+        if enable_cinn:
+            build_strategy.build_cinn_pass = True
+        resnet = paddle.jit.to_static(resnet, build_strategy=build_strategy)
+    optimizer = optimizer_setting(parameter_list=resnet.parameters())
+
+    train_losses = run(resnet, data_loader, optimizer, 'train')
+    if to_static and enable_prim and enable_cinn:
+        eval_losses = run(resnet, data_loader, optimizer, 'eval')
+    return train_losses
+
+
 class TestResnet(unittest.TestCase):
     @unittest.skipIf(
         not (paddle.is_compiled_with_cinn() and paddle.is_compiled_with_cuda()),

From 8e7c37894f8381d5e9662aa2974c4bbeac9b628e Mon Sep 17 00:00:00 2001
From: zhupengyang <zhu_py@qq.com>
Date: Wed, 12 Apr 2023 16:26:20 +0800
Subject: [PATCH 098/156] cache scope in while (#52628)

---
 .../fluid/operators/controlflow/while_op.cc   | 33 +++++++++++++++----
 1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc
index 30fdb90ce10696..4c7578c0104739 100644
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
@@ -22,6 +22,13 @@
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
+
+PADDLE_DEFINE_EXPORTED_bool(
+    cache_inference_while_scope,
+    false,
+    "Cache the scope of the while op to avoid repeated creation of the scope "
+    "for each iteration and improve inference performance.");
+
 namespace paddle {
 namespace framework {
 class InferShapeContext;
@@ -257,14 +264,23 @@ class WhileOp : public framework::OperatorBase {
             scope.FindVar(Input(kCondition))->Get<phi::DenseTensor>());
       }
     } else {
-      auto &current_scope = scope.NewScope();
-
-      BuildScopeForControlFlowOp(*core_, *block, &current_scope);
-      core_->reset_scope(&current_scope);
+      framework::Scope *current_scope = nullptr;
+      if (!FLAGS_cache_inference_while_scope) {
+        current_scope = &(scope.NewScope());
+        BuildScopeForControlFlowOp(*core_, *block, current_scope);
+        core_->reset_scope(current_scope);
+      } else {
+        if (cached_inference_scope_ == nullptr) {
+          cached_inference_scope_ = &(scope.NewScope());
+          BuildScopeForControlFlowOp(*core_, *block, cached_inference_scope_);
+          core_->reset_scope(cached_inference_scope_);
+        }
+        current_scope = cached_inference_scope_;
+      }
 
       while (cond_data) {
-        for (auto &name : current_scope.LocalVarNames()) {
-          auto *var = current_scope.Var(name);
+        for (auto &name : current_scope->LocalVarNames()) {
+          auto *var = current_scope->Var(name);
           if (var->IsType<phi::DenseTensor>()) {
             // Clear all lod information for all lod_tensors.
             auto *t = var->GetMutable<phi::DenseTensor>();
@@ -283,7 +299,9 @@ class WhileOp : public framework::OperatorBase {
             scope.FindVar(Input(kCondition))->Get<phi::DenseTensor>());
       }
 
-      scope.DeleteScope(&current_scope);
+      if (!FLAGS_cache_inference_while_scope) {
+        scope.DeleteScope(current_scope);
+      }
     }
   }
 
@@ -291,6 +309,7 @@ class WhileOp : public framework::OperatorBase {
   mutable std::shared_ptr<framework::Executor> executor_{nullptr};
   mutable std::unique_ptr<framework::ExecutorPrepareContext> ctx_{nullptr};
   mutable std::shared_ptr<framework::InterpreterCore> core_{nullptr};
+  mutable framework::Scope *cached_inference_scope_{nullptr};
 };
 
 class WhileOpMaker : public framework::OpProtoAndCheckerMaker {

From c376a9408d76bfa58869d4c35a54bf4b25c28923 Mon Sep 17 00:00:00 2001
From: gouzil <66515297+gouzil@users.noreply.github.com>
Date: Wed, 12 Apr 2023 16:53:35 +0800
Subject: [PATCH 099/156] [phi] mv sequence_pool to phi - Step 1 :
 sequence_pooling_test (#52782)

* [phi] mv sequence_pooling_test

* [test] fix include
---
 paddle/fluid/operators/math/CMakeLists.txt    |  4 --
 test/cpp/phi/kernels/CMakeLists.txt           |  5 +++
 .../cpp/phi/kernels}/sequence_pooling_test.cc | 43 ++++++++++---------
 3 files changed, 28 insertions(+), 24 deletions(-)
 rename {paddle/fluid/operators/math => test/cpp/phi/kernels}/sequence_pooling_test.cc (81%)

diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index 6975873b137967..42cb92db8625e3 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -39,10 +39,6 @@ cc_test(
   vol2col_test
   SRCS vol2col_test.cc
   DEPS vol2col)
-cc_test(
-  sequence_pooling_test
-  SRCS sequence_pooling_test.cc
-  DEPS sequence_pooling)
 cc_test(
   beam_search_test
   SRCS beam_search_test.cc
diff --git a/test/cpp/phi/kernels/CMakeLists.txt b/test/cpp/phi/kernels/CMakeLists.txt
index a9e897eb614dcf..3e7f394f186da2 100644
--- a/test/cpp/phi/kernels/CMakeLists.txt
+++ b/test/cpp/phi/kernels/CMakeLists.txt
@@ -105,3 +105,8 @@ cc_test(
   sequence_padding_test
   SRCS sequence_padding_test.cc
   DEPS sequence_padding)
+
+cc_test(
+  sequence_pooling_test
+  SRCS sequence_pooling_test.cc
+  DEPS sequence_pooling)
diff --git a/paddle/fluid/operators/math/sequence_pooling_test.cc b/test/cpp/phi/kernels/sequence_pooling_test.cc
similarity index 81%
rename from paddle/fluid/operators/math/sequence_pooling_test.cc
rename to test/cpp/phi/kernels/sequence_pooling_test.cc
index dac5eb63bfc13d..3c12d55ed360f9 100644
--- a/paddle/fluid/operators/math/sequence_pooling_test.cc
+++ b/test/cpp/phi/kernels/sequence_pooling_test.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,13 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <gtest/gtest.h>
+
 #include "paddle/fluid/operators/math/sequence_pooling.h"
 
-#include <gtest/gtest.h>
+#include "paddle/phi/backends/context_pool.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/tensor_utils.h"
 
 template <typename DeviceContext, typename T>
 void TestSequencePoolingSum(const DeviceContext &context,
-                            const paddle::framework::LoD &lod,
+                            const phi::LoD &lod,
                             const int64_t second_dim) {
   phi::DenseTensor cpu_out_grad;
   phi::DenseTensor cpu_in_grad;
@@ -30,17 +34,17 @@ void TestSequencePoolingSum(const DeviceContext &context,
   auto out_dims =
       phi::make_ddim({static_cast<int64_t>(out_first_dim), second_dim});
 
-  cpu_out_grad.mutable_data<T>(out_dims, paddle::platform::CPUPlace());
+  cpu_out_grad.mutable_data<T>(out_dims, phi::CPUPlace());
   for (int64_t i = 0; i < cpu_out_grad.numel(); ++i) {
     cpu_out_grad.data<T>()[i] = static_cast<T>(i);
   }
 
   // copy to dst out_grad
   auto place = context.GetPlace();
-  if (paddle::platform::is_cpu_place(place)) {
+  if (place == phi::CPUPlace()) {
     out_grad = cpu_out_grad;
   } else {
-    paddle::framework::TensorCopySync(cpu_out_grad, place, &out_grad);
+    phi::Copy(context, cpu_out_grad, place, true, &out_grad);
   }
 
   // construct in_grad
@@ -53,7 +57,7 @@ void TestSequencePoolingSum(const DeviceContext &context,
   PADDLE_ENFORCE_EQ(
       in_grad.dims().size(),
       out_grad.dims().size(),
-      paddle::platform::errors::InvalidArgument(
+      phi::errors::InvalidArgument(
           "The dimension of input and output shall be same. Expected %ld == "
           "%ld, but got %ld != %ld. Please check the input value.",
           in_grad.dims().size(),
@@ -64,7 +68,7 @@ void TestSequencePoolingSum(const DeviceContext &context,
     PADDLE_ENFORCE_EQ(
         in_grad.dims()[i],
         out_grad.dims()[i],
-        paddle::platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The dimension of input and output shall be same. Expected %ld == "
             "%ld, but got %ld != %ld. Please check the input value.",
             in_grad.dims()[i],
@@ -77,18 +81,17 @@ void TestSequencePoolingSum(const DeviceContext &context,
   paddle::operators::math::SequencePoolGradFunctor<DeviceContext, T>()(
       context, "SUM", out_grad, &in_grad);
 
-  if (paddle::platform::is_cpu_place(place)) {
+  if (place == phi::CPUPlace()) {
     cpu_in_grad = in_grad;
   } else {
-    paddle::framework::TensorCopySync(
-        in_grad, paddle::platform::CPUPlace(), &cpu_in_grad);
+    phi::Copy(context, in_grad, phi::CPUPlace(), true, &cpu_in_grad);
     cpu_in_grad.set_lod(in_grad.lod());
   }
 
   EXPECT_EQ(in_grad.numel(), static_cast<int64_t>(lod[0].back() * second_dim));
   EXPECT_EQ(in_grad.lod(), lod);
 
-  if (paddle::platform::is_cpu_place(place)) {
+  if (place == phi::CPUPlace()) {
     for (size_t i = 0; i < in_grad.lod()[0].size() - 1; ++i) {
       int64_t begin = in_grad.lod()[0][i];
       int64_t end = in_grad.lod()[0][i + 1];
@@ -116,30 +119,30 @@ void TestSequencePoolingSum(const DeviceContext &context,
 }
 
 TEST(SequencePoolingGrad, CPU_SUM) {
-  auto place = paddle::platform::CPUPlace();
+  auto place = phi::CPUPlace();
   auto *context = static_cast<phi::CPUContext *>(
-      paddle::platform::DeviceContextPool::Instance().Get(place));
+      phi::DeviceContextPool::Instance().Get(place));
 
-  paddle::framework::LoD lod1;
+  phi::LoD lod1;
   lod1.push_back(std::vector<size_t>{0, 10});
   TestSequencePoolingSum<phi::CPUContext, float>(*context, lod1, 128);
 
-  paddle::framework::LoD lod2;
+  phi::LoD lod2;
   lod2.push_back(std::vector<size_t>{0, 2, 7, 10});
   TestSequencePoolingSum<phi::CPUContext, float>(*context, lod2, 128);
 }
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 TEST(SequencePoolingGrad, CUDA_SUM) {
-  auto place = paddle::platform::CUDAPlace(0);
+  auto place = phi::GPUPlace(0);
   auto *context = static_cast<phi::GPUContext *>(
-      paddle::platform::DeviceContextPool::Instance().Get(place));
+      phi::DeviceContextPool::Instance().Get(place));
 
-  paddle::framework::LoD lod1;
+  phi::LoD lod1;
   lod1.push_back(std::vector<size_t>{0, 10});
   TestSequencePoolingSum<phi::GPUContext, float>(*context, lod1, 128);
 
-  paddle::framework::LoD lod2;
+  phi::LoD lod2;
   lod2.push_back(std::vector<size_t>{0, 2, 7, 10});
   TestSequencePoolingSum<phi::GPUContext, float>(*context, lod2, 128);
 }

From 8cbeefea9d7cba2de98574eafa12b87daab7af1e Mon Sep 17 00:00:00 2001
From: Zhang Zheng <32410583+ZzSean@users.noreply.github.com>
Date: Wed, 12 Apr 2023 17:07:33 +0800
Subject: [PATCH 100/156] Optimize performance of unique kernel (#52736)

* Optimize performance of unique kernel

* fix ci
---
 paddle/phi/kernels/gpu/unique_kernel.cu | 141 +++++++-----------------
 1 file changed, 41 insertions(+), 100 deletions(-)

diff --git a/paddle/phi/kernels/gpu/unique_kernel.cu b/paddle/phi/kernels/gpu/unique_kernel.cu
index c073708ed85569..10cf1ea8df5343 100644
--- a/paddle/phi/kernels/gpu/unique_kernel.cu
+++ b/paddle/phi/kernels/gpu/unique_kernel.cu
@@ -30,6 +30,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/unique_functor.h"
+#include "paddle/phi/kernels/index_select_kernel.h"
 
 namespace phi {
 
@@ -98,76 +99,6 @@ struct BinaryNotEqual {
   }
 };
 
-// index_select() function for DenseTensor
-template <typename Context, typename InT, typename IndexT>
-void IndexSelect(const Context& context,
-                 const DenseTensor& input,
-                 const DenseTensor& index,
-                 DenseTensor* output,
-                 int dim) {
-  auto input_dim = input.dims();
-  auto input_dim_size = input_dim.size();
-  auto output_dim = output->dims();
-
-  auto slice_size = 1;
-  for (auto i = dim + 1; i < input_dim_size; i++) {
-    slice_size *= input_dim[i];
-  }
-
-  auto input_width = slice_size * input_dim[dim];
-  auto output_width = slice_size * output_dim[dim];
-
-  auto outer_nums = 1;
-  for (auto i = 0; i < dim; i++) {
-    outer_nums *= input_dim[i];
-  }
-
-  auto index_size = index.dims()[0];
-
-  std::vector<InT> input_vec;
-  std::vector<IndexT> index_vec;
-  phi::TensorToVector(input, context, &input_vec);
-  phi::TensorToVector(index, context, &index_vec);
-  std::vector<InT> out_vec(output->numel());
-
-  for (int i = 0; i < index_size; i++) {
-    PADDLE_ENFORCE_GE(
-        index_vec[i],
-        0,
-        phi::errors::InvalidArgument(
-            "Variable value (index) of OP(index_select) "
-            "expected >= 0 and < %ld, but got %ld. Please check input "
-            "value.",
-            input_dim[dim],
-            index_vec[i]));
-    PADDLE_ENFORCE_LT(
-        index_vec[i],
-        input_dim[dim],
-        phi::errors::InvalidArgument(
-            "Variable value (index) of OP(index_select) "
-            "expected >= 0 and < %ld, but got %ld. Please check input "
-            "value.",
-            input_dim[dim],
-            index_vec[i]));
-  }
-
-  for (auto i = 0; i < outer_nums; i++) {
-    auto input_start_offset = i * input_width;
-    auto output_start_offset = i * output_width;
-
-    for (auto j = 0; j < index_size; j++) {
-      IndexT index_value = index_vec[j];
-      for (auto k = 0; k < slice_size; k++) {
-        out_vec[output_start_offset + j * slice_size + k] =
-            input_vec[input_start_offset + index_value * slice_size + k];
-      }
-    }
-  }
-  context.template Alloc<IndexT>(output);
-  phi::TensorFromVector(out_vec, context, output);
-  output->Resize(output_dim);
-}
-
 // The core logic of computing Unique for a flattend DenseTensor
 template <typename Context,
           typename InT,
@@ -354,24 +285,29 @@ static void UniqueDimsCUDATensor(const Context& context,
                                  int axis) {
   // 1. Transpose & reshape
   // Transpose tensor: eg. axis=1, [dim0, dim1, dim2] -> [dim1, dim0, dim2]
-  std::vector<int> permute(in.dims().size());
-  std::iota(permute.begin(), permute.end(), 0);
-  permute[axis] = 0;
-  permute[0] = axis;
-  std::vector<int64_t> in_trans_dims_vec(phi::vectorize(in.dims()));
-  in_trans_dims_vec[axis] = in.dims()[0];
-  in_trans_dims_vec[0] = in.dims()[axis];
   DenseTensor in_trans;
+  std::vector<int64_t> in_trans_dims_vec(phi::vectorize(in.dims()));
   auto in_trans_dims = phi::make_ddim(in_trans_dims_vec);
-  in_trans.Resize(in_trans_dims);
-  context.template Alloc<InT>(&in_trans);
-  phi::funcs::TransCompute<Context, InT>(
-      in.dims().size(),  // num of dims
-      context,           // device
-      in,                // original DenseTensor
-      &in_trans,         // DenseTensor after reshape
-      permute);          // index of axis
-
+  std::vector<int> permute(in.dims().size());
+  bool is_transpose = axis != 0;
+  if (is_transpose) {
+    std::iota(permute.begin(), permute.end(), 0);
+    permute[axis] = 0;
+    permute[0] = axis;
+    in_trans_dims_vec[axis] = in.dims()[0];
+    in_trans_dims_vec[0] = in.dims()[axis];
+    in_trans_dims = phi::make_ddim(in_trans_dims_vec);
+    in_trans.Resize(in_trans_dims);
+    context.template Alloc<InT>(&in_trans);
+    phi::funcs::TransCompute<Context, InT>(
+        in.dims().size(),  // num of dims
+        context,           // device
+        in,                // original DenseTensor
+        &in_trans,         // DenseTensor after reshape
+        permute);          // index of axis
+  } else {
+    in_trans.ShareDataWith(in);
+  }
   // Reshape tensor: eg. [dim1, dim0, dim2] -> [dim1, dim0*dim2]
   auto in_trans_flat_dims = phi::flatten_to_2d(in_trans_dims, 1);
   in_trans.Resize(in_trans_flat_dims);
@@ -407,22 +343,27 @@ static void UniqueDimsCUDATensor(const Context& context,
       row);
 
   // 3. Select indices and reshape back to get 'out'
-  DenseTensor out_trans;
   std::vector<int64_t> out_trans_dims_vec = in_trans_dims_vec;
   out_trans_dims_vec[0] = indices->numel();
-  out_trans.Resize(phi::make_ddim(out_trans_dims_vec));
-  context.template Alloc<InT>(&out_trans);
-
-  IndexSelect<Context, InT, IndexT>(context, in_trans, *indices, &out_trans, 0);
-
-  std::swap(out_trans_dims_vec[0], out_trans_dims_vec[axis]);
-  out->Resize(phi::make_ddim(out_trans_dims_vec));
-  context.template Alloc<InT>(out);
-  std::vector<DenseTensor> out_trans_unbind = phi::funcs::Unbind(out_trans);
-  phi::funcs::ConcatFunctor<Context, InT> concat_functor;
-  concat_functor(context, out_trans_unbind, 0, &out_trans);
-  phi::funcs::TransCompute<Context, InT>(
-      out_trans.dims().size(), context, out_trans, out, permute);
+  if (is_transpose) {
+    DenseTensor out_trans;
+    out_trans.Resize(phi::make_ddim(out_trans_dims_vec));
+    context.template Alloc<InT>(&out_trans);
+
+    phi::IndexSelectKernel<InT, Context>(
+        context, in_trans, *indices, 0, &out_trans);
+
+    std::swap(out_trans_dims_vec[0], out_trans_dims_vec[axis]);
+    out->Resize(phi::make_ddim(out_trans_dims_vec));
+    context.template Alloc<InT>(out);
+    phi::funcs::TransCompute<Context, InT>(
+        out_trans.dims().size(), context, out_trans, out, permute);
+  } else {
+    out->Resize(phi::make_ddim(out_trans_dims_vec));
+    context.template Alloc<InT>(out);
+
+    phi::IndexSelectKernel<InT, Context>(context, in_trans, *indices, 0, out);
+  }
 }
 
 // functor for processing a flattend DenseTensor

From fd97d7d107f043b9389d5f8a424d981d13b820de Mon Sep 17 00:00:00 2001
From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com>
Date: Wed, 12 Apr 2023 17:11:33 +0800
Subject: [PATCH 101/156] [IR] Value system && Operation (#51992)

* add Value OpResult OpOperand class

* add Value OpResult OpOperand class

* fix bug

* fix bug

* add utils

* refine code

* add ptr offset and reset method

* add value impl

* fix bug

* refine comment of ValueImpl

* refine code of OpResult

* refine code of Value

* add some comment

* fix cpu compile bug

* refine code

* add op

* add method for op & test value

* refine unittest

* refine code by comment

* refine code

* refine code

* refine code

* refine code
---
 paddle/ir/builtin_attribute.h          |   9 +-
 paddle/ir/builtin_attribute_storage.cc |   3 +-
 paddle/ir/builtin_attribute_storage.h  |   4 -
 paddle/ir/builtin_type_storage.h       |  26 ++--
 paddle/ir/op_base.h                    |  37 +++++
 paddle/ir/operation.cc                 | 173 ++++++++++++++++++++++
 paddle/ir/operation.h                  |  57 +++++++
 paddle/ir/tests/CMakeLists.txt         |   1 +
 paddle/ir/tests/ir_value_test.cc       |  98 +++++++++++++
 paddle/ir/tests/type_test.cc           |  10 +-
 paddle/ir/utils.cc                     |  58 ++++++++
 paddle/ir/utils.h                      |  28 ++++
 paddle/ir/value.cc                     | 183 +++++++++++++++++++++++
 paddle/ir/value.h                      | 137 +++++++++++++++++
 paddle/ir/value_impl.h                 | 196 +++++++++++++++++++++++++
 15 files changed, 988 insertions(+), 32 deletions(-)
 create mode 100644 paddle/ir/op_base.h
 create mode 100644 paddle/ir/operation.cc
 create mode 100644 paddle/ir/operation.h
 create mode 100644 paddle/ir/tests/ir_value_test.cc
 create mode 100644 paddle/ir/utils.cc
 create mode 100644 paddle/ir/utils.h
 create mode 100644 paddle/ir/value.cc
 create mode 100644 paddle/ir/value.h
 create mode 100644 paddle/ir/value_impl.h

diff --git a/paddle/ir/builtin_attribute.h b/paddle/ir/builtin_attribute.h
index 4572617ea57ec1..82b5f8eb48aa58 100644
--- a/paddle/ir/builtin_attribute.h
+++ b/paddle/ir/builtin_attribute.h
@@ -16,6 +16,7 @@
 
 #include "paddle/ir/attribute.h"
 #include "paddle/ir/builtin_attribute_storage.h"
+#include "paddle/ir/utils.h"
 
 namespace ir {
 ///
@@ -82,15 +83,11 @@ class DictionaryAttribute : public ir::Attribute {
 }  // namespace ir
 
 namespace std {
-static std::size_t hash_combine(std::size_t lhs, std::size_t rhs) {
-  return lhs ^= rhs + 0x9e3779b9 + (lhs << 6) + (lhs >> 2);
-}
-
 template <>
 struct hash<ir::NamedAttribute> {
   std::size_t operator()(const ir::NamedAttribute &obj) const {
-    return hash_combine(std::hash<ir::Attribute>()(obj.name_),
-                        std::hash<ir::Attribute>()(obj.value_));
+    return ir::hash_combine(std::hash<ir::Attribute>()(obj.name_),
+                            std::hash<ir::Attribute>()(obj.value_));
   }
 };
 }  // namespace std
diff --git a/paddle/ir/builtin_attribute_storage.cc b/paddle/ir/builtin_attribute_storage.cc
index 961319bc4a94ef..c7feacae4d64af 100644
--- a/paddle/ir/builtin_attribute_storage.cc
+++ b/paddle/ir/builtin_attribute_storage.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/ir/builtin_attribute_storage.h"
 #include "paddle/ir/builtin_attribute.h"
+#include "paddle/ir/utils.h"
 
 namespace ir {
 
@@ -32,7 +33,7 @@ DictionaryAttributeStorage::DictionaryAttributeStorage(const ParamKey &key) {
 std::size_t DictionaryAttributeStorage::HashValue(const ParamKey &key) {
   std::size_t hash_value = key.size();
   for (auto iter = key.begin(); iter != key.end(); ++iter) {
-    hash_value = hash_combine(
+    hash_value = ir::hash_combine(
         hash_value,
         std::hash<NamedAttribute>()(NamedAttribute(iter->first, iter->second)));
   }
diff --git a/paddle/ir/builtin_attribute_storage.h b/paddle/ir/builtin_attribute_storage.h
index a0fdca9f1e10f0..a34648fb17e358 100644
--- a/paddle/ir/builtin_attribute_storage.h
+++ b/paddle/ir/builtin_attribute_storage.h
@@ -83,10 +83,6 @@ struct DictionaryAttributeStorage : public AttributeStorage {
   uint32_t size() const { return size_; }
 
  private:
-  static std::size_t hash_combine(std::size_t lhs, std::size_t rhs) {
-    return lhs ^= rhs + 0x9e3779b9 + (lhs << 6) + (lhs >> 2);
-  }
-
   NamedAttribute *data_;
   uint32_t size_;
 };
diff --git a/paddle/ir/builtin_type_storage.h b/paddle/ir/builtin_type_storage.h
index 876b6ceeffdceb..132a1656a7975d 100644
--- a/paddle/ir/builtin_type_storage.h
+++ b/paddle/ir/builtin_type_storage.h
@@ -17,6 +17,7 @@
 #include <type_traits>
 
 #include "paddle/ir/type.h"
+#include "paddle/ir/utils.h"
 
 namespace std {
 ///
@@ -109,20 +110,22 @@ struct DenseTensorTypeStorage : public ir::TypeStorage {
     std::size_t hash_value = 0;
     // hash dtype
     hash_value =
-        hash_combine(hash_value, std::hash<ir::Type>()(std::get<0>(key)));
+        ir::hash_combine(hash_value, std::hash<ir::Type>()(std::get<0>(key)));
     // hash dims
-    hash_value = hash_combine(hash_value, std::hash<Dim>()(std::get<1>(key)));
-    // hash layout
     hash_value =
-        hash_combine(hash_value,
-                     std::hash<std::underlying_type<DataLayout>::type>()(
-                         static_cast<std::underlying_type<DataLayout>::type>(
-                             std::get<2>(key))));
+        ir::hash_combine(hash_value, std::hash<Dim>()(std::get<1>(key)));
+    // hash layout
+    hash_value = ir::hash_combine(
+        hash_value,
+        std::hash<std::underlying_type<DataLayout>::type>()(
+            static_cast<std::underlying_type<DataLayout>::type>(
+                std::get<2>(key))));
     // hash lod
-    hash_value = hash_combine(hash_value, std::hash<LoD>()(std::get<3>(key)));
+    hash_value =
+        ir::hash_combine(hash_value, std::hash<LoD>()(std::get<3>(key)));
     // hash offset
     hash_value =
-        hash_combine(hash_value, std::hash<size_t>()(std::get<4>(key)));
+        ir::hash_combine(hash_value, std::hash<size_t>()(std::get<4>(key)));
     return hash_value;
   }
 
@@ -146,11 +149,6 @@ struct DenseTensorTypeStorage : public ir::TypeStorage {
   DataLayout layout_;
   LoD lod_;
   size_t offset_;
-
- private:
-  static std::size_t hash_combine(std::size_t lhs, std::size_t rhs) {
-    return lhs ^= rhs + 0x9e3779b9 + (lhs << 6) + (lhs >> 2);
-  }
 };
 
 }  // namespace ir
diff --git a/paddle/ir/op_base.h b/paddle/ir/op_base.h
new file mode 100644
index 00000000000000..38ff4002c6b2b3
--- /dev/null
+++ b/paddle/ir/op_base.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/ir/operation.h"
+
+namespace ir {
+class OpBase {
+ public:
+  Operation *operation() { return operation_; }
+
+  explicit operator bool() { return operation() != nullptr; }
+
+  operator Operation *() const { return operation_; }
+
+  Operation *operator->() const { return operation_; }
+
+ protected:
+  explicit OpBase(Operation *operation) : operation_(operation) {}
+
+ private:
+  Operation *operation_;
+};
+
+}  // namespace ir
diff --git a/paddle/ir/operation.cc b/paddle/ir/operation.cc
new file mode 100644
index 00000000000000..e9d727f1b5fb34
--- /dev/null
+++ b/paddle/ir/operation.cc
@@ -0,0 +1,173 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/ir/operation.h"
+#include "paddle/ir/utils.h"
+
+namespace ir {
+// Allocate the required memory based on the size and number of inputs, outputs,
+// and operators, and construct it in the order of: OpOutlineResult,
+// OpInlineResult, Operation, Operand.
+Operation *Operation::create(const std::vector<ir::OpResult> &inputs,
+                             const std::vector<ir::Type> &output_types,
+                             ir::DictionaryAttribute attribute) {
+  // 1. Calculate the required memory size for OpResults + Operation +
+  // OpOperands.
+  uint32_t num_results = output_types.size();
+  uint32_t num_operands = inputs.size();
+  uint32_t max_inline_result_num =
+      detail::OpResultImpl::GetMaxInlineResultIndex() + 1;
+  size_t result_mem_size =
+      num_results > max_inline_result_num
+          ? sizeof(detail::OpOutlineResultImpl) *
+                    (num_results - max_inline_result_num) +
+                sizeof(detail::OpInlineResultImpl) * max_inline_result_num
+          : sizeof(detail::OpInlineResultImpl) * num_results;
+  size_t operand_mem_size = sizeof(detail::OpOperandImpl) * num_operands;
+  size_t op_mem_size = sizeof(Operation);
+  size_t base_size = result_mem_size + op_mem_size + operand_mem_size;
+  // 2. Malloc memory.
+  char *base_ptr = reinterpret_cast<char *>(aligned_malloc(base_size, 8));
+  // 3.1. Construct OpResults.
+  for (size_t idx = num_results; idx > 0; idx--) {
+    if (idx > max_inline_result_num) {
+      new (base_ptr)
+          detail::OpOutlineResultImpl(output_types[idx - 1], idx - 1);
+      base_ptr += sizeof(detail::OpOutlineResultImpl);
+    } else {
+      new (base_ptr) detail::OpInlineResultImpl(output_types[idx - 1], idx - 1);
+      base_ptr += sizeof(detail::OpInlineResultImpl);
+    }
+  }
+  // 3.2. Construct Operation.
+  Operation *op =
+      new (base_ptr) Operation(num_results, num_operands, attribute);
+  base_ptr += sizeof(Operation);
+  // 3.3. Construct OpOperands.
+  if ((reinterpret_cast<uintptr_t>(base_ptr) & 0x7) != 0) {
+    throw("The address of OpOperandImpl must be divisible by 8.");
+  }
+  for (size_t idx = 0; idx < num_operands; idx++) {
+    new (base_ptr) detail::OpOperandImpl(inputs[idx].impl_, op);
+    base_ptr += sizeof(detail::OpOperandImpl);
+  }
+  VLOG(4) << "Construct an Operation: " << op->print();
+  return op;
+}
+
+// Call destructors for OpResults, Operation, and OpOperands in sequence, and
+// finally free memory.
+void Operation::destroy() {
+  // 1. Get aligned_ptr by result_num.
+  uint32_t max_inline_result_num =
+      detail::OpResultImpl::GetMaxInlineResultIndex() + 1;
+  size_t result_mem_size =
+      num_results_ > max_inline_result_num
+          ? sizeof(detail::OpOutlineResultImpl) *
+                    (num_results_ - max_inline_result_num) +
+                sizeof(detail::OpInlineResultImpl) * max_inline_result_num
+          : sizeof(detail::OpInlineResultImpl) * num_results_;
+  char *aligned_ptr = reinterpret_cast<char *>(this) - result_mem_size;
+  // 2.1. Deconstruct OpResult.
+  char *base_ptr = aligned_ptr;
+  for (size_t idx = num_results_; idx > 0; idx--) {
+    if (!reinterpret_cast<detail::OpResultImpl *>(base_ptr)->use_empty()) {
+      throw("Cannot destroy a value that still has uses!");
+    }
+    if (idx > max_inline_result_num) {
+      reinterpret_cast<detail::OpOutlineResultImpl *>(base_ptr)
+          ->~OpOutlineResultImpl();
+      base_ptr += sizeof(detail::OpOutlineResultImpl);
+    } else {
+      reinterpret_cast<detail::OpInlineResultImpl *>(base_ptr)
+          ->~OpInlineResultImpl();
+      base_ptr += sizeof(detail::OpInlineResultImpl);
+    }
+  }
+  // 2.2. Deconstruct Operation.
+  if (reinterpret_cast<uintptr_t>(base_ptr) !=
+      reinterpret_cast<uintptr_t>(this)) {
+    throw("Operation address error");
+  }
+  reinterpret_cast<Operation *>(base_ptr)->~Operation();
+  base_ptr += sizeof(Operation);
+  // 2.3. Deconstruct OpOpOerand.
+  for (size_t idx = 0; idx < num_operands_; idx++) {
+    reinterpret_cast<detail::OpOperandImpl *>(base_ptr)->~OpOperandImpl();
+    base_ptr += sizeof(detail::OpOperandImpl);
+  }
+  // 3. Free memory.
+  VLOG(4) << "Destroy an Operation: {ptr = "
+          << reinterpret_cast<void *>(aligned_ptr)
+          << ", size = " << result_mem_size << "}";
+  aligned_free(reinterpret_cast<void *>(aligned_ptr));
+}
+
+Operation::Operation(uint32_t num_results,
+                     uint32_t num_operands,
+                     ir::DictionaryAttribute attribute) {
+  if (!attribute) {
+    throw("unexpected null attribute dictionary");
+  }
+  num_results_ = num_results;
+  num_operands_ = num_operands;
+  attribute_ = attribute;
+}
+
+ir::OpResult Operation::GetResultByIndex(uint32_t index) {
+  if (index >= num_results_) {
+    throw("index exceeds OP output range.");
+  }
+  uint32_t max_inline_idx = detail::OpResultImpl::GetMaxInlineResultIndex();
+  char *ptr = nullptr;
+  if (index > max_inline_idx) {
+    ptr = reinterpret_cast<char *>(this) -
+          (max_inline_idx + 1) * sizeof(detail::OpInlineResultImpl) -
+          (index - max_inline_idx) * sizeof(detail::OpOutlineResultImpl);
+  } else {
+    ptr = reinterpret_cast<char *>(this) -
+          (index + 1) * sizeof(detail::OpInlineResultImpl);
+  }
+  if (index > max_inline_idx) {
+    detail::OpOutlineResultImpl *result_impl_ptr =
+        reinterpret_cast<detail::OpOutlineResultImpl *>(ptr);
+    return ir::OpResult(result_impl_ptr);
+  } else {
+    detail::OpInlineResultImpl *result_impl_ptr =
+        reinterpret_cast<detail::OpInlineResultImpl *>(ptr);
+    return ir::OpResult(result_impl_ptr);
+  }
+}
+
+std::string Operation::print() {
+  std::stringstream result;
+  result << "{ " << num_results_ << " outputs, " << num_operands_
+         << " inputs } : ";
+  result << "[ ";
+  for (size_t idx = num_results_; idx > 0; idx--) {
+    result << GetResultByIndex(idx - 1).impl_ << ", ";
+  }
+  result << "] = ";
+  result << this << "( ";
+  for (size_t idx = 0; idx < num_operands_; idx++) {
+    result << reinterpret_cast<void *>(reinterpret_cast<char *>(this) +
+                                       sizeof(Operation) +
+                                       idx * sizeof(detail::OpOperandImpl))
+           << ", ";
+  }
+  result << ")";
+  return result.str();
+}
+
+}  // namespace ir
diff --git a/paddle/ir/operation.h b/paddle/ir/operation.h
new file mode 100644
index 00000000000000..924dcafb73dfc4
--- /dev/null
+++ b/paddle/ir/operation.h
@@ -0,0 +1,57 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/ir/builtin_attribute.h"
+#include "paddle/ir/type.h"
+#include "paddle/ir/value_impl.h"
+
+namespace ir {
+
+class alignas(8) Operation final {
+ public:
+  ///
+  /// \brief Malloc memory and construct objects in the following order:
+  /// OpResultImpls|Operation|OpOperandImpls.
+  ///
+  static Operation *create(const std::vector<ir::OpResult> &inputs,
+                           const std::vector<ir::Type> &output_types,
+                           ir::DictionaryAttribute attribute);
+
+  void destroy();
+
+  ir::OpResult GetResultByIndex(uint32_t index);
+
+  std::string print();
+
+  ir::DictionaryAttribute attribute() { return attribute_; }
+
+  uint32_t num_results() { return num_results_; }
+
+  uint32_t num_operands() { return num_operands_; }
+
+ private:
+  Operation(uint32_t num_results,
+            uint32_t num_operands,
+            ir::DictionaryAttribute attribute);
+
+  ir::DictionaryAttribute attribute_;
+
+  uint32_t num_results_ = 0;
+
+  uint32_t num_operands_ = 0;
+};
+
+}  // namespace ir
diff --git a/paddle/ir/tests/CMakeLists.txt b/paddle/ir/tests/CMakeLists.txt
index d94789fd056824..e012ec5bd264d1 100644
--- a/paddle/ir/tests/CMakeLists.txt
+++ b/paddle/ir/tests/CMakeLists.txt
@@ -1,2 +1,3 @@
 cc_test_old(type_test SRCS type_test.cc DEPS new_ir gtest)
 cc_test_old(ir_attribute_test SRCS ir_attribute_test.cc DEPS new_ir gtest)
+cc_test_old(ir_value_test SRCS ir_value_test.cc DEPS new_ir gtest)
diff --git a/paddle/ir/tests/ir_value_test.cc b/paddle/ir/tests/ir_value_test.cc
new file mode 100644
index 00000000000000..c04e7c35128f46
--- /dev/null
+++ b/paddle/ir/tests/ir_value_test.cc
@@ -0,0 +1,98 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include "paddle/ir/attribute.h"
+#include "paddle/ir/builtin_attribute.h"
+#include "paddle/ir/builtin_type.h"
+#include "paddle/ir/ir_context.h"
+#include "paddle/ir/operation.h"
+
+// This unittest is used to test the construction interfaces of value class and
+// operation. The constructed test scenario is: a = OP1(); b = OP2(); c = OP3(a,
+// b); d, e, f, g, h, i, j = OP4(a, c);
+
+ir::DictionaryAttribute CreateAttribute(std::string attribute_name,
+                                        std::string attribute) {
+  ir::IrContext *ctx = ir::IrContext::Instance();
+  ir::StrAttribute attr_name = ir::StrAttribute::get(ctx, attribute_name);
+  ir::Attribute attr_value = ir::StrAttribute::get(ctx, attribute);
+  std::map<ir::StrAttribute, ir::Attribute> named_attr;
+  named_attr.insert(
+      std::pair<ir::StrAttribute, ir::Attribute>(attr_name, attr_value));
+  return ir::DictionaryAttribute::get(ctx, named_attr);
+}
+
+TEST(value_test, value_test) {
+  ir::IrContext *ctx = ir::IrContext::Instance();
+  // 1. Construct OP1: a = OP1()
+  std::vector<ir::OpResult> op1_inputs = {};
+  std::vector<ir::Type> op1_output_types = {ir::Float32Type::get(ctx)};
+  ir::Operation *op1 = ir::Operation::create(
+      op1_inputs, op1_output_types, CreateAttribute("op1_name", "op1_attr"));
+  std::cout << op1->print() << std::endl;
+  // 2. Construct OP2: b = OP2();
+  std::vector<ir::OpResult> op2_inputs = {};
+  std::vector<ir::Type> op2_output_types = {ir::Float32Type::get(ctx)};
+  ir::Operation *op2 = ir::Operation::create(
+      op2_inputs, op2_output_types, CreateAttribute("op2_name", "op2_attr"));
+  std::cout << op2->print() << std::endl;
+  // 3. Construct OP3: c = OP3(a, b);
+  std::vector<ir::OpResult> op3_inputs = {op1->GetResultByIndex(0),
+                                          op2->GetResultByIndex(0)};
+  std::vector<ir::Type> op3_output_types = {ir::Float32Type::get(ctx)};
+  ir::Operation *op3 = ir::Operation::create(
+      op3_inputs, op3_output_types, CreateAttribute("op3_name", "op3_attr"));
+  std::cout << op3->print() << std::endl;
+  // 4. Construct OP4: d, e, f, g, h, i, j = OP4(a, c);
+  std::vector<ir::OpResult> op4_inputs = {op1->GetResultByIndex(0),
+                                          op3->GetResultByIndex(0)};
+  std::vector<ir::Type> op4_output_types;
+  for (size_t i = 0; i < 7; i++) {
+    op4_output_types.push_back(ir::Float32Type::get(ctx));
+  }
+  ir::Operation *op4 = ir::Operation::create(
+      op4_inputs, op4_output_types, CreateAttribute("op4_name", "op4_attr"));
+  std::cout << op4->print() << std::endl;
+
+  // Test 1:
+  EXPECT_EQ(op1->GetResultByIndex(0).GetDefiningOp(), op1);
+  EXPECT_EQ(op2->GetResultByIndex(0).GetDefiningOp(), op2);
+  EXPECT_EQ(op3->GetResultByIndex(0).GetDefiningOp(), op3);
+  EXPECT_EQ(op4->GetResultByIndex(6).GetDefiningOp(), op4);
+
+  // Test 2: op1_first_output -> op4_first_input
+  ir::OpResult op1_first_output = op1->GetResultByIndex(0);
+  ir::detail::OpOperandImpl *op4_first_input =
+      reinterpret_cast<ir::detail::OpOperandImpl *>(
+          reinterpret_cast<uintptr_t>(op4) + sizeof(ir::Operation));
+  EXPECT_EQ(static_cast<ir::Value>(op1_first_output).impl()->first_use(),
+            op4_first_input);
+  ir::detail::OpOperandImpl *op3_first_input =
+      reinterpret_cast<ir::detail::OpOperandImpl *>(
+          reinterpret_cast<uintptr_t>(op3) + sizeof(ir::Operation));
+  EXPECT_EQ(op4_first_input->next_use(), op3_first_input);
+  EXPECT_EQ(op3_first_input->next_use(), nullptr);
+
+  // destroy
+  std::cout << op1->GetResultByIndex(0).print_ud_chain() << std::endl;
+  op4->destroy();
+  std::cout << op1->GetResultByIndex(0).print_ud_chain() << std::endl;
+  op3->destroy();
+  std::cout << op1->GetResultByIndex(0).print_ud_chain() << std::endl;
+  op2->destroy();
+  std::cout << op1->GetResultByIndex(0).print_ud_chain() << std::endl;
+  op1->destroy();
+}
diff --git a/paddle/ir/tests/type_test.cc b/paddle/ir/tests/type_test.cc
index a11040e3656a53..d21afdcb80a59f 100644
--- a/paddle/ir/tests/type_test.cc
+++ b/paddle/ir/tests/type_test.cc
@@ -21,6 +21,7 @@
 #include "paddle/ir/ir_context.h"
 #include "paddle/ir/type.h"
 #include "paddle/ir/type_base.h"
+#include "paddle/ir/utils.h"
 
 TEST(type_test, type_id) {
   // Define two empty classes, just for testing.
@@ -172,8 +173,8 @@ struct IntegerTypeStorage : public ir::TypeStorage {
   using ParamKey = std::pair<unsigned, unsigned>;
 
   static std::size_t HashValue(const ParamKey &key) {
-    return hash_combine(std::hash<unsigned>()(std::get<0>(key)),
-                        std::hash<unsigned>()(std::get<1>(key)));
+    return ir::hash_combine(std::hash<unsigned>()(std::get<0>(key)),
+                            std::hash<unsigned>()(std::get<1>(key)));
   }
 
   bool operator==(const ParamKey &key) const {
@@ -188,11 +189,6 @@ struct IntegerTypeStorage : public ir::TypeStorage {
 
   unsigned width_ : 30;
   unsigned signedness_ : 2;
-
- private:
-  static std::size_t hash_combine(std::size_t lhs, std::size_t rhs) {
-    return lhs ^= rhs + 0x9e3779b9 + (lhs << 6) + (lhs >> 2);
-  }
 };
 
 // Customize a parameterized type: IntegerType, storage type is
diff --git a/paddle/ir/utils.cc b/paddle/ir/utils.cc
new file mode 100644
index 00000000000000..9e6f1fcaf57901
--- /dev/null
+++ b/paddle/ir/utils.cc
@@ -0,0 +1,58 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/ir/utils.h"
+
+namespace ir {
+std::size_t hash_combine(std::size_t lhs, std::size_t rhs) {
+  return lhs ^= rhs + 0x9e3779b9 + (lhs << 6) + (lhs >> 2);
+}
+
+void *aligned_malloc(size_t size, size_t alignment) {
+  assert(alignment >= sizeof(void *) && (alignment & (alignment - 1)) == 0);
+  size = (size + alignment - 1) / alignment * alignment;
+#if defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 200112L
+  void *aligned_mem = nullptr;
+  if (posix_memalign(&aligned_mem, alignment, size) != 0) {
+    aligned_mem = nullptr;
+  }
+  return aligned_mem;
+#elif defined(_WIN32)
+  return _aligned_malloc(size, alignment);
+#else
+  void *mem = malloc(size + alignment);
+  if (mem == nullptr) {
+    return nullptr;
+  }
+  size_t adjust = alignment - reinterpret_cast<uint64_t>(mem) % alignment;
+  void *aligned_mem = reinterpret_cast<char *>(mem) + adjust;
+  *(reinterpret_cast<void **>(aligned_mem) - 1) = mem;
+  assert(reinterpret_cast<uint64_t>(aligned_mem) % alignment == 0);
+  return aligned_mem;
+#endif
+}
+
+void aligned_free(void *mem_ptr) {
+#if defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 200112L
+  free(mem_ptr);
+#elif defined(_WIN32)
+  _aligned_free(mem_ptr);
+#else
+  if (mem_ptr) {
+    free(*(reinterpret_cast<void **>(mem_ptr) - 1));
+  }
+#endif
+}
+
+}  // namespace ir
diff --git a/paddle/ir/utils.h b/paddle/ir/utils.h
new file mode 100644
index 00000000000000..b4dd00281e1598
--- /dev/null
+++ b/paddle/ir/utils.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+
+namespace ir {
+std::size_t hash_combine(std::size_t lhs, std::size_t rhs);
+
+void *aligned_malloc(size_t size, size_t alignment);
+
+void aligned_free(void *mem_ptr);
+
+}  // namespace ir
diff --git a/paddle/ir/value.cc b/paddle/ir/value.cc
new file mode 100644
index 00000000000000..f5ecc41018bcf1
--- /dev/null
+++ b/paddle/ir/value.cc
@@ -0,0 +1,183 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/ir/value.h"
+#include "paddle/ir/value_impl.h"
+
+namespace ir {
+// Operand
+OpOperand::OpOperand(const detail::OpOperandImpl *impl)
+    : impl_(const_cast<detail::OpOperandImpl *>(impl)) {}
+
+OpOperand &OpOperand::operator=(const OpOperand &rhs) {
+  if (this == &rhs) return *this;
+  impl_ = rhs.impl_;
+  return *this;
+}
+
+OpOperand &OpOperand::operator=(const detail::OpOperandImpl *impl) {
+  if (this->impl_ == impl) return *this;
+  impl_ = const_cast<detail::OpOperandImpl *>(impl);
+  return *this;
+}
+
+bool OpOperand::operator==(OpOperand other) const {
+  return impl_ == other.impl_;
+}
+
+bool OpOperand::operator!=(OpOperand other) const {
+  return impl_ != other.impl_;
+}
+
+bool OpOperand::operator!() const { return impl_ == nullptr; }
+
+OpOperand::operator bool() const { return impl_; }
+
+detail::OpOperandImpl *OpOperand::impl() const { return impl_; }
+
+// Value
+Value::Value(const detail::ValueImpl *impl)
+    : impl_(const_cast<detail::ValueImpl *>(impl)) {}
+
+bool Value::operator==(const Value &other) const {
+  return impl_ == other.impl_;
+}
+
+bool Value::operator!=(const Value &other) const {
+  return impl_ != other.impl_;
+}
+
+bool Value::operator!() const { return impl_ == nullptr; }
+
+Value::operator bool() const { return impl_; }
+
+detail::ValueImpl *Value::impl() const { return impl_; }
+
+ir::Type Value::type() const { return impl_->type(); }
+
+void Value::SetType(ir::Type type) { impl_->SetType(type); }
+
+Operation *Value::GetDefiningOp() const {
+  if (auto result = dyn_cast<OpResult>()) return result.owner();
+  return nullptr;
+}
+
+std::string Value::print_ud_chain() { return impl_->print_ud_chain(); }
+
+// OpResult
+bool OpResult::classof(Value value) {
+  return ir::isa<detail::OpResultImpl>(value.impl());
+}
+
+Operation *OpResult::owner() const { return impl()->owner(); }
+
+uint32_t OpResult::GetResultIndex() const { return impl()->GetResultIndex(); }
+
+detail::OpResultImpl *OpResult::impl() const {
+  return reinterpret_cast<detail::OpResultImpl *>(impl_);
+}
+
+uint32_t OpResult::GetValidInlineIndex(uint32_t index) {
+  uint32_t max_inline_index =
+      ir::detail::OpResultImpl::GetMaxInlineResultIndex();
+  return index <= max_inline_index ? index : max_inline_index;
+}
+
+// details
+namespace detail {
+ir::Operation *OpOperandImpl::owner() const { return owner_; }
+
+ir::detail::OpOperandImpl *OpOperandImpl::next_use() { return next_use_; }
+
+OpOperandImpl::OpOperandImpl(ir::Value source, ir::Operation *owner)
+    : source_(source), owner_(owner) {
+  prev_use_addr_ = source.impl()->first_use_addr();
+  next_use_ = source.impl()->first_use();
+  if (next_use_) {
+    next_use_->prev_use_addr_ = &next_use_;
+  }
+  source.impl()->SetFirstUse(this);
+}
+
+void OpOperandImpl::remove_from_ud_chain() {
+  if (!prev_use_addr_) return;
+  if (prev_use_addr_ == source_.impl()->first_use_addr()) {
+    /// NOTE: In ValueImpl, first_use_offseted_by_index_ use lower three bits
+    /// storage index information, so need to be updated using the SetFirstUse
+    /// method here.
+    source_.impl()->SetFirstUse(next_use_);
+  } else {
+    *prev_use_addr_ = next_use_;
+  }
+  if (next_use_) {
+    next_use_->prev_use_addr_ = prev_use_addr_;
+  }
+}
+
+OpOperandImpl::~OpOperandImpl() { remove_from_ud_chain(); }
+
+uint32_t ValueImpl::index() const {
+  uint32_t index =
+      reinterpret_cast<uintptr_t>(first_use_offseted_by_index_) & 0x07;
+  if (index < 6) return index;
+  return reinterpret_cast<OpOutlineResultImpl *>(const_cast<ValueImpl *>(this))
+      ->GetResultIndex();
+}
+
+std::string ValueImpl::print_ud_chain() {
+  std::stringstream result;
+  result << "Value[" << this << "] -> ";
+  OpOperandImpl *tmp = first_use();
+  if (tmp) {
+    result << "OpOperand[" << reinterpret_cast<void *>(tmp) << "] -> ";
+    while (tmp->next_use() != nullptr) {
+      result << "OpOperand[" << reinterpret_cast<void *>(tmp->next_use())
+             << "] -> ";
+      tmp = tmp->next_use();
+    }
+  }
+  result << "nullptr";
+  return result.str();
+}
+
+uint32_t OpResultImpl::GetResultIndex() const {
+  if (const auto *outline_result = ir::dyn_cast<OpOutlineResultImpl>(this)) {
+    return outline_result->GetResultIndex();
+  }
+  return ir::dyn_cast<OpInlineResultImpl>(this)->GetResultIndex();
+}
+
+ir::Operation *OpResultImpl::owner() const {
+  // For inline result, pointer offset index to obtain the address of op.
+  if (const auto *result = ir::dyn_cast<OpInlineResultImpl>(this)) {
+    result += result->GetResultIndex() + 1;
+    return reinterpret_cast<Operation *>(
+        const_cast<OpInlineResultImpl *>(result));
+  }
+  // For outline result, pointer offset outline_index to obtain the address of
+  // maximum inline result.
+  const OpOutlineResultImpl *outline_result =
+      (const OpOutlineResultImpl *)(this);
+  outline_result +=
+      (outline_result->outline_index_ - GetMaxInlineResultIndex());
+  // The offset of the maximum inline result distance op is
+  // GetMaxInlineResultIndex.
+  const auto *inline_result =
+      reinterpret_cast<const OpInlineResultImpl *>(outline_result);
+  inline_result += (GetMaxInlineResultIndex() + 1);
+  return reinterpret_cast<Operation *>(
+      const_cast<OpInlineResultImpl *>(inline_result));
+}
+}  // namespace detail
+}  // namespace ir
diff --git a/paddle/ir/value.h b/paddle/ir/value.h
new file mode 100644
index 00000000000000..3d197182cd6e9d
--- /dev/null
+++ b/paddle/ir/value.h
@@ -0,0 +1,137 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/ir/cast_utils.h"
+#include "paddle/ir/type.h"
+
+namespace ir {
+class Operation;
+
+namespace detail {
+class OpOperandImpl;
+class ValueImpl;
+class OpResultImpl;
+}  // namespace detail
+
+///
+/// \brief OpOperand class represents the operand of operation. This class only
+/// provides interfaces, for specific implementation, see Impl class.
+///
+class OpOperand {
+ public:
+  OpOperand() = default;
+
+  OpOperand(const OpOperand &other) = default;
+
+  OpOperand(const detail::OpOperandImpl *impl);  // NOLINT
+
+  OpOperand &operator=(const OpOperand &rhs);
+
+  OpOperand &operator=(const detail::OpOperandImpl *impl);
+
+  bool operator==(OpOperand other) const;
+
+  bool operator!=(OpOperand other) const;
+
+  bool operator!() const;
+
+  explicit operator bool() const;
+
+  detail::OpOperandImpl *impl() const;
+
+ private:
+  detail::OpOperandImpl *impl_{nullptr};
+};
+
+///
+/// \brief Value class represents the SSA value in the IR system. This class
+/// only provides interfaces, for specific implementation, see Impl class.
+///
+class Value {
+ public:
+  Value() = default;
+
+  Value(const detail::ValueImpl *impl);  // NOLINT
+
+  Value(const Value &other) = default;
+
+  bool operator==(const Value &other) const;
+
+  bool operator!=(const Value &other) const;
+
+  bool operator!() const;
+
+  explicit operator bool() const;
+
+  template <typename T>
+  bool isa() const {
+    return ir::isa<T>(*this);
+  }
+
+  template <typename U>
+  U dyn_cast() const {
+    return ir::dyn_cast<U>(*this);
+  }
+
+  detail::ValueImpl *impl() const;
+
+  ir::Type type() const;
+
+  void SetType(ir::Type type);
+
+  Operation *GetDefiningOp() const;
+
+  std::string print_ud_chain();
+
+  friend struct std::hash<Value>;
+
+ protected:
+  detail::ValueImpl *impl_{nullptr};
+};
+
+///
+/// \brief OpResult class represents the value defined by a result of operation.
+/// This class only provides interfaces, for specific implementation, see Impl
+/// class.
+///
+class OpResult : public Value {
+ public:
+  using Value::Value;
+
+  static bool classof(Value value);
+
+  Operation *owner() const;
+
+  uint32_t GetResultIndex() const;
+
+  friend Operation;
+
+ private:
+  static uint32_t GetValidInlineIndex(uint32_t index);
+
+  detail::OpResultImpl *impl() const;
+};
+
+}  // namespace ir
+
+namespace std {
+template <>
+struct hash<ir::Value> {
+  std::size_t operator()(const ir::Value &obj) const {
+    return std::hash<const ir::detail::ValueImpl *>()(obj.impl_);
+  }
+};
+}  // namespace std
diff --git a/paddle/ir/value_impl.h b/paddle/ir/value_impl.h
new file mode 100644
index 00000000000000..2fa236dddd8330
--- /dev/null
+++ b/paddle/ir/value_impl.h
@@ -0,0 +1,196 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/ir/value.h"
+
+namespace ir {
+static const uint32_t OUTLINE_OP_RESULT_INDEX = 6;
+
+class Operation;
+
+namespace detail {
+///
+/// \brief OpOperandImpl
+///
+class OpOperandImpl {
+ public:
+  ir::Operation *owner() const;
+
+  ir::detail::OpOperandImpl *next_use();
+
+  /// Remove this operand from the current use list.
+  void remove_from_ud_chain();
+
+  ~OpOperandImpl();
+
+  friend ir::Operation;
+
+ private:
+  OpOperandImpl(ir::Value source, ir::Operation *owner);
+
+  ir::detail::OpOperandImpl *next_use_ = nullptr;
+
+  ir::detail::OpOperandImpl **prev_use_addr_ = nullptr;
+
+  ir::Value source_;
+
+  ir::Operation *owner_ = nullptr;
+};
+
+///
+/// \brief ValueImpl is the base class of all drived Value classes such as
+/// OpResultImpl. This class defines all the information and usage interface in
+/// the IR Value. Each Value include three attributes:
+/// (1) type: ir::Type; (2) UD-chain of value: OpOperandImpl*, first operand
+/// address with offset of this value; (3) index: the position where the output
+/// list of the parent operator.
+///
+class alignas(8) ValueImpl {
+ public:
+  ///
+  /// \brief Interface functions of "type_" attribute.
+  ///
+  ir::Type type() const { return type_; }
+
+  void SetType(ir::Type type) { type_ = type; }
+
+  ///
+  /// \brief Interface functions of "first_use_offseted_by_index_" attribute.
+  ///
+  uint32_t index() const;
+
+  OpOperandImpl *first_use() const {
+    return reinterpret_cast<OpOperandImpl *>(
+        reinterpret_cast<uintptr_t>(first_use_offseted_by_index_) & (~0x07));
+  }
+
+  void SetFirstUse(OpOperandImpl *first_use) {
+    uint32_t offset = index();
+    first_use_offseted_by_index_ = reinterpret_cast<OpOperandImpl *>(
+        reinterpret_cast<uintptr_t>(first_use) + offset);
+    VLOG(4) << "The index of this value is " << offset
+            << ". Offset and set first use: " << first_use << " -> "
+            << first_use_offseted_by_index_ << ".";
+  }
+
+  OpOperandImpl **first_use_addr() { return &first_use_offseted_by_index_; }
+
+  bool use_empty() const { return first_use() == nullptr; }
+
+  std::string print_ud_chain();
+
+ protected:
+  ///
+  /// \brief Only can be constructed by derived classes such as OpResultImpl.
+  ///
+  explicit ValueImpl(ir::Type type, uint32_t index) {
+    if (index > OUTLINE_OP_RESULT_INDEX) {
+      throw("The value of index must not exceed 6");
+    }
+    type_ = type;
+    first_use_offseted_by_index_ = reinterpret_cast<OpOperandImpl *>(
+        reinterpret_cast<uintptr_t>(nullptr) + index);
+    VLOG(4) << "Construct a ValueImpl whose's index is " << index
+            << ". The offset first_use address is: "
+            << first_use_offseted_by_index_;
+  }
+
+  ///
+  /// \brief Attribute1: Type of value.
+  ///
+  ir::Type type_;
+
+  ///
+  /// \brief Attribute2/3: Record the UD-chain of value and index.
+  /// NOTE: The members of the OpOperandImpl include four pointers, so this
+  /// class is 8-byte aligned, and the lower 3 bits of its address are 0, so the
+  /// index can be stored in these 3 bits, stipulate:
+  /// (1) index = 0~5: represent positions 0 to 5 inline
+  /// output(OpInlineResultImpl); (2) index = 6: represent the position >=6
+  /// outline output(OpOutlineResultImpl); (3) index = 7 is reserved.
+  ///
+  OpOperandImpl *first_use_offseted_by_index_ = nullptr;
+};
+
+///
+/// \brief OpResultImpl is the implementation of an operation result.
+///
+class alignas(8) OpResultImpl : public ValueImpl {
+ public:
+  using ValueImpl::ValueImpl;
+
+  static bool classof(const ValueImpl &value) { return true; }
+
+  ///
+  /// \brief Get the parent operation of this result.(op_ptr = value_ptr +
+  /// index)
+  ///
+  ir::Operation *owner() const;
+
+  ///
+  /// \brief Get the result index of the operation result.
+  ///
+  uint32_t GetResultIndex() const;
+
+  ///
+  /// \brief Get the maximum number of results that can be stored inline.
+  ///
+  static uint32_t GetMaxInlineResultIndex() {
+    return OUTLINE_OP_RESULT_INDEX - 1;
+  }
+};
+
+///
+/// \brief OpInlineResultImpl is the implementation of an operation result whose
+/// index <= 5.
+///
+class OpInlineResultImpl : public OpResultImpl {
+ public:
+  OpInlineResultImpl(ir::Type type, uint32_t result_index)
+      : OpResultImpl(type, result_index) {
+    if (result_index > GetMaxInlineResultIndex()) {
+      throw("Inline result index should not exceed MaxInlineResultIndex(5)");
+    }
+  }
+
+  static bool classof(const OpResultImpl &value) {
+    return value.index() < OUTLINE_OP_RESULT_INDEX;
+  }
+
+  uint32_t GetResultIndex() const { return index(); }
+};
+
+///
+/// \brief OpOutlineResultImpl is the implementation of an operation result
+/// whose index > 5.
+///
+class OpOutlineResultImpl : public OpResultImpl {
+ public:
+  OpOutlineResultImpl(ir::Type type, uint32_t outline_index)
+      : OpResultImpl(type, OUTLINE_OP_RESULT_INDEX),
+        outline_index_(outline_index) {}
+
+  static bool classof(const OpResultImpl &value) {
+    return value.index() >= OUTLINE_OP_RESULT_INDEX;
+  }
+
+  uint32_t GetResultIndex() const { return outline_index_; }
+
+  uint32_t outline_index_;
+};
+
+}  // namespace detail
+}  // namespace ir

From cbdba5093302f78b2d5c1af331faf65b3a28bead Mon Sep 17 00:00:00 2001
From: Yulong Ao <aoyulong@baidu.com>
Date: Wed, 12 Apr 2023 18:25:39 +0800
Subject: [PATCH 102/156] [Auto Parallel] Move some changes or bug fixes from
 2.4 to develop (#52721)

* [Auto Parallel] Speedup the completion process

* [Auto Parallel] Skip the property of dist_context when deepcopying

* [Auto Parallel] Remove the unnecessary print

* [Auto Parallel] Move some changes from 2.4 branch to develop

* Update engine.py

* [Auto Parallel] Fix a bug
---
 .../distributed/auto_parallel/constants.py    |  10 +
 .../auto_parallel/cost/estimate_cost.py       |   4 +-
 .../distributed/auto_parallel/dist_context.py |  49 ++
 .../distributed/auto_parallel/dist_op.py      |  93 +--
 .../distributed/auto_parallel/dist_saver.py   |  26 +-
 .../distributed/auto_parallel/engine.py       | 182 ++---
 .../distributed/auto_parallel/interface.py    |  11 +-
 .../distributed/auto_parallel/parallelizer.py |   9 +-
 .../auto_parallel/parallelizer_v2.py          |  33 +-
 .../auto_parallel/process_group.py            |   8 +-
 .../distributed/auto_parallel/reshard.py      |  90 ++-
 .../distributed/auto_parallel/strategy.py     |   9 +
 .../auto_parallel/tuner/profiler.py           |   2 +-
 .../communication/stream/all_reduce.py        |   6 +-
 python/paddle/distributed/passes/__init__.py  |   1 +
 .../passes/auto_parallel_pipeline.py          | 626 ++++++++++++++++++
 .../generation_pipeline_pass_unittest.py      | 177 +++++
 .../auto_parallel/test_dist_context.py        |   1 +
 .../unittests/auto_parallel/test_pass_bf16.py |   2 +-
 .../test_pass_generation_pipeline.py          |  57 ++
 .../test_auto_parallel_reshard_serial.py      |   3 +
 21 files changed, 1174 insertions(+), 225 deletions(-)
 create mode 100644 python/paddle/distributed/passes/auto_parallel_pipeline.py
 create mode 100644 python/paddle/fluid/tests/unittests/auto_parallel/generation_pipeline_pass_unittest.py
 create mode 100644 python/paddle/fluid/tests/unittests/auto_parallel/test_pass_generation_pipeline.py

diff --git a/python/paddle/distributed/auto_parallel/constants.py b/python/paddle/distributed/auto_parallel/constants.py
index 83f5704f29cb00..d2fbadd78b9c54 100644
--- a/python/paddle/distributed/auto_parallel/constants.py
+++ b/python/paddle/distributed/auto_parallel/constants.py
@@ -102,6 +102,16 @@ def set_field_default_config(category, field, default_value):
 set_field_default_config(GRADIENT_MERGE, "k_steps", 1)
 set_field_default_config(GRADIENT_MERGE, "avg", True)
 
+#########################################
+# pipeline configuration
+#########################################
+PIPELINE = "pipeline"
+set_field_default_config(PIPELINE, "enable", False)
+set_field_default_config(PIPELINE, "schedule_mode", "1F1B")
+set_field_default_config(PIPELINE, "micro_batch_size", 1)
+set_field_default_config(PIPELINE, "accumulate_steps", 1)
+set_field_default_config(PIPELINE, "generation_batch_size", 1)
+
 #########################################
 # quantization configuration
 #########################################
diff --git a/python/paddle/distributed/auto_parallel/cost/estimate_cost.py b/python/paddle/distributed/auto_parallel/cost/estimate_cost.py
index 6c081f94a2aadd..f9c0b3cb15db29 100644
--- a/python/paddle/distributed/auto_parallel/cost/estimate_cost.py
+++ b/python/paddle/distributed/auto_parallel/cost/estimate_cost.py
@@ -606,8 +606,8 @@ def get_cost_from_engine(engine, mode):
     )
 
     serial_startup_prog = (
-        engine._serial_startup_progs[mode].clone()
-        if mode in engine._serial_startup_progs
+        engine._fwd_dist_contexts[mode]._original_serial_main_program.clone()
+        if mode in engine._fwd_dist_contexts
         else engine._orig_startup_prog.clone()
     )
     losses = (
diff --git a/python/paddle/distributed/auto_parallel/dist_context.py b/python/paddle/distributed/auto_parallel/dist_context.py
index 22a83ae341d628..f3418f271825a4 100644
--- a/python/paddle/distributed/auto_parallel/dist_context.py
+++ b/python/paddle/distributed/auto_parallel/dist_context.py
@@ -130,6 +130,9 @@ def __init__(
         # A flag indicates whether the used parallelism is data parallel
         self._data_parallel = False
 
+        # record upstream and downstream of cur rank
+        self._up_down_streams = UpDownStream()
+
         self._json_config = json_config
 
     @property
@@ -218,6 +221,10 @@ def gradient_scale(self, gs):
     def data_parallel(self):
         return self._data_parallel
 
+    @property
+    def up_down_streams(self):
+        return self._up_down_streams
+
     @data_parallel.setter
     def data_parallel(self, dp):
         self._data_parallel = dp
@@ -1220,3 +1227,45 @@ def parse_backward_blocks(self, program):
             self.nblock += 1
 
         assert self.nblock == len(program.blocks)
+
+
+class UpDownStream:
+    def __init__(self):
+        self._ups = {}
+        self._downs = {}
+
+    def add_up_stream(self, rank, up_stream):
+        ups = self._ups.get(rank, None)
+        if not ups:
+            self._ups[rank] = [up_stream]
+        elif up_stream != -1:
+            ups = list(filter(lambda a: a != -1, ups))
+            ups.append(up_stream)
+            self._ups[rank] = ups
+
+    def add_down_stream(self, rank, down_stream):
+        downs = self._downs.get(rank, None)
+        if not downs:
+            self._downs[rank] = [down_stream]
+        elif down_stream != -1:
+            downs = list(filter(lambda a: a != -1, downs))
+            downs.append(down_stream)
+            self._downs[rank] = downs
+
+    def add_pair_stream(self, up, down):
+        self.add_up_stream(up, -1)
+        self.add_up_stream(down, up)
+        self.add_down_stream(up, down)
+        self.add_down_stream(down, -1)
+
+    def ups(self, rank):
+        ups = self._ups.get(rank, None)
+        if not ups:
+            return None
+        return list(set(ups))
+
+    def downs(self, rank):
+        downs = self._downs.get(rank, None)
+        if not downs:
+            return None
+        return list(set(downs))
diff --git a/python/paddle/distributed/auto_parallel/dist_op.py b/python/paddle/distributed/auto_parallel/dist_op.py
index 7960adafbdfc4f..8489d3f3332a63 100644
--- a/python/paddle/distributed/auto_parallel/dist_op.py
+++ b/python/paddle/distributed/auto_parallel/dist_op.py
@@ -29,8 +29,6 @@ class DistributedOperator:
     def __init__(self, serial_op, dist_attr=None):
         self._serial_op = serial_op
         if dist_attr is not None and isinstance(dist_attr, OperatorDistAttr):
-            pass
-
             # TODO: remove this deepcopy after we fix the issue
             self._dist_attr = copy.deepcopy(dist_attr)
             # self._dist_attr = dist_attr
@@ -56,21 +54,6 @@ def dist_attr(self, dist_attr):
         self._dist_attr = dist_attr
         # TODO: Do we really need to write back to serial op？
         self._serial_op.dist_attr = dist_attr
-        # if self._dist_attr is None:
-        #     self._dist_attr = OperatorDistAttr()
-        # # Create new dist_attr related to current serial_op
-        # dist_attr = self._filter_dist_attr(dist_attr)
-        # # Append suffix to mark the inputs or outputs
-        # if isinstance(dist_attr, dict):
-        #     # Copy the keys since we may add new ones
-        #     for key in list(dist_attr.keys()):
-        #         if isinstance(key, Variable):
-        #             if key.name in self._serial_op.input_arg_names:
-        #                 dist_attr[append_op_input_suffix(key.name)] = True
-        #             if key.name in self._serial_op.output_arg_names:
-        #                 dist_attr[append_op_output_suffix(key.name)] = True
-        # self._dist_attr.init(dist_attr)
-        # self._init_default_dist_attr()
 
     def get_serial_input(self, name):
         if self._serial_op.type == "create_py_reader":
@@ -83,81 +66,6 @@ def get_serial_output(self, name):
         tensor = self._serial_op.block._var_recursive(name)
         return tensor
 
-    # def _init_default_dist_attr(self):
-    #     for tensor_name in self._serial_op.input_arg_names:
-    #         if self._serial_op.type == "create_py_reader":
-    #             tensor = None
-    #         else:
-    #             tensor = self._serial_op.block._var_recursive(tensor_name)
-    #         self._serial_inputs[tensor_name] = tensor
-    #         if tensor is None:
-    #             tensor_shape = []
-    #         else:
-    #             if tensor.type in __no_shape_var_type__:
-    #                 tensor_shape = []
-    #             else:
-    #                 tensor_shape = tensor.shape
-    #         if self._dist_attr.get_input_dims_mapping(tensor_name) is None:
-    #             tensor_dims_mapping = [-1 for _ in range(len(tensor_shape))]
-    #             self._dist_attr.set_input_dims_mapping(
-    #                 tensor_name, tensor_dims_mapping
-    #             )
-    #     for tensor_name in self._serial_op.output_arg_names:
-    #         tensor = self._serial_op.block._var_recursive(tensor_name)
-    #         if tensor.type in __no_shape_var_type__:
-    #             tensor_shape = []
-    #         else:
-    #             tensor_shape = tensor.shape
-    #         self._serial_outputs[tensor_name] = tensor
-    #         if self._dist_attr.get_output_dims_mapping(tensor_name) is None:
-    #             tensor_dims_mapping = [-1 for _ in range(len(tensor_shape))]
-    #             self._dist_attr.set_output_dims_mapping(
-    #                 tensor_name, tensor_dims_mapping
-    #             )
-    #     if self._dist_attr.op_type is None:
-    #         self._dist_attr.op_type = self.serial_op.type
-    #     if self._dist_attr.impl_type is None:
-    #         self._dist_attr.impl_type = "default"
-    #     if self._dist_attr.impl_idx is None:
-    #         self._dist_attr.impl_idx = 0
-    #     if self._dist_attr.is_recompute is None:
-    #         self._dist_attr.is_recompute = False
-
-    # def _filter_dist_attr(self, dist_attr):
-    #     if dist_attr is None:
-    #         return None
-    #     new_dist_attr = None
-    #     if isinstance(dist_attr, dict):
-    #         new_dist_attr = {}
-    #         for key, value in dist_attr.items():
-    #             if isinstance(key, Variable):
-    #                 if (
-    #                     key.name in self._serial_op.input_arg_names
-    #                     or key.name in self._serial_op.output_arg_names
-    #                 ):
-    #                     new_dist_attr[key] = value
-    #             else:
-    #                 new_dist_attr[key] = value
-    #     elif isinstance(dist_attr, OperatorDistAttr):
-    #         new_dist_attr = copy.deepcopy(dist_attr)
-    #         new_dist_attr._inputs_dist_attrs.clear()
-    #         new_dist_attr._outputs_dist_attrs.clear()
-    #         for tensor_name in self._serial_op.input_arg_names:
-    #             tensor_dist_attr = dist_attr.get_input_dist_attr(tensor_name)
-    #             if tensor_dist_attr:
-    #                 new_dist_attr.set_input_dist_attr(
-    #                     tensor_name, tensor_dist_attr
-    #                 )
-    #         for tensor_name in self._serial_op.output_arg_names:
-    #             tensor_dist_attr = dist_attr.get_output_dist_attr(tensor_name)
-    #             if tensor_dist_attr:
-    #                 new_dist_attr.set_output_dist_attr(
-    #                     tensor_name, tensor_dist_attr
-    #                 )
-    #     else:
-    #         assert False, "Cannot recognize the {} parameter.".format(dist_attr)
-    #     return new_dist_attr
-
     def validate_dist_attr(self):
         if "read" in self.serial_op.type or "while" == self.serial_op.type:
             return True
@@ -402,5 +310,6 @@ def __call__(self, *args, **kwargs):
             if self._process_mesh is not None:
                 dist_op.dist_attr.mark_annotated("process_mesh")
             default_dist_ctx.add_dist_op_for_program(dist_op)
+            default_dist_ctx.add_process_mesh(self._process_mesh)
 
         return output
diff --git a/python/paddle/distributed/auto_parallel/dist_saver.py b/python/paddle/distributed/auto_parallel/dist_saver.py
index 87a0319204fd35..8772d234ddf99b 100644
--- a/python/paddle/distributed/auto_parallel/dist_saver.py
+++ b/python/paddle/distributed/auto_parallel/dist_saver.py
@@ -192,17 +192,27 @@ def save_inference_model(self, path, feed_vars, fetch_vars, exe, **kwargs):
             used_inputs += op.input_arg_names
             used_outputs += op.output_arg_names
 
-        for idx, var_name in enumerate(feed_vars_names):
-            if var_name not in used_inputs:
-                feed_vars_names.pop(idx)
-        for idx, var_name in enumerate(fetch_vars_names):
-            if var_name not in used_outputs:
-                fetch_vars_names.pop(idx)
+        # delete duplicated elements and keep order
+        feed_vars_names = list({}.fromkeys(feed_vars_names).keys())
+        used_inputs = list({}.fromkeys(used_inputs).keys())
+        fetch_vars_names = list({}.fromkeys(fetch_vars_names).keys())
+        used_outputs = list({}.fromkeys(used_outputs).keys())
+
+        dist_feed_vars_names = [
+            var_name for var_name in feed_vars_names if var_name in used_inputs
+        ]
+        dist_fetch_vars_names = [
+            var_name
+            for var_name in fetch_vars_names
+            if var_name in used_outputs
+        ]
 
         dist_feed_vars = list(
-            reversed([global_block.vars[name] for name in feed_vars_names])
+            reversed([global_block.vars[name] for name in dist_feed_vars_names])
         )
-        dist_fetch_vars = [global_block.vars[name] for name in fetch_vars_names]
+        dist_fetch_vars = [
+            global_block.vars[name] for name in dist_fetch_vars_names
+        ]
 
         dist_filename = filename + "_dist" + str(rank_id)
         dist_path = os.path.join(dirname, dist_filename)
diff --git a/python/paddle/distributed/auto_parallel/engine.py b/python/paddle/distributed/auto_parallel/engine.py
index a84bea42d538fc..9a4f8611daf427 100644
--- a/python/paddle/distributed/auto_parallel/engine.py
+++ b/python/paddle/distributed/auto_parallel/engine.py
@@ -17,7 +17,6 @@
 import numbers
 import os
 import random
-from collections import defaultdict
 
 import numpy as np
 
@@ -154,7 +153,6 @@ def __init__(
                 " or `paddle.static.Optimizer`."
             )
         self._optimizer = auto_utils.validate_opt(optimizer)
-        self._orig_optimizer = copy.deepcopy(self._optimizer)
 
         metrics = metrics or []
         for metric in auto_utils.to_list(metrics):
@@ -185,6 +183,12 @@ def __init__(
             )
             fleet.init(is_collective=True)
 
+        # for compute cost
+        # TODO: remove _fwd_main_progs and _orig_optimizer
+        self._fwd_dist_contexts = {}
+        self._fwd_main_progs = {}
+        self._orig_optimizer = copy.deepcopy(self._optimizer)
+
         self._executor = None
         self._cur_rank = paddle.distributed.get_rank()
         self._nranks = paddle.distributed.get_world_size()
@@ -194,14 +198,6 @@ def __init__(
         self._orig_startup_prog = static.default_startup_program()
         self._orig_dist_context = get_default_distributed_context()
         self._dist_contexts = {}
-        self._fwd_main_progs = {}
-        self._fwd_dist_contexts = {}
-        self._serial_main_progs = {}
-        self._serial_startup_progs = {}
-        self._dist_main_progs = defaultdict(dict)  # dist main programs
-        self._dist_startup_progs = defaultdict(dict)  # dist startup programs
-        self._feed_vars = {}
-        self._fetch_vars = {}
         self._planners = {}
         self._has_prepared = {"train": False, "eval": False, "predict": False}
         self._has_prepared_reader = {
@@ -334,9 +330,9 @@ def _prepare_data_tensor(self, inputs_spec, labels_spec, inputs, labels):
 
         return inputs, labels
 
-    def _prepare_reader(self):
-        dist_main_prog = self._dist_main_progs[self._mode][self._cur_rank]
+    def _prepare_reader(self, feed_list=[]):
         dist_context = self._dist_contexts[self._mode]
+        dist_main_prog = dist_context.dist_main_programs[self._cur_rank]
         dist_main_block = dist_main_prog.global_block()
 
         # NOTE: this list may be changed if Paddle changes the existing rules.
@@ -357,10 +353,13 @@ def _prepare_reader(self):
             if op.type in related_reader_ops:
                 reader_op_indices.append(idx)
         # Step 2: insert the new reader ops to cpp
+        # record the read ops' desc to insert to program of forward task_node
+        read_ops_desc = []
         new_reader_ops = []
         for idx in reversed(reader_op_indices):
             new_op_desc = dist_main_block.desc._prepend_op()
             new_op_desc.copy_from(dist_main_block.ops[idx].desc)
+            read_ops_desc.append(new_op_desc)
             new_op = Operator(
                 dist_main_block, new_op_desc, type=new_op_desc.type()
             )
@@ -379,6 +378,29 @@ def _prepare_reader(self):
         dist_main_block._sync_with_cpp()
         self._has_prepared_reader[self._mode] = True
 
+        # Insert read op to forward TaskNode if 1F1B pass is setted
+        if self.main_program._pipeline_opt:
+            assert "tasks" in self.main_program._pipeline_opt["fleet_opt"]
+            fleet_opt = self.main_program._pipeline_opt["fleet_opt"]
+            fwd_task = fleet_opt["tasks"][0]
+            fwd_prog = fwd_task.get_program()
+            fwd_block = fwd_prog.global_block()
+
+            for var in feed_list:
+                if var.name not in fwd_block.vars:
+                    fwd_block._clone_variable(var)
+
+            for op_desc in read_ops_desc:
+                new_op_desc = fwd_block.desc._prepend_op()
+                new_op_desc.copy_from(op_desc)
+                new_op = Operator(
+                    fwd_block, new_op_desc, type=new_op_desc.type()
+                )
+                fwd_block.ops.insert(0, new_op)
+
+            fwd_block._sync_with_cpp()
+            fwd_task.set_program(fwd_prog)
+
     def _prepare_feed(self, data, user_feeds, mode):
         feeds = {}
         if data is not None:
@@ -428,14 +450,16 @@ def _process_fetch_group(group_name, var_list):
                 fetch_names.append([])
             fetch_indices.append(group_indices)
 
+        dist_context = self._dist_contexts[mode]
+        fetch_vars = dist_context.serial_fetch_vars
         if mode != "predict":
-            _process_fetch_group("loss", self._fetch_vars[mode]["loss"])
+            _process_fetch_group("loss", fetch_vars["loss"])
         if mode != "predict":
-            metrics = self._fetch_vars[mode]["metrics"]
+            metrics = fetch_vars["metrics"]
             for i, var_list in enumerate(metrics):
                 _process_fetch_group("metrics_" + str(i), var_list)
         if mode == "predict":
-            _process_fetch_group("outputs", self._fetch_vars[mode]["outputs"])
+            _process_fetch_group("outputs", fetch_vars["outputs"])
         for usr_fetch in user_fetches:
             var_name = _to_name_str(usr_fetch)
             fetch(var_name)
@@ -472,7 +496,8 @@ def _prepare_logger(
                 logs["loss"] = outs[idx][0]
             group_idx += 1
             # logging metrics
-            metric_vars = self._fetch_vars[mode]["metrics"]
+            dist_context = self._dist_contexts[mode]
+            metric_vars = dist_context.serial_fetch_vars["metrics"]
             if metric_vars:
                 for metric in self._metrics:
                     metrics_indices = fetch_indices[group_idx]
@@ -503,15 +528,18 @@ def _prepare_logger(
         logs["fetches"] = logs_fetch
         return logs
 
-    def _prepare_program(self, mode):
+    def _prepare_program(self, mode, init_parameters=True):
         # Do the build process
         self._build(mode)
         # Do the planning process
         self._plan(mode)
         # Do the parallel process
         self._parallel(mode)
-        # Init comm and startup program
-        self._initialize(mode)
+        # Init comm
+        self._init_comm()
+        if init_parameters:
+            # startup program
+            self._initialize(mode)
         self._has_prepared[mode] = True
 
     def _build(self, mode):
@@ -543,9 +571,9 @@ def _build(self, mode):
 
             paddle.enable_static()
         else:
-            # build program in static graph mode
-            serial_main_prog = self._serial_main_progs.get(mode, None)
-            if serial_main_prog is not None:
+            # build program in static mode
+            dist_context = self._dist_contexts.get(mode, None)
+            if dist_context is not None:
                 return
 
             outputs = []
@@ -735,42 +763,23 @@ def _init_dist_context(self, mode):
                 )
                 dist_context.set_op_dist_attr_for_program(op, ref_op_dist_attr)
 
-    def _initialize(self, mode):
-        # Get the current content from the distributed context
-        self._serial_main_progs[mode] = self._dist_contexts[
-            mode
-        ].serial_main_program
-        self._serial_startup_progs[mode] = self._dist_contexts[
-            mode
-        ].serial_startup_program
-        self._dist_main_progs[mode] = self._dist_contexts[
-            mode
-        ].dist_main_programs
-        self._dist_startup_progs[mode] = self._dist_contexts[
-            mode
-        ].dist_startup_programs
-        self._feed_vars[mode] = self._dist_contexts[mode].serial_feed_vars
-        self._fetch_vars[mode] = self._dist_contexts[mode].serial_fetch_vars
-        self._optimizer = self._dist_contexts[mode]._serial_optimizer
-
+    def _init_comm(self):
         if self._nranks > 1:
             # Traverse different rank programs and traverse each op of them,
             # instantiate communication by process_mapping.
             all_process_groups = get_all_process_groups()
-            cur_rank = self._cur_rank
-            # NOTE: After the implementation of the unified dynamic and static communication group
-            # initialization mode in the future, the initialization logic of full mode
-            # will be removed because port occupation error may occur.
+
             if self._strategy.auto_mode == "full":
                 auto_utils.initialize_pg_in_full_mode(
-                    all_process_groups, cur_rank
+                    all_process_groups, self._cur_rank
                 )
             else:
                 for process_group in all_process_groups:
-                    if cur_rank not in process_group.ranks:
+                    if self._cur_rank not in process_group.ranks:
                         continue
                     process_group.instantiate()
 
+    def _initialize(self, mode):
         self._place = _get_device()
         if isinstance(self._place, paddle.framework.CUDAPlace):
             self._place = paddle.framework.CUDAPlace(
@@ -782,9 +791,9 @@ def _initialize(self, mode):
             np.random.seed(self._strategy.seed + self._dp_ranks[0])
             random.seed(self._strategy.seed + self._dp_ranks[0])
 
+        dist_context = self._dist_contexts[mode]
         if self._dygraph_mode:
-            dist_context = self._dist_contexts[mode]
-            dist_main_program = self._dist_main_progs[mode][self._cur_rank]
+            dist_main_program = dist_context.dist_main_programs[self._cur_rank]
             self.program_helper.init(
                 dist_main_program, self._place, dist_context
             )
@@ -792,7 +801,9 @@ def _initialize(self, mode):
         if self._executor is None:
             self._executor = paddle.static.Executor(self._place)
             uninitialized = []
-            dist_startup_prog = self._dist_startup_progs[mode][self._cur_rank]
+            dist_startup_prog = dist_context.dist_startup_programs[
+                self._cur_rank
+            ]
             for var in dist_startup_prog.list_vars():
                 scope_var = global_scope().find_var(var.name)
                 if scope_var and scope_var.get_tensor()._is_initialized():
@@ -809,7 +820,9 @@ def _initialize(self, mode):
 
         if self._strategy.reinit:
             self._logger.info("NOTE: parameters will be re-initialized.")
-            dist_startup_prog = self._dist_startup_progs[mode][self._cur_rank]
+            dist_startup_prog = dist_context.dist_startup_programs[
+                self._cur_rank
+            ]
             self._executor.run(dist_startup_prog)
 
     def fit(
@@ -1282,6 +1295,7 @@ def prepare(
         main_program=None,
         startup_program=None,
         mode=None,
+        init_parameters=True,
     ):
         if mode is not None:
             self.to_mode(mode)
@@ -1324,7 +1338,7 @@ def prepare(
         self._inputs_spec, self._labels_spec = inputs_spec, labels_spec
         self._inputs, self._labels = inputs, labels
         if not self._has_prepared[self._mode]:
-            self._prepare_program(self._mode)
+            self._prepare_program(self._mode, init_parameters)
         else:
             self._switch_mode(self._mode)
 
@@ -1375,16 +1389,17 @@ def _prepare_dataloader(
             )
             batch_size //= self._k_steps
 
-        dist_main_prog = self._dist_main_progs[self._mode][self._cur_rank]
-        dist_startup_prog = self._dist_startup_progs[self._mode][self._cur_rank]
+        dist_context = self._dist_contexts[self._mode]
+        dist_main_prog = dist_context.dist_main_programs[self._cur_rank]
+        dist_startup_prog = dist_context.dist_startup_programs[self._cur_rank]
         dist_main_block = dist_main_prog.global_block()
 
         # NOTE: Get feed_list, then insert dataloader op with sharded var shape.
         # Cause predict_program does not contain labels var,
         # then we will add labels var from serial_program to dist_program,
         # that maintains the length of feed_list equal to the length of dataset's values.
-        inputs_var = self._feed_vars[self._mode]["inputs"]
-        labels_var = self._feed_vars[self._mode]["labels"]
+        inputs_var = dist_context.serial_feed_vars["inputs"]
+        labels_var = dist_context.serial_feed_vars["labels"]
         feed_list = []
         for var in inputs_var + labels_var:
             if var.name in dist_main_block.vars:
@@ -1443,16 +1458,17 @@ def _prepare_dataloader_from_generator(
             )
             batch_size //= self._k_steps
 
-        dist_main_prog = self._dist_main_progs[self._mode][self._cur_rank]
-        dist_startup_prog = self._dist_startup_progs[self._mode][self._cur_rank]
+        dist_context = self._dist_contexts[self._mode]
+        dist_main_prog = dist_context.dist_main_programs[self._cur_rank]
+        dist_startup_prog = dist_context.dist_startup_programs[self._cur_rank]
         dist_main_block = dist_main_prog.global_block()
 
         # NOTE: Get feed_list, then insert dataloader op with sharded var shape.
         # Cause predict_program does not contain labels var,
         # then we will add labels var from serial_program to dist_program,
         # that maintains the length of feed_list equal to the length of dataset's values.
-        inputs_var = self._feed_vars[self._mode]["inputs"]
-        labels_var = self._feed_vars[self._mode]["labels"]
+        inputs_var = dist_context.serial_feed_vars["inputs"]
+        labels_var = dist_context.serial_feed_vars["labels"]
         feed_list = []
         for var in inputs_var + labels_var:
             if var.name in dist_main_block.vars:
@@ -1482,7 +1498,7 @@ def _prepare_dataloader_from_generator(
                 data_parallel_world_size=self._dp_world_sizes,
                 data_parallel_rank=self._dp_ranks,
             )
-        self._prepare_reader()
+        self._prepare_reader(feed_list)
         return dataloader
 
     def _tune(self, tune_data, tune_sample_split=None, batch_size=1):
@@ -1542,7 +1558,7 @@ def _metrics_name(self):
 
     def _switch_mode(self, mode):
         assert (
-            mode in self._dist_main_progs
+            mode in self._dist_contexts
         ), f"{mode} model is not ready, please call `prepare()` first."
         self.to_mode(mode)
         self._optimizer = self._dist_contexts[mode]._serial_optimizer
@@ -1556,8 +1572,8 @@ def to_mode(self, mode):
         self._mode = mode
 
     def _set_state_dict(self, mode, strict, state_dict, dist_attr):
-        program = self._dist_main_progs[mode][self._cur_rank]
         dist_context = self._dist_contexts[mode]
+        program = dist_context.dist_main_programs[self._cur_rank]
         cur_dist_attr = auto_utils.get_dist_attr(program, dist_context)
         converter = Converter(state_dict, dist_attr, cur_dist_attr)
         state_dict = converter.convert(strict=strict)
@@ -1622,10 +1638,10 @@ def save(self, path, training=True):
 
         """
         if training:
-            assert self._mode in self._serial_main_progs
-            serial_program = self._serial_main_progs[self._mode]
-            dist_main_prog = self._dist_main_progs[self._mode][self._cur_rank]
+            assert self._mode in self._dist_contexts
             dist_context = self._dist_contexts[self._mode]
+            serial_program = dist_context.serial_main_program
+            dist_main_prog = dist_context.dist_main_programs[self._cur_rank]
             self._saver.save(
                 path,
                 serial_program=serial_program,
@@ -1633,10 +1649,11 @@ def save(self, path, training=True):
                 dist_context=dist_context,
             )
         else:
-            assert "predict" in self._dist_main_progs
-            feed_vars = self._feed_vars["predict"]['inputs']
-            fetch_vars = self._fetch_vars["predict"]['outputs']
-            dist_main_prog = self._dist_main_progs["predict"][self._cur_rank]
+            assert "predict" in self._dist_contexts
+            dist_context = self._dist_contexts["predict"]
+            feed_vars = dist_context.serial_feed_vars['inputs']
+            fetch_vars = dist_context.serial_fetch_vars['outputs']
+            dist_main_prog = dist_context.dist_main_programs[self._cur_rank]
             if self._strategy.qat.enable and self._strategy.qat.onnx_format:
                 from paddle.static.quantization import QuantWeightPass
 
@@ -1776,11 +1793,13 @@ def cost(self, inputs_spec=None, labels_spec=None, mode=None):
 
     @property
     def main_program(self):
-        return self._dist_main_progs[self._mode][self._cur_rank]
+        dist_context = self._dist_contexts[self._mode]
+        return dist_context.dist_main_programs[self._cur_rank]
 
     @property
     def startup_program(self):
-        return self._dist_startup_progs[self._mode][self._cur_rank]
+        dist_context = self._dist_contexts[self._mode]
+        return dist_context.dist_startup_programs[self._cur_rank]
 
     @property
     def dist_context(self):
@@ -1788,15 +1807,30 @@ def dist_context(self):
 
     @property
     def serial_main_program(self):
-        return self._serial_main_progs[self._mode]
+        dist_context = self._dist_contexts[self._mode]
+        return dist_context.serial_main_program
 
     @property
     def serial_startup_program(self):
-        return self._serial_startup_progs[self._mode]
+        dist_context = self._dist_contexts[self._mode]
+        return dist_context.serial_startup_program
+
+    @property
+    def feed_vars(self):
+        dist_context = self._dist_contexts[self._mode]
+        return dist_context.serial_feed_vars
 
     @property
     def fetch_vars(self):
-        return self._fetch_vars[self._mode]
+        dist_context = self._dist_contexts[self._mode]
+        return dist_context.serial_fetch_vars
+
+    @property
+    def optimizer(self):
+        dist_context = self._dist_contexts[self._mode]
+        if dist_context._serial_optimizer:
+            return dist_context._serial_optimizer
+        return self._optimizer
 
     @property
     def inputs(self):
diff --git a/python/paddle/distributed/auto_parallel/interface.py b/python/paddle/distributed/auto_parallel/interface.py
index 9fda85ecef010a..76207bc588968c 100644
--- a/python/paddle/distributed/auto_parallel/interface.py
+++ b/python/paddle/distributed/auto_parallel/interface.py
@@ -79,7 +79,15 @@ def shard_tensor(x, process_mesh=None, shard_spec=None):
     assert isinstance(
         shard_spec, list
     ), f"Argument shard_spec {shard_spec} is not an instance of list"
-    dist_tensor = DistributedTensor(x)
+    if isinstance(x, str):
+        x = (
+            paddle.static.default_main_program()
+            .global_block()
+            ._var_recursive(x)
+        )
+        dist_tensor = DistributedTensor(x)
+    else:
+        dist_tensor = DistributedTensor(x)
     serial_tensor = dist_tensor.serial_tensor
     dist_tensor.dist_attr.process_mesh = process_mesh
     if serial_tensor.type in __no_shape_var_type__:
@@ -102,6 +110,7 @@ def shard_tensor(x, process_mesh=None, shard_spec=None):
     default_dist_ctx = get_default_distributed_context()
     default_dist_ctx.add_dist_tensor_for_program(dist_tensor)
     dist_tensor = default_dist_ctx.get_dist_tensor_for_program(x)
+    default_dist_ctx.add_process_mesh(process_mesh)
     return x
 
 
diff --git a/python/paddle/distributed/auto_parallel/parallelizer.py b/python/paddle/distributed/auto_parallel/parallelizer.py
index d2463f33086376..549f618c6cbc94 100644
--- a/python/paddle/distributed/auto_parallel/parallelizer.py
+++ b/python/paddle/distributed/auto_parallel/parallelizer.py
@@ -499,12 +499,19 @@ def parallelize(
                         break
                 if is_pipeline:
                     with paddle.static.program_guard(dist_main_prog):
-                        paddle.distributed.barrier()
+                        paddle.distributed.barrier(get_process_group(0))
 
             # Traverse different rank programs and traverse each op of them,
             # instantiate communication by process_mapping.
             all_process_groups = get_all_process_groups()
             for process_group in all_process_groups:
+                if len(_g_process_group_map) > 0:
+                    tmp = paddle.to_tensor([1], dtype="int32")
+                    paddle.distributed.all_reduce(
+                        tmp, sync_op=True, group=_g_process_group_map[0]
+                    )
+                    paddle.device.cuda.synchronize()
+
                 if rank not in process_group.ranks:
                     continue
                 process_group.instantiate()
diff --git a/python/paddle/distributed/auto_parallel/parallelizer_v2.py b/python/paddle/distributed/auto_parallel/parallelizer_v2.py
index a76a3f5dcb9abd..c4ef623b172605 100644
--- a/python/paddle/distributed/auto_parallel/parallelizer_v2.py
+++ b/python/paddle/distributed/auto_parallel/parallelizer_v2.py
@@ -177,10 +177,22 @@ def parallel(self, rank):
                     time.time() - time0, self._mode
                 )
             )
+            # Apply post optimization passes
+            time0 = time.time()
+            self._apply_post_optimization(
+                dist_main_prog, dist_startup_prog, rank, dist_params_grads
+            )
+            self._logger.debug(
+                "within parallel apply_post_optimization time: {}, mode {}".format(
+                    time.time() - time0, self._mode
+                )
+            )
         # Clone program for test
         if self._mode != 'train':
+            pipeline_opt = dist_main_prog._pipeline_opt
             dist_main_prog = dist_main_prog.clone(for_test=True)
             dist_startup_prog = dist_startup_prog.clone(for_test=True)
+            dist_main_prog._pipeline_opt = pipeline_opt
 
         # Store the distributed programs for further usages
         self._dist_context.dist_main_programs[rank] = dist_main_prog
@@ -247,7 +259,7 @@ def _apply_pre_optimization(
 
         # apply quantization pass
         # The pass can be applied when mode must be 'train'
-        if self._strategy.qat.enable:
+        if self._mode == 'train' and self._strategy.qat.enable:
             config = copy.deepcopy(self._strategy.qat.to_dict())
             config["dist_context"] = self._dist_context
             config["params_grads"] = params_grads
@@ -307,8 +319,8 @@ def _apply_post_optimization(
             )
             params_grads = self._pass_context.get_attr("params_grads")
 
-        # GradClip is train-only optimization
         if self._mode == "train":
+            # GradClip is train-only optimization
             config = copy.deepcopy(self._strategy.sharding.to_dict())
             config["dist_context"] = self._dist_context
             config["params_grads"] = params_grads
@@ -330,6 +342,13 @@ def _apply_post_optimization(
                 [main_program], [startup_program], self._pass_context
             )
 
+        if self._strategy.pipeline.enable:
+            self._strategy.gradient_merge.enable = True
+            self._strategy.gradient_merge.k_steps = (
+                self._strategy.pipeline.accumulate_steps
+            )
+            self._strategy.gradient_merge.avg = True
+
         # gradient_merge is then train-only optimization
         if self._mode == "train" and self._strategy.gradient_merge.enable:
             config = copy.deepcopy(self._strategy.gradient_merge.to_dict())
@@ -342,6 +361,16 @@ def _apply_post_optimization(
                 [main_program], [startup_program], self._pass_context
             )
 
+        if self._strategy.pipeline.enable:
+            config = copy.deepcopy(self._strategy.pipeline.to_dict())
+            config["dist_context"] = self._dist_context
+            auto_parallel_pipeline_pass = new_pass(
+                "auto_parallel_pipeline", config
+            )
+            auto_parallel_pipeline_pass.apply(
+                [main_program], [startup_program], self._pass_context
+            )
+
         if self._mode == "train" and self._strategy.fused_passes.enable:
             if len(self._strategy.fused_passes.fused_passes_list) > 0:
                 new_pass_list = []
diff --git a/python/paddle/distributed/auto_parallel/process_group.py b/python/paddle/distributed/auto_parallel/process_group.py
index 83e1642ba21bb1..8c300cbcd53b63 100644
--- a/python/paddle/distributed/auto_parallel/process_group.py
+++ b/python/paddle/distributed/auto_parallel/process_group.py
@@ -52,9 +52,9 @@ def new_process_group(ranks, group_id=None, force_new_group=False):
     global _g_process_group_map
     if not force_new_group:
         # A key constructed from ranks is used for avoiding duplication
-        new_key = ''.join(map(str, sorted(ranks)))
+        new_key = ''.join(map(str, ranks))
         for pg_id, pg in _g_process_group_map.items():
-            cur_key = ''.join(map(str, sorted(pg.ranks)))
+            cur_key = ''.join(map(str, pg.ranks))
             if pg_id != 0 and new_key == cur_key:
                 return pg
     # If not matching the existing one, construct a new process group
@@ -82,7 +82,7 @@ def __init__(self, group_id, ranks):
                 group_id != 0
             ), "Process group id 0 is reserved for all ranks."
         self._group_id = group_id
-        self._ranks = sorted(ranks)
+        self._ranks = ranks
         # Add the current ranks into group 0
         if group_id != 0:
             global _g_process_group_map
@@ -109,7 +109,7 @@ def add_ranks(self, new_ranks):
                 not self.is_instantiate()
             ), "Cannot add new ranks after instantiating the process group"
         self._ranks.extend(new_ranks)
-        self._ranks = sorted(set(self.ranks))
+        self._ranks = list(set(self.ranks))
 
     def local_rank(self, global_rank):
         if global_rank in self.ranks:
diff --git a/python/paddle/distributed/auto_parallel/reshard.py b/python/paddle/distributed/auto_parallel/reshard.py
index 7461e85c672483..91e07fc651d209 100644
--- a/python/paddle/distributed/auto_parallel/reshard.py
+++ b/python/paddle/distributed/auto_parallel/reshard.py
@@ -848,7 +848,8 @@ def remove_no_need_ops(auto_parallel_main_prog, dist_context, rank_id):
                         remove_op_idx.append(idx)
 
             for idx in remove_op_idx[::-1]:
-                block._remove_op(idx)
+                block._remove_op(idx, sync=False)
+            block._sync_with_cpp()
 
     @staticmethod
     def remove_no_need_vars(
@@ -1000,7 +1001,8 @@ def remove_no_need_in_startup(
             if is_no_need_op:
                 remove_op_idx.append(idx)
         for idx in remove_op_idx[::-1]:
-            startup_block._remove_op(idx)
+            startup_block._remove_op(idx, sync=False)
+        startup_block._sync_with_cpp()
 
 
 class Resharder:
@@ -1441,6 +1443,8 @@ def find_op_desc_seq(self, dist_tensor, dist_attr, serial=False):
         target_process_group = target_process_mesh.process_ids
         target_process_shape = target_process_mesh.shape
 
+        op_role = dist_attr[2]
+
         if source_tensor.shape[0] < 0:
             assert source_tensor.shape[0] == -1
             new_shape = list(source_tensor.shape)
@@ -1583,6 +1587,10 @@ def find_op_desc_seq(self, dist_tensor, dist_attr, serial=False):
                         Resharder.concat_partitions(
                             partition_index_list, source_partition_index
                         )
+                        if int(op_role) == int(OpRole.Forward):
+                            self.dist_context.up_down_streams.add_pair_stream(
+                                to_send_process, target_process
+                            )
 
                 # append concat op desc
                 op_desc_seq[target_process].append(
@@ -2037,13 +2045,6 @@ def parse_op_desc(
                                     op_dist_attr.set_input_dims_mapping(
                                         new_name, dims_mapping
                                     )
-                                    # if (
-                                    #     old_name
-                                    #     in op_dist_attr._inputs_dist_attrs
-                                    # ):
-                                    #     op_dist_attr.del_input_dist_attr(
-                                    #         old_name
-                                    #     )
                                     op_dist_attr.set_input_dims_mapping(
                                         new_name, dims_mapping
                                     )
@@ -2067,7 +2068,6 @@ def parse_op_desc(
                                     op_dist_attr.set_input_dims_mapping(
                                         new_name, dims_mapping
                                     )
-                                    # op_dist_attr.del_input_dist_attr(old_name)
                                     op_dist_attr.set_input_dims_mapping(
                                         new_name, dims_mapping
                                     )
@@ -2095,7 +2095,6 @@ def parse_op_desc(
                                 op_dist_attr.set_input_dims_mapping(
                                     new_name, dims_mapping
                                 )
-                                # op_dist_attr.del_input_dist_attr(old_name)
                                 op_dist_attr.set_input_dims_mapping(
                                     new_name, dims_mapping
                                 )
@@ -2135,7 +2134,13 @@ def _get_subblock_input_attrs(self, op, var_name):
                             has_exist = True
                             break
                     if not has_exist:
-                        input_attrs.append([process_mesh, input_dims_mapping])
+                        input_attrs.append(
+                            [
+                                process_mesh,
+                                input_dims_mapping,
+                                op.attr('op_role'),
+                            ]
+                        )
         return input_attrs
 
     def _get_subblock_output_attrs(self, op, var_name):
@@ -2165,7 +2170,13 @@ def _get_subblock_output_attrs(self, op, var_name):
                             has_exist = True
                             break
                     if not has_exist:
-                        output_attrs.append([process_mesh, output_dims_mapping])
+                        output_attrs.append(
+                            [
+                                process_mesh,
+                                output_dims_mapping,
+                                op.attr('op_role'),
+                            ]
+                        )
         return output_attrs
 
     def _get_common_op_input_attrs(self, op, var_name):
@@ -2188,7 +2199,9 @@ def _get_common_op_input_attrs(self, op, var_name):
         input_dims_mapping = dist_attr.get_input_dims_mapping(var_name)
         input_attrs = []
         for process_mesh in process_meshes:
-            input_attrs.append([process_mesh, input_dims_mapping])
+            input_attrs.append(
+                [process_mesh, input_dims_mapping, op.attr('op_role')]
+            )
 
         return input_attrs
 
@@ -2207,7 +2220,7 @@ def get_op_input_attrs(self, op, var_name):
 
         assert (
             op_input_attrs
-        ), "The input '{}' of op '{}' has no distibution attributes in subblock".format(
+        ), "The input '{}' of op '{}' has no distributed attributes in subblock".format(
             op.name, var_name
         )
 
@@ -2215,30 +2228,24 @@ def get_op_input_attrs(self, op, var_name):
 
     def _remove_global_process_mesh(self):
         """Remove global process mesh from dist_context.process_meshes"""
-        processes = set()
+        process_ids = set()
         process_mesh_count = len(self.dist_context.process_meshes)
         if process_mesh_count > 1:
-            global_process_mesh_idx = None
+            global_process_mesh_idx = []
+            has_sub_process_mesh = False
             for process_mesh in self.dist_context.process_meshes:
-                for process in process_mesh.process_ids:
-                    processes.add(process)
+                for process_id in process_mesh.process_ids:
+                    process_ids.add(process_id)
             for idx, process_mesh in enumerate(
                 self.dist_context.process_meshes
             ):
-                if len(set(process_mesh.process_ids)) == len(processes):
-                    global_process_mesh_idx = idx
-                    break
+                if len(set(process_mesh.process_ids)) == len(process_ids):
+                    global_process_mesh_idx.append(idx)
+                elif set(process_mesh.process_ids) < process_ids:
+                    has_sub_process_mesh = True
 
-            if global_process_mesh_idx is not None:
-                is_removed = False
-                global_mesh = self.dist_context.process_meshes[idx]
-                for i, mesh in enumerate(self.dist_context.process_meshes):
-                    if i == idx:
-                        continue
-                    if set(mesh.process_ids) < set(global_mesh.process_ids):
-                        is_removed = True
-
-                if is_removed:
+            if has_sub_process_mesh:
+                for idx in reversed(global_process_mesh_idx):
                     self.dist_context.process_meshes.pop(idx)
 
     def _change_subblock_op_input_and_output(self, block_idx, block):
@@ -2278,7 +2285,6 @@ def _change_subblock_op_input_and_output(self, block_idx, block):
                             op_dist_attr.set_input_dist_attr(
                                 new_name, op_input_dist_attr
                             )
-                            # op_dist_attr.del_input_dist_attr(old_name)
 
                 # the outputs also need to be renamed when the output name is the same with input name in inplace op
                 for var_name in op.output_arg_names:
@@ -2302,7 +2308,6 @@ def _change_subblock_op_input_and_output(self, block_idx, block):
                         op_dist_attr.set_output_dist_attr(
                             new_name, op_output_dist_attr
                         )
-                        # op_dist_attr.del_output_dist_attr(old_name)
 
     def _reshard_input(self, block):
         idx = 0
@@ -2450,7 +2455,7 @@ def _hadnle_recv(self, block, idx, var, op, send_rank, recv_rank):
                     assert set_lod is True
 
                 # cast int64 to bool
-                block._insert_op(
+                cast_op = block._insert_op(
                     idx + 2,
                     type='cast',
                     inputs={
@@ -2465,6 +2470,7 @@ def _hadnle_recv(self, block, idx, var, op, send_rank, recv_rank):
                         'op_role': op.attr('op_role'),
                     },
                 )
+                cast_op._set_attr('op_namescope', "/auto_parallel/reshard")
             else:
                 if var.lod_level != 0:
                     recv_out = block.create_var(
@@ -2612,6 +2618,10 @@ def _reshard_output(self, block):
                                         ]
                                         if recv_rank == item:
                                             continue
+                                        if var.shape[0] == -1:
+                                            new_shape = list(var.shape)
+                                            new_shape[0] = self.batch_size
+                                            var.desc.set_shape(new_shape)
                                         if self.rank_id == item:
                                             # if send bool data, cast then send
                                             self._handle_send(
@@ -2640,6 +2650,10 @@ def _reshard_output(self, block):
                                     item = output_attr[0].process_ids[index]
                                     if recv_rank == item:
                                         continue
+                                    if var.shape[0] == -1:
+                                        new_shape = list(var.shape)
+                                        new_shape[0] = self.batch_size
+                                        var.desc.set_shape(new_shape)
                                     if self.rank_id == item:
                                         # if send bool data, cast then send
                                         self._handle_send(
@@ -2714,7 +2728,11 @@ def get_cost(self, op, tensor, cluster):
                     tensor.name
                 )
                 process_mesh = dist_op.dist_attr.process_mesh
-                dist_attr = [process_mesh, dims_mapping]
+                dist_attr = [
+                    process_mesh,
+                    dims_mapping,
+                    dist_op.serial_op.attr('op_role'),
+                ]
                 if dist_tensor is not None and self.need_reshard(
                     dist_tensor, dist_attr
                 ):
diff --git a/python/paddle/distributed/auto_parallel/strategy.py b/python/paddle/distributed/auto_parallel/strategy.py
index 58a08586ff5cbd..a4dd2c54d2eed8 100644
--- a/python/paddle/distributed/auto_parallel/strategy.py
+++ b/python/paddle/distributed/auto_parallel/strategy.py
@@ -102,6 +102,12 @@ def __init__(self, config_dict=None):
         super().__init__(category, config_dict)
 
 
+class PipelineConfig(BaseConfig):
+    def __init__(self, config_dict=None):
+        category = constants.PIPELINE
+        super().__init__(category, config_dict)
+
+
 class QATConfig(BaseConfig):
     def __init__(self, config_dict=None):
         category = constants.QAT
@@ -186,6 +192,9 @@ def __init__(self, config=None):
         config_dict = self._config_dict.get(constants.GRADIENT_MERGE, None)
         self.gradient_merge = GradientMergeConfig(config_dict)
 
+        config_dict = self._config_dict.get(constants.PIPELINE, None)
+        self.pipeline = PipelineConfig(config_dict)
+
         config_dict = self._config_dict.get(constants.QAT, None)
         self.qat = QATConfig(config_dict)
 
diff --git a/python/paddle/distributed/auto_parallel/tuner/profiler.py b/python/paddle/distributed/auto_parallel/tuner/profiler.py
index cca53773ebbef5..27e0fa49845446 100644
--- a/python/paddle/distributed/auto_parallel/tuner/profiler.py
+++ b/python/paddle/distributed/auto_parallel/tuner/profiler.py
@@ -91,7 +91,7 @@ def init_process_groups(group_map, rank):
     # TODO should instantiate global group first
     all_process_groups = get_all_process_groups()
     for process_group in all_process_groups:
-        if rank not in process_group.ranks:
+        if process_group.id == 0 or rank not in process_group.ranks:
             continue
         print(process_group)
         process_group.instantiate()
diff --git a/python/paddle/distributed/communication/stream/all_reduce.py b/python/paddle/distributed/communication/stream/all_reduce.py
index 3b870afe6f5c12..6b38bffc0bf3f1 100644
--- a/python/paddle/distributed/communication/stream/all_reduce.py
+++ b/python/paddle/distributed/communication/stream/all_reduce.py
@@ -122,9 +122,9 @@ def all_reduce(
             tensor, op, group, sync_op, use_calc_stream
         )
     else:
-        assert (
-            group is None
-        ), "Group can not be used in static graph mode for now."
+        # assert (
+        #     group is None
+        # ), "Group can not be used in static graph mode for now."
         return _all_reduce_in_static_mode(
             tensor, op, group, sync_op, use_calc_stream
         )
diff --git a/python/paddle/distributed/passes/__init__.py b/python/paddle/distributed/passes/__init__.py
index 8550cb049b11ed..8ab110e60c3b97 100644
--- a/python/paddle/distributed/passes/__init__.py
+++ b/python/paddle/distributed/passes/__init__.py
@@ -23,6 +23,7 @@
 from .auto_parallel_data_parallel_optimization import *  # noqa: F403
 from .auto_parallel_grad_clip import *  # noqa: F403
 from .auto_parallel_supplement_explicit_dependencies import *  # noqa: F403
+from .auto_parallel_pipeline import *  # noqa: F403
 from .cpp_pass import *  # noqa: F403
 from .ps_trainer_pass import *  # noqa: F403
 from .ps_server_pass import *  # noqa: F403
diff --git a/python/paddle/distributed/passes/auto_parallel_pipeline.py b/python/paddle/distributed/passes/auto_parallel_pipeline.py
new file mode 100644
index 00000000000000..5b707d088bf8b2
--- /dev/null
+++ b/python/paddle/distributed/passes/auto_parallel_pipeline.py
@@ -0,0 +1,626 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from paddle.distributed.fleet.fleet_executor_utils import TaskNode
+from paddle.fluid import core
+from paddle.fluid.framework import Parameter, Program
+
+from .pass_base import PassBase, register_pass
+
+__not_shape_var_type__ = [
+    core.VarDesc.VarType.READER,
+    core.VarDesc.VarType.STEP_SCOPES,
+    core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+    core.VarDesc.VarType.FEED_MINIBATCH,
+    core.VarDesc.VarType.FETCH_LIST,
+]
+
+
+@register_pass("auto_parallel_pipeline")
+class PipelinePass(PassBase):
+    def __init__(self):
+        super().__init__()
+        self.set_attr("dist_context", None)
+
+    def _check_self(self):
+        if self.get_attr("dist_context") is None:
+            return False
+        return True
+
+    def _check_conflict(self, other_pass):
+        return True
+
+    def _apply_single_impl(self, main_program, startup_program, context):
+        self._dist_context = self.get_attr("dist_context")
+        self._acc_steps = self.get_attr("accumulate_steps")
+        self._mode = self.get_attr("schedule_mode")
+        self._gen_bsz = self.get_attr("generation_batch_size")
+        self._program = main_program
+
+        if self._mode == "1F1B":
+            raise NotImplementedError("1F1B has not been implemented")
+        elif self._mode == "F-Then-B":
+            raise NotImplementedError("F-Then-B has not been implemented")
+        elif self._mode == "stream":
+            self._insert_sync_ops_for_stream()
+            self._task_stream()
+        else:
+            raise ValueError(
+                "Now only 'F-then-B', '1F1B' and 'stream' are supported."
+                "The given value is {}.".format(self._mode)
+            )
+
+    def _insert_sync_ops_for_stream(self):
+
+        for block in self._program.blocks:
+            offset = 0
+            send_vars = []
+            # insert sync ops
+            for index, op in enumerate(list(block.ops)):
+                if op.type == 'send_v2':
+                    # step1: set 'use_calc_stream' False
+                    op._set_attr("use_calc_stream", False)
+                    op_role = op.attr('op_role')
+                    # step2: insert 'c_sync_calc_stream' op before 'send_v2' op
+                    var_name = op.input_arg_names[0]
+                    var = block.var(var_name)
+                    block._insert_op_without_sync(
+                        index=index + offset,
+                        type="c_sync_calc_stream",
+                        inputs={'X': [var]},
+                        outputs={'Out': [var]},
+                        attrs={'op_role': op_role},
+                    )
+                    offset += 1
+                    send_vars.append(var_name)
+
+            for var_name in send_vars:
+                nop_op = block.append_op(type='nop')
+                nop_op.desc.set_input('X', [var_name])
+                nop_op.desc.set_output('Out', [var_name])
+
+            block._sync_with_cpp()
+
+    def _create_param(self, dst_block, src_var):
+        copied_kwargs = {}
+        copied_kwargs['trainable'] = src_var.trainable
+        copied_kwargs['optimize_attr'] = src_var.optimize_attr
+        copied_kwargs['regularizer'] = src_var.regularizer
+        copied_kwargs['do_model_average'] = src_var.do_model_average
+        copied_kwargs['need_clip'] = src_var.need_clip
+
+        Parameter(
+            block=dst_block,
+            type=src_var.type,
+            name=src_var.name,
+            shape=src_var.shape,
+            dtype=src_var.dtype,
+            lod_level=src_var.lod_level,
+            error_clip=src_var.error_clip,
+            stop_gradient=src_var.stop_gradient,
+            is_data=src_var.is_data,
+            belong_to_optimizer=src_var.belong_to_optimizer,
+            **copied_kwargs
+        )
+
+    def _create_inter(self, dst_block, src_var):
+        dst_block.create_var(
+            type=src_var.type,
+            name=src_var.name,
+            shape=src_var.shape,
+            dtype=src_var.dtype,
+            lod_level=src_var.lod_level,
+            persistable=src_var.persistable,
+            error_clip=src_var.error_clip,
+            stop_gradient=src_var.stop_gradient,
+            is_data=src_var.is_data,
+            belong_to_optimizer=src_var.belong_to_optimizer,
+        )
+
+    def _create_var(
+        self, src_block, dst_block, src_varname, force_create=False
+    ):
+
+        if not force_create:
+            src_var = src_block.var(src_varname)
+        else:
+            src_var = src_block._var_recursive(src_varname)
+        if src_var.type in __not_shape_var_type__:
+            persist = getattr(src_var, 'persistable', False)
+            dst_block.create_var(
+                type=src_var.type,
+                name=src_var.name,
+                persistable=persist,
+                error_clip=src_var.error_clip,
+                stop_gradient=src_var.stop_gradient,
+                is_data=src_var.is_data,
+                belong_to_optimizer=src_var.belong_to_optimizer,
+            )
+        else:
+            if isinstance(src_var, Parameter):
+                self._create_param(dst_block, src_var)
+            else:
+                self._create_inter(dst_block, src_var)
+
+    def _create_program(self, src_block, dst_block, src_op, force_create=False):
+        dst_op_desc = dst_block.desc.append_op()
+        dst_op_desc.copy_from(src_op.desc)
+        for input_varname in src_op.input_arg_names:
+            if src_block.has_var(input_varname) or (
+                force_create and src_block._find_var_recursive(input_varname)
+            ):
+                self._create_var(
+                    src_block, dst_block, input_varname, force_create
+                )
+        for output_varname in src_op.output_arg_names:
+            if src_block.has_var(output_varname) or (
+                force_create and src_block._find_var_recursive(output_varname)
+            ):
+                self._create_var(
+                    src_block, dst_block, output_varname, force_create
+                )
+
+    def _get_pp_stage(self, rank):
+        pp_idx = None
+        for idx, process_mesh in enumerate(self._dist_context.process_meshes):
+            if rank in process_mesh.processes:
+                pp_idx = idx
+                break
+        return pp_idx
+
+    def _task_stream(self):
+        cur_rank = int(os.getenv("PADDLE_TRAINER_ID", 0))
+        trainer_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS", "").split(',')
+        nrank = len(trainer_endpoints)
+        num_of_functionality = 5
+
+        # compute current pp stage
+        pp_stages = len(self._dist_context.process_meshes)
+        cur_pp_stage = self._get_pp_stage(cur_rank)
+
+        start_prog = Program()
+        cond_prog = Program()
+        end_prog = Program()
+        send_prog = Program()
+        recv_prog = Program()
+
+        cond_var_name = None
+        send_vars_name = set()
+        recv_vars_name = {}
+        for ib, src_block in enumerate(self._program.blocks):
+            if ib == 0:
+                strat_block = start_prog.block(0)
+                end_block = end_prog.block(0)
+
+                is_after_while_op = False
+                for op in src_block.ops:
+                    if op.type == "while":
+                        assert len(op.input('Condition')) == 1
+                        cond_var_name = op.input('Condition')[0]
+                        is_after_while_op = True
+                        continue
+
+                    if not is_after_while_op:
+                        self._create_program(
+                            src_block, strat_block, op, force_create=True
+                        )
+                    else:
+                        self._create_program(
+                            src_block, end_block, op, force_create=True
+                        )
+            elif ib == 1:
+                send_block = send_prog.block(0)
+                recv_block = recv_prog.block(0)
+
+                is_after_send_op = False
+                is_after_recv_op = False
+                for op in src_block.ops:
+                    if op.type == "send_v2" and not is_after_send_op:
+                        is_after_send_op = True
+                        if cur_pp_stage == pp_stages - 1:
+                            if op.type in ["c_sync_calc_stream", "nop"]:
+                                continue
+                            if (
+                                op.type not in ["recv_2", "assign"]
+                                and op.has_attr('op_namescope')
+                                and "/auto_parallel/reshard"
+                                in op.attr('op_namescope')
+                            ):
+                                if (
+                                    len(op.desc.input_arg_names()) > 0
+                                    and "@RESHARD"
+                                    not in op.desc.input_arg_names()[0]
+                                ):
+                                    send_vars_name.add(
+                                        op.desc.input_arg_names()[0]
+                                    )
+                                    continue
+                                if op.type == "send_v2":
+                                    continue
+                        self._create_program(
+                            src_block, send_block, op, force_create=True
+                        )
+                        continue
+
+                    if (
+                        is_after_send_op
+                        and not is_after_recv_op
+                        and op.type == "recv_v2"
+                    ):
+                        is_after_recv_op = True
+                        if op.has_attr(
+                            'op_namescope'
+                        ) and "/auto_parallel/reshard" in op.attr(
+                            'op_namescope'
+                        ):
+                            var_name = op.desc.output_arg_names()[0]
+                            index = var_name.find("@")
+                            if index > 0:
+                                old_var_name = var_name[:index]
+                            else:
+                                old_var_name = var_name
+                            recv_vars_name[var_name] = old_var_name
+                            if not src_block._find_var_recursive(old_var_name):
+                                src_var = src_block._var_recursive(var_name)
+                                recv_block.create_var(
+                                    type=src_var.type,
+                                    name=old_var_name,
+                                    shape=src_var.shape,
+                                    dtype=src_var.dtype,
+                                    lod_level=src_var.lod_level,
+                                    persistable=src_var.persistable,
+                                    error_clip=src_var.error_clip,
+                                    stop_gradient=src_var.stop_gradient,
+                                    is_data=src_var.is_data,
+                                    belong_to_optimizer=src_var.belong_to_optimizer,
+                                )
+                            continue
+
+                        self._create_program(
+                            src_block, recv_block, op, force_create=True
+                        )
+                        continue
+
+                    if not is_after_send_op or not is_after_recv_op:
+                        if cur_pp_stage == pp_stages - 1:
+                            if op.type in ["c_sync_calc_stream", "nop"]:
+                                continue
+                            if (
+                                op.type not in ["recv_2", "assign"]
+                                and op.has_attr('op_namescope')
+                                and "/auto_parallel/reshard"
+                                in op.attr('op_namescope')
+                            ):
+                                if (
+                                    len(op.desc.input_arg_names()) > 0
+                                    and "@RESHARD"
+                                    not in op.desc.input_arg_names()[0]
+                                ):
+                                    send_vars_name.add(
+                                        op.desc.input_arg_names()[0]
+                                    )
+                                    continue
+                                if op.type == "send_v2":
+                                    continue
+                        self._create_program(
+                            src_block, send_block, op, force_create=True
+                        )
+
+                    if is_after_send_op and is_after_recv_op:
+                        if op.has_attr(
+                            'op_namescope'
+                        ) and "/auto_parallel/reshard" in op.attr(
+                            'op_namescope'
+                        ):
+                            var_name = op.desc.output_arg_names()[0]
+                            index = var_name.find("@")
+                            if index > 0:
+                                old_var_name = var_name[:index]
+                            else:
+                                old_var_name = var_name
+                            recv_vars_name[var_name] = old_var_name
+                            if not src_block._find_var_recursive(old_var_name):
+                                src_var = src_block._var_recursive(var_name)
+                                recv_block.create_var(
+                                    type=src_var.type,
+                                    name=old_var_name,
+                                    shape=src_var.shape,
+                                    dtype=src_var.dtype,
+                                    lod_level=src_var.lod_level,
+                                    persistable=src_var.persistable,
+                                    error_clip=src_var.error_clip,
+                                    stop_gradient=src_var.stop_gradient,
+                                    is_data=src_var.is_data,
+                                    belong_to_optimizer=src_var.belong_to_optimizer,
+                                )
+                            continue
+
+                        for in_name in op.desc.input_arg_names():
+                            if in_name in recv_vars_name:
+                                op.desc._rename_input(
+                                    in_name, recv_vars_name[in_name]
+                                )
+                        self._create_program(
+                            src_block, recv_block, op, force_create=True
+                        )
+            else:
+                raise Exception("Only support generation condition.")
+
+        start_prog._sync_with_cpp()
+        end_prog._sync_with_cpp()
+        send_prog._sync_with_cpp()
+        recv_prog._sync_with_cpp()
+
+        assert cond_var_name is not None
+
+        send_task_node_var_dtype = {}
+        send_task_node_var_shape = {}
+        recv_task_node_var_dtype = {}
+        recv_task_node_var_shape = {}
+        for var_name in list(send_vars_name):
+            var = send_prog.global_block().vars[var_name]
+            dtype = str(var.dtype)
+            send_task_node_var_dtype[var_name] = dtype[
+                dtype.find("paddle.") + len("paddle.") :
+            ]
+            send_task_node_var_shape[var_name] = var.shape
+        for var_name in list(set(recv_vars_name.values())):
+            var = recv_prog.global_block().vars[var_name]
+            dtype = str(var.dtype)
+            recv_task_node_var_dtype[var_name] = dtype[
+                dtype.find("paddle.") + len("paddle.") :
+            ]
+            recv_task_node_var_shape[var_name] = var.shape
+
+        vars_to_dtype = []
+        vars_to_shape = []
+        if len(send_task_node_var_dtype) > 0:
+            assert len(recv_task_node_var_dtype) == 0
+            vars_to_dtype = send_task_node_var_dtype
+            vars_to_shape = send_task_node_var_shape
+        if len(recv_task_node_var_dtype) > 0:
+            assert len(send_task_node_var_dtype) == 0
+            vars_to_dtype = recv_task_node_var_dtype
+            vars_to_shape = recv_task_node_var_shape
+
+        start_task_node = TaskNode(
+            rank=cur_rank,
+            max_run_times=self._acc_steps,
+            node_type="Start",
+            task_id=int(cur_rank * num_of_functionality + 0),
+            program=start_prog,
+            lazy_initialize=True,
+        )
+        cond_task_node = TaskNode(
+            rank=cur_rank,
+            max_run_times=self._acc_steps,
+            node_type="Cond",
+            task_id=int(cur_rank * num_of_functionality + 1),
+            program=cond_prog,
+            cond_var_name=cond_var_name,
+            lazy_initialize=True,
+        )
+        send_task_node = TaskNode(
+            rank=cur_rank,
+            max_run_times=self._acc_steps,
+            node_type="Compute",
+            task_id=int(cur_rank * num_of_functionality + 2),
+            program=send_prog,
+            lazy_initialize=True,
+        )
+        recv_task_node = TaskNode(
+            rank=cur_rank,
+            max_run_times=self._acc_steps,
+            node_type="Compute",
+            task_id=int(cur_rank * num_of_functionality + 3),
+            program=recv_prog,
+            lazy_initialize=True,
+            vars_to_dtype=vars_to_dtype,
+            vars_to_shape=vars_to_shape,
+        )
+        end_task_node = TaskNode(
+            rank=cur_rank,
+            max_run_times=self._acc_steps,
+            node_type="Compute",
+            task_id=int(cur_rank * num_of_functionality + 4),
+            program=end_prog,
+            lazy_initialize=True,
+        )
+
+        # add dependencies for task nodes intra stage
+        inf = -1
+        pp_buff_size = int(pp_stages - cur_pp_stage)
+        start_task_node.add_downstream_task(
+            cond_task_node.task_id(), self._gen_bsz
+        )
+        print(
+            "Task ",
+            start_task_node.task_id(),
+            "'s downstream is:",
+            cond_task_node.task_id(),
+            ", buffer size is:",
+            self._gen_bsz,
+        )
+        cond_task_node.add_upstream_task(
+            start_task_node.task_id(), self._gen_bsz
+        )
+        print(
+            "Task ",
+            cond_task_node.task_id(),
+            "'s upstream is:",
+            start_task_node.task_id(),
+            ", buffer size is:",
+            self._gen_bsz,
+        )
+        cond_task_node.add_downstream_task(send_task_node.task_id(), inf)
+        print(
+            "Task ",
+            cond_task_node.task_id(),
+            "'s downstream is:",
+            send_task_node.task_id(),
+            ", buffer size is:",
+            inf,
+        )
+        send_task_node.add_upstream_task(cond_task_node.task_id(), inf)
+        print(
+            "Task ",
+            send_task_node.task_id(),
+            "'s upstream is:",
+            cond_task_node.task_id(),
+            ", buffer size is:",
+            inf,
+        )
+        send_task_node.add_downstream_task(
+            recv_task_node.task_id(), pp_buff_size
+        )
+        print(
+            "Task ",
+            send_task_node.task_id(),
+            "'s downstream is:",
+            recv_task_node.task_id(),
+            ", buffer size is:",
+            pp_buff_size,
+        )
+        recv_task_node.add_upstream_task(send_task_node.task_id(), pp_buff_size)
+        print(
+            "Task ",
+            recv_task_node.task_id(),
+            "'s upstream is:",
+            send_task_node.task_id(),
+            ", buffer size is:",
+            pp_buff_size,
+        )
+        recv_task_node.add_downstream_task(
+            cond_task_node.task_id(), inf, core.DependType.LOOP
+        )
+        print(
+            "Task ",
+            recv_task_node.task_id(),
+            "'s downstream is:",
+            cond_task_node.task_id(),
+            ", buffer size is:",
+            inf,
+        )
+        cond_task_node.add_upstream_task(
+            recv_task_node.task_id(), inf, core.DependType.LOOP
+        )
+        print(
+            "Task ",
+            cond_task_node.task_id(),
+            "'s upstream is:",
+            recv_task_node.task_id(),
+            ", buffer size is:",
+            inf,
+        )
+        cond_task_node.add_downstream_task(
+            end_task_node.task_id(), inf, core.DependType.STOP_LOOP
+        )
+        print(
+            "Task ",
+            cond_task_node.task_id(),
+            "'s downstream is:",
+            end_task_node.task_id(),
+            ", buffer size is:",
+            inf,
+        )
+        end_task_node.add_upstream_task(
+            cond_task_node.task_id(), inf, core.DependType.STOP_LOOP
+        )
+        print(
+            "Task ",
+            end_task_node.task_id(),
+            "'s upstream is:",
+            cond_task_node.task_id(),
+            ", buffer size is:",
+            inf,
+        )
+
+        # add dependencies for task nodes inter stage
+        # get upstream ranks and downstream ranks of cur_rank
+        up_down_streams = self._dist_context.up_down_streams
+        pp_upstream_ranks = up_down_streams.ups(cur_rank)
+        pp_downstream_ranks = up_down_streams.downs(cur_rank)
+
+        for upstream_rank in pp_upstream_ranks:
+            upstream_pp_stage = self._get_pp_stage(upstream_rank)
+            if upstream_pp_stage < pp_stages - 1:
+                upstream_task_id = int(upstream_rank * num_of_functionality + 2)
+                send_task_node.add_upstream_task(upstream_task_id)
+                print(
+                    "Task ",
+                    send_task_node.task_id(),
+                    "'s upstream is:",
+                    upstream_task_id,
+                    ", buffer size is:",
+                    2,
+                )
+            else:
+                upstream_task_id = int(upstream_rank * num_of_functionality + 3)
+                recv_task_node.add_upstream_task(upstream_task_id)
+                print(
+                    "Task ",
+                    recv_task_node.task_id(),
+                    "'s upstream is:",
+                    upstream_task_id,
+                    ", buffer size is:",
+                    2,
+                )
+        for downstream_rank in pp_downstream_ranks:
+            if cur_pp_stage < pp_stages - 1:
+                downstream_task_id = int(
+                    downstream_rank * num_of_functionality + 2
+                )
+                send_task_node.add_downstream_task(downstream_task_id)
+                print(
+                    "Task ",
+                    send_task_node.task_id(),
+                    "'s downstream is:",
+                    downstream_task_id,
+                    ", buffer size is:",
+                    2,
+                )
+            else:
+                downstream_task_id = int(
+                    downstream_rank * num_of_functionality + 3
+                )
+                recv_task_node.add_downstream_task(downstream_task_id)
+                print(
+                    "Task ",
+                    recv_task_node.task_id(),
+                    "'s downstream is:",
+                    downstream_task_id,
+                    ", buffer size is:",
+                    2,
+                )
+
+        task_id_to_rank = {}
+        for i in range(nrank):
+            for j in range(num_of_functionality):
+                task_id_to_rank[int(i * num_of_functionality + j)] = i
+        self._program._pipeline_opt = {
+            "fleet_opt": {
+                'tasks': [
+                    start_task_node,
+                    cond_task_node,
+                    send_task_node,
+                    recv_task_node,
+                    end_task_node,
+                ],
+                'task_id_to_rank': task_id_to_rank,
+                'num_micro_batches': self._acc_steps,
+                'inference_generation': True,
+            }
+        }
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/generation_pipeline_pass_unittest.py b/python/paddle/fluid/tests/unittests/auto_parallel/generation_pipeline_pass_unittest.py
new file mode 100644
index 00000000000000..4a54b99df0dbad
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/generation_pipeline_pass_unittest.py
@@ -0,0 +1,177 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+from paddle.distributed.fleet import auto
+
+_g_mesh = auto.ProcessMesh([0, 1])
+PP_MESH_0 = auto.ProcessMesh([0])
+PP_MESH_1 = auto.ProcessMesh([1])
+
+image_size = 1024
+class_num = 10
+
+
+class MyDataset(paddle.io.Dataset):
+    def __init__(self, num_samples):
+        super().__init__()
+        self.num_samples = num_samples
+
+    def __getitem__(self, index):
+        input = np.random.uniform(size=image_size).astype("float32")
+        input = np.random.uniform(size=image_size).astype("float32")
+        return input, input
+
+    def __len__(self):
+        return self.num_samples
+
+
+class MLPLayer(nn.Layer):
+    def __init__(
+        self,
+        hidden_size=1024,
+        intermediate_size=4 * 1024,
+        dropout_ratio=0.1,
+        initializer_range=0.02,
+    ):
+        super().__init__()
+        d_model = hidden_size
+        dim_feedforward = intermediate_size
+        weight_attr = paddle.ParamAttr(
+            initializer=nn.initializer.Normal(mean=0.0, std=initializer_range)
+        )
+        bias_attr = None
+
+        self.linear0 = nn.Linear(
+            d_model, dim_feedforward, weight_attr, bias_attr=bias_attr
+        )
+        self.linear1 = nn.Linear(
+            dim_feedforward, d_model, weight_attr, bias_attr=bias_attr
+        )
+        self.linear2 = nn.Linear(d_model, 1, weight_attr, bias_attr=bias_attr)
+        self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
+        self.dropout = nn.Dropout(dropout_ratio, mode="upscale_in_train")
+
+    def forward(self, input):
+        out = auto.shard_op(self.norm, PP_MESH_0)(input)
+        out = self.linear0(out)
+        out = F.gelu(out, approximate=True)
+        out = auto.shard_op(self.linear1, PP_MESH_1)(out)
+        out = self.dropout(out)
+        out = self.linear2(out)
+        return out
+
+
+class GEN(nn.Layer):
+    def __init__(self, mlp):
+        super().__init__()
+        self.mlp = mlp
+
+    def forward(self, input):
+        model_kwargs = {}
+
+        output = self.mlp(input)
+
+        cur_step = paddle.full([1], 0, dtype='int64')
+        total_step = paddle.full([1], 10, dtype='int64')
+
+        model_kwargs['input'] = input
+        model_kwargs['output'] = output
+
+        while cur_step < total_step:
+
+            out = self.mlp(model_kwargs['input'])
+            model_kwargs['res'] = out
+            paddle.increment(cur_step)
+
+            auto.shard_op(paddle.assign, _g_mesh)(model_kwargs['input'], out)
+
+        output = F.gelu(model_kwargs['input'], approximate=True)
+
+        return output, cur_step
+
+
+def get_model():
+
+    with paddle.LazyGuard():
+        mlp = MLPLayer()
+        gen = GEN(mlp)
+    return gen
+
+
+class TestGenerationPipeline(unittest.TestCase):
+    def test_pp2(self):
+
+        model = get_model()
+
+        strategy = auto.Strategy()
+        pipeline = strategy.pipeline
+        pipeline.enable = True
+        pipeline.schedule_mode = "stream"
+        pipeline.generation_batch_size = 4
+        pipeline.accumulate_steps = 4
+        engine = auto.Engine(model, strategy=strategy)
+
+        engine.prepare(
+            inputs_spec=paddle.static.InputSpec(
+                shape=[2, 1024], name='input', dtype='float32'
+            ),
+            labels_spec=paddle.static.InputSpec(
+                shape=[2, 1024], name='label', dtype='float32'
+            ),
+            mode="eval",
+        )
+
+        train_data = MyDataset(50 * 2)
+        train_dataloader = engine._prepare_dataloader_from_generator(
+            dataset=train_data,
+            capacity=70,
+            iterable=False,
+            batch_size=2,
+            epochs=1,
+            steps_per_epoch=100,
+        )
+        engine._prepare_reader()
+
+        fleet_opt = engine.main_program._pipeline_opt['fleet_opt']
+        assert len(fleet_opt['tasks']) == 5
+        assert fleet_opt['inference_generation']
+        assert fleet_opt['num_micro_batches'] == 4
+        num_task_in_rank = 5
+        for idx, (task_id, rank_id) in enumerate(
+            fleet_opt['task_id_to_rank'].items()
+        ):
+            assert (
+                task_id == rank_id * num_task_in_rank + idx % num_task_in_rank
+            )
+
+        train_dataloader._inner_dataloader.start()
+        try:
+            engine._executor.run(
+                engine.main_program, use_program_cache=False, return_numpy=False
+            )
+        except paddle.fluid.core.EOFException:
+            print("test done")
+            train_dataloader._inner_dataloader.reset()
+            train_dataloader._inner_dataloader.start()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_context.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_context.py
index 029f33f8c647ee..10f78aedd4fb97 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_context.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_context.py
@@ -247,6 +247,7 @@ def test_deepcopy(self):
             "_backup_serial_main_program_stack",
             "_backup_serial_startup_program_stack",
             "_pass_context",
+            "_tensor_nodes_with_same_name",
         ]
 
         for i in range(len(copy_list)):
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_bf16.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_bf16.py
index c83c098959c13e..411cee39eca545 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_bf16.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_bf16.py
@@ -203,7 +203,7 @@ def test_bf16_pass(self):
         bf16_o1_engine.prepare(
             inputs_spec=inputs_spec, labels_spec=labels_spec, mode="train"
         )
-        self.check_program(bf16_o1_engine._dist_main_progs["train"][0])
+        self.check_program(bf16_o1_engine.main_program)
         print("BF16!check program successfully!")
 
 
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_generation_pipeline.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_generation_pipeline.py
new file mode 100644
index 00000000000000..598359cd516859
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_generation_pipeline.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import subprocess
+import sys
+import tempfile
+import unittest
+
+
+class TestGenerationPipeline(unittest.TestCase):
+    def test_pp2(self):
+        file_dir = os.path.dirname(os.path.abspath(__file__))
+        launch_model_path = os.path.join(
+            file_dir, "generation_pipeline_pass_unittest.py"
+        )
+
+        if os.environ.get("WITH_COVERAGE", "OFF") == "ON":
+            coverage_args = ["-m", "coverage", "run", "--branch", "-p"]
+        else:
+            coverage_args = []
+
+        tmp_dir = tempfile.TemporaryDirectory()
+        cmd = (
+            [sys.executable, "-u"]
+            + coverage_args
+            + [
+                "-m",
+                "paddle.distributed.launch",
+                "--devices",
+                "0,1",
+                "--log_dir",
+                tmp_dir.name,
+                launch_model_path,
+            ]
+        )
+
+        process = subprocess.Popen(cmd)
+        process.wait()
+        self.assertEqual(process.returncode, 0)
+
+        tmp_dir.cleanup()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_serial.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_serial.py
index f7c4fb0e94e891..11c817b9baeea8 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_serial.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_serial.py
@@ -180,6 +180,9 @@ def check_send_recv_result(dist_main_prog, rank_id):
     return send_result and recv_result
 
 
+@unittest.skipIf(
+    not paddle.is_compiled_with_cuda(), "core is not compiled with CUDA"
+)
 class TestMLPReshard(unittest.TestCase):
     def test_mlp_serial(self):
         global _global_parallel_strategy

From 3ece0ece6428f54e3e2060299e0a43dc005eb24f Mon Sep 17 00:00:00 2001
From: ShenLiang <1422485404@qq.com>
Date: Wed, 12 Apr 2023 06:40:25 -0500
Subject: [PATCH 103/156] fix bug of mp (#52789)

---
 .../distributed/fleet/layers/mpu/mp_ops.py    | 20 +++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/python/paddle/distributed/fleet/layers/mpu/mp_ops.py b/python/paddle/distributed/fleet/layers/mpu/mp_ops.py
index 08093710b3b891..884af3a4414318 100644
--- a/python/paddle/distributed/fleet/layers/mpu/mp_ops.py
+++ b/python/paddle/distributed/fleet/layers/mpu/mp_ops.py
@@ -46,7 +46,15 @@ def _c_identity(tensor, group=None):
         class c_identity_eager(PyLayer):
             @staticmethod
             def forward(ctx, tensor):
-                return tensor
+                return _legacy_C_ops.c_identity(
+                    tensor,
+                    'use_calc_stream',
+                    True,
+                    'ring_id',
+                    group.id,
+                    'use_model_parallel',
+                    True,
+                )
 
             @staticmethod
             def backward(ctx, dy):
@@ -249,7 +257,15 @@ def forward(
 
             @staticmethod
             def backward(ctx, dy):
-                return dy
+                return _legacy_C_ops.c_identity(
+                    dy,
+                    'use_calc_stream',
+                    True,
+                    'ring_id',
+                    ctx.ring_id,
+                    'use_model_parallel',
+                    True,
+                )
 
         return mp_allreduce_eager.apply(
             tensor, group, use_calc_stream, use_model_parallel

From e54e2bc8bcc7f4e75edf449320cbbf8e8047377e Mon Sep 17 00:00:00 2001
From: zqw_1997 <118182234+zhengqiwen1997@users.noreply.github.com>
Date: Wed, 12 Apr 2023 20:15:24 +0800
Subject: [PATCH 104/156] Support cuda12+ arch and Hopper arch. Discard 30 and
 Kepler arch. (#52285)

* slight modify

* support cuda12+ arch, Hopper arch and discard 30 arch

* add arch 90 for each paddle_known_gpu_archs12

* for comments
---
 cmake/cuda.cmake | 32 +++++++++++++++++++++-----------
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 9c1d71914bc21c..82c4ec14d9ef89 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -7,28 +7,33 @@ if(WITH_NV_JETSON)
   set(paddle_known_gpu_archs "53 62 72")
   set(paddle_known_gpu_archs10 "53 62 72")
   set(paddle_known_gpu_archs11 "53 62 72 87")
+  set(paddle_known_gpu_archs12 "53 62 72 87 90")
 elseif(NEW_RELEASE_ALL)
   message("Using New Release Strategy - All Arches Packge")
   add_definitions(-DNEW_RELEASE_ALL)
-  set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80 86")
-  set(paddle_known_gpu_archs10 "35 50 52 60 61 70 75")
+  set(paddle_known_gpu_archs "50 52 60 61 70 75 80 86 90")
+  set(paddle_known_gpu_archs10 "50 52 60 61 70 75")
   set(paddle_known_gpu_archs11 "50 60 61 70 75 80")
+  set(paddle_known_gpu_archs12 "50 60 61 70 75 80 90")
 elseif(NEW_RELEASE_PYPI)
   message("Using New Release Strategy - Cubin Packge")
   add_definitions(-DNEW_RELEASE_PYPI)
-  set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80 86")
+  set(paddle_known_gpu_archs "50 52 60 61 70 75 80 86 90")
   set(paddle_known_gpu_archs10 "")
   set(paddle_known_gpu_archs11 "61 70 75 80")
+  set(paddle_known_gpu_archs12 "61 70 75 80 90")
 elseif(NEW_RELEASE_JIT)
   message("Using New Release Strategy - JIT Packge")
   add_definitions(-DNEW_RELEASE_JIT)
-  set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80 86")
-  set(paddle_known_gpu_archs10 "35 50 60 70 75")
-  set(paddle_known_gpu_archs11 "35 50 60 70 75 80")
+  set(paddle_known_gpu_archs "50 52 60 61 70 75 80 86 90")
+  set(paddle_known_gpu_archs10 "50 60 70 75")
+  set(paddle_known_gpu_archs11 "50 60 70 75 80")
+  set(paddle_known_gpu_archs12 "50 60 70 75 80 90")
 else()
-  set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80")
+  set(paddle_known_gpu_archs "50 52 60 61 70 75 80 90")
   set(paddle_known_gpu_archs10 "50 52 60 61 70 75")
   set(paddle_known_gpu_archs11 "52 60 61 70 75 80")
+  set(paddle_known_gpu_archs12 "52 60 61 70 75 80 90")
 endif()
 
 ######################################################################################
@@ -100,12 +105,12 @@ endfunction()
 function(select_nvcc_arch_flags out_variable out_arch_bin)
   # List of arch names
   set(archs_names
-      "Kepler"
       "Maxwell"
       "Pascal"
       "Volta"
       "Turing"
       "Ampere"
+      "Hopper"
       "All"
       "Manual")
   set(archs_name_default "Auto")
@@ -144,9 +149,7 @@ function(select_nvcc_arch_flags out_variable out_arch_bin)
     unset(CUDA_ARCH_PTX CACHE)
   endif()
 
-  if(${CUDA_ARCH_NAME} STREQUAL "Kepler")
-    set(cuda_arch_bin "30 35")
-  elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
+  if(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
     if(WITH_NV_JETSON)
       set(cuda_arch_bin "53")
     else()
@@ -176,6 +179,8 @@ function(select_nvcc_arch_flags out_variable out_arch_bin)
         set(cuda_arch_bin "80 86")
       endif()
     endif()
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Hopper")
+    set(cuda_arch_bin "90")
   elseif(${CUDA_ARCH_NAME} STREQUAL "All")
     set(cuda_arch_bin ${paddle_known_gpu_archs})
   elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
@@ -266,6 +271,11 @@ elseif(${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0) # CUDA 11.2+
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
+elseif(${CMAKE_CUDA_COMPILER_VERSION} LESS 13.0) # CUDA 12.0+
+  set(paddle_known_gpu_archs "${paddle_known_gpu_archs12} 86")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
 endif()
 
 if(NOT ${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0)

From 57201d9d2b5504f41d30c33781f4d8c9c0ff36df Mon Sep 17 00:00:00 2001
From: Wang Xin <xinwang614@gmail.com>
Date: Wed, 12 Apr 2023 20:23:43 +0800
Subject: [PATCH 105/156] add autogen code for clip_by_norm op (#52743)

* add autogen code for clip_by_norm op

* bug fixed
---
 paddle/fluid/operators/clip_by_norm_op.cc | 29 ----------------------
 paddle/phi/api/yaml/legacy_ops.yaml       |  8 ------
 paddle/phi/api/yaml/op_compat.yaml        |  6 +++++
 paddle/phi/api/yaml/ops.yaml              |  9 +++++++
 paddle/phi/ops/compat/clip_by_norm_sig.cc | 30 -----------------------
 5 files changed, 15 insertions(+), 67 deletions(-)
 delete mode 100644 paddle/fluid/operators/clip_by_norm_op.cc
 delete mode 100644 paddle/phi/ops/compat/clip_by_norm_sig.cc

diff --git a/paddle/fluid/operators/clip_by_norm_op.cc b/paddle/fluid/operators/clip_by_norm_op.cc
deleted file mode 100644
index 3805e11d752e3e..00000000000000
--- a/paddle/fluid/operators/clip_by_norm_op.cc
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/clip_by_norm_op.h"
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/phi/infermeta/unary.h"
-
-namespace ops = paddle::operators;
-
-DECLARE_INFER_SHAPE_FUNCTOR(clip_by_norm,
-                            ClipByNormInferShapeFunctor,
-                            PD_INFER_META(phi::ClipByNormInferMeta));
-
-REGISTER_OP_WITHOUT_GRADIENT(clip_by_norm,
-                             ops::ClipByNormOp,
-                             ops::ClipByNormOpMaker,
-                             ClipByNormInferShapeFunctor);
diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml
index 100329f555bea0..abd42601a8fa00 100755
--- a/paddle/phi/api/yaml/legacy_ops.yaml
+++ b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -222,14 +222,6 @@
   kernel :
     func : class_center_sample
 
-- op : clip_by_norm
-  args : (Tensor x, float max_norm)
-  output : Tensor(out)
-  infer_meta :
-    func : ClipByNormInferMeta
-  kernel :
-    func : clip_by_norm
-
 - op : coalesce_tensor
   args : (Tensor[] input, DataType dtype, bool copy_data = false, bool set_constant = false, bool persist_output = false, float constant = 0.0, bool use_align = true, int align_size = -1, int size_of_dtype = -1, int64_t[] concated_shapes = {}, int64_t[] concated_ranks = {})
   output : Tensor[](output){input.size()}, Tensor(fused_output)
diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
index bfbab2d52af4ea..44f065feb7d72d 100644
--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -356,6 +356,12 @@
   extra :
     attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
 
+- op : clip_by_norm
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
 - op : complex
   backward : complex_grad
   inputs :
diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
index 980505ddeb2f1d..20adbd31aca06d 100644
--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -355,6 +355,15 @@
     data_type : x
   backward : clip_grad
 
+- op : clip_by_norm
+  args : (Tensor x, float max_norm)
+  output : Tensor(out)
+  infer_meta :
+    func : ClipByNormInferMeta
+  kernel :
+    func : clip_by_norm {dense -> dense}
+           clip_by_norm_sr {selected_rows -> selected_rows}
+
 - op : complex
   args : (Tensor real, Tensor imag)
   output : Tensor
diff --git a/paddle/phi/ops/compat/clip_by_norm_sig.cc b/paddle/phi/ops/compat/clip_by_norm_sig.cc
deleted file mode 100644
index 8a2cecc0293d3f..00000000000000
--- a/paddle/phi/ops/compat/clip_by_norm_sig.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-
-KernelSignature ClipByNormOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  if (ctx.IsDenseTensorInput("X")) {
-    return KernelSignature("clip_by_norm", {"X"}, {"max_norm"}, {"Out"});
-  } else if (ctx.IsSelectedRowsInput("X")) {
-    return KernelSignature("clip_by_norm_sr", {"X"}, {"max_norm"}, {"Out"});
-  }
-  return KernelSignature("unregistered", {}, {}, {});
-}
-
-}  // namespace phi
-
-PD_REGISTER_ARG_MAPPING_FN(clip_by_norm, phi::ClipByNormOpArgumentMapping);

From 94cc1d6b4b833d17b17b1aeafae60682ee51398c Mon Sep 17 00:00:00 2001
From: gaoziyuan <88373061+gzy19990617@users.noreply.github.com>
Date: Thu, 13 Apr 2023 10:13:56 +0800
Subject: [PATCH 106/156] =?UTF-8?q?[Hackathon=20NO.75]=20=E4=B8=BA=20Paddl?=
 =?UTF-8?q?e-TRT=20=E6=B7=BB=E5=8A=A0=20expend=5Fas=5Fv2=20=E7=AE=97?=
 =?UTF-8?q?=E5=AD=90=20(#51028)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---------

Co-authored-by: Zhang Jun <ewalker@live.cn>
---
 .../fluid/inference/api/analysis_predictor.cc |   1 +
 .../tensorrt/convert/expand_v2_op.cc          |  78 ++++--
 paddle/fluid/inference/tensorrt/op_teller.cc  |  30 ++-
 .../test_trt_convert_expand_as_v2.py          | 252 ++++++++++++++++++
 4 files changed, 334 insertions(+), 27 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_expand_as_v2.py

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 6523e5cfced3ea..790c32b31e1294 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -2685,6 +2685,7 @@ USE_TRT_CONVERTER(tanh_shrink)
 USE_TRT_CONVERTER(logsigmoid)
 USE_TRT_CONVERTER(lookup_table)
 USE_TRT_CONVERTER(expand_v2)
+USE_TRT_CONVERTER(expand_as_v2)
 USE_TRT_CONVERTER(take_along_axis)
 USE_TRT_CONVERTER(skip_groupnorm_act)
 USE_TRT_CONVERTER(preln_groupnorm_act)
diff --git a/paddle/fluid/inference/tensorrt/convert/expand_v2_op.cc b/paddle/fluid/inference/tensorrt/convert/expand_v2_op.cc
index 0c36811145a6d5..452f1f8b92057d 100644
--- a/paddle/fluid/inference/tensorrt/convert/expand_v2_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/expand_v2_op.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -18,12 +18,12 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {
 
-class ExpandV2OpConverter : public OpConverter {
+class ExpandOpConverter : public OpConverter {
  public:
   void operator()(const framework::proto::OpDesc& op,
                   const framework::Scope& scope,
                   bool test_mode) override {
-    VLOG(3) << "convert a expand_v2 op to trt expand layer.";
+    VLOG(3) << "convert a paddle " << op_type_ << " op to trt expand layer.";
     framework::OpDesc op_desc(op, nullptr);
     auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
     auto inputs = op_desc.Inputs();
@@ -33,25 +33,40 @@ class ExpandV2OpConverter : public OpConverter {
 
     nvinfer1::ITensor* shape_tensor = nullptr;
     int32_t shape_rank = 0;
-    if (inputs.find("Shape") != inputs.end() &&
-        op_desc.Input("Shape").size() >= 1) {
-      shape_tensor = engine_->GetITensor(op_desc.Input("Shape")[0]);
-      shape_rank = shape_tensor->getDimensions().d[0];
-    } else if (inputs.find("expand_shapes_tensor") != inputs.end() &&
-               op_desc.Input("expand_shapes_tensor").size() >= 1) {
-      int shape_size = op_desc.Input("expand_shapes_tensor").size();
-      std::vector<nvinfer1::ITensor*> shape_tensors;
-      for (int i = 0; i < shape_size; ++i) {
-        shape_tensors.push_back(
-            engine_->GetITensor(op_desc.Input("expand_shapes_tensor")[i]));
+
+    if (op_type_ == "expand_v2") {
+      if (inputs.find("Shape") != inputs.end() &&
+          op_desc.Input("Shape").size() >= 1) {
+        shape_tensor = engine_->GetITensor(op_desc.Input("Shape")[0]);
+        shape_rank = shape_tensor->getDimensions().nbDims;
+      } else if (inputs.find("expand_shapes_tensor") != inputs.end() &&
+                 op_desc.Input("expand_shapes_tensor").size() >= 1) {
+        int shape_size = op_desc.Input("expand_shapes_tensor").size();
+        std::vector<nvinfer1::ITensor*> shape_tensors;
+        for (int i = 0; i < shape_size; ++i) {
+          shape_tensors.push_back(
+              engine_->GetITensor(op_desc.Input("expand_shapes_tensor")[i]));
+        }
+        shape_tensor = Concat(shape_tensors);
+        shape_rank = shape_size;
+      } else {
+        std::vector<int32_t> shape =
+            PADDLE_GET_CONST(std::vector<int32_t>, op_desc.GetAttr("shape"));
+        shape_tensor =
+            Add1DConstantLayer(shape, output_name + "_shape_tensor_");
+        shape_rank = shape.size();
+      }
+    } else if (op_type_ == "expand_as_v2") {
+      if (inputs.find("Y") != inputs.end()) {
+        shape_tensor = engine_->GetITensor(op_desc.Input("Y")[0]);
+        shape_rank = shape_tensor->getDimensions().nbDims;
+      } else {
+        std::vector<int32_t> shape = PADDLE_GET_CONST(
+            std::vector<int32_t>, op_desc.GetAttr("target_shape"));
+        shape_tensor =
+            Add1DConstantLayer(shape, output_name + "_target_shape_tensor_");
+        shape_rank = shape.size();
       }
-      shape_tensor = Concat(shape_tensors);
-      shape_rank = shape_size;
-    } else {
-      std::vector<int32_t> shape =
-          PADDLE_GET_CONST(std::vector<int32_t>, op_desc.GetAttr("shape"));
-      shape_tensor = Add1DConstantLayer(shape, output_name + "_shape_tensor_");
-      shape_rank = shape.size();
     }
 
     nvinfer1::ITensor* input_shape_tensor;
@@ -68,8 +83,7 @@ class ExpandV2OpConverter : public OpConverter {
       input_shape_tensor = Shape(input);
     }
 
-    auto* shuffle = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
-    shuffle->setInput(1, *input_shape_tensor);
+    auto* newInputTensor = Reshape(input, input_shape_tensor);
 
     std::vector<int32_t> start_vec(shape_rank, 0);
     nvinfer1::Dims start;
@@ -91,13 +105,26 @@ class ExpandV2OpConverter : public OpConverter {
     auto strides_tensor = Min(one_tensor, input_sub_tensor);
 
     auto layer = TRT_ENGINE_ADD_LAYER(
-        engine_, Slice, *shuffle->getOutput(0), start, size, stride);
+        engine_, Slice, *newInputTensor, start, size, stride);
     layer->setInput(1, *starts_tensor);
     layer->setInput(2, *sizes_tensor);
     layer->setInput(3, *strides_tensor);
 
-    RreplenishLayerAndOutput(layer, "expand_v2", {output_name}, test_mode);
+    RreplenishLayerAndOutput(layer, op_type_, {output_name}, test_mode);
   }
+
+ protected:
+  std::string op_type_;
+};
+
+class ExpandV2OpConverter : public ExpandOpConverter {
+ public:
+  ExpandV2OpConverter() { op_type_ = "expand_v2"; }
+};
+
+class ExpandAsV2OpConverter : public ExpandOpConverter {
+ public:
+  ExpandAsV2OpConverter() { op_type_ = "expand_as_v2"; }
 };
 
 }  // namespace tensorrt
@@ -105,3 +132,4 @@ class ExpandV2OpConverter : public OpConverter {
 }  // namespace paddle
 
 REGISTER_TRT_OP_CONVERTER(expand_v2, ExpandV2OpConverter);
+REGISTER_TRT_OP_CONVERTER(expand_as_v2, ExpandAsV2OpConverter);
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 85f5c003746c20..9ce57fe6aee912 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -2654,11 +2654,35 @@ struct SimpleOpTypeSetTeller : public Teller {
       }
     }
 
-    if (op_type == "expand_v2") {
+    if (op_type == "expand_as_v2" || op_type == "expand_v2") {
       if (!with_dynamic_shape) {
+        VLOG(3) << "the " << op_type
+                << "does not support "
+                   "static shape yet";
         return false;
       }
-      if (!desc.HasAttr("shape")) {
+
+      auto inputs = desc.Inputs();
+      if (op_type == "expand_as_v2") {
+        if (!desc.HasAttr("target_shape") && inputs.find("Y") == inputs.end()) {
+          VLOG(3)
+              << "expand_as_v2 op need have input(Y) or attr(target_shape). ";
+          return false;
+        }
+      } else if (op_type == "expand_v2") {
+        if (!desc.HasAttr("shape") && inputs.find("Shape") == inputs.end() &&
+            inputs.find("expand_shapes_tensor") == inputs.end()) {
+          VLOG(3) << "expand_v2 op need have input(Shape) or "
+                     "input(expand_shapes_tensor) or attr(shape) . ";
+          return false;
+        }
+      }
+
+      auto* block = desc.Block();
+      if (block == nullptr) {
+        VLOG(3) << "The block desc is nullptr, we can't continue to analyze. "
+                   "Developers need to check whether block_desc is passed in "
+                   "the pass.";
         return false;
       }
     }
@@ -2921,6 +2945,7 @@ struct SimpleOpTypeSetTeller : public Teller {
       "skip_merge_layernorm",
       "lookup_table_v2",
       "expand_v2",
+      "expand_as_v2",
       "fuse_eleadd_transpose",
       "skip_groupnorm_act",
       "preln_groupnorm_act",
@@ -3080,6 +3105,7 @@ struct SimpleOpTypeSetTeller : public Teller {
       "lookup_table",
       "lookup_table_v2",
       "expand_v2",
+      "expand_as_v2",
       "fuse_eleadd_transpose",
       "skip_groupnorm_act",
       "preln_groupnorm_act",
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_expand_as_v2.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_expand_as_v2.py
new file mode 100644
index 00000000000000..46b3a2232e4711
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_expand_as_v2.py
@@ -0,0 +1,252 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from functools import partial
+from typing import Any, Dict, List
+
+import numpy as np
+from program_config import ProgramConfig, TensorConfig
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest
+
+import paddle.inference as paddle_infer
+
+
+class TrtConvertExpandASV2Test(TrtLayerAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        attrs = [
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
+        ]
+        if len(attrs[0]['target_shape']) < self.dims:
+            return False
+        if self.dims == 1:
+            if len(attrs[0]['target_shape']) == 4:
+                return False
+        return True
+
+    def sample_program_configs(self):
+        def generate_input1(attrs: List[Dict[str, Any]]):
+            if self.dims == 4:
+                self.input_shape = [1, 8, 1, 32]
+                return np.random.random([1, 8, 1, 32]).astype(np.float32)
+            elif self.dims == 3:
+                self.input_shape = [1, 32, 32]
+                return np.random.random([1, 32, 32]).astype(np.float32)
+            elif self.dims == 2:
+                self.input_shape = [1, 32]
+                return np.random.random([1, 32]).astype(np.float32)
+            elif self.dims == 1:
+                self.input_shape = [32]
+                return np.random.random([32]).astype(np.float32)
+
+        for dims in [1, 2, 3, 4]:
+            for shape in [
+                [10, 8, 32, 32],
+                [2, 8, 32, 32],
+                [8, 32, 32],
+                [2, 32],
+                [32],
+            ]:
+                dics = [
+                    {
+                        "target_shape": shape,
+                    },
+                ]
+                self.dims = dims
+
+                ops_config = [
+                    {
+                        "op_type": "expand_as_v2",
+                        "op_inputs": {"X": ["expand_v2_input"]},
+                        "op_outputs": {"Out": ["expand_v2_out"]},
+                        "op_attrs": dics[0],
+                    }
+                ]
+                ops = self.generate_op_config(ops_config)
+                program_config = ProgramConfig(
+                    ops=ops,
+                    weights={},
+                    inputs={
+                        "expand_v2_input": TensorConfig(
+                            data_gen=partial(generate_input1, dics)
+                        )
+                    },
+                    outputs=["expand_v2_out"],
+                )
+
+                yield program_config
+
+    def sample_predictor_configs(
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
+        def generate_dynamic_shape(attrs):
+            if self.dims == 4:
+                self.dynamic_shape.min_input_shape = {
+                    "expand_v2_input": [1, 8, 1, 32]
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "expand_v2_input": [10, 8, 1, 32]
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "expand_v2_input": [1, 8, 1, 32]
+                }
+            elif self.dims == 3:
+                self.dynamic_shape.min_input_shape = {
+                    "expand_v2_input": [1, 32, 32]
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "expand_v2_input": [8, 32, 32]
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "expand_v2_input": [1, 32, 32]
+                }
+            elif self.dims == 2:
+                self.dynamic_shape.min_input_shape = {
+                    "expand_v2_input": [1, 32]
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "expand_v2_input": [4, 32]
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "expand_v2_input": [1, 32]
+                }
+            elif self.dims == 1:
+                self.dynamic_shape.min_input_shape = {"expand_v2_input": [32]}
+                self.dynamic_shape.max_input_shape = {"expand_v2_input": [64]}
+                self.dynamic_shape.opt_input_shape = {"expand_v2_input": [32]}
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        def generate_trt_nodes_num(attrs, dynamic_shape):
+            if dynamic_shape:
+                return 1, 2
+            else:
+                return 0, 3
+
+        attrs = [
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
+        ]
+
+        clear_dynamic_shape()
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True
+        ), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True
+        ), 1e-3
+
+    def add_skip_trt_case(self):
+        pass
+
+    def test(self):
+        self.add_skip_trt_case()
+        self.run_test()
+
+
+class TrtConvertExpandV2Test2(TrtLayerAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        attrs = [
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
+        ]
+        return True
+
+    def sample_program_configs(self):
+        def generate_input1(attrs: List[Dict[str, Any]]):
+            if self.dims == 1:
+                self.input_shape = [1]
+                return np.random.random([1]).astype(np.float32)
+
+        for dims in [1]:
+            for shape in [[10]]:
+                dics = [
+                    {
+                        "target_shape": shape,
+                    },
+                ]
+                self.dims = dims
+                dics_intput = [
+                    {"X": ["expand_v2_input"], "Y": ["shapeT1_data"]},
+                ]
+                ops_config = [
+                    {
+                        "op_type": "fill_constant",
+                        "op_inputs": {},
+                        "op_outputs": {"Out": ["shapeT1_data"]},
+                        "op_attrs": {
+                            "dtype": 2,
+                            "str_value": "10",
+                            "shape": [1],
+                        },
+                    },
+                    {
+                        "op_type": "expand_as_v2",
+                        "op_inputs": dics_intput[0],
+                        "op_outputs": {"Out": ["expand_v2_out"]},
+                        "op_attrs": dics[0],
+                    },
+                ]
+                ops = self.generate_op_config(ops_config)
+                program_config = ProgramConfig(
+                    ops=ops,
+                    weights={},
+                    inputs={
+                        "expand_v2_input": TensorConfig(
+                            data_gen=partial(generate_input1, dics)
+                        )
+                    },
+                    outputs=["expand_v2_out"],
+                )
+
+                yield program_config
+
+    def sample_predictor_configs(
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
+        def generate_dynamic_shape():
+            if self.dims == 1:
+                self.dynamic_shape.min_input_shape = {"expand_v2_input": [1]}
+                self.dynamic_shape.max_input_shape = {"expand_v2_input": [1]}
+                self.dynamic_shape.opt_input_shape = {"expand_v2_input": [1]}
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        clear_dynamic_shape()
+        # for dynamic_shape
+        generate_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        # fill_constant will be folded by constnt folding pass!
+        yield self.create_inference_config(), (1, 2), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), (1, 2), 1e-3
+
+    def add_skip_trt_case(self):
+        pass
+
+    def test(self):
+        self.add_skip_trt_case()
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()

From c5e1987c97b16b3b66c2f6e036a7abb9c0b20dd1 Mon Sep 17 00:00:00 2001
From: risemeup1 <62429225+risemeup1@users.noreply.github.com>
Date: Thu, 13 Apr 2023 10:14:51 +0800
Subject: [PATCH 107/156] fix ninja error (#52796)

* fix ninja error

* fix_ninja_error_qa
---
 cmake/external/gtest.cmake | 97 ++++++++++++++++++++++++++------------
 1 file changed, 66 insertions(+), 31 deletions(-)

diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake
index 3d79b154b80f45..be6d9cdde61e73 100644
--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@@ -69,37 +69,72 @@ if(NOT WIN32 AND ${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER 12.0)
   set(GTEST_PATCH_COMMAND patch -d ${GTEST_SOURCE_DIR}/googletest/src <
                           ${native_src})
 endif()
-ExternalProject_Add(
-  extern_gtest
-  ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
-  GIT_REPOSITORY ${GTEST_REPOSITORY}
-  GIT_TAG ${GTEST_TAG}
-  DEPENDS ${GTEST_DEPENDS}
-  PREFIX ${GTEST_PREFIX_DIR}
-  UPDATE_COMMAND ""
-  PATCH_COMMAND ${GTEST_PATCH_COMMAND}
-  CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-             -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-             -DCMAKE_CXX_FLAGS=${GTEST_CMAKE_CXX_FLAGS}
-             -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-             -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-             -DCMAKE_C_FLAGS=${GTEST_CMAKE_C_FLAGS}
-             -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-             -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-             -DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR}
-             -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-             -DBUILD_GMOCK=ON
-             -Dgtest_disable_pthreads=ON
-             -Dgtest_force_shared_crt=ON
-             -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-             ${EXTERNAL_OPTIONAL_ARGS}
-  CMAKE_CACHE_ARGS
-    -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR}
-    -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-    -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-  BUILD_BYPRODUCTS ${GTEST_LIBRARIES}
-  BUILD_BYPRODUCTS ${GTEST_MAIN_LIBRARIES}
-  BUILD_BYPRODUCTS ${GMOCK_LIBRARIES})
+if(WIN32)
+  ExternalProject_Add(
+    extern_gtest
+    ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
+    GIT_REPOSITORY ${GTEST_REPOSITORY}
+    GIT_TAG ${GTEST_TAG}
+    DEPENDS ${GTEST_DEPENDS}
+    PREFIX ${GTEST_PREFIX_DIR}
+    UPDATE_COMMAND ""
+    PATCH_COMMAND ${GTEST_PATCH_COMMAND}
+    CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+               -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+               -DCMAKE_CXX_FLAGS=${GTEST_CMAKE_CXX_FLAGS}
+               -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+               -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+               -DCMAKE_C_FLAGS=${GTEST_CMAKE_C_FLAGS}
+               -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+               -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+               -DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR}
+               -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+               -DBUILD_GMOCK=ON
+               -Dgtest_disable_pthreads=ON
+               -Dgtest_force_shared_crt=ON
+               -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+               ${EXTERNAL_OPTIONAL_ARGS}
+    CMAKE_CACHE_ARGS
+      -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR}
+      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+      -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+    BUILD_BYPRODUCTS ${GTEST_LIBRARIES}
+    BUILD_BYPRODUCTS ${GTEST_MAIN_LIBRARIES}
+    BUILD_BYPRODUCTS ${GMOCK_LIBRARIES})
+else()
+  ExternalProject_Add(
+    extern_gtest
+    ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
+    GIT_REPOSITORY ${GTEST_REPOSITORY}
+    GIT_TAG ${GTEST_TAG}
+    DEPENDS ${GTEST_DEPENDS}
+    PREFIX ${GTEST_PREFIX_DIR}
+    UPDATE_COMMAND ""
+    PATCH_COMMAND ${GTEST_PATCH_COMMAND}
+    CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+               -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+               -DCMAKE_CXX_FLAGS=${GTEST_CMAKE_CXX_FLAGS}
+               -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+               -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+               -DCMAKE_C_FLAGS=${GTEST_CMAKE_C_FLAGS}
+               -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+               -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+               -DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR}
+               -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+               -DBUILD_GMOCK=ON
+               -Dgtest_disable_pthreads=ON
+               -Dgtest_force_shared_crt=ON
+               -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+               ${EXTERNAL_OPTIONAL_ARGS}
+    CMAKE_CACHE_ARGS
+      -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR}
+      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+      -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+    CMAKE_GENERATOR "Unix Makefiles"
+    BUILD_BYPRODUCTS ${GTEST_LIBRARIES}
+    BUILD_BYPRODUCTS ${GTEST_MAIN_LIBRARIES}
+    BUILD_BYPRODUCTS ${GMOCK_LIBRARIES})
+endif()
 
 add_library(gtest STATIC IMPORTED GLOBAL)
 set_property(TARGET gtest PROPERTY IMPORTED_LOCATION ${GTEST_LIBRARIES})

From 710b664d4726cb168f99176dcff94883e75208f1 Mon Sep 17 00:00:00 2001
From: RedContritio <RedContritio@qq.com>
Date: Thu, 13 Apr 2023 10:15:05 +0800
Subject: [PATCH 108/156] support auto generate for op adamax optimizer
 (#52702)

---
 .../fluid/operators/optimizers/adamax_op.cc   | 112 ------------------
 .../optimizers/unity_build_rule.cmake         |   2 -
 paddle/phi/api/yaml/legacy_ops.yaml           |  11 --
 paddle/phi/api/yaml/op_compat.yaml            |   6 +
 paddle/phi/api/yaml/ops.yaml                  |  11 ++
 paddle/phi/ops/compat/adamax_sig.cc           |  49 --------
 6 files changed, 17 insertions(+), 174 deletions(-)
 delete mode 100644 paddle/fluid/operators/optimizers/adamax_op.cc
 delete mode 100644 paddle/phi/ops/compat/adamax_sig.cc

diff --git a/paddle/fluid/operators/optimizers/adamax_op.cc b/paddle/fluid/operators/optimizers/adamax_op.cc
deleted file mode 100644
index 881b2eee6afed3..00000000000000
--- a/paddle/fluid/operators/optimizers/adamax_op.cc
+++ /dev/null
@@ -1,112 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/phi/infermeta/multiary.h"
-
-namespace paddle {
-namespace operators {
-
-class AdamaxOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "Param"),
-                          ctx.GetPlace());
-  }
-};
-
-class AdamaxOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Param", "(Tensor) Input parameter");
-    AddInput("Grad", "(Tensor) Input gradient");
-    AddInput("LearningRate", "(Tensor) Learning rate");
-    AddInput("Moment", "(Tensor) First moment");
-    AddInput("InfNorm",
-             "(Tensor) "
-             "Input exponentially weighted infinity norm");
-    AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator");
-    AddInput("MasterParam", "FP32 master weight for AMP.").AsDispensable();
-    AddOutput("ParamOut", "(Tensor) Output parameter");
-    AddOutput("MomentOut", "(Tensor) Output first moment");
-    AddOutput("InfNormOut",
-              "(Tensor) "
-              "Output exponentially weighted infinity norm");
-    AddOutput("MasterParamOut",
-              "The updated FP32 master weight for AMP. "
-              "It shared memory with Input(MasterParam).")
-        .AsDispensable();
-
-    AddAttr<float>("beta1",
-                   "(float, default 0.9) "
-                   "Exponential decay rate for the "
-                   "1st moment estimates.")
-        .SetDefault(0.9f);
-    AddAttr<float>("beta2",
-                   "(float, default 0.999) "
-                   "exponential decay rate for the weighted "
-                   "infinity norm estimates.")
-        .SetDefault(0.999f);
-    AddAttr<float>("epsilon",
-                   "(float, default 1.0e-8) "
-                   "Constant for numerical stability")
-        .SetDefault(1.0e-8f);
-    AddAttr<bool>("multi_precision",
-                  "(bool, default false) "
-                  "Whether to use multi-precision during weight updating.")
-        .SetDefault(false);
-    AddComment(R"DOC(
-Adamax Optimizer.
-
-We implement the Adamax optimizer from Section 7 of the Adam
-paper: https://arxiv.org/abs/1412.6980. Adamax is a variant of the
-Adam algorithm based on the infinity norm.
-
-Adamax updates:
-
-$$
-moment\_out = \beta_1 * moment + (1 - \beta_1) * grad \\
-inf\_norm\_out = max(\beta_2 * inf\_norm + \epsilon, |grad|) \\
-learning\_rate = \frac{learning\_rate}{1 - \beta_{1\_pow}} \\
-param\_out = param - learning\_rate * \frac{moment\_out}{inf\_norm\_out}
-$$
-
-The original paper does not have an epsilon attribute.
-However, it is added here for numerical stability to prevent the
-division by 0 error.
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-DECLARE_INFER_SHAPE_FUNCTOR(adamax,
-                            AdamaxInferMetaFunctor,
-                            PD_INFER_META(phi::AdamaxInferMeta));
-
-REGISTER_OPERATOR(
-    adamax,
-    ops::AdamaxOp,
-    ops::AdamaxOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    AdamaxInferMetaFunctor);
diff --git a/paddle/fluid/operators/optimizers/unity_build_rule.cmake b/paddle/fluid/operators/optimizers/unity_build_rule.cmake
index 676d554bc00733..6936175d8743b8 100644
--- a/paddle/fluid/operators/optimizers/unity_build_rule.cmake
+++ b/paddle/fluid/operators/optimizers/unity_build_rule.cmake
@@ -10,7 +10,6 @@ register_unity_group(
   lars_momentum_op.cc
   proximal_adagrad_op.cc
   adam_op.cc
-  adamax_op.cc
   dgc_momentum_op.cc
   proximal_gd_op.cc
   decayed_adagrad_op.cc
@@ -26,7 +25,6 @@ register_unity_group(
   proximal_adagrad_op.cu
   adagrad_op.cu
   adam_op.cu
-  adamax_op.cu
   decayed_adagrad_op.cu
   adadelta_op.cu
   lamb_op.cu
diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml
index abd42601a8fa00..cd499a2d049e72 100755
--- a/paddle/phi/api/yaml/legacy_ops.yaml
+++ b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -33,17 +33,6 @@
   optional : master_param, skip_update
   inplace : (param -> param_out), (moment1 -> moment1_out), (moment2 -> moment2_out), (beta1_pow -> beta1_pow_out), (beta2_pow -> beta2_pow_out), (master_param -> master_param_outs)
 
-- op : adamax_
-  args : (Tensor param, Tensor grad, Tensor learning_rate, Tensor moment, Tensor inf_norm, Tensor beta1_pow, Tensor master_param, float beta1, float beta2, float epsilon, bool multi_precision)
-  output : Tensor(param_out), Tensor(avg_squared_grad_out), Tensor(avg_squared_update_out), Tensor(master_param_outs)
-  infer_meta :
-    func : AdamaxInferMeta
-  kernel :
-    func : adamax
-    data_type : param
-  optional : master_param
-  inplace : (param -> param_out), (moment -> avg_squared_grad_out), (inf_norm -> avg_squared_update_out), (master_param ->master_param_outs)
-
 - op : adamw_
   args : (Tensor param, Tensor grad, Tensor learning_rate, Tensor moment1, Tensor moment2, Tensor beta1_pow, Tensor beta2_pow, Tensor master_param, Tensor skip_update, Scalar beta1, Scalar beta2, Scalar epsilon, float lr_ratio, float coeff, bool with_decay, bool lazy_mode, int64_t min_row_size_to_use_multithread, bool multi_precision, bool use_global_beta_pow)
   output : Tensor(param_out), Tensor(moment1_out), Tensor(moment2_out), Tensor(beta1_pow_out), Tensor(beta2_pow_out), Tensor(master_param_outs)
diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
index 44f065feb7d72d..cf5453a3846ee2 100644
--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -50,6 +50,12 @@
   outputs :
     { param_out : ParamOut, moment_out : MomentOut, master_param_out : MasterParamOut }
 
+- op : adamax_
+  inputs :
+    {param : Param, grad: Grad, learning_rate : LearningRate, moment : Moment, inf_norm : InfNorm, beta1_pow : Beta1Pow, master_param : MasterParam}
+  outputs :
+    {param_out : ParamOut, moment_out : MomentOut, inf_norm_out : InfNormOut, master_param_out : MasterParamOut}
+
 - op : add (elementwise_add)
   backward : add_grad (elementwise_add_grad)
   extra :
diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
index 20adbd31aca06d..2ad4d563d9f04f 100644
--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -44,6 +44,17 @@
   optional : master_param, master_param_out
   inplace : (param -> param_out), (moment -> moment_out), (master_param -> master_param_out)
 
+- op : adamax_
+  args : (Tensor param, Tensor grad, Tensor learning_rate, Tensor moment, Tensor inf_norm, Tensor beta1_pow, Tensor master_param, float beta1 = 0.9f, float beta2 = 0.999f, float epsilon = 1.0e-8f, bool multi_precision = false)
+  output : Tensor(param_out), Tensor(moment_out), Tensor(inf_norm_out), Tensor(master_param_out)
+  infer_meta :
+    func : AdamaxInferMeta
+  kernel :
+    func : adamax
+    data_type : param
+  optional : master_param, master_param_out
+  inplace : (param -> param_out), (moment -> moment_out), (inf_norm -> inf_norm_out), (master_param ->master_param_out)
+
 - op : addmm
   args : (Tensor input, Tensor x, Tensor y, float beta=1.0, float alpha=1.0)
   output : Tensor
diff --git a/paddle/phi/ops/compat/adamax_sig.cc b/paddle/phi/ops/compat/adamax_sig.cc
deleted file mode 100644
index 9c012de3771fba..00000000000000
--- a/paddle/phi/ops/compat/adamax_sig.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include <string>
-
-#include "paddle/phi/core/compat/op_utils.h"
-#include "paddle/utils/small_vector.h"
-
-namespace phi {
-
-KernelSignature AdamaxOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  paddle::small_vector<const char*> in_names = {"Param",
-                                                "Grad",
-                                                "LearningRate",
-                                                "Moment",
-                                                "InfNorm",
-                                                "Beta1Pow",
-                                                "MasterParam"};
-  paddle::small_vector<const char*> out_names = {
-      "ParamOut", "MomentOut", "InfNormOut", "MasterParamOut"};
-  paddle::small_vector<const char*> attr_names;
-  attr_names.emplace_back("beta1");
-  attr_names.emplace_back("beta2");
-  attr_names.emplace_back("epsilon");
-  attr_names.emplace_back("multi_precision");
-
-  if (ctx.IsDenseTensorInput("Grad")) {
-    return KernelSignature("adamax",
-                           std::move(in_names),
-                           std::move(attr_names),
-                           std::move(out_names));
-  } else {
-    return KernelSignature("unregistered", {}, {}, {});
-  }
-}
-
-}  // namespace phi
-
-PD_REGISTER_ARG_MAPPING_FN(adamax, phi::AdamaxOpArgumentMapping);

From fa8abeec1645a45eefb7681bb1cf1dc7f2332b30 Mon Sep 17 00:00:00 2001
From: csy0225 <78470701+csy0225@users.noreply.github.com>
Date: Thu, 13 Apr 2023 10:15:55 +0800
Subject: [PATCH 109/156] =?UTF-8?q?[XPU]=20Fix=20instance=5Fnorm=E3=80=81c?=
 =?UTF-8?q?onv2d=5Fxpu=E3=80=81inplace=20optimizer=20bugs.=20(#52627)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../framework/ir/auto_mixed_precision_pass.cc |  9 +++++
 .../fluid/framework/ir/inplace_op_var_pass.cc |  6 +++
 paddle/phi/backends/xpu/xpu2_op_list.cc       |  3 +-
 paddle/phi/kernels/assign_kernel.cc           |  3 +-
 .../phi/kernels/xpu/instance_norm_kernel.cc   | 39 ++++++++++++++++---
 5 files changed, 51 insertions(+), 9 deletions(-)

diff --git a/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
index 061e2432eed1a3..fa570394d80f3b 100644
--- a/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
+++ b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
@@ -620,6 +620,15 @@ bool AutoMixedPrecisionPass::InputVarsNotConvert(
       if (std::find(vecs.begin(), vecs.end(), var_name) != vecs.end()) {
         return true;
       }
+    } else if (GetOpOriginalType(op_desc->Type()) == "instance_norm") {
+      auto vecs = op_desc->Input("Bias");
+      if (std::find(vecs.begin(), vecs.end(), var_name) != vecs.end()) {
+        return true;
+      }
+      vecs = op_desc->Input("Scale");
+      if (std::find(vecs.begin(), vecs.end(), var_name) != vecs.end()) {
+        return true;
+      }
     }
   }
 
diff --git a/paddle/fluid/framework/ir/inplace_op_var_pass.cc b/paddle/fluid/framework/ir/inplace_op_var_pass.cc
index 0ccac637be36c7..5bbe980daaba7e 100644
--- a/paddle/fluid/framework/ir/inplace_op_var_pass.cc
+++ b/paddle/fluid/framework/ir/inplace_op_var_pass.cc
@@ -36,6 +36,12 @@ bool InplaceOpVarPass::IsValidInplaceOp(
     if (var_node->Name() != x_name) continue;
     if (var_node->Var()->Persistable() || var_node->outputs.size() != 1)
       return false;
+    // The op type in front of in_var_node should not be feed.
+    for (auto* pre_op : var_node->inputs) {
+      if (pre_op->Op()->Type() == "feed") {
+        return false;
+      }
+    }
   }
 
   // in/out_var_node should be not used in multi graphs.
diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc
index 419bba4da7417a..ceab85cf551d6b 100644
--- a/paddle/phi/backends/xpu/xpu2_op_list.cc
+++ b/paddle/phi/backends/xpu/xpu2_op_list.cc
@@ -416,7 +416,8 @@ XPUOpMap& get_kl2_ops() {
        XPUKernelSet({phi::DataType::FLOAT32,
                      phi::DataType::INT32,
                      phi::DataType::INT64})},
-      {"instance_norm", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"instance_norm",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
       {"instance_norm_grad", XPUKernelSet({phi::DataType::FLOAT32})},
       {"iou_similarity", XPUKernelSet({phi::DataType::FLOAT32})},
       {"label_smooth", XPUKernelSet({phi::DataType::FLOAT32})},
diff --git a/paddle/phi/kernels/assign_kernel.cc b/paddle/phi/kernels/assign_kernel.cc
index 09046ef45565e3..9aba3bcb78faf1 100644
--- a/paddle/phi/kernels/assign_kernel.cc
+++ b/paddle/phi/kernels/assign_kernel.cc
@@ -186,6 +186,5 @@ PD_REGISTER_KERNEL(assign_value,
                    int,
                    float,
                    double,
-                   int64_t,
-                   phi::dtype::float16) {}
+                   int64_t) {}
 #endif
diff --git a/paddle/phi/kernels/xpu/instance_norm_kernel.cc b/paddle/phi/kernels/xpu/instance_norm_kernel.cc
index 293397f66ee37d..1631d0ccbeed8d 100644
--- a/paddle/phi/kernels/xpu/instance_norm_kernel.cc
+++ b/paddle/phi/kernels/xpu/instance_norm_kernel.cc
@@ -15,6 +15,7 @@
 #include "paddle/phi/kernels/instance_norm_kernel.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace phi {
 
@@ -37,9 +38,31 @@ void InstanceNormKernel(const Context& dev_ctx,
   dev_ctx.template Alloc<T>(y);
   dev_ctx.template Alloc<float>(saved_mean);
   dev_ctx.template Alloc<float>(saved_var);
-
+  // scale
   const auto scale_ptr = scale.get_ptr();
-  const auto bias_ptr = bias.get_ptr();
+  const float* scale_data_fp32 = nullptr;
+  DenseTensor scale_data;
+  if (scale_ptr == nullptr) {
+    scale_data.Resize({c});
+    dev_ctx.template Alloc<float>(&scale_data);
+    phi::funcs::set_constant(dev_ctx, &scale_data, static_cast<float>(1));
+    scale_data_fp32 = scale_data.data<float>();
+  } else {
+    // no need to cast
+    scale_data_fp32 = scale_ptr->data<float>();
+  }
+  // bias
+  const float* bias_data_fp32 = nullptr;
+  const auto* bias_ptr = bias.get_ptr();
+  DenseTensor bias_data;
+  if (bias_ptr == nullptr) {
+    bias_data.Resize({c});
+    dev_ctx.template Alloc<float>(&bias_data);
+    phi::funcs::set_constant(dev_ctx, &bias_data, static_cast<float>(0));
+    bias_data_fp32 = bias_data.data<float>();
+  } else {
+    bias_data_fp32 = bias_ptr->data<float>();
+  }
 
   int r = xpu::instance_norm(dev_ctx.x_context(),
                              reinterpret_cast<const XPUType*>(x.data<T>()),
@@ -49,8 +72,8 @@ void InstanceNormKernel(const Context& dev_ctx,
                              h,
                              w,
                              epsilon,
-                             scale_ptr->data<float>(),
-                             bias_ptr->data<float>(),
+                             scale_data_fp32,
+                             bias_data_fp32,
                              saved_mean->data<float>(),
                              saved_var->data<float>(),
                              true);
@@ -60,5 +83,9 @@ void InstanceNormKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PD_REGISTER_KERNEL(
-    instance_norm, XPU, ALL_LAYOUT, phi::InstanceNormKernel, float) {}
+PD_REGISTER_KERNEL(instance_norm,
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::InstanceNormKernel,
+                   float,
+                   phi::dtype::float16) {}

From 4c7d50459657ba60cd77cc7efdd788f50fde5c0e Mon Sep 17 00:00:00 2001
From: Kim  Yann <baochen.yang@intel.com>
Date: Thu, 13 Apr 2023 10:29:01 +0800
Subject: [PATCH 110/156] rem cncl in ut & build sh (#52811)

* rem cncl in new test

* rem cncl in build sh

* rem cncl in old test
---
 paddle/scripts/paddle_build.sh                 | 18 +++++++-----------
 .../fluid/tests/unittests/test_dist_base.py    | 11 -----------
 test/cpp/imperative/CMakeLists.txt             |  3 +--
 test/cpp/imperative/test_group.cc              |  3 +--
 4 files changed, 9 insertions(+), 26 deletions(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 4693d78e2dc327..a6477a62edeb6e 100644
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -232,7 +232,6 @@ function cmake_base() {
         -DWITH_PSLIB=${pslib_flag}
         -DWITH_GLOO=${gloo_flag}
         -DWITH_LITE=${WITH_LITE:-OFF}
-        -DWITH_CNCL=${WITH_CNCL:-OFF}
         -DWITH_XPU=${WITH_XPU:-OFF}
         -DWITH_IPU=${WITH_IPU:-OFF}
         -DLITE_GIT_TAG=release/v2.10
@@ -288,7 +287,6 @@ EOF
         -DLITE_GIT_TAG=release/v2.10 \
         -DWITH_XPU=${WITH_XPU:-OFF} \
         -DWITH_IPU=${WITH_IPU:-OFF} \
-        -DWITH_CNCL=${WITH_CNCL:-OFF} \
         -DXPU_SDK_ROOT=${XPU_SDK_ROOT:-""} \
         -DWITH_LITE=${WITH_LITE:-OFF} \
         -DWITH_XPU_BKCL=${WITH_XPU_BKCL:-OFF} \
@@ -2188,7 +2186,7 @@ EOF
             grep -F  -f all_ut_run_by_protobuf3 ut_gpups > ut_run_by_protobuf3_in_gpups
             #get the difference set of ut_gpups and ut_run_by_protobuf3_in_gpups
             grep -F -x -v -f ut_run_by_protobuf3_in_gpups ut_gpups > ut_run_in_gpups
-            
+
             ctest -R ${ut_run_in_gpups} --timeout 120
             pip install protobuf==3.20.2
             ctest -R ${ut_run_by_protobuf3_in_gpups} --timeout 120
@@ -2908,13 +2906,13 @@ EOF
     echo "if you use setup.py to compile,please export envs as following in /paddle ..."
     cat << EOF
     ========================================
-    export CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} WITH_GPU=${WITH_GPU:-OFF} WITH_CUDNN_DSO=${WITH_CUDNN_DSO:-OFF} WITH_TENSORRT=${WITH_TENSORRT:-ON} WITH_ROCM=${WITH_ROCM:-OFF} WITH_CINN=${WITH_CINN:-OFF} WITH_DISTRIBUTE=${distibuted_flag} WITH_MKL=${WITH_MKL:-ON} WITH_AVX=${WITH_AVX:-OFF} CUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} NEW_RELEASE_PYPI=${NEW_RELEASE_PYPI:-OFF} NEW_RELEASE_ALL=${NEW_RELEASE_ALL:-OFF} NEW_RELEASE_JIT=${NEW_RELEASE_JIT:-OFF} WITH_PYTHON=${WITH_PYTHON:-ON} CUDNN_ROOT=/usr/ WITH_TESTING=${WITH_TESTING:-ON} WITH_COVERAGE=${WITH_COVERAGE:-OFF} WITH_INCREMENTAL_COVERAGE=${WITH_INCREMENTAL_COVERAGE:-OFF} CMAKE_MODULE_PATH=/opt/rocm/hip/cmake CMAKE_EXPORT_COMPILE_COMMANDS=ON WITH_CONTRIB=${WITH_CONTRIB:-ON} WITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} INFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} PY_VERSION=${PY_VERSION:-3.7} CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} WITH_PSCORE=${pscore_flag} WITH_PSLIB=${pslib_flag} WITH_GLOO=${gloo_flag} LITE_GIT_TAG=release/v2.10 WITH_XPU=${WITH_XPU:-OFF} WITH_IPU=${WITH_IPU:-OFF} WITH_CNCL=${WITH_CNCL:-OFF} XPU_SDK_ROOT=${XPU_SDK_ROOT:-""} WITH_LITE=${WITH_LITE:-OFF} WITH_XPU_BKCL=${WITH_XPU_BKCL:-OFF} WITH_ARM=${WITH_ARM:-OFF} WITH_STRIP=${WITH_STRIP:-ON} ON_INFER=${ON_INFER:-OFF} WITH_HETERPS=${WITH_HETERPS:-OFF} WITH_GPU_GRAPH=${WITH_GPU_GRAPH:-OFF} WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} CUDA_ARCH_BIN=${CUDA_ARCH_BIN} WITH_RECORD_BUILDTIME=${WITH_RECORD_BUILDTIME:-OFF} WITH_UNITY_BUILD=${WITH_UNITY_BUILD:-OFF} WITH_ONNXRUNTIME=${WITH_ONNXRUNTIME:-OFF} WITH_CUDNN_FRONTEND=${WITH_CUDNN_FRONTEND:-OFF}
+    export CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} WITH_GPU=${WITH_GPU:-OFF} WITH_CUDNN_DSO=${WITH_CUDNN_DSO:-OFF} WITH_TENSORRT=${WITH_TENSORRT:-ON} WITH_ROCM=${WITH_ROCM:-OFF} WITH_CINN=${WITH_CINN:-OFF} WITH_DISTRIBUTE=${distibuted_flag} WITH_MKL=${WITH_MKL:-ON} WITH_AVX=${WITH_AVX:-OFF} CUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} NEW_RELEASE_PYPI=${NEW_RELEASE_PYPI:-OFF} NEW_RELEASE_ALL=${NEW_RELEASE_ALL:-OFF} NEW_RELEASE_JIT=${NEW_RELEASE_JIT:-OFF} WITH_PYTHON=${WITH_PYTHON:-ON} CUDNN_ROOT=/usr/ WITH_TESTING=${WITH_TESTING:-ON} WITH_COVERAGE=${WITH_COVERAGE:-OFF} WITH_INCREMENTAL_COVERAGE=${WITH_INCREMENTAL_COVERAGE:-OFF} CMAKE_MODULE_PATH=/opt/rocm/hip/cmake CMAKE_EXPORT_COMPILE_COMMANDS=ON WITH_CONTRIB=${WITH_CONTRIB:-ON} WITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} INFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} PY_VERSION=${PY_VERSION:-3.7} CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} WITH_PSCORE=${pscore_flag} WITH_PSLIB=${pslib_flag} WITH_GLOO=${gloo_flag} LITE_GIT_TAG=release/v2.10 WITH_XPU=${WITH_XPU:-OFF} WITH_IPU=${WITH_IPU:-OFF} XPU_SDK_ROOT=${XPU_SDK_ROOT:-""} WITH_LITE=${WITH_LITE:-OFF} WITH_XPU_BKCL=${WITH_XPU_BKCL:-OFF} WITH_ARM=${WITH_ARM:-OFF} WITH_STRIP=${WITH_STRIP:-ON} ON_INFER=${ON_INFER:-OFF} WITH_HETERPS=${WITH_HETERPS:-OFF} WITH_GPU_GRAPH=${WITH_GPU_GRAPH:-OFF} WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} CUDA_ARCH_BIN=${CUDA_ARCH_BIN} WITH_RECORD_BUILDTIME=${WITH_RECORD_BUILDTIME:-OFF} WITH_UNITY_BUILD=${WITH_UNITY_BUILD:-OFF} WITH_ONNXRUNTIME=${WITH_ONNXRUNTIME:-OFF} WITH_CUDNN_FRONTEND=${WITH_CUDNN_FRONTEND:-OFF}
     ========================================
 EOF
     echo "if you use cmake to compile,please Configuring cmake in /paddle/build ..."
     cat <<EOF
     ========================================
-    cmake .. -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} -DWITH_GPU=${WITH_GPU:-OFF} -DWITH_CUDNN_DSO=${WITH_CUDNN_DSO:-OFF} -DWITH_TENSORRT=${WITH_TENSORRT:-ON} -DWITH_ROCM=${WITH_ROCM:-OFF} -DWITH_CINN=${WITH_CINN:-OFF} -DWITH_DISTRIBUTE=${distibuted_flag} -DWITH_MKL=${WITH_MKL:-ON} -DWITH_AVX=${WITH_AVX:-OFF} -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} -DNEW_RELEASE_PYPI=${NEW_RELEASE_PYPI:-OFF} -DNEW_RELEASE_ALL=${NEW_RELEASE_ALL:-OFF} -DNEW_RELEASE_JIT=${NEW_RELEASE_JIT:-OFF} -DWITH_PYTHON=${WITH_PYTHON:-ON} -DCUDNN_ROOT=/usr/ -DWITH_TESTING=${WITH_TESTING:-ON} -DWITH_COVERAGE=${WITH_COVERAGE:-OFF} -DWITH_INCREMENTAL_COVERAGE=${WITH_INCREMENTAL_COVERAGE:-OFF} -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DWITH_CONTRIB=${WITH_CONTRIB:-ON} -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} -DPY_VERSION=${PY_VERSION:-3.7} -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} -DWITH_PSCORE=${pscore_flag} -DWITH_PSLIB=${pslib_flag} -DWITH_GLOO=${gloo_flag} -DLITE_GIT_TAG=release/v2.10 -DWITH_XPU=${WITH_XPU:-OFF} -DWITH_IPU=${WITH_IPU:-OFF} -DWITH_CNCL=${WITH_CNCL:-OFF} -DXPU_SDK_ROOT=${XPU_SDK_ROOT:-""} -DWITH_LITE=${WITH_LITE:-OFF} -DWITH_XPU_BKCL=${WITH_XPU_BKCL:-OFF} -DWITH_ARM=${WITH_ARM:-OFF} -DWITH_STRIP=${WITH_STRIP:-ON} -DON_INFER=${ON_INFER:-OFF} -DWITH_HETERPS=${WITH_HETERPS:-OFF} -DWITH_GPU_GRAPH=${WITH_GPU_GRAPH:-OFF} -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} -DCUDA_ARCH_BIN=${CUDA_ARCH_BIN} -DWITH_RECORD_BUILDTIME=${WITH_RECORD_BUILDTIME:-OFF} -DWITH_UNITY_BUILD=${WITH_UNITY_BUILD:-OFF} -DWITH_ONNXRUNTIME=${WITH_ONNXRUNTIME:-OFF} -DWITH_CUDNN_FRONTEND=${WITH_CUDNN_FRONTEND:-OFF}
+    cmake .. -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} -DWITH_GPU=${WITH_GPU:-OFF} -DWITH_CUDNN_DSO=${WITH_CUDNN_DSO:-OFF} -DWITH_TENSORRT=${WITH_TENSORRT:-ON} -DWITH_ROCM=${WITH_ROCM:-OFF} -DWITH_CINN=${WITH_CINN:-OFF} -DWITH_DISTRIBUTE=${distibuted_flag} -DWITH_MKL=${WITH_MKL:-ON} -DWITH_AVX=${WITH_AVX:-OFF} -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} -DNEW_RELEASE_PYPI=${NEW_RELEASE_PYPI:-OFF} -DNEW_RELEASE_ALL=${NEW_RELEASE_ALL:-OFF} -DNEW_RELEASE_JIT=${NEW_RELEASE_JIT:-OFF} -DWITH_PYTHON=${WITH_PYTHON:-ON} -DCUDNN_ROOT=/usr/ -DWITH_TESTING=${WITH_TESTING:-ON} -DWITH_COVERAGE=${WITH_COVERAGE:-OFF} -DWITH_INCREMENTAL_COVERAGE=${WITH_INCREMENTAL_COVERAGE:-OFF} -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DWITH_CONTRIB=${WITH_CONTRIB:-ON} -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} -DPY_VERSION=${PY_VERSION:-3.7} -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} -DWITH_PSCORE=${pscore_flag} -DWITH_PSLIB=${pslib_flag} -DWITH_GLOO=${gloo_flag} -DLITE_GIT_TAG=release/v2.10 -DWITH_XPU=${WITH_XPU:-OFF} -DWITH_IPU=${WITH_IPU:-OFF} -DXPU_SDK_ROOT=${XPU_SDK_ROOT:-""} -DWITH_LITE=${WITH_LITE:-OFF} -DWITH_XPU_BKCL=${WITH_XPU_BKCL:-OFF} -DWITH_ARM=${WITH_ARM:-OFF} -DWITH_STRIP=${WITH_STRIP:-ON} -DON_INFER=${ON_INFER:-OFF} -DWITH_HETERPS=${WITH_HETERPS:-OFF} -DWITH_GPU_GRAPH=${WITH_GPU_GRAPH:-OFF} -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} -DCUDA_ARCH_BIN=${CUDA_ARCH_BIN} -DWITH_RECORD_BUILDTIME=${WITH_RECORD_BUILDTIME:-OFF} -DWITH_UNITY_BUILD=${WITH_UNITY_BUILD:-OFF} -DWITH_ONNXRUNTIME=${WITH_ONNXRUNTIME:-OFF} -DWITH_CUDNN_FRONTEND=${WITH_CUDNN_FRONTEND:-OFF}
     ========================================
 EOF
     # reset ccache zero stats for collect PR's actual hit rate
@@ -3391,13 +3389,13 @@ function run_setup(){
     echo "if you use setup.py to compile,please export envs as following in /paddle ..."
     cat << EOF
     ========================================
-    export CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} WITH_GPU=${WITH_GPU:-OFF} WITH_CUDNN_DSO=${WITH_CUDNN_DSO:-OFF} WITH_TENSORRT=${WITH_TENSORRT:-ON} WITH_ROCM=${WITH_ROCM:-OFF} WITH_CINN=${WITH_CINN:-OFF} WITH_DISTRIBUTE=${distibuted_flag} WITH_MKL=${WITH_MKL:-ON} WITH_AVX=${WITH_AVX:-OFF} CUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} NEW_RELEASE_PYPI=${NEW_RELEASE_PYPI:-OFF} NEW_RELEASE_ALL=${NEW_RELEASE_ALL:-OFF} NEW_RELEASE_JIT=${NEW_RELEASE_JIT:-OFF} WITH_PYTHON=${WITH_PYTHON:-ON} CUDNN_ROOT=/usr/ WITH_TESTING=${WITH_TESTING:-ON} WITH_COVERAGE=${WITH_COVERAGE:-OFF} WITH_INCREMENTAL_COVERAGE=${WITH_INCREMENTAL_COVERAGE:-OFF} CMAKE_MODULE_PATH=/opt/rocm/hip/cmake CMAKE_EXPORT_COMPILE_COMMANDS=ON WITH_CONTRIB=${WITH_CONTRIB:-ON} WITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} INFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} PY_VERSION=${PY_VERSION:-3.7} CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} WITH_PSCORE=${pscore_flag} WITH_PSLIB=${pslib_flag} WITH_GLOO=${gloo_flag} LITE_GIT_TAG=release/v2.10 WITH_XPU=${WITH_XPU:-OFF} WITH_IPU=${WITH_IPU:-OFF} WITH_CNCL=${WITH_CNCL:-OFF} XPU_SDK_ROOT=${XPU_SDK_ROOT:-""} WITH_LITE=${WITH_LITE:-OFF} WITH_XPU_BKCL=${WITH_XPU_BKCL:-OFF} WITH_ARM=${WITH_ARM:-OFF} WITH_STRIP=${WITH_STRIP:-ON} ON_INFER=${ON_INFER:-OFF} WITH_HETERPS=${WITH_HETERPS:-OFF} WITH_GPU_GRAPH=${WITH_GPU_GRAPH:-OFF} WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} CUDA_ARCH_BIN=${CUDA_ARCH_BIN} WITH_RECORD_BUILDTIME=${WITH_RECORD_BUILDTIME:-OFF} WITH_UNITY_BUILD=${WITH_UNITY_BUILD:-OFF} WITH_ONNXRUNTIME=${WITH_ONNXRUNTIME:-OFF} WITH_CUDNN_FRONTEND=${WITH_CUDNN_FRONTEND:-OFF}
+    export CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} WITH_GPU=${WITH_GPU:-OFF} WITH_CUDNN_DSO=${WITH_CUDNN_DSO:-OFF} WITH_TENSORRT=${WITH_TENSORRT:-ON} WITH_ROCM=${WITH_ROCM:-OFF} WITH_CINN=${WITH_CINN:-OFF} WITH_DISTRIBUTE=${distibuted_flag} WITH_MKL=${WITH_MKL:-ON} WITH_AVX=${WITH_AVX:-OFF} CUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} NEW_RELEASE_PYPI=${NEW_RELEASE_PYPI:-OFF} NEW_RELEASE_ALL=${NEW_RELEASE_ALL:-OFF} NEW_RELEASE_JIT=${NEW_RELEASE_JIT:-OFF} WITH_PYTHON=${WITH_PYTHON:-ON} CUDNN_ROOT=/usr/ WITH_TESTING=${WITH_TESTING:-ON} WITH_COVERAGE=${WITH_COVERAGE:-OFF} WITH_INCREMENTAL_COVERAGE=${WITH_INCREMENTAL_COVERAGE:-OFF} CMAKE_MODULE_PATH=/opt/rocm/hip/cmake CMAKE_EXPORT_COMPILE_COMMANDS=ON WITH_CONTRIB=${WITH_CONTRIB:-ON} WITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} INFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} PY_VERSION=${PY_VERSION:-3.7} CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} WITH_PSCORE=${pscore_flag} WITH_PSLIB=${pslib_flag} WITH_GLOO=${gloo_flag} LITE_GIT_TAG=release/v2.10 WITH_XPU=${WITH_XPU:-OFF} WITH_IPU=${WITH_IPU:-OFF} XPU_SDK_ROOT=${XPU_SDK_ROOT:-""} WITH_LITE=${WITH_LITE:-OFF} WITH_XPU_BKCL=${WITH_XPU_BKCL:-OFF} WITH_ARM=${WITH_ARM:-OFF} WITH_STRIP=${WITH_STRIP:-ON} ON_INFER=${ON_INFER:-OFF} WITH_HETERPS=${WITH_HETERPS:-OFF} WITH_GPU_GRAPH=${WITH_GPU_GRAPH:-OFF} WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} CUDA_ARCH_BIN=${CUDA_ARCH_BIN} WITH_RECORD_BUILDTIME=${WITH_RECORD_BUILDTIME:-OFF} WITH_UNITY_BUILD=${WITH_UNITY_BUILD:-OFF} WITH_ONNXRUNTIME=${WITH_ONNXRUNTIME:-OFF} WITH_CUDNN_FRONTEND=${WITH_CUDNN_FRONTEND:-OFF}
     ========================================
 EOF
     echo "if you use cmake to compile,please Configuring cmake in /paddle/build ..."
     cat <<EOF
     ========================================
-    cmake .. -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} -DWITH_GPU=${WITH_GPU:-OFF} -DWITH_CUDNN_DSO=${WITH_CUDNN_DSO:-OFF} -DWITH_TENSORRT=${WITH_TENSORRT:-ON} -DWITH_ROCM=${WITH_ROCM:-OFF} -DWITH_CINN=${WITH_CINN:-OFF} -DWITH_DISTRIBUTE=${distibuted_flag} -DWITH_MKL=${WITH_MKL:-ON} -DWITH_AVX=${WITH_AVX:-OFF} -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} -DNEW_RELEASE_PYPI=${NEW_RELEASE_PYPI:-OFF} -DNEW_RELEASE_ALL=${NEW_RELEASE_ALL:-OFF} -DNEW_RELEASE_JIT=${NEW_RELEASE_JIT:-OFF} -DWITH_PYTHON=${WITH_PYTHON:-ON} -DCUDNN_ROOT=/usr/ -DWITH_TESTING=${WITH_TESTING:-ON} -DWITH_COVERAGE=${WITH_COVERAGE:-OFF} -DWITH_INCREMENTAL_COVERAGE=${WITH_INCREMENTAL_COVERAGE:-OFF} -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DWITH_CONTRIB=${WITH_CONTRIB:-ON} -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} -DPY_VERSION=${PY_VERSION:-3.7} -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} -DWITH_PSCORE=${pscore_flag} -DWITH_PSLIB=${pslib_flag} -DWITH_GLOO=${gloo_flag} -DLITE_GIT_TAG=release/v2.10 -DWITH_XPU=${WITH_XPU:-OFF} -DWITH_IPU=${WITH_IPU:-OFF} -DWITH_CNCL=${WITH_CNCL:-OFF} -DXPU_SDK_ROOT=${XPU_SDK_ROOT:-""} -DWITH_LITE=${WITH_LITE:-OFF} -DWITH_XPU_BKCL=${WITH_XPU_BKCL:-OFF} -DWITH_ARM=${WITH_ARM:-OFF} -DWITH_STRIP=${WITH_STRIP:-ON} -DON_INFER=${ON_INFER:-OFF} -DWITH_HETERPS=${WITH_HETERPS:-OFF} -DWITH_GPU_GRAPH=${WITH_GPU_GRAPH:-OFF} -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} -DCUDA_ARCH_BIN=${CUDA_ARCH_BIN} -DWITH_RECORD_BUILDTIME=${WITH_RECORD_BUILDTIME:-OFF} -DWITH_UNITY_BUILD=${WITH_UNITY_BUILD:-OFF} -DWITH_ONNXRUNTIME=${WITH_ONNXRUNTIME:-OFF} -DWITH_CUDNN_FRONTEND=${WITH_CUDNN_FRONTEND:-OFF}
+    cmake .. -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} -DWITH_GPU=${WITH_GPU:-OFF} -DWITH_CUDNN_DSO=${WITH_CUDNN_DSO:-OFF} -DWITH_TENSORRT=${WITH_TENSORRT:-ON} -DWITH_ROCM=${WITH_ROCM:-OFF} -DWITH_CINN=${WITH_CINN:-OFF} -DWITH_DISTRIBUTE=${distibuted_flag} -DWITH_MKL=${WITH_MKL:-ON} -DWITH_AVX=${WITH_AVX:-OFF} -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} -DNEW_RELEASE_PYPI=${NEW_RELEASE_PYPI:-OFF} -DNEW_RELEASE_ALL=${NEW_RELEASE_ALL:-OFF} -DNEW_RELEASE_JIT=${NEW_RELEASE_JIT:-OFF} -DWITH_PYTHON=${WITH_PYTHON:-ON} -DCUDNN_ROOT=/usr/ -DWITH_TESTING=${WITH_TESTING:-ON} -DWITH_COVERAGE=${WITH_COVERAGE:-OFF} -DWITH_INCREMENTAL_COVERAGE=${WITH_INCREMENTAL_COVERAGE:-OFF} -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DWITH_CONTRIB=${WITH_CONTRIB:-ON} -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} -DPY_VERSION=${PY_VERSION:-3.7} -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} -DWITH_PSCORE=${pscore_flag} -DWITH_PSLIB=${pslib_flag} -DWITH_GLOO=${gloo_flag} -DLITE_GIT_TAG=release/v2.10 -DWITH_XPU=${WITH_XPU:-OFF} -DWITH_IPU=${WITH_IPU:-OFF} -DXPU_SDK_ROOT=${XPU_SDK_ROOT:-""} -DWITH_LITE=${WITH_LITE:-OFF} -DWITH_XPU_BKCL=${WITH_XPU_BKCL:-OFF} -DWITH_ARM=${WITH_ARM:-OFF} -DWITH_STRIP=${WITH_STRIP:-ON} -DON_INFER=${ON_INFER:-OFF} -DWITH_HETERPS=${WITH_HETERPS:-OFF} -DWITH_GPU_GRAPH=${WITH_GPU_GRAPH:-OFF} -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} -DCUDA_ARCH_BIN=${CUDA_ARCH_BIN} -DWITH_RECORD_BUILDTIME=${WITH_RECORD_BUILDTIME:-OFF} -DWITH_UNITY_BUILD=${WITH_UNITY_BUILD:-OFF} -DWITH_ONNXRUNTIME=${WITH_ONNXRUNTIME:-OFF} -DWITH_CUDNN_FRONTEND=${WITH_CUDNN_FRONTEND:-OFF}
     ========================================
 EOF
     export CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release}
@@ -3432,7 +3430,6 @@ EOF
     export LITE_GIT_TAG=release/v2.10
     export WITH_XPU=${WITH_XPU:-OFF}
     export WITH_IPU=${WITH_IPU:-OFF}
-    export WITH_CNCL=${WITH_CNCL:-OFF}
     export XPU_SDK_ROOT=${XPU_SDK_ROOT:-""}
     export WITH_LITE=${WITH_LITE:-OFF}
     export WITH_XPU_BKCL=${WITH_XPU_BKCL:-OFF}
@@ -3636,13 +3633,13 @@ function run_setup_mac(){
     echo "if you use setup.py to compile,please export envs as following in /paddle ..."
     cat << EOF
     ========================================
-    export CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} WITH_GPU=${WITH_GPU:-OFF} WITH_CUDNN_DSO=${WITH_CUDNN_DSO:-OFF} WITH_TENSORRT=${WITH_TENSORRT:-ON} WITH_ROCM=${WITH_ROCM:-OFF} WITH_CINN=${WITH_CINN:-OFF} WITH_DISTRIBUTE=${distibuted_flag} WITH_MKL=${WITH_MKL:-ON} WITH_AVX=${WITH_AVX:-OFF} CUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} NEW_RELEASE_PYPI=${NEW_RELEASE_PYPI:-OFF} NEW_RELEASE_ALL=${NEW_RELEASE_ALL:-OFF} NEW_RELEASE_JIT=${NEW_RELEASE_JIT:-OFF} WITH_PYTHON=${WITH_PYTHON:-ON} CUDNN_ROOT=/usr/ WITH_TESTING=${WITH_TESTING:-ON} WITH_COVERAGE=${WITH_COVERAGE:-OFF} WITH_INCREMENTAL_COVERAGE=${WITH_INCREMENTAL_COVERAGE:-OFF} CMAKE_MODULE_PATH=/opt/rocm/hip/cmake CMAKE_EXPORT_COMPILE_COMMANDS=ON WITH_CONTRIB=${WITH_CONTRIB:-ON} WITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} INFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} PY_VERSION=${PY_VERSION:-3.7} CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} WITH_PSCORE=${pscore_flag} WITH_PSLIB=${pslib_flag} WITH_GLOO=${gloo_flag} LITE_GIT_TAG=release/v2.10 WITH_XPU=${WITH_XPU:-OFF} WITH_IPU=${WITH_IPU:-OFF} WITH_CNCL=${WITH_CNCL:-OFF} XPU_SDK_ROOT=${XPU_SDK_ROOT:-""} WITH_LITE=${WITH_LITE:-OFF} WITH_XPU_BKCL=${WITH_XPU_BKCL:-OFF} WITH_ARM=${WITH_ARM:-OFF} WITH_STRIP=${WITH_STRIP:-ON} ON_INFER=${ON_INFER:-OFF} WITH_HETERPS=${WITH_HETERPS:-OFF} WITH_GPU_GRAPH=${WITH_GPU_GRAPH:-OFF} WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} CUDA_ARCH_BIN=${CUDA_ARCH_BIN} WITH_RECORD_BUILDTIME=${WITH_RECORD_BUILDTIME:-OFF} WITH_UNITY_BUILD=${WITH_UNITY_BUILD:-OFF} WITH_ONNXRUNTIME=${WITH_ONNXRUNTIME:-OFF} WITH_CUDNN_FRONTEND=${WITH_CUDNN_FRONTEND:-OFF}
+    export CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} WITH_GPU=${WITH_GPU:-OFF} WITH_CUDNN_DSO=${WITH_CUDNN_DSO:-OFF} WITH_TENSORRT=${WITH_TENSORRT:-ON} WITH_ROCM=${WITH_ROCM:-OFF} WITH_CINN=${WITH_CINN:-OFF} WITH_DISTRIBUTE=${distibuted_flag} WITH_MKL=${WITH_MKL:-ON} WITH_AVX=${WITH_AVX:-OFF} CUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} NEW_RELEASE_PYPI=${NEW_RELEASE_PYPI:-OFF} NEW_RELEASE_ALL=${NEW_RELEASE_ALL:-OFF} NEW_RELEASE_JIT=${NEW_RELEASE_JIT:-OFF} WITH_PYTHON=${WITH_PYTHON:-ON} CUDNN_ROOT=/usr/ WITH_TESTING=${WITH_TESTING:-ON} WITH_COVERAGE=${WITH_COVERAGE:-OFF} WITH_INCREMENTAL_COVERAGE=${WITH_INCREMENTAL_COVERAGE:-OFF} CMAKE_MODULE_PATH=/opt/rocm/hip/cmake CMAKE_EXPORT_COMPILE_COMMANDS=ON WITH_CONTRIB=${WITH_CONTRIB:-ON} WITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} INFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} PY_VERSION=${PY_VERSION:-3.7} CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} WITH_PSCORE=${pscore_flag} WITH_PSLIB=${pslib_flag} WITH_GLOO=${gloo_flag} LITE_GIT_TAG=release/v2.10 WITH_XPU=${WITH_XPU:-OFF} WITH_IPU=${WITH_IPU:-OFF} XPU_SDK_ROOT=${XPU_SDK_ROOT:-""} WITH_LITE=${WITH_LITE:-OFF} WITH_XPU_BKCL=${WITH_XPU_BKCL:-OFF} WITH_ARM=${WITH_ARM:-OFF} WITH_STRIP=${WITH_STRIP:-ON} ON_INFER=${ON_INFER:-OFF} WITH_HETERPS=${WITH_HETERPS:-OFF} WITH_GPU_GRAPH=${WITH_GPU_GRAPH:-OFF} WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} CUDA_ARCH_BIN=${CUDA_ARCH_BIN} WITH_RECORD_BUILDTIME=${WITH_RECORD_BUILDTIME:-OFF} WITH_UNITY_BUILD=${WITH_UNITY_BUILD:-OFF} WITH_ONNXRUNTIME=${WITH_ONNXRUNTIME:-OFF} WITH_CUDNN_FRONTEND=${WITH_CUDNN_FRONTEND:-OFF}
     ========================================
 EOF
     echo "if you use cmake to compile,please Configuring cmake in /paddle/build ..."
     cat <<EOF
     ========================================
-    cmake .. -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} -DWITH_GPU=${WITH_GPU:-OFF} -DWITH_CUDNN_DSO=${WITH_CUDNN_DSO:-OFF} -DWITH_TENSORRT=${WITH_TENSORRT:-ON} -DWITH_ROCM=${WITH_ROCM:-OFF} -DWITH_CINN=${WITH_CINN:-OFF} -DWITH_DISTRIBUTE=${distibuted_flag} -DWITH_MKL=${WITH_MKL:-ON} -DWITH_AVX=${WITH_AVX:-OFF} -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} -DNEW_RELEASE_PYPI=${NEW_RELEASE_PYPI:-OFF} -DNEW_RELEASE_ALL=${NEW_RELEASE_ALL:-OFF} -DNEW_RELEASE_JIT=${NEW_RELEASE_JIT:-OFF} -DWITH_PYTHON=${WITH_PYTHON:-ON} -DCUDNN_ROOT=/usr/ -DWITH_TESTING=${WITH_TESTING:-ON} -DWITH_COVERAGE=${WITH_COVERAGE:-OFF} -DWITH_INCREMENTAL_COVERAGE=${WITH_INCREMENTAL_COVERAGE:-OFF} -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DWITH_CONTRIB=${WITH_CONTRIB:-ON} -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} -DPY_VERSION=${PY_VERSION:-3.7} -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} -DWITH_PSCORE=${pscore_flag} -DWITH_PSLIB=${pslib_flag} -DWITH_GLOO=${gloo_flag} -DLITE_GIT_TAG=release/v2.10 -DWITH_XPU=${WITH_XPU:-OFF} -DWITH_IPU=${WITH_IPU:-OFF} -DWITH_CNCL=${WITH_CNCL:-OFF} -DXPU_SDK_ROOT=${XPU_SDK_ROOT:-""} -DWITH_LITE=${WITH_LITE:-OFF} -DWITH_XPU_BKCL=${WITH_XPU_BKCL:-OFF} -DWITH_ARM=${WITH_ARM:-OFF} -DWITH_STRIP=${WITH_STRIP:-ON} -DON_INFER=${ON_INFER:-OFF} -DWITH_HETERPS=${WITH_HETERPS:-OFF} -DWITH_GPU_GRAPH=${WITH_GPU_GRAPH:-OFF} -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} -DCUDA_ARCH_BIN=${CUDA_ARCH_BIN} -DWITH_RECORD_BUILDTIME=${WITH_RECORD_BUILDTIME:-OFF} -DWITH_UNITY_BUILD=${WITH_UNITY_BUILD:-OFF} -DWITH_ONNXRUNTIME=${WITH_ONNXRUNTIME:-OFF} -DWITH_CUDNN_FRONTEND=${WITH_CUDNN_FRONTEND:-OFF}
+    cmake .. -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} -DWITH_GPU=${WITH_GPU:-OFF} -DWITH_CUDNN_DSO=${WITH_CUDNN_DSO:-OFF} -DWITH_TENSORRT=${WITH_TENSORRT:-ON} -DWITH_ROCM=${WITH_ROCM:-OFF} -DWITH_CINN=${WITH_CINN:-OFF} -DWITH_DISTRIBUTE=${distibuted_flag} -DWITH_MKL=${WITH_MKL:-ON} -DWITH_AVX=${WITH_AVX:-OFF} -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} -DNEW_RELEASE_PYPI=${NEW_RELEASE_PYPI:-OFF} -DNEW_RELEASE_ALL=${NEW_RELEASE_ALL:-OFF} -DNEW_RELEASE_JIT=${NEW_RELEASE_JIT:-OFF} -DWITH_PYTHON=${WITH_PYTHON:-ON} -DCUDNN_ROOT=/usr/ -DWITH_TESTING=${WITH_TESTING:-ON} -DWITH_COVERAGE=${WITH_COVERAGE:-OFF} -DWITH_INCREMENTAL_COVERAGE=${WITH_INCREMENTAL_COVERAGE:-OFF} -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DWITH_CONTRIB=${WITH_CONTRIB:-ON} -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} -DPY_VERSION=${PY_VERSION:-3.7} -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} -DWITH_PSCORE=${pscore_flag} -DWITH_PSLIB=${pslib_flag} -DWITH_GLOO=${gloo_flag} -DLITE_GIT_TAG=release/v2.10 -DWITH_XPU=${WITH_XPU:-OFF} -DWITH_IPU=${WITH_IPU:-OFF} -DXPU_SDK_ROOT=${XPU_SDK_ROOT:-""} -DWITH_LITE=${WITH_LITE:-OFF} -DWITH_XPU_BKCL=${WITH_XPU_BKCL:-OFF} -DWITH_ARM=${WITH_ARM:-OFF} -DWITH_STRIP=${WITH_STRIP:-ON} -DON_INFER=${ON_INFER:-OFF} -DWITH_HETERPS=${WITH_HETERPS:-OFF} -DWITH_GPU_GRAPH=${WITH_GPU_GRAPH:-OFF} -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} -DCUDA_ARCH_BIN=${CUDA_ARCH_BIN} -DWITH_RECORD_BUILDTIME=${WITH_RECORD_BUILDTIME:-OFF} -DWITH_UNITY_BUILD=${WITH_UNITY_BUILD:-OFF} -DWITH_ONNXRUNTIME=${WITH_ONNXRUNTIME:-OFF} -DWITH_CUDNN_FRONTEND=${WITH_CUDNN_FRONTEND:-OFF}
     ========================================
 EOF
     export CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release}
@@ -3677,7 +3674,6 @@ EOF
     export LITE_GIT_TAG=release/v2.10
     export WITH_XPU=${WITH_XPU:-OFF}
     export WITH_IPU=${WITH_IPU:-OFF}
-    export WITH_CNCL=${WITH_CNCL:-OFF}
     export XPU_SDK_ROOT=${XPU_SDK_ROOT:-""}
     export WITH_LITE=${WITH_LITE:-OFF}
     export WITH_XPU_BKCL=${WITH_XPU_BKCL:-OFF}
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index 42fd03d4bb8617..94b11c7d1de42f 100755
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -704,7 +704,6 @@ def run_trainer(self, args):
                 args.update_method == "nccl2"
                 or args.update_method == "bkcl"
                 or args.update_method == "hccl"
-                or args.update_method == "cncl"
             ):
                 strategy = paddle.distributed.parallel.ParallelStrategy()
                 strategy.nranks = nranks
@@ -862,7 +861,6 @@ def runtime_main(test_class):
             "nccl2_reduce_layer",
             "gloo",
             "hccl",
-            "cncl",
         ],
     )
     parser.add_argument('--trainer_id', type=int, required=False, default=0)
@@ -971,7 +969,6 @@ def setUp(self):
         self._bkcl_mode = False
         self._gloo_mode = False  # now, support gloo backend
         self._hccl_mode = False
-        self._cncl_mode = False
         self._pipeline_mode = False
         self._mp_mode = False
         self._diff_batch = False
@@ -1778,14 +1775,6 @@ def check_with_place_func(
                 check_error_log=check_error_log,
                 log_name=log_name,
             )
-        elif self._cncl_mode:
-            tr0_losses, tr1_losses = self._run_cluster_nccl2(
-                model_file,
-                required_envs,
-                update_method='cncl',
-                check_error_log=check_error_log,
-                log_name=log_name,
-            )
         elif self._pipeline_mode:
             tr0_losses, tr1_losses = self._run_pipeline(
                 model_file, required_envs, check_error_log, log_name=log_name
diff --git a/test/cpp/imperative/CMakeLists.txt b/test/cpp/imperative/CMakeLists.txt
index 70cc57cf3cb4c5..acecb4fe010c55 100644
--- a/test/cpp/imperative/CMakeLists.txt
+++ b/test/cpp/imperative/CMakeLists.txt
@@ -93,8 +93,7 @@ cc_test(
   DEPS tracer layer prepared_operator mul_op)
 if(WITH_NCCL
    OR WITH_RCCL
-   OR WITH_XPU_BKCL
-   OR WITH_CNCL)
+   OR WITH_XPU_BKCL)
   cc_test(
     test_group
     SRCS test_group.cc
diff --git a/test/cpp/imperative/test_group.cc b/test/cpp/imperative/test_group.cc
index ddd454ae35d195..f2eeb24b7eccef 100644
--- a/test/cpp/imperative/test_group.cc
+++ b/test/cpp/imperative/test_group.cc
@@ -77,8 +77,7 @@ void GroupConcatSplit(Place place, size_t size) {
     }
 
     if (std::is_same<Place, platform::CUDAPlace>::value) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
-    defined(PADDLE_WITH_CNCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
       paddle::memory::Copy(
           place, data, cpu_place, value.data(), sizeof(T) * value.size(), 0);
 #endif

From 84d34ddd49f0e14e44a8215e43e2486406f29161 Mon Sep 17 00:00:00 2001
From: Yuanle Liu <yuanlehome@163.com>
Date: Thu, 13 Apr 2023 10:36:26 +0800
Subject: [PATCH 111/156] fix bug only on win (#52839)

---
 paddle/fluid/inference/api/analysis_predictor.cc | 3 +--
 paddle/fluid/pybind/eager_utils.cc               | 8 ++++++--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 790c32b31e1294..d718ab936b5302 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1241,8 +1241,7 @@ bool AnalysisPredictor::GetFetch(std::vector<paddle::Tensor> *outputs,
   for (size_t i = 0; i < fetches_.size(); ++i) {
     auto const &name = idx2fetches_[i];
     auto &t = framework::GetVariableTensor(*scope, name);
-    (*outputs)[i] =
-        std::move(paddle::Tensor(std::make_shared<phi::DenseTensor>(t), name));
+    (*outputs)[i] = paddle::Tensor(std::make_shared<phi::DenseTensor>(t), name);
   }
   return true;
 }
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index 93030c9138fa84..b7ecd196ca2a87 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -770,11 +770,15 @@ PyObject* ToPyObject(const std::vector<std::vector<size_t>>& value) {
 
 PyObject* ToPyObject(const std::vector<paddle::Tensor>& value,
                      bool return_py_none_if_not_initialize) {
-  // NOTE(liuyuanle): I encountered a bug(access violation) in windows. ref to
-  // https://stackoverflow.com/questions/55598839/how-to-fix-access-violation-error-when-returning-pyobject-from-c-function-usin
+// NOTE(liuyuanle): I encountered a bug(access violation) in windows. ref to
+// https://stackoverflow.com/questions/55598839/how-to-fix-access-violation-error-when-returning-pyobject-from-c-function-usin
+#ifdef _WIN32
   PyGILState_STATE gstate = PyGILState_Ensure();
+#endif
   PyObject* result = PyList_New((Py_ssize_t)value.size());
+#ifdef _WIN32
   PyGILState_Release(gstate);
+#endif
 
   for (size_t i = 0; i < value.size(); i++) {
     if (!value[i].initialized() && return_py_none_if_not_initialize) {

From 16c36465fdbe20f7aeda7992ce0f279eb2bb5854 Mon Sep 17 00:00:00 2001
From: lzydev <lizhiyu02@baidu.com>
Date: Thu, 13 Apr 2023 10:45:08 +0800
Subject: [PATCH 112/156] Fix bug in cross_entropy in static mode (#52771)

* fix bug in cross_entropy in static mode

* fix ci-coverage
---
 paddle/phi/infermeta/binary.cc                |  7 ----
 .../tests/unittests/test_cross_entropy_op.py  | 15 +++++++++
 .../test_softmax_with_cross_entropy_op.py     | 33 +++++++++++++++++--
 python/paddle/nn/functional/loss.py           | 22 +++++++++++++
 4 files changed, 68 insertions(+), 9 deletions(-)

diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index 0a3e31054b0f5b..86b44c9a3c531a 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -886,7 +886,6 @@ void CrossEntropyWithSoftmaxInferMeta(const MetaTensor& logits,
   auto logits_dims = logits.dims();
   auto labels_dims = label.dims();
   auto logits_rank = logits_dims.size();
-  auto labels_rank = labels_dims.size();
   PADDLE_ENFORCE_GE(axis,
                     -logits_rank,
                     phi::errors::InvalidArgument(
@@ -919,12 +918,6 @@ void CrossEntropyWithSoftmaxInferMeta(const MetaTensor& logits,
                                      "when not in numeric_stable_mode."));
   }
 
-  PADDLE_ENFORCE_EQ(
-      (logits_rank - 1 != labels_rank) && (logits_rank != labels_rank),
-      false,
-      phi::errors::InvalidArgument("Expected input_dims - 1 == label_dims "
-                                   "or input_dims == label_dims."));
-
   if (soft_label) {
     if (config.is_runtime || (logits_dims[axis] > 0 && labels_dims[axis] > 0)) {
       PADDLE_ENFORCE_EQ(logits_dims[axis],
diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py b/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py
index 8f1a8f8c815a4d..f7bc9f62d0c352 100644
--- a/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py
@@ -441,6 +441,21 @@ def test_dtype():
 
             self.assertRaises(TypeError, test_dtype)
 
+            def test_input_dims():
+                with paddle_static_guard():
+                    # "input_dims - 1 != label_dims and input_dims != label_dims" must be false.
+                    x3 = paddle.static.data(
+                        name='x3', shape=[-1, 3, 4, 5], dtype="int32"
+                    )
+                    lab3 = paddle.static.data(
+                        name='lab3', shape=[-1, 3, 4, 5, 6], dtype="int32"
+                    )
+                    paddle.nn.functional.cross_entropy(
+                        x3, lab3, reduction='none', use_softmax=False
+                    )
+
+            self.assertRaises(ValueError, test_input_dims)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
index 51aa23306481bd..5ea346aec0362c 100644
--- a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
@@ -15,11 +15,11 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from eager_op_test import OpTest, convert_float_to_uint16, paddle_static_guard
 from test_softmax_op import stable_softmax
 
 import paddle
-from paddle.fluid import core
+from paddle.fluid import Program, core, program_guard
 
 
 def cross_entropy(softmax, label, soft_label, axis, ignore_index=-1):
@@ -974,6 +974,35 @@ def test_check_grad(self):
         )
 
 
+class TestSoftmaxWithCrossEntropyOpError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+
+            def test_input_dims1():
+                with paddle_static_guard():
+                    # the input dims of cross_entropy can't be 0,
+                    x1 = paddle.static.data(name='x1', shape=[], dtype="int32")
+                    lab1 = paddle.static.data(
+                        name='lab1', shape=[-1, 3, 4, 5, 6], dtype="int32"
+                    )
+                    paddle.nn.functional.softmax_with_cross_entropy(x1, lab1)
+
+            self.assertRaises(ValueError, test_input_dims1)
+
+            def test_input_dims2():
+                with paddle_static_guard():
+                    # "input_dims - 1 != label_dims and input_dims != label_dims" must be false.
+                    x2 = paddle.static.data(
+                        name='x2', shape=[-1, 3, 4, 5], dtype="int32"
+                    )
+                    lab2 = paddle.static.data(
+                        name='lab2', shape=[-1, 3, 4, 5, 6], dtype="int32"
+                    )
+                    paddle.nn.functional.softmax_with_cross_entropy(x2, lab2)
+
+            self.assertRaises(ValueError, test_input_dims2)
+
+
 if __name__ == "__main__":
     paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index c2c98361c75e75..ab11d302930a2f 100644
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -253,6 +253,20 @@ def fluid_softmax_with_cross_entropy(
             # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
             #        [1.15328646])
     """
+    input_dims = len(list(logits.shape))
+    if input_dims == 0:
+        raise ValueError('The dimention of input should be larger than zero!')
+
+    label_dims = len(list(label.shape))
+    if input_dims - 1 != label_dims and input_dims != label_dims:
+        raise ValueError(
+            'Expected nput_dims - 1 = label_dims or input_dims == label_dims\
+             (got nput_dims{}, label_dims{})'.format(
+                input_dims, label_dims
+            )
+        )
+    if input_dims - 1 == label_dims:
+        label = paddle.unsqueeze(label, axis=axis)
     if in_dygraph_mode():
         if core.is_compiled_with_custom_device("npu"):
             if not soft_label:
@@ -2700,6 +2714,14 @@ def cross_entropy(
     if input_dims - 1 == label_dims:
         label = paddle.unsqueeze(label, axis=axis)
 
+    if input_dims - 1 != label_dims and input_dims != label_dims:
+        raise ValueError(
+            'Expected nput_dims - 1 = label_dims or input_dims == label_dims\
+             (got nput_dims{}, label_dims{})'.format(
+                input_dims, label_dims
+            )
+        )
+
     if in_dygraph_mode():
         if not soft_label:
             valid_label = (

From 5ab07e0470e874715630ba669feb283dab5f3445 Mon Sep 17 00:00:00 2001
From: HongyuJia <jiahongyu@baidu.com>
Date: Thu, 13 Apr 2023 10:54:36 +0800
Subject: [PATCH 113/156] [Unittest Stream] Fix unittest, change
 raw_stream->GetStream() (#52810)

---
 test/custom_runtime/custom_op.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/custom_runtime/custom_op.cc b/test/custom_runtime/custom_op.cc
index 68c06dcd94be26..5ae35d8bda4489 100644
--- a/test/custom_runtime/custom_op.cc
+++ b/test/custom_runtime/custom_op.cc
@@ -199,7 +199,7 @@ std::vector<paddle::Tensor> StreamForward(const paddle::Tensor& x) {
   auto dev_ctx =
       paddle::experimental::DeviceContextPool::Instance().Get(x.place());
   auto custom_ctx = static_cast<const phi::CustomContext*>(dev_ctx);
-  void* stream = custom_ctx->stream();
+  std::shared_ptr<phi::stream::Stream> stream = custom_ctx->GetStream();
 
   PD_CHECK(stream != nullptr);
   std::cout << "Check stream != nullptr successfully" << std::endl;

From eb93b5c90e5ad62283ed8e4de2d8fff9cec38448 Mon Sep 17 00:00:00 2001
From: zhangyuqin1998 <75946871+zhangyuqin1998@users.noreply.github.com>
Date: Thu, 13 Apr 2023 10:55:29 +0800
Subject: [PATCH 114/156] rename_bilinear_tensor_op (#52745)

---
 paddle/phi/api/yaml/backward.yaml             | 18 ++++++++---------
 paddle/phi/api/yaml/op_compat.yaml            | 12 +++++------
 paddle/phi/api/yaml/ops.yaml                  | 20 +++++++++----------
 paddle/phi/infermeta/backward.cc              | 16 +++++++--------
 paddle/phi/infermeta/backward.h               | 16 +++++++--------
 paddle/phi/kernels/cpu/slice_grad_kernel.cc   |  2 +-
 .../phi/kernels/gpu/slice_grad_kernel.cu.cc   |  2 +-
 .../phi/kernels/impl/slice_grad_kernel_impl.h | 18 ++++++++---------
 .../phi/kernels/onednn/slice_grad_kernel.cc   | 20 +++++++++----------
 paddle/phi/kernels/slice_grad_kernel.h        | 18 ++++++++---------
 paddle/phi/kernels/xpu/slice_grad_kernel.cc   | 20 +++++++++----------
 python/paddle/nn/functional/common.py         |  2 +-
 12 files changed, 82 insertions(+), 82 deletions(-)

diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml
index d288f0bf18f6a8..cf8fd4de67774f 100644
--- a/paddle/phi/api/yaml/backward.yaml
+++ b/paddle/phi/api/yaml/backward.yaml
@@ -158,6 +158,15 @@
   data_transform :
     skip_transform : out_size, size_tensor, scale_tensor
 
+- backward_op : bilinear_grad
+  forward : bilinear (Tensor x, Tensor y, Tensor weight, Tensor bias) -> Tensor(out)
+  args : (Tensor x, Tensor y, Tensor weight, Tensor out_grad)
+  output : Tensor(x_grad), Tensor(y_grad), Tensor(weight_grad), Tensor(bias_grad)
+  infer_meta :
+    func : BilinearGradInferMeta
+  kernel :
+    func : bilinear_grad
+
 - backward_op : bilinear_interp_grad
   forward : bilinear_interp (Tensor x, Tensor out_size, Tensor[] size_tensor, Tensor scale_tensor, str data_layout="NCHW", int out_d=0, int out_h=0, int out_w=0, float[] scale={}, str interp_method="bilinear", bool align_corners=true, int align_mode=1) -> Tensor(output)
   args : (Tensor x, Tensor out_size, Tensor[] size_tensor, Tensor scale_tensor, Tensor output_grad, str data_layout, int out_d, int out_h, int out_w, float[] scale, str interp_method, bool align_corners, int align_mode)
@@ -173,15 +182,6 @@
   data_transform :
     skip_transform : out_size, size_tensor, scale_tensor
 
-- backward_op : bilinear_tensor_product_grad
-  forward : bilinear_tensor_product (Tensor x, Tensor y, Tensor weight, Tensor bias) -> Tensor(out)
-  args : (Tensor x, Tensor y, Tensor weight, Tensor out_grad)
-  output : Tensor(x_grad), Tensor(y_grad), Tensor(weight_grad), Tensor(bias_grad)
-  infer_meta :
-    func : BilinearTensorProductGradInferMeta
-  kernel :
-    func : bilinear_grad
-
 - backward_op : bmm_grad
   forward : bmm (Tensor x, Tensor y) -> Tensor(out)
   args : (Tensor x, Tensor y, Tensor out_grad)
diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
index cf5453a3846ee2..655bfe546b6d39 100644
--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -265,6 +265,12 @@
   extra :
     attrs : [bool use_mkldnn = false]
 
+- op : bilinear (bilinear_tensor_product)
+  inputs :
+    {x : X, y : Y,weight: Weight, bias: Bias}
+  outputs :
+    {out : Out}
+
 - op : bilinear_interp (bilinear_interp_v2)
   backward : bilinear_interp_grad (bilinear_interp_v2_grad)
   inputs :
@@ -274,12 +280,6 @@
   extra :
     attrs : [bool use_mkldnn = false]
 
-- op : bilinear_tensor_product
-  inputs :
-    {x : X, y : Y,weight: Weight, bias: Bias}
-  outputs :
-    {out : Out}
-
 - op : bitwise_and
   inputs :
     {x : X, y : Y}
diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
index 2ad4d563d9f04f..2b2f30f1c63516 100644
--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -238,6 +238,16 @@
   data_transform :
     skip_transform : out_size, size_tensor, scale_tensor
 
+- op : bilinear
+  args : (Tensor x, Tensor y, Tensor weight, Tensor bias)
+  output : Tensor
+  infer_meta :
+    func : BilinearInferMeta
+  kernel :
+    func : bilinear
+  optional : bias
+  backward : bilinear_grad
+
 - op : bilinear_interp
   args : (Tensor x, Tensor out_size, Tensor[] size_tensor, Tensor scale_tensor, str data_layout="NCHW", int out_d=0, int out_h=0, int out_w=0, float[] scale={}, str interp_method="bilinear", bool align_corners=true, int align_mode=1)
   output : Tensor(output)
@@ -251,16 +261,6 @@
   data_transform :
     skip_transform : out_size, size_tensor, scale_tensor
 
-- op : bilinear_tensor_product
-  args : (Tensor x, Tensor y, Tensor weight, Tensor bias)
-  output : Tensor
-  infer_meta :
-    func : BilinearInferMeta
-  kernel :
-    func : bilinear
-  optional : bias
-  backward : bilinear_tensor_product_grad
-
 - op : bitwise_and
   args : (Tensor x, Tensor y)
   output : Tensor(out)
diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc
index 427bc51ab0dd8f..bd9fae6bd155b5 100644
--- a/paddle/phi/infermeta/backward.cc
+++ b/paddle/phi/infermeta/backward.cc
@@ -39,14 +39,14 @@ void AngleGradInferMeta(const MetaTensor& x,
   UnchangedInferMeta(x, x_grad);
 }
 
-void BilinearTensorProductGradInferMeta(const MetaTensor& x,
-                                        const MetaTensor& y,
-                                        const MetaTensor& weight,
-                                        const MetaTensor& dout,
-                                        MetaTensor* dx,
-                                        MetaTensor* dy,
-                                        MetaTensor* dweight,
-                                        MetaTensor* dbias) {
+void BilinearGradInferMeta(const MetaTensor& x,
+                           const MetaTensor& y,
+                           const MetaTensor& weight,
+                           const MetaTensor& dout,
+                           MetaTensor* dx,
+                           MetaTensor* dy,
+                           MetaTensor* dweight,
+                           MetaTensor* dbias) {
   auto x_dims = x.dims();
   auto y_dims = y.dims();
   auto weight_dims = weight.dims();
diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h
index b95d164e748514..8d3edf2a40a944 100644
--- a/paddle/phi/infermeta/backward.h
+++ b/paddle/phi/infermeta/backward.h
@@ -36,14 +36,14 @@ void AngleGradInferMeta(const MetaTensor& x,
                         const MetaTensor& out_grad,
                         MetaTensor* x_grad);
 
-void BilinearTensorProductGradInferMeta(const MetaTensor& x,
-                                        const MetaTensor& y,
-                                        const MetaTensor& weight,
-                                        const MetaTensor& dout,
-                                        MetaTensor* dx,
-                                        MetaTensor* dy,
-                                        MetaTensor* dweight,
-                                        MetaTensor* dbias);
+void BilinearGradInferMeta(const MetaTensor& x,
+                           const MetaTensor& y,
+                           const MetaTensor& weight,
+                           const MetaTensor& dout,
+                           MetaTensor* dx,
+                           MetaTensor* dy,
+                           MetaTensor* dweight,
+                           MetaTensor* dbias);
 
 void BmmGradInferMeta(const MetaTensor& x,
                       const MetaTensor& y,
diff --git a/paddle/phi/kernels/cpu/slice_grad_kernel.cc b/paddle/phi/kernels/cpu/slice_grad_kernel.cc
index 54834cb6c5cbee..0ecb3940fb2754 100644
--- a/paddle/phi/kernels/cpu/slice_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/slice_grad_kernel.cc
@@ -21,7 +21,7 @@
 PD_REGISTER_KERNEL(slice_grad,
                    CPU,
                    ALL_LAYOUT,
-                   phi::SliceGradRawKernel,
+                   phi::SliceGradKernel,
                    bool,
                    uint8_t,
                    int,
diff --git a/paddle/phi/kernels/gpu/slice_grad_kernel.cu.cc b/paddle/phi/kernels/gpu/slice_grad_kernel.cu.cc
index b7de6c9d94135b..89a6fad5df02a3 100644
--- a/paddle/phi/kernels/gpu/slice_grad_kernel.cu.cc
+++ b/paddle/phi/kernels/gpu/slice_grad_kernel.cu.cc
@@ -21,7 +21,7 @@
 PD_REGISTER_KERNEL(slice_grad,
                    GPU,
                    ALL_LAYOUT,
-                   phi::SliceGradRawKernel,
+                   phi::SliceGradKernel,
                    bool,
                    uint8_t,
                    int,
diff --git a/paddle/phi/kernels/impl/slice_grad_kernel_impl.h b/paddle/phi/kernels/impl/slice_grad_kernel_impl.h
index 2fad8d7a59cd83..152a2c7ff21d5a 100644
--- a/paddle/phi/kernels/impl/slice_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/slice_grad_kernel_impl.h
@@ -271,15 +271,15 @@ void SliceGradCompute(const Context& ctx,
 }
 
 template <typename T, typename Context>
-void SliceGradRawKernel(const Context& ctx,
-                        const DenseTensor& input,
-                        const DenseTensor& out_grad,
-                        const std::vector<int64_t>& axes,
-                        const IntArray& starts_arr,
-                        const IntArray& ends_arr,
-                        const std::vector<int64_t>& infer_flags,
-                        const std::vector<int64_t>& decrease_axis,
-                        DenseTensor* input_grad) {
+void SliceGradKernel(const Context& ctx,
+                     const DenseTensor& input,
+                     const DenseTensor& out_grad,
+                     const std::vector<int64_t>& axes,
+                     const IntArray& starts_arr,
+                     const IntArray& ends_arr,
+                     const std::vector<int64_t>& infer_flags,
+                     const std::vector<int64_t>& decrease_axis,
+                     DenseTensor* input_grad) {
   size_t rank = input.dims().size();
 
   auto& starts = starts_arr.GetData();
diff --git a/paddle/phi/kernels/onednn/slice_grad_kernel.cc b/paddle/phi/kernels/onednn/slice_grad_kernel.cc
index 45a152fd694134..115f1b22ca52bd 100644
--- a/paddle/phi/kernels/onednn/slice_grad_kernel.cc
+++ b/paddle/phi/kernels/onednn/slice_grad_kernel.cc
@@ -20,15 +20,15 @@
 namespace phi {
 
 template <typename T, typename Context>
-void SliceGradRawKernel(const Context& dev_ctx,
-                        const DenseTensor& input,
-                        const DenseTensor& out_grad,
-                        const std::vector<int64_t>& axes,
-                        const IntArray& starts,
-                        const IntArray& ends,
-                        const std::vector<int64_t>& infer_flags,
-                        const std::vector<int64_t>& decrease_axis,
-                        DenseTensor* input_grad) {
+void SliceGradKernel(const Context& dev_ctx,
+                     const DenseTensor& input,
+                     const DenseTensor& out_grad,
+                     const std::vector<int64_t>& axes,
+                     const IntArray& starts,
+                     const IntArray& ends,
+                     const std::vector<int64_t>& infer_flags,
+                     const std::vector<int64_t>& decrease_axis,
+                     DenseTensor* input_grad) {
   const auto& onednn_engine = dev_ctx.GetEngine();
 
   auto dx_dims = vectorize(input_grad->dims());
@@ -81,6 +81,6 @@ void SliceGradRawKernel(const Context& dev_ctx,
 PD_REGISTER_KERNEL(slice_grad,
                    OneDNN,
                    ONEDNN,
-                   phi::SliceGradRawKernel,
+                   phi::SliceGradKernel,
                    float,
                    phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/slice_grad_kernel.h b/paddle/phi/kernels/slice_grad_kernel.h
index 5c01631a93dbff..3c8b36142ef61a 100644
--- a/paddle/phi/kernels/slice_grad_kernel.h
+++ b/paddle/phi/kernels/slice_grad_kernel.h
@@ -21,15 +21,15 @@
 namespace phi {
 
 template <typename T, typename Context>
-void SliceGradRawKernel(const Context& ctx,
-                        const DenseTensor& input,
-                        const DenseTensor& out_grad,
-                        const std::vector<int64_t>& axes,
-                        const IntArray& starts,
-                        const IntArray& ends,
-                        const std::vector<int64_t>& infer_flags,
-                        const std::vector<int64_t>& decrease_axis,
-                        DenseTensor* input_grad);
+void SliceGradKernel(const Context& ctx,
+                     const DenseTensor& input,
+                     const DenseTensor& out_grad,
+                     const std::vector<int64_t>& axes,
+                     const IntArray& starts,
+                     const IntArray& ends,
+                     const std::vector<int64_t>& infer_flags,
+                     const std::vector<int64_t>& decrease_axis,
+                     DenseTensor* input_grad);
 
 template <typename T, typename Context>
 void SliceArrayGradKernel(const Context& dev_ctx,
diff --git a/paddle/phi/kernels/xpu/slice_grad_kernel.cc b/paddle/phi/kernels/xpu/slice_grad_kernel.cc
index 86891776179564..3e054f3d8f3424 100644
--- a/paddle/phi/kernels/xpu/slice_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/slice_grad_kernel.cc
@@ -21,15 +21,15 @@
 namespace phi {
 
 template <typename T, typename Context>
-void SliceGradRawKernel(const Context& ctx,
-                        const DenseTensor& input,
-                        const DenseTensor& out_grad,
-                        const std::vector<int64_t>& axes,
-                        const IntArray& starts_t,
-                        const IntArray& ends_t,
-                        const std::vector<int64_t>& infer_flags,
-                        const std::vector<int64_t>& decrease_axis,
-                        DenseTensor* input_grad) {
+void SliceGradKernel(const Context& ctx,
+                     const DenseTensor& input,
+                     const DenseTensor& out_grad,
+                     const std::vector<int64_t>& axes,
+                     const IntArray& starts_t,
+                     const IntArray& ends_t,
+                     const std::vector<int64_t>& infer_flags,
+                     const std::vector<int64_t>& decrease_axis,
+                     DenseTensor* input_grad) {
   using XPUType = typename XPUTypeTrait<T>::Type;
   ctx.template Alloc<T>(input_grad);
 
@@ -82,7 +82,7 @@ void SliceGradRawKernel(const Context& ctx,
 PD_REGISTER_KERNEL(slice_grad,
                    XPU,
                    ALL_LAYOUT,
-                   phi::SliceGradRawKernel,
+                   phi::SliceGradKernel,
                    float,
                    int,
                    phi::dtype::float16) {}
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index 86cc69b92f9ee9..9817b9bea26e6d 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -942,7 +942,7 @@ def bilinear(x1, x2, weight, bias=None, name=None):
     """
 
     if in_dygraph_mode():
-        return _C_ops.bilinear_tensor_product(x1, x2, weight, bias)
+        return _C_ops.bilinear(x1, x2, weight, bias)
     else:
         check_variable_and_dtype(x1, 'x1', ['float32', 'float64'], 'bilinear')
         check_variable_and_dtype(x2, 'x2', ['float32', 'float64'], 'bilinear')

From edd578a15c76ed27c5ed6913ba240ecc3ff6374c Mon Sep 17 00:00:00 2001
From: jiangcheng <thisjiang@qq.com>
Date: Thu, 13 Apr 2023 11:33:36 +0800
Subject: [PATCH 115/156] [CINN] optest add cinn check test (#52205)

* [CINN] optest add cinn check test

* replace set self.check_cinn to pass check_cinn by function parameter

* fix ci bug

* add cinn atol/rtol
---
 .../fluid/tests/unittests/CMakeLists.txt      |  2 +
 .../fluid/tests/unittests/eager_op_test.py    | 72 +++++++++++++++++--
 .../fluid/tests/unittests/test_clip_op.py     |  5 +-
 .../fluid/tests/unittests/test_scale_op.py    |  6 +-
 4 files changed, 74 insertions(+), 11 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 909b658c0983ca..e0d89932a29213 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -1115,6 +1115,8 @@ set(TEST_CINN_OPS
     test_mean_op
     test_unsqueeze2_op
     test_meshgrid_op
+    test_scale_op
+    test_clip_op
     test_scatter_op
     test_gather_op
     test_layer_norm_op
diff --git a/python/paddle/fluid/tests/unittests/eager_op_test.py b/python/paddle/fluid/tests/unittests/eager_op_test.py
index 9b0868edfa7ecb..0bcb34f956b238 100644
--- a/python/paddle/fluid/tests/unittests/eager_op_test.py
+++ b/python/paddle/fluid/tests/unittests/eager_op_test.py
@@ -494,6 +494,28 @@ def enable_cal_ref_output(self):
     def disable_cal_ref_output(self):
         self.is_calc_ref = False
 
+    def _enable_check_cinn_test(self, place, inputs, outputs):
+        # if the test not run in cuda or the paddle not compile with CINN, skip cinn test
+        if (
+            core.is_compiled_with_cinn()
+            and core.is_compiled_with_cuda()
+            and isinstance(place, fluid.CUDAPlace)
+        ):
+            return False
+        # CINN not support bfloat16 now, skip cinn test
+        if self.is_bfloat16_op():
+            return False
+        # CINN not support 0D-tensor now, skip cinn test
+        for var in inputs.values():
+            if len(var.shape()) == 0:
+                return False
+        for var in outputs.values():
+            if len(var.shape) == 0:
+                return False
+        # CINN not support dynamic shape now, skip cinn test
+        # TODO(thisjiang): cannot check dynamic shape op automatic, should do manually now
+        return True
+
     # set the self.output_dtype .
     def infer_dtype_from_inputs_outputs(self, inputs, outputs):
         def is_np_data(input):
@@ -1044,6 +1066,7 @@ def _calc_output(
         loss=None,
         enable_inplace=None,
         for_inplace_test=None,
+        check_cinn=False,
     ):
         with paddle.fluid.framework._static_guard():
             program = Program()
@@ -1087,9 +1110,21 @@ def _calc_output(
                 for out_name, out_dup in Operator.get_op_outputs(self.op_type):
                     fetch_list.append(str(out_name))
 
-            if enable_inplace is not None:
+            enable_cinn_test = check_cinn and self._enable_check_cinn_test(
+                place, feed_map, outputs
+            )
+            if enable_cinn_test:
+                if hasattr(self, 'cinn_atol'):
+                    self.atol = self.cinn_atol
+                if hasattr(self, 'cinn_rtol'):
+                    self.rtol = self.cinn_rtol
+
+            if (enable_inplace is not None) or enable_cinn_test:
                 build_strategy = fluid.BuildStrategy()
-                build_strategy.enable_inplace = enable_inplace
+                if enable_inplace is not None:
+                    build_strategy.enable_inplace = enable_inplace
+                if enable_cinn_test:
+                    build_strategy.build_cinn_pass = check_cinn
 
                 compiled_prog = fluid.CompiledProgram(
                     program, build_strategy=build_strategy
@@ -1518,6 +1553,7 @@ def check_output_with_place(
         check_dygraph=True,
         check_prim=False,
         inplace_atol=None,
+        check_cinn=False,
     ):
         core._set_prim_all_enabled(False)
         core.set_prim_eager_enabled(False)
@@ -1626,7 +1662,7 @@ def _compare_numpy(self, name, actual_np, expect_np):
                     np.testing.assert_allclose(
                         actual_np,
                         expect_np,
-                        atol=atol,
+                        atol=self.atol if hasattr(self, 'atol') else atol,
                         rtol=self.rtol if hasattr(self, 'rtol') else rtol,
                         equal_nan=equal_nan,
                         err_msg=(
@@ -1645,7 +1681,7 @@ def _compare_numpy(self, name, actual_np, expect_np):
                     np.allclose(
                         actual_np,
                         expect_np,
-                        atol=atol,
+                        atol=self.atol if hasattr(self, 'atol') else atol,
                         rtol=self.rtol if hasattr(self, 'rtol') else rtol,
                         equal_nan=equal_nan,
                     ),
@@ -1721,7 +1757,7 @@ def init(self):
 
             def calculate_output(self):
                 outs, fetch_list = self.op_test._calc_output(
-                    place, no_check_set=no_check_set
+                    place, no_check_set=no_check_set, check_cinn=check_cinn
                 )
                 self.outputs = outs
                 self.fetch_list = fetch_list
@@ -2077,6 +2113,7 @@ def check_output(
         check_dygraph=True,
         check_prim=False,
         inplace_atol=None,
+        check_cinn=False,
     ):
 
         self.__class__.op_type = self.op_type
@@ -2100,6 +2137,7 @@ def check_output(
                 check_dygraph=check_dygraph,
                 check_prim=check_prim,
                 inplace_atol=inplace_atol,
+                check_cinn=check_cinn,
             )
             if check_dygraph:
                 outs, dygraph_dygraph_outs, fetch_list = res
@@ -2257,6 +2295,7 @@ def check_grad(
         check_prim=False,
         only_check_prim=False,
         atol=1e-5,
+        check_cinn=False,
     ):
         if hasattr(self, "use_custom_device") and self.use_custom_device:
             check_dygraph = False
@@ -2278,6 +2317,7 @@ def check_grad(
                 check_prim=check_prim,
                 only_check_prim=only_check_prim,
                 atol=atol,
+                check_cinn=check_cinn,
             )
 
     def check_grad_with_place(
@@ -2296,6 +2336,7 @@ def check_grad_with_place(
         only_check_prim=False,
         numeric_place=None,
         atol=1e-5,
+        check_cinn=False,
     ):
         if hasattr(self, "use_custom_device") and self.use_custom_device:
             check_dygraph = False
@@ -2427,6 +2468,7 @@ def check_grad_with_place(
             output_names,
             no_grad_set,
             user_defined_grad_outputs,
+            check_cinn=check_cinn,
         )
         # comparison of bf16 results will happen as fp32
         # loop over list of grads and convert bf16 to fp32
@@ -2655,6 +2697,7 @@ def _get_gradient(
         no_grad_set,
         user_defined_grad_outputs=None,
         parallel=False,
+        check_cinn=False,
     ):
         with paddle.fluid.framework._static_guard():
             prog = Program()
@@ -2721,11 +2764,26 @@ def _get_gradient(
                 )
                 fetch_list = grad_inputs
 
-            if parallel:
+            enable_cinn_test = check_cinn and self._enable_check_cinn_test(
+                place, feed_dict, outputs
+            )
+            if enable_cinn_test:
+                if hasattr(self, 'cinn_atol'):
+                    self.atol = self.cinn_atol
+                if hasattr(self, 'cinn_rtol'):
+                    self.rtol = self.cinn_rtol
+
+            if parallel or enable_cinn_test:
                 use_cuda = False
                 if isinstance(place, fluid.CUDAPlace):
                     use_cuda = True
-                compiled_prog = fluid.CompiledProgram(prog)
+
+                build_strategy = None
+                if enable_cinn_test:
+                    build_strategy = fluid.BuildStrategy()
+                    build_strategy.build_cinn_pass = check_cinn
+
+                compiled_prog = fluid.CompiledProgram(prog, build_strategy)
                 prog = compiled_prog
             executor = fluid.Executor(place)
             res = list(
diff --git a/python/paddle/fluid/tests/unittests/test_clip_op.py b/python/paddle/fluid/tests/unittests/test_clip_op.py
index eca97092a6419d..b807d01ada068b 100644
--- a/python/paddle/fluid/tests/unittests/test_clip_op.py
+++ b/python/paddle/fluid/tests/unittests/test_clip_op.py
@@ -49,10 +49,13 @@ def setUp(self):
         input[np.abs(input - max_v) < self.max_relative_error] = 0.5
         self.inputs['X'] = input
         self.outputs = {'Out': np.clip(self.inputs['X'], min_v, max_v)}
+        self.check_cinn = ('Min' not in self.inputs) and (
+            'Max' not in self.inputs
+        )
 
     def test_check_output(self):
         paddle.enable_static()
-        self.check_output()
+        self.check_output(check_cinn=self.check_cinn)
         paddle.disable_static()
 
     def test_check_grad_normal(self):
diff --git a/python/paddle/fluid/tests/unittests/test_scale_op.py b/python/paddle/fluid/tests/unittests/test_scale_op.py
index c28ba1d1725d50..137a31c89fb41e 100644
--- a/python/paddle/fluid/tests/unittests/test_scale_op.py
+++ b/python/paddle/fluid/tests/unittests/test_scale_op.py
@@ -42,7 +42,7 @@ def init_dtype_type(self):
         pass
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=True)
 
     def test_check_grad(self):
         self.check_grad(['X'], 'Out')
@@ -66,7 +66,7 @@ def init_dtype_type(self):
         pass
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=True)
 
     def test_check_grad(self):
         self.check_grad(['X'], 'Out')
@@ -148,7 +148,7 @@ def init_dtype_type(self):
         self.dtype = np.float16
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=True)
 
     def test_check_grad(self):
         self.check_grad(["X"], "Out")

From 5a8642700423c38f0dfb76d310b8cdec9d68f88e Mon Sep 17 00:00:00 2001
From: Guoxia Wang <mingzilaochongtu@gmail.com>
Date: Thu, 13 Apr 2023 11:38:19 +0800
Subject: [PATCH 116/156] add uint16 for bfloat16 dtype check in layer_norm
 under static mode (#52845)

---
 python/paddle/nn/functional/norm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index 95e1ca2504cd97..8078cf237ce037 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -337,7 +337,7 @@ def layer_norm(
 
     else:
         check_variable_and_dtype(
-            x, 'input', ['float16', 'float32', 'float64'], 'LayerNorm'
+            x, 'input', ['uint16', 'float16', 'float32', 'float64'], 'LayerNorm'
         )
 
         inputs = {}

From b9ccf0e6e97b3b274b8b7561c0966c6428ec1214 Mon Sep 17 00:00:00 2001
From: liuruyan <44316842+liuruyan@users.noreply.github.com>
Date: Thu, 13 Apr 2023 12:19:35 +0800
Subject: [PATCH 117/156] adjust unittest. (#52847)

---
 test/amp/test_layer_convert_dtype.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/amp/test_layer_convert_dtype.py b/test/amp/test_layer_convert_dtype.py
index 86ee5c5fc25035..31332e1d47f0a9 100644
--- a/test/amp/test_layer_convert_dtype.py
+++ b/test/amp/test_layer_convert_dtype.py
@@ -127,7 +127,7 @@ def test_cpu(self):
         res = paddle.amp.is_float16_supported('cpu')
         self.assertEqual(res, False)
         res = paddle.amp.is_bfloat16_supported('cpu')
-        self.assertEqual(res, True)
+        self.assertEqual(res, core.supports_bfloat16())
 
     def test_gpu_fp16_supported(self):
         res = paddle.amp.is_float16_supported()

From e7652a37e3201450476dc834201105051ce66939 Mon Sep 17 00:00:00 2001
From: niuliling123 <51102941+niuliling123@users.noreply.github.com>
Date: Thu, 13 Apr 2023 12:52:53 +0800
Subject: [PATCH 118/156] Support print stack when place=CUDAPlace (#52841)

---
 .../framework/details/nan_inf_utils_detail.cu | 42 +++++++++++++++++--
 .../framework/details/nan_inf_utils_detail.h  | 28 ++++++-------
 2 files changed, 51 insertions(+), 19 deletions(-)

diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cu b/paddle/fluid/framework/details/nan_inf_utils_detail.cu
index 49e07944c83b67..dd99adfecfcd94 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cu
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cu
@@ -23,6 +23,7 @@
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/kernels/funcs/math_cuda_utils.h"
 
 DECLARE_int32(check_nan_inf_level);
@@ -294,7 +295,8 @@ __global__ void FindGlobalMaxMinAndPrint(const int64_t* block_num_nan_ptr,
                                          const char* debug_info,
                                          int64_t numel,
                                          int64_t numel_max_min,
-                                         int check_nan_inf_level) {
+                                         int check_nan_inf_level,
+                                         int64_t* nan_inf_zero) {
   if (blockIdx.x == 0 && threadIdx.x == 0) {
     int64_t num_nan = 0;
     int64_t num_inf = 0;
@@ -325,8 +327,12 @@ __global__ void FindGlobalMaxMinAndPrint(const int64_t* block_num_nan_ptr,
         min_value = tmp_min_value < min_value ? tmp_min_value : min_value;
         mean_value += tmp_mean_value;
       }
+      if (check_nan_inf_level == 0) {
+        nan_inf_zero[0] = num_nan;
+        nan_inf_zero[1] = num_inf;
+        nan_inf_zero[2] = num_zero;
+      }
     }
-
     PrintForDifferentLevel<T, MT>(debug_info,
                                   numel,
                                   num_nan,
@@ -493,6 +499,10 @@ void TensorCheckerVisitor<phi::GPUContext>::apply(
                                                   tensor_block_mean_ptr);
 
   int check_nan_inf_level = FLAGS_check_nan_inf_level;
+  phi::DenseTensor nan_inf_zero_tensor;
+  nan_inf_zero_tensor.Resize({static_cast<int64_t>(3)});
+  int64_t* nan_inf_zero =
+      dev_ctx->template Alloc<int64_t>(&nan_inf_zero_tensor);
   FindGlobalMaxMinAndPrint<T, MT>
       <<<1, 1, 0, dev_ctx->stream()>>>(block_num_nan_ptr,
                                        block_num_inf_ptr,
@@ -503,7 +513,33 @@ void TensorCheckerVisitor<phi::GPUContext>::apply(
                                        gpu_str_ptr,
                                        tensor.numel(),
                                        numel_max_min,
-                                       check_nan_inf_level);
+                                       check_nan_inf_level,
+                                       nan_inf_zero_tensor.data<int64_t>());
+
+  if (check_nan_inf_level == 0) {
+    auto nan_cpu =
+        phi::memory_utils::Alloc(phi::CPUPlace(), sizeof(int64_t) * 3);
+    int64_t* nan_cpu_ptr = reinterpret_cast<int64_t*>(nan_cpu->ptr());
+    phi::memory_utils::Copy(phi::CPUPlace(),
+                            nan_cpu_ptr,
+                            place,
+                            nan_inf_zero,
+                            3 * sizeof(int64_t),
+                            dev_ctx->stream());
+
+    dev_ctx->Wait();
+    if (nan_cpu_ptr[0] > 0 || nan_cpu_ptr[1] > 0) {
+      const std::string debug_info =
+          GetHintString<T>(op_type, var_name, place, dev_id);
+      PADDLE_THROW(platform::errors::PreconditionNotMet(
+          "There are NAN or INF (num_nan=%lld, num_inf=%lld, num_zero=%lld) in "
+          "%s.",
+          static_cast<long long>(nan_cpu_ptr[0]),  // NOLINT
+          static_cast<long long>(nan_cpu_ptr[1]),  // NOLINT
+          static_cast<long long>(nan_cpu_ptr[2]),  // NOLINT
+          debug_info));
+    }
+  }
 #endif
 }
 
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.h b/paddle/fluid/framework/details/nan_inf_utils_detail.h
index e83fc6c2dd18e5..8f5eb5352ac7bc 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.h
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.h
@@ -87,15 +87,7 @@ HOSTDEVICE void PrintForDifferentLevel(const char* debug_info,
         static_cast<float>(min_value),
         static_cast<float>(mean_value));
     if (check_nan_inf_level == 0) {
-#if defined(__NVCC__) || defined(__HIPCC__)
-      PADDLE_ENFORCE(false,
-                     "There are NAN or INF (num_nan=%ld, num_inf=%lld, "
-                     "num_zero=%lld) in %s.",
-                     static_cast<long long>(num_nan),   // NOLINT
-                     static_cast<long long>(num_inf),   // NOLINT
-                     static_cast<long long>(num_zero),  // NOLINT
-                     debug_info);
-#else
+#if !(defined(__NVCC__) || defined(__HIPCC__))
       PADDLE_THROW(platform::errors::PreconditionNotMet(
           "There are NAN or INF (num_nan=%lld, num_inf=%lld, num_zero=%lld) in "
           "%s.",
@@ -106,12 +98,15 @@ HOSTDEVICE void PrintForDifferentLevel(const char* debug_info,
 #endif
     }
   } else if (NeedPrint<T, MT>(max_value, min_value, check_nan_inf_level)) {
-    printf("[PRECISION] in %s, numel=%lld, max=%e, min=%e, mean=%e\n",
-           debug_info,
-           static_cast<long long>(numel),  // NOLINT
-           static_cast<float>(max_value),
-           static_cast<float>(min_value),
-           static_cast<float>(mean_value));
+    printf(
+        "[PRECISION] in %s, numel=%lld, num_zero=%lld, max=%e, min=%e, "
+        "mean=%e\n",
+        debug_info,
+        static_cast<long long>(numel),     // NOLINT
+        static_cast<long long>(num_zero),  // NOLINT
+        static_cast<float>(max_value),
+        static_cast<float>(min_value),
+        static_cast<float>(mean_value));
   }
 }
 
@@ -152,7 +147,8 @@ void PrintForDifferentLevelFile(const char* debug_info,
             << ", mean=" << static_cast<float>(mean_value) << std::endl;
   } else if (NeedPrint<T, MT>(max_value, min_value, check_nan_inf_level)) {
     outfile << "[PRECISION] in " << debug_info
-            << ", numel=" << static_cast<long long>(numel)  // NOLINT
+            << ", numel=" << static_cast<long long>(numel)        // NOLINT
+            << ", num_zero=" << static_cast<long long>(num_zero)  // NOLINT
             << ", max=" << static_cast<float>(max_value)
             << ", min=" << static_cast<float>(min_value)
             << ", mean=" << static_cast<float>(mean_value) << std::endl;

From 1acb845a8cd7d9c9dec66cabe38896df65da0748 Mon Sep 17 00:00:00 2001
From: wentao yu <yuwentao126@126.com>
Date: Thu, 13 Apr 2023 13:16:45 +0800
Subject: [PATCH 119/156] fix distributed comm context (#52787)

---
 .../interpreter/stream_analyzer.cc            | 36 +++++++++++--------
 1 file changed, 22 insertions(+), 14 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc
index 416534b64a6b50..6401248a2ff542 100644
--- a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc
@@ -148,8 +148,7 @@ DeviceContext* StreamAnalyzer::ParseDeviceContext(
   const int stream_priority = op_func_node.stream_priority_;
   ContextManager& ctx_manager = ContextManager::Instance();
 
-  auto dev_ctx = ctx_manager.Get(op_type, place_, stream_priority).get().get();
-  SetDeviceCommContext(op.get(), dev_ctx);
+  DeviceContext* dev_ctx = nullptr;
 
   // only gpu/npu need update. xpu not need, because xpu memcpy op kernel is
   // synchronous.
@@ -158,22 +157,30 @@ DeviceContext* StreamAnalyzer::ParseDeviceContext(
     VLOG(6) << "Parse DeviceContext for " << op_type
             << ", execution stream = " << execution_stream;
     if (execution_stream != kDefaultStream) {
-      return ctx_manager
-          .Get(std::string(kCustomStream) + "-" + execution_stream,
-               place_,
-               stream_priority)
-          .get()
-          .get();
+      dev_ctx = ctx_manager
+                    .Get(std::string(kCustomStream) + "-" + execution_stream,
+                         place_,
+                         stream_priority)
+                    .get()
+                    .get();
+      SetDeviceCommContext(op.get(), dev_ctx);
+      return dev_ctx;
     }
 
     if (op_type == interpreter::kMemcpyD2H) {
-      return ctx_manager.Get(std::string(kD2HStream), place_, stream_priority)
-          .get()
-          .get();
+      dev_ctx =
+          ctx_manager.Get(std::string(kD2HStream), place_, stream_priority)
+              .get()
+              .get();
+      SetDeviceCommContext(op.get(), dev_ctx);
+      return dev_ctx;
     } else if (op_type == interpreter::kMemcpyH2D) {
-      return ctx_manager.Get(std::string(kH2DStream), place_, stream_priority)
-          .get()
-          .get();
+      dev_ctx =
+          ctx_manager.Get(std::string(kH2DStream), place_, stream_priority)
+              .get()
+              .get();
+      SetDeviceCommContext(op.get(), dev_ctx);
+      return dev_ctx;
     }
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
@@ -195,6 +202,7 @@ DeviceContext* StreamAnalyzer::ParseDeviceContext(
 #endif
   }
 
+  SetDeviceCommContext(op.get(), op_func_node.dev_ctx_);
   return op_func_node.dev_ctx_;
 }
 

From 0b98d1aa3d63bf2007e288f643caf7891d9d6065 Mon Sep 17 00:00:00 2001
From: umiswing <umiswing@foxmail.com>
Date: Thu, 13 Apr 2023 13:48:18 +0800
Subject: [PATCH 120/156] [cutlass] Sparse conv3d backward fusion (#52361)

---
 paddle/phi/kernels/autotune/auto_tune_base.h  |  47 +++-
 paddle/phi/kernels/autotune/cache.h           |  53 +++--
 .../kernels/sparse/gpu/conv_grad_kernel.cu    | 193 ++++++++++++-----
 paddle/phi/kernels/sparse/gpu/conv_kernel.cu  |  29 +--
 .../sparse/gpu/cutlass_generator/common.h     | 204 +++++++++++++-----
 .../gather_gemm_scatter_generator.py          |  53 ++++-
 .../gather_gemm_scatter_manifest.py           |  38 ++--
 .../gather_gemm_scatter_operation.py          |  16 +-
 .../kernels/sparse/gpu/gather_gemm_scatter.h  | 100 +++++----
 9 files changed, 507 insertions(+), 226 deletions(-)

diff --git a/paddle/phi/kernels/autotune/auto_tune_base.h b/paddle/phi/kernels/autotune/auto_tune_base.h
index 606ecc3c59ca72..fa96ed67a29fd5 100644
--- a/paddle/phi/kernels/autotune/auto_tune_base.h
+++ b/paddle/phi/kernels/autotune/auto_tune_base.h
@@ -177,18 +177,34 @@ class MatmulAutoTuner
   }
 };
 
-template <typename T, typename ReturnType, typename... Args>
+template <bool TransposeA,
+          bool TransposeB,
+          typename T,
+          typename ReturnType,
+          typename... Args>
 class GatherGemmScatterAutoTuner
     : public AutoTuneBase<T, KernelCallback<T, ReturnType, T, T, Args...>> {
  public:
-  static GatherGemmScatterAutoTuner<T, ReturnType, Args...>* Instance(
-      ReturnType (*func)(T, T, Args...)) {
+  static GatherGemmScatterAutoTuner<TransposeA,
+                                    TransposeB,
+                                    T,
+                                    ReturnType,
+                                    Args...>*
+  Instance(ReturnType (*func)(T, T, Args...)) {
     static std::once_flag gather_gemm_scatter_init_flag;
-    static std::unique_ptr<GatherGemmScatterAutoTuner<T, ReturnType, Args...>>
+    static std::unique_ptr<GatherGemmScatterAutoTuner<TransposeA,
+                                                      TransposeB,
+                                                      T,
+                                                      ReturnType,
+                                                      Args...>>
         instance;
     std::call_once(gather_gemm_scatter_init_flag, [&] {
       auto obj = MakeCallback<T>(func);
-      instance.reset(new GatherGemmScatterAutoTuner<T, ReturnType, Args...>);
+      instance.reset(new GatherGemmScatterAutoTuner<TransposeA,
+                                                    TransposeB,
+                                                    T,
+                                                    ReturnType,
+                                                    Args...>);
       instance->AddCallBack(func);
     });
     return instance.get();
@@ -201,7 +217,8 @@ class GatherGemmScatterAutoTuner
            Args... args) {
     this->is_init_ = true;
     this->CheckKernelSize();
-    auto& cache = AutoTuneCache::Instance().GetGatherGemmScatter<T>();
+    auto& cache = AutoTuneCache::Instance()
+                      .GetGatherGemmScatter<T, TransposeA, TransposeB>();
 
     if (cache.Find(key)) {
       auto best_idx = cache.Get(key);
@@ -250,10 +267,22 @@ class GatherGemmScatterAutoTuner
     return best_idx;
   }
 };
-template <typename T, typename ReturnType, typename... Args>
-static GatherGemmScatterAutoTuner<T, ReturnType, Args...>*
+template <bool TransposeA,
+          bool TransposeB,
+          typename T,
+          typename ReturnType,
+          typename... Args>
+static GatherGemmScatterAutoTuner<TransposeA,
+                                  TransposeB,
+                                  T,
+                                  ReturnType,
+                                  Args...>*
 MakeGatherGemmScatterTuner(ReturnType (*func)(T, T, Args...)) {
-  return GatherGemmScatterAutoTuner<T, ReturnType, Args...>::Instance(func);
+  return GatherGemmScatterAutoTuner<TransposeA,
+                                    TransposeB,
+                                    T,
+                                    ReturnType,
+                                    Args...>::Instance(func);
 }
 
 // Define the auto_tuner inital object.
diff --git a/paddle/phi/kernels/autotune/cache.h b/paddle/phi/kernels/autotune/cache.h
index c5122f0260cb2a..188faaed71be3a 100644
--- a/paddle/phi/kernels/autotune/cache.h
+++ b/paddle/phi/kernels/autotune/cache.h
@@ -47,13 +47,15 @@ enum class AlgorithmType {
   kMatmul = 5,
   kGatherGemmScatterFP16NN = 6,
   kGatherGemmScatterFP32NN = 7,
+  kGatherGemmScatterFP32TN = 8,
+  kGatherGemmScatterFP32NT = 9,
 #if !defined(PADDLE_WITH_CUDNN_FRONTEND)
-  kAlgorithmCount = 8
+  kAlgorithmCount = 10
 #else
-  kConvForwardV8 = 8,
-  kConvBackwardDataV8 = 9,
-  kConvBackwardFilterV8 = 10,
-  kAlgorithmCount = 11
+  kConvForwardV8 = 10,
+  kConvBackwardDataV8 = 11,
+  kConvBackwardFilterV8 = 12,
+  kAlgorithmCount = 13
 #endif
 };
 
@@ -73,6 +75,17 @@ using CudnnV8AlgorithmsTypeMap =
     std::unordered_map<int64_t, CudnnFrontendPlanCache>;
 #endif
 
+#define DEFINE_GET_GATHER_GEMM_SCATTER(                    \
+    dtype, transpose_a, transpose_b, algo_type)            \
+  template <typename T, bool TransposeA, bool TransposeB>  \
+  typename std::enable_if<std::is_same<T, dtype>::value && \
+                              TransposeA == transpose_a && \
+                              TransposeB == transpose_b,   \
+                          AlgorithmsCacheMap&>::type       \
+  GetGatherGemmScatter() {                                 \
+    return Get(algo_type);                                 \
+  }
+
 class AutoTuneCache {
  public:
   static AutoTuneCache& Instance() {
@@ -89,20 +102,22 @@ class AutoTuneCache {
   ConvAlgorithmsCacheMap& GetConv(const AlgorithmType& algo_type) {
     return conv_auto_tune_map_[static_cast<int64_t>(algo_type)];
   }
-
-  template <typename T>
-  typename std::enable_if<std::is_same<T, float>::value,
-                          AlgorithmsCacheMap&>::type
-  GetGatherGemmScatter() {
-    return Get(AlgorithmType::kGatherGemmScatterFP32NN);
-  }
-
-  template <typename T>
-  typename std::enable_if<std::is_same<T, phi::dtype::float16>::value,
-                          AlgorithmsCacheMap&>::type
-  GetGatherGemmScatter() {
-    return Get(AlgorithmType::kGatherGemmScatterFP16NN);
-  }
+  DEFINE_GET_GATHER_GEMM_SCATTER(phi::dtype::float16,
+                                 false,
+                                 false,
+                                 AlgorithmType::kGatherGemmScatterFP16NN);
+  DEFINE_GET_GATHER_GEMM_SCATTER(float,
+                                 false,
+                                 false,
+                                 AlgorithmType::kGatherGemmScatterFP32NN);
+  DEFINE_GET_GATHER_GEMM_SCATTER(float,
+                                 true,
+                                 false,
+                                 AlgorithmType::kGatherGemmScatterFP32TN);
+  DEFINE_GET_GATHER_GEMM_SCATTER(float,
+                                 false,
+                                 true,
+                                 AlgorithmType::kGatherGemmScatterFP32NT);
 
 #ifdef PADDLE_WITH_CUDNN_FRONTEND
   CudnnFrontendPlanCache& GetConvV8(const AlgorithmType& algo_type) {
diff --git a/paddle/phi/kernels/sparse/gpu/conv_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/conv_grad_kernel.cu
index adfdb09968cbb6..c29dd6ee86e810 100644
--- a/paddle/phi/kernels/sparse/gpu/conv_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/conv_grad_kernel.cu
@@ -24,9 +24,13 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/sparse/gpu/conv.cu.h"
+#ifdef PADDLE_WITH_CUTLASS
+#include "paddle/phi/kernels/sparse/gpu/gather_gemm_scatter.h"
+#endif
 
 namespace phi {
 namespace sparse {
+extern size_t workspace_size;
 
 // rulebook[3, rulebook_len]:
 //[
@@ -130,34 +134,52 @@ void Conv3dCooGradGPUKernel(const GPUContext& dev_ctx,
   phi::backends::gpu::GpuMemsetAsync(
       out_index_ptr, 0, sizeof(int) * x.nnz() * 2, dev_ctx.stream());
 
-  GroupIndexsV2<<<config.block_per_grid,
-                  config.thread_per_block,
-                  0,
-                  dev_ctx.stream()>>>(rulebook_len,
-                                      x.nnz(),
-                                      kernel_size,
-                                      offsets[kernel_size / 2],
-                                      rulebook_ptr,
-                                      out_index_ptr,
-                                      unique_value_ptr);
+#ifdef PADDLE_WITH_CUTLASS
+  bool cutlass = true;
+  if (dev_ctx.GetComputeCapability() < 80) cutlass = false;
 
-  GatherV2<T, IntT>(dev_ctx,
-                    x.values().data<T>(),
-                    out_index_ptr,
-                    unique_value_ptr,
-                    x.nnz(),
-                    kernel_size,
-                    in_channels,
-                    2,
-                    in_features_ptr);
+  if (in_channels % 4 != 0 || out_channels % 4 != 0) cutlass = false;
 
-  Gather<T, IntT>(dev_ctx,
-                  out_grad.values().data<T>(),
-                  rulebook_ptr + rulebook_len,
-                  rulebook_len,
-                  out_channels,
-                  out_grad_features_ptr);
+  if (std::is_same<T, phi::dtype::float16>::value ||
+      std::is_same<T, double>::value)
+    cutlass = false;
 
+  if (!std::is_same<IntT, int32_t>::value) cutlass = false;
+
+  if (!cutlass) {
+#endif
+
+    GroupIndexsV2<<<config.block_per_grid,
+                    config.thread_per_block,
+                    0,
+                    dev_ctx.stream()>>>(rulebook_len,
+                                        x.nnz(),
+                                        kernel_size,
+                                        offsets[kernel_size / 2],
+                                        rulebook_ptr,
+                                        out_index_ptr,
+                                        unique_value_ptr);
+
+    GatherV2<T, IntT>(dev_ctx,
+                      x.values().data<T>(),
+                      out_index_ptr,
+                      unique_value_ptr,
+                      x.nnz(),
+                      kernel_size,
+                      in_channels,
+                      2,
+                      in_features_ptr);
+
+    Gather<T, IntT>(dev_ctx,
+                    out_grad.values().data<T>(),
+                    rulebook_ptr + rulebook_len,
+                    rulebook_len,
+                    out_channels,
+                    out_grad_features_ptr);
+
+#ifdef PADDLE_WITH_CUTLASS
+  }
+#endif
   const T* kernel_ptr = kernel.data<T>();
   for (int i = 0; i < kernel_size; i++) {
     if (counter_ptr[i] <= 0 || (subm && i == half_kernel_size)) {
@@ -173,43 +195,98 @@ void Conv3dCooGradGPUKernel(const GPUContext& dev_ctx,
     T* tmp_d_x_ptr = d_x_features_ptr + offsets[i] * in_channels;
     T* tmp_d_kernel_ptr = d_kernel_ptr + i * in_channels * out_channels;
 
-    // call gemm: d_kernel = transpose(x) * out_grad
-    // (in_channels, n) * (n, out_channels)
-    blas.GEMM(CblasTrans,
-              CblasNoTrans,
-              K,
-              N,
-              M,
-              static_cast<T>(1),
-              tmp_in_ptr,
-              tmp_out_grad_ptr,
-              static_cast<T>(0),
-              tmp_d_kernel_ptr);
+#ifdef PADDLE_WITH_CUTLASS
+    if (cutlass) {
+      const IntT* gather_x_indices = rulebook_ptr + offsets[i];
+      const IntT* scatter_x_indices = rulebook_ptr + offsets[i];
+      const IntT* gather_out_indices = rulebook_ptr + rulebook_len + offsets[i];
+      const size_t key = autotune::GenKey(M / features_num_range, N, K);
+      // call gemm: d_kernel = transpose(x) * out_grad
+      // (in_channels, n) * (n, out_channels)
+      static cutlass::device_memory::allocation<uint8_t> workspace(
+          workspace_size);
+      GatherGemmScatterDriver<T, IntT, true, false>(
+          dev_ctx,
+          key,
+          x.values().data<T>(),
+          out_grad.values().data<T>(),
+          tmp_d_kernel_ptr,
+          tmp_d_kernel_ptr,
+          in_channels,
+          out_channels,
+          counter_ptr[i],
+          gather_x_indices,
+          gather_out_indices,
+          static_cast<const IntT*>(nullptr),
+          static_cast<const T>(1.0),
+          static_cast<const T>(0.0),
+          &workspace);
+      // call gemm: d_x = out_grad * transpose(kernel)
+      // (n, out_channels) * (out_channels, in_channels)
+      GatherGemmScatterDriver<T, IntT, false, true>(
+          dev_ctx,
+          key,
+          out_grad.values().data<T>(),
+          tmp_kernel_ptr,
+          x_grad_values_ptr,
+          x_grad_values_ptr,
+          counter_ptr[i],
+          in_channels,
+          out_channels,
+          gather_out_indices,
+          static_cast<const IntT*>(nullptr),
+          scatter_x_indices,
+          static_cast<const T>(1.0),
+          static_cast<const T>(1.0),
+          nullptr);
+    } else {
+#endif
+      // call gemm: d_kernel = transpose(x) * out_grad
+      // (in_channels, n) * (n, out_channels)
+      blas.GEMM(CblasTrans,
+                CblasNoTrans,
+                K,
+                N,
+                M,
+                static_cast<T>(1),
+                tmp_in_ptr,
+                tmp_out_grad_ptr,
+                static_cast<T>(0),
+                tmp_d_kernel_ptr);
 
-    // call gemm: d_x = out_grad * transpose(kernel)
-    // (n, out_channels) * (out_channels, in_channels)
-    blas.GEMM(CblasNoTrans,
-              CblasTrans,
-              M,
-              K,
-              N,
-              static_cast<T>(1),
-              tmp_out_grad_ptr,
-              tmp_kernel_ptr,
-              static_cast<T>(0),
-              tmp_d_x_ptr);
+      // call gemm: d_x = out_grad * transpose(kernel)
+      // (n, out_channels) * (out_channels, in_channels)
+      blas.GEMM(CblasNoTrans,
+                CblasTrans,
+                M,
+                K,
+                N,
+                static_cast<T>(1),
+                tmp_out_grad_ptr,
+                tmp_kernel_ptr,
+                static_cast<T>(0),
+                tmp_d_x_ptr);
+#ifdef PADDLE_WITH_CUTLASS
+    }
+#endif
   }
 
   // 4. scatter
-  phi::funcs::sparse::ScatterV2<T>(dev_ctx,
-                                   d_x_features_ptr,
-                                   out_index.data<int>(),
-                                   unique_value.data<int>(),
-                                   x_grad->nnz(),
-                                   kernel_size,
-                                   in_channels,
-                                   2,
-                                   x_grad_values_ptr);
+#ifdef PADDLE_WITH_CUTLASS
+  if (!cutlass) {
+#endif
+    phi::funcs::sparse::ScatterV2<T>(dev_ctx,
+                                     d_x_features_ptr,
+                                     out_index.data<int>(),
+                                     unique_value.data<int>(),
+                                     x_grad->nnz(),
+                                     kernel_size,
+                                     in_channels,
+                                     2,
+                                     x_grad_values_ptr);
+#ifdef PADDLE_WITH_CUTLASS
+  }
+#endif
 }
 
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/sparse/gpu/conv_kernel.cu b/paddle/phi/kernels/sparse/gpu/conv_kernel.cu
index aa3ae43397ab5f..43e2b8c01cff64 100644
--- a/paddle/phi/kernels/sparse/gpu/conv_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/conv_kernel.cu
@@ -154,18 +154,23 @@ void Conv3dCooGPUKernel(const GPUContext& dev_ctx,
       const IntT* gather_indices = rulebook_ptr + h_offsets_ptr[i];
       const IntT* scatter_indices =
           rulebook_ptr + rulebook_len + h_offsets_ptr[i];
-      GatherGemmScatterDriver(dev_ctx,
-                              x.non_zero_elements().data<T>(),
-                              tmp_kernel_ptr,
-                              out_values_ptr,
-                              out_values_ptr,
-                              M,
-                              N,
-                              K,
-                              gather_indices,
-                              scatter_indices,
-                              static_cast<T>(1.0),
-                              static_cast<T>(1.0));
+      const size_t key = autotune::GenKey(M / features_num_range, N, K);
+      GatherGemmScatterDriver<T, IntT, false, false>(
+          dev_ctx,
+          key,
+          x.non_zero_elements().data<T>(),
+          tmp_kernel_ptr,
+          out_values_ptr,
+          out_values_ptr,
+          M,
+          N,
+          K,
+          gather_indices,
+          static_cast<const IntT*>(nullptr),
+          scatter_indices,
+          static_cast<T>(1.0),
+          static_cast<T>(1.0),
+          nullptr);
     }
   } else {
 #endif
diff --git a/paddle/phi/kernels/sparse/gpu/cutlass_generator/common.h b/paddle/phi/kernels/sparse/gpu/cutlass_generator/common.h
index 9732cb89ae899a..5a94344e8f9110 100644
--- a/paddle/phi/kernels/sparse/gpu/cutlass_generator/common.h
+++ b/paddle/phi/kernels/sparse/gpu/cutlass_generator/common.h
@@ -16,28 +16,41 @@
 
 #ifdef PADDLE_WITH_CUTLASS
 #include "cutlass/arch/mma.h"
+#include "cutlass/device_kernel.h"
 #include "cutlass/epilogue/thread/linear_combination.h"
 #include "cutlass/gemm/device/gemm_universal.h"
 #include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
 #include "cutlass/half.h"
+#include "cutlass/reduction/device/reduce_split_k.h"
+#include "cutlass/reduction/thread/reduction_operators.h"
+#include "cutlass/tensor_ref.h"
 #include "cutlass/util/device_memory.h"
 #include "examples/common/helper.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 namespace phi {
 namespace sparse {
-#define TYPEDEF_KERNEL_POINTER(kernel, dtype)       \
-  typedef void (*kernel)(dtype const alpha,         \
-                         dtype const beta,          \
-                         const GPUContext& dev_ctx, \
-                         const dtype* const a,      \
-                         const dtype* const b,      \
-                         const dtype* const c,      \
-                         dtype* const d,            \
-                         const int m,               \
-                         const int n,               \
-                         const int k,               \
-                         const int32_t* a_indices,  \
-                         const int32_t* c_d_indices);
+size_t constexpr max_splitk_slices = 256;
+size_t constexpr max_in_channels = 256;
+size_t constexpr max_out_channels = 256;
+static size_t workspace_size =
+    sizeof(float) * max_splitk_slices * max_in_channels * max_out_channels;
+
+#define TYPEDEF_KERNEL_POINTER(kernel, dtype)        \
+  typedef void (*kernel)(dtype const alpha,          \
+                         dtype const beta,           \
+                         const GPUContext& dev_ctx,  \
+                         const dtype* const a,       \
+                         const dtype* const b,       \
+                         const dtype* const c,       \
+                         dtype* const d,             \
+                         const int m,                \
+                         const int n,                \
+                         const int k,                \
+                         const int32_t* a_indices,   \
+                         const int32_t* b_indices,   \
+                         const int32_t* c_d_indices, \
+                         void* const workspace_ptr);
 #define GATHER_GEMM_SCATTER_CHECK(status)                      \
   {                                                            \
     cutlass::Status error = status;                            \
@@ -45,51 +58,126 @@ namespace sparse {
       throw std::runtime_error(cutlassGetStatusString(error)); \
     }                                                          \
   }
-#define DEFINE_LAUNCH_KERNEL(dtype, cutlass_type)                          \
-  template <typename Gemm>                                                 \
-  void launchKernel(dtype const alpha,                                     \
-                    dtype const beta,                                      \
-                    const GPUContext& dev_ctx,                             \
-                    const dtype* const a,                                  \
-                    const dtype* const b,                                  \
-                    const dtype* const c,                                  \
-                    dtype* const d,                                        \
-                    const int m,                                           \
-                    const int n,                                           \
-                    const int k,                                           \
-                    const int32_t* a_indices,                              \
-                    const int32_t* c_d_indices) {                          \
-    cutlass::gemm::GemmCoord problem_size_real({m, n, k});                 \
-    int split_k_slices = 1;                                                \
-    typename Gemm::Arguments arguments{                                    \
-        cutlass::gemm::GemmUniversalMode::kGemm,                           \
-        problem_size_real,                                                 \
-        split_k_slices,                                                    \
-        {static_cast<const cutlass_type>(static_cast<const float>(alpha)), \
-         static_cast<const cutlass_type>(static_cast<const float>(beta))}, \
-        reinterpret_cast<const cutlass_type* const>(a),                    \
-        reinterpret_cast<const cutlass_type* const>(b),                    \
-        reinterpret_cast<const cutlass_type* const>(c),                    \
-        reinterpret_cast<cutlass_type* const>(d),                          \
-        cutlass::layout::RowMajor().capacity(problem_size_real.mk()),      \
-        cutlass::layout::RowMajor().capacity(problem_size_real.kn()),      \
-        cutlass::layout::RowMajor().capacity(problem_size_real.mn()),      \
-        cutlass::layout::RowMajor().capacity(problem_size_real.mn()),      \
-        problem_size_real.k(),                                             \
-        problem_size_real.n(),                                             \
-        problem_size_real.n(),                                             \
-        problem_size_real.n(),                                             \
-        a_indices,                                                         \
-        nullptr,                                                           \
-        c_d_indices};                                                      \
-    size_t workspace_size = Gemm::get_workspace_size(arguments);           \
-    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size); \
-    Gemm gemm_op;                                                          \
-    cutlass::Status status = gemm_op.can_implement(arguments);             \
-    GATHER_GEMM_SCATTER_CHECK(status);                                     \
-    status = gemm_op.initialize(arguments, workspace.get());               \
-    GATHER_GEMM_SCATTER_CHECK(status);                                     \
-    gemm_op(dev_ctx.stream());                                             \
+#define DEFINE_LAUNCH_KERNEL(dtype, cutlass_type)                             \
+  template <typename Config>                                                  \
+  void launchKernel(dtype const alpha,                                        \
+                    dtype const beta,                                         \
+                    const GPUContext& dev_ctx,                                \
+                    const dtype* const a,                                     \
+                    const dtype* const b,                                     \
+                    const dtype* const c,                                     \
+                    dtype* const d,                                           \
+                    const int m,                                              \
+                    const int n,                                              \
+                    const int k,                                              \
+                    const int32_t* a_indices,                                 \
+                    const int32_t* b_indices,                                 \
+                    const int32_t* c_d_indices,                               \
+                    void* const workspace_ptr) {                              \
+    cutlass::gemm::GemmCoord problem_size_real({m, n, k});                    \
+    using Gemm = typename Config::Gemm;                                       \
+    int split_k_slices = std::max(std::min(64, k / 128), 1);                  \
+    typename Gemm::Arguments arguments{                                       \
+        Config::Mode,                                                         \
+        problem_size_real,                                                    \
+        split_k_slices,                                                       \
+        {static_cast<const cutlass_type>(static_cast<const float>(alpha)),    \
+         static_cast<const cutlass_type>(static_cast<const float>(beta))},    \
+        reinterpret_cast<const cutlass_type* const>(a),                       \
+        reinterpret_cast<const cutlass_type* const>(b),                       \
+        reinterpret_cast<const cutlass_type* const>(c),                       \
+        reinterpret_cast<cutlass_type* const>(d),                             \
+        m * k,                                                                \
+        k * n,                                                                \
+        m * n,                                                                \
+        m * n,                                                                \
+        std::is_same<typename Gemm::Base::LayoutA,                            \
+                     cutlass::layout::RowMajor>::value                        \
+            ? problem_size_real.k()                                           \
+            : problem_size_real.m(),                                          \
+        std::is_same<typename Gemm::Base::LayoutB,                            \
+                     cutlass::layout::RowMajor>::value                        \
+            ? problem_size_real.n()                                           \
+            : problem_size_real.k(),                                          \
+        std::is_same<typename Gemm::Base::LayoutC,                            \
+                     cutlass::layout::RowMajor>::value                        \
+            ? problem_size_real.n()                                           \
+            : problem_size_real.m(),                                          \
+        std::is_same<typename Gemm::Base::LayoutC,                            \
+                     cutlass::layout::RowMajor>::value                        \
+            ? problem_size_real.n()                                           \
+            : problem_size_real.m(),                                          \
+        a_indices,                                                            \
+        b_indices,                                                            \
+        c_d_indices};                                                         \
+    cutlass::device_memory::allocation<uint8_t>* const real_workspace_ptr =   \
+        static_cast<cutlass::device_memory::allocation<uint8_t>* const>(      \
+            workspace_ptr);                                                   \
+    if (Config::Mode ==                                                       \
+        cutlass::gemm::GemmUniversalMode::kGemmSplitKParallel) {              \
+      size_t current_workspace_size = Gemm::get_workspace_size(arguments);    \
+      if (current_workspace_size > workspace_size) {                          \
+        workspace_size = current_workspace_size;                              \
+        real_workspace_ptr->reset(workspace_size);                            \
+      }                                                                       \
+                                                                              \
+      arguments.ptr_D = real_workspace_ptr->get();                            \
+    }                                                                         \
+    Gemm gemm_op;                                                             \
+    cutlass::Status status = gemm_op.can_implement(arguments);                \
+    GATHER_GEMM_SCATTER_CHECK(status);                                        \
+    if (Config::Mode ==                                                       \
+        cutlass::gemm::GemmUniversalMode::kGemmSplitKParallel) {              \
+      status = gemm_op.initialize(arguments, real_workspace_ptr->get());      \
+    } else {                                                                  \
+      cutlass::device_memory::allocation<uint8_t> empty_workspace(0);         \
+      status = gemm_op.initialize(arguments, empty_workspace.get());          \
+    }                                                                         \
+    GATHER_GEMM_SCATTER_CHECK(status);                                        \
+    gemm_op(dev_ctx.stream());                                                \
+    if (Config::Mode ==                                                       \
+        cutlass::gemm::GemmUniversalMode::kGemmSplitKParallel) {              \
+      using ReductionOp = cutlass::reduction::thread::ReduceAdd<              \
+          typename Gemm::ElementAccumulator,                                  \
+          typename Gemm::EpilogueOutputOp::ElementAccumulator,                \
+          Gemm::EpilogueOutputOp::kCount>;                                    \
+                                                                              \
+      using ReductionKernel = cutlass::reduction::kernel::ReduceSplitK<       \
+          cutlass::MatrixShape<4, 32 * Gemm::EpilogueOutputOp::kCount>,       \
+          typename Gemm::EpilogueOutputOp,                                    \
+          ReductionOp>;                                                       \
+      using ReductionDevice =                                                 \
+          typename cutlass::reduction::device::ReduceSplitK<ReductionKernel>; \
+      ReductionDevice reduction_op;                                           \
+      int splitk_gemm_stride = n;                                             \
+      cutlass::layout::RowMajor splitk_gemm_layout(splitk_gemm_stride);       \
+      void* workspace_gemm_ptr = real_workspace_ptr->get();                   \
+      cutlass::TensorRef<typename Gemm::ElementAccumulator,                   \
+                         cutlass::layout::RowMajor>                           \
+          ref_workspace(reinterpret_cast<typename Gemm::ElementAccumulator*>( \
+                            workspace_gemm_ptr),                              \
+                        splitk_gemm_layout);                                  \
+      cutlass::TensorRef<typename Gemm::Base::ElementC,                       \
+                         typename Gemm::Base::LayoutC>                        \
+          ref_c(reinterpret_cast<typename Gemm::Base::ElementC* const>(d),    \
+                splitk_gemm_layout);                                          \
+      cutlass::TensorRef<typename Gemm::Base::ElementC,                       \
+                         typename Gemm::Base::LayoutC>                        \
+          ref_d(reinterpret_cast<typename Gemm::Base::ElementC* const>(d),    \
+                splitk_gemm_layout);                                          \
+      typename ReductionDevice::Arguments reduction_args(                     \
+          problem_size_real.mn(),                                             \
+          split_k_slices,                                                     \
+          static_cast<size_t>(problem_size_real.m() * problem_size_real.n()), \
+          ref_workspace,                                                      \
+          ref_d,                                                              \
+          ref_c,                                                              \
+          {static_cast<const cutlass_type>(static_cast<const float>(alpha)),  \
+           static_cast<const cutlass_type>(static_cast<const float>(beta))}); \
+      status = reduction_op.initialize(reduction_args);                       \
+      GATHER_GEMM_SCATTER_CHECK(status);                                      \
+      reduction_op(dev_ctx.stream());                                         \
+    }                                                                         \
   }
 
 TYPEDEF_KERNEL_POINTER(fp16_gather_gemm_scatter, phi::dtype::float16)
diff --git a/paddle/phi/kernels/sparse/gpu/cutlass_generator/gather_gemm_scatter_generator.py b/paddle/phi/kernels/sparse/gpu/cutlass_generator/gather_gemm_scatter_generator.py
index 21b59f067a6090..a2d837347b4b06 100644
--- a/paddle/phi/kernels/sparse/gpu/cutlass_generator/gather_gemm_scatter_generator.py
+++ b/paddle/phi/kernels/sparse/gpu/cutlass_generator/gather_gemm_scatter_generator.py
@@ -97,7 +97,7 @@ def CreateGatherGemmScatterOperator(
     return operations
 
 
-def GenerateSM80_TensorOp_16816(manifest, cuda_version):
+def GenerateSM80_TensorOp_16816(manifest, cuda_version, debug=False):
 
     if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
         return
@@ -191,6 +191,12 @@ def GenerateSM80_TensorOp_16816(manifest, cuda_version):
                 [64, 64, 64], 5, [2, 2, 1], math_inst, min_cc, max_cc
             ),
         ]
+        if debug:
+            tile_descriptions = [
+                TileDescription(
+                    [256, 128, 32], 3, [4, 2, 1], math_inst, min_cc, max_cc
+                ),
+            ]
 
         data_type = [
             math_inst.element_a,
@@ -218,13 +224,15 @@ def GenerateSM80_TensorOp_16816(manifest, cuda_version):
             )
 
 
-def GenerateSM80_TensorOp_1688(manifest, cuda_version):
+def GenerateSM80_TensorOp_1688(manifest, cuda_version, debug=False):
 
     if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
         return
 
     layouts = [
         (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.RowMajor),
+        (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.RowMajor),
+        (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.RowMajor),
     ]
 
     math_instructions = [
@@ -302,6 +310,13 @@ def GenerateSM80_TensorOp_1688(manifest, cuda_version):
             ),
         ]
 
+        if debug:
+            tile_descriptions = [
+                TileDescription(
+                    [256, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc
+                ),
+            ]
+
         data_type = [
             math_inst.element_a,
             math_inst.element_b,
@@ -325,13 +340,15 @@ def GenerateSM80_TensorOp_1688(manifest, cuda_version):
         )
 
 
-def GenerateSM80_TensorOp_1688_fast_math(manifest, cuda_version):
+def GenerateSM80_TensorOp_1688_fast_math(manifest, cuda_version, debug=False):
 
     if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
         return
 
     layouts = [
         (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.RowMajor),
+        (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.RowMajor),
+        (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.RowMajor),
     ]
 
     math_instructions = [
@@ -409,6 +426,13 @@ def GenerateSM80_TensorOp_1688_fast_math(manifest, cuda_version):
             ),
         ]
 
+        if debug:
+            tile_descriptions = [
+                TileDescription(
+                    [256, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc
+                ),
+            ]
+
         data_type = [DataType.f32, DataType.f32, DataType.f32, DataType.f32]
 
         CreateGatherGemmScatterOperator(
@@ -416,13 +440,17 @@ def GenerateSM80_TensorOp_1688_fast_math(manifest, cuda_version):
         )
 
 
-def GenerateSM80_TensorOp_1688_fast_fp32_math(manifest, cuda_version):
+def GenerateSM80_TensorOp_1688_fast_fp32_math(
+    manifest, cuda_version, debug=False
+):
 
     if not CudaToolkitVersionSatisfies(cuda_version, 11, 0):
         return
 
     layouts = [
         (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.RowMajor),
+        (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.RowMajor),
+        (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.RowMajor),
     ]
 
     math_instructions = [
@@ -482,6 +510,13 @@ def GenerateSM80_TensorOp_1688_fast_fp32_math(manifest, cuda_version):
             ),
         ]
 
+        if debug:
+            tile_descriptions = [
+                TileDescription(
+                    [128, 128, 16], 4, [4, 2, 1], math_inst, min_cc, max_cc
+                ),
+            ]
+
         data_type = [DataType.f32, DataType.f32, DataType.f32, DataType.f32]
 
         CreateGatherGemmScatterOperator(
@@ -489,11 +524,11 @@ def GenerateSM80_TensorOp_1688_fast_fp32_math(manifest, cuda_version):
         )
 
 
-def GenerateSM80(manifest, cuda_version):
-    GenerateSM80_TensorOp_16816(manifest, cuda_version)
-    GenerateSM80_TensorOp_1688(manifest, cuda_version)
-    GenerateSM80_TensorOp_1688_fast_math(manifest, cuda_version)
-    GenerateSM80_TensorOp_1688_fast_fp32_math(manifest, cuda_version)
+def GenerateSM80(manifest, cuda_version, debug=False):
+    GenerateSM80_TensorOp_16816(manifest, cuda_version, debug)
+    GenerateSM80_TensorOp_1688(manifest, cuda_version, debug)
+    GenerateSM80_TensorOp_1688_fast_math(manifest, cuda_version, debug)
+    GenerateSM80_TensorOp_1688_fast_fp32_math(manifest, cuda_version, debug)
 
 
 class KernelCfg:
diff --git a/paddle/phi/kernels/sparse/gpu/cutlass_generator/gather_gemm_scatter_manifest.py b/paddle/phi/kernels/sparse/gpu/cutlass_generator/gather_gemm_scatter_manifest.py
index a9ac554ede6e42..280cd082d2f4db 100644
--- a/paddle/phi/kernels/sparse/gpu/cutlass_generator/gather_gemm_scatter_manifest.py
+++ b/paddle/phi/kernels/sparse/gpu/cutlass_generator/gather_gemm_scatter_manifest.py
@@ -41,12 +41,12 @@ def __init__(self, generated_path, kind, args):
 }  // namespace phi
 #endif
 """
-        self.fp16_kernels_list = (
-            "static std::vector<fp16_gather_gemm_scatter> fp16_kernels = {\n"
-        )
-        self.fp32_kernels_list = (
-            "static std::vector<fp32_gather_gemm_scatter> fp32_kernels = {\n"
-        )
+        self.kernels_lists = {
+            "hnn": "static std::vector<fp16_gather_gemm_scatter> fp16_nn_kernels = {",
+            "snn": "static std::vector<fp32_gather_gemm_scatter> fp32_nn_kernels = {",
+            "snt": "static std::vector<fp32_gather_gemm_scatter> fp32_nt_kernels = {",
+            "stn": "static std::vector<fp32_gather_gemm_scatter> fp32_tn_kernels = {",
+        }
 
     def __enter__(self):
         self.operation_path = os.path.join(
@@ -78,19 +78,25 @@ def emit(self, configuration_name, operations):
             self.source_files.append(configuration_emitter.configuration_path)
 
         self.configurations.append(configuration_name)
-        if 'h' == operations[0].short_math_name():
-            self.fp16_kernels_list += (
+
+        if operations[0].layout_name() == 'tn':
+            self.kernels_lists[
+                operations[0].short_math_name() + operations[0].layout_name()
+            ] += (
                 """
 launchKernel<"""
                 + configuration_name
-                + "::Gemm>,"
+                + "<cutlass::gemm::GemmUniversalMode::kGemmSplitKParallel"
+                + ">>,"
             )
-        if 's' == operations[0].short_math_name():
-            self.fp32_kernels_list += (
+        else:
+            self.kernels_lists[
+                operations[0].short_math_name() + operations[0].layout_name()
+            ] += (
                 """
 launchKernel<"""
                 + configuration_name
-                + "::Gemm>,"
+                + "<>>,"
             )
 
         self.top_level_file.write(
@@ -117,11 +123,11 @@ def __exit__(self, exception_type, exception_value, traceback):
                 )
             )
 
-        self.fp16_kernels_list += "\n};\n"
-        self.fp32_kernels_list += "\n};\n"
+        for k, v in self.kernels_lists.items():
+            self.kernels_lists[k] += "\n};\n"
         self.top_level_file.write(self.namespace_template)
-        self.top_level_file.write(self.fp16_kernels_list)
-        self.top_level_file.write(self.fp32_kernels_list)
+        for k, v in self.kernels_lists.items():
+            self.top_level_file.write(v)
         self.top_level_file.write(self.epilogue_template)
         self.top_level_file.close()
 
diff --git a/paddle/phi/kernels/sparse/gpu/cutlass_generator/gather_gemm_scatter_operation.py b/paddle/phi/kernels/sparse/gpu/cutlass_generator/gather_gemm_scatter_operation.py
index 32e244dc140618..41b5eeab6a91de 100644
--- a/paddle/phi/kernels/sparse/gpu/cutlass_generator/gather_gemm_scatter_operation.py
+++ b/paddle/phi/kernels/sparse/gpu/cutlass_generator/gather_gemm_scatter_operation.py
@@ -52,6 +52,8 @@ def __init__(self, operation_suffix=''):
 """
         self.gemm_template = """
 // Gemm operator ${operation_name}
+template<cutlass::gemm::GemmUniversalMode Mode_ =
+             cutlass::gemm::GemmUniversalMode::kGemm>
 struct ${operation_name} {
   using Gemm =
     cutlass::gemm::device::GemmUniversal<
@@ -75,10 +77,11 @@ def __init__(self, operation_suffix=''):
       ${math_operation},
       ${transform_a},
       ${transform_b},
-      true, // gather a
-      false, // gather b
-      true // scatter d
+      ${gather_a}, // gather a
+      ${gather_b}, // gather b
+      ${scatter_d} // scatter d
     >;
+  static const cutlass::gemm::GemmUniversalMode Mode = Mode_;
 };
 """
 
@@ -192,6 +195,9 @@ def emit(self, operation):
             'math_operation': MathOperationTag[
                 operation.tile_description.math_instruction.math_operation
             ],
+            'gather_a': 'true',
+            'gather_b': str(operation.layout_name() == 'tn').lower(),
+            'scatter_d': str(operation.layout_name() != 'tn').lower(),
         }
 
         return SubstituteTemplate(gemm_template, values)
@@ -295,8 +301,8 @@ def __init__(
             B,
             C,
             element_epilogue,
-            epilogue_functor=EpilogueFunctor.LinearCombination,
-            swizzling_functor=SwizzlingFunctor.Identity8,
+            epilogue_functor,
+            swizzling_functor,
         )
         self.ShortLayoutTypeNames = {
             LayoutType.ColumnMajor: 't',
diff --git a/paddle/phi/kernels/sparse/gpu/gather_gemm_scatter.h b/paddle/phi/kernels/sparse/gpu/gather_gemm_scatter.h
index 158a875b5403c0..60ffd99c7f1a5b 100644
--- a/paddle/phi/kernels/sparse/gpu/gather_gemm_scatter.h
+++ b/paddle/phi/kernels/sparse/gpu/gather_gemm_scatter.h
@@ -24,29 +24,50 @@ namespace phi {
 namespace sparse {
 
 // To reduce tuning time, map shape (m,n,k) to (m/features_num_range,n,k) so
-// that shapes in this range share the same key.
+// that shapes within this range share the same key.
 constexpr int features_num_range = 10000;
 
-#define DEFINE_GATHER_GEMM_SCATTER_DRIVER(dtype, kernels)                     \
-  template <typename T, typename IntT>                                        \
-  typename std::enable_if<std::is_same<T, dtype>::value &&                    \
-                              std::is_same<IntT, int32_t>::value,             \
-                          void>::type                                         \
-  GatherGemmScatterDriver(const phi::GPUContext& ctx,                         \
-                          const T* const a,                                   \
-                          const T* const b,                                   \
-                          const T* const c,                                   \
-                          T* const d,                                         \
-                          const int& m,                                       \
-                          const int& n,                                       \
-                          const int& k,                                       \
-                          const IntT* a_indices,                              \
-                          const IntT* c_d_indices,                            \
-                          T alpha,                                            \
-                          T beta) {                                           \
-    auto* tuner = autotune::MakeGatherGemmScatterTuner(kernels[0]);           \
+template <typename T, typename IntT, bool TransposeA, bool TransposeB>
+void GatherGemmScatterDriver(
+    const phi::GPUContext& ctx,
+    const size_t key,
+    const T* const a,
+    const T* const b,
+    const T* const c,
+    T* const d,
+    const int& m,
+    const int& n,
+    const int& k,
+    const IntT* a_indices,
+    const IntT* b_indices,
+    const IntT* c_d_indices,
+    T alpha,
+    T beta,
+    cutlass::device_memory::allocation<uint8_t>* const workspace_ptr) {}
+
+#define EXPLICIT_SPECIALIZE_GATHER_GEMM_SCATTER_DRIVER(                       \
+    T, kernels, transpose_a, transpose_b)                                     \
+  template <>                                                                 \
+  inline void GatherGemmScatterDriver<T, int32_t, transpose_a, transpose_b>(  \
+      const phi::GPUContext& ctx,                                             \
+      const size_t key,                                                       \
+      const T* const a,                                                       \
+      const T* const b,                                                       \
+      const T* const c,                                                       \
+      T* const d,                                                             \
+      const int& m,                                                           \
+      const int& n,                                                           \
+      const int& k,                                                           \
+      const int32_t* a_indices,                                               \
+      const int32_t* b_indices,                                               \
+      const int32_t* c_d_indices,                                             \
+      T alpha,                                                                \
+      T beta,                                                                 \
+      cutlass::device_memory::allocation<uint8_t>* const workspace_ptr) {     \
+    auto* tuner =                                                             \
+        autotune::MakeGatherGemmScatterTuner<transpose_a, transpose_b>(       \
+            kernels[0]);                                                      \
     for (auto i = 1; i < kernels.size(); i++) tuner->AddCallBack(kernels[i]); \
-    size_t key = autotune::GenKey(m / features_num_range, n, k);              \
     tuner->Run(ctx,                                                           \
                key,                                                           \
                alpha,                                                         \
@@ -60,28 +81,27 @@ constexpr int features_num_range = 10000;
                n,                                                             \
                k,                                                             \
                a_indices,                                                     \
-               c_d_indices);                                                  \
+               b_indices,                                                     \
+               c_d_indices,                                                   \
+               workspace_ptr);                                                \
   }
 
-template <typename T, typename IntT>
-typename std::enable_if<std::is_same<T, double>::value ||
-                            !std::is_same<IntT, int32_t>::value,
-                        void>::type
-GatherGemmScatterDriver(const phi::GPUContext& ctx,
-                        const T* const a,
-                        const T* const b,
-                        const T* const c,
-                        T* const d,
-                        const int& m,
-                        const int& n,
-                        const int& k,
-                        const IntT* a_indices,
-                        const IntT* c_d_indices,
-                        T alpha,
-                        T beta) {}
-
-DEFINE_GATHER_GEMM_SCATTER_DRIVER(phi::dtype::float16, fp16_kernels)
-DEFINE_GATHER_GEMM_SCATTER_DRIVER(float, fp32_kernels)
+EXPLICIT_SPECIALIZE_GATHER_GEMM_SCATTER_DRIVER(phi::dtype::float16,
+                                               fp16_nn_kernels,
+                                               false,
+                                               false)
+EXPLICIT_SPECIALIZE_GATHER_GEMM_SCATTER_DRIVER(float,
+                                               fp32_nn_kernels,
+                                               false,
+                                               false)
+EXPLICIT_SPECIALIZE_GATHER_GEMM_SCATTER_DRIVER(float,
+                                               fp32_nt_kernels,
+                                               false,
+                                               true)
+EXPLICIT_SPECIALIZE_GATHER_GEMM_SCATTER_DRIVER(float,
+                                               fp32_tn_kernels,
+                                               true,
+                                               false)
 
 }  // namespace sparse
 }  // namespace phi

From f4ae373770c4482d3cd26f1be8ee4d97ac3794f7 Mon Sep 17 00:00:00 2001
From: WangZhen <23097963+0x45f@users.noreply.github.com>
Date: Thu, 13 Apr 2023 14:01:54 +0800
Subject: [PATCH 121/156] [Dy2St]Fix _param_grad_names when grad name likes
 'param@GRAD@GRAD' (#52821)

* Fix _param_grad_names when like 'param@GRAD@GRAD'
---
 python/paddle/jit/dy2static/utils.py          | 25 ++++++++++++++-----
 .../dygraph_to_static/test_gradname_parse.py  |  0
 2 files changed, 19 insertions(+), 6 deletions(-)
 rename {python/paddle/fluid/tests/unittests => test}/dygraph_to_static/test_gradname_parse.py (100%)

diff --git a/python/paddle/jit/dy2static/utils.py b/python/paddle/jit/dy2static/utils.py
index 28c8c739f2efca..bd90f6089fe95b 100644
--- a/python/paddle/jit/dy2static/utils.py
+++ b/python/paddle/jit/dy2static/utils.py
@@ -87,6 +87,9 @@
 FOR_CONDITION_PREFIX = 'for_loop_condition'
 FOR_BODY_PREFIX = 'for_loop_body'
 
+GRAD_PREFIX = 'grad/'
+GRAD_SUFFIX = '@GRAD'
+
 NO_SHAPE_VAR_TYPE = [
     core.VarDesc.VarType.READER,
     core.VarDesc.VarType.STEP_SCOPES,
@@ -1463,18 +1466,28 @@ def _param_grad_names(program_desc, params):
     # the param grad name can be set correctly in the run_program.
     for param in params:
         candidate = []
-        suffix = param.name + '@GRAD'
         for var in program_desc.block(0).all_vars():
             var_name = var.name()
-            if var_name.endswith(suffix):
-                prefix_count = var_name.count('grad/')
-                if 'grad/' * prefix_count + suffix == var_name:
+            if param.name not in var_name:
+                continue
+            suf_count = var_name.count(GRAD_SUFFIX)
+            if suf_count > 0:
+                suffix = param.name + GRAD_SUFFIX * suf_count
+                pre_count = var_name.count(GRAD_PREFIX)
+                if GRAD_PREFIX * pre_count + suffix == var_name:
                     candidate.append(var_name)
 
         if candidate:
-            names.append(max(candidate, key=lambda name: name.count('grad/')))
+            names.append(
+                max(
+                    candidate,
+                    key=lambda name: name.count(GRAD_PREFIX)
+                    if GRAD_PREFIX in name
+                    else name.count(GRAD_SUFFIX),
+                )
+            )
         else:
-            names.append(suffix)
+            names.append(param.name + GRAD_SUFFIX)
     return names
 
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_gradname_parse.py b/test/dygraph_to_static/test_gradname_parse.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/dygraph_to_static/test_gradname_parse.py
rename to test/dygraph_to_static/test_gradname_parse.py

From 0695fb884ac643c5a4cd5facc8635ec5f2030341 Mon Sep 17 00:00:00 2001
From: zhupengyang <zhu_py@qq.com>
Date: Thu, 13 Apr 2023 14:08:01 +0800
Subject: [PATCH 122/156] delete useless cast, elementwise_mul (#52831)

---
 paddle/fluid/framework/ir/CMakeLists.txt      |   1 +
 .../fluid/framework/ir/delete_cast_op_pass.cc | 125 +++++++++++++++++
 .../fluid/framework/ir/delete_cast_op_pass.h  |  15 ++-
 .../framework/ir/delete_cast_op_pass_test.cc  |  29 ++++
 .../ir/delete_elementwise_mul_op_pass.cc      | 127 ++++++++++++++++++
 paddle/fluid/framework/ir/pass.cc             |   1 +
 .../inference/api/paddle_pass_builder.cc      |   1 +
 paddle/phi/kernels/xpu/scatter_kernel.cc      |  19 ++-
 ...test_xpu_delete_elementwise_mul_op_pass.py |  83 ++++++++++++
 9 files changed, 394 insertions(+), 7 deletions(-)
 create mode 100644 paddle/fluid/framework/ir/delete_elementwise_mul_op_pass.cc
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_xpu_delete_elementwise_mul_op_pass.py

diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index b1db3dd0a43cb6..df7454bf2cf569 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -127,6 +127,7 @@ pass_library(gpu_cpu_map_matmul_to_mul_pass inference)
 pass_library(dense_fc_to_sparse_pass inference)
 pass_library(dense_multihead_matmul_to_sparse_pass inference)
 pass_library(delete_cast_op_pass inference)
+pass_library(delete_elementwise_mul_op_pass inference)
 pass_library(generate_pass DEPS pass_desc_proto)
 target_link_libraries(generate_pass pass_desc_proto)
 
diff --git a/paddle/fluid/framework/ir/delete_cast_op_pass.cc b/paddle/fluid/framework/ir/delete_cast_op_pass.cc
index bfda0f32380102..5df0128f482eb2 100644
--- a/paddle/fluid/framework/ir/delete_cast_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_cast_op_pass.cc
@@ -505,6 +505,122 @@ int DeleteCastOpPass::ApplyCastIndexSamplePass(ir::Graph* graph) const {
   return found_subgraph_count;
 }
 
+namespace patterns {
+struct CastScatterPattern : public PatternBase {
+  CastScatterPattern(PDPattern* pattern, const std::string& name_scope);
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(scatter);
+  PATTERN_DECL_NODE(cast0);
+  PATTERN_DECL_NODE(cast1);
+  PATTERN_DECL_NODE(cast2);
+  // declare variable node's name
+  PATTERN_DECL_NODE(cast0_in);
+  PATTERN_DECL_NODE(cast0_out);
+  PATTERN_DECL_NODE(cast1_in);
+  PATTERN_DECL_NODE(cast1_out);
+  PATTERN_DECL_NODE(scatter_out);
+  PATTERN_DECL_NODE(cast2_out);
+};
+
+CastScatterPattern::CastScatterPattern(PDPattern* pattern,
+                                       const std::string& name_scope)
+    : PatternBase(pattern, name_scope, name_scope) {
+  auto* cast0_in = pattern->NewNode(cast0_in_repr())
+                       ->assert_is_op_input("cast", "X")
+                       ->assert_has_n_outputs(1);
+  auto* cast0 =
+      pattern->NewNode(cast0_repr())
+          ->assert_is_op("cast")
+          ->assert_more([](Node* node) {
+            auto* op_desc = node->Op();
+            auto in_dtype = op_desc->GetAttrIfExists<int>("in_dtype");
+            auto out_dtype = op_desc->GetAttrIfExists<int>("out_dtype");
+            return in_dtype == static_cast<int>(proto::VarType::FP16) &&
+                   out_dtype == static_cast<int>(proto::VarType::FP32);
+          });
+  auto* cast0_out = pattern->NewNode(cast0_out_repr())
+                        ->assert_is_op_output("cast", "Out")
+                        ->assert_is_op_input("scatter", "X")
+                        ->assert_has_n_outputs(1);
+  auto* cast1_in = pattern->NewNode(cast1_in_repr())
+                       ->assert_is_op_input("cast", "X")
+                       ->assert_has_n_outputs(1);
+  auto* cast1 =
+      pattern->NewNode(cast1_repr())
+          ->assert_is_op("cast")
+          ->assert_more([](Node* node) {
+            auto* op_desc = node->Op();
+            auto in_dtype = op_desc->GetAttrIfExists<int>("in_dtype");
+            auto out_dtype = op_desc->GetAttrIfExists<int>("out_dtype");
+            return in_dtype == static_cast<int>(proto::VarType::FP16) &&
+                   out_dtype == static_cast<int>(proto::VarType::FP32);
+          });
+  auto* cast1_out = pattern->NewNode(cast1_out_repr())
+                        ->assert_is_op_output("cast", "Out")
+                        ->assert_is_op_input("scatter", "Updates")
+                        ->assert_has_n_outputs(1);
+  auto* scatter = pattern->NewNode(scatter_repr())->assert_is_op("scatter");
+  auto* scatter_out = pattern->NewNode(scatter_out_repr())
+                          ->assert_is_op_output("scatter", "Out")
+                          ->assert_is_op_input("cast", "X")
+                          ->assert_has_n_outputs(1);
+  auto* cast2 =
+      pattern->NewNode(cast2_repr())
+          ->assert_is_op("cast")
+          ->assert_more([](Node* node) {
+            auto* op_desc = node->Op();
+            auto in_dtype = op_desc->GetAttrIfExists<int>("in_dtype");
+            auto out_dtype = op_desc->GetAttrIfExists<int>("out_dtype");
+            return in_dtype == static_cast<int>(proto::VarType::FP32) &&
+                   out_dtype == static_cast<int>(proto::VarType::FP16);
+          });
+  auto* cast2_out =
+      pattern->NewNode(cast2_out_repr())->assert_is_op_output("cast", "Out");
+
+  cast0->LinksFrom({cast0_in}).LinksTo({cast0_out});
+  cast1->LinksFrom({cast1_in}).LinksTo({cast1_out});
+  scatter->LinksFrom({cast0_out, cast1_out}).LinksTo({scatter_out});
+  cast2->LinksFrom({scatter_out}).LinksTo({cast2_out});
+}
+}  // namespace patterns
+
+int DeleteCastOpPass::ApplyCastScatterPass(ir::Graph* graph) const {
+  GraphPatternDetector gpd;
+  patterns::CastScatterPattern pattern(gpd.mutable_pattern(), name_scope_);
+
+  int found_subgraph_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* graph) {
+    VLOG(4) << "handle ApplyCastScatterPass fuse";
+    GET_IR_NODE_FROM_SUBGRAPH(scatter, scatter, pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(cast0, cast0, pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(cast1, cast1, pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(cast2, cast2, pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(cast0_in, cast0_in, pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(cast0_out, cast0_out, pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(cast1_in, cast1_in, pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(cast1_out, cast1_out, pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(scatter_out, scatter_out, pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(cast2_out, cast2_out, pattern);
+
+    scatter->Op()->RenameInput(cast0_out->Name(), cast0_in->Name());
+    scatter->Op()->RenameInput(cast1_out->Name(), cast1_in->Name());
+    scatter->Op()->RenameOutput(scatter_out->Name(), cast2_out->Name());
+    IR_NODE_LINK_TO(cast0_in, scatter);
+    IR_NODE_LINK_TO(cast1_in, scatter);
+    IR_NODE_LINK_TO(scatter, cast2_out);
+
+    std::unordered_set<const Node*> delete_nodes{
+        cast0, cast1, cast2, cast0_out, cast1_out, scatter_out};
+    GraphSafeRemoveNodes(graph, delete_nodes);
+    found_subgraph_count++;
+  };
+
+  gpd(graph, handler);
+  return found_subgraph_count;
+}
+
 namespace patterns {
 struct CastPattern : public PatternBase {
   CastPattern(PDPattern* pattern, const std::string& name_scope);
@@ -591,6 +707,15 @@ void DeleteCastOpPass::ApplyImpl(ir::Graph* graph) const {
               << " cast_index_sample_cast subgraph";
   }
 
+  found_subgraph_count = 0;
+  for (size_t i = 0; i < graph->SubGraphsSize(); i++) {
+    found_subgraph_count += ApplyCastScatterPass(graph->GetSubGraph(i));
+  }
+  if (found_subgraph_count > 0) {
+    LOG(INFO) << "--- delete " << found_subgraph_count
+              << " cast_scatter_cast subgraph";
+  }
+
   found_subgraph_count = 0;
   for (size_t i = 0; i < graph->SubGraphsSize(); i++) {
     found_subgraph_count += ApplyCastPass(graph->GetSubGraph(i));
diff --git a/paddle/fluid/framework/ir/delete_cast_op_pass.h b/paddle/fluid/framework/ir/delete_cast_op_pass.h
index f0010a851f7225..37132af07e17fd 100644
--- a/paddle/fluid/framework/ir/delete_cast_op_pass.h
+++ b/paddle/fluid/framework/ir/delete_cast_op_pass.h
@@ -111,7 +111,20 @@ class DeleteCastOpPass : public FusePassBase {
   */
   int ApplyCastIndexSamplePass(ir::Graph* graph) const;
 
-  // Delete cast if its "in_dtype" is the same with "out_dtype"
+  /*
+ Origin subgraph:
+       cast(fp16->fp32) cast(fp16->fp32)
+                  \       /
+                   scatter
+                      |
+              cast(fp32->fp16)
+
+ Optimized subgraph:
+                   scatter
+  */
+  int ApplyCastScatterPass(ir::Graph* graph) const;
+
+  // Delete cast if its "in_dtype" is the same as "out_dtype"
   int ApplyCastPass(ir::Graph* graph) const;
 
   const std::string name_scope_{"delete_cast_op_pass"};
diff --git a/paddle/fluid/framework/ir/delete_cast_op_pass_test.cc b/paddle/fluid/framework/ir/delete_cast_op_pass_test.cc
index 570eae7825e35c..1885f945840332 100644
--- a/paddle/fluid/framework/ir/delete_cast_op_pass_test.cc
+++ b/paddle/fluid/framework/ir/delete_cast_op_pass_test.cc
@@ -226,6 +226,35 @@ TEST(ApplyCastIndexSamplePass, basic) {
                         cast_num_in_graph));
 }
 
+TEST(ApplyCastScatterPass, basic) {
+  paddle::framework::ProgramDesc program;
+  auto* block = program.MutableBlock(0);
+  auto* cast0_in = Data(block, "cast0_in", {1});
+  auto* cast0_out = AddCast(block, cast0_in, 4, 5);
+  auto* cast1_in = Data(block, "cast1_in", {1});
+  auto* cast1_out = AddCast(block, cast1_in, 4, 5);
+  auto* scatter_out = Data(block, "scatter_out", {1});
+  OpDesc* scatter = block->AppendOp();
+  scatter->SetType("scatter");
+  scatter->SetInput("X", {cast0_out->Name()});
+  scatter->SetInput("Updates", {cast1_out->Name()});
+  scatter->SetOutput("Out", {scatter_out->Name()});
+  AddCast(block, scatter_out, 5, 4);
+
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(program));
+  auto scope = new Scope();
+  graph->Set("__param_scope__", scope);
+  auto pass = PassRegistry::Instance().Get("delete_cast_op_pass");
+  pass->Apply(graph.get());
+  int cast_num_in_graph = GetOpNum(graph->GetSubGraph(0), "cast");
+  PADDLE_ENFORCE_EQ(GetOpNum(graph->GetSubGraph(0), "cast"),
+                    0,
+                    platform::errors::PreconditionNotMet(
+                        "graph should have 0 cast after delete_cast_op_pass, "
+                        "but actually has %d.",
+                        cast_num_in_graph));
+}
+
 TEST(ApplyCastPass, basic) {
   paddle::framework::ProgramDesc program;
   auto* block = program.MutableBlock(0);
diff --git a/paddle/fluid/framework/ir/delete_elementwise_mul_op_pass.cc b/paddle/fluid/framework/ir/delete_elementwise_mul_op_pass.cc
new file mode 100644
index 00000000000000..5f8cd8b8ec6070
--- /dev/null
+++ b/paddle/fluid/framework/ir/delete_elementwise_mul_op_pass.cc
@@ -0,0 +1,127 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace phi {
+class DenseTensor;
+}  // namespace phi
+
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace patterns {
+
+struct FillMulPattern : public PatternBase {
+  FillMulPattern(PDPattern* pattern, const std::string& name_scope);
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(fill);
+  PATTERN_DECL_NODE(mul);
+  // declare variable node's name
+  PATTERN_DECL_NODE(fill_out);
+  PATTERN_DECL_NODE(mul_in);
+  PATTERN_DECL_NODE(mul_out);
+};
+
+FillMulPattern::FillMulPattern(PDPattern* pattern,
+                               const std::string& name_scope)
+    : PatternBase(pattern, name_scope, name_scope) {
+  auto* fill = pattern->NewNode(fill_repr())
+                   ->assert_is_op("fill_constant_batch_size_like")
+                   ->assert_more([](Node* node) {
+                     float value = node->Op()->GetAttrIfExists<float>("value");
+                     return fabs(value - 1.f) < 1e-5;
+                   });
+  auto* fill_out =
+      pattern->NewNode(fill_out_repr())
+          ->assert_is_op_output("fill_constant_batch_size_like", "Out")
+          ->assert_has_n_outputs(1);
+  auto* mul_in = pattern->NewNode(mul_in_repr());
+  auto* mul = pattern->NewNode(mul_repr())->assert_is_op("elementwise_mul");
+  auto* mul_out = pattern->NewNode(mul_out_repr())
+                      ->assert_is_op_output("elementwise_mul", "Out");
+
+  fill->LinksTo({fill_out});
+  mul->LinksFrom({fill_out, mul_in}).LinksTo({mul_out});
+}
+
+}  // namespace patterns
+
+/*
+Delete "elementwise" if one of inputs is "1".
+*/
+class DeleteElementwiseMulOpPass : public FusePassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+
+ private:
+  const std::string name_scope_{"delete_elementwise_mul_op_pass"};
+};
+
+void DeleteElementwiseMulOpPass::ApplyImpl(ir::Graph* graph) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::PreconditionNotMet("graph should not be null."));
+  Init(name_scope_, graph);
+  GraphPatternDetector gpd;
+  patterns::FillMulPattern pattern(gpd.mutable_pattern(), name_scope_);
+
+  int found_subgraph_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* graph) {
+    VLOG(4) << "handle DeleteElementwiseMulOpPass fuse";
+#define GET_IR_NODE(node_) GET_IR_NODE_FROM_SUBGRAPH(node_, node_, pattern)
+    GET_IR_NODE(fill);
+    GET_IR_NODE(mul);
+    GET_IR_NODE(fill_out);
+    GET_IR_NODE(mul_in);
+    GET_IR_NODE(mul_out);
+#undef GET_IR_NODE
+
+    for (auto* next_op : mul_out->outputs) {
+      next_op->Op()->RenameInput(mul_out->Name(), mul_in->Name());
+      IR_NODE_LINK_TO(mul_in, next_op);
+    }
+
+    std::unordered_set<const Node*> delete_nodes{fill, mul, fill_out, mul_out};
+    GraphSafeRemoveNodes(graph, delete_nodes);
+    found_subgraph_count++;
+  };
+
+  gpd(graph, handler);
+  AddStatis(found_subgraph_count);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(delete_elementwise_mul_op_pass,
+              paddle::framework::ir::DeleteElementwiseMulOpPass);
+
+REGISTER_PASS_CAPABILITY(delete_elementwise_mul_op_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination().EQ(
+            "fill_constant_batch_size_like", 0));
diff --git a/paddle/fluid/framework/ir/pass.cc b/paddle/fluid/framework/ir/pass.cc
index 548d94360003ac..cc4033f7f5a545 100644
--- a/paddle/fluid/framework/ir/pass.cc
+++ b/paddle/fluid/framework/ir/pass.cc
@@ -57,6 +57,7 @@ static const std::vector<std::string> xpu_support_subgraph_passes = {
     "identity_scale_op_clean_pass",
     "delete_op_device_pass",
     "constant_folding_pass",
+    "delete_elementwise_mul_op_pass",
     "generate_sequence_xpu_fuse_pass",
     "embedding_with_eltwise_add_xpu_fuse_pass",
     "multi_encoder_xpu_fuse_pass",
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index a1fe08b081eebd..cd46398c66fcc4 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -524,6 +524,7 @@ XpuPassStrategy::XpuPassStrategy() : PassStrategy({}) {
       "identity_scale_op_clean_pass",
       "delete_op_device_pass",
       "constant_folding_pass",
+      "delete_elementwise_mul_op_pass",
       "generate_sequence_xpu_fuse_pass",
       "embedding_with_eltwise_add_xpu_fuse_pass",
       "multi_encoder_xpu_fuse_pass",
diff --git a/paddle/phi/kernels/xpu/scatter_kernel.cc b/paddle/phi/kernels/xpu/scatter_kernel.cc
index de48682c1ecd55..4856c05ebf7403 100644
--- a/paddle/phi/kernels/xpu/scatter_kernel.cc
+++ b/paddle/phi/kernels/xpu/scatter_kernel.cc
@@ -27,9 +27,12 @@ void ScatterKernel(const Context &ctx,
                    const DenseTensor &updates,
                    bool overwrite,
                    DenseTensor *out) {
+  using XPUTypeT = typename XPUTypeTrait<T>::Type;
   out->Resize(x.dims());
-  ctx.template Alloc<T>(out);
-  int ret = xpu::copy(ctx.x_context(), x.data<T>(), out->data<T>(), x.numel());
+  auto *x_data = reinterpret_cast<const XPUTypeT *>(x.data<T>());
+  auto *updates_data = reinterpret_cast<const XPUTypeT *>(updates.data<T>());
+  auto *out_data = reinterpret_cast<XPUTypeT *>(ctx.template Alloc<T>(out));
+  int ret = xpu::copy(ctx.x_context(), x_data, out_data, x.numel());
   PADDLE_ENFORCE_XDNN_SUCCESS(ret, "copy");
   // Apply ScatterUpdate: Out[index] = Updates[:]
   const auto &index_type = index.dtype();
@@ -78,8 +81,6 @@ void ScatterKernel(const Context &ctx,
   int dim0 = static_cast<int>(x.dims()[0]);
   int dim1 =
       static_cast<int>(phi::product(phi::slice_ddim(x_dims, 1, x_dims.size())));
-  T *out_data = out->data<T>();
-  const T *updates_data = updates.data<T>();
 
   DenseTensor indices_cpu(index.type());
   phi::Copy(ctx, index, phi::CPUPlace(), false, &indices_cpu);
@@ -113,5 +114,11 @@ void ScatterKernel(const Context &ctx,
 
 }  // namespace phi
 
-PD_REGISTER_KERNEL(
-    scatter, XPU, ALL_LAYOUT, phi::ScatterKernel, float, int, int64_t) {}
+PD_REGISTER_KERNEL(scatter,
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::ScatterKernel,
+                   float,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_xpu_delete_elementwise_mul_op_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_xpu_delete_elementwise_mul_op_pass.py
new file mode 100644
index 00000000000000..b49e3652c33956
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_xpu_delete_elementwise_mul_op_pass.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import hypothesis.strategies as st
+from auto_scan_test import PassAutoScanTest
+from program_config import OpConfig, ProgramConfig, TensorConfig
+
+
+class TestDeleteElementwiseMulOpPass(PassAutoScanTest):
+    def sample_predictor_configs(self, program_config):
+        config = self.create_inference_config(use_xpu=True)
+        yield config, ["relu"], (1e-5, 1e-5)
+
+    def sample_program_config(self, draw):
+        x_shape = draw(
+            st.lists(
+                st.integers(min_value=1, max_value=20), min_size=2, max_size=2
+            )
+        )
+
+        fill_op = OpConfig(
+            "fill_constant_batch_size_like",
+            inputs={
+                "Input": ["fill_x"],
+            },
+            shape=[-1, 1],
+            input_dim_idx=0,
+            output_dim_idx=0,
+            dtype=5,
+            value=1.0,
+            str_value="1",
+            force_cpu=False,
+            outputs={"Out": ["fill_out"]},
+        )
+        mul_op = OpConfig(
+            "elementwise_mul",
+            inputs={"X": ["fill_out"], "Y": ["mul_in"]},
+            axis=0,
+            outputs={"Out": ["mul_out"]},
+        )
+        relu_op = OpConfig(
+            "relu",
+            inputs={
+                "X": ["mul_out"],
+            },
+            outputs={"Out": ["relu_out"]},
+        )
+        ops = [fill_op, mul_op, relu_op]
+
+        program_config = ProgramConfig(
+            ops=ops,
+            weights={},
+            inputs={
+                "fill_x": TensorConfig(shape=x_shape),
+                "mul_in": TensorConfig(shape=x_shape),
+            },
+            outputs=ops[-1].outputs["Out"],
+        )
+        return program_config
+
+    def test(self):
+        self.run_and_statis(
+            quant=False,
+            max_examples=25,
+            passes=["delete_elementwise_mul_op_pass"],
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()

From 5664ea26a0c2ed61bca5857877a3bc6ef0a1d01c Mon Sep 17 00:00:00 2001
From: HongyuJia <jiahongyu@baidu.com>
Date: Thu, 13 Apr 2023 14:17:44 +0800
Subject: [PATCH 123/156] [enforce.h Decouple logging.h] Delete glog/logging.h
 from enforce.h (#52651)

* [enforce.h Decouple logging.h] Delete glog/logging.h from enforce.h

* Add logging.h for profiler.cc

* Add logging.h for gloo_utils.h

* Add logging.h for addmm_kernel_impl.h

* Add logging.h for addmm_grad_kernel_impl.h

* Add logging.h for p_send_kernel.cu

* Add logging.h for determinant_grad_kernel_impl.h

* Add logging.h for p_recv_kernel.cu

* Add logging.h for elementwise_grad_base.h

* Add logging.h for transfer_layout_kernel.cc

* Add logging.h for eigvals_kernel.cc and index_select_impl.h

* Add logging.h for all files in kernel directory

* Add logging.h for xpu_info.cc

* Add logging.h for xpu
---
 paddle/fluid/framework/ir/delete_cast_op_pass.cc             | 2 ++
 paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc        | 3 +++
 paddle/fluid/framework/ir/xpu/delete_isolated_node_pass.cc   | 3 +++
 paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc            | 3 +++
 .../ir/xpu/fused_multi_transformer_xpu_quant_pass.cc         | 3 +++
 .../ir/xpu/fused_multi_transformer_xpu_quant_pass_tester.cc  | 2 ++
 .../framework/ir/xpu/generate_sequence_xpu_fuse_pass.cc      | 3 +++
 paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc        | 3 +++
 .../framework/ir/xpu/multi_encoder_xpu_slice_fuse_pass.cc    | 3 +++
 paddle/fluid/framework/ir/xpu/one_beam_size_fuse_pass.cc     | 3 +++
 paddle/fluid/framework/ir/xpu/stack_fuse_pass.cc             | 3 +++
 .../fluid/memory/allocation/cuda_device_context_allocator.h  | 2 ++
 paddle/fluid/platform/device/xpu/bkcl_helper.h               | 2 ++
 paddle/fluid/platform/device/xpu/xpu_op_list.cc              | 2 ++
 paddle/phi/api/lib/data_transform.cc                         | 2 ++
 paddle/phi/api/lib/tensor_copy.cc                            | 2 ++
 paddle/phi/api/lib/tensor_method.cc                          | 2 ++
 paddle/phi/api/profiler/profiler.cc                          | 2 ++
 paddle/phi/api/yaml/generator/strings_api_gen.py             | 2 ++
 paddle/phi/backends/context_pool.cc                          | 3 +++
 paddle/phi/backends/custom/custom_device.cc                  | 2 ++
 paddle/phi/backends/event.cc                                 | 2 ++
 paddle/phi/backends/gpu/cuda/cuda_graph.h                    | 2 ++
 paddle/phi/backends/gpu/cuda/cuda_info.cc                    | 2 ++
 paddle/phi/backends/gpu/gpu_resources.cc                     | 1 +
 paddle/phi/backends/onednn/onednn_context.cc                 | 2 ++
 paddle/phi/backends/stream.cc                                | 2 ++
 paddle/phi/backends/xpu/xpu_context.cc                       | 2 ++
 paddle/phi/backends/xpu/xpu_info.cc                          | 2 ++
 paddle/phi/core/cuda_stream.h                                | 2 ++
 paddle/phi/core/dense_tensor.cc                              | 2 ++
 paddle/phi/core/distributed/check/nccl_dynamic_check.cc      | 2 ++
 paddle/phi/core/distributed/gloo_utils.h                     | 2 ++
 paddle/phi/core/distributed/nccl_comm_context.cc             | 3 +++
 paddle/phi/core/distributed/store/tcp_store.cc               | 2 ++
 paddle/phi/core/distributed/store/tcp_utils.cc               | 2 ++
 paddle/phi/core/enforce.h                                    | 1 -
 paddle/phi/core/selected_rows_impl.cc                        | 3 +++
 paddle/phi/core/string_tensor.cc                             | 2 ++
 paddle/phi/core/tensor_utils.cc                              | 2 ++
 paddle/phi/infermeta/multiary.cc                             | 2 ++
 paddle/phi/kernels/coalesce_tensor_kernel.cc                 | 2 ++
 paddle/phi/kernels/cpu/adam_kernel.cc                        | 2 ++
 paddle/phi/kernels/cpu/adamw_kernel.cc                       | 2 ++
 paddle/phi/kernels/cpu/add_n_kernel.cc                       | 2 ++
 paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc             | 2 ++
 paddle/phi/kernels/cpu/eigvals_kernel.cc                     | 2 ++
 paddle/phi/kernels/cpu/gelu_grad_kernel.cc                   | 2 ++
 paddle/phi/kernels/cpu/gelu_kernel.cc                        | 2 ++
 paddle/phi/kernels/cpu/index_add_impl.h                      | 2 ++
 paddle/phi/kernels/cpu/index_sample_kernel.cc                | 2 ++
 paddle/phi/kernels/cpu/index_select_impl.h                   | 2 ++
 paddle/phi/kernels/cpu/roll_kernel_impl.h                    | 2 ++
 paddle/phi/kernels/funcs/batch_norm_utils.h                  | 2 ++
 paddle/phi/kernels/funcs/blas/blas_impl.cu.h                 | 2 ++
 paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h               | 2 ++
 paddle/phi/kernels/funcs/concat_and_split_functor.cu         | 2 ++
 paddle/phi/kernels/funcs/dims_simplifier.h                   | 2 ++
 paddle/phi/kernels/funcs/elementwise_grad_base.h             | 2 ++
 paddle/phi/kernels/funcs/fused_gemm_epilogue.h               | 1 +
 paddle/phi/kernels/funcs/gather_scatter_functor.cc           | 2 ++
 paddle/phi/kernels/funcs/layer_norm_impl.cu.h                | 2 ++
 paddle/phi/kernels/funcs/selected_rows_functor.cc            | 2 ++
 paddle/phi/kernels/funcs/selected_rows_functor.cu            | 2 ++
 paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h        | 2 ++
 paddle/phi/kernels/funcs/stack_and_unstack.h                 | 2 ++
 paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.cu      | 2 ++
 paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h       | 2 ++
 paddle/phi/kernels/fusion/gpu/attn_gemm.h                    | 2 ++
 paddle/phi/kernels/fusion/gpu/conv_fusion_kernel.cu          | 2 ++
 paddle/phi/kernels/fusion/gpu/fmha_ref.h                     | 2 ++
 paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h         | 2 ++
 .../fusion/gpu/fused_layernorm_residual_dropout_bias.h       | 2 ++
 .../kernels/fusion/gpu/fused_linear_param_grad_add_kernel.cu | 2 ++
 paddle/phi/kernels/fusion/onednn/fused_transpose_kernel.cc   | 2 ++
 .../kernels/fusion/xpu/fused_multi_transformer_xpu_kernel.cc | 3 +++
 paddle/phi/kernels/gpu/activation_grad_kernel.cu             | 2 ++
 paddle/phi/kernels/gpu/adam_kernel.cu                        | 2 ++
 paddle/phi/kernels/gpu/adamw_kernel.cu                       | 2 ++
 paddle/phi/kernels/gpu/affine_grid_grad_kernel.cu            | 2 ++
 paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu             | 2 ++
 paddle/phi/kernels/gpu/batch_norm_kernel.cu                  | 2 ++
 paddle/phi/kernels/gpu/elementwise_grad.h                    | 2 ++
 paddle/phi/kernels/gpu/embedding_grad_kernel.cu              | 1 +
 paddle/phi/kernels/gpu/gelu_funcs.h                          | 2 ++
 paddle/phi/kernels/gpu/grid_sample_kernel.cu                 | 2 ++
 paddle/phi/kernels/gpu/index_add_kernel.cu                   | 1 +
 paddle/phi/kernels/gpu/index_select_grad_kernel.cu           | 1 +
 paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu          | 3 +++
 paddle/phi/kernels/gpu/instance_norm_kernel.cu               | 2 ++
 paddle/phi/kernels/gpu/p_recv_kernel.cu                      | 2 ++
 paddle/phi/kernels/gpu/p_send_kernel.cu                      | 2 ++
 paddle/phi/kernels/gpu/prelu_kernel.cu                       | 2 ++
 paddle/phi/kernels/gpu/rnn_kernel.cu.cc                      | 2 ++
 paddle/phi/kernels/gpu/top_k_kernel.cu                       | 2 ++
 paddle/phi/kernels/gpudnn/conv_cudnn_frontend.h              | 2 ++
 paddle/phi/kernels/gpudnn/conv_cudnn_v7.h                    | 2 ++
 paddle/phi/kernels/gpudnn/conv_gpudnn_base.h                 | 2 ++
 paddle/phi/kernels/gpudnn/conv_grad_kernel.cu                | 2 ++
 paddle/phi/kernels/gpudnn/conv_kernel.cu                     | 2 ++
 paddle/phi/kernels/gpudnn/conv_miopen_helper.h               | 2 ++
 paddle/phi/kernels/impl/activation_grad_impl.h               | 2 ++
 paddle/phi/kernels/impl/addmm_grad_kernel_impl.h             | 2 ++
 paddle/phi/kernels/impl/addmm_kernel_impl.h                  | 2 ++
 paddle/phi/kernels/impl/determinant_grad_kernel_impl.h       | 2 ++
 paddle/phi/kernels/impl/determinant_kernel_impl.h            | 2 ++
 paddle/phi/kernels/impl/dot_grad_kernel_impl.h               | 2 ++
 paddle/phi/kernels/impl/einsum_grad_impl.h                   | 2 ++
 paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h       | 2 ++
 paddle/phi/kernels/impl/lamb_kernel_impl.h                   | 2 ++
 paddle/phi/kernels/impl/matmul_grad_kernel_impl.h            | 2 ++
 paddle/phi/kernels/impl/matmul_kernel_impl.h                 | 2 ++
 paddle/phi/kernels/impl/merged_momentum_impl.h               | 2 ++
 paddle/phi/kernels/impl/momentum_kernel_impl.h               | 2 ++
 paddle/phi/kernels/impl/set_value_grad_kernel_impl.h         | 2 ++
 paddle/phi/kernels/impl/slogdeterminant_grad_kernel_impl.h   | 2 ++
 paddle/phi/kernels/impl/slogdeterminant_kernel_impl.h        | 2 ++
 paddle/phi/kernels/npu_identity_kernel.cc                    | 2 ++
 paddle/phi/kernels/onednn/transpose_kernel.cc                | 3 +++
 paddle/phi/kernels/selected_rows/cpu/adam_kernel.cc          | 1 +
 paddle/phi/kernels/selected_rows/cpu/adamw_kernel.cc         | 2 ++
 paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu          | 2 ++
 paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu         | 2 ++
 paddle/phi/kernels/selected_rows/impl/lamb_kernel_impl.h     | 2 ++
 paddle/phi/kernels/selected_rows/xpu/adam_kernel.cc          | 2 ++
 paddle/phi/kernels/share_buffer_kernel.cc                    | 2 ++
 paddle/phi/kernels/strings/gpu/strings_copy_kernel.cu        | 2 ++
 paddle/phi/kernels/transfer_layout_kernel.cc                 | 2 ++
 paddle/phi/kernels/xpu/adam_kernel.cc                        | 2 ++
 paddle/phi/kernels/xpu/adamw_kernel.cc                       | 2 ++
 paddle/phi/kernels/xpu/amp_kernel.cc                         | 2 ++
 paddle/phi/kernels/xpu/set_value_grad_kernel.cc              | 5 +++--
 test/cpp/phi/core/test_sparse_csr_tensor.cc                  | 1 +
 test/cpp/phi/core/test_string_tensor.cc                      | 1 +
 test/cpp/phi/core/test_tensor_array.cc                       | 4 ++--
 test/cpp/phi/kernels/test_strings_lower_upper_dev_api.cc     | 2 ++
 136 files changed, 278 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/framework/ir/delete_cast_op_pass.cc b/paddle/fluid/framework/ir/delete_cast_op_pass.cc
index 5df0128f482eb2..3bf2e53e40533b 100644
--- a/paddle/fluid/framework/ir/delete_cast_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_cast_op_pass.cc
@@ -14,6 +14,8 @@
 
 #include "paddle/fluid/framework/ir/delete_cast_op_pass.h"
 
+#include "glog/logging.h"
+
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc
index 0b591120014e35..af5ec2c651c8a8 100644
--- a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc
@@ -13,6 +13,9 @@
 // limitations under the License.
 
 #include <string>
+
+#include "glog/logging.h"
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/pass.h"
diff --git a/paddle/fluid/framework/ir/xpu/delete_isolated_node_pass.cc b/paddle/fluid/framework/ir/xpu/delete_isolated_node_pass.cc
index c1137c319d2f52..b96fc8c31bb02b 100644
--- a/paddle/fluid/framework/ir/xpu/delete_isolated_node_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/delete_isolated_node_pass.cc
@@ -13,6 +13,9 @@
 // limitations under the License.
 
 #include <string>
+
+#include "glog/logging.h"
+
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/ir/xpu/pass_utils.h"
diff --git a/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc
index cd9febd8ec830d..74b021bf1af271 100644
--- a/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc
@@ -13,6 +13,9 @@
 // limitations under the License.
 
 #include <string>
+
+#include "glog/logging.h"
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/pass.h"
diff --git a/paddle/fluid/framework/ir/xpu/fused_multi_transformer_xpu_quant_pass.cc b/paddle/fluid/framework/ir/xpu/fused_multi_transformer_xpu_quant_pass.cc
index e79c37c127ba2b..fab466a50637e2 100644
--- a/paddle/fluid/framework/ir/xpu/fused_multi_transformer_xpu_quant_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/fused_multi_transformer_xpu_quant_pass.cc
@@ -13,6 +13,9 @@
 // limitations under the License.
 
 #include <string>
+
+#include "glog/logging.h"
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/pass.h"
diff --git a/paddle/fluid/framework/ir/xpu/fused_multi_transformer_xpu_quant_pass_tester.cc b/paddle/fluid/framework/ir/xpu/fused_multi_transformer_xpu_quant_pass_tester.cc
index d3181dbde4632b..cdcc20cb2f7657 100644
--- a/paddle/fluid/framework/ir/xpu/fused_multi_transformer_xpu_quant_pass_tester.cc
+++ b/paddle/fluid/framework/ir/xpu/fused_multi_transformer_xpu_quant_pass_tester.cc
@@ -11,6 +11,8 @@ limitations under the License. */
 
 #include <gtest/gtest.h>
 
+#include "glog/logging.h"
+
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 
diff --git a/paddle/fluid/framework/ir/xpu/generate_sequence_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/generate_sequence_xpu_fuse_pass.cc
index 7b40b67824d16f..5be359c0cd1c0c 100644
--- a/paddle/fluid/framework/ir/xpu/generate_sequence_xpu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/generate_sequence_xpu_fuse_pass.cc
@@ -13,6 +13,9 @@
 // limitations under the License.
 
 #include <string>
+
+#include "glog/logging.h"
+
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
diff --git a/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc b/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc
index 932d4ca7b88646..e100db36c2735a 100644
--- a/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc
@@ -13,6 +13,9 @@
 // limitations under the License.
 
 #include <string>
+
+#include "glog/logging.h"
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/pass.h"
diff --git a/paddle/fluid/framework/ir/xpu/multi_encoder_xpu_slice_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/multi_encoder_xpu_slice_fuse_pass.cc
index 722ac525d41762..59754f9d58146b 100644
--- a/paddle/fluid/framework/ir/xpu/multi_encoder_xpu_slice_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/multi_encoder_xpu_slice_fuse_pass.cc
@@ -13,6 +13,9 @@
 // limitations under the License.
 
 #include <string>
+
+#include "glog/logging.h"
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/pass.h"
diff --git a/paddle/fluid/framework/ir/xpu/one_beam_size_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/one_beam_size_fuse_pass.cc
index c6adf390fad715..2944f9c7a51749 100644
--- a/paddle/fluid/framework/ir/xpu/one_beam_size_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/one_beam_size_fuse_pass.cc
@@ -14,6 +14,9 @@
 
 #include "paddle/fluid/framework/ir/xpu/one_beam_size_fuse_pass.h"
 #include <string>
+
+#include "glog/logging.h"
+
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/xpu/pass_utils.h"
 #include "paddle/fluid/framework/op_version_registry.h"
diff --git a/paddle/fluid/framework/ir/xpu/stack_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/stack_fuse_pass.cc
index a128011159db7d..ff7a0b30237d1d 100644
--- a/paddle/fluid/framework/ir/xpu/stack_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/stack_fuse_pass.cc
@@ -13,6 +13,9 @@
 // limitations under the License.
 
 #include <string>
+
+#include "glog/logging.h"
+
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/pass.h"
diff --git a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
index c8c1f7a61e6df1..7286f84160c6ad 100644
--- a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
+++ b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
@@ -19,6 +19,8 @@
 #include <utility>
 #include <vector>
 
+#include "glog/logging.h"
+
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/device_context.h"
diff --git a/paddle/fluid/platform/device/xpu/bkcl_helper.h b/paddle/fluid/platform/device/xpu/bkcl_helper.h
index e7c968bd34f1e6..1f44bc0a8c98b2 100644
--- a/paddle/fluid/platform/device/xpu/bkcl_helper.h
+++ b/paddle/fluid/platform/device/xpu/bkcl_helper.h
@@ -25,6 +25,8 @@
 #include <unordered_map>
 #include <vector>
 
+#include "glog/logging.h"
+
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/platform/collective_helper.h"
diff --git a/paddle/fluid/platform/device/xpu/xpu_op_list.cc b/paddle/fluid/platform/device/xpu/xpu_op_list.cc
index 1b12b9ebf94683..abe5bcd8c6c852 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_list.cc
+++ b/paddle/fluid/platform/device/xpu/xpu_op_list.cc
@@ -15,6 +15,8 @@ limitations under the License. */
 #include <string>
 #include <unordered_set>
 
+#include "glog/logging.h"
+
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/platform/device/xpu/xpu_info.h"
 #include "paddle/phi/backends/xpu/xpu_op_kpfirst_list.h"
diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc
index db848413d9fe8c..d5e09c052c00c8 100644
--- a/paddle/phi/api/lib/data_transform.cc
+++ b/paddle/phi/api/lib/data_transform.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #include "paddle/phi/api/lib/data_transform.h"
 
+#include "glog/logging.h"
+
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/backends/context_pool.h"
diff --git a/paddle/phi/api/lib/tensor_copy.cc b/paddle/phi/api/lib/tensor_copy.cc
index 5bbb178f50424a..ac3f80bed0c03c 100644
--- a/paddle/phi/api/lib/tensor_copy.cc
+++ b/paddle/phi/api/lib/tensor_copy.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #include "paddle/phi/api/lib/tensor_copy.h"
 
+#include "glog/logging.h"
+
 #include "paddle/phi/api/include/context_pool.h"
 #include "paddle/phi/api/lib/api_gen_utils.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
diff --git a/paddle/phi/api/lib/tensor_method.cc b/paddle/phi/api/lib/tensor_method.cc
index 5efd89d34e1bb4..2b5269f5882f37 100644
--- a/paddle/phi/api/lib/tensor_method.cc
+++ b/paddle/phi/api/lib/tensor_method.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #include "paddle/phi/api/include/tensor.h"
 
+#include "glog/logging.h"
+
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/tensor_base.h"
diff --git a/paddle/phi/api/profiler/profiler.cc b/paddle/phi/api/profiler/profiler.cc
index 3c7f405a011a68..906ca12c7c581a 100644
--- a/paddle/phi/api/profiler/profiler.cc
+++ b/paddle/phi/api/profiler/profiler.cc
@@ -20,6 +20,8 @@ limitations under the License. */
 #include <string>
 #include <type_traits>
 
+#include "glog/logging.h"
+
 #include "paddle/phi/api/profiler/common_event.h"
 #include "paddle/phi/api/profiler/device_tracer.h"
 #include "paddle/phi/api/profiler/host_event_recorder.h"
diff --git a/paddle/phi/api/yaml/generator/strings_api_gen.py b/paddle/phi/api/yaml/generator/strings_api_gen.py
index 53f4cd6e0dff3b..b66342d7815298 100644
--- a/paddle/phi/api/yaml/generator/strings_api_gen.py
+++ b/paddle/phi/api/yaml/generator/strings_api_gen.py
@@ -330,6 +330,8 @@ def source_include(header_file_path):
 #include "{header_file_path}"
 
 #include "gflags/gflags.h"
+#include "glog/logging.h"
+
 #include "paddle/phi/api/lib/api_gen_utils.h"
 #include "paddle/phi/core/kernel_context.h"
 #include "paddle/phi/core/string_tensor.h"
diff --git a/paddle/phi/backends/context_pool.cc b/paddle/phi/backends/context_pool.cc
index fdc156d5617eb6..b05aa51205ac00 100644
--- a/paddle/phi/backends/context_pool.cc
+++ b/paddle/phi/backends/context_pool.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/backends/context_pool.h"
+
+#include "glog/logging.h"
+
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/enforce.h"
 
diff --git a/paddle/phi/backends/custom/custom_device.cc b/paddle/phi/backends/custom/custom_device.cc
index cee4f56d63c24c..46535ee84777a3 100644
--- a/paddle/phi/backends/custom/custom_device.cc
+++ b/paddle/phi/backends/custom/custom_device.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "glog/logging.h"
+
 #include "paddle/phi/api/profiler/trace_event_collector.h"
 #include "paddle/phi/backends/callback_manager.h"
 #include "paddle/phi/backends/context_pool.h"
diff --git a/paddle/phi/backends/event.cc b/paddle/phi/backends/event.cc
index 7e64bc58fdbbf8..371e858a3fe4fa 100644
--- a/paddle/phi/backends/event.cc
+++ b/paddle/phi/backends/event.cc
@@ -14,6 +14,8 @@
 
 #include "paddle/phi/backends/event.h"
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/device_guard.h"
 #include "paddle/phi/backends/stream.h"
 
diff --git a/paddle/phi/backends/gpu/cuda/cuda_graph.h b/paddle/phi/backends/gpu/cuda/cuda_graph.h
index abf9dac21db8fd..2f61e031f1128b 100644
--- a/paddle/phi/backends/gpu/cuda/cuda_graph.h
+++ b/paddle/phi/backends/gpu/cuda/cuda_graph.h
@@ -26,6 +26,8 @@
 #include "cuda.h"          // NOLINT
 #include "cuda_runtime.h"  // NOLINT
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/memory_utils.h"
diff --git a/paddle/phi/backends/gpu/cuda/cuda_info.cc b/paddle/phi/backends/gpu/cuda/cuda_info.cc
index 1ef1327cd6d039..297b0a5e132315 100644
--- a/paddle/phi/backends/gpu/cuda/cuda_info.cc
+++ b/paddle/phi/backends/gpu/cuda/cuda_info.cc
@@ -14,6 +14,8 @@
 
 #include "paddle/phi/backends/gpu/gpu_info.h"
 
+#include "glog/logging.h"
+
 #include "paddle/phi/core/enforce.h"
 
 static std::once_flag g_device_props_size_init_flag;
diff --git a/paddle/phi/backends/gpu/gpu_resources.cc b/paddle/phi/backends/gpu/gpu_resources.cc
index 1b484b503365fd..622891c93bbe74 100644
--- a/paddle/phi/backends/gpu/gpu_resources.cc
+++ b/paddle/phi/backends/gpu/gpu_resources.cc
@@ -33,6 +33,7 @@
 #endif  // !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
 #endif  // PADDLE_WITH_CUDA
 
+#include "glog/logging.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
 #include "paddle/phi/core/enforce.h"
diff --git a/paddle/phi/backends/onednn/onednn_context.cc b/paddle/phi/backends/onednn/onednn_context.cc
index a536651cbab4fe..5095b5c234b9a6 100644
--- a/paddle/phi/backends/onednn/onednn_context.cc
+++ b/paddle/phi/backends/onednn/onednn_context.cc
@@ -21,6 +21,8 @@
 #include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/core/expect.h"
 
+#include "glog/logging.h"
+
 namespace phi {
 
 OneDNNContextThreadLocals::Body::Body()
diff --git a/paddle/phi/backends/stream.cc b/paddle/phi/backends/stream.cc
index 097f44264580c3..52bb1d2e549b52 100644
--- a/paddle/phi/backends/stream.cc
+++ b/paddle/phi/backends/stream.cc
@@ -14,6 +14,8 @@
 
 #include "paddle/phi/backends/stream.h"
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/device_guard.h"
 #include "paddle/phi/backends/event.h"
 
diff --git a/paddle/phi/backends/xpu/xpu_context.cc b/paddle/phi/backends/xpu/xpu_context.cc
index c1fc20761dc0de..b1d768f8445b89 100644
--- a/paddle/phi/backends/xpu/xpu_context.cc
+++ b/paddle/phi/backends/xpu/xpu_context.cc
@@ -16,6 +16,8 @@
 
 #include <memory>
 
+#include "glog/logging.h"
+
 #include "paddle/phi/api/ext/exception.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 #include "paddle/phi/common/place.h"
diff --git a/paddle/phi/backends/xpu/xpu_info.cc b/paddle/phi/backends/xpu/xpu_info.cc
index 997d3cd9d11bb4..a8fbf7e7e6b7b3 100644
--- a/paddle/phi/backends/xpu/xpu_info.cc
+++ b/paddle/phi/backends/xpu/xpu_info.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 #include <cstdlib>
 #include <string>
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 #include "paddle/phi/backends/xpu/xpu_context.h"
 #include "paddle/phi/backends/xpu/xpu_header.h"
diff --git a/paddle/phi/core/cuda_stream.h b/paddle/phi/core/cuda_stream.h
index ff2f4846b17187..b27770b0814339 100644
--- a/paddle/phi/core/cuda_stream.h
+++ b/paddle/phi/core/cuda_stream.h
@@ -28,6 +28,8 @@ using gpuStream_t = cudaStream_t;
 using gpuStream_t = hipStream_t;
 #endif
 
+#include "glog/logging.h"
+
 #include "paddle/phi/core/enforce.h"
 
 namespace phi {
diff --git a/paddle/phi/core/dense_tensor.cc b/paddle/phi/core/dense_tensor.cc
index fc84ec1b33da5d..3116093a7884d7 100644
--- a/paddle/phi/core/dense_tensor.cc
+++ b/paddle/phi/core/dense_tensor.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #include "paddle/phi/core/dense_tensor.h"
 
+#include "glog/logging.h"
+
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/float16.h"
diff --git a/paddle/phi/core/distributed/check/nccl_dynamic_check.cc b/paddle/phi/core/distributed/check/nccl_dynamic_check.cc
index da8fb5d98a82fb..57bdf12bce7992 100644
--- a/paddle/phi/core/distributed/check/nccl_dynamic_check.cc
+++ b/paddle/phi/core/distributed/check/nccl_dynamic_check.cc
@@ -14,6 +14,8 @@
 
 #include "paddle/phi/core/distributed/check/nccl_dynamic_check.h"
 
+#include "glog/logging.h"
+
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/errors.h"
diff --git a/paddle/phi/core/distributed/gloo_utils.h b/paddle/phi/core/distributed/gloo_utils.h
index 80e5fca49af35f..57e029c17ac9b6 100644
--- a/paddle/phi/core/distributed/gloo_utils.h
+++ b/paddle/phi/core/distributed/gloo_utils.h
@@ -22,6 +22,8 @@
 #include <memory>
 #include <string>
 
+#include "glog/logging.h"
+
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/distributed/reduce_helper.h"
diff --git a/paddle/phi/core/distributed/nccl_comm_context.cc b/paddle/phi/core/distributed/nccl_comm_context.cc
index df80bf38742984..49a58f8f10f9cc 100644
--- a/paddle/phi/core/distributed/nccl_comm_context.cc
+++ b/paddle/phi/core/distributed/nccl_comm_context.cc
@@ -13,6 +13,9 @@
 // limitations under the License.
 
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
+
+#include "glog/logging.h"
+
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/distributed/check/nccl_dynamic_check.h"
 #include "paddle/phi/core/distributed/check/static_check.h"
diff --git a/paddle/phi/core/distributed/store/tcp_store.cc b/paddle/phi/core/distributed/store/tcp_store.cc
index dcf75042c104df..98b1ad3f850f30 100644
--- a/paddle/phi/core/distributed/store/tcp_store.cc
+++ b/paddle/phi/core/distributed/store/tcp_store.cc
@@ -18,6 +18,8 @@
 #include <iostream>
 #include <thread>
 
+#include "glog/logging.h"
+
 #include "paddle/phi/core/distributed/store/tcp_utils.h"
 #include "paddle/phi/core/flags.h"
 
diff --git a/paddle/phi/core/distributed/store/tcp_utils.cc b/paddle/phi/core/distributed/store/tcp_utils.cc
index 516c6437e0f568..aaf00cb8000853 100644
--- a/paddle/phi/core/distributed/store/tcp_utils.cc
+++ b/paddle/phi/core/distributed/store/tcp_utils.cc
@@ -18,6 +18,8 @@
 #include <cstring>
 #include <thread>
 
+#include "glog/logging.h"
+
 namespace phi {
 namespace distributed {
 namespace tcputils {
diff --git a/paddle/phi/core/enforce.h b/paddle/phi/core/enforce.h
index 70cd7af19cdbe9..96006fe83a42b8 100644
--- a/paddle/phi/core/enforce.h
+++ b/paddle/phi/core/enforce.h
@@ -59,7 +59,6 @@ limitations under the License. */
 #endif
 
 #define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
-#include "glog/logging.h"
 #include "paddle/phi/core/errors.h"
 
 #include "paddle/phi/backends/dynload/port.h"
diff --git a/paddle/phi/core/selected_rows_impl.cc b/paddle/phi/core/selected_rows_impl.cc
index f0fbefe2fc5fa4..e64474ea0d4e20 100644
--- a/paddle/phi/core/selected_rows_impl.cc
+++ b/paddle/phi/core/selected_rows_impl.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/core/selected_rows_impl.h"
+
+#include "glog/logging.h"
+
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/utils/data_type.h"
 
diff --git a/paddle/phi/core/string_tensor.cc b/paddle/phi/core/string_tensor.cc
index 6957d1fc1f9675..e82915a38abe5b 100644
--- a/paddle/phi/core/string_tensor.cc
+++ b/paddle/phi/core/string_tensor.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #include "paddle/phi/core/string_tensor.h"
 
+#include "glog/logging.h"
+
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/common/pstring.h"
 
diff --git a/paddle/phi/core/tensor_utils.cc b/paddle/phi/core/tensor_utils.cc
index 4fd11df211f9b7..79f8138222a505 100644
--- a/paddle/phi/core/tensor_utils.cc
+++ b/paddle/phi/core/tensor_utils.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #include "paddle/phi/core/tensor_utils.h"
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/data_type.h"
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index 71fe149e7c0c0f..bd38a2ec521d9b 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -16,6 +16,8 @@ limitations under the License. */
 
 #include <vector>
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/device_memory_aligment.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/common/scalar.h"
diff --git a/paddle/phi/kernels/coalesce_tensor_kernel.cc b/paddle/phi/kernels/coalesce_tensor_kernel.cc
index 5b9c1b4a3de21d..559c88eebbc481 100644
--- a/paddle/phi/kernels/coalesce_tensor_kernel.cc
+++ b/paddle/phi/kernels/coalesce_tensor_kernel.cc
@@ -17,6 +17,8 @@
 #include <sstream>
 #include <vector>
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/backends/device_memory_aligment.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
diff --git a/paddle/phi/kernels/cpu/adam_kernel.cc b/paddle/phi/kernels/cpu/adam_kernel.cc
index bf8296f8418fcc..083c9dab74001c 100644
--- a/paddle/phi/kernels/cpu/adam_kernel.cc
+++ b/paddle/phi/kernels/cpu/adam_kernel.cc
@@ -16,6 +16,8 @@
 
 #include <vector>
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
diff --git a/paddle/phi/kernels/cpu/adamw_kernel.cc b/paddle/phi/kernels/cpu/adamw_kernel.cc
index c21c04e6826210..f8b8ea67e23bb6 100644
--- a/paddle/phi/kernels/cpu/adamw_kernel.cc
+++ b/paddle/phi/kernels/cpu/adamw_kernel.cc
@@ -16,6 +16,8 @@
 
 #include <vector>
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/cpu/add_n_kernel.cc b/paddle/phi/kernels/cpu/add_n_kernel.cc
index 54506ccd54f5b9..650d33755dca95 100644
--- a/paddle/phi/kernels/cpu/add_n_kernel.cc
+++ b/paddle/phi/kernels/cpu/add_n_kernel.cc
@@ -14,6 +14,8 @@
 
 #include "paddle/phi/kernels/impl/add_n_kernel_impl.h"
 
+#include "glog/logging.h"
+
 namespace phi {
 
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
index 9eec65e92a38f3..4740dddab3350c 100644
--- a/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/batch_norm_kernel.h"
diff --git a/paddle/phi/kernels/cpu/eigvals_kernel.cc b/paddle/phi/kernels/cpu/eigvals_kernel.cc
index 555dbfb71dfb77..48da00b8a70060 100644
--- a/paddle/phi/kernels/cpu/eigvals_kernel.cc
+++ b/paddle/phi/kernels/cpu/eigvals_kernel.cc
@@ -14,6 +14,8 @@
 
 #include "paddle/phi/kernels/eigvals_kernel.h"
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/cpu/gelu_grad_kernel.cc b/paddle/phi/kernels/cpu/gelu_grad_kernel.cc
index 254c4ea5716d19..65ee3c1851003e 100644
--- a/paddle/phi/kernels/cpu/gelu_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/gelu_grad_kernel.cc
@@ -17,6 +17,8 @@
 #include <algorithm>
 #include <cmath>
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
diff --git a/paddle/phi/kernels/cpu/gelu_kernel.cc b/paddle/phi/kernels/cpu/gelu_kernel.cc
index 4d23470aa4e9e2..dbab3bd3266649 100644
--- a/paddle/phi/kernels/cpu/gelu_kernel.cc
+++ b/paddle/phi/kernels/cpu/gelu_kernel.cc
@@ -17,6 +17,8 @@
 #include <algorithm>
 #include <cmath>
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
diff --git a/paddle/phi/kernels/cpu/index_add_impl.h b/paddle/phi/kernels/cpu/index_add_impl.h
index d9a1b93d7217de..0a0671951b357a 100644
--- a/paddle/phi/kernels/cpu/index_add_impl.h
+++ b/paddle/phi/kernels/cpu/index_add_impl.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include "glog/logging.h"
+
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
diff --git a/paddle/phi/kernels/cpu/index_sample_kernel.cc b/paddle/phi/kernels/cpu/index_sample_kernel.cc
index 4ab51161350f29..f9e1721a6968b5 100644
--- a/paddle/phi/kernels/cpu/index_sample_kernel.cc
+++ b/paddle/phi/kernels/cpu/index_sample_kernel.cc
@@ -21,6 +21,8 @@
 #include <utility>
 #include <vector>
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/cpu/index_select_impl.h b/paddle/phi/kernels/cpu/index_select_impl.h
index 02cc2b4bdb91e8..1b2842ee666348 100644
--- a/paddle/phi/kernels/cpu/index_select_impl.h
+++ b/paddle/phi/kernels/cpu/index_select_impl.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include "glog/logging.h"
+
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
diff --git a/paddle/phi/kernels/cpu/roll_kernel_impl.h b/paddle/phi/kernels/cpu/roll_kernel_impl.h
index e2d96b896ac6a6..f177afec120ff8 100644
--- a/paddle/phi/kernels/cpu/roll_kernel_impl.h
+++ b/paddle/phi/kernels/cpu/roll_kernel_impl.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include "glog/logging.h"
+
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/core/dense_tensor.h"
 
diff --git a/paddle/phi/kernels/funcs/batch_norm_utils.h b/paddle/phi/kernels/funcs/batch_norm_utils.h
index 513266cb423b59..64a20ee5d2e098 100644
--- a/paddle/phi/kernels/funcs/batch_norm_utils.h
+++ b/paddle/phi/kernels/funcs/batch_norm_utils.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include "glog/logging.h"
+
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
index 2568f88274f0ea..35769d0e6e3386 100644
--- a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
+++ b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
@@ -15,6 +15,8 @@
 #pragma once
 
 #include "gflags/gflags.h"
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/dynload/cublas.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h b/paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h
index 86a8b7eae0ecd0..284d866d08fbdb 100644
--- a/paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h
+++ b/paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h
@@ -16,6 +16,8 @@ limitations under the License. */
 
 #if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
 
+#include "glog/logging.h"
+
 #include <cuda_runtime_api.h>  // NOLINT
 #include "cuda.h"              // NOLINT
 #include "paddle/phi/backends/dynload/cublasLt.h"
diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.cu b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
index 2f75f42c2a7100..5a7574b56a8917 100644
--- a/paddle/phi/kernels/funcs/concat_and_split_functor.cu
+++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/common/place.h"
diff --git a/paddle/phi/kernels/funcs/dims_simplifier.h b/paddle/phi/kernels/funcs/dims_simplifier.h
index a52373c117e3ee..fa0953ec46aab3 100644
--- a/paddle/phi/kernels/funcs/dims_simplifier.h
+++ b/paddle/phi/kernels/funcs/dims_simplifier.h
@@ -17,6 +17,8 @@ limitations under the License. */
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/dense_tensor.h"
 
+#include "glog/logging.h"
+
 namespace phi {
 namespace funcs {
 
diff --git a/paddle/phi/kernels/funcs/elementwise_grad_base.h b/paddle/phi/kernels/funcs/elementwise_grad_base.h
index 0b24031de3556b..858fc6766afdcb 100644
--- a/paddle/phi/kernels/funcs/elementwise_grad_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_grad_base.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/common/memory_utils.h"
diff --git a/paddle/phi/kernels/funcs/fused_gemm_epilogue.h b/paddle/phi/kernels/funcs/fused_gemm_epilogue.h
index 8edfea5f0789cb..b9b6854fd3841d 100644
--- a/paddle/phi/kernels/funcs/fused_gemm_epilogue.h
+++ b/paddle/phi/kernels/funcs/fused_gemm_epilogue.h
@@ -27,6 +27,7 @@ limitations under the License. */
 #if CUDA_VERSION >= 11060
 
 #include "gflags/gflags.h"
+#include "glog/logging.h"
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/backends/dynload/cublasLt.h"
 #include "paddle/phi/backends/gpu/cuda/cuda_helper.h"
diff --git a/paddle/phi/kernels/funcs/gather_scatter_functor.cc b/paddle/phi/kernels/funcs/gather_scatter_functor.cc
index 67af6a3322d9a6..db9e8a8247f30a 100644
--- a/paddle/phi/kernels/funcs/gather_scatter_functor.cc
+++ b/paddle/phi/kernels/funcs/gather_scatter_functor.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #include "paddle/phi/kernels/funcs/gather_scatter_functor.h"
 
+#include "glog/logging.h"
+
 namespace phi {
 namespace funcs {
 
diff --git a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
index 0b78b074e81f45..d8ade4612c85e9 100644
--- a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
+++ b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
@@ -24,6 +24,8 @@ namespace cub = hipcub;
 
 #include <iostream>
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/backends/gpu/gpu_dnn.h"
diff --git a/paddle/phi/kernels/funcs/selected_rows_functor.cc b/paddle/phi/kernels/funcs/selected_rows_functor.cc
index a7e8b1b4fdb5ec..9fc69ddab4a9c7 100644
--- a/paddle/phi/kernels/funcs/selected_rows_functor.cc
+++ b/paddle/phi/kernels/funcs/selected_rows_functor.cc
@@ -30,6 +30,8 @@ limitations under the License. */
 #include "paddle/phi/backends/onednn/axpy_handler.h"
 #endif
 
+#include "glog/logging.h"
+
 namespace phi {
 namespace funcs {
 template <typename T>
diff --git a/paddle/phi/kernels/funcs/selected_rows_functor.cu b/paddle/phi/kernels/funcs/selected_rows_functor.cu
index ccd0875dd9d4f6..2947701befcc70 100644
--- a/paddle/phi/kernels/funcs/selected_rows_functor.cu
+++ b/paddle/phi/kernels/funcs/selected_rows_functor.cu
@@ -15,6 +15,8 @@ limitations under the License. */
 #include <set>
 #include <vector>
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
diff --git a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h
index 7604008c4aec5e..fde5cb1768d47c 100644
--- a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h
+++ b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/dynload/cusparse.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/float16.h"
diff --git a/paddle/phi/kernels/funcs/stack_and_unstack.h b/paddle/phi/kernels/funcs/stack_and_unstack.h
index d82cbd523f8fbd..af45cfebc276f9 100644
--- a/paddle/phi/kernels/funcs/stack_and_unstack.h
+++ b/paddle/phi/kernels/funcs/stack_and_unstack.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/kernels/funcs/fast_divmod.h"
 #include "paddle/phi/kernels/funcs/segmented_array.h"
diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.cu b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.cu
index 63d9ffae3a85c8..c6480f6dbe54ba 100644
--- a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.cu
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.cu
@@ -16,6 +16,8 @@
 
 #include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h"
 
+#include "glog/logging.h"
+
 namespace phi {
 namespace fusion {
 namespace cutlass_internal {
diff --git a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h
index 710de13a5167d9..5a215a72369849 100644
--- a/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h
+++ b/paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_util.h
@@ -17,6 +17,8 @@
 #include <vector>
 #include "paddle/phi/kernels/fusion/cutlass/conv2d/conv2d_decl.h"
 
+#include "glog/logging.h"
+
 #include "cutlass/cutlass.h"
 #include "cutlass/gemm/device/gemm.h"
 
diff --git a/paddle/phi/kernels/fusion/gpu/attn_gemm.h b/paddle/phi/kernels/fusion/gpu/attn_gemm.h
index 4945a8a9a6dc7e..01544436e4d7eb 100644
--- a/paddle/phi/kernels/fusion/gpu/attn_gemm.h
+++ b/paddle/phi/kernels/fusion/gpu/attn_gemm.h
@@ -18,6 +18,8 @@
 #include "paddle/phi/backends/dynload/cublasLt.h"
 #endif
 
+#include "glog/logging.h"
+
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
diff --git a/paddle/phi/kernels/fusion/gpu/conv_fusion_kernel.cu b/paddle/phi/kernels/fusion/gpu/conv_fusion_kernel.cu
index a1957eab67f820..da71c0bf7d364d 100644
--- a/paddle/phi/kernels/fusion/gpu/conv_fusion_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/conv_fusion_kernel.cu
@@ -22,6 +22,8 @@
 #include <memory>
 #include <unordered_map>
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/backends/dynload/cudnn.h"
 #include "paddle/phi/backends/gpu/cuda/cudnn_desc.h"
diff --git a/paddle/phi/kernels/fusion/gpu/fmha_ref.h b/paddle/phi/kernels/fusion/gpu/fmha_ref.h
index ff09d05a7286cb..207be7b5d7bda4 100644
--- a/paddle/phi/kernels/fusion/gpu/fmha_ref.h
+++ b/paddle/phi/kernels/fusion/gpu/fmha_ref.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include "glog/logging.h"
+
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h b/paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h
index f381dcfde1cfad..57cbf678b92b3e 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h
+++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h
@@ -18,6 +18,8 @@
 #include "paddle/phi/backends/dynload/cublasLt.h"
 #endif
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/dropout_impl_util.h"
diff --git a/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h b/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h
index defc0b8112e7b0..7df7a79fb68667 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h
+++ b/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include "glog/logging.h"
+
 #include "paddle/phi/kernels/funcs/layer_norm_impl.cu.h"
 #include "paddle/phi/kernels/fusion/gpu/fused_residual_dropout_bias.h"
 
diff --git a/paddle/phi/kernels/fusion/gpu/fused_linear_param_grad_add_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_linear_param_grad_add_kernel.cu
index c562720ebee858..f39cc8acdc3102 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_linear_param_grad_add_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_linear_param_grad_add_kernel.cu
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/fusion/onednn/fused_transpose_kernel.cc b/paddle/phi/kernels/fusion/onednn/fused_transpose_kernel.cc
index 10252cb94165a2..e4684f23f54dc4 100644
--- a/paddle/phi/kernels/fusion/onednn/fused_transpose_kernel.cc
+++ b/paddle/phi/kernels/fusion/onednn/fused_transpose_kernel.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/onednn/onednn_reuse.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/transpose_kernel.h"
diff --git a/paddle/phi/kernels/fusion/xpu/fused_multi_transformer_xpu_kernel.cc b/paddle/phi/kernels/fusion/xpu/fused_multi_transformer_xpu_kernel.cc
index 5f49c24ea711ad..0d27a398e449be 100644
--- a/paddle/phi/kernels/fusion/xpu/fused_multi_transformer_xpu_kernel.cc
+++ b/paddle/phi/kernels/fusion/xpu/fused_multi_transformer_xpu_kernel.cc
@@ -13,6 +13,9 @@
 // limitations under the License.
 
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
+
+#include "glog/logging.h"
+
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/memcpy_kernel.h"
 #ifdef PADDLE_WITH_XPU_XFT
diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
index e56c3cf4f42000..04a414fd5848e2 100644
--- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #include "paddle/phi/kernels/activation_grad_kernel.h"
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/common/bfloat16.h"
diff --git a/paddle/phi/kernels/gpu/adam_kernel.cu b/paddle/phi/kernels/gpu/adam_kernel.cu
index 53d31b153d5f8a..23dc6cdfd3398c 100644
--- a/paddle/phi/kernels/gpu/adam_kernel.cu
+++ b/paddle/phi/kernels/gpu/adam_kernel.cu
@@ -18,6 +18,8 @@
 
 #include <vector>
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/common/float16.h"
diff --git a/paddle/phi/kernels/gpu/adamw_kernel.cu b/paddle/phi/kernels/gpu/adamw_kernel.cu
index 448153ed2196d9..d40fdf392b1a28 100644
--- a/paddle/phi/kernels/gpu/adamw_kernel.cu
+++ b/paddle/phi/kernels/gpu/adamw_kernel.cu
@@ -18,6 +18,8 @@
 
 #include <vector>
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/common/bfloat16.h"
diff --git a/paddle/phi/kernels/gpu/affine_grid_grad_kernel.cu b/paddle/phi/kernels/gpu/affine_grid_grad_kernel.cu
index 7619c73b3ccb71..f42d13df86a7c0 100644
--- a/paddle/phi/kernels/gpu/affine_grid_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/affine_grid_grad_kernel.cu
@@ -16,6 +16,8 @@
 
 #include "paddle/phi/kernels/affine_grid_grad_kernel.h"
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
index db7f3c3224a03e..c5ddd28e0da788 100644
--- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_dnn.h"
 #include "paddle/phi/common/layout.h"
diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
index fb1bca3daba86d..1a07e5f0d49098 100644
--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -20,6 +20,8 @@
 namespace cub = hipcub;
 #endif
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_dnn.h"
 #include "paddle/phi/common/layout.h"
diff --git a/paddle/phi/kernels/gpu/elementwise_grad.h b/paddle/phi/kernels/gpu/elementwise_grad.h
index 84047f14739b55..8440ed2b1222b2 100644
--- a/paddle/phi/kernels/gpu/elementwise_grad.h
+++ b/paddle/phi/kernels/gpu/elementwise_grad.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include "glog/logging.h"
+
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
diff --git a/paddle/phi/kernels/gpu/embedding_grad_kernel.cu b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu
index 130dc570e33986..e2bcfa4d19eb04 100644
--- a/paddle/phi/kernels/gpu/embedding_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu
@@ -15,6 +15,7 @@
 #include "paddle/phi/kernels/embedding_grad_kernel.h"
 
 #include "gflags/gflags.h"
+#include "glog/logging.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/common/data_type.h"
diff --git a/paddle/phi/kernels/gpu/gelu_funcs.h b/paddle/phi/kernels/gpu/gelu_funcs.h
index 42b5322426f70f..a8f685c0ab0d6e 100644
--- a/paddle/phi/kernels/gpu/gelu_funcs.h
+++ b/paddle/phi/kernels/gpu/gelu_funcs.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/common/place.h"
diff --git a/paddle/phi/kernels/gpu/grid_sample_kernel.cu b/paddle/phi/kernels/gpu/grid_sample_kernel.cu
index ff657d9dc46f72..3809ae7d5c3388 100644
--- a/paddle/phi/kernels/gpu/grid_sample_kernel.cu
+++ b/paddle/phi/kernels/gpu/grid_sample_kernel.cu
@@ -14,6 +14,8 @@
 
 #include "paddle/phi/kernels/grid_sample_kernel.h"
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/gpu/index_add_kernel.cu b/paddle/phi/kernels/gpu/index_add_kernel.cu
index e6d12e5c78414b..f4894fc6d9308c 100644
--- a/paddle/phi/kernels/gpu/index_add_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_add_kernel.cu
@@ -15,6 +15,7 @@
 #include "paddle/phi/kernels/index_add_kernel.h"
 
 #include "gflags/gflags.h"
+#include "glog/logging.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
diff --git a/paddle/phi/kernels/gpu/index_select_grad_kernel.cu b/paddle/phi/kernels/gpu/index_select_grad_kernel.cu
index 9c99a12fc285ef..2fce40dbe7306a 100644
--- a/paddle/phi/kernels/gpu/index_select_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_select_grad_kernel.cu
@@ -15,6 +15,7 @@
 #include "paddle/phi/kernels/index_select_grad_kernel.h"
 
 #include "gflags/gflags.h"
+#include "glog/logging.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
diff --git a/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
index a121f9fb95b063..0f17a1bcc318a7 100644
--- a/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/instance_norm_grad_kernel.cu
@@ -13,6 +13,9 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/instance_norm_grad_kernel.h"
+
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/gpu/instance_norm_kernel.cu b/paddle/phi/kernels/gpu/instance_norm_kernel.cu
index d4f421e62ddb92..7f10eac67c67c8 100644
--- a/paddle/phi/kernels/gpu/instance_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/instance_norm_kernel.cu
@@ -14,6 +14,8 @@
 
 #include "paddle/phi/kernels/instance_norm_kernel.h"
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/gpu/p_recv_kernel.cu b/paddle/phi/kernels/gpu/p_recv_kernel.cu
index 88bb45b61fbac9..5e96c7319d46bf 100644
--- a/paddle/phi/kernels/gpu/p_recv_kernel.cu
+++ b/paddle/phi/kernels/gpu/p_recv_kernel.cu
@@ -14,6 +14,8 @@
 
 #include "paddle/phi/kernels/p_recv_kernel.h"
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/ddim.h"
diff --git a/paddle/phi/kernels/gpu/p_send_kernel.cu b/paddle/phi/kernels/gpu/p_send_kernel.cu
index 747af01b260bbe..c2bf4c6888965d 100644
--- a/paddle/phi/kernels/gpu/p_send_kernel.cu
+++ b/paddle/phi/kernels/gpu/p_send_kernel.cu
@@ -14,6 +14,8 @@
 
 #include "paddle/phi/kernels/p_send_kernel.h"
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/gpu/prelu_kernel.cu b/paddle/phi/kernels/gpu/prelu_kernel.cu
index d698d78a55cc23..679d7ba3f2e7d5 100644
--- a/paddle/phi/kernels/gpu/prelu_kernel.cu
+++ b/paddle/phi/kernels/gpu/prelu_kernel.cu
@@ -14,6 +14,8 @@
 
 #include "paddle/phi/kernels/prelu_kernel.h"
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
diff --git a/paddle/phi/kernels/gpu/rnn_kernel.cu.cc b/paddle/phi/kernels/gpu/rnn_kernel.cu.cc
index 66babdcecbb187..d472b7c31d3172 100644
--- a/paddle/phi/kernels/gpu/rnn_kernel.cu.cc
+++ b/paddle/phi/kernels/gpu/rnn_kernel.cu.cc
@@ -14,6 +14,8 @@
 
 #include "paddle/phi/kernels/rnn_kernel.h"
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/generator.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/gpu/top_k_kernel.cu b/paddle/phi/kernels/gpu/top_k_kernel.cu
index e2793955ef9c17..a791812432a06b 100644
--- a/paddle/phi/kernels/gpu/top_k_kernel.cu
+++ b/paddle/phi/kernels/gpu/top_k_kernel.cu
@@ -14,6 +14,8 @@
 
 #include "paddle/phi/kernels/top_k_kernel.h"
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/gpudnn/conv_cudnn_frontend.h b/paddle/phi/kernels/gpudnn/conv_cudnn_frontend.h
index c5fe47a2431df4..e53d5783048ffd 100644
--- a/paddle/phi/kernels/gpudnn/conv_cudnn_frontend.h
+++ b/paddle/phi/kernels/gpudnn/conv_cudnn_frontend.h
@@ -17,6 +17,8 @@ limitations under the License. */
 
 #include <vector>
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/dynload/cudnn_frontend.h"
 #include "paddle/phi/backends/gpu/cuda/cudnn_desc.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
diff --git a/paddle/phi/kernels/gpudnn/conv_cudnn_v7.h b/paddle/phi/kernels/gpudnn/conv_cudnn_v7.h
index 852b0d77e5fe92..dfea9013ab0b87 100644
--- a/paddle/phi/kernels/gpudnn/conv_cudnn_v7.h
+++ b/paddle/phi/kernels/gpudnn/conv_cudnn_v7.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/gpu/cuda/cuda_graph_with_memory_pool.h"
 #include "paddle/phi/kernels/autotune/switch_autotune.h"
 #include "paddle/phi/kernels/gpudnn/conv_gpudnn_base.h"
diff --git a/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h b/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h
index 59dc0bef21e943..186bbd75fae62c 100644
--- a/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h
+++ b/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h
@@ -20,6 +20,8 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/kernels/autotune/cache.h"
diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
index a41144c5ee7361..2c6e898fa25c85 100644
--- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
@@ -14,6 +14,8 @@
 
 #include "paddle/phi/kernels/conv_grad_kernel.h"
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
diff --git a/paddle/phi/kernels/gpudnn/conv_kernel.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu
index d5ba3301cdd396..15161dd61c6970 100644
--- a/paddle/phi/kernels/gpudnn/conv_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_kernel.cu
@@ -14,6 +14,8 @@
 
 #include "paddle/phi/kernels/conv_kernel.h"
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
diff --git a/paddle/phi/kernels/gpudnn/conv_miopen_helper.h b/paddle/phi/kernels/gpudnn/conv_miopen_helper.h
index 9cdbcc126544ff..be2c09bf8d18a8 100644
--- a/paddle/phi/kernels/gpudnn/conv_miopen_helper.h
+++ b/paddle/phi/kernels/gpudnn/conv_miopen_helper.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include "glog/logging.h"
+
 #include "paddle/phi/kernels/gpudnn/conv_gpudnn_base.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/impl/activation_grad_impl.h b/paddle/phi/kernels/impl/activation_grad_impl.h
index ffe9ac26c69354..8be4d1c39d61aa 100644
--- a/paddle/phi/kernels/impl/activation_grad_impl.h
+++ b/paddle/phi/kernels/impl/activation_grad_impl.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/activation_kernel.h"
diff --git a/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h b/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
index bd775110f3a9e9..a72fb6062ceffe 100644
--- a/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
@@ -16,6 +16,8 @@ limitations under the License. */
 
 #include <type_traits>
 
+#include "glog/logging.h"
+
 #include "paddle/phi/kernels/addmm_grad_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
diff --git a/paddle/phi/kernels/impl/addmm_kernel_impl.h b/paddle/phi/kernels/impl/addmm_kernel_impl.h
index 151e8f7420acce..c86cea80e47e8b 100644
--- a/paddle/phi/kernels/impl/addmm_kernel_impl.h
+++ b/paddle/phi/kernels/impl/addmm_kernel_impl.h
@@ -16,6 +16,8 @@ limitations under the License. */
 
 #include <type_traits>
 
+#include "glog/logging.h"
+
 #include "paddle/phi/kernels/addmm_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
diff --git a/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h b/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h
index 248305b7fc0c94..3f463e1d9e0644 100644
--- a/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/determinant_grad_kernel_impl.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include "glog/logging.h"
+
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/determinant_grad_kernel.h"
 #include "paddle/phi/kernels/elementwise_multiply_kernel.h"
diff --git a/paddle/phi/kernels/impl/determinant_kernel_impl.h b/paddle/phi/kernels/impl/determinant_kernel_impl.h
index 5c7a16045c54e6..36e47c78c832c1 100644
--- a/paddle/phi/kernels/impl/determinant_kernel_impl.h
+++ b/paddle/phi/kernels/impl/determinant_kernel_impl.h
@@ -20,6 +20,8 @@
 #include <cmath>
 #include <vector>
 
+#include "glog/logging.h"
+
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/determinant_kernel.h"
diff --git a/paddle/phi/kernels/impl/dot_grad_kernel_impl.h b/paddle/phi/kernels/impl/dot_grad_kernel_impl.h
index 7e49465ec616bf..6bfe6d65245c76 100644
--- a/paddle/phi/kernels/impl/dot_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/dot_grad_kernel_impl.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include "glog/logging.h"
+
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/complex_kernel.h"
diff --git a/paddle/phi/kernels/impl/einsum_grad_impl.h b/paddle/phi/kernels/impl/einsum_grad_impl.h
index 78d61bcfd6a559..2b11e9eda6626a 100644
--- a/paddle/phi/kernels/impl/einsum_grad_impl.h
+++ b/paddle/phi/kernels/impl/einsum_grad_impl.h
@@ -13,6 +13,8 @@
 // limitations under the License.
 #pragma once
 
+#include "glog/logging.h"
+
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/complex_kernel.h"
 #include "paddle/phi/kernels/impl/einsum_impl.h"
diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
index a7c8ecc2234155..98849ca60d4d74 100644
--- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include "glog/logging.h"
+
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/float16.h"
diff --git a/paddle/phi/kernels/impl/lamb_kernel_impl.h b/paddle/phi/kernels/impl/lamb_kernel_impl.h
index 7a0fab69118753..e0850b8aef0d90 100644
--- a/paddle/phi/kernels/impl/lamb_kernel_impl.h
+++ b/paddle/phi/kernels/impl/lamb_kernel_impl.h
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
+#include "glog/logging.h"
+
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/lamb_functors.h"
diff --git a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
index b840a803b6d124..885827a36beab1 100644
--- a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/kernels/complex_kernel.h"
diff --git a/paddle/phi/kernels/impl/matmul_kernel_impl.h b/paddle/phi/kernels/impl/matmul_kernel_impl.h
index 454263fcb7fa10..acc7affc00e261 100644
--- a/paddle/phi/kernels/impl/matmul_kernel_impl.h
+++ b/paddle/phi/kernels/impl/matmul_kernel_impl.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include "glog/logging.h"
+
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/autotune/cache_base.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
diff --git a/paddle/phi/kernels/impl/merged_momentum_impl.h b/paddle/phi/kernels/impl/merged_momentum_impl.h
index de974131c4648b..40364507e8b2cd 100644
--- a/paddle/phi/kernels/impl/merged_momentum_impl.h
+++ b/paddle/phi/kernels/impl/merged_momentum_impl.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include "glog/logging.h"
+
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/hostdevice.h"
diff --git a/paddle/phi/kernels/impl/momentum_kernel_impl.h b/paddle/phi/kernels/impl/momentum_kernel_impl.h
index a9178262d6aea8..932cdfe57dade5 100644
--- a/paddle/phi/kernels/impl/momentum_kernel_impl.h
+++ b/paddle/phi/kernels/impl/momentum_kernel_impl.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include "glog/logging.h"
+
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/kernels/funcs/algorithm.h"
diff --git a/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h b/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h
index 02e5323c5b6c00..1292a3af36352e 100644
--- a/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include "glog/logging.h"
+
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/tensor_utils.h"
diff --git a/paddle/phi/kernels/impl/slogdeterminant_grad_kernel_impl.h b/paddle/phi/kernels/impl/slogdeterminant_grad_kernel_impl.h
index 9f629ab4bd1610..e7fa5edf9ad4ab 100644
--- a/paddle/phi/kernels/impl/slogdeterminant_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/slogdeterminant_grad_kernel_impl.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include "glog/logging.h"
+
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/complex_kernel.h"
 #include "paddle/phi/kernels/elementwise_multiply_kernel.h"
diff --git a/paddle/phi/kernels/impl/slogdeterminant_kernel_impl.h b/paddle/phi/kernels/impl/slogdeterminant_kernel_impl.h
index 6f590f62463a42..6a4f707b898bf0 100644
--- a/paddle/phi/kernels/impl/slogdeterminant_kernel_impl.h
+++ b/paddle/phi/kernels/impl/slogdeterminant_kernel_impl.h
@@ -18,6 +18,8 @@
 #include <cmath>
 #include <vector>
 
+#include "glog/logging.h"
+
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/impl/determinant_kernel_impl.h"
diff --git a/paddle/phi/kernels/npu_identity_kernel.cc b/paddle/phi/kernels/npu_identity_kernel.cc
index c9fb7a972851a7..9a96c149637e98 100644
--- a/paddle/phi/kernels/npu_identity_kernel.cc
+++ b/paddle/phi/kernels/npu_identity_kernel.cc
@@ -14,6 +14,8 @@
 
 #include "paddle/phi/kernels/npu_identity_kernel.h"
 
+#include "glog/logging.h"
+
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 
diff --git a/paddle/phi/kernels/onednn/transpose_kernel.cc b/paddle/phi/kernels/onednn/transpose_kernel.cc
index c9302ba4f80e6b..e76d33fc786f5e 100644
--- a/paddle/phi/kernels/onednn/transpose_kernel.cc
+++ b/paddle/phi/kernels/onednn/transpose_kernel.cc
@@ -13,6 +13,9 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/transpose_kernel.h"
+
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/onednn/onednn_reuse.h"
 #include "paddle/phi/core/kernel_registry.h"
 
diff --git a/paddle/phi/kernels/selected_rows/cpu/adam_kernel.cc b/paddle/phi/kernels/selected_rows/cpu/adam_kernel.cc
index a9cc821aaaf5f1..40c9fb37c4c764 100644
--- a/paddle/phi/kernels/selected_rows/cpu/adam_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/cpu/adam_kernel.cc
@@ -15,6 +15,7 @@
 #include "paddle/phi/kernels/selected_rows/adam_kernel.h"
 
 #include "gflags/gflags.h"
+#include "glog/logging.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/selected_rows/cpu/adamw_kernel.cc b/paddle/phi/kernels/selected_rows/cpu/adamw_kernel.cc
index d2cd9f06174039..ad1c27df1b399f 100644
--- a/paddle/phi/kernels/selected_rows/cpu/adamw_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/cpu/adamw_kernel.cc
@@ -14,6 +14,8 @@
 
 #include "paddle/phi/kernels/selected_rows/adamw_kernel.h"
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu b/paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu
index 555562cfac5837..084721a721ee56 100644
--- a/paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu
+++ b/paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu
@@ -14,6 +14,8 @@
 
 #include "paddle/phi/kernels/selected_rows/adam_kernel.h"
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/common/float16.h"
diff --git a/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu b/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu
index 0e24b5f71ed2ac..ee7eab855220aa 100644
--- a/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu
+++ b/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu
@@ -18,6 +18,8 @@
 
 #include <vector>
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/common/float16.h"
diff --git a/paddle/phi/kernels/selected_rows/impl/lamb_kernel_impl.h b/paddle/phi/kernels/selected_rows/impl/lamb_kernel_impl.h
index 4830f39c01dc24..4323a23e0e60cb 100644
--- a/paddle/phi/kernels/selected_rows/impl/lamb_kernel_impl.h
+++ b/paddle/phi/kernels/selected_rows/impl/lamb_kernel_impl.h
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
+#include "glog/logging.h"
+
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/selected_rows.h"
diff --git a/paddle/phi/kernels/selected_rows/xpu/adam_kernel.cc b/paddle/phi/kernels/selected_rows/xpu/adam_kernel.cc
index 77ac944167d784..b2a81885b88894 100644
--- a/paddle/phi/kernels/selected_rows/xpu/adam_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/xpu/adam_kernel.cc
@@ -14,6 +14,8 @@
 
 #include "paddle/phi/kernels/selected_rows/adam_kernel.h"
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/xpu/xpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
diff --git a/paddle/phi/kernels/share_buffer_kernel.cc b/paddle/phi/kernels/share_buffer_kernel.cc
index 2a25495b8d6791..d428de3863780b 100644
--- a/paddle/phi/kernels/share_buffer_kernel.cc
+++ b/paddle/phi/kernels/share_buffer_kernel.cc
@@ -14,6 +14,8 @@
 
 #include "paddle/phi/kernels/share_buffer_kernel.h"
 
+#include "glog/logging.h"
+
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/visit_type.h"
 
diff --git a/paddle/phi/kernels/strings/gpu/strings_copy_kernel.cu b/paddle/phi/kernels/strings/gpu/strings_copy_kernel.cu
index fb9d32264b00f6..cfee8d56690bea 100644
--- a/paddle/phi/kernels/strings/gpu/strings_copy_kernel.cu
+++ b/paddle/phi/kernels/strings/gpu/strings_copy_kernel.cu
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #include "paddle/phi/kernels/strings/strings_copy_kernel.h"
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/backends/gpu/gpu_helper.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
diff --git a/paddle/phi/kernels/transfer_layout_kernel.cc b/paddle/phi/kernels/transfer_layout_kernel.cc
index ae9c5d3092ec60..df3d0ef4617fdf 100644
--- a/paddle/phi/kernels/transfer_layout_kernel.cc
+++ b/paddle/phi/kernels/transfer_layout_kernel.cc
@@ -17,6 +17,8 @@ limitations under the License. */
 #include <sstream>
 #include <string>
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/visit_type.h"
diff --git a/paddle/phi/kernels/xpu/adam_kernel.cc b/paddle/phi/kernels/xpu/adam_kernel.cc
index 3e2a6471c1fbee..f2e9fccc58da37 100644
--- a/paddle/phi/kernels/xpu/adam_kernel.cc
+++ b/paddle/phi/kernels/xpu/adam_kernel.cc
@@ -14,6 +14,8 @@
 
 #include "paddle/phi/kernels/adam_kernel.h"
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/xpu/xpu_context.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/enforce.h"
diff --git a/paddle/phi/kernels/xpu/adamw_kernel.cc b/paddle/phi/kernels/xpu/adamw_kernel.cc
index 9258348117786c..a74fd2aa9bd4e9 100644
--- a/paddle/phi/kernels/xpu/adamw_kernel.cc
+++ b/paddle/phi/kernels/xpu/adamw_kernel.cc
@@ -16,6 +16,8 @@
 
 #include <vector>
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 #include "paddle/phi/backends/xpu/xpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/paddle/phi/kernels/xpu/amp_kernel.cc b/paddle/phi/kernels/xpu/amp_kernel.cc
index 52f81a4f828467..0c00baf9170d53 100644
--- a/paddle/phi/kernels/xpu/amp_kernel.cc
+++ b/paddle/phi/kernels/xpu/amp_kernel.cc
@@ -18,6 +18,8 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 #include "paddle/phi/backends/xpu/xpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
diff --git a/paddle/phi/kernels/xpu/set_value_grad_kernel.cc b/paddle/phi/kernels/xpu/set_value_grad_kernel.cc
index affc6b0fe94f75..d7e1ed8114e008 100644
--- a/paddle/phi/kernels/xpu/set_value_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/set_value_grad_kernel.cc
@@ -14,12 +14,13 @@
 
 #include "paddle/phi/kernels/set_value_grad_kernel.h"
 
+#include "glog/logging.h"
+
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 #include "paddle/phi/backends/xpu/xpu_context.h"
-#include "paddle/phi/core/kernel_registry.h"
-
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
diff --git a/test/cpp/phi/core/test_sparse_csr_tensor.cc b/test/cpp/phi/core/test_sparse_csr_tensor.cc
index 001c4bd7c7ed65..56f671a7fc7e9e 100644
--- a/test/cpp/phi/core/test_sparse_csr_tensor.cc
+++ b/test/cpp/phi/core/test_sparse_csr_tensor.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "glog/logging.h"
 #include "gtest/gtest.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
diff --git a/test/cpp/phi/core/test_string_tensor.cc b/test/cpp/phi/core/test_string_tensor.cc
index 0883e5b671f28e..fe0154e4e37654 100644
--- a/test/cpp/phi/core/test_string_tensor.cc
+++ b/test/cpp/phi/core/test_string_tensor.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <string>
 #include <utility>
 
+#include "glog/logging.h"
 #include "gtest/gtest.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/backends/context_pool.h"
diff --git a/test/cpp/phi/core/test_tensor_array.cc b/test/cpp/phi/core/test_tensor_array.cc
index 87cadafdca6d79..201790a7bc0e10 100644
--- a/test/cpp/phi/core/test_tensor_array.cc
+++ b/test/cpp/phi/core/test_tensor_array.cc
@@ -86,7 +86,7 @@ TEST(tensor_array, tensor_array_not_init) {
     EXPECT_TRUE(ex_msg.find("valid") != std::string::npos);
   }
 
-  CHECK_EQ(tensor_array.initialized(), false);
+  EXPECT_TRUE(!tensor_array.initialized());
 }
 
 TEST(tensor_array, tensor_array_init) {
@@ -115,7 +115,7 @@ TEST(tensor_array, tensor_array_init) {
   TensorArray tensor_array(tensors);
   tensor_array.AllocateFrom(alloc, DataType::INT8);
 
-  CHECK_EQ(tensor_array.initialized(), true);
+  EXPECT_TRUE(tensor_array.initialized());
 }
 
 }  // namespace tests
diff --git a/test/cpp/phi/kernels/test_strings_lower_upper_dev_api.cc b/test/cpp/phi/kernels/test_strings_lower_upper_dev_api.cc
index 2f056182e44125..2a7aa5cdaf71fd 100644
--- a/test/cpp/phi/kernels/test_strings_lower_upper_dev_api.cc
+++ b/test/cpp/phi/kernels/test_strings_lower_upper_dev_api.cc
@@ -18,6 +18,8 @@ limitations under the License. */
 #include <memory>
 #include <string>
 
+#include "glog/logging.h"
+
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/common/data_type.h"

From 9bdf7f02a24d8fff630b6fe05147d4d9bade1b07 Mon Sep 17 00:00:00 2001
From: Sanbu <96160062+sanbuphy@users.noreply.github.com>
Date: Thu, 13 Apr 2023 14:20:12 +0800
Subject: [PATCH 124/156] Support static graph code-gen for yolo_box (#52714)

* Support static graph code-gen for yolo_box

* Support static graph code-gen for yolo_box

* Support static graph code-gen for yolo_box

* Update op_compat.yaml

* fix

* fix
---
 .../fluid/operators/detection/CMakeLists.txt  |   2 -
 .../fluid/operators/detection/yolo_box_op.cc  | 269 ------------------
 paddle/phi/api/yaml/legacy_ops.yaml           |   9 -
 paddle/phi/api/yaml/op_compat.yaml            |   6 +
 paddle/phi/api/yaml/op_version.yaml           |  11 +
 paddle/phi/api/yaml/ops.yaml                  |   9 +
 paddle/phi/ops/compat/yolo_box_sig.cc         |  35 ---
 7 files changed, 26 insertions(+), 315 deletions(-)
 delete mode 100644 paddle/fluid/operators/detection/yolo_box_op.cc
 delete mode 100644 paddle/phi/ops/compat/yolo_box_sig.cc

diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index 7228641caab33d..6b4a6061cbb289 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -36,13 +36,11 @@ if(WITH_XPU)
   detection_library(iou_similarity_op SRCS iou_similarity_op.cc
                     iou_similarity_op_xpu.cc)
   detection_library(prior_box_op SRCS prior_box_op.cc)
-  detection_library(yolo_box_op SRCS yolo_box_op.cc)
   detection_library(generate_proposals_v2_op SRCS generate_proposals_v2_op.cc)
 else()
   detection_library(iou_similarity_op SRCS iou_similarity_op.cc
                     iou_similarity_op.cu)
   detection_library(prior_box_op SRCS prior_box_op.cc)
-  detection_library(yolo_box_op SRCS yolo_box_op.cc)
   # detection_library(generate_proposals_v2_op SRCS generate_proposals_v2_op.cc)
 endif()
 
diff --git a/paddle/fluid/operators/detection/yolo_box_op.cc b/paddle/fluid/operators/detection/yolo_box_op.cc
deleted file mode 100644
index a60f42de66a68f..00000000000000
--- a/paddle/fluid/operators/detection/yolo_box_op.cc
+++ /dev/null
@@ -1,269 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/phi/infermeta/binary.h"
-
-namespace paddle {
-namespace operators {
-
-class YoloBoxOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "YoloBoxOp");
-    OP_INOUT_CHECK(ctx->HasInput("ImgSize"), "Input", "ImgSize", "YoloBoxOp");
-    OP_INOUT_CHECK(ctx->HasOutput("Boxes"), "Output", "Boxes", "YoloBoxOp");
-    OP_INOUT_CHECK(ctx->HasOutput("Scores"), "Output", "Scores", "YoloBoxOp");
-
-    auto dim_x = ctx->GetInputDim("X");
-    auto dim_imgsize = ctx->GetInputDim("ImgSize");
-    auto anchors = ctx->Attrs().Get<std::vector<int>>("anchors");
-    int anchor_num = anchors.size() / 2;
-    auto class_num = ctx->Attrs().Get<int>("class_num");
-    auto iou_aware = ctx->Attrs().Get<bool>("iou_aware");
-    auto iou_aware_factor = ctx->Attrs().Get<float>("iou_aware_factor");
-
-    PADDLE_ENFORCE_EQ(
-        dim_x.size(),
-        4,
-        platform::errors::InvalidArgument("Input(X) should be a 4-D tensor."
-                                          "But received X dimension(%s)",
-                                          dim_x.size()));
-    if (iou_aware) {
-      PADDLE_ENFORCE_EQ(
-          dim_x[1],
-          anchor_num * (6 + class_num),
-          platform::errors::InvalidArgument(
-              "Input(X) dim[1] should be equal to (anchor_mask_number * (6 "
-              "+ class_num)) while iou_aware is true."
-              "But received dim[1](%s) != (anchor_mask_number * "
-              "(6+class_num)(%s).",
-              dim_x[1],
-              anchor_num * (6 + class_num)));
-      PADDLE_ENFORCE_GE(
-          iou_aware_factor,
-          0,
-          platform::errors::InvalidArgument(
-              "Attr(iou_aware_factor) should greater than or equal to 0."
-              "But received iou_aware_factor (%s)",
-              iou_aware_factor));
-      PADDLE_ENFORCE_LE(
-          iou_aware_factor,
-          1,
-          platform::errors::InvalidArgument(
-              "Attr(iou_aware_factor) should less than or equal to 1."
-              "But received iou_aware_factor (%s)",
-              iou_aware_factor));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          dim_x[1],
-          anchor_num * (5 + class_num),
-          platform::errors::InvalidArgument(
-              "Input(X) dim[1] should be equal to (anchor_mask_number * (5 "
-              "+ class_num))."
-              "But received dim[1](%s) != (anchor_mask_number * "
-              "(5+class_num)(%s).",
-              dim_x[1],
-              anchor_num * (5 + class_num)));
-    }
-    PADDLE_ENFORCE_EQ(dim_imgsize.size(),
-                      2,
-                      platform::errors::InvalidArgument(
-                          "Input(ImgSize) should be a 2-D tensor."
-                          "But received Imgsize size(%s)",
-                          dim_imgsize.size()));
-    if ((dim_imgsize[0] > 0 && dim_x[0] > 0) || ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(
-          dim_imgsize[0],
-          dim_x[0],
-          platform::errors::InvalidArgument(
-              "Input(ImgSize) dim[0] and Input(X) dim[0] should be same."));
-    }
-    PADDLE_ENFORCE_EQ(
-        dim_imgsize[1],
-        2,
-        platform::errors::InvalidArgument("Input(ImgSize) dim[1] should be 2."
-                                          "But received imgsize dim[1](%s).",
-                                          dim_imgsize[1]));
-    PADDLE_ENFORCE_GT(anchors.size(),
-                      0,
-                      platform::errors::InvalidArgument(
-                          "Attr(anchors) length should be greater than 0."
-                          "But received anchors length(%s).",
-                          anchors.size()));
-    PADDLE_ENFORCE_EQ(anchors.size() % 2,
-                      0,
-                      platform::errors::InvalidArgument(
-                          "Attr(anchors) length should be even integer."
-                          "But received anchors length (%s)",
-                          anchors.size()));
-    PADDLE_ENFORCE_GT(class_num,
-                      0,
-                      platform::errors::InvalidArgument(
-                          "Attr(class_num) should be an integer greater than 0."
-                          "But received class_num (%s)",
-                          class_num));
-
-    int box_num;
-    if ((dim_x[2] > 0 && dim_x[3] > 0) || ctx->IsRuntime()) {
-      box_num = dim_x[2] * dim_x[3] * anchor_num;
-    } else {
-      box_num = -1;
-    }
-    std::vector<int64_t> dim_boxes({dim_x[0], box_num, 4});
-    ctx->SetOutputDim("Boxes", phi::make_ddim(dim_boxes));
-
-    std::vector<int64_t> dim_scores({dim_x[0], box_num, class_num});
-    ctx->SetOutputDim("Scores", phi::make_ddim(dim_scores));
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                          ctx.GetPlace());
-  }
-};
-
-class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "The input tensor of YoloBox operator is a 4-D tensor with "
-             "shape of [N, C, H, W]. The second dimension(C) stores "
-             "box locations, confidence score and classification one-hot "
-             "keys of each anchor box. Generally, X should be the output "
-             "of YOLOv3 network.");
-    AddInput("ImgSize",
-             "The image size tensor of YoloBox operator, "
-             "This is a 2-D tensor with shape of [N, 2]. This tensor holds "
-             "height and width of each input image used for resizing output "
-             "box in input image scale.");
-    AddOutput("Boxes",
-              "The output tensor of detection boxes of YoloBox operator, "
-              "This is a 3-D tensor with shape of [N, M, 4], N is the "
-              "batch num, M is output box number, and the 3rd dimension "
-              "stores [xmin, ymin, xmax, ymax] coordinates of boxes.");
-    AddOutput("Scores",
-              "The output tensor of detection boxes scores of YoloBox "
-              "operator, This is a 3-D tensor with shape of "
-              "[N, M, :attr:`class_num`], N is the batch num, M is "
-              "output box number.");
-
-    AddAttr<int>("class_num", "The number of classes to predict.");
-    AddAttr<std::vector<int>>("anchors",
-                              "The anchor width and height, "
-                              "it will be parsed pair by pair.")
-        .SetDefault(std::vector<int>{});
-    AddAttr<int>("downsample_ratio",
-                 "The downsample ratio from network input to YoloBox operator "
-                 "input, so 32, 16, 8 should be set for the first, second, "
-                 "and thrid YoloBox operators.")
-        .SetDefault(32);
-    AddAttr<float>("conf_thresh",
-                   "The confidence scores threshold of detection boxes. "
-                   "Boxes with confidence scores under threshold should "
-                   "be ignored.")
-        .SetDefault(0.01);
-    AddAttr<bool>("clip_bbox",
-                  "Whether clip output bonding box in Input(ImgSize) "
-                  "boundary. Default true.")
-        .SetDefault(true);
-    AddAttr<float>("scale_x_y",
-                   "Scale the center point of decoded bounding "
-                   "box. Default 1.0")
-        .SetDefault(1.);
-    AddAttr<bool>("iou_aware", "Whether use iou aware. Default false.")
-        .SetDefault(false);
-    AddAttr<float>("iou_aware_factor", "iou aware factor. Default 0.5.")
-        .SetDefault(0.5);
-    AddComment(R"DOC(
-         This operator generates YOLO detection boxes from output of YOLOv3 network.
-
-         The output of previous network is in shape [N, C, H, W], while H and W
-         should be the same, H and W specify the grid size, each grid point predict
-         given number boxes, this given number, which following will be represented as S,
-         is specified by the number of anchors. In the second dimension(the channel
-         dimension), C should be equal to S * (5 + class_num) if :attr:`iou_aware` is false,
-         otherwise C should be equal to S * (6 + class_num). class_num is the object
-         category number of source dataset(such as 80 in coco dataset), so the
-         second(channel) dimension, apart from 4 box location coordinates x, y, w, h,
-         also includes confidence score of the box and class one-hot key of each anchor
-         box.
-
-         Assume the 4 location coordinates are :math:`t_x, t_y, t_w, t_h`, the box
-         predictions should be as follows:
-
-         $$
-         b_x = \\sigma(t_x) + c_x
-         $$
-         $$
-         b_y = \\sigma(t_y) + c_y
-         $$
-         $$
-         b_w = p_w e^{t_w}
-         $$
-         $$
-         b_h = p_h e^{t_h}
-         $$
-
-         in the equation above, :math:`c_x, c_y` is the left top corner of current grid
-         and :math:`p_w, p_h` is specified by anchors.
-
-         The logistic regression value of the 5th channel of each anchor prediction boxes
-         represents the confidence score of each prediction box, and the logistic
-         regression value of the last :attr:`class_num` channels of each anchor prediction
-         boxes represents the classifcation scores. Boxes with confidence scores less than
-         :attr:`conf_thresh` should be ignored, and box final scores is the product of
-         confidence scores and classification scores.
-
-         $$
-         score_{pred} = score_{conf} * score_{class}
-         $$
-
-         where the confidence scores follow the formula bellow
-
-         .. math::
-
-            score_{conf} = \begin{case}
-                             obj, \text{if } iou_aware == false \\
-                             obj^{1 - iou_aware_factor} * iou^{iou_aware_factor}, \text{otherwise}
-                           \end{case}
-
-         )DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-DECLARE_INFER_SHAPE_FUNCTOR(yolo_box,
-                            YoloBoxInferShapeFunctor,
-                            PD_INFER_META(phi::YoloBoxInferMeta));
-REGISTER_OPERATOR(
-    yolo_box,
-    ops::YoloBoxOp,
-    ops::YoloBoxOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    YoloBoxInferShapeFunctor);
-
-REGISTER_OP_VERSION(yolo_box).AddCheckpoint(
-    R"ROC(
-      Upgrade yolo box to add new attribute [iou_aware, iou_aware_factor].
-    )ROC",
-    paddle::framework::compatible::OpVersionDesc()
-        .NewAttr("iou_aware", "Whether use iou aware", false)
-        .NewAttr("iou_aware_factor", "iou aware factor", 0.5f));
diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml
index cd499a2d049e72..47dc127162c61f 100755
--- a/paddle/phi/api/yaml/legacy_ops.yaml
+++ b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -1384,15 +1384,6 @@
     data_type: x
   backward: unpool3d_grad
 
-- op : yolo_box
-  args : (Tensor x, Tensor img_size, int[] anchors, int class_num, float conf_thresh, int downsample_ratio, bool clip_bbox, float scale_x_y=1.0, bool iou_aware=false, float iou_aware_factor=0.5)
-  output : Tensor(boxes), Tensor(scores)
-  infer_meta :
-    func : YoloBoxInferMeta
-  kernel :
-    func : yolo_box
-    data_type : x
-
 - op : yolo_loss
   args : (Tensor x, Tensor gt_box, Tensor gt_label, Tensor gt_score, int[] anchors, int[] anchor_mask, int class_num, float ignore_thresh, int downsample_ratio, bool use_label_smooth=true, float scale_x_y=1.0)
   output : Tensor(loss), Tensor(objectness_mask), Tensor(gt_match_mask)
diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
index 655bfe546b6d39..fecca3a4b85857 100644
--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -2328,6 +2328,12 @@
   extra :
     attrs : ['str[] skip_eager_deletion_vars = {}']
 
+- op : yolo_box
+  inputs :
+    {x : X, img_size : ImgSize}
+  outputs :
+    {boxes : Boxes, scores : Scores}
+
 - op: sigmoid_cross_entropy_with_logits
   backward: sigmoid_cross_entropy_with_logits_grad
   inputs :
diff --git a/paddle/phi/api/yaml/op_version.yaml b/paddle/phi/api/yaml/op_version.yaml
index e8bdbb28259f9e..91ab3dfb1eb057 100644
--- a/paddle/phi/api/yaml/op_version.yaml
+++ b/paddle/phi/api/yaml/op_version.yaml
@@ -250,3 +250,14 @@
         - add_attr : axis
           comment : The axis to apply unique. If None, the input will be flattened.
           default : std::vector<int>{}
+
+- op : yolo_box
+  version :
+    - checkpoint : Upgrade yolo box to add new attribute [iou_aware, iou_aware_factor].
+      action :
+        - add_attr : iou_aware
+          comment : Whether use iou aware.
+          default : "false"
+        - add_attr : iou_aware_factor
+          comment : iou aware factor.
+          default : 0.5f
diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
index 2b2f30f1c63516..c839a8cfeefcad 100644
--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -2028,3 +2028,12 @@
   kernel :
     func : where
   backward : where_grad
+
+- op : yolo_box
+  args : (Tensor x, Tensor img_size, int[] anchors={}, int class_num = 1, float conf_thresh = 0.01, int downsample_ratio = 32, bool clip_bbox = true, float scale_x_y=1.0, bool iou_aware=false, float iou_aware_factor=0.5)
+  output : Tensor(boxes), Tensor(scores)
+  infer_meta :
+    func : YoloBoxInferMeta
+  kernel :
+    func : yolo_box
+    data_type : x
diff --git a/paddle/phi/ops/compat/yolo_box_sig.cc b/paddle/phi/ops/compat/yolo_box_sig.cc
deleted file mode 100644
index bb39e72a64f507..00000000000000
--- a/paddle/phi/ops/compat/yolo_box_sig.cc
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-
-KernelSignature YoloBoxOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  return KernelSignature("yolo_box",
-                         {"X", "ImgSize"},
-                         {"anchors",
-                          "class_num",
-                          "conf_thresh",
-                          "downsample_ratio",
-                          "clip_bbox",
-                          "scale_x_y",
-                          "iou_aware",
-                          "iou_aware_factor"},
-                         {"Boxes", "Scores"});
-}
-
-}  // namespace phi
-
-PD_REGISTER_ARG_MAPPING_FN(yolo_box, phi::YoloBoxOpArgumentMapping);

From a7070f345a38d41d054486f470abb1cde7422d46 Mon Sep 17 00:00:00 2001
From: Wang Xin <xinwang614@gmail.com>
Date: Thu, 13 Apr 2023 14:22:51 +0800
Subject: [PATCH 125/156] add autogen code support for expand_as op (#52797)

* add autogen code support for expand_as op

* bug fixed
---
 paddle/fluid/operators/expand_as_v2_op.cc | 124 ----------------------
 paddle/phi/api/yaml/backward.yaml         |  11 ++
 paddle/phi/api/yaml/legacy_backward.yaml  |  11 --
 paddle/phi/api/yaml/legacy_ops.yaml       |  10 --
 paddle/phi/api/yaml/op_compat.yaml        |   4 +-
 paddle/phi/api/yaml/op_version.yaml       |   7 ++
 paddle/phi/api/yaml/ops.yaml              |  11 ++
 paddle/phi/ops/compat/expand_as_sig.cc    |  36 -------
 8 files changed, 31 insertions(+), 183 deletions(-)
 delete mode 100644 paddle/fluid/operators/expand_as_v2_op.cc
 delete mode 100644 paddle/phi/ops/compat/expand_as_sig.cc

diff --git a/paddle/fluid/operators/expand_as_v2_op.cc b/paddle/fluid/operators/expand_as_v2_op.cc
deleted file mode 100644
index 5e0f98c3eedb37..00000000000000
--- a/paddle/fluid/operators/expand_as_v2_op.cc
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/expand_as_v2_op.h"
-
-#include <memory>
-#include <vector>
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/phi/infermeta/binary.h"
-
-namespace paddle {
-namespace operators {
-
-class ExpandAsV2Op : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                          ctx.device_context().GetPlace());
-  }
-};
-
-class ExpandAsV2OpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(Tensor, default Tensor<float>). A tensor with rank in [1, 6]."
-             "X is the input to be expanded.");
-    AddInput("Y",
-             "(Tensor, default Tensor<float>). A tensor with rank in [1, 6]."
-             "Expand X according to the shape of Y.")
-        .AsDispensable();
-    AddOutput("Out",
-              "(Tensor, default Tensor<float>). A tensor with rank in [1, 6]."
-              "The rank of Output(Out) have the same with Input(X). "
-              "After expanding, size of each dimension of Output(Out) is equal "
-              "to size of the corresponding dimension of Input(X) multiplying "
-              "the corresponding value given by Attr(expand_times).");
-    AddAttr<std::vector<int>>("target_shape",
-                              "Expand shape for each dimension.")
-        .SetDefault({});
-    AddComment(R"DOC(
-Expand the input to the given shape.
-)DOC");
-  }
-};
-
-class ExpandAsV2GradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ExpandAsV2Grad");
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input",
-                   framework::GradVarName("Out"),
-                   "ExpandAsV2Grad");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto x_grad_name = framework::GradVarName("X");
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
-    }
-  }
-
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(
-                              ctx, framework::GradVarName("Out")),
-                          ctx.device_context().GetPlace());
-  }
-};
-
-template <typename T>
-class ExpandAsV2GradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("expand_as_v2_grad");
-    op->SetInput("X", this->Input("X"));
-    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    op->SetAttrMap(this->Attrs());
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERER(ExpandAsV2GradNoNeedBufVarsInferer, "X");
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-DECLARE_INFER_SHAPE_FUNCTOR(expand_as_v2,
-                            ExpandAsInferShapeFunctor,
-                            PD_INFER_META(phi::ExpandAsInferMeta));
-REGISTER_OPERATOR(expand_as_v2,
-                  ops::ExpandAsV2Op,
-                  ops::ExpandAsV2OpMaker,
-                  ops::ExpandAsV2GradOpMaker<paddle::framework::OpDesc>,
-                  ops::ExpandAsV2GradOpMaker<paddle::imperative::OpBase>,
-                  ExpandAsInferShapeFunctor);
-REGISTER_OPERATOR(expand_as_v2_grad,
-                  ops::ExpandAsV2GradOp,
-                  ops::ExpandAsV2GradNoNeedBufVarsInferer);
-
-REGISTER_OP_VERSION(expand_as_v2)
-    .AddCheckpoint(R"ROC(fix expand_as_v2 and add new input [Y])ROC",
-                   paddle::framework::compatible::OpVersionDesc().NewInput(
-                       "Y", "Expand X according to the shape of Y"));
diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml
index cf8fd4de67774f..6415dc4fc8cdcf 100644
--- a/paddle/phi/api/yaml/backward.yaml
+++ b/paddle/phi/api/yaml/backward.yaml
@@ -543,6 +543,17 @@
   inplace : (out_grad -> x_grad)
   composite : exp_grad(out, out_grad, x_grad)
 
+- backward_op : expand_as_grad
+  forward : expand_as (Tensor x, Tensor y, int[] target_shape = {}) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad, int[] target_shape)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  kernel :
+    func : expand_as_grad
+  no_need_buffer : x
+
 - backward_op : expm1_grad
   forward : expm1 (Tensor x) -> Tensor(out)
   args : (Tensor out, Tensor out_grad)
diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml
index 3a67b3e4a3e463..178e8fcda94f2c 100755
--- a/paddle/phi/api/yaml/legacy_backward.yaml
+++ b/paddle/phi/api/yaml/legacy_backward.yaml
@@ -348,17 +348,6 @@
   invoke : embedding_grad_impl(x, weight, out_grad, padding_idx, sparse, weight_grad)
   no_need_buffer : weight
 
-- backward_op : expand_as_grad
-  forward : expand_as (Tensor x, Tensor y, int[] target_shape) -> Tensor(out)
-  args : (Tensor x, Tensor out_grad, int[] target_shape)
-  output : Tensor(x_grad)
-  infer_meta :
-    func : UnchangedInferMeta
-    param : [x]
-  kernel :
-    func : expand_as_grad
-  no_need_buffer : x
-
 - backward_op : expand_double_grad
   forward : expand_grad (Tensor x, Tensor grad_out, IntArray shape) -> Tensor(grad_x)
   args : (Tensor grad_x_grad, IntArray shape)
diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml
index 47dc127162c61f..9ffbaa07bd076f 100755
--- a/paddle/phi/api/yaml/legacy_ops.yaml
+++ b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -435,16 +435,6 @@
     func : expand
   backward : expand_grad
 
-- op : expand_as
-  args : (Tensor x, Tensor y, int[] target_shape)
-  output : Tensor(out)
-  infer_meta :
-    func : ExpandAsInferMeta
-  kernel :
-    func : expand_as
-  optional : y
-  backward : expand_as_grad
-
 - op : exponential_
   args : (Tensor x, float lam)
   output : Tensor(out)
diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
index fecca3a4b85857..471e257a6afdcb 100644
--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -740,9 +740,9 @@
     attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
 
 - op : expand_as (expand_as_v2)
+  backward : expand_as_grad (expand_as_v2_grad)
   inputs :
-    x : X
-    y : Y
+    {x : X, y : Y}
   outputs :
     out : Out
 
diff --git a/paddle/phi/api/yaml/op_version.yaml b/paddle/phi/api/yaml/op_version.yaml
index 91ab3dfb1eb057..40790d25c02410 100644
--- a/paddle/phi/api/yaml/op_version.yaml
+++ b/paddle/phi/api/yaml/op_version.yaml
@@ -66,6 +66,13 @@
           comment : In order to force fill output variable to gpu memory.
           default : "false"
 
+- op : expand_as_v2
+  version :
+    - checkpoint : fix expand_as_v2 and add new input [Y].
+      action :
+        - add_input : Y
+          comment : Expand X according to the shape of Y.
+
 - op : flip
   version :
     - checkpoint : Upgrade flip, add new attr [axis] and delete attr [dims]
diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
index c839a8cfeefcad..328b0211450bf6 100644
--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -608,6 +608,17 @@
   inplace : (x -> out)
   backward : exp_grad
 
+- op : expand_as
+  args : (Tensor x, Tensor y, int[] target_shape = {})
+  output : Tensor(out)
+  infer_meta :
+    func : ExpandAsInferMeta
+  kernel :
+    func : expand_as
+    data_type : x
+  optional : y
+  backward : expand_as_grad
+
 - op : expm1
   args : (Tensor x)
   output : Tensor
diff --git a/paddle/phi/ops/compat/expand_as_sig.cc b/paddle/phi/ops/compat/expand_as_sig.cc
deleted file mode 100644
index 03b308f4a8b1db..00000000000000
--- a/paddle/phi/ops/compat/expand_as_sig.cc
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-
-KernelSignature ExpandAsOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  return KernelSignature("expand_as", {"X", "Y"}, {"target_shape"}, {"Out"});
-}
-
-KernelSignature ExpandAsGradOpArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  return KernelSignature(
-      "expand_as_grad", {"X", "Out@GRAD"}, {"target_shape"}, {"X@GRAD"});
-}
-
-}  // namespace phi
-
-PD_REGISTER_BASE_KERNEL_NAME(expand_as_v2, expand_as);
-PD_REGISTER_BASE_KERNEL_NAME(expand_as_v2_grad, expand_as_grad);
-
-PD_REGISTER_ARG_MAPPING_FN(expand_as_v2, phi::ExpandAsOpArgumentMapping);
-PD_REGISTER_ARG_MAPPING_FN(expand_as_v2_grad,
-                           phi::ExpandAsGradOpArgumentMapping);

From 2cff9839915426d12375f6db851b31d01574137d Mon Sep 17 00:00:00 2001
From: Vvsmile <17864154871@163.com>
Date: Thu, 13 Apr 2023 14:23:54 +0800
Subject: [PATCH 126/156] [AMP OP&Test] adjust test_elementwise_sub's
 tolerance, max_relative_error of grad and (#50953)

* adjust test_elementwise_sub's tolerance, max_relative_error of grad and
atol/rtol of output to 1e-3

* fix the dtype in setUp

* fix the elementwise_sub optest

* modify elementwise_sub optest

* fix and add bf16/fp16 to elementwise_sub

* fix elementwise_sub bugs

* fix bugs

* fix elementwise_sub op

* fix the data type

* fix elementwise_sub

* fix elementwise

* fix elementwise_sub

* fix bugs

* fix elementwise sub

* fix elementwise_sub

* remove scalar and vector
---
 .../unittests/test_elementwise_sub_op.py      | 531 +++++++++++++++---
 1 file changed, 460 insertions(+), 71 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
index 1034e5fb59c65a..b979fa339de947 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
@@ -21,6 +21,7 @@
 
 import paddle
 from paddle import fluid
+from paddle.fluid import core
 from paddle.fluid.layer_helper import LayerHelper
 
 
@@ -30,14 +31,18 @@ def setUp(self):
         self.python_api = paddle.subtract
         self.public_python_api = paddle.subtract
         self.prim_op_type = "prim"
+        self.init_dtype()
         self.inputs = {
-            'X': np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype("float64"),
-            'Y': np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype("float64"),
+            'X': np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype),
+            'Y': np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype),
         }
         self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
         self.if_check_prim()
         self.if_enable_cinn()
 
+    def init_dtype(self):
+        self.dtype = np.float64
+
     def test_check_output(self):
         self.check_output()
 
@@ -66,7 +71,55 @@ def if_check_prim(self):
         self.check_prim = True
 
     def if_enable_cinn(self):
-        pass
+        self.enable_cinn = False
+
+
+class TestElementwiseFP16OP(TestElementwiseOp):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and do not support bfloat16",
+)
+class TestElementwiseBF16OP(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.dtype = np.uint16
+        self.python_api = paddle.subtract
+        self.public_python_api = paddle.subtract
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(np.float32),
+            'Y': np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(np.float32),
+        }
+        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
+        self.inputs = {
+            'X': convert_float_to_uint16(self.inputs['X']),
+            'Y': convert_float_to_uint16(self.inputs['Y']),
+        }
+        self.outputs = {'Out': convert_float_to_uint16(self.outputs['Out'])}
+        self.if_check_prim()
+        self.if_enable_cinn()
+
+    def test_check_grad_normal(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(
+            place, ['X', 'Y'], 'Out', max_relative_error=0.1
+        )
+
+    def test_check_grad_ingore_x(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(
+            place, ['Y'], 'Out', no_grad_set=set("X"), max_relative_error=0.1
+        )
+
+    def test_check_grad_ingore_y(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(
+            place, ['X'], 'Out', no_grad_set=set('Y'), max_relative_error=0.1
+        )
 
 
 class TestElementwiseSubOp_ZeroDim1(TestElementwiseOp):
@@ -75,19 +128,45 @@ def setUp(self):
         self.python_api = paddle.subtract
         self.public_python_api = paddle.subtract
         self.prim_op_type = "prim"
+        self.init_dtype()
         self.inputs = {
-            'X': np.random.uniform(0.1, 1, []).astype("float64"),
-            'Y': np.random.uniform(0.1, 1, []).astype("float64"),
+            'X': np.random.uniform(0.1, 1, []).astype(self.dtype),
+            'Y': np.random.uniform(0.1, 1, []).astype(self.dtype),
         }
         self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
         self.if_check_prim()
         self.if_enable_cinn()
 
-    def if_check_prim(self):
-        self.check_prim = True
 
-    def if_enable_cinn(self):
-        self.enable_cinn = False
+class TestElementwiseSubFP16OP_ZeroDim1(TestElementwiseSubOp_ZeroDim1):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and do not support bfloat16",
+)
+class TestElementwiseSubBF16OP_ZeroDim1(TestElementwiseBF16OP):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.dtype = np.uint16
+        self.python_api = paddle.subtract
+        self.public_python_api = paddle.subtract
+        self.prim_op_type = "prim"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, []).astype(np.float32),
+            'Y': np.random.uniform(0.1, 1, []).astype(np.float32),
+        }
+        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
+        self.inputs = {
+            'X': convert_float_to_uint16(self.inputs['X']),
+            'Y': convert_float_to_uint16(self.inputs['Y']),
+        }
+        self.outputs = {'Out': convert_float_to_uint16(self.outputs['Out'])}
+        self.if_check_prim()
+        self.if_enable_cinn()
 
 
 class TestElementwiseSubOp_ZeroDim2(TestElementwiseOp):
@@ -96,19 +175,45 @@ def setUp(self):
         self.python_api = paddle.subtract
         self.public_python_api = paddle.subtract
         self.prim_op_type = "prim"
+        self.init_dtype()
         self.inputs = {
-            'X': np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype("float64"),
-            'Y': np.random.uniform(0.1, 1, []).astype("float64"),
+            'X': np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype),
+            'Y': np.random.uniform(0.1, 1, []).astype(self.dtype),
         }
         self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
         self.if_check_prim()
         self.if_enable_cinn()
 
-    def if_check_prim(self):
-        self.check_prim = True
 
-    def if_enable_cinn(self):
-        self.enable_cinn = False
+class TestElementwiseSubFP16OP_ZeroDim2(TestElementwiseSubOp_ZeroDim2):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and do not support bfloat16",
+)
+class TestElementwiseSubBF16OP_ZeroDim2(TestElementwiseBF16OP):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.dtype = np.uint16
+        self.python_api = paddle.subtract
+        self.public_python_api = paddle.subtract
+        self.prim_op_type = "prim"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(np.float32),
+            'Y': np.random.uniform(0.1, 1, []).astype(np.float32),
+        }
+        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
+        self.inputs = {
+            'X': convert_float_to_uint16(self.inputs['X']),
+            'Y': convert_float_to_uint16(self.inputs['Y']),
+        }
+        self.outputs = {'Out': convert_float_to_uint16(self.outputs['Out'])}
+        self.if_check_prim()
+        self.if_enable_cinn()
 
 
 class TestElementwiseSubOp_ZeroDim3(TestElementwiseOp):
@@ -117,21 +222,52 @@ def setUp(self):
         self.python_api = paddle.subtract
         self.public_python_api = paddle.subtract
         self.prim_op_type = "prim"
+        self.init_dtype()
         self.inputs = {
-            'X': np.random.uniform(0.1, 1, []).astype("float64"),
-            'Y': np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype("float64"),
+            'X': np.random.uniform(0.1, 1, []).astype(self.dtype),
+            'Y': np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype),
         }
         self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
         self.if_check_prim()
         self.if_enable_cinn()
 
-    def if_check_prim(self):
-        self.check_prim = True
 
-    def if_enable_cinn(self):
-        self.enable_cinn = False
+class TestElementwiseSubFP16OP_ZeroDim3(TestElementwiseSubOp_ZeroDim3):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and do not support bfloat16",
+)
+class TestElementwiseBF16OP_ZeroDim3(TestElementwiseBF16OP):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.dtype = np.uint16
+        self.python_api = paddle.subtract
+        self.public_python_api = paddle.subtract
+        self.prim_op_type = "prim"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, []).astype(np.float32),
+            'Y': np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(np.float32),
+        }
+        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
+        self.inputs = {
+            'X': convert_float_to_uint16(self.inputs['X']),
+            'Y': convert_float_to_uint16(self.inputs['Y']),
+        }
+        self.outputs = {'Out': convert_float_to_uint16(self.outputs['Out'])}
+        self.if_check_prim()
+        self.if_enable_cinn()
 
 
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and do not support bfloat16",
+)
 class TestBF16ElementwiseOp(OpTest):
     def setUp(self):
         self.op_type = "elementwise_sub"
@@ -151,6 +287,12 @@ def setUp(self):
         self.if_check_prim()
         self.if_enable_cinn()
 
+    def if_check_prim(self):
+        self.check_prim = True
+
+    def if_enable_cinn(self):
+        self.enable_cinn = False
+
     def test_check_output(self):
         self.check_output()
 
@@ -162,17 +304,6 @@ def test_check_grad_ingore_x(self):
             ['Y'], 'Out', no_grad_set=set("X"), check_prim=self.check_prim
         )
 
-    def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'], 'Out', no_grad_set=set('Y'), check_prim=self.check_prim
-        )
-
-    def if_check_prim(self):
-        self.check_prim = True
-
-    def if_enable_cinn(self):
-        self.enable_cinn = False
-
 
 @skip_check_grad_ci(
     reason="[skip shape check] Use y_shape(1) to test broadcast."
@@ -183,9 +314,10 @@ def setUp(self):
         self.python_api = paddle.subtract
         self.public_python_api = paddle.subtract
         self.prim_op_type = "prim"
+        self.init_dtype()
         self.inputs = {
-            'X': np.random.rand(10, 3, 4).astype(np.float64),
-            'Y': np.random.rand(1).astype(np.float64),
+            'X': np.random.rand(10, 3, 4).astype(self.dtype),
+            'Y': np.random.rand(1).astype(self.dtype),
         }
         self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
         self.if_check_prim()
@@ -197,21 +329,23 @@ def setUp(self):
         self.python_api = paddle.subtract
         self.public_python_api = paddle.subtract
         self.prim_op_type = "prim"
+        self.init_dtype()
         self.inputs = {
-            'X': np.random.random((100,)).astype("float64"),
-            'Y': np.random.random((100,)).astype("float64"),
+            'X': np.random.random((100,)).astype(self.dtype),
+            'Y': np.random.random((100,)).astype(self.dtype),
         }
         self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
         self.if_check_prim()
 
 
-class TestElementwiseSubOp_broadcast_O(TestElementwiseOp):
+class TestElementwiseSubOp_broadcast_0(TestElementwiseOp):
     def setUp(self):
         self.op_type = "elementwise_sub"
         self.python_api = paddle.subtract
+        self.init_dtype()
         self.inputs = {
-            'X': np.random.rand(100, 3, 2).astype(np.float64),
-            'Y': np.random.rand(100).astype(np.float64),
+            'X': np.random.rand(100, 3, 2).astype(self.dtype),
+            'Y': np.random.rand(100).astype(self.dtype),
         }
 
         self.attrs = {'axis': 0}
@@ -244,13 +378,66 @@ def test_check_grad_ingore_y(self):
         )
 
 
-class TestElementwiseSubOp_broadcast_1(TestElementwiseSubOp_broadcast_O):
+class TestElementwiseSubFP16OP_broadcast_0(TestElementwiseSubOp_broadcast_0):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and do not support bfloat16",
+)
+class TestElementwiseBF16OP_broadcast_0(TestElementwiseBF16OP):
     def setUp(self):
         self.op_type = "elementwise_sub"
+        self.dtype = np.uint16
         self.python_api = paddle.subtract
         self.inputs = {
-            'X': np.random.rand(2, 100, 3).astype(np.float64),
-            'Y': np.random.rand(100).astype(np.float64),
+            'X': np.random.rand(100, 3, 2).astype(np.float32),
+            'Y': np.random.rand(100).astype(np.float32),
+        }
+        self.outputs = {
+            'Out': self.inputs['X'] - self.inputs['Y'].reshape(100, 1, 1)
+        }
+        self.inputs = {
+            'X': convert_float_to_uint16(self.inputs['X']),
+            'Y': convert_float_to_uint16(self.inputs['Y']),
+        }
+        self.outputs = {'Out': convert_float_to_uint16(self.outputs['Out'])}
+        self.attrs = {'axis': 0}
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place, check_dygraph=False)
+
+    def test_check_grad_normal(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(
+            place, ['X', 'Y'], 'Out', check_dygraph=False
+        )
+
+    def test_check_grad_ingore_x(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(
+            place, ['Y'], 'Out', no_grad_set=set("X"), check_dygraph=False
+        )
+
+    def test_check_grad_ingore_y(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(
+            place, ['X'], 'Out', no_grad_set=set('Y'), check_dygraph=False
+        )
+
+
+class TestElementwiseSubOp_broadcast_1(TestElementwiseSubOp_broadcast_0):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.python_api = paddle.subtract
+        self.init_dtype()
+        self.inputs = {
+            'X': np.random.rand(2, 100, 3).astype(self.dtype),
+            'Y': np.random.rand(100).astype(self.dtype),
         }
 
         self.attrs = {'axis': 1}
@@ -259,15 +446,46 @@ def setUp(self):
         }
 
 
+class TestElementwiseSubFP16OP_broadcast_1(TestElementwiseSubOp_broadcast_1):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and do not support bfloat16",
+)
+class TestElementwiseBF16OP_broadcast_1(TestElementwiseBF16OP_broadcast_0):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.dtype = np.uint16
+        self.python_api = paddle.subtract
+        self.inputs = {
+            'X': np.random.rand(2, 100, 3).astype(np.float32),
+            'Y': np.random.rand(100).astype(np.float32),
+        }
+        self.outputs = {
+            'Out': self.inputs['X'] - self.inputs['Y'].reshape(1, 100, 1)
+        }
+        self.inputs = {
+            'X': convert_float_to_uint16(self.inputs['X']),
+            'Y': convert_float_to_uint16(self.inputs['Y']),
+        }
+        self.outputs = {'Out': convert_float_to_uint16(self.outputs['Out'])}
+        self.attrs = {'axis': 1}
+
+
 class TestElementwiseSubOp_broadcast_2(TestElementwiseOp):
     def setUp(self):
         self.op_type = "elementwise_sub"
         self.python_api = paddle.subtract
         self.public_python_api = paddle.subtract
         self.prim_op_type = "prim"
+        self.init_dtype()
         self.inputs = {
-            'X': np.random.rand(2, 3, 100).astype(np.float64),
-            'Y': np.random.rand(100).astype(np.float64),
+            'X': np.random.rand(2, 3, 100).astype(self.dtype),
+            'Y': np.random.rand(100).astype(self.dtype),
         }
 
         self.outputs = {
@@ -275,17 +493,70 @@ def setUp(self):
         }
         self.if_check_prim()
 
-    def if_check_prim(self):
-        self.check_prim = True
+
+class TestElementwiseSubFP16OP_broadcast_2(TestElementwiseSubOp_broadcast_2):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and do not support bfloat16",
+)
+class TestElementwiseBF16OP_broadcast_2(TestElementwiseBF16OP_broadcast_0):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.dtype = np.uint16
+        self.python_api = paddle.subtract
+        self.inputs = {
+            'X': np.random.rand(2, 3, 100).astype(np.float32),
+            'Y': np.random.rand(100).astype(np.float32),
+        }
+        self.outputs = {
+            'Out': self.inputs['X'] - self.inputs['Y'].reshape(1, 1, 100)
+        }
+        self.inputs = {
+            'X': convert_float_to_uint16(self.inputs['X']),
+            'Y': convert_float_to_uint16(self.inputs['Y']),
+        }
+        self.outputs = {'Out': convert_float_to_uint16(self.outputs['Out'])}
+        self.if_check_prim()
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and do not support bfloat16",
+)
+class TestElementwiseBF16OP_broadcast_3(TestElementwiseBF16OP_broadcast_0):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.dtype = np.uint16
+        self.python_api = paddle.subtract
+        self.inputs = {
+            'X': np.random.rand(2, 10, 12, 3).astype(np.float32),
+            'Y': np.random.rand(10, 12).astype(np.float32),
+        }
+        self.outputs = {
+            'Out': self.inputs['X'] - self.inputs['Y'].reshape(1, 10, 12, 1)
+        }
+        self.inputs = {
+            'X': convert_float_to_uint16(self.inputs['X']),
+            'Y': convert_float_to_uint16(self.inputs['Y']),
+        }
+        self.outputs = {'Out': convert_float_to_uint16(self.outputs['Out'])}
+        self.attrs = {'axis': 1}
 
 
-class TestElementwiseSubOp_broadcast_3(TestElementwiseSubOp_broadcast_O):
+class TestElementwiseSubOp_broadcast_3(TestElementwiseSubOp_broadcast_0):
     def setUp(self):
         self.op_type = "elementwise_sub"
         self.python_api = paddle.subtract
+        self.init_dtype()
         self.inputs = {
-            'X': np.random.rand(2, 10, 12, 3).astype(np.float64),
-            'Y': np.random.rand(10, 12).astype(np.float64),
+            'X': np.random.rand(2, 10, 12, 3).astype(self.dtype),
+            'Y': np.random.rand(10, 12).astype(self.dtype),
         }
 
         self.attrs = {'axis': 1}
@@ -294,21 +565,52 @@ def setUp(self):
         }
 
 
+class TestElementwiseSubFP16OP_broadcast_3(TestElementwiseSubOp_broadcast_3):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
 class TestElementwiseSubOp_broadcast_4(TestElementwiseOp):
     def setUp(self):
         self.op_type = "elementwise_sub"
         self.python_api = paddle.subtract
         self.public_python_api = paddle.subtract
         self.prim_op_type = "prim"
+        self.init_dtype()
         self.inputs = {
-            'X': np.random.rand(2, 5, 3, 12).astype(np.float64),
-            'Y': np.random.rand(2, 5, 1, 12).astype(np.float64),
+            'X': np.random.rand(2, 5, 3, 12).astype(self.dtype),
+            'Y': np.random.rand(2, 5, 1, 12).astype(self.dtype),
         }
         self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
         self.if_check_prim()
 
-    def if_check_prim(self):
-        self.check_prim = True
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and do not support bfloat16",
+)
+class TestElementwiseBF16OP_broadcast_4(TestElementwiseBF16OP_broadcast_0):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.dtype = np.uint16
+        self.python_api = paddle.subtract
+        self.inputs = {
+            'X': np.random.rand(2, 5, 3, 12).astype(np.float32),
+            'Y': np.random.rand(2, 5, 1, 12).astype(np.float32),
+        }
+        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
+        self.inputs = {
+            'X': convert_float_to_uint16(self.inputs['X']),
+            'Y': convert_float_to_uint16(self.inputs['Y']),
+        }
+        self.outputs = {'Out': convert_float_to_uint16(self.outputs['Out'])}
+        self.if_check_prim()
+
+
+class TestElementwiseSubFP16OP_broadcast_4(TestElementwiseSubOp_broadcast_4):
+    def init_dtype(self):
+        self.dtype = np.float16
 
 
 class TestElementwiseSubOp_commonuse_1(TestElementwiseOp):
@@ -317,15 +619,43 @@ def setUp(self):
         self.python_api = paddle.subtract
         self.public_python_api = paddle.subtract
         self.prim_op_type = "prim"
+        self.init_dtype()
         self.inputs = {
-            'X': np.random.rand(2, 3, 100).astype(np.float64),
-            'Y': np.random.rand(1, 1, 100).astype(np.float64),
+            'X': np.random.rand(2, 3, 100).astype(self.dtype),
+            'Y': np.random.rand(1, 1, 100).astype(self.dtype),
         }
         self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
         self.if_check_prim()
 
-    def if_check_prim(self):
-        self.check_prim = True
+
+class TestElementwiseSubFP16OP_commonuse_1(TestElementwiseSubOp_commonuse_1):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and do not support bfloat16",
+)
+class TestElementwiseBF16OP_commonuse_1(TestElementwiseBF16OP):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.dtype = np.uint16
+        self.python_api = paddle.subtract
+        self.public_python_api = paddle.subtract
+        self.prim_op_type = "prim"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 100).astype(np.float32),
+            'Y': np.random.rand(1, 1, 100).astype(np.float32),
+        }
+        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
+        self.inputs = {
+            'X': convert_float_to_uint16(self.inputs['X']),
+            'Y': convert_float_to_uint16(self.inputs['Y']),
+        }
+        self.outputs = {'Out': convert_float_to_uint16(self.outputs['Out'])}
+        self.if_check_prim()
 
 
 class TestElementwiseSubOp_commonuse_2(TestElementwiseOp):
@@ -334,15 +664,43 @@ def setUp(self):
         self.python_api = paddle.subtract
         self.public_python_api = paddle.subtract
         self.prim_op_type = "prim"
+        self.init_dtype()
         self.inputs = {
-            'X': np.random.rand(10, 3, 1, 4).astype(np.float64),
-            'Y': np.random.rand(10, 1, 12, 1).astype(np.float64),
+            'X': np.random.rand(10, 3, 1, 4).astype(self.dtype),
+            'Y': np.random.rand(10, 1, 12, 1).astype(self.dtype),
         }
         self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
         self.if_check_prim()
 
-    def if_check_prim(self):
-        self.check_prim = True
+
+class TestElementwiseSubFP16OP_commonuse_2(TestElementwiseSubOp_commonuse_2):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and do not support bfloat16",
+)
+class TestElementwiseBF16OP_commonuse_2(TestElementwiseBF16OP):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.dtype = np.uint16
+        self.python_api = paddle.subtract
+        self.public_python_api = paddle.subtract
+        self.prim_op_type = "prim"
+        self.inputs = {
+            'X': np.random.rand(10, 3, 1, 4).astype(np.float32),
+            'Y': np.random.rand(10, 1, 12, 1).astype(np.float32),
+        }
+        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
+        self.inputs = {
+            'X': convert_float_to_uint16(self.inputs['X']),
+            'Y': convert_float_to_uint16(self.inputs['Y']),
+        }
+        self.outputs = {'Out': convert_float_to_uint16(self.outputs['Out'])}
+        self.if_check_prim()
 
 
 class TestElementwiseSubOp_xsize_lessthan_ysize(TestElementwiseOp):
@@ -351,9 +709,10 @@ def setUp(self):
         self.python_api = paddle.subtract
         self.public_python_api = paddle.subtract
         self.prim_op_type = "prim"
+        self.init_dtype()
         self.inputs = {
-            'X': np.random.rand(10, 12).astype(np.float64),
-            'Y': np.random.rand(2, 3, 10, 12).astype(np.float64),
+            'X': np.random.rand(10, 12).astype(self.dtype),
+            'Y': np.random.rand(2, 3, 10, 12).astype(self.dtype),
         }
         self.attrs = {'axis': 2}
 
@@ -362,8 +721,38 @@ def setUp(self):
         }
         self.if_check_prim()
 
-    def if_check_prim(self):
-        self.check_prim = True
+
+class TestElementwiseSubFP16OP_xsize_lessthan_ysize(
+    TestElementwiseSubOp_xsize_lessthan_ysize
+):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and do not support bfloat16",
+)
+class TestElementwiseBF16OP_xsize_lessthan_ysize(TestElementwiseBF16OP):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.dtype = np.uint16
+        self.python_api = paddle.subtract
+        self.public_python_api = paddle.subtract
+        self.prim_op_type = "prim"
+        self.inputs = {
+            'X': np.random.rand(10, 12).astype(np.float32),
+            'Y': np.random.rand(2, 3, 10, 12).astype(np.float32),
+        }
+        self.attrs = {'axis': 2}
+        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
+        self.inputs = {
+            'X': convert_float_to_uint16(self.inputs['X']),
+            'Y': convert_float_to_uint16(self.inputs['Y']),
+        }
+        self.outputs = {'Out': convert_float_to_uint16(self.outputs['Out'])}
+        self.if_check_prim()
 
 
 class TestComplexElementwiseSubOp(OpTest):
@@ -473,7 +862,7 @@ def _executed_api(self, x, y, name=None):
     def test_name(self):
         with fluid.program_guard(fluid.Program()):
             x = paddle.static.data(name="x", shape=[2, 3], dtype="float32")
-            y = paddle.static.data(name='y', shape=[2, 3], dtype='float32')
+            y = paddle.static.data(name='y', shape=[2, 3], dtype=np.float32)
 
             y_1 = self._executed_api(x, y, name='subtract_res')
             self.assertEqual(('subtract_res' in y_1.name), True)
@@ -483,12 +872,12 @@ def test_declarative(self):
 
             def gen_data():
                 return {
-                    "x": np.array([2, 3, 4]).astype('float32'),
-                    "y": np.array([1, 5, 2]).astype('float32'),
+                    "x": np.array([2, 3, 4]).astype(np.float32),
+                    "y": np.array([1, 5, 2]).astype(np.float32),
                 }
 
-            x = paddle.static.data(name="x", shape=[3], dtype='float32')
-            y = paddle.static.data(name="y", shape=[3], dtype='float32')
+            x = paddle.static.data(name="x", shape=[3], dtype=np.float32)
+            y = paddle.static.data(name="y", shape=[3], dtype=np.float32)
             z = self._executed_api(x, y)
             place = fluid.CPUPlace()
             exe = fluid.Executor(place)
@@ -640,7 +1029,7 @@ def test_warnings(self):
             paddle.enable_static()
             helper = LayerHelper("elementwise_sub")
             data = paddle.static.data(
-                name='data', shape=[None, 3, 32, 32], dtype='float32'
+                name='data', shape=[None, 3, 32, 32], dtype=np.float32
             )
             out = helper.create_variable_for_type_inference(dtype=data.dtype)
             os.environ['FLAGS_print_extra_attrs'] = "1"

From 3a66627e1b2c17a948a178313ba0f168d8446c07 Mon Sep 17 00:00:00 2001
From: zhangyuqin1998 <75946871+zhangyuqin1998@users.noreply.github.com>
Date: Thu, 13 Apr 2023 14:27:09 +0800
Subject: [PATCH 127/156] rename PD_REGISTER_GENERAL_KERNEL (#52759)

* rename PD_REGISTER_GENERAL_KERNEL

* Update feed_op.cc

* fix

* Update strings_empty_kernel.cc
---
 cmake/operators.cmake                         |  9 +-
 cmake/phi.cmake                               |  4 +-
 paddle/fluid/operators/controlflow/feed_op.cc | 74 +++-----------
 paddle/phi/core/kernel_registry.h             | 26 ++---
 paddle/phi/kernels/assign_kernel.cc           | 72 +++++++-------
 paddle/phi/kernels/memcpy_kernel.cc           | 99 +++++++++----------
 paddle/phi/kernels/reshape_grad_kernel.cc     | 55 ++---------
 .../kernels/selected_rows/assign_kernel.cc    | 27 +++--
 paddle/phi/kernels/share_buffer_kernel.cc     | 24 +----
 .../strings/cpu/strings_copy_kernel.cc        |  9 +-
 .../strings/cpu/strings_lower_upper_kernel.cc | 22 ++---
 .../strings/gpu/strings_copy_kernel.cu        |  9 +-
 .../strings/gpu/strings_lower_upper_kernel.cu | 22 ++---
 .../kernels/strings/strings_empty_kernel.cc   | 40 ++++----
 paddle/phi/kernels/transfer_layout_kernel.cc  | 18 ++--
 15 files changed, 195 insertions(+), 315 deletions(-)

diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 4935f90a73c296..92906235032068 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -368,7 +368,8 @@ function(op_library TARGET)
     # Add PHI Kernel Registry Message
     find_phi_register(${cc_src} ${pybind_file} "PD_REGISTER_KERNEL")
     find_phi_register(${cc_src} ${pybind_file} "PD_REGISTER_STRUCT_KERNEL")
-    find_phi_register(${cc_src} ${pybind_file} "PD_REGISTER_GENERAL_KERNEL")
+    find_phi_register(${cc_src} ${pybind_file}
+                      "PD_REGISTER_KERNEL_FOR_ALL_DTYPE")
     find_register(${cc_src} "REGISTER_OPERATOR" op_name)
     if(NOT ${op_name} EQUAL "")
       file(APPEND ${pybind_file} "USE_OP_ITSELF(${op_name});\n")
@@ -420,7 +421,8 @@ function(op_library TARGET)
     # Add PHI Kernel Registry Message
     find_phi_register(${cu_src} ${pybind_file} "PD_REGISTER_KERNEL")
     find_phi_register(${cu_src} ${pybind_file} "PD_REGISTER_STRUCT_KERNEL")
-    find_phi_register(${cu_src} ${pybind_file} "PD_REGISTER_GENERAL_KERNEL")
+    find_phi_register(${cu_src} ${pybind_file}
+                      "PD_REGISTER_KERNEL_FOR_ALL_DTYPE")
     find_register(${cu_src} "REGISTER_OP_CUDA_KERNEL" op_name)
     if(NOT ${op_name} EQUAL "")
       file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, CUDA);\n")
@@ -436,7 +438,8 @@ function(op_library TARGET)
     find_register(${hip_src} "REGISTER_OP_CUDA_KERNEL" op_name)
     find_phi_register(${hip_src} ${pybind_file} "PD_REGISTER_KERNEL")
     find_phi_register(${hip_src} ${pybind_file} "PD_REGISTER_STRUCT_KERNEL")
-    find_phi_register(${hip_src} ${pybind_file} "PD_REGISTER_GENERAL_KERNEL")
+    find_phi_register(${hip_src} ${pybind_file}
+                      "PD_REGISTER_KERNEL_FOR_ALL_DTYPE")
     if(NOT ${op_name} EQUAL "")
       file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, CUDA);\n")
       set(pybind_flag 1)
diff --git a/cmake/phi.cmake b/cmake/phi.cmake
index bdb6a4f425281f..96760969adf902 100644
--- a/cmake/phi.cmake
+++ b/cmake/phi.cmake
@@ -78,7 +78,7 @@ function(kernel_declare TARGET_LIST)
     string(
       REGEX
         MATCH
-        "(PD_REGISTER_KERNEL|PD_REGISTER_GENERAL_KERNEL|PD_REGISTER_KERNEL_FOR_ALL_BACKEND_DTYPE)\\([ \t\r\n]*[a-z0-9_]*,[[ \\\t\r\n\/]*[a-z0-9_]*]?[ \\\t\r\n]*[a-zA-Z_]*,[ \\\t\r\n]*[A-Z_]*"
+        "(PD_REGISTER_KERNEL|PD_REGISTER_KERNEL_FOR_ALL_DTYPE|PD_REGISTER_KERNEL_FOR_ALL_BACKEND_DTYPE)\\([ \t\r\n]*[a-z0-9_]*,[[ \\\t\r\n\/]*[a-z0-9_]*]?[ \\\t\r\n]*[a-zA-Z_]*,[ \\\t\r\n]*[A-Z_]*"
         first_registry
         "${kernel_impl}")
     if(NOT first_registry STREQUAL "")
@@ -108,7 +108,7 @@ function(kernel_declare TARGET_LIST)
       string(REPLACE "PD_REGISTER_KERNEL_FOR_ALL_BACKEND_DTYPE(" "" kernel_msg
                      "${first_registry}")
       string(REPLACE "PD_REGISTER_KERNEL(" "" kernel_msg "${kernel_msg}")
-      string(REPLACE "PD_REGISTER_GENERAL_KERNEL(" "" kernel_msg
+      string(REPLACE "PD_REGISTER_KERNEL_FOR_ALL_DTYPE(" "" kernel_msg
                      "${kernel_msg}")
       string(REPLACE "," ";" kernel_msg "${kernel_msg}")
       string(REGEX REPLACE "[ \\\t\r\n]+" "" kernel_msg "${kernel_msg}")
diff --git a/paddle/fluid/operators/controlflow/feed_op.cc b/paddle/fluid/operators/controlflow/feed_op.cc
index 4352d58047e3c4..698ca4e02e6290 100644
--- a/paddle/fluid/operators/controlflow/feed_op.cc
+++ b/paddle/fluid/operators/controlflow/feed_op.cc
@@ -212,77 +212,29 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
     paddle::operators::FeedOpInfoMaker);
 
-PD_REGISTER_GENERAL_KERNEL(
-    feed_dense_tensor,
-    CPU,
-    ALL_LAYOUT,
-    paddle::operators::FeedDenseTensorKernel<phi::CPUContext>,
-    ALL_DTYPE) {}
-
-PD_REGISTER_GENERAL_KERNEL(
+PD_REGISTER_KERNEL_FOR_ALL_DTYPE(
     feed_sparse_coo_tensor,
     CPU,
     ALL_LAYOUT,
-    paddle::operators::FeedSparseCooTensorKernel<phi::CPUContext>,
-    ALL_DTYPE) {}
-
-PD_REGISTER_GENERAL_KERNEL(
-    feed_strings,
-    CPU,
-    ALL_LAYOUT,
-    paddle::operators::FeedStringsKernel<phi::CPUContext>,
-    ALL_DTYPE) {}
+    paddle::operators::FeedSparseCooTensorKernel<phi::CPUContext>) {}
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PD_REGISTER_GENERAL_KERNEL(
-    feed_dense_tensor,
-    GPU,
-    ALL_LAYOUT,
-    paddle::operators::FeedDenseTensorKernel<phi::GPUContext>,
-    ALL_DTYPE) {}
-PD_REGISTER_GENERAL_KERNEL(
+PD_REGISTER_KERNEL_FOR_ALL_DTYPE(
     feed_sparse_coo_tensor,
     GPU,
     ALL_LAYOUT,
-    paddle::operators::FeedSparseCooTensorKernel<phi::GPUContext>,
-    ALL_DTYPE) {}
-PD_REGISTER_GENERAL_KERNEL(
-    feed_strings,
-    GPU,
-    ALL_LAYOUT,
-    paddle::operators::FeedStringsKernel<phi::GPUContext>,
-    ALL_DTYPE) {}
+    paddle::operators::FeedSparseCooTensorKernel<phi::GPUContext>) {}
 #elif defined(PADDLE_WITH_XPU)
-PD_REGISTER_GENERAL_KERNEL(
-    feed_dense_tensor,
-    XPU,
-    ALL_LAYOUT,
-    paddle::operators::FeedDenseTensorKernel<phi::XPUContext>,
-    ALL_DTYPE) {}
-PD_REGISTER_GENERAL_KERNEL(
+PD_REGISTER_KERNEL_FOR_ALL_DTYPE(
     feed_sparse_coo_tensor,
     XPU,
     ALL_LAYOUT,
-    paddle::operators::FeedSparseCooTensorKernel<phi::XPUContext>,
-    ALL_DTYPE) {}
-PD_REGISTER_GENERAL_KERNEL(
-    feed_strings,
-    XPU,
-    ALL_LAYOUT,
-    paddle::operators::FeedStringsKernel<phi::XPUContext>,
-    ALL_DTYPE) {}
-#endif
-#ifdef PADDLE_WITH_CUSTOM_DEVICE
-PD_REGISTER_GENERAL_KERNEL(
-    feed_dense_tensor,
-    Custom,
-    ALL_LAYOUT,
-    paddle::operators::FeedDenseTensorKernel<phi::CustomContext>,
-    ALL_DTYPE) {}
-PD_REGISTER_GENERAL_KERNEL(
-    feed_strings,
-    Custom,
-    ALL_LAYOUT,
-    paddle::operators::FeedStringsKernel<phi::CustomContext>,
-    ALL_DTYPE) {}
+    paddle::operators::FeedSparseCooTensorKernel<phi::XPUContext>) {}
 #endif
+
+PD_REGISTER_KERNEL_FOR_ALL_BACKEND_DTYPE(
+    feed_dense_tensor, ALL_LAYOUT, paddle::operators::FeedDenseTensorKernel) {}
+PD_REGISTER_KERNEL_FOR_ALL_BACKEND_DTYPE(feed_strings,
+                                         ALL_LAYOUT,
+                                         paddle::operators::FeedStringsKernel) {
+}
diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h
index af187f57aec1ac..9f45c23dac90b7 100644
--- a/paddle/phi/core/kernel_registry.h
+++ b/paddle/phi/core/kernel_registry.h
@@ -1326,28 +1326,28 @@ struct KernelRegistrar {
                                          kernel_unfold_macro,          \
                                          variadic_kernel_unfold_marco, \
                                          __VA_ARGS__))
-/** PD_REGISTER_GENERAL_KERNEL
+/** PD_REGISTER_KERNEL_FOR_ALL_DTYPE
  *
  * Basic Kernel register marco, used to register a instantiated kernel function
  * with one template argument.
  */
 
-#define PD_REGISTER_GENERAL_KERNEL(                 \
-    kernel_name, backend, layout, kernel_fn, dtype) \
-  _PD_REGISTER_GENERAL_KERNEL(                      \
-      ::phi::RegType::INNER, kernel_name, backend, layout, kernel_fn, dtype)
+#define PD_REGISTER_KERNEL_FOR_ALL_DTYPE(    \
+    kernel_name, backend, layout, kernel_fn) \
+  _PD_REGISTER_KERNEL_FOR_ALL_DTYPE(         \
+      ::phi::RegType::INNER, kernel_name, backend, layout, kernel_fn)
 
-#define _PD_REGISTER_GENERAL_KERNEL(                                         \
-    reg_type, kernel_name, backend, layout, kernel_fn, dtype)                \
+#define _PD_REGISTER_KERNEL_FOR_ALL_DTYPE(                                   \
+    reg_type, kernel_name, backend, layout, kernel_fn)                       \
   PD_STATIC_ASSERT_GLOBAL_NAMESPACE(                                         \
       PD_REGISTER_no_t_kernel_ns_check_##kernel_name##_##backend##_##layout, \
       "PD_REGISTER_NO_TEMPLATE_KERNEL must be called in global namespace."); \
-  __PD_REGISTER_GENERAL_KERNEL(                                              \
-      reg_type, kernel_name, backend, layout, kernel_fn, dtype)
+  __PD_REGISTER_KERNEL_FOR_ALL_DTYPE(                                        \
+      reg_type, kernel_name, backend, layout, kernel_fn)
 
 #ifndef _WIN32
-#define __PD_REGISTER_GENERAL_KERNEL(                                       \
-    reg_type, kernel_name, backend, layout, kernel_fn, dtype)               \
+#define __PD_REGISTER_KERNEL_FOR_ALL_DTYPE(                                 \
+    reg_type, kernel_name, backend, layout, kernel_fn)                      \
   template decltype(kernel_fn) kernel_fn;                                   \
   static void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel);           \
@@ -1367,8 +1367,8 @@ struct KernelRegistrar {
   void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(        \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel)
 #else
-#define __PD_REGISTER_GENERAL_KERNEL(                                       \
-    reg_type, kernel_name, backend, layout, kernel_fn, dtype)               \
+#define __PD_REGISTER_KERNEL_FOR_ALL_DTYPE(                                 \
+    reg_type, kernel_name, backend, layout, kernel_fn)                      \
   static void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel);           \
   static const ::phi::KernelRegistrar                                       \
diff --git a/paddle/phi/kernels/assign_kernel.cc b/paddle/phi/kernels/assign_kernel.cc
index 9aba3bcb78faf1..7a6e8d392da1df 100644
--- a/paddle/phi/kernels/assign_kernel.cc
+++ b/paddle/phi/kernels/assign_kernel.cc
@@ -108,21 +108,21 @@ void AssignValueKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PD_REGISTER_GENERAL_KERNEL(
-    assign, CPU, ALL_LAYOUT, phi::AssignKernel<phi::CPUContext>, ALL_DTYPE) {}
+PD_REGISTER_KERNEL_FOR_ALL_DTYPE(assign,
+                                 CPU,
+                                 ALL_LAYOUT,
+                                 phi::AssignKernel<phi::CPUContext>) {}
 
-PD_REGISTER_GENERAL_KERNEL(assign_raw,
-                           CPU,
-                           ALL_LAYOUT,
-                           phi::AssignRawKernel<phi::CPUContext>,
-                           ALL_DTYPE) {
+PD_REGISTER_KERNEL_FOR_ALL_DTYPE(assign_raw,
+                                 CPU,
+                                 ALL_LAYOUT,
+                                 phi::AssignRawKernel<phi::CPUContext>) {
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
 }
-PD_REGISTER_GENERAL_KERNEL(assign_array,
-                           CPU,
-                           ALL_LAYOUT,
-                           phi::AssignArrayKernel<phi::CPUContext>,
-                           ALL_DTYPE) {
+PD_REGISTER_KERNEL_FOR_ALL_DTYPE(assign_array,
+                                 CPU,
+                                 ALL_LAYOUT,
+                                 phi::AssignArrayKernel<phi::CPUContext>) {
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
 }
 PD_REGISTER_KERNEL(assign_value,
@@ -135,20 +135,20 @@ PD_REGISTER_KERNEL(assign_value,
                    int64_t) {}
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PD_REGISTER_GENERAL_KERNEL(
-    assign, GPU, ALL_LAYOUT, phi::AssignKernel<phi::GPUContext>, ALL_DTYPE) {}
-PD_REGISTER_GENERAL_KERNEL(assign_raw,
-                           GPU,
-                           ALL_LAYOUT,
-                           phi::AssignRawKernel<phi::GPUContext>,
-                           ALL_DTYPE) {
+PD_REGISTER_KERNEL_FOR_ALL_DTYPE(assign,
+                                 GPU,
+                                 ALL_LAYOUT,
+                                 phi::AssignKernel<phi::GPUContext>) {}
+PD_REGISTER_KERNEL_FOR_ALL_DTYPE(assign_raw,
+                                 GPU,
+                                 ALL_LAYOUT,
+                                 phi::AssignRawKernel<phi::GPUContext>) {
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
 }
-PD_REGISTER_GENERAL_KERNEL(assign_array,
-                           GPU,
-                           ALL_LAYOUT,
-                           phi::AssignArrayKernel<phi::GPUContext>,
-                           ALL_DTYPE) {
+PD_REGISTER_KERNEL_FOR_ALL_DTYPE(assign_array,
+                                 GPU,
+                                 ALL_LAYOUT,
+                                 phi::AssignArrayKernel<phi::GPUContext>) {
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
 }
 PD_REGISTER_KERNEL(assign_value,
@@ -162,20 +162,20 @@ PD_REGISTER_KERNEL(assign_value,
 #endif
 
 #ifdef PADDLE_WITH_XPU
-PD_REGISTER_GENERAL_KERNEL(
-    assign, XPU, ALL_LAYOUT, phi::AssignKernel<phi::XPUContext>, ALL_DTYPE) {}
-PD_REGISTER_GENERAL_KERNEL(assign_raw,
-                           XPU,
-                           ALL_LAYOUT,
-                           phi::AssignRawKernel<phi::XPUContext>,
-                           ALL_DTYPE) {
+PD_REGISTER_KERNEL_FOR_ALL_DTYPE(assign,
+                                 XPU,
+                                 ALL_LAYOUT,
+                                 phi::AssignKernel<phi::XPUContext>) {}
+PD_REGISTER_KERNEL_FOR_ALL_DTYPE(assign_raw,
+                                 XPU,
+                                 ALL_LAYOUT,
+                                 phi::AssignRawKernel<phi::XPUContext>) {
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
 }
-PD_REGISTER_GENERAL_KERNEL(assign_array,
-                           XPU,
-                           ALL_LAYOUT,
-                           phi::AssignArrayKernel<phi::XPUContext>,
-                           ALL_DTYPE) {
+PD_REGISTER_KERNEL_FOR_ALL_DTYPE(assign_array,
+                                 XPU,
+                                 ALL_LAYOUT,
+                                 phi::AssignArrayKernel<phi::XPUContext>) {
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
 }
 PD_REGISTER_KERNEL(assign_value,
diff --git a/paddle/phi/kernels/memcpy_kernel.cc b/paddle/phi/kernels/memcpy_kernel.cc
index cf2f6ac00a6d6b..49d69a23fedd12 100644
--- a/paddle/phi/kernels/memcpy_kernel.cc
+++ b/paddle/phi/kernels/memcpy_kernel.cc
@@ -136,83 +136,78 @@ void MemcpyKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PD_REGISTER_GENERAL_KERNEL(memcpy_h2d,
-                           CPU,
-                           ALL_LAYOUT,
-                           phi::MemcpyH2DKernel<phi::CPUContext>,
-                           ALL_DTYPE) {}
-
-PD_REGISTER_GENERAL_KERNEL(memcpy_d2h,
-                           CPU,
-                           ALL_LAYOUT,
-                           phi::MemcpyD2HKernel<phi::CPUContext>,
-                           ALL_DTYPE) {
+PD_REGISTER_KERNEL_FOR_ALL_DTYPE(memcpy_h2d,
+                                 CPU,
+                                 ALL_LAYOUT,
+                                 phi::MemcpyH2DKernel<phi::CPUContext>) {}
+
+PD_REGISTER_KERNEL_FOR_ALL_DTYPE(memcpy_d2h,
+                                 CPU,
+                                 ALL_LAYOUT,
+                                 phi::MemcpyD2HKernel<phi::CPUContext>) {
   kernel->OutputAt(0).SetBackend(phi::Backend::CPU);
 }
 
-PD_REGISTER_GENERAL_KERNEL(memcpy_d2h_multi_io,
-                           CPU,
-                           ALL_LAYOUT,
-                           phi::MemcpyD2HMultiIOKernel<phi::CPUContext>,
-                           ALL_DTYPE) {
+PD_REGISTER_KERNEL_FOR_ALL_DTYPE(memcpy_d2h_multi_io,
+                                 CPU,
+                                 ALL_LAYOUT,
+                                 phi::MemcpyD2HMultiIOKernel<phi::CPUContext>) {
   kernel->OutputAt(0).SetBackend(phi::Backend::CPU);
 }
 
-PD_REGISTER_GENERAL_KERNEL(
-    memcpy, CPU, ALL_LAYOUT, phi::MemcpyKernel<phi::CPUContext>, ALL_DTYPE) {
+PD_REGISTER_KERNEL_FOR_ALL_DTYPE(memcpy,
+                                 CPU,
+                                 ALL_LAYOUT,
+                                 phi::MemcpyKernel<phi::CPUContext>) {
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
 }
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PD_REGISTER_GENERAL_KERNEL(memcpy_h2d,
-                           GPU,
-                           ALL_LAYOUT,
-                           phi::MemcpyH2DKernel<phi::GPUContext>,
-                           ALL_DTYPE) {}
-
-PD_REGISTER_GENERAL_KERNEL(memcpy_d2h,
-                           GPU,
-                           ALL_LAYOUT,
-                           phi::MemcpyD2HKernel<phi::GPUContext>,
-                           ALL_DTYPE) {
+PD_REGISTER_KERNEL_FOR_ALL_DTYPE(memcpy_h2d,
+                                 GPU,
+                                 ALL_LAYOUT,
+                                 phi::MemcpyH2DKernel<phi::GPUContext>) {}
+
+PD_REGISTER_KERNEL_FOR_ALL_DTYPE(memcpy_d2h,
+                                 GPU,
+                                 ALL_LAYOUT,
+                                 phi::MemcpyD2HKernel<phi::GPUContext>) {
   kernel->OutputAt(0).SetBackend(phi::Backend::CPU);
 }
 
-PD_REGISTER_GENERAL_KERNEL(memcpy_d2h_multi_io,
-                           GPU,
-                           ALL_LAYOUT,
-                           phi::MemcpyD2HMultiIOKernel<phi::GPUContext>,
-                           ALL_DTYPE) {
+PD_REGISTER_KERNEL_FOR_ALL_DTYPE(memcpy_d2h_multi_io,
+                                 GPU,
+                                 ALL_LAYOUT,
+                                 phi::MemcpyD2HMultiIOKernel<phi::GPUContext>) {
   kernel->OutputAt(0).SetBackend(phi::Backend::CPU);
 }
 
-PD_REGISTER_GENERAL_KERNEL(
-    memcpy, GPU, ALL_LAYOUT, phi::MemcpyKernel<phi::GPUContext>, ALL_DTYPE) {
+PD_REGISTER_KERNEL_FOR_ALL_DTYPE(memcpy,
+                                 GPU,
+                                 ALL_LAYOUT,
+                                 phi::MemcpyKernel<phi::GPUContext>) {
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
 }
 
 #endif
 
 #ifdef PADDLE_WITH_XPU
-PD_REGISTER_GENERAL_KERNEL(memcpy_h2d,
-                           XPU,
-                           ALL_LAYOUT,
-                           phi::MemcpyH2DKernel<phi::XPUContext>,
-                           ALL_DTYPE) {}
-
-PD_REGISTER_GENERAL_KERNEL(memcpy_d2h,
-                           XPU,
-                           ALL_LAYOUT,
-                           phi::MemcpyD2HKernel<phi::XPUContext>,
-                           ALL_DTYPE) {
+PD_REGISTER_KERNEL_FOR_ALL_DTYPE(memcpy_h2d,
+                                 XPU,
+                                 ALL_LAYOUT,
+                                 phi::MemcpyH2DKernel<phi::XPUContext>) {}
+
+PD_REGISTER_KERNEL_FOR_ALL_DTYPE(memcpy_d2h,
+                                 XPU,
+                                 ALL_LAYOUT,
+                                 phi::MemcpyD2HKernel<phi::XPUContext>) {
   kernel->OutputAt(0).SetBackend(phi::Backend::CPU);
 }
 
-PD_REGISTER_GENERAL_KERNEL(memcpy_d2h_multi_io,
-                           XPU,
-                           ALL_LAYOUT,
-                           phi::MemcpyD2HMultiIOKernel<phi::XPUContext>,
-                           ALL_DTYPE) {
+PD_REGISTER_KERNEL_FOR_ALL_DTYPE(memcpy_d2h_multi_io,
+                                 XPU,
+                                 ALL_LAYOUT,
+                                 phi::MemcpyD2HMultiIOKernel<phi::XPUContext>) {
   kernel->OutputAt(0).SetBackend(phi::Backend::CPU);
 }
 
diff --git a/paddle/phi/kernels/reshape_grad_kernel.cc b/paddle/phi/kernels/reshape_grad_kernel.cc
index 70dab5d9bb92ba..ef4c7d4a6f7e1c 100644
--- a/paddle/phi/kernels/reshape_grad_kernel.cc
+++ b/paddle/phi/kernels/reshape_grad_kernel.cc
@@ -61,52 +61,9 @@ void ReshapeDoubleGradKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PD_REGISTER_GENERAL_KERNEL(reshape_grad,
-                           CPU,
-                           ALL_LAYOUT,
-                           phi::ReshapeGradKernel<phi::CPUContext>,
-                           ALL_DTYPE) {}
-PD_REGISTER_GENERAL_KERNEL(reshape_double_grad,
-                           CPU,
-                           ALL_LAYOUT,
-                           phi::ReshapeDoubleGradKernel<phi::CPUContext>,
-                           ALL_DTYPE) {}
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PD_REGISTER_GENERAL_KERNEL(reshape_grad,
-                           GPU,
-                           ALL_LAYOUT,
-                           phi::ReshapeGradKernel<phi::GPUContext>,
-                           ALL_DTYPE) {}
-PD_REGISTER_GENERAL_KERNEL(reshape_double_grad,
-                           GPU,
-                           ALL_LAYOUT,
-                           phi::ReshapeDoubleGradKernel<phi::GPUContext>,
-                           ALL_DTYPE) {}
-#endif
-
-#ifdef PADDLE_WITH_XPU
-PD_REGISTER_GENERAL_KERNEL(reshape_grad,
-                           XPU,
-                           ALL_LAYOUT,
-                           phi::ReshapeGradKernel<phi::XPUContext>,
-                           ALL_DTYPE) {}
-PD_REGISTER_GENERAL_KERNEL(reshape_double_grad,
-                           XPU,
-                           ALL_LAYOUT,
-                           phi::ReshapeDoubleGradKernel<phi::XPUContext>,
-                           ALL_DTYPE) {}
-#endif
-
-#ifdef PADDLE_WITH_CUSTOM_DEVICE
-PD_REGISTER_GENERAL_KERNEL(reshape_grad,
-                           Custom,
-                           ALL_LAYOUT,
-                           phi::ReshapeGradKernel<phi::CustomContext>,
-                           ALL_DTYPE) {}
-PD_REGISTER_GENERAL_KERNEL(reshape_double_grad,
-                           Custom,
-                           ALL_LAYOUT,
-                           phi::ReshapeDoubleGradKernel<phi::CustomContext>,
-                           ALL_DTYPE) {}
-#endif
+PD_REGISTER_KERNEL_FOR_ALL_BACKEND_DTYPE(reshape_grad,
+                                         ALL_LAYOUT,
+                                         phi::ReshapeGradKernel) {}
+PD_REGISTER_KERNEL_FOR_ALL_BACKEND_DTYPE(reshape_double_grad,
+                                         ALL_LAYOUT,
+                                         phi::ReshapeDoubleGradKernel) {}
diff --git a/paddle/phi/kernels/selected_rows/assign_kernel.cc b/paddle/phi/kernels/selected_rows/assign_kernel.cc
index 32045d3106f81c..081d85e68c959f 100644
--- a/paddle/phi/kernels/selected_rows/assign_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/assign_kernel.cc
@@ -34,30 +34,27 @@ void AssignKernel(const Context& dev_ctx,
 }  // namespace sr
 }  // namespace phi
 
-PD_REGISTER_GENERAL_KERNEL(assign_sr,
-                           CPU,
-                           ALL_LAYOUT,
-                           phi::sr::AssignKernel<phi::CPUContext>,
-                           ALL_DTYPE) {
+PD_REGISTER_KERNEL_FOR_ALL_DTYPE(assign_sr,
+                                 CPU,
+                                 ALL_LAYOUT,
+                                 phi::sr::AssignKernel<phi::CPUContext>) {
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
 }
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PD_REGISTER_GENERAL_KERNEL(assign_sr,
-                           GPU,
-                           ALL_LAYOUT,
-                           phi::sr::AssignKernel<phi::GPUContext>,
-                           ALL_DTYPE) {
+PD_REGISTER_KERNEL_FOR_ALL_DTYPE(assign_sr,
+                                 GPU,
+                                 ALL_LAYOUT,
+                                 phi::sr::AssignKernel<phi::GPUContext>) {
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
 }
 #endif
 
 #ifdef PADDLE_WITH_XPU
-PD_REGISTER_GENERAL_KERNEL(assign_sr,
-                           XPU,
-                           ALL_LAYOUT,
-                           phi::sr::AssignKernel<phi::XPUContext>,
-                           ALL_DTYPE) {
+PD_REGISTER_KERNEL_FOR_ALL_DTYPE(assign_sr,
+                                 XPU,
+                                 ALL_LAYOUT,
+                                 phi::sr::AssignKernel<phi::XPUContext>) {
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
 }
 #endif
diff --git a/paddle/phi/kernels/share_buffer_kernel.cc b/paddle/phi/kernels/share_buffer_kernel.cc
index d428de3863780b..dd4272a591bf46 100644
--- a/paddle/phi/kernels/share_buffer_kernel.cc
+++ b/paddle/phi/kernels/share_buffer_kernel.cc
@@ -46,24 +46,6 @@ void ShareBufferKernel(const Context &dev_ctx,
 
 }  // namespace phi
 
-PD_REGISTER_GENERAL_KERNEL(share_buffer,
-                           CPU,
-                           ALL_LAYOUT,
-                           phi::ShareBufferKernel<phi::CPUContext>,
-                           ALL_DTYPE) {}
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PD_REGISTER_GENERAL_KERNEL(share_buffer,
-                           GPU,
-                           ALL_LAYOUT,
-                           phi::ShareBufferKernel<phi::GPUContext>,
-                           ALL_DTYPE) {}
-#endif
-
-#ifdef PADDLE_WITH_XPU
-PD_REGISTER_GENERAL_KERNEL(share_buffer,
-                           XPU,
-                           ALL_LAYOUT,
-                           phi::ShareBufferKernel<phi::XPUContext>,
-                           ALL_DTYPE) {}
-#endif
+PD_REGISTER_KERNEL_FOR_ALL_BACKEND_DTYPE(share_buffer,
+                                         ALL_LAYOUT,
+                                         phi::ShareBufferKernel) {}
diff --git a/paddle/phi/kernels/strings/cpu/strings_copy_kernel.cc b/paddle/phi/kernels/strings/cpu/strings_copy_kernel.cc
index cd6e5fadd49d7a..da8fba85accf9a 100644
--- a/paddle/phi/kernels/strings/cpu/strings_copy_kernel.cc
+++ b/paddle/phi/kernels/strings/cpu/strings_copy_kernel.cc
@@ -53,8 +53,7 @@ void Copy(const Context& dev_ctx,
 }  // namespace strings
 }  // namespace phi
 
-PD_REGISTER_GENERAL_KERNEL(strings_copy,
-                           CPU,
-                           ALL_LAYOUT,
-                           phi::strings::Copy<phi::CPUContext>,
-                           pstring) {}
+PD_REGISTER_KERNEL_FOR_ALL_DTYPE(strings_copy,
+                                 CPU,
+                                 ALL_LAYOUT,
+                                 phi::strings::Copy<phi::CPUContext>) {}
diff --git a/paddle/phi/kernels/strings/cpu/strings_lower_upper_kernel.cc b/paddle/phi/kernels/strings/cpu/strings_lower_upper_kernel.cc
index 9901496b2a6cd8..b470f3b211f6a3 100644
--- a/paddle/phi/kernels/strings/cpu/strings_lower_upper_kernel.cc
+++ b/paddle/phi/kernels/strings/cpu/strings_lower_upper_kernel.cc
@@ -43,14 +43,14 @@ void StringUpperKernel(const ContextT& dev_ctx,
 }  // namespace strings
 }  // namespace phi
 
-PD_REGISTER_GENERAL_KERNEL(strings_lower,
-                           CPU,
-                           ALL_LAYOUT,
-                           phi::strings::StringLowerKernel<phi::CPUContext>,
-                           pstring) {}
-
-PD_REGISTER_GENERAL_KERNEL(strings_upper,
-                           CPU,
-                           ALL_LAYOUT,
-                           phi::strings::StringUpperKernel<phi::CPUContext>,
-                           pstring) {}
+PD_REGISTER_KERNEL_FOR_ALL_DTYPE(
+    strings_lower,
+    CPU,
+    ALL_LAYOUT,
+    phi::strings::StringLowerKernel<phi::CPUContext>) {}
+
+PD_REGISTER_KERNEL_FOR_ALL_DTYPE(
+    strings_upper,
+    CPU,
+    ALL_LAYOUT,
+    phi::strings::StringUpperKernel<phi::CPUContext>) {}
diff --git a/paddle/phi/kernels/strings/gpu/strings_copy_kernel.cu b/paddle/phi/kernels/strings/gpu/strings_copy_kernel.cu
index cfee8d56690bea..8f283f8b8ebc3b 100644
--- a/paddle/phi/kernels/strings/gpu/strings_copy_kernel.cu
+++ b/paddle/phi/kernels/strings/gpu/strings_copy_kernel.cu
@@ -120,8 +120,7 @@ void Copy(const Context& dev_ctx,
 }  // namespace strings
 }  // namespace phi
 
-PD_REGISTER_GENERAL_KERNEL(strings_copy,
-                           GPU,
-                           ALL_LAYOUT,
-                           phi::strings::Copy<phi::GPUContext>,
-                           pstring) {}
+PD_REGISTER_KERNEL_FOR_ALL_DTYPE(strings_copy,
+                                 GPU,
+                                 ALL_LAYOUT,
+                                 phi::strings::Copy<phi::GPUContext>) {}
diff --git a/paddle/phi/kernels/strings/gpu/strings_lower_upper_kernel.cu b/paddle/phi/kernels/strings/gpu/strings_lower_upper_kernel.cu
index 2237edae7eb776..832d9bbf73c0b5 100644
--- a/paddle/phi/kernels/strings/gpu/strings_lower_upper_kernel.cu
+++ b/paddle/phi/kernels/strings/gpu/strings_lower_upper_kernel.cu
@@ -167,14 +167,14 @@ void StringUpperKernel(const ContextT& dev_ctx,
 }  // namespace strings
 }  // namespace phi
 
-PD_REGISTER_GENERAL_KERNEL(strings_lower,
-                           GPU,
-                           ALL_LAYOUT,
-                           phi::strings::StringLowerKernel<phi::GPUContext>,
-                           pstring) {}
-
-PD_REGISTER_GENERAL_KERNEL(strings_upper,
-                           GPU,
-                           ALL_LAYOUT,
-                           phi::strings::StringUpperKernel<phi::GPUContext>,
-                           pstring) {}
+PD_REGISTER_KERNEL_FOR_ALL_DTYPE(
+    strings_lower,
+    GPU,
+    ALL_LAYOUT,
+    phi::strings::StringLowerKernel<phi::GPUContext>) {}
+
+PD_REGISTER_KERNEL_FOR_ALL_DTYPE(
+    strings_upper,
+    GPU,
+    ALL_LAYOUT,
+    phi::strings::StringUpperKernel<phi::GPUContext>) {}
diff --git a/paddle/phi/kernels/strings/strings_empty_kernel.cc b/paddle/phi/kernels/strings/strings_empty_kernel.cc
index 433d3ad0a95f66..22a43ceaff1c17 100644
--- a/paddle/phi/kernels/strings/strings_empty_kernel.cc
+++ b/paddle/phi/kernels/strings/strings_empty_kernel.cc
@@ -38,28 +38,26 @@ void EmptyLikeKernel(const Context& dev_ctx, StringTensor* out) {
 
 using pstring = ::phi::dtype::pstring;
 
-PD_REGISTER_GENERAL_KERNEL(strings_empty,
-                           CPU,
-                           ALL_LAYOUT,
-                           phi::strings::EmptyKernel<phi::CPUContext>,
-                           pstring) {}
+PD_REGISTER_KERNEL_FOR_ALL_DTYPE(strings_empty,
+                                 CPU,
+                                 ALL_LAYOUT,
+                                 phi::strings::EmptyKernel<phi::CPUContext>) {}
 
-PD_REGISTER_GENERAL_KERNEL(strings_empty_like,
-                           CPU,
-                           ALL_LAYOUT,
-                           phi::strings::EmptyLikeKernel<phi::CPUContext>,
-                           pstring) {}
+PD_REGISTER_KERNEL_FOR_ALL_DTYPE(
+    strings_empty_like,
+    CPU,
+    ALL_LAYOUT,
+    phi::strings::EmptyLikeKernel<phi::CPUContext>) {}
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PD_REGISTER_GENERAL_KERNEL(strings_empty,
-                           GPU,
-                           ALL_LAYOUT,
-                           phi::strings::EmptyKernel<phi::GPUContext>,
-                           pstring) {}
-
-PD_REGISTER_GENERAL_KERNEL(strings_empty_like,
-                           GPU,
-                           ALL_LAYOUT,
-                           phi::strings::EmptyLikeKernel<phi::GPUContext>,
-                           pstring) {}
+PD_REGISTER_KERNEL_FOR_ALL_DTYPE(strings_empty,
+                                 GPU,
+                                 ALL_LAYOUT,
+                                 phi::strings::EmptyKernel<phi::GPUContext>) {}
+
+PD_REGISTER_KERNEL_FOR_ALL_DTYPE(
+    strings_empty_like,
+    GPU,
+    ALL_LAYOUT,
+    phi::strings::EmptyLikeKernel<phi::GPUContext>) {}
 #endif
diff --git a/paddle/phi/kernels/transfer_layout_kernel.cc b/paddle/phi/kernels/transfer_layout_kernel.cc
index df3d0ef4617fdf..3d0b5950bc9af4 100644
--- a/paddle/phi/kernels/transfer_layout_kernel.cc
+++ b/paddle/phi/kernels/transfer_layout_kernel.cc
@@ -202,15 +202,13 @@ void TransferLayoutKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PD_REGISTER_GENERAL_KERNEL(transfer_layout,
-                           CPU,
-                           ALL_LAYOUT,
-                           phi::TransferLayoutKernel<phi::CPUContext>,
-                           ALL_DTYPE) {}
+PD_REGISTER_KERNEL_FOR_ALL_DTYPE(transfer_layout,
+                                 CPU,
+                                 ALL_LAYOUT,
+                                 phi::TransferLayoutKernel<phi::CPUContext>) {}
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PD_REGISTER_GENERAL_KERNEL(transfer_layout,
-                           GPU,
-                           ALL_LAYOUT,
-                           phi::TransferLayoutKernel<phi::GPUContext>,
-                           ALL_DTYPE) {}
+PD_REGISTER_KERNEL_FOR_ALL_DTYPE(transfer_layout,
+                                 GPU,
+                                 ALL_LAYOUT,
+                                 phi::TransferLayoutKernel<phi::GPUContext>) {}
 #endif

From 205a4d9a9ecfb8044e03f1e6ea7378d485589c1b Mon Sep 17 00:00:00 2001
From: huangjiyi <43315610+huangjiyi@users.noreply.github.com>
Date: Thu, 13 Apr 2023 14:55:39 +0800
Subject: [PATCH 128/156] [phi] Add get_kernel_signatures.py tool (#52781)

* add get_kernels.py

* update

* update

* update

* update

* update

* update test=document_fix

---------

Co-authored-by: YuanRisheng <yuanrisheng@baidu.com>
---
 paddle/phi/tools/get_kernel_signatures.py | 230 ++++++++++++++++++++++
 1 file changed, 230 insertions(+)
 create mode 100644 paddle/phi/tools/get_kernel_signatures.py

diff --git a/paddle/phi/tools/get_kernel_signatures.py b/paddle/phi/tools/get_kernel_signatures.py
new file mode 100644
index 00000000000000..9b165ef4541cae
--- /dev/null
+++ b/paddle/phi/tools/get_kernel_signatures.py
@@ -0,0 +1,230 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import os.path as osp
+import re
+import subprocess
+import warnings
+
+import pandas as pd
+
+
+def preprocess_macro(file_content, processed_file_path):
+    if file_content is None:
+        return file_content
+    # comment out external macro
+    file_content = re.sub(r'#(include|pragma)', r'// \g<0>', file_content)
+    with open(processed_file_path, "w") as f:
+        f.write(file_content)
+    # expand macro and correct format
+    subprocess.run(
+        ['g++', '-E', processed_file_path, '-o', processed_file_path]
+    )
+    subprocess.run(['clang-format', '-i', processed_file_path])
+    file_content = open(processed_file_path, "r").read()
+    return file_content
+
+
+def search_pattern(pattern, file_content):
+    if file_content is not None:
+        match_result = re.search(pattern, file_content)
+        if match_result is not None:
+            return match_result.group(1)
+    return None
+
+
+class KernelSignatureSearcher:
+    kernel_sig_pattern = (
+        r'(template <typename [\w\s,]+>[\s\n]*void (\w+Kernel)\([^\)]+\))'
+    )
+    kernel_reg_pattern = r'PD_REGISTER_KERNEL(_FOR_ALL_DTYPE)?\([\s\n]*(\w+),[\s\n]*(\w+),[\s\n]*(\w+),[\s\n]*([\w:<>]+)[^\)]*\)'
+    macro_kernel_reg_pattern = (
+        r'#define \w+\([^\)]*\)[\s\n\\]*PD_REGISTER_KERNEL(_FOR_ALL_DTYPE)?'
+    )
+
+    srcs_dir = ['cpu', 'gpu', 'xpu', 'onednn', 'gpudnn', 'kps']
+    build_path = None
+
+    filter = {"kernel_name": []}
+
+    def __init__(self, search_path):
+        self.search_path = search_path
+        self.kernel_func_map = {}
+        self.func_signature_map = {}
+
+        self.search_kernel_signature()
+        self.search_kernel_registration(search_path)
+        self.filter_result()
+
+    @classmethod
+    def search(cls, search_path):
+        if cls.build_path is None:
+            raise ValueError("Please set build_path first.")
+        searcher = cls(search_path)
+        kernel_func_df = pd.DataFrame(
+            list(searcher.kernel_func_map.items()),
+            columns=['kernel_name', 'kernel_func'],
+        )
+        func_signature_df = pd.DataFrame(
+            list(searcher.func_signature_map.items()),
+            columns=['kernel_func', 'kernel_signature'],
+        )
+        return pd.merge(
+            kernel_func_df, func_signature_df, on='kernel_func', how='left'
+        )[['kernel_name', 'kernel_signature']]
+
+    def filter_result(self):
+        for kernel_name in self.filter["kernel_name"]:
+            if kernel_name in self.kernel_func_map:
+                del self.kernel_func_map[kernel_name]
+
+    def search_kernel_signature(self):
+        for file in os.listdir(self.search_path):
+            if file.endswith("_kernel.h"):
+                f = open(osp.join(self.search_path, file), 'r')
+                file_content = f.read()
+                results = re.findall(self.kernel_sig_pattern, file_content)
+                for match_result in results:
+                    self.func_signature_map[match_result[1]] = match_result[0]
+
+    def search_kernel_registration(self, path):
+        self.processed_file_path = osp.join(
+            self.build_path, '.processed_file.cc'
+        )
+        for file in os.listdir(path):
+            file_path = osp.join(path, file)
+            # only search src files under specific srcs_dir
+            if file in self.srcs_dir:
+                self.search_kernel_registration(file_path)
+            if osp.isdir(file_path):
+                continue
+            if re.match(r'\w+_kernel\.(cc|cu)', file):
+                self._search_kernel_registration(file_path, file)
+        if osp.exists(self.processed_file_path):
+            os.remove(self.processed_file_path)
+
+    def _search_kernel_registration(self, file_path, file):
+        file_content = open(file_path, 'r').read()
+        self.header_content = None
+        # if some kernel registration is in macro, preprocess macro first
+        self.file_preprocessed = False
+        if re.search(self.macro_kernel_reg_pattern, file_content):
+            file_content = preprocess_macro(
+                file_content, self.processed_file_path
+            )
+            self.file_preprocessed = True
+        # search kernel registration
+        match_results = re.findall(self.kernel_reg_pattern, file_content)
+        for match_result in match_results:
+            kernel_name = match_result[1]
+            if kernel_name in self.kernel_func_map:
+                continue
+            kernel_func = match_result[-1].split("<")[0].split("::")[-1]
+            self.kernel_func_map[kernel_name] = kernel_func
+            if kernel_func in self.func_signature_map:
+                continue
+            # if target kernel signature is not found in header file, search
+            # it in current src file, or preprocess macro and search again
+            kernel_signature = self.search_target_kernel_signature(
+                kernel_func, file, file_content
+            )
+            self.func_signature_map[kernel_func] = kernel_signature
+            if kernel_signature is None:
+                warnings.warn(
+                    "Can't find kernel signature for kernel: "
+                    + kernel_func
+                    + ", which is registered in file: "
+                    + file_path
+                )
+
+    def search_target_kernel_signature(self, kernel_func, file, file_content):
+        target_kernel_signature_pattern = self.kernel_sig_pattern.replace(
+            r'(\w+Kernel)', kernel_func
+        )
+        # search kernel signature in current kernel registration file
+        kernel_signature = search_pattern(
+            target_kernel_signature_pattern, file_content
+        )
+        if kernel_signature is not None:
+            return kernel_signature
+        # expand macro and search again
+        if not self.file_preprocessed:
+            file_content = preprocess_macro(
+                file_content, self.processed_file_path
+            )
+            kernel_signature = search_pattern(
+                target_kernel_signature_pattern, file_content
+            )
+            if kernel_signature is not None:
+                return kernel_signature
+        # expand macro in according kernel header file and search again
+        if self.header_content is None:
+            header_path = osp.join(self.search_path, file.split('.')[0] + '.h')
+            if osp.exists(header_path):
+                self.header_content = open(header_path, 'r').read()
+        if self.header_content is not None:
+            self.header_content = preprocess_macro(
+                self.header_content, self.processed_file_path
+            )
+            kernel_signature = search_pattern(
+                target_kernel_signature_pattern, self.header_content
+            )
+            if kernel_signature is not None:
+                return kernel_signature
+        return None
+
+
+def get_kernel_signatures():
+    """
+    Get kernel signatures of all kernels registered in phi/kernels, and
+    generate a csv file named 'kernel_signatures.csv' in Paddle/build.
+
+    If you want to filter some kernels in result, you can add them to
+    KernelSignatureSearcher.filter["kernel_name"].
+    """
+    Paddle_path = osp.abspath(osp.join(osp.dirname(__file__), '../../..'))
+    if Paddle_path.split('/')[-1] != 'Paddle':
+        raise ValueError('Paddle path error.')
+    build_path = osp.join(Paddle_path, 'build')
+    os.makedirs(build_path, exist_ok=True)
+    KernelSignatureSearcher.build_path = build_path
+
+    base_path = osp.join(Paddle_path, 'paddle/phi/kernels')
+    kernel_signature_df = KernelSignatureSearcher.search(base_path)
+
+    # Because phi/kernels has some independent subdirs, whose kernel names
+    # (in different namespaces) may conflict with main directory or other
+    # subdirs, so we need to search them separately.
+    indenpendent_subdir = [
+        'fusion',
+        'legacy',
+        'selected_rows',
+        'sparse',
+        'strings',
+    ]
+    for subdir in indenpendent_subdir:
+        sub_path = osp.join(base_path, subdir)
+        sub_df = KernelSignatureSearcher.search(sub_path)
+        kernel_signature_df = pd.concat(
+            [kernel_signature_df, sub_df], ignore_index=True
+        )
+
+    output_csv_path = osp.join(build_path, 'kernel_signatures.csv')
+    kernel_signature_df.to_csv(output_csv_path, index=False)
+    print(kernel_signature_df)
+
+
+if __name__ == "__main__":
+    get_kernel_signatures()

From a67d3bb730500c8f22bb4c30ba13b76ec5d31e09 Mon Sep 17 00:00:00 2001
From: wentao yu <yuwentao126@126.com>
Date: Thu, 13 Apr 2023 15:06:23 +0800
Subject: [PATCH 129/156] [Auto Parallel] Add auto parallel tuner options in
 launch (#52053)

* add auto parallel tuner options in launch

* add ut for launch in auto_parallel tuner

fix code format

* fix ci-converage
---
 .../distributed/fleet/base/role_maker.py      |  4 ++
 .../distributed/launch/context/args_envs.py   |  8 +++
 .../launch/controllers/collective.py          | 64 ++++++++++++++++++-
 .../launch/controllers/controller.py          |  9 ++-
 python/paddle/distributed/launch/job/pod.py   | 10 +--
 .../paddle/fluid/tests/unittests/test_run.py  | 27 +++++++-
 6 files changed, 112 insertions(+), 10 deletions(-)

diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py
index cdfa58330f66f6..0bf7db4a2c3321 100755
--- a/python/paddle/distributed/fleet/base/role_maker.py
+++ b/python/paddle/distributed/fleet/base/role_maker.py
@@ -1076,6 +1076,10 @@ def _collective_env(self):
             self._non_distributed = True
         self._worker_endpoints = self._worker_endpoints.split(",")
         self._trainers_num = len(self._worker_endpoints)
+        auto_tuner = os.getenv("PADDLE_AUTO_PARALLEL_CONFIG", None)
+        if auto_tuner is not None:
+            trainers_num = os.getenv("PADDLE_TRAINERS_NUM", None)
+            self._trainers_num = int(trainers_num)
         self._nodes_num = len({x.split(':')[0] for x in self._worker_endpoints})
         self._local_rank = os.getenv("PADDLE_RANK_IN_NODE")
         self._local_device_ids = os.getenv("PADDLE_LOCAL_DEVICE_IDS")
diff --git a/python/paddle/distributed/launch/context/args_envs.py b/python/paddle/distributed/launch/context/args_envs.py
index 4e942e35662f81..de91d90119633b 100644
--- a/python/paddle/distributed/launch/context/args_envs.py
+++ b/python/paddle/distributed/launch/context/args_envs.py
@@ -37,6 +37,7 @@
     'PADDLE_WITH_GLOO': 'with_gloo',
     'PADDLE_START_PORT': 'start_port',
     'PADDLE_IPS': 'ips',
+    "PADDLE_AUTO_PARALLEL_CONFIG": 'auto_parallel_config',
 }
 
 
@@ -128,6 +129,13 @@ def parse_args():
         "--start_port", type=int, default=6070, help="fix port start with"
     )
 
+    base_group.add_argument(
+        "--auto_parallel_config",
+        type=str,
+        default=None,
+        help="auto parallel config file absolute path, the file should be json format",
+    )
+
     base_group.add_argument(
         "training_script",
         type=str,
diff --git a/python/paddle/distributed/launch/controllers/collective.py b/python/paddle/distributed/launch/controllers/collective.py
index 0c12240b2612e9..07e9be7f3a2238 100644
--- a/python/paddle/distributed/launch/controllers/collective.py
+++ b/python/paddle/distributed/launch/controllers/collective.py
@@ -13,12 +13,17 @@
 # limitations under the License.
 
 import json
+import os
 
 from ..context.device import DeviceType
 from .controller import ControleMode, Controller
 
 
 class CollectiveController(Controller):
+    def __init__(self, ctx):
+        self._tuner_run_mode = None  # 'tuner_only', 'run_only', 'tuner_and_run'
+        super().__init__(ctx)
+
     @classmethod
     def enable(cls, ctx):
         # collective is the default mode
@@ -30,6 +35,9 @@ def enable(cls, ctx):
             return False
 
     def build_pod(self):
+        skip_run = self._build_pod_with_tuner()
+        if skip_run:
+            return
         if (
             self.ctx.args.master is None
             and self.ctx.args.start_port
@@ -39,6 +47,46 @@ def build_pod(self):
         else:
             return self._build_pod_with_master()
 
+    def _build_pod_with_tuner(self):
+        auto_parallel_config = self.ctx.args.auto_parallel_config
+        if auto_parallel_config is not None:
+            if not os.path.exists(auto_parallel_config):
+                self.ctx.logger.warning("auto_parallel_conf not exists!")
+            if not auto_parallel_config.endswith(".json"):
+                self.ctx.logger.warning(
+                    "auto_parallel_config should be a json format file!"
+                )
+
+            with open(auto_parallel_config, 'r') as robj:
+                auto_parallel_data = json.loads(robj.read())
+                self._tuner_run_mode = auto_parallel_data.get(
+                    "tuner_run_mode", 'tuner_and_run'
+                )
+
+            self.ctx.logger.info(f"tuner_run_mode is: {self._tuner_run_mode}")
+            endpoint = f"127.0.0.1:{self.ctx.node.get_free_port()}"
+            pod_replicas = self.pod_replicas()
+            if self._tuner_run_mode in ['tuner_only', 'tuner_and_run']:
+                e = {
+                    "PADDLE_AUTO_PARALLEL_CONFIG": self.ctx.args.auto_parallel_config,
+                    "PADDLE_TRAINERS_NUM": "1",
+                    "PADDLE_TRAINER_ENDPOINTS": endpoint,
+                    "PADDLE_TRAINER_ID": "0",
+                    "PADDLE_CURRENT_ENDPOINT": endpoint,
+                    "FLAGS_selected_gpus": "0",
+                    "PADDLE_AUTO_PARALLEL_STAGE": "tuner",
+                    "PADDLE_GLOBAL_SIZE": "{}".format(
+                        pod_replicas * int(self.ctx.args.nnodes)
+                    ),
+                    "PADDLE_LOCAL_SIZE": f"{pod_replicas}",
+                }
+                log_file = "tuner.log"
+                self.add_container(envs=e, log_file=log_file, is_init=True)
+
+                if self._tuner_run_mode == 'tuner_only':
+                    return True
+        return False
+
     def _build_pod_with_args(self):
         self.pod.replicas = self.pod_replicas()
 
@@ -78,6 +126,13 @@ def _build_pod_with_args(self):
                 "PADDLE_TRAINERS_NUM": f"{len(job_endpoints)}",
                 "PADDLE_RANK_IN_NODE": str(i),
             }
+            if self._tuner_run_mode is not None:
+                e.update(
+                    {
+                        "PADDLE_AUTO_PARALLEL_CONFIG": self.ctx.args.auto_parallel_config,
+                        "PADDLE_AUTO_PARALLEL_STAGE": "run",
+                    }
+                )
             if len(selected_dev_list) > 0:
                 if self.ctx.node.device.dtype == DeviceType.CUSTOM_DEVICE:
                     e.update(self.ctx.node.device.get_custom_device_envs())
@@ -144,7 +199,7 @@ def _build_pod_with_master(self):
 
         job_endpoints = [i['endpoints'] for i in peer_list]
 
-        self.pod.reset()
+        # self.pod.reset()
         selected_dev_key = self.ctx.node.device.get_selected_device_key()
         selected_dev_list = self.ctx.node.device.get_selected_devices(
             self.ctx.args.devices
@@ -164,6 +219,13 @@ def _build_pod_with_master(self):
                 "PADDLE_TRAINERS_NUM": f"{global_size}",
                 "PADDLE_RANK_IN_NODE": str(i),
             }
+            if self._tuner_run_mode is not None:
+                e.update(
+                    {
+                        "PADDLE_AUTO_PARALLEL_CONFIG": self.ctx.args.auto_parallel_config,
+                        "PADDLE_AUTO_PARALLEL_STAGE": "run",
+                    }
+                )
             if len(selected_dev_list) > 0:
                 if self.ctx.node.device.dtype == DeviceType.CUSTOM_DEVICE:
                     e.update(self.ctx.node.device.get_custom_device_envs())
diff --git a/python/paddle/distributed/launch/controllers/controller.py b/python/paddle/distributed/launch/controllers/controller.py
index 22c3b77c95dee3..6b1d54e0fe9d2e 100644
--- a/python/paddle/distributed/launch/controllers/controller.py
+++ b/python/paddle/distributed/launch/controllers/controller.py
@@ -55,10 +55,15 @@ def __init__(self, ctx):
 
     def deploy_pod(self):
 
-        assert len(self.pod.containers) > 0, "No container in the pod"
+        assert (
+            len(self.pod.containers) + len(self.pod.init_containers) > 0
+        ), "No container in the pod"
 
         self.ctx.logger.info(f"Run {self.pod}")
-        self.ctx.logger.debug(self.pod.containers[0])
+        if len(self.pod.init_containers) > 0:
+            self.ctx.logger.debug(self.pod.init_containers[0])
+        if len(self.pod.containers) > 0:
+            self.ctx.logger.debug(self.pod.containers[0])
 
         self.ctx.status.run()
         self.pod.deploy()
diff --git a/python/paddle/distributed/launch/job/pod.py b/python/paddle/distributed/launch/job/pod.py
index 5c8dad2721cfb4..85cf1fed34be4d 100644
--- a/python/paddle/distributed/launch/job/pod.py
+++ b/python/paddle/distributed/launch/job/pod.py
@@ -109,7 +109,6 @@ def deploy(self):
         for i in self._init_containers:
             i.start()
             i.wait(self._init_timeout)
-
         for c in self._containers:
             c.start()
 
@@ -173,7 +172,10 @@ def is_running(self):
 
     def logs(self, idx=None):
         if idx is None:
-            self._containers[0].logs()
+            if len(self._containers) > 0:
+                self._containers[0].logs()
+            if len(self._init_containers) > 0:
+                self._init_containers[0].logs()
         else:
             self._containers[idx].logs()
 
@@ -196,11 +198,11 @@ def watch(
         '''
         end = time.time() + timeout
         while timeout < 0 or time.time() < end:
-            for c in self._containers:
+            for c in self._init_containers + self._containers:
                 if c.status in any_list:
                     return c.status
 
-            s = [c.status for c in self._containers]
+            s = [c.status for c in self._init_containers + self._containers]
             if len(set(s)) == 1 and s[0] in all_list:
                 return s[0]
 
diff --git a/python/paddle/fluid/tests/unittests/test_run.py b/python/paddle/fluid/tests/unittests/test_run.py
index f692e2598fdfd2..bd71ae2304b060 100644
--- a/python/paddle/fluid/tests/unittests/test_run.py
+++ b/python/paddle/fluid/tests/unittests/test_run.py
@@ -25,11 +25,12 @@
 colpyfile = '''# train.py for unitest
 import os
 env = os.environ.copy()
-assert "PADDLE_MASTER" in env
+if "PADDLE_AUTO_PARALLEL_CONFIG" not in env:
+    assert "PADDLE_MASTER" in env
+    assert "PADDLE_GLOBAL_RANK" in env
+    assert "PADDLE_LOCAL_RANK" in env
 assert "PADDLE_GLOBAL_SIZE" in env
 assert "PADDLE_LOCAL_SIZE" in env
-assert "PADDLE_GLOBAL_RANK" in env
-assert "PADDLE_LOCAL_RANK" in env
 '''
 
 pspyfile = '''# train.py for unitest
@@ -114,6 +115,26 @@ def test_collective_3(self):
         self.assertTrue(len(c2) == 3)
         log_dir.cleanup()
 
+    def test_collective_4(self):
+        log_dir = tempfile.TemporaryDirectory()
+        config_dir = tempfile.TemporaryDirectory()
+        config_path = os.path.join(config_dir.name, 'auto_parallel_config.json')
+        with open(config_path, 'w') as wobj:
+            wobj.write(
+                '{\"tuner_save_path\":\"parallel_strategy.pkl\",\"tuner_load_path\":\"parallel_strategy.pkl\",\"tuner_run_mode\":\"tuner_and_run\"}'
+            )
+        port = random.randrange(6000, 8000)
+        args = "--job_id test4 --devices 0,1 --log_dir {} --auto_parallel_config {}"
+        p1 = self.pdrun(args.format(log_dir.name + "/1", config_path))
+        p1.wait()
+        self.assertTrue(p1.poll() == 0)
+
+        c1 = get_files(log_dir.name + "/1", 'test4')
+        print(c1)
+        self.assertTrue(len(c1) == 4)
+        log_dir.cleanup()
+        config_dir.cleanup()
+
 
 class PS_Test(unittest.TestCase):
     def setUp(self):

From cb6de765d969d1350e8b29f0235b4aa5040ee590 Mon Sep 17 00:00:00 2001
From: chenxujun <co63oc@users.noreply.github.com>
Date: Thu, 13 Apr 2023 15:08:35 +0800
Subject: [PATCH 130/156] Add overlap_add, sign tests (#52667)

---
 paddle/phi/kernels/funcs/eigen/sign.cu        |  1 +
 .../kernels/gpu/overlap_add_grad_kernel.cu    |  1 +
 paddle/phi/kernels/gpu/overlap_add_kernel.cu  |  1 +
 paddle/phi/kernels/gpu/sign_kernel.cu.cc      | 16 ++++--
 .../tests/unittests/test_overlap_add_op.py    | 57 ++++++++++++++++++-
 .../fluid/tests/unittests/test_sign_op.py     | 42 +++++++++++++-
 python/paddle/signal.py                       |  5 +-
 python/paddle/tensor/math.py                  |  2 +-
 8 files changed, 112 insertions(+), 13 deletions(-)

diff --git a/paddle/phi/kernels/funcs/eigen/sign.cu b/paddle/phi/kernels/funcs/eigen/sign.cu
index 4caed688013dc7..b630ba7bb6c408 100644
--- a/paddle/phi/kernels/funcs/eigen/sign.cu
+++ b/paddle/phi/kernels/funcs/eigen/sign.cu
@@ -32,6 +32,7 @@ struct EigenSign<Eigen::GpuDevice, T> {
 template struct EigenSign<Eigen::GpuDevice, float>;
 template struct EigenSign<Eigen::GpuDevice, double>;
 template struct EigenSign<Eigen::GpuDevice, dtype::float16>;
+template struct EigenSign<Eigen::GpuDevice, dtype::bfloat16>;
 
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/overlap_add_grad_kernel.cu b/paddle/phi/kernels/gpu/overlap_add_grad_kernel.cu
index 057f7e465c0559..a2ec60109d6404 100644
--- a/paddle/phi/kernels/gpu/overlap_add_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/overlap_add_grad_kernel.cu
@@ -161,5 +161,6 @@ PD_REGISTER_KERNEL(overlap_add_grad,
                    float,
                    double,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/overlap_add_kernel.cu b/paddle/phi/kernels/gpu/overlap_add_kernel.cu
index cf56095db5ea7f..b8726b8d8e15ad 100644
--- a/paddle/phi/kernels/gpu/overlap_add_kernel.cu
+++ b/paddle/phi/kernels/gpu/overlap_add_kernel.cu
@@ -147,5 +147,6 @@ PD_REGISTER_KERNEL(overlap_add,
                    float,
                    double,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/sign_kernel.cu.cc b/paddle/phi/kernels/gpu/sign_kernel.cu.cc
index 37f10243dc596d..71cd1d39b687d6 100644
--- a/paddle/phi/kernels/gpu/sign_kernel.cu.cc
+++ b/paddle/phi/kernels/gpu/sign_kernel.cu.cc
@@ -19,9 +19,13 @@ limitations under the License. */
 #include "paddle/phi/kernels/impl/sign_kernel_impl.h"
 
 // See Note [ Why still include the fluid headers? ]
-#include "paddle/phi/common/float16.h"
-
-using float16 = phi::dtype::float16;
-
-PD_REGISTER_KERNEL(
-    sign, GPU, ALL_LAYOUT, phi::SignKernel, float, double, float16) {}
+#include "paddle/phi/common/amp_type_traits.h"
+
+PD_REGISTER_KERNEL(sign,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SignKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/python/paddle/fluid/tests/unittests/test_overlap_add_op.py b/python/paddle/fluid/tests/unittests/test_overlap_add_op.py
index d0e5cd79c3b899..98d4ce10aaabb4 100644
--- a/python/paddle/fluid/tests/unittests/test_overlap_add_op.py
+++ b/python/paddle/fluid/tests/unittests/test_overlap_add_op.py
@@ -15,14 +15,15 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from eager_op_test import OpTest, convert_float_to_uint16
 
 import paddle
+from paddle.fluid import core
 
 
 def overlap_add(x, hop_length, axis=-1):
     assert axis in [0, -1], 'axis should be 0/-1.'
-    assert len(x.shape) >= 2, 'Input dims shoulb be >= 2.'
+    assert len(x.shape) >= 2, 'Input dims should be >= 2.'
 
     squeeze_output = False
     if len(x.shape) == 2:
@@ -101,6 +102,58 @@ def test_check_grad_normal(self):
         paddle.disable_static()
 
 
+class TestOverlapAddFP16Op(TestOverlapAddOp):
+    def initTestCase(self):
+        input_shape = (50, 3)
+        input_type = 'float16'
+        attrs = {
+            'hop_length': 4,
+            'axis': -1,
+        }
+        return input_shape, input_type, attrs
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support bfloat16",
+)
+class TestOverlapAddBF16Op(OpTest):
+    def setUp(self):
+        self.op_type = "overlap_add"
+        self.python_api = paddle.signal.overlap_add
+        self.shape, self.type, self.attrs = self.initTestCase()
+        self.np_dtype = np.float32
+        self.dtype = np.uint16
+        self.inputs = {
+            'X': np.random.random(size=self.shape).astype(self.np_dtype),
+        }
+        self.outputs = {'Out': overlap_add(x=self.inputs['X'], **self.attrs)}
+
+        self.inputs['X'] = convert_float_to_uint16(self.inputs['X'])
+        self.outputs['Out'] = convert_float_to_uint16(self.outputs['Out'])
+        self.place = core.CUDAPlace(0)
+
+    def initTestCase(self):
+        input_shape = (50, 3)
+        input_type = np.uint16
+        attrs = {
+            'hop_length': 4,
+            'axis': -1,
+        }
+        return input_shape, input_type, attrs
+
+    def test_check_output(self):
+        paddle.enable_static()
+        self.check_output_with_place(self.place)
+        paddle.disable_static()
+
+    def test_check_grad_normal(self):
+        paddle.enable_static()
+        self.check_grad_with_place(self.place, ['X'], 'Out')
+        paddle.disable_static()
+
+
 class TestCase1(TestOverlapAddOp):
     def initTestCase(self):
         input_shape = (3, 50)
diff --git a/python/paddle/fluid/tests/unittests/test_sign_op.py b/python/paddle/fluid/tests/unittests/test_sign_op.py
index 79ee4ceff5f219..2617c2451f330d 100644
--- a/python/paddle/fluid/tests/unittests/test_sign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sign_op.py
@@ -17,7 +17,7 @@
 import gradient_checker
 import numpy as np
 from decorator_helper import prog_scope
-from eager_op_test import OpTest
+from eager_op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import fluid
@@ -40,6 +40,42 @@ def test_check_grad(self):
         self.check_grad(['X'], 'Out')
 
 
+class TestSignFP16Op(TestSignOp):
+    def setUp(self):
+        self.op_type = "sign"
+        self.python_api = paddle.sign
+        self.inputs = {
+            'X': np.random.uniform(-10, 10, (10, 10)).astype("float16")
+        }
+        self.outputs = {'Out': np.sign(self.inputs['X'])}
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support bfloat16",
+)
+class TestSignBF16Op(OpTest):
+    def setUp(self):
+        self.op_type = "sign"
+        self.python_api = paddle.sign
+        self.dtype = np.uint16
+        self.inputs = {
+            'X': np.random.uniform(-10, 10, (10, 10)).astype("float32")
+        }
+        self.outputs = {'Out': np.sign(self.inputs['X'])}
+
+        self.inputs['X'] = convert_float_to_uint16(self.inputs['X'])
+        self.outputs['Out'] = convert_float_to_uint16(self.outputs['Out'])
+        self.place = core.CUDAPlace(0)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ['X'], 'Out')
+
+
 class TestSignOpError(unittest.TestCase):
     def test_errors(self):
         with program_guard(Program(), Program()):
@@ -97,7 +133,7 @@ def sign_wrapper(self, x):
 
     @prog_scope()
     def func(self, place):
-        # the shape of input variable should be clearly specified, not inlcude -1.
+        # the shape of input variable should be clearly specified, not include -1.
         eps = 0.005
         dtype = np.float32
 
@@ -128,7 +164,7 @@ def sign_wrapper(self, x):
 
     @prog_scope()
     def func(self, place):
-        # the shape of input variable should be clearly specified, not inlcude -1.
+        # the shape of input variable should be clearly specified, not include -1.
         eps = 0.005
         dtype = np.float32
 
diff --git a/python/paddle/signal.py b/python/paddle/signal.py
index e404ec08ffb903..e1580b00075492 100644
--- a/python/paddle/signal.py
+++ b/python/paddle/signal.py
@@ -219,7 +219,10 @@ def overlap_add(x, hop_length, axis=-1, name=None):
         out = op(x, *attrs)
     else:
         check_variable_and_dtype(
-            x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'], op_type
+            x,
+            'x',
+            ['int32', 'int64', 'float16', 'float32', 'float64', 'uint16'],
+            op_type,
         )
         helper = LayerHelper(op_type, **locals())
         dtype = helper.input_dtype(input_param_name='x')
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 1e969be880401e..fe41200378793d 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -3677,7 +3677,7 @@ def sign(x, name=None):
         return _C_ops.sign(x)
     else:
         check_variable_and_dtype(
-            x, 'x', ['float16', 'float32', 'float64'], 'sign'
+            x, 'x', ['float16', 'float32', 'float64', 'uint16'], 'sign'
         )
         helper = LayerHelper("sign", **locals())
         out = helper.create_variable_for_type_inference(dtype=x.dtype)

From 9246b93c6e6281433aca81c1bb893dd2d31683df Mon Sep 17 00:00:00 2001
From: Sanbu <96160062+sanbuphy@users.noreply.github.com>
Date: Thu, 13 Apr 2023 15:41:14 +0800
Subject: [PATCH 131/156] Support static graph code-gen for temporal_shift
 (#52686)

---
 paddle/fluid/operators/temporal_shift_op.cc | 164 --------------------
 paddle/phi/api/yaml/backward.yaml           |  11 ++
 paddle/phi/api/yaml/legacy_backward.yaml    |  10 --
 paddle/phi/api/yaml/legacy_ops.yaml         |   9 --
 paddle/phi/api/yaml/op_compat.yaml          |   7 +
 paddle/phi/api/yaml/ops.yaml                |  10 ++
 paddle/phi/ops/compat/temporal_shift_sig.cc |  39 -----
 python/paddle/nn/functional/extension.py    |  36 ++++-
 8 files changed, 61 insertions(+), 225 deletions(-)
 delete mode 100644 paddle/fluid/operators/temporal_shift_op.cc
 delete mode 100644 paddle/phi/ops/compat/temporal_shift_sig.cc

diff --git a/paddle/fluid/operators/temporal_shift_op.cc b/paddle/fluid/operators/temporal_shift_op.cc
deleted file mode 100644
index 932e6c1f5eaca0..00000000000000
--- a/paddle/fluid/operators/temporal_shift_op.cc
+++ /dev/null
@@ -1,164 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/fluid/operators/temporal_shift_op.h"
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/phi/infermeta/unary.h"
-
-namespace paddle {
-namespace operators {
-
-class TemporalShiftOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                          ctx.GetPlace());
-  }
-};
-
-class TemporalShiftOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "The input tensor of temporal shift operator. "
-             "This is a 4-D tensor with shape of [N*T, C, H, W] "
-             "or [N*T, H, W, C]. "
-             "While N is the batch size, T is the temporal segment "
-             "number, C is the channel number, H is the height of "
-             "features and W is the width of features. "
-             "The data type is float16, float32 and float64");
-    AddOutput("Out",
-              "The output tensor of temporal shift operator. "
-              "This is a 4-D tensor in the same shape with Input(X).");
-
-    AddAttr<int>("seg_num",
-                 "The temporal segment number, this should be a positive "
-                 "integer.");
-    AddAttr<float>(
-        "shift_ratio",
-        "The shift ratio of the channels, the first :attr:`shift_ratio` part "
-        "of channels will be shifted by -1 along the temporal dimension, "
-        "and the second :attr:`shift_ratio` part of channels will be shifted "
-        "by 1 along the temporal dimension. :attr:`shift_ratio` should be in "
-        "range [0, 0.5]. Default 0.25.")
-        .SetDefault(0.25);
-    AddAttr<std::string>(
-        "data_format",
-        "(string, default NCHW) Only used in "
-        "an optional string from: \"NHWC\", \"NCHW\". "
-        "Specify that the data format of the input and output data is "
-        "channel_first or channel_last.")
-        .SetDefault("NCHW");
-
-    AddComment(R"DOC(
-          This operator calculates the temporal shifting features for Input(X).
-
-          Input(X) should be in shape of [N*T, C, H, W] or [N*T, H, W, C], while
-          N is the batch size, T is the temporal segment number specified by
-          :attr:`seg_num`, C is the channel number, H and W is the height and
-          width of features.
-
-          Temporal Shifting is calculated as follows when data format is NCHW:
-
-          Step 1: Reshape Input(X) to [N, T, C, H, W].
-
-          Step 2: Pad 0 to reshaping result in the 2nd(T) dimension with
-          padding width as 1 on each side, padding result will be in shape
-          of [N, T+2, C, H, W].
-
-          Step 3: Assume :attr:`shift_ratio` is :math:`1/4`, slice padding
-          result as follows:
-
-          $$
-          slice1 = x[:, :T, :C/4, :, :]
-          $$
-          $$
-          slice2 = x[:, 2:T+2, C/4:C/2, :, :]
-          $$
-          $$
-          slice3 = x[:, 1:T+1, C/2:, :, :]
-          $$
-
-          Step 4: Concatenate three slices along the 3rd(C) dimension and
-          reshape result to [N*T, C, H, W].
-
-          For details of temporal shifting, please refer to paper:
-          `Temporal Shift Module <http://arxiv.org/abs/1811.08383>`_ .
-
-         )DOC");
-  }
-};
-
-class TemporalShiftOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    if (ctx->HasOutput(framework::GradVarName("X"))) {
-      ctx->SetOutputDim(framework::GradVarName("X"),
-                        ctx->GetInputDim(framework::GradVarName("Out")));
-    }
-  }
-
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(
-                              ctx, framework::GradVarName("Out")),
-                          ctx.GetPlace());
-  }
-};
-
-template <typename T>
-class TemporalShiftGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("temporal_shift_grad");
-    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    op->SetAttrMap(this->Attrs());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-DECLARE_INFER_SHAPE_FUNCTOR(temporal_shift,
-                            TemporalShiftInferShapeFunctor,
-                            PD_INFER_META(phi::TemporalShiftInferMeta));
-REGISTER_OPERATOR(temporal_shift,
-                  ops::TemporalShiftOp,
-                  ops::TemporalShiftOpMaker,
-                  ops::TemporalShiftGradOpMaker<paddle::framework::OpDesc>,
-                  ops::TemporalShiftGradOpMaker<paddle::imperative::OpBase>,
-                  TemporalShiftInferShapeFunctor);
-REGISTER_OPERATOR(temporal_shift_grad, ops::TemporalShiftOpGrad);
-REGISTER_OP_CPU_KERNEL(temporal_shift,
-                       ops::TemporalShiftKernel<float>,
-                       ops::TemporalShiftKernel<double>);
-REGISTER_OP_CPU_KERNEL(temporal_shift_grad,
-                       ops::TemporalShiftGradKernel<float>,
-                       ops::TemporalShiftGradKernel<double>);
diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml
index 6415dc4fc8cdcf..6c0184d7d0ced4 100644
--- a/paddle/phi/api/yaml/backward.yaml
+++ b/paddle/phi/api/yaml/backward.yaml
@@ -1863,6 +1863,17 @@
   inplace : (grad_x_grad_forward -> grad_out_forward_grad)
   optional : grad_out_new_grad, grad_out_grad_grad
 
+- backward_op : temporal_shift_grad
+  forward : temporal_shift(Tensor x, int seg_num, float shift_ratio = 0.25f, str data_format = "NCHW") -> Tensor(out)
+  args : (Tensor out_grad, int seg_num, float shift_ratio, str data_format)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [out_grad]
+  kernel :
+    func : temporal_shift_grad
+    data_type : out_grad
+
 - backward_op : thresholded_relu_grad
   forward : thresholded_relu (Tensor x, float threshold) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, float threshold)
diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml
index 178e8fcda94f2c..d810ad8bd9f032 100755
--- a/paddle/phi/api/yaml/legacy_backward.yaml
+++ b/paddle/phi/api/yaml/legacy_backward.yaml
@@ -1031,16 +1031,6 @@
     data_type : out_grad
   optional : reserve_space
 
-- backward_op : temporal_shift_grad
-  forward : temporal_shift(Tensor x, int seg_num, float shift_ratio, str data_format_str) -> Tensor(out)
-  args : (Tensor out_grad, int seg_num, float shift_ratio, str data_format_str)
-  output : Tensor(x_grad)
-  infer_meta :
-    func : UnchangedInferMeta
-    param : [out_grad]
-  kernel :
-    func : temporal_shift_grad
-
 - backward_op : tile_double_grad
   forward : tile_grad (Tensor x, Tensor grad_out, IntArray repeat_times) -> Tensor(grad_x)
   args : (Tensor grad_x_grad, IntArray repeat_times)
diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml
index 9ffbaa07bd076f..05abaf30445dab 100755
--- a/paddle/phi/api/yaml/legacy_ops.yaml
+++ b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -1229,15 +1229,6 @@
   backward : sync_batch_norm_grad
   inplace : (mean -> mean_out), (variance -> variance_out)
 
-- op : temporal_shift
-  args : (Tensor x, int seg_num, float shift_ratio, str data_format_str)
-  output : Tensor
-  infer_meta :
-    func : TemporalShiftInferMeta
-  kernel :
-    func : temporal_shift
-  backward : temporal_shift_grad
-
 - op : tile
   args : (Tensor x, IntArray repeat_times = {})
   output : Tensor(out)
diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
index 471e257a6afdcb..ad183913832f90 100644
--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -2347,3 +2347,10 @@
     x : X
   outputs :
     out : Out
+
+- op: temporal_shift
+  backward: temporal_shift_grad
+  inputs :
+    x : X
+  outputs :
+    out : Out
diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
index 328b0211450bf6..b482fcbcdcc347 100644
--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -1885,6 +1885,16 @@
     func : tanh_shrink
   backward : tanh_shrink_grad
 
+- op : temporal_shift
+  args : (Tensor x, int seg_num, float shift_ratio = 0.25f, str data_format = "NCHW")
+  output : Tensor(out)
+  infer_meta :
+    func : TemporalShiftInferMeta
+  kernel :
+    func : temporal_shift
+    data_type : x
+  backward : temporal_shift_grad
+
 - op : thresholded_relu
   args : (Tensor x, float threshold = 1.0)
   output : Tensor
diff --git a/paddle/phi/ops/compat/temporal_shift_sig.cc b/paddle/phi/ops/compat/temporal_shift_sig.cc
deleted file mode 100644
index a6eed22716ca7c..00000000000000
--- a/paddle/phi/ops/compat/temporal_shift_sig.cc
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-
-KernelSignature TemporalShiftOpArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  return KernelSignature("temporal_shift",
-                         {"X"},
-                         {"seg_num", "shift_ratio", "data_format"},
-                         {"Out"});
-}
-
-KernelSignature TemporalShiftGradOpArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  return KernelSignature("temporal_shift_grad",
-                         {"Out@GRAD"},
-                         {"seg_num", "shift_ratio", "data_format"},
-                         {"X@GRAD"});
-}
-
-}  // namespace phi
-
-PD_REGISTER_ARG_MAPPING_FN(temporal_shift, phi::TemporalShiftOpArgumentMapping);
-PD_REGISTER_ARG_MAPPING_FN(temporal_shift_grad,
-                           phi::TemporalShiftGradOpArgumentMapping);
diff --git a/python/paddle/nn/functional/extension.py b/python/paddle/nn/functional/extension.py
index 67bc16ccddc6ac..c0e86267f34846 100644
--- a/python/paddle/nn/functional/extension.py
+++ b/python/paddle/nn/functional/extension.py
@@ -28,7 +28,6 @@
 from ...fluid.layer_helper import LayerHelper
 from ...framework import convert_np_dtype_to_dtype_, core
 from ...tensor.creation import assign
-from ...tensor.layer_function_generator import templatedoc
 
 __all__ = []
 
@@ -338,13 +337,44 @@ def gather_tree(ids, parents):
         return out
 
 
-@templatedoc()
 def temporal_shift(x, seg_num, shift_ratio=0.25, name=None, data_format="NCHW"):
     """
 
     **Temporal Shift Operator**
 
-    ${comment}
+    Calculate the temporal shifting features for Input(X).
+
+    Input(X) should be in shape of [N*T, C, H, W] or [N*T, H, W, C], while
+    N is the batch size, T is the temporal segment number specified by
+    :attr:`seg_num`, C is the channel number, H and W is the height and
+    width of features.
+
+    Temporal Shifting is calculated as follows when data format is NCHW:
+
+    Step 1: Reshape Input(X) to [N, T, C, H, W].
+
+    Step 2: Pad 0 to reshaping result in the 2nd(T) dimension with
+    padding width as 1 on each side, padding result will be in shape
+    of [N, T+2, C, H, W].
+
+    Step 3: Assume :attr:`shift_ratio` is :math:`1/4`, slice padding
+    result as follows:
+
+    $$
+    slice1 = x[:, :T, :C/4, :, :]
+    $$
+    $$
+    slice2 = x[:, 2:T+2, C/4:C/2, :, :]
+    $$
+    $$
+    slice3 = x[:, 1:T+1, C/2:, :, :]
+    $$
+
+    Step 4: Concatenate three slices along the 3rd(C) dimension and
+    reshape result to [N*T, C, H, W].
+
+    For details of temporal shifting, please refer to paper:
+    `Temporal Shift Module <http://arxiv.org/abs/1811.08383>`_ .
 
     Args:
         x(Tensor): ${x_comment}

From bd06be002f1c3196b035a34d04bfde82b196fe73 Mon Sep 17 00:00:00 2001
From: RedContritio <RedContritio@qq.com>
Date: Thu, 13 Apr 2023 15:42:26 +0800
Subject: [PATCH 132/156] support auto generate for op rmsprop optimizer
 (#52709)

---
 .../fluid/operators/optimizers/rmsprop_op.cc  | 112 ------------------
 .../optimizers/unity_build_rule.cmake         |   6 +-
 paddle/phi/api/yaml/legacy_ops.yaml           |  12 --
 paddle/phi/api/yaml/op_compat.yaml            |   6 +
 paddle/phi/api/yaml/ops.yaml                  |  12 ++
 paddle/phi/ops/compat/rmsprop_sig.cc          |  59 ---------
 6 files changed, 20 insertions(+), 187 deletions(-)
 delete mode 100644 paddle/fluid/operators/optimizers/rmsprop_op.cc
 delete mode 100644 paddle/phi/ops/compat/rmsprop_sig.cc

diff --git a/paddle/fluid/operators/optimizers/rmsprop_op.cc b/paddle/fluid/operators/optimizers/rmsprop_op.cc
deleted file mode 100644
index 64be18ddee8c50..00000000000000
--- a/paddle/fluid/operators/optimizers/rmsprop_op.cc
+++ /dev/null
@@ -1,112 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/phi/infermeta/multiary.h"
-
-namespace paddle {
-namespace operators {
-
-class RmspropOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-};
-
-class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Param",
-             "(Tensor, default Tensor<float>) "
-             "Input parameter value that has to be updated.");
-    AddInput("MeanSquare",
-             "(Tensor, default Tensor<float>)"
-             " The mean square value that gets updated.");
-    AddInput("MeanGrad",
-             "(Tensor, default Tensor<float>)"
-             " The moving average of gradient")
-        .AsDispensable();
-
-    AddInput("LearningRate",
-             "(Tensor, default Tensor<float>) "
-             "The learning rate should be a tensor of size 1.");
-    AddInput("Grad",
-             "(Tensor, default Tensor<float>) "
-             "Input gradient of the parameter.");
-    AddInput("Moment",
-             "(Tensor, default Tensor<float>) The moment that gets updated.");
-    AddInput("MasterParam", "FP32 master weight for AMP.").AsDispensable();
-
-    AddOutput("ParamOut", "(Tensor) Output updated parameter value.");
-    AddOutput("MomentOut", "(Tensor) Output updated moment.");
-    AddOutput("MeanSquareOut", "(Tensor) Output Mean squared updated value.");
-    AddOutput("MeanGradOut",
-              "(Tensor) Output moving average of gradient updated value.");
-    AddOutput("MasterParamOut",
-              "The updated FP32 master weight for AMP. "
-              "It shared memory with Input(MasterParam).")
-        .AsDispensable();
-
-    AddAttr<float>("epsilon",
-                   "(float, default 1e-10) Constant "
-                   "for numerical stability.")
-        .SetDefault(1.0e-10f);
-    AddAttr<float>("decay",
-                   "(float, default 0.9) "
-                   "Discounting factor for coming gradient.")
-        .SetDefault(0.9f);
-    AddAttr<float>("momentum", "(float, default 0.0) Constant value.")
-        .SetDefault(0.0f);
-    AddAttr<bool>("centered", "(bool, default false) use centered rmsprop.")
-        .SetDefault(false);
-    AddAttr<bool>("multi_precision",
-                  "(bool, default false) "
-                  "Whether to use multi-precision during weight updating.")
-        .SetDefault(false);
-    AddComment(R"DOC(
-Rmsprop Optimizer.
-
-$$
-MeanSquareOut = decay * MeanSquare + (1 - decay) * Grad * Grad \\
-MomentOut = momentum * Moment +
-            \frac{LearningRate * Grad}{\sqrt{MeanSquareOut + epsilon}} \\
-ParamOut = Param -  MomentOut
-$$
-
-if centered is true:
-
-mean_grad = decay * mean_square{t-1} + (1-decay) * gradient
-mean_square = decay * mean_square{t-1} + (1-decay) * gradient ** 2
-mom = momentum * mom{t-1} + learning_rate * g_t /
-    sqrt(mean_square - mean_grad**2 + epsilon)
-param -= mom
-
-The original slides that proposed Rmsprop: Slide 29 of
-http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
-
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-DECLARE_INFER_SHAPE_FUNCTOR(rmsprop,
-                            RmspropInferShapeFunctor,
-                            PD_INFER_META(phi::RmspropInferMeta));
-REGISTER_OP_WITHOUT_GRADIENT(rmsprop,
-                             ops::RmspropOp,
-                             ops::RmspropOpMaker,
-                             RmspropInferShapeFunctor);
diff --git a/paddle/fluid/operators/optimizers/unity_build_rule.cmake b/paddle/fluid/operators/optimizers/unity_build_rule.cmake
index 6936175d8743b8..8f89abf1a09736 100644
--- a/paddle/fluid/operators/optimizers/unity_build_rule.cmake
+++ b/paddle/fluid/operators/optimizers/unity_build_rule.cmake
@@ -14,8 +14,7 @@ register_unity_group(
   proximal_gd_op.cc
   decayed_adagrad_op.cc
   adadelta_op.cc
-  dpsgd_op.cc
-  rmsprop_op.cc)
+  dpsgd_op.cc)
 register_unity_group(
   cu
   ftrl_op.cu
@@ -27,5 +26,4 @@ register_unity_group(
   adam_op.cu
   decayed_adagrad_op.cu
   adadelta_op.cu
-  lamb_op.cu
-  rmsprop_op.cu)
+  lamb_op.cu)
diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml
index 05abaf30445dab..ab84e0187757d6 100755
--- a/paddle/phi/api/yaml/legacy_ops.yaml
+++ b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -1070,18 +1070,6 @@
   intermediate : xshape
   backward: reshape_grad
 
-- op : rmsprop_
-  args : (Tensor param, Tensor mean_square, Tensor grad, Tensor moment, Tensor learning_rate, Tensor mean_grad, Tensor master_param, float epsilon, float decay, float momentum, bool centered, bool multi_precision)
-  output : Tensor(param_out), Tensor(moment_out), Tensor(mean_square_out), Tensor(mean_grad_out), Tensor(master_param_out)
-  infer_meta :
-    func : RmspropInferMeta
-  kernel :
-    func : rmsprop {dense, dense, dense, dense, dense, dense, dense-> dense, dense, dense, dense, dense}
-           rmsprop_dense_param_sparse_grad {dense, dense, selected_rows, dense, dense, dense, dense-> dense, dense, dense, dense, dense}
-    data_type : param
-  optional : mean_grad, master_param
-  inplace : (param -> param_out), (moment -> moment_out), (mean_square -> mean_square_out), (mean_grad -> mean_grad_out), (master_param->master_param_out)
-
 - op : rnn
   args: (Tensor x, Tensor[] pre_state, Tensor[] weight_list, Tensor sequence_length, Tensor dropout_state_in, float dropout_prob=0.0, bool is_bidirec=false, int input_size=10, int hidden_size=100, int num_layers=1, str mode="RNN_TANH", int seed=0, bool is_test=false)
   output: Tensor(out), Tensor(dropout_state_out), Tensor[](state){pre_state.size()}, Tensor(reserve)
diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
index ad183913832f90..19069eeac9a58e 100644
--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -1790,6 +1790,12 @@
       support_tensor : true
   manual_signature : [reverse]
 
+- op : rmsprop_
+  inputs :
+    {param: Param, mean_square: MeanSquare, mean_grad: MeanGrad, learning_rate: LearningRate, grad: Grad, moment: Moment, master_param: MasterParam}
+  outputs :
+    {param_out: ParamOut, moment_out: MomentOut, mean_square_out: MeanSquareOut, mean_grad_out: MeanGradOut, master_param_outs: MasterParamOut}
+
 - op : roll
   backward : roll_grad
   inputs :
diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
index b482fcbcdcc347..f1cc5d1b5395f7 100644
--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -1511,6 +1511,18 @@
     data_type : x
   backward : reverse_grad
 
+- op : rmsprop_
+  args : (Tensor param, Tensor mean_square, Tensor grad, Tensor moment, Tensor learning_rate, Tensor mean_grad, Tensor master_param, float epsilon = 1.0e-10f, float decay = 0.9f, float momentum = 0.0f, bool centered = false, bool multi_precision = false)
+  output : Tensor(param_out), Tensor(moment_out), Tensor(mean_square_out), Tensor(mean_grad_out), Tensor(master_param_outs)
+  infer_meta :
+    func : RmspropInferMeta
+  kernel :
+    func : rmsprop {dense, dense, dense, dense, dense, dense, dense-> dense, dense, dense, dense, dense}
+           rmsprop_dense_param_sparse_grad {dense, dense, selected_rows, dense, dense, dense, dense-> dense, dense, dense, dense, dense}
+    data_type : param
+  optional : mean_grad, master_param, master_param_outs
+  inplace : (param -> param_out), (moment -> moment_out), (mean_square -> mean_square_out), (mean_grad -> mean_grad_out), (master_param->master_param_outs)
+
 - op : roll
   args : (Tensor x, IntArray shifts={}, int64_t[] axis={})
   output : Tensor(out)
diff --git a/paddle/phi/ops/compat/rmsprop_sig.cc b/paddle/phi/ops/compat/rmsprop_sig.cc
deleted file mode 100644
index b0027279fe6b1d..00000000000000
--- a/paddle/phi/ops/compat/rmsprop_sig.cc
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-
-KernelSignature RmspropOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  if (ctx.IsDenseTensorInput("Grad")) {
-    return KernelSignature(
-        "rmsprop",
-        {"Param",
-         "MeanSquare",
-         "Grad",
-         "Moment",
-         "LearningRate",
-         "MeanGrad",
-         "MasterParam"},
-        {"epsilon", "decay", "momentum", "centered", "multi_precision"},
-        {"ParamOut",
-         "MomentOut",
-         "MeanSquareOut",
-         "MeanGradOut",
-         "MasterParamOut"});
-  } else if (ctx.IsSelectedRowsInput("Grad")) {
-    return KernelSignature(
-        "rmsprop_dense_param_sparse_grad",
-        {"Param",
-         "MeanSquare",
-         "Grad",
-         "Moment",
-         "LearningRate",
-         "MeanGrad",
-         "MasterParam"},
-        {"epsilon", "decay", "momentum", "centered", "multi_precision"},
-        {"ParamOut",
-         "MomentOut",
-         "MeanSquareOut",
-         "MeanGradOut",
-         "MasterParamOut"});
-  }
-
-  return KernelSignature("unregistered", {}, {}, {});
-}
-
-}  // namespace phi
-
-PD_REGISTER_ARG_MAPPING_FN(rmsprop, phi::RmspropOpArgumentMapping);

From ea1c9b893c103ad5d1e81a059c0acd573fa95310 Mon Sep 17 00:00:00 2001
From: wanghuancoder <wanghuan29@baidu.com>
Date: Thu, 13 Apr 2023 16:21:36 +0800
Subject: [PATCH 133/156] refine force syncbn (#52860)

---
 paddle/fluid/eager/backward.cc | 39 +++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 20 deletions(-)

diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc
index 2216b6b01427ee..0477c5d5f90d8b 100644
--- a/paddle/fluid/eager/backward.cc
+++ b/paddle/fluid/eager/backward.cc
@@ -372,32 +372,31 @@ std::vector<paddle::Tensor> RunBackward(
 
         auto add_next_node_func = [&node_in_degree_map,
                                    &queue](GradNodeBase* next_node) {
-          if (node_in_degree_map[next_node] == 0) {
-            if (dynamic_cast<egr::GradNodeAccumulation*>(next_node)) {
-              queue.push_front(std::move(next_node));
-            } else {
-              queue.push_back(std::move(next_node));
-            }
+          if (dynamic_cast<egr::GradNodeAccumulation*>(next_node)) {
+            queue.push_front(std::move(next_node));
+          } else {
+            queue.push_back(std::move(next_node));
           }
         };
-
-        if (force_sequential_nodes_set.count(next_node)) {
-          if (force_sequential_nodes_queue.front() == next_node) {
-            force_sequential_nodes_queue.pop_front();
-            add_next_node_func(next_node);
-            while (ready_force_sequential_nodes.count(
-                force_sequential_nodes_queue.front())) {
-              ready_force_sequential_nodes.erase(
-                  force_sequential_nodes_queue.front());
-              add_next_node_func(force_sequential_nodes_queue.front());
+        if (node_in_degree_map[next_node] == 0) {
+          if (force_sequential_nodes_set.count(next_node)) {
+            if (force_sequential_nodes_queue.front() == next_node) {
               force_sequential_nodes_queue.pop_front();
+              add_next_node_func(next_node);
+              while (ready_force_sequential_nodes.count(
+                  force_sequential_nodes_queue.front())) {
+                ready_force_sequential_nodes.erase(
+                    force_sequential_nodes_queue.front());
+                add_next_node_func(force_sequential_nodes_queue.front());
+                force_sequential_nodes_queue.pop_front();
+              }
+            } else {
+              ready_force_sequential_nodes.insert(next_node);
+              continue;
             }
           } else {
-            ready_force_sequential_nodes.insert(next_node);
-            continue;
+            add_next_node_func(next_node);
           }
-        } else {
-          add_next_node_func(next_node);
         }
       }
     }

From 0f2dc4ca640688bd1e482abbc717039c4df504a4 Mon Sep 17 00:00:00 2001
From: csy0225 <78470701+csy0225@users.noreply.github.com>
Date: Thu, 13 Apr 2023 16:44:39 +0800
Subject: [PATCH 134/156] Fix delete_isolated_node_pass problem (#52856)

---
 .../ir/xpu/delete_isolated_node_pass.cc       | 23 ++++++-------------
 test/xpu/test_instance_norm_op_xpu.py         |  5 +++-
 2 files changed, 11 insertions(+), 17 deletions(-)

diff --git a/paddle/fluid/framework/ir/xpu/delete_isolated_node_pass.cc b/paddle/fluid/framework/ir/xpu/delete_isolated_node_pass.cc
index b96fc8c31bb02b..c543045b5bc95a 100644
--- a/paddle/fluid/framework/ir/xpu/delete_isolated_node_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/delete_isolated_node_pass.cc
@@ -50,8 +50,8 @@ class DeleteIsolatedNodePass : public Pass {
       std::unordered_set<std::string>* delete_node_names) const;
 
   int UpdateControlFlowOp(
+      int current_graph_index,
       Graph* graph,
-      const std::map<int, Graph*>& block_id_graph_map,
       const std::unordered_set<std::string>& delete_node_names) const;
 
   const std::map<std::string, std::string> control_flow_op_input_map_{
@@ -86,20 +86,9 @@ void DeleteIsolatedNodePass::ApplyImpl(Graph* graph) const {
     LOG(INFO) << "---  delete " << delete_counts << " isolated nodes";
   }
 
-  std::map<int, Graph*> block_id_graph_map;
-  for (size_t i = 0; i < graph->SubGraphsSize(); i++) {
-    auto* sub_graph = graph->GetSubGraph(i);
-    for (auto* node : sub_graph->Nodes()) {
-      if (node->IsVar()) {
-        block_id_graph_map[node->GetVarNodeBlockId()] = sub_graph;
-        break;
-      }
-    }
-  }
   int update_counts = 0;
   for (size_t i = 0; i < graph->SubGraphsSize(); i++) {
-    update_counts += UpdateControlFlowOp(
-        graph->GetSubGraph(i), block_id_graph_map, delete_node_names);
+    update_counts += UpdateControlFlowOp(i, graph, delete_node_names);
   }
   if (update_counts > 0) {
     LOG(INFO) << "---  update " << update_counts << " control flow ops";
@@ -129,6 +118,7 @@ int DeleteIsolatedNodePass::RemoveIsolatedNodes(
   for (auto* node : graph->Nodes()) {
     if (node->IsOp()) {
       block = node->Op()->Block();
+      break;
     }
   }
   Scope& scope = graph->Get<framework::Scope>("__param_scope__");
@@ -160,11 +150,12 @@ int DeleteIsolatedNodePass::RemoveIsolatedNodes(
 }
 
 int DeleteIsolatedNodePass::UpdateControlFlowOp(
+    int current_graph_index,
     Graph* graph,
-    const std::map<int, Graph*>& block_id_graph_map,
     const std::unordered_set<std::string>& delete_node_names) const {
+  auto* cur_graph = graph->GetSubGraph(current_graph_index);
   int update_counts = 0;
-  for (auto* node : graph->Nodes()) {
+  for (auto* node : cur_graph->Nodes()) {
     if (!node->IsOp()) continue;
     auto op_type = node->Op()->Type();
     if (control_flow_op_input_map_.count(op_type) == 0) continue;
@@ -181,7 +172,7 @@ int DeleteIsolatedNodePass::UpdateControlFlowOp(
 
     auto* sub_block = PADDLE_GET_CONST(framework::BlockDesc*,
                                        node->Op()->GetAttr("sub_block"));
-    auto* sub_graph = block_id_graph_map.at(sub_block->ID());
+    auto* sub_graph = graph->GetSubGraph(sub_block->ID());
     std::unordered_set<std::string> sub_persistable_node_names;
     CollectReservedPersistableNodeNames(sub_graph, &sub_persistable_node_names);
     for (auto sub_name : sub_persistable_node_names) {
diff --git a/test/xpu/test_instance_norm_op_xpu.py b/test/xpu/test_instance_norm_op_xpu.py
index 5eb3e955deddf5..06714984a1b6d1 100644
--- a/test/xpu/test_instance_norm_op_xpu.py
+++ b/test/xpu/test_instance_norm_op_xpu.py
@@ -70,6 +70,9 @@ def setUp(self):
             self.epsilon = 1e-05
             self.no_grad_set = None
             self.set_attrs()
+            self.atol = 1e-5
+            if self.dtype == np.float16:
+                self.atol = 1e-2
 
             np.random.seed(12345)
             epsilon = self.epsilon
@@ -109,7 +112,7 @@ def set_attrs(self):
             pass
 
         def test_check_output(self):
-            self.check_output_with_place(paddle.XPUPlace(0))
+            self.check_output_with_place(paddle.XPUPlace(0), atol=self.atol)
 
         def test_check_grad(self):
             self.check_grad_with_place(

From 4341ebd9abbb6ca97d0c2e4a5f8129256414fc6f Mon Sep 17 00:00:00 2001
From: Ghost Screaming <mofengshenjieII@163.com>
Date: Thu, 13 Apr 2023 16:53:51 +0800
Subject: [PATCH 135/156] Fix ignore index of c_softmax_with_cross_entropy_op.
 (#52835)

* Fix bug of reduce_sum op. When input.numel() > INT32_MAX, its result
is wrong.

* Remove climits.

* Fix bug of c_softmax_with_cross_entropy_op. Support ignore_index is
negative number.
---
 .../c_softmax_with_cross_entropy_op.cu        | 69 ++++++++++---------
 1 file changed, 38 insertions(+), 31 deletions(-)

diff --git a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
index c37266a9b42fad..6a2dab9005a631 100644
--- a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
@@ -40,6 +40,7 @@ template <typename T, typename IndexT>
 __global__ void MaskLabelByIndex(T* predicted_logits,
                                  const T* logit,
                                  const IndexT* label,
+                                 const IndexT ignore_index,
                                  const int start_index,
                                  const int end_index,
                                  const int64_t N,
@@ -47,13 +48,15 @@ __global__ void MaskLabelByIndex(T* predicted_logits,
                                  const int nranks) {
   CUDA_KERNEL_LOOP(i, N) {
     auto real_label = label[i];
-    PADDLE_ENFORCE((real_label < D * nranks) && (real_label >= 0),
+    PADDLE_ENFORCE(((real_label < D * nranks) && (real_label >= 0)) ||
+                       (real_label == ignore_index),
                    "The index is out of bounds, "
                    "please check whether the value of label and "
                    "input meet the class number. It should "
-                   "be less than [%d], but received [%d]",
-                   D * nranks,
-                   real_label);
+                   "be less than [%ld] or equal to [%ld], but received [%ld]",
+                   static_cast<int64_t>(D * nranks),
+                   static_cast<int64_t>(ignore_index),
+                   static_cast<int64_t>(real_label));
 
     if (real_label >= start_index && real_label < end_index) {
       predicted_logits[i] = logit[i * D + real_label - start_index];
@@ -204,20 +207,22 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
     const auto& label_type = framework::TransToProtoVarType(labels->dtype());
 
     if (label_type == framework::proto::VarType::INT32) {
-      MaskLabelByIndex<T, int32_t>
-          <<<blocks, threads, 0, dev_ctx.stream()>>>(predicted_logits.data<T>(),
-                                                     softmax_2d.data<T>(),
-                                                     labels->data<int32_t>(),
-                                                     start_index,
-                                                     end_index,
-                                                     N,
-                                                     D,
-                                                     nranks);
+      MaskLabelByIndex<T, int32_t><<<blocks, threads, 0, dev_ctx.stream()>>>(
+          predicted_logits.data<T>(),
+          softmax_2d.data<T>(),
+          labels->data<int32_t>(),
+          static_cast<int32_t>(ignore_index),
+          start_index,
+          end_index,
+          N,
+          D,
+          nranks);
     } else if (label_type == framework::proto::VarType::INT64) {
       MaskLabelByIndex<T, int64_t>
           <<<blocks, threads, 0, dev_ctx.stream()>>>(predicted_logits.data<T>(),
                                                      softmax_2d.data<T>(),
                                                      labels->data<int64_t>(),
+                                                     ignore_index,
                                                      start_index,
                                                      end_index,
                                                      N,
@@ -362,25 +367,27 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
     const auto& label_type = framework::TransToProtoVarType(labels->dtype());
 
     if (label_type == framework::proto::VarType::INT32) {
-      MaskLabelByIndex<T, int32_t>
-          <<<blocks, threads, 0, dev_ctx.stream()>>>(predicted_logits.data<T>(),
-                                                     softmax_2d.data<T>(),
-                                                     labels->data<int32_t>(),
-                                                     start_index,
-                                                     end_index,
-                                                     N,
-                                                     D,
-                                                     nranks);
+      MaskLabelByIndex<T, int32_t><<<blocks, threads, 0, dev_ctx.stream()>>>(
+          predicted_logits.data<T>(),
+          softmax_2d.data<T>(),
+          labels->data<int32_t>(),
+          static_cast<int32_t>(ignore_index),
+          start_index,
+          end_index,
+          N,
+          D,
+          nranks);
     } else if (label_type == framework::proto::VarType::INT64) {
-      MaskLabelByIndex<T, int64_t>
-          <<<blocks, threads, 0, dev_ctx.stream()>>>(predicted_logits.data<T>(),
-                                                     softmax_2d.data<T>(),
-                                                     labels->data<int64_t>(),
-                                                     start_index,
-                                                     end_index,
-                                                     N,
-                                                     D,
-                                                     nranks);
+      MaskLabelByIndex<T, int64_t><<<blocks, threads, 0, dev_ctx.stream()>>>(
+          predicted_logits.data<T>(),
+          softmax_2d.data<T>(),
+          labels->data<int64_t>(),
+          static_cast<int32_t>(ignore_index),
+          start_index,
+          end_index,
+          N,
+          D,
+          nranks);
     }
 
     in_out.clear();

From e05df02062100d22d057abb0732fc4ba9d3232bc Mon Sep 17 00:00:00 2001
From: cyber-pioneer <116002591+cyber-pioneer@users.noreply.github.com>
Date: Thu, 13 Apr 2023 16:57:25 +0800
Subject: [PATCH 136/156] add batch_norm cinn case (#52815)

---
 test/prim/model/CMakeLists.txt              |   3 +
 test/prim/model/test_prim_simplenet_cinn.py | 111 ++++++++++++++++++++
 2 files changed, 114 insertions(+)
 create mode 100644 test/prim/model/test_prim_simplenet_cinn.py

diff --git a/test/prim/model/CMakeLists.txt b/test/prim/model/CMakeLists.txt
index 53a109cf41d7a5..908c5ce6254479 100644
--- a/test/prim/model/CMakeLists.txt
+++ b/test/prim/model/CMakeLists.txt
@@ -10,8 +10,11 @@ endforeach()
 
 set_tests_properties(test_resnet_prim_cinn PROPERTIES TIMEOUT 850)
 set_tests_properties(test_bert_prim_cinn PROPERTIES TIMEOUT 500)
+set_tests_properties(test_prim_simplenet_cinn PROPERTIES TIMEOUT 120)
 
 if(WITH_CINN)
   set_tests_properties(test_resnet_prim_cinn PROPERTIES LABELS "RUN_TYPE=CINN")
   set_tests_properties(test_bert_prim_cinn PROPERTIES LABELS "RUN_TYPE=CINN")
+  set_tests_properties(test_prim_simplenet_cinn PROPERTIES LABELS
+                                                           "RUN_TYPE=CINN")
 endif()
diff --git a/test/prim/model/test_prim_simplenet_cinn.py b/test/prim/model/test_prim_simplenet_cinn.py
new file mode 100644
index 00000000000000..8a4a0861b0d864
--- /dev/null
+++ b/test/prim/model/test_prim_simplenet_cinn.py
@@ -0,0 +1,111 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle.fluid import core
+from paddle.nn import BatchNorm
+
+np.random.seed(2023)
+
+
+def apply_to_static(net, use_cinn):
+    build_strategy = paddle.static.BuildStrategy()
+    build_strategy.build_cinn_pass = use_cinn
+    return paddle.jit.to_static(net, build_strategy=build_strategy)
+
+
+class PrimeNet(paddle.nn.Layer):
+    def __init__(self, shape):
+        super().__init__()
+        self.bn = BatchNorm(shape[-1], data_layout='NHWC', act="relu")
+
+    def forward(self, data, dout):
+        y = self.bn(data) * dout
+        return y
+
+
+class TestPrimForwardAndBackward(unittest.TestCase):
+    """
+    Test PrimeNet with @to_static + prim forward + prim backward + cinn v.s Dygraph
+    """
+
+    def setUp(self):
+        self.data = None
+        self.dout = None
+        self.shape = None
+
+    def train(self, use_prim):
+        paddle.seed(2022)
+        net = PrimeNet(self.shape)
+        sgd = paddle.optimizer.SGD(
+            learning_rate=1.0, parameters=net.parameters()
+        )
+        core._set_prim_all_enabled(use_prim)
+
+        net = paddle.amp.decorate(models=net, level='O2')
+        if use_prim:
+            net = apply_to_static(net, use_prim)
+        res = []
+        with paddle.amp.auto_cast(level='O2'):
+            for _ in range(10):
+                out = net(self.data, self.dout)
+                loss = paddle.mean(out)
+                loss.backward()
+                sgd.step()
+                sgd.clear_grad()
+                res.append(loss.numpy())
+            self.check_prim(net, use_prim)
+
+        return res
+
+    def check_prim(self, net, use_prim):
+        if not use_prim:
+            return
+        fwd_ops = [
+            op.type
+            for op in net.forward.get_concrete_program(self.data, self.dout)[1]
+            .train_program.block(0)
+            .ops
+        ]
+
+        # Ensure that batch_norm is splitted into small ops
+        self.assertTrue('batch_norm' not in fwd_ops)
+
+    def test_cinn_prim(self):
+        if paddle.device.get_device() == "cpu":
+            return
+        self.shape = (16, 112, 112, 64)
+        self.data = paddle.to_tensor(
+            np.random.random(self.shape).astype("float16")
+        )
+        self.data.stop_gradient = False
+        self.dout = paddle.to_tensor(
+            np.random.random(self.shape).astype("float16")
+        )
+
+        dy2st_res = self.train(use_prim=False)
+        prim_res = self.train(use_prim=True)
+
+        for i in range(len(dy2st_res)):
+            np.testing.assert_allclose(
+                prim_res[i], dy2st_res[i], rtol=1e-3, atol=1e-3
+            )
+
+
+if __name__ == '__main__':
+    unittest.main()

From 802129b3c6b1185faca33ab656a0b4ca8ad1a154 Mon Sep 17 00:00:00 2001
From: Zman <35071129+Atlantisming@users.noreply.github.com>
Date: Thu, 13 Apr 2023 17:10:18 +0800
Subject: [PATCH 137/156] Add GaussianNLLLoss API. (#50843)

* Add GaussianNLLLoss API.

* Change `rotl` `atol`.Check `var` in dynamic graph

* remove assertTrue

* update unittest

* update unittest for ci-covarage.add broadcast with same dim.

* Supply static err print.

* Repair note and example.

* Split unitest.

* empty commit.

* for standard commit.

* for standard commit.

* Add int dynamic graph test.

* Repair parameters name.

* Repair unitest parameters name.

* Repair unitest parameters name

* Repair unitest parameters name

* Repair unitest parameters name

* add square in code-block

* fit few notes.

* fit few notes.

* fit few notes.

* fit few notes.

* add few interpretations.

* add few interpretations.

* add few interpretations.

* fix import.

* fix space.

* empty commit for ci.
---
 .../tests/unittests/test_gaussian_nll_loss.py | 216 ++++++++++++++++++
 python/paddle/nn/__init__.py                  |   3 +
 python/paddle/nn/functional/__init__.py       |   3 +
 python/paddle/nn/functional/loss.py           | 161 +++++++++++++
 python/paddle/nn/layer/__init__.py            |   2 +
 python/paddle/nn/layer/loss.py                |  94 ++++++++
 6 files changed, 479 insertions(+)
 create mode 100644 python/paddle/fluid/tests/unittests/test_gaussian_nll_loss.py

diff --git a/python/paddle/fluid/tests/unittests/test_gaussian_nll_loss.py b/python/paddle/fluid/tests/unittests/test_gaussian_nll_loss.py
new file mode 100644
index 00000000000000..1480c83eb26ae4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_gaussian_nll_loss.py
@@ -0,0 +1,216 @@
+#   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+import paddle.nn.functional as F
+from paddle.fluid import core
+
+np.random.seed(10)
+
+
+def ref_gaussian_nll_loss(
+    input, label, variance, full=False, eps=1e-6, reduction='none'
+):
+    if variance.shape != input.shape:
+        if input.shape[:-1] == variance.shape:
+            variance = np.expand_dims(variance, -1)
+        elif (
+            input.shape[:-1] == variance.shape[:-1] and variance.shape[-1] == 1
+        ):
+            pass
+        else:
+            raise ValueError("variance is of incorrect size")
+    if reduction != 'none' and reduction != 'mean' and reduction != 'sum':
+        raise ValueError(reduction + " is not valid")
+
+    if np.any(variance < 0):
+        raise ValueError("var has negative entry/entries")
+
+    variance = variance.copy()
+    variance = np.clip(variance, a_min=eps, a_max=None)
+
+    loss = 0.5 * (np.log(variance) + (input - label) ** 2 / variance)
+    if full:
+        loss += 0.5 * np.log(2 * np.pi)
+
+    if reduction == 'none':
+        return loss
+    elif reduction == 'sum':
+        return [np.sum(loss)]
+    elif reduction == 'mean':
+        return [np.mean(loss)]
+
+
+class TestGaussianNLLLossAPI(unittest.TestCase):
+    # test paddle.nn.functional.gaussian_nll_loss, paddle.nn.gaussian_nll_loss
+
+    def setUp(self, type=None):
+        self.shape = [10, 2]
+        if type in ['float16', 'float64', 'int32', 'int64']:
+            dtype = np.dtype(type)
+            self.input_np = np.random.random(self.shape).astype(dtype)
+            self.label_np = np.random.random(self.shape).astype(dtype)
+            self.variance_np = np.ones(self.shape).astype(dtype)
+        elif type == 'broadcast1':
+            self.shape = [10, 2, 3]
+            self.broadcast_shape = [10, 2]
+            self.input_np = np.random.random(self.shape).astype(np.float32)
+            self.label_np = np.random.random(self.shape).astype(np.float32)
+            self.variance_np = np.ones(self.broadcast_shape).astype(np.float32)
+        elif type == 'broadcast2':
+            self.shape = [10, 2, 3]
+            self.broadcast_shape = [10, 2, 1]
+            self.input_np = np.random.random(self.shape).astype(np.float32)
+            self.label_np = np.random.random(self.shape).astype(np.float32)
+            self.variance_np = np.ones(self.broadcast_shape).astype(np.float32)
+        else:
+            dtype = np.dtype('float32')
+            self.input_np = np.random.random(self.shape).astype(dtype)
+            self.label_np = np.random.random(self.shape).astype(dtype)
+            self.variance_np = np.ones(self.shape).astype(dtype)
+        if type == 'test_err':
+            self.variance_np = -np.ones(self.shape).astype(np.float32)
+
+        self.place = (
+            paddle.CUDAPlace(0)
+            if core.is_compiled_with_cuda()
+            else paddle.CPUPlace()
+        )
+
+    def test_dynamic_case(self, type=None, full=False, reduction='none'):
+        self.setUp(type)
+        paddle.disable_static(self.place)
+
+        input_x = paddle.to_tensor(self.input_np)
+        label = paddle.to_tensor(self.label_np)
+        variance = paddle.to_tensor(self.variance_np)
+        if type in ['test_err', 'int32', 'int64']:
+            self.assertRaises(
+                ValueError,
+                paddle.nn.functional.gaussian_nll_loss,
+                input=input_x,
+                label=label,
+                variance=variance,
+            )
+        else:
+            out_ref = ref_gaussian_nll_loss(
+                self.input_np,
+                self.label_np,
+                self.variance_np,
+                full=full,
+                reduction=reduction,
+            )
+            out1 = F.gaussian_nll_loss(
+                input_x, label, variance, full=full, reduction=reduction
+            )
+            gaussian_nll_loss = paddle.nn.GaussianNLLLoss(
+                full, reduction=reduction
+            )
+            out2 = gaussian_nll_loss(input_x, label, variance)
+
+            for r in [out1, out2]:
+                np.allclose(out_ref, r.numpy(), rtol=1e-5, atol=1e-5)
+        paddle.enable_static()
+
+    def test_static_case(self, type=None, full=False, reduction='none'):
+        self.setUp(type)
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            if type in ['int32', 'int64', 'float64']:
+                input_x = paddle.static.data('Input_x', self.shape, type)
+                label = paddle.static.data('Label', self.shape, type)
+                variance = paddle.static.data('Variance', self.shape, type)
+            elif type in ['broadcast1', 'broadcast2']:
+                input_x = paddle.static.data('Input_x', self.shape)
+                label = paddle.static.data('Label', self.shape)
+                variance = paddle.static.data('Variance', self.broadcast_shape)
+            else:
+                input_x = paddle.static.data('Input_x', self.shape, 'float32')
+                label = paddle.static.data('Label', self.shape, 'float32')
+                variance = paddle.static.data('Variance', self.shape, 'float32')
+            out1 = F.gaussian_nll_loss(
+                input_x, label, variance, full=full, reduction=reduction
+            )
+            gaussian_nll_loss = paddle.nn.GaussianNLLLoss(
+                full, reduction=reduction
+            )
+            out2 = gaussian_nll_loss(input_x, label, variance)
+            exe = paddle.static.Executor(self.place)
+            if type not in ['test_err', 'int32', 'int64']:
+                out_ref = ref_gaussian_nll_loss(
+                    self.input_np,
+                    self.label_np,
+                    self.variance_np,
+                    full=full,
+                    reduction=reduction,
+                )
+                res = exe.run(
+                    feed={
+                        'Input_x': self.input_np,
+                        'Label': self.label_np,
+                        'Variance': self.variance_np,
+                    },
+                    fetch_list=[out1, out2],
+                )
+                for r in res:
+                    np.allclose(out_ref, r, rtol=1e-5, atol=1e-5)
+            else:
+                try:
+                    res = exe.run(
+                        feed={
+                            'Input_x': self.input_np,
+                            'Label': self.label_np,
+                            'Variance': self.variance_np,
+                        },
+                        fetch_list=[out1, out2],
+                    )
+                except ValueError:
+                    pass
+
+    def test_api(self):
+        self.test_dynamic_case()
+        self.test_static_case()
+
+    def test_float64(self):
+        self.test_dynamic_case('float64')
+        self.test_static_case('float64')
+
+    def test_broadcast(self):
+        self.test_dynamic_case('broadcast1')
+        self.test_static_case('broadcast1')
+
+    def test_broadcast_with_same_dim(self):
+        self.test_dynamic_case('broadcast2')
+        self.test_static_case('broadcast2')
+
+    def test_reduction(self):
+        self.test_dynamic_case(full=True, reduction='mean')
+        self.test_dynamic_case(full=True, reduction='sum')
+        self.test_static_case(full=True, reduction='mean')
+
+    def test_error(self):
+        self.test_dynamic_case('test_err')
+        self.test_static_case('test_err')
+
+    def test_int(self):
+        self.test_dynamic_case('int64')
+        self.test_dynamic_case('int32')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index fe7209ecf46dab..41d86c52980033 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -114,6 +114,8 @@
 from .layer.loss import TripletMarginWithDistanceLoss
 from .layer.loss import TripletMarginLoss
 from .layer.loss import SoftMarginLoss
+from .layer.loss import GaussianNLLLoss
+
 from .layer.norm import BatchNorm  # noqa: F401
 from .layer.norm import SyncBatchNorm  # noqa: F401
 from .layer.norm import GroupNorm  # noqa: F401
@@ -335,4 +337,5 @@ def weight_norm(*args):
     'TripletMarginWithDistanceLoss',
     'TripletMarginLoss',
     'SoftMarginLoss',
+    'GaussianNLLLoss',
 ]
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index 2a9d1390527c86..2eabd18d394b5c 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -99,6 +99,8 @@
 from .loss import triplet_margin_with_distance_loss
 from .loss import triplet_margin_loss
 from .loss import soft_margin_loss
+from .loss import gaussian_nll_loss
+
 from .norm import batch_norm  # noqa: F401
 from .norm import instance_norm  # noqa: F401
 from .norm import layer_norm  # noqa: F401
@@ -248,4 +250,5 @@
     'triplet_margin_loss',
     'multi_margin_loss',
     'soft_margin_loss',
+    'gaussian_nll_loss',
 ]
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index ab11d302930a2f..e0f7d874be69c5 100644
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -18,6 +18,7 @@
 import paddle
 from paddle import _C_ops, _legacy_C_ops, fluid, in_dynamic_mode
 from paddle.framework import core
+from paddle.static.nn.control_flow import Assert
 from paddle.utils import deprecated
 
 from ...common_ops_import import Variable
@@ -4007,3 +4008,163 @@ def soft_margin_loss(input, label, reduction='mean', name=None):
         return paddle.mean(out, name=name)
     else:
         return out
+
+
+def gaussian_nll_loss(
+    input,
+    label,
+    variance,
+    full=False,
+    epsilon=1e-6,
+    reduction='mean',
+    name=None,
+):
+    r"""Gaussian negative log likelihood loss.
+
+    Gaussian negative log likelihood loss among ``input``, ``variance`` and
+    ``label``. Note that the ``label`` is treated as samples from Gaussian distributions.
+    This function is used to train a neural network predicts
+    the ``input`` and ``variance`` of a gaussian distribution that ``label`` are supposed to
+    be coming from. This means ``input`` and ``variance`` should be functions(the neural network) of some inputs.
+
+    For a ``label`` having Gaussian distribution with ``input`` and ``variance`` predicted by neural network
+    the loss is calculated as follows:
+
+    .. math::
+        \text{loss} = \frac{1}{2}\left(\log\left(\text{max}\left(\text{var},
+        \ \text{epsilon}\right)\right) + \frac{\left(\text{input} - \text{label}\right)^2}
+        {\text{max}\left(\text{var}, \ \text{epsilon}\right)}\right) + \text{const.}
+
+    where :attr:`epsilon` is used for stability. By default, the constant term of
+    the loss function is omitted unless :attr:`full` is ``True``. If ``variance`` is not the same
+    size as ``input`` (due to a homoscedastic assumption), it must either have a final dimension
+    of 1 or have one fewer dimension (with all other sizes being the same) for correct broadcasting.
+
+    Args:
+        input (Tensor): input tensor, :math:`(N, *)` or :math:`(*)` where :math:`*` means any number of additional
+            dimensions. Expectation of the Gaussian distribution, available dtype is float32, float64.
+        label (Tensor): target label tensor, :math:`(N, *)` or :math:`(*)`, same shape as the input, or same shape as the input
+            but with one dimension equal to 1 (to allow for broadcasting). Sample from the Gaussian distribution, available dtype is float32, float64.
+        variance (Tensor): tensor of positive variance(s), :math:`(N, *)` or :math:`(*)`, same shape as the input, or same shape as the input but
+            with one dimension equal to 1, or same shape as the input but with one fewer
+            dimension (to allow for broadcasting). One for each of the expectations
+            in the input (heteroscedastic), or a single one (homoscedastic), available dtype is float32, float64.
+        full (bool, optional): include the constant term in the loss
+            calculation. Default: ``False``.
+        epsilon (float, optional): value used to clamp ``variance`` (see note below), for
+            stability. Default: 1e-6.
+        reduction (str, optional): specifies the reduction to apply to the
+            output:``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction
+            will be applied, ``'mean'``: the output is the average of all batch
+            member losses, ``'sum'``: the output is the sum of all batch member
+            losses. Default: ``'mean'``.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+
+        output (Tensor): If ``reduction`` is ``'none'``, the shape of output is same as ``input`` , else the shape of output is [1].
+
+    Examples::
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+
+            input = paddle.randn([5, 2], dtype=paddle.float32)
+            label = paddle.randn([5, 2], dtype=paddle.float32)
+            variance = paddle.ones([5, 2], dtype=paddle.float32)
+
+            loss = F.gaussian_nll_loss(input, label, variance, reduction='none')
+            print(loss)
+
+            loss = F.gaussian_nll_loss(input, label, variance, reduction='mean')
+            print(loss)
+
+    Note:
+        The clamping of ``variance`` is ignored with respect to autograd, and so the
+        gradients are unaffected by it.
+    """
+
+    # Check variance shape
+    # If variance.shape == input.shape, the case is heteroscedastic and no further checks are needed.
+    # Otherwise:
+    if variance.shape != input.shape:
+        # If variance is one dimension short of input, but the shape match otherwise, then this is a homoscedastic case.
+        # e.g. input.shape = (10, 2, 3), variance.shape = (10, 2)
+        # -> unsqueeze variance so that variance.shape = (10, 2, 1)
+        # this is done so that broadcasting can happen in the loss calculation
+        if input.shape[:-1] == variance.shape:
+            variance = paddle.unsqueeze(variance, -1)
+        # This checks if the shape match up to the final dimension, and the final dimension of variance is of shape 1.
+        # This is also a homoscedastic case.
+        # e.g. input.shape = (10, 2, 3), variance.shape = (10, 2, 1)
+        elif (
+            input.shape[:-1] == variance.shape[:-1] and variance.shape[-1] == 1
+        ):  # Heteroscedastic case
+            pass
+        # If none of the above pass, then the shape of variance is incorrect.
+        else:
+            raise ValueError("variance is of incorrect shape")
+
+    # Check validity of reduction mode
+    if reduction != 'none' and reduction != 'mean' and reduction != 'sum':
+        raise ValueError(reduction + " is not valid")
+
+    check_variable_and_dtype(
+        input,
+        'Input',
+        ['float32', 'float64'],
+        'gaussian_nll_loss',
+    )
+    check_variable_and_dtype(
+        label,
+        'Label',
+        ['float32', 'float64'],
+        'gaussian_nll_loss',
+    )
+    check_variable_and_dtype(
+        variance,
+        'Variance',
+        ['float32', 'float64'],
+        'gaussian_nll_loss',
+    )
+    # Entries of variance must be non-negative
+    if not in_dygraph_mode():
+        condition = paddle.all(variance > 0)
+        Assert(condition, [variance], 6)
+    else:
+        if input.dtype not in [paddle.float32, paddle.float64]:
+            raise ValueError(
+                "The data type of input Variable must be 'float32' or 'float64'"
+            )
+        if label.dtype not in [
+            paddle.float32,
+            paddle.float64,
+        ]:
+            raise ValueError(
+                "The data type of label Variable must be 'float32', 'float64'"
+            )
+        if variance.dtype not in [paddle.float32, paddle.float64]:
+            raise ValueError(
+                "The data type of variance Variable must be 'float32', 'float64'"
+            )
+        if paddle.any(variance < 0):
+            raise ValueError("variance has negative entry/entries")
+
+    # Clamp for stability
+    variance = variance.clone()
+    with paddle.no_grad():
+        variance = paddle.clip(variance, min=epsilon)
+    # Calculate the loss
+    loss = 0.5 * (
+        paddle.log(variance) + paddle.square(input - label) / variance
+    )
+    if full:
+        loss += 0.5 * math.log(2 * math.pi)
+
+    if reduction == 'mean':
+        return paddle.mean(loss, name=name)
+    elif reduction == 'sum':
+        return paddle.sum(loss, name=name)
+    elif reduction == 'none':
+        return loss
diff --git a/python/paddle/nn/layer/__init__.py b/python/paddle/nn/layer/__init__.py
index 09b491b900d5ca..7bf1ab6b62ce44 100644
--- a/python/paddle/nn/layer/__init__.py
+++ b/python/paddle/nn/layer/__init__.py
@@ -85,6 +85,8 @@
 from .loss import TripletMarginLoss
 from .loss import SoftMarginLoss
 from .loss import MultiMarginLoss
+from .loss import GaussianNLLLoss
+
 from .norm import BatchNorm1D  # noqa: F401
 from .norm import BatchNorm2D  # noqa: F401
 from .norm import BatchNorm3D  # noqa: F401
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index 57ee00608cce10..6fd186c882bd04 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -2046,3 +2046,97 @@ def forward(self, input, label):
             input, label, self.reduction, self.name
         )
         return out
+
+
+class GaussianNLLLoss(Layer):
+    r"""Create a callable object of 'GaussianNLLLoss' to calculate Gaussian negative log likelihood loss.
+
+    This class create a callable object of Gaussian negative log likelihood loss among ``input``, ``variance`` and
+    ``label``. Note that the ``label`` is treated as samples from Gaussian distributions.
+    This class is used to train a neural network predicts
+    the ``input`` and ``variance`` of a gaussian distribution that ``label`` are supposed to
+    be coming from. This means ``input`` and ``variance`` should be functions(the neural network) of some inputs.
+
+    For a ``label`` having Gaussian distribution with ``input`` and ``variance`` predicted by neural network
+    the loss is calculated as follows:
+
+    .. math::
+        \text{loss} = \frac{1}{2}\left(\log\left(\text{max}\left(\text{var},
+        \ \text{eps}\right)\right) + \frac{\left(\text{input} - \text{label}\right)^2}
+        {\text{max}\left(\text{var}, \ \text{eps}\right)}\right) + \text{const.}
+
+    where :attr:`epsilon` is used for stability. By default, the constant term of
+    the loss function is omitted unless :attr:`full` is ``True``. If ``variance`` is not the same
+    size as ``input`` (due to a homoscedastic assumption), it must either have a final dimension
+    of 1 or have one fewer dimension (with all other sizes being the same) for correct broadcasting.
+
+    Args:
+        full (bool, optional): include the constant term in the loss
+            calculation. Default: ``False``, means omit the constant term.
+        epsilon (float, optional): value used to clamp ``variance`` (see note below), for
+            stability. Default: 1e-6.
+        reduction (str, optional): specifies the reduction to apply to the
+            output:``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction
+            will be applied, ``'mean'``: the output is the average of all batch
+            member losses, ``'sum'``: the output is the sum of all batch member
+            losses. Default: ``'mean'``.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - Input(Tensor): :math:`(N, *)` or :math:`(*)` where :math:`*` means any number of additional
+          dimensions. Available dtype is float32, float64.
+        - Label(Tensor): :math:`(N, *)` or :math:`(*)`, same shape as the input, or same shape as the input
+          but with one dimension equal to 1 (to allow for broadcasting). Available dtype is float32, float64.
+        - Variance(Tensor): :math:`(N, *)` or :math:`(*)`, same shape as the input, or same shape as the input but
+          with one dimension equal to 1, or same shape as the input but with one fewer
+          dimension (to allow for broadcasting). Available dtype is float32, float64.
+        - Output: scalar if :attr:`reduction` is ``'mean'`` (default) or
+          ``'sum'``. If :attr:`reduction` is ``'none'``, then :math:`(N, *)`, same
+          shape as the input
+
+    Returns:
+        A callable object of GaussianNLLLoss.
+
+    Examples::
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+
+            input = paddle.randn([5, 2], dtype=paddle.float32)
+            label = paddle.randn([5, 2], dtype=paddle.float32)
+            variance = paddle.ones([5, 2], dtype=paddle.float32)
+
+            gs_nll_loss = nn.GaussianNLLLoss(full=False, epsilon=1e-6, reduction='none')
+            loss = gs_nll_loss(input, label, variance)
+            print(loss)
+
+    Note:
+        The clamping of ``variance`` is ignored with respect to autograd, and so the
+        gradients are unaffected by it.
+    """
+
+    def __init__(self, full=False, epsilon=1e-6, reduction='mean', name=None):
+        if reduction not in ['sum', 'mean', 'none']:
+            raise ValueError(
+                "The value of 'reduction' in GaussianNLLLoss should be 'sum', 'mean' or 'none', but "
+                "received %s, which is not allowed." % reduction
+            )
+
+        super().__init__()
+        self.full = full
+        self.epsilon = epsilon
+        self.reduction = reduction
+        self.name = name
+
+    def forward(self, input, label, variance):
+        out = F.gaussian_nll_loss(
+            input,
+            label,
+            variance,
+            self.full,
+            self.epsilon,
+            self.reduction,
+            self.name,
+        )
+        return out

From e64ce0bbbcface1f4af348790b957274ac8a5dfa Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Thu, 13 Apr 2023 17:14:33 +0800
Subject: [PATCH 138/156] move some function of cuda error from enforce.h to
 enforce.cc (#52828)

---
 paddle/phi/core/enforce.cc | 173 +++++++++++++++++++++++++++++++++++++
 paddle/phi/core/enforce.h  | 165 ++++-------------------------------
 2 files changed, 188 insertions(+), 150 deletions(-)

diff --git a/paddle/phi/core/enforce.cc b/paddle/phi/core/enforce.cc
index 897ca5fe5c5ece..0fce1eee7005df 100644
--- a/paddle/phi/core/enforce.cc
+++ b/paddle/phi/core/enforce.cc
@@ -23,6 +23,10 @@ limitations under the License. */
 #include "paddle/phi/common/scalar.h"
 #include "paddle/utils/blank.h"
 
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/phi/core/external_error.pb.h"
+#endif  // PADDLE_WITH_CUDA
+
 DECLARE_int32(call_stack_level);
 
 namespace egr {
@@ -177,5 +181,174 @@ std::string SimplifyErrorTypeFormat(const std::string& str) {
   return sout.str();
 }
 
+/**************************************************************************/
+/**************************** NVIDIA ERROR ********************************/
+#ifdef PADDLE_WITH_CUDA
+
+namespace details {
+
+template <typename T>
+struct ExternalApiProtoType {};
+
+#define DEFINE_EXTERNAL_API_PROTO_TYPE(type, proto_type)    \
+  template <>                                               \
+  struct ExternalApiProtoType<type> {                       \
+    using Type = type;                                      \
+    static constexpr const char* kTypeString = #proto_type; \
+    static constexpr phi::proto::ApiType kProtoType =       \
+        phi::proto::ApiType::proto_type;                    \
+  }
+
+DEFINE_EXTERNAL_API_PROTO_TYPE(cudaError_t, CUDA);
+DEFINE_EXTERNAL_API_PROTO_TYPE(curandStatus_t, CURAND);
+DEFINE_EXTERNAL_API_PROTO_TYPE(cudnnStatus_t, CUDNN);
+DEFINE_EXTERNAL_API_PROTO_TYPE(cublasStatus_t, CUBLAS);
+DEFINE_EXTERNAL_API_PROTO_TYPE(cusparseStatus_t, CUSPARSE);
+DEFINE_EXTERNAL_API_PROTO_TYPE(cusolverStatus_t, CUSOLVER);
+DEFINE_EXTERNAL_API_PROTO_TYPE(cufftResult_t, CUFFT);
+DEFINE_EXTERNAL_API_PROTO_TYPE(CUresult, CU);
+
+#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
+DEFINE_EXTERNAL_API_PROTO_TYPE(ncclResult_t, NCCL);
+#endif
+
+#undef DEFINE_EXTERNAL_API_PROTO_TYPE
+
+}  // namespace details
+
+template <typename T>
+inline const char* GetErrorMsgUrl(T status) {
+  using __CUDA_STATUS_TYPE__ = decltype(status);
+  phi::proto::ApiType proto_type =
+      details::ExternalApiProtoType<__CUDA_STATUS_TYPE__>::kProtoType;
+  switch (proto_type) {
+    case phi::proto::ApiType::CUDA:
+    case phi::proto::ApiType::CU:
+      return "https://docs.nvidia.com/cuda/cuda-runtime-api/"
+             "group__CUDART__TYPES.html#group__CUDART__TYPES_"
+             "1g3f51e3575c2178246db0a94a430e0038";
+      break;
+    case phi::proto::ApiType::CURAND:
+      return "https://docs.nvidia.com/cuda/curand/"
+             "group__HOST.html#group__HOST_1gb94a31d5c165858c96b6c18b70644437";
+      break;
+    case phi::proto::ApiType::CUDNN:
+      return "https://docs.nvidia.com/deeplearning/cudnn/api/"
+             "index.html#cudnnStatus_t";
+      break;
+    case phi::proto::ApiType::CUBLAS:
+      return "https://docs.nvidia.com/cuda/cublas/index.html#cublasstatus_t";
+      break;
+    case phi::proto::ApiType::CUSOLVER:
+      return "https://docs.nvidia.com/cuda/cusolver/"
+             "index.html#cuSolverSPstatus";
+      break;
+    case phi::proto::ApiType::NCCL:
+      return "https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/"
+             "types.html#ncclresult-t";
+      break;
+    case phi::proto::ApiType::CUFFT:
+      return "https://docs.nvidia.com/cuda/cufft/index.html#cufftresult";
+    case phi::proto::ApiType::CUSPARSE:
+      return "https://docs.nvidia.com/cuda/cusparse/"
+             "index.html#cusparseStatus_t";
+      break;
+    default:
+      return "Unknown type of External API, can't get error message URL!";
+      break;
+  }
+}
+
+template <typename T>
+std::string GetExternalErrorMsg(T status) {
+  std::ostringstream sout;
+  bool _initSucceed = false;
+  phi::proto::ExternalErrorDesc externalError;
+  if (externalError.ByteSizeLong() == 0) {
+    std::string filePath;
+#if !defined(_WIN32)
+    Dl_info info;
+    if (dladdr(reinterpret_cast<void*>(GetCurrentTraceBackString), &info)) {
+      std::string strModule(info.dli_fname);
+      const size_t last_slash_idx = strModule.find_last_of("/");
+      std::string compare_path = strModule.substr(strModule.length() - 6);
+      if (std::string::npos != last_slash_idx) {
+        strModule.erase(last_slash_idx, std::string::npos);
+      }
+      if (compare_path.compare("avx.so") == 0) {
+        filePath =
+            strModule +
+            "/../include/third_party/externalError/data/externalErrorMsg.pb";
+      } else {
+        filePath = strModule +
+                   "/../../third_party/externalError/data/externalErrorMsg.pb";
+      }
+    }
+#else
+    char buf[512];
+    MEMORY_BASIC_INFORMATION mbi;
+    HMODULE h_module =
+        (::VirtualQuery(GetCurrentTraceBackString, &mbi, sizeof(mbi)) != 0)
+            ? (HMODULE)mbi.AllocationBase
+            : NULL;
+    GetModuleFileName(h_module, buf, 512);
+    std::string strModule(buf);
+    const size_t last_slash_idx = strModule.find_last_of("\\");
+    std::string compare_path = strModule.substr(strModule.length() - 7);
+    if (std::string::npos != last_slash_idx) {
+      strModule.erase(last_slash_idx, std::string::npos);
+    }
+    if (compare_path.compare("avx.pyd") == 0) {
+      filePath = strModule +
+                 "\\..\\include\\third_"
+                 "party\\externalerror\\data\\externalErrorMsg.pb";
+    } else {
+      filePath =
+          strModule +
+          "\\..\\..\\third_party\\externalerror\\data\\externalErrorMsg.pb";
+    }
+#endif
+    std::ifstream fin(filePath, std::ios::in | std::ios::binary);
+    _initSucceed = externalError.ParseFromIstream(&fin);
+  }
+  using __CUDA_STATUS_TYPE__ = decltype(status);
+  phi::proto::ApiType proto_type =
+      details::ExternalApiProtoType<__CUDA_STATUS_TYPE__>::kProtoType;
+  if (_initSucceed) {
+    for (int i = 0; i < externalError.errors_size(); ++i) {
+      if (proto_type == externalError.errors(i).type()) {
+        for (int j = 0; j < externalError.errors(i).messages_size(); ++j) {
+          if (status == externalError.errors(i).messages(j).code()) {
+            sout << "\n  [Hint: "
+                 << externalError.errors(i).messages(j).message() << "]";
+            return sout.str();
+          }
+        }
+      }
+    }
+  }
+
+  sout << "\n  [Hint: Please search for the error code(" << status
+       << ") on website (" << GetErrorMsgUrl(status)
+       << ") to get Nvidia's official solution and advice about "
+       << details::ExternalApiProtoType<__CUDA_STATUS_TYPE__>::kTypeString
+       << " Error.]";
+  return sout.str();
+}
+
+template std::string GetExternalErrorMsg<cudaError_t>(cudaError_t);
+template std::string GetExternalErrorMsg<curandStatus_t>(curandStatus_t);
+template std::string GetExternalErrorMsg<cudnnStatus_t>(cudnnStatus_t);
+template std::string GetExternalErrorMsg<cublasStatus_t>(cublasStatus_t);
+template std::string GetExternalErrorMsg<cusparseStatus_t>(cusparseStatus_t);
+template std::string GetExternalErrorMsg<cusolverStatus_t>(cusolverStatus_t);
+template std::string GetExternalErrorMsg<cufftResult_t>(cufftResult_t);
+template std::string GetExternalErrorMsg<CUresult>(CUresult);
+#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
+template std::string GetExternalErrorMsg<ncclResult_t>(ncclResult_t);
+#endif
+
+#endif  // PADDLE_WITH_CUDA
+
 }  // namespace enforce
 }  // namespace phi
diff --git a/paddle/phi/core/enforce.h b/paddle/phi/core/enforce.h
index 96006fe83a42b8..d0b240e89417d2 100644
--- a/paddle/phi/core/enforce.h
+++ b/paddle/phi/core/enforce.h
@@ -33,8 +33,6 @@ limitations under the License. */
 #include <cusparse.h>
 #include <thrust/system/cuda/error.h>
 #include <thrust/system_error.h>
-
-#include "paddle/phi/core/external_error.pb.h"
 #endif  // PADDLE_WITH_CUDA
 
 #ifdef PADDLE_WITH_HIP
@@ -90,7 +88,6 @@ limitations under the License. */
 #endif  // PADDLE_WITH_HIP
 
 // Note: these headers for simplify demangle type string
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/type_defs.h"
 // Note: this header for simplify HIP and CUDA type string
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -615,162 +612,30 @@ namespace details {
 template <typename T>
 struct ExternalApiType {};
 
-#define DEFINE_EXTERNAL_API_TYPE(type, success_value, proto_type) \
-  template <>                                                     \
-  struct ExternalApiType<type> {                                  \
-    using Type = type;                                            \
-    static constexpr Type kSuccess = success_value;               \
-    static constexpr const char* kTypeString = #proto_type;       \
-    static constexpr phi::proto::ApiType kProtoType =             \
-        phi::proto::ApiType::proto_type;                          \
+#define DEFINE_EXTERNAL_API_TYPE(type, success_value) \
+  template <>                                         \
+  struct ExternalApiType<type> {                      \
+    using Type = type;                                \
+    static constexpr Type kSuccess = success_value;   \
   }
 
-DEFINE_EXTERNAL_API_TYPE(cudaError_t, cudaSuccess, CUDA);
-DEFINE_EXTERNAL_API_TYPE(curandStatus_t, CURAND_STATUS_SUCCESS, CURAND);
-DEFINE_EXTERNAL_API_TYPE(cudnnStatus_t, CUDNN_STATUS_SUCCESS, CUDNN);
-DEFINE_EXTERNAL_API_TYPE(cublasStatus_t, CUBLAS_STATUS_SUCCESS, CUBLAS);
-DEFINE_EXTERNAL_API_TYPE(cusparseStatus_t, CUSPARSE_STATUS_SUCCESS, CUSPARSE);
-DEFINE_EXTERNAL_API_TYPE(cusolverStatus_t, CUSOLVER_STATUS_SUCCESS, CUSOLVER);
-DEFINE_EXTERNAL_API_TYPE(cufftResult_t, CUFFT_SUCCESS, CUFFT);
-DEFINE_EXTERNAL_API_TYPE(CUresult, CUDA_SUCCESS, CU);
+DEFINE_EXTERNAL_API_TYPE(cudaError_t, cudaSuccess);
+DEFINE_EXTERNAL_API_TYPE(curandStatus_t, CURAND_STATUS_SUCCESS);
+DEFINE_EXTERNAL_API_TYPE(cudnnStatus_t, CUDNN_STATUS_SUCCESS);
+DEFINE_EXTERNAL_API_TYPE(cublasStatus_t, CUBLAS_STATUS_SUCCESS);
+DEFINE_EXTERNAL_API_TYPE(cusparseStatus_t, CUSPARSE_STATUS_SUCCESS);
+DEFINE_EXTERNAL_API_TYPE(cusolverStatus_t, CUSOLVER_STATUS_SUCCESS);
+DEFINE_EXTERNAL_API_TYPE(cufftResult_t, CUFFT_SUCCESS);
+DEFINE_EXTERNAL_API_TYPE(CUresult, CUDA_SUCCESS);
 
 #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
-DEFINE_EXTERNAL_API_TYPE(ncclResult_t, ncclSuccess, NCCL);
+DEFINE_EXTERNAL_API_TYPE(ncclResult_t, ncclSuccess);
 #endif
 
 }  // namespace details
 
 template <typename T>
-inline const char* GetErrorMsgUrl(T status) {
-  using __CUDA_STATUS_TYPE__ = decltype(status);
-  phi::proto::ApiType proto_type =
-      details::ExternalApiType<__CUDA_STATUS_TYPE__>::kProtoType;
-  switch (proto_type) {
-    case phi::proto::ApiType::CUDA:
-    case phi::proto::ApiType::CU:
-      return "https://docs.nvidia.com/cuda/cuda-runtime-api/"
-             "group__CUDART__TYPES.html#group__CUDART__TYPES_"
-             "1g3f51e3575c2178246db0a94a430e0038";
-      break;
-    case phi::proto::ApiType::CURAND:
-      return "https://docs.nvidia.com/cuda/curand/"
-             "group__HOST.html#group__HOST_1gb94a31d5c165858c96b6c18b70644437";
-      break;
-    case phi::proto::ApiType::CUDNN:
-      return "https://docs.nvidia.com/deeplearning/cudnn/api/"
-             "index.html#cudnnStatus_t";
-      break;
-    case phi::proto::ApiType::CUBLAS:
-      return "https://docs.nvidia.com/cuda/cublas/index.html#cublasstatus_t";
-      break;
-    case phi::proto::ApiType::CUSOLVER:
-      return "https://docs.nvidia.com/cuda/cusolver/"
-             "index.html#cuSolverSPstatus";
-      break;
-    case phi::proto::ApiType::NCCL:
-      return "https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/"
-             "types.html#ncclresult-t";
-      break;
-    case phi::proto::ApiType::CUFFT:
-      return "https://docs.nvidia.com/cuda/cufft/index.html#cufftresult";
-    case phi::proto::ApiType::CUSPARSE:
-      return "https://docs.nvidia.com/cuda/cusparse/"
-             "index.html#cusparseStatus_t";
-      break;
-    default:
-      return "Unknown type of External API, can't get error message URL!";
-      break;
-  }
-}
-
-template <typename T>
-inline std::string GetExternalErrorMsg(T status) {
-  std::ostringstream sout;
-  bool _initSucceed = false;
-  phi::proto::ExternalErrorDesc externalError;
-  if (externalError.ByteSizeLong() == 0) {
-    std::string filePath;
-#if !defined(_WIN32)
-    Dl_info info;
-    if (dladdr(reinterpret_cast<void*>(GetCurrentTraceBackString), &info)) {
-      std::string strModule(info.dli_fname);
-      const size_t last_slash_idx = strModule.find_last_of("/");
-      std::string compare_path = strModule.substr(strModule.length() - 6);
-      if (std::string::npos != last_slash_idx) {
-        strModule.erase(last_slash_idx, std::string::npos);
-      }
-      if (compare_path.compare("avx.so") == 0) {
-        filePath =
-            strModule +
-            "/../include/third_party/externalError/data/externalErrorMsg.pb";
-      } else {
-        filePath = strModule +
-                   "/../../third_party/externalError/data/externalErrorMsg.pb";
-      }
-    }
-#else
-    char buf[512];
-    MEMORY_BASIC_INFORMATION mbi;
-    HMODULE h_module =
-        (::VirtualQuery(GetCurrentTraceBackString, &mbi, sizeof(mbi)) != 0)
-            ? (HMODULE)mbi.AllocationBase
-            : NULL;
-    GetModuleFileName(h_module, buf, 512);
-    std::string strModule(buf);
-    const size_t last_slash_idx = strModule.find_last_of("\\");
-    std::string compare_path = strModule.substr(strModule.length() - 7);
-    if (std::string::npos != last_slash_idx) {
-      strModule.erase(last_slash_idx, std::string::npos);
-    }
-    if (compare_path.compare("avx.pyd") == 0) {
-      filePath = strModule +
-                 "\\..\\include\\third_"
-                 "party\\externalerror\\data\\externalErrorMsg.pb";
-    } else {
-      filePath =
-          strModule +
-          "\\..\\..\\third_party\\externalerror\\data\\externalErrorMsg.pb";
-    }
-#endif
-    std::ifstream fin(filePath, std::ios::in | std::ios::binary);
-    _initSucceed = externalError.ParseFromIstream(&fin);
-  }
-  using __CUDA_STATUS_TYPE__ = decltype(status);
-  phi::proto::ApiType proto_type =
-      details::ExternalApiType<__CUDA_STATUS_TYPE__>::kProtoType;
-  if (_initSucceed) {
-    for (int i = 0; i < externalError.errors_size(); ++i) {
-      if (proto_type == externalError.errors(i).type()) {
-        for (int j = 0; j < externalError.errors(i).messages_size(); ++j) {
-          if (status == externalError.errors(i).messages(j).code()) {
-            sout << "\n  [Hint: "
-                 << externalError.errors(i).messages(j).message() << "]";
-            return sout.str();
-          }
-        }
-      }
-    }
-  }
-
-  sout << "\n  [Hint: Please search for the error code(" << status
-       << ") on website (" << GetErrorMsgUrl(status)
-       << ") to get Nvidia's official solution and advice about "
-       << details::ExternalApiType<__CUDA_STATUS_TYPE__>::kTypeString
-       << " Error.]";
-  return sout.str();
-}
-
-template std::string GetExternalErrorMsg<cudaError_t>(cudaError_t);
-template std::string GetExternalErrorMsg<curandStatus_t>(curandStatus_t);
-template std::string GetExternalErrorMsg<cudnnStatus_t>(cudnnStatus_t);
-template std::string GetExternalErrorMsg<cublasStatus_t>(cublasStatus_t);
-template std::string GetExternalErrorMsg<cusparseStatus_t>(cusparseStatus_t);
-template std::string GetExternalErrorMsg<cusolverStatus_t>(cusolverStatus_t);
-template std::string GetExternalErrorMsg<cufftResult_t>(cufftResult_t);
-template std::string GetExternalErrorMsg<CUresult>(CUresult);
-#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
-template std::string GetExternalErrorMsg<ncclResult_t>(ncclResult_t);
-#endif
+std::string GetExternalErrorMsg(T status);
 
 /*************** CUDA ERROR ***************/
 inline bool is_error(cudaError_t e) { return e != cudaSuccess; }

From 2aaed989fbc6ec43a48af5fd0b67cb84223c4ac5 Mon Sep 17 00:00:00 2001
From: chenxujun <co63oc@users.noreply.github.com>
Date: Thu, 13 Apr 2023 17:48:37 +0800
Subject: [PATCH 139/156] Add pixel_shuffle pixel_unshuffle fp16/bf16 (#52582)

---
 .../gpu/pixel_unshuffle_grad_kernel.cu        |  4 +-
 .../phi/kernels/gpu/pixel_unshuffle_kernel.cu |  4 +-
 .../tests/unittests/test_pixel_shuffle_op.py  | 62 ++++++++++++++++-
 .../tests/unittests/test_pixel_unshuffle.py   | 67 ++++++++++++++++++-
 python/paddle/nn/functional/vision.py         |  4 +-
 5 files changed, 134 insertions(+), 7 deletions(-)

diff --git a/paddle/phi/kernels/gpu/pixel_unshuffle_grad_kernel.cu b/paddle/phi/kernels/gpu/pixel_unshuffle_grad_kernel.cu
index d7d2cde4ebade0..830d91452ffd4f 100644
--- a/paddle/phi/kernels/gpu/pixel_unshuffle_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/pixel_unshuffle_grad_kernel.cu
@@ -23,4 +23,6 @@ PD_REGISTER_KERNEL(pixel_unshuffle_grad,
                    ALL_LAYOUT,
                    phi::PixelUnshuffleGradKernel,
                    float,
-                   double) {}
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/pixel_unshuffle_kernel.cu b/paddle/phi/kernels/gpu/pixel_unshuffle_kernel.cu
index fcc53cbee1ecb3..cfe71b4f0f39be 100644
--- a/paddle/phi/kernels/gpu/pixel_unshuffle_kernel.cu
+++ b/paddle/phi/kernels/gpu/pixel_unshuffle_kernel.cu
@@ -23,4 +23,6 @@ PD_REGISTER_KERNEL(pixel_unshuffle,
                    ALL_LAYOUT,
                    phi::PixelUnshuffleKernel,
                    float,
-                   double) {}
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/python/paddle/fluid/tests/unittests/test_pixel_shuffle_op.py b/python/paddle/fluid/tests/unittests/test_pixel_shuffle_op.py
index b12f9c19d501df..aa2ba1895a6cab 100644
--- a/python/paddle/fluid/tests/unittests/test_pixel_shuffle_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pixel_shuffle_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from eager_op_test import OpTest, convert_float_to_uint16
 
 import paddle
 import paddle.nn.functional as F
@@ -64,6 +64,7 @@ class TestPixelShuffleOp(OpTest):
     def setUp(self):
         self.op_type = "pixel_shuffle"
         self.python_api = paddle.nn.functional.pixel_shuffle
+        self.init_dtype()
         self.init_data_format()
         n, c, h, w = 2, 9, 4, 4
 
@@ -74,13 +75,16 @@ def setUp(self):
 
         up_factor = 3
 
-        x = np.random.random(shape).astype("float64")
+        x = np.random.random(shape).astype(self.dtype)
         npresult = pixel_shuffle_np(x, up_factor, self.format)
 
         self.inputs = {'X': x}
         self.outputs = {'Out': npresult}
         self.attrs = {'upscale_factor': up_factor, "data_format": self.format}
 
+    def init_dtype(self):
+        self.dtype = np.float64
+
     def init_data_format(self):
         self.format = "NCHW"
 
@@ -99,6 +103,60 @@ def init_data_format(self):
         self.format = "NHWC"
 
 
+class TestPixelShuffleFP16Op(TestPixelShuffleOp):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support bfloat16",
+)
+class TestPixelShuffleBF16Op(OpTest):
+    def setUp(self):
+        self.op_type = "pixel_shuffle"
+        self.python_api = paddle.nn.functional.pixel_shuffle
+        self.init_dtype()
+        self.init_data_format()
+        n, c, h, w = 2, 9, 4, 4
+
+        if self.format == "NCHW":
+            shape = [n, c, h, w]
+        if self.format == "NHWC":
+            shape = [n, h, w, c]
+
+        up_factor = 3
+
+        x = np.random.random(shape).astype(self.np_dtype)
+        npresult = pixel_shuffle_np(x, up_factor, self.format)
+
+        self.inputs = {'X': x}
+        self.outputs = {'Out': npresult}
+        self.attrs = {'upscale_factor': up_factor, "data_format": self.format}
+
+        self.place = core.CUDAPlace(0)
+        self.inputs['X'] = convert_float_to_uint16(self.inputs['X'])
+        self.outputs['Out'] = convert_float_to_uint16(self.outputs['Out'])
+
+    def init_dtype(self):
+        self.dtype = np.uint16
+        self.np_dtype = np.float32
+
+    def init_data_format(self):
+        self.format = "NCHW"
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(
+            self.place,
+            ['X'],
+            'Out',
+        )
+
+
 class TestPixelShuffleAPI(unittest.TestCase):
     def setUp(self):
         self.x_1_np = np.random.random([2, 9, 4, 4]).astype("float64")
diff --git a/python/paddle/fluid/tests/unittests/test_pixel_unshuffle.py b/python/paddle/fluid/tests/unittests/test_pixel_unshuffle.py
index 5d1f9907ecb9e5..b2cfd457603c42 100644
--- a/python/paddle/fluid/tests/unittests/test_pixel_unshuffle.py
+++ b/python/paddle/fluid/tests/unittests/test_pixel_unshuffle.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from eager_op_test import OpTest, convert_float_to_uint16
 
 import paddle
 import paddle.nn.functional as F
@@ -82,6 +82,7 @@ def setUp(self):
 
         self.op_type = "pixel_unshuffle"
         self.python_api = pixel_unshuffle_wrapper
+        self.init_dtype()
         self.init_data_format()
         n, c, h, w = 2, 1, 12, 12
 
@@ -92,7 +93,7 @@ def setUp(self):
 
         down_factor = 3
 
-        x = np.random.random(shape).astype("float64")
+        x = np.random.random(shape).astype(self.dtype)
         npresult = pixel_unshuffle_np(x, down_factor, self.format)
 
         self.inputs = {"X": x}
@@ -102,6 +103,9 @@ def setUp(self):
             "data_format": self.format,
         }
 
+    def init_dtype(self):
+        self.dtype = np.float64
+
     def init_data_format(self):
         '''init_data_format'''
 
@@ -127,6 +131,65 @@ def init_data_format(self):
         self.format = "NHWC"
 
 
+class TestPixelUnshuffleFP16Op(TestPixelUnshuffleOp):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support bfloat16",
+)
+class TestPixelUnshuffleBP16Op(OpTest):
+    '''TestPixelUnshuffleBP16Op'''
+
+    def setUp(self):
+        self.op_type = "pixel_unshuffle"
+        self.python_api = pixel_unshuffle_wrapper
+        self.init_dtype()
+        self.init_data_format()
+        n, c, h, w = 2, 1, 12, 12
+
+        if self.format == "NCHW":
+            shape = [n, c, h, w]
+        if self.format == "NHWC":
+            shape = [n, h, w, c]
+
+        down_factor = 3
+
+        x = np.random.random(shape).astype(self.np_dtype)
+        npresult = pixel_unshuffle_np(x, down_factor, self.format)
+
+        self.inputs = {"X": x}
+        self.outputs = {"Out": npresult}
+        self.attrs = {
+            "downscale_factor": down_factor,
+            "data_format": self.format,
+        }
+
+        self.place = core.CUDAPlace(0)
+        self.inputs['X'] = convert_float_to_uint16(self.inputs['X'])
+        self.outputs['Out'] = convert_float_to_uint16(self.outputs['Out'])
+
+    def init_dtype(self):
+        self.dtype = np.uint16
+        self.np_dtype = np.float32
+
+    def init_data_format(self):
+        self.format = "NCHW"
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(
+            self.place,
+            ['X'],
+            'Out',
+        )
+
+
 class TestPixelUnshuffleAPI(unittest.TestCase):
     '''TestPixelUnshuffleAPI'''
 
diff --git a/python/paddle/nn/functional/vision.py b/python/paddle/nn/functional/vision.py
index 80e1a176b7662d..03d94b91abb7ff 100644
--- a/python/paddle/nn/functional/vision.py
+++ b/python/paddle/nn/functional/vision.py
@@ -443,7 +443,9 @@ def pixel_unshuffle(x, downscale_factor, data_format="NCHW", name=None):
         )
 
     helper = LayerHelper("pixel_unshuffle", **locals())
-    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'pixel_unshuffle')
+    check_variable_and_dtype(
+        x, 'x', ['float16', 'float32', 'float64', 'uint16'], 'pixel_unshuffle'
+    )
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
     helper.append_op(
         type="pixel_unshuffle",

From 28de4558c947f3c4d02ac0cffc73d63b21f8916b Mon Sep 17 00:00:00 2001
From: niuliling123 <51102941+niuliling123@users.noreply.github.com>
Date: Thu, 13 Apr 2023 18:05:37 +0800
Subject: [PATCH 140/156] Add TensorCheckerConfig for debugging tools (#51906)

---
 python/paddle/amp/debugging.py                | 275 ++++++++++++++++++
 .../fluid/tests/unittests/test_nan_inf_dir.py | 125 ++++++++
 2 files changed, 400 insertions(+)

diff --git a/python/paddle/amp/debugging.py b/python/paddle/amp/debugging.py
index 114edc2bd3cda1..1e0db8884e4965 100644
--- a/python/paddle/amp/debugging.py
+++ b/python/paddle/amp/debugging.py
@@ -13,8 +13,14 @@
 # limitations under the License.
 
 import contextlib
+import os
+import random
+from enum import Enum
+
+import numpy as np
 
 import paddle
+from paddle.fluid import core
 from paddle.fluid.framework import dygraph_only
 
 __all__ = [
@@ -24,6 +30,217 @@
 ]
 
 
+class DebugMode(Enum):
+    CHECK_NAN_INF_AND_ABORT = 0
+    CHECK_NAN_INF = 1
+    CHECK_ALL_FOR_OVERFLOW = 2
+    CHECK_ALL = 3
+    CHECK_ALL_AND_ABORT = 4
+    DUMP_ALL = 5
+
+
+class TensorCheckerConfig:
+    """
+    Collect the config for checking nan and inf in module or op tensor.
+
+    Args:
+    * enable: Whether to enable Tensor's value detection function. The default value is False, which means that these tools will never be used.
+
+    * debug_mode: Debug mode,There are 6 kinds of debug mode.
+        CHECK_NAN_INF_AND_ABORT(default): Print or save Tensor key information with NaN/Inf and interrupt the program
+        CHECK_NAN_INF: Print or save Tensor critical information with NaN/Inf, but continue to run
+        CHECK_ALL_AND_ABORT: Print or save the output Tensor key information of all operators, and interrupt the program if NaN/Inf occurs
+        CHECK_ALL_FOR_OVERFLOW: Check the output of the FP32 operator, print or save key Tensor information that exceeds the FP16 representation range (overflow, underflow)
+        CHECK_ALL: Print or save output Tensor key information for all operators
+        DUMP_ALL: Saves all Tensor data. This mode does not print on the terminal
+
+    * dump_dir: The collection data storage path. If it is None, it will be directly printed to the terminal
+
+    * checked_op_list: A list of operators you want to check
+
+    * skipped_op_list: A list of operators to skip checking
+
+    * debug_step: The iteration scope of debugging
+
+    * stack_height_limit: The maximum depth of the call stack, and supports printing the call stack at the error location. The specific scheme needs to be investigated
+
+    * enable_traceback_filtering: Whether to filter the traceback. The main purpose is to filter out the internal code call stack of the framework and only display the user code call stack
+
+    Examples:
+       .. code-block:: python
+          import paddle
+
+          checker_config = paddle.amp.debugging.TensorCheckerConfig(enable=True, debug_mode=DebugMode.CHECK_NAN_INF_AND_ABORT)
+          paddle.amp.debugging.enable_tensor_checker(checker_config)
+
+          x = paddle.to_tensor([1, 0, 3], place=paddle.CPUPlace(), dtype='float32', stop_gradient=False)
+          y = paddle.to_tensor([0.2, 0, 0.5], place=paddle.CPUPlace(), dtype='float32')
+          res = paddle.pow(x, y)
+
+          paddle.autograd.backward(res, retain_graph=True)
+          paddle.amp.debugging.disable_tensor_checker()
+
+    """
+
+    # For module debugging
+    Current_step_id = 0
+
+    def __init__(
+        self,
+        enable,
+        debug_mode=DebugMode.CHECK_NAN_INF_AND_ABORT,
+        dump_dir=None,
+        checked_op_list=None,
+        skipped_op_list=None,
+        debug_step=None,
+        stack_height_limit=3,
+        enable_traceback_filtering=False,
+    ):
+
+        self.enable = enable
+        self.debug_mode = debug_mode
+        self.dump_dir = dump_dir
+
+        self.checked_op_list = checked_op_list
+        self.skipped_op_list = skipped_op_list
+
+        self.debug_step = debug_step
+        self.stack_height_limit = stack_height_limit
+
+        self.enable_traceback_filtering = enable_traceback_filtering
+
+        self.start_step = None
+        self.end_step = None
+
+        self.seed = 123
+        self.initial_seed = 123
+
+        # check debug_step
+        if debug_step is not None:
+            if isinstance(debug_step, (tuple, list)):
+                assert (
+                    len(self.debug_step) == 2
+                    and self.debug_step[1] > self.debug_step[0]
+                )
+                self.start_step, self.end_step = self.debug_step
+                self.start_step = max(self.start_step, 0)
+            else:
+                raise ValueError("debug_step must be list or tuple")
+
+        if core.is_compiled_with_cuda():
+            for i in range(core.get_cuda_device_count()):
+                self.initial_seed = core.default_cuda_generator(
+                    i
+                ).initial_seed()
+        elif core.is_compiled_with_xpu():
+            for i in range(core.get_xpu_device_count()):
+                self.initial_seed = core.default_xpu_generator(i).initial_seed()
+
+        self.initial_seed = core.default_cpu_generator().initial_seed()
+
+        # check debug_mode
+        if self.debug_mode.name not in DebugMode.__members__:
+            raise ValueError(
+                "debug_mode in DebugMode",
+                self.debug_mode,
+                DebugMode.__members__,
+            )
+
+        # check checked_op_list
+        if self.checked_op_list is not None:
+            if isinstance(self.checked_op_list, (list, tuple)):
+                check_op_list = ",".join(
+                    value for value in self.checked_op_list
+                )
+                os.environ["Paddle_check_nan_inf_op_list"] = str(check_op_list)
+            else:
+                raise ValueError("checked_op_list must be list or tuple")
+
+        # check skipped_op_list
+        if self.skipped_op_list is not None:
+            if isinstance(self.skipped_op_list, (list, tuple)):
+                skipped_op_list = ",".join(
+                    value for value in self.skipped_op_list
+                )
+                os.environ["Paddle_skip_nan_inf_op_list"] = str(skipped_op_list)
+            else:
+                raise ValueError("skipped_op_list must be list or tuple")
+
+        if self.enable:
+            self._set_seed(self.enable)
+
+    def keep_random(self, seed, flag):
+        # get random seed
+        self.seed = seed
+        paddle.seed(self.seed)
+        np.random.seed(self.seed)
+        random.seed(self.seed)
+
+        # set cudnn and cpu
+        if core.is_compiled_with_cuda():
+            paddle.set_flags({"FLAGS_cudnn_deterministic": flag})
+        paddle.set_flags({"FLAGS_cpu_deterministic": flag})
+
+        # info
+        print("AMP Debugging TensorCheckerConfig: seed ", self.seed)
+        print(
+            "AMP Debugging TensorCheckerConfig: FLAGS_cudnn_deterministic is ",
+            flag,
+        )
+        print(
+            "AMP Debugging TensorCheckerConfig: FLAGS_cpu_deterministic is ",
+            flag,
+        )
+
+    def _set_seed(self, enable):
+        if self.initial_seed != self.seed:
+            self.seed = self.initial_seed
+        if self.seed > 4294967295 or self.seed < 0:
+            print("[Warnning: Seed must be between 0 and 2**32 - 1")
+            self.seed = 123
+            self.keep_random(self.seed, True)
+
+    def _set_env(self, check_flag):
+        paddle.set_flags({"FLAGS_check_nan_inf": check_flag})
+        if check_flag:
+            # set debug level
+            paddle.set_flags(
+                {"FLAGS_check_nan_inf_level": self.debug_mode.value}
+            )
+
+            # set output_dir
+            if self.dump_dir is not None:
+                paddle.fluid.core.set_nan_inf_debug_path(self.dump_dir)
+
+            # set stack_height_limit
+            if isinstance(self.stack_height_limit, (int)):
+                paddle.set_flags(
+                    {"FLAGS_call_stack_level": self.stack_height_limit}
+                )
+            else:
+                raise ValueError("stack_height_limit must be int")
+
+    def check(self):
+        if self.enable:
+            if self.start_step is not None and self.end_step is not None:
+                if (
+                    self.start_step > TensorCheckerConfig.Current_step_id
+                    or TensorCheckerConfig.Current_step_id >= self.end_step
+                ):
+                    return False
+                else:
+                    TensorCheckerConfig.Current_step_id += 1
+            return True
+        return False
+
+    def run(self):
+        if self.enable:
+            self._set_env(self.enable)
+
+    def end(self):
+        self._set_env(False)
+
+
 def _get_operator_stats_flag():
     flags = paddle.get_flags(["FLAGS_low_precision_op_list"])
     return flags["FLAGS_low_precision_op_list"]
@@ -188,3 +405,61 @@ def collect_operator_stats():
     enable_operator_stats_collection()
     yield
     disable_operator_stats_collection()
+
+
+def enable_tensor_checker(checker_config):
+    """
+    enable_tensor_checker(checker_config) is enables model level accuracy checking, which is used together with disables_tensor_checker() to achieve model level precision checking through the combination of these two APIs, checking the output Tensors of all operators within the specified range.
+
+    Attention:
+
+    * If disable is called before loss. backward()_tensor_checker(), the gradient operator is not checked;
+
+    * If disable is called before optimizer.step() tensor_checker(), the optimizer and other weight update related operators will not be checked
+
+    Examples:
+       .. code-block:: python
+           import paddle
+
+           checker_config = paddle.amp.debugging.TensorCheckerConfig(enable=True, debug_mode=DebugMode.CHECK_NAN_INF_AND_ABORT)
+           paddle.amp.debugging.enable_tensor_checker(checker_config)
+
+           x = paddle.to_tensor([1, 0, 3], place=paddle.CPUPlace(), dtype='float32', stop_gradient=False)
+           y = paddle.to_tensor([0.2, 0, 0.5], place=paddle.CPUPlace(), dtype='float32')
+           res = paddle.pow(x, y)
+           paddle.autograd.backward(res, retain_graph=True)
+
+           paddle.amp.debugging.disable_tensor_checker()
+    """
+    if checker_config.check():
+        checker_config.run()
+    else:
+        checker_config.end()
+
+
+def disable_tensor_checker():
+    """
+    disable_tensor_checker() to disables the accuracy checking, which is used together with enables_tensor_checker(config) to achieve model level precision checking through the combination of these two APIs, checking the output Tensors of all operators within the specified range.
+
+    Attention:
+
+    * If disable_tensor_checker() is called before loss.backward(), the gradient operator is not checked;
+
+    * If disable_tensor_checker() is called before optimizer.step(), the optimizer and other weight update related operators will not be checked
+
+    Examples:
+       .. code-block:: python
+           import paddle
+
+           checker_config = paddle.amp.debugging.TensorCheckerConfig(enable=True, debug_mode=DebugMode.CHECK_NAN_INF_AND_ABORT)
+           paddle.amp.debugging.enable_tensor_checker(checker_config)
+
+           x = paddle.to_tensor([1, 0, 3], place=paddle.CPUPlace(), dtype='float32', stop_gradient=False)
+           y = paddle.to_tensor([0.2, 0, 0.5], place=paddle.CPUPlace(), dtype='float32')
+           res = paddle.pow(x, y)
+           paddle.autograd.backward(res, retain_graph=True)
+
+           paddle.amp.debugging.disable_tensor_checker()
+
+    """
+    paddle.set_flags({"FLAGS_check_nan_inf": 0})
diff --git a/python/paddle/fluid/tests/unittests/test_nan_inf_dir.py b/python/paddle/fluid/tests/unittests/test_nan_inf_dir.py
index 6d2fa6a84add86..425dc9a7e997e8 100644
--- a/python/paddle/fluid/tests/unittests/test_nan_inf_dir.py
+++ b/python/paddle/fluid/tests/unittests/test_nan_inf_dir.py
@@ -100,6 +100,131 @@ def _check_num_nan_inf(use_cuda):
         x = paddle.to_tensor([2, 3, 4], 'float32')
         y = paddle.to_tensor([1, 5, 2], 'float32')
         z = paddle.add(x, y)
+        path = ""
+        paddle.fluid.core.set_nan_inf_debug_path(path)
+
+    def test_nan_inf_op(self):
+        import paddle
+
+        num_nan = 0
+        num_inf = 0
+        # check op list
+        x = paddle.to_tensor(
+            [1, 0, 1],
+            place=paddle.CPUPlace(),
+            dtype='float32',
+            stop_gradient=False,
+        )
+        y = paddle.to_tensor(
+            [0.2, -1, 0.5], place=paddle.CPUPlace(), dtype='float32'
+        )
+        try:
+            res = paddle.pow(x, y)
+        except Exception as e:
+            # Cannot catch the log in CUDA kernel.
+            err_str_list = (
+                str(e)
+                .replace("(", " ")
+                .replace(")", " ")
+                .replace(",", " ")
+                .split(" ")
+            )
+            for err_str in err_str_list:
+                if "num_nan" in err_str:
+                    num_nan = int(err_str.split("=")[1])
+                elif "num_inf" in err_str:
+                    num_inf = int(err_str.split("=")[1])
+            print(
+                "[CHECK_NAN_INF_AND_ABORT] num_nan={}, num_inf={}".format(
+                    num_nan, num_inf
+                )
+            )
+        return num_inf
+
+    def test_check_op_list(self):
+        import paddle
+
+        num_nan = 0
+        num_inf = 0
+
+        checker_config = paddle.amp.debugging.TensorCheckerConfig(
+            enable=True,
+            debug_mode=paddle.amp.debugging.DebugMode.CHECK_NAN_INF_AND_ABORT,
+            skipped_op_list=["elementwise_div"],
+        )
+
+        x = paddle.to_tensor(
+            [0, 0, 0],
+            place=paddle.CPUPlace(),
+            dtype='float32',
+            stop_gradient=False,
+        )
+        y = paddle.to_tensor(
+            [0.2, -1, 0.5], place=paddle.CPUPlace(), dtype='float32'
+        )
+        paddle.amp.debugging.enable_tensor_checker(checker_config)
+        try:
+            res = paddle.divide(y, x)
+        except Exception as e:
+            # Cannot catch the log in CUDA kernel.
+            err_str_list = (
+                str(e)
+                .replace("(", " ")
+                .replace(")", " ")
+                .replace(",", " ")
+                .split(" ")
+            )
+            for err_str in err_str_list:
+                if "num_nan" in err_str:
+                    num_nan = int(err_str.split("=")[1])
+                elif "num_inf" in err_str:
+                    num_inf = int(err_str.split("=")[1])
+            print(
+                "[CHECK_NAN_INF_AND_ABORT] num_nan={}, num_inf={}".format(
+                    num_nan, num_inf
+                )
+            )
+        paddle.amp.debugging.enable_tensor_checker(checker_config)
+
+    def test_tensor_checker(self):
+        import paddle
+
+        def _assert_flag(value):
+            flags = ['FLAGS_check_nan_inf', 'FLAGS_check_nan_inf_level']
+            res = paddle.get_flags(flags)
+            assert res["FLAGS_check_nan_inf"] == value
+
+        paddle.set_flags({"FLAGS_check_nan_inf": 0})
+        paddle.seed(102)
+        checker_config = paddle.amp.debugging.TensorCheckerConfig(
+            enable=True,
+            debug_mode=paddle.amp.debugging.DebugMode.CHECK_NAN_INF_AND_ABORT,
+            checked_op_list=["elementwise_pow"],
+            skipped_op_list=["elementwise_add"],
+            debug_step=[0, 3],
+        )
+        # check seed
+        assert checker_config.initial_seed == 102
+        assert checker_config.seed == 102
+        _assert_flag(False)
+        for index in range(5):
+            paddle.amp.debugging.enable_tensor_checker(checker_config)
+            if index <= 2:
+                _assert_flag(True)
+                assert (
+                    index + 1
+                    == paddle.amp.debugging.TensorCheckerConfig.Current_step_id
+                )
+                assert 1 == self.test_nan_inf_op()
+            else:
+                assert (
+                    3
+                    == paddle.amp.debugging.TensorCheckerConfig.Current_step_id
+                )
+                _assert_flag(False)
+                assert 0 == self.test_nan_inf_op()
+            paddle.amp.debugging.disable_tensor_checker()
+            _assert_flag(False)
 
 
 if __name__ == '__main__':

From 9dc7e5ef4f0293e47a34922ee230c9ff93708d62 Mon Sep 17 00:00:00 2001
From: Leo Guo <58431564+ZibinGuo@users.noreply.github.com>
Date: Thu, 13 Apr 2023 18:30:13 +0800
Subject: [PATCH 141/156] Fix the parameter check error in rmsprop_kernel_xpu.
 (#52866)

---
 paddle/phi/kernels/xpu/rmsprop_kernel.cc | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/paddle/phi/kernels/xpu/rmsprop_kernel.cc b/paddle/phi/kernels/xpu/rmsprop_kernel.cc
index e3bf5439c0b63b..5aac7a279c8300 100644
--- a/paddle/phi/kernels/xpu/rmsprop_kernel.cc
+++ b/paddle/phi/kernels/xpu/rmsprop_kernel.cc
@@ -41,12 +41,6 @@ void RmspropDenseKernel(const Context& dev_ctx,
                         DenseTensor* mean_grad_out,
                         DenseTensor* master_param_outs) {
   // copy learning_rate to cpu
-  PADDLE_ENFORCE_EQ(
-      learning_rate.dims().size(),
-      1,
-      errors::InvalidArgument("learining rate should have dimension = 1."
-                              " But received learning rate dim [%s] ",
-                              learning_rate.dims().size()));
   T learning_rate_cpu = 0.0f;
   memory_utils::Copy(CPUPlace(),
                      static_cast<void*>(&learning_rate_cpu),

From dc8d6a1a6956afc41ba0d0cdf8e98b40a10094ce Mon Sep 17 00:00:00 2001
From: zhoutianzi666 <39978853+zhoutianzi666@users.noreply.github.com>
Date: Thu, 13 Apr 2023 18:33:49 +0800
Subject: [PATCH 142/156] [Paddle-TRT]fix bilinear_interp_v2 && some other bugs
 in trt 7011 (#52753)

* fix bilinear_interp_v2 && some other bugs in trt 7011

* add version check in test_trt_convert_bilinear_interp_v2.py
---
 paddle/fluid/inference/tensorrt/engine.cc           |  4 ++--
 paddle/fluid/inference/tensorrt/op_teller.cc        |  4 ++++
 .../plugin/elementwiseadd_transpose_op_plugin.cu    | 13 ++++++++++---
 .../test_trt_convert_bilinear_interp_v2.py          |  5 ++++-
 4 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index 6a116c6cb75901..cabad0bd2df311 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -236,8 +236,8 @@ void TensorRTEngine::FreezeNetwork() {
     LOG(INFO) << "Run Paddle-TRT Dynamic Shape mode.";
     for (int i = 0; i < max_profile_num_; i++) {
       for (auto &input : min_input_shape_) {
-#if IS_TRT_VERSION_LT(7000)
-        // trt6 will check all_of input > 0
+#if IS_TRT_VERSION_LT(7100)
+        // trt6/trt7011 will check all_of input > 0
         if (!(std::all_of(input.second.begin(),
                           input.second.end(),
                           [](int x) { return x > 0; }) &&
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 9ce57fe6aee912..d9605bb18e4508 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -917,6 +917,10 @@ struct SimpleOpTypeSetTeller : public Teller {
     }
 
     if (op_type == "bilinear_interp_v2") {
+      // trt 7011 result in test_solov2_trt_fp32.py TRT fp32 diff
+#if IS_TRT_VERSION_LT(7100)
+      return false;
+#endif
       std::vector<std::string> attrs{"data_layout",
                                      "interp_method",
                                      "align_corners",
diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwiseadd_transpose_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/elementwiseadd_transpose_op_plugin.cu
index f12c4d951cf71e..d2f373bca07de8 100644
--- a/paddle/fluid/inference/tensorrt/plugin/elementwiseadd_transpose_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/elementwiseadd_transpose_op_plugin.cu
@@ -86,9 +86,16 @@ bool ElementwiseAddTransposePluginDynamic::supportsFormatCombination(
   }
   // output 0
   if (pos == 2) {
-    return (in.type == in_out[0].type) &&
-           (in.format == nvinfer1::TensorFormat::kLINEAR ||
-            in.format == nvinfer1::TensorFormat::kHWC8);
+    // 7.0.0.11 test_pcpvt_base_trt_fp16.py failed if support C8.
+    // Only support linear format in lower versions of TRT
+#if IS_TRT_VERSION_GE(7100)
+    bool support_format = in.format == nvinfer1::TensorFormat::kLINEAR ||
+                          in.format == nvinfer1::TensorFormat::kHWC8;
+#else
+    bool support_format = in.format == nvinfer1::TensorFormat::kLINEAR;
+#endif
+
+    return (in.type == in_out[0].type) && (support_format);
   }
 }
 void ElementwiseAddTransposePluginDynamic::configurePlugin(
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_bilinear_interp_v2.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_bilinear_interp_v2.py
index f93b598de71b82..148776365c0911 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_bilinear_interp_v2.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_bilinear_interp_v2.py
@@ -30,7 +30,10 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
         attrs = [
             program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
-
+        ver = paddle_infer.get_trt_compile_version()
+        # here is consistent with op_teller.cc
+        if ver[0] * 1000 + ver[1] * 100 + ver[2] * 10 < 7100:
+            return False
         return True
 
     def sample_program_configs(self):

From e0e044c0137814f130b9945498b85a7490083d46 Mon Sep 17 00:00:00 2001
From: Zhang Zheng <32410583+ZzSean@users.noreply.github.com>
Date: Thu, 13 Apr 2023 19:28:48 +0800
Subject: [PATCH 143/156] [AMP OP&Test] Support fp16&bf16 in reduce_max
 (#52862)

* [AMP OP&Test] Support fp16&bf16 in reduce_max
---
 .../phi/kernels/gpu/reduce_max_grad_kernel.cu | 62 +++++++++++++++++-
 paddle/phi/kernels/kps/reduce_max_kernel.cu   | 12 +++-
 paddle/phi/kernels/reduce_max_kernel.cc       | 15 ++++-
 .../fluid/tests/unittests/test_reduce_op.py   | 63 +++++++++++++------
 python/paddle/tensor/math.py                  |  5 +-
 5 files changed, 132 insertions(+), 25 deletions(-)

diff --git a/paddle/phi/kernels/gpu/reduce_max_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_max_grad_kernel.cu
index b4ff277b5026ce..7b4472c5223182 100644
--- a/paddle/phi/kernels/gpu/reduce_max_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/reduce_max_grad_kernel.cu
@@ -16,7 +16,63 @@
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+#include "paddle/phi/kernels/funcs/compare_functors.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/funcs/reduce_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ReduceMaxGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& out,
+                         const DenseTensor& out_grad,
+                         const IntArray& dims,
+                         bool keep_dim,
+                         bool reduce_all,
+                         DenseTensor* x_grad) {
+  dev_ctx.Alloc(x_grad, x.dtype());
+  reduce_all = recompute_reduce_all(x, dims, reduce_all);
+
+  // get reduce_dim
+  int dim_size = x.dims().size();
+  auto reduce_dims =
+      funcs::details::GetReduceDim(dims.GetData(), dim_size, reduce_all);
+  auto update_dims = vectorize(x.dims());
+  for (auto i : reduce_dims) {
+    update_dims[i] = 1;
+  }
+
+  // make new tensor of out and out_grad
+  phi::DenseTensor new_out(out.type());
+  new_out.ShareDataWith(out);
+  new_out.Resize(phi::make_ddim(update_dims));
+
+  phi::DenseTensor new_out_grad(out_grad.type());
+  new_out_grad.ShareDataWith(out_grad);
+  new_out_grad.Resize(phi::make_ddim(update_dims));
+
+  // make equal_out
+  phi::DenseTensor* equal_out = new phi::DenseTensor();
+  equal_out->Resize(x.dims());
+  dev_ctx.template Alloc<T>(equal_out);
+
+  // compute
+  // 1. equal_out = Equal(x, y)
+  std::vector<const phi::DenseTensor*> equal_inputs = {&new_out, &x};
+  std::vector<phi::DenseTensor*> equal_outputs = {equal_out};
+  funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
+      dev_ctx, equal_inputs, &equal_outputs, 0, funcs::EqualFunctor<T>());
+
+  // 2. dx = dout * 1
+  std::vector<const phi::DenseTensor*> mul_inputs = {&new_out_grad, equal_out};
+  std::vector<phi::DenseTensor*> mul_outputs = {x_grad};
+  funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
+      dev_ctx, mul_inputs, &mul_outputs, 0, funcs::MultiplyFunctor<T>());
+  delete equal_out;
+}
+}  // namespace phi
 
 PD_REGISTER_KERNEL(max_grad,
                    GPU,
@@ -25,4 +81,6 @@ PD_REGISTER_KERNEL(max_grad,
                    float,
                    double,
                    int,
-                   int64_t) {}
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/kps/reduce_max_kernel.cu b/paddle/phi/kernels/kps/reduce_max_kernel.cu
index 9c0fdb52c42790..a03035dcf1932d 100644
--- a/paddle/phi/kernels/kps/reduce_max_kernel.cu
+++ b/paddle/phi/kernels/kps/reduce_max_kernel.cu
@@ -36,6 +36,14 @@ void MaxRawKernel(const Context& dev_ctx,
 #ifdef PADDLE_WITH_XPU_KP
 PD_REGISTER_KERNEL(max_raw, KPS, ALL_LAYOUT, phi::MaxRawKernel, float) {}
 #else
-PD_REGISTER_KERNEL(
-    max_raw, KPS, ALL_LAYOUT, phi::MaxRawKernel, float, double, int, int64_t) {}
+PD_REGISTER_KERNEL(max_raw,
+                   KPS,
+                   ALL_LAYOUT,
+                   phi::MaxRawKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 #endif
diff --git a/paddle/phi/kernels/reduce_max_kernel.cc b/paddle/phi/kernels/reduce_max_kernel.cc
index 23da5bd4cd54ed..7892fc879c713c 100644
--- a/paddle/phi/kernels/reduce_max_kernel.cc
+++ b/paddle/phi/kernels/reduce_max_kernel.cc
@@ -34,7 +34,20 @@ void MaxKernel(const Context& dev_ctx,
 PD_REGISTER_KERNEL(
     max, CPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA)
+PD_REGISTER_KERNEL(max,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MaxKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#endif
+
+#if defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(
     max, GPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {}
 #endif
diff --git a/python/paddle/fluid/tests/unittests/test_reduce_op.py b/python/paddle/fluid/tests/unittests/test_reduce_op.py
index 01b25b543117c8..050879369244d4 100644
--- a/python/paddle/fluid/tests/unittests/test_reduce_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py
@@ -251,18 +251,6 @@ def test_check_grad(self):
             only_check_prim=True,
         )
 
-    def test_raise_error(self):
-        if core.is_compiled_with_cuda():
-            self.inputs = {'X': np.random.random((5, 6, 10)).astype("float16")}
-            place = core.CUDAPlace(0)
-            with self.assertRaises(RuntimeError) as cm:
-                self.check_output_with_place(place)
-            error_msg = str(cm.exception).split("\n")[-2].strip().split(".")[0]
-            self.assertEqual(
-                error_msg,
-                "NotFoundError: The kernel (reduce_max) with key (GPU, Undefined(AnyLayout), float16) is not found and GPU kernel cannot fallback to CPU one",
-            )
-
 
 class TestMaxOp_ZeroDim(OpTest):
     """Remove Max with subgradient from gradient check to confirm the success of CI."""
@@ -292,7 +280,7 @@ def test_check_grad(self):
         )
 
 
-class TestMaxOp_FP32(OpTest):
+class TestMaxFP32Op(OpTest):
     """Remove Max with subgradient from gradient check to confirm the success of CI."""
 
     def setUp(self):
@@ -300,13 +288,19 @@ def setUp(self):
         self.prim_op_type = "prim"
         self.python_api = paddle.max
         self.public_python_api = paddle.max
-        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")}
+        self.init_dtype()
+        if self.dtype == np.uint16:
+            x = np.random.random((5, 6, 10)).astype(np.float32)
+            self.inputs = {'X': convert_float_to_uint16(x)}
+        else:
+            x = np.random.random((5, 6, 10)).astype(self.dtype)
+            self.inputs = {'X': x}
         self.attrs = {'dim': [-1], 'keep_dim': True}
-        self.outputs = {
-            'Out': self.inputs['X'].max(
-                axis=tuple(self.attrs['dim']), keepdims=True
-            )
-        }
+        out = x.max(axis=tuple(self.attrs['dim']), keepdims=True)
+        if self.dtype == np.uint16:
+            self.outputs = {'Out': convert_float_to_uint16(out)}
+        else:
+            self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
@@ -320,6 +314,37 @@ def test_check_grad(self):
             only_check_prim=True,
         )
 
+    def init_dtype(self):
+        self.dtype = np.float32
+
+
+class TestMaxFP16Op(TestMaxFP32Op):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA or not support the bfloat16",
+)
+class TestMaxBF16Op(TestMaxFP32Op):
+    def init_dtype(self):
+        self.dtype = np.uint16
+
+    def test_check_output(self):
+        self.check_output_with_place(core.CUDAPlace(0))
+
+    def test_check_grad(self):
+        # only composite op support gradient check of reduce_max
+        self.check_grad_with_place(
+            core.CUDAPlace(0),
+            ['X'],
+            'Out',
+            check_prim=True,
+            only_check_prim=True,
+        )
+
 
 @skip_check_grad_ci(
     reason="reduce_min is discontinuous non-derivable function,"
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index fe41200378793d..0e6b55142bf70d 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -2348,7 +2348,10 @@ def max(x, axis=None, keepdim=False, name=None):
         reduce_all, axis = _get_reduce_axis_with_tensor(axis, x)
         helper = LayerHelper('max', **locals())
         check_variable_and_dtype(
-            x, 'x', ['float32', 'float64', 'int32', 'int64'], 'max'
+            x,
+            'x',
+            ['float16', 'uint16', 'float32', 'float64', 'int32', 'int64'],
+            'max',
         )
         if not isinstance(axis, Variable) and paddle.utils._contain_var(axis):
             axis = paddle.utils._convert_to_tensor_list(axis)

From 205094f03691ffa174393cfc920b8a87e5b589d0 Mon Sep 17 00:00:00 2001
From: Difer <707065510@qq.com>
Date: Thu, 13 Apr 2023 19:40:16 +0800
Subject: [PATCH 144/156] =?UTF-8?q?=E3=80=90Hackathon=20No57=E3=80=91add?=
 =?UTF-8?q?=5Ffp16=5Fbf16=5Ffor=5Fdot=20&=20bf16=5Ffor=5Fcross=20(#52426)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* add_fp_bf_for_dot & bf_for_cross

* fix error

* fix some error

* fix some error

* change something

* fix magic number
---
 paddle/phi/kernels/gpu/cross_grad_kernel.cu   |   1 +
 paddle/phi/kernels/gpu/cross_kernel.cu        |   1 +
 paddle/phi/kernels/gpu/dot_grad_kernel.cu     |   6 +-
 paddle/phi/kernels/gpu/dot_kernel.cu          |   6 +-
 .../fluid/tests/unittests/test_cross_op.py    |  52 ++++-
 .../fluid/tests/unittests/test_dot_op.py      | 199 +++++++++++++++++-
 6 files changed, 259 insertions(+), 6 deletions(-)

diff --git a/paddle/phi/kernels/gpu/cross_grad_kernel.cu b/paddle/phi/kernels/gpu/cross_grad_kernel.cu
index b3316ea875b906..58f53fcf3f3d22 100644
--- a/paddle/phi/kernels/gpu/cross_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/cross_grad_kernel.cu
@@ -191,6 +191,7 @@ PD_REGISTER_KERNEL(cross_grad,
                    ALL_LAYOUT,
                    phi::CrossGradKernel,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    float,
                    double,
                    int,
diff --git a/paddle/phi/kernels/gpu/cross_kernel.cu b/paddle/phi/kernels/gpu/cross_kernel.cu
index 60623cb8e3d747..461e3a219d5d6a 100644
--- a/paddle/phi/kernels/gpu/cross_kernel.cu
+++ b/paddle/phi/kernels/gpu/cross_kernel.cu
@@ -168,6 +168,7 @@ PD_REGISTER_KERNEL(cross,
                    ALL_LAYOUT,
                    phi::CrossKernel,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    float,
                    double,
                    int,
diff --git a/paddle/phi/kernels/gpu/dot_grad_kernel.cu b/paddle/phi/kernels/gpu/dot_grad_kernel.cu
index 874d0f03b7dce3..0bd448339b661d 100644
--- a/paddle/phi/kernels/gpu/dot_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/dot_grad_kernel.cu
@@ -15,7 +15,9 @@ limitations under the License. */
 #include "paddle/phi/kernels/dot_grad_kernel.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/dot_grad_kernel_impl.h"
 
@@ -28,4 +30,6 @@ PD_REGISTER_KERNEL(dot_grad,
                    int,
                    int64_t,
                    phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
+                   phi::dtype::complex<double>,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/dot_kernel.cu b/paddle/phi/kernels/gpu/dot_kernel.cu
index 144fc66e3837b9..5005f6390d2ac0 100644
--- a/paddle/phi/kernels/gpu/dot_kernel.cu
+++ b/paddle/phi/kernels/gpu/dot_kernel.cu
@@ -15,6 +15,8 @@
 #include "paddle/phi/kernels/dot_kernel.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 
@@ -61,4 +63,6 @@ PD_REGISTER_KERNEL(dot,
                    int,
                    int64_t,
                    complex64,
-                   complex128) {}
+                   complex128,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/python/paddle/fluid/tests/unittests/test_cross_op.py b/python/paddle/fluid/tests/unittests/test_cross_op.py
index bbfa19aa7ff044..1114bb0b69ffbd 100644
--- a/python/paddle/fluid/tests/unittests/test_cross_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_op.py
@@ -15,11 +15,11 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from eager_op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import fluid
-from paddle.fluid import Program, program_guard
+from paddle.fluid import Program, core, program_guard
 
 
 class TestCrossOp(OpTest):
@@ -65,6 +65,9 @@ def init_output(self):
         self.outputs = {'Out': np.array(z_list).reshape(self.shape)}
 
 
+@unittest.skipIf(
+    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
+)
 class TestCrossFP16Op(TestCrossOp):
     def initTestCase(self):
         self.shape = (2048, 3)
@@ -77,6 +80,51 @@ def init_output(self):
         self.outputs = {'Out': np.array(z_list).reshape(self.shape)}
 
 
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and not support the bfloat16",
+)
+class TestCrossBF16Op(OpTest):
+    def setUp(self):
+        self.op_type = "cross"
+        self.python_api = paddle.cross
+        self.initTestCase()
+        self.x = np.random.random(self.shape).astype(np.float32)
+        self.y = np.random.random(self.shape).astype(np.float32)
+        self.inputs = {
+            'X': convert_float_to_uint16(self.x),
+            'Y': convert_float_to_uint16(self.y),
+        }
+        self.init_output()
+
+    def initTestCase(self):
+        self.attrs = {'dim': -2}
+        self.dtype = np.uint16
+        self.shape = (1024, 3, 1)
+
+    def init_output(self):
+        x = np.squeeze(self.x, 2)
+        y = np.squeeze(self.y, 2)
+        z_list = []
+        for i in range(1024):
+            z_list.append(np.cross(x[i], y[i]))
+        out = np.array(z_list).astype(np.float32).reshape(self.shape)
+        self.outputs = {'Out': convert_float_to_uint16(out)}
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_bfloat16_supported(place):
+                self.check_output_with_place(place)
+
+    def test_check_grad_normal(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_bfloat16_supported(place):
+                self.check_grad_with_place(place, ['X', 'Y'], 'Out')
+
+
 class TestCrossAPI(unittest.TestCase):
     def input_data(self):
         self.data_x = np.array(
diff --git a/python/paddle/fluid/tests/unittests/test_dot_op.py b/python/paddle/fluid/tests/unittests/test_dot_op.py
index 4acf5f4ed14ef9..5cb061c368b900 100644
--- a/python/paddle/fluid/tests/unittests/test_dot_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dot_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from eager_op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import fluid
@@ -85,7 +85,7 @@ def test_check_grad_ingore_y(self):
     def init_input_output(self):
         self.x = np.random.uniform(0.1, 1, [121]).astype(self.dtype)
         self.y = np.random.uniform(1, 3, [121]).astype(self.dtype)
-        self.out = np.dot(self.x, self.y)
+        self.out = np.dot(self.x, self.y).astype(self.dtype)
 
     def init_dtype(self):
         self.dtype = np.float64
@@ -314,6 +314,201 @@ def test_check_grad_ingore_y(self):
         )
 
 
+@unittest.skipIf(
+    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
+)
+class TestDotFP16Op(OpTest):
+    def setUp(self):
+        self.op_type = "dot"
+        self.python_api = paddle.dot
+        self.init_dtype()
+        self.init_input_output()
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y),
+        }
+        self.outputs = {'Out': self.out}
+        self.attrs = {}
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place, atol=0.125)
+
+    def test_check_grad_normal(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_grad_with_place(place, ['X', 'Y'], 'Out')
+
+    def test_check_grad_ingore_x(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_grad_with_place(
+                    place, ['Y'], 'Out', no_grad_set=set("X")
+                )
+
+    def test_check_grad_ingore_y(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_grad_with_place(
+                    place, ['X'], 'Out', no_grad_set=set("Y")
+                )
+
+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 1, [121]).astype(self.dtype)
+        self.y = np.random.uniform(1, 3, [121]).astype(self.dtype)
+        self.out = np.dot(self.x, self.y)
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
+)
+class DotFP16OpBatch(TestDotFP16Op):
+    def init_input_output(self):
+        self.x = (
+            np.random.uniform(0.1, 1, [132])
+            .astype(self.dtype)
+            .reshape([11, 12])
+        )
+        self.y = (
+            np.random.uniform(1, 3, [132]).astype(self.dtype).reshape([11, 12])
+        )
+        self.out = np.sum(self.x * self.y, axis=1).reshape([11, 1])
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and not support the bfloat16",
+)
+class TestDotBF16Op(OpTest):
+    def setUp(self):
+        self.op_type = "dot"
+        self.python_api = paddle.dot
+        self.init_dtype()
+        self.init_input_output()
+
+        self.inputs = {
+            'X': convert_float_to_uint16(self.x),
+            'Y': convert_float_to_uint16(self.y),
+        }
+        self.outputs = {'Out': convert_float_to_uint16(self.out)}
+        self.attrs = {}
+
+    def init_dtype(self):
+        self.dtype = np.uint16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_bfloat16_supported(place):
+                self.check_output_with_place(place, atol=0.5)
+
+    def test_check_grad_normal(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_bfloat16_supported(place):
+                self.check_grad_with_place(
+                    place,
+                    ['X', 'Y'],
+                    'Out',
+                    user_defined_grads=[self.inputs['Y'], self.inputs['X']],
+                )
+
+    def test_check_grad_ingore_x(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_bfloat16_supported(place):
+                self.check_grad_with_place(
+                    place,
+                    ['Y'],
+                    'Out',
+                    no_grad_set=set("X"),
+                    user_defined_grads=[self.inputs['X']],
+                )
+
+    def test_check_grad_ingore_y(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_bfloat16_supported(place):
+                self.check_grad_with_place(
+                    place,
+                    ['X'],
+                    'Out',
+                    no_grad_set=set("Y"),
+                    user_defined_grads=[self.inputs['Y']],
+                )
+
+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 1, [121]).astype(np.float32)
+        self.y = np.random.uniform(1, 3, [121]).astype(np.float32)
+        self.out = np.dot(self.x, self.y)
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and not support the bfloat16",
+)
+class DotBF16OpBatch(TestDotBF16Op):
+    def init_input_output(self):
+        self.x = (
+            np.random.uniform(0.1, 1, [132])
+            .astype(np.float32)
+            .reshape([11, 12])
+        )
+        self.y = (
+            np.random.uniform(1, 3, [132]).astype(np.float32).reshape([11, 12])
+        )
+        self.out = np.sum(self.x * self.y, axis=1).reshape([11, 1])
+
+    def test_check_grad_normal(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_bfloat16_supported(place):
+                self.check_grad_with_place(
+                    place,
+                    ['X', 'Y'],
+                    'Out',
+                    user_defined_grads=[
+                        self.y / self.y.shape[0],
+                        self.x / self.x.shape[0],
+                    ],
+                )
+
+    def test_check_grad_ingore_x(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_bfloat16_supported(place):
+                self.check_grad_with_place(
+                    place,
+                    ['Y'],
+                    'Out',
+                    no_grad_set=set("X"),
+                    user_defined_grads=[self.x / self.x.shape[0]],
+                )
+
+    def test_check_grad_ingore_y(self):
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            if core.is_bfloat16_supported(place):
+                self.check_grad_with_place(
+                    place,
+                    ['X'],
+                    'Out',
+                    no_grad_set=set("Y"),
+                    user_defined_grads=[self.y / self.y.shape[0]],
+                )
+
+
 if __name__ == '__main__':
     paddle.enable_static()
     unittest.main()

From 48ccb785c66951cee9fae2876f36a6a3bbc5c039 Mon Sep 17 00:00:00 2001
From: superwinner1 <82640284+superwinner1@users.noreply.github.com>
Date: Thu, 13 Apr 2023 20:12:07 +0800
Subject: [PATCH 145/156] =?UTF-8?q?=E3=80=90Hackathon=20No.55=E3=80=91=20a?=
 =?UTF-8?q?dd=20channel=5Fshuffle=20FP16/BF16=20support=20and=20tests=20(#?=
 =?UTF-8?q?51884)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* No55 add channel_shuffle FP16/BF16 support and tests
---
 .../phi/kernels/channel_shuffle_grad_kernel.h |  1 -
 .../gpu/channel_shuffle_grad_kernel.cu        |  4 +-
 .../phi/kernels/gpu/channel_shuffle_kernel.cu |  4 +-
 .../tests/unittests/test_channel_shuffle.py   | 56 ++++++++++++++++++-
 4 files changed, 60 insertions(+), 5 deletions(-)

diff --git a/paddle/phi/kernels/channel_shuffle_grad_kernel.h b/paddle/phi/kernels/channel_shuffle_grad_kernel.h
index d75d887d0fcd81..7dfefdfd4fe67c 100644
--- a/paddle/phi/kernels/channel_shuffle_grad_kernel.h
+++ b/paddle/phi/kernels/channel_shuffle_grad_kernel.h
@@ -17,7 +17,6 @@
 #include <string>
 
 #include "paddle/phi/core/dense_tensor.h"
-
 namespace phi {
 
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/gpu/channel_shuffle_grad_kernel.cu b/paddle/phi/kernels/gpu/channel_shuffle_grad_kernel.cu
index a9f751bfbc0a97..10842d6d5c7bcb 100644
--- a/paddle/phi/kernels/gpu/channel_shuffle_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/channel_shuffle_grad_kernel.cu
@@ -23,4 +23,6 @@ PD_REGISTER_KERNEL(channel_shuffle_grad,
                    ALL_LAYOUT,
                    phi::ChannelShuffleGradKernel,
                    float,
-                   double) {}
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/channel_shuffle_kernel.cu b/paddle/phi/kernels/gpu/channel_shuffle_kernel.cu
index 6be09721952473..63ed127642c042 100644
--- a/paddle/phi/kernels/gpu/channel_shuffle_kernel.cu
+++ b/paddle/phi/kernels/gpu/channel_shuffle_kernel.cu
@@ -23,4 +23,6 @@ PD_REGISTER_KERNEL(channel_shuffle,
                    ALL_LAYOUT,
                    phi::ChannelShuffleKernel,
                    float,
-                   double) {}
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/python/paddle/fluid/tests/unittests/test_channel_shuffle.py b/python/paddle/fluid/tests/unittests/test_channel_shuffle.py
index f4a772105a55f1..9ae01074bf55e9 100644
--- a/python/paddle/fluid/tests/unittests/test_channel_shuffle.py
+++ b/python/paddle/fluid/tests/unittests/test_channel_shuffle.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from eager_op_test import OpTest, convert_float_to_uint16
 
 import paddle
 import paddle.nn.functional as F
@@ -45,6 +45,7 @@ def channel_shuffle_np(x, groups, data_format="NCHW"):
 class TestChannelShuffleOp(OpTest):
     def setUp(self):
         self.op_type = "channel_shuffle"
+        self.init_dtype()
         self.init_data_format()
         n, c, h, w = 2, 9, 4, 4
         self.python_api = paddle.nn.functional.channel_shuffle
@@ -56,13 +57,16 @@ def setUp(self):
 
         groups = 3
 
-        x = np.random.random(shape).astype("float64")
+        x = np.random.random(shape).astype(self.dtype)
         npresult = channel_shuffle_np(x, groups, self.format)
 
         self.inputs = {'X': x}
         self.outputs = {'Out': npresult}
         self.attrs = {'groups': groups, "data_format": self.format}
 
+    def init_dtype(self):
+        self.dtype = 'float64'
+
     def init_data_format(self):
         self.format = "NCHW"
 
@@ -268,5 +272,53 @@ def error_data_format_layer():
         self.assertRaises(ValueError, error_data_format_layer)
 
 
+class TestChannelShuffleFP16OP(TestChannelShuffleOp):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not complied with CUDA and not support the bfloat16",
+)
+class TestChannelShuffleBF16OP(OpTest):
+    def setUp(self):
+        self.op_type = "channel_shuffle"
+        self.init_data_format()
+        n, c, h, w = 2, 9, 4, 4
+        self.python_api = paddle.nn.functional.channel_shuffle
+        self.dtype = np.uint16
+        self.use_mkldnn = False
+
+        if self.format == "NCHW":
+            shape = [n, c, h, w]
+        if self.format == "NHWC":
+            shape = [n, h, w, c]
+
+        groups = 3
+
+        x = np.random.random(shape).astype('float32')
+        out = channel_shuffle_np(x, groups, self.format)
+        self.inputs = {'X': convert_float_to_uint16(x)}
+        self.attrs = {'groups': groups, "data_format": self.format}
+        self.outputs = {'Out': convert_float_to_uint16(out)}
+
+    def init_data_format(self):
+        self.format = "NCHW"
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(
+            place,
+            ['X'],
+            'Out',
+        )
+
+
 if __name__ == '__main__':
     unittest.main()

From 4a374c60d7cac708907005915cd6719e050e340d Mon Sep 17 00:00:00 2001
From: jjyaoao <88936287+jjyaoao@users.noreply.github.com>
Date: Thu, 13 Apr 2023 21:12:59 +0800
Subject: [PATCH 146/156] delete WITH_ASCEND_CL (#52825)

* delete WITH_ASCEND_CL

* delete NPU/ and WITH_MLU
---
 paddle/fluid/operators/controlflow/while_op_helper.cc | 6 +++---
 paddle/phi/backends/context_pool.cc                   | 2 +-
 tools/coverage/paddle_coverage_new.sh                 | 2 --
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/operators/controlflow/while_op_helper.cc b/paddle/fluid/operators/controlflow/while_op_helper.cc
index bab4395803fa3c..38865a1c53e0b0 100644
--- a/paddle/fluid/operators/controlflow/while_op_helper.cc
+++ b/paddle/fluid/operators/controlflow/while_op_helper.cc
@@ -233,9 +233,9 @@ bool GetCondData(const phi::DenseTensor &cond) {
   framework::TensorCopySync(cond, platform::CPUPlace(), cpu_cond.get());
 #else
   PADDLE_THROW(platform::errors::PreconditionNotMet(
-      "This version of PaddlePaddle does NOT support GPU/NPU/XPU but got "
-      "GPU/NPU/XPU tensor Cond in WhileOp. Please compile WITH_GPU or "
-      "WITH_ASCEND_CL or WITH_XPU option."));
+      "This version of PaddlePaddle does NOT support GPU/XPU but got "
+      "GPU/XPU tensor Cond in WhileOp. Please compile WITH_GPU or "
+      "WITH_XPU option."));
 #endif
   return cpu_cond->data<bool>()[0];
 }
diff --git a/paddle/phi/backends/context_pool.cc b/paddle/phi/backends/context_pool.cc
index b05aa51205ac00..e295ac388d8928 100644
--- a/paddle/phi/backends/context_pool.cc
+++ b/paddle/phi/backends/context_pool.cc
@@ -72,7 +72,7 @@ phi::DeviceContext* DeviceContextPool::Get(const phi::Place& place) {
   if (it == ptr->end()) {
     PADDLE_THROW(phi::errors::Unimplemented(
         "Place %s is not supported. Please check that your paddle compiles "
-        "with WITH_GPU, WITH_XPU, WITH_IPU, WITH_MLU or WITH_ASCEND_CL option "
+        "with WITH_GPU, WITH_XPU or WITH_IPU option "
         "or check "
         "that your train process set the correct device id if you use "
         "Executor.",
diff --git a/tools/coverage/paddle_coverage_new.sh b/tools/coverage/paddle_coverage_new.sh
index 8cf43664a4566e..656b3588ac670b 100644
--- a/tools/coverage/paddle_coverage_new.sh
+++ b/tools/coverage/paddle_coverage_new.sh
@@ -116,8 +116,6 @@ function gen_full_html_report_npu() {
 
 # if [ ${WITH_XPU:-OFF} == "ON" ]; then
 #     gen_full_html_report_xpu || true
-# elif [ ${WITH_ASCEND_CL:-OFF} == "ON" ]; then
-#     gen_full_html_report_npu || true
 # else
 #     gen_full_html_report || true
 # fi

From acf5501650ad750dd25925961da18e27cd19a91f Mon Sep 17 00:00:00 2001
From: jjyaoao <88936287+jjyaoao@users.noreply.github.com>
Date: Thu, 13 Apr 2023 21:14:38 +0800
Subject: [PATCH 147/156] remove code with PADDLE_WITH_ASCEND (#52830)

* remove code with PADDLE_WITH_ASCEND

* try pass codestyle
---
 cmake/configure.cmake       |  4 ----
 cmake/external/ascend.cmake | 27 ---------------------------
 paddle/CMakeLists.txt       |  1 -
 3 files changed, 32 deletions(-)
 delete mode 100644 cmake/external/ascend.cmake

diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 973e21fd55056d..ad789a53e830d6 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -93,10 +93,6 @@ if(WITH_BOX_PS)
   add_definitions(-DPADDLE_WITH_BOX_PS)
 endif()
 
-if(WITH_ASCEND)
-  add_definitions(-DPADDLE_WITH_ASCEND)
-endif()
-
 if(WITH_XPU)
   message(STATUS "Compile with XPU!")
   add_definitions(-DPADDLE_WITH_XPU)
diff --git a/cmake/external/ascend.cmake b/cmake/external/ascend.cmake
deleted file mode 100644
index cbddf9496c24f4..00000000000000
--- a/cmake/external/ascend.cmake
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#NOTE: Logic is from
-# https://github.com/mindspore-ai/graphengine/blob/master/CMakeLists.txt
-if(DEFINED ENV{ASCEND_CUSTOM_PATH})
-  set(ASCEND_DIR $ENV{ASCEND_CUSTOM_PATH})
-else()
-  set(ASCEND_DIR /usr/local/Ascend)
-endif()
-
-if(EXISTS
-   ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include/graph/ascend_string.h)
-  # It means CANN 20.2 +
-  add_definitions(-DPADDLE_WITH_ASCEND_STRING)
-endif()
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index c4a4d2564580ba..35556347cb3392 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -5,7 +5,6 @@ set(PYTHON_TESTS_DIR
 add_subdirectory(utils)
 add_subdirectory(scripts)
 add_subdirectory(testing)
-
 add_subdirectory(phi)
 add_subdirectory(fluid)
 

From ef734e84500a7cf53bbd7650c280903841935ac0 Mon Sep 17 00:00:00 2001
From: Wangzheee <634486483@qq.com>
Date: Thu, 13 Apr 2023 22:15:07 +0800
Subject: [PATCH 148/156] [Paddle-Trt] Replace fc mul matmul matmul_v2 with
 matrix_multiply (#52222)

* Paddle-Trt: Replace fc mul matmul matmul_v2 with matrix_multiply
---
 paddle/fluid/framework/ir/CMakeLists.txt      |   2 +-
 .../framework/ir/constant_folding_pass.cc     |   2 +-
 .../delete_weight_dequant_linear_op_pass.cc   |   4 +-
 .../framework/ir/graph_pattern_detector.cc    |  28 +-
 .../framework/ir/graph_pattern_detector.h     |  11 +
 .../ir/multihead_matmul_roformer_fuse_pass.cc |  58 +-
 .../ir/quant_conv2d_dequant_fuse_pass.cc      | 133 +--
 .../ir/quant_conv2d_dequant_fuse_pass.h       |   2 +-
 .../ir/remove_padding_recover_padding_pass.cc | 122 ++-
 .../ir/remove_padding_recover_padding_pass.h  |  21 +-
 .../trt_cross_multihead_matmul_fuse_pass.cc   |  61 +-
 ...rt_delete_weight_dequant_linear_op_pass.cc |  71 --
 .../trt_flash_multihead_matmul_fuse_pass.cc   |  61 +-
 .../ir/trt_map_matmul_to_mul_pass.cc          | 918 ------------------
 .../framework/ir/trt_map_matmul_to_mul_pass.h | 130 ---
 .../ir/trt_map_ops_to_matrix_multiply_pass.cc | 125 +++
 .../ir/trt_map_ops_to_matrix_multiply_pass.h  |  39 +
 .../ir/trt_multihead_matmul_fuse_pass.cc      | 210 +---
 .../ir/trt_skip_layernorm_fuse_pass.cc        |   9 +-
 .../framework/ir/vit_attention_fuse_pass.cc   |   5 +-
 .../fluid/inference/api/analysis_predictor.cc |   4 +-
 .../inference/api/paddle_pass_builder.cc      |  44 +-
 .../inference/tensorrt/convert/CMakeLists.txt |   4 +-
 .../fluid/inference/tensorrt/convert/fc_op.cc | 415 --------
 .../inference/tensorrt/convert/matmul_op.cc   | 190 ----
 .../tensorrt/convert/matmul_v2_op.cc          | 126 ---
 .../tensorrt/convert/matrix_multiply_op.cc    | 273 ++++++
 .../tensorrt/convert/multihead_matmul_op.cc   | 147 ++-
 .../inference/tensorrt/convert/one_hot_op.cc  |   2 +
 .../inference/tensorrt/convert/op_converter.h |  13 -
 .../tensorrt/convert/skip_layernorm.cc        |  69 +-
 paddle/fluid/inference/tensorrt/engine.cc     |   7 +
 paddle/fluid/inference/tensorrt/op_teller.cc  | 123 +--
 .../tensorrt/tensorrt_engine_op_test.cc       |  92 +-
 .../unittests/ir/inference/CMakeLists.txt     |   2 -
 .../ir/inference/test_fc_fuse_pass.py         |  10 -
 ...est_multihead_matmul_roformer_fuse_pass.py |   5 +-
 .../inference/test_trt_convert_matmul_v2.py   |  32 +-
 .../test_trt_convert_multihead_matmul.py      | 171 +---
 .../ir/inference/test_trt_fc_fuse_pass.py     |  18 +-
 .../test_trt_flatten2_matmul_fuse_pass.py     | 148 ---
 .../test_trt_matmul_quant_dequant.py          |  18 +-
 42 files changed, 1004 insertions(+), 2921 deletions(-)
 delete mode 100644 paddle/fluid/framework/ir/trt_map_matmul_to_mul_pass.cc
 delete mode 100644 paddle/fluid/framework/ir/trt_map_matmul_to_mul_pass.h
 create mode 100644 paddle/fluid/framework/ir/trt_map_ops_to_matrix_multiply_pass.cc
 create mode 100644 paddle/fluid/framework/ir/trt_map_ops_to_matrix_multiply_pass.h
 delete mode 100644 paddle/fluid/inference/tensorrt/convert/fc_op.cc
 delete mode 100644 paddle/fluid/inference/tensorrt/convert/matmul_op.cc
 delete mode 100644 paddle/fluid/inference/tensorrt/convert/matmul_v2_op.cc
 create mode 100644 paddle/fluid/inference/tensorrt/convert/matrix_multiply_op.cc
 delete mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_flatten2_matmul_fuse_pass.py

diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index df7454bf2cf569..12457c2e3c1d1e 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -132,7 +132,7 @@ pass_library(generate_pass DEPS pass_desc_proto)
 target_link_libraries(generate_pass pass_desc_proto)
 
 if(WITH_TENSORRT)
-  pass_library(trt_map_matmul_to_mul_pass inference)
+  pass_library(trt_map_ops_to_matrix_multiply_pass inference)
   pass_library(trt_multihead_matmul_fuse_pass inference)
   pass_library(trt_flash_multihead_matmul_fuse_pass inference)
   pass_library(trt_cross_multihead_matmul_fuse_pass inference)
diff --git a/paddle/fluid/framework/ir/constant_folding_pass.cc b/paddle/fluid/framework/ir/constant_folding_pass.cc
index 0bcd7a733dde73..74d8e4a29a73cb 100644
--- a/paddle/fluid/framework/ir/constant_folding_pass.cc
+++ b/paddle/fluid/framework/ir/constant_folding_pass.cc
@@ -64,7 +64,7 @@ void ConstantFoldingPass::ApplyImpl(ir::Graph *graph) const {
       platform::errors::Fatal(
           "scope must not be null when applying constant floding."));
 
-  std::vector<std::string> blacklist{"feed"};
+  std::vector<std::string> blacklist{"feed", "matrix_multiply"};
 
   auto op_node_sorted = framework::ir::TopologyVarientSort(
       *graph, static_cast<framework::ir::SortKind>(0));
diff --git a/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc b/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc
index 03b6c988950de3..0b09d1b30f40af 100644
--- a/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc
@@ -24,10 +24,10 @@ namespace ir {
 class Graph;
 
 void DeleteWeightDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const {
-  std::unordered_set<std::string> op_list = {"matmul_v2",
+  std::unordered_set<std::string> op_list = {"matrix_multiply",
+                                             "matmul_v2",
                                              "matmul",
                                              "mul",
-                                             "fc",
                                              "depthwise_conv2d",
                                              "conv2d",
                                              "conv2d_transpose"};
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index ffcf7f78c27e60..af23d69bd4be31 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -2465,7 +2465,7 @@ PDNode *patterns::ConvElementwiseaddAct::operator()(
 
 PDNode *patterns::VitAttention::operator()(PDNode *in) {
   in->AsInput();
-  std::unordered_set<std::string> matmul_ops{"matmul", "matmul_v2"};
+  std::unordered_set<std::string> matmul_ops{"matrix_multiply"};
 
   auto matmul0_op =
       pattern->NewNode(matmul0_op_repr())->assert_is_ops(matmul_ops);
@@ -2504,13 +2504,13 @@ PDNode *patterns::VitAttention::operator()(PDNode *in) {
   auto slice1_op = pattern->NewNode(slice1_op_repr())->assert_is_op("slice");
   auto slice1_out = pattern->NewNode(slice1_out_repr())
                         ->assert_is_op_output("slice", "Out")
-                        ->assert_is_op_input("matmul_v2", "Y")
+                        ->assert_is_op_input("matrix_multiply", "Y")
                         ->AsIntermediate();
 
   auto slice2_op = pattern->NewNode(slice2_op_repr())->assert_is_op("slice");
   auto slice2_out = pattern->NewNode(slice2_out_repr())
                         ->assert_is_op_output("slice", "Out")
-                        ->assert_is_op_input("matmul_v2", "X")
+                        ->assert_is_op_input("matrix_multiply", "X")
                         ->AsIntermediate();
 
   auto slice3_op = pattern->NewNode(slice3_op_repr())->assert_is_op("slice");
@@ -2523,13 +2523,13 @@ PDNode *patterns::VitAttention::operator()(PDNode *in) {
       pattern->NewNode(transpose2_op_repr())->assert_is_op("transpose2");
   auto transpose2_out = pattern->NewNode(transpose2_out_repr())
                             ->assert_is_op_output("transpose2", "Out")
-                            ->assert_is_op_input("matmul_v2", "Y")
+                            ->assert_is_op_input("matrix_multiply", "Y")
                             ->AsIntermediate();
 
   auto matmul1_op =
-      pattern->NewNode(matmul1_op_repr())->assert_is_op("matmul_v2");
+      pattern->NewNode(matmul1_op_repr())->assert_is_op("matrix_multiply");
   auto matmul1_out = pattern->NewNode(matmul1_out_repr())
-                         ->assert_is_op_output("matmul_v2", "Out")
+                         ->assert_is_op_output("matrix_multiply", "Out")
                          ->assert_is_op_input("scale", "X")
                          ->AsIntermediate();
 
@@ -2543,13 +2543,13 @@ PDNode *patterns::VitAttention::operator()(PDNode *in) {
       pattern->NewNode(softmax1_op_repr())->assert_is_op("softmax");
   auto softmax1_out = pattern->NewNode(softmax1_out_repr())
                           ->assert_is_op_output("softmax", "Out")
-                          ->assert_is_op_input("matmul_v2", "X")
+                          ->assert_is_op_input("matrix_multiply", "X")
                           ->AsIntermediate();
 
   auto matmul2_op =
-      pattern->NewNode(matmul2_op_repr())->assert_is_op("matmul_v2");
+      pattern->NewNode(matmul2_op_repr())->assert_is_op("matrix_multiply");
   auto matmul2_out = pattern->NewNode(matmul2_out_repr())
-                         ->assert_is_op_output("matmul_v2", "Out")
+                         ->assert_is_op_output("matrix_multiply", "Out")
                          ->assert_is_op_input("transpose2", "X")
                          ->AsIntermediate();
 
@@ -4452,6 +4452,16 @@ PDNode *patterns::FusedFeedForwardBwd::operator()(
   return out_grad;
 }
 
+void patterns::MulMatmulMatmulV2::operator()(
+    const std::unordered_set<std::string> &ops_type) {
+  auto ops = pattern->NewNode(ops_repr())->assert_is_ops(ops_type);
+  auto ops_out = pattern->NewNode(ops_out_repr())
+                     ->AsOutput()
+                     ->assert_is_ops_output(ops_type, "Out");
+
+  ops->LinksTo({ops_out});
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index 14d0d4e7b8e3a7..40a7439b06f5ad 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -2146,6 +2146,17 @@ struct MergeLayernormPattern : public PatternBase {
   PATTERN_DECL_NODE(layernorm_40_out);
 };
 
+// MulMatmulMatmulV2: ops(mul, matmul, matmul_v2)
+// Forward pass for ops(mul, matmul, matmul_v2) convert to matrix_multiply.
+struct MulMatmulMatmulV2 : public PatternBase {
+  MulMatmulMatmulV2(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "mul_matmul_matmul_v2") {}
+
+  void operator()(const std::unordered_set<std::string>& ops_type);
+  PATTERN_DECL_NODE(ops);
+  PATTERN_DECL_NODE(ops_out);
+};
+
 // Add support int8 flag
 struct AddSupportInt8 : public PatternBase {
   AddSupportInt8(PDPattern* pattern, const std::string& name_scope)
diff --git a/paddle/fluid/framework/ir/multihead_matmul_roformer_fuse_pass.cc b/paddle/fluid/framework/ir/multihead_matmul_roformer_fuse_pass.cc
index dcb7d1efa92707..7a28236a8a2037 100644
--- a/paddle/fluid/framework/ir/multihead_matmul_roformer_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/multihead_matmul_roformer_fuse_pass.cc
@@ -37,7 +37,7 @@ static void ReplaceOutputVar(Node* op, Node* old_var, Node* new_var) {
 }
 
 PDNode* MultiHeadMatmulRoformerPattern::operator()() {
-  std::unordered_set<std::string> matmul_ops{"matmul", "matmul_v2"};
+  std::unordered_set<std::string> matmul_ops{"matrix_multiply"};
   auto* input0 = pattern->NewNode(input0_repr());
   input0->assert_is_ops_input(matmul_ops);
 
@@ -313,23 +313,6 @@ PDNode* MultiHeadMatmulRoformerPattern::operator()() {
 }  // namespace patterns
 
 MultiHeadMatmulRoformerFusePass::MultiHeadMatmulRoformerFusePass() {
-  AddOpCompat(OpCompat("mul"))
-      .AddInput("X")  // the shape shoule be (B, S, N*H)
-      .IsTensor()
-      .End()
-      .AddInput("Y")  // the shape shoule be (N*H, N*H)
-      .IsTensor()
-      .End()
-      .AddOutput("Out")  // the shape shoule be (B, S, N*H)
-      .IsTensor()
-      .End()
-      .AddAttr("x_num_col_dims")
-      .IsNumEQ(2)
-      .End()
-      .AddAttr("y_num_col_dims")
-      .IsNumEQ(1)
-      .End();
-
   AddOpCompat(OpCompat("elementwise_add"))
       .AddInput("X")
       // in bias, shape is (B, S, N*H),
@@ -394,43 +377,6 @@ MultiHeadMatmulRoformerFusePass::MultiHeadMatmulRoformerFusePass() {
 
   // QK (B, H, S, N)*(B, H, S, N) -> (B, H, S, S)
   // QKV (B, H, S, S)*(B, H, S, N) -> (B, H, S, N)
-  AddOpCompat(OpCompat("matmul"))
-      .AddInput("X")
-      .IsTensor()
-      .End()
-      .AddInput("Y")
-      .IsTensor()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("alpha")
-      .IsType<float>()  // QK(anyvalue, will copy to new op) QKV(1.0)
-      .End()
-      .AddAttr("transpose_X")
-      .IsBoolEQ(false)
-      .End()
-      .AddAttr("transpose_Y")  // QK(true) QKV(false)
-      .IsType<bool>()
-      .End();
-
-  AddOpCompat(OpCompat("matmul_v2"))
-      .AddInput("X")
-      .IsTensor()
-      .End()
-      .AddInput("Y")
-      .IsTensor()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("trans_x")
-      .IsBoolEQ(false)
-      .End()
-      .AddAttr("trans_y")  // QK(true) QKV(false)
-      .IsType<bool>()
-      .End();
-
   AddOpCompat(OpCompat("softmax"))
       .AddInput("X")
       .IsTensor()
@@ -825,6 +771,4 @@ REGISTER_PASS_CAPABILITY(multihead_matmul_roformer_fuse_pass)
             .EQ("reshape2", 0)
             .EQ("transpose2", 0)
             .EQ("scale", 0)
-            .LE("matmul", 1)
-            .EQ("matmul_v2", 0)
             .EQ("softmax", 0));
diff --git a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
index a4cb15dcf3f344..0dd8a79ac6a5bc 100644
--- a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
@@ -202,77 +202,6 @@ QuantDequantFusePass::QuantDequantFusePass() {
       .AddAttr("data_format")
       .IsStringIn({"NCHW", "NHWC", "AnyLayout"})
       .End();
-  AddOpCompat(OpCompat("mul"))
-      .AddInput("X")
-      .IsTensor()
-      .End()
-      .AddInput("Y")
-      .IsTensor()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("x_num_col_dims")
-      .IsNumGE(1)
-      .End()
-      .AddAttr("y_num_col_dims")
-      .IsNumEQ(1)
-      .End();
-  AddOpCompat(OpCompat("matmul_v2"))
-      .AddInput("X")
-      .IsTensor()
-      .End()
-      .AddInput("Y")
-      .IsTensor()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("trans_x")
-      .IsBoolEQ(false)
-      .End()
-      .AddAttr("trans_y")
-      .IsBoolEQ(false)
-      .End();
-  AddOpCompat(OpCompat("matmul"))
-      .AddInput("X")
-      .IsTensor()
-      .End()
-      .AddInput("Y")
-      .IsTensor()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("alpha")
-      .IsNumGE(0.99f)
-      .IsNumLE(1.01f)
-      .End()
-      .AddAttr("transpose_X")
-      .IsBoolEQ(false)
-      .End()
-      .AddAttr("transpose_Y")
-      .IsBoolEQ(false)
-      .End();
-  AddOpCompat(OpCompat("fc"))
-      .AddInput("Input")
-      .IsTensor()
-      .End()
-      .AddInput("W")
-      .IsTensor()
-      .End()
-      .AddInput("Bias")
-      .IsTensor()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("in_num_col_dims")
-      .IsNumGE(1)
-      .End()
-      .AddAttr("activation_type")
-      .IsStringIn({"relu", ""})
-      .End();
   AddOpCompat(OpCompat("conv2d_transpose"))
       .AddInput("Input")
       .IsTensor()
@@ -379,10 +308,8 @@ void QuantDequantFusePass::DeleteQuant(ir::Graph* graph,
       if (quantized_op_type == "conv2d" ||
           quantized_op_type == "conv2d_fusion" ||
           quantized_op_type == "depthwise_conv2d" ||
-          quantized_op_type == "fc" ||
           quantized_op_type == "conv2d_transpose" ||
-          quantized_op_type == "mul" || quantized_op_type == "matmul" ||
-          quantized_op_type == "matmul_v2") {
+          quantized_op_type == "matrix_multiply") {
         op_desc->SetAttr("Input_scale", scale_value);
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
@@ -416,17 +343,14 @@ void QuantDequantFusePass::FuseDequant(ir::Graph* graph,
       quantized_op_type == "conv2d_transpose") {
     weight_name = "Filter";
     input_name = "Input";
-  } else if (quantized_op_type == "mul" || quantized_op_type == "matmul" ||
-             quantized_op_type == "matmul_v2") {
+  } else if (quantized_op_type == "matrix_multiply") {
     weight_name = "Y";
     input_name = "X";
-  } else if (quantized_op_type == "fc") {
-    weight_name = "W";
-    input_name = "Input";
   } else {
     PADDLE_THROW(platform::errors::Unimplemented(
         "QuantDequantFuse: We only support conv2d, conv2d_fusion, fused_conv2d,"
-        "conv2d_transpose, fc, mul, matmul, matmul_v2 for now, but received: "
+        "conv2d_transpose, matrix_multiply(mul/matmul/matmul_v2) for now, but "
+        "received: "
         "%s.",
         quantized_op_type));
   }
@@ -514,16 +438,16 @@ void QuantDequantFusePass::FuseDequant(ir::Graph* graph,
     //  re-write it again when this weight tensor is shared among many ops.
     if (!quantized_op_weight_node_set.count(quantized_op_weight_node)) {
       quantized_op_weight_node_set.insert(quantized_op_weight_node);
-      // If quantized op is fc, weight scale size = 1;
+      // If quantized op is matrix_multiply, weight scale size = 1;
       // If quantized op is conv2d, weight scale size = weight dims[0]
       // If quantized op is conv2d_transpose, weight scale size = weight dims[1]
-      if (quantized_op_type == "mul" || quantized_op_type == "matmul" ||
-          quantized_op_type == "matmul_v2" || quantized_op_type == "fc") {
+      if (quantized_op_type == "matrix_multiply") {
         if (dequant_type == "fake_dequantize_max_abs") {
           PADDLE_ENFORCE_EQ(weight_scale.size(),
                             1,
                             platform::errors::InvalidArgument(
-                                "mul/matmul/matmul_v2 op weight dequantized by "
+                                "matrix_multiply(mul/matmul/matmul_v2) op "
+                                "weight dequantized by "
                                 "[fake_dequantize_max_abs] "
                                 "requires weight scale size = 1, but got %d.",
                                 weight_scale.size()));
@@ -538,24 +462,27 @@ void QuantDequantFusePass::FuseDequant(ir::Graph* graph,
                 quant_axis == 1,
                 true,
                 platform::errors::InvalidArgument(
-                    "'quant_axis' of mul/matmul/fc/matmul_v2 op weight "
+                    "'quant_axis' of matrix_multiply(mul/matmul/matmul_v2) op "
+                    "weight "
                     "dequantized by "
                     "[fake_channel_wise_dequantize_max_abs]should be 1, but "
                     "the received is %d",
                     quant_axis));
           }
-          PADDLE_ENFORCE_EQ(weight_scale.size(),
-                            static_cast<size_t>(w_dims[1]),
-                            platform::errors::InvalidArgument(
-                                "mul/matmul/matmul_v2 op weight dequantized by "
-                                "[fake_channel_wise_dequantize_max_abs] "
-                                "requires weight scale "
-                                "size = 2nd dim of mul/matmul/matmul_v2's "
-                                "weight, which is %d, "
-                                "but got "
-                                "%d.",
-                                static_cast<size_t>(w_dims[1]),
-                                weight_scale.size()));
+          PADDLE_ENFORCE_EQ(
+              weight_scale.size(),
+              static_cast<size_t>(w_dims[1]),
+              platform::errors::InvalidArgument(
+                  "matrix_multiply(mul/matmul/matmul_v2) op weight dequantized "
+                  "by "
+                  "[fake_channel_wise_dequantize_max_abs] "
+                  "requires weight scale "
+                  "size = 2nd dim of matrix_multiply(mul/matmul/matmul_v2)'s "
+                  "weight, which is %d, "
+                  "but got "
+                  "%d.",
+                  static_cast<size_t>(w_dims[1]),
+                  weight_scale.size()));
           for (int j = 0; j < weight_tensor->numel(); j++) {
             quantized_weight_data[j] *= weight_scale[j % w_dims[1]];
           }
@@ -650,11 +577,7 @@ void QuantDequantFusePass::FuseDequant(ir::Graph* graph,
         quantized_op_type == "conv2d_transpose") {
       new_op_desc.SetInput("Input", {new_input});
       new_op_desc.SetOutput("Output", {new_output});
-    } else if (quantized_op_type == "fc") {
-      new_op_desc.SetInput("Input", {new_input});
-      new_op_desc.SetOutput("Out", {new_output});
-    } else if (quantized_op_type == "mul" || quantized_op_type == "matmul" ||
-               quantized_op_type == "matmul_v2") {
+    } else if (quantized_op_type == "matrix_multiply") {
       new_op_desc.SetInput("X", {new_input});
       new_op_desc.SetOutput("Out", {new_output});
     }
@@ -682,12 +605,9 @@ void QuantDequantFusePass::ApplyImpl(ir::Graph* graph) const {
   std::unordered_set<std::string> quantized_op_types = {
       "conv2d",
       "fused_conv2d",
-      "mul",
-      "matmul",
+      "matrix_multiply",
       "depthwise_conv2d",
       "conv2d_transpose",
-      "fc",
-      "matmul_v2",
   };
   auto* scope = param_scope();
 
@@ -712,7 +632,6 @@ REGISTER_PASS_CAPABILITY(quant_conv2d_dequant_fuse_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
             .LE("conv2d", 1)
-            .EQ("fc", 0)
             .LE("conv2d_transpose", 2)
             .EQ("fake_quantize_abs_max", 0)
             .EQ("fake_quantize_range_abs_max", 0)
diff --git a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.h b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.h
index 1ddce86193632e..32ea38178ee757 100644
--- a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.h
+++ b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.h
@@ -22,7 +22,7 @@ namespace framework {
 namespace ir {
 
 ///
-/// Fuse quant + conv2d/depthwise_conv2d/mul/fc + dequant
+/// Fuse quant + conv2d/depthwise_conv2d/matrix_multiply + dequant
 ///
 class QuantDequantFusePass : public FusePassBase {
  public:
diff --git a/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.cc b/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.cc
index 19c2e0541b1bce..429fc12d1df4fa 100644
--- a/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.cc
+++ b/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.cc
@@ -110,12 +110,14 @@ void MultiheadMatmul::operator()() {
       .LinksTo({multihead_matmul_out});
 }
 
-void Fc::operator()() {
-  // Create nodes for fc.
-  auto* fc_input =
-      pattern->NewNode(fc_input_repr())->assert_is_op_input("fc", "Input");
-  auto* fc_op = pattern->NewNode(fc_op_repr())->assert_is_op("fc");
-  fc_op->LinksFrom({fc_input});
+void MatrixMultiply::operator()() {
+  // Create nodes for matrix_multiply.
+  auto* matrix_multiply_input =
+      pattern->NewNode(matrix_multiply_input_repr())
+          ->assert_is_op_input("matrix_multiply", "X");
+  auto* matrix_multiply_op = pattern->NewNode(matrix_multiply_op_repr())
+                                 ->assert_is_op("matrix_multiply");
+  matrix_multiply_op->LinksFrom({matrix_multiply_input});
 }
 
 void Activation::operator()() {
@@ -146,6 +148,19 @@ void FusedTokenPrune::operator()() {
   fused_token_prune_op->LinksFrom({fused_token_prune_input})
       .LinksTo({fused_token_prune_output});
 }
+
+void ElementWise::operator()() {
+  // Create nodes for elementwise.
+  auto* elementwise_input = pattern->NewNode(elementwise_input_repr())
+                                ->assert_is_op_input("elementwise_add", "X");
+  auto* elementwise_op =
+      pattern->NewNode(elementwise_op_repr())->assert_is_op("elementwise_add");
+  auto* elementwise_out = pattern->NewNode(elementwise_out_repr())
+                              ->assert_is_op_output("elementwise_add");
+
+  // Add links for elementwise op.
+  elementwise_op->LinksFrom({elementwise_input}).LinksTo({elementwise_out});
+}
 }  // namespace patterns
 
 void RemovePaddingRecoverPaddingPass::ApplyImpl(ir::Graph* graph) const {
@@ -400,38 +415,45 @@ void RemovePaddingRecoverPaddingPass::ApplyImpl(ir::Graph* graph) const {
   gpd2(graph, handler2);
 
   GraphPatternDetector gpd3;
-  patterns::Fc fc(gpd3.mutable_pattern(),
-                  "remove_padding_recover_padding_pass");
-  fc();
+  patterns::MatrixMultiply matrix_multiply(
+      gpd3.mutable_pattern(), "remove_padding_recover_padding_pass");
+  matrix_multiply();
 
   auto handler3 = [&](const GraphPatternDetector::subgraph_t& subgraph,
                       Graph* graph) {
-    VLOG(3) << "remove_padding_recover_padding_pass for transformer: fc";
+    VLOG(3) << "remove_padding_recover_padding_pass for transformer: "
+               "matrix_multiply";
 
-    GET_IR_NODE_FROM_SUBGRAPH(fc_input, fc_input, fc);
-    GET_IR_NODE_FROM_SUBGRAPH(fc_op, fc_op, fc);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matrix_multiply_input, matrix_multiply_input, matrix_multiply);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matrix_multiply_op, matrix_multiply_op, matrix_multiply);
 
-    std::vector<int64_t> fc_input_shape = fc_input->Var()->GetShape();
+    std::vector<int64_t> matrix_multiply_input_shape =
+        matrix_multiply_input->Var()->GetShape();
     check_flag = true;
-    if ((fc_input_shape.size() != multihead_matmul_input_shape.size()) ||
-        (fc_input_shape.size() != 3)) {
+    if ((matrix_multiply_input_shape.size() !=
+         multihead_matmul_input_shape.size()) ||
+        (matrix_multiply_input_shape.size() != 3)) {
       check_flag = false;
       VLOG(3) << "Transformer model remove_padding shape check failed, return "
                  "remove_padding pass.";
       return;
     }
-    if (fc_input_shape[0] != multihead_matmul_input_shape[0]) {
+    if (matrix_multiply_input_shape[0] != multihead_matmul_input_shape[0]) {
       check_flag = false;
     }
-    if (fc_input_shape[1] != multihead_matmul_input_shape[1]) {
+    if (matrix_multiply_input_shape[1] != multihead_matmul_input_shape[1]) {
       check_flag = false;
     }
-    if ((fc_input_shape[2] != multihead_matmul_input_shape[2]) &&
-        (fc_input_shape[2] != 4 * multihead_matmul_input_shape[2])) {
+    if ((matrix_multiply_input_shape[2] != multihead_matmul_input_shape[2]) &&
+        (matrix_multiply_input_shape[2] !=
+         4 * multihead_matmul_input_shape[2])) {
       check_flag = false;
     }
 
-    if (PADDLE_GET_CONST(int, fc_op->Op()->GetAttr("in_num_col_dims")) != 2) {
+    if (PADDLE_GET_CONST(
+            int, matrix_multiply_op->Op()->GetAttr("x_num_col_dims")) != 2) {
       check_flag = false;
     }
     if (!check_flag) {
@@ -439,8 +461,13 @@ void RemovePaddingRecoverPaddingPass::ApplyImpl(ir::Graph* graph) const {
                  "remove_padding pass.";
       return;
     }
-    insert_remove_padding_op(fc_input, fc_op);
-    insert_recover_padding_op(fc_op, fc_op->outputs[0]);
+
+    matrix_multiply_op->Op()->RemoveAttr("x_num_col_dims");
+    matrix_multiply_op->Op()->SetAttr("x_num_col_dims", 1);
+
+    insert_remove_padding_op(matrix_multiply_input, matrix_multiply_op);
+    insert_recover_padding_op(matrix_multiply_op,
+                              matrix_multiply_op->outputs[0]);
     found_subgraph_count++;
   };
   gpd3(graph, handler3);
@@ -617,6 +644,57 @@ void RemovePaddingRecoverPaddingPass::ApplyImpl(ir::Graph* graph) const {
   };
   gpd7(graph, handler7);
 
+  // Removed fc_add fuse, elementwise can be used by the optimized model
+  GraphPatternDetector gpd8;
+  patterns::ElementWise elementwise(gpd8.mutable_pattern(),
+                                    "remove_padding_recover_padding_pass");
+  elementwise();
+
+  auto handler8 = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                      Graph* graph) {
+    VLOG(3) << "remove_padding_recover_padding_pass for transformer: "
+               "elementwise";
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        elementwise_input, elementwise_input, elementwise);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_op, elementwise_op, elementwise);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out, elementwise);
+
+    std::vector<int64_t> elementwise_input_shape =
+        elementwise_input->Var()->GetShape();
+    check_flag = true;
+    if (elementwise_input_shape.size() != multihead_matmul_input_shape.size()) {
+      check_flag = false;
+      VLOG(3) << "Transformer model remove_padding shape check failed, return "
+                 "remove_padding pass.";
+      return;
+    }
+
+    if (elementwise_input_shape[0] != multihead_matmul_input_shape[0]) {
+      check_flag = false;
+    }
+    if (elementwise_input_shape[1] != multihead_matmul_input_shape[1]) {
+      check_flag = false;
+    }
+    if ((elementwise_input_shape[2] != multihead_matmul_input_shape[2]) &&
+        (elementwise_input_shape[2] != 4 * multihead_matmul_input_shape[2])) {
+      check_flag = false;
+    }
+    if (!check_flag) {
+      VLOG(3) << "Transformer model remove_padding shape check failed, return "
+                 "remove_padding pass.";
+      return;
+    }
+
+    elementwise_op->Op()->RemoveAttr("axis");
+    elementwise_op->Op()->SetAttr("axis", 1);
+
+    insert_remove_padding_op(elementwise_input, elementwise_op);
+    insert_recover_padding_op(elementwise_op, elementwise_out);
+    found_subgraph_count++;
+  };
+  gpd8(graph, handler8);
+
   AddStatis(found_subgraph_count);
 }
 
diff --git a/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.h b/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.h
index ff04dc55323ab4..6df73301b1c329 100644
--- a/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.h
+++ b/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.h
@@ -87,14 +87,14 @@ struct MultiheadMatmul : public PatternBase {
   PATTERN_DECL_NODE(multihead_matmul_out);
 };
 
-struct Fc : public PatternBase {
-  Fc(PDPattern *pattern, const std::string &name_scope)
-      : PatternBase(pattern, name_scope, "fc") {}
+struct MatrixMultiply : public PatternBase {
+  MatrixMultiply(PDPattern *pattern, const std::string &name_scope)
+      : PatternBase(pattern, name_scope, "matrix_multiply") {}
 
   void operator()();
 
-  PATTERN_DECL_NODE(fc_input);
-  PATTERN_DECL_NODE(fc_op);
+  PATTERN_DECL_NODE(matrix_multiply_input);
+  PATTERN_DECL_NODE(matrix_multiply_op);
 };
 
 struct Activation : public PatternBase {
@@ -118,6 +118,17 @@ struct FusedTokenPrune : public PatternBase {
   PATTERN_DECL_NODE(fused_token_prune_op);
   PATTERN_DECL_NODE(fused_token_prune_output);
 };
+
+struct ElementWise : public PatternBase {
+  ElementWise(PDPattern *pattern, const std::string &name_scope)
+      : PatternBase(pattern, name_scope, "elementwise") {}
+
+  void operator()();
+
+  PATTERN_DECL_NODE(elementwise_input);
+  PATTERN_DECL_NODE(elementwise_op);
+  PATTERN_DECL_NODE(elementwise_out);
+};
 }  // namespace patterns
 
 class RemovePaddingRecoverPaddingPass : public FusePassBase {
diff --git a/paddle/fluid/framework/ir/trt_cross_multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/trt_cross_multihead_matmul_fuse_pass.cc
index ae50563dfa4cad..3f94c97baa6d8e 100644
--- a/paddle/fluid/framework/ir/trt_cross_multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/trt_cross_multihead_matmul_fuse_pass.cc
@@ -64,8 +64,8 @@ namespace patterns {
 //     output
 
 PDNode* TrtCrossMultiHeadMatmulPattern::operator()() {
-  std::unordered_set<std::string> mul_ops{"mul", "matmul_v2"};
-  std::unordered_set<std::string> matmul_ops{"matmul", "matmul_v2"};
+  std::unordered_set<std::string> mul_ops{"matrix_multiply"};
+  std::unordered_set<std::string> matmul_ops{"matrix_multiply"};
   auto* input0 = pattern->NewNode(input0_repr());
   auto* input1 = pattern->NewNode(input1_repr());
 
@@ -210,23 +210,6 @@ PDNode* TrtCrossMultiHeadMatmulPattern::operator()() {
 }  // namespace patterns
 
 TrtCrossMultiHeadMatmulFusePass::TrtCrossMultiHeadMatmulFusePass() {
-  AddOpCompat(OpCompat("mul"))
-      .AddInput("X")  // the shape shoule be (B, S, N*H)
-      .IsTensor()
-      .End()
-      .AddInput("Y")  // the shape shoule be (N*H, N*H)
-      .IsTensor()
-      .End()
-      .AddOutput("Out")  // the shape shoule be (B, S, N*H)
-      .IsTensor()
-      .End()
-      .AddAttr("x_num_col_dims")
-      .IsNumEQ(2)
-      .End()
-      .AddAttr("y_num_col_dims")
-      .IsNumEQ(1)
-      .End();
-
   AddOpCompat(OpCompat("reshape2"))
       .AddInput("X")
       .IsTensor()
@@ -269,43 +252,6 @@ TrtCrossMultiHeadMatmulFusePass::TrtCrossMultiHeadMatmulFusePass() {
 
   // QK (B, H, S, N)*(B, H, S, N) -> (B, H, S, S)
   // QKV (B, H, S, S)*(B, H, S, N) -> (B, H, S, N)
-  AddOpCompat(OpCompat("matmul"))
-      .AddInput("X")
-      .IsTensor()
-      .End()
-      .AddInput("Y")
-      .IsTensor()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("alpha")
-      .IsType<float>()  // QK(anyvalue, will copy to new op) QKV(1.0)
-      .End()
-      .AddAttr("transpose_X")
-      .IsBoolEQ(false)
-      .End()
-      .AddAttr("transpose_Y")  // QK(true) QKV(false)
-      .IsType<bool>()
-      .End();
-
-  AddOpCompat(OpCompat("matmul_v2"))
-      .AddInput("X")
-      .IsTensor()
-      .End()
-      .AddInput("Y")
-      .IsTensor()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("trans_x")
-      .IsBoolEQ(false)
-      .End()
-      .AddAttr("trans_y")  // QK(true) QKV(false)
-      .IsType<bool>()
-      .End();
-
   AddOpCompat(OpCompat("softmax"))
       .AddInput("X")
       .IsTensor()
@@ -584,11 +530,8 @@ REGISTER_PASS(trt_cross_multihead_matmul_fuse_pass,
 REGISTER_PASS_CAPABILITY(trt_cross_multihead_matmul_fuse_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
-            .EQ("mul", 0)
             .LE("elementwise_add", 1)
             .EQ("reshape2", 0)
             .EQ("transpose2", 0)
             .EQ("scale", 0)
-            .LE("matmul", 1)
-            .EQ("matmul_v2", 0)
             .EQ("softmax", 0));
diff --git a/paddle/fluid/framework/ir/trt_delete_weight_dequant_linear_op_pass.cc b/paddle/fluid/framework/ir/trt_delete_weight_dequant_linear_op_pass.cc
index 7614a9eda8e53c..9c51254029c586 100644
--- a/paddle/fluid/framework/ir/trt_delete_weight_dequant_linear_op_pass.cc
+++ b/paddle/fluid/framework/ir/trt_delete_weight_dequant_linear_op_pass.cc
@@ -156,77 +156,6 @@ TrtDeleteWeightQuantDequantLinearOpPass::
       .AddAttr("data_format")
       .IsStringIn({"NCHW", "NHWC", "AnyLayout"})
       .End();
-  AddOpCompat(OpCompat("mul"))
-      .AddInput("X")
-      .IsTensor()
-      .End()
-      .AddInput("Y")
-      .IsTensor()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("x_num_col_dims")
-      .IsNumGE(1)
-      .End()
-      .AddAttr("y_num_col_dims")
-      .IsNumEQ(1)
-      .End();
-  AddOpCompat(OpCompat("matmul_v2"))
-      .AddInput("X")
-      .IsTensor()
-      .End()
-      .AddInput("Y")
-      .IsTensor()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("trans_x")
-      .IsBoolEQ(false)
-      .End()
-      .AddAttr("trans_y")
-      .IsBoolEQ(false)
-      .End();
-  AddOpCompat(OpCompat("matmul"))
-      .AddInput("X")
-      .IsTensor()
-      .End()
-      .AddInput("Y")
-      .IsTensor()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("alpha")
-      .IsNumGE(0.99f)
-      .IsNumLE(1.01f)
-      .End()
-      .AddAttr("transpose_X")
-      .IsBoolEQ(false)
-      .End()
-      .AddAttr("transpose_Y")
-      .IsBoolEQ(false)
-      .End();
-  AddOpCompat(OpCompat("fc"))
-      .AddInput("Input")
-      .IsTensor()
-      .End()
-      .AddInput("W")
-      .IsTensor()
-      .End()
-      .AddInput("Bias")
-      .IsTensor()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("in_num_col_dims")
-      .IsNumGE(1)
-      .End()
-      .AddAttr("activation_type")
-      .IsStringIn({"relu", ""})
-      .End();
   AddOpCompat(OpCompat("conv2d_transpose"))
       .AddInput("Input")
       .IsTensor()
diff --git a/paddle/fluid/framework/ir/trt_flash_multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/trt_flash_multihead_matmul_fuse_pass.cc
index 2cafb36e93a266..4fba50fc559c96 100644
--- a/paddle/fluid/framework/ir/trt_flash_multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/trt_flash_multihead_matmul_fuse_pass.cc
@@ -65,8 +65,8 @@ namespace patterns {
 //     output
 
 PDNode* TrtFlashMultiHeadMatmulPattern::operator()() {
-  std::unordered_set<std::string> mul_ops{"mul", "matmul_v2"};
-  std::unordered_set<std::string> matmul_ops{"matmul", "matmul_v2"};
+  std::unordered_set<std::string> mul_ops{"matrix_multiply"};
+  std::unordered_set<std::string> matmul_ops{"matrix_multiply"};
   auto* input0 = pattern->NewNode(input0_repr());
   input0->assert_is_ops_input(mul_ops);
   VLOG(5) << "Start match TrtFlashMultiHeadMatmulPattern";
@@ -209,23 +209,6 @@ PDNode* TrtFlashMultiHeadMatmulPattern::operator()() {
 }  // namespace patterns
 
 TrtFlashMultiHeadMatmulFusePass::TrtFlashMultiHeadMatmulFusePass() {
-  AddOpCompat(OpCompat("mul"))
-      .AddInput("X")  // the shape shoule be (B, S, N*H)
-      .IsTensor()
-      .End()
-      .AddInput("Y")  // the shape shoule be (N*H, N*H)
-      .IsTensor()
-      .End()
-      .AddOutput("Out")  // the shape shoule be (B, S, N*H)
-      .IsTensor()
-      .End()
-      .AddAttr("x_num_col_dims")
-      .IsNumEQ(2)
-      .End()
-      .AddAttr("y_num_col_dims")
-      .IsNumEQ(1)
-      .End();
-
   AddOpCompat(OpCompat("reshape2"))
       .AddInput("X")
       .IsTensor()
@@ -268,43 +251,6 @@ TrtFlashMultiHeadMatmulFusePass::TrtFlashMultiHeadMatmulFusePass() {
 
   // QK (B, H, S, N)*(B, H, S, N) -> (B, H, S, S)
   // QKV (B, H, S, S)*(B, H, S, N) -> (B, H, S, N)
-  AddOpCompat(OpCompat("matmul"))
-      .AddInput("X")
-      .IsTensor()
-      .End()
-      .AddInput("Y")
-      .IsTensor()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("alpha")
-      .IsType<float>()  // QK(anyvalue, will copy to new op) QKV(1.0)
-      .End()
-      .AddAttr("transpose_X")
-      .IsBoolEQ(false)
-      .End()
-      .AddAttr("transpose_Y")  // QK(true) QKV(false)
-      .IsType<bool>()
-      .End();
-
-  AddOpCompat(OpCompat("matmul_v2"))
-      .AddInput("X")
-      .IsTensor()
-      .End()
-      .AddInput("Y")
-      .IsTensor()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("trans_x")
-      .IsBoolEQ(false)
-      .End()
-      .AddAttr("trans_y")  // QK(true) QKV(false)
-      .IsType<bool>()
-      .End();
-
   AddOpCompat(OpCompat("softmax"))
       .AddInput("X")
       .IsTensor()
@@ -578,11 +524,8 @@ REGISTER_PASS(trt_flash_multihead_matmul_fuse_pass,
 REGISTER_PASS_CAPABILITY(trt_flash_multihead_matmul_fuse_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
-            .EQ("mul", 0)
             .LE("elementwise_add", 1)
             .EQ("reshape2", 0)
             .EQ("transpose2", 0)
             .EQ("scale", 0)
-            .LE("matmul", 1)
-            .EQ("matmul_v2", 0)
             .EQ("softmax", 0));
diff --git a/paddle/fluid/framework/ir/trt_map_matmul_to_mul_pass.cc b/paddle/fluid/framework/ir/trt_map_matmul_to_mul_pass.cc
deleted file mode 100644
index dceacef0010ef5..00000000000000
--- a/paddle/fluid/framework/ir/trt_map_matmul_to_mul_pass.cc
+++ /dev/null
@@ -1,918 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/trt_map_matmul_to_mul_pass.h"
-
-#include <cmath>
-#include <string>
-
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/framework/op_proto_maker.h"
-#include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-class Node;
-
-TrtMapMatmul2MulPass::TrtMapMatmul2MulPass() {
-  AddOpCompat(OpCompat("matmul"))
-      .AddInput("X")
-      .IsTensor()
-      .End()
-      .AddInput("Y")
-      .IsTensor()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("alpha")
-      .IsNumGE(0.99f)
-      .IsNumLE(1.01f)
-      .End()
-      .AddAttr("transpose_X")
-      .IsBoolEQ(false)
-      .End()
-      .AddAttr("transpose_Y")
-      .IsType<bool>()
-      .End();
-
-  AddOpCompat(OpCompat("mul"))
-      .AddInput("X")
-      .IsTensor()
-      .End()
-      .AddInput("Y")
-      .IsTensor()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("x_num_col_dims")
-      .IsNumGE(1)
-      .End()
-      .AddAttr("y_num_col_dims")
-      .IsNumEQ(1)
-      .End();
-}
-
-TrtMapMatmulV2ToMulPass::TrtMapMatmulV2ToMulPass() {
-  AddOpCompat(OpCompat("matmul_v2"))
-      .AddInput("X")
-      .IsTensor()
-      .End()
-      .AddInput("Y")
-      .IsTensor()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("trans_x")
-      .IsBoolEQ(false)
-      .End()
-      .AddAttr("trans_y")
-      .IsType<bool>()
-      .End();
-
-  AddOpCompat(OpCompat("mul"))
-      .AddInput("X")
-      .IsTensor()
-      .End()
-      .AddInput("Y")
-      .IsTensor()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("x_num_col_dims")
-      .IsNumGE(1)
-      .End()
-      .AddAttr("y_num_col_dims")
-      .IsNumEQ(1)
-      .End();
-}
-
-TrtMapMatmulV2ToMatmulPass::TrtMapMatmulV2ToMatmulPass() {
-  AddOpCompat(OpCompat("matmul_v2"))
-      .AddInput("X")
-      .IsTensor()
-      .End()
-      .AddInput("Y")
-      .IsTensor()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("trans_x")
-      .IsType<bool>()
-      .End()
-      .AddAttr("trans_y")
-      .IsType<bool>()
-      .End();
-
-  AddOpCompat(OpCompat("matmul"))
-      .AddInput("X")
-      .IsTensor()
-      .End()
-      .AddInput("Y")
-      .IsTensor()
-      .End()
-      .AddAttr("alpha")
-      .IsNumEQ(1.0f)
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("transpose_X")
-      .IsType<bool>()
-      .End()
-      .AddAttr("transpose_Y")
-      .IsType<bool>()
-      .End();
-}
-
-TrtFlatten2MatmulFusePass::TrtFlatten2MatmulFusePass() {
-  AddOpCompat(OpCompat("matmul"))
-      .AddInput("X")
-      .IsTensor()
-      .End()
-      .AddInput("Y")
-      .IsTensor()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("alpha")
-      .IsNumGE(0.99f)
-      .IsNumLE(1.01f)
-      .End()
-      .AddAttr("transpose_X")
-      .IsBoolEQ(false)
-      .End()
-      .AddAttr("transpose_Y")
-      .IsBoolEQ(false)
-      .End();
-
-  AddOpCompat(OpCompat("flatten2"))
-      .AddInput("X")
-      .IsTensor()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddOutput("XShape")
-      .IsTensor()
-      .End()
-      .AddAttr("axis")
-      .IsNumEQ(1)
-      .End();
-
-  AddOpCompat(OpCompat("mul"))
-      .AddInput("X")
-      .IsTensor()
-      .End()
-      .AddInput("Y")
-      .IsTensor()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("x_num_col_dims")
-      .IsNumGE(1)
-      .End()
-      .AddAttr("y_num_col_dims")
-      .IsNumEQ(1)
-      .End();
-}
-
-TrtSqueeze2MatmulFusePass::TrtSqueeze2MatmulFusePass() {
-  AddOpCompat(OpCompat("matmul"))
-      .AddInput("X")
-      .IsTensor()
-      .End()
-      .AddInput("Y")
-      .IsTensor()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("alpha")
-      .IsNumGE(0.99f)
-      .IsNumLE(1.01f)
-      .End()
-      .AddAttr("transpose_X")
-      .IsBoolEQ(false)
-      .End()
-      .AddAttr("transpose_Y")
-      .IsBoolEQ(false)
-      .End();
-
-  AddOpCompat(OpCompat("squeeze2"))
-      .AddInput("X")
-      .IsTensor()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddOutput("XShape")
-      .IsTensor()
-      .End()
-      .AddAttr("axes")
-      .IsType<std::vector<int>>()
-      .End();
-
-  AddOpCompat(OpCompat("mul"))
-      .AddInput("X")
-      .IsTensor()
-      .End()
-      .AddInput("Y")
-      .IsTensor()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("x_num_col_dims")
-      .IsNumEQ(1)
-      .End()
-      .AddAttr("y_num_col_dims")
-      .IsNumEQ(1)
-      .End();
-}
-
-void TrtMapMatmul2MulPass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE_NOT_NULL(
-      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
-  std::string name_scope = "trt_map_matmul_to_mul_pass";
-  FusePassBase::Init(name_scope, graph);
-
-  GraphPatternDetector gpd;
-  patterns::Matmul matmul_pattern(gpd.mutable_pattern(), name_scope);
-  matmul_pattern();
-
-  int found_count = 0;
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    VLOG(4) << "trt map matmul to mul";
-    GET_IR_NODE_FROM_SUBGRAPH(matmul_in_x, matmul_in_x, matmul_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(matmul_in_y, matmul_in_y, matmul_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(matmul_op, matmul_op, matmul_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(matmul_out, matmul_out, matmul_pattern);
-    bool flag = true;
-
-    bool transpose_X =
-        PADDLE_GET_CONST(bool, matmul_op->Op()->GetAttr("transpose_X"));
-    float alpha = PADDLE_GET_CONST(float, matmul_op->Op()->GetAttr("alpha"));
-    flag = flag && !transpose_X && std::abs(alpha - 1.0) < 1e-5;
-
-    std::vector<int64_t> x_shape = matmul_in_x->Var()->GetShape();
-    std::vector<int64_t> y_shape = matmul_in_y->Var()->GetShape();
-    size_t x_rank = x_shape.size();
-    size_t y_rank = y_shape.size();
-    flag = flag && x_rank >= 2 && y_rank == 2;
-
-    if (flag) {
-      if (!IsCompat(subgraph, g)) {
-        LOG(WARNING) << "TrtMapMatmul2MulPass in op compat failed.";
-        return;
-      }
-      OpDesc desc(matmul_op->Op()->Block());
-      desc.SetType("mul");
-      desc.SetInput("X", {matmul_in_x->Name()});
-      desc.SetInput("Y", {matmul_in_y->Name()});
-      desc.SetOutput("Out", {matmul_out->Name()});
-      desc.SetAttr("x_num_col_dims", static_cast<int>(x_rank - 1));
-      desc.SetAttr("y_num_col_dims", 1);
-      desc.SetAttr("transpose_Y", matmul_op->Op()->GetAttr("transpose_Y"));
-      if (matmul_op->Op()->HasAttr("enable_int8")) {
-        desc.SetAttr("enable_int8", matmul_op->Op()->GetAttr("enable_int8"));
-        desc.SetAttr("Input_scale", matmul_op->Op()->GetAttr("Input_scale"));
-        desc.SetAttr("out_threshold",
-                     matmul_op->Op()->GetAttr("out_threshold"));
-      }
-
-      bool inscale_flag = false;
-      bool outscale_flag = false;
-
-      if (matmul_op->Op()->HasAttr("X")) {
-        desc.SetAttr("X", matmul_op->Op()->GetAttr("X"));
-        inscale_flag = true;
-      }
-      if (matmul_op->Op()->HasAttr("Out")) {
-        desc.SetAttr("Out", matmul_op->Op()->GetAttr("Out"));
-        outscale_flag = true;
-      }
-      desc.SetAttr("support_int8", inscale_flag && outscale_flag);
-
-      auto mul_node = g->CreateOpNode(&desc);
-      IR_NODE_LINK_TO(matmul_in_x, mul_node);
-      IR_NODE_LINK_TO(matmul_in_y, mul_node);
-      IR_NODE_LINK_TO(mul_node, matmul_out);
-      GraphSafeRemoveNodes(graph, {matmul_op});
-      ++found_count;
-
-      if (!IsCompat(desc)) {
-        LOG(WARNING) << "TrtMapMatmul2MulPass in out mul op compat failed.";
-        return;
-      }
-    }
-  };
-
-  gpd(graph, handler);
-  AddStatis(found_count);
-}
-
-void TrtMapMatmulV2ToMulPass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE_NOT_NULL(
-      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
-  std::string name_scope = "trt_map_matmul_v2_to_mul_pass";
-  FusePassBase::Init(name_scope, graph);
-
-  GraphPatternDetector gpd;
-  patterns::MatmulV2Weight matmul_v2_weight_pattern(gpd.mutable_pattern(),
-                                                    name_scope);
-  matmul_v2_weight_pattern();
-
-  int found_count = 0;
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    VLOG(3) << "trt map matmul_v2 to mul";
-    GET_IR_NODE_FROM_SUBGRAPH(
-        matmul_v2_in_x, matmul_v2_in_x, matmul_v2_weight_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(
-        matmul_v2_in_y, matmul_v2_in_y, matmul_v2_weight_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(
-        matmul_v2_op, matmul_v2_op, matmul_v2_weight_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(
-        matmul_v2_out, matmul_v2_out, matmul_v2_weight_pattern);
-
-    bool flag = true;
-    bool trans_x =
-        PADDLE_GET_CONST(bool, matmul_v2_op->Op()->GetAttr("trans_x"));
-    flag = flag && !trans_x;
-
-    std::vector<int64_t> x_shape = matmul_v2_in_x->Var()->GetShape();
-    std::vector<int64_t> y_shape = matmul_v2_in_y->Var()->GetShape();
-    size_t x_rank = x_shape.size();
-    size_t y_rank = y_shape.size();
-    flag = flag && x_rank >= 2 && y_rank == 2;
-
-    if (flag) {
-      if (!IsCompat(subgraph, g)) {
-        LOG(WARNING) << "TrtMapMatmulV2ToMulPass in op compat failed.";
-        return;
-      }
-      OpDesc desc(matmul_v2_op->Op()->Block());
-      desc.SetType("mul");
-      desc.SetInput("X", {matmul_v2_in_x->Name()});
-      desc.SetInput("Y", {matmul_v2_in_y->Name()});
-      desc.SetOutput("Out", {matmul_v2_out->Name()});
-      desc.SetAttr("x_num_col_dims", static_cast<int>(x_rank - 1));
-      desc.SetAttr("y_num_col_dims", 1);
-      desc.SetAttr("transpose_Y", matmul_v2_op->Op()->GetAttr("trans_y"));
-      if (matmul_v2_op->Op()->HasAttr("enable_int8")) {
-        desc.SetAttr("enable_int8", matmul_v2_op->Op()->GetAttr("enable_int8"));
-        desc.SetAttr("Input_scale", matmul_v2_op->Op()->GetAttr("Input_scale"));
-        desc.SetAttr("out_threshold",
-                     matmul_v2_op->Op()->GetAttr("out_threshold"));
-      }
-
-      bool inscale_flag = false;
-      bool outscale_flag = false;
-      if (matmul_v2_op->Op()->HasAttr("X")) {
-        desc.SetAttr("X", matmul_v2_op->Op()->GetAttr("X"));
-        inscale_flag = true;
-      }
-      if (matmul_v2_op->Op()->HasAttr("Out")) {
-        desc.SetAttr("Out", matmul_v2_op->Op()->GetAttr("Out"));
-        outscale_flag = true;
-      }
-      desc.SetAttr("support_int8", inscale_flag && outscale_flag);
-
-      auto mul_node = g->CreateOpNode(&desc);
-      IR_NODE_LINK_TO(matmul_v2_in_x, mul_node);
-      IR_NODE_LINK_TO(matmul_v2_in_y, mul_node);
-      IR_NODE_LINK_TO(mul_node, matmul_v2_out);
-      GraphSafeRemoveNodes(graph, {matmul_v2_op});
-      ++found_count;
-
-      if (!IsCompat(desc)) {
-        LOG(WARNING) << "TrtMapMatmulV2ToMulPass in out mul op compat failed.";
-        return;
-      }
-    }
-  };
-
-  gpd(graph, handler);
-  AddStatis(found_count);
-}
-
-void TrtMapMatmulV2ToMatmulPass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE_NOT_NULL(
-      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
-  std::string name_scope = "trt_map_matmul_v2_to_matmul_pass";
-  FusePassBase::Init(name_scope, graph);
-
-  GraphPatternDetector gpd;
-  patterns::MatmulV2 matmul_v2_pattern(gpd.mutable_pattern(), name_scope);
-  matmul_v2_pattern();
-
-  int found_count = 0;
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    VLOG(4) << "trt map matmul_v2 to matmul";
-    GET_IR_NODE_FROM_SUBGRAPH(
-        matmul_v2_in_x, matmul_v2_in_x, matmul_v2_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(
-        matmul_v2_in_y, matmul_v2_in_y, matmul_v2_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(matmul_v2_op, matmul_v2_op, matmul_v2_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(matmul_v2_out, matmul_v2_out, matmul_v2_pattern);
-    if (!IsCompat(subgraph, g)) {
-      LOG(WARNING) << "TrtMapMatmulV2ToMatmulPass in op compat failed.";
-      return;
-    }
-
-    std::vector<int64_t> x_shape = matmul_v2_in_x->Var()->GetShape();
-    std::vector<int64_t> y_shape = matmul_v2_in_y->Var()->GetShape();
-    if (x_shape.size() != y_shape.size()) {
-      LOG(WARNING)
-          << "matmul op not support broadcast, please check inputs'shape. ";
-      return;
-    }
-    uint64_t dims = 2;
-    for (size_t i = 0; i < x_shape.size() - dims; ++i) {
-      if (x_shape[i] != y_shape[i] && (x_shape[i] == 1 || y_shape[i] == 1)) {
-        LOG(WARNING) << "matmul op not support broadcast, please check "
-                        "inputs'shape[i]. ";
-        return;
-      }
-    }
-
-    OpDesc desc(matmul_v2_op->Op()->Block());
-    desc.SetType("matmul");
-    desc.SetInput("X", {matmul_v2_in_x->Name()});
-    desc.SetInput("Y", {matmul_v2_in_y->Name()});
-    desc.SetOutput("Out", {matmul_v2_out->Name()});
-    desc.SetAttr("transpose_X", matmul_v2_op->Op()->GetAttr("trans_x"));
-    desc.SetAttr("transpose_Y", matmul_v2_op->Op()->GetAttr("trans_y"));
-    desc.SetAttr("alpha", 1.0f);
-    if (matmul_v2_op->Op()->HasAttr("use_mkldnn")) {
-      desc.SetAttr("use_mkldnn", matmul_v2_op->Op()->GetAttr("use_mkldnn"));
-    }
-    if (matmul_v2_op->Op()->HasAttr("enable_int8")) {
-      desc.SetAttr("enable_int8", matmul_v2_op->Op()->GetAttr("enable_int8"));
-      desc.SetAttr("Input_scale", matmul_v2_op->Op()->GetAttr("Input_scale"));
-      desc.SetAttr("out_threshold",
-                   matmul_v2_op->Op()->GetAttr("out_threshold"));
-    }
-
-    bool inscale_flag = false;
-    bool outscale_flag = false;
-    if (matmul_v2_op->Op()->HasAttr("X")) {
-      desc.SetAttr("X", matmul_v2_op->Op()->GetAttr("X"));
-      inscale_flag = true;
-    }
-    if (matmul_v2_op->Op()->HasAttr("Out")) {
-      desc.SetAttr("Out", matmul_v2_op->Op()->GetAttr("Out"));
-      outscale_flag = true;
-    }
-    desc.SetAttr("support_int8", inscale_flag && outscale_flag);
-
-    auto matmul_node = g->CreateOpNode(&desc);
-    IR_NODE_LINK_TO(matmul_v2_in_x, matmul_node);
-    IR_NODE_LINK_TO(matmul_v2_in_y, matmul_node);
-    IR_NODE_LINK_TO(matmul_node, matmul_v2_out);
-    GraphSafeRemoveNodes(graph, {matmul_v2_op});
-    ++found_count;
-
-    if (!IsCompat(desc)) {
-      LOG(WARNING)
-          << "TrtMapMatmulV2ToMatmulPass in out matmul op compat failed.";
-      return;
-    }
-  };
-
-  gpd(graph, handler);
-  AddStatis(found_count);
-}
-
-void TrtSqueeze2MatmulFusePass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE_NOT_NULL(
-      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
-  std::string name_scope = "trt_squeeze2_matmul_fuse_pass";
-  FusePassBase::Init(name_scope, graph);
-
-  GraphPatternDetector gpd;
-  patterns::Squeeze2Matmul fuse_pattern(gpd.mutable_pattern(), name_scope);
-  fuse_pattern();
-
-  int found_count = 0;
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    VLOG(4) << "trt fuse squeeze2+matmul to mul";
-    GET_IR_NODE_FROM_SUBGRAPH(squeeze2_in_x, squeeze2_in_x, fuse_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(squeeze2_op, squeeze2_op, fuse_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(matmul_in_x, matmul_in_x, fuse_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(matmul_in_y, matmul_in_y, fuse_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(matmul_op, matmul_op, fuse_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(matmul_out, matmul_out, fuse_pattern);
-    bool flag = true;
-
-    size_t squeeze2_in_x_rank = (squeeze2_in_x->Var()->GetShape()).size();
-    std::vector<int> squeeze2_op_axes =
-        PADDLE_GET_CONST(std::vector<int>, squeeze2_op->Op()->GetAttr("axes"));
-    flag = flag && squeeze2_in_x_rank == 4 &&
-           squeeze2_op_axes == std::vector<int>{2, 3} &&
-           (matmul_in_x->outputs).size() == 1 &&
-           matmul_in_y->Var()->Persistable();
-
-    bool transpose_X =
-        PADDLE_GET_CONST(bool, matmul_op->Op()->GetAttr("transpose_X"));
-    bool transpose_Y =
-        PADDLE_GET_CONST(bool, matmul_op->Op()->GetAttr("transpose_Y"));
-    float alpha = PADDLE_GET_CONST(float, matmul_op->Op()->GetAttr("alpha"));
-    size_t matmul_in_x_rank = (matmul_in_x->Var()->GetShape()).size();
-    size_t matmul_in_y_rank = (matmul_in_y->Var()->GetShape()).size();
-    flag = flag && !transpose_X && !transpose_Y &&
-           std::abs(alpha - 1.0) < 1e-5 && matmul_in_x_rank == 2 &&
-           matmul_in_y_rank == 2;
-
-    std::vector<Node*>& next_ops = matmul_out->outputs;
-    flag = flag && next_ops.size() == 1 &&
-           next_ops[0]->Name() == "elementwise_add";
-
-    if (flag) {
-      if (!IsCompat(subgraph, g)) {
-        LOG(WARNING) << "TrtSqueeze2MatmulFusePass in op compat failed.";
-        return;
-      }
-      OpDesc desc(matmul_op->Op()->Block());
-      desc.SetType("mul");
-      desc.SetInput("X", {squeeze2_in_x->Name()});
-      desc.SetInput("Y", {matmul_in_y->Name()});
-      desc.SetOutput("Out", {matmul_out->Name()});
-      desc.SetAttr("x_num_col_dims", 1);
-      desc.SetAttr("y_num_col_dims", 1);
-      if (matmul_op->Op()->HasAttr("enable_int8")) {
-        desc.SetAttr("enable_int8", matmul_op->Op()->GetAttr("enable_int8"));
-        desc.SetAttr("Input_scale", matmul_op->Op()->GetAttr("Input_scale"));
-        desc.SetAttr("out_threshold",
-                     matmul_op->Op()->GetAttr("out_threshold"));
-      }
-
-      bool inscale_flag_x = false;
-      bool outscale_flag = false;
-
-      if (squeeze2_op->Op()->HasAttr("X")) {
-        desc.SetAttr("X", squeeze2_op->Op()->GetAttr("X"));
-        inscale_flag_x = true;
-      }
-      if (matmul_op->Op()->HasAttr("Out")) {
-        desc.SetAttr("Out", matmul_op->Op()->GetAttr("Out"));
-        outscale_flag = true;
-      }
-      desc.SetAttr("support_int8", inscale_flag_x && outscale_flag);
-
-      auto mul_node = g->CreateOpNode(&desc);
-      IR_NODE_LINK_TO(squeeze2_in_x, mul_node);
-      IR_NODE_LINK_TO(matmul_in_y, mul_node);
-      IR_NODE_LINK_TO(mul_node, matmul_out);
-      GraphSafeRemoveNodes(graph, {squeeze2_op, matmul_in_x, matmul_op});
-      ++found_count;
-      if (!IsCompat(desc)) {
-        LOG(WARNING)
-            << "TrtSqueeze2MatmulFusePass in out mul op compat failed.";
-        return;
-      }
-    }
-  };
-
-  gpd(graph, handler);
-  AddStatis(found_count);
-}
-
-TrtReshape2MatmulFusePass::TrtReshape2MatmulFusePass() {
-  AddOpCompat(OpCompat("reshape2"))
-      .AddInput("X")
-      .IsTensor()
-      .End()
-      .AddInput("Shape")
-      .IsTensor()
-      .IsOptional()
-      .End()
-      .AddInput("ShapeTensor")
-      .IsTensor()
-      .IsOptional()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddOutput("XShape")
-      .IsTensor()
-      .End()
-      .AddAttr("shape")  // ints
-      .IsType<std::vector<int>>()
-      .End();
-
-  AddOpCompat(OpCompat("matmul"))
-      .AddInput("X")
-      .IsTensor()
-      .End()
-      .AddInput("Y")
-      .IsTensor()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("alpha")
-      .IsNumGT(0.99999f)
-      .IsNumLT(1.00001f)
-      .End()
-      .AddAttr("transpose_X")
-      .IsBoolEQ(false)
-      .End()
-      .AddAttr("transpose_Y")
-      .IsBoolEQ(false)
-      .End();
-
-  AddOpCompat(OpCompat("mul"))
-      .AddInput("X")
-      .IsTensor()
-      .End()
-      .AddInput("Y")
-      .IsTensor()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("x_num_col_dims")
-      .IsNumEQ(1)
-      .End()
-      .AddAttr("y_num_col_dims")
-      .IsNumEQ(1)
-      .End();
-}
-
-void TrtReshape2MatmulFusePass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE_NOT_NULL(
-      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
-  std::string name_scope = "trt_reshape2_matmul_fuse_pass";
-  FusePassBase::Init(name_scope, graph);
-
-  GraphPatternDetector gpd;
-  patterns::Reshape2Matmul fuse_pattern(gpd.mutable_pattern(), name_scope);
-  fuse_pattern();
-
-  int found_count = 0;
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    VLOG(4) << "trt fuse reshape2+matmul to mul";
-    GET_IR_NODE_FROM_SUBGRAPH(reshape2_in_x, reshape2_in_x, fuse_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(reshape2_op, reshape2_op, fuse_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(matmul_in_x, matmul_in_x, fuse_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(matmul_in_y, matmul_in_y, fuse_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(matmul_op, matmul_op, fuse_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(matmul_out, matmul_out, fuse_pattern);
-    bool flag = true;
-
-    size_t reshape2_in_nums = reshape2_op->inputs.size();
-    auto reshape2_in_x_shape = reshape2_in_x->Var()->GetShape();
-    size_t reshape2_in_x_rank = reshape2_in_x_shape.size();
-    std::vector<int> reshape2_op_shape =
-        PADDLE_GET_CONST(std::vector<int>, reshape2_op->Op()->GetAttr("shape"));
-    flag = flag && reshape2_in_nums == 1 && reshape2_in_x_rank == 4 &&
-           reshape2_in_x_shape[2] == 1 && reshape2_in_x_shape[3] == 1 &&
-           reshape2_op_shape.size() == 2 && (matmul_in_x->outputs).size() == 1;
-
-    bool transpose_X =
-        PADDLE_GET_CONST(bool, matmul_op->Op()->GetAttr("transpose_X"));
-    bool transpose_Y =
-        PADDLE_GET_CONST(bool, matmul_op->Op()->GetAttr("transpose_Y"));
-    float alpha = PADDLE_GET_CONST(float, matmul_op->Op()->GetAttr("alpha"));
-    size_t matmul_in_x_rank = (matmul_in_x->Var()->GetShape()).size();
-    size_t matmul_in_y_rank = (matmul_in_y->Var()->GetShape()).size();
-    flag = flag && !transpose_X && !transpose_Y &&
-           std::abs(alpha - 1.0) < 1e-5 && matmul_in_x_rank == 2 &&
-           matmul_in_y_rank == 2 && matmul_in_y->Var()->Persistable();
-
-    std::vector<Node*>& next_ops = matmul_out->outputs;
-    flag = flag && next_ops.size() == 1 &&
-           next_ops[0]->Name() == "elementwise_add";
-
-    if (flag) {
-      if (!IsCompat(subgraph, g)) {
-        LOG(WARNING) << "TrtReshape2MatmulFusePass in op compat failed.";
-        return;
-      }
-      OpDesc desc(matmul_op->Op()->Block());
-      desc.SetType("mul");
-      desc.SetInput("X", {reshape2_in_x->Name()});
-      desc.SetInput("Y", {matmul_in_y->Name()});
-      desc.SetOutput("Out", {matmul_out->Name()});
-      desc.SetAttr("x_num_col_dims", 1);
-      desc.SetAttr("y_num_col_dims", 1);
-      if (matmul_op->Op()->HasAttr("enable_int8")) {
-        desc.SetAttr("enable_int8", matmul_op->Op()->GetAttr("enable_int8"));
-        desc.SetAttr("Input_scale", matmul_op->Op()->GetAttr("Input_scale"));
-        desc.SetAttr("out_threshold",
-                     matmul_op->Op()->GetAttr("out_threshold"));
-      }
-
-      bool inscale_flag_x = false;
-      bool outscale_flag = false;
-
-      if (reshape2_op->Op()->HasAttr("X")) {
-        desc.SetAttr("X", reshape2_op->Op()->GetAttr("X"));
-        inscale_flag_x = true;
-      }
-      if (matmul_op->Op()->HasAttr("Out")) {
-        desc.SetAttr("Out", matmul_op->Op()->GetAttr("Out"));
-        outscale_flag = true;
-      }
-      desc.SetAttr("support_int8", inscale_flag_x && outscale_flag);
-
-      if (!IsCompat(desc)) {
-        LOG(WARNING)
-            << "TrtReshape2MatmulFusePass in out mul op compat failed.";
-        return;
-      }
-      auto mul_node = g->CreateOpNode(&desc);
-      IR_NODE_LINK_TO(reshape2_in_x, mul_node);
-      IR_NODE_LINK_TO(matmul_in_y, mul_node);
-      IR_NODE_LINK_TO(mul_node, matmul_out);
-      GraphSafeRemoveNodes(graph, {reshape2_op, matmul_in_x, matmul_op});
-      ++found_count;
-    }
-  };
-
-  gpd(graph, handler);
-  AddStatis(found_count);
-}
-
-void TrtFlatten2MatmulFusePass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE_NOT_NULL(
-      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
-  std::string name_scope = "trt_flatten2_matmul_fuse_pass";
-  FusePassBase::Init(name_scope, graph);
-
-  GraphPatternDetector gpd;
-  patterns::Flatten2Matmul fuse_pattern(gpd.mutable_pattern(), name_scope);
-  fuse_pattern();
-
-  int found_count = 0;
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    VLOG(4) << "trt fuse flatten2+matmul to mul";
-    GET_IR_NODE_FROM_SUBGRAPH(flatten2_in_x, flatten2_in_x, fuse_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(flatten2_op, flatten2_op, fuse_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(matmul_in_x, matmul_in_x, fuse_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(matmul_in_y, matmul_in_y, fuse_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(matmul_op, matmul_op, fuse_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(matmul_out, matmul_out, fuse_pattern);
-    bool pattern_found = true;
-
-    size_t flatten2_in_nums = flatten2_op->inputs.size();
-    auto flatten2_in_x_shape = flatten2_in_x->Var()->GetShape();
-    size_t flatten2_in_x_rank = flatten2_in_x_shape.size();
-    int flatten2_axis =
-        PADDLE_GET_CONST(int, flatten2_op->Op()->GetAttr("axis"));
-    // only convert matmul to mul when the flatten2 has a single input
-    // and the rank of input is 4 and the size of the output of matmul
-    // is 1.
-    pattern_found = pattern_found && flatten2_in_nums == 1 &&
-                    flatten2_in_x_rank == 4 &&
-                    (matmul_in_x->outputs).size() == 1;
-
-    bool transpose_X =
-        PADDLE_GET_CONST(bool, matmul_op->Op()->GetAttr("transpose_X"));
-    bool transpose_Y =
-        PADDLE_GET_CONST(bool, matmul_op->Op()->GetAttr("transpose_Y"));
-    float alpha = PADDLE_GET_CONST(float, matmul_op->Op()->GetAttr("alpha"));
-    size_t matmul_in_x_rank = (matmul_in_x->Var()->GetShape()).size();
-    size_t matmul_in_y_rank = (matmul_in_y->Var()->GetShape()).size();
-    pattern_found = pattern_found && !transpose_X && !transpose_Y &&
-                    std::abs(alpha - 1.0) < 1e-5 && matmul_in_x_rank == 2 &&
-                    matmul_in_y_rank == 2 && matmul_in_y->Var()->Persistable();
-
-    std::vector<Node*>& next_ops = matmul_out->outputs;
-    // we further require the matmul op is followed by one elementwise
-    // add op.
-    pattern_found = pattern_found && next_ops.size() == 1 &&
-                    next_ops[0]->Name() == "elementwise_add";
-
-    if (pattern_found) {
-      if (!IsCompat(subgraph, g)) {
-        LOG(WARNING) << "TrtFlatten2MatmulFusePass in op compat failed.";
-        return;
-      }
-      OpDesc desc(matmul_op->Op()->Block());
-      desc.SetType("mul");
-      desc.SetInput("X", {flatten2_in_x->Name()});
-      desc.SetInput("Y", {matmul_in_y->Name()});
-      desc.SetOutput("Out", {matmul_out->Name()});
-      desc.SetAttr("x_num_col_dims", flatten2_axis);
-      desc.SetAttr("y_num_col_dims", 1);
-      if (matmul_op->Op()->HasAttr("enable_int8")) {
-        desc.SetAttr("enable_int8", matmul_op->Op()->GetAttr("enable_int8"));
-        desc.SetAttr("Input_scale", matmul_op->Op()->GetAttr("Input_scale"));
-        desc.SetAttr("out_threshold",
-                     matmul_op->Op()->GetAttr("out_threshold"));
-      }
-
-      bool inscale_flag_x = false;
-      bool outscale_flag = false;
-
-      if (flatten2_op->Op()->HasAttr("X")) {
-        desc.SetAttr("X", flatten2_op->Op()->GetAttr("X"));
-        inscale_flag_x = true;
-      }
-      if (matmul_op->Op()->HasAttr("Out")) {
-        desc.SetAttr("Out", matmul_op->Op()->GetAttr("Out"));
-        outscale_flag = true;
-      }
-      desc.SetAttr("support_int8", inscale_flag_x && outscale_flag);
-
-      auto mul_node = g->CreateOpNode(&desc);
-      IR_NODE_LINK_TO(flatten2_in_x, mul_node);
-      IR_NODE_LINK_TO(matmul_in_y, mul_node);
-      IR_NODE_LINK_TO(mul_node, matmul_out);
-      GraphSafeRemoveNodes(graph, {flatten2_op, matmul_in_x, matmul_op});
-      ++found_count;
-
-      if (!IsCompat(desc)) {
-        LOG(WARNING)
-            << "TrtFlatten2MatmulFusePass in out mul op compat failed.";
-        return;
-      }
-    }
-  };
-
-  gpd(graph, handler);
-  AddStatis(found_count);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(trt_map_matmul_to_mul_pass,
-              paddle::framework::ir::TrtMapMatmul2MulPass);
-REGISTER_PASS_CAPABILITY(trt_map_matmul_to_mul_pass)
-    .AddCombination(
-        paddle::framework::compatible::OpVersionComparatorCombination()
-            .LE("matmul", 1)
-            .EQ("mul", 0));
-
-REGISTER_PASS(trt_map_matmul_v2_to_mul_pass,
-              paddle::framework::ir::TrtMapMatmulV2ToMulPass);
-REGISTER_PASS_CAPABILITY(trt_map_matmul_v2_to_mul_pass)
-    .AddCombination(
-        paddle::framework::compatible::OpVersionComparatorCombination()
-            .EQ("matmul_v2", 0)
-            .EQ("mul", 0));
-
-REGISTER_PASS(trt_map_matmul_v2_to_matmul_pass,
-              paddle::framework::ir::TrtMapMatmulV2ToMatmulPass);
-REGISTER_PASS_CAPABILITY(trt_map_matmul_v2_to_matmul_pass)
-    .AddCombination(
-        paddle::framework::compatible::OpVersionComparatorCombination()
-            .EQ("matmul_v2", 0)
-            .LE("matmul", 1));
-
-REGISTER_PASS(trt_squeeze2_matmul_fuse_pass,
-              paddle::framework::ir::TrtSqueeze2MatmulFusePass);
-REGISTER_PASS_CAPABILITY(trt_squeeze2_matmul_fuse_pass)
-    .AddCombination(
-        paddle::framework::compatible::OpVersionComparatorCombination()
-            .LE("matmul", 1)
-            .EQ("squeeze2", 0)
-            .EQ("mul", 0));
-
-REGISTER_PASS(trt_reshape2_matmul_fuse_pass,
-              paddle::framework::ir::TrtReshape2MatmulFusePass);
-REGISTER_PASS_CAPABILITY(trt_reshape2_matmul_fuse_pass)
-    .AddCombination(
-        paddle::framework::compatible::OpVersionComparatorCombination()
-            .LE("matmul", 1)
-            .EQ("reshape2", 0)
-            .EQ("mul", 0));
-
-REGISTER_PASS(trt_flatten2_matmul_fuse_pass,
-              paddle::framework::ir::TrtFlatten2MatmulFusePass);
-REGISTER_PASS_CAPABILITY(trt_flatten2_matmul_fuse_pass)
-    .AddCombination(
-        paddle::framework::compatible::OpVersionComparatorCombination()
-            .LE("matmul", 1)
-            .EQ("flatten2", 0)
-            .EQ("mul", 0));
diff --git a/paddle/fluid/framework/ir/trt_map_matmul_to_mul_pass.h b/paddle/fluid/framework/ir/trt_map_matmul_to_mul_pass.h
deleted file mode 100644
index c382837f10a6c1..00000000000000
--- a/paddle/fluid/framework/ir/trt_map_matmul_to_mul_pass.h
+++ /dev/null
@@ -1,130 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/framework/ir/pass.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-class Graph;
-
-class TrtMapMatmul2MulPass : public FusePassBase {
- public:
-  TrtMapMatmul2MulPass();
-  virtual ~TrtMapMatmul2MulPass() {}
-
- protected:
-  void ApplyImpl(Graph* graph) const override;
-};
-
-/*
- * Map matmul_v2 to mul, the same as TrtMapMatmul2MulPass.
- */
-class TrtMapMatmulV2ToMulPass : public FusePassBase {
- public:
-  TrtMapMatmulV2ToMulPass();
-  virtual ~TrtMapMatmulV2ToMulPass() {}
-
- protected:
-  void ApplyImpl(Graph* graph) const override;
-};
-
-/*
- * Map matmul_v2 to matmul, not supoort broadcast.
- */
-class TrtMapMatmulV2ToMatmulPass : public FusePassBase {
- public:
-  TrtMapMatmulV2ToMatmulPass();
-  virtual ~TrtMapMatmulV2ToMatmulPass() {}
-
- protected:
-  void ApplyImpl(Graph* graph) const override;
-};
-
-/*
- * Fuse squeeze2+matmul to mul, so the optimization can use fc_fuse_pass.
- * The squeeze2 op must satisfy the following conditions:
- * 1. the rank of input X is 4
- * 2. the axis attr is [2, 3]
- * 3. the next op is only matmul
- *
- * The matmul op must satisfy the following conditions:
- * 1. the transpose_X and transpose_Y attrs are false
- * 2. the alpha attr is 1.0
- * 3. the rank of input X and Y is 2
- * 4. the next op of matmul is only elementwise_add
- *
- * Notice:
- *  the rank of input activation is obtained from var_desc,
- *  it maybe change in runtime. Therefore, the pass considers
- *  the above passes to reduce the impact on other models.
- */
-
-class TrtSqueeze2MatmulFusePass : public FusePassBase {
- public:
-  TrtSqueeze2MatmulFusePass();
-  virtual ~TrtSqueeze2MatmulFusePass() {}
-
- protected:
-  void ApplyImpl(Graph* graph) const override;
-};
-
-/*
- * Fuse reshape2+matmul to mul, so the optimization can use fc_fuse_pass.
- * The reshape2 op must satisfy the following conditions:
- * 1. reshape2 has one input node, which means it don't
- *    have Shape or ShapeTensor input
- * 2. the rank of input X is 4 and the last two dims of input X is 1
- * 3. the rank of shape attr is 2
- * 4. the next op is only matmul
- *
- * The matmul op must satisfy the following conditions:
- * 1. the transpose_X and transpose_Y attrs are false
- * 2. the alpha attr is 1.0
- * 3. the rank of input X and Y is 2
- * 4. the next op of matmul is only elementwise_add
- *
- * Notice:
- *  the shape and rank of input activation is obtained from var_desc,
- *  they maybe change in runtime. Therefore, the pass considers
- *  the above passes to reduce the impact on other models.
- */
-
-class TrtReshape2MatmulFusePass : public FusePassBase {
- public:
-  TrtReshape2MatmulFusePass();
-  virtual ~TrtReshape2MatmulFusePass() {}
-
- protected:
-  void ApplyImpl(Graph* graph) const override;
-};
-
-class TrtFlatten2MatmulFusePass : public FusePassBase {
- public:
-  TrtFlatten2MatmulFusePass();
-  virtual ~TrtFlatten2MatmulFusePass() {}
-
- protected:
-  void ApplyImpl(Graph* graph) const override;
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/trt_map_ops_to_matrix_multiply_pass.cc b/paddle/fluid/framework/ir/trt_map_ops_to_matrix_multiply_pass.cc
new file mode 100644
index 00000000000000..a616329de430e0
--- /dev/null
+++ b/paddle/fluid/framework/ir/trt_map_ops_to_matrix_multiply_pass.cc
@@ -0,0 +1,125 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/trt_map_ops_to_matrix_multiply_pass.h"
+
+#include <cmath>
+#include <string>
+
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class Node;
+
+TrtMapOpsToMatrixMultiplyPass::TrtMapOpsToMatrixMultiplyPass() {}
+
+void TrtMapOpsToMatrixMultiplyPass::ApplyImpl(ir::Graph* graph) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
+  std::string name_scope = "trt_map_ops_to_matrix_multiply_pass";
+  FusePassBase::Init(name_scope, graph);
+
+  std::unordered_set<std::string> ops_type = {"mul", "matmul", "matmul_v2"};
+  GraphPatternDetector gpd;
+  patterns::MulMatmulMatmulV2 mul_matmul_matmul_v2(gpd.mutable_pattern(),
+                                                   name_scope);
+  mul_matmul_matmul_v2(ops_type);
+  int found_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    bool with_dynamic_shape = Get<bool>("with_dynamic_shape");
+    if (!with_dynamic_shape) {
+      VLOG(3)
+          << "TrtMapOpsToMatrixMultiplyPass need with_dynamic_shape, stop this "
+             "pass."
+             "Please reconfig 'SetTRTDynamicShapeInfo'. You can refer to the "
+             "https://github.com/PaddlePaddle/Paddle-Inference-Demo/blob/"
+             "master/c%2B%2B/gpu/resnet50/resnet50_test.cc";
+      return;
+    }
+    VLOG(4) << "trt map some ops to matrix_multiply";
+    GET_IR_NODE_FROM_SUBGRAPH(ops, ops, mul_matmul_matmul_v2);
+    GET_IR_NODE_FROM_SUBGRAPH(ops_out, ops_out, mul_matmul_matmul_v2);
+    OpDesc desc(ops->Op()->Block());
+    desc.SetType("matrix_multiply");
+    desc.SetInput("X", {ops->Op()->Input("X").front()});
+    desc.SetInput("Y", {ops->Op()->Input("Y").front()});
+    desc.SetOutput("Out", {ops_out->Name()});
+
+    if (ops->Op()->HasAttr("transpose_X") || ops->Op()->HasAttr("trans_x")) {
+      if (ops->Op()->HasAttr("transpose_X")) {
+        desc.SetAttr("transpose_x", ops->Op()->GetAttr("transpose_X"));
+      } else {
+        desc.SetAttr("transpose_x", ops->Op()->GetAttr("trans_x"));
+      }
+    } else {
+      desc.SetAttr("transpose_x", false);
+    }
+
+    if (ops->Op()->HasAttr("transpose_Y") || ops->Op()->HasAttr("trans_y")) {
+      if (ops->Op()->HasAttr("transpose_Y")) {
+        desc.SetAttr("transpose_y", ops->Op()->GetAttr("transpose_Y"));
+      } else {
+        desc.SetAttr("transpose_y", ops->Op()->GetAttr("trans_y"));
+      }
+    } else {
+      desc.SetAttr("transpose_y", false);
+    }
+
+    if (ops->Op()->HasAttr("out_threshold")) {
+      desc.SetAttr("out_threshold", ops->Op()->GetAttr("out_threshold"));
+    }
+
+    // Todo: remove attr(x_num_col_dims, y_num_col_dims, alpha)
+    if (ops->Op()->HasAttr("x_num_col_dims")) {
+      desc.SetAttr("x_num_col_dims", ops->Op()->GetAttr("x_num_col_dims"));
+    } else {
+      int32_t x_num_col_dims = -1;
+      desc.SetAttr("x_num_col_dims", x_num_col_dims);
+    }
+
+    // op_teller: Only support y_num_col_dims == y.rank - 1;
+    int32_t y_num_col_dims = -1;
+    desc.SetAttr("y_num_col_dims", y_num_col_dims);
+
+    float alpha = 1;
+    if (ops->Op()->HasAttr("alpha")) {
+      alpha = PADDLE_GET_CONST(float, ops->Op()->GetAttr("alpha"));
+    }
+    desc.SetAttr("alpha", alpha);
+
+    auto matrix_multiply_node = g->CreateOpNode(&desc);
+    for (auto node : ops->inputs) {
+      IR_NODE_LINK_TO(node, matrix_multiply_node);
+    }
+    IR_NODE_LINK_TO(matrix_multiply_node, ops_out);
+    GraphSafeRemoveNodes(graph, {ops});
+    ++found_count;
+  };
+  gpd(graph, handler);
+  AddStatis(found_count);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(trt_map_ops_to_matrix_multiply_pass,
+              paddle::framework::ir::TrtMapOpsToMatrixMultiplyPass);
diff --git a/paddle/fluid/framework/ir/trt_map_ops_to_matrix_multiply_pass.h b/paddle/fluid/framework/ir/trt_map_ops_to_matrix_multiply_pass.h
new file mode 100644
index 00000000000000..efe051c55653ee
--- /dev/null
+++ b/paddle/fluid/framework/ir/trt_map_ops_to_matrix_multiply_pass.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class Graph;
+
+class TrtMapOpsToMatrixMultiplyPass : public FusePassBase {
+ public:
+  TrtMapOpsToMatrixMultiplyPass();
+  virtual ~TrtMapOpsToMatrixMultiplyPass() {}
+
+ protected:
+  void ApplyImpl(Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/trt_multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/trt_multihead_matmul_fuse_pass.cc
index cf42775c2bdde9..11705c33121b69 100644
--- a/paddle/fluid/framework/ir/trt_multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/trt_multihead_matmul_fuse_pass.cc
@@ -257,18 +257,16 @@ static int BuildFusion(Graph* graph, const std::string& name_scope) {
 }
 
 PDNode* TrtMultiHeadMatmulPattern::operator()() {
-  std::unordered_set<std::string> mul_ops{"mul", "matmul_v2"};
-  std::unordered_set<std::string> matmul_ops{"matmul", "matmul_v2"};
   auto* input0 = pattern->NewNode(input0_repr());
-  input0->assert_is_ops_input(mul_ops);
+  input0->assert_is_op_input("matrix_multiply");
 
   // First path with scale
-  auto* mul0 = pattern->NewNode(mul0_repr())->assert_is_ops(mul_ops);
+  auto* mul0 = pattern->NewNode(mul0_repr())->assert_is_op("matrix_multiply");
   auto* mul0_w_var = pattern->NewNode(mul0_w_repr())
                          ->AsInput()
-                         ->assert_is_ops_input(mul_ops, "Y");
+                         ->assert_is_op_input("matrix_multiply", "Y");
   auto* mul0_out_var =
-      pattern->NewNode(mul0_out_repr())->assert_is_ops_output(mul_ops);
+      pattern->NewNode(mul0_out_repr())->assert_is_op_output("matrix_multiply");
 
   decltype(mul0) eltadd0;
   decltype(mul0) eltadd0_b_var;
@@ -301,12 +299,12 @@ PDNode* TrtMultiHeadMatmulPattern::operator()() {
   auto* scale = pattern->NewNode(scale_repr())->assert_is_op("scale");
   auto* scale_out_var =
       pattern->NewNode(scale_out_repr())->assert_is_op_output("scale");
-  scale_out_var->AsIntermediate()->assert_is_ops_input(matmul_ops);
+  scale_out_var->AsIntermediate()->assert_is_op_input("matrix_multiply");
 
   auto* matmul_qk =
-      pattern->NewNode(matmul_qk_repr())->assert_is_ops(matmul_ops);
-  auto* matmul_qk_out_var =
-      pattern->NewNode(matmul_qk_out_repr())->assert_is_ops_output(matmul_ops);
+      pattern->NewNode(matmul_qk_repr())->assert_is_op("matrix_multiply");
+  auto* matmul_qk_out_var = pattern->NewNode(matmul_qk_out_repr())
+                                ->assert_is_op_output("matrix_multiply");
   matmul_qk_out_var->AsIntermediate()->assert_is_op_input("elementwise_add");
 
   auto* eltadd_qk =
@@ -322,12 +320,12 @@ PDNode* TrtMultiHeadMatmulPattern::operator()() {
       pattern->NewNode(softmax_qk_repr())->assert_is_op("softmax");
   auto* softmax_qk_out_var =
       pattern->NewNode(softmax_qk_out_repr())->assert_is_op_output("softmax");
-  softmax_qk_out_var->AsIntermediate()->assert_is_ops_input(matmul_ops);
+  softmax_qk_out_var->AsIntermediate()->assert_is_op_input("matrix_multiply");
 
   auto* matmul_qkv =
-      pattern->NewNode(matmul_qkv_repr())->assert_is_ops(matmul_ops);
-  auto* matmul_qkv_out_var =
-      pattern->NewNode(matmul_qkv_out_repr())->assert_is_ops_output(matmul_ops);
+      pattern->NewNode(matmul_qkv_repr())->assert_is_op("matrix_multiply");
+  auto* matmul_qkv_out_var = pattern->NewNode(matmul_qkv_out_repr())
+                                 ->assert_is_op_output("matrix_multiply");
   matmul_qkv_out_var->AsIntermediate()->assert_is_op_input("transpose2");
 
   auto* transpose2_qkv =
@@ -340,15 +338,14 @@ PDNode* TrtMultiHeadMatmulPattern::operator()() {
       pattern->NewNode(reshape2_qkv_repr())->assert_is_op("reshape2");
   auto* reshape2_qkv_out_var = pattern->NewNode(reshape2_qkv_out_repr())
                                    ->assert_is_op_output("reshape2");
-  reshape2_qkv_out_var->assert_is_ops_input(mul_ops);
 
   // Second path to matmul
-  auto* mul1 = pattern->NewNode(mul1_repr())->assert_is_ops(mul_ops);
+  auto* mul1 = pattern->NewNode(mul1_repr())->assert_is_op("matrix_multiply");
   auto* mul1_w_var = pattern->NewNode(mul1_w_repr())
                          ->AsInput()
-                         ->assert_is_ops_input(mul_ops, "Y");
+                         ->assert_is_op_input("matrix_multiply", "Y");
   auto* mul1_out_var =
-      pattern->NewNode(mul1_out_repr())->assert_is_ops_output(mul_ops);
+      pattern->NewNode(mul1_out_repr())->assert_is_op_output("matrix_multiply");
 
   decltype(mul1) eltadd1;
   decltype(mul1) eltadd1_b_var;
@@ -375,16 +372,16 @@ PDNode* TrtMultiHeadMatmulPattern::operator()() {
       pattern->NewNode(transpose2_1_repr())->assert_is_op("transpose2");
   auto* transpose2_1_out_var = pattern->NewNode(transpose2_1_out_repr())
                                    ->assert_is_op_output("transpose2");
-  transpose2_1_out_var->AsIntermediate()->assert_is_ops_input(
-      matmul_ops);  // link to matmul qk
+  transpose2_1_out_var->AsIntermediate()->assert_is_op_input(
+      "matrix_multiply");  // link to matmul qk
 
   // Third path to matmul
-  auto* mul2 = pattern->NewNode(mul2_repr())->assert_is_ops(mul_ops);
+  auto* mul2 = pattern->NewNode(mul2_repr())->assert_is_op("matrix_multiply");
   auto* mul2_w_var = pattern->NewNode(mul2_w_repr())
                          ->AsInput()
-                         ->assert_is_ops_input(mul_ops, "Y");
+                         ->assert_is_op_input("matrix_multiply", "Y");
   auto* mul2_out_var =
-      pattern->NewNode(mul2_out_repr())->assert_is_ops_output(mul_ops);
+      pattern->NewNode(mul2_out_repr())->assert_is_op_output("matrix_multiply");
 
   decltype(mul2) eltadd2;
   decltype(mul2) eltadd2_b_var;
@@ -411,8 +408,8 @@ PDNode* TrtMultiHeadMatmulPattern::operator()() {
       pattern->NewNode(transpose2_2_repr())->assert_is_op("transpose2");
   auto* transpose2_2_out_var = pattern->NewNode(transpose2_2_out_repr())
                                    ->assert_is_op_output("transpose2");
-  transpose2_2_out_var->AsIntermediate()->assert_is_ops_input(
-      matmul_ops);  // link to matmul qkv
+  transpose2_2_out_var->AsIntermediate()->assert_is_op_input(
+      "matrix_multiply");  // link to matmul qkv
 
   // Q path
   mul0->LinksFrom({input0, mul0_w_var}).LinksTo({mul0_out_var});
@@ -449,17 +446,16 @@ PDNode* TrtMultiHeadMatmulPattern::operator()() {
 }
 
 PDNode* TrtMultiHeadMatmulV3Pattern::operator()() {
-  std::unordered_set<std::string> matmul_ops{"matmul", "matmul_v2"};
   auto* input0 = pattern->NewNode(input0_repr());
-  input0->assert_is_ops_input(matmul_ops);
+  input0->assert_is_op_input("matrix_multiply");
 
   // First path with scale
-  auto* mul0 = pattern->NewNode(mul0_repr())->assert_is_ops(matmul_ops);
+  auto* mul0 = pattern->NewNode(mul0_repr())->assert_is_op("matrix_multiply");
   auto* mul0_w_var = pattern->NewNode(mul0_w_repr())
                          ->AsInput()
-                         ->assert_is_ops_input(matmul_ops, "Y");
+                         ->assert_is_op_input("matrix_multiply", "Y");
   auto* mul0_out_var =
-      pattern->NewNode(mul0_out_repr())->assert_is_ops_output(matmul_ops);
+      pattern->NewNode(mul0_out_repr())->assert_is_op_output("matrix_multiply");
 
   decltype(mul0) eltadd0;
   decltype(mul0) eltadd0_b_var;
@@ -487,12 +483,13 @@ PDNode* TrtMultiHeadMatmulV3Pattern::operator()() {
       pattern->NewNode(transpose2_0_repr())->assert_is_op("transpose2");
   auto* transpose2_0_out_var = pattern->NewNode(transpose2_0_out_repr())
                                    ->assert_is_op_output("transpose2");
-  transpose2_0_out_var->AsIntermediate()->assert_is_ops_input(matmul_ops, "X");
+  transpose2_0_out_var->AsIntermediate()->assert_is_op_input("matrix_multiply",
+                                                             "X");
 
   auto* matmul_qk =
-      pattern->NewNode(matmul_qk_repr())->assert_is_ops(matmul_ops);
-  auto* matmul_qk_out_var =
-      pattern->NewNode(matmul_qk_out_repr())->assert_is_ops_output(matmul_ops);
+      pattern->NewNode(matmul_qk_repr())->assert_is_op("matrix_multiply");
+  auto* matmul_qk_out_var = pattern->NewNode(matmul_qk_out_repr())
+                                ->assert_is_op_output("matrix_multiply");
   matmul_qk_out_var->AsIntermediate()->assert_is_op_input("elementwise_add");
 
   auto* eltadd_qk =
@@ -508,12 +505,12 @@ PDNode* TrtMultiHeadMatmulV3Pattern::operator()() {
       pattern->NewNode(softmax_qk_repr())->assert_is_op("softmax");
   auto* softmax_qk_out_var =
       pattern->NewNode(softmax_qk_out_repr())->assert_is_op_output("softmax");
-  softmax_qk_out_var->AsIntermediate()->assert_is_ops_input(matmul_ops);
+  softmax_qk_out_var->AsIntermediate()->assert_is_op_input("matrix_multiply");
 
   auto* matmul_qkv =
-      pattern->NewNode(matmul_qkv_repr())->assert_is_ops(matmul_ops);
-  auto* matmul_qkv_out_var =
-      pattern->NewNode(matmul_qkv_out_repr())->assert_is_ops_output(matmul_ops);
+      pattern->NewNode(matmul_qkv_repr())->assert_is_op("matrix_multiply");
+  auto* matmul_qkv_out_var = pattern->NewNode(matmul_qkv_out_repr())
+                                 ->assert_is_op_output("matrix_multiply");
   matmul_qkv_out_var->AsIntermediate()->assert_is_op_input("transpose2");
 
   auto* transpose2_qkv =
@@ -526,14 +523,13 @@ PDNode* TrtMultiHeadMatmulV3Pattern::operator()() {
       pattern->NewNode(reshape2_qkv_repr())->assert_is_op("reshape2");
   auto* reshape2_qkv_out_var = pattern->NewNode(reshape2_qkv_out_repr())
                                    ->assert_is_op_output("reshape2");
-  reshape2_qkv_out_var->assert_is_ops_input(matmul_ops);
   // Second path to matmul
-  auto* mul1 = pattern->NewNode(mul1_repr())->assert_is_ops(matmul_ops);
+  auto* mul1 = pattern->NewNode(mul1_repr())->assert_is_op("matrix_multiply");
   auto* mul1_w_var = pattern->NewNode(mul1_w_repr())
                          ->AsInput()
-                         ->assert_is_ops_input(matmul_ops, "Y");
+                         ->assert_is_op_input("matrix_multiply", "Y");
   auto* mul1_out_var =
-      pattern->NewNode(mul1_out_repr())->assert_is_ops_output(matmul_ops);
+      pattern->NewNode(mul1_out_repr())->assert_is_op_output("matrix_multiply");
 
   decltype(mul1) eltadd1;
   decltype(mul1) eltadd1_b_var;
@@ -560,16 +556,16 @@ PDNode* TrtMultiHeadMatmulV3Pattern::operator()() {
       pattern->NewNode(transpose2_1_repr())->assert_is_op("transpose2");
   auto* transpose2_1_out_var = pattern->NewNode(transpose2_1_out_repr())
                                    ->assert_is_op_output("transpose2");
-  transpose2_1_out_var->AsIntermediate()->assert_is_ops_input(
-      matmul_ops, "Y");  // link to matmul qk
+  transpose2_1_out_var->AsIntermediate()->assert_is_op_input(
+      "matrix_multiply", "Y");  // link to matmul qk
 
   // Third path to matmul
-  auto* mul2 = pattern->NewNode(mul2_repr())->assert_is_ops(matmul_ops);
+  auto* mul2 = pattern->NewNode(mul2_repr())->assert_is_op("matrix_multiply");
   auto* mul2_w_var = pattern->NewNode(mul2_w_repr())
                          ->AsInput()
-                         ->assert_is_ops_input(matmul_ops, "Y");
+                         ->assert_is_op_input("matrix_multiply", "Y");
   auto* mul2_out_var =
-      pattern->NewNode(mul2_out_repr())->assert_is_ops_output(matmul_ops);
+      pattern->NewNode(mul2_out_repr())->assert_is_op_output("matrix_multiply");
 
   decltype(mul2) eltadd2;
   decltype(mul2) eltadd2_b_var;
@@ -596,8 +592,8 @@ PDNode* TrtMultiHeadMatmulV3Pattern::operator()() {
       pattern->NewNode(transpose2_2_repr())->assert_is_op("transpose2");
   auto* transpose2_2_out_var = pattern->NewNode(transpose2_2_out_repr())
                                    ->assert_is_op_output("transpose2");
-  transpose2_2_out_var->AsIntermediate()->assert_is_ops_input(
-      matmul_ops);  // link to matmul qkv
+  transpose2_2_out_var->AsIntermediate()->assert_is_op_input(
+      "matrix_multiply");  // link to matmul qkv
 
   // Q path
   mul0->LinksFrom({input0, mul0_w_var}).LinksTo({mul0_out_var});
@@ -642,23 +638,6 @@ void TrtMultiHeadMatmulFusePass::ApplyImpl(Graph* graph) const {
 }
 
 TrtMultiHeadMatmulV2FusePass::TrtMultiHeadMatmulV2FusePass() {
-  AddOpCompat(OpCompat("mul"))
-      .AddInput("X")  // the shape shoule be (B, S, N*H)
-      .IsTensor()
-      .End()
-      .AddInput("Y")  // the shape shoule be (N*H, N*H)
-      .IsTensor()
-      .End()
-      .AddOutput("Out")  // the shape shoule be (B, S, N*H)
-      .IsTensor()
-      .End()
-      .AddAttr("x_num_col_dims")
-      .IsNumEQ(2)
-      .End()
-      .AddAttr("y_num_col_dims")
-      .IsNumEQ(1)
-      .End();
-
   AddOpCompat(OpCompat("elementwise_add"))
       .AddInput("X")
       // in bias, shape is (B, S, N*H),
@@ -738,45 +717,6 @@ TrtMultiHeadMatmulV2FusePass::TrtMultiHeadMatmulV2FusePass() {
       .IsType<bool>()
       .End();
 
-  // QK (B, H, S, N)*(B, H, S, N) -> (B, H, S, S)
-  // QKV (B, H, S, S)*(B, H, S, N) -> (B, H, S, N)
-  AddOpCompat(OpCompat("matmul"))
-      .AddInput("X")
-      .IsTensor()
-      .End()
-      .AddInput("Y")
-      .IsTensor()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("alpha")
-      .IsNumEQ(1.0f)
-      .End()
-      .AddAttr("transpose_X")
-      .IsBoolEQ(false)
-      .End()
-      .AddAttr("transpose_Y")  // QK(true) QKV(false)
-      .IsType<bool>()
-      .End();
-
-  AddOpCompat(OpCompat("matmul_v2"))
-      .AddInput("X")
-      .IsTensor()
-      .End()
-      .AddInput("Y")
-      .IsTensor()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("trans_x")
-      .IsType<bool>()
-      .End()
-      .AddAttr("trans_y")
-      .IsType<bool>()
-      .End();
-
   AddOpCompat(OpCompat("softmax"))
       .AddInput("X")
       .IsTensor()
@@ -1187,23 +1127,6 @@ void TrtMultiHeadMatmulV2FusePass::ApplyImpl(Graph* graph) const {
 }
 
 TrtMultiHeadMatmulV3FusePass::TrtMultiHeadMatmulV3FusePass() {
-  AddOpCompat(OpCompat("mul"))
-      .AddInput("X")  // the shape shoule be (B, S, N*H)
-      .IsTensor()
-      .End()
-      .AddInput("Y")  // the shape shoule be (N*H, N*H)
-      .IsTensor()
-      .End()
-      .AddOutput("Out")  // the shape shoule be (B, S, N*H)
-      .IsTensor()
-      .End()
-      .AddAttr("x_num_col_dims")
-      .IsNumEQ(2)
-      .End()
-      .AddAttr("y_num_col_dims")
-      .IsNumEQ(1)
-      .End();
-
   AddOpCompat(OpCompat("elementwise_add"))
       .AddInput("X")
       // in bias, shape is (B, S, N*H),
@@ -1266,45 +1189,6 @@ TrtMultiHeadMatmulV3FusePass::TrtMultiHeadMatmulV3FusePass() {
       .IsType<std::vector<int>>()
       .End();
 
-  // QK (B, H, S, N)*(B, H, S, N) -> (B, H, S, S)
-  // QKV (B, H, S, S)*(B, H, S, N) -> (B, H, S, N)
-  AddOpCompat(OpCompat("matmul"))
-      .AddInput("X")
-      .IsTensor()
-      .End()
-      .AddInput("Y")
-      .IsTensor()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("alpha")
-      .IsType<float>()  // QK(anyvalue, will copy to new op) QKV(1.0)
-      .End()
-      .AddAttr("transpose_X")
-      .IsBoolEQ(false)
-      .End()
-      .AddAttr("transpose_Y")  // QK(true) QKV(false)
-      .IsType<bool>()
-      .End();
-
-  AddOpCompat(OpCompat("matmul_v2"))
-      .AddInput("X")
-      .IsTensor()
-      .End()
-      .AddInput("Y")
-      .IsTensor()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("trans_x")
-      .IsBoolEQ(false)
-      .End()
-      .AddAttr("trans_y")  // QK(true) QKV(false)
-      .IsType<bool>()
-      .End();
-
   AddOpCompat(OpCompat("softmax"))
       .AddInput("X")
       .IsTensor()
@@ -1672,12 +1556,10 @@ REGISTER_PASS(trt_multihead_matmul_fuse_pass_v3,
 REGISTER_PASS_CAPABILITY(trt_multihead_matmul_fuse_pass_v2)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
-            .EQ("mul", 0)
             .LE("elementwise_add", 1)
             .EQ("reshape2", 0)
             .EQ("transpose2", 0)
             .EQ("scale", 0)
-            .LE("matmul", 1)
             .EQ("softmax", 0));
 
 REGISTER_PASS_CAPABILITY(trt_multihead_matmul_fuse_pass_v3)
@@ -1687,6 +1569,4 @@ REGISTER_PASS_CAPABILITY(trt_multihead_matmul_fuse_pass_v3)
             .EQ("reshape2", 0)
             .EQ("transpose2", 0)
             .EQ("scale", 0)
-            .LE("matmul", 1)
-            .EQ("matmul_v2", 0)
             .EQ("softmax", 0));
diff --git a/paddle/fluid/framework/ir/trt_skip_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/trt_skip_layernorm_fuse_pass.cc
index 4661d2cbf27566..02ae3e29595c3b 100644
--- a/paddle/fluid/framework/ir/trt_skip_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/trt_skip_layernorm_fuse_pass.cc
@@ -176,10 +176,17 @@ void TrtSkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const {
     new_desc.SetInput("Bias", {layer_norm_bias->Name()});
 
     if (layer_norm->Op()->HasAttr("out_threshold")) {
-      new_desc.SetAttr("enable_int8", true);
       new_desc.SetAttr("out_threshold",
                        layer_norm->Op()->GetAttr("out_threshold"));
     }
+    if (subgraph.at(x)->inputs[0]->Op()->HasAttr("out_threshold")) {
+      new_desc.SetAttr(
+          "X", subgraph.at(x)->inputs[0]->Op()->GetAttr("out_threshold"));
+    }
+    if (subgraph.at(y)->inputs[0]->Op()->HasAttr("out_threshold")) {
+      new_desc.SetAttr(
+          "Y", subgraph.at(y)->inputs[0]->Op()->GetAttr("out_threshold"));
+    }
 
     if (layer_norm->Op()->HasAttr("smooth_scale")) {
       new_desc.SetAttr("smooth_scale",
diff --git a/paddle/fluid/framework/ir/vit_attention_fuse_pass.cc b/paddle/fluid/framework/ir/vit_attention_fuse_pass.cc
index 3ff91e0bcb76c1..c27c7430c2af60 100644
--- a/paddle/fluid/framework/ir/vit_attention_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/vit_attention_fuse_pass.cc
@@ -79,7 +79,7 @@ void VitAttentionFusePass::ApplyImpl(ir::Graph* graph) const {
   auto* scope = param_scope();
 
   // pattern
-  std::unordered_set<std::string> matmul_ops{"matmul", "matmul_v2"};
+  std::unordered_set<std::string> matmul_ops{"matrix_multiply"};
   PDNode* x = gpd.mutable_pattern()
                   ->NewNode("x")
                   ->assert_is_ops_input(matmul_ops, "X")
@@ -173,5 +173,4 @@ REGISTER_PASS_CAPABILITY(vit_attention_fuse_pass)
             .EQ("transpose2", 0)
             .EQ("slice", 0)
             .EQ("scale", 0)
-            .EQ("softmax", 0)
-            .EQ("matmul_v2", 0));
+            .EQ("softmax", 0));
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index d718ab936b5302..45e4d763e6bd9f 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -2552,13 +2552,11 @@ USE_TRT_CONVERTER(transpose);
 USE_TRT_CONVERTER(transpose2);
 USE_TRT_CONVERTER(flatten);
 USE_TRT_CONVERTER(flatten_contiguous_range);
-USE_TRT_CONVERTER(matmul);
-USE_TRT_CONVERTER(matmul_v2);
+USE_TRT_CONVERTER(matrix_multiply);
 USE_TRT_CONVERTER(bmm);
 USE_TRT_CONVERTER(conv2d);
 USE_TRT_CONVERTER(relu);
 USE_TRT_CONVERTER(sigmoid);
-USE_TRT_CONVERTER(fc);
 USE_TRT_CONVERTER(pool2d);
 USE_TRT_CONVERTER(softmax);
 USE_TRT_CONVERTER(batch_norm);
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index cd46398c66fcc4..b30241cd2a83a7 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -86,17 +86,17 @@ void PaddlePassBuilder::ClearPasses() { passes_.clear(); }
 
 const std::vector<std::string> kTRTSubgraphPasses({
   "trt_support_nhwc_pass",
-      "adaptive_pool2d_convert_global_pass",       //
-      "shuffle_channel_detect_pass",               //
-      "quant_conv2d_dequant_fuse_pass",            //
-      "delete_fill_constant_op_pass",              //
-      "delete_quant_dequant_op_pass",              //
-      "delete_quant_dequant_filter_op_pass",       //
-      "trt_delete_weight_dequant_linear_op_pass",  //
-      "delete_quant_dequant_linear_op_pass",       //
-      "identity_scale_op_clean_pass",              //
-      "add_support_int8_pass",                     //
-      // "fc_fuse_pass",                        //
+      "adaptive_pool2d_convert_global_pass",          //
+      "trt_map_ops_to_matrix_multiply_pass",          //
+      "shuffle_channel_detect_pass",                  //
+      "quant_conv2d_dequant_fuse_pass",               //
+      "delete_fill_constant_op_pass",                 //
+      "delete_quant_dequant_op_pass",                 //
+      "delete_quant_dequant_filter_op_pass",          //
+      "trt_delete_weight_dequant_linear_op_pass",     //
+      "delete_quant_dequant_linear_op_pass",          //
+      "identity_scale_op_clean_pass",                 //
+      "add_support_int8_pass",                        //
       "simplify_with_basic_ops_pass",                 //
       "trt_embedding_eltwise_layernorm_fuse_pass",    //
       "preln_embedding_eltwise_layernorm_fuse_pass",  //
@@ -119,18 +119,12 @@ const std::vector<std::string> kTRTSubgraphPasses({
       "trt_skip_layernorm_fuse_pass",          //
       "preln_skip_layernorm_fuse_pass",        //
 #endif
-      "preln_residual_bias_fuse_pass",     //
-      "preln_layernorm_x_fuse_pass",       //
-      "reverse_roll_fuse_pass",            //
-      "conv_bn_fuse_pass",                 //
-      "unsqueeze2_eltwise_fuse_pass",      //
-      "trt_squeeze2_matmul_fuse_pass",     //
-      "trt_flatten2_matmul_fuse_pass",     //
-      "trt_map_matmul_v2_to_mul_pass",     //
-      "trt_map_matmul_v2_to_matmul_pass",  //
-      "trt_map_matmul_to_mul_pass",        //
-      "fc_fuse_pass",                      //
-      "conv_elementwise_add_fuse_pass",    //
+      "preln_residual_bias_fuse_pass",   //
+      "preln_layernorm_x_fuse_pass",     //
+      "reverse_roll_fuse_pass",          //
+      "conv_bn_fuse_pass",               //
+      "unsqueeze2_eltwise_fuse_pass",    //
+      "conv_elementwise_add_fuse_pass",  //
 #if defined _WIN32  // Windows CI is TensorRT7.0. Remove this after upgrading.
 #else
       "trans_layernorm_fuse_pass",             //
@@ -216,10 +210,6 @@ const std::vector<std::string> kTrtLowerPrecisionPasses{
     // "conv_eltwiseadd_bn_fuse_pass",
     "trt_embedding_eltwise_layernorm_fuse_pass",
     "trt_skip_layernorm_fuse_pass",
-    "trt_map_matmul_v2_to_mul_pass",
-    "trt_map_matmul_v2_to_matmul_pass",
-    "trt_map_matmul_to_mul_pass",
-    "fc_fuse_pass",
     "tensorrt_subgraph_pass",
 };
 
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index 1793e1207771e2..13c0137c7d895b 100755
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -2,11 +2,9 @@
 list(
   APPEND
   CONVERT_FILES
-  matmul_op.cc
-  matmul_v2_op.cc
+  matrix_multiply_op.cc
   bmm_op.cc
   conv2d_op.cc
-  fc_op.cc
   pool2d_op.cc
   elementwise_op.cc
   batch_norm_op.cc
diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
deleted file mode 100644
index fc68662d327160..00000000000000
--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ /dev/null
@@ -1,415 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-namespace {
-template <typename T>
-void tranpose_weight(const T* src, T* dst, int m, int n) {
-  for (int i = 0; i < m; i++) {
-    for (int j = 0; j < n; j++) {
-      dst[j * m + i] = src[i * n + j];
-    }
-  }
-}
-}  // namespace
-
-/*
- * FC converter convert a MUL op in Fluid to a FC layer in TRT.
- */
-class FcOpConverter : public OpConverter {
- public:
-  nvinfer1::ILayer* reshape_before_fc(nvinfer1::ITensor* before_fc,
-                                      nvinfer1::Dims x_dim,
-                                      int x_num_col_dims,
-                                      std::string output_name) {
-    // add shuffle before fc
-    nvinfer1::Dims reshape_before_fc_dim;
-    reshape_before_fc_dim.nbDims = x_num_col_dims + 3;
-    // padding shape "* x q x 1 x 1"
-
-    nvinfer1::ITensor* filal_reshape_before_fc_shape_tensor = nullptr;
-
-    if (!engine_->with_dynamic_shape()) {
-      for (int i = 0; i < reshape_before_fc_dim.nbDims; i++) {
-        reshape_before_fc_dim.d[i] = 1;
-      }
-      for (int i = 0; i < x_dim.nbDims; i++) {
-        if (i < x_num_col_dims) {
-          reshape_before_fc_dim.d[i] = 0;
-        } else {
-          reshape_before_fc_dim.d[x_num_col_dims] *= x_dim.d[i];
-        }
-      }
-    } else {
-      std::vector<nvinfer1::ITensor*> reshape_before_fc_shape_tensor;
-      nvinfer1::ITensor* input_shape_tensor = Shape(before_fc);
-
-      for (int i = 0; i < reshape_before_fc_dim.nbDims; i++) {
-        reshape_before_fc_shape_tensor.push_back(Add1DConstantLayer(1));
-      }
-      for (int i = 0; i < x_dim.nbDims; i++) {
-        if (i < x_num_col_dims) {
-          reshape_before_fc_shape_tensor[i] =
-              GetEleTensorOfShape(input_shape_tensor, i);
-        } else {
-          reshape_before_fc_shape_tensor[x_num_col_dims] =
-              Prod(GetEleTensorOfShape(input_shape_tensor, i),
-                   reshape_before_fc_shape_tensor[x_num_col_dims]);
-        }
-      }
-      filal_reshape_before_fc_shape_tensor =
-          Concat(reshape_before_fc_shape_tensor);
-    }
-
-    auto* reshape_before_fc_layer =
-        TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *before_fc);
-    if (!engine_->with_dynamic_shape()) {
-      reshape_before_fc_layer->setReshapeDimensions(reshape_before_fc_dim);
-    } else {
-      reshape_before_fc_layer->setInput(1,
-                                        *filal_reshape_before_fc_shape_tensor);
-    }
-
-    reshape_before_fc_layer->setName(
-        ("fc_op_reshape_before_fc: Shuffle (Output: " + output_name + ")")
-            .c_str());
-    return reshape_before_fc_layer;
-  }
-
-  nvinfer1::ILayer* reshape_after_fc(nvinfer1::ITensor* after_fc,
-                                     nvinfer1::Dims x_dim,
-                                     int x_num_col_dims) {
-    // add shuffle after fc
-    nvinfer1::Dims reshape_after_fc_dim;
-    reshape_after_fc_dim.nbDims = x_num_col_dims + 1;
-
-    nvinfer1::ITensor* filal_reshape_after_fc_shape_tensor = nullptr;
-
-    if (!engine_->with_dynamic_shape()) {
-      for (int i = 0; i < reshape_after_fc_dim.nbDims; i++) {
-        reshape_after_fc_dim.d[i] = 0;
-      }
-    } else {
-      std::vector<int> gather_indices(x_num_col_dims + 1);
-      std::iota(gather_indices.begin(), gather_indices.end(), 0);
-      filal_reshape_after_fc_shape_tensor =
-          Gather(Shape(after_fc), gather_indices);
-    }
-
-    auto* reshape_after_fc_layer =
-        TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *after_fc);
-    if (!engine_->with_dynamic_shape()) {
-      reshape_after_fc_layer->setReshapeDimensions(reshape_after_fc_dim);
-    } else {
-      reshape_after_fc_layer->setInput(1, *filal_reshape_after_fc_shape_tensor);
-    }
-
-    return reshape_after_fc_layer;
-  }
-
-  void operator()(const framework::proto::OpDesc& op,
-                  const framework::Scope& scope,
-                  bool test_mode) override {
-    VLOG(3) << "convert a fc op to tensorrt fc layer without bias";
-    framework::OpDesc op_desc(op, nullptr);
-    auto output_name = op_desc.Output("Out").front();
-    auto input_names = op_desc.InputNames();
-    bool with_bias = input_names.size() >= 3;
-    std::string w_name = "Y";
-    std::string i_name = "X";
-    if (with_bias) {
-      w_name = "W";
-      i_name = "Input";
-    }
-    // Declare inputs
-    auto* X = engine_->GetITensor(op_desc.Input(i_name).front());
-    auto x_dim = X->getDimensions();
-    // Declare weights
-    auto* Y_v = scope.FindVar(op_desc.Input(w_name).front());
-    PADDLE_ENFORCE_NOT_NULL(
-        Y_v,
-        platform::errors::NotFound(
-            "Can not find %s presistale var of fc in scope.", w_name));
-    auto* Y_t = Y_v->GetMutable<phi::DenseTensor>();
-    int x_num_col_dims =
-        op_desc.HasAttr("x_num_col_dims")
-            ? PADDLE_GET_CONST(int, op_desc.GetAttr("x_num_col_dims"))
-            : (op_desc.HasAttr("in_num_col_dims")
-                   ? PADDLE_GET_CONST(int, op_desc.GetAttr("in_num_col_dims"))
-                   : 1);
-    const std::string activation_type =
-        op_desc.HasAttr("activation_type")
-            ? PADDLE_GET_CONST(std::string, op_desc.GetAttr("activation_type"))
-            : "";
-
-    bool enable_int8 = op_desc.HasAttr("enable_int8");
-    bool support_int8 = false;
-    if (op_desc.HasAttr("support_int8")) {
-      support_int8 = PADDLE_GET_CONST(bool, op_desc.GetAttr("support_int8"));
-    }
-    float in_scale = 0;
-    if (enable_int8 || support_int8) {
-      if (enable_int8) {
-        in_scale = PADDLE_GET_CONST(float, op_desc.GetAttr("Input_scale"));
-      } else {
-        in_scale = PADDLE_GET_CONST(float, op_desc.GetAttr("X"));
-      }
-      engine_->SetTensorDynamicRange(X, in_scale);
-    }
-
-    PADDLE_ENFORCE_EQ(Y_t->dims().size(),
-                      2UL,
-                      platform::errors::InvalidArgument(
-                          "The fc's weight should be a matrix with 2 dims, but "
-                          "it's %d-dimensional.",
-                          Y_t->dims().size()));  // a matrix
-    int m = Y_t->dims()[0];
-    int n = Y_t->dims()[1];
-
-    auto regist_fc = [&](nvinfer1::ITensor* inputs,
-                         int n_output,
-                         TensorRTEngine::Weight& weight,
-                         TensorRTEngine::Weight& bias) {
-      if (enable_int8 || support_int8) {
-        // add conv layer
-        float out_scale = 0;
-        if (enable_int8) {
-          PADDLE_ENFORCE_EQ(
-              op_desc.HasAttr("out_threshold"),
-              true,
-              platform::errors::InvalidArgument(
-                  "must have out threshold in fc layers in int8 mode"));
-          out_scale = PADDLE_GET_CONST(float, op_desc.GetAttr("out_threshold"));
-        } else {
-          out_scale = PADDLE_GET_CONST(float, op_desc.GetAttr("Out"));
-        }
-        nvinfer1::DimsHW nv_ksize(1, 1);
-        auto* fc_layer_int8 = TRT_ENGINE_ADD_LAYER(engine_,
-                                                   Convolution,
-                                                   *inputs,
-                                                   n_output,
-                                                   nv_ksize,
-                                                   weight.get(),
-                                                   bias.get());
-        fc_layer_int8->setName(
-            ("fc_op_int8_conv1x1: Convolution (Output: " + output_name + ")")
-                .c_str());
-        engine_->SetTensorDynamicRange(fc_layer_int8->getOutput(0), out_scale);
-        auto* fc_after_reshape_int8 = reshape_after_fc(
-            fc_layer_int8->getOutput(0), x_dim, x_num_col_dims);
-        if (activation_type == "relu") {
-          fc_after_reshape_int8->setName(
-              ("int8_reshape_after_fc: Shuffle (Output: " + output_name + ")")
-                  .c_str());
-          engine_->SetTensorDynamicRange(fc_after_reshape_int8->getOutput(0),
-                                         out_scale);
-          nvinfer1::IActivationLayer* relu_layer_int8 =
-              TRT_ENGINE_ADD_LAYER(engine_,
-                                   Activation,
-                                   *(fc_after_reshape_int8->getOutput(0)),
-                                   nvinfer1::ActivationType::kRELU);
-          RreplenishLayerAndOutput(relu_layer_int8,
-                                   "relu_after_fc_shuffle",
-                                   {output_name},
-                                   test_mode);
-        } else {
-          RreplenishLayerAndOutput(fc_after_reshape_int8,
-                                   "fc_op_int8_reshape_after_fc: Shuffle",
-                                   {output_name},
-                                   test_mode);
-        }
-      } else {
-        // add fc layer
-        auto* fc_layer_float = TRT_ENGINE_ADD_LAYER(engine_,
-                                                    FullyConnected,
-                                                    *inputs,
-                                                    n_output,
-                                                    weight.get(),
-                                                    bias.get());
-        fc_layer_float->setName(
-            ("fc_op_float: FullyConnected (Output: " + output_name + ")")
-                .c_str());
-        auto* fc_after_reshape_float = reshape_after_fc(
-            fc_layer_float->getOutput(0), x_dim, x_num_col_dims);
-        if (activation_type == "relu") {
-          fc_after_reshape_float->setName(
-              ("float_reshape_after_fc: Shuffle (Output: " + output_name + ")")
-                  .c_str());
-          nvinfer1::IActivationLayer* relu_layer_float =
-              TRT_ENGINE_ADD_LAYER(engine_,
-                                   Activation,
-                                   *(fc_after_reshape_float->getOutput(0)),
-                                   nvinfer1::ActivationType::kRELU);
-          RreplenishLayerAndOutput(relu_layer_float,
-                                   "relu_after_fc_shuffle",
-                                   {output_name},
-                                   test_mode);
-        } else {
-          RreplenishLayerAndOutput(fc_after_reshape_float,
-                                   "shuffle_after_fc",
-                                   {output_name},
-                                   test_mode);
-        }
-      }
-    };
-
-    bool transpose_y = false;
-    if (op_desc.HasAttr("transpose_Y")) {
-      transpose_y = PADDLE_GET_CONST(bool, op_desc.GetAttr("transpose_Y"));
-    }
-    int weight_w, weight_h;
-    auto weight = engine_->GetTrtWeight(op_desc.Input(w_name).front(), *Y_t);
-
-    if (!transpose_y) {
-      if (weight.get().type == nvinfer1::DataType::kFLOAT) {
-        std::vector<float> weight_data_tmp;
-        weight_data_tmp.reserve(Y_t->numel());
-        memcpy(weight_data_tmp.data(),
-               weight.get().values,
-               Y_t->numel() * sizeof(float));
-        tranpose_weight(
-            weight_data_tmp.data(),
-            const_cast<float*>(static_cast<const float*>(weight.get().values)),
-            m,
-            n);
-      } else if (weight.get().type == nvinfer1::DataType::kHALF) {
-        std::vector<float16> weight_data_tmp;
-        weight_data_tmp.reserve(Y_t->numel());
-        memcpy(weight_data_tmp.data(),
-               weight.get().values,
-               Y_t->numel() * sizeof(float16));
-        tranpose_weight(weight_data_tmp.data(),
-                        const_cast<float16*>(
-                            static_cast<const float16*>(weight.get().values)),
-                        m,
-                        n);
-      } else {
-        PADDLE_THROW(paddle::platform::errors::InvalidArgument(
-            "Paddle-TRT fc convert not supporte dtype, now only support fp32 "
-            "and fp16."));
-      }
-      weight_w = n;
-      weight_h = m;
-    } else {
-      weight_w = m;
-      weight_h = n;
-    }
-    size_t n_output = weight_w;
-    weight.dims.assign({weight_w, weight_h});
-
-    TensorRTEngine::Weight bias{weight.get().type, nullptr, 0};
-    if (with_bias) {
-      auto* b_v = scope.GetVar(op_desc.Input("Bias").front());
-      auto* b_t = b_v->GetMutable<phi::DenseTensor>();
-      bias = engine_->GetTrtWeight(op_desc.Input("Bias").front(), *b_t);
-    }
-
-    // Running the TRT Static Shape mode: x_num_col_dims-1
-    if (!engine_->with_dynamic_shape()) {
-      x_num_col_dims--;
-    }
-    // If use tensorrt'oss, the x_dim and x_num_col_dims need change, and can
-    // not add Shuffle layer in ernie's multihead.
-    if (x_dim.nbDims == 4 && x_dim.d[2] == 1 && x_dim.d[3] == 1) {
-      if (enable_int8 || support_int8) {
-        // add conv1x1 layer
-        nvinfer1::DimsHW nv_ksize(1, 1);
-        auto* fc_layer_int8 = TRT_ENGINE_ADD_LAYER(engine_,
-                                                   Convolution,
-                                                   *X,
-                                                   n_output,
-                                                   nv_ksize,
-                                                   weight.get(),
-                                                   bias.get());
-        if (activation_type == "relu") {
-          fc_layer_int8->setName(
-              ("ernie_fc_op_int8: Convolution (Output: " + output_name + ")")
-                  .c_str());
-          PADDLE_ENFORCE_EQ(
-              op_desc.HasAttr("out_threshold"),
-              true,
-              platform::errors::InvalidArgument(
-                  "must have out threshold in fc layers in int8 mode"));
-          float out_scale = 0;
-          if (enable_int8) {
-            out_scale =
-                PADDLE_GET_CONST(float, op_desc.GetAttr("out_threshold"));
-          } else {
-            out_scale = PADDLE_GET_CONST(float, op_desc.GetAttr("Out"));
-          }
-          engine_->SetTensorDynamicRange(fc_layer_int8->getOutput(0),
-                                         out_scale);
-          nvinfer1::IActivationLayer* relu_layer_int8 =
-              TRT_ENGINE_ADD_LAYER(engine_,
-                                   Activation,
-                                   *(fc_layer_int8->getOutput(0)),
-                                   nvinfer1::ActivationType::kRELU);
-          RreplenishLayerAndOutput(relu_layer_int8,
-                                   "relu_after_ernie_fc_int8",
-                                   {output_name},
-                                   test_mode);
-        } else {
-          RreplenishLayerAndOutput(fc_layer_int8,
-                                   "ernie_fc_op_int8: Convolution",
-                                   {output_name},
-                                   test_mode);
-        }
-      } else {
-        // add fc layer
-        auto* fc_layer_float = TRT_ENGINE_ADD_LAYER(
-            engine_, FullyConnected, *X, n_output, weight.get(), bias.get());
-        if (activation_type == "relu") {
-          fc_layer_float->setName(
-              ("ernie_fc_op_float: (Output: " + output_name + ")").c_str());
-          nvinfer1::IActivationLayer* relu_layer_float =
-              TRT_ENGINE_ADD_LAYER(engine_,
-                                   Activation,
-                                   *(fc_layer_float->getOutput(0)),
-                                   nvinfer1::ActivationType::kRELU);
-          RreplenishLayerAndOutput(relu_layer_float,
-                                   "relu_after_ernie_fc_float",
-                                   {output_name},
-                                   test_mode);
-        } else {
-          RreplenishLayerAndOutput(
-              fc_layer_float, "ernie_fc_op_float", {output_name}, test_mode);
-        }
-      }
-    } else {  // need reshape input before and after fc
-      PADDLE_ENFORCE_GT(
-          x_dim.nbDims,
-          x_num_col_dims,
-          platform::errors::InvalidArgument(
-              "Params and input dims mismatch. Paddle-TRT FC "
-              "converter expects x_dim.nbDims > x_num_col_dims, but "
-              "x_dim.nbDims : %d, x_num_col_dims : %d.",
-              x_dim.nbDims,
-              x_num_col_dims));
-      auto* reshape_before_fc_layer =
-          reshape_before_fc(X, x_dim, x_num_col_dims, output_name);
-      auto* reshape_itensor = reshape_before_fc_layer->getOutput(0);
-      if (enable_int8 || support_int8) {
-        engine_->SetTensorDynamicRange(reshape_itensor, in_scale);
-      }
-      regist_fc(reshape_itensor, n_output, weight, bias);
-    }
-  }
-};
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
-
-REGISTER_TRT_OP_CONVERTER(fc, FcOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/matmul_op.cc
deleted file mode 100644
index f2bf84b8429e0f..00000000000000
--- a/paddle/fluid/inference/tensorrt/convert/matmul_op.cc
+++ /dev/null
@@ -1,190 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-#include "paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-/*
- * MatMulOp, IMatrixMultiplyLayer in TRT. This Layer doesn't has weights.
- */
-class MatMulOpConverter : public OpConverter {
- public:
-  void operator()(const framework::proto::OpDesc& op,
-                  const framework::Scope& scope,
-                  bool test_mode) override {
-    VLOG(3) << "convert a matmul op to tensorrt matmul layer ";
-    framework::OpDesc op_desc(op, nullptr);
-    nvinfer1::ILayer* layer = nullptr;
-
-    // Declare inputs
-    auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
-    auto* input2 = engine_->GetITensor(op_desc.Input("Y")[0]);
-
-    nvinfer1::Dims dims_x = input1->getDimensions();
-    nvinfer1::Dims dims_y = input2->getDimensions();
-
-    bool transpose_X = PADDLE_GET_CONST(bool, op_desc.GetAttr("transpose_X"));
-    bool transpose_Y = PADDLE_GET_CONST(bool, op_desc.GetAttr("transpose_Y"));
-
-    auto output_name = op_desc.Output("Out")[0];
-    float alpha = 1;
-    if (op_desc.HasAttr("alpha")) {
-      float alpha_tem = PADDLE_GET_CONST(float, op_desc.GetAttr("alpha"));
-      alpha = alpha_tem;
-    }
-    nvinfer1::MatrixOperation matrix_operation_X =
-        transpose_X ? nvinfer1::MatrixOperation::kTRANSPOSE
-                    : nvinfer1::MatrixOperation::kNONE;
-    nvinfer1::MatrixOperation matrix_operation_Y =
-        transpose_Y ? nvinfer1::MatrixOperation::kTRANSPOSE
-                    : nvinfer1::MatrixOperation::kNONE;
-
-    if (op_desc.HasAttr("support_int8") &&
-        PADDLE_GET_CONST(bool, op_desc.GetAttr("support_int8")) &&
-        engine_->precision() == AnalysisConfig::Precision::kInt8 &&
-        platform::GetGPUComputeCapability(platform::GetCurrentDeviceId()) >=
-            75) {
-      if (engine_->with_dynamic_shape()) {
-        VLOG(3) << "Convert a fluid matmul_op_int8_dynamic to TensorRT "
-                   "MatmulPluginLayer";
-        plugin::MatmulPluginDynamic* plugin =
-            new plugin::MatmulPluginDynamic(transpose_X, transpose_Y, alpha);
-        std::vector<nvinfer1::ITensor*> inputs{input1, input2};
-        layer = engine_->AddDynamicPlugin(inputs.data(), inputs.size(), plugin);
-        RreplenishLayerAndOutput(
-            layer, "matmul_op_int8_dynamic", {output_name}, test_mode);
-      } else {
-        VLOG(3) << "Convert a fluid matmul_op_int8_static to TensorRT "
-                   "MatmulPluginLayer";
-        plugin::MatmulPlugin* plugin = new plugin::MatmulPlugin(
-            dims_x, dims_y, transpose_X, transpose_Y, alpha);
-        std::vector<nvinfer1::ITensor*> inputs{input1, input2};
-        layer = engine_->AddPluginV2IOExt(inputs.data(), inputs.size(), plugin);
-        RreplenishLayerAndOutput(
-            layer, "matmul_op_int8_static", {output_name}, test_mode);
-      }
-    } else {
-      VLOG(3) << "Convert a fluid matmul_op_float to TensorRT ";
-      layer = TRT_ENGINE_ADD_LAYER(engine_,
-                                   MatrixMultiply,
-                                   *input1,
-                                   matrix_operation_X,
-                                   *input2,
-                                   matrix_operation_Y);
-      if (alpha == 1) {
-        RreplenishLayerAndOutput(
-            layer, "matmul_op_float_no_alpha", {output_name}, test_mode);
-      } else {
-        layer->setName(
-            ("matmul_op_float_has_alpha: MatrixMultiplyLayer (Output: " +
-             output_name + ")")
-                .c_str());
-        // IScaleLayer requires the input must have at least
-        // three dimensions in static shape mode and at least
-        // four dimensions in dynamic shape mode.
-        auto* matmul_out = layer->getOutput(0);
-        nvinfer1::Dims out_shape = matmul_out->getDimensions();
-        const int out_dims = out_shape.nbDims;
-        bool need_change_dim = false;
-
-        if (engine_->with_dynamic_shape()) {
-          if (out_dims == 3) {
-            need_change_dim = true;
-          }
-        } else {
-          if (out_dims == 2) {
-            need_change_dim = true;
-          }
-        }
-
-        if (need_change_dim) {
-          nvinfer1::Dims reshape_dim;
-          reshape_dim.nbDims = out_dims + 1;
-          reshape_dim.d[out_dims] = 1;
-          for (int i = 0; i < out_dims; i++) {
-            reshape_dim.d[i] = out_shape.d[i];
-          }
-
-          auto* reshape_layer =
-              TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *matmul_out);
-          reshape_layer->setReshapeDimensions(reshape_dim);
-          matmul_out = reshape_layer->getOutput(0);
-          reshape_layer->setName(("matmul_op_float_has_alpha_reshape_before: "
-                                  "ShuffleLayer (Output: " +
-                                  output_name + ")")
-                                     .c_str());
-        }
-
-        auto create_weights = [&](float data,
-                                  const std::string& type) -> float* {
-          std::unique_ptr<phi::DenseTensor> tmp_tensor(new phi::DenseTensor());
-          tmp_tensor->Resize({1});
-          auto* tmp_data =
-              tmp_tensor->mutable_data<float>(platform::CPUPlace());
-          tmp_data[0] = data;
-          engine_->SetWeights(output_name + "_add_scale_op_" + type,
-                              std::move(tmp_tensor));
-          return tmp_data;
-        };
-        float* alpha_data = create_weights(alpha, "alpha");
-        float* shift_data = create_weights(0.0, "shift");
-        float* power_data = create_weights(1.0, "power");
-        TensorRTEngine::Weight nv_alpha{
-            nvinfer1::DataType::kFLOAT, static_cast<void*>(alpha_data), 1};
-        TensorRTEngine::Weight nv_shift{
-            nvinfer1::DataType::kFLOAT, static_cast<void*>(shift_data), 1};
-        TensorRTEngine::Weight nv_power{
-            nvinfer1::DataType::kFLOAT, static_cast<void*>(power_data), 1};
-        auto* scale_layer = TRT_ENGINE_ADD_LAYER(engine_,
-                                                 Scale,
-                                                 *matmul_out,
-                                                 nvinfer1::ScaleMode::kUNIFORM,
-                                                 nv_shift.get(),
-                                                 nv_alpha.get(),
-                                                 nv_power.get());
-        auto* scale_out = scale_layer->getOutput(0);
-        scale_layer->setName(
-            ("matmul_op_float_has_alpha: ScaleLayer (Output: " + output_name +
-             ")")
-                .c_str());
-
-        if (need_change_dim) {
-          auto* reshape_layer =
-              TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *scale_out);
-          reshape_layer->setReshapeDimensions(out_shape);
-          scale_out = reshape_layer->getOutput(0);
-          reshape_layer->setName(("matmul_op_float_has_alpha_reshape_after: "
-                                  "ShuffleLayer (Output: " +
-                                  output_name + ")")
-                                     .c_str());
-        }
-        engine_->SetITensor(output_name, scale_out);
-        if (test_mode) {  // the test framework can not determine which is the
-                          // output, so place the declaration inside.
-          engine_->DeclareOutput(output_name);
-        }
-      }
-    }
-  }
-};
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
-
-REGISTER_TRT_OP_CONVERTER(matmul, MatMulOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/matmul_v2_op.cc b/paddle/fluid/inference/tensorrt/convert/matmul_v2_op.cc
deleted file mode 100644
index 3dad8fc1d2c245..00000000000000
--- a/paddle/fluid/inference/tensorrt/convert/matmul_v2_op.cc
+++ /dev/null
@@ -1,126 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-#include "paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.h"
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-/*
- * MatMulV2Op, IMatrixMultiplyLayer in TRT. This Layer doesn't has weights.
- */
-class MatMulV2OpConverter : public OpConverter {
- public:
-  void operator()(const framework::proto::OpDesc& op,
-                  const framework::Scope& scope,
-                  bool test_mode) override {
-    VLOG(3) << "convert a matmul_v2 op to tensorrt IMatrixMultiplyLayer layer ";
-    framework::OpDesc op_desc(op, nullptr);
-    nvinfer1::IMatrixMultiplyLayer* layer = nullptr;
-
-    // Declare inputs
-    auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
-    auto* input2 = engine_->GetITensor(op_desc.Input("Y")[0]);
-
-    nvinfer1::Dims dims_x = input1->getDimensions();
-    nvinfer1::Dims dims_y = input2->getDimensions();
-
-    bool transpose_X = PADDLE_GET_CONST(bool, op_desc.GetAttr("trans_x"));
-    bool transpose_Y = PADDLE_GET_CONST(bool, op_desc.GetAttr("trans_y"));
-
-    auto output_name = op_desc.Output("Out")[0];
-
-    nvinfer1::MatrixOperation matrix_operation_X =
-        transpose_X ? nvinfer1::MatrixOperation::kTRANSPOSE
-                    : nvinfer1::MatrixOperation::kNONE;
-    nvinfer1::MatrixOperation matrix_operation_Y =
-        transpose_Y ? nvinfer1::MatrixOperation::kTRANSPOSE
-                    : nvinfer1::MatrixOperation::kNONE;
-
-    int one_num = 0;
-    bool all_matrix = dims_x.nbDims >= 2 && dims_y.nbDims >= 2;
-    nvinfer1::ITensor* new_shape_tensor = nullptr;
-    if (dims_x.nbDims < dims_y.nbDims && all_matrix) {
-      one_num = dims_y.nbDims - dims_x.nbDims;
-      new_shape_tensor = Shape(input1);
-      std::vector<int32_t> one_vec(one_num, 1);
-      auto* one_tensor = Add1DConstantLayer(one_vec);
-      new_shape_tensor =
-          Concat(std::vector<nvinfer1::ITensor*>{one_tensor, new_shape_tensor});
-
-      auto* reshape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input1);
-      reshape_layer->setInput(1, *new_shape_tensor);
-
-      layer = TRT_ENGINE_ADD_LAYER(engine_,
-                                   MatrixMultiply,
-                                   *reshape_layer->getOutput(0),
-                                   matrix_operation_X,
-                                   *input2,
-                                   matrix_operation_Y);
-
-    } else if (dims_x.nbDims > dims_y.nbDims && all_matrix) {
-      one_num = dims_x.nbDims - dims_y.nbDims;
-      new_shape_tensor = Shape(input2);
-      std::vector<int32_t> one_vec(one_num, 1);
-      auto* one_tensor = Add1DConstantLayer(one_vec);
-      new_shape_tensor =
-          Concat(std::vector<nvinfer1::ITensor*>{one_tensor, new_shape_tensor});
-      auto* reshape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input2);
-      reshape_layer->setInput(1, *new_shape_tensor);
-
-      layer = TRT_ENGINE_ADD_LAYER(engine_,
-                                   MatrixMultiply,
-                                   *input1,
-                                   matrix_operation_X,
-                                   *reshape_layer->getOutput(0),
-                                   matrix_operation_Y);
-
-    } else {
-      layer = TRT_ENGINE_ADD_LAYER(engine_,
-                                   MatrixMultiply,
-                                   *input1,
-                                   matrix_operation_X,
-                                   *input2,
-                                   matrix_operation_Y);
-    }
-    if (dims_x.nbDims == 1)
-      layer->setOperation(0, nvinfer1::MatrixOperation::kVECTOR);
-    if (dims_y.nbDims == 1)
-      layer->setOperation(1, nvinfer1::MatrixOperation::kVECTOR);
-    nvinfer1::ILayer* final_layer = static_cast<nvinfer1::ILayer*>(layer);
-    // When vec * vec, trt produces a scalar, so to be consistent with paddle,
-    // we need add a reshape.
-    if (dims_x.nbDims == 1 && dims_y.nbDims == 1) {
-      auto reshape_layer =
-          TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *layer->getOutput(0));
-      nvinfer1::Dims reshape_dim;
-      reshape_dim.nbDims = 1;
-      reshape_dim.d[0] = 1;
-      reshape_layer->setReshapeDimensions(reshape_dim);
-      final_layer = static_cast<nvinfer1::ILayer*>(reshape_layer);
-    }
-    VLOG(3) << "Convert a matmul_v2_op to TensorRT ";
-
-    RreplenishLayerAndOutput(
-        final_layer, "matmul_v2_op", {output_name}, test_mode);
-  }
-};
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
-
-REGISTER_TRT_OP_CONVERTER(matmul_v2, MatMulV2OpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/matrix_multiply_op.cc b/paddle/fluid/inference/tensorrt/convert/matrix_multiply_op.cc
new file mode 100644
index 00000000000000..85ffaa9f0768ff
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/matrix_multiply_op.cc
@@ -0,0 +1,273 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * After trt_map_ops_to_matrix_multiply_pass(mul, matmul, matmul_v2 ->
+ * matrix_multiply), use MatrixMultiply layer, ElementWiseOperation::kPROD
+ * layer.
+ */
+class MatrixMultiplyOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope,
+                  bool test_mode) override {
+    VLOG(3)
+        << "convert a matrix_multiply op to TensorRT MatrixMultiply layer +  "
+           "ElementWiseOperation::kPROD layer(if alpha != 1).";
+
+    // Input: X, Y
+    // Output: Out
+    // Attributes: transpose_x, transpose_y, x_num_col_dims, y_num_col_dims,
+    // alpha. extra Attributes(for quant dequant): X, Y, Out, Input_scale,
+    // out_threshold.
+    framework::OpDesc op_desc(op, nullptr);
+
+    // Declare inputs
+    auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
+    auto* input2 = engine_->GetITensor(op_desc.Input("Y")[0]);
+
+    bool enable_int8 =
+        (engine_->precision() == AnalysisConfig::Precision::kInt8);
+    float x_scale = 0;
+    float y_scale = 0;
+    float out_scale = 0;
+
+    if (enable_int8) {
+      if (op_desc.HasAttr("Input_scale")) {
+        x_scale = PADDLE_GET_CONST(float, op_desc.GetAttr("Input_scale"));
+        engine_->SetTensorDynamicRange(input1, x_scale);
+      }
+      if (op_desc.HasAttr("X")) {
+        x_scale = PADDLE_GET_CONST(float, op_desc.GetAttr("X"));
+        engine_->SetTensorDynamicRange(input1, x_scale);
+      }
+
+      if (op_desc.HasAttr("Y")) {
+        y_scale = PADDLE_GET_CONST(float, op_desc.GetAttr("Y"));
+        engine_->SetTensorDynamicRange(input2, y_scale);
+      }
+
+      if (op_desc.HasAttr("out_threshold")) {
+        out_scale = PADDLE_GET_CONST(float, op_desc.GetAttr("out_threshold"));
+      }
+      if (op_desc.HasAttr("Out")) {
+        out_scale = PADDLE_GET_CONST(float, op_desc.GetAttr("Out"));
+      }
+    }
+
+    auto output_name = op_desc.Output("Out")[0];
+
+    nvinfer1::Dims dims_x = input1->getDimensions();
+    int32_t x_rank = dims_x.nbDims;
+    nvinfer1::Dims dims_y = input2->getDimensions();
+    int32_t y_rank = dims_y.nbDims;
+
+    int32_t x_num_col_dims =
+        PADDLE_GET_CONST(int32_t, op_desc.GetAttr("x_num_col_dims"));
+    if (x_num_col_dims < 0) {
+      x_num_col_dims += x_rank;
+    }
+
+    // Temporarily solve the reformat problem of matrix multiplication, make
+    // input.rank == 4. Possible solution in trt 8.7.
+    if (x_rank == 2 && x_num_col_dims == 1 && engine_->use_varseqlen()) {
+      VLOG(3) << "Temporarily solve the reformat problem of matrix "
+                 "multiplication, make input.rank == 4. ";
+      auto* reshape_before_matrix =
+          TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input1);
+      std::vector<nvinfer1::ITensor*> reshape_before_tensor;
+      reshape_before_tensor.push_back(GetEleTensorOfShape(Shape(input1), 0));
+      reshape_before_tensor.push_back(GetEleTensorOfShape(Shape(input1), 1));
+      reshape_before_tensor.push_back(Add1DConstantLayer(1));
+      reshape_before_tensor.push_back(Add1DConstantLayer(1));
+
+      reshape_before_matrix->setInput(1, *Concat(reshape_before_tensor));
+      reshape_before_matrix->setName(
+          ("reshape_before_matrix(Output: " + output_name + ")").c_str());
+      input1 = reshape_before_matrix->getOutput(0);
+      dims_x = input1->getDimensions();
+      x_rank = dims_x.nbDims;
+
+      if (enable_int8) {
+        if (op_desc.HasAttr("Input_scale") || op_desc.HasAttr("X")) {
+          engine_->SetTensorDynamicRange(input1, x_scale);
+        }
+      }
+    }
+
+    if (x_num_col_dims != x_rank - 1) {
+      std::vector<nvinfer1::ITensor*> before_shape_tensors;
+      nvinfer1::ITensor* input_shape_tensor = Shape(input1);
+      for (int i = 0; i < x_num_col_dims; ++i) {
+        before_shape_tensors.push_back(
+            GetEleTensorOfShape(input_shape_tensor, i));
+      }
+      nvinfer1::ITensor* producted = Add1DConstantLayer(1);
+      for (int i = x_num_col_dims; i < x_rank; ++i) {
+        producted = Prod(producted, GetEleTensorOfShape(input_shape_tensor, i));
+      }
+      before_shape_tensors.push_back(producted);
+      nvinfer1::ITensor* before_shape_tensor = Concat(before_shape_tensors);
+      auto* reshape_before_layer =
+          TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input1);
+      reshape_before_layer->setInput(1, *before_shape_tensor);
+      reshape_before_layer->setName(
+          ("reshape_x_before_matrix_multiply: Shuffle (Output: " + output_name +
+           ")")
+              .c_str());
+      input1 = reshape_before_layer->getOutput(0);
+
+      if (enable_int8) {
+        if (op_desc.HasAttr("Input_scale") || op_desc.HasAttr("X")) {
+          engine_->SetTensorDynamicRange(input1, x_scale);
+        }
+      }
+
+      x_rank = x_num_col_dims + 1;
+    }
+
+    int32_t y_num_col_dims =
+        PADDLE_GET_CONST(int32_t, op_desc.GetAttr("y_num_col_dims"));
+    if (y_num_col_dims < 0) {
+      y_num_col_dims += y_rank;
+    }
+    PADDLE_ENFORCE_EQ(
+        y_num_col_dims,
+        y_rank - 1,
+        platform::errors::InvalidArgument(
+            "The matrix_multiply op'y_num_col_dims should be equal "
+            "to y'rank - 1, but got y_num_col_dims = %d, and y_rank = %d",
+            y_num_col_dims,
+            y_rank - 1));
+
+    if (x_rank != 1 && y_rank != 1 && x_rank != y_rank) {
+      if (x_rank < y_rank) {
+        std::vector<nvinfer1::ITensor*> before_shape_tensors;
+        nvinfer1::ITensor* input_shape_tensor = Shape(input1);
+        for (int i = 0; i < y_rank - x_rank; ++i) {
+          before_shape_tensors.push_back(Add1DConstantLayer(1));
+        }
+        for (int i = 0; i < x_rank; ++i) {
+          before_shape_tensors.push_back(
+              GetEleTensorOfShape(input_shape_tensor, i));
+        }
+        nvinfer1::ITensor* before_shape_tensor = Concat(before_shape_tensors);
+        auto* reshape_before_layer =
+            TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input1);
+        reshape_before_layer->setInput(1, *before_shape_tensor);
+        reshape_before_layer->setName(
+            ("full_x_before_matrix_multiply: Shuffle (Output: " + output_name +
+             ")")
+                .c_str());
+        input1 = reshape_before_layer->getOutput(0);
+
+        if (enable_int8) {
+          if (op_desc.HasAttr("Input_scale") || op_desc.HasAttr("X")) {
+            engine_->SetTensorDynamicRange(input1, x_scale);
+          }
+        }
+        x_rank = y_rank;
+      } else {
+        std::vector<nvinfer1::ITensor*> before_shape_tensors;
+        nvinfer1::ITensor* input_shape_tensor = Shape(input2);
+
+        for (int i = 0; i < x_rank - y_rank; ++i) {
+          before_shape_tensors.push_back(Add1DConstantLayer(1));
+        }
+        for (int i = 0; i < y_rank; ++i) {
+          before_shape_tensors.push_back(
+              GetEleTensorOfShape(input_shape_tensor, i));
+        }
+        nvinfer1::ITensor* before_shape_tensor = Concat(before_shape_tensors);
+        auto* reshape_before_layer =
+            TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input2);
+        reshape_before_layer->setInput(1, *before_shape_tensor);
+        reshape_before_layer->setName(
+            ("full_y_before_matrix_multiply: Shuffle (Output: " + output_name +
+             ")")
+                .c_str());
+        input2 = reshape_before_layer->getOutput(0);
+
+        if (enable_int8) {
+          if (op_desc.HasAttr("Y")) {
+            engine_->SetTensorDynamicRange(input2, y_scale);
+          }
+        }
+      }
+      y_rank = x_rank;
+    }
+
+    nvinfer1::MatrixOperation matrix_operation_x;
+    nvinfer1::MatrixOperation matrix_operation_y;
+
+    if (x_rank == 1) {
+      matrix_operation_x = nvinfer1::MatrixOperation::kVECTOR;
+    } else {
+      bool transpose_x = PADDLE_GET_CONST(bool, op_desc.GetAttr("transpose_x"));
+      matrix_operation_x = transpose_x ? nvinfer1::MatrixOperation::kTRANSPOSE
+                                       : nvinfer1::MatrixOperation::kNONE;
+    }
+
+    if (y_rank == 1) {
+      matrix_operation_y = nvinfer1::MatrixOperation::kVECTOR;
+    } else {
+      bool transpose_y = PADDLE_GET_CONST(bool, op_desc.GetAttr("transpose_y"));
+      matrix_operation_y = transpose_y ? nvinfer1::MatrixOperation::kTRANSPOSE
+                                       : nvinfer1::MatrixOperation::kNONE;
+    }
+
+    nvinfer1::ILayer* layer = nullptr;
+    layer = TRT_ENGINE_ADD_LAYER(engine_,
+                                 MatrixMultiply,
+                                 *input1,
+                                 matrix_operation_x,
+                                 *input2,
+                                 matrix_operation_y);
+
+    if (enable_int8) {
+      if (op_desc.HasAttr("out_threshold") || op_desc.HasAttr("Out")) {
+        engine_->SetTensorDynamicRange(layer->getOutput(0), out_scale);
+      }
+    }
+
+    float alpha = PADDLE_GET_CONST(float, op_desc.GetAttr("alpha"));
+    if (alpha < 0.999 || alpha > 1.001) {
+      auto* alpha_tensor = Add1DConstantLayer(alpha);
+      std::vector<nvinfer1::ITensor*> alpha_shape_tensors;
+      for (int i = 0; i < layer->getOutput(0)->getDimensions().nbDims; i++) {
+        alpha_shape_tensors.push_back(Add1DConstantLayer(1));
+      }
+      auto* reshape_alpha =
+          TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *alpha_tensor);
+      reshape_alpha->setInput(1, *Concat(alpha_shape_tensors));
+      layer = TRT_ENGINE_ADD_LAYER(engine_,
+                                   ElementWise,
+                                   *layer->getOutput(0),
+                                   *reshape_alpha->getOutput(0),
+                                   nvinfer1::ElementWiseOperation::kPROD);
+    }
+    RreplenishLayerAndOutput(
+        layer, "matrix_multiply_op", {output_name}, test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(matrix_multiply, MatrixMultiplyOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
index 6eafca682efe8d..da9376d2b83868 100644
--- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
@@ -71,14 +71,6 @@ class MultiheadMatMulOpConverter : public OpConverter {
     int hidden_out = weight_dims[2];  // channels_out
     int m = hidden_in;
     int n = three * hidden_out;
-    auto tranpose_weight = [](const float* src, float* dst, int m, int n) {
-      for (int i = 0; i < m; i++) {
-        for (int j = 0; j < n; j++) {
-          dst[j * m + i] = src[i * n + j];
-        }
-      }
-    };
-    tranpose_weight(weight_data_tmp.data(), weight_data, m, n);
 
     int head_number = PADDLE_GET_CONST(int, op_desc.GetAttr("head_number"));
 
@@ -102,7 +94,6 @@ class MultiheadMatMulOpConverter : public OpConverter {
         nvinfer1::ITensor* mask_tensor;
         nvinfer1::ITensor* pos_id_tensor;
         nvinfer1::ITensor* max_seqlen_tensor;
-        auto* new_input = input;
         if (flag_varseqlen) {
           mask_tensor = engine_->GetITensor("qkv_plugin_mask");
           pos_id_tensor = engine_->GetITensor("pos_id");
@@ -188,7 +179,11 @@ class MultiheadMatMulOpConverter : public OpConverter {
           nvinfer1::ILayer* transformer_input_layer = engine_->AddDynamicPlugin(
               inputs_transformer.data(), inputs_transformer.size(), plugin);
 
-          new_input = transformer_input_layer->getOutput(0);
+          input = transformer_input_layer->getOutput(0);
+          if (op_desc.HasAttr("Input_scale")) {
+            in_scale = PADDLE_GET_CONST(float, op_desc.GetAttr("Input_scale"));
+            engine_->SetTensorDynamicRange(input, in_scale);
+          }
           mask_tensor = transformer_input_layer->getOutput(1);
           pos_id_tensor = transformer_input_layer->getOutput(2);
           max_seqlen_tensor = transformer_input_layer->getOutput(3);
@@ -204,7 +199,7 @@ class MultiheadMatMulOpConverter : public OpConverter {
           float dp_probs = 1.0 / 127.0;
           nvinfer1::DimsHW nv_ksize(1, 1);
           fc_layer = TRT_ENGINE_ADD_LAYER(
-              engine_, Convolution, *new_input, n, nv_ksize, weight, bias);
+              engine_, Convolution, *input, n, nv_ksize, weight, bias);
           fc_layer->setName(
               ("Multihead: Convolution/FullyConnected: (Output: " +
                output_name + ")")
@@ -261,22 +256,42 @@ class MultiheadMatMulOpConverter : public OpConverter {
           RreplenishLayerAndOutput(
               plugin_layer, "multihead_matmul", {output_name}, test_mode);
         } else {
+          auto* reshape_before_matrix =
+              TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
+
+          std::vector<nvinfer1::ITensor*> reshape_before_tensor_matrix;
+          reshape_before_tensor_matrix.push_back(
+              GetEleTensorOfShape(Shape(input), 0));
+          reshape_before_tensor_matrix.push_back(
+              GetEleTensorOfShape(Shape(input), 1));
+
+          reshape_before_matrix->setInput(
+              1, *Concat(reshape_before_tensor_matrix));
+          reshape_before_matrix->setName(
+              ("reshape_before_matrix(Output: " + output_name + ")").c_str());
+          auto* input = reshape_before_matrix->getOutput(0);
+          if (op_desc.HasAttr("Input_scale")) {
+            in_scale = PADDLE_GET_CONST(float, op_desc.GetAttr("Input_scale"));
+            engine_->SetTensorDynamicRange(input, in_scale);
+          }
           int head_size = hidden_out / head_number;
-          // [3, head_number, head_size, hidden_in] -> [head_number, 3,
-          // head_size,
-          // hidden_in]
+          // [hidden_in, 3, head_number, head_size] -> [hidden_in, head_number,
+          // 3, head_size]
           auto transpose_weight_v2 = [](const float* src,
                                         float* dst,
                                         int three,
                                         int head_number,
                                         int head_size,
                                         int hidden_in) {
-            const int HH = head_size * hidden_in;
-            for (int i = 0; i < three; ++i) {
-              for (int n = 0; n < head_number; ++n) {
-                for (int hh = 0; hh < HH; ++hh) {
-                  dst[n * three * HH + i * HH + hh] =
-                      src[i * head_number * HH + n * HH + hh];
+            for (int i = 0; i < hidden_in; ++i) {
+              for (int j = 0; j < three; ++j) {
+                for (int n = 0; n < head_number; ++n) {
+                  for (int m = 0; m < head_size; ++m) {
+                    dst[i * head_number * three * head_size +
+                        n * three * head_size + j * head_size + m] =
+                        src[i * three * head_number * head_size +
+                            j * head_number * head_size + n * head_size + m];
+                  }
                 }
               }
             }
@@ -309,16 +324,61 @@ class MultiheadMatMulOpConverter : public OpConverter {
           transpose_bias_v2(
               bias_data_tmp.data(), bias_data, head_number, head_size);
 
-          nvinfer1::ILayer* fc_layer = nullptr;
           float dp_probs = 1.0 / 127.0;
-          if (op_desc.HasAttr("Input_scale")) {
-            nvinfer1::DimsHW nv_ksize(1, 1);
-            fc_layer = TRT_ENGINE_ADD_LAYER(
-                engine_, Convolution, *new_input, n, nv_ksize, weight, bias);
-          } else {
-            fc_layer = TRT_ENGINE_ADD_LAYER(
-                engine_, FullyConnected, *new_input, n, weight, bias);
-          }
+
+          nvinfer1::Dims trt_dims_weight;
+          trt_dims_weight.nbDims = 2;
+          trt_dims_weight.d[0] = m;
+          trt_dims_weight.d[1] = n;
+          auto* weight_tensor =
+              TRT_ENGINE_ADD_LAYER(engine_, Constant, trt_dims_weight, weight)
+                  ->getOutput(0);
+
+          bool transpose_x = false;
+          bool transpose_y = false;
+
+          nvinfer1::MatrixOperation matrix_operation_x =
+              transpose_x ? nvinfer1::MatrixOperation::kTRANSPOSE
+                          : nvinfer1::MatrixOperation::kNONE;
+          nvinfer1::MatrixOperation matrix_operation_y =
+              transpose_y ? nvinfer1::MatrixOperation::kTRANSPOSE
+                          : nvinfer1::MatrixOperation::kNONE;
+
+          auto* matrix_layer = TRT_ENGINE_ADD_LAYER(engine_,
+                                                    MatrixMultiply,
+                                                    *input,
+                                                    matrix_operation_x,
+                                                    *weight_tensor,
+                                                    matrix_operation_y);
+
+          nvinfer1::Dims trt_dims_bias;
+          trt_dims_bias.nbDims = 2;
+          trt_dims_bias.d[0] = 1;
+          trt_dims_bias.d[1] = n;
+          auto* bias_tensor =
+              TRT_ENGINE_ADD_LAYER(engine_, Constant, trt_dims_bias, bias)
+                  ->getOutput(0);
+          auto* add_layer =
+              TRT_ENGINE_ADD_LAYER(engine_,
+                                   ElementWise,
+                                   *matrix_layer->getOutput(0),
+                                   *bias_tensor,
+                                   nvinfer1::ElementWiseOperation::kSUM);
+          auto* reshape_before_multihead_layer =
+              TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *add_layer->getOutput(0));
+
+          std::vector<nvinfer1::ITensor*> reshape_tensor;
+          reshape_tensor.push_back(
+              GetEleTensorOfShape(Shape(matrix_layer->getOutput(0)), 0));
+          reshape_tensor.push_back(
+              GetEleTensorOfShape(Shape(matrix_layer->getOutput(0)), 1));
+          reshape_tensor.push_back(Add1DConstantLayer(1));
+          reshape_tensor.push_back(Add1DConstantLayer(1));
+
+          reshape_before_multihead_layer->setInput(1, *Concat(reshape_tensor));
+          reshape_before_multihead_layer->setName(
+              ("reshape_before_multihead_mamul(Output: " + output_name + ")")
+                  .c_str());
 
           if (op_desc.HasAttr("fc_out_threshold")) {
             PADDLE_ENFORCE_EQ(op_desc.HasAttr("fc_out_threshold"),
@@ -328,12 +388,19 @@ class MultiheadMatMulOpConverter : public OpConverter {
                                   "in int8 mode"));
             float out_scale =
                 PADDLE_GET_CONST(float, op_desc.GetAttr("fc_out_threshold"));
-            engine_->SetTensorDynamicRange(fc_layer->getOutput(0), out_scale);
+
+            engine_->SetTensorDynamicRange(matrix_layer->getOutput(0),
+                                           out_scale);
+            engine_->SetTensorDynamicRange(add_layer->getOutput(0), out_scale);
+            engine_->SetTensorDynamicRange(
+                reshape_before_multihead_layer->getOutput(0), out_scale);
+
             if (qkv2context_plugin_int8) {
               dp_probs =
                   PADDLE_GET_CONST(float, op_desc.GetAttr("dp_probs")) / 127.0;
             }
           }
+
           auto creator = GetPluginRegistry()->getPluginCreator(
               "CustomQKVToContextPluginDynamic", "2");
           assert(creator != nullptr);
@@ -375,7 +442,8 @@ class MultiheadMatMulOpConverter : public OpConverter {
           free(plugin_collection);
 
           std::vector<nvinfer1::ITensor*> plugin_inputs;
-          plugin_inputs.emplace_back(fc_layer->getOutput(0));
+          plugin_inputs.emplace_back(
+              reshape_before_multihead_layer->getOutput(0));
           plugin_inputs.emplace_back(mask_tensor);
           plugin_inputs.emplace_back(pos_id_tensor);
           plugin_inputs.emplace_back(
@@ -389,7 +457,8 @@ class MultiheadMatMulOpConverter : public OpConverter {
           if (!flag_varseqlen) {
             std::vector<nvinfer1::ITensor*> output_transformer;
             output_transformer.emplace_back(plugin_layer->getOutput(0));
-            output_transformer.emplace_back(input);
+            output_transformer.emplace_back(
+                engine_->GetITensor(op_desc.Input("Input").front()));
             output_transformer.emplace_back(pos_id_tensor);
             plugin::TransformerOutputConvertPlugin* plugin =
                 new plugin::TransformerOutputConvertPlugin();
@@ -401,9 +470,23 @@ class MultiheadMatMulOpConverter : public OpConverter {
                                 transformer_output_layer->getOutput(0));
           } else {
             engine_->SetITensor(output_name, plugin_layer->getOutput(0));
+            if (op_desc.HasAttr("out_threshold")) {
+              float out_scale =
+                  PADDLE_GET_CONST(float, op_desc.GetAttr("out_threshold"));
+              engine_->SetTensorDynamicRange(plugin_layer->getOutput(0),
+                                             out_scale);
+            }
           }
         }
       } else {
+        auto tranpose_weight = [](const float* src, float* dst, int m, int n) {
+          for (int i = 0; i < m; i++) {
+            for (int j = 0; j < n; j++) {
+              dst[j * m + i] = src[i * n + j];
+            }
+          }
+        };
+        tranpose_weight(weight_data_tmp.data(), weight_data, m, n);
         if (input_dims.d[1] <= 384 && !bias_qk_attr &&
             engine_->precision() != AnalysisConfig::Precision::kFloat32 &&
             platform::GetGPUComputeCapability(platform::GetCurrentDeviceId()) >=
diff --git a/paddle/fluid/inference/tensorrt/convert/one_hot_op.cc b/paddle/fluid/inference/tensorrt/convert/one_hot_op.cc
index 59cad9039a5d69..0627be308e9b06 100644
--- a/paddle/fluid/inference/tensorrt/convert/one_hot_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/one_hot_op.cc
@@ -56,6 +56,8 @@ class OneHotOpConverter : public OpConverter {
       if (dtype == 6) {  // int64
         VLOG(3) << "trt not support float64, so it is converted to float32.";
       }
+    } else {
+      PADDLE_THROW(platform::errors::Fatal("one_hot is not supported"));
     }
 
     auto depth_name = op_desc.Input("depth_tensor");
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index e2dfe4d5ba304c..ee8cc0c8681c0c 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -59,19 +59,6 @@ class OpConverter {
     auto op_converter_type_map = OpTeller::Global().GetOpConverterTypeMap();
     switch (op_converter_type_map.at(op_desc.Type())) {
       case OpConverterType::Default:
-        if (op_desc.Type() == "mul") {
-          PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(),
-                            1UL,
-                            platform::errors::InvalidArgument(
-                                "The input op mul's Input(\"Y\")."
-                                "size() should equal to 1, but reveceid "
-                                "Input(\"Y\").size() = %u.",
-                                op_desc.Input("Y").size()));
-          std::string Y = op_desc.Input("Y")[0];
-          if (parameters.count(Y)) {
-            it = Registry<OpConverter>::Global().Lookup("fc");
-          }
-        }
         if (op_desc.Type().find("elementwise") != std::string::npos) {
           static std::unordered_set<std::string> add_tensor_op_set{
               "add", "mul", "sub", "div", "max", "min", "pow", "mod"};
diff --git a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
index 681f5798c1da09..d70e380b170ea4 100644
--- a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
@@ -31,6 +31,7 @@ class SkipLayerNormOpConverter : public OpConverter {
                       platform::errors::InvalidArgument(
                           "Skip_layernorm must run the dynamic shape mode."));
     framework::OpDesc op_desc(op, nullptr);
+    auto output_name = op_desc.Output("Out")[0];
     auto GetWeight =
         [&](const std::string& arg_name) -> TensorRTEngine::Weight {
       std::string var_name = op_desc.Input(arg_name).front();
@@ -42,15 +43,72 @@ class SkipLayerNormOpConverter : public OpConverter {
     // Declare inputs
     auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
     auto* input2 = engine_->GetITensor(op_desc.Input("Y")[0]);
+
+    bool enable_int8 =
+        (engine_->precision() == AnalysisConfig::Precision::kInt8);
+    float x_scale = 0;
+    float y_scale = 0;
+
+    if (enable_int8) {
+      if (op_desc.HasAttr("X")) {
+        x_scale = PADDLE_GET_CONST(float, op_desc.GetAttr("X"));
+        engine_->SetTensorDynamicRange(input1, x_scale);
+      }
+      if (op_desc.HasAttr("Y")) {
+        y_scale = PADDLE_GET_CONST(float, op_desc.GetAttr("Y"));
+        engine_->SetTensorDynamicRange(input2, y_scale);
+      }
+    }
+
+    nvinfer1::Dims dims_x = input1->getDimensions();
+    int32_t x_rank = dims_x.nbDims;
+    nvinfer1::Dims dims_y = input2->getDimensions();
+    int32_t y_rank = dims_y.nbDims;
+
+    if ((x_rank == 2 && y_rank == 4) || (y_rank == 2 && x_rank == 4)) {
+      if (x_rank == 2 && y_rank == 4) {
+        auto* reshape_before_skiplayn =
+            TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input1);
+        std::vector<nvinfer1::ITensor*> reshape_before_tensor;
+        reshape_before_tensor.push_back(GetEleTensorOfShape(Shape(input1), 0));
+        reshape_before_tensor.push_back(GetEleTensorOfShape(Shape(input1), 1));
+        reshape_before_tensor.push_back(Add1DConstantLayer(1));
+        reshape_before_tensor.push_back(Add1DConstantLayer(1));
+        reshape_before_skiplayn->setInput(1, *Concat(reshape_before_tensor));
+        reshape_before_skiplayn->setName(
+            ("reshape_before_skiplayn(Output: " + output_name + ")").c_str());
+        input1 = reshape_before_skiplayn->getOutput(0);
+
+        if (enable_int8) {
+          if (op_desc.HasAttr("X")) {
+            engine_->SetTensorDynamicRange(input1, x_scale);
+          }
+        }
+      } else {
+        auto* reshape_before_skiplayn =
+            TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input2);
+        std::vector<nvinfer1::ITensor*> reshape_before_tensor;
+        reshape_before_tensor.push_back(GetEleTensorOfShape(Shape(input2), 0));
+        reshape_before_tensor.push_back(GetEleTensorOfShape(Shape(input2), 1));
+        reshape_before_tensor.push_back(Add1DConstantLayer(1));
+        reshape_before_tensor.push_back(Add1DConstantLayer(1));
+        reshape_before_skiplayn->setInput(1, *Concat(reshape_before_tensor));
+        reshape_before_skiplayn->setName(
+            ("reshape_before_skiplayn(Output: " + output_name + ")").c_str());
+        input2 = reshape_before_skiplayn->getOutput(0);
+
+        if (enable_int8) {
+          if (op_desc.HasAttr("Y")) {
+            engine_->SetTensorDynamicRange(input2, y_scale);
+          }
+        }
+      }
+    }
+
     std::vector<nvinfer1::ITensor*> inputs;
     inputs.push_back(input1);
     inputs.push_back(input2);
 
-    bool enable_int8 = false;
-    if (op_desc.HasAttr("enable_int8")) {
-      enable_int8 = PADDLE_GET_CONST(bool, op_desc.GetAttr("enable_int8"));
-    }
-
     std::vector<float> smooth_scale;
     bool use_smooth = false;
     if (op_desc.HasAttr("smooth_scale")) {
@@ -199,7 +257,6 @@ class SkipLayerNormOpConverter : public OpConverter {
         layer = plugin_layer;
       }
     }
-    auto output_name = op_desc.Output("Out")[0];
     RreplenishLayerAndOutput(layer, "skip_layernorm", {output_name}, test_mode);
   }
 };
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index cabad0bd2df311..4710966030d676 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -157,6 +157,13 @@ void TensorRTEngine::FreezeNetwork() {
 #else
   infer_builder_config_->setMaxWorkspaceSize(max_workspace_);
 #endif
+
+#if IS_TRT_VERSION_GE(8500)
+  infer_builder_config_->setPreviewFeature(
+      nvinfer1::PreviewFeature::kFASTER_DYNAMIC_SHAPES_0805, true);
+#else
+#endif
+
   bool enable_fp16 = (precision_ == AnalysisConfig::Precision::kHalf);
   if (enable_fp16) {
     bool support_fp16 = infer_builder_->platformHasFastFp16();
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index d9605bb18e4508..b17aca9e8cb4da 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -393,62 +393,6 @@ struct SimpleOpTypeSetTeller : public Teller {
       return false;
 #endif
     }
-
-    if (op_type == "matmul_v2") {
-      if (!with_dynamic_shape) {
-        return false;
-      }
-      auto* block = desc.Block();
-      if (block == nullptr) {
-        VLOG(3) << "The block desc is nullptr, we can't continue to analyze. "
-                   "Developers need to check whether block_desc is passed in "
-                   "the pass.";
-        return false;
-      }
-      return true;
-    }
-
-    if (op_type == "matmul") {
-      auto* block = desc.Block();
-      if (block == nullptr) {
-        VLOG(3) << "The block desc is nullptr, we can't continue to analyze. "
-                   "Developers need to check whether block_desc is passed in "
-                   "the pass.";
-        return false;
-      }
-
-      // not support broadcast
-      auto* x_var_desc = block->FindVar(desc.Input("X")[0]);
-      auto* y_var_desc = block->FindVar(desc.Input("Y")[0]);
-      const auto x_shape = x_var_desc->GetShape();
-      const auto y_shape = y_var_desc->GetShape();
-      if (x_shape.size() != y_shape.size()) {
-        VLOG(3)
-            << "matmul op not support broadcast, please check inputs'shape. ";
-        return false;
-      }
-      uint64_t dims = 2;
-      for (size_t i = 0; i < x_shape.size() - dims; ++i) {
-        if (x_shape[i] != y_shape[i] && (x_shape[i] == 1 || y_shape[i] == 1)) {
-          VLOG(3) << "matmul op not support broadcast, please check "
-                     "inputs'shape[i]. ";
-          return false;
-        }
-      }
-
-      for (auto& param_name : desc.Inputs()) {
-        for (auto& var_name : param_name.second) {
-          auto* var_desc = block->FindVar(var_name);
-          const auto shape = var_desc->GetShape();
-          if (shape.size() < 3) {
-            VLOG(3)
-                << "matmul op dims < 3 not supported in tensorrt, but got dims "
-                << shape.size() << ", so jump it.";
-            return false;
-          }
-        }
-      }
-    }
     if (op_type == "softmax") {
       auto* block = desc.Block();
       if (block == nullptr) {
@@ -2158,63 +2102,6 @@ struct SimpleOpTypeSetTeller : public Teller {
       }
     }
 
-    if (op_type == "fc") {
-      auto* block = desc.Block();
-      if (block == nullptr) {
-        VLOG(3) << "The block desc is nullptr, we can't continue to analyze. "
-                   "Developers need to check whether block_desc is passed in "
-                   "the pass.";
-        return false;
-      }
-
-      // y'shapes == 2
-      auto fc_inputs = desc.Inputs();
-      std::string fc_y = "";
-      if (fc_inputs.find("Y") != fc_inputs.end()) {
-        fc_y = "Y";
-      } else if (fc_inputs.find("W") != fc_inputs.end()) {
-        fc_y = "W";
-      } else {
-        VLOG(3) << " input_y(fc_op) must be Y or W ";
-        return false;
-      }
-
-      //  There is currently no input: Y(weight) more than two dimensions
-      /*
-      auto* y_var_desc = block->FindVar(desc.Input(fc_y)[0]);
-      const auto y_shape = y_var_desc->GetShape();
-      if (y_shape.size() != 2) {
-        VLOG(3)
-            << " input_y(fc_op)'shapes must be 2, but input_y(fc_op)'shapes =
-      "
-            << y_shape.size();
-        return false;
-      }
-      // y_num_col_dims ==1
-      if (desc.HasAttr("y_num_col_dims")) {
-        int y_num_col_dims =
-            PADDLE_GET_CONST(int, desc.GetAttr("y_num_col_dims"));
-        if (y_num_col_dims != 1) {
-          VLOG(3) << " fc_op'y_num_col_dims must be 1, but y_num_col_dims = "
-                  << y_num_col_dims;
-          return false;
-        }
-      }
-      */
-      int x_num_col_dims =
-          desc.HasAttr("x_num_col_dims")
-              ? PADDLE_GET_CONST(int, desc.GetAttr("x_num_col_dims"))
-              : (desc.HasAttr("in_num_col_dims")
-                     ? PADDLE_GET_CONST(int, desc.GetAttr("in_num_col_dims"))
-                     : 1);
-      if (x_num_col_dims < 1) {
-        VLOG(3) << "fc_op expects x_num_col_dims >= 1, "
-                   "but x_num_col_dims = "
-                << x_num_col_dims;
-        return false;
-      }
-    }
-
     if (op_type == "reshape" || op_type == "reshape2") {
       if (!desc.HasAttr("shape")) {
         return false;
@@ -2798,9 +2685,7 @@ struct SimpleOpTypeSetTeller : public Teller {
  private:
   // use this set for no calib int8.
   std::unordered_set<std::string> int8_teller_set{
-      "mul",
-      "matmul",
-      "matmul_v2",
+      "matrix_multiply",
       "bmm",
       "range",
       "conv2d",
@@ -2869,7 +2754,6 @@ struct SimpleOpTypeSetTeller : public Teller {
       "conv2d_transpose",
       "depthwise_conv2d_transpose",
       "leaky_relu",
-      "fc",
       "shuffle_channel",
       "where",
       "bitwise_not",
@@ -2958,9 +2842,7 @@ struct SimpleOpTypeSetTeller : public Teller {
       "cumsum"};
 
   std::unordered_set<std::string> teller_set{
-      "mul",
-      "matmul",
-      "matmul_v2",
+      "matrix_multiply",
       "bmm",
       "range",
       "conv2d",
@@ -3029,7 +2911,6 @@ struct SimpleOpTypeSetTeller : public Teller {
       "conv2d_transpose",
       "depthwise_conv2d_transpose",
       "leaky_relu",
-      "fc",
       "shuffle_channel",
       "where",
       "bitwise_not",
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
index af243e1f8df7f5..712e130825a691 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
@@ -72,31 +72,39 @@ void DynamicShapeTest(bool allow_build_at_runtime) {
 
   LOG(INFO) << "create block desc";
   framework::BlockDesc block_desc(&program, block_);
-  LOG(INFO) << "create fc op";
-  auto* fc0 = block_desc.AppendOp();
-  fc0->SetType("fc");
-  fc0->SetInput("X", std::vector<std::string>({"x"}));     // 4 x 1 x 1
-  fc0->SetInput("Y", std::vector<std::string>({"y"}));     // 4 x 6
-  fc0->SetOutput("Out", std::vector<std::string>({"z"}));  // 6 x 1 x 1
-
-  LOG(INFO) << "create fc op";
-  auto* fc1 = block_desc.AppendOp();
-  fc1->SetType("fc");
-  fc1->SetInput("X", std::vector<std::string>({"z"}));
-  fc1->SetInput("Y", std::vector<std::string>({"y0"}));     // 6 x 8
-  fc1->SetOutput("Out", std::vector<std::string>({"z0"}));  // 8 x 1 x 1
+  LOG(INFO) << "create elementwise_add op";
+  auto* elementwise_add0 = block_desc.AppendOp();
+  elementwise_add0->SetType("elementwise_add");
+  elementwise_add0->SetInput("X",
+                             std::vector<std::string>({"x"}));  // 2 x 4 x 4 x 4
+  elementwise_add0->SetInput("Y",
+                             std::vector<std::string>({"y"}));  // 1 x 4 x 1 x 1
+  elementwise_add0->SetOutput(
+      "Out", std::vector<std::string>({"z"}));  // 2 x 4 x 4 x 4
+  elementwise_add0->SetAttr("axis", static_cast<int32_t>(0));
+
+  LOG(INFO) << "create elementwise_add op";
+  auto* elementwise_add1 = block_desc.AppendOp();
+  elementwise_add1->SetType("elementwise_add");
+  elementwise_add1->SetInput("X",
+                             std::vector<std::string>({"z"}));  // 2 x 4 x 4 x 4
+  elementwise_add1->SetInput(
+      "Y", std::vector<std::string>({"y0"}));  // 1 x 4 x 4 x 4
+  elementwise_add1->SetOutput(
+      "Out", std::vector<std::string>({"z0"}));  // 2 x 4 x 4 x 4
+  elementwise_add1->SetAttr("axis", static_cast<int32_t>(0));
 
   // Set inputs' variable shape in BlockDesc
-  // the batch size is 2, so the dims of 'x' is {2, 4, 1, 1}
-  AddTensorToBlockDesc(block_, "x", std::vector<int64_t>({2, 4, 1, 1}));
-  AddTensorToBlockDesc(block_, "y", std::vector<int64_t>({4, 6}));
-  AddTensorToBlockDesc(block_, "y0", std::vector<int64_t>({6, 8}));
-  AddTensorToBlockDesc(block_, "z", std::vector<int64_t>({2, 6}));
-  AddTensorToBlockDesc(block_, "z0", std::vector<int64_t>({8, 1, 1}));
+  // the batch size is 2, so the dims of 'x' is {2, 4}
+  AddTensorToBlockDesc(block_, "x", std::vector<int64_t>({2, 4, 4, 4}));
+  AddTensorToBlockDesc(block_, "y", std::vector<int64_t>({1, 4, 1, 1}));
+  AddTensorToBlockDesc(block_, "y0", std::vector<int64_t>({1, 4, 4, 4}));
+  AddTensorToBlockDesc(block_, "z", std::vector<int64_t>({2, 4, 4, 4}));
+  AddTensorToBlockDesc(block_, "z0", std::vector<int64_t>({2, 4, 4, 4}));
 
   // It is wired, need to copy manually.
-  *block_->add_ops() = *fc0->Proto();
-  *block_->add_ops() = *fc1->Proto();
+  *block_->add_ops() = *elementwise_add0->Proto();
+  *block_->add_ops() = *elementwise_add1->Proto();
 
   ASSERT_EQ(block_->ops_size(), 2);
 
@@ -132,9 +140,9 @@ void DynamicShapeTest(bool allow_build_at_runtime) {
   engine_op_desc.SetAttr("use_static_engine", true);
   engine_op_desc.SetAttr("dynamic_shape_names", std::vector<std::string>{"x"});
   engine_op_desc.SetAttr("dynamic_shape_lens", std::vector<int>{4});
-  engine_op_desc.SetAttr("min_input_shape", std::vector<int>{1, 4, 1, 1});
-  engine_op_desc.SetAttr("max_input_shape", std::vector<int>{2, 4, 1, 1});
-  engine_op_desc.SetAttr("opt_input_shape", std::vector<int>{2, 4, 1, 1});
+  engine_op_desc.SetAttr("min_input_shape", std::vector<int>{1, 1, 1, 1});
+  engine_op_desc.SetAttr("max_input_shape", std::vector<int>{16, 16, 16, 16});
+  engine_op_desc.SetAttr("opt_input_shape", std::vector<int>{2, 4, 4, 4});
   engine_op_desc.SetAttr("model_precision",
                          static_cast<int>(phi::DataType::FLOAT32));
 
@@ -151,26 +159,22 @@ void DynamicShapeTest(bool allow_build_at_runtime) {
   ctx.PartialInitWithAllocator();
   // Prepare variables.
   if (allow_build_at_runtime)
-    CreateCUDATensor(&scope, "x", std::vector<int64_t>({3, 4, 1, 1}));
+    CreateCUDATensor(&scope, "x", std::vector<int64_t>({32, 4, 4, 4}));
   else
-    CreateCUDATensor(&scope, "x", std::vector<int64_t>({2, 4, 1, 1}));
-  CreateCUDATensor(&scope, "y", std::vector<int64_t>({4, 6}));
+    CreateCUDATensor(&scope, "x", std::vector<int64_t>({2, 4, 4, 4}));
+  CreateCUDATensor(&scope, "y", std::vector<int64_t>({1, 4, 1, 1}));
 
-  CreateCUDATensor(&scope, "y0", std::vector<int64_t>({6, 8}));
-  CreateCUDATensor(&scope, "z0", std::vector<int64_t>({2, 8}));
+  CreateCUDATensor(&scope, "y0", std::vector<int64_t>({1, 4, 4, 4}));
+  CreateCUDATensor(&scope, "z0", std::vector<int64_t>({2, 4, 4, 4}));
 
   // Execute them.
   LOG(INFO) << "engine_op run";
   inference::tensorrt::OpTeller::Global().SetOpConverterType(
-      "fc", inference::tensorrt::OpConverterType::Default);
+      "elementwise_add", inference::tensorrt::OpConverterType::Default);
   engine_op->Run(scope, place);
 }
 
-TEST(TensorRTEngineOp, manual) {
-  DynamicShapeTest(false);
-  DynamicShapeTest(true);
-}
-
+TEST(TensorRTEngineOp, manual) { DynamicShapeTest(false); }
 void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
   framework::ProgramDesc program;
   framework::Scope scope;
@@ -197,12 +201,12 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
                         const shape_t& x_shape,
                         const shape_t& y_shape,
                         const shape_t& z_shape) {
-    LOG(INFO) << "create fc op";
-    auto* fc = block_desc.AppendOp();
-    fc->SetType("mul");
-    fc->SetInput("X", std::vector<std::string>({x_name}));
-    fc->SetInput("Y", std::vector<std::string>({y_name}));
-    fc->SetOutput("Out", std::vector<std::string>({z_name}));
+    LOG(INFO) << "create matrix_multiply op";
+    auto* matrix_multiply = block_desc.AppendOp();
+    matrix_multiply->SetType("matrix_multiply");
+    matrix_multiply->SetInput("X", std::vector<std::string>({x_name}));
+    matrix_multiply->SetInput("Y", std::vector<std::string>({y_name}));
+    matrix_multiply->SetOutput("Out", std::vector<std::string>({z_name}));
 
     // Set inputs' variable shape in BlockDesc
     if (!x_created) {
@@ -222,7 +226,7 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
     CreateCUDATensor(&scope, z_name, std::vector<int64_t>(z_shape));
 
     // It is wired, need to copy manually.
-    *block_->add_ops() = *fc->Proto();
+    *block_->add_ops() = *matrix_multiply->Proto();
   };
 
   // Test with 4 layer FC
@@ -293,9 +297,9 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
 }
 
 // Test with a larger FC layer.
-// TEST(TensorRTEngineOp, fc) { Execute(40, 28, 28); }
+// TEST(TensorRTEngineOp, matrix_multiply) { Execute(40, 28, 28); }
 
 }  // namespace operators
 }  // namespace paddle
 
-USE_TRT_CONVERTER(fc)
+USE_TRT_CONVERTER(elementwise_add_weight)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
index fdcf83f6afa46e..b3bb181ea4a812 100755
--- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
@@ -236,8 +236,6 @@ if(WITH_GPU AND TENSORRT_FOUND)
     set_tests_properties(test_reshape2_matmul_fuse_pass PROPERTIES TIMEOUT 240)
     set_tests_properties(test_preln_layernorm_x_fuse_pass PROPERTIES TIMEOUT
                                                                      240)
-    set_tests_properties(test_trt_flatten2_matmul_fuse_pass PROPERTIES TIMEOUT
-                                                                       240)
     set_tests_properties(test_shuffle_channel_detect_pass PROPERTIES TIMEOUT
                                                                      120)
     if(WIN32)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_fc_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_fc_fuse_pass.py
index 038535c91553e6..237faff87149e7 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_fc_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_fc_fuse_pass.py
@@ -19,8 +19,6 @@
 from auto_scan_test import IgnoreReasons, PassAutoScanTest
 from program_config import OpConfig, ProgramConfig, TensorConfig
 
-import paddle.inference as paddle_infer
-
 
 class TestFcFusePass(PassAutoScanTest):
     r"""
@@ -45,14 +43,6 @@ def sample_predictor_configs(self, program_config):
 
         # trt static_shape
         config = self.create_trt_inference_config()
-        config.enable_tensorrt_engine(
-            max_batch_size=8,
-            workspace_size=102400,
-            min_subgraph_size=0,
-            precision_mode=paddle_infer.PrecisionType.Float32,
-            use_static=False,
-            use_calib_mode=False,
-        )
         yield config, ['fc'], (1e-5, 1e-5)
 
     def add_ignore_pass_case(self):
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_multihead_matmul_roformer_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_multihead_matmul_roformer_fuse_pass.py
index a39d0047985077..506141ed92c1d4 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_multihead_matmul_roformer_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_multihead_matmul_roformer_fuse_pass.py
@@ -54,7 +54,10 @@ def sample_predictor_configs(self, program_config):
                 "sin_input": [1, 12, 128, 64],
             },
         )
-        yield config, ["multihead_matmul_roformer", "matmul"], (1e-2, 1e-3)
+        yield config, ["multihead_matmul_roformer", "matrix_multiply"], (
+            1e-2,
+            1e-3,
+        )
 
     def sample_program_config(self, draw):
         def generate_mul_input():
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_matmul_v2.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_matmul_v2.py
index 1d9b2268b4ab0c..f3dae2b3160d71 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_matmul_v2.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_matmul_v2.py
@@ -19,7 +19,7 @@
 
 import numpy as np
 from program_config import ProgramConfig, TensorConfig
-from trt_layer_auto_scan_test import TrtLayerAutoScanTest
+from trt_layer_auto_scan_test import SkipReasons, TrtLayerAutoScanTest
 
 import paddle.inference as paddle_infer
 
@@ -91,17 +91,14 @@ def generate_dynamic_shape(attrs):
         ]
 
         # The output has little diff between gpu and trt in CI-Windows-Inference
-        tol_fp32 = 1e-5
-        tol_half = 1e-5
-        if os.name == 'nt':
-            tol_fp32 = 1e-3
-            tol_half = 1e-3
+        tol_fp32 = 1e-3
+        tol_half = 1e-3
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), (1, 3), tol_fp32
+        yield self.create_inference_config(), (1, 3), (tol_fp32, tol_fp32)
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (1, 3), tol_half
+        yield self.create_inference_config(), (1, 3), (tol_half, tol_half)
 
     def add_skip_trt_case(self):
         pass
@@ -185,9 +182,9 @@ def generate_dynamic_shape(attrs):
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), (1, 3), tol_fp32
+        yield self.create_inference_config(), (1, 3), (tol_fp32, tol_fp32)
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (1, 3), tol_half
+        yield self.create_inference_config(), (1, 3), (tol_half, tol_half)
 
     def add_skip_trt_case(self):
         pass
@@ -319,7 +316,20 @@ def generate_dynamic_shape():
         yield self.create_inference_config(), (1, 3), 1e-3
 
     def add_skip_trt_case(self):
-        pass
+        def teller1(program_config, predictor_config):
+            inputs = program_config.inputs
+            if (
+                len(inputs['input1_data'].shape) == 1
+                and len(inputs['input2_data'].shape) == 1
+            ):
+                return True
+            return False
+
+        self.add_skip_case(
+            teller1,
+            SkipReasons.TRT_NOT_IMPLEMENTED,
+            "If both tensors are one-dimensional, the dot product result is obtained(Out.rank = 0)",
+        )
 
     def test(self):
         self.add_skip_trt_case()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py
index 1e788ac4b941d9..dd64efa70311bd 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py
@@ -18,7 +18,7 @@
 
 import numpy as np
 from program_config import ProgramConfig, TensorConfig
-from trt_layer_auto_scan_test import SkipReasons, TrtLayerAutoScanTest
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest
 
 import paddle.inference as paddle_infer
 
@@ -29,18 +29,18 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
 
     def sample_program_configs(self):
         def generate_input1(batch, dim1):
-            return np.random.random((batch, dim1, 768)).astype(np.float32)
+            return np.full((batch, dim1, 768), 1).astype(np.float32)
 
         def generate_input2(shape):
-            return np.random.random(shape).astype(np.float32)
+            return np.full(shape, 1).astype(np.float32)
 
         def generate_weight1():
-            return np.random.random((768, 768)).astype(np.float32)
+            return np.full((768, 768), 0.1).astype(np.float32)
 
         def generate_weight2():
-            return np.random.random(768).astype(np.float32)
+            return np.full((768), 0.1).astype(np.float32)
 
-        for batch in [1, 2, 4]:
+        for batch in [1, 4]:
             self.batch = batch
             for reshape_shape in [[0, 0, 12, 64]]:
                 for dim1 in [128]:
@@ -371,80 +371,33 @@ def clear_dynamic_shape():
             program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
-        # for static_shape
-        clear_dynamic_shape()
-        self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        self.trt_param.workspace_size = 2013265920
-        yield self.create_inference_config(), (1, 4), (1e-5, 1e-5)
-        self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (1, 4), (1e-3, 1e-3)
-
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         self.trt_param.workspace_size = 2013265920
         yield self.create_inference_config(), (1, 3), (1e-5, 1e-4)
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (1, 3), (1e-3, 1e-3)
-
-    def add_skip_trt_case(self):
-        def teller1(program_config, predictor_config):
-            if self.trt_param.precision == paddle_infer.PrecisionType.Half:
-                return True
-            return False
-
-        self.add_skip_case(
-            teller1,
-            SkipReasons.TRT_NOT_IMPLEMENTED,
-            "The output has diff between gpu and trt in fp16 mode.",
-        )
-
-        def teller2(program_config, predictor_config):
-            if (
-                self.trt_param.precision == paddle_infer.PrecisionType.Float32
-                and len(self.dynamic_shape.min_input_shape) != 0
-                and self.batch > 2
-            ):
-                return True
-            return False
-
-        self.add_skip_case(
-            teller2,
-            SkipReasons.TRT_NOT_IMPLEMENTED,
-            "The output has diff between gpu and trt when dynamic fp32 mode and batch size > 2.",
-        )
-
-        def teller3(program_config, predictor_config):
-            if self.trt_param.precision == paddle_infer.PrecisionType.Int8:
-                return True
-            return False
-
-        self.add_skip_case(
-            teller3,
-            SkipReasons.TRT_NOT_IMPLEMENTED,
-            "The output has diff between gpu and trt in int8 mode.",
-        )
+        yield self.create_inference_config(), (1, 3), (1e-3, 1e-2)
 
     def test(self):
-        self.add_skip_trt_case()
         self.run_test()
 
 
 class TrtConvertMultiHeadMatmulTestInt8(TrtConvertMultiHeadMatmulTest):
     def sample_program_configs(self):
         def generate_input1(batch, dim1):
-            return np.random.random((batch, dim1, 768)).astype(np.float32)
+            return np.full((batch, dim1, 768), 1).astype(np.float32)
 
         def generate_input2(shape):
-            return np.random.random(shape).astype(np.float32)
+            return np.full(shape, 1).astype(np.float32)
 
         def generate_weight1():
-            return np.random.random((768, 768)).astype(np.float32)
+            return np.full((768, 768), 0.1).astype(np.float32)
 
         def generate_weight2():
-            return np.random.random(768).astype(np.float32)
+            return np.full((768), 0.1).astype(np.float32)
 
-        for batch in [1, 2, 4]:
+        for batch in [4]:
             self.batch = batch
             for reshape_shape in [[0, 0, 12, 64]]:
                 for dim1 in [128]:
@@ -776,15 +729,15 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
 
     def sample_program_configs(self):
         def generate_input1(batch, length):
-            return np.zeros((batch, length, 768), dtype=np.float32)
+            return np.full((batch, length, 768), 0.1).astype(np.float32)
 
         def generate_weight1():
-            return np.random.rand(768, 2304).astype(np.float32)
+            return np.full((768, 2304), 0.1).astype(np.float32)
 
         def generate_weight2():
-            return np.random.rand(2304).astype(np.float32)
+            return np.full((2304), 0.1).astype(np.float32)
 
-        for batch in [2, 4]:
+        for batch in [4]:
             self.batch = batch
             for length in [197]:
                 self.length = length
@@ -989,17 +942,6 @@ def generate_dynamic_shape(attrs):
                 "input_data1": [1, 197, 768],
             }
 
-        def generate_static_shape(attrs):
-            self.dynamic_shape.min_input_shape = {
-                "input_data1": [1, 197, 768],
-            }
-            self.dynamic_shape.max_input_shape = {
-                "input_data1": [16, 197, 768],
-            }
-            self.dynamic_shape.opt_input_shape = {
-                "input_data1": [1, 197, 768],
-            }
-
         def clear_dynamic_shape():
             self.dynamic_shape.max_input_shape = {}
             self.dynamic_shape.min_input_shape = {}
@@ -1026,7 +968,7 @@ def generate_trt_nodes_num():
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(), (
             1e-3,
-            1e-3,
+            2e-2,
         )
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(), (
@@ -1034,35 +976,7 @@ def generate_trt_nodes_num():
             1e-5,
         )
 
-        # for static_shape
-        clear_dynamic_shape()
-        generate_static_shape(attrs)
-        self.trt_param.workspace_size = 2013265920
-        self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), generate_trt_nodes_num(), (
-            1e-3,
-            1e-3,
-        )
-        self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(), (
-            1e-5,
-            1e-5,
-        )
-
-    def add_skip_trt_case(self):
-        def teller1(program_config, predictor_config):
-            if self.trt_param.precision == paddle_infer.PrecisionType.Half:
-                return True
-            return False
-
-        self.add_skip_case(
-            teller1,
-            SkipReasons.TRT_NOT_IMPLEMENTED,
-            "The output has diff between gpu and trt in fp16 mode.",
-        )
-
     def test(self):
-        self.add_skip_trt_case()
         self.run_test()
 
 
@@ -1072,19 +986,19 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
 
     def sample_program_configs(self):
         def generate_input1(batch, dim1):
-            return np.random.random((batch, dim1, 768)).astype(np.float32)
+            return np.full((batch, dim1, 768), 1).astype(np.float32)
 
         def generate_input2(shape):
-            return np.random.random(shape).astype(np.float32)
+            return np.full(shape, 1).astype(np.float32)
 
         def generate_weight1():
-            return np.random.random((768, 768)).astype(np.float32)
+            return np.full((768, 768), 0.1).astype(np.float32)
 
         def generate_weight2():
-            return np.random.random(768).astype(np.float32)
+            return np.full((768), 0.1).astype(np.float32)
 
         def generate_weight3():
-            return np.random.random((768, 768)).astype(np.float32)
+            return np.full((768, 768), 0.1).astype(np.float32)
 
         for batch in [2]:
             self.batch = batch
@@ -1423,48 +1337,9 @@ def clear_dynamic_shape():
         self.trt_param.workspace_size = 2013265920
         yield self.create_inference_config(), (1, 3), (1e-5, 1e-4)
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (1, 3), (1e-3, 1e-3)
-
-    def add_skip_trt_case(self):
-        def teller1(program_config, predictor_config):
-            if self.trt_param.precision == paddle_infer.PrecisionType.Half:
-                return True
-            return False
-
-        self.add_skip_case(
-            teller1,
-            SkipReasons.TRT_NOT_IMPLEMENTED,
-            "The output has diff between gpu and trt in fp16 mode.",
-        )
-
-        def teller2(program_config, predictor_config):
-            if (
-                self.trt_param.precision == paddle_infer.PrecisionType.Float32
-                and len(self.dynamic_shape.min_input_shape) != 0
-                and self.batch > 2
-            ):
-                return True
-            return False
-
-        self.add_skip_case(
-            teller2,
-            SkipReasons.TRT_NOT_IMPLEMENTED,
-            "The output has diff between gpu and trt when dynamic fp32 mode and batch size > 2.",
-        )
-
-        def teller3(program_config, predictor_config):
-            if self.trt_param.precision == paddle_infer.PrecisionType.Int8:
-                return True
-            return False
-
-        self.add_skip_case(
-            teller3,
-            SkipReasons.TRT_NOT_IMPLEMENTED,
-            "The output has diff between gpu and trt in int8 mode.",
-        )
+        yield self.create_inference_config(), (1, 3), (1e-3, 1e-2)
 
     def test(self):
-        self.add_skip_trt_case()
         self.run_test()
 
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_pass.py
index 1a932746035387..b89ffae51bbe55 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_pass.py
@@ -50,7 +50,7 @@ def test_check_output(self):
         if core.is_compiled_with_cuda():
             use_gpu.append(True)
         for i in range(len(use_gpu)):
-            self.check_output_with_option(use_gpu[i])
+            self.check_output_with_option(use_gpu[i], atol=1e-4, rtol=1e-3)
 
 
 class FCFusePassTRTStaticDims4Cols1Test(InferencePassTest):
@@ -78,7 +78,7 @@ def test_check_output(self):
         if core.is_compiled_with_cuda():
             use_gpu.append(True)
         for i in range(len(use_gpu)):
-            self.check_output_with_option(use_gpu[i])
+            self.check_output_with_option(use_gpu[i], atol=1e-4, rtol=1e-3)
 
 
 class FCFusePassTRTStaticDims4Cols2Test(InferencePassTest):
@@ -106,7 +106,7 @@ def test_check_output(self):
         if core.is_compiled_with_cuda():
             use_gpu.append(True)
         for i in range(len(use_gpu)):
-            self.check_output_with_option(use_gpu[i])
+            self.check_output_with_option(use_gpu[i], atol=1e-4, rtol=1e-3)
 
 
 class FCFusePassTRTDynamicDims2Test(InferencePassTest):
@@ -140,7 +140,7 @@ def test_check_output(self):
         if core.is_compiled_with_cuda():
             use_gpu.append(True)
         for i in range(len(use_gpu)):
-            self.check_output_with_option(use_gpu[i])
+            self.check_output_with_option(use_gpu[i], atol=1e-4, rtol=1e-3)
 
 
 class FCFusePassTRTDynamicDims3Cols1Test(InferencePassTest):
@@ -174,7 +174,7 @@ def test_check_output(self):
         if core.is_compiled_with_cuda():
             use_gpu.append(True)
         for i in range(len(use_gpu)):
-            self.check_output_with_option(use_gpu[i])
+            self.check_output_with_option(use_gpu[i], atol=1e-4, rtol=1e-3)
 
 
 class FCFusePassTRTDynamicDims3Cols2Test(InferencePassTest):
@@ -208,7 +208,7 @@ def test_check_output(self):
         if core.is_compiled_with_cuda():
             use_gpu.append(True)
         for i in range(len(use_gpu)):
-            self.check_output_with_option(use_gpu[i])
+            self.check_output_with_option(use_gpu[i], atol=1e-4, rtol=1e-3)
 
 
 class FCFusePassTRTDynamicDims4Cols1Test(InferencePassTest):
@@ -244,7 +244,7 @@ def test_check_output(self):
         if core.is_compiled_with_cuda():
             use_gpu.append(True)
         for i in range(len(use_gpu)):
-            self.check_output_with_option(use_gpu[i])
+            self.check_output_with_option(use_gpu[i], atol=1e-4, rtol=1e-3)
 
 
 class FCFusePassTRTDynamicDims4Cols2Test(InferencePassTest):
@@ -280,7 +280,7 @@ def test_check_output(self):
         if core.is_compiled_with_cuda():
             use_gpu.append(True)
         for i in range(len(use_gpu)):
-            self.check_output_with_option(use_gpu[i])
+            self.check_output_with_option(use_gpu[i], atol=1e-4, rtol=1e-3)
 
 
 class FCFusePassTRTDynamicDims4Cols3Test(InferencePassTest):
@@ -316,7 +316,7 @@ def test_check_output(self):
         if core.is_compiled_with_cuda():
             use_gpu.append(True)
         for i in range(len(use_gpu)):
-            self.check_output_with_option(use_gpu[i])
+            self.check_output_with_option(use_gpu[i], atol=1e-4, rtol=1e-3)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_flatten2_matmul_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_flatten2_matmul_fuse_pass.py
deleted file mode 100644
index 6d62c6b8ab343c..00000000000000
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_flatten2_matmul_fuse_pass.py
+++ /dev/null
@@ -1,148 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import hypothesis.strategies as st
-from auto_scan_test import IgnoreReasons, PassAutoScanTest
-from program_config import OpConfig, ProgramConfig, TensorConfig
-
-import paddle.inference as paddle_infer
-
-
-class TestFlatten2MatmulFusePass(PassAutoScanTest):
-    r"""
-        x_var
-          |
-       flatten2
-          \
-    flatten2_out_var    y_var
-             \           /
-                 matmul      bias_var
-                    \          /
-                   elementwise_add
-    """
-
-    def sample_predictor_configs(self, program_config):
-        # TRT
-        config = self.create_trt_inference_config()
-        config.enable_tensorrt_engine(
-            max_batch_size=10,
-            workspace_size=102400,
-            min_subgraph_size=0,
-            precision_mode=paddle_infer.PrecisionType.Float32,
-            use_static=False,
-            use_calib_mode=False,
-        )
-        yield config, ['mul', 'elementwise_add'], (1e-4, 1e-1)
-
-    def add_ignore_pass_case(self):
-        # Here we put some skip rules to avoid known bugs
-        def teller1(program_config, predictor_config):
-            y_shape = list(program_config.weights["matmul_y"].shape)
-            bias_shape = program_config.weights["bias"].shape
-            axis = program_config.ops[2].attrs["axis"]
-            # bias should be [mul_y_shape[-1]]
-            if axis == 0 or bias_shape[0] != y_shape[1] or len(bias_shape) != 1:
-                return True
-            return False
-
-        self.add_ignore_check_case(
-            teller1,
-            IgnoreReasons.PASS_ACCURACY_ERROR,
-            "The pass error on TRT while shape of bias is not [out_size].",
-        )
-
-    def sample_program_config(self, draw):
-        # 1. Generate shape and attr of flatten2
-        x_shape = draw(
-            st.lists(
-                st.integers(min_value=1, max_value=10), min_size=4, max_size=4
-            )
-        )
-        # [a, b, c, d] => [a, b*c*d]
-        flatten_axis = 1
-        flatten_shape = [x_shape[0], x_shape[1] * x_shape[2] * x_shape[3]]
-
-        # 2. Generate attr:transpose_X/transpose_Y/alpha of matmul
-        alpha = 1.0
-        transpose_X = False
-        transpose_Y = False
-
-        # 3. Generate legal shape of input:Y of matmul
-        y_shape = draw(
-            st.lists(
-                st.integers(min_value=1, max_value=8), min_size=2, max_size=2
-            )
-        )
-        y_shape[0] = flatten_shape[1]
-
-        # 4. Generate legal attr:axis of elementwise_add
-        axis = draw(st.integers(min_value=-1, max_value=1))
-        if axis == 0:
-            axis = -1
-        bias_shape = [
-            y_shape[1],
-        ]
-
-        flatten2_op = OpConfig(
-            "flatten2",
-            inputs={
-                "X": ["flatten2_x"],
-            },
-            axis=flatten_axis,
-            outputs={"Out": ["flatten2_out"], "XShape": ["xshape"]},
-        )
-        matmul_op = OpConfig(
-            "matmul",
-            inputs={"X": ["flatten2_out"], "Y": ["matmul_y"]},
-            outputs={"Out": ["matmul_out"]},
-            alpha=alpha,
-            transpose_X=transpose_X,
-            transpose_Y=transpose_Y,
-        )
-
-        add_op = OpConfig(
-            "elementwise_add",
-            inputs={"X": ["matmul_out"], "Y": ["bias"]},
-            outputs={"Out": ["add_out"]},
-            axis=axis,
-        )
-
-        ops = [flatten2_op, matmul_op, add_op]
-
-        program_config = ProgramConfig(
-            ops=ops,
-            weights={
-                "matmul_y": TensorConfig(shape=y_shape),
-                "bias": TensorConfig(shape=bias_shape),
-            },
-            inputs={
-                "flatten2_x": TensorConfig(shape=x_shape),
-            },
-            outputs=ops[-1].outputs["Out"],
-        )
-
-        return program_config
-
-    def test(self):
-        self.run_and_statis(
-            quant=False,
-            max_examples=25,
-            passes=["trt_flatten2_matmul_fuse_pass"],
-        )
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul_quant_dequant.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul_quant_dequant.py
index e1d30929b9d0ae..7c37837c40646b 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul_quant_dequant.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul_quant_dequant.py
@@ -79,6 +79,14 @@ def network():
         self.trt_parameters = TensorRTMatMulQuantDequantDims3Test.TensorRTParam(
             1 << 30, 32, 0, AnalysisConfig.Precision.Int8, False, False
         )
+        self.dynamic_shape_params = (
+            TensorRTMatMulQuantDequantDims3Test.DynamicShapeParam(
+                {'data': [1, 28, 28]},
+                {'data': [4, 28, 28]},
+                {'data': [3, 28, 28]},
+                False,
+            )
+        )
         self.activation_quantize_type = 'moving_average_abs_max'
         self.weight_quantize_type = 'channel_wise_abs_max'
 
@@ -137,7 +145,7 @@ def network():
             self.label = paddle.static.data(
                 name='label', shape=[1, 1], dtype='int64'
             )
-            reshape_out = paddle.reshape(self.data, shape=[1, 4, 14, 14])
+            reshape_out = paddle.reshape(self.data, shape=[0, 4, 14, 14])
             matmul_out = paddle.matmul(
                 x=reshape_out,
                 y=reshape_out,
@@ -183,6 +191,14 @@ def network():
         self.trt_parameters = TensorRTMatMulQuantDequantDims4Test.TensorRTParam(
             1 << 30, 32, 0, AnalysisConfig.Precision.Int8, False, False
         )
+        self.dynamic_shape_params = (
+            TensorRTMatMulQuantDequantDims4Test.DynamicShapeParam(
+                {'data': [1, 28, 28]},
+                {'data': [4, 28, 28]},
+                {'data': [3, 28, 28]},
+                False,
+            )
+        )
         self.activation_quantize_type = 'moving_average_abs_max'
         self.weight_quantize_type = 'channel_wise_abs_max'
 

From d9c3abe674357ef6ea78fbd68c72c228d59610c4 Mon Sep 17 00:00:00 2001
From: Yuanle Liu <yuanlehome@163.com>
Date: Thu, 13 Apr 2023 23:42:07 +0800
Subject: [PATCH 149/156] remove need cpp14 support (#52867)

---
 paddle/fluid/inference/api/demo_ci/CMakeLists.txt | 2 +-
 paddle/phi/common/complex.h                       | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
index 11f214bc45d535..fc23caee656380 100644
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -83,7 +83,7 @@ else()
   if(WITH_MKL)
     set(FLAG_OPENMP "-fopenmp")
   endif()
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 ${FLAG_OPENMP}")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 ${FLAG_OPENMP}")
 endif()
 
 if(WITH_GPU)
diff --git a/paddle/phi/common/complex.h b/paddle/phi/common/complex.h
index c52d941f2e238c..a4e003dd544ad6 100644
--- a/paddle/phi/common/complex.h
+++ b/paddle/phi/common/complex.h
@@ -105,16 +105,16 @@ struct PADDLE_ALIGN(sizeof(T) * 2) complex {
 
   template <typename T1 = T>
   HOSTDEVICE explicit complex(
-      const std::enable_if_t<std::is_same<T1, float>::value, complex<double>>&
-          val) {
+      const typename std::enable_if<std::is_same<T1, float>::value,
+                                    complex<double>>::type& val) {
     real = val.real;
     imag = val.imag;
   }
 
   template <typename T1 = T>
   HOSTDEVICE explicit complex(
-      const std::enable_if_t<std::is_same<T1, double>::value, complex<float>>&
-          val) {
+      const typename std::enable_if<std::is_same<T1, double>::value,
+                                    complex<float>>::type& val) {
     real = val.real;
     imag = val.imag;
   }

From 3fed97f47a3141bab9f323bb76a8eee2f374e457 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=9F=B3=E6=99=93=E4=BC=9F?=
 <39303645+Shixiaowei02@users.noreply.github.com>
Date: Fri, 14 Apr 2023 10:25:49 +0800
Subject: [PATCH 150/156] add ci reviewer for inference size (#52159)

---
 paddle/scripts/paddle_build.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index a6477a62edeb6e..72de9f5e07b29a 100644
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -1133,12 +1133,12 @@ function check_approvals_of_unittest() {
 EOF
         if [ $(awk "BEGIN{print 20<$AllDiffSize}") -eq 1 ] ; then
             approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
-            APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 39303645 328693`
+            APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 39303645 7845005 26377421`
             echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
             if [ "${APPROVALS}" == "FALSE" ]; then
                 echo "=========================================================================================="
                 echo "This PR make the release inference library size growth exceeds 20 M."
-                echo "Then you must have one RD (Shixiaowei02 (Recommend) or Superjomn) approval for this PR\n"
+                echo "Then you must have one RD (jiweibo (Recommend), qingqing01 or Shixiaowei02) approval for this PR\n"
                 echo "=========================================================================================="
                 exit 6
             fi

From aac8da90bdf0129332fd9b2f3eba5c4ff579fdd1 Mon Sep 17 00:00:00 2001
From: risemeup1 <62429225+risemeup1@users.noreply.github.com>
Date: Fri, 14 Apr 2023 10:26:42 +0800
Subject: [PATCH 151/156] Fix test full name usage (#52790)

* test

* fix test error

* fix test error

* fix test error
---
 test/dygraph_to_static/test_full_name_usage.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/dygraph_to_static/test_full_name_usage.py b/test/dygraph_to_static/test_full_name_usage.py
index 483d9a14f6f455..e95b2616977d7a 100644
--- a/test/dygraph_to_static/test_full_name_usage.py
+++ b/test/dygraph_to_static/test_full_name_usage.py
@@ -72,9 +72,9 @@ def test_run_success(self):
             np.testing.assert_allclose(
                 decorated_call_decorated(x).numpy(), answer, rtol=1e-05
             )
-            with self.assertRaises(NotImplementedError):
+            with self.assertRaises((NotImplementedError, TypeError)):
                 DoubleDecorated().double_decorated_func1(x)
-            with self.assertRaises(NotImplementedError):
+            with self.assertRaises((NotImplementedError, TypeError)):
                 DoubleDecorated().double_decorated_func2(x)
 
 

From e93e8a3f1fbac5e9f68e9d6419df735f10d36817 Mon Sep 17 00:00:00 2001
From: huangjiyi <43315610+huangjiyi@users.noreply.github.com>
Date: Fri, 14 Apr 2023 10:29:31 +0800
Subject: [PATCH 152/156] update (#52878)

---
 .../operators/amp/get_float_status_op.cc      |   5 +-
 .../operators/collective/global_gather_op.cc  |  15 +-
 .../collective/global_gather_op.cu.cc         |  17 +-
 .../operators/collective/global_gather_op.h   |   2 +-
 .../operators/collective/global_scatter_op.cc |  15 +-
 .../collective/global_scatter_op.cu.cc        |  17 +-
 .../operators/collective/global_scatter_op.h  |   2 +-
 .../detection/generate_mask_labels_op.cc      |  10 +-
 .../detection/generate_proposal_labels_op.cc  |  11 +-
 .../gaussian_random_batch_size_like_op.cc     |  13 +-
 .../gaussian_random_batch_size_like_op.cu     |  17 +-
 .../fluid/operators/graph_khop_sampler_op.cc  |  10 +-
 .../fluid/operators/graph_khop_sampler_op.cu  |  11 +-
 .../fluid/operators/graph_khop_sampler_op.h   |   2 +-
 paddle/fluid/operators/group_norm_op.cc       |   2 -
 paddle/fluid/operators/group_norm_op.cu       | 834 ------------------
 paddle/fluid/operators/group_norm_op.h        | 387 --------
 paddle/fluid/operators/l1_norm_op.cc          |  15 +-
 paddle/fluid/operators/l1_norm_op.h           |   4 +-
 19 files changed, 101 insertions(+), 1288 deletions(-)
 delete mode 100644 paddle/fluid/operators/group_norm_op.cu
 delete mode 100644 paddle/fluid/operators/group_norm_op.h

diff --git a/paddle/fluid/operators/amp/get_float_status_op.cc b/paddle/fluid/operators/amp/get_float_status_op.cc
index d5a924b8d842c3..8700d82976f01c 100644
--- a/paddle/fluid/operators/amp/get_float_status_op.cc
+++ b/paddle/fluid/operators/amp/get_float_status_op.cc
@@ -53,7 +53,7 @@ class GetFloatStatusMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class GetFloatStatusKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -75,4 +75,5 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
 
-REGISTER_OP_CPU_KERNEL(get_float_status, ops::GetFloatStatusKernel<CPU, float>);
+PD_REGISTER_STRUCT_KERNEL(
+    get_float_status, CPU, ALL_LAYOUT, ops::GetFloatStatusKernel, float) {}
diff --git a/paddle/fluid/operators/collective/global_gather_op.cc b/paddle/fluid/operators/collective/global_gather_op.cc
index f3380b4498331f..370701dbb8e40c 100644
--- a/paddle/fluid/operators/collective/global_gather_op.cc
+++ b/paddle/fluid/operators/collective/global_gather_op.cc
@@ -111,9 +111,12 @@ REGISTER_OPERATOR(global_gather,
                   ops::GlobalGatherOpGradMaker<paddle::framework::OpDesc>,
                   ops::GlobalGatherOpGradMaker<paddle::imperative::OpBase>)
 
-REGISTER_OP_CPU_KERNEL(global_gather,
-                       ops::GlobalGatherOpCPUKernel<float>,
-                       ops::GlobalGatherOpCPUKernel<double>,
-                       ops::GlobalGatherOpCPUKernel<int>,
-                       ops::GlobalGatherOpCPUKernel<int64_t>,
-                       ops::GlobalGatherOpCPUKernel<plat::float16>);
+PD_REGISTER_STRUCT_KERNEL(global_gather,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::GlobalGatherOpCPUKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t,
+                          plat::float16) {}
diff --git a/paddle/fluid/operators/collective/global_gather_op.cu.cc b/paddle/fluid/operators/collective/global_gather_op.cu.cc
index 83e1a4d4ca778c..07df7717a30bc6 100644
--- a/paddle/fluid/operators/collective/global_gather_op.cu.cc
+++ b/paddle/fluid/operators/collective/global_gather_op.cu.cc
@@ -261,7 +261,7 @@ struct GlobalGatherProcessGroupFunctor<phi::GPUContext, T> {
   }
 };
 
-template <typename T>
+template <typename T, typename DeivceContext>
 class GlobalGatherOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -283,9 +283,12 @@ class GlobalGatherOpCUDAKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_CUDA_KERNEL(global_gather,
-                        ops::GlobalGatherOpCUDAKernel<float>,
-                        ops::GlobalGatherOpCUDAKernel<double>,
-                        ops::GlobalGatherOpCUDAKernel<int>,
-                        ops::GlobalGatherOpCUDAKernel<int64_t>,
-                        ops::GlobalGatherOpCUDAKernel<plat::float16>);
+PD_REGISTER_STRUCT_KERNEL(global_gather,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::GlobalGatherOpCUDAKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t,
+                          plat::float16) {}
diff --git a/paddle/fluid/operators/collective/global_gather_op.h b/paddle/fluid/operators/collective/global_gather_op.h
index 463c5fb594401e..0d3b4ed92e9b23 100644
--- a/paddle/fluid/operators/collective/global_gather_op.h
+++ b/paddle/fluid/operators/collective/global_gather_op.h
@@ -25,7 +25,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class GlobalGatherOpCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/fluid/operators/collective/global_scatter_op.cc b/paddle/fluid/operators/collective/global_scatter_op.cc
index d4469c5eadbbd8..e29ca9ab371e0c 100644
--- a/paddle/fluid/operators/collective/global_scatter_op.cc
+++ b/paddle/fluid/operators/collective/global_scatter_op.cc
@@ -115,9 +115,12 @@ REGISTER_OPERATOR(global_scatter,
                   ops::GlobalScatterOpGradMaker<paddle::framework::OpDesc>,
                   ops::GlobalScatterOpGradMaker<paddle::imperative::OpBase>)
 
-REGISTER_OP_CPU_KERNEL(global_scatter,
-                       ops::GlobalScatterOpCPUKernel<float>,
-                       ops::GlobalScatterOpCPUKernel<double>,
-                       ops::GlobalScatterOpCPUKernel<int>,
-                       ops::GlobalScatterOpCPUKernel<int64_t>,
-                       ops::GlobalScatterOpCPUKernel<plat::float16>);
+PD_REGISTER_STRUCT_KERNEL(global_scatter,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::GlobalScatterOpCPUKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t,
+                          plat::float16) {}
diff --git a/paddle/fluid/operators/collective/global_scatter_op.cu.cc b/paddle/fluid/operators/collective/global_scatter_op.cu.cc
index 017398413b372b..3136ac21ab764b 100644
--- a/paddle/fluid/operators/collective/global_scatter_op.cu.cc
+++ b/paddle/fluid/operators/collective/global_scatter_op.cu.cc
@@ -259,7 +259,7 @@ struct GlobalScatterProcessGroupFunctor<phi::GPUContext, T> {
   }
 };
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class GlobalScatterOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -281,9 +281,12 @@ class GlobalScatterOpCUDAKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_CUDA_KERNEL(global_scatter,
-                        ops::GlobalScatterOpCUDAKernel<float>,
-                        ops::GlobalScatterOpCUDAKernel<double>,
-                        ops::GlobalScatterOpCUDAKernel<int>,
-                        ops::GlobalScatterOpCUDAKernel<int64_t>,
-                        ops::GlobalScatterOpCUDAKernel<plat::float16>);
+PD_REGISTER_STRUCT_KERNEL(global_scatter,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::GlobalScatterOpCUDAKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t,
+                          plat::float16) {}
diff --git a/paddle/fluid/operators/collective/global_scatter_op.h b/paddle/fluid/operators/collective/global_scatter_op.h
index 9a29808e7db101..3cb2a3c7fc41b2 100644
--- a/paddle/fluid/operators/collective/global_scatter_op.h
+++ b/paddle/fluid/operators/collective/global_scatter_op.h
@@ -25,7 +25,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class GlobalScatterOpCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/fluid/operators/detection/generate_mask_labels_op.cc b/paddle/fluid/operators/detection/generate_mask_labels_op.cc
index 6acc0431762cf7..e5cb72eed931ef 100644
--- a/paddle/fluid/operators/detection/generate_mask_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_mask_labels_op.cc
@@ -328,7 +328,7 @@ std::vector<phi::DenseTensor> SampleMaskForOneImage(
   return res;
 }
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class GenerateMaskLabelsKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -533,5 +533,9 @@ REGISTER_OPERATOR(
     ops::GenerateMaskLabelsOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(generate_mask_labels,
-                       ops::GenerateMaskLabelsKernel<float>);
+
+PD_REGISTER_STRUCT_KERNEL(generate_mask_labels,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::GenerateMaskLabelsKernel,
+                          float) {}
diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
index dcffa170b6a3a4..c5274c63ec799f 100644
--- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
@@ -510,7 +510,7 @@ std::vector<phi::DenseTensor> SampleRoisForOneImage(
   return res;
 }
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -811,9 +811,12 @@ REGISTER_OPERATOR(
     ops::GenerateProposalLabelsOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(generate_proposal_labels,
-                       ops::GenerateProposalLabelsKernel<float>,
-                       ops::GenerateProposalLabelsKernel<double>);
+PD_REGISTER_STRUCT_KERNEL(generate_proposal_labels,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::GenerateProposalLabelsKernel,
+                          float,
+                          double) {}
 
 REGISTER_OP_VERSION(generate_proposal_labels)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc b/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc
index c7c3af2b4d70a4..c792532e58f792 100644
--- a/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc
+++ b/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class CPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -99,7 +99,10 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
     paddle::operators::BatchSizeLikeNoNeedBufferVarsInferer);
 
-REGISTER_OP_CPU_KERNEL(
-    gaussian_random_batch_size_like,
-    paddle::operators::CPUGaussianRandomBatchSizeLikeKernel<float>,
-    paddle::operators::CPUGaussianRandomBatchSizeLikeKernel<double>);
+namespace ops = paddle::operators;
+PD_REGISTER_STRUCT_KERNEL(gaussian_random_batch_size_like,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::CPUGaussianRandomBatchSizeLikeKernel,
+                          float,
+                          double) {}
diff --git a/paddle/fluid/operators/gaussian_random_batch_size_like_op.cu b/paddle/fluid/operators/gaussian_random_batch_size_like_op.cu
index b91a93b7cb1cfb..7546c3c350db09 100644
--- a/paddle/fluid/operators/gaussian_random_batch_size_like_op.cu
+++ b/paddle/fluid/operators/gaussian_random_batch_size_like_op.cu
@@ -47,7 +47,7 @@ struct GaussianGenerator {
   }
 };
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -78,9 +78,12 @@ class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_CUDA_KERNEL(
-    gaussian_random_batch_size_like,
-    paddle::operators::GPUGaussianRandomBatchSizeLikeKernel<
-        paddle::platform::float16>,
-    paddle::operators::GPUGaussianRandomBatchSizeLikeKernel<float>,
-    paddle::operators::GPUGaussianRandomBatchSizeLikeKernel<double>);
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+PD_REGISTER_STRUCT_KERNEL(gaussian_random_batch_size_like,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::GPUGaussianRandomBatchSizeLikeKernel,
+                          float,
+                          double,
+                          plat::float16) {}
diff --git a/paddle/fluid/operators/graph_khop_sampler_op.cc b/paddle/fluid/operators/graph_khop_sampler_op.cc
index 1cb5ac3c3071ca..2d22e32fd0cd0d 100644
--- a/paddle/fluid/operators/graph_khop_sampler_op.cc
+++ b/paddle/fluid/operators/graph_khop_sampler_op.cc
@@ -136,6 +136,10 @@ using CPU = phi::CPUContext;
 REGISTER_OPERATOR(graph_khop_sampler,
                   ops::GraphKhopSamplerOP,
                   ops::GraphKhopSamplerOpMaker);
-REGISTER_OP_CPU_KERNEL(graph_khop_sampler,
-                       ops::GraphKhopSamplerOpKernel<CPU, int32_t>,
-                       ops::GraphKhopSamplerOpKernel<CPU, int64_t>);
+
+PD_REGISTER_STRUCT_KERNEL(graph_khop_sampler,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::GraphKhopSamplerOpKernel,
+                          int32_t,
+                          int64_t) {}
diff --git a/paddle/fluid/operators/graph_khop_sampler_op.cu b/paddle/fluid/operators/graph_khop_sampler_op.cu
index c9e4dac74a85a9..e533960c8a648f 100644
--- a/paddle/fluid/operators/graph_khop_sampler_op.cu
+++ b/paddle/fluid/operators/graph_khop_sampler_op.cu
@@ -412,7 +412,7 @@ void ReindexFunc(const framework::ExecutionContext& ctx,
                           thrust::raw_pointer_cast(values.data()));
 }
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class GraphKhopSamplerOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -668,6 +668,9 @@ class GraphKhopSamplerOpCUDAKernel : public framework::OpKernel<T> {
 using CUDA = phi::GPUContext;
 namespace ops = paddle::operators;
 
-REGISTER_OP_CUDA_KERNEL(graph_khop_sampler,
-                        ops::GraphKhopSamplerOpCUDAKernel<CUDA, int32_t>,
-                        ops::GraphKhopSamplerOpCUDAKernel<CUDA, int64_t>);
+PD_REGISTER_STRUCT_KERNEL(graph_khop_sampler,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::GraphKhopSamplerOpCUDAKernel,
+                          int32_t,
+                          int64_t) {}
diff --git a/paddle/fluid/operators/graph_khop_sampler_op.h b/paddle/fluid/operators/graph_khop_sampler_op.h
index a22b7a6ee20d8f..0ab9485ee680cc 100644
--- a/paddle/fluid/operators/graph_khop_sampler_op.h
+++ b/paddle/fluid/operators/graph_khop_sampler_op.h
@@ -191,7 +191,7 @@ void SampleNeighbors(const T* src,
   }
 }
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class GraphKhopSamplerOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/fluid/operators/group_norm_op.cc b/paddle/fluid/operators/group_norm_op.cc
index 90e15ef273456d..cead83df15235a 100644
--- a/paddle/fluid/operators/group_norm_op.cc
+++ b/paddle/fluid/operators/group_norm_op.cc
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/group_norm_op.h"
-
 #include <memory>
 #include <string>
 #include <unordered_map>
diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu
deleted file mode 100644
index 9cb4e54ac0054b..00000000000000
--- a/paddle/fluid/operators/group_norm_op.cu
+++ /dev/null
@@ -1,834 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef __NVCC__
-#include "cub/cub.cuh"
-#endif
-#ifdef __HIPCC__
-#include <hipcub/hipcub.hpp>
-namespace cub = hipcub;
-#endif
-
-#include "paddle/fluid/operators/group_norm_op.h"
-#include "paddle/phi/backends/gpu/gpu_device_function.h"
-#include "paddle/phi/backends/gpu/gpu_primitives.h"
-
-namespace paddle {
-namespace operators {
-
-using DataLayout = phi::DataLayout;
-enum GroupNormKernelFlags { kHasScale = 1, kHasBias = 2 };
-#define ALIGN_BYTES 16
-
-#define CHECK_CASE(i, flags, kernel_name, ...)                              \
-  if (i == flags) {                                                         \
-    kernel_name<T, i><<<grid, threads, 0, dev_ctx.stream()>>>(__VA_ARGS__); \
-  }
-
-// 0 for no scale, no bias
-// 1 for has scale, no bias
-// 2 for no scale, has bias
-// 3 for has scale, has bias
-#define UNROLL_ALL_CASES(flags, kernel_name, ...) \
-  CHECK_CASE(0, flags, kernel_name, __VA_ARGS__)  \
-  CHECK_CASE(1, flags, kernel_name, __VA_ARGS__)  \
-  CHECK_CASE(2, flags, kernel_name, __VA_ARGS__)  \
-  CHECK_CASE(3, flags, kernel_name, __VA_ARGS__)
-
-template <typename T>
-__device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) {
-  typedef cub::WarpReduce<T> WarpReduce;
-  typename WarpReduce::TempStorage temp_storage;
-  value = WarpReduce(temp_storage).Sum(value);
-  if (cub::LaneId() == 0) phi::CudaAtomicAdd(sum, value);
-}
-
-template <typename T>
-__global__ void GroupNormForwardGetMeanAndVar(const T* x,
-                                              int N,
-                                              int C,
-                                              int W,
-                                              int imsize,
-                                              int groups,
-                                              int group_size,
-                                              T* mean,
-                                              T* var) {
-  int gid = blockIdx.y;
-  int cid = blockIdx.x;
-  int bid = blockIdx.z;
-  int H = imsize / W;
-  int number = min(group_size, static_cast<int>(C - gid * group_size));
-  int ccid = gid * group_size + cid;
-  if (ccid >= C) return;
-  T x_mean = 0, x_var = 0;
-  for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
-    T val;
-    int hid = imid / W;
-    int wid = imid % W;
-    val = x[(bid * H + hid) * W * C + wid * C + ccid];
-
-    x_mean += val;
-    x_var += val * val;
-  }
-  x_mean /= number * imsize;
-  x_var /= number * imsize;
-  CudaAtomicAddWithWarp(&mean[bid * groups + gid], x_mean);
-  CudaAtomicAddWithWarp(&var[bid * groups + gid], x_var);
-}
-
-template <typename T, typename AccT, int VecSize, int Num>
-__device__ __forceinline__ void ThreadReduce(phi::Array<const T*, Num> arrs,
-                                             int size,
-                                             const int offset,
-                                             AccT* out_mean,
-                                             AccT* out_var) {
-  const T* x = arrs[0];
-  const T* y;
-  if (Num == 2) {
-    y = arrs[1];
-  }
-  using VecT = kps::details::VectorType<T, VecSize>;
-  int tid = threadIdx.x;
-  if (offset > 0) {
-    x -= offset;
-    if (Num == 2) {
-      y -= offset;
-    }
-    size += offset;
-    if (tid >= offset) {
-      if (Num == 1) {
-        *out_mean += x[tid];
-        *out_var += x[tid] * x[tid];
-      } else if (Num == 2) {
-        *out_mean += y[tid];
-        *out_var += y[tid] * x[tid];
-      }
-    }
-    size -= blockDim.x;
-    x += blockDim.x;
-    if (Num == 2) {
-      y += blockDim.x;
-    }
-  }
-  int remain = size % (VecSize * blockDim.x);
-
-  T ins_x[VecSize];
-  T ins_y[VecSize];
-  VecT* ins_vec_x = reinterpret_cast<VecT*>(&ins_x);
-  VecT* ins_vec_y = reinterpret_cast<VecT*>(&ins_y);
-
-  // vector part
-  for (; VecSize * tid < (size - remain); tid += blockDim.x) {
-    *ins_vec_x = reinterpret_cast<const VecT*>(x)[tid];
-    if (Num == 2) {
-      *ins_vec_y = reinterpret_cast<const VecT*>(y)[tid];
-    }
-
-#pragma unroll
-    for (int i = 0; i < VecSize; ++i) {
-      if (Num == 1) {
-        *out_mean += ins_x[i];
-        *out_var += ins_x[i] * ins_x[i];
-      } else if (Num == 2) {
-        *out_mean += ins_y[i];
-        *out_var += ins_y[i] * ins_x[i];
-      }
-    }
-  }
-
-  // scalar part
-  tid = size - remain + threadIdx.x;
-  for (; tid < size; tid += blockDim.x) {
-    if (Num == 1) {
-      *out_mean += x[tid];
-      *out_var += x[tid] * x[tid];
-    } else if (Num == 2) {
-      *out_mean += y[tid];
-      *out_var += y[tid] * x[tid];
-    }
-  }
-}
-
-template <typename T>
-__device__ __forceinline__ void ReduceMeanAndVar(
-    T* mean, T* var, T x_mean, T x_var, int size) {
-  const int nc = blockIdx.x;
-  x_mean = kps::details::BlockXReduce<T, kps::AddFunctor<T>>(
-      x_mean, kps::AddFunctor<T>());
-  x_var = kps::details::BlockXReduce<T, kps::AddFunctor<T>>(
-      x_var, kps::AddFunctor<T>());
-  __syncthreads();
-  if (threadIdx.x == 0) {
-    mean[nc] = static_cast<T>(x_mean / size);
-    var[nc] = static_cast<T>(x_var / size);
-  }
-}
-
-template <typename T>
-__global__ void ScalarGetMeanAndVarNCHW(const T* x, T* mean, T* var, int size) {
-  int i = blockIdx.x;
-  T x_mean = 0, x_var = 0;
-  for (int j = threadIdx.x; j < size; j += blockDim.x) {
-    T val;
-    val = x[i * size + j];
-    x_mean += val;
-    x_var += val * val;
-  }
-  ReduceMeanAndVar<T>(mean, var, x_mean, x_var, size);
-}
-
-template <typename T, typename AccT, int VecSize>
-__global__ void VectorizedGetMeanAndVarNCHW(const T* x,
-                                            T* mean,
-                                            T* var,
-                                            int size) {
-  int i = blockIdx.x;
-  AccT x_mean = static_cast<AccT>(0);
-  AccT x_var = static_cast<AccT>(0);
-  x += i * size;
-  const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T);
-  phi::Array<const T*, 1> ins;
-  ins[0] = x;
-  ThreadReduce<T, AccT, VecSize, 1>(ins, size, input_offset, &x_mean, &x_var);
-  ReduceMeanAndVar<AccT>(mean, var, x_mean, x_var, size);
-}
-
-template <typename T, int flags>
-__global__ void GroupNormForward(const T* x,
-                                 const T* mean,
-                                 const T* var,
-                                 const T* scale,
-                                 const T* bias,
-                                 int N,
-                                 int C,
-                                 int W,
-                                 int imsize,
-                                 int groups,
-                                 int group_size,
-                                 T epsilon,
-                                 T* y,
-                                 T* real_var,
-                                 const DataLayout data_layout) {
-  int gid = blockIdx.y;
-  int cid = blockIdx.x;
-  int bid = blockIdx.z;
-  int H = imsize / W;
-  int ccid = gid * group_size + cid;
-  if (ccid >= C) return;
-  auto ng = bid * groups + gid;
-  T x_mean = mean[ng];
-  T x_var = var[ng];
-  x_var = x_var - x_mean * x_mean;
-  T var_inv = rsqrt(x_var + epsilon);
-  if (cid == 0 && threadIdx.x == 0) {
-    real_var[ng] = x_var;
-  }
-  for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
-    T val;
-    int hid, wid;
-    int index = (bid * C + ccid) * imsize + imid;
-    if (data_layout == DataLayout::kNCHW) {
-      val = x[index];
-    } else {
-      hid = imid / W;
-      wid = imid % W;
-      val = x[(bid * H + hid) * W * C + wid * C + ccid];
-    }
-    val = (val - x_mean) * var_inv;
-    if (flags & kHasScale) {
-      val *= scale[ccid];
-    }
-    if (flags & kHasBias) {
-      val += bias[ccid];
-    }
-    if (data_layout == DataLayout::kNCHW) {
-      y[index] = val;
-    } else {
-      y[(bid * H + hid) * W * C + wid * C + ccid] = val;
-    }
-  }
-}
-
-template <typename T>
-class GroupNormKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
-    const float epsilon = ctx.Attr<float>("epsilon");
-    auto* scale = ctx.Input<phi::DenseTensor>("Scale");
-    auto* bias = ctx.Input<phi::DenseTensor>("Bias");
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-
-    auto* y = ctx.Output<phi::DenseTensor>("Y");
-    auto* mean = ctx.Output<phi::DenseTensor>("Mean");
-    auto* var = ctx.Output<phi::DenseTensor>("Variance");
-    const auto groups = ctx.Attr<int>("groups");
-
-    const auto x_dims = x->dims();
-    const int C =
-        (data_layout == DataLayout::kNCHW ? x_dims[1]
-                                          : x_dims[x_dims.size() - 1]);
-    const int group_size = C / groups;
-
-    const int W =
-        (data_layout == DataLayout::kNCHW ? x_dims[x_dims.size() - 1]
-                                          : x_dims[x_dims.size() - 2]);
-
-    y->mutable_data<T>(ctx.GetPlace());
-    mean->mutable_data<T>(ctx.GetPlace());
-    var->mutable_data<T>(ctx.GetPlace());
-    phi::funcs::SetConstant<phi::GPUContext, T> set_zero;
-    auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
-    phi::DenseTensor temp_var;
-    temp_var.mutable_data<T>(var->dims(), ctx.GetPlace());
-    auto* x_data = x->data<T>();
-    auto* y_data = y->data<T>();
-    auto* mean_data = mean->data<T>();
-    auto* var_data = var->data<T>();
-    auto* temp_var_data = temp_var.data<T>();
-
-    const T* scale_data = nullptr;
-    if (scale) scale_data = scale->data<T>();
-    const T* bias_data = nullptr;
-    if (bias) bias_data = bias->data<T>();
-
-    int imsize = 1;
-    if (data_layout == DataLayout::kNCHW) {
-      for (int i = 2; i < x_dims.size(); ++i) {
-        imsize *= x_dims[i];
-      }
-    } else {
-      for (int i = 1; i < x_dims.size() - 1; ++i) {
-        imsize *= x_dims[i];
-      }
-    }
-
-#ifdef __HIPCC__
-    int block_size = std::max(std::min(256, imsize), 64);
-#else
-    int block_size = std::min(1024, imsize);
-#endif
-
-    dim3 grid(group_size, groups, x_dims[0]);
-    dim3 threads(block_size, 1, 1);
-    if (data_layout == DataLayout::kNCHW) {
-      using AccT = typename phi::dtype::MPTypeTrait<T>::Type;
-      constexpr int vec_size = sizeof(float4) / sizeof(T);
-      int size = group_size * imsize;
-      const int max_num_threads = 1024;
-      int max_block_size = std::min(size / vec_size, max_num_threads);
-      int block_size_nchw = 1;
-      while (block_size_nchw < max_block_size) {
-        block_size_nchw *= 2;
-      }
-      block_size_nchw = std::max(block_size_nchw, kps::details::kWarpSize);
-      dim3 grids(x_dims[0] * groups);
-      dim3 blocks(block_size_nchw);
-      if (size < vec_size * block_size_nchw) {
-        ScalarGetMeanAndVarNCHW<T><<<grids, blocks, 0, dev_ctx.stream()>>>(
-            x_data, mean_data, temp_var_data, size);
-      } else {
-        VectorizedGetMeanAndVarNCHW<T, AccT, vec_size>
-            <<<grids, blocks, 0, dev_ctx.stream()>>>(
-                x_data, mean_data, temp_var_data, size);
-      }
-    } else {
-      set_zero(dev_ctx, mean, static_cast<T>(0));
-      set_zero(dev_ctx, &temp_var, static_cast<T>(0));
-      GroupNormForwardGetMeanAndVar<T>
-          <<<grid, threads, 0, dev_ctx.stream()>>>(x_data,
-                                                   x_dims[0],
-                                                   C,
-                                                   W,
-                                                   imsize,
-                                                   groups,
-                                                   group_size,
-                                                   mean_data,
-                                                   temp_var_data);
-    }
-    int flags =
-        (scale_data != nullptr) * kHasScale + (bias_data != nullptr) * kHasBias;
-    UNROLL_ALL_CASES(flags,
-                     GroupNormForward,
-                     x_data,
-                     mean_data,
-                     temp_var_data,
-                     scale_data,
-                     bias_data,
-                     x_dims[0],
-                     C,
-                     W,
-                     imsize,
-                     groups,
-                     group_size,
-                     epsilon,
-                     y_data,
-                     var_data,
-                     data_layout);
-  }
-};
-
-template <typename T, int flags>
-__global__ void GroupNormBackwardGetMeanAndVar(const T* x,
-                                               const T* scale,
-                                               const T* bias,
-                                               const T* d_y,
-                                               int N,
-                                               int C,
-                                               int W,
-                                               int imsize,
-                                               int groups,
-                                               int group_size,
-                                               T epsilon,
-                                               T* d_mean,
-                                               T* d_var,
-                                               T* d_scale,
-                                               T* d_bias) {
-  int gid = blockIdx.y;
-  int cid = blockIdx.x;
-  int bid = blockIdx.z;
-  int H = imsize / W;
-  int number = min(group_size, static_cast<int>(C - gid * group_size));
-  int ccid = gid * group_size + cid;
-  if (ccid >= C) return;
-  T x_scale = (flags & kHasScale) ? scale[ccid] : 1;
-  T x_bias = (flags & kHasBias) ? bias[ccid] : 0;
-  T x_scale_inv = 0;
-  if (x_scale != 0) x_scale_inv = 1.0 / x_scale;
-  T d_mean_data = 0, d_var_data = 0, d_scale_data = 0, d_bias_data = 0;
-
-  for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
-    T val, dval;
-
-    int hid = imid / W;
-    int wid = imid % W;
-    val = x[(bid * H + hid) * W * C + wid * C + ccid] - x_bias;
-    dval = d_y[(bid * H + hid) * W * C + wid * C + ccid];
-
-    d_var_data += val * dval;
-    d_mean_data += dval * x_scale;
-
-    val = val * x_scale_inv;
-    d_bias_data += dval;
-    d_scale_data += val * dval;
-  }
-  CudaAtomicAddWithWarp(&(d_mean[bid * groups + gid]), d_mean_data);
-  CudaAtomicAddWithWarp(&(d_var[bid * groups + gid]), d_var_data);
-
-  if (flags & kHasScale) {
-#if CUDA_VERSION >= 11070
-    phi::CudaAtomicAdd(&(d_scale[ccid]), d_scale_data);
-#else
-    CudaAtomicAddWithWarp(&(d_scale[ccid]), d_scale_data);
-#endif
-  }
-  if (flags & kHasBias) {
-#if CUDA_VERSION >= 11070
-    phi::CudaAtomicAdd(&(d_bias[ccid]), d_bias_data);
-#else
-    CudaAtomicAddWithWarp(&(d_bias[ccid]), d_bias_data);
-#endif
-  }
-}
-
-template <typename T, int flags>
-__global__ void GroupNormBackward(const T* x,
-                                  const T* d_y,
-                                  const T* scale,
-                                  const T* bias,
-                                  const T* var,
-                                  const T* d_mean,
-                                  const T* d_var,
-                                  int N,
-                                  int C,
-                                  int W,
-                                  int imsize,
-                                  int groups,
-                                  int group_size,
-                                  T epsilon,
-                                  T* d_x) {
-  int gid = blockIdx.y;
-  int cid = blockIdx.x;
-  int bid = blockIdx.z;
-  int H = imsize / W;
-  int number = min(group_size, static_cast<int>(C - gid * group_size));
-  int ccid = gid * group_size + cid;
-  if (ccid >= C) return;
-  T x_var = var[bid * groups + gid];
-  T d_x_mean = d_mean[bid * groups + gid];
-  T d_x_var = d_var[bid * groups + gid];
-
-  T x_var_inv = 1.0 / sqrt(x_var + epsilon);
-  T number_inv = 1.0 / (number * imsize);
-
-  T x_scale = (flags & kHasScale) ? scale[ccid] : 1;
-  T x_bias = (flags & kHasBias) ? bias[ccid] : 0;
-  T x_scale_inv = 0;
-  if (x_scale != 0) x_scale_inv = 1.0 / x_scale;
-
-  for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
-    int hid = imid / W;
-    int wid = imid % W;
-    T tmp = x[(bid * H + hid) * W * C + wid * C + ccid];
-    T v_y = (tmp - x_bias) * x_scale_inv;
-    T dly = d_y[(bid * H + hid) * W * C + wid * C + ccid];
-    d_x[(bid * H + hid) * W * C + wid * C + ccid] =
-        x_var_inv *
-        (dly * x_scale - number_inv * d_x_var * v_y - number_inv * d_x_mean);
-  }
-}
-
-template <typename T>
-__global__ void ScalarGetDsDbCUDAKernel(
-    int imsize, const T* x, const T* dy, T* ds, T* db) {
-  const int nc = blockIdx.x;
-  T ds_sum = 0;
-  T db_sum = 0;
-  for (int i = threadIdx.x; i < imsize; i += blockDim.x) {
-    const int index = nc * imsize + i;
-    ds_sum += dy[index] * x[index];
-    db_sum += dy[index];
-  }
-  ReduceMeanAndVar<T>(db, ds, db_sum, ds_sum, 1);
-}
-
-template <typename T>
-__global__ void GetScaleBiasGradientCUDAKernel(int N,
-                                               int C,
-                                               int group,
-                                               T epsilon,
-                                               const T* mean,
-                                               const T* var,
-                                               const T* ds,
-                                               const T* db,
-                                               T* d_scale,
-                                               T* d_bias) {
-  const int c = blockIdx.x * blockDim.x + threadIdx.x;
-  if (c < C) {
-    const int G = group;
-    const int D = C / G;
-    T sum1 = 0;
-    T sum2 = 0;
-    for (int n = 0; n < N; ++n) {
-      const int nc = n * C + c;
-      const int ng = n * G + c / D;
-      sum1 += (d_scale == nullptr)
-                  ? T(0)
-                  : ((ds[nc] - db[nc] * static_cast<T>(mean[ng])) *
-                     static_cast<T>(rsqrt(var[ng] + epsilon)));
-      sum2 += (d_bias == nullptr) ? T(0) : db[nc];
-    }
-    if (d_scale != nullptr) {
-      d_scale[c] = sum1;
-    }
-    if (d_bias != nullptr) {
-      d_bias[c] = sum2;
-    }
-  }
-}
-
-template <typename T, int BlockDim>
-__global__ void GetBackwardParamsCUDAKernel(int imsize,
-                                            int groups,
-                                            int group_size,
-                                            T epsilon,
-                                            const T* mean,
-                                            const T* var,
-                                            const T* scale,
-                                            const T* ds,
-                                            const T* db,
-                                            T* p1,
-                                            T* p2,
-                                            T* p3) {
-  const int n = blockIdx.x;
-  const int g = blockIdx.y;
-  const int ng = n * groups + g;
-  T sum1 = 0;
-  T sum2 = 0;
-  T var_inv = rsqrt(var[ng] + epsilon);
-  for (int64_t i = threadIdx.x; i < group_size; i += blockDim.x) {
-    const int64_t index = ng * group_size + i;
-    const int64_t c = g * group_size + i;
-    const T scale_v = scale == nullptr ? T(1) : static_cast<T>(scale[c]);
-    sum1 += ds[index] * scale_v;
-    sum2 += db[index] * scale_v;
-    const T scale_c = scale == nullptr ? T(0) : static_cast<T>(scale[c]);
-    p1[index] = scale_c * var_inv;
-  }
-
-  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage ds_storage;
-  __shared__ typename BlockReduce::TempStorage db_storage;
-  sum1 = BlockReduce(ds_storage).Reduce(sum1, cub::Sum());
-  sum2 = BlockReduce(db_storage).Reduce(sum2, cub::Sum());
-
-  if (threadIdx.x == 0) {
-    const T s = T(1) / static_cast<T>(group_size * imsize);
-    const T x = (sum2 * static_cast<T>(mean[ng]) - sum1) *
-                static_cast<T>(var_inv) * static_cast<T>(var_inv) *
-                static_cast<T>(var_inv) * s;
-    p2[ng] = x;
-    p3[ng] = -x * static_cast<T>(mean[ng]) - sum2 * static_cast<T>(var_inv) * s;
-  }
-}
-
-template <typename T>
-__global__ void GetXGradientCUDAKernel(int imsize,
-                                       int C,
-                                       int group_size,
-                                       int groups,
-                                       T* p1,
-                                       T* p2,
-                                       T* p3,
-                                       const T* x,
-                                       const T* dy,
-                                       T* dx) {
-  int cid = blockIdx.x;
-  int gid = blockIdx.y;
-  int bid = blockIdx.z;
-  int ccid = bid * C + gid * group_size + cid;
-  int ng = bid * groups + gid;
-  int nc = gid * group_size + cid;
-  for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
-    int index = (bid * C + nc) * imsize + imid;
-    dx[index] = p1[ccid] * dy[index] + p2[ng] * x[index] + p3[ng];
-  }
-}
-
-template <typename T>
-class GroupNormGradKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
-    const float epsilon = ctx.Attr<float>("epsilon");
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-    auto* y = ctx.Input<phi::DenseTensor>("Y");
-    auto* mean = ctx.Input<phi::DenseTensor>("Mean");
-    auto* var = ctx.Input<phi::DenseTensor>("Variance");
-    auto* scale = ctx.Input<phi::DenseTensor>("Scale");
-    auto* bias = ctx.Input<phi::DenseTensor>("Bias");
-    auto* d_y = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
-    const auto groups = ctx.Attr<int>("groups");
-
-    // init output
-    auto* d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* d_scale =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Scale"));
-    auto* d_bias = ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
-
-    const auto& x_dims = x->dims();
-    const int C =
-        (data_layout == DataLayout::kNCHW ? x_dims[1]
-                                          : x_dims[x_dims.size() - 1]);
-    const int group_size = C / groups;
-    const int W =
-        (data_layout == DataLayout::kNCHW ? x_dims[x_dims.size() - 1]
-                                          : x_dims[x_dims.size() - 2]);
-
-    d_x->mutable_data<T>(ctx.GetPlace());
-    phi::funcs::SetConstant<phi::GPUContext, T> set_zero;
-    auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
-
-    phi::DenseTensor ds, db;
-    ds.mutable_data<T>({x_dims[0], C}, ctx.GetPlace());
-    db.mutable_data<T>({x_dims[0], C}, ctx.GetPlace());
-    T* ds_data = ds.data<T>();
-    T* db_data = db.data<T>();
-
-    auto* y_data = y->data<T>();
-    auto* x_data = x->data<T>();
-    T* d_x_data = nullptr;
-    if (d_x) d_x_data = d_x->data<T>();
-    auto* dy_data = d_y->data<T>();
-    auto* var_data = var->data<T>();
-    auto* mean_data = mean->data<T>();
-    T* d_scale_data = nullptr;
-    if (d_scale) {
-      d_scale->mutable_data<T>(ctx.GetPlace());
-      d_scale_data = d_scale->data<T>();
-    }
-    T* d_bias_data = nullptr;
-    if (d_bias) {
-      d_bias->mutable_data<T>(ctx.GetPlace());
-      d_bias_data = d_bias->data<T>();
-    }
-
-    const T* scale_data = nullptr;
-    if (scale) scale_data = scale->data<T>();
-    const T* bias_data = nullptr;
-    if (bias) bias_data = bias->data<T>();
-
-    int imsize = 1;
-    if (data_layout == DataLayout::kNCHW) {
-      for (int i = 2; i < x_dims.size(); ++i) {
-        imsize *= x_dims[i];
-      }
-    } else {
-      for (int i = 1; i < x_dims.size() - 1; ++i) {
-        imsize *= x_dims[i];
-      }
-    }
-
-#ifdef __HIPCC__
-    int block_size = std::max(std::min(256, imsize), 64);
-    const int block_dims = 256;
-#else
-    int block_size = std::min(1024, imsize);
-    const int block_dims = 1024;
-#endif
-    dim3 grid(group_size, groups, x_dims[0]);
-    dim3 threads(block_size, 1, 1);
-    int flags =
-        (scale_data != nullptr) * kHasScale + (bias_data != nullptr) * kHasBias;
-    if (data_layout == DataLayout::kNCHW) {
-      const int max_num_threads = 1024;
-      int max_block_size = std::min(imsize, max_num_threads);
-      int block_size_nchw = 1;
-      while (block_size_nchw < max_block_size) {
-        block_size_nchw *= 2;
-      }
-      block_size_nchw = std::max(block_size_nchw, kps::details::kWarpSize);
-      dim3 blocks(block_size_nchw);
-      ScalarGetDsDbCUDAKernel<T>
-          <<<x_dims[0] * C, blocks, 0, dev_ctx.stream()>>>(
-              imsize, x_data, dy_data, ds_data, db_data);
-
-      if (d_scale || d_bias) {
-        const int block = 256;
-        GetScaleBiasGradientCUDAKernel<T>
-            <<<(C + block - 1) / block, block, 0, dev_ctx.stream()>>>(
-                x_dims[0],
-                C,
-                groups,
-                epsilon,
-                mean_data,
-                var_data,
-                ds_data,
-                db_data,
-                d_scale_data,
-                d_bias_data);
-      }
-
-      if (d_x_data != nullptr) {
-        // p1 * dy + p2 * x + p3,
-        // p1, p2, p3 represent the reverse calculation of temporary variables
-        // p1 = scale * var_inv
-        // p2 = (db * scale * mean - ds * scale) * pow(var_inv, 3) * (1/n)
-        // p3 = -p2 * mean[ng] - db * scale * var_inv * (1/n);
-        phi::DenseTensor p1, p2, p3;
-        p1.mutable_data<T>({x_dims[0] * C}, ctx.GetPlace());
-        p2.mutable_data<T>({x_dims[0], groups}, ctx.GetPlace());
-        p3.mutable_data<T>({x_dims[0], groups}, ctx.GetPlace());
-        T* p1_data = p1.data<T>();
-        T* p2_data = p2.data<T>();
-        T* p3_data = p3.data<T>();
-
-        GetBackwardParamsCUDAKernel<T, block_dims>
-            <<<dim3(x_dims[0], groups), block_dims, 0, dev_ctx.stream()>>>(
-                imsize,
-                groups,
-                group_size,
-                epsilon,
-                mean_data,
-                var_data,
-                scale_data,
-                ds_data,
-                db_data,
-                p1_data,
-                p2_data,
-                p3_data);
-        GetXGradientCUDAKernel<T>
-            <<<grid, threads, 0, dev_ctx.stream()>>>(imsize,
-                                                     C,
-                                                     group_size,
-                                                     groups,
-                                                     p1_data,
-                                                     p2_data,
-                                                     p3_data,
-                                                     x_data,
-                                                     dy_data,
-                                                     d_x_data);
-      }
-    } else {
-      if (d_scale) {
-        set_zero(dev_ctx, d_scale, static_cast<T>(0));
-      }
-      if (d_bias) {
-        set_zero(dev_ctx, d_bias, static_cast<T>(0));
-      }
-
-      phi::DenseTensor temp_var;
-      temp_var.mutable_data<T>(var->dims(), ctx.GetPlace());
-      set_zero(dev_ctx, &temp_var, static_cast<T>(0));
-      T* temp_var_data = temp_var.data<T>();
-
-      phi::DenseTensor temp_mean;
-      temp_mean.mutable_data<T>(var->dims(), ctx.GetPlace());
-      set_zero(dev_ctx, &temp_mean, static_cast<T>(0));
-      T* temp_mean_data = temp_mean.data<T>();
-
-      int flags = (scale_data != nullptr) * kHasScale +
-                  (bias_data != nullptr) * kHasBias;
-      UNROLL_ALL_CASES(flags,
-                       GroupNormBackwardGetMeanAndVar,
-                       y_data,
-                       scale_data,
-                       bias_data,
-                       dy_data,
-                       x_dims[0],
-                       C,
-                       W,
-                       imsize,
-                       groups,
-                       group_size,
-                       epsilon,
-                       temp_mean_data,
-                       temp_var_data,
-                       d_scale_data,
-                       d_bias_data);
-      if (d_x_data != nullptr) {
-        UNROLL_ALL_CASES(flags,
-                         GroupNormBackward,
-                         y_data,
-                         dy_data,
-                         scale_data,
-                         bias_data,
-                         var_data,
-                         temp_mean_data,
-                         temp_var_data,
-                         x_dims[0],
-                         C,
-                         W,
-                         imsize,
-                         groups,
-                         group_size,
-                         epsilon,
-                         d_x_data);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(group_norm,
-                        ops::GroupNormKernel<phi::GPUContext, float>,
-                        ops::GroupNormKernel<phi::GPUContext, double>);
-REGISTER_OP_CUDA_KERNEL(group_norm_grad,
-                        ops::GroupNormGradKernel<phi::GPUContext, float>,
-                        ops::GroupNormGradKernel<phi::GPUContext, double>);
diff --git a/paddle/fluid/operators/group_norm_op.h b/paddle/fluid/operators/group_norm_op.h
deleted file mode 100644
index 95cdeefc783f42..00000000000000
--- a/paddle/fluid/operators/group_norm_op.h
+++ /dev/null
@@ -1,387 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <array>
-#include <numeric>
-#include <string>
-
-#include "paddle/fluid/framework/data_layout.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using DataLayout = phi::DataLayout;
-
-template <typename DeviceContext, typename T>
-class GroupNormKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
-    const float epsilon = ctx.Attr<float>("epsilon");
-    auto* scale = ctx.Input<phi::DenseTensor>("Scale");
-    auto* bias = ctx.Input<phi::DenseTensor>("Bias");
-    auto* x = ctx.Input<phi::DenseTensor>("X");
-
-    auto* y = ctx.Output<phi::DenseTensor>("Y");
-    auto* mean = ctx.Output<phi::DenseTensor>("Mean");
-    auto* var = ctx.Output<phi::DenseTensor>("Variance");
-    const auto groups = ctx.Attr<int>("groups");
-
-    const auto x_dims = x->dims();
-    const int C =
-        (data_layout == DataLayout::kNCHW ? x_dims[1]
-                                          : x_dims[x_dims.size() - 1]);
-    const int group_size = C / groups;
-
-    y->mutable_data<T>(ctx.GetPlace());
-    mean->mutable_data<T>(ctx.GetPlace());
-    var->mutable_data<T>(ctx.GetPlace());
-
-    auto* x_data = x->data<T>();
-    auto* y_data = y->data<T>();
-    auto* mean_data = mean->data<T>();
-    auto* var_data = var->data<T>();
-
-    const T* scale_data = nullptr;
-    if (scale) scale_data = scale->data<T>();
-    const T* bias_data = nullptr;
-    if (bias) bias_data = bias->data<T>();
-
-    int imsize = 1;
-    if (data_layout == DataLayout::kNCHW) {
-      for (int i = 2; i < x_dims.size(); ++i) {
-        imsize *= x_dims[i];
-      }
-    } else {
-      for (int i = 1; i < x_dims.size() - 1; ++i) {
-        imsize *= x_dims[i];
-      }
-    }
-    auto* iter_x_data = x_data;
-    auto* iter_y_data = y_data;
-    for (int bid = 0; bid < x_dims[0]; bid++) {
-      for (int gid = 0; gid < groups; gid++) {
-        const int64_t M = 8;
-        std::array<T, M> x_mean_arr;
-        std::array<T, M> x_var_arr;
-        std::fill(x_mean_arr.begin(), x_mean_arr.end(), T(0));
-        std::fill(x_var_arr.begin(), x_var_arr.end(), T(0));
-        T x_mean = 0, x_var = 0;
-        int number =
-            std::min(group_size, static_cast<int>(C - gid * group_size));
-        auto* tmp_x = iter_x_data;
-        auto* x_src_data = iter_x_data;
-        auto* tmp_y = iter_y_data;
-        auto* y_src_data = iter_y_data;
-
-        if (data_layout == DataLayout::kNCHW) {
-          for (int cid = 0; cid < number; cid++) {
-            int imid;
-            for (imid = 0; imid < imsize - (imsize % M);
-                 imid += M, iter_x_data += M) {
-              // TODO(gaoxiang): Because AVX/AVX2/AVX512 can not directly used
-              // in template class/function, before we complete high
-              // performance cpu vector extension, temporarily unrolling
-              // loop to get high precision and performance
-              x_mean_arr[0] += iter_x_data[0];
-              x_var_arr[0] += iter_x_data[0] * iter_x_data[0];
-              x_mean_arr[1] += iter_x_data[1];
-              x_var_arr[1] += iter_x_data[1] * iter_x_data[1];
-              x_mean_arr[2] += iter_x_data[2];
-              x_var_arr[2] += iter_x_data[2] * iter_x_data[2];
-              x_mean_arr[3] += iter_x_data[3];
-              x_var_arr[3] += iter_x_data[3] * iter_x_data[3];
-              x_mean_arr[4] += iter_x_data[4];
-              x_var_arr[4] += iter_x_data[4] * iter_x_data[4];
-              x_mean_arr[5] += iter_x_data[5];
-              x_var_arr[5] += iter_x_data[5] * iter_x_data[5];
-              x_mean_arr[6] += iter_x_data[6];
-              x_var_arr[6] += iter_x_data[6] * iter_x_data[6];
-              x_mean_arr[7] += iter_x_data[7];
-              x_var_arr[7] += iter_x_data[7] * iter_x_data[7];
-            }
-            x_mean =
-                std::accumulate(x_mean_arr.cbegin(), x_mean_arr.cend(), x_mean);
-            x_var =
-                std::accumulate(x_var_arr.cbegin(), x_var_arr.cend(), x_var);
-            std::fill(x_mean_arr.begin(), x_mean_arr.end(), T(0));
-            std::fill(x_var_arr.begin(), x_var_arr.end(), T(0));
-            for (; imid < imsize; imid++, iter_x_data++) {
-              x_mean += iter_x_data[0];
-              x_var += iter_x_data[0] * iter_x_data[0];
-            }
-          }
-        } else {
-          for (int cid = 0; cid < number; cid++) {
-            iter_x_data = tmp_x + cid;
-            int imid;
-            for (imid = 0; imid < imsize - (imsize % M);
-                 imid += M, iter_x_data += M * C) {
-              // TODO(gaoxiang): Because AVX/AVX2/AVX512 can not directly used
-              // in template class/function, before we complete high
-              // performance cpu vector extension, temporarily unrolling
-              // loop to get high precision and performance
-              x_mean_arr[0] += iter_x_data[0 * C];
-              x_var_arr[0] += iter_x_data[0 * C] * iter_x_data[0 * C];
-              x_mean_arr[1] += iter_x_data[1 * C];
-              x_var_arr[1] += iter_x_data[1 * C] * iter_x_data[1 * C];
-              x_mean_arr[2] += iter_x_data[2 * C];
-              x_var_arr[2] += iter_x_data[2 * C] * iter_x_data[2 * C];
-              x_mean_arr[3] += iter_x_data[3 * C];
-              x_var_arr[3] += iter_x_data[3 * C] * iter_x_data[3 * C];
-              x_mean_arr[4] += iter_x_data[4 * C];
-              x_var_arr[4] += iter_x_data[4 * C] * iter_x_data[4 * C];
-              x_mean_arr[5] += iter_x_data[5 * C];
-              x_var_arr[5] += iter_x_data[5 * C] * iter_x_data[5 * C];
-              x_mean_arr[6] += iter_x_data[6 * C];
-              x_var_arr[6] += iter_x_data[6 * C] * iter_x_data[6 * C];
-              x_mean_arr[7] += iter_x_data[7 * C];
-              x_var_arr[7] += iter_x_data[7 * C] * iter_x_data[7 * C];
-            }
-            x_mean =
-                std::accumulate(x_mean_arr.cbegin(), x_mean_arr.cend(), x_mean);
-            x_var =
-                std::accumulate(x_var_arr.cbegin(), x_var_arr.cend(), x_var);
-            std::fill(x_mean_arr.begin(), x_mean_arr.end(), T(0));
-            std::fill(x_var_arr.begin(), x_var_arr.end(), T(0));
-            for (; imid < imsize; imid++, iter_x_data += C) {
-              x_mean += iter_x_data[0];
-              x_var += iter_x_data[0] * iter_x_data[0];
-            }
-          }
-          iter_x_data = tmp_x + group_size;
-        }
-
-        x_mean /= number * imsize;
-        x_var /= number * imsize;
-        x_var = std::max(x_var - x_mean * x_mean, T(0));
-        T var_inv = T(1) / std::sqrt(x_var + epsilon);
-        mean_data[bid * groups + gid] = x_mean;
-        var_data[bid * groups + gid] = x_var;
-
-        if (data_layout == DataLayout::kNCHW) {
-          for (int cid = 0; cid < number; cid++) {
-            for (int imid = 0; imid < imsize; imid++, tmp_x++, iter_y_data++) {
-              T val = (tmp_x[0] - x_mean) * var_inv;
-              if (scale_data) val *= scale_data[gid * group_size + cid];
-              if (bias_data) val += bias_data[gid * group_size + cid];
-              iter_y_data[0] = val;
-            }
-          }
-        } else {
-          for (int cid = 0; cid < number; cid++) {
-            tmp_x = x_src_data + cid;
-            iter_y_data = y_src_data + cid;
-            for (int imid = 0; imid < imsize;
-                 imid++, tmp_x += C, iter_y_data += C) {
-              T val = (tmp_x[0] - x_mean) * var_inv;
-              if (scale_data) val *= scale_data[gid * group_size + cid];
-              if (bias_data) val += bias_data[gid * group_size + cid];
-              iter_y_data[0] = val;
-            }
-          }
-          iter_y_data = tmp_y + group_size;
-        }
-      }
-      if (data_layout == DataLayout::kNHWC) {
-        iter_x_data = x_data + (bid + 1) * C * imsize;
-        iter_y_data = y_data + (bid + 1) * C * imsize;
-      }
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class GroupNormGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout data_layout = phi::StringToDataLayout(data_layout_str);
-    const float epsilon = ctx.Attr<float>("epsilon");
-    auto* x = ctx.Input<phi::DenseTensor>("Y");
-    auto* var = ctx.Input<phi::DenseTensor>("Variance");
-    auto* scale = ctx.Input<phi::DenseTensor>("Scale");
-    auto* bias = ctx.Input<phi::DenseTensor>("Bias");
-    auto* d_y = ctx.Input<phi::DenseTensor>(framework::GradVarName("Y"));
-    const auto groups = ctx.Attr<int>("groups");
-
-    // init output
-    auto* d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* d_scale =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Scale"));
-    auto* d_bias = ctx.Output<phi::DenseTensor>(framework::GradVarName("Bias"));
-
-    const auto& x_dims = x->dims();
-    const int C =
-        (data_layout == DataLayout::kNCHW ? x_dims[1]
-                                          : x_dims[x_dims.size() - 1]);
-    const int group_size = C / groups;
-
-    d_x->mutable_data<T>(ctx.GetPlace());
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-    auto* x_data = x->data<T>();
-    auto* d_x_data = d_x->data<T>();
-    auto* y_data = d_y->data<T>();
-    auto* var_data = var->data<T>();
-    T* d_scale_data = nullptr;
-    if (d_scale) {
-      d_scale->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, d_scale, static_cast<T>(0));
-      d_scale_data = d_scale->data<T>();
-    }
-    T* d_bias_data = nullptr;
-    if (d_bias) {
-      d_bias->mutable_data<T>(ctx.GetPlace());
-      set_zero(dev_ctx, d_bias, static_cast<T>(0));
-      d_bias_data = d_bias->data<T>();
-    }
-
-    const T* scale_data = nullptr;
-    if (scale) scale_data = scale->data<T>();
-    const T* bias_data = nullptr;
-    if (bias) bias_data = bias->data<T>();
-
-    int imsize = 1;
-    if (data_layout == DataLayout::kNCHW) {
-      for (int i = 2; i < x_dims.size(); ++i) {
-        imsize *= x_dims[i];
-      }
-    } else {
-      for (int i = 1; i < x_dims.size() - 1; ++i) {
-        imsize *= x_dims[i];
-      }
-    }
-    auto* iter_x_data = x_data;
-    auto* iter_d_x_data = d_x_data;
-    auto* iter_y_data = y_data;
-    for (int bid = 0; bid < x_dims[0]; bid++) {
-      for (int gid = 0; gid < groups; gid++) {
-        T x_var = var_data[bid * groups + gid];
-        T var_inv = 1.0 / sqrt(x_var + epsilon);
-        int number =
-            std::min(group_size, static_cast<int>(C - gid * group_size));
-        T number_inv = 1.0 / (number * imsize);
-        auto* tmp_x = iter_x_data;
-        auto* tmp_y = iter_y_data;
-        auto* tmp_d_x = iter_d_x_data;
-        auto* x_src_data = iter_x_data;
-        auto* y_src_data = iter_y_data;
-        auto* iter_x_data_backup = iter_x_data;
-        auto* iter_y_data_backup = iter_y_data;
-        auto* iter_d_x_data_backup = iter_d_x_data;
-        T dp_scale = 0, dp_bias = 0;
-
-        if (data_layout == DataLayout::kNCHW) {
-          for (int cid = 0; cid < number; cid++) {
-            for (int imid = 0; imid < imsize;
-                 imid++, iter_x_data++, iter_y_data++) {
-              T val = iter_x_data[0];
-              if (bias_data) val -= bias_data[gid * group_size + cid];
-              T dval = iter_y_data[0];
-              dp_scale += val * dval;
-              if (scale_data)
-                dp_bias += dval * scale_data[gid * group_size + cid];
-
-              if (scale_data && scale_data[gid * group_size + cid] != 0)
-                val /= scale_data[gid * group_size + cid];
-              if (d_bias_data) d_bias_data[gid * group_size + cid] += dval;
-              if (d_scale_data)
-                d_scale_data[gid * group_size + cid] += val * dval;
-            }
-          }
-
-          for (int cid = 0; cid < number; cid++) {
-            for (int imid = 0; imid < imsize;
-                 imid++, iter_d_x_data++, tmp_x++, tmp_y++) {
-              T v_y = tmp_x[0];
-              T dly = tmp_y[0];
-              T dss = dp_scale;
-              T dbs = dp_bias;
-              T v_scale = 1., v_bias = 0.;
-              if (scale_data) v_scale = scale_data[gid * group_size + cid];
-              if (bias_data) v_bias = bias_data[gid * group_size + cid];
-              v_y -= v_bias;
-              if (v_scale != 0) v_y /= v_scale;
-              iter_d_x_data[0] =
-                  (dly * v_scale - number_inv * dss * v_y - number_inv * dbs) *
-                  var_inv;
-            }
-          }
-        } else {
-          for (int cid = 0; cid < number; cid++) {
-            iter_x_data = x_src_data + cid;
-            iter_y_data = y_src_data + cid;
-            for (int imid = 0; imid < imsize;
-                 imid++, iter_x_data += C, iter_y_data += C) {
-              T val = iter_x_data[0];
-              if (bias_data) val -= bias_data[gid * group_size + cid];
-              T dval = iter_y_data[0];
-              dp_scale += val * dval;
-              if (scale_data)
-                dp_bias += dval * scale_data[gid * group_size + cid];
-
-              if (scale_data && scale_data[gid * group_size + cid] != 0)
-                val /= scale_data[gid * group_size + cid];
-              if (d_bias_data) d_bias_data[gid * group_size + cid] += dval;
-              if (d_scale_data)
-                d_scale_data[gid * group_size + cid] += val * dval;
-            }
-          }
-
-          for (int cid = 0; cid < number; cid++) {
-            tmp_x = x_src_data + cid;
-            tmp_y = y_src_data + cid;
-            iter_d_x_data = tmp_d_x + cid;
-            for (int imid = 0; imid < imsize;
-                 imid++, iter_d_x_data += C, tmp_x += C, tmp_y += C) {
-              T v_y = tmp_x[0];
-              T dly = tmp_y[0];
-              T dss = dp_scale;
-              T dbs = dp_bias;
-              T v_scale = 1.0, v_bias = 0.;
-              if (scale_data) v_scale = scale_data[gid * group_size + cid];
-              if (bias_data) v_bias = bias_data[gid * group_size + cid];
-              v_y -= v_bias;
-              if (v_scale != 0) v_y /= v_scale;
-              iter_d_x_data[0] =
-                  (dly * v_scale - number_inv * dss * v_y - number_inv * dbs) *
-                  var_inv;
-            }
-          }
-          iter_x_data = iter_x_data_backup + group_size;
-          iter_y_data = iter_y_data_backup + group_size;
-          iter_d_x_data = iter_d_x_data_backup + group_size;
-        }
-      }
-      if (data_layout == DataLayout::kNHWC) {
-        iter_x_data = x_data + (bid + 1) * C * imsize;
-        iter_d_x_data = d_x_data + (bid + 1) * C * imsize;
-        iter_y_data = y_data + (bid + 1) * C * imsize;
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/l1_norm_op.cc b/paddle/fluid/operators/l1_norm_op.cc
index 112a84b00e3293..08a0b894c9fea9 100644
--- a/paddle/fluid/operators/l1_norm_op.cc
+++ b/paddle/fluid/operators/l1_norm_op.cc
@@ -91,10 +91,13 @@ REGISTER_OPERATOR(l1_norm,
                   ops::L1NormGradMaker<paddle::framework::OpDesc>,
                   ops::L1NormGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(l1_norm_grad, ops::L1NormGradOp);
-REGISTER_OP_CPU_KERNEL(l1_norm, ops::L1NormKernel<phi::CPUContext, float>);
-REGISTER_OP_CPU_KERNEL(l1_norm_grad,
-                       ops::L1NormGradKernel<phi::CPUContext, float>);
 
-REGISTER_OP_CUDA_KERNEL(l1_norm, ops::L1NormKernel<phi::GPUContext, float>);
-REGISTER_OP_CUDA_KERNEL(l1_norm_grad,
-                        ops::L1NormGradKernel<phi::GPUContext, float>);
+PD_REGISTER_STRUCT_KERNEL(l1_norm, CPU, ALL_LAYOUT, ops::L1NormKernel, float) {}
+PD_REGISTER_STRUCT_KERNEL(
+    l1_norm_grad, CPU, ALL_LAYOUT, ops::L1NormGradKernel, float) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_STRUCT_KERNEL(l1_norm, GPU, ALL_LAYOUT, ops::L1NormKernel, float) {}
+PD_REGISTER_STRUCT_KERNEL(
+    l1_norm_grad, GPU, ALL_LAYOUT, ops::L1NormGradKernel, float) {}
+#endif
diff --git a/paddle/fluid/operators/l1_norm_op.h b/paddle/fluid/operators/l1_norm_op.h
index 36465c14bf00aa..c268a6c51fbc55 100644
--- a/paddle/fluid/operators/l1_norm_op.h
+++ b/paddle/fluid/operators/l1_norm_op.h
@@ -21,7 +21,7 @@ namespace paddle {
 namespace operators {
 
 // Out = sum(abs(X))
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class L1NormKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
@@ -39,7 +39,7 @@ class L1NormKernel : public framework::OpKernel<T> {
 };
 
 // dX = dout * sign(X)
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class L1NormGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {

From b1bb74849998b0887f1ce0eec9896c114715256a Mon Sep 17 00:00:00 2001
From: huangjiyi <43315610+huangjiyi@users.noreply.github.com>
Date: Fri, 14 Apr 2023 10:30:18 +0800
Subject: [PATCH 153/156] update (#52879)

---
 paddle/fluid/operators/gru_op.cc       | 14 ++++++--------
 paddle/fluid/operators/gru_op.cu.cc    | 13 ++++++-------
 paddle/fluid/operators/gru_op.h        |  2 +-
 paddle/fluid/operators/gru_unit_op.cc  | 10 ++++------
 paddle/fluid/operators/gru_unit_op.cu  | 11 +++++------
 paddle/fluid/operators/gru_unit_op.h   |  4 ++--
 paddle/fluid/operators/lrn_op.cc       |  6 ++++--
 paddle/fluid/operators/lrn_op.cu       |  5 +++--
 paddle/fluid/operators/lrn_op.h        |  4 ++--
 paddle/fluid/operators/lstm_op.cc      | 11 +++++------
 paddle/fluid/operators/lstm_op.cu.cc   | 10 ++++------
 paddle/fluid/operators/lstm_op.h       |  4 ++--
 paddle/fluid/operators/lstm_unit_op.cc | 12 +++++-------
 paddle/fluid/operators/lstm_unit_op.cu | 18 ++++++++++--------
 paddle/fluid/operators/lstm_unit_op.h  |  4 ++--
 paddle/fluid/operators/lstmp_op.cc     | 10 ++++------
 paddle/fluid/operators/lstmp_op.cu     | 10 ++++------
 paddle/fluid/operators/lstmp_op.h      |  4 ++--
 18 files changed, 71 insertions(+), 81 deletions(-)

diff --git a/paddle/fluid/operators/gru_op.cc b/paddle/fluid/operators/gru_op.cc
index 2d58438dbf35ea..921076a4a1406e 100644
--- a/paddle/fluid/operators/gru_op.cc
+++ b/paddle/fluid/operators/gru_op.cc
@@ -313,11 +313,10 @@ class GRUGradOp : public framework::OperatorWithKernel {
   }
 };
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class GRUCPUKernel : public framework::OpKernel<T> {
  public:
   void BatchCompute(const framework::ExecutionContext& context) const {
-    using DeviceContext = phi::CPUContext;
     using LodTensorPtr = phi::DenseTensor*;
     bool is_test = context.Attr<bool>("is_test");
 
@@ -585,9 +584,8 @@ REGISTER_OPERATOR(gru,
 REGISTER_OPERATOR(gru_grad,
                   ops::GRUGradOp,
                   ops::GRUGradOpNoNeedBufferVarInferer);
-REGISTER_OP_CPU_KERNEL(gru,
-                       ops::GRUCPUKernel<float>,
-                       ops::GRUCPUKernel<double>);
-REGISTER_OP_CPU_KERNEL(gru_grad,
-                       ops::GRUGradKernel<phi::CPUContext, float>,
-                       ops::GRUGradKernel<phi::CPUContext, double>);
+
+PD_REGISTER_STRUCT_KERNEL(
+    gru, CPU, ALL_LAYOUT, ops::GRUCPUKernel, float, double) {}
+PD_REGISTER_STRUCT_KERNEL(
+    gru_grad, CPU, ALL_LAYOUT, ops::GRUGradKernel, float, double) {}
diff --git a/paddle/fluid/operators/gru_op.cu.cc b/paddle/fluid/operators/gru_op.cu.cc
index 0d3686bb495ef9..f89400fca5373a 100644
--- a/paddle/fluid/operators/gru_op.cu.cc
+++ b/paddle/fluid/operators/gru_op.cu.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class GRUKernel : public framework::OpKernel<T> {
  public:
   void BatchCompute(const framework::ExecutionContext& context) const {
@@ -133,9 +133,8 @@ class GRUKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(gru,
-                        ops::GRUKernel<phi::GPUContext, float>,
-                        ops::GRUKernel<phi::GPUContext, double>);
-REGISTER_OP_CUDA_KERNEL(gru_grad,
-                        ops::GRUGradKernel<phi::GPUContext, float>,
-                        ops::GRUGradKernel<phi::GPUContext, double>);
+
+PD_REGISTER_STRUCT_KERNEL(gru, GPU, ALL_LAYOUT, ops::GRUKernel, float, double) {
+}
+PD_REGISTER_STRUCT_KERNEL(
+    gru_grad, GPU, ALL_LAYOUT, ops::GRUGradKernel, float, double) {}
diff --git a/paddle/fluid/operators/gru_op.h b/paddle/fluid/operators/gru_op.h
index 760a33a161cabb..f2fc7663d972a6 100644
--- a/paddle/fluid/operators/gru_op.h
+++ b/paddle/fluid/operators/gru_op.h
@@ -36,7 +36,7 @@ inline void ReorderInitState(const DeviceContext& ctx,
   row_shuffle(ctx, src, index_lod, dst, indexed_src);
 }
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class GRUGradKernel : public framework::OpKernel<T> {
  public:
   void BatchCompute(const framework::ExecutionContext& context) const {
diff --git a/paddle/fluid/operators/gru_unit_op.cc b/paddle/fluid/operators/gru_unit_op.cc
index 7bd104472fe558..3c8b2947e5fce8 100644
--- a/paddle/fluid/operators/gru_unit_op.cc
+++ b/paddle/fluid/operators/gru_unit_op.cc
@@ -323,9 +323,7 @@ REGISTER_OPERATOR(gru_unit_grad,
                   ops::GRUUnitGradOp,
                   ops::GRUUnitGradOpNoNeedBufferVarInferer);
 
-REGISTER_OP_CPU_KERNEL(gru_unit,
-                       ops::GRUUnitKernel<phi::CPUContext, float>,
-                       ops::GRUUnitKernel<phi::CPUContext, double>);
-REGISTER_OP_CPU_KERNEL(gru_unit_grad,
-                       ops::GRUUnitGradKernel<phi::CPUContext, float>,
-                       ops::GRUUnitGradKernel<phi::CPUContext, double>);
+PD_REGISTER_STRUCT_KERNEL(
+    gru_unit, CPU, ALL_LAYOUT, ops::GRUUnitKernel, float, double) {}
+PD_REGISTER_STRUCT_KERNEL(
+    gru_unit_grad, CPU, ALL_LAYOUT, ops::GRUUnitGradKernel, float, double) {}
diff --git a/paddle/fluid/operators/gru_unit_op.cu b/paddle/fluid/operators/gru_unit_op.cu
index adaaf1d09cd764..192594a09e86f5 100644
--- a/paddle/fluid/operators/gru_unit_op.cu
+++ b/paddle/fluid/operators/gru_unit_op.cu
@@ -14,9 +14,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/gru_unit_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(gru_unit,
-                        ops::GRUUnitKernel<phi::GPUContext, float>,
-                        ops::GRUUnitKernel<phi::GPUContext, double>);
-REGISTER_OP_CUDA_KERNEL(gru_unit_grad,
-                        ops::GRUUnitGradKernel<phi::GPUContext, float>,
-                        ops::GRUUnitGradKernel<phi::GPUContext, double>);
+
+PD_REGISTER_STRUCT_KERNEL(
+    gru_unit, GPU, ALL_LAYOUT, ops::GRUUnitKernel, float, double) {}
+PD_REGISTER_STRUCT_KERNEL(
+    gru_unit_grad, GPU, ALL_LAYOUT, ops::GRUUnitGradKernel, float, double) {}
diff --git a/paddle/fluid/operators/gru_unit_op.h b/paddle/fluid/operators/gru_unit_op.h
index c22a82c5ae8fd6..9309ca0417f62d 100644
--- a/paddle/fluid/operators/gru_unit_op.h
+++ b/paddle/fluid/operators/gru_unit_op.h
@@ -25,7 +25,7 @@ namespace operators {
 
 enum GRUActivationType { identity = 0, sigmoid = 1, tanh = 2, relu = 3 };
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class GRUUnitKernel : public framework::OpKernel<T> {
  public:
   template <typename Device, typename X, typename Y>
@@ -153,7 +153,7 @@ class GRUUnitKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class GRUUnitGradKernel : public framework::OpKernel<T> {
  public:
   template <typename Device, typename X, typename Y, typename DX, typename DY>
diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc
index 96d5a115991b01..d16b2abf4716a5 100644
--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
@@ -400,5 +400,7 @@ REGISTER_OPERATOR(lrn,
                   ops::LRNGradOpMaker<paddle::imperative::OpBase>);
 
 REGISTER_OPERATOR(lrn_grad, ops::LRNOpGrad);
-REGISTER_OP_CPU_KERNEL(lrn, ops::LRNKernel<phi::CPUContext, float>);
-REGISTER_OP_CPU_KERNEL(lrn_grad, ops::LRNGradKernel<phi::CPUContext, float>);
+
+PD_REGISTER_STRUCT_KERNEL(lrn, CPU, ALL_LAYOUT, ops::LRNKernel, float) {}
+PD_REGISTER_STRUCT_KERNEL(
+    lrn_grad, CPU, ALL_LAYOUT, ops::LRNGradKernel, float) {}
diff --git a/paddle/fluid/operators/lrn_op.cu b/paddle/fluid/operators/lrn_op.cu
index 22b4f0c4618374..20f56cb8f9b947 100644
--- a/paddle/fluid/operators/lrn_op.cu
+++ b/paddle/fluid/operators/lrn_op.cu
@@ -274,5 +274,6 @@ template struct LRNGradFunctor<phi::GPUContext, double>;
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(lrn, ops::LRNKernel<phi::GPUContext, float>);
-REGISTER_OP_CUDA_KERNEL(lrn_grad, ops::LRNGradKernel<phi::GPUContext, float>);
+PD_REGISTER_STRUCT_KERNEL(lrn, GPU, ALL_LAYOUT, ops::LRNKernel, float) {}
+PD_REGISTER_STRUCT_KERNEL(
+    lrn_grad, GPU, ALL_LAYOUT, ops::LRNGradKernel, float) {}
diff --git a/paddle/fluid/operators/lrn_op.h b/paddle/fluid/operators/lrn_op.h
index b772aa82e9d7ea..15ebb4df74f47d 100644
--- a/paddle/fluid/operators/lrn_op.h
+++ b/paddle/fluid/operators/lrn_op.h
@@ -43,7 +43,7 @@ struct LRNFunctor {
                   const DataLayout data_layout = DataLayout::kAnyLayout);
 };
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class LRNKernel : public framework::OpKernel<T> {
  public:
   // f(x) = x * ( k + alpha * SUM((x)^2) )^(-beta)
@@ -136,7 +136,7 @@ struct LRNGradFunctor {
  * The upper and lower is the same as forward. The logic of the sum
  * is also the same as forward.
  */
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class LRNGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/fluid/operators/lstm_op.cc b/paddle/fluid/operators/lstm_op.cc
index 7250cf65e488ed..d7734e57ee4921 100644
--- a/paddle/fluid/operators/lstm_op.cc
+++ b/paddle/fluid/operators/lstm_op.cc
@@ -358,9 +358,8 @@ REGISTER_OPERATOR(lstm,
                   ops::LSTMGradOpMaker<paddle::framework::OpDesc>,
                   ops::LSTMGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(lstm_grad, ops::LSTMGradOp);
-REGISTER_OP_CPU_KERNEL(lstm,
-                       ops::LSTMKernel<phi::CPUContext, float>,
-                       ops::LSTMKernel<phi::CPUContext, double>);
-REGISTER_OP_CPU_KERNEL(lstm_grad,
-                       ops::LSTMGradKernel<phi::CPUContext, float>,
-                       ops::LSTMGradKernel<phi::CPUContext, double>);
+
+PD_REGISTER_STRUCT_KERNEL(
+    lstm, CPU, ALL_LAYOUT, ops::LSTMKernel, float, double) {}
+PD_REGISTER_STRUCT_KERNEL(
+    lstm_grad, CPU, ALL_LAYOUT, ops::LSTMGradKernel, float, double) {}
diff --git a/paddle/fluid/operators/lstm_op.cu.cc b/paddle/fluid/operators/lstm_op.cu.cc
index 13a0ded14b4ea1..b06521088a95a2 100644
--- a/paddle/fluid/operators/lstm_op.cu.cc
+++ b/paddle/fluid/operators/lstm_op.cu.cc
@@ -15,9 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/lstm_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(lstm,
-                        ops::LSTMKernel<phi::GPUContext, float>,
-                        ops::LSTMKernel<phi::GPUContext, double>);
-REGISTER_OP_CUDA_KERNEL(lstm_grad,
-                        ops::LSTMGradKernel<phi::GPUContext, float>,
-                        ops::LSTMGradKernel<phi::GPUContext, double>);
+PD_REGISTER_STRUCT_KERNEL(
+    lstm, GPU, ALL_LAYOUT, ops::LSTMKernel, float, double) {}
+PD_REGISTER_STRUCT_KERNEL(
+    lstm_grad, GPU, ALL_LAYOUT, ops::LSTMGradKernel, float, double) {}
diff --git a/paddle/fluid/operators/lstm_op.h b/paddle/fluid/operators/lstm_op.h
index cba587815657a0..0e068c47647e3d 100644
--- a/paddle/fluid/operators/lstm_op.h
+++ b/paddle/fluid/operators/lstm_op.h
@@ -35,7 +35,7 @@ inline void ReorderInitState(const DeviceContext& ctx,
   row_shuffle(ctx, src, index_lod, dst, indexed_src);
 }
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class LSTMKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -197,7 +197,7 @@ class LSTMKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class LSTMGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/fluid/operators/lstm_unit_op.cc b/paddle/fluid/operators/lstm_unit_op.cc
index dab7164fad1370..bbe5504b98e39b 100644
--- a/paddle/fluid/operators/lstm_unit_op.cc
+++ b/paddle/fluid/operators/lstm_unit_op.cc
@@ -142,10 +142,8 @@ REGISTER_OPERATOR(lstm_unit,
                   ops::LstmUnitGradOpMaker<paddle::framework::OpDesc>,
                   ops::LstmUnitGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(lstm_unit_grad, ops::LstmUnitGradOp);
-REGISTER_OP_CPU_KERNEL(lstm_unit,
-                       ops::LstmUnitKernel<paddle::platform::CPUPlace, float>,
-                       ops::LstmUnitKernel<paddle::platform::CPUPlace, double>);
-REGISTER_OP_CPU_KERNEL(
-    lstm_unit_grad,
-    ops::LstmUnitGradKernel<paddle::platform::CPUPlace, float>,
-    ops::LstmUnitGradKernel<paddle::platform::CPUPlace, double>);
+
+PD_REGISTER_STRUCT_KERNEL(
+    lstm_unit, CPU, ALL_LAYOUT, ops::LstmUnitKernel, float, double) {}
+PD_REGISTER_STRUCT_KERNEL(
+    lstm_unit_grad, CPU, ALL_LAYOUT, ops::LstmUnitGradKernel, float, double) {}
diff --git a/paddle/fluid/operators/lstm_unit_op.cu b/paddle/fluid/operators/lstm_unit_op.cu
index ffc6e42587f1ce..b1c9d035a8cb5d 100644
--- a/paddle/fluid/operators/lstm_unit_op.cu
+++ b/paddle/fluid/operators/lstm_unit_op.cu
@@ -98,7 +98,7 @@ __global__ void LSTMUnitGradientKernel(const int nthreads,
   }
 }
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class LstmUnitOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -131,7 +131,7 @@ class LstmUnitOpCUDAKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class LstmUnitGradOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -183,9 +183,11 @@ class LstmUnitGradOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(lstm_unit,
-                        ops::LstmUnitOpCUDAKernel<float>,
-                        ops::LstmUnitOpCUDAKernel<double>);
-REGISTER_OP_CUDA_KERNEL(lstm_unit_grad,
-                        ops::LstmUnitGradOpCUDAKernel<float>,
-                        ops::LstmUnitGradOpCUDAKernel<double>);
+PD_REGISTER_STRUCT_KERNEL(
+    lstm_unit, GPU, ALL_LAYOUT, ops::LstmUnitOpCUDAKernel, float, double) {}
+PD_REGISTER_STRUCT_KERNEL(lstm_unit_grad,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::LstmUnitGradOpCUDAKernel,
+                          float,
+                          double) {}
diff --git a/paddle/fluid/operators/lstm_unit_op.h b/paddle/fluid/operators/lstm_unit_op.h
index abb2eb1620dbe4..0621741b885fb7 100644
--- a/paddle/fluid/operators/lstm_unit_op.h
+++ b/paddle/fluid/operators/lstm_unit_op.h
@@ -33,7 +33,7 @@ inline T tanh(T x) {
   return 2. * sigmoid(2. * x) - 1.;
 }
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class LstmUnitKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -78,7 +78,7 @@ class LstmUnitKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class LstmUnitGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/fluid/operators/lstmp_op.cc b/paddle/fluid/operators/lstmp_op.cc
index 63cf07e35b7cb9..44e9a698beee71 100644
--- a/paddle/fluid/operators/lstmp_op.cc
+++ b/paddle/fluid/operators/lstmp_op.cc
@@ -405,9 +405,7 @@ REGISTER_OPERATOR(lstmp,
                   ops::LSTMPGradMaker<paddle::framework::OpDesc>,
                   ops::LSTMPGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(lstmp_grad, ops::LSTMPGradOp);
-REGISTER_OP_CPU_KERNEL(lstmp,
-                       ops::LSTMPKernel<phi::CPUContext, float>,
-                       ops::LSTMPKernel<phi::CPUContext, double>);
-REGISTER_OP_CPU_KERNEL(lstmp_grad,
-                       ops::LSTMPGradKernel<phi::CPUContext, float>,
-                       ops::LSTMPGradKernel<phi::CPUContext, double>);
+PD_REGISTER_STRUCT_KERNEL(
+    lstmp, CPU, ALL_LAYOUT, ops::LSTMPKernel, float, double) {}
+PD_REGISTER_STRUCT_KERNEL(
+    lstmp_grad, CPU, ALL_LAYOUT, ops::LSTMPGradKernel, float, double) {}
diff --git a/paddle/fluid/operators/lstmp_op.cu b/paddle/fluid/operators/lstmp_op.cu
index 8614eaf5d49595..5559d09f1b9ba9 100644
--- a/paddle/fluid/operators/lstmp_op.cu
+++ b/paddle/fluid/operators/lstmp_op.cu
@@ -15,9 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/lstmp_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(lstmp,
-                        ops::LSTMPKernel<phi::GPUContext, float>,
-                        ops::LSTMPKernel<phi::GPUContext, double>);
-REGISTER_OP_CUDA_KERNEL(lstmp_grad,
-                        ops::LSTMPGradKernel<phi::GPUContext, float>,
-                        ops::LSTMPGradKernel<phi::GPUContext, double>);
+PD_REGISTER_STRUCT_KERNEL(
+    lstmp, GPU, ALL_LAYOUT, ops::LSTMPKernel, float, double) {}
+PD_REGISTER_STRUCT_KERNEL(
+    lstmp_grad, GPU, ALL_LAYOUT, ops::LSTMPGradKernel, float, double) {}
diff --git a/paddle/fluid/operators/lstmp_op.h b/paddle/fluid/operators/lstmp_op.h
index 90e3072dbf97eb..fd9032c730af84 100644
--- a/paddle/fluid/operators/lstmp_op.h
+++ b/paddle/fluid/operators/lstmp_op.h
@@ -78,7 +78,7 @@ inline void ReorderInitState(const DeviceContext& ctx,
   row_shuffle(ctx, src, index, dst, indexed_src);
 }
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class LSTMPKernel : public framework::OpKernel<T> {
  public:
   template <typename Device, typename X, typename Y>
@@ -279,7 +279,7 @@ class LSTMPKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class LSTMPGradKernel : public framework::OpKernel<T> {
  public:
   template <typename Device, typename X, typename Y, typename DX, typename DY>

From 54e4360afe81f127eef8ed776246f2a8806f6a90 Mon Sep 17 00:00:00 2001
From: zhangyuqin1998 <75946871+zhangyuqin1998@users.noreply.github.com>
Date: Fri, 14 Apr 2023 10:32:20 +0800
Subject: [PATCH 154/156] delete unused param from swish_grad and relu6_grad
 (#52805)

---
 paddle/phi/api/yaml/legacy_backward.yaml      |  4 +--
 paddle/phi/api/yaml/sparse_backward.yaml      |  2 +-
 paddle/phi/kernels/activation_grad_kernel.h   |  4 +--
 .../phi/kernels/cpu/activation_grad_kernel.cc |  6 ++---
 paddle/phi/kernels/funcs/activation_functor.h | 24 ++++++------------
 .../phi/kernels/gpu/activation_grad_kernel.cu |  8 ++----
 .../kernels/onednn/activation_grad_kernel.cc  | 15 ++++++++---
 .../sparse/impl/unary_grad_kernel_impl.h      |  2 +-
 paddle/phi/kernels/sparse/unary_grad_kernel.h |  2 +-
 .../phi/kernels/xpu/activation_grad_kernel.cc | 19 +++-----------
 paddle/phi/ops/compat/activation_sig.cc       | 10 ++++++--
 test/mkldnn/test_activation_mkldnn_op.py      | 25 -------------------
 12 files changed, 42 insertions(+), 79 deletions(-)

diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml
index d810ad8bd9f032..c0168544a3b55f 100755
--- a/paddle/phi/api/yaml/legacy_backward.yaml
+++ b/paddle/phi/api/yaml/legacy_backward.yaml
@@ -801,7 +801,7 @@
 
 - backward_op : relu6_grad
   forward : relu6 (Tensor x) -> Tensor(out)
-  args : (Tensor out, Tensor out_grad, float threshold = 6)
+  args : (Tensor out, Tensor out_grad)
   output : Tensor(x_grad)
   infer_meta :
     func : UnchangedInferMeta
@@ -1010,7 +1010,7 @@
 
 - backward_op : swish_grad
   forward : swish (Tensor x) -> Tensor(out)
-  args : (Tensor x, Tensor out_grad, float bete=1.0)
+  args : (Tensor x, Tensor out_grad)
   output : Tensor(x_grad)
   infer_meta :
     func : GeneralUnaryGradInferMeta
diff --git a/paddle/phi/api/yaml/sparse_backward.yaml b/paddle/phi/api/yaml/sparse_backward.yaml
index 949a6c4c19b12c..c541129f7ffbbb 100644
--- a/paddle/phi/api/yaml/sparse_backward.yaml
+++ b/paddle/phi/api/yaml/sparse_backward.yaml
@@ -252,7 +252,7 @@
 
 - backward_op : relu6_grad
   forward : relu6(Tensor x) -> Tensor(out)
-  args : (Tensor out, Tensor out_grad, float threshold = 6)
+  args : (Tensor out, Tensor out_grad)
   output : Tensor(x_grad)
   infer_meta :
     func : UnchangedInferMeta
diff --git a/paddle/phi/kernels/activation_grad_kernel.h b/paddle/phi/kernels/activation_grad_kernel.h
index b322ed5e02a290..ca75a6e0b24a48 100644
--- a/paddle/phi/kernels/activation_grad_kernel.h
+++ b/paddle/phi/kernels/activation_grad_kernel.h
@@ -285,6 +285,7 @@ DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Log);
 DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Log2);
 DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Log10);
 DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Log1p);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Swish);
 
 DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Exp);
 DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Expm1);
@@ -294,6 +295,7 @@ DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu);
 DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh);
 DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Sigmoid);
 DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Sqrt);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu6);
 
 DECLARE_ACTIVATION_GRAD_KERNEL_NODEP(Round);
 DECLARE_ACTIVATION_GRAD_KERNEL_NODEP(Floor);
@@ -303,11 +305,9 @@ DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu, alpha);
 DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(ThresholdedRelu, threshold);
 DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink, lambda);
 DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(HardShrink, threshold);
-DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Swish, beta);
 DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Logit, eps);
 DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Mish, threshold);
 DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Celu, alpha);
-DECLARE_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT(Relu6, threshold);
 
 DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(HardTanh, t_min, t_max);
 DECLARE_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(STanh, scale_a, scale_b);
diff --git a/paddle/phi/kernels/cpu/activation_grad_kernel.cc b/paddle/phi/kernels/cpu/activation_grad_kernel.cc
index e15ae5bb89e90b..9273f8393b5b35 100644
--- a/paddle/phi/kernels/cpu/activation_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/activation_grad_kernel.cc
@@ -136,12 +136,14 @@ DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Expm1, Expm1GradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Reciprocal, ReciprocalGradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Sqrt, SqrtGradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Rsqrt, RsqrtGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu6, Relu6GradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Softsign, SoftsignGradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid, LogSigmoidGradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Log, LogGradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Log2, Log2GradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Log10, Log10GradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Log1p, Log1pGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Swish, SwishGradFunctor);
 
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu, ReluGradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh, TanhGradFunctor);
@@ -157,16 +159,12 @@ DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu,
 DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(ThresholdedRelu,
                                                ThresholdedReluGradFunctor,
                                                threshold);
-DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT(Relu6,
-                                                 Relu6GradFunctor,
-                                                 threshold);
 DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink,
                                                SoftShrinkGradFunctor,
                                                lambda);
 DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(HardShrink,
                                                HardShrinkGradFunctor,
                                                threshold);
-DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Swish, SwishGradFunctor, beta);
 
 DEFINE_CPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Mish,
                                                MishGradFunctor,
diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h
index 78a1f8cb24f852..6a3554318e5e66 100644
--- a/paddle/phi/kernels/funcs/activation_functor.h
+++ b/paddle/phi/kernels/funcs/activation_functor.h
@@ -1505,16 +1505,14 @@ struct Relu6Functor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct Relu6GradFunctor : public BaseActivationFunctor<T> {
-  float threshold;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() { return {{}}; }
   template <typename Device,
             typename X,
             typename Out,
             typename dOut,
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    float threshold = 6;
     dx.device(d) =
         dout * ((out > static_cast<T>(0)) * (out < static_cast<T>(threshold)))
                    .template cast<T>();
@@ -2188,10 +2186,7 @@ struct SwishFunctor : public BaseActivationFunctor<T> {
 
 template <typename T>
 struct SwishGradFunctor : public BaseActivationFunctor<T> {
-  float beta;
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"beta", &beta}};
-  }
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() { return {{}}; }
 
   template <typename Device,
             typename X,
@@ -2199,6 +2194,7 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
             typename dOut,
             typename dX>
   void operator()(Device d, X x, Out fake_out, dOut dout, dX dx) const {
+    float beta = 1.0;
     auto temp1 = static_cast<T>(1) /
                  (static_cast<T>(1) + (static_cast<T>(-beta) * x).exp());
     auto out = x * temp1;
@@ -3285,14 +3281,12 @@ struct CudaRelu6Functor : public BaseActivationFunctor<T> {
 template <typename T>
 struct CudaRelu6GradFunctor : public BaseActivationFunctor<T> {
   T zero = static_cast<T>(0.0f);
-  float threshold;
 
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() { return {{}}; }
 
   // dx = (out > 0 && out < t) ? dout : 0
   __device__ __forceinline__ T operator()(const T dout, const T out) const {
+    float threshold = 6;
     T t = static_cast<T>(threshold);
     return (out > zero && out < t) ? dout : zero;
   }
@@ -3781,15 +3775,13 @@ template <typename T>
 struct CudaSwishGradFunctor : public BaseActivationFunctor<T> {
   using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
   MPType one = static_cast<MPType>(1.0f);
-  float beta;
 
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"beta", &beta}};
-  }
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() { return {{}}; }
 
   // dx = dout * (1 + exp(-b * x) + b * x * exp(-b * x) / (1 + exp(-b * x))^2)
   __device__ __forceinline__ T operator()(const T arg_dout,
                                           const T arg_x) const {
+    float beta = 1.0;
     MPType dout = static_cast<MPType>(arg_dout);
     MPType x = static_cast<MPType>(arg_x);
     MPType b = static_cast<MPType>(beta);
diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
index 04a414fd5848e2..c0fb7342a80b65 100644
--- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
@@ -198,12 +198,14 @@ DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Expm1, CudaExpm1GradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Reciprocal, CudaReciprocalGradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Sqrt, CudaSqrtGradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Rsqrt, CudaRsqrtGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu6, CudaRelu6GradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Softsign, CudaSoftsignGradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid, CudaLogSigmoidGradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Log, CudaLogGradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Log2, CudaLog2GradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Log10, CudaLog10GradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Log1p, CudaLog1pGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Swish, CudaSwishGradFunctor);
 
 DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu,
                                                CudaLeakyReluGradFunctor,
@@ -217,9 +219,6 @@ DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink,
 DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(HardShrink,
                                                CudaHardShrinkGradFunctor,
                                                threshold);
-DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Swish,
-                                               CudaSwishGradFunctor,
-                                               beta);
 
 DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Mish,
                                                CudaMishGradFunctor,
@@ -227,9 +226,6 @@ DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Mish,
 DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Celu,
                                                CudaCELUGradFunctor,
                                                alpha);
-DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT(Relu6,
-                                                 CudaRelu6GradFunctor,
-                                                 threshold);
 DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT(LogitCUDA,
                                                  CudaLogitGradFunctor,
                                                  eps);
diff --git a/paddle/phi/kernels/onednn/activation_grad_kernel.cc b/paddle/phi/kernels/onednn/activation_grad_kernel.cc
index 6355908c250bde..9b8626254c7b5d 100644
--- a/paddle/phi/kernels/onednn/activation_grad_kernel.cc
+++ b/paddle/phi/kernels/onednn/activation_grad_kernel.cc
@@ -204,9 +204,16 @@ DEFINE_ONEDNN_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu,
 DEFINE_ONEDNN_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Mish,
                                                   MishOneDNNGradFunctor,
                                                   threshold);
-DEFINE_ONEDNN_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Swish,
-                                                  SwishOneDNNGradFunctor,
-                                                  beta);
+
+template <typename T, typename Context>
+void SwishGradKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& dout,
+                     DenseTensor* dx) {
+  SwishOneDNNGradFunctor<T> functor;
+  float beta = 1.0;
+  functor(dev_ctx, x, dout, beta, 0, dx);
+}
 
 template <typename T, typename Context>
 void EluGradKernel(const Context& dev_ctx,
@@ -247,9 +254,9 @@ template <typename T, typename Context>
 void Relu6GradKernel(const Context& dev_ctx,
                      const DenseTensor& out,
                      const DenseTensor& dout,
-                     float threshold,
                      DenseTensor* dx) {
   Relu6OneDNNGradUseOutFunctor<T> functor;
+  float threshold = 6;
   functor(dev_ctx, out, dout, 0, threshold, dx);
 }
 
diff --git a/paddle/phi/kernels/sparse/impl/unary_grad_kernel_impl.h b/paddle/phi/kernels/sparse/impl/unary_grad_kernel_impl.h
index 0709e6d946ffa7..6a71a924526732 100644
--- a/paddle/phi/kernels/sparse/impl/unary_grad_kernel_impl.h
+++ b/paddle/phi/kernels/sparse/impl/unary_grad_kernel_impl.h
@@ -94,9 +94,9 @@ DEFINE_SPARSE_UNARY_GRAD_KERNEL(Log1p)
 DEFINE_SPARSE_UNARY_GRAD_KERNEL(Relu)
 DEFINE_SPARSE_UNARY_GRAD_KERNEL(Abs)
 DEFINE_SPARSE_UNARY_GRAD_KERNEL(Expm1)
+DEFINE_SPARSE_UNARY_GRAD_KERNEL(Relu6)
 DEFINE_SPARSE_UNARY_GRAD_KERNEL_WITH_ONE_ATTR(Pow, factor)
 DEFINE_SPARSE_UNARY_GRAD_KERNEL_WITH_ONE_ATTR(LeakyRelu, alpha)
-DEFINE_SPARSE_UNARY_GRAD_KERNEL_WITH_ONE_ATTR(Relu6, threshold)
 
 template <typename T, typename Context>
 void CastCooGradKernel(const Context& dev_ctx,
diff --git a/paddle/phi/kernels/sparse/unary_grad_kernel.h b/paddle/phi/kernels/sparse/unary_grad_kernel.h
index 88bf0e9002501a..7440533057022e 100644
--- a/paddle/phi/kernels/sparse/unary_grad_kernel.h
+++ b/paddle/phi/kernels/sparse/unary_grad_kernel.h
@@ -62,9 +62,9 @@ DECLARE_SPARSE_UNARY_GRAD_KERNEL(Sqrt)
 DECLARE_SPARSE_UNARY_GRAD_KERNEL(Log1p)
 DECLARE_SPARSE_UNARY_GRAD_KERNEL(Abs)
 DECLARE_SPARSE_UNARY_GRAD_KERNEL(Expm1)
+DECLARE_SPARSE_UNARY_GRAD_KERNEL(Relu6)
 DECLARE_SPARSE_UNARY_GRAD_KERNEL_WITH_ONE_ATTR(Pow, factor)
 DECLARE_SPARSE_UNARY_GRAD_KERNEL_WITH_ONE_ATTR(LeakyRelu, alpha)
-DECLARE_SPARSE_UNARY_GRAD_KERNEL_WITH_ONE_ATTR(Relu6, threshold)
 
 template <typename T, typename Context>
 void CastCooGradKernel(const Context& dev_ctx,
diff --git a/paddle/phi/kernels/xpu/activation_grad_kernel.cc b/paddle/phi/kernels/xpu/activation_grad_kernel.cc
index df9674a16d6250..02243215f95886 100644
--- a/paddle/phi/kernels/xpu/activation_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/activation_grad_kernel.cc
@@ -351,10 +351,7 @@ struct XPUReluGradFunctor : public funcs::BaseActivationFunctor<T> {
 template <typename T>
 struct XPURelu6GradFunctor : public funcs::BaseActivationFunctor<T> {
   using XPUType = typename XPUTypeTrait<T>::Type;
-  float threshold;
-  typename funcs::BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"threshold", &threshold}};
-  }
+  typename funcs::BaseActivationFunctor<T>::AttrPair GetAttrs() { return {{}}; }
   template <typename Context>
   void operator()(const Context& dev_ctx,
                   const DenseTensor* x,
@@ -481,10 +478,7 @@ void PowGradKernel(const Context& dev_ctx,
 template <typename T>
 struct XPUSwishGradFunctor : public funcs::BaseActivationFunctor<T> {
   using XPUType = typename XPUTypeTrait<T>::Type;
-  float beta;
-  typename funcs::BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"beta", &beta}};
-  }
+  typename funcs::BaseActivationFunctor<T>::AttrPair GetAttrs() { return {{}}; }
 
   template <typename Context>
   void operator()(const Context& dev_ctx,
@@ -571,14 +565,13 @@ DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Sigmoid, XPUSigmoidGradFunctor);
 DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Sqrt, XPUSqrtGradFunctor);
 DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh, XPUTanhGradFunctor);
 DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu, XPUReluGradFunctor);
+DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu6, XPURelu6GradFunctor);
 
 DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPX(Silu, XPUSiluGradFunctor);
 DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPX(Log, XPULogGradFunctor);
 DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPX(Square, XPUSquareGradFunctor);
+DEFINE_XPU_ACTIVATION_GRAD_KERNEL_DEPX(Swish, XPUSwishGradFunctor);
 
-DEFINE_XPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Swish,
-                                               XPUSwishGradFunctor,
-                                               beta);
 DEFINE_XPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Mish,
                                                XPUMishGradFunctor,
                                                threshold);
@@ -586,10 +579,6 @@ DEFINE_XPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu,
                                                XPULeakyReluGradFunctor,
                                                alpha);
 
-DEFINE_XPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT(Relu6,
-                                                 XPURelu6GradFunctor,
-                                                 threshold);
-
 DEFINE_XPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(Softplus,
                                                XPUSoftPlusGradFunctor,
                                                beta,
diff --git a/paddle/phi/ops/compat/activation_sig.cc b/paddle/phi/ops/compat/activation_sig.cc
index 804d0d63aa2891..e754c79ed1b29b 100644
--- a/paddle/phi/ops/compat/activation_sig.cc
+++ b/paddle/phi/ops/compat/activation_sig.cc
@@ -41,8 +41,14 @@ namespace phi {
 
 DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(HardTanh, "hardtanh", "t_min" comma "t_max");
 DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Mish, "mish", "threshold");
-DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Swish, "swish", "beta");         // NOLINT
-DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Relu6, "relu6", "threshold");  // NOLINT
+
+KernelSignature SwishGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("swish_grad", {"X", "Out@GRAD"}, {}, {"X@GRAD"});
+}
+
+KernelSignature Relu6GradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("relu6_grad", {"Out", "Out@GRAD"}, {}, {"X@GRAD"});
+}
 
 KernelSignature HardSwishGradOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
diff --git a/test/mkldnn/test_activation_mkldnn_op.py b/test/mkldnn/test_activation_mkldnn_op.py
index 77c28e28869732..09ebe1ae21e8de 100644
--- a/test/mkldnn/test_activation_mkldnn_op.py
+++ b/test/mkldnn/test_activation_mkldnn_op.py
@@ -16,7 +16,6 @@
 
 import numpy as np
 from mkldnn_op_test import check_if_mkldnn_primitives_exist_in_bwd
-from scipy.special import expit
 
 import paddle
 import paddle.nn.functional as F
@@ -416,30 +415,6 @@ def init_dtype(self):
         self.dtype = np.float32
 
 
-class TestMKLDNNSwishDim4(TestSwish):
-    def setUp(self):
-        super().setUp()
-
-        x = np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype(self.dtype)
-        beta = 2.3
-        out = x * expit(beta * x)
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.outputs = {'Out': out}
-        self.attrs = {"use_mkldnn": True, "beta": beta}
-
-    def init_dtype(self):
-        self.dtype = np.float32
-
-    def test_check_output(self):
-        self.check_output(check_dygraph=False)
-
-    def test_check_grad(self):
-        if self.dtype == np.float16:
-            return
-        self.check_grad(['X'], 'Out', check_dygraph=False)
-
-
 def ref_hardswish(x, threshold=6.0, scale=6.0, offset=3.0):
     x_dtype = x.dtype
     if x_dtype == 'float16':

From 2f4997135c3703a8a9476af20e79c6b443517eea Mon Sep 17 00:00:00 2001
From: huangjiyi <43315610+huangjiyi@users.noreply.github.com>
Date: Fri, 14 Apr 2023 10:33:08 +0800
Subject: [PATCH 155/156] update (#52880)

---
 paddle/fluid/operators/hash_op.cc             |  4 ++-
 paddle/fluid/operators/hash_op.h              |  2 +-
 paddle/fluid/operators/hinge_loss_op.cc       | 21 ++++++++-------
 paddle/fluid/operators/hinge_loss_op.h        |  4 +--
 paddle/fluid/operators/im2sequence_op.cc      | 21 ++++++++-------
 paddle/fluid/operators/im2sequence_op.h       |  4 +--
 paddle/fluid/operators/inplace_abn_op.cc      | 18 +++++++------
 paddle/fluid/operators/inplace_abn_op.cu      | 26 ++++++++++---------
 .../fluid/operators/limit_by_capacity_op.cc   | 11 +++++---
 .../fluid/operators/limit_by_capacity_op.cu   | 11 ++++----
 paddle/fluid/operators/limit_by_capacity_op.h |  2 +-
 paddle/fluid/operators/linear_chain_crf_op.cc | 20 +++++++++-----
 paddle/fluid/operators/linear_chain_crf_op.h  |  4 +--
 paddle/fluid/operators/margin_rank_loss_op.cc | 12 ++++++---
 paddle/fluid/operators/margin_rank_loss_op.cu | 11 +++++---
 paddle/fluid/operators/margin_rank_loss_op.h  |  4 +--
 .../fluid/operators/modified_huber_loss_op.cc | 12 ++++++---
 .../fluid/operators/modified_huber_loss_op.cu | 15 +++++++----
 .../fluid/operators/modified_huber_loss_op.h  |  4 +--
 19 files changed, 122 insertions(+), 84 deletions(-)

diff --git a/paddle/fluid/operators/hash_op.cc b/paddle/fluid/operators/hash_op.cc
index f111a379e16fe9..e5fc57c6567b4f 100644
--- a/paddle/fluid/operators/hash_op.cc
+++ b/paddle/fluid/operators/hash_op.cc
@@ -79,4 +79,6 @@ class HashOpMaker : public framework::OpProtoAndCheckerMaker {
 namespace ops = paddle::operators;
 
 REGISTER_OP_WITHOUT_GRADIENT(hash, ops::HashOp, ops::HashOpMaker);
-REGISTER_OP_CPU_KERNEL(hash, ops::HashKernel<int>, ops::HashKernel<int64_t>);
+
+PD_REGISTER_STRUCT_KERNEL(
+    hash, CPU, ALL_LAYOUT, ops::HashKernel, int, int64_t) {}
diff --git a/paddle/fluid/operators/hash_op.h b/paddle/fluid/operators/hash_op.h
index 47b478025b4a9b..700f7c1d70138a 100644
--- a/paddle/fluid/operators/hash_op.h
+++ b/paddle/fluid/operators/hash_op.h
@@ -38,7 +38,7 @@ inline void HashOutputSize(const framework::DDim& in_dims,
   out_dims.emplace_back(1);
 }
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class HashKernel : public framework::OpKernel<T> {
  public:
   virtual void Compute(const framework::ExecutionContext& context) const {
diff --git a/paddle/fluid/operators/hinge_loss_op.cc b/paddle/fluid/operators/hinge_loss_op.cc
index 6741af7638809e..dea3ce3fe695b8 100644
--- a/paddle/fluid/operators/hinge_loss_op.cc
+++ b/paddle/fluid/operators/hinge_loss_op.cc
@@ -150,12 +150,15 @@ REGISTER_OPERATOR(hinge_loss,
                   ops::HingeLossGradOpMaker<paddle::framework::OpDesc>,
                   ops::HingeLossGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(hinge_loss_grad, ops::HingeLossGradOp);
-REGISTER_OP_CPU_KERNEL(hinge_loss,
-                       ops::HingeLossKernel<phi::CPUContext, float>);
-REGISTER_OP_CPU_KERNEL(hinge_loss_grad,
-                       ops::HingeLossGradKernel<phi::CPUContext, float>);
-
-REGISTER_OP_CUDA_KERNEL(hinge_loss,
-                        ops::HingeLossKernel<phi::GPUContext, float>);
-REGISTER_OP_CUDA_KERNEL(hinge_loss_grad,
-                        ops::HingeLossGradKernel<phi::GPUContext, float>);
+
+PD_REGISTER_STRUCT_KERNEL(
+    hinge_loss, CPU, ALL_LAYOUT, ops::HingeLossKernel, float) {}
+PD_REGISTER_STRUCT_KERNEL(
+    hinge_loss_grad, CPU, ALL_LAYOUT, ops::HingeLossGradKernel, float) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_STRUCT_KERNEL(
+    hinge_loss, GPU, ALL_LAYOUT, ops::HingeLossKernel, float) {}
+PD_REGISTER_STRUCT_KERNEL(
+    hinge_loss_grad, GPU, ALL_LAYOUT, ops::HingeLossGradKernel, float) {}
+#endif
diff --git a/paddle/fluid/operators/hinge_loss_op.h b/paddle/fluid/operators/hinge_loss_op.h
index 8f06154c79060e..968b79ea91be49 100644
--- a/paddle/fluid/operators/hinge_loss_op.h
+++ b/paddle/fluid/operators/hinge_loss_op.h
@@ -20,7 +20,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename DeviceContext, typename T, typename AttrType = T>
+template <typename T, typename DeviceContext, typename AttrType = T>
 class HingeLossKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -38,7 +38,7 @@ class HingeLossKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename DeviceContext, typename T, typename AttrType = T>
+template <typename T, typename DeviceContext, typename AttrType = T>
 class HingeLossGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
diff --git a/paddle/fluid/operators/im2sequence_op.cc b/paddle/fluid/operators/im2sequence_op.cc
index b58f9a55756ad5..8c123bb8a32f22 100644
--- a/paddle/fluid/operators/im2sequence_op.cc
+++ b/paddle/fluid/operators/im2sequence_op.cc
@@ -195,12 +195,15 @@ REGISTER_OPERATOR(im2sequence,
                   ops::Im2SequenceGradMaker<paddle::framework::OpDesc>,
                   ops::Im2SequenceGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(im2sequence_grad, ops::Im2SequenceGradOp);
-REGISTER_OP_CPU_KERNEL(im2sequence,
-                       ops::Im2SequenceKernel<phi::CPUContext, float>);
-REGISTER_OP_CPU_KERNEL(im2sequence_grad,
-                       ops::Im2SequenceGradKernel<phi::CPUContext, float>);
-
-REGISTER_OP_CUDA_KERNEL(im2sequence,
-                        ops::Im2SequenceKernel<phi::GPUContext, float>);
-REGISTER_OP_CUDA_KERNEL(im2sequence_grad,
-                        ops::Im2SequenceGradKernel<phi::GPUContext, float>);
+
+PD_REGISTER_STRUCT_KERNEL(
+    im2sequence, CPU, ALL_LAYOUT, ops::Im2SequenceKernel, float) {}
+PD_REGISTER_STRUCT_KERNEL(
+    im2sequence_grad, CPU, ALL_LAYOUT, ops::Im2SequenceGradKernel, float) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_STRUCT_KERNEL(
+    im2sequence, GPU, ALL_LAYOUT, ops::Im2SequenceKernel, float) {}
+PD_REGISTER_STRUCT_KERNEL(
+    im2sequence_grad, GPU, ALL_LAYOUT, ops::Im2SequenceGradKernel, float) {}
+#endif
diff --git a/paddle/fluid/operators/im2sequence_op.h b/paddle/fluid/operators/im2sequence_op.h
index 523639faddcbea..18e6d429f1b161 100644
--- a/paddle/fluid/operators/im2sequence_op.h
+++ b/paddle/fluid/operators/im2sequence_op.h
@@ -33,7 +33,7 @@ inline int Im2SeqOutputSize(
   return output_size;
 }
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class Im2SequenceKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -153,7 +153,7 @@ class Im2SequenceKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class Im2SequenceGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/fluid/operators/inplace_abn_op.cc b/paddle/fluid/operators/inplace_abn_op.cc
index 5acc9f1bd13c23..5deffde2b562c1 100644
--- a/paddle/fluid/operators/inplace_abn_op.cc
+++ b/paddle/fluid/operators/inplace_abn_op.cc
@@ -210,7 +210,7 @@ class InplaceABNOpGradMaker : public framework::SingleGradOpMaker<T> {
   }
 };
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class InplaceABNKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -270,7 +270,7 @@ class InplaceABNKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class InplaceABNGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -373,9 +373,11 @@ REGISTER_OPERATOR(inplace_abn,
                   InplaceAbnOpInplaceInferer)
 REGISTER_OPERATOR(inplace_abn_grad, ops::InplaceABNGradOp)
 
-REGISTER_OP_CPU_KERNEL(inplace_abn,
-                       ops::InplaceABNKernel<phi::CPUContext, float>,
-                       ops::InplaceABNKernel<phi::CPUContext, double>);
-REGISTER_OP_CPU_KERNEL(inplace_abn_grad,
-                       ops::InplaceABNGradKernel<phi::CPUContext, float>,
-                       ops::InplaceABNGradKernel<phi::CPUContext, double>);
+PD_REGISTER_STRUCT_KERNEL(
+    inplace_abn, CPU, ALL_LAYOUT, ops::InplaceABNKernel, float, double) {}
+PD_REGISTER_STRUCT_KERNEL(inplace_abn_grad,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::InplaceABNGradKernel,
+                          float,
+                          double) {}
diff --git a/paddle/fluid/operators/inplace_abn_op.cu b/paddle/fluid/operators/inplace_abn_op.cu
index a7d5a514c58553..38db9e847bf334 100644
--- a/paddle/fluid/operators/inplace_abn_op.cu
+++ b/paddle/fluid/operators/inplace_abn_op.cu
@@ -23,7 +23,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class InplaceABNKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -109,7 +109,7 @@ class InplaceABNKernel : public framework::OpKernel<T> {
 
 // Deriving the Gradient for the Backward Pass of Batch Normalization
 // https://kevinzakka.github.io/2016/09/14/batch_normalization/
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class InplaceABNGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -221,15 +221,17 @@ namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 #ifdef PADDLE_WITH_HIP
 // MIOPEN do not support double
-REGISTER_OP_CUDA_KERNEL(inplace_abn,
-                        ops::InplaceABNKernel<phi::GPUContext, float>);
-REGISTER_OP_CUDA_KERNEL(inplace_abn_grad,
-                        ops::InplaceABNGradKernel<phi::GPUContext, float>);
+PD_REGISTER_STRUCT_KERNEL(
+    inplace_abn, GPU, ALL_LAYOUT, ops::InplaceABNKernel, float) {}
+PD_REGISTER_STRUCT_KERNEL(
+    inplace_abn_grad, GPU, ALL_LAYOUT, ops::InplaceABNGradKernel, float) {}
 #else
-REGISTER_OP_CUDA_KERNEL(inplace_abn,
-                        ops::InplaceABNKernel<phi::GPUContext, float>,
-                        ops::InplaceABNKernel<phi::GPUContext, double>);
-REGISTER_OP_CUDA_KERNEL(inplace_abn_grad,
-                        ops::InplaceABNGradKernel<phi::GPUContext, float>,
-                        ops::InplaceABNGradKernel<phi::GPUContext, double>);
+PD_REGISTER_STRUCT_KERNEL(
+    inplace_abn, GPU, ALL_LAYOUT, ops::InplaceABNKernel, float, double) {}
+PD_REGISTER_STRUCT_KERNEL(inplace_abn_grad,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::InplaceABNGradKernel,
+                          float,
+                          double) {}
 #endif
diff --git a/paddle/fluid/operators/limit_by_capacity_op.cc b/paddle/fluid/operators/limit_by_capacity_op.cc
index ffae23c7025379..e4ce30d41ae63a 100644
--- a/paddle/fluid/operators/limit_by_capacity_op.cc
+++ b/paddle/fluid/operators/limit_by_capacity_op.cc
@@ -77,10 +77,13 @@ class LimitByCapacityOpMaker : public framework::OpProtoAndCheckerMaker {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_CPU_KERNEL(limit_by_capacity,
-                       ops::LimitByCapacityOpCPUKernel<int>,
-                       ops::LimitByCapacityOpCPUKernel<int64_t>);
-
 REGISTER_OP_WITHOUT_GRADIENT(limit_by_capacity,
                              ops::LimitByCapacityOp,
                              ops::LimitByCapacityOpMaker);
+
+PD_REGISTER_STRUCT_KERNEL(limit_by_capacity,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::LimitByCapacityOpCPUKernel,
+                          int,
+                          int64_t) {}
diff --git a/paddle/fluid/operators/limit_by_capacity_op.cu b/paddle/fluid/operators/limit_by_capacity_op.cu
index d14e4c75425c9b..4ddc921144843d 100644
--- a/paddle/fluid/operators/limit_by_capacity_op.cu
+++ b/paddle/fluid/operators/limit_by_capacity_op.cu
@@ -47,7 +47,7 @@ __global__ void limit_by_capacity_impl(
   }
 }
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class LimitByCapacityOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -78,7 +78,8 @@ class LimitByCapacityOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(limit_by_capacity,
-                        ops::LimitByCapacityOpCUDAKernel<int64_t>);
+PD_REGISTER_STRUCT_KERNEL(limit_by_capacity,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::LimitByCapacityOpCUDAKernel,
+                          int64_t) {}
diff --git a/paddle/fluid/operators/limit_by_capacity_op.h b/paddle/fluid/operators/limit_by_capacity_op.h
index c76d298f429821..c08183b5f1a676 100644
--- a/paddle/fluid/operators/limit_by_capacity_op.h
+++ b/paddle/fluid/operators/limit_by_capacity_op.h
@@ -24,7 +24,7 @@
 namespace paddle {
 namespace operators {
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class LimitByCapacityOpCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/fluid/operators/linear_chain_crf_op.cc b/paddle/fluid/operators/linear_chain_crf_op.cc
index ebaa61416f0cd6..46ff4c2e94a94b 100644
--- a/paddle/fluid/operators/linear_chain_crf_op.cc
+++ b/paddle/fluid/operators/linear_chain_crf_op.cc
@@ -395,10 +395,16 @@ REGISTER_OPERATOR(linear_chain_crf,
 REGISTER_OPERATOR(linear_chain_crf_grad,
                   ops::LinearChainCRFGradOp,
                   ops::LinearChainCRFGradNoNeedBufferVarsInferer);
-REGISTER_OP_CPU_KERNEL(linear_chain_crf,
-                       ops::LinearChainCRFOpKernel<phi::CPUContext, float>,
-                       ops::LinearChainCRFOpKernel<phi::CPUContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    linear_chain_crf_grad,
-    ops::LinearChainCRFGradOpKernel<phi::CPUContext, float>,
-    ops::LinearChainCRFGradOpKernel<phi::CPUContext, double>);
+
+PD_REGISTER_STRUCT_KERNEL(linear_chain_crf,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::LinearChainCRFOpKernel,
+                          float,
+                          double) {}
+PD_REGISTER_STRUCT_KERNEL(linear_chain_crf_grad,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::LinearChainCRFGradOpKernel,
+                          float,
+                          double) {}
diff --git a/paddle/fluid/operators/linear_chain_crf_op.h b/paddle/fluid/operators/linear_chain_crf_op.h
index bf68c7298e72a7..49387240625c18 100644
--- a/paddle/fluid/operators/linear_chain_crf_op.h
+++ b/paddle/fluid/operators/linear_chain_crf_op.h
@@ -48,7 +48,7 @@ struct ScalarMul {
 
 using framework::LoD;
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class LinearChainCRFOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -245,7 +245,7 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/fluid/operators/margin_rank_loss_op.cc b/paddle/fluid/operators/margin_rank_loss_op.cc
index 47ed77cbfb4e9b..2aaf8f74af359e 100644
--- a/paddle/fluid/operators/margin_rank_loss_op.cc
+++ b/paddle/fluid/operators/margin_rank_loss_op.cc
@@ -181,7 +181,11 @@ REGISTER_OPERATOR(margin_rank_loss,
                   ops::MarginRankLossGradMaker<paddle::framework::OpDesc>,
                   ops::MarginRankLossGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(margin_rank_loss_grad, ops::MarginRankLossGradOp);
-REGISTER_OP_CPU_KERNEL(margin_rank_loss,
-                       ops::MarginRankLossKernel<phi::CPUContext, float>);
-REGISTER_OP_CPU_KERNEL(margin_rank_loss_grad,
-                       ops::MarginRankLossGradKernel<phi::CPUContext, float>);
+
+PD_REGISTER_STRUCT_KERNEL(
+    margin_rank_loss, CPU, ALL_LAYOUT, ops::MarginRankLossKernel, float) {}
+PD_REGISTER_STRUCT_KERNEL(margin_rank_loss_grad,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::MarginRankLossGradKernel,
+                          float) {}
diff --git a/paddle/fluid/operators/margin_rank_loss_op.cu b/paddle/fluid/operators/margin_rank_loss_op.cu
index f672381ed7a413..8c6c2ee055f9c2 100644
--- a/paddle/fluid/operators/margin_rank_loss_op.cu
+++ b/paddle/fluid/operators/margin_rank_loss_op.cu
@@ -16,7 +16,10 @@ limitations under the License. */
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_CUDA_KERNEL(margin_rank_loss,
-                        ops::MarginRankLossKernel<phi::GPUContext, float>);
-REGISTER_OP_CUDA_KERNEL(margin_rank_loss_grad,
-                        ops::MarginRankLossGradKernel<phi::GPUContext, float>);
+PD_REGISTER_STRUCT_KERNEL(
+    margin_rank_loss, GPU, ALL_LAYOUT, ops::MarginRankLossKernel, float) {}
+PD_REGISTER_STRUCT_KERNEL(margin_rank_loss_grad,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::MarginRankLossGradKernel,
+                          float) {}
diff --git a/paddle/fluid/operators/margin_rank_loss_op.h b/paddle/fluid/operators/margin_rank_loss_op.h
index d04af331fd18df..49cbb1168f1b50 100644
--- a/paddle/fluid/operators/margin_rank_loss_op.h
+++ b/paddle/fluid/operators/margin_rank_loss_op.h
@@ -34,7 +34,7 @@ struct Heaviside {
   }
 };
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class MarginRankLossKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
@@ -62,7 +62,7 @@ class MarginRankLossKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class MarginRankLossGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
diff --git a/paddle/fluid/operators/modified_huber_loss_op.cc b/paddle/fluid/operators/modified_huber_loss_op.cc
index e0e64bb0c26809..b44c795b6e5368 100644
--- a/paddle/fluid/operators/modified_huber_loss_op.cc
+++ b/paddle/fluid/operators/modified_huber_loss_op.cc
@@ -176,7 +176,11 @@ REGISTER_OPERATOR(
     ops::ModifiedHuberLossGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(modified_huber_loss_grad, ops::ModifiedHuberLossGradOp);
 
-REGISTER_OP_CPU_KERNEL(modified_huber_loss,
-                       ops::ModifiedHuberLossKernel<phi::CPUContext, float>);
-REGISTER_OP_CPU_KERNEL(modified_huber_loss_grad,
-                       ops::ModifiedHuberLossGradCPUKernel<float>);
+PD_REGISTER_STRUCT_KERNEL(
+    modified_huber_loss, CPU, ALL_LAYOUT, ops::ModifiedHuberLossKernel, float) {
+}
+PD_REGISTER_STRUCT_KERNEL(modified_huber_loss_grad,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::ModifiedHuberLossGradCPUKernel,
+                          float) {}
diff --git a/paddle/fluid/operators/modified_huber_loss_op.cu b/paddle/fluid/operators/modified_huber_loss_op.cu
index bd4451ebda46df..f811b0ad9d6d64 100644
--- a/paddle/fluid/operators/modified_huber_loss_op.cu
+++ b/paddle/fluid/operators/modified_huber_loss_op.cu
@@ -39,7 +39,7 @@ struct ModifiedHuberLossBackward {
   }
 };
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class ModifiedHuberLossGradGPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -74,7 +74,12 @@ class ModifiedHuberLossGradGPUKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(modified_huber_loss,
-                        ops::ModifiedHuberLossKernel<phi::GPUContext, float>);
-REGISTER_OP_CUDA_KERNEL(modified_huber_loss_grad,
-                        ops::ModifiedHuberLossGradGPUKernel<float>);
+
+PD_REGISTER_STRUCT_KERNEL(
+    modified_huber_loss, GPU, ALL_LAYOUT, ops::ModifiedHuberLossKernel, float) {
+}
+PD_REGISTER_STRUCT_KERNEL(modified_huber_loss_grad,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::ModifiedHuberLossGradGPUKernel,
+                          float) {}
diff --git a/paddle/fluid/operators/modified_huber_loss_op.h b/paddle/fluid/operators/modified_huber_loss_op.h
index 62600ed7c6970b..571482ce475886 100644
--- a/paddle/fluid/operators/modified_huber_loss_op.h
+++ b/paddle/fluid/operators/modified_huber_loss_op.h
@@ -52,7 +52,7 @@ struct ModifiedHuberLossForward {
   }
 };
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class ModifiedHuberLossKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -79,7 +79,7 @@ class ModifiedHuberLossKernel : public framework::OpKernel<T> {
 };
 
 // CPU backward kernel
-template <typename T>
+template <typename T, typename DeviceContext>
 class ModifiedHuberLossGradCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {

From ce6978c6aef67c03aaf2a3fcbd66254f37510f1a Mon Sep 17 00:00:00 2001
From: huangjiyi <43315610+huangjiyi@users.noreply.github.com>
Date: Fri, 14 Apr 2023 10:33:30 +0800
Subject: [PATCH 156/156] update (#52875)

---
 paddle/phi/tools/get_kernel_signatures.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/phi/tools/get_kernel_signatures.py b/paddle/phi/tools/get_kernel_signatures.py
index 9b165ef4541cae..cb03c47710a22e 100644
--- a/paddle/phi/tools/get_kernel_signatures.py
+++ b/paddle/phi/tools/get_kernel_signatures.py
@@ -83,7 +83,7 @@ def search(cls, search_path):
         )
         return pd.merge(
             kernel_func_df, func_signature_df, on='kernel_func', how='left'
-        )[['kernel_name', 'kernel_signature']]
+        )[['kernel_name', 'kernel_signature']].sort_values(by='kernel_name')
 
     def filter_result(self):
         for kernel_name in self.filter["kernel_name"]:
@@ -207,14 +207,14 @@ def get_kernel_signatures():
     # Because phi/kernels has some independent subdirs, whose kernel names
     # (in different namespaces) may conflict with main directory or other
     # subdirs, so we need to search them separately.
-    indenpendent_subdir = [
+    independent_subdir = [
         'fusion',
         'legacy',
         'selected_rows',
         'sparse',
         'strings',
     ]
-    for subdir in indenpendent_subdir:
+    for subdir in independent_subdir:
         sub_path = osp.join(base_path, subdir)
         sub_df = KernelSignatureSearcher.search(sub_path)
         kernel_signature_df = pd.concat(