From 4c6544a953e756256bc0ee430980fd18d61078d5 Mon Sep 17 00:00:00 2001
From: yuanlehome <yuanlehome@163.com>
Date: Wed, 7 Dec 2022 09:07:09 +0000
Subject: [PATCH 1/5] rewrite convert_to_mixed_precision

---
 .../fluid/framework/ir/float_to_half_pass.cc  |  68 +-
 .../fluid/framework/ir/float_to_half_pass.h   |  18 +-
 .../inference/analysis/ir_pass_manager.cc     |  31 +-
 .../ir_passes/tensorrt_subgraph_pass.cc       |  16 +-
 .../passes/convert_to_mixed_precision.cc      | 805 ++----------------
 .../passes/convert_to_mixed_precision.h       |  49 +-
 6 files changed, 183 insertions(+), 804 deletions(-)
diff --git a/paddle/fluid/framework/ir/float_to_half_pass.cc b/paddle/fluid/framework/ir/float_to_half_pass.cc
index ec94728fb3c641..9f2098683a95a9 100644
--- a/paddle/fluid/framework/ir/float_to_half_pass.cc
+++ b/paddle/fluid/framework/ir/float_to_half_pass.cc
@@ -66,6 +66,23 @@ bool GpuKernelSupportPrecision(
   return support;
 }
 
+inline bool VarNodeHasDtype(Node* var_node) {
+  auto type = var_node->Var()->GetType();
+  return (type == VarType::SELECTED_ROWS) || (type == VarType::LOD_TENSOR) ||
+         (type == VarType::LOD_TENSOR_ARRAY) || (type == VarType::STRINGS) ||
+         (type == VarType::VOCAB);
+}
+
+inline bool IsFloatType(VarType::Type type) {
+  return (type == VarType::FP64) || (type == VarType::FP32);
+}
+
+inline bool IsHalfType(VarType::Type type) {
+  return (type == VarType::FP16) || (type == VarType::BF16);
+}
+
+};  // namespace
+
 void DoInsertCastOp(Graph* graph,
                     Node* var_node,
                     Node* op_node,
@@ -118,23 +135,19 @@ void DoInsertCastOp(Graph* graph,
   IR_NODE_UNLINK(var_node, op_node);
 }
 
-inline bool VarNodeHasDtype(Node* var_node) {
-  auto type = var_node->Var()->GetType();
-  return (type == VarType::SELECTED_ROWS) || (type == VarType::LOD_TENSOR) ||
-         (type == VarType::LOD_TENSOR_ARRAY) || (type == VarType::STRINGS) ||
-         (type == VarType::VOCAB);
-}
-
-inline bool IsFloatType(VarType::Type type) {
-  return (type == VarType::FP64) || (type == VarType::FP32);
-}
-
-inline bool IsHalfType(VarType::Type type) {
-  return (type == VarType::FP16) || (type == VarType::BF16);
+bool OpSupportPrecision(const std::string& op_type,
+                        phi::Backend backend,
+                        phi::DataType precision,
+                        const std::unordered_set<std::string>& black_list) {
+  bool support = false;
+  if (black_list.count(op_type) == 0) {
+    if (backend == phi::Backend::GPU) {
+      support = GpuKernelSupportPrecision(op_type, precision);
+    }
+  }
+  return support;
 }
 
-};  // namespace
-
 // The set of ops that support fp16 calculation and are considered
 // numerically-dangerous, slower and whose effects may also be observed in
 // downstream ops.
@@ -172,10 +185,17 @@ void FloatToHalfPass::SetDefaultBlacklist() const {
 
 void FloatToHalfPass::Init(Graph* graph) const {
   keep_io_types_ = true;
+  if (Has("keep_io_types")) {
+    keep_io_types_ = Get<bool>("keep_io_types");
+  }
   half_precision_ =
       static_cast<phi::DataType>(Get<int>("mixed_precision_mode"));
   black_list_ = Get<std::unordered_set<std::string>>("mixed_black_list");
   SetDefaultBlacklist();
+  VLOG(4) << "black_list has ";
+  for (const auto& name : black_list_) {
+    VLOG(4) << " - " << name;
+  }
 
   auto graph_size = graph->SubGraphsSize();
   VLOG(4) << "graph size: " << graph_size;
@@ -235,18 +255,6 @@ void FloatToHalfPass::ApplyImpl(Graph* graph) const {
   VLOG(4) << "RestoreOpOriginType done";
 }
 
-bool FloatToHalfPass::OpSupportPrecision(const std::string& op_type,
-                                         phi::DataType precision,
-                                         phi::Backend backend) const {
-  bool support = false;
-  if (black_list_.count(op_type) == 0) {
-    if (backend == phi::Backend::GPU) {
-      support = GpuKernelSupportPrecision(op_type, precision);
-    }
-  }
-  return support;
-}
-
 void FloatToHalfPass::SetOpUniqueType() const {
   int suffix = 0;
   for (const auto& nodes : all_op_nodes_) {
@@ -328,8 +336,10 @@ void FloatToHalfPass::GetOpPrecision() const {
           GetOpOriginalType(op_type) == "fetch") {
         support_half = !keep_io_types_;
       } else {
-        support_half =
-            OpSupportPrecision(GetOpOriginalType(op_type), half_precision_);
+        support_half = OpSupportPrecision(GetOpOriginalType(op_type),
+                                          phi::Backend::GPU,
+                                          half_precision_,
+                                          black_list_);
       }
 
       if (op_node->Op()->HasAttr("dtype")) {
diff --git a/paddle/fluid/framework/ir/float_to_half_pass.h b/paddle/fluid/framework/ir/float_to_half_pass.h
index a274dc9a53c61a..89351a363403b1 100644
--- a/paddle/fluid/framework/ir/float_to_half_pass.h
+++ b/paddle/fluid/framework/ir/float_to_half_pass.h
@@ -46,10 +46,6 @@ class FloatToHalfPass : public FusePassBase {
 
   void SetDefaultBlacklist() const;
 
-  bool OpSupportPrecision(const std::string& op_type,
-                          phi::DataType precision,
-                          phi::Backend backend = phi::Backend::GPU) const;
-
   void SetOpUniqueType() const;
 
   void RestoreOpOriginType() const;
@@ -93,6 +89,20 @@ class FloatToHalfPass : public FusePassBase {
   mutable std::unordered_set<std::string> vars_convert_to_half_;
 };
 
+bool OpSupportPrecision(const std::string& op_type,
+                        phi::Backend backend,
+                        phi::DataType precision,
+                        const std::unordered_set<std::string>& black_list);
+
+void DoInsertCastOp(Graph* graph,
+                    Node* var_node,
+                    Node* op_node,
+                    proto::VarType::Type from_type,
+                    proto::VarType::Type to_type,
+                    framework::BlockDesc* block_desc,
+                    int* suffix,
+                    std::unordered_map<Node*, Node*>* cache);
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index cbcc48a7f68e85..2068b8abb8375b 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -44,8 +44,10 @@ IRPassManager::IRPassManager(Argument *argument) {
 
 void IRPassManager::CreatePasses(Argument *argument,
                                  const std::vector<std::string> &passes) {
+  // For graph_viz_pass
   std::string pre_pass;
   int pass_num = 0;
+
   for (const std::string &pass_name : passes) {
     auto pass = framework::ir::PassRegistry::Instance().Get(pass_name);
     pass->Set("use_varseqlen", new bool(argument->tensorrt_use_varseqlen()));
@@ -86,15 +88,6 @@ void IRPassManager::CreatePasses(Argument *argument,
                               argument->tensorrt_tuned_dynamic_shape();
     pass->Set("with_dynamic_shape", new bool(with_dynamic_shape));
 
-    // mixed precision related
-    pass->Set("model_precision", new int(argument->model_precision()));
-    pass->Set(
-        "mixed_black_list",
-        new std::unordered_set<std::string>(argument->mixed_black_list()));
-    pass->Set("enable_gpu_half", new bool(argument->enable_gpu_half()));
-    pass->Set("mixed_precision_mode",
-              new int(argument->mixed_precision_mode()));
-
     if (pass_name == "graph_viz_pass") {
       std::string optim_cache_dir = argument->optim_cache_dir();
       std::string dot_file_path;
@@ -209,10 +202,17 @@ void IRPassManager::CreatePasses(Argument *argument,
           new std::vector<std::string>(argument->tensorrt_disabled_ops()));
       pass->Set("trt_use_dla", new bool(argument->tensorrt_use_dla()));
       pass->Set("trt_dla_core", new int(argument->tensorrt_dla_core()));
+
       // Setting the disable_trt_plugin_fp16 to true means that TRT plugin will
       // not run fp16.
       pass->Set("disable_trt_plugin_fp16",
                 new bool(argument->disable_trt_plugin_fp16()));
+
+      // Mixed precision related.
+      pass->Set("model_precision", new int(argument->model_precision()));
+      pass->Set(
+          "mixed_black_list",
+          new std::unordered_set<std::string>(argument->mixed_black_list()));
     } else if (pass_name == "dlnne_subgraph_pass") {
       auto precision_mode = argument->dlnne_precision_mode();
       pass->Set("min_subgraph_size",
@@ -235,8 +235,7 @@ void IRPassManager::CreatePasses(Argument *argument,
                 new framework::ProgramDesc *(&argument->main_program()));
     } else if (pass_name == "memory_optimize_pass") {
       pass->Set("root_predictor_id", new int(argument->root_predictor_id()));
-    }
-    if (pass_name == "lite_subgraph_pass") {
+    } else if (pass_name == "lite_subgraph_pass") {
       bool lite_enable_int8 =
           argument->lite_precision_mode() == AnalysisConfig::Precision::kInt8;
       pass->Set("program",
@@ -284,8 +283,7 @@ void IRPassManager::CreatePasses(Argument *argument,
       pass->Set("nnadapter_model_cache_token",
                 new std::vector<std::string>(
                     argument->nnadapter_model_cache_token()));
-    }
-    if (pass_name == "fc_fuse_pass") {
+    } else if (pass_name == "fc_fuse_pass") {
       pass->Set("use_gpu", new bool(argument->use_gpu()));
       bool fc_mkldnn_pass = 0;
       for (const std::string &pass_n : passes) {
@@ -295,6 +293,13 @@ void IRPassManager::CreatePasses(Argument *argument,
       }
       bool use_fc_padding = !fc_mkldnn_pass && argument->use_fc_padding();
       pass->Set("use_fc_padding", new bool(use_fc_padding));
+    } else if (pass_name == "float_to_half_pass") {
+      pass->Set(
+          "mixed_black_list",
+          new std::unordered_set<std::string>(argument->mixed_black_list()));
+      pass->Set("enable_gpu_half", new bool(argument->enable_gpu_half()));
+      pass->Set("mixed_precision_mode",
+                new int(argument->mixed_precision_mode()));
     }
     pre_pass = pass_name;
 
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index 44249796ec4055..f67891feccc5ce 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -83,14 +83,14 @@ void OutputProcess(framework::ir::Graph *graph,
                   backend,
                   precision,
                   blacklist)) {
-            AddCastOp(graph,
-                      var_node,
-                      next_op,
-                      framework::proto::VarType::FP32,
-                      to_type,
-                      &suffix,
-                      block_desc,
-                      &var_to_cast_op_map);
+            InsertCastOp(graph,
+                         var_node,
+                         next_op,
+                         framework::proto::VarType::FP32,
+                         to_type,
+                         block_desc,
+                         &suffix,
+                         &var_to_cast_op_map);
             var_node->Var()->SetDataType(framework::proto::VarType::FP32);
           }
         }
diff --git a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc
index afc1d8a882ca6e..e3740ff4e96e56 100644
--- a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc
+++ b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc
@@ -14,662 +14,71 @@
 
 #include "paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.h"
 
-#include <algorithm>
-#include <iterator>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-
-#include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/float_to_half_pass.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/framework/ir/node.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/var_desc.h"
 #include "paddle/fluid/inference/io.h"
-#include "paddle/phi/common/bfloat16.h"
-#include "paddle/phi/common/data_type.h"
-#include "paddle/phi/common/float16.h"
-#include "paddle/phi/common/layout.h"
-#include "paddle/phi/common/place.h"
+#include "paddle/phi/common/backend.h"
 
 namespace paddle {
 namespace inference {
 namespace analysis {
 
-namespace {
-using VarType = framework::proto::VarType;
-
-bool PhiKernelSupportPrecision(
-    const std::string& op_type,
+ConvertToMixedPrecisionPass::ConvertToMixedPrecisionPass(
+    const std::string& model_file,
+    const std::string& params_file,
+    const std::string& mixed_model_file,
+    const std::string& mixed_params_file,
+    phi::DataType mixed_precision,
     phi::Backend backend,
-    phi::DataType data_type,
-    phi::DataLayout layout = phi::DataLayout::ALL_LAYOUT) {
-  auto kernels = phi::KernelFactory::Instance().kernels();
-  if (kernels.find(op_type) == kernels.end()) {
-    return false;
-  }
-  phi::KernelKey kernel_key(backend, layout, data_type);
-  return phi::KernelFactory::Instance().HasKernel(op_type, kernel_key);
-}
-
-bool GpuKernelSupportPrecision(
-    const std::string& op_type,
-    phi::DataType data_type,
-    phi::DataLayout layout = phi::DataLayout::ALL_LAYOUT) {
-  auto phi_op_type = phi::TransToPhiKernelName(op_type);
-  bool res = PhiKernelSupportPrecision(
-      phi_op_type, phi::Backend::GPU, data_type, layout);
-  res |= PhiKernelSupportPrecision(
-      phi_op_type, phi::Backend::GPUDNN, data_type, layout);
-
-  if (!res) {
-    auto& all_kernels = framework::OperatorWithKernel::AllOpKernels();
-    auto it = all_kernels.find(op_type);
-    if (it != all_kernels.end()) {
-      for (auto& kern_pair : it->second) {
-        if (platform::is_gpu_place(kern_pair.first.place_) &&
-            kern_pair.first.data_type_ == VarType::FP16) {
-          res = true;
-          break;
-        }
-      }
-    }
-  }
-  return res;
-}
-
-class ConvertToMixedPrecisionPass {
-  using BlockID = size_t;
-
- public:
-  explicit ConvertToMixedPrecisionPass(
-      const std::string& model_file,
-      const std::string& params_file,
-      const std::string& mixed_model_file,
-      const std::string& mixed_params_file,
-      phi::DataType mixed_precision,
-      phi::Backend backend,
-      bool keep_io_types,
-      const std::unordered_set<std::string>& black_list)
-      : model_file_(model_file),
-        params_file_(params_file),
-        mixed_model_file_(mixed_model_file),
-        mixed_params_file_(mixed_params_file),
-        mixed_precision_(mixed_precision),
-        backend_(backend),
-        keep_io_types_(keep_io_types),
-        black_list_(black_list),
-        place_(paddle::CPUPlace()),
-        executor_(place_) {
-    VLOG(4) << "black_list has ";
-    for (auto& name : black_list_) {
-      VLOG(4) << " - " << name;
-    }
-  }
-
-  void Run();
-
- private:
-  void LoadAndPrepare();
-  inline bool VarNodeHasDtype(framework::ir::Node* node);
-  void ConvertAllFp64ToFp32(framework::ir::Graph* graph);
-  void FixCastAttr(framework::ir::Graph* graph);
-  void SaveMixedModel();
-  void ConvertTensorDtype(BlockID block_idx);
-  void ProcessInputNode(bool support_precision,
-                        framework::ir::Node* in_node,
-                        framework::ir::Node* op_node,
-                        int* suffix,
-                        framework::BlockDesc* block_desc,
-                        VarType::Type to_type,
-                        BlockID block_idx);
-
-  void ProcessOutputNode(BlockID block_idx,
-                         framework::ir::Node* var_node,
-                         VarType::Type to_type);
-  inline bool IsFloatVarType(VarType::Type type);
-
-  bool OutShouldNotConvert(framework::ir::Node* var_node);
-  // Just process special cases for weights conversion.
-  bool WeightsShouldNotConvert(framework::ir::Node* var_node);
-
-  // Return Node* which first appers in block.
-  framework::ir::Node* GetRealVarNode(framework::ir::Node* node);
-
-  // Fallback to fp32 dtype when encounter circle (Not a DAG graph).
-  void ProcessCircleCases();
-
- private:
-  std::string model_file_;
-  std::string params_file_;
-  std::string mixed_model_file_;
-  std::string mixed_params_file_;
-  phi::DataType mixed_precision_;
-  phi::Backend backend_;
-  bool keep_io_types_;
-  std::unordered_set<std::string> black_list_;
-  paddle::CPUPlace place_;
-  framework::Executor executor_;
-  framework::Scope scope_;
-
-  std::unordered_map<std::string, framework::ir::Node*> name2node_;
-  std::unordered_map<framework::ir::Node*, framework::ir::Node*> cast_map_;
-  int suffix_{0};
-
-  std::set<std::string> var_names_in_circles_;
-
-  std::unique_ptr<framework::ProgramDesc> program_desc_{nullptr};
-  std::unique_ptr<framework::ir::Graph> main_graph_{nullptr};
-  std::vector<framework::ir::Graph*> graphes_;
-};
-
-framework::ir::Node* ConvertToMixedPrecisionPass::GetRealVarNode(
-    framework::ir::Node* var_node) {
-  CHECK_EQ(var_node->IsVar(), true);
-  if (name2node_.count(var_node->Name())) return name2node_[var_node->Name()];
-  return var_node;
-}
-
-inline bool ConvertToMixedPrecisionPass::VarNodeHasDtype(
-    framework::ir::Node* var_node) {
-  CHECK_EQ(var_node->IsVar(), true);
-  auto type = var_node->Var()->GetType();
-  return (type == VarType::SELECTED_ROWS) || (type == VarType::LOD_TENSOR) ||
-         (type == VarType::LOD_TENSOR_ARRAY) || (type == VarType::STRINGS) ||
-         (type == VarType::VOCAB);
-}
-
-void ConvertToMixedPrecisionPass::ProcessInputNode(
-    bool support_precision,
-    framework::ir::Node* in_node,
-    framework::ir::Node* op_node,
-    int* suffix,
-    framework::BlockDesc* block_desc,
-    VarType::Type to_type,
-    BlockID block_idx) {
-  if (!in_node->IsVar()) return;
-  auto* real_node = GetRealVarNode(in_node);
-  if (!VarNodeHasDtype(real_node)) return;
-  auto* graph = graphes_[block_idx];
-  auto* in_var = real_node->Var();
-  auto in_var_type = in_var->GetDataType();
-  auto prev_type = in_var_type;
-
-  if (support_precision) {
-    if (in_var->Persistable() && in_var_type == VarType::FP32) {
-      if (WeightsShouldNotConvert(in_node)) return;
-      in_var->SetDataType(to_type);
-      in_var_type = to_type;
-      VLOG(3) << "   in_node name " << in_var->Name() << " from " << prev_type
-              << " to " << to_type;
-    } else if (!in_var->Persistable() && IsFloatVarType(in_var_type) &&
-               in_var_type != to_type) {
-      AddCastOp(graph,
-                in_node,
-                op_node,
-                in_var_type,
-                to_type,
-                suffix,
-                block_desc,
-                &cast_map_);
-      VLOG(3) << "   in_node name " << in_var->Name() << "(" << prev_type
-              << ") to " << cast_map_[in_node]->Name() << "(" << to_type << ")";
-    }
-  } else {
-    if (!in_var->Persistable() && IsFloatVarType(in_var_type) &&
-        in_var_type != to_type) {
-      AddCastOp(graph,
-                in_node,
-                op_node,
-                in_var_type,
-                to_type,
-                suffix,
-                block_desc,
-                &cast_map_);
-      VLOG(3) << "   in_node name " << in_var->Name() << "(" << prev_type
-              << ") to " << cast_map_[in_node]->Name() << "(" << to_type << ")";
-    }
-  }
-}
-
-void ConvertToMixedPrecisionPass::ProcessOutputNode(
-    BlockID block_idx, framework::ir::Node* var_node, VarType::Type to_type) {
-  if (!var_node->IsVar()) return;
-  auto* real_node = GetRealVarNode(var_node);
-  if (!VarNodeHasDtype(real_node)) return;
-  auto* out_var = real_node->Var();
-  auto prev_type = out_var->GetDataType();
-  if (out_var->GetDataType() == VarType::FP32) {
-    if (OutShouldNotConvert(var_node)) return;
-    out_var->SetDataType(to_type);
-  }
-  VLOG(3) << "   out_node name " << var_node->Name() << " from dtype "
-          << prev_type << " to " << out_var->GetDataType();
-}
-
-// Just process special cases.
-bool ConvertToMixedPrecisionPass::OutShouldNotConvert(
-    framework::ir::Node* var_node) {
-  auto op_node = var_node->inputs[0];
-  auto* op_desc = op_node->Op();
-
-  // batch_norm's input and output (variance and mean) are the same.
-  if (op_desc->Type() == "batch_norm") {
-    auto vecs = op_desc->Output("MeanOut");
-    if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
-      return true;
-    }
-    vecs = op_desc->Output("VarianceOut");
-    if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
-      return true;
-    }
-    vecs = op_desc->Output("SavedMean");
-    if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
-      return true;
-    }
-    vecs = op_desc->Output("SavedVariance");
-    if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
-      return true;
-    }
+    bool keep_io_types,
+    const std::unordered_set<std::string>& black_list)
+    : model_file_(model_file),
+      params_file_(params_file),
+      mixed_model_file_(mixed_model_file),
+      mixed_params_file_(mixed_params_file),
+      mixed_precision_(mixed_precision),
+      backend_(backend),
+      keep_io_types_(keep_io_types),
+      black_list_(black_list) {
+  if (mixed_precision_ != phi::DataType::FLOAT16 &&
+      mixed_precision_ != phi::DataType::BFLOAT16) {
+    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+        "mixed_precision currently not supported dtype %d, we now only "
+        "support fp16 and bf16.",
+        static_cast<int>(mixed_precision_)));
   }
-
-  return false;
-}
-
-bool ConvertToMixedPrecisionPass::WeightsShouldNotConvert(
-    framework::ir::Node* var_node) {
-  auto op_nodes = var_node->outputs;
-  for (auto* op_node : op_nodes) {
-    auto* op_desc = op_node->Op();
-    // batch_norm op's bias, mean, scale and variance just be float32, so we can
-    // not convert the dtype.
-    if (op_desc->Type() == "batch_norm") {
-      auto vecs = op_desc->Input("Bias");
-      if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
-        return true;
-      }
-      vecs = op_desc->Input("Mean");
-      if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
-        return true;
-      }
-      vecs = op_desc->Input("Scale");
-      if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
-        return true;
-      }
-      vecs = op_desc->Input("Variance");
-      if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
-        return true;
-      }
-    } else if (op_desc->Type() == "fused_multi_transformer") {
-      auto vecs = op_desc->Input("LnScale");
-      if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
-        return true;
-      }
-
-      vecs = op_desc->Input("LnBias");
-      if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
-        return true;
-      }
-
-      vecs = op_desc->Input("FFNLnScale");
-      if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
-        return true;
-      }
-
-      vecs = op_desc->Input("FFNLnBias");
-      if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
-        return true;
-      }
-    }
+  if (backend_ != phi::Backend::GPU) {
+    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+        "mixed_precision currently not supported place %d, we now only "
+        "support gpu.",
+        static_cast<int>(backend_)));
   }
-
-  return false;
 }
 
-inline bool ConvertToMixedPrecisionPass::IsFloatVarType(VarType::Type type) {
-  return (type == VarType::FP16) || (type == VarType::FP32) ||
-         (type == VarType::BF16);
-}
+void ConvertToMixedPrecisionPass::LoadModel() {
+  framework::Executor exe{platform::CPUPlace{}};
 
-void ConvertToMixedPrecisionPass::LoadAndPrepare() {
-  program_desc_ =
-      inference::Load(&executor_, &scope_, model_file_, params_file_);
+  auto program_desc = inference::Load(&exe, &scope_, model_file_, params_file_);
   main_graph_ = std::unique_ptr<framework::ir::Graph>(
-      new framework::ir::Graph(*program_desc_));
-
-  for (size_t i = 0; i < main_graph_->SubGraphsSize(); ++i) {
-    auto* graph = main_graph_->GetSubGraph(i);
-    graphes_.push_back(graph);
-
-    for (auto* node : graph->Nodes()) {
-      if (!node->IsVar()) continue;
-      if (!name2node_.count(node->Name())) {
-        name2node_[node->Name()] = node;
-      }
-    }
-  }
-
-  ProcessCircleCases();
-}
-
-// Find var names which in circles.
-void ConvertToMixedPrecisionPass::ProcessCircleCases() {
-  std::vector<std::string> vars_in_circles;
-  for (size_t idx = 0; idx < program_desc_->Size(); ++idx) {
-    for (auto* op : program_desc_->Block(idx).AllOps()) {
-      // TODO(inference): batch_norm has circle, but we need to fuse it in conv
-      // op.
-      if (op->Type() == "batch_norm") continue;
-      const auto& in_names = op->InputArgumentNames();
-      const auto& out_names = op->OutputArgumentNames();
-      std::set<std::string> in_names_set(in_names.begin(), in_names.end());
-      std::set<std::string> out_names_set(out_names.begin(), out_names.end());
-      std::set_intersection(in_names_set.begin(),
-                            in_names_set.end(),
-                            out_names_set.begin(),
-                            out_names_set.end(),
-                            std::back_inserter(vars_in_circles));
-    }
-  }
-
-  for (auto& name : vars_in_circles) {
-    var_names_in_circles_.insert(name);
-  }
-  for (auto& name : var_names_in_circles_) {
-    LOG(INFO) << name
-              << " in circles, so we will skip process those vars and ops.";
-  }
-}
-
-inline void ProcessConstantOpAttr(framework::ir::Node* op_node,
-                                  VarType::Type from_type,
-                                  VarType::Type to_type) {
-  if (!op_node->IsOp()) return;
-  auto op_type = op_node->Op()->Type();
-  if (op_type == "feed" || op_type == "fetch") return;
-
-  if (op_type == "fill_constant") {
-    if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("dtype")) ==
-        static_cast<int>(from_type))
-      op_node->Op()->SetAttr("dtype", static_cast<int>(to_type));
-  } else if (op_type == "assign_value") {
-    if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("dtype")) ==
-        static_cast<int>(from_type))
-      op_node->Op()->SetAttr("dtype", static_cast<int>(to_type));
-  } else if (op_type == "eye") {
-    if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("dtype")) ==
-        static_cast<int>(from_type))
-      op_node->Op()->SetAttr("dtype", static_cast<int>(to_type));
-  } else if (op_type == "fill_any_like") {
-    if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("dtype")) ==
-        static_cast<int>(from_type))
-      op_node->Op()->SetAttr("dtype", static_cast<int>(to_type));
-  } else if (op_type == "cast") {
-    if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("in_dtype")) ==
-        static_cast<int>(from_type))
-      op_node->Op()->SetAttr("in_dtype", static_cast<int>(to_type));
-    if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("out_dtype")) ==
-        static_cast<int>(from_type))
-      op_node->Op()->SetAttr("out_dtype", static_cast<int>(to_type));
-  }
-}
-
-void ConvertToMixedPrecisionPass::ConvertAllFp64ToFp32(
-    framework::ir::Graph* graph) {
-  auto op_nodes = framework::ir::TopologySortOperations(*graph);
-  for (auto* op_node : op_nodes) {
-    if (!op_node->IsOp()) continue;
-    auto op_type = op_node->Op()->Type();
-    ProcessConstantOpAttr(op_node, VarType::FP64, VarType::FP32);
-    auto inputs = op_node->inputs;
-    for (auto* in_node : inputs) {
-      auto* in_var = in_node->Var();
-      if (!in_var->Persistable() && in_var->GetDataType() == VarType::FP64) {
-        in_var->SetDataType(VarType::FP32);
-      }
-    }
-  }
+      new framework::ir::Graph(*program_desc));
+  main_graph_->SetNotOwned(framework::ir::kParamScopeAttr, &scope_);
 }
 
 void ConvertToMixedPrecisionPass::Run() {
-  LoadAndPrepare();
+  LoadModel();
 
-  for (size_t i = 0; i < graphes_.size(); ++i) {
-    auto* graph = graphes_[i];
-    VLOG(2) << " --------  handle subgraph " << i << ", has "
-            << graph->Nodes().size() << " nodes --------";
+  framework::ir::FloatToHalfPass pass;
+  pass.Set("mixed_precision_mode", new int{static_cast<int>(mixed_precision_)});
+  pass.Set("mixed_black_list",
+           new std::unordered_set<std::string>{black_list_});
+  pass.Set("enable_gpu_half", new bool{true});
+  pass.Set("keep_io_types", new bool{keep_io_types_});
 
-    ConvertAllFp64ToFp32(graph);
-    ConvertTensorDtype(i);
-    FixCastAttr(graph);
-
-    CHECK_EQ(framework::ir::VarDescIsConsistency(*graph), true);
-  }
+  main_graph_.reset(pass.Apply(main_graph_.release()));
 
   SaveMixedModel();
 }
 
-void ConvertToMixedPrecisionPass::ConvertTensorDtype(BlockID block_idx) {
-  auto* graph = graphes_[block_idx];
-  VarType::Type to_type;
-  if (mixed_precision_ == phi::DataType::FLOAT16) {
-    to_type = VarType::FP16;
-  } else if (mixed_precision_ == phi::DataType::BFLOAT16) {
-    to_type = VarType::BF16;
-  } else {
-    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
-        "mixed_precision currently not supported dtype %d, we now only "
-        "support fp16 and bf16.",
-        static_cast<int>(mixed_precision_)));
-  }
-
-  auto op_nodes = framework::ir::TopologySortOperations(*graph);
-  auto* block_desc = op_nodes[0]->Op()->Block();
-  int num_low_precision = 0;
-  std::vector<framework::ir::Node*> output_nodes;
-
-  for (auto* op_node : op_nodes) {
-    if (!op_node->IsOp()) continue;
-    auto op_type = op_node->Op()->Type();
-    VLOG(3) << "-------------------- op_type " << op_type << ", phi_type "
-            << phi::TransToPhiKernelName(op_type);
-    // 1. set input dtype.
-    if (op_type == "feed") {
-      auto feed_var = op_node->outputs[0]->Var();
-      if (!keep_io_types_ && feed_var->GetDataType() == VarType::FP32) {
-        feed_var->SetDataType(to_type);
-      }
-    } else if (op_type == "fetch") {
-      auto* fetch_var = op_node->inputs[0];
-      output_nodes.push_back(fetch_var);
-      continue;
-    } else if (op_type == "cast") {
-      continue;
-    }
-
-    // We can not add cast operator before ops who have sub_block, as in
-    // sub_block we may get a var which may be transformer by cast op.
-    else if (op_node->Op()->HasAttr("sub_block")) {  // NOLINT
-      continue;
-    }
-
-    // 2. if op support fp16/bf16 and not in blacklist.
-    //      - cast weight to fp16/bf16.
-    //      - add cast op if the input dtype is not fp16/bf16.
-    //      - set output dtype.
-    else if (black_list_.count(op_type) == 0) {  // NOLINT
-      bool support_precision =
-          OpSupportPrecision(op_type, backend_, mixed_precision_, black_list_);
-
-      // If op's output in circle, we should not convert to fp16.
-      for (auto* out_node : op_node->outputs) {
-        if (var_names_in_circles_.count(out_node->Name())) {
-          support_precision = false;
-          VLOG(2) << " op's output " << out_node->Name()
-                  << " is in circle, we can not support this case, just skip.";
-          break;
-        }
-      }
-
-      // If the op has no input or output of float type, we will not choose the
-      // low precision kernel.
-      if (support_precision) {
-        bool has_float_in_out{false};
-        for (auto* in_node : op_node->inputs) {
-          if (!in_node->IsVar()) continue;
-          if (in_node->Var()->GetType() != VarType::LOD_TENSOR) {
-            support_precision = false;
-            VLOG(2) << " op has tensor array input[" << in_node->Name()
-                    << "], just skip.";
-            break;
-          }
-          auto* real_node = GetRealVarNode(in_node);
-          if (real_node->Var()->GetDataType() == VarType::FP16 ||
-              real_node->Var()->GetDataType() == VarType::FP32 ||
-              real_node->Var()->GetDataType() == VarType::FP64 ||
-              real_node->Var()->GetDataType() == VarType::BF16) {
-            has_float_in_out = true;
-            break;
-          }
-        }
-        for (auto* out_node : op_node->outputs) {
-          if (!out_node->IsVar()) continue;
-          auto* real_node = GetRealVarNode(out_node);
-          if (real_node->Var()->GetDataType() == VarType::FP16 ||
-              real_node->Var()->GetDataType() == VarType::FP32 ||
-              real_node->Var()->GetDataType() == VarType::FP64 ||
-              real_node->Var()->GetDataType() == VarType::BF16) {
-            has_float_in_out = true;
-            break;
-          }
-        }
-
-        if (!has_float_in_out) {
-          support_precision = false;
-          VLOG(2) << " op doesn't has float input and output, just skip.";
-        }
-      }
-
-      VLOG(2) << "op type: " << op_type
-              << " support low precision: " << support_precision;
-
-      if (support_precision) {
-        ProcessConstantOpAttr(op_node, VarType::FP32, to_type);
-        VLOG(2) << " process input nodes:";
-        ++num_low_precision;
-        auto inputs = op_node->inputs;
-        for (auto* in_node : inputs) {
-          ProcessInputNode(
-              true, in_node, op_node, &suffix_, block_desc, to_type, block_idx);
-        }
-
-        VLOG(2) << " process output nodes:";
-        auto outputs = op_node->outputs;
-        for (auto* out_node : outputs) {
-          ProcessOutputNode(block_idx, out_node, to_type);
-        }
-      } else {
-        auto inputs = op_node->inputs;
-        for (auto* in_node : inputs) {
-          ProcessInputNode(false,
-                           in_node,
-                           op_node,
-                           &suffix_,
-                           block_desc,
-                           VarType::FP32,
-                           block_idx);
-        }
-      }
-    }
-
-    // 3. check op not support fp16/bf16 or in blacklist.
-    //      - add cast op if the input dtype is not fp32.
-    else {  // NOLINT
-      VLOG(3) << "not to run fp16 op_type: " << op_type << ", node input size "
-              << op_node->inputs.size();
-      auto in_nodes = op_node->inputs;
-      for (auto* in_node : in_nodes) {
-        auto* in_var = in_node->Var();
-        if (in_var->GetDataType() == to_type) {
-          AddCastOp(graph,
-                    in_node,
-                    op_node,
-                    to_type,
-                    VarType::FP32,
-                    &suffix_,
-                    block_desc,
-                    &cast_map_);
-          VLOG(3) << "-- " << in_node->Name() << "(" << to_type << ") to "
-                  << cast_map_[in_node]->Name() << "(" << VarType::FP32 << ")";
-        }
-      }
-    }
-  }
-
-  // 4. if output_op's dtype is not compatible to output dtype, then just
-  // insert cast.
-  for (auto* node : output_nodes) {
-    framework::ir::Node* fetch_op{nullptr};
-    for (auto* op_node : node->outputs) {
-      if (op_node->IsOp() && op_node->Op()->Type() == "fetch") {
-        fetch_op = op_node;
-      }
-    }
-    CHECK_NOTNULL(fetch_op);
-    auto* var = node->Var();
-    if (keep_io_types_ && var->GetDataType() == to_type) {
-      // fp16/bf16 -> fp32.
-      AddCastOp(graph,
-                node,
-                fetch_op,
-                to_type,
-                VarType::FP32,
-                &suffix_,
-                block_desc,
-                &cast_map_);
-    } else if (!keep_io_types_ && var->GetDataType() == VarType::FP32) {
-      // fp32 -> fp16/bf16
-      AddCastOp(graph,
-                node,
-                fetch_op,
-                VarType::FP32,
-                to_type,
-                &suffix_,
-                block_desc,
-                &cast_map_);
-    }
-  }
-
-  if (num_low_precision)
-    LOG(INFO) << "---  detected " << num_low_precision
-              << " low precision ops in " << block_idx << " subgraph";
-}
-
-// We modify op's input output precision, and we need to fix cast op in_dtype
-// and out_dtype attribute.
-// TODO(inference): we need a cast elimination pass.
-void ConvertToMixedPrecisionPass::FixCastAttr(framework::ir::Graph* graph) {
-  auto op_nodes = framework::ir::TopologySortOperations(*graph);
-  for (auto* op_node : op_nodes) {
-    if (!op_node->IsOp()) continue;
-    auto op_type = op_node->Op()->Type();
-    if (op_type != "cast") continue;
-    auto input = op_node->inputs[0];
-    auto output = op_node->outputs[0];
-    op_node->Op()->SetAttr("in_dtype",
-                           static_cast<int>(input->Var()->GetDataType()));
-    op_node->Op()->SetAttr("out_dtype",
-                           static_cast<int>(output->Var()->GetDataType()));
-  }
-}
-
 void ConvertToMixedPrecisionPass::SaveMixedModel() {
   framework::ProgramDesc mixed_program_desc;
   framework::ir::GraphToProgram(*main_graph_, &mixed_program_desc);
@@ -677,51 +86,6 @@ void ConvertToMixedPrecisionPass::SaveMixedModel() {
   auto parameters = scope_.LocalVarNames();
   std::sort(parameters.begin(), parameters.end());
 
-  std::unordered_set<std::string> weights_should_be_fp32;
-  for (auto* node : main_graph_->Nodes()) {
-    if (!node->IsVar()) continue;
-    if (VarNodeHasDtype(node)) {
-      if (node->Var()->Persistable() &&
-          node->Var()->GetDataType() == VarType::FP32) {
-        VLOG(2) << "weights keep to fp32: " << node->Name() << ", ptr "
-                << reinterpret_cast<void*>(node->Var());
-        weights_should_be_fp32.insert(node->Name());
-      }
-    }
-  }
-
-#define CONVERT_TENSOR_DTYPE(DTYPE, dtype)                                   \
-  mixed_tensor.set_type(DTYPE);                                              \
-  auto* mixed_data = mixed_tensor.mutable_data<dtype>(platform::CPUPlace()); \
-  for (int64_t i = 0; i < origin_tensor->numel(); i++) {                     \
-    mixed_data[i] = static_cast<dtype>(origin_data[i]);                      \
-  }                                                                          \
-  origin_tensor->clear();                                                    \
-  paddle::framework::TensorCopySync(                                         \
-      mixed_tensor, platform::CPUPlace(), origin_tensor)
-
-  for (const auto& param_name : parameters) {
-    if (weights_should_be_fp32.count(param_name)) continue;
-    auto* var = scope_.FindLocalVar(param_name);
-    if (var->IsType<phi::DenseTensor>()) {
-      auto* origin_tensor = var->GetMutable<phi::DenseTensor>();
-      if (origin_tensor->dtype() != phi::DataType::FLOAT32) continue;
-      phi::DenseTensor mixed_tensor;
-      mixed_tensor.Resize(origin_tensor->dims());
-      auto* origin_data =
-          origin_tensor->mutable_data<float>(platform::CPUPlace());
-      if (mixed_precision_ == phi::DataType::FLOAT16) {
-        CONVERT_TENSOR_DTYPE(paddle::experimental::DataType::FLOAT16,
-                             phi::dtype::float16);
-      } else if (mixed_precision_ == phi::DataType::BFLOAT16) {
-        CONVERT_TENSOR_DTYPE(paddle::experimental::DataType::BFLOAT16,
-                             phi::dtype::bfloat16);
-      }
-    }
-  }
-
-#undef CONVERT_TENSOR_DTYPE
-
   auto SerializeParams = [&]() -> std::string {
     std::ostringstream os;
     phi::CPUContext ctx;
@@ -746,73 +110,32 @@ void ConvertToMixedPrecisionPass::SaveMixedModel() {
               mixed_program_desc.Proto()->SerializeAsString());
   StrToBinary(mixed_params_file_, SerializeParams());
 }
-}  // namespace
-
-void AddCastOp(
-    framework::ir::Graph* graph,
-    framework::ir::Node* node,
-    framework::ir::Node* next_op,
-    VarType::Type from_type,
-    VarType::Type to_type,
-    int* suffix,
-    framework::BlockDesc* block_desc,
-    std::unordered_map<framework::ir::Node*, framework::ir::Node*>* map) {
-  auto update_cast_desc = [&](framework::OpDesc& desc,
-                              const std::string& x_name,
-                              const std::string& out_name,
-                              const int in_dtype,
-                              const int out_dtype) {
-    desc.SetType("cast");
-    desc.SetInput("X", {x_name});
-    desc.SetOutput("Out", {out_name});
-    desc.SetAttr("in_dtype", in_dtype);
-    desc.SetAttr("out_dtype", out_dtype);
-    desc.SetAttr("use_mkldnn", false);
-    desc.SetAttr("with_quant_attr", false);
-    desc.Flush();
-  };
-
-  if (map->count(node) == 0) {
-    // insert cast op before node.
-    std::string cast_input_name = node->Var()->Name();
-    std::string cast_output_name =
-        node->Var()->Name() + "_cast.tmp_" + std::to_string((*suffix)++);
-    CHECK_NOTNULL(block_desc);
-    framework::OpDesc cast_op_desc(block_desc);
-    update_cast_desc(cast_op_desc,
-                     cast_input_name,
-                     cast_output_name,
-                     static_cast<int>(from_type),
-                     static_cast<int>(to_type));
-    auto* cast_op_node = graph->CreateOpNode(&cast_op_desc);
-    auto* cast_output_vardesc = block_desc->Var(cast_output_name);
-    cast_output_vardesc->SetPersistable(false);
-    cast_output_vardesc->SetDataType(to_type);
-    cast_output_vardesc->SetShape(node->Var()->GetShape());
-    auto* cast_output_node = graph->CreateVarNode(cast_output_vardesc);
-    IR_NODE_LINK_TO(cast_op_node, cast_output_node);
-    (*map)[node] = cast_output_node;
-  }
-  next_op->Op()->Rename(node->Name(), map->at(node)->Name());
-  IR_NODE_LINK_TO(node, map->at(node)->inputs[0]);
-  IR_NODE_UNLINK(node, next_op);
-  IR_NODE_LINK_TO(map->at(node), next_op);
-}
 
 bool OpSupportPrecision(const std::string& op_type,
                         phi::Backend backend,
                         phi::DataType precision,
-                        const std::unordered_set<std::string>& blacklist) {
-  auto phi_op_type = phi::TransToPhiKernelName(op_type);
-  bool support_precision = false;
-  if (blacklist.count(op_type) == 0) {
-    if (backend == phi::Backend::GPU)
-      support_precision = GpuKernelSupportPrecision(op_type, precision);
-    else
-      support_precision =
-          PhiKernelSupportPrecision(phi_op_type, backend, precision);
-  }
-  return support_precision;
+                        const std::unordered_set<std::string>& black_list) {
+  return framework::ir::OpSupportPrecision(
+      op_type, backend, precision, black_list);
+}
+
+void InsertCastOp(
+    framework::ir::Graph* graph,
+    framework::ir::Node* var_node,
+    framework::ir::Node* op_node,
+    framework::proto::VarType::Type from_type,
+    framework::proto::VarType::Type to_type,
+    framework::BlockDesc* block_desc,
+    int* suffix,
+    std::unordered_map<framework::ir::Node*, framework::ir::Node*>* visited) {
+  framework::ir::DoInsertCastOp(graph,
+                                var_node,
+                                op_node,
+                                from_type,
+                                to_type,
+                                block_desc,
+                                suffix,
+                                visited);
 }
 
 void ConvertToMixedPrecision(
diff --git a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.h b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.h
index 583512408c5869..c3ae63aeeca261 100644
--- a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.h
+++ b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.h
@@ -15,14 +15,13 @@
 #pragma once
 
 #include <string>
-#include <unordered_map>
 #include <unordered_set>
 
 #include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
 #include "paddle/phi/common/backend.h"
 #include "paddle/phi/common/data_type.h"
 
@@ -30,20 +29,52 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
+class ConvertToMixedPrecisionPass {
+ public:
+  explicit ConvertToMixedPrecisionPass(
+      const std::string& model_file,
+      const std::string& params_file,
+      const std::string& mixed_model_file,
+      const std::string& mixed_params_file,
+      phi::DataType mixed_precision,
+      phi::Backend backend,
+      bool keep_io_types,
+      const std::unordered_set<std::string>& black_list);
+
+  void Run();
+
+ private:
+  void LoadModel();
+  void SaveMixedModel();
+
+ private:
+  std::string model_file_;
+  std::string params_file_;
+  std::string mixed_model_file_;
+  std::string mixed_params_file_;
+  phi::DataType mixed_precision_;
+  phi::Backend backend_;
+  bool keep_io_types_;
+  std::unordered_set<std::string> black_list_;
+
+  framework::Scope scope_;
+  std::unique_ptr<framework::ir::Graph> main_graph_{nullptr};
+};
+
 bool OpSupportPrecision(const std::string& op_type,
                         phi::Backend backend,
                         phi::DataType precision,
-                        const std::unordered_set<std::string>& blacklist);
+                        const std::unordered_set<std::string>& black_list);
 
-void AddCastOp(
+void InsertCastOp(
     framework::ir::Graph* graph,
-    framework::ir::Node* node,
-    framework::ir::Node* next_op,
+    framework::ir::Node* var_node,
+    framework::ir::Node* op_node,
     framework::proto::VarType::Type from_type,
     framework::proto::VarType::Type to_type,
-    int* suffix,
     framework::BlockDesc* block_desc,
-    std::unordered_map<framework::ir::Node*, framework::ir::Node*>* map);
+    int* suffix,
+    std::unordered_map<framework::ir::Node*, framework::ir::Node*>* visited);
 
 void ConvertToMixedPrecision(const std::string& model_file,
                              const std::string& params_file,

From 7e062dd03c0b1dfff2c86fbc19f577ee0abe52bf Mon Sep 17 00:00:00 2001
From: yuanlehome <yuanlehome@163.com>
Date: Wed, 7 Dec 2022 18:04:31 +0000
Subject: [PATCH 2/5] fix keep_io_types

---
 paddle/fluid/framework/ir/float_to_half_pass.cc           | 8 +++++++-
 .../analysis/passes/convert_to_mixed_precision.cc         | 3 ++-
 .../analysis/passes/convert_to_mixed_precision.h          | 1 -
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/framework/ir/float_to_half_pass.cc b/paddle/fluid/framework/ir/float_to_half_pass.cc
index 9f2098683a95a9..5a20b913ce2069 100644
--- a/paddle/fluid/framework/ir/float_to_half_pass.cc
+++ b/paddle/fluid/framework/ir/float_to_half_pass.cc
@@ -565,7 +565,11 @@ bool FloatToHalfPass::OutputVarsNotConvert(Node* op_node,
 void FloatToHalfPass::SetVarPrecision() const {
   for (const auto& nodes : all_op_nodes_) {
     for (auto* op_node : nodes) {
-      if (op_run_half_.count(op_node->Op()->Type())) {
+      if (op_run_half_.count(op_node->Op()->Type()) == 0) {
+        continue;
+      }
+
+      if (GetOpOriginalType(op_node->Op()->Type()) != "feed") {
         for (auto* in_var_node : op_node->inputs) {
           CHECK_EQ(in_var_node->IsVar(), true);
 
@@ -582,7 +586,9 @@ void FloatToHalfPass::SetVarPrecision() const {
             vars_convert_to_half_.insert(in_var_name);
           }
         }
+      }
 
+      if (GetOpOriginalType(op_node->Op()->Type()) != "fetch") {
         for (auto* out_var_node : op_node->outputs) {
           CHECK_EQ(out_var_node->IsVar(), true);
 
diff --git a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc
index e3740ff4e96e56..906c745762dfda 100644
--- a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc
+++ b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.h"
 
+#include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/ir/float_to_half_pass.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/inference/io.h"
@@ -74,7 +75,7 @@ void ConvertToMixedPrecisionPass::Run() {
   pass.Set("enable_gpu_half", new bool{true});
   pass.Set("keep_io_types", new bool{keep_io_types_});
 
-  main_graph_.reset(pass.Apply(main_graph_.release()));
+  pass.Apply(main_graph_.get());
 
   SaveMixedModel();
 }
diff --git a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.h b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.h
index c3ae63aeeca261..3a1e5fbb30a21d 100644
--- a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.h
+++ b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.h
@@ -18,7 +18,6 @@
 #include <unordered_set>
 
 #include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"

From a9ff7a88a2b3306c9c4d27fcf27c8767a7c8b8d4 Mon Sep 17 00:00:00 2001
From: yuanlehome <yuanlehome@163.com>
Date: Fri, 9 Dec 2022 12:04:43 +0000
Subject: [PATCH 3/5] update

---
 paddle/fluid/framework/ir/CMakeLists.txt      |   2 +-
 ...f_pass.cc => auto_mixed_precision_pass.cc} | 260 +++++++++---------
 ...alf_pass.h => auto_mixed_precision_pass.h} |  20 +-
 .../ir/conv2d_fusion_layout_transfer_pass.cc  |   2 +-
 .../ir/conv_elementwise_add_act_fuse_pass.cc  |   2 +-
 paddle/fluid/inference/analysis/argument.h    |   2 +-
 .../inference/analysis/ir_pass_manager.cc     |   7 +-
 .../inference/analysis/passes/CMakeLists.txt  |   2 +-
 .../passes/convert_to_mixed_precision.cc      |   6 +-
 paddle/fluid/inference/api/analysis_config.cc |   8 +-
 .../fluid/inference/api/analysis_predictor.cc |   8 +-
 .../inference/api/paddle_analysis_config.h    |   2 +-
 .../inference/api/paddle_pass_builder.cc      |   2 +-
 13 files changed, 159 insertions(+), 164 deletions(-)
 rename paddle/fluid/framework/ir/{float_to_half_pass.cc => auto_mixed_precision_pass.cc} (75%)
 rename paddle/fluid/framework/ir/{float_to_half_pass.h => auto_mixed_precision_pass.h} (84%)

diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index f65db53893038c..c3685ea1d3669b 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -103,7 +103,7 @@ pass_library(delete_c_identity_op_pass inference)
 pass_library(preln_residual_bias_fuse_pass inference)
 pass_library(delete_fill_constant_op_pass inference)
 pass_library(constant_folding_pass inference)
-pass_library(float_to_half_pass inference)
+pass_library(auto_mixed_precision_pass inference)
 pass_library(conv2d_fusion_layout_transfer_pass inference)
 pass_library(simplify_with_basic_ops_pass base)
 pass_library(fc_elementwise_layernorm_fuse_pass base)
diff --git a/paddle/fluid/framework/ir/float_to_half_pass.cc b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
similarity index 75%
rename from paddle/fluid/framework/ir/float_to_half_pass.cc
rename to paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
index a4b35267b5fe1a..48b1de077bf200 100644
--- a/paddle/fluid/framework/ir/float_to_half_pass.cc
+++ b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/float_to_half_pass.h"
+#include "paddle/fluid/framework/ir/auto_mixed_precision_pass.h"
 
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/operator.h"
@@ -29,7 +29,7 @@ namespace ir {
 
 namespace {
 
-using VarType = FloatToHalfPass::VarType;
+using VarType = AutoMixedPrecisionPass::VarType;
 
 bool PhiKernelSupportPrecision(
     const std::string& op_type,
@@ -148,6 +148,9 @@ bool OpSupportPrecision(const std::string& op_type,
   if (black_list.count(op_type) == 0) {
     if (backend == phi::Backend::GPU) {
       support = GpuKernelSupportPrecision(op_type, precision);
+    } else {
+      PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+          "Now, only support backend of GPU."));
     }
   }
   return support;
@@ -156,7 +159,7 @@ bool OpSupportPrecision(const std::string& op_type,
 // The set of ops that support fp16 calculation and are considered
 // numerically-dangerous, slower and whose effects may also be observed in
 // downstream ops.
-void FloatToHalfPass::SetDefaultBlacklist() const {
+void AutoMixedPrecisionPass::SetDefaultBlacklist() const {
   black_list_.insert({
       // numerically-dangerous
       "acos",
@@ -188,13 +191,16 @@ void FloatToHalfPass::SetDefaultBlacklist() const {
   });
 }
 
-void FloatToHalfPass::Init(Graph* graph) const {
-  keep_io_types_ = true;
-  if (Has("keep_io_types")) {
-    keep_io_types_ = Get<bool>("keep_io_types");
+void AutoMixedPrecisionPass::Init(Graph* graph) const {
+  bool enable_gpu_mixed = Get<bool>("enable_gpu_mixed");
+  if (enable_gpu_mixed) {
+    backend_ = phi::Backend::GPU;
   }
-  half_precision_ =
-      static_cast<phi::DataType>(Get<int>("mixed_precision_mode"));
+
+  skip_pass_ = !enable_gpu_mixed;
+
+  low_precision_ = static_cast<phi::DataType>(Get<int>("low_precision_mode"));
+
   black_list_ = Get<std::unordered_set<std::string>>("mixed_black_list");
   SetDefaultBlacklist();
   VLOG(4) << "black_list has ";
@@ -202,6 +208,11 @@ void FloatToHalfPass::Init(Graph* graph) const {
     VLOG(4) << " - " << name;
   }
 
+  keep_io_types_ = true;
+  if (Has("keep_io_types")) {
+    keep_io_types_ = Get<bool>("keep_io_types");
+  }
+
   auto graph_size = graph->SubGraphsSize();
   VLOG(4) << "graph size: " << graph_size;
   subgraphes_.resize(graph_size);
@@ -224,24 +235,27 @@ void FloatToHalfPass::Init(Graph* graph) const {
   }
 }
 
-void FloatToHalfPass::ApplyImpl(Graph* graph) const {
-  auto enable_gpu_half = Get<bool>("enable_gpu_half");
-  if (!enable_gpu_half) return;
-
-  PADDLE_ENFORCE_NOT_NULL(
-      graph,
-      platform::errors::PreconditionNotMet(
-          "During the float to half pass, the graph should not be nullptr."));
-  PADDLE_ENFORCE_EQ(
-      graph->IsMainGraph(),
-      true,
-      platform::errors::PreconditionNotMet(
-          "During the float to half pass, the graph should be main graph."));
+void AutoMixedPrecisionPass::ApplyImpl(Graph* graph) const {
+  PADDLE_ENFORCE_NOT_NULL(graph,
+                          platform::errors::PreconditionNotMet(
+                              "During the auto_low_precision_pass, the graph "
+                              "should not be nullptr."));
+  PADDLE_ENFORCE_EQ(graph->IsMainGraph(),
+                    true,
+                    platform::errors::PreconditionNotMet(
+                        "During the auto_low_precision_pass, the graph "
+                        "should be main graph."));
 
-  FusePassBase::Init("float_to_half", graph);
+  FusePassBase::Init("auto_mixed_precision", graph);
 
   Init(graph);
   VLOG(4) << "Init done";
+
+  if (skip_pass_) {
+    VLOG(3) << "Skip apply auto_low_precision_pass.";
+    return;
+  }
+
   SetOpUniqueType();
   VLOG(4) << "SetOpUniqueType done";
   GetOpPrecision();
@@ -260,7 +274,7 @@ void FloatToHalfPass::ApplyImpl(Graph* graph) const {
   VLOG(4) << "RestoreOpOriginType done";
 }
 
-void FloatToHalfPass::SetOpUniqueType() const {
+void AutoMixedPrecisionPass::SetOpUniqueType() const {
   int suffix = 0;
   for (const auto& nodes : all_op_nodes_) {
     for (auto* op_node : nodes) {
@@ -277,7 +291,7 @@ void FloatToHalfPass::SetOpUniqueType() const {
   }
 }
 
-void FloatToHalfPass::RestoreOpOriginType() const {
+void AutoMixedPrecisionPass::RestoreOpOriginType() const {
   for (const auto& nodes : all_op_nodes_) {
     for (auto* op_node : nodes) {
       auto op_type = op_node->Op()->Type();
@@ -289,7 +303,7 @@ void FloatToHalfPass::RestoreOpOriginType() const {
   }
 }
 
-inline std::string FloatToHalfPass::GetOpOriginalType(
+inline std::string AutoMixedPrecisionPass::GetOpOriginalType(
     const std::string& op_type) const {
   if (op_original_type_.count(op_type)) {
     return op_original_type_.at(op_type);
@@ -297,22 +311,21 @@ inline std::string FloatToHalfPass::GetOpOriginalType(
   return op_type;
 }
 
-void FloatToHalfPass::ProcessOpWithDtypeAttr() const {
+void AutoMixedPrecisionPass::ProcessOpWithDtypeAttr() const {
   for (const auto& nodes : all_op_nodes_) {
     for (auto* op_node : nodes) {
       auto op_type = op_node->Op()->Type();
-      if (op_run_half_.count(op_type) == 0) continue;
+      if (op_run_low_precision_.count(op_type) == 0) continue;
 
       if (op_node->Op()->HasAttr("dtype")) {
         auto dtype = op_node->Op()->GetAttrIfExists<int>("dtype");
         if (IsFloatType(static_cast<VarType::Type>(dtype))) {
           op_node->Op()->SetAttr(
               "dtype",
-              static_cast<int>(
-                  framework::TransToProtoVarType(half_precision_)));
+              static_cast<int>(framework::TransToProtoVarType(low_precision_)));
           op_node->Op()->Flush();
           VLOG(4) << "process op with dtype attr: " << op_type << " ( " << dtype
-                  << " --->" << static_cast<int>(half_precision_) << " )";
+                  << " --->" << static_cast<int>(low_precision_) << " )";
         }
       }
       if (op_node->Op()->HasAttr("out_dtype")) {
@@ -320,11 +333,10 @@ void FloatToHalfPass::ProcessOpWithDtypeAttr() const {
         if (IsFloatType(static_cast<VarType::Type>(out_dtype))) {
           op_node->Op()->SetAttr(
               "out_dtype",
-              static_cast<int>(
-                  framework::TransToProtoVarType(half_precision_)));
+              static_cast<int>(framework::TransToProtoVarType(low_precision_)));
           op_node->Op()->Flush();
           VLOG(4) << "process op with out_dtype attr: " << op_type << " ( "
-                  << out_dtype << " --->" << static_cast<int>(half_precision_)
+                  << out_dtype << " --->" << static_cast<int>(low_precision_)
                   << " )";
         }
       }
@@ -332,39 +344,39 @@ void FloatToHalfPass::ProcessOpWithDtypeAttr() const {
   }
 }
 
-void FloatToHalfPass::GetOpPrecision() const {
+void AutoMixedPrecisionPass::GetOpPrecision() const {
   for (const auto& nodes : all_op_nodes_) {
     for (auto* op_node : nodes) {
       auto op_type = op_node->Op()->Type();
-      bool support_half = true;
+      bool support_low_precision = true;
       if (GetOpOriginalType(op_type) == "feed" ||
           GetOpOriginalType(op_type) == "fetch") {
-        support_half = !keep_io_types_;
+        support_low_precision = !keep_io_types_;
       } else {
-        support_half = OpSupportPrecision(GetOpOriginalType(op_type),
-                                          phi::Backend::GPU,
-                                          half_precision_,
-                                          black_list_);
+        support_low_precision = OpSupportPrecision(
+            GetOpOriginalType(op_type), backend_, low_precision_, black_list_);
       }
 
       if (op_node->Op()->HasAttr("dtype")) {
         auto dtype = op_node->Op()->GetAttrIfExists<int>("dtype");
-        support_half =
-            support_half && IsFloatType(static_cast<VarType::Type>(dtype));
+        support_low_precision = support_low_precision &&
+                                IsFloatType(static_cast<VarType::Type>(dtype));
       } else if (op_node->Op()->HasAttr("out_dtype")) {
         auto out_dtype = op_node->Op()->GetAttrIfExists<int>("out_dtype");
-        support_half =
-            support_half && IsFloatType(static_cast<VarType::Type>(out_dtype));
+        support_low_precision =
+            support_low_precision &&
+            IsFloatType(static_cast<VarType::Type>(out_dtype));
       } else {
         // if op's input var and output var is not dense tensor, the op should
-        // not run half.
+        // not run at low precision.
         for (auto* in_var_node : op_node->inputs) {
           CHECK_EQ(in_var_node->IsVar(), true);
           auto* real_in_var_node = real_vars_[in_var_node->Var()->Name()];
           if (real_in_var_node->Var()->Persistable()) continue;
 
-          support_half = support_half && (real_in_var_node->Var()->GetType() ==
-                                          VarType::LOD_TENSOR);
+          support_low_precision =
+              support_low_precision &&
+              (real_in_var_node->Var()->GetType() == VarType::LOD_TENSOR);
         }
 
         for (auto* out_var_node : op_node->outputs) {
@@ -372,23 +384,25 @@ void FloatToHalfPass::GetOpPrecision() const {
           auto* real_out_var_node = real_vars_[out_var_node->Var()->Name()];
           if (real_out_var_node->Var()->Persistable()) continue;
 
-          support_half = support_half && (real_out_var_node->Var()->GetType() ==
-                                          VarType::LOD_TENSOR);
+          support_low_precision =
+              support_low_precision &&
+              (real_out_var_node->Var()->GetType() == VarType::LOD_TENSOR);
         }
       }
 
-      if (support_half) {
-        op_run_half_.insert(op_type);
-        VLOG(4) << "support precision: " << op_type << " run at half";
+      if (support_low_precision) {
+        op_run_low_precision_.insert(op_type);
+        VLOG(4) << "support precision: " << op_type << " run at low precision";
       } else {
-        VLOG(4) << "support precision: " << op_type << " not run at half";
+        VLOG(4) << "support precision: " << op_type
+                << " not run at low precision";
       }
     }
   }
 }
 
-void FloatToHalfPass::UpdateOpPrecision() const {
-  std::unordered_set<std::string> vars_should_not_half;
+void AutoMixedPrecisionPass::UpdateOpPrecision() const {
+  std::unordered_set<std::string> vars_should_not_low_precision;
 
   // var -> the var's all input op
   std::unordered_map<std::string, std::vector<Node*>> var_input_ops;
@@ -411,30 +425,16 @@ void FloatToHalfPass::UpdateOpPrecision() const {
                   << " is output of " << op_type;
         }
 
-        // the select_input op's input var should not convert to half. when
-        // op's output var is select_input op's input var, the op should not run
-        // half.
+        // the select_input op's input var should not convert to low precision.
+        // when op's output var is select_input op's input var, the op should
+        // not run at low precision.
         if (GetOpOriginalType(op_node->Op()->Type()) == "select_input") {
           for (auto* in_var_node : op_node->inputs) {
             CHECK_EQ(in_var_node->IsVar(), true);
             if (in_var_node->Var()->Persistable()) continue;
             if (!VarNodeHasDtype(in_var_node)) continue;
 
-            vars_should_not_half.insert(in_var_node->Var()->Name());
-          }
-        }
-
-        // when op_1 only support cpu kernel. if op_2's intput var is op_1's
-        // output var, then op_2 should not run half.
-        if (GetOpOriginalType(op_type) != "feed" &&
-            !GpuKernelSupportPrecision(GetOpOriginalType(op_type),
-                                       phi::DataType::FLOAT32)) {
-          for (auto* out_var_node : op_node->outputs) {
-            CHECK_EQ(out_var_node->IsVar(), true);
-            if (out_var_node->Var()->Persistable()) continue;
-            if (!VarNodeHasDtype(out_var_node)) continue;
-
-            vars_should_not_half.insert(out_var_node->Var()->Name());
+            vars_should_not_low_precision.insert(in_var_node->Var()->Name());
           }
         }
       }
@@ -447,25 +447,7 @@ void FloatToHalfPass::UpdateOpPrecision() const {
     precision_updated = false;
     for (const auto& nodes : all_op_nodes_) {
       for (auto* op_node : nodes) {
-        if (op_run_half_.count(op_node->Op()->Type()) == 0) continue;
-
-        for (auto* in_var_node : op_node->inputs) {
-          CHECK_EQ(in_var_node->IsVar(), true);
-          if (!VarNodeHasDtype(in_var_node)) continue;
-
-          auto* real_in_var_node = real_vars_[in_var_node->Var()->Name()];
-          if (real_in_var_node->Var()->Persistable()) continue;
-
-          if (vars_should_not_half.count(real_in_var_node->Var()->Name())) {
-            op_run_half_.erase(op_node->Op()->Type());
-            precision_updated = true;
-            VLOG(4) << op_node->Op()->Type()
-                    << " should not support half precision.";
-            break;
-          }
-        }
-
-        if (op_run_half_.count(op_node->Op()->Type()) == 0) continue;
+        if (op_run_low_precision_.count(op_node->Op()->Type()) == 0) continue;
 
         for (auto* out_var_node : op_node->outputs) {
           CHECK_EQ(out_var_node->IsVar(), true);
@@ -474,24 +456,25 @@ void FloatToHalfPass::UpdateOpPrecision() const {
           auto* real_out_var_node = real_vars_[out_var_node->Var()->Name()];
           if (real_out_var_node->Var()->Persistable()) continue;
 
-          bool not_run_half = false;
+          bool not_run_low_precision = false;
           const auto& input_op_nodes =
               var_input_ops[real_out_var_node->Var()->Name()];
-          if (vars_should_not_half.count(real_out_var_node->Var()->Name())) {
-            not_run_half = true;
+          if (vars_should_not_low_precision.count(
+                  real_out_var_node->Var()->Name())) {
+            not_run_low_precision = true;
           } else {
             for (auto* node : input_op_nodes) {
-              if (op_run_half_.count(node->Op()->Type()) == 0) {
-                not_run_half = true;
+              if (op_run_low_precision_.count(node->Op()->Type()) == 0) {
+                not_run_low_precision = true;
                 break;
               }
             }
           }
-          if (not_run_half) {
-            op_run_half_.erase(op_node->Op()->Type());
+          if (not_run_low_precision) {
+            op_run_low_precision_.erase(op_node->Op()->Type());
             precision_updated = true;
             VLOG(4) << op_node->Op()->Type()
-                    << " should not support half precision.";
+                    << " should not run at low precision.";
             break;
           }
         }
@@ -501,8 +484,8 @@ void FloatToHalfPass::UpdateOpPrecision() const {
 }
 
 // special ops, its weights should not be low precision.
-bool FloatToHalfPass::InputVarsNotConvert(Node* op_node,
-                                          const std::string& var_name) const {
+bool AutoMixedPrecisionPass::InputVarsNotConvert(
+    Node* op_node, const std::string& var_name) const {
   auto* op_desc = op_node->Op();
   if (GetOpOriginalType(op_desc->Type()) == "batch_norm") {
     auto vecs = op_desc->Input("Bias");
@@ -542,8 +525,8 @@ bool FloatToHalfPass::InputVarsNotConvert(Node* op_node,
   return false;
 }
 
-bool FloatToHalfPass::OutputVarsNotConvert(Node* op_node,
-                                           const std::string& var_name) const {
+bool AutoMixedPrecisionPass::OutputVarsNotConvert(
+    Node* op_node, const std::string& var_name) const {
   auto* op_desc = op_node->Op();
   // batch_norm's input and output (variance and mean) are the same.
   if (GetOpOriginalType(op_desc->Type()) == "batch_norm") {
@@ -567,10 +550,10 @@ bool FloatToHalfPass::OutputVarsNotConvert(Node* op_node,
   return false;
 }
 
-void FloatToHalfPass::SetVarPrecision() const {
+void AutoMixedPrecisionPass::SetVarPrecision() const {
   for (const auto& nodes : all_op_nodes_) {
     for (auto* op_node : nodes) {
-      if (op_run_half_.count(op_node->Op()->Type()) == 0) {
+      if (op_run_low_precision_.count(op_node->Op()->Type()) == 0) {
         continue;
       }
 
@@ -587,8 +570,8 @@ void FloatToHalfPass::SetVarPrecision() const {
 
           if (real_in_var_node->Var()->Persistable()) {
             real_in_var_node->Var()->SetDataType(
-                framework::TransToProtoVarType(half_precision_));
-            vars_convert_to_half_.insert(in_var_name);
+                framework::TransToProtoVarType(low_precision_));
+            vars_convert_to_low_precision_.insert(in_var_name);
           }
         }
       }
@@ -605,9 +588,9 @@ void FloatToHalfPass::SetVarPrecision() const {
           if (OutputVarsNotConvert(op_node, out_var_name)) continue;
 
           real_out_var_node->Var()->SetDataType(
-              framework::TransToProtoVarType(half_precision_));
+              framework::TransToProtoVarType(low_precision_));
           if (real_out_var_node->Var()->Persistable()) {
-            vars_convert_to_half_.insert(out_var_name);
+            vars_convert_to_low_precision_.insert(out_var_name);
           }
         }
       }
@@ -622,24 +605,24 @@ void FloatToHalfPass::SetVarPrecision() const {
       if (!VarNodeHasDtype(var_node)) continue;
 
       auto var_name = var_node->Var()->Name();
-      if (vars_convert_to_half_.count(var_name)) {
+      if (vars_convert_to_low_precision_.count(var_name)) {
         var_node->Var()->SetDataType(
-            framework::TransToProtoVarType(half_precision_));
+            framework::TransToProtoVarType(low_precision_));
       }
     }
   }
 }
 
-void FloatToHalfPass::ConvertWeightsData() const {
+void AutoMixedPrecisionPass::ConvertWeightsData() const {
   auto* scope = param_scope();
-  PADDLE_ENFORCE_NOT_NULL(
-      scope,
-      platform::errors::PreconditionNotMet(
-          "During the float to half pass, the scope should not be null."));
+  PADDLE_ENFORCE_NOT_NULL(scope,
+                          platform::errors::PreconditionNotMet(
+                              "During the auto_low_precision_pass, the scope "
+                              "should not be null."));
 
   auto var_names = scope->LocalVarNames();
   for (const auto& var_name : var_names) {
-    if (vars_convert_to_half_.count(var_name)) {
+    if (vars_convert_to_low_precision_.count(var_name)) {
       VLOG(4) << var_name << "'s data type was convert to half";
 
       auto* var = scope->FindLocalVar(var_name);
@@ -647,25 +630,29 @@ void FloatToHalfPass::ConvertWeightsData() const {
 
       auto* origin_tensor = var->GetMutable<phi::DenseTensor>();
 
-      phi::DenseTensor half_tensor;
-      half_tensor.Resize(origin_tensor->dims());
-      half_tensor.set_type(half_precision_);
+      phi::DenseTensor low_precision_tensor;
+      low_precision_tensor.Resize(origin_tensor->dims());
+      low_precision_tensor.set_type(low_precision_);
 
-      if (half_precision_ == phi::DataType::FLOAT16) {
-        auto* half_data =
-            half_tensor.mutable_data<phi::dtype::float16>(phi::CPUPlace{});
+      if (low_precision_ == phi::DataType::FLOAT16) {
+        auto* low_precision_data =
+            low_precision_tensor.mutable_data<phi::dtype::float16>(
+                phi::CPUPlace{});
         for (int64_t i = 0; i < origin_tensor->numel(); i++) {
           if (origin_tensor->dtype() == phi::DataType::FLOAT64) {
             auto* origin_data = origin_tensor->data<double>();
-            half_data[i] = static_cast<phi::dtype::float16>(origin_data[i]);
+            low_precision_data[i] =
+                static_cast<phi::dtype::float16>(origin_data[i]);
           } else if (origin_tensor->dtype() == phi::DataType::FLOAT32) {
             auto* origin_data = origin_tensor->data<float>();
-            half_data[i] = static_cast<phi::dtype::float16>(origin_data[i]);
+            low_precision_data[i] =
+                static_cast<phi::dtype::float16>(origin_data[i]);
           }
         }
-      } else if (half_precision_ == phi::DataType::BFLOAT16) {
+      } else if (low_precision_ == phi::DataType::BFLOAT16) {
         auto* half_data =
-            half_tensor.mutable_data<phi::dtype::bfloat16>(phi::CPUPlace{});
+            low_precision_tensor.mutable_data<phi::dtype::bfloat16>(
+                phi::CPUPlace{});
         for (int64_t i = 0; i < origin_tensor->numel(); i++) {
           if (origin_tensor->dtype() == phi::DataType::FLOAT64) {
             auto* origin_data = origin_tensor->data<double>();
@@ -678,12 +665,12 @@ void FloatToHalfPass::ConvertWeightsData() const {
       }
       origin_tensor->clear();
       paddle::framework::TensorCopySync(
-          half_tensor, phi::CPUPlace{}, origin_tensor);
+          low_precision_tensor, phi::CPUPlace{}, origin_tensor);
     }
   }
 }
 
-void FloatToHalfPass::InsertCastOp() const {
+void AutoMixedPrecisionPass::InsertCastOp() const {
   int suffix = 0;
   std::unordered_map<Node*, Node*> cache;
 
@@ -697,7 +684,7 @@ void FloatToHalfPass::InsertCastOp() const {
       if (op_node->Op()->HasAttr("sub_block")) continue;
 
       VLOG(4) << "process op: " << op_type
-              << " run half: " << op_run_half_.count(op_type);
+              << " run low precision: " << op_run_low_precision_.count(op_type);
 
       auto inputs = op_node->inputs;
       for (auto* in_var_node : inputs) {
@@ -712,17 +699,17 @@ void FloatToHalfPass::InsertCastOp() const {
         VLOG(4) << "process var: " << real_in_var_node->Var()->Name()
                 << " with type " << in_var_type;
 
-        if (IsFloatType(in_var_type) && op_run_half_.count(op_type)) {
+        if (IsFloatType(in_var_type) && op_run_low_precision_.count(op_type)) {
           DoInsertCastOp(subgraphes_[i],
                          in_var_node,
                          op_node,
                          in_var_type,
-                         framework::TransToProtoVarType(half_precision_),
+                         framework::TransToProtoVarType(low_precision_),
                          block_desc,
                          &suffix,
                          &cache);
         } else if (IsHalfType(in_var_type) &&
-                   op_run_half_.count(op_type) == 0) {
+                   op_run_low_precision_.count(op_type) == 0) {
           DoInsertCastOp(subgraphes_[i],
                          in_var_node,
                          op_node,
@@ -754,4 +741,5 @@ void FloatToHalfPass::InsertCastOp() const {
 }  // namespace framework
 }  // namespace paddle
 
-REGISTER_PASS(float_to_half_pass, paddle::framework::ir::FloatToHalfPass);
+REGISTER_PASS(auto_low_precision_pass,
+              paddle::framework::ir::AutoMixedPrecisionPass);
diff --git a/paddle/fluid/framework/ir/float_to_half_pass.h b/paddle/fluid/framework/ir/auto_mixed_precision_pass.h
similarity index 84%
rename from paddle/fluid/framework/ir/float_to_half_pass.h
rename to paddle/fluid/framework/ir/auto_mixed_precision_pass.h
index c15755896c32b1..578d47282b76d4 100644
--- a/paddle/fluid/framework/ir/float_to_half_pass.h
+++ b/paddle/fluid/framework/ir/auto_mixed_precision_pass.h
@@ -27,13 +27,13 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
-class FloatToHalfPass : public FusePassBase {
+class AutoMixedPrecisionPass : public FusePassBase {
  public:
   using VarType = framework::proto::VarType;
 
  public:
-  FloatToHalfPass() = default;
-  ~FloatToHalfPass() = default;
+  AutoMixedPrecisionPass() = default;
+  ~AutoMixedPrecisionPass() = default;
 
  protected:
   void ApplyImpl(Graph* graph) const override;
@@ -66,9 +66,13 @@ class FloatToHalfPass : public FusePassBase {
   void ConvertWeightsData() const;
 
  private:
-  mutable bool keep_io_types_;
+  mutable bool skip_pass_{false};
+
+  mutable bool keep_io_types_{false};
   // float16 or bfloat16 now
-  mutable phi::DataType half_precision_;
+  mutable phi::DataType low_precision_{phi::DataType::FLOAT16};
+
+  mutable phi::Backend backend_{phi::Backend::GPU};
 
   mutable std::unordered_set<std::string> black_list_;
 
@@ -80,10 +84,10 @@ class FloatToHalfPass : public FusePassBase {
   mutable std::vector<std::vector<Node*>> all_op_nodes_;
   // op's unique type -> the op's origin type
   mutable std::unordered_map<std::string, std::string> op_original_type_;
-  // op's unique type -> whether the op run at half precision
-  mutable std::unordered_set<std::string> op_run_half_;
+  // op's unique type -> whether the op run at low precision
+  mutable std::unordered_set<std::string> op_run_low_precision_;
 
-  mutable std::unordered_set<std::string> vars_convert_to_half_;
+  mutable std::unordered_set<std::string> vars_convert_to_low_precision_;
 };
 
 bool OpSupportPrecision(const std::string& op_type,
diff --git a/paddle/fluid/framework/ir/conv2d_fusion_layout_transfer_pass.cc b/paddle/fluid/framework/ir/conv2d_fusion_layout_transfer_pass.cc
index dbba001d521015..efed7dd6e637bc 100644
--- a/paddle/fluid/framework/ir/conv2d_fusion_layout_transfer_pass.cc
+++ b/paddle/fluid/framework/ir/conv2d_fusion_layout_transfer_pass.cc
@@ -142,7 +142,7 @@ void Conv2dFusionLayoutTransferPass::ApplyImpl(ir::Graph *graph) const {
   bool is_fp16_precision =
       static_cast<phi::DataType>(Get<int>("model_precision")) ==
           phi::DataType::FLOAT16 ||
-      Get<bool>("enable_gpu_half");
+      Get<bool>("enable_gpu_mixed");
   bool cutlass_enable = false;
 
 #ifdef PADDLE_WITH_CUTLASS
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
index 063eb90d90af17..2f527ff1e707bb 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
@@ -165,7 +165,7 @@ void ConvElementwiseAddActFusePass::ApplyImpl(ir::Graph* graph) const {
   bool is_fp16_precision =
       static_cast<phi::DataType>(Get<int>("model_precision")) ==
           phi::DataType::FLOAT16 ||
-      Get<bool>("enable_gpu_half");
+      Get<bool>("enable_gpu_mixed");
   constexpr int CUTLASS_NHWC_ALIGNMENT = 8;
   if (is_fp16_precision) {
 #ifdef PADDLE_WITH_CUTLASS
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index a72c1fe7622136..fea343f69e7e5e 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -365,7 +365,7 @@ struct Argument {
   DECL_ARGUMENT_FIELD(mixed_black_list,
                       MixedBlackList,
                       std::unordered_set<std::string>);
-  DECL_ARGUMENT_FIELD(enable_gpu_half, EnableGPUHalf, bool);
+  DECL_ARGUMENT_FIELD(enable_gpu_mixed, EnableGPUMixed, bool);
   DECL_ARGUMENT_FIELD(mixed_precision_mode, MixedPrecisionMode, int);
 
   // cinn compiler related
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index da393502ea1625..b4c24cf9b8bd77 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -296,13 +296,16 @@ void IRPassManager::CreatePasses(Argument *argument,
       }
       bool use_fc_padding = !fc_mkldnn_pass && argument->use_fc_padding();
       pass->Set("use_fc_padding", new bool(use_fc_padding));
-    } else if (pass_name == "float_to_half_pass") {
+    } else if (pass_name == "auto_mixed_precision_pass") {
       pass->Set(
           "mixed_black_list",
           new std::unordered_set<std::string>(argument->mixed_black_list()));
-      pass->Set("enable_gpu_half", new bool(argument->enable_gpu_half()));
+      pass->Set("enable_gpu_mixed", new bool(argument->enable_gpu_mixed()));
       pass->Set("mixed_precision_mode",
                 new int(argument->mixed_precision_mode()));
+    } else if (pass_name == "conv_elementwise_add_act_fuse_pass" ||
+               pass_name == "conv2d_fusion_layout_transfer_pass") {
+      pass->Set("enable_gpu_mixed", new bool(argument->enable_gpu_mixed()));
     }
     pre_pass = pass_name;
 
diff --git a/paddle/fluid/inference/analysis/passes/CMakeLists.txt b/paddle/fluid/inference/analysis/passes/CMakeLists.txt
index fa074f962eb3d4..96121601cb6fdb 100644
--- a/paddle/fluid/inference/analysis/passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/passes/CMakeLists.txt
@@ -13,7 +13,7 @@ cc_library(
 cc_library(
   convert_to_mixed_precision
   SRCS convert_to_mixed_precision.cc
-  DEPS analysis_pass ir_graph_build_pass)
+  DEPS analysis_pass ir_graph_build_pass auto_mixed_precision_pass)
 cc_library(
   ir_params_sync_among_devices_pass
   SRCS ir_params_sync_among_devices_pass.cc
diff --git a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc
index 906c745762dfda..f1939fc8b328b8 100644
--- a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc
+++ b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc
@@ -15,7 +15,7 @@
 #include "paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.h"
 
 #include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/ir/float_to_half_pass.h"
+#include "paddle/fluid/framework/ir/auto_mixed_precision_pass.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/inference/io.h"
 #include "paddle/phi/common/backend.h"
@@ -68,11 +68,11 @@ void ConvertToMixedPrecisionPass::LoadModel() {
 void ConvertToMixedPrecisionPass::Run() {
   LoadModel();
 
-  framework::ir::FloatToHalfPass pass;
+  framework::ir::AutoMixedPrecisionPass pass;
   pass.Set("mixed_precision_mode", new int{static_cast<int>(mixed_precision_)});
   pass.Set("mixed_black_list",
            new std::unordered_set<std::string>{black_list_});
-  pass.Set("enable_gpu_half", new bool{true});
+  pass.Set("enable_gpu_mixed", new bool{true});
   pass.Set("keep_io_types", new bool{keep_io_types_});
 
   pass.Apply(main_graph_.get());
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index c7554de9df73b2..b4d39e687203e3 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -99,7 +99,7 @@ void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
     // default
   } else if (precision_mode == Precision::kHalf ||
              precision_mode == Precision::kBf16) {
-    enable_gpu_half_ = true;
+    enable_gpu_mixed_ = true;
   } else {
     LOG(ERROR)
         << "The Paddle-GPU inference currently only supports "
@@ -396,7 +396,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
 
   // Mixed precision related.
   CP_MEMBER(mixed_black_list_);
-  CP_MEMBER(enable_gpu_half_);
+  CP_MEMBER(enable_gpu_mixed_);
   CP_MEMBER(mixed_precision_mode_);
 
   CP_MEMBER(enable_memory_optim_);
@@ -1022,7 +1022,7 @@ std::string AnalysisConfig::SerializeInfoCache() {
   ss << params_file_;
 
   ss << use_gpu_;
-  ss << enable_gpu_half_;
+  ss << enable_gpu_mixed_;
   ss << use_external_stream_;
   ss << exec_stream_;
   ss << use_fc_padding_;
@@ -1239,7 +1239,7 @@ std::string AnalysisConfig::Summary() {
   os.InsertRow({"use_gpu", use_gpu_ ? "true" : "false"});
   if (use_gpu_) {
     os.InsertRow({"gpu_device_id", std::to_string(gpu_device_id_)});
-    os.InsertRow({"enable_gpu_half_", std::to_string(enable_gpu_half_)});
+    os.InsertRow({"enable_gpu_mixed_", std::to_string(enable_gpu_mixed_)});
     os.InsertRow({"memory_pool_init_size",
                   std::to_string(memory_pool_init_size_mb_) + "MB"});
     os.InsertRow(
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index af4d83f55a6ee2..48cef9e95d75c8 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1268,10 +1268,10 @@ void AnalysisPredictor::PrepareArgument() {
 
   if (!config_.ir_optim()) {
     argument_.SetEnableIrOptim(false);
-    if (config_.enable_gpu_half_) {
+    if (config_.enable_gpu_mixed_) {
       argument_.SetEnableIrOptim(true);
       pass_builder->ClearPasses();
-      pass_builder->AppendPass("float_to_half_pass");
+      pass_builder->AppendPass("auto_mixed_precision_pass");
       LOG(INFO)
           << "This model run in Paddle-GPU mixed precision mode with no ir "
              "optimization.";
@@ -1282,7 +1282,7 @@ void AnalysisPredictor::PrepareArgument() {
     if (config_.ir_debug_) {
       pass_builder->TurnOnDebug();
     }
-    if (config_.enable_gpu_half_) {
+    if (config_.enable_gpu_mixed_) {
       LOG(INFO) << "This model run in Paddle-GPU mixed precision mode.";
     }
   }
@@ -1294,7 +1294,7 @@ void AnalysisPredictor::PrepareArgument() {
   // mixed precison.
   argument_.SetModelPrecision(static_cast<int>(model_precision_));
   argument_.SetMixedBlackList(config_.mixed_black_list_);
-  argument_.SetEnableGPUHalf(config_.enable_gpu_half_);
+  argument_.SetEnableGPUMixed(config_.enable_gpu_mixed_);
   argument_.SetMixedPrecisionMode(static_cast<int>(
       paddle::ConvertPrecision(config_.mixed_precision_mode_)));
 }
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index b4c5a0d293574d..41eea1fb98c319 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -1049,7 +1049,7 @@ struct PD_INFER_DECL AnalysisConfig {
   bool use_gpu_{false};
   int gpu_device_id_{0};
   uint64_t memory_pool_init_size_mb_{100};  // initial size is 100MB.
-  bool enable_gpu_half_{false};
+  bool enable_gpu_mixed_{false};
   bool thread_local_stream_{false};
 
   bool use_cudnn_{false};
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index f7ce5b39ed9015..0cb7191ce7d261 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -245,7 +245,7 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
 #endif                                         //
         "transpose_flatten_concat_fuse_pass",  //
         "constant_folding_pass",               //
-        "float_to_half_pass",                  //
+        "auto_mixed_precision_pass",           //
   });
 
   use_gpu_ = true;

From a49aad56962997ed0bdf86c732689c1e89f1a9cd Mon Sep 17 00:00:00 2001
From: yuanlehome <yuanlehome@163.com>
Date: Mon, 12 Dec 2022 03:46:20 +0000
Subject: [PATCH 4/5] fix

---
 .../framework/ir/auto_mixed_precision_pass.cc | 12 ++++-----
 .../inference/analysis/ir_pass_manager.cc     | 25 +++++++------------
 2 files changed, 15 insertions(+), 22 deletions(-)

diff --git a/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
index 48b1de077bf200..bc034301989b0a 100644
--- a/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
+++ b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
@@ -199,7 +199,7 @@ void AutoMixedPrecisionPass::Init(Graph* graph) const {
 
   skip_pass_ = !enable_gpu_mixed;
 
-  low_precision_ = static_cast<phi::DataType>(Get<int>("low_precision_mode"));
+  low_precision_ = static_cast<phi::DataType>(Get<int>("mixed_precision_mode"));
 
   black_list_ = Get<std::unordered_set<std::string>>("mixed_black_list");
   SetDefaultBlacklist();
@@ -238,12 +238,12 @@ void AutoMixedPrecisionPass::Init(Graph* graph) const {
 void AutoMixedPrecisionPass::ApplyImpl(Graph* graph) const {
   PADDLE_ENFORCE_NOT_NULL(graph,
                           platform::errors::PreconditionNotMet(
-                              "During the auto_low_precision_pass, the graph "
+                              "During the auto_mixed_precision_pass, the graph "
                               "should not be nullptr."));
   PADDLE_ENFORCE_EQ(graph->IsMainGraph(),
                     true,
                     platform::errors::PreconditionNotMet(
-                        "During the auto_low_precision_pass, the graph "
+                        "During the auto_mixed_precision_pass, the graph "
                         "should be main graph."));
 
   FusePassBase::Init("auto_mixed_precision", graph);
@@ -252,7 +252,7 @@ void AutoMixedPrecisionPass::ApplyImpl(Graph* graph) const {
   VLOG(4) << "Init done";
 
   if (skip_pass_) {
-    VLOG(3) << "Skip apply auto_low_precision_pass.";
+    VLOG(3) << "Skip auto_mixed_precision_pass.";
     return;
   }
 
@@ -617,7 +617,7 @@ void AutoMixedPrecisionPass::ConvertWeightsData() const {
   auto* scope = param_scope();
   PADDLE_ENFORCE_NOT_NULL(scope,
                           platform::errors::PreconditionNotMet(
-                              "During the auto_low_precision_pass, the scope "
+                              "During the auto_mixed_precision_pass, the scope "
                               "should not be null."));
 
   auto var_names = scope->LocalVarNames();
@@ -741,5 +741,5 @@ void AutoMixedPrecisionPass::InsertCastOp() const {
 }  // namespace framework
 }  // namespace paddle
 
-REGISTER_PASS(auto_low_precision_pass,
+REGISTER_PASS(auto_mixed_precision_pass,
               paddle::framework::ir::AutoMixedPrecisionPass);
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index b4c24cf9b8bd77..734c8a60fb86be 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -89,6 +89,15 @@ void IRPassManager::CreatePasses(Argument *argument,
                               argument->tensorrt_tuned_dynamic_shape();
     pass->Set("with_dynamic_shape", new bool(with_dynamic_shape));
 
+    // Mixed precision related.
+    pass->Set(
+        "mixed_black_list",
+        new std::unordered_set<std::string>(argument->mixed_black_list()));
+    pass->Set("enable_gpu_mixed", new bool(argument->enable_gpu_mixed()));
+    pass->Set("mixed_precision_mode",
+              new int(argument->mixed_precision_mode()));
+    pass->Set("model_precision", new int(argument->model_precision()));
+
     if (pass_name == "graph_viz_pass") {
       std::string optim_cache_dir = argument->optim_cache_dir();
       std::string dot_file_path;
@@ -208,12 +217,6 @@ void IRPassManager::CreatePasses(Argument *argument,
       // not run fp16.
       pass->Set("disable_trt_plugin_fp16",
                 new bool(argument->disable_trt_plugin_fp16()));
-
-      // Mixed precision related.
-      pass->Set("model_precision", new int(argument->model_precision()));
-      pass->Set(
-          "mixed_black_list",
-          new std::unordered_set<std::string>(argument->mixed_black_list()));
     } else if (pass_name == "dlnne_subgraph_pass") {
       auto precision_mode = argument->dlnne_precision_mode();
       pass->Set("min_subgraph_size",
@@ -296,16 +299,6 @@ void IRPassManager::CreatePasses(Argument *argument,
       }
       bool use_fc_padding = !fc_mkldnn_pass && argument->use_fc_padding();
       pass->Set("use_fc_padding", new bool(use_fc_padding));
-    } else if (pass_name == "auto_mixed_precision_pass") {
-      pass->Set(
-          "mixed_black_list",
-          new std::unordered_set<std::string>(argument->mixed_black_list()));
-      pass->Set("enable_gpu_mixed", new bool(argument->enable_gpu_mixed()));
-      pass->Set("mixed_precision_mode",
-                new int(argument->mixed_precision_mode()));
-    } else if (pass_name == "conv_elementwise_add_act_fuse_pass" ||
-               pass_name == "conv2d_fusion_layout_transfer_pass") {
-      pass->Set("enable_gpu_mixed", new bool(argument->enable_gpu_mixed()));
     }
     pre_pass = pass_name;
 

From 18bd36e2970d8086e0d2b1b574c7935218f175ac Mon Sep 17 00:00:00 2001
From: yuanlehome <yuanlehome@163.com>
Date: Mon, 12 Dec 2022 09:46:24 +0000
Subject: [PATCH 5/5] merge

---
 .gitignore                                    |   4 +-
 .../distributed/collective/CMakeLists.txt     |   2 +-
 .../fluid/distributed/collective/NCCLTools.cc |   2 +-
 .../fluid/distributed/collective/NCCLTools.h  |  37 +--
 .../collective/ProcessGroupNCCL.cc            | 200 +++++++-----
 .../distributed/collective/ProcessGroupNCCL.h |   4 +-
 paddle/fluid/distributed/collective/check.cc  | 290 ++++++++++++++++++
 .../collective/{static_check.h => check.h}    |  46 ++-
 .../distributed/collective/static_check.cc    | 155 ----------
 .../tests/task_tests/nan_inf_utils_test.cc    |  46 ++-
 .../framework/details/nan_inf_utils_detail.cc | 270 ++++++----------
 .../framework/details/nan_inf_utils_detail.cu | 239 ++++++++-------
 .../framework/details/nan_inf_utils_detail.h  | 119 ++++++-
 .../interpreter/dependency_builder.cc         |  14 +-
 .../interpreter/dependency_builder.h          |   2 +-
 .../interpreter/stream_analyzer.cc            |  29 +-
 .../interpreter/stream_analyzer.h             |   3 +-
 .../framework/new_executor/interpretercore.cc |  21 +-
 .../new_executor/new_executor_defs.h          |   4 +-
 .../ir_passes/tensorrt_subgraph_pass.cc       |  53 +++-
 paddle/fluid/inference/api/analysis_config.cc |   5 -
 .../inference/api/paddle_pass_builder.cc      |   1 +
 paddle/fluid/inference/tensorrt/engine.cc     |   2 +-
 paddle/fluid/inference/tensorrt/engine.h      |   6 +-
 paddle/fluid/inference/tensorrt/op_teller.cc  |  14 +-
 paddle/fluid/operators/batch_norm_op.cu       |   4 +-
 paddle/fluid/operators/batch_norm_op.h        |   2 +-
 .../operators/fused/fused_bn_activation_op.cu |   6 +-
 .../fused/fused_bn_add_activation_op.cu       |   6 +-
 .../fluid/operators/generator/CMakeLists.txt  | 129 +++-----
 .../operators/generator/generate_static_op.py | 153 +++++++++
 paddle/fluid/operators/log_loss_op.cc         | 129 --------
 .../operators/mkldnn/reshape_mkldnn_op.cc     |  22 --
 paddle/fluid/operators/norm_utils.h           |  51 ---
 paddle/fluid/operators/put_along_axis_op.cc   | 111 -------
 paddle/fluid/operators/searchsorted_op.cc     |  72 -----
 paddle/fluid/operators/svd_op.cc              | 126 --------
 .../fluid/operators/sync_batch_norm_op_mlu.cc |   4 +-
 .../fluid/operators/sync_batch_norm_op_npu.cc |   4 +-
 paddle/fluid/operators/take_along_axis_op.cc  | 106 -------
 .../operators/tensorrt/tensorrt_engine_op.h   |  39 ++-
 .../tensorrt/tensorrt_engine_op_test.cc       |   5 +-
 paddle/fluid/platform/dynload/cusolver.h      |  23 +-
 paddle/phi/api/yaml/backward.yaml             |  41 +++
 paddle/phi/api/yaml/legacy_backward.yaml      |  42 ---
 paddle/phi/api/yaml/legacy_ops.yaml           |  50 ---
 paddle/phi/api/yaml/op_compat.yaml            |  38 +++
 paddle/phi/api/yaml/ops.yaml                  |  58 +++-
 paddle/phi/api/yaml/static_ops.yaml           |   7 +
 paddle/phi/backends/dynload/cusolver.h        |  23 +-
 paddle/phi/core/tensor_utils.cc               |  43 ++-
 .../kernels/funcs}/norm_utils.cu.h            |  20 +-
 .../kernels/funcs/values_vectors_functor.h    | 179 ++++++++++-
 .../phi/kernels/gpu/batch_norm_grad_kernel.cu |  37 ++-
 paddle/phi/kernels/gpu/batch_norm_kernel.cu   |   1 -
 paddle/phi/kernels/gpu/stack_kernel.cu        | 164 +++++++---
 paddle/phi/kernels/onednn/reshape_kernel.cc   | 179 +++++++++++
 .../kernels/onednn/transpose_grad_kernel.cc   |   8 +-
 paddle/phi/kernels/onednn/transpose_kernel.cc |   3 +-
 paddle/phi/ops/compat/log_loss_sig.cc         |  29 --
 paddle/phi/ops/compat/put_along_axis_sig.cc   |  38 ---
 paddle/phi/ops/compat/svd_sig.cc              |  27 --
 paddle/phi/ops/compat/take_along_axis_sig.cc  |  37 ---
 paddle/scripts/paddle_build.sh                |  27 +-
 .../sharding/group_sharded_utils.py           |   6 +-
 python/paddle/fluid/dygraph/amp/auto_cast.py  |  52 +++-
 python/paddle/fluid/dygraph/checkpoint.py     |   3 +-
 .../fluid/dygraph/learning_rate_scheduler.py  |  15 +-
 python/paddle/fluid/dygraph/nn.py             | 182 -----------
 python/paddle/fluid/dygraph/parallel.py       |  10 +-
 python/paddle/fluid/framework.py              |   9 +-
 python/paddle/fluid/optimizer.py              |   5 +-
 .../fleet/dygraph_save_for_auto_infer.py      |   3 +-
 ...parallel_dygraph_control_flow_different.py |   9 +-
 .../fleet/parallel_dygraph_transformer.py     |  21 +-
 .../test_imperative_auto_mixed_precision.py   |   2 +-
 ...perative_auto_mixed_precision_for_eager.py |   2 +-
 .../dygraph_to_static/bert_dygraph_model.py   |  28 +-
 .../seq2seq_dygraph_model.py                  |  26 +-
 .../dygraph_to_static/simnet_dygraph_model.py |  11 +-
 .../simnet_dygraph_model_v2.py                |   9 +-
 .../unittests/dygraph_to_static/test_lac.py   |  10 +-
 .../dygraph_to_static/test_ptb_lm.py          |  11 +-
 .../dygraph_to_static/test_ptb_lm_v2.py       |  10 +-
 .../dygraph_to_static/test_sentiment.py       |  31 +-
 .../dygraph_to_static/test_word2vec.py        |  14 +-
 .../transformer_dygraph_model.py              |  24 +-
 .../unittests/ir/inference/test_trt_int64.py  | 239 +++++++++++++++
 .../parallel_dygraph_sparse_embedding.py      |  10 +-
 .../standalone_executor/CMakeLists.txt        |  53 +---
 .../test_standalone_cross_step_overlap.py     |  82 +++++
 .../unittests/test_imperative_auto_prune.py   |   9 +-
 .../test_imperative_load_static_param.py      |   6 +-
 ..._imperative_lod_tensor_to_selected_rows.py |  12 +-
 .../test_imperative_named_members.py          |   2 +-
 .../test_imperative_ocr_attention_model.py    |   6 +-
 .../unittests/test_imperative_ptb_rnn.py      |  10 +-
 .../unittests/test_imperative_save_load.py    |  14 +-
 .../unittests/test_imperative_save_load_v2.py |  16 +-
 .../test_imperative_selected_rows.py          |  10 +-
 ..._imperative_selected_rows_to_lod_tensor.py |  11 +-
 ..._imperative_transformer_sorted_gradient.py |  21 +-
 .../fluid/tests/unittests/test_layers.py      |  36 ++-
 .../fluid/tests/unittests/test_nan_inf.py     | 103 ++++++-
 .../tests/unittests/test_rnn_decode_api.py    |   4 +-
 python/paddle/static/__init__.py              |   4 +-
 python/paddle/static/nn/metric.py             |   2 +-
 107 files changed, 2573 insertions(+), 2181 deletions(-)
 create mode 100644 paddle/fluid/distributed/collective/check.cc
 rename paddle/fluid/distributed/collective/{static_check.h => check.h} (65%)
 delete mode 100644 paddle/fluid/distributed/collective/static_check.cc
 create mode 100644 paddle/fluid/operators/generator/generate_static_op.py
 delete mode 100644 paddle/fluid/operators/log_loss_op.cc
 delete mode 100644 paddle/fluid/operators/norm_utils.h
 delete mode 100644 paddle/fluid/operators/put_along_axis_op.cc
 delete mode 100644 paddle/fluid/operators/searchsorted_op.cc
 delete mode 100644 paddle/fluid/operators/svd_op.cc
 delete mode 100644 paddle/fluid/operators/take_along_axis_op.cc
 create mode 100644 paddle/phi/api/yaml/static_ops.yaml
 rename paddle/{fluid/operators => phi/kernels/funcs}/norm_utils.cu.h (98%)
 create mode 100644 paddle/phi/kernels/onednn/reshape_kernel.cc
 delete mode 100644 paddle/phi/ops/compat/log_loss_sig.cc
 delete mode 100644 paddle/phi/ops/compat/put_along_axis_sig.cc
 delete mode 100644 paddle/phi/ops/compat/svd_sig.cc
 delete mode 100644 paddle/phi/ops/compat/take_along_axis_sig.cc
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_int64.py
 create mode 100644 python/paddle/fluid/tests/unittests/standalone_executor/test_standalone_cross_step_overlap.py

diff --git a/.gitignore b/.gitignore
index f1d02f4dd25c69..890aca6fa96ad1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -73,8 +73,8 @@ tools/nvcc_lazy
 # these files (directories) are generated before build system generation
 paddle/fluid/operators/generated_op.cc
 paddle/fluid/operators/generated_sparse_op.cc
-paddle/phi/ops/compat/generated_sig.cc
-paddle/phi/ops/compat/generated_sparse_sig.cc
+paddle/fluid/operators/generated_static_op.cc
+paddle/phi/ops/compat/generated_*.cc
 paddle/phi/api/yaml/parsed_apis/
 paddle/fluid/operators/generator/parsed_ops/
 paddle/fluid/pybind/tmp_eager_op_function_impl.h
diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt
index 83b42fd4320706..85efa52c3196a7 100644
--- a/paddle/fluid/distributed/collective/CMakeLists.txt
+++ b/paddle/fluid/distributed/collective/CMakeLists.txt
@@ -21,7 +21,7 @@ endif()
 if(WITH_NCCL OR WITH_RCCL)
   cc_library(
     processgroup_nccl
-    SRCS ProcessGroupNCCL.cc NCCLTools.cc Common.cc static_check.cc
+    SRCS ProcessGroupNCCL.cc NCCLTools.cc Common.cc check.cc
     DEPS processgroup
          processgroup_stream
          place
diff --git a/paddle/fluid/distributed/collective/NCCLTools.cc b/paddle/fluid/distributed/collective/NCCLTools.cc
index a8c437bb12225d..47c0f547ee79ea 100644
--- a/paddle/fluid/distributed/collective/NCCLTools.cc
+++ b/paddle/fluid/distributed/collective/NCCLTools.cc
@@ -14,7 +14,7 @@
 
 #include "paddle/fluid/distributed/collective/NCCLTools.h"
 
-#include "paddle/fluid/distributed/collective/Types.h"
+#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/collective/NCCLTools.h b/paddle/fluid/distributed/collective/NCCLTools.h
index 37b1e0f114c3d4..103e56a99d50b1 100644
--- a/paddle/fluid/distributed/collective/NCCLTools.h
+++ b/paddle/fluid/distributed/collective/NCCLTools.h
@@ -21,42 +21,29 @@
 #include <hip/hip_runtime.h>
 #endif
 
-#include <error.h>
-
 #include <string>
 
 #include "paddle/fluid/distributed/collective/Types.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/variable.h"
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#include "paddle/fluid/platform/cuda_device_guard.h"
-#endif
-
-#include "paddle/fluid/platform/device_context.h"
 
 #ifdef PADDLE_WITH_RCCL
-#include "paddle/fluid/platform/dynload/rccl.h"
+#include "paddle/phi/backends/dynload/rccl.h"
 #else
-#include "paddle/fluid/platform/dynload/nccl.h"
+#include "paddle/phi/backends/dynload/nccl.h"
 #endif
 
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/utils/variant.h"
-
 namespace paddle {
 namespace distributed {
 
-#define NCCL_CHECK(cmd)                                 \
-  do {                                                  \
-    ncclResult_t r = cmd;                               \
-    if (r != ncclSuccess) {                             \
-      printf("Failed, NCCL error %s:%d '%s'\n",         \
-             __FILE__,                                  \
-             __LINE__,                                  \
-             platform::dynload::ncclGetErrorString(r)); \
-      exit(EXIT_FAILURE);                               \
-    }                                                   \
+#define NCCL_CHECK(cmd)                            \
+  do {                                             \
+    ncclResult_t r = cmd;                          \
+    if (r != ncclSuccess) {                        \
+      printf("Failed, NCCL error %s:%d '%s'\n",    \
+             __FILE__,                             \
+             __LINE__,                             \
+             phi::dynload::ncclGetErrorString(r)); \
+      exit(EXIT_FAILURE);                          \
+    }                                              \
   } while (0)
 
 ncclRedOp_t ToNCCLRedType(ReduceOp reduction);
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
index b5c44962dd3a52..13de2625a6eeea 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
@@ -16,7 +16,7 @@
 
 #include "paddle/fluid/distributed/collective/Common.h"
 #include "paddle/fluid/distributed/collective/NCCLTools.h"
-#include "paddle/fluid/distributed/collective/static_check.h"
+#include "paddle/fluid/distributed/collective/check.h"
 #include "paddle/fluid/distributed/collective/utils.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/fluid/platform/place.h"
@@ -25,6 +25,8 @@
 DECLARE_bool(nccl_blocking_wait);
 DECLARE_bool(use_stream_safe_cuda_allocator);
 
+// set this flag to `true` and recompile to enable dynamic checks
+constexpr bool FLAGS_enable_nccl_dynamic_check = false;
 constexpr int64_t kWaitBlockTImeout = 10;
 
 namespace paddle {
@@ -89,12 +91,10 @@ ProcessGroupNCCL::ProcessGroupNCCL(const std::shared_ptr<Store>& store,
     : ProcessGroupStream(rank, size, gid), store_(store) {}
 
 void ProcessGroupNCCL::GroupStart() {
-  NCCL_CHECK(platform::dynload::ncclGroupStart());
+  NCCL_CHECK(phi::dynload::ncclGroupStart());
 }
 
-void ProcessGroupNCCL::GroupEnd() {
-  NCCL_CHECK(platform::dynload::ncclGroupEnd());
-}
+void ProcessGroupNCCL::GroupEnd() { NCCL_CHECK(phi::dynload::ncclGroupEnd()); }
 
 phi::DeviceContext* ProcessGroupNCCL::GetDeviceContext(
     const Place& place) const {
@@ -146,7 +146,13 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllGather(
                                    size_);
   return RunFnInNCCLEnv(
       [&](ncclComm_t comm, gpuStream_t stream) {
-        NCCL_CHECK(platform::dynload::ncclAllGather(
+        if (FLAGS_enable_nccl_dynamic_check) {
+          CommDynamicCheck::CheckShape(*out_tensor,
+                                       /*root_rank*/ 0,
+                                       rank_,
+                                       comm);
+        }
+        NCCL_CHECK(phi::dynload::ncclAllGather(
             in_tensor_maybe_partial.data(),
             out_tensor->data(),
             in_tensor_maybe_partial.numel(),
@@ -173,7 +179,13 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllReduce(
                              size_);
   return RunFnInNCCLEnv(
       [&](ncclComm_t comm, gpuStream_t stream) {
-        NCCL_CHECK(platform::dynload::ncclAllReduce(
+        if (FLAGS_enable_nccl_dynamic_check) {
+          CommDynamicCheck::CheckShape(*out_tensor,
+                                       /*root_rank*/ 0,
+                                       rank_,
+                                       comm);
+        }
+        NCCL_CHECK(phi::dynload::ncclAllReduce(
             in_tensor.data(),
             out_tensor->data(),
             in_tensor.numel(),
@@ -219,9 +231,10 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllToAll(
   CheckSizeOnEachRank(out_dim, out_size_each_rank, size_);
   CheckSizeOnEachRank(in_dim, in_size_each_rank, size_);
 
-  // NOTE: Since `all_to_all` needs other processes's participation, it cannot
+  // NOTE: Since `all_to_all` needs other processes' participation, it cannot
   // simply be covered by static checks. Factors are set to 0 here to skip the
-  // shape check. Its shape check will be done by dynamic checks in debug mode.
+  // shape check. Its shape check will be done by dynamic checks with
+  // FLAGS_enable_nccl_dynamic_check.
   CommStaticCheck::CheckShape(*out_tensor,
                               in_tensor,
                               /*dst_rank*/ rank_,
@@ -231,6 +244,10 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllToAll(
                               /*in_size_factor*/ 0);
   return RunFnInNCCLEnv(
       [&](ncclComm_t comm, gpuStream_t stream) {
+        if (FLAGS_enable_nccl_dynamic_check) {
+          CommDynamicCheck::CheckShape(
+              *out_tensor, in_tensor, in_size_each_rank, rank_, size_, comm);
+        }
         int64_t in_row_size = in_tensor.numel() / in_dim[0],
                 out_row_size = out_tensor->numel() / out_dim[0];
         int64_t in_offset = 0, in_numel = 0, out_offset = 0, out_numel = 0;
@@ -240,7 +257,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllToAll(
         for (auto i = 0; i < size_; i++) {
           in_numel = in_size_each_rank[i] * in_row_size;
           input_partial = GetPartialTensor(in_tensor, in_offset, in_numel);
-          NCCL_CHECK(platform::dynload::ncclSend(
+          NCCL_CHECK(phi::dynload::ncclSend(
               input_partial.data(),
               in_numel,
               platform::ToNCCLDataType(input_partial.dtype()),
@@ -251,7 +268,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllToAll(
 
           out_numel = out_size_each_rank[i] * out_row_size;
           output_partial = GetPartialTensor(*out_tensor, out_offset, out_numel);
-          NCCL_CHECK(platform::dynload::ncclRecv(
+          NCCL_CHECK(phi::dynload::ncclRecv(
               output_partial.data(),
               out_numel,
               platform::ToNCCLDataType(output_partial.dtype()),
@@ -304,7 +321,10 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Broadcast(
   return RunFnInNCCLEnv(
       [&](ncclComm_t comm, gpuStream_t stream) {
         int root = opts.source_rank + opts.source_root;
-        NCCL_CHECK(platform::dynload::ncclBroadcast(
+        if (FLAGS_enable_nccl_dynamic_check) {
+          CommDynamicCheck::CheckShape(*out_tensor, root, rank_, comm);
+        }
+        NCCL_CHECK(phi::dynload::ncclBroadcast(
             in_tensor.data(),
             out_tensor->data(),
             in_tensor.numel(),
@@ -332,7 +352,13 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Reduce(
                              size_);
   return RunFnInNCCLEnv(
       [&](ncclComm_t comm, gpuStream_t stream) {
-        NCCL_CHECK(platform::dynload::ncclReduce(
+        if (FLAGS_enable_nccl_dynamic_check) {
+          CommDynamicCheck::CheckShape(*out_tensor,
+                                       /*root_rank*/ opts.root_rank,
+                                       rank_,
+                                       comm);
+        }
+        NCCL_CHECK(phi::dynload::ncclReduce(
             in_tensor.data(),
             out_tensor->data(),
             in_tensor.numel(),
@@ -361,7 +387,13 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::ReduceScatter(
                                     size_);
   return RunFnInNCCLEnv(
       [&](ncclComm_t comm, gpuStream_t stream) {
-        NCCL_CHECK(platform::dynload::ncclReduceScatter(
+        if (FLAGS_enable_nccl_dynamic_check) {
+          CommDynamicCheck::CheckShape(*out_tensor,
+                                       /*root_rank*/ 0,
+                                       rank_,
+                                       comm);
+        }
+        NCCL_CHECK(phi::dynload::ncclReduceScatter(
             in_tensor.data(),
             out_tensor->data(),
             out_tensor->numel(),
@@ -389,6 +421,12 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Scatter(
                                     size_);
   return RunFnInNCCLEnv(
       [&](ncclComm_t comm, gpuStream_t stream) {
+        if (FLAGS_enable_nccl_dynamic_check) {
+          CommDynamicCheck::CheckShape(*out_tensor,
+                                       /*root_rank*/ opts.root_rank,
+                                       rank_,
+                                       comm);
+        }
         int64_t numel = in_tensor.numel() / size_;
         if (rank_ == opts.root_rank) {
           int64_t offset = 0;
@@ -396,7 +434,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Scatter(
           GroupStart();
           for (auto i = 0; i < size_; i++) {
             partial_tensor = GetPartialTensor(in_tensor, offset, numel);
-            NCCL_CHECK(platform::dynload::ncclSend(
+            NCCL_CHECK(phi::dynload::ncclSend(
                 partial_tensor.data(),
                 numel,
                 platform::ToNCCLDataType(partial_tensor.dtype()),
@@ -405,7 +443,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Scatter(
                 stream));
             offset += numel;
           }
-          NCCL_CHECK(platform::dynload::ncclRecv(
+          NCCL_CHECK(phi::dynload::ncclRecv(
               out_tensor->data(),
               numel,
               platform::ToNCCLDataType(out_tensor->dtype()),
@@ -414,7 +452,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Scatter(
               stream));
           GroupEnd();
         } else {
-          NCCL_CHECK(platform::dynload::ncclRecv(
+          NCCL_CHECK(phi::dynload::ncclRecv(
               out_tensor->data(),
               numel,
               platform::ToNCCLDataType(out_tensor->dtype()),
@@ -443,16 +481,22 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv(
     tensor = &partial_tensor;
   }
 
-  CommStaticCheck::SingleTensor(*tensor, rank_, size_);
+  CommStaticCheck::CheckShape(*tensor, rank_, size_);
   return RunFnInNCCLEnv(
       [&](ncclComm_t comm, gpuStream_t stream) {
-        NCCL_CHECK(platform::dynload::ncclRecv(
-            tensor->data(),
-            tensor->numel(),
-            platform::ToNCCLDataType(tensor->dtype()),
-            src_rank,
-            comm,
-            stream));
+        if (FLAGS_enable_nccl_dynamic_check) {
+          CommDynamicCheck::CheckShape(*tensor,
+                                       /*root_rank*/ src_rank,
+                                       rank_,
+                                       comm);
+        }
+        NCCL_CHECK(
+            phi::dynload::ncclRecv(tensor->data(),
+                                   tensor->numel(),
+                                   platform::ToNCCLDataType(tensor->dtype()),
+                                   src_rank,
+                                   comm,
+                                   stream));
       },
       *tensor,
       CommType::RECV,
@@ -471,10 +515,16 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Send(
   const phi::DenseTensor& tensor_maybe_partial =
       numel > 0 ? GetPartialTensor(tensor, offset, numel) : tensor;
 
-  CommStaticCheck::SingleTensor(tensor_maybe_partial, rank_, size_);
+  CommStaticCheck::CheckShape(tensor_maybe_partial, rank_, size_);
   return RunFnInNCCLEnv(
       [&](ncclComm_t comm, gpuStream_t stream) {
-        NCCL_CHECK(platform::dynload::ncclSend(
+        if (FLAGS_enable_nccl_dynamic_check) {
+          CommDynamicCheck::CheckShape(tensor_maybe_partial,
+                                       /*root_rank*/ rank_,
+                                       rank_,
+                                       comm);
+        }
+        NCCL_CHECK(phi::dynload::ncclSend(
             tensor_maybe_partial.data(),
             tensor_maybe_partial.numel(),
             platform::ToNCCLDataType(tensor_maybe_partial.dtype()),
@@ -520,7 +570,7 @@ void ProcessGroupNCCL::CreateNCCLEnvCache(const Place& place,
 
   ncclUniqueId nccl_id;
   if (rank_ == 0) {
-    NCCL_CHECK(platform::dynload::ncclGetUniqueId(&nccl_id));
+    NCCL_CHECK(phi::dynload::ncclGetUniqueId(&nccl_id));
   }
   BroadcastUniqueNCCLID(&nccl_id);
 
@@ -532,7 +582,7 @@ void ProcessGroupNCCL::CreateNCCLEnvCache(const Place& place,
       platform::DeviceContextPool::Instance().Get(place));
   auto comm_ctx = std::make_unique<phi::GPUContext>(place);
   ncclComm_t nccl_comm;
-  NCCL_CHECK(platform::dynload::ncclCommInitRank(
+  NCCL_CHECK(phi::dynload::ncclCommInitRank(
       &nccl_comm, GetSize(), nccl_id, GetRank()));
   comm_ctx->set_nccl_comm(nccl_comm);
 
@@ -589,6 +639,10 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::RunFnInNCCLEnv(
     task->UpdateWaitChain(*comm_ctx);
   }
 
+  if (FLAGS_enable_nccl_dynamic_check) {
+    task->SetBlockCPUInWait();
+    task->Wait();
+  }
   return task;
 }
 
@@ -633,7 +687,7 @@ void ProcessGroupNCCL::CreateNCCLManagerCache(
 
   ncclUniqueId nccl_id;
   if (rank_ == 0) {
-    NCCL_CHECK(platform::dynload::ncclGetUniqueId(&nccl_id));
+    NCCL_CHECK(phi::dynload::ncclGetUniqueId(&nccl_id));
   }
   BroadcastUniqueNCCLID(&nccl_id);
 
@@ -654,7 +708,7 @@ void ProcessGroupNCCL::CreateNCCLManagerCache(
 
     dev_ctx[i].reset(new phi::GPUContext(places[i]));
     ncclComm_t nccl_comm;
-    NCCL_CHECK(platform::dynload::ncclCommInitRank(
+    NCCL_CHECK(phi::dynload::ncclCommInitRank(
         &nccl_comm, GetSize(), nccl_id, GetRank()));
     dev_ctx[i]->set_nccl_comm(nccl_comm);
     dev_ctx_raw[i] = dev_ctx[i].get();
@@ -791,7 +845,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllReduce(
           phi::DenseTensor& output,
           ncclComm_t comm,
           const gpuStream_t& stream) {
-        return platform::dynload::ncclAllReduce(
+        return phi::dynload::ncclAllReduce(
             input.data(),
             output.data(),
             input.numel(),
@@ -821,7 +875,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Broadcast(
           const gpuStream_t& stream) {
         const auto root =
             opts.source_rank * in_tensors.size() + opts.source_root;
-        return platform::dynload::ncclBroadcast(
+        return phi::dynload::ncclBroadcast(
             input.data(),
             output.data(),
             input.numel(),
@@ -871,13 +925,12 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Send(
           ncclComm_t comm,
           const gpuStream_t& stream,
           int dst_rank) {
-        return platform::dynload::ncclSend(
-            input.data(),
-            input.numel(),
-            platform::ToNCCLDataType(input.dtype()),
-            dst_rank,
-            comm,
-            stream);
+        return phi::dynload::ncclSend(input.data(),
+                                      input.numel(),
+                                      platform::ToNCCLDataType(input.dtype()),
+                                      dst_rank,
+                                      comm,
+                                      stream);
       },
       dst_rank,
       CommType::SEND);
@@ -894,13 +947,12 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv(
           ncclComm_t comm,
           const gpuStream_t& stream,
           int src_rank) {
-        return platform::dynload::ncclRecv(
-            output.data(),
-            output.numel(),
-            platform::ToNCCLDataType(output.dtype()),
-            src_rank,
-            comm,
-            stream);
+        return phi::dynload::ncclRecv(output.data(),
+                                      output.numel(),
+                                      platform::ToNCCLDataType(output.dtype()),
+                                      src_rank,
+                                      comm,
+                                      stream);
       },
       src_rank,
       CommType::RECV);
@@ -925,7 +977,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllGather(
           phi::DenseTensor& output,
           ncclComm_t comm,
           const gpuStream_t& stream) {
-        return platform::dynload::ncclAllGather(
+        return phi::dynload::ncclAllGather(
             input.data(),
             output.data(),
             input.numel(),
@@ -994,14 +1046,14 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllToAll(
         size_t offset = 0;
         GroupStart();
         for (auto i = 0; i < size_; i++) {
-          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
+          PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclSend(
               GetPointerByOffset(input.data(), offset, input.dtype()),
               input.numel() / size_,
               platform::ToNCCLDataType(input.dtype()),
               i,
               comm,
               stream));
-          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
+          PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclRecv(
               GetPointerByOffset(output.data(), offset, input.dtype()),
               input.numel() / size_,
               platform::ToNCCLDataType(input.dtype()),
@@ -1030,15 +1082,15 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Reduce(
           phi::DenseTensor& output,
           ncclComm_t comm,
           const gpuStream_t& stream) {
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduce(
-            input.data(),
-            output.data(),
-            input.numel(),
-            platform::ToNCCLDataType(input.dtype()),
-            ToNCCLRedType(opts.reduce_op),
-            opts.root_rank,
-            comm,
-            stream));
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            phi::dynload::ncclReduce(input.data(),
+                                     output.data(),
+                                     input.numel(),
+                                     platform::ToNCCLDataType(input.dtype()),
+                                     ToNCCLRedType(opts.reduce_op),
+                                     opts.root_rank,
+                                     comm,
+                                     stream));
       },
       CommType::REDUCE);
 }
@@ -1066,7 +1118,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Scatter(
         if (rank_ == opts.root_rank) {
           GroupStart();
           for (auto i = 0; i < size_; i++) {
-            PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
+            PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclSend(
                 GetPointerByOffset(input.data(), offset, input.dtype()),
                 input.numel() / size_,
                 platform::ToNCCLDataType(input.dtype()),
@@ -1075,22 +1127,22 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Scatter(
                 stream));
             offset += input.numel() / size_;
           }
-          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
-              output.data(),
-              input.numel() / size_,
-              platform::ToNCCLDataType(input.dtype()),
-              opts.root_rank,
-              comm,
-              stream));
+          PADDLE_ENFORCE_GPU_SUCCESS(
+              phi::dynload::ncclRecv(output.data(),
+                                     input.numel() / size_,
+                                     platform::ToNCCLDataType(input.dtype()),
+                                     opts.root_rank,
+                                     comm,
+                                     stream));
           GroupEnd();
         } else {
-          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
-              output.data(),
-              input.numel() / size_,
-              platform::ToNCCLDataType(input.dtype()),
-              opts.root_rank,
-              comm,
-              stream));
+          PADDLE_ENFORCE_GPU_SUCCESS(
+              phi::dynload::ncclRecv(output.data(),
+                                     input.numel() / size_,
+                                     platform::ToNCCLDataType(input.dtype()),
+                                     opts.root_rank,
+                                     comm,
+                                     stream));
         }
       },
       CommType::SCATTER);
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
index 2a184e182aae9d..3ce77297f56f18 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
@@ -33,9 +33,9 @@
 #endif
 
 #ifdef PADDLE_WITH_RCCL
-#include "paddle/fluid/platform/dynload/rccl.h"
+#include "paddle/phi/backends/dynload/rccl.h"
 #elif PADDLE_WITH_NCCL
-#include "paddle/fluid/platform/dynload/nccl.h"
+#include "paddle/phi/backends/dynload/nccl.h"
 #endif
 
 namespace paddle {
diff --git a/paddle/fluid/distributed/collective/check.cc b/paddle/fluid/distributed/collective/check.cc
new file mode 100644
index 00000000000000..9a2ca064024f4c
--- /dev/null
+++ b/paddle/fluid/distributed/collective/check.cc
@@ -0,0 +1,290 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/collective/check.h"
+
+#include "paddle/fluid/distributed/collective/NCCLTools.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/errors.h"
+
+#ifdef PADDLE_WITH_HIP
+#define gpuMalloc hipMalloc
+#define gpuMemcpy hipMemcpy
+#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuFree hipFree
+#else
+#define gpuMalloc cudaMalloc
+#define gpuMemcpy cudaMemcpy
+#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuFree cudaFree
+#endif
+
+namespace paddle {
+namespace distributed {
+
+// static checks
+void CommStaticCheck::CheckRank(int rank, int world_size) {
+  PADDLE_ENFORCE_GE(rank,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "Rank should be greater than or equal to 0."));
+  PADDLE_ENFORCE_LT(
+      rank,
+      world_size,
+      phi::errors::InvalidArgument("Rank is out of the process group."));
+}
+
+void CommStaticCheck::CheckPlace(const phi::DenseTensor& tensor) {
+  PADDLE_ENFORCE_EQ(
+      platform::is_gpu_place(tensor.place()),
+      true,
+      platform::errors::InvalidArgument("Tensor should be in GPU place."));
+}
+
+void CommStaticCheck::CheckPlace(const phi::DenseTensor& out_tensor,
+                                 const phi::DenseTensor& in_tensor) {
+  CheckPlace(out_tensor);
+  CheckPlace(in_tensor);
+  PADDLE_ENFORCE_EQ(
+      out_tensor.place(),
+      in_tensor.place(),
+      phi::errors::InvalidArgument(
+          "Input and output tensors should be on the same place."));
+}
+
+void CommStaticCheck::CheckDataType(const phi::DenseTensor& out_tensor,
+                                    const phi::DenseTensor& in_tensor) {
+  PADDLE_ENFORCE_EQ(
+      out_tensor.dtype(),
+      in_tensor.dtype(),
+      phi::errors::InvalidArgument(
+          "Input and output tensors should have the same data type."));
+}
+
+void CommStaticCheck::CheckShape(const phi::DenseTensor& tensor) {
+  PADDLE_ENFORCE_GT(
+      tensor.numel(),
+      0,
+      phi::errors::InvalidArgument("Size of tensor should be greater than 0."));
+}
+
+void CommStaticCheck::CheckShape(const phi::DenseTensor& out_tensor,
+                                 const phi::DenseTensor& in_tensor,
+                                 int out_size_factor,
+                                 int in_size_factor) {
+  CheckShape(out_tensor);
+  CheckShape(in_tensor);
+  int64_t out_size = out_tensor.numel(), in_size = in_tensor.numel();
+  PADDLE_ENFORCE_EQ(
+      out_size * out_size_factor,
+      in_size * in_size_factor,
+      phi::errors::InvalidArgument(
+          "Input and output tensors should have matching sizes."));
+}
+
+void CommStaticCheck::CheckShape(const phi::DenseTensor& out_tensor,
+                                 const phi::DenseTensor& in_tensor,
+                                 int dst_rank,
+                                 int cur_rank,
+                                 int world_size,
+                                 int out_size_factor,
+                                 int in_size_factor) {
+  CheckRank(dst_rank, world_size);
+  CheckRank(cur_rank, world_size);
+
+  CheckPlace(out_tensor, in_tensor);
+  CheckDataType(out_tensor, in_tensor);
+
+  if (dst_rank == cur_rank) {
+    CheckShape(out_tensor, in_tensor, out_size_factor, in_size_factor);
+  } else {
+    CheckShape(out_tensor);
+    CheckShape(in_tensor);
+  }
+}
+
+void CommStaticCheck::CheckShape(const phi::DenseTensor& tensor,
+                                 int rank,
+                                 int world_size) {
+  CheckPlace(tensor);
+  CheckRank(rank, world_size);
+}
+
+void CommStaticCheck::SameShape(const phi::DenseTensor& out_tensor,
+                                const phi::DenseTensor& in_tensor,
+                                int dst_rank,
+                                int cur_rank,
+                                int world_size) {
+  CheckShape(out_tensor,
+             in_tensor,
+             dst_rank,
+             cur_rank,
+             world_size,
+             /*out_size_factor*/ 1,
+             /*in_size_factor*/ 1);
+}
+
+void CommStaticCheck::ScatterLikeShape(const phi::DenseTensor& out_tensor,
+                                       const phi::DenseTensor& in_tensor,
+                                       int dst_rank,
+                                       int cur_rank,
+                                       int world_size) {
+  CheckShape(out_tensor,
+             in_tensor,
+             dst_rank,
+             cur_rank,
+             world_size,
+             /*out_size_factor*/ world_size,
+             /*in_size_factor*/ 1);
+}
+
+void CommStaticCheck::GatherLikeShape(const phi::DenseTensor& out_tensor,
+                                      const phi::DenseTensor& in_tensor,
+                                      int dst_rank,
+                                      int cur_rank,
+                                      int world_size) {
+  CheckShape(out_tensor,
+             in_tensor,
+             dst_rank,
+             cur_rank,
+             world_size,
+             /*out_size_factor*/ 1,
+             /*in_size_factor*/ world_size);
+}
+
+// dynamic checks
+void CommDynamicCheck::CheckDataType(const phi::DenseTensor& tensor,
+                                     int64_t dtype) {
+  PADDLE_ENFORCE_EQ(
+      static_cast<int64_t>(tensor.dtype()),
+      dtype,
+      phi::errors::InvalidArgument(
+          "Tensors in communication are expected to have the same data type."));
+}
+
+void CommDynamicCheck::CheckDataType(const phi::DenseTensor& tensor,
+                                     int root_rank,
+                                     int cur_rank,
+                                     ncclComm_t comm) {
+  constexpr int kSize = sizeof(int64_t);
+  int64_t dtype_host = static_cast<int64_t>(tensor.dtype());
+  int64_t* dtype_device;
+  PADDLE_ENFORCE_GPU_SUCCESS(gpuMalloc(&dtype_device, kSize));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      gpuMemcpy(dtype_device, &dtype_host, kSize, gpuMemcpyHostToDevice));
+
+  NCCL_CHECK(phi::dynload::ncclBroadcast(dtype_device,
+                                         dtype_device,
+                                         kSize,
+                                         ncclInt64,
+                                         root_rank,
+                                         comm,
+                                         kDefaultStream));
+
+  if (root_rank == cur_rank) {
+    VLOG(3) << "Dynamic check broadcast metadata, dtype: " << dtype_host;
+  } else {
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        gpuMemcpy(&dtype_host, dtype_device, kSize, gpuMemcpyDeviceToHost));
+    VLOG(3) << "Dynamic check recv metadata, dtype: " << dtype_host;
+    CheckDataType(tensor, dtype_host);
+  }
+  PADDLE_ENFORCE_GPU_SUCCESS(gpuFree(dtype_device));
+}
+
+void CommDynamicCheck::CheckShape(const phi::DenseTensor& tensor,
+                                  int64_t shape) {
+  PADDLE_ENFORCE_EQ(
+      tensor.numel(),
+      shape,
+      phi::errors::InvalidArgument(
+          "Tensors in communication are expected to have matching sizes."));
+}
+
+void CommDynamicCheck::CheckShape(const phi::DenseTensor& tensor,
+                                  int root_rank,
+                                  int cur_rank,
+                                  ncclComm_t comm) {
+  CheckDataType(tensor, root_rank, cur_rank, comm);
+
+  constexpr int kSize = sizeof(int64_t);
+  int64_t shape_host = tensor.numel();
+  int64_t* shape_device;
+
+  PADDLE_ENFORCE_GPU_SUCCESS(gpuMalloc(&shape_device, kSize));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      gpuMemcpy(shape_device, &shape_host, kSize, gpuMemcpyHostToDevice));
+
+  NCCL_CHECK(phi::dynload::ncclBroadcast(shape_device,
+                                         shape_device,
+                                         kSize,
+                                         ncclInt64,
+                                         root_rank,
+                                         comm,
+                                         kDefaultStream));
+
+  if (root_rank == cur_rank) {
+    VLOG(3) << "Dynamic check broadcast metadata, shape: " << shape_host;
+  } else {
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        gpuMemcpy(&shape_host, shape_device, kSize, gpuMemcpyDeviceToHost));
+    VLOG(3) << "Dynamic check recv metadata, shape: " << shape_host;
+    CheckShape(tensor, shape_host);
+  }
+  PADDLE_ENFORCE_GPU_SUCCESS(gpuFree(shape_device));
+}
+
+void CommDynamicCheck::CheckShape(const phi::DenseTensor& out_tensor,
+                                  const phi::DenseTensor& in_tensor,
+                                  const std::vector<int64_t>& in_size_each_rank,
+                                  int cur_rank,
+                                  int world_size,
+                                  ncclComm_t comm) {
+  CheckDataType(out_tensor, /*root_rank*/ 0, cur_rank, comm);
+  CheckDataType(in_tensor, /*root_rank*/ 0, cur_rank, comm);
+
+  constexpr int kSize = sizeof(int64_t);
+  int64_t in_row_size = in_tensor.numel() / in_tensor.dims()[0];
+
+  for (int rank = 0; rank < world_size; ++rank) {
+    int64_t in_shape_host = in_size_each_rank[rank] * in_row_size;
+    int64_t* in_shape_device;
+    PADDLE_ENFORCE_GPU_SUCCESS(gpuMalloc(&in_shape_device, kSize));
+    PADDLE_ENFORCE_GPU_SUCCESS(gpuMemcpy(
+        in_shape_device, &in_shape_host, kSize, gpuMemcpyHostToDevice));
+
+    NCCL_CHECK(phi::dynload::ncclReduce(in_shape_device,
+                                        in_shape_device,
+                                        kSize,
+                                        ncclInt64,
+                                        ncclSum,
+                                        rank,
+                                        comm,
+                                        kDefaultStream));
+    if (rank == cur_rank) {
+      PADDLE_ENFORCE_GPU_SUCCESS(gpuMemcpy(
+          &in_shape_host, in_shape_device, kSize, gpuMemcpyDeviceToHost));
+      VLOG(3) << "Dynamic check recv metadata, shape: " << in_shape_host;
+      CheckShape(out_tensor, in_shape_host);
+    }
+    PADDLE_ENFORCE_GPU_SUCCESS(gpuFree(in_shape_device));
+  }
+}
+
+}  //  namespace distributed
+}  //  namespace paddle
diff --git a/paddle/fluid/distributed/collective/static_check.h b/paddle/fluid/distributed/collective/check.h
similarity index 65%
rename from paddle/fluid/distributed/collective/static_check.h
rename to paddle/fluid/distributed/collective/check.h
index 5dcb17e505438c..be9bfb5f78f393 100644
--- a/paddle/fluid/distributed/collective/static_check.h
+++ b/paddle/fluid/distributed/collective/check.h
@@ -14,7 +14,18 @@
 
 #pragma once
 
-// forward declaration to reduce deps
+#include <cstdint>
+#include <vector>
+
+#include "paddle/phi/backends/gpu/forwards.h"
+
+#ifdef PADDLE_WITH_HIP
+using gpuStream_t = hipStream_t;
+#else
+using gpuStream_t = cudaStream_t;
+#endif
+
+// forward declarations
 namespace phi {
 class DenseTensor;
 }
@@ -49,9 +60,9 @@ struct CommStaticCheck {
                          int in_size_factor);
 
   // for p2p
-  static void SingleTensor(const phi::DenseTensor& tensor,
-                           int rank,
-                           int world_size);
+  static void CheckShape(const phi::DenseTensor& tensor,
+                         int rank,
+                         int world_size);
 
   // for collective
   static void SameShape(const phi::DenseTensor& out_tensor,
@@ -73,5 +84,32 @@ struct CommStaticCheck {
                               int world_size);
 };
 
+struct CommDynamicCheck {
+  static void CheckDataType(const phi::DenseTensor& tensor, int64_t dtype);
+
+  static void CheckDataType(const phi::DenseTensor& tensor,
+                            int root_rank,
+                            int cur_rank,
+                            ncclComm_t comm);
+
+  static void CheckShape(const phi::DenseTensor& tensor, int64_t shape);
+
+  static void CheckShape(const phi::DenseTensor& tensor,
+                         int root_rank,
+                         int cur_rank,
+                         ncclComm_t comm);
+
+  static void CheckShape(const phi::DenseTensor& out_tensor,
+                         const phi::DenseTensor& in_tensor,
+                         const std::vector<int64_t>& in_size_each_rank,
+                         int cur_rank,
+                         int world_size,
+                         ncclComm_t comm);
+
+ private:
+  // `0` represents default stream for both cuda & hip
+  static constexpr gpuStream_t kDefaultStream = 0;
+};
+
 }  // namespace distributed
 }  // namespace paddle
diff --git a/paddle/fluid/distributed/collective/static_check.cc b/paddle/fluid/distributed/collective/static_check.cc
deleted file mode 100644
index 98336db90d1e29..00000000000000
--- a/paddle/fluid/distributed/collective/static_check.cc
+++ /dev/null
@@ -1,155 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/distributed/collective/static_check.h"
-
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/errors.h"
-
-namespace paddle {
-namespace distributed {
-
-void CommStaticCheck::CheckRank(int rank, int world_size) {
-  PADDLE_ENFORCE_GE(rank,
-                    0,
-                    phi::errors::InvalidArgument(
-                        "Rank should be greater than or equal to 0."));
-  PADDLE_ENFORCE_LT(
-      rank,
-      world_size,
-      phi::errors::InvalidArgument("Rank is out of the process group."));
-}
-
-void CommStaticCheck::CheckPlace(const phi::DenseTensor& tensor) {
-  PADDLE_ENFORCE_EQ(
-      platform::is_gpu_place(tensor.place()),
-      true,
-      platform::errors::InvalidArgument("Tensor should be in GPU place."));
-}
-
-void CommStaticCheck::CheckPlace(const phi::DenseTensor& out_tensor,
-                                 const phi::DenseTensor& in_tensor) {
-  CheckPlace(out_tensor);
-  CheckPlace(in_tensor);
-  PADDLE_ENFORCE_EQ(
-      out_tensor.place(),
-      in_tensor.place(),
-      phi::errors::InvalidArgument(
-          "Input and output tensors should be on the same place."));
-}
-
-void CommStaticCheck::CheckDataType(const phi::DenseTensor& out_tensor,
-                                    const phi::DenseTensor& in_tensor) {
-  PADDLE_ENFORCE_EQ(
-      out_tensor.dtype(),
-      in_tensor.dtype(),
-      phi::errors::InvalidArgument(
-          "Input and output tensors should have the same data type."));
-}
-
-void CommStaticCheck::CheckShape(const phi::DenseTensor& tensor) {
-  PADDLE_ENFORCE_GT(
-      tensor.numel(),
-      0,
-      phi::errors::InvalidArgument("Size of tensor should be greater than 0."));
-}
-
-void CommStaticCheck::CheckShape(const phi::DenseTensor& out_tensor,
-                                 const phi::DenseTensor& in_tensor,
-                                 int out_size_factor,
-                                 int in_size_factor) {
-  CheckShape(out_tensor);
-  CheckShape(in_tensor);
-  int64_t out_size = out_tensor.numel(), in_size = in_tensor.numel();
-  PADDLE_ENFORCE_EQ(
-      out_size * out_size_factor,
-      in_size * in_size_factor,
-      phi::errors::InvalidArgument(
-          "Input and output tensors should have matching sizes."));
-}
-
-void CommStaticCheck::CheckShape(const phi::DenseTensor& out_tensor,
-                                 const phi::DenseTensor& in_tensor,
-                                 int dst_rank,
-                                 int cur_rank,
-                                 int world_size,
-                                 int out_size_factor,
-                                 int in_size_factor) {
-  CheckRank(dst_rank, world_size);
-  CheckRank(cur_rank, world_size);
-
-  CheckPlace(out_tensor, in_tensor);
-  CheckDataType(out_tensor, in_tensor);
-
-  if (dst_rank == cur_rank) {
-    CheckShape(out_tensor, in_tensor, out_size_factor, in_size_factor);
-  } else {
-    CheckShape(out_tensor);
-    CheckShape(in_tensor);
-  }
-}
-
-void CommStaticCheck::SingleTensor(const phi::DenseTensor& tensor,
-                                   int rank,
-                                   int world_size) {
-  CheckPlace(tensor);
-  CheckRank(rank, world_size);
-}
-
-void CommStaticCheck::SameShape(const phi::DenseTensor& out_tensor,
-                                const phi::DenseTensor& in_tensor,
-                                int dst_rank,
-                                int cur_rank,
-                                int world_size) {
-  CheckShape(out_tensor,
-             in_tensor,
-             dst_rank,
-             cur_rank,
-             world_size,
-             /*out_size_factor*/ 1,
-             /*in_size_factor*/ 1);
-}
-
-void CommStaticCheck::ScatterLikeShape(const phi::DenseTensor& out_tensor,
-                                       const phi::DenseTensor& in_tensor,
-                                       int dst_rank,
-                                       int cur_rank,
-                                       int world_size) {
-  CheckShape(out_tensor,
-             in_tensor,
-             dst_rank,
-             cur_rank,
-             world_size,
-             /*out_size_factor*/ world_size,
-             /*in_size_factor*/ 1);
-}
-
-void CommStaticCheck::GatherLikeShape(const phi::DenseTensor& out_tensor,
-                                      const phi::DenseTensor& in_tensor,
-                                      int dst_rank,
-                                      int cur_rank,
-                                      int world_size) {
-  CheckShape(out_tensor,
-             in_tensor,
-             dst_rank,
-             cur_rank,
-             world_size,
-             /*out_size_factor*/ 1,
-             /*in_size_factor*/ world_size);
-}
-
-}  //  namespace distributed
-}  //  namespace paddle
diff --git a/paddle/fluid/eager/tests/task_tests/nan_inf_utils_test.cc b/paddle/fluid/eager/tests/task_tests/nan_inf_utils_test.cc
index 73d213f71148f7..86f863bdffa6d5 100644
--- a/paddle/fluid/eager/tests/task_tests/nan_inf_utils_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/nan_inf_utils_test.cc
@@ -30,32 +30,30 @@ PD_DECLARE_KERNEL(strings_empty, CPU, ALL_LAYOUT);
 
 namespace egr {
 
-#define CHECK_NAN_INF(tensors)                                         \
-  {                                                                    \
-    bool caught_exception = false;                                     \
-    try {                                                              \
-      CheckTensorHasNanOrInf("nan_inf_test", tensors);                 \
-    } catch (paddle::platform::EnforceNotMet & error) {                \
-      caught_exception = true;                                         \
-      std::string ex_msg = error.what();                               \
-      EXPECT_TRUE(ex_msg.find("There are `nan` or `inf` in tensor") != \
-                  std::string::npos);                                  \
-    }                                                                  \
-    EXPECT_TRUE(caught_exception);                                     \
+#define CHECK_NAN_INF(tensors)                                               \
+  {                                                                          \
+    bool caught_exception = false;                                           \
+    try {                                                                    \
+      CheckTensorHasNanOrInf("nan_inf_test", tensors);                       \
+    } catch (paddle::platform::EnforceNotMet & error) {                      \
+      caught_exception = true;                                               \
+      std::string ex_msg = error.what();                                     \
+      EXPECT_TRUE(ex_msg.find("There are NAN or INF") != std::string::npos); \
+    }                                                                        \
+    EXPECT_TRUE(caught_exception);                                           \
   }
 
-#define CHECK_NO_NAN_INF(tensors)                                      \
-  {                                                                    \
-    bool caught_exception = false;                                     \
-    try {                                                              \
-      CheckTensorHasNanOrInf("nan_inf_test", tensors);                 \
-    } catch (paddle::platform::EnforceNotMet & error) {                \
-      caught_exception = true;                                         \
-      std::string ex_msg = error.what();                               \
-      EXPECT_TRUE(ex_msg.find("There are `nan` or `inf` in tensor") != \
-                  std::string::npos);                                  \
-    }                                                                  \
-    EXPECT_FALSE(caught_exception);                                    \
+#define CHECK_NO_NAN_INF(tensors)                                            \
+  {                                                                          \
+    bool caught_exception = false;                                           \
+    try {                                                                    \
+      CheckTensorHasNanOrInf("nan_inf_test", tensors);                       \
+    } catch (paddle::platform::EnforceNotMet & error) {                      \
+      caught_exception = true;                                               \
+      std::string ex_msg = error.what();                                     \
+      EXPECT_TRUE(ex_msg.find("There are NAN or INF") != std::string::npos); \
+    }                                                                        \
+    EXPECT_FALSE(caught_exception);                                          \
   }
 
 TEST(NanInfUtils, Functions) {
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
index f80bb94b30b648..30046b2d1d44e8 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
@@ -17,6 +17,7 @@
 #include "paddle/fluid/framework/details/nan_inf_utils.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/phi/common/amp_type_traits.h"
 
 #ifdef PADDLE_WITH_ASCEND_CL
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
@@ -24,6 +25,8 @@
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/phi/kernels/funcs/eigen/extensions.h"
 
+DECLARE_int32(check_nan_inf_level);
+
 namespace paddle {
 namespace framework {
 namespace details {
@@ -90,7 +93,7 @@ static void InitWhiteListFormEnv() {
   const char* op_role_skip = std::getenv("PADDLE_INF_NAN_SKIP_ROLE");
   const char* op_var_skip = std::getenv("PADDLE_INF_NAN_SKIP_VAR");
 
-  if (op_type_skip != NULL) {
+  if (op_type_skip) {
     std::stringstream ss(op_type_skip);
     std::string op_type;
     while (std::getline(ss, op_type, ',')) {
@@ -98,7 +101,7 @@ static void InitWhiteListFormEnv() {
     }
   }
 
-  if (op_role_skip != NULL) {
+  if (op_role_skip) {
     std::stringstream ss(op_role_skip);
     std::string op_role;
     while (std::getline(ss, op_role, ',')) {
@@ -113,7 +116,7 @@ static void InitWhiteListFormEnv() {
     }
   }
 
-  if (op_var_skip != NULL) {
+  if (op_var_skip) {
     std::stringstream ss(op_var_skip);
     std::string op_var;
     while (std::getline(ss, op_var, ',')) {
@@ -131,175 +134,101 @@ static void InitWhiteListFormEnv() {
   }
 }
 
-template <typename T>
-static void PrintNanInf(const T* value,
-                        const size_t numel,
-                        int print_num,
-                        const std::string& op_type,
-                        const std::string& var_name,
-                        bool abort = true) {
-  T min_value = std::numeric_limits<T>::max();
-  T max_value = std::numeric_limits<T>::min();
-  size_t nan_count, inf_count, num_count;
-  nan_count = inf_count = num_count = 0;
-
-  // CPU print num value
-  for (size_t i = 0; i < numel; ++i) {
-    size_t count = 0;
-    if (std::isnan(value[i])) {
-      count = nan_count++;
-    } else if (std::isinf(value[i])) {
-      count = inf_count++;
-    } else {
-      count = num_count++;
-      min_value = std::min(min_value, value[i]);
-      max_value = std::max(max_value, value[i]);
-    }
-
-    if (count < static_cast<size_t>(print_num)) {
-      printf("numel:%zu index:%zu value:%f\n",
-             numel,
-             i,
-             static_cast<float>(value[i]));
-    }
-  }
-  printf(
-      "In cpu, there has %zu,%zu,%zu nan,inf,num. "
-      "And in num, min_value is %f, max_value is %f\n",
-      nan_count,
-      inf_count,
-      num_count,
-      static_cast<double>(min_value),
-      static_cast<double>(max_value));
-  if (abort) {
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "There are `nan` or `inf` in tensor (%s) of operator (%s).",
-        var_name,
-        op_type));
-  }
-}
+template <
+    typename T,
+    std::enable_if_t<!std::is_same<T, phi::dtype::complex<float>>::value &&
+                         !std::is_same<T, phi::dtype::complex<double>>::value,
+                     bool> = true>
+static void CheckNanInfCpuImpl(const T* value_ptr,
+                               const int64_t numel,
+                               const std::string& cpu_hint_str) {
+  using MT = typename phi::dtype::template MPTypeTrait<T>::Type;
+
+#ifdef _OPENMP
+  // Use maximum 4 threads to collect the nan and inf information.
+  int num_threads = std::max(omp_get_num_threads(), 1);
+  num_threads = std::min(num_threads, 4);
+#else
+  int num_threads = 1;
+#endif
 
-// openmp 4.0, reduction with fp16
-#if defined _OPENMP && _OPENMP >= 201307
-// more detail see: 180 page of
-// https://www.openmp.org/wp-content/uploads/OpenMP4.0.0.pdf
-#pragma omp declare reduction(+ : paddle::platform::float16 : omp_out += omp_in)
-#pragma omp declare reduction(+ : paddle::platform::bfloat16 : omp_out += \
-                              omp_in)
-#pragma omp declare reduction(+ : paddle::platform::complex < \
-                                  float > : omp_out += omp_in)
-#pragma omp declare reduction(+ : paddle::platform::complex < \
-                                  double > : omp_out += omp_in)
+  std::vector<int64_t> thread_num_nan(num_threads, 0);
+  std::vector<int64_t> thread_num_inf(num_threads, 0);
+  std::vector<MT> thread_min_value(num_threads, static_cast<MT>(value_ptr[0]));
+  std::vector<MT> thread_max_value(num_threads, static_cast<MT>(value_ptr[0]));
+  std::vector<MT> thread_mean_value(num_threads, static_cast<MT>(0));
 
+#ifdef _OPENMP
+#pragma omp parallel num_threads(num_threads)
 #endif
-
-template <typename T>
-static void CheckNanInf(const T* value,
-                        const size_t numel,
-                        int print_num,
-                        const std::string& op_type,
-                        const std::string& var_name) {
-  T sum = static_cast<T>(0.0);
-#if defined _OPENMP && _OPENMP >= 201307
-#pragma omp parallel for simd reduction(+ : sum)
-#elif defined _OPENMP
-#pragma omp parallel for reduction(+ : sum)
+  {
+#ifdef _OPENMP
+    int64_t tid = omp_get_thread_num();
+    int64_t chunk_size = (numel + num_threads - 1) / num_threads;
+    int64_t begin = tid * chunk_size;
+    int64_t end = chunk_size + begin > numel ? numel : chunk_size + begin;
+#else
+    int64_t tid = 0;
+    int64_t begin = 0;
+    int64_t end = numel;
 #endif
-  for (size_t i = 0; i < numel; ++i) {
-    sum += (value[i] - value[i]);
-  }
+    for (int64_t i = begin; i < end; ++i) {
+      MT value = static_cast<MT>(value_ptr[i]);
 
-  if (std::isnan(sum) || std::isinf(sum)) {
-    PrintNanInf(value, numel, print_num, op_type, var_name);
-  }
-}
+      thread_min_value[tid] = std::min(thread_min_value[tid], value);
+      thread_max_value[tid] = std::max(thread_max_value[tid], value);
+      thread_mean_value[tid] += value / static_cast<MT>(numel);
 
-#if defined _OPENMP && _OPENMP >= 201307
-// openmp4.0 not need to specialization fp16
-#elif defined _OPENMP
-template <>
-void CheckNanInf<paddle::platform::float16>(
-    const paddle::platform::float16* value,
-    const size_t numel,
-    int print_num,
-    const std::string& op_type,
-    const std::string& var_name) {
-  float sum = 0.0f;
-#pragma omp parallel for reduction(+ : sum)
-  for (size_t i = 0; i < numel; ++i) {
-    sum += static_cast<float>(value[i] - value[i]);
-  }
-
-  if (std::isnan(sum) || std::isinf(sum)) {
-    PrintNanInf(value, numel, print_num, op_type, var_name);
+      if (std::isnan(value)) {
+        thread_num_nan[tid] += 1;
+      } else if (std::isinf(value)) {
+        thread_num_inf[tid] += 1;
+      }
+    }
   }
-}
 
-template <>
-void CheckNanInf<paddle::platform::bfloat16>(
-    const paddle::platform::bfloat16* value,
-    const size_t numel,
-    int print_num,
-    const std::string& op_type,
-    const std::string& var_name) {
-  float sum = 0.0f;
-#pragma omp parallel for reduction(+ : sum)
-  for (size_t i = 0; i < numel; ++i) {
-    sum += static_cast<float>(value[i] - value[i]);
+  int64_t num_nan = 0;
+  int64_t num_inf = 0;
+  MT min_value = thread_min_value[0];
+  MT max_value = thread_max_value[0];
+  MT mean_value = static_cast<MT>(0);
+  for (int i = 0; i < num_threads; ++i) {
+    num_nan += thread_num_nan[i];
+    num_inf += thread_num_inf[i];
+    min_value = std::min(thread_min_value[i], min_value);
+    max_value = std::max(thread_max_value[i], max_value);
+    mean_value += thread_mean_value[i];
   }
 
-  if (std::isnan(sum) || std::isinf(sum)) {
-    PrintNanInf(value, numel, print_num, op_type, var_name);
-  }
+  PrintForDifferentLevel<T, MT>(cpu_hint_str.c_str(),
+                                numel,
+                                num_nan,
+                                num_inf,
+                                max_value,
+                                min_value,
+                                mean_value,
+                                FLAGS_check_nan_inf_level);
 }
 
-template <>
-void CheckNanInf<paddle::platform::complex<float>>(
-    const paddle::platform::complex<float>* value,
-    const size_t numel,
-    int print_num,
-    const std::string& op_type,
-    const std::string& var_name) {
-  float real_sum = 0.0f;
-#pragma omp parallel for reduction(+ : real_sum)
-  for (size_t i = 0; i < numel; ++i) {
-    real_sum += (value[i].real - value[i].real);
-  }
-
-  float imag_sum = 0.0f;
-#pragma omp parallel for reduction(+ : imag_sum)
-  for (size_t i = 0; i < numel; ++i) {
-    imag_sum += (value[i].imag - value[i].imag);
-  }
-
-  if (std::isnan(real_sum) || std::isinf(real_sum) || std::isnan(imag_sum) ||
-      std::isinf(imag_sum)) {
-    // hot fix for compile failed in gcc4.8
-    // here also need print detail info of nan or inf later
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "There are `nan` or `inf` in tensor (%s) of operator (%s).",
-        var_name,
-        op_type));
-  }
-}
+template <
+    typename T,
+    std::enable_if_t<std::is_same<T, phi::dtype::complex<float>>::value ||
+                         std::is_same<T, phi::dtype::complex<double>>::value,
+                     bool> = true>
+void CheckNanInfCpuImpl(const T* value_ptr,
+                        const int64_t numel,
+                        const std::string& cpu_hint_str) {
+  using RealType = typename T::value_type;
 
-template <>
-    void CheckNanInf < paddle::platform::complex < double >>>
-    (const paddle::platform::complex<double>* value,
-     const size_t numel,
-     int print_num,
-     const std::string& op_type,
-     const std::string& var_name) {
-  double real_sum = 0.0;
-#pragma omp parallel for reduction(+ : real_sum)
-  for (size_t i = 0; i < numel; ++i) {
-    real_sum += (value[i].real - value[i].real);
-  }
+  RealType real_sum = 0.0f, imag_sum = 0.0f;
 
-  double imag_sum = 0.0;
-#pragma omp parallel for reduction(+ : imag_sum)
-  for (size_t i = 0; i < numel; ++i) {
-    imag_sum += (value[i].imag - value[i].imag);
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+ : real_sum) reduction(+ : imag_sum)
+#endif
+  for (int64_t i = 0; i < numel; ++i) {
+    T value = value_ptr[i];
+    real_sum += (value.real - value.real);
+    imag_sum += (value.imag - value.imag);
   }
 
   if (std::isnan(real_sum) || std::isinf(real_sum) || std::isnan(imag_sum) ||
@@ -307,14 +236,10 @@ template <>
     // hot fix for compile failed in gcc4.8
     // here also need print detail info of nan or inf later
     PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "There are `nan` or `inf` in tensor (%s) of operator (%s).",
-        var_name,
-        op_type));
+        "There are NAN or INF in %s.", cpu_hint_str));
   }
 }
 
-#endif
-
 template <>
 template <typename T>
 void TensorCheckerVisitor<phi::CPUContext>::apply(
@@ -323,10 +248,9 @@ void TensorCheckerVisitor<phi::CPUContext>::apply(
         std::is_same<T, ::paddle::platform::complex<float>>::value ||
         std::is_same<T, ::paddle::platform::complex<double>>::value>::type*)
     const {
-  // use env strategy control in future, -1=print_all.
-  int print_num = 3;
-  CheckNanInf(
-      tensor_.data<T>(), tensor_.numel(), print_num, op_type_, var_name_);
+  std::string cpu_hint_str =
+      GetCpuHintString<T>(op_type, var_name, tensor.place());
+  CheckNanInfCpuImpl(tensor.data<T>(), tensor.numel(), cpu_hint_str);
 }
 
 template <>
@@ -371,8 +295,8 @@ void CheckVarHasNanOrInf(const std::string& op_type,
     tensor_check<phi::GPUContext>(op_type, var_name, *tensor, place);
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "phi::DenseTensor[%s] use gpu place. PaddlePaddle must compile with "
-        "GPU.",
+        "phi::DenseTensor[%s] use gpu place. PaddlePaddle must compile "
+        "with GPU.",
         var_name));
 #endif
     return;
@@ -406,8 +330,8 @@ void CheckVarHasNanOrInf(const std::string& op_type,
             var_name));
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "phi::DenseTensor[%s] use xpu place. PaddlePaddle must compile with "
-        "XPU.",
+        "phi::DenseTensor[%s] use xpu place. PaddlePaddle must compile "
+        "with XPU.",
         var_name));
 #endif
     return;
@@ -440,8 +364,8 @@ void CheckVarHasNanOrInf(const std::string& op_type,
             var_name));
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "phi::DenseTensor[%s] use npu place. PaddlePaddle must compile with "
-        "NPU.",
+        "phi::DenseTensor[%s] use npu place. PaddlePaddle must compile "
+        "with NPU.",
         var_name));
 #endif
     return;
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cu b/paddle/fluid/framework/details/nan_inf_utils_detail.cu
index abf575b4ca5453..629ab737055a47 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cu
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cu
@@ -138,6 +138,54 @@ __global__ void CheckNanInfKernel(const T* value,
   PrintNanInfKernel(value, numel, print_num, debug_info);
 }
 
+template <typename T, int ReduceType>
+__device__ T BlockReduce(T value) {
+  __shared__ T shared_mem[1024];
+
+  shared_mem[threadIdx.x] = value;
+  __syncthreads();
+
+  for (int stride = blockDim.x >> 1; stride > 0; stride = stride >> 1) {
+    if (threadIdx.x < stride) {
+      T value0 = shared_mem[threadIdx.x];
+      T value1 = shared_mem[threadIdx.x + stride];
+      T reduce_value;
+      if (ReduceType == 0) {
+        // max
+        reduce_value = value0 > value1 ? value0 : value1;
+      } else if (ReduceType == 1) {
+        // min
+        reduce_value = value0 < value1 ? value0 : value1;
+      } else if (ReduceType == 2) {
+        // sum
+        reduce_value = value0 + value1;
+      }
+      shared_mem[threadIdx.x] = reduce_value;
+    }
+
+    if (stride > 16) {
+      __syncthreads();
+    }
+  }
+
+  __syncthreads();
+  return shared_mem[0];
+}
+
+__device__ void BlockReduceNumNanInfAndWrite(const int64_t num_nan,
+                                             const int64_t num_inf,
+                                             int64_t offset,
+                                             int64_t* num_nan_ptr,
+                                             int64_t* num_inf_ptr) {
+  int64_t block_num_nan = BlockReduce<int64_t, 2>(num_nan);
+  int64_t block_num_inf = BlockReduce<int64_t, 2>(num_inf);
+
+  if (threadIdx.x == 0) {
+    num_nan_ptr[offset] = block_num_nan;
+    num_inf_ptr[offset] = block_num_inf;
+  }
+}
+
 template <
     typename T,
     std::enable_if_t<std::is_same<T, phi::dtype::complex<float>>::value ||
@@ -183,15 +231,16 @@ __device__ void BlockReduceMaxMinAndWrite(const T max_value,
 template <typename T, typename MT>
 __global__ void FindNanInfAndBlockMaxMin(const T* value_ptr,
                                          const int64_t numel,
-                                         int* found_nan_inf_ptr,
+                                         int64_t* block_num_nan_ptr,
+                                         int64_t* block_num_inf_ptr,
                                          MT* tensor_block_max_ptr,
                                          MT* tensor_block_min_ptr,
                                          MT* tensor_block_mean_ptr) {
-  bool has_nan = false;
-  bool has_inf = false;
-
   int64_t i = threadIdx.x + blockIdx.x * blockDim.x;
 
+  int64_t num_nan = 0;
+  int64_t num_inf = 0;
+
   MT max_value = static_cast<MT>(i < numel ? value_ptr[i] : value_ptr[0]);
   MT min_value = static_cast<MT>(i < numel ? value_ptr[i] : value_ptr[0]);
   MT mean_value = static_cast<MT>(0);
@@ -203,25 +252,14 @@ __global__ void FindNanInfAndBlockMaxMin(const T* value_ptr,
     mean_value += value / static_cast<MT>(numel);
 
     if (isnan(value)) {
-      has_nan = true;
-    }
-    if (isinf(value)) {
-      has_inf = true;
-    }
-
-    if (has_nan || has_inf) {
-      if (!tensor_block_max_ptr && !tensor_block_min_ptr &&
-          !tensor_block_mean_ptr) {
-        break;
-      }
+      num_nan += 1;
+    } else if (isinf(value)) {
+      num_inf += 1;
     }
   }
-  if (has_nan) {
-    found_nan_inf_ptr[0] = 1;
-  }
-  if (has_inf) {
-    found_nan_inf_ptr[1] = 1;
-  }
+
+  BlockReduceNumNanInfAndWrite(
+      num_nan, num_inf, blockIdx.x, block_num_nan_ptr, block_num_inf_ptr);
 
   BlockReduceMaxMinAndWrite<MT>(max_value,
                                 min_value,
@@ -232,32 +270,9 @@ __global__ void FindNanInfAndBlockMaxMin(const T* value_ptr,
                                 tensor_block_mean_ptr);
 }
 
-template <typename T,
-          typename MT,
-          std::enable_if_t<std::is_same<T, float>::value, bool> = true>
-__device__ bool NeedPrint(MT max_value, MT min_value, int check_nan_inf_level) {
-  if (check_nan_inf_level >= 3) {
-    return true;
-  } else if (check_nan_inf_level >= 2) {
-    MT fp16_max =
-        static_cast<MT>(std::numeric_limits<phi::dtype::float16>::max());
-    return max_value > fp16_max || min_value < -fp16_max;
-  }
-  return false;
-}
-
-template <typename T,
-          typename MT,
-          std::enable_if_t<!std::is_same<T, float>::value, bool> = true>
-__device__ bool NeedPrint(MT max_value, MT min_value, int check_nan_inf_level) {
-  if (check_nan_inf_level >= 3) {
-    return true;
-  }
-  return false;
-}
-
 template <typename T, typename MT>
-__global__ void FindGlobalMaxMinAndPrint(const int* found_nan_inf_ptr,
+__global__ void FindGlobalMaxMinAndPrint(const int64_t* block_num_nan_ptr,
+                                         const int64_t* block_num_inf_ptr,
                                          const MT* tensor_block_max_ptr,
                                          const MT* tensor_block_min_ptr,
                                          const MT* tensor_block_mean_ptr,
@@ -266,8 +281,14 @@ __global__ void FindGlobalMaxMinAndPrint(const int* found_nan_inf_ptr,
                                          int64_t numel_max_min,
                                          int check_nan_inf_level) {
   if (blockIdx.x == 0 && threadIdx.x == 0) {
-    int has_nan = found_nan_inf_ptr[0];
-    int has_inf = found_nan_inf_ptr[1];
+    int64_t num_nan = 0;
+    int64_t num_inf = 0;
+
+    // numel_max_min <= 128
+    for (int64_t i = 0; i < numel_max_min; ++i) {
+      num_nan += block_num_nan_ptr[i];
+      num_inf += block_num_inf_ptr[i];
+    }
 
     MT max_value = static_cast<MT>(0);
     MT min_value = static_cast<MT>(0);
@@ -289,67 +310,31 @@ __global__ void FindGlobalMaxMinAndPrint(const int* found_nan_inf_ptr,
       }
     }
 
-    if (has_nan || has_inf) {
-      if (check_nan_inf_level == 0) {
-        PADDLE_ENFORCE(false,
-                       "===[PRECISION] [ERROR] in %s, numel=%ld, find_nan=%d, "
-                       "find_inf=%d, "
-                       "max=%e, min=%e, mean=%e===\n",
-                       debug_info,
-                       numel,
-                       has_nan,
-                       has_inf,
-                       static_cast<float>(max_value),
-                       static_cast<float>(min_value),
-                       static_cast<float>(mean_value));
-      } else if (check_nan_inf_level >= 1) {
-        printf(
-            "===[PRECISION] [ERROR] in %s, numel=%ld, find_nan=%d, "
-            "find_inf=%d, "
-            "max=%e, min=%e, mean=%e===\n",
-            debug_info,
-            numel,
-            has_nan,
-            has_inf,
-            static_cast<float>(max_value),
-            static_cast<float>(min_value),
-            static_cast<float>(mean_value));
-      }
-    } else if (NeedPrint<T, MT>(max_value, min_value, check_nan_inf_level)) {
-      printf("[PRECISION] in %s, numel=%ld, max=%e, min=%e, mean=%e\n",
-             debug_info,
-             numel,
-             static_cast<float>(max_value),
-             static_cast<float>(min_value),
-             static_cast<float>(mean_value));
-    }
+    PrintForDifferentLevel<T, MT>(debug_info,
+                                  numel,
+                                  num_nan,
+                                  num_inf,
+                                  max_value,
+                                  min_value,
+                                  mean_value,
+                                  check_nan_inf_level);
   }
 }
 
-template <>
 template <typename T>
-void TensorCheckerVisitor<phi::GPUContext>::apply(
-    typename std::enable_if<
-        std::is_floating_point<T>::value ||
-        std::is_same<T, ::paddle::platform::complex<float>>::value ||
-        std::is_same<T, ::paddle::platform::complex<double>>::value>::type*)
-    const {
-  auto* dev_ctx = reinterpret_cast<phi::GPUContext*>(
-      platform::DeviceContextPool::Instance().Get(tensor_.place()));
-  int dev_id = tensor_.place().device;
+static char* GetGpuHintStringPtr(const phi::GPUContext& ctx,
+                                 const std::string& op_type,
+                                 const std::string& var_name,
+                                 int dev_id) {
   PADDLE_ENFORCE_EQ(
       (dev_id >= 0 && dev_id < multi_op_var2gpu_str_mutex().size()),
       true,
       platform::errors::OutOfRange("GPU dev_id must >=0 and < dev_count=%d",
                                    multi_op_var2gpu_str_mutex().size()));
 
-  std::string dtype_str = DataTypeToString(DataTypeTrait<T>::DataType());
-  if (dtype_str == "::paddle::platform::float16") {
-    dtype_str = "float16";
-  }
-  std::string op_var = "[op=" + op_type_ + "] [tensor=" + var_name_ +
-                       "] [dtype=" + dtype_str + "]";
-  char* gpu_str_ptr = NULL;
+  std::string op_var =
+      GetCpuHintString<T>(op_type, var_name, ctx.GetPlace(), dev_id);
+  char* gpu_str_ptr = nullptr;
 
   {
     auto& op_var2gpu_str_mutex = multi_op_var2gpu_str_mutex().at(dev_id);
@@ -358,9 +343,9 @@ void TensorCheckerVisitor<phi::GPUContext>::apply(
     std::lock_guard<std::mutex> guard(op_var2gpu_str_mutex);
     if (op_var2gpu_str.find(op_var) == op_var2gpu_str.end()) {  // insert
       auto gpu_str_tensor = paddle::memory::Alloc(
-          dev_ctx->GetPlace(),
+          ctx.GetPlace(),
           op_var.length() + 1,
-          phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx->stream())));
+          phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
       gpu_str_ptr = reinterpret_cast<char*>(gpu_str_tensor->ptr());
 
       op_var2gpu_str.emplace(op_var, std::move(gpu_str_tensor));
@@ -378,13 +363,13 @@ void TensorCheckerVisitor<phi::GPUContext>::apply(
                                                 iter->first.c_str(),
                                                 op_var.length() + 1,
                                                 hipMemcpyHostToDevice,
-                                                dev_ctx->stream()));
+                                                ctx.stream()));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(gpu_str_ptr,
                                                  iter->first.c_str(),
                                                  op_var.length() + 1,
                                                  cudaMemcpyHostToDevice,
-                                                 dev_ctx->stream()));
+                                                 ctx.stream()));
 #endif
     } else {  // get
       auto iter = op_var2gpu_str.find(op_var);
@@ -397,6 +382,22 @@ void TensorCheckerVisitor<phi::GPUContext>::apply(
       gpu_str_ptr = reinterpret_cast<char*>(iter->second->ptr());
     }
   }
+  return gpu_str_ptr;
+}
+
+template <>
+template <typename T>
+void TensorCheckerVisitor<phi::GPUContext>::apply(
+    typename std::enable_if<
+        std::is_floating_point<T>::value ||
+        std::is_same<T, ::paddle::platform::complex<float>>::value ||
+        std::is_same<T, ::paddle::platform::complex<double>>::value>::type*)
+    const {
+  auto* dev_ctx = reinterpret_cast<phi::GPUContext*>(
+      platform::DeviceContextPool::Instance().Get(tensor.place()));
+  int dev_id = tensor.place().device;
+  char* gpu_str_ptr =
+      GetGpuHintStringPtr<T>(*dev_ctx, op_type, var_name, dev_id);
 
 #ifdef __HIPCC__
   // HIP will throw GPU memory access fault if threads > 256
@@ -406,7 +407,7 @@ void TensorCheckerVisitor<phi::GPUContext>::apply(
 #endif
   size_t blocks =
       std::min(static_cast<size_t>(128),
-               static_cast<size_t>((tensor_.numel() + threads - 1) / threads));
+               static_cast<size_t>((tensor.numel() + threads - 1) / threads));
 #ifdef __HIPCC__
   int print_num = 3;
 
@@ -415,44 +416,46 @@ void TensorCheckerVisitor<phi::GPUContext>::apply(
                      dim3(threads),
                      0,
                      dev_ctx->stream(),
-                     tensor_.data<T>(),
-                     tensor_.numel(),
+                     tensor.data<T>(),
+                     tensor.numel(),
                      print_num,
                      gpu_str_ptr);
 #else
   using MT = typename phi::dtype::MPTypeTrait<T>::Type;
 
-  phi::DenseTensor found_nan_inf;
-  found_nan_inf.Resize({2});
-  int* found_nan_inf_ptr = found_nan_inf.mutable_data<int>(tensor_.place());
-  PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(
-      found_nan_inf_ptr, 0, 2 * sizeof(int), dev_ctx->stream()));
-
   int64_t numel_max_min = blocks;
 
+  phi::DenseTensor block_num_nan_inf;
+  block_num_nan_inf.Resize({static_cast<int64_t>(2 * numel_max_min)});
+  int64_t* block_num_nan_ptr =
+      block_num_nan_inf.mutable_data<int64_t>(tensor.place());
+  int64_t* block_num_inf_ptr = block_num_nan_ptr + numel_max_min;
+
   phi::DenseTensor tensor_block_max_min;
   tensor_block_max_min.Resize({static_cast<int64_t>(3 * numel_max_min)});
   MT* tensor_block_max_ptr =
-      tensor_block_max_min.mutable_data<MT>(tensor_.place());
+      tensor_block_max_min.mutable_data<MT>(tensor.place());
   MT* tensor_block_min_ptr = tensor_block_max_ptr + numel_max_min;
   MT* tensor_block_mean_ptr = tensor_block_max_ptr + 2 * numel_max_min;
 
   FindNanInfAndBlockMaxMin<T, MT>
-      <<<blocks, threads, 0, dev_ctx->stream()>>>(tensor_.data<T>(),
-                                                  tensor_.numel(),
-                                                  found_nan_inf_ptr,
+      <<<blocks, threads, 0, dev_ctx->stream()>>>(tensor.data<T>(),
+                                                  tensor.numel(),
+                                                  block_num_nan_ptr,
+                                                  block_num_inf_ptr,
                                                   tensor_block_max_ptr,
                                                   tensor_block_min_ptr,
                                                   tensor_block_mean_ptr);
 
   int check_nan_inf_level = FLAGS_check_nan_inf_level;
   FindGlobalMaxMinAndPrint<T, MT>
-      <<<1, 1, 0, dev_ctx->stream()>>>(found_nan_inf_ptr,
+      <<<1, 1, 0, dev_ctx->stream()>>>(block_num_nan_ptr,
+                                       block_num_inf_ptr,
                                        tensor_block_max_ptr,
                                        tensor_block_min_ptr,
                                        tensor_block_mean_ptr,
                                        gpu_str_ptr,
-                                       tensor_.numel(),
+                                       tensor.numel(),
                                        numel_max_min,
                                        check_nan_inf_level);
 #endif
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.h b/paddle/fluid/framework/details/nan_inf_utils_detail.h
index 2a25bc7b68f366..0adf23fd029218 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.h
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.h
@@ -24,21 +24,114 @@ namespace paddle {
 namespace framework {
 namespace details {
 
+template <typename T,
+          typename MT,
+          std::enable_if_t<std::is_same<T, float>::value, bool> = true>
+HOSTDEVICE bool NeedPrint(MT max_value, MT min_value, int check_nan_inf_level) {
+  if (check_nan_inf_level >= 3) {
+    return true;
+  } else if (check_nan_inf_level >= 2) {
+    MT fp16_max =
+        static_cast<MT>(std::numeric_limits<phi::dtype::float16>::max());
+    return max_value > fp16_max || min_value < -fp16_max;
+  }
+  return false;
+}
+
+template <typename T,
+          typename MT,
+          std::enable_if_t<!std::is_same<T, float>::value, bool> = true>
+HOSTDEVICE bool NeedPrint(MT max_value, MT min_value, int check_nan_inf_level) {
+  if (check_nan_inf_level >= 3) {
+    return true;
+  }
+  return false;
+}
+
+template <typename T, typename MT>
+HOSTDEVICE void PrintForDifferentLevel(const char* debug_info,
+                                       int64_t numel,
+                                       int64_t num_nan,
+                                       int64_t num_inf,
+                                       MT max_value,
+                                       MT min_value,
+                                       MT mean_value,
+                                       int check_nan_inf_level) {
+  if (num_nan > 0 || num_inf > 0) {
+    printf(
+        "[PRECISION] [ERROR] in %s, numel=%lld, num_nan=%lld, "
+        "num_inf=%lld, max=%e, min=%e, mean=%e\n",
+        debug_info,
+        static_cast<long long>(numel),    // NOLINT
+        static_cast<long long>(num_nan),  // NOLINT
+        static_cast<long long>(num_inf),  // NOLINT
+        static_cast<float>(max_value),
+        static_cast<float>(min_value),
+        static_cast<float>(mean_value));
+    if (check_nan_inf_level == 0) {
+#if defined(__NVCC__) || defined(__HIPCC__)
+      PADDLE_ENFORCE(false,
+                     "There are NAN or INF (num_nan=%ld, num_inf=%lld) in %s.",
+                     static_cast<long long>(num_nan),  // NOLINT
+                     static_cast<long long>(num_inf),  // NOLINT
+                     debug_info);
+#else
+      PADDLE_THROW(platform::errors::PreconditionNotMet(
+          "There are NAN or INF (num_nan=%lld, num_inf=%lld) in %s.",
+          static_cast<long long>(num_nan),  // NOLINT
+          static_cast<long long>(num_inf),  // NOLINT
+          debug_info));
+#endif
+    }
+  } else if (NeedPrint<T, MT>(max_value, min_value, check_nan_inf_level)) {
+    printf("[PRECISION] in %s, numel=%lld, max=%e, min=%e, mean=%e\n",
+           debug_info,
+           static_cast<long long>(numel),  // NOLINT
+           static_cast<float>(max_value),
+           static_cast<float>(min_value),
+           static_cast<float>(mean_value));
+  }
+}
+
+template <typename T>
+inline std::string GetCpuHintString(const std::string& op_type,
+                                    const std::string& var_name,
+                                    const phi::Place& place,
+                                    int device_id = -1) {
+  std::string dtype_str = DataTypeToString(DataTypeTrait<T>::DataType());
+  if (dtype_str == "float") {
+    dtype_str = "fp32";
+  } else if (dtype_str == "double") {
+    dtype_str = "fp64";
+  } else if (dtype_str == "::paddle::platform::float16") {
+    dtype_str = "fp16";
+  } else if (dtype_str == "::paddle::platform::bfloat16") {
+    dtype_str = "bf16";
+  }
+
+  std::stringstream ss;
+  if (platform::is_gpu_place(place)) {
+    ss << "[device=gpu:" << device_id << ", ";
+  } else {
+    ss << "[device=cpu, ";
+  }
+  ss << "op=" << op_type << ", tensor=" << var_name << ", dtype=" << dtype_str
+     << "]";
+  return ss.str();
+}
+
 template <typename DeviceContext>
 struct TensorCheckerVisitor {
-  TensorCheckerVisitor(const std::string& op_type,
-                       const std::string& var_name,
-                       const phi::DenseTensor& tensor,
-                       const platform::Place& place)
-      : op_type_(op_type),
-        var_name_(var_name),
-        tensor_(tensor),
-        place_(place) {}
+  TensorCheckerVisitor(const std::string& o,
+                       const std::string& v,
+                       const phi::DenseTensor& t,
+                       const platform::Place& p)
+      : op_type(o), var_name(v), tensor(t), place(p) {}
 
   template <typename T>
   void apply(
       typename std::enable_if<std::is_integral<T>::value>::type* = 0) const {
-    VLOG(10) << var_name_ << " need not to check, it's type is not float point";
+    VLOG(10) << var_name << " need not to check, it's type is not float point";
   }
 
   template <typename T>
@@ -49,10 +142,10 @@ struct TensorCheckerVisitor {
           std::is_same<T, ::paddle::platform::complex<double>>::value>::type* =
           0) const;
 
-  std::string op_type_;
-  std::string var_name_;
-  const phi::DenseTensor& tensor_;
-  const platform::Place& place_;
+  std::string op_type;
+  std::string var_name;
+  const phi::DenseTensor& tensor;
+  const platform::Place& place;
 };
 
 template <typename DeviceContext>
diff --git a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc
index d5207398adca96..7c18f9288c5e70 100644
--- a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc
@@ -17,6 +17,16 @@
 #include <queue>
 #include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h"
 
+// The difference between "sequential_run" and "serial_run":
+// "sequential_run" dispatches OPs one by one according to the sequence in the
+// Program, while "serial_run" ensures that all Ops are scheduled in a singal
+// thread. In standalone executor, "sequential_run" is also "serial_run", while
+// "serial_run" is not necessarily "sequential_run".
+PADDLE_DEFINE_EXPORTED_bool(new_executor_sequential_run,
+                            false,
+                            "Enable sequential execution for standalone "
+                            "executor, only applied to GPU OPs.");
+
 namespace paddle {
 namespace framework {
 namespace interpreter {
@@ -43,7 +53,7 @@ const std::string StringizeDownstreamMap(
 }
 
 const std::map<size_t, std::set<size_t>>& DependencyBuilder::Build(
-    const std::vector<Instruction>& instructions, bool is_sequential_run) {
+    const std::vector<Instruction>& instructions) {
   PADDLE_ENFORCE_EQ(
       is_build_,
       false,
@@ -56,7 +66,7 @@ const std::map<size_t, std::set<size_t>>& DependencyBuilder::Build(
   BuildOpHappensBefore();
   ShrinkDownstreamMap();
 
-  if (is_sequential_run) {
+  if (FLAGS_new_executor_sequential_run) {
     AddDependencyForSequentialRun();
   }
 
diff --git a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.h b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.h
index ca7331d4b78e47..ec1119e701da3d 100644
--- a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.h
+++ b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.h
@@ -36,7 +36,7 @@ class DependencyBuilder {
   // build op dependencies and return the mapping from op to its downstream-op
   // set
   const std::map<size_t, std::set<size_t>>& Build(
-      const std::vector<Instruction>& instructions, bool is_sequential_run);
+      const std::vector<Instruction>& instructions);
 
   const std::map<size_t, std::set<size_t>>& OpDownstreamMap() const;
 
diff --git a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc
index 88fac23338f543..fa33610096b7cc 100644
--- a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc
@@ -70,21 +70,30 @@ inline std::string RunTypeToString(DownstreamRunType run_type) {
 }
 
 void StreamAnalyzer::ConstructEvents(
-    const DependencyBuilder& dependency_builder,
     std::vector<Instruction>* instructions) const {
+  std::vector<Instruction> cross_step_merged_instructions = *instructions;
+  for (const Instruction& instr : *instructions) {
+    cross_step_merged_instructions.emplace_back(instr);
+  }
+
+  DependencyBuilder dependency_builder;
+  dependency_builder.Build(cross_step_merged_instructions);
+
   const std::map<size_t, std::set<size_t>>& downstream_map =
       dependency_builder.OpDownstreamMap();
-  const size_t instr_num = instructions->size();
+  const size_t instr_num = cross_step_merged_instructions.size();
   std::vector<std::vector<std::vector<size_t>>> run_type_info(
       instr_num,
       std::vector<std::vector<size_t>>(
-          /*number_of_run_type = */ 3));  // instr_id -> run_type ->
+          /*number_of_run_type = */ 2));  // instr_id -> run_type ->
                                           // next_instr_id
-  AnalyseAllRunType(*instructions, downstream_map, &run_type_info);
+  AnalyseAllRunType(
+      cross_step_merged_instructions, downstream_map, &run_type_info);
 
   std::map<const DeviceContext*, std::map<size_t, std::set<size_t>>>
       event_info;  // DeviceContext -> waiter_instr_id -> recorder_instr_ids
-  AnalyseAllEventInfo(*instructions, run_type_info, &event_info);
+  AnalyseAllEventInfo(
+      cross_step_merged_instructions, run_type_info, &event_info);
   ShrinkEventInfo(dependency_builder, &event_info);
 
   // Construct events
@@ -93,7 +102,17 @@ void StreamAnalyzer::ConstructEvents(
     for (auto& waiter_item : context_item.second) {
       size_t waiter_instr_id = waiter_item.first;
       std::set<size_t>& recorder_instr_ids = waiter_item.second;
+
+      if (waiter_instr_id >= instructions->size()) {
+        waiter_instr_id -= instructions->size();
+      }
+
       for (size_t recorder_instr_id : recorder_instr_ids) {
+        // Redundant record
+        if (recorder_instr_id >= instructions->size()) {
+          continue;
+        }
+
         Instruction& recorder_instr = instructions->at(recorder_instr_id);
         Instruction& waiter_instr = instructions->at(waiter_instr_id);
         platform::DeviceType waiter_type = GetWaiterType(waiter_instr);
diff --git a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h
index b9a228869d4c96..de0e6c741c2451 100644
--- a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h
+++ b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h
@@ -37,8 +37,7 @@ class StreamAnalyzer {
 
   ~StreamAnalyzer() {}
 
-  void ConstructEvents(const DependencyBuilder& dependency_builder,
-                       std::vector<Instruction>* instructions) const;
+  void ConstructEvents(std::vector<Instruction>* instructions) const;
 
   platform::DeviceContext* ParseDeviceContext(
       const OpFuncNode& op_func_node) const;
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index 070230af4d7867..a0aa82102e315d 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -33,15 +33,6 @@
 #endif
 #include "paddle/phi/backends/device_manager.h"
 
-// The difference between "sequential_run" and "serial_run":
-// "sequential_run" dispatches OPs one by one according to the sequence in the
-// Program, while "serial_run" ensures that all Ops are scheduled in a singal
-// thread. In standalone executor, "sequential_run" is also "serial_run", while
-// "serial_run" is not necessarily "sequential_run".
-PADDLE_DEFINE_EXPORTED_bool(new_executor_sequential_run,
-                            false,
-                            "Enable sequential execution for standalone "
-                            "executor, only applied to GPU OPs.");
 PADDLE_DEFINE_EXPORTED_bool(new_executor_use_inplace,
                             false,
                             "Use inplace in new executor");
@@ -519,9 +510,7 @@ void InterpreterCore::BuildOperatorDependences() {
   // and set the dependecy_count_
   size_t instr_num = vec_instruction_.size();
   dependecy_count_.resize(instr_num);
-  auto downstream_map = dependency_builder_.Build(
-      vec_instruction_,
-      /*is_sequential_run=*/FLAGS_new_executor_sequential_run);
+  auto downstream_map = dependency_builder_.Build(vec_instruction_);
 
   for (size_t instr_id = 0; instr_id < instr_num; ++instr_id) {
     Instruction& cur_instr = vec_instruction_[instr_id];
@@ -588,7 +577,13 @@ void InterpreterCore::Convert(
 
   BuildOperatorDependences();
 
-  stream_analyzer_.ConstructEvents(dependency_builder_, &vec_instruction_);
+  // NOTE(Ruibiao): For cross-step stream synchronization, an event may be
+  // recorded in the first step and waited in the second step. So, in the first
+  // step, the WaitEvent may be called without RecordEvent. Considering that
+  // before the first call to RecordEvent, an Event represents an empty set of
+  // work and WaitEvent always return succeed immediately, we omit the
+  // prelude-record for the first step here.
+  stream_analyzer_.ConstructEvents(&vec_instruction_);
 
   // add event for the input var of jit program, since there are async copied
   // from gpu_pinned place to gpu place on compute stream.
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h
index f42270f34a2205..9fb4e0b7eebaf5 100644
--- a/paddle/fluid/framework/new_executor/new_executor_defs.h
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.h
@@ -301,7 +301,7 @@ class Instruction {
 
   void AddEventToRecord(std::shared_ptr<platform::DeviceEvent> event,
                         platform::DeviceType waiter_type) {
-    event_to_record_ = std::make_unique<EventInter>(id_, event, waiter_type);
+    event_to_record_ = std::make_shared<EventInter>(id_, event, waiter_type);
   }
 
   void AddEventToWait(size_t instr_id,
@@ -379,7 +379,7 @@ class Instruction {
   std::vector<size_t> next_instrs_in_different_thread;
   std::vector<size_t> next_instrs_in_same_thread;
 
-  std::unique_ptr<EventInter> event_to_record_;
+  std::shared_ptr<EventInter> event_to_record_;
   std::vector<EventInter> events_to_wait_;
 
   OpFuncNode op_func_node_;
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index f67891feccc5ce..f765d9c22bbd5b 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -253,6 +253,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   // problem, so we filter them out.
   std::vector<std::string> params_not_shared;
 
+  auto *scope = param_scope();
   // The node->inputs contains input tensors and parameters.
   for (auto *x : node->inputs) {
     input_names.insert(x->Name());
@@ -264,6 +265,21 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
         x->outputs.size() <= 1) {
       params_not_shared.push_back(x->Name());
     }
+    // When TRT Engine's input is INT64, we need do some extra work.
+    // So we reserved a name for later use when casting INT64 -> INT32.
+    // We must check whether scope has had the same name var!
+    if (x->Var()->GetDataType() == framework::proto::VarType::INT64) {
+      std::string tmp_name = x->Name() + "_cast_to_INT32";
+      LOG(WARNING)
+          << "tensorrt_subgraph's input named " << tmp_name
+          << " having int64 dtype in pdmodel description, we will cast them to "
+             "int32 dtype to feed them into paddle-trt.";
+      PADDLE_ENFORCE_EQ(scope->FindVar(tmp_name),
+                        nullptr,
+                        platform::errors::InvalidArgument(
+                            "The  var name %s has exists in scope.", tmp_name));
+      scope->Var(tmp_name);
+    }
   }
 
   auto model_precision =
@@ -273,13 +289,18 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
 
   std::set<std::string> output_names;
   std::set<std::string> output_names_with_id;
-  std::map<std::string, int> origin_name_output_dims;
+  std::map<std::string, int> origin_name_output_rank;
   std::unordered_set<Node *> trt_outputs;
+  // record the origin output data type
+  std::vector<int> origin_outputs_dtype;
+  std::map<std::string, int> map_origin_outputs_dtype;
   for (auto *x : node->outputs) {
     output_names.insert(x->Name());
     output_names_with_id.insert(x->Name() + std::to_string(x->id()));
-    origin_name_output_dims[x->Name()] = x->Var()->GetShape().size();
+    origin_name_output_rank[x->Name()] = x->Var()->GetShape().size();
     trt_outputs.insert(x);
+    map_origin_outputs_dtype[x->Name()] =
+        static_cast<int>(x->Var()->GetDataType());
   }
 
   OutputProcess(
@@ -353,14 +374,34 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   // output_mapping help us copy the data from the renamed ITensor
   // to Tensor.
   std::vector<std::string> output_mapping;
-  std::vector<int> renamed_output_dims;
+  std::vector<int> renamed_output_rank;
   for (auto name : output_names) {
     PADDLE_ENFORCE_NE(output_name_map.count(name),
                       0,
                       platform::errors::PreconditionNotMet(
                           "The output_name_map should have %s", name));
     output_mapping.push_back(output_name_map[name]);
-    renamed_output_dims.push_back(origin_name_output_dims[name]);
+    renamed_output_rank.push_back(origin_name_output_rank[name]);
+    origin_outputs_dtype.push_back(map_origin_outputs_dtype[name]);
+
+    // When TRT Engine's output is INT64, we need do some extra work.
+    // So we reserved a name for later use when casting INT32 -> INT64.
+    // We must check whether scope has had the same name var!
+    if (static_cast<framework::proto::VarType_Type>(
+            map_origin_outputs_dtype[name]) ==
+        framework::proto::VarType::INT64) {
+      std::string tmp_name = name + "_cast_to_INT64";
+      LOG(WARNING) << "tensorrt_subgraph's output named " << tmp_name
+                   << " having int64 dtype in pdmodel description, but in fact "
+                      "it is int32 "
+                      "dtype after executing this tensorrt_subgraph, so we "
+                      "need cast them into int64.";
+      PADDLE_ENFORCE_EQ(scope->FindVar(tmp_name),
+                        nullptr,
+                        platform::errors::InvalidArgument(
+                            "The  var name %s has exists in scope.", tmp_name));
+      scope->Var(tmp_name);
+    }
   }
   PADDLE_ENFORCE_EQ(output_mapping.empty(),
                     false,
@@ -381,11 +422,12 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
 
   op_desc->SetBlockAttr("sub_block", new_block);
   op_desc->SetAttr("subgraph", block_desc.Proto()->SerializeAsString());
+  op_desc->SetAttr("origin_outputs_dtype", origin_outputs_dtype);
   op_desc->SetAttr("max_batch_size", max_batch_size);
   op_desc->SetAttr("workspace_size", Get<int64_t>("workspace_size"));
   op_desc->SetAttr("gpu_id", Get<int>("gpu_device_id"));
   op_desc->SetAttr("output_name_mapping", output_mapping);
-  op_desc->SetAttr("origin_output_dims", renamed_output_dims);
+  op_desc->SetAttr("origin_output_rank", renamed_output_rank);
   op_desc->SetAttr("parameters", params);
   op_desc->SetAttr("allow_build_at_runtime", allow_build_at_runtime);
   op_desc->SetAttr("shape_range_info_path", shape_range_info_path);
@@ -548,7 +590,6 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP "
                "kernel etc). This process may cost a lot of time.";
 
-  auto *scope = param_scope();
   framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto());
   std::unordered_set<std::string> param_set(params.begin(), params.end());
   inference::Singleton<inference::tensorrt::OpConverter>::Global()
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index b4d39e687203e3..58b0d2a1189ad7 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -948,12 +948,7 @@ void AnalysisConfig::Update() {
 #endif
   }
 
-#ifdef PADDLE_WITH_MKLDNN
-  // Do not optimize when mkldnn is on
-  if (enable_memory_optim_ && !use_mkldnn_) {
-#else
   if (enable_memory_optim_) {
-#endif
     pass_builder()->AppendAnalysisPass("memory_optimize_pass");
   }
 
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 0cb7191ce7d261..518404ad197756 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -211,6 +211,7 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
         "delete_quant_dequant_linear_op_pass",                          //
         "delete_weight_dequant_linear_op_pass",                         //
         "map_depthwise_conv_to_conv_pass",                              //
+        "constant_folding_pass",                                        //
         "conv_bn_fuse_pass",                                            //
         "conv_eltwiseadd_bn_fuse_pass",                                 //
         "embedding_eltwise_layernorm_fuse_pass",                        //
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index 255ef5d6d61945..301136d3533e04 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -732,7 +732,7 @@ TensorRTEngine::Weight TensorRTEngine::GetTrtWeight(
     for (int i = 0; i < weight_tensor.numel(); i++) {
       int32_data[i] = int64_data[i];
     }
-    weight.SetDataType(phi::DataType::FLOAT32);
+    weight.SetDataType(phi::DataType::INT32);
     weight.SetValues(int32_data);
   } else {
     paddle::framework::TensorCopySync(
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index 91876ab1544e1e..81e4ca89805ab2 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -60,6 +60,7 @@ TRT_DT FluidDataType2TRT(FluidDT type) {
     case FluidDT::VarType_Type_FP32:
       return TRT_DT::kFLOAT;
     case FluidDT::VarType_Type_INT32:
+    case FluidDT::VarType_Type_INT64:
       return TRT_DT::kINT32;
     case FluidDT::VarType_Type_FP16:
       return TRT_DT::kHALF;
@@ -68,10 +69,9 @@ TRT_DT FluidDataType2TRT(FluidDT type) {
       return TRT_DT::kBOOL;
 #endif
     default:
-      return TRT_DT::kINT32;
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "unknown fluid datatype in TRT op converter"));
   }
-  PADDLE_THROW(platform::errors::InvalidArgument(
-      "unknown fluid datatype in TRT op converter"));
   return TRT_DT::kINT32;
 }
 
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 4367927bb17344..98d865247de54f 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -284,6 +284,15 @@ struct SimpleOpTypeSetTeller : public Teller {
         }
       }
 #endif
+      auto* block = desc.Block();
+      if (block) {
+        auto* filter_var_desc = block->FindVar(desc.Input("Filter")[0]);
+        if (!filter_var_desc->Persistable()) {
+          VLOG(3) << "Trt not support filter is  a intermediate tensor in "
+                     "conv2d op.";
+          return false;
+        }
+      }
     }
 
     if (op_type == "deformable_conv") {
@@ -1890,8 +1899,9 @@ struct SimpleOpTypeSetTeller : public Teller {
           return false;
         }
       } else {
-#if !IS_TRT_VERSION_GE(8100)
-        VLOG(3) << "The version of TRT must be greater than 8100";
+#if (IS_TRT_VERSION_GE(8000) && IS_TRT_VERSION_LT(8100)) || \
+    (IS_TRT_VERSION_LT(7200))
+        VLOG(3) << "There are some bugs with trt 8.0";
         return false;
 #endif
       }
diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu
index e643efcb8b9f56..f93cb32a850efb 100644
--- a/paddle/fluid/operators/batch_norm_op.cu
+++ b/paddle/fluid/operators/batch_norm_op.cu
@@ -25,8 +25,8 @@ namespace cub = hipcub;
 #endif
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/operators/batch_norm_op.h"
-#include "paddle/fluid/operators/norm_utils.cu.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/backends/gpu/gpu_dnn.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 DECLARE_bool(cudnn_batchnorm_spatial_persistent);
@@ -36,7 +36,7 @@ namespace operators {
 
 using DataLayout = phi::DataLayout;
 template <typename T>
-using CudnnDataType = platform::CudnnDataType<T>;
+using CudnnDataType = phi::backends::gpu::CudnnDataType<T>;
 template <typename T>
 using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
 
diff --git a/paddle/fluid/operators/batch_norm_op.h b/paddle/fluid/operators/batch_norm_op.h
index 40cdb68329fb27..0e579010a91d79 100644
--- a/paddle/fluid/operators/batch_norm_op.h
+++ b/paddle/fluid/operators/batch_norm_op.h
@@ -21,8 +21,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/layout_utils.h"
-#include "paddle/fluid/operators/norm_utils.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/norm_utils.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/fused/fused_bn_activation_op.cu b/paddle/fluid/operators/fused/fused_bn_activation_op.cu
index 4023aaa8445f95..35d1b45408b1f4 100644
--- a/paddle/fluid/operators/fused/fused_bn_activation_op.cu
+++ b/paddle/fluid/operators/fused/fused_bn_activation_op.cu
@@ -21,10 +21,10 @@
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/operators/fused/fused_bn_activation_op.h"
-#include "paddle/fluid/operators/norm_utils.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/norm_utils.h"
 
 DECLARE_bool(cudnn_batchnorm_spatial_persistent);
 
@@ -91,7 +91,7 @@ class FusedBatchNormActKernel<phi::GPUContext, T>
 
     int N, C, H, W, D;
     const DataLayout data_layout = DataLayout::kNHWC;
-    ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
+    phi::funcs::ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
 
     if ((N * H * W * D) == 1) {
       // Only 1 element in normalization dimension,
@@ -257,7 +257,7 @@ class FusedBatchNormActGradKernel<phi::GPUContext, T>
                           "The Input dim size should be between 2 and 5"));
     int N, C, H, W, D;
     const DataLayout data_layout = DataLayout::kNHWC;
-    ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
+    phi::funcs::ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
 
     // init output
     auto *d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
index 4c4756b8e19792..4d92a7865eb2c9 100644
--- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
@@ -20,11 +20,11 @@
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/operators/fused/fused_bn_add_activation_op.h"
-#include "paddle/fluid/operators/norm_utils.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/norm_utils.h"
 
 DECLARE_bool(cudnn_batchnorm_spatial_persistent);
 
@@ -85,7 +85,7 @@ class FusedBatchNormAddActKernel<phi::GPUContext, T>
 
     int N, C, H, W, D;
     const DataLayout data_layout = DataLayout::kNHWC;
-    ExtractNCWHD(in_dims, data_layout, &N, &C, &H, &W, &D);
+    phi::funcs::ExtractNCWHD(in_dims, data_layout, &N, &C, &H, &W, &D);
 
     // ------------------- cudnn descriptors ---------------------
     auto handle = dev_ctx.cudnn_handle();
@@ -231,7 +231,7 @@ class FusedBatchNormAddActGradKernel<phi::GPUContext, T>
 
     int N, C, H, W, D;
     const DataLayout data_layout = DataLayout::kNHWC;
-    ExtractNCWHD(in_dims, data_layout, &N, &C, &H, &W, &D);
+    phi::funcs::ExtractNCWHD(in_dims, data_layout, &N, &C, &H, &W, &D);
 
     // init output
     auto *d_x = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
diff --git a/paddle/fluid/operators/generator/CMakeLists.txt b/paddle/fluid/operators/generator/CMakeLists.txt
index 53cbb990dc1ee9..62c11faadaf209 100644
--- a/paddle/fluid/operators/generator/CMakeLists.txt
+++ b/paddle/fluid/operators/generator/CMakeLists.txt
@@ -5,6 +5,7 @@ include(phi)
 set(op_yaml_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/ops.yaml)
 set(legacy_op_yaml_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/legacy_ops.yaml)
 set(bw_op_yaml_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/backward.yaml)
+set(static_op_yaml_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/static_ops.yaml)
 set(legacy_bw_op_yaml_file
     ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/legacy_backward.yaml)
 set(sparse_op_yaml_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/sparse_ops.yaml)
@@ -29,10 +30,14 @@ set(parsed_op_dir
     ${CMAKE_SOURCE_DIR}/paddle/fluid/operators/generator/parsed_ops)
 set(generated_op_path
     ${CMAKE_SOURCE_DIR}/paddle/fluid/operators/generated_op.cc)
+set(generated_static_op_path
+    ${CMAKE_SOURCE_DIR}/paddle/fluid/operators/generated_static_op.cc)
 set(generated_sparse_ops_path
     ${CMAKE_SOURCE_DIR}/paddle/fluid/operators/generated_sparse_op.cc)
 set(generated_argument_mapping_path
     ${CMAKE_SOURCE_DIR}/paddle/phi/ops/compat/generated_sig.cc)
+set(generated_static_argument_mapping_path
+    ${CMAKE_SOURCE_DIR}/paddle/phi/ops/compat/generated_static_sig.cc)
 set(generated_sparse_argument_mapping_path
     ${CMAKE_SOURCE_DIR}/paddle/phi/ops/compat/generated_sparse_sig.cc)
 
@@ -54,6 +59,8 @@ execute_process(
   COMMAND
     ${PYTHON_EXECUTABLE} parse_op.py --op_yaml_path ${legacy_bw_op_yaml_file}
     --output_path ./parsed_ops/legacy_backward_ops.parsed.yaml --backward
+  COMMAND ${PYTHON_EXECUTABLE} parse_op.py --op_yaml_path ${static_op_yaml_file}
+          --output_path ./parsed_ops/static_ops.parsed.yaml
   COMMAND ${PYTHON_EXECUTABLE} parse_op.py --op_yaml_path ${sparse_op_yaml_file}
           --output_path ./parsed_ops/sparse_ops.parsed.yaml
   COMMAND
@@ -75,7 +82,8 @@ execute_process(
   COMMAND
     ${PYTHON_EXECUTABLE} cross_validate.py --forward_yaml_paths
     ./parsed_ops/ops.parsed.yaml ./parsed_ops/legacy_ops.parsed.yaml
-    --backward_yaml_paths ./parsed_ops/backward_ops.parsed.yaml
+    ./parsed_ops/static_ops.parsed.yaml --backward_yaml_paths
+    ./parsed_ops/backward_ops.parsed.yaml
     ./parsed_ops/legacy_backward_ops.parsed.yaml
   RESULT_VARIABLE _result)
 if(${_result})
@@ -113,6 +121,20 @@ if(${_result})
   message(FATAL_ERROR "operator codegen failed, exiting.")
 endif()
 
+execute_process(
+  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/paddle/fluid/operators/generator
+  COMMAND
+    ${PYTHON_EXECUTABLE} generate_static_op.py --ops_yaml_path
+    ./parsed_ops/static_ops.parsed.yaml --op_version_yaml_path
+    ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/op_version.yaml
+    --op_compat_yaml_path ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/op_compat.yaml
+    --output_op_path "${generated_static_op_path}.tmp" --output_arg_map_path
+    "${generated_static_argument_mapping_path}.tmp"
+  RESULT_VARIABLE _result)
+if(${_result})
+  message(FATAL_ERROR "operator codegen failed, exiting.")
+endif()
+
 execute_process(
   WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/paddle/fluid/operators/generator
   COMMAND
@@ -126,84 +148,33 @@ if(${_result})
   message(FATAL_ERROR "sparse operator codegen failed, exiting.")
 endif()
 
-if(EXISTS "${generated_op_path}.tmp" AND EXISTS "${generated_op_path}")
-  execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different
-                          "${generated_op_path}.tmp" "${generated_op_path}")
-  message("copy if different ${generated_op_path}.tmp ${generated_op_path}")
-elseif(EXISTS "${generated_op_path}.tmp")
-  execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${generated_op_path}.tmp"
-                          "${generated_op_path}")
-  message("copy ${generated_op_path}.tmp ${generated_op_path}")
-else()
-  execute_process(COMMAND ${CMAKE_COMMAND} -E remove -f "${generated_op_path}")
-  message("remove ${generated_op_path}")
-endif()
-
-if(EXISTS "${generated_sparse_ops_path}.tmp" AND EXISTS
-                                                 "${generated_sparse_ops_path}")
-  execute_process(
-    COMMAND ${CMAKE_COMMAND} -E copy_if_different
-            "${generated_sparse_ops_path}.tmp" "${generated_sparse_ops_path}")
-  message(
-    "copy if different ${generated_sparse_ops_path}.tmp ${generated_sparse_ops_path}"
-  )
-elseif(EXISTS "${generated_sparse_ops_path}.tmp")
-  execute_process(
-    COMMAND ${CMAKE_COMMAND} -E copy "${generated_sparse_ops_path}.tmp"
-            "${generated_sparse_ops_path}")
-  message("copy ${generated_sparse_ops_path}.tmp ${generated_sparse_ops_path}")
-else()
-  execute_process(COMMAND ${CMAKE_COMMAND} -E remove -f
-                          "${generated_sparse_ops_path}")
-  message("remove ${generated_sparse_ops_path}")
-endif()
-
-if(EXISTS "${generated_argument_mapping_path}.tmp"
-   AND EXISTS "${generated_argument_mapping_path}")
-  execute_process(
-    COMMAND
-      ${CMAKE_COMMAND} -E copy_if_different
-      "${generated_argument_mapping_path}.tmp"
-      "${generated_argument_mapping_path}")
-  message(
-    "copy if different ${generated_argument_mapping_path}.tmp ${generated_argument_mapping_path}"
-  )
-elseif(EXISTS "${generated_argument_mapping_path}.tmp")
-  execute_process(
-    COMMAND ${CMAKE_COMMAND} -E copy "${generated_argument_mapping_path}.tmp"
-            "${generated_argument_mapping_path}")
-  message(
-    "copy ${generated_argument_mapping_path}.tmp ${generated_argument_mapping_path}"
-  )
-else()
-  execute_process(COMMAND ${CMAKE_COMMAND} -E remove -f
-                          "${generated_argument_mapping_path}")
-  message("remove ${generated_argument_mapping_path}")
-endif()
-
-if(EXISTS "${generated_sparse_argument_mapping_path}.tmp"
-   AND EXISTS "${generated_sparse_argument_mapping_path}")
-  execute_process(
-    COMMAND
-      ${CMAKE_COMMAND} -E copy_if_different
-      "${generated_sparse_argument_mapping_path}.tmp"
-      "${generated_sparse_argument_mapping_path}")
-  message(
-    "copy if different ${generated_sparse_argument_mapping_path}.tmp ${generated_sparse_argument_mapping_path}"
-  )
-elseif(EXISTS "${generated_sparse_argument_mapping_path}.tmp")
-  execute_process(
-    COMMAND
-      ${CMAKE_COMMAND} -E copy "${generated_sparse_argument_mapping_path}.tmp"
-      "${generated_sparse_argument_mapping_path}")
-  message(
-    "copy ${generated_sparse_argument_mapping_path}.tmp ${generated_sparse_argument_mapping_path}"
-  )
-else()
-  execute_process(COMMAND ${CMAKE_COMMAND} -E remove -f
-                          "${generated_sparse_argument_mapping_path}")
-  message("remove ${generated_sparse_argument_mapping_path}")
-endif()
+set(generated_static_files
+    "${generated_op_path}"
+    "${generated_static_op_path}"
+    "${generated_sparse_ops_path}"
+    "${generated_argument_mapping_path}"
+    "${generated_static_argument_mapping_path}"
+    "${generated_sparse_argument_mapping_path}")
+
+foreach(generated_static_file ${generated_static_files})
+  if(EXISTS "${generated_static_file}.tmp" AND EXISTS
+                                               "${generated_static_file}")
+    execute_process(
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different
+              "${generated_static_file}.tmp" "${generated_static_file}")
+    message(
+      "copy if different ${generated_static_file}.tmp ${generated_static_file}")
+  elseif(EXISTS "${generated_static_file}.tmp")
+    execute_process(
+      COMMAND ${CMAKE_COMMAND} -E copy "${generated_static_file}.tmp"
+              "${generated_static_file}")
+    message("copy ${generated_static_file}.tmp ${generated_static_file}")
+  else()
+    execute_process(COMMAND ${CMAKE_COMMAND} -E remove -f
+                            "${generated_static_file}")
+    message("remove ${generated_static_file}")
+  endif()
+endforeach()
 
 # op extra info file
 set(ops_extra_info_gen_file
diff --git a/paddle/fluid/operators/generator/generate_static_op.py b/paddle/fluid/operators/generator/generate_static_op.py
new file mode 100644
index 00000000000000..b24e60dc4da1ad
--- /dev/null
+++ b/paddle/fluid/operators/generator/generate_static_op.py
@@ -0,0 +1,153 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+from pathlib import Path
+
+import yaml
+from filters import (
+    cartesian_prod_mapping,
+    to_input_name,
+    to_int_array_tensor_name,
+    to_int_array_tensors_name,
+    to_op_attr_type,
+    to_opmaker_name,
+    to_opmaker_name_cstr,
+    to_pascal_case,
+    to_scalar_tensor_name,
+)
+from generate_op import replace_compat_name
+from jinja2 import Environment, FileSystemLoader, StrictUndefined
+from parse_utils import to_named_dict
+from tests import (
+    is_base_op,
+    is_initializer_list,
+    is_scalar,
+    is_vec,
+    supports_inplace,
+    supports_no_need_buffer,
+)
+
+file_loader = FileSystemLoader(Path(__file__).parent / "templates")
+env = Environment(
+    loader=file_loader,
+    keep_trailing_newline=True,
+    trim_blocks=True,
+    lstrip_blocks=True,
+    undefined=StrictUndefined,
+    extensions=['jinja2.ext.do'],
+)
+env.filters["to_op_attr_type"] = to_op_attr_type
+env.filters["to_opmaker_name"] = to_opmaker_name
+env.filters["to_pascal_case"] = to_pascal_case
+env.filters["to_scalar_tensor_name"] = to_scalar_tensor_name
+env.filters["to_int_array_tensor_name"] = to_int_array_tensor_name
+env.filters["to_int_array_tensors_name"] = to_int_array_tensors_name
+env.filters["to_input_name"] = to_input_name
+env.filters["to_opmaker_name_cstr"] = to_opmaker_name_cstr
+env.filters["cartesian_prod_mapping"] = cartesian_prod_mapping
+env.tests["base_op"] = is_base_op
+env.tests["vec"] = is_vec
+env.tests["scalar"] = is_scalar
+env.tests["initializer_list"] = is_initializer_list
+env.tests["supports_inplace"] = supports_inplace
+env.tests["supports_no_need_buffer"] = supports_no_need_buffer
+
+
+def restruct_io(op):
+    op["input_dict"] = to_named_dict(op["inputs"])
+    op["attr_dict"] = to_named_dict(op["attrs"])
+    op["output_dict"] = to_named_dict(op["outputs"])
+    return op
+
+
+def main(
+    ops_yaml_path,
+    op_compat_yaml_path,
+    op_version_yaml_path,
+    output_op_path,
+    output_arg_map_path,
+):
+    with open(ops_yaml_path, "rt") as f:
+        ops = yaml.safe_load(f)
+        ops = [restruct_io(op) for op in ops]
+    forward_op_dict = to_named_dict(ops)
+
+    with open(op_version_yaml_path, "rt") as f:
+        op_versions = yaml.safe_load(f)
+
+    # add op version info into op
+    for op_version in op_versions:
+        if op_version['op'] in forward_op_dict:
+            forward_op_dict[op_version['op']]['version'] = op_version['version']
+
+    with open(op_compat_yaml_path, "rt") as f:
+        op_op_map = yaml.safe_load(f)
+
+    for op in ops:
+        op['op_name'] = op['name']
+
+    replace_compat_name(op_op_map, forward_op_dict, {})
+
+    if len(ops) == 0:
+        if os.path.isfile(output_op_path):
+            os.remove(output_op_path)
+        if os.path.isfile(output_arg_map_path):
+            os.remove(output_arg_map_path)
+        return
+
+    op_template = env.get_template('op.c.j2')
+    with open(output_op_path, "wt") as f:
+        msg = op_template.render(
+            ops=ops, backward_ops=[], op_dict=forward_op_dict
+        )
+        f.write(msg)
+
+    ks_template = env.get_template('ks.c.j2')
+    with open(output_arg_map_path, 'wt') as f:
+        msg = ks_template.render(ops=ops, backward_ops=[])
+        f.write(msg)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Generate operator file from op yaml."
+    )
+    parser.add_argument(
+        '--ops_yaml_path', type=str, help="parsed static ops yaml file."
+    )
+    parser.add_argument(
+        '--op_compat_yaml_path', type=str, help="ops args compat yaml file."
+    )
+    parser.add_argument(
+        '--op_version_yaml_path', type=str, help="ops version yaml file."
+    )
+    parser.add_argument(
+        "--output_op_path", type=str, help="path to save generated operators."
+    )
+    parser.add_argument(
+        "--output_arg_map_path",
+        type=str,
+        help="path to save generated argument mapping functions.",
+    )
+
+    args = parser.parse_args()
+    main(
+        args.ops_yaml_path,
+        args.op_compat_yaml_path,
+        args.op_version_yaml_path,
+        args.output_op_path,
+        args.output_arg_map_path,
+    )
diff --git a/paddle/fluid/operators/log_loss_op.cc b/paddle/fluid/operators/log_loss_op.cc
deleted file mode 100644
index ebb588e8996b8c..00000000000000
--- a/paddle/fluid/operators/log_loss_op.cc
+++ /dev/null
@@ -1,129 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/phi/infermeta/binary.h"
-
-namespace paddle {
-namespace operators {
-
-class LogLossOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-};
-
-template <typename AttrType>
-class LogLossOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Predicted",
-             "The input value (Predicted) of Log loss op."
-             "Predicted is a 2-D tensor with shape [batch_size, 1].");
-    AddInput("Labels",
-             "The target value (Labels) of Log loss op."
-             "Labels is a 2-D tensor with shape [batch_size, 1].");
-    AddOutput("Loss",
-              "The output tensor with shape [batch_size, 1] "
-              "which represents the log loss.");
-    AddAttr<AttrType>("epsilon", "Epsilon in log loss.");
-    AddComment(R"DOC(
-LogLoss Operator.
-
-Log loss is a loss function used for binary classification. Log Loss quantifies
-the accuracy of a classifier by penalising false classifications. Minimising the
-Log Loss is equivalent to maximising the accuracy of the classifier. We define
-Predicted as the values predicted by our model and Labels as the target ground
-truth value. Log loss can evaluate how close the predicted values are to the
-target. The shapes of Predicted and Labels are both [batch_size, 1].
-The equation is:
-
-$$
-Loss = - Labels * log(Predicted + \epsilon) -
-        (1 - Labels) * log(1 - Predicted + \epsilon)
-$$
-
-)DOC");
-  }
-};
-
-class LogLossGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(
-        ctx->HasInput("Predicted"), "Input", "Predicted", "LogLossGrad");
-    OP_INOUT_CHECK(ctx->HasInput("Labels"), "Input", "Labels", "LogLossGrad");
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Loss")),
-                   "Input",
-                   framework::GradVarName("Loss"),
-                   "LogLossGrad");
-    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Predicted")),
-                   "Output",
-                   framework::GradVarName("Predicted"),
-                   "LogLossGrad");
-
-    auto pred_dims = ctx->GetInputDim("Predicted");
-    auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss"));
-    PADDLE_ENFORCE_EQ(loss_grad_dims,
-                      pred_dims,
-                      platform::errors::InvalidArgument(
-                          "The dimensions of loss_grad must be equal to the "
-                          "dimensions of Predicted,"
-                          "But received dimensions of loss_grad is [%s], "
-                          "received Predicted is "
-                          "[%s]",
-                          loss_grad_dims,
-                          pred_dims));
-
-    auto pred_grad_name = framework::GradVarName("Predicted");
-    ctx->SetOutputDim(pred_grad_name, pred_dims);
-  }
-};
-
-template <typename T>
-class LogLossGradMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("log_loss_grad");
-    op->SetInput("Predicted", this->Input("Predicted"));
-    op->SetInput("Labels", this->Input("Labels"));
-    op->SetInput(framework::GradVarName("Loss"), this->OutputGrad("Loss"));
-    op->SetOutput(framework::GradVarName("Predicted"),
-                  this->InputGrad("Predicted"));
-    op->SetAttrMap(this->Attrs());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-DECLARE_INFER_SHAPE_FUNCTOR(log_loss,
-                            LogLossInferShapeFunctor,
-                            PD_INFER_META(phi::LogLossInferMeta));
-REGISTER_OPERATOR(log_loss,
-                  ops::LogLossOp,
-                  ops::LogLossOpMaker<float>,
-                  ops::LogLossGradMaker<paddle::framework::OpDesc>,
-                  ops::LogLossGradMaker<paddle::imperative::OpBase>,
-                  LogLossInferShapeFunctor);
-REGISTER_OPERATOR(log_loss_grad, ops::LogLossGradOp);
diff --git a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc
index 65a49dab27df25..5a540b802e60bd 100644
--- a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc
@@ -99,9 +99,6 @@ class ReshapeMKLDNNKernel : public framework::OpKernel<T> {
       case ReshapeKernelOpName::reshape:
         InferShapeReshapeOp(ctx, x_dims, out_dims);
         break;
-      case ReshapeKernelOpName::reshape2:
-        InferShapeReshape2Op(ctx, x_dims, out_dims);
-        break;
       case ReshapeKernelOpName::squeeze:
         InferShapeSqueezeOp(ctx, x_dims, out_dims);
         break;
@@ -127,17 +124,6 @@ class ReshapeMKLDNNKernel : public framework::OpKernel<T> {
     ChangeReshapeOutDimsIfNeeded(ctx, x_dims, out_dims);
   }
 
-  void InferShapeReshape2Op(const framework::ExecutionContext& ctx,
-                            framework::DDim& x_dims,            // NOLINT
-                            framework::DDim& out_dims) const {  // NOLINT
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-    auto* xshape = ctx.Output<phi::DenseTensor>("XShape");
-    auto xshape_dims = xshape->dims();
-    x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size());
-    out_dims = out->dims();
-    ChangeReshapeOutDimsIfNeeded(ctx, x_dims, out_dims);
-  }
-
   // in reshape1/2 ops  "ShapeTensor" has highest priority and "Shape" has
   // second highest priority
   void ChangeReshapeOutDimsIfNeeded(
@@ -400,14 +386,6 @@ REGISTER_OP_KERNEL(
     ops::ReshapeGradMKLDNNKernel<paddle::platform::bfloat16,
                                  ReshapeKernelOpName::reshape>);
 
-REGISTER_OP_KERNEL(
-    reshape2,
-    MKLDNN,
-    paddle::platform::CPUPlace,
-    ops::ReshapeMKLDNNKernel<float, ReshapeKernelOpName::reshape2>,
-    ops::ReshapeMKLDNNKernel<paddle::platform::bfloat16,
-                             ReshapeKernelOpName::reshape2>);
-
 REGISTER_OP_KERNEL(
     reshape2_grad,
     MKLDNN,
diff --git a/paddle/fluid/operators/norm_utils.h b/paddle/fluid/operators/norm_utils.h
deleted file mode 100644
index edaf19f68f1b38..00000000000000
--- a/paddle/fluid/operators/norm_utils.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <memory>
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using DataLayout = phi::DataLayout;
-
-inline void ExtractNCWHD(const framework::DDim &dims,
-                         const DataLayout &data_layout,
-                         int *N,
-                         int *C,
-                         int *H,
-                         int *W,
-                         int *D) {
-  *N = dims[0];
-  if (dims.size() == 2) {
-    *C = dims[1];
-    *H = 1;
-    *W = 1;
-    *D = 1;
-  } else {
-    *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1];
-    *H = data_layout == DataLayout::kNCHW ? dims[2] : dims[1];
-    *W = dims.size() > 3
-             ? (data_layout == DataLayout::kNCHW ? dims[3] : dims[2])
-             : 1;
-    *D = dims.size() > 4
-             ? (data_layout == DataLayout::kNCHW ? dims[4] : dims[3])
-             : 1;
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/put_along_axis_op.cc b/paddle/fluid/operators/put_along_axis_op.cc
deleted file mode 100644
index 551dba6d839ed4..00000000000000
--- a/paddle/fluid/operators/put_along_axis_op.cc
+++ /dev/null
@@ -1,111 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/phi/core/ddim.h"
-#include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/phi/infermeta/ternary.h"
-
-namespace paddle {
-namespace operators {
-
-class PutAlongAxisOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        OperatorWithKernel::IndicateVarDataType(ctx, "Input"),
-        ctx.device_context());
-  }
-};
-
-class PutAlongAxisOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Input", "The input tensor of PutAlongAxisOp");
-    AddInput("Index", "The index tensor of PutAlongAxisOp");
-    AddInput("Value", "The value tensor of PutAlongAxisOp");
-    AddOutput("Result", "The result tensor of PutAlongAxisOp");
-    AddAttr<int>("Axis", "The axis that we do PutAlongAxis operation");
-    AddAttr<std::string>("Reduce", "The reduce operation for scatter")
-        .SetDefault("assign");
-    AddComment(R"DOC(
-        PutAlongAxis Operator.)
-    )DOC");
-  }
-};
-
-class PutAlongAxisGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    ctx->SetOutputDim(framework::GradVarName("Input"),
-                      ctx->GetInputDim("Input"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
-                                       ctx, framework::GradVarName("Result")),
-                                   ctx.device_context());
-  }
-};
-
-template <typename T>
-class PutAlongAxisGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("put_along_axis_grad");
-    op->SetInput("Index", this->Input("Index"));
-    op->SetInput("Input", this->Input("Input"));
-
-    op->SetInput(framework::GradVarName("Result"), this->OutputGrad("Result"));
-    op->SetOutput(framework::GradVarName("Input"), this->InputGrad("Input"));
-    op->SetOutput(framework::GradVarName("Value"), this->InputGrad("Value"));
-    op->SetAttrMap(this->Attrs());
-  }
-};
-
-DECLARE_INPLACE_OP_INFERER(PutAlongAxisInplaceInferer, {"Input", "Result"});
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-DECLARE_INFER_SHAPE_FUNCTOR(put_along_axis,
-                            PutAlongAxisInferShapeFunctor,
-                            PD_INFER_META(phi::PutAlongAxisInferMeta));
-REGISTER_OPERATOR(put_along_axis,
-                  ops::PutAlongAxisOp,
-                  ops::PutAlongAxisOpMaker,
-                  ops::PutAlongAxisGradOpMaker<paddle::framework::OpDesc>,
-                  ops::PutAlongAxisGradOpMaker<paddle::imperative::OpBase>,
-                  paddle::operators::PutAlongAxisInplaceInferer,
-                  PutAlongAxisInferShapeFunctor);
-
-REGISTER_OPERATOR(put_along_axis_grad, ops::PutAlongAxisGradOp);
diff --git a/paddle/fluid/operators/searchsorted_op.cc b/paddle/fluid/operators/searchsorted_op.cc
deleted file mode 100644
index 1beb06366ea919..00000000000000
--- a/paddle/fluid/operators/searchsorted_op.cc
+++ /dev/null
@@ -1,72 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/phi/infermeta/binary.h"
-
-namespace paddle {
-namespace operators {
-
-class SearchSortedOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto data_type =
-        OperatorWithKernel::IndicateVarDataType(ctx, "SortedSequence");
-    return framework::OpKernelType(data_type, ctx.device_context());
-  }
-};
-
-class SearchSortedOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("SortedSequence",
-             "(Tensor), N-D or 1-D tensor, The value of the tensor"
-             "monotonically increases in the innermost dimension.");
-    AddInput("Values", "(Tensor), N-D tensor given values.");
-    AddOutput("Out", "(Tensor), The output tensor of searchsorted op.");
-    AddAttr<bool>("out_int32",
-                  "the output tensor is int64 type if False and on the"
-                  "contrary for int32")
-        .SetDefault(false);
-    AddAttr<bool>(
-        "right",
-        "corresponding to lower bound if False and upper bound if True")
-        .SetDefault(false);
-
-    AddComment(R"DOC(
-  Searchsorted Operator.
-
-  This OP is used to find the index of the corresponding sorted_sequence in the innermost dimension based on the given values.
-
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-DECLARE_INFER_SHAPE_FUNCTOR(searchsorted,
-                            SearchsortedInferShapeFunctor,
-                            PD_INFER_META(phi::SearchsortedInferMeta));
-REGISTER_OPERATOR(searchsorted,
-                  ops::SearchSortedOp,
-                  ops::SearchSortedOpMaker,
-                  SearchsortedInferShapeFunctor);
diff --git a/paddle/fluid/operators/svd_op.cc b/paddle/fluid/operators/svd_op.cc
deleted file mode 100644
index afbfd80b8d5379..00000000000000
--- a/paddle/fluid/operators/svd_op.cc
+++ /dev/null
@@ -1,126 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/ddim.h"
-#include "paddle/phi/infermeta/unary.h"
-
-namespace paddle {
-namespace operators {
-
-class SvdOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-};
-
-class SvdOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor), The input tensor of svd op.");
-    AddOutput("U", "(Tensor), The output U tensor of svd op.");
-    AddOutput("S", "(Tensor), The output S tensor of svd op.");
-    AddOutput("VH", "(Tensor), The output VH tensor of svd op.");
-    AddAttr<bool>("full_matrices",
-                  "(bool, default false) Only Compute the thin U and V"
-                  "when set as True, the gradient have some random "
-                  "attribute.")
-        .SetDefault(false);
-    AddComment(R"DOC(
-Svd Operator.
-
-This operator is used to perform SVD operation for batched matrics $X$.
-$$U, S, VH = svd(X)$$
-
-)DOC");
-  }
-};
-
-class SvdGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("U")),
-                   "Input",
-                   "U@Grad",
-                   "SvdGrad");
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("VH")),
-                   "Input",
-                   "VH@Grad",
-                   "SvdGrad");
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("S")),
-                   "Input",
-                   "S@Grad",
-                   "SvdGrad");
-    OP_INOUT_CHECK(ctx->HasInput("U"), "Input", "U", "SvdGrad");
-    OP_INOUT_CHECK(ctx->HasInput("S"), "Input", "S", "SvdGrad");
-    OP_INOUT_CHECK(ctx->HasInput("VH"), "Input", "VH", "SvdGrad");
-    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")),
-                   "Output",
-                   "X@Grad",
-                   "SvdGrad");
-
-    auto d_x = ctx->GetInputDim(("X"));
-    ctx->SetOutputDim(framework::GradVarName("X"), d_x);
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto dtype = OperatorWithKernel::IndicateVarDataType(ctx, "X");
-    return framework::OpKernelType(dtype, ctx.GetPlace());
-  }
-};
-
-template <typename T>
-class SvdGradMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
-  void Apply(GradOpPtr<T> retv) const override {
-    retv->SetType("svd_grad");
-    retv->SetInput(framework::GradVarName("U"), this->OutputGrad("U"));
-    retv->SetInput(framework::GradVarName("VH"), this->OutputGrad("VH"));
-    retv->SetInput(framework::GradVarName("S"), this->OutputGrad("S"));
-    retv->SetInput("U", this->Output("U"));
-    retv->SetInput("VH", this->Output("VH"));
-    retv->SetInput("S", this->Output("S"));
-    retv->SetInput("X", this->Input("X"));
-    retv->SetAttrMap(this->Attrs());
-    retv->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-DECLARE_INFER_SHAPE_FUNCTOR(svd,
-                            SvdInferShapeFunctor,
-                            PD_INFER_META(phi::SvdInferMeta));
-
-REGISTER_OPERATOR(svd,
-                  ops::SvdOp,
-                  ops::SvdOpMaker,
-                  ops::SvdGradMaker<paddle::framework::OpDesc>,
-                  ops::SvdGradMaker<paddle::imperative::OpBase>,
-                  SvdInferShapeFunctor);
-
-REGISTER_OPERATOR(svd_grad, ops::SvdGradOp);
diff --git a/paddle/fluid/operators/sync_batch_norm_op_mlu.cc b/paddle/fluid/operators/sync_batch_norm_op_mlu.cc
index 2d037a7c3ecc1a..6d9e161806d820 100644
--- a/paddle/fluid/operators/sync_batch_norm_op_mlu.cc
+++ b/paddle/fluid/operators/sync_batch_norm_op_mlu.cc
@@ -72,7 +72,7 @@ class SyncBatchNormMLUKernel : public framework::OpKernel<T> {
                           "The Input dim size should be less than 6."));
 
     int N, C, H, W, D;
-    ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D);
+    phi::funcs::ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D);
 
     y->mutable_data<T>(ctx.GetPlace());
     mean_out->mutable_data<MPDType>(ctx.GetPlace());
@@ -320,7 +320,7 @@ class SyncBatchNormMLUGradKernel : public framework::OpKernel<T> {
                           "The Input X dim size should be less than 6."));
 
     int N, C, H, W, D;
-    ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D);
+    phi::funcs::ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D);
     PADDLE_ENFORCE_EQ(scale->dims()[0],
                       C,
                       platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/sync_batch_norm_op_npu.cc b/paddle/fluid/operators/sync_batch_norm_op_npu.cc
index 46b1ccc140ddb8..b25ca5b3823cef 100644
--- a/paddle/fluid/operators/sync_batch_norm_op_npu.cc
+++ b/paddle/fluid/operators/sync_batch_norm_op_npu.cc
@@ -344,7 +344,7 @@ class SyncBatchNormNPUKernel : public framework::OpKernel<T> {
                           x_dims.size()));
 
     int N, C, H, W, D;
-    ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D);
+    phi::funcs::ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D);
 
     int x_numel = x->numel();
     auto place = ctx.GetPlace();
@@ -598,7 +598,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel<T> {
     }
 
     int N, C, H, W, D;
-    ExtractNCWHD(x->dims(), layout, &N, &C, &H, &W, &D);
+    phi::funcs::ExtractNCWHD(x->dims(), layout, &N, &C, &H, &W, &D);
 
     int x_numel = x->numel();
     auto place = ctx.GetPlace();
diff --git a/paddle/fluid/operators/take_along_axis_op.cc b/paddle/fluid/operators/take_along_axis_op.cc
deleted file mode 100644
index 5e3424a552bf92..00000000000000
--- a/paddle/fluid/operators/take_along_axis_op.cc
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/phi/core/ddim.h"
-#include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/phi/infermeta/binary.h"
-
-namespace paddle {
-namespace operators {
-
-class TakeAlongAxisOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        OperatorWithKernel::IndicateVarDataType(ctx, "Input"),
-        ctx.device_context());
-  }
-};
-
-class TakeAlongAxisOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Input", "The input tensor of TakeAlongAxisOp");
-    AddInput("Index", "The index tensor of TakeAlongAxisOp");
-    AddOutput("Result", "The result tensor of TakeAlongAxisOp");
-    AddAttr<int>("Axis",
-                 "The Tensor which contains the axis that we do TakeAlongAxis "
-                 "operation.");
-    AddComment(R"DOC(
-        Take_along_axis Operator.)
-    )DOC");
-  }
-};
-
-class TakeAlongAxisGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    ctx->SetOutputDim(framework::GradVarName("Input"),
-                      ctx->GetInputDim("Input"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
-                                       ctx, framework::GradVarName("Result")),
-                                   ctx.device_context());
-  }
-};
-
-template <typename T>
-class TakeAlongAxisGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("take_along_axis_grad");
-    op->SetInput("Index", this->Input("Index"));
-    op->SetInput("Input", this->Input("Input"));
-
-    op->SetInput(framework::GradVarName("Result"), this->OutputGrad("Result"));
-    op->SetOutput(framework::GradVarName("Input"), this->InputGrad("Input"));
-    op->SetAttrMap(this->Attrs());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-DECLARE_INFER_SHAPE_FUNCTOR(take_along_axis,
-                            TakeAlongAxisInferShapeFunctor,
-                            PD_INFER_META(phi::TakeAlongAxisInferMeta));
-REGISTER_OPERATOR(take_along_axis,
-                  ops::TakeAlongAxisOp,
-                  ops::TakeAlongAxisOpMaker,
-                  ops::TakeAlongAxisGradOpMaker<paddle::framework::OpDesc>,
-                  ops::TakeAlongAxisGradOpMaker<paddle::imperative::OpBase>,
-                  TakeAlongAxisInferShapeFunctor);
-
-REGISTER_OPERATOR(take_along_axis_grad, ops::TakeAlongAxisGradOp);
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
index 38cf5e2b823466..579549a4c3ec47 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -21,13 +21,13 @@
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/place.h"
 #ifdef PADDLE_WITH_CUDA
-
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
 #include <vector>
+#include "paddle/phi/kernels/cast_kernel.h"
 
 #include "paddle/fluid/framework/data_device_transform.h"
 #include "paddle/fluid/framework/executor.h"
@@ -596,7 +596,14 @@ class TensorRTEngineOp : public framework::OperatorBase {
       if (type == framework::proto::VarType::FP32) {
         buffers[bind_index] = static_cast<void *>(t.data<float>());
       } else if (type == framework::proto::VarType::INT64) {
-        buffers[bind_index] = static_cast<void *>(t.data<int64_t>());
+        auto int32_tensor =
+            scope.FindVar(x + "_cast_to_INT32")->GetMutable<phi::DenseTensor>();
+        *int32_tensor = phi::Cast<int64_t>(
+            reinterpret_cast<const phi::GPUContext &>(dev_ctx),
+            t,
+            phi::DataType::INT32);
+        buffers[bind_index] =
+            static_cast<void *>(int32_tensor->data<int32_t>());
       } else if (type == framework::proto::VarType::INT32) {
         buffers[bind_index] = static_cast<void *>(t.data<int32_t>());
       } else if (type == framework::proto::VarType::FP16) {
@@ -614,8 +621,8 @@ class TensorRTEngineOp : public framework::OperatorBase {
 
     // Bind output tensor to TRT.
     int output_index = 0;
-    std::vector<int> origin_output_dims =
-        Attr<std::vector<int>>("origin_output_dims");
+    std::vector<int> origin_output_rank =
+        Attr<std::vector<int>>("origin_output_rank");
     VLOG(4) << "TensorRT Engine Op Outputs:";
     for (const auto &y : Outputs("Ys")) {
       const int bind_index =
@@ -636,7 +643,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
         for (; nb_dims > 0; nb_dims--) {
           // some 'x 1' of shape is normal, no need to remove it
           if (dims.d[nb_dims - 1] != 1 ||
-              nb_dims == origin_output_dims[output_index])
+              nb_dims == origin_output_rank[output_index])
             break;
         }
         for (int i = 0; i < nb_dims; i++) ddim.push_back(dims.d[i]);
@@ -694,6 +701,28 @@ class TensorRTEngineOp : public framework::OperatorBase {
     }
     // Execute the engine.
     engine->Execute(runtime_batch, &buffers, stream);
+
+    std::vector<int> origin_outputs_dtype =
+        Attr<std::vector<int>>("origin_outputs_dtype");
+    for (size_t i = 0; i < Outputs("Ys").size(); i++) {
+      auto type =
+          static_cast<framework::proto::VarType_Type>(origin_outputs_dtype[i]);
+
+      if (type == framework::proto::VarType::INT64) {
+        auto y = Outputs("Ys")[i];
+        auto *fluid_v = scope.FindVar(y);
+        auto *fluid_t = fluid_v->GetMutable<phi::DenseTensor>();
+        auto int32_tensor =
+            scope.FindVar(y + "_cast_to_INT64")->GetMutable<phi::DenseTensor>();
+        int32_tensor->Resize(fluid_t->dims());
+        dev_ctx.Alloc<int32_t>(int32_tensor);
+        framework::TensorCopy(*fluid_t, dev_place, dev_ctx, int32_tensor);
+        *fluid_t = phi::Cast<int32_t>(
+            reinterpret_cast<const phi::GPUContext &>(dev_ctx),
+            *int32_tensor,
+            phi::DataType::INT64);
+      }
+    }
   }
 
   TensorRTEngine *GetEngine(const framework::Scope &scope,
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
index 6d37290d151485..49a74cc8800975 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
@@ -104,6 +104,7 @@ void DynamicShapeTest(bool allow_build_at_runtime) {
   engine_op_desc.SetType("tensorrt_engine");
   engine_op_desc.SetInput("Xs", std::vector<std::string>({"x"}));
   engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z0"}));
+  engine_op_desc.SetAttr("origin_outputs_dtype", std::vector<int>{5});
 
   engine_op_desc.SetBlockAttr("sub_block", &block_desc);
   engine_op_desc.SetAttr("max_batch_size", static_cast<int>(2));
@@ -119,7 +120,7 @@ void DynamicShapeTest(bool allow_build_at_runtime) {
   engine_op_desc.SetAttr("use_calib_mode", static_cast<bool>(false));
   engine_op_desc.SetAttr("output_name_mapping",
                          std::vector<std::string>({"z0"}));
-  engine_op_desc.SetAttr("origin_output_dims", std::vector<int>({2}));
+  engine_op_desc.SetAttr("origin_output_rank", std::vector<int>({2}));
   engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString()));
   engine_op_desc.SetAttr("engine_serialized_data", std::string(""));
   int device_id = 0;
@@ -274,7 +275,7 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
   engine_op_desc.SetAttr("use_calib_mode", static_cast<bool>(false));
   engine_op_desc.SetAttr("output_name_mapping",
                          std::vector<std::string>({"z3"}));
-  engine_op_desc.SetAttr("origin_output_dims", std::vector<int>({2}));
+  engine_op_desc.SetAttr("origin_output_rank", std::vector<int>({2}));
   engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString()));
   engine_op_desc.SetAttr("engine_serialized_data", std::string(""));
   int device_id = 0;
diff --git a/paddle/fluid/platform/dynload/cusolver.h b/paddle/fluid/platform/dynload/cusolver.h
index 854de23150cad7..c49c30eb65c42d 100644
--- a/paddle/fluid/platform/dynload/cusolver.h
+++ b/paddle/fluid/platform/dynload/cusolver.h
@@ -96,13 +96,22 @@ CUSOLVER_ROUTINE_EACH_R1(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP)
 #endif
 
 #if CUDA_VERSION >= 9020
-#define CUSOLVER_ROUTINE_EACH_R2(__macro) \
-  __macro(cusolverDnCreateSyevjInfo);     \
-  __macro(cusolverDnSsyevj_bufferSize);   \
-  __macro(cusolverDnDsyevj_bufferSize);   \
-  __macro(cusolverDnSsyevj);              \
-  __macro(cusolverDnDsyevj);              \
-  __macro(cusolverDnDestroySyevjInfo);
+#define CUSOLVER_ROUTINE_EACH_R2(__macro)      \
+  __macro(cusolverDnCreateSyevjInfo);          \
+  __macro(cusolverDnSsyevj_bufferSize);        \
+  __macro(cusolverDnDsyevj_bufferSize);        \
+  __macro(cusolverDnSsyevj);                   \
+  __macro(cusolverDnDsyevj);                   \
+  __macro(cusolverDnDestroySyevjInfo);         \
+  __macro(cusolverDnXsyevjSetSortEig);         \
+  __macro(cusolverDnSsyevjBatched_bufferSize); \
+  __macro(cusolverDnDsyevjBatched_bufferSize); \
+  __macro(cusolverDnCheevjBatched_bufferSize); \
+  __macro(cusolverDnZheevjBatched_bufferSize); \
+  __macro(cusolverDnSsyevjBatched);            \
+  __macro(cusolverDnDsyevjBatched);            \
+  __macro(cusolverDnCheevjBatched);            \
+  __macro(cusolverDnZheevjBatched);
 
 CUSOLVER_ROUTINE_EACH_R2(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP)
 #endif
diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml
index 2e5ca9ff4916e2..4cd0c5ab341c72 100644
--- a/paddle/phi/api/yaml/backward.yaml
+++ b/paddle/phi/api/yaml/backward.yaml
@@ -676,6 +676,16 @@
   backward : log_double_grad
   inplace : (out_grad -> x_grad)
 
+- backward_op : log_loss_grad
+  forward : log_loss (Tensor input, Tensor label, float epsilon) -> Tensor(out)
+  args : (Tensor input, Tensor label, Tensor out_grad, float epsilon)
+  output : Tensor(input_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [input]
+  kernel :
+    func : log_loss_grad
+
 - backward_op : logit_grad
   forward : logit (Tensor x, float eps = 1e-6f) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, float eps)
@@ -779,6 +789,16 @@
   kernel :
     func : poisson_grad
 
+- backward_op : put_along_axis_grad
+  forward : put_along_axis (Tensor arr, Tensor indices, Tensor value, int axis, str reduce = "assign") -> Tensor(out)
+  args : (Tensor arr, Tensor indices, Tensor out_grad, int axis, str reduce)
+  output : Tensor(arr_grad), Tensor(value_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [arr, indices]
+  kernel :
+    func : put_along_axis_grad
+
 - backward_op : qr_grad
   forward : qr (Tensor x, str mode = "reduced") -> Tensor(q), Tensor(r)
   args : (Tensor x, Tensor q, Tensor r, Tensor q_grad, Tensor r_grad, str mode)
@@ -1062,6 +1082,27 @@
   backward : square_double_grad
   inplace : (out_grad -> x_grad)
 
+- backward_op : svd_grad
+  forward : svd (Tensor x, bool full_matrices = false) -> Tensor(u), Tensor(s), Tensor(vh)
+  args : (Tensor x, Tensor u, Tensor vh, Tensor s, Tensor u_grad, Tensor vh_grad, Tensor s_grad, bool full_matrices)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  kernel :
+    func : svd_grad
+  optional: u_grad, vh_grad, s_grad
+
+- backward_op : take_along_axis_grad
+  forward : take_along_axis (Tensor arr, Tensor indices, int axis) -> Tensor(out)
+  args : (Tensor arr, Tensor indices, Tensor out_grad, int axis)
+  output : Tensor(arr_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [arr]
+  kernel :
+    func : take_along_axis_grad
+
 - backward_op : tan_grad
   forward : tan (Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml
index 4001a75d0fa1d4..51e49ef831c473 100755
--- a/paddle/phi/api/yaml/legacy_backward.yaml
+++ b/paddle/phi/api/yaml/legacy_backward.yaml
@@ -745,16 +745,6 @@
     func : linear_interp_grad
     data_type : output_grad
 
-- backward_op : log_loss_grad
-  forward : log_loss (Tensor input, Tensor label, float epsilon) -> Tensor(out)
-  args : (Tensor input, Tensor label, Tensor out_grad, float epsilon)
-  output : Tensor(input_grad)
-  infer_meta :
-    func : UnchangedInferMeta
-    param : [input]
-  kernel :
-    func : log_loss_grad
-
 - backward_op : log_softmax_grad
   forward : log_softmax(Tensor x,  int axis) -> Tensor(out)
   args : (Tensor out, Tensor out_grad,  int axis)
@@ -1195,17 +1185,6 @@
     data_type : x
   optional : boxes_num
 
-# output is optional
-- backward_op : put_along_axis_grad
-  forward : put_along_axis (Tensor arr, Tensor indices, Tensor value, int axis, str reduce) -> Tensor(out)
-  args : (Tensor arr, Tensor indices, Tensor out_grad, int axis, str reduce)
-  output : Tensor(arr_grad), Tensor(value_grad)
-  infer_meta :
-    func : GeneralBinaryGradInferMeta
-    param : [arr, indices]
-  kernel :
-    func : put_along_axis_grad
-
 - backward_op : real_grad
   forward : real (Tensor x) -> Tensor(out)
   args : (Tensor out_grad)
@@ -1573,17 +1552,6 @@
   no_need_buffer : x
   backward : sum_double_grad
 
-- backward_op : svd_grad
-  forward : svd (Tensor x, bool full_matrices) -> Tensor(u), Tensor(s), Tensor(vh)
-  args : (Tensor x, Tensor u, Tensor vh, Tensor s, Tensor u_grad, Tensor vh_grad, Tensor s_grad, bool full_matrices)
-  output : Tensor(x_grad)
-  infer_meta :
-    func : UnchangedInferMeta
-    param : [x]
-  kernel :
-    func : svd_grad
-  optional: u_grad, vh_grad, s_grad
-
 - backward_op : swish_grad
   forward : swish (Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, float bete=1.0)
@@ -1607,16 +1575,6 @@
     data_type : out_grad
   optional : reserve_space
 
-- backward_op : take_along_axis_grad
-  forward : take_along_axis (Tensor arr, Tensor indices, int axis) -> Tensor(out)
-  args : (Tensor arr, Tensor indices, Tensor out_grad, int axis)
-  output : Tensor(arr_grad)
-  infer_meta :
-    func : UnchangedInferMeta
-    param : [arr]
-  kernel :
-    func : take_along_axis_grad
-
 - backward_op : temporal_shift_grad
   forward : temporal_shift(Tensor x, int seg_num, float shift_ratio, str data_format_str) -> Tensor(out)
   args : (Tensor out_grad, int seg_num, float shift_ratio, str data_format_str)
diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml
index 2382739377eece..4b697f0182be29 100755
--- a/paddle/phi/api/yaml/legacy_ops.yaml
+++ b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -1068,15 +1068,6 @@
     data_type : dtype
     backend : place
 
-- op : log_loss
-  args : (Tensor input, Tensor label, float epsilon)
-  output : Tensor
-  infer_meta :
-    func : LogLossInferMeta
-  kernel :
-    func : log_loss
-  backward : log_loss_grad
-
 - op : log_softmax
   args : (Tensor x,  int axis)
   output : Tensor(out)
@@ -1555,18 +1546,6 @@
   optional : boxes_num
   backward : psroi_pool_grad
 
-- op : put_along_axis
-  args : (Tensor arr, Tensor indices, Tensor values, int axis, str reduce)
-  output : Tensor(out)
-  infer_meta :
-    func : UnchangedInferMeta
-    param : [arr]
-  kernel :
-    func : put_along_axis
-    data_type : arr
-  inplace : (arr -> out)
-  backward : put_along_axis_grad
-
 - op : randint
   args : (int low, int high, IntArray shape, DataType dtype=DataType::INT64, Place place={})
   output : Tensor(out)
@@ -1750,15 +1729,6 @@
     func : scatter_nd_add
   backward : scatter_nd_add_grad
 
-- op : searchsorted
-  args : (Tensor sorted_sequence, Tensor values, bool out_int32, bool right)
-  output : Tensor(out)
-  infer_meta :
-    func : SearchsortedInferMeta
-  kernel :
-    func : searchsorted
-    data_type : sorted_sequence
-
 - op : segment_pool
   args : (Tensor x, Tensor segment_ids, str pooltype)
   output : Tensor(out), Tensor(summed_ids)
@@ -1968,15 +1938,6 @@
     data_type : x
   backward : sum_grad
 
-- op : svd
-  args : (Tensor x, bool full_matrices)
-  output : Tensor(u), Tensor(s), Tensor(vh)
-  infer_meta :
-    func : SvdInferMeta
-  kernel :
-    func : svd
-  backward : svd_grad
-
 - op : swish
   args : (Tensor x)
   output : Tensor(out)
@@ -1998,17 +1959,6 @@
   backward : sync_batch_norm_grad
   inplace : (mean -> mean_out), (variance -> variance_out)
 
-- op : take_along_axis
-  args : (Tensor arr, Tensor indices, int axis)
-  output : Tensor
-  infer_meta :
-    func : UnchangedInferMeta
-    param : [indices]
-  kernel :
-    func : take_along_axis
-    data_type : arr
-  backward : take_along_axis_grad
-
 - op : temporal_shift
   args : (Tensor x, int seg_num, float shift_ratio, str data_format_str)
   output : Tensor
diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
index 4945794dc58f5c..63c6c5c38f54f7 100644
--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -750,6 +750,13 @@
   extra :
     attrs : [bool use_mkldnn = false, bool use_cudnn = false]
 
+- op : log_loss
+  backward : log_loss_grad
+  inputs :
+    {input : Predicted, label : Labels}
+  outputs :
+    out : Loss
+
 - op : log_softmax
   backward : log_softmax_grad
   extra :
@@ -916,6 +923,15 @@
   extra :
     attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32", bool is_test = false]
 
+- op : put_along_axis
+  backward : put_along_axis_grad
+  inputs :
+    {arr : Input, indices : Index, values : Value}
+  outputs :
+    out : Result
+  attrs :
+    {axis : Axis, reduce : Reduce}
+
 - op : qr
   backward : qr_grad
   inputs :
@@ -1029,6 +1045,12 @@
   extra :
     attrs : [bool use_mkldnn = false]
 
+- op : searchsorted
+  inputs :
+    {sorted_sequence : SortedSequence, values : Values}
+  outputs :
+    out : Out
+
 - op : seed
   extra :
     attrs : [bool deterministic = false, str rng_name = "", bool force_cpu = false]
@@ -1176,6 +1198,13 @@
     attrs : [bool use_mkldnn = false, str x_data_format = "", str y_data_format = "", str mkldnn_data_type = "float32",
              bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
 
+- op : svd
+  backward : svd_grad
+  inputs :
+    x : X
+  outputs :
+    {u : U, s : S, vh : VH}
+
 - op : swish
   backward : swish_grad
   extra :
@@ -1186,6 +1215,15 @@
   extra :
     attrs : [bool use_mkldnn = false, bool fuse_with_relu = false]
 
+- op : take_along_axis
+  backward : take_along_axis_grad
+  inputs :
+    {arr : Input, indices : Index}
+  outputs :
+    out : Result
+  attrs :
+    axis : Axis
+
 - op : tan
   backward : tan_grad
   inputs :
diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
index e51f23dda220fa..32fe25624fe7b9 100644
--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -628,6 +628,15 @@
     func : log2
   backward: log2_grad
 
+- op : log_loss
+  args : (Tensor input, Tensor label, float epsilon)
+  output : Tensor
+  infer_meta :
+    func : LogLossInferMeta
+  kernel :
+    func : log_loss
+  backward : log_loss_grad
+
 - op : logit
   args : (Tensor x, float eps = 1e-6f)
   output : Tensor
@@ -741,6 +750,18 @@
     func : poisson
   backward : poisson_grad
 
+- op : put_along_axis
+  args : (Tensor arr, Tensor indices, Tensor values, int axis, str reduce = "assign")
+  output : Tensor(out)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [arr]
+  kernel :
+    func : put_along_axis
+    data_type : arr
+  inplace : (arr -> out)
+  backward : put_along_axis_grad
+
 - op : qr
   args : (Tensor x, str mode = "reduced")
   output : Tensor(q), Tensor(r)
@@ -800,6 +821,15 @@
   inplace : (x -> out)
   backward : rsqrt_grad
 
+- op : searchsorted
+  args : (Tensor sorted_sequence, Tensor values, bool out_int32 = false, bool right = false)
+  output : Tensor(out)
+  infer_meta :
+    func : SearchsortedInferMeta
+  kernel :
+    func : searchsorted
+    data_type : sorted_sequence
+
 - op : send_uv
   args : (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op = "ADD")
   output : Tensor(out)
@@ -907,6 +937,26 @@
            square_sr {selected_rows -> selected_rows}
   backward : square_grad
 
+- op : svd
+  args : (Tensor x, bool full_matrices = false)
+  output : Tensor(u), Tensor(s), Tensor(vh)
+  infer_meta :
+    func : SvdInferMeta
+  kernel :
+    func : svd
+  backward : svd_grad
+
+- op : take_along_axis
+  args : (Tensor arr, Tensor indices, int axis)
+  output : Tensor
+  infer_meta :
+    func : TakeAlongAxisInferMeta
+    param : [arr, indices, axis]
+  kernel :
+    func : take_along_axis
+    data_type : arr
+  backward : take_along_axis_grad
+
 - op : tan
   args : (Tensor x)
   output : Tensor
@@ -981,11 +1031,3 @@
   kernel :
     func : unfold
   backward : unfold_grad
-
-- op: share_buffer
-  args : (Tensor[] x, bool[] share_dims_and_dtype={})
-  output : Tensor[](out){x.size()}, Tensor[](xout){x.size()}
-  infer_meta :
-    func : ShareBufferInferMeta
-  kernel :
-    func : share_buffer
diff --git a/paddle/phi/api/yaml/static_ops.yaml b/paddle/phi/api/yaml/static_ops.yaml
new file mode 100644
index 00000000000000..1849b9f6c1e48e
--- /dev/null
+++ b/paddle/phi/api/yaml/static_ops.yaml
@@ -0,0 +1,7 @@
+- op : share_buffer
+  args : (Tensor[] x, bool[] share_dims_and_dtype={})
+  output : Tensor[](out){x.size()}, Tensor[](xout){x.size()}
+  infer_meta :
+    func : ShareBufferInferMeta
+  kernel :
+    func : share_buffer
diff --git a/paddle/phi/backends/dynload/cusolver.h b/paddle/phi/backends/dynload/cusolver.h
index 1354e310554804..a86e85144fd7fb 100644
--- a/paddle/phi/backends/dynload/cusolver.h
+++ b/paddle/phi/backends/dynload/cusolver.h
@@ -108,13 +108,22 @@ CUSOLVER_ROUTINE_EACH_R1(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP)
 #endif
 
 #if CUDA_VERSION >= 9020
-#define CUSOLVER_ROUTINE_EACH_R2(__macro) \
-  __macro(cusolverDnCreateSyevjInfo);     \
-  __macro(cusolverDnSsyevj_bufferSize);   \
-  __macro(cusolverDnDsyevj_bufferSize);   \
-  __macro(cusolverDnSsyevj);              \
-  __macro(cusolverDnDsyevj);              \
-  __macro(cusolverDnDestroySyevjInfo);
+#define CUSOLVER_ROUTINE_EACH_R2(__macro)      \
+  __macro(cusolverDnCreateSyevjInfo);          \
+  __macro(cusolverDnSsyevj_bufferSize);        \
+  __macro(cusolverDnDsyevj_bufferSize);        \
+  __macro(cusolverDnSsyevj);                   \
+  __macro(cusolverDnDsyevj);                   \
+  __macro(cusolverDnDestroySyevjInfo);         \
+  __macro(cusolverDnXsyevjSetSortEig);         \
+  __macro(cusolverDnSsyevjBatched_bufferSize); \
+  __macro(cusolverDnDsyevjBatched_bufferSize); \
+  __macro(cusolverDnCheevjBatched_bufferSize); \
+  __macro(cusolverDnZheevjBatched_bufferSize); \
+  __macro(cusolverDnSsyevjBatched);            \
+  __macro(cusolverDnDsyevjBatched);            \
+  __macro(cusolverDnCheevjBatched);            \
+  __macro(cusolverDnZheevjBatched);
 
 CUSOLVER_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP)
 #endif
diff --git a/paddle/phi/core/tensor_utils.cc b/paddle/phi/core/tensor_utils.cc
index 6e87f40ed0ab07..467552032f0ad6 100644
--- a/paddle/phi/core/tensor_utils.cc
+++ b/paddle/phi/core/tensor_utils.cc
@@ -56,6 +56,9 @@ void Copy(const Context& dev_ctx,
   void* dst_ptr = nullptr;
   if (paddle::platform::is_cpu_place(dst_place)) {
     dst_ptr = dev_ctx.HostAlloc(dst, src.dtype());
+#ifdef PADDLE_WITH_MKLDNN
+    dst->set_layout(src.layout());
+#endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   } else if (paddle::platform::is_gpu_place(dst_place) ||
              paddle::platform::is_cuda_pinned_place(dst_place)) {
@@ -81,7 +84,7 @@ void Copy(const Context& dev_ctx,
   PADDLE_ENFORCE_EQ(
       dst->place(),
       dst_place,
-      phi::errors::Unavailable(
+      errors::Unavailable(
           "The Dst Tensor's place and dst_place do not match, Tensor's place "
           "place is %s, dst_place is %s.",
           dst->place(),
@@ -112,13 +115,13 @@ void Copy(const Context& dev_ctx,
     PADDLE_ENFORCE_EQ(
         paddle::platform::is_gpu_place(ctx_place),
         true,
-        phi::errors::PreconditionNotMet(
+        errors::PreconditionNotMet(
             "Context place error, excepted GPUPlace, but actually %s.",
             ctx_place));
     auto ctx_gpu_place = ctx_place;
     PADDLE_ENFORCE_EQ(src_gpu_place,
                       ctx_gpu_place,
-                      phi::errors::Unavailable(
+                      errors::Unavailable(
                           "Source place and context place do not match, source "
                           "place is %s, context place is %s.",
                           src_gpu_place,
@@ -137,17 +140,17 @@ void Copy(const Context& dev_ctx,
     PADDLE_ENFORCE_EQ(
         paddle::platform::is_gpu_place(ctx_place),
         true,
-        phi::errors::PreconditionNotMet(
+        errors::PreconditionNotMet(
             "Context place error, excepted GPUPlace, but actually %s.",
             ctx_place));
     auto ctx_gpu_place = ctx_place;
-    PADDLE_ENFORCE_EQ(dst_gpu_place,
-                      ctx_gpu_place,
-                      phi::errors::Unavailable(
-                          "Destination place and context place do not match, "
-                          "destination place is %s, context place is %s.",
-                          dst_gpu_place,
-                          ctx_gpu_place));
+    PADDLE_ENFORCE_EQ(
+        dst_gpu_place,
+        ctx_gpu_place,
+        errors::Unavailable("Destination place and context place do not match, "
+                            "destination place is %s, context place is %s.",
+                            dst_gpu_place,
+                            ctx_gpu_place));
     auto stream =
         blocking ? nullptr
                  : reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
@@ -161,7 +164,7 @@ void Copy(const Context& dev_ctx,
     PADDLE_ENFORCE_EQ(
         paddle::platform::is_gpu_place(ctx_place),
         true,
-        phi::errors::PreconditionNotMet(
+        errors::PreconditionNotMet(
             "Context place error, excepted GPUPlace, but actually %s.",
             ctx_place));
     auto stream =
@@ -184,7 +187,7 @@ void Copy(const Context& dev_ctx,
         paddle::memory::Copy(
             dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
       } else {
-        PADDLE_THROW(phi::errors::Unavailable(
+        PADDLE_THROW(errors::Unavailable(
             "Context place dose not match the source and destination place."));
       }
     }
@@ -196,13 +199,13 @@ void Copy(const Context& dev_ctx,
     PADDLE_ENFORCE_EQ(
         paddle::platform::is_gpu_place(ctx_place),
         true,
-        phi::errors::PreconditionNotMet(
+        errors::PreconditionNotMet(
             "Context place error, excepted GPUPlace, but actually %s.",
             ctx_place));
     auto ctx_gpu_place = ctx_place;
     PADDLE_ENFORCE_EQ(src_gpu_place,
                       ctx_gpu_place,
-                      phi::errors::Unavailable(
+                      errors::Unavailable(
                           "Source place and context place do not match, source "
                           "place is %s, context place is %s.",
                           src_gpu_place,
@@ -259,7 +262,7 @@ void Copy(const Context& dev_ctx,
     paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream);
 #endif
   } else {
-    PADDLE_THROW(phi::errors::Unimplemented(
+    PADDLE_THROW(errors::Unimplemented(
         "Copy from %s to %s is not supported.", src_place, dst_place));
   }
 }
@@ -411,4 +414,12 @@ template void Copy(const CustomContext& dev_ctx,
                    bool blocking,
                    DenseTensor* dst);
 #endif
+
+#ifdef PADDLE_WITH_MKLDNN
+template void Copy(const OneDNNContext& dev_ctx,
+                   const DenseTensor& src,
+                   Place dst_place,
+                   bool blocking,
+                   DenseTensor* dst);
+#endif
 }  // namespace phi
diff --git a/paddle/fluid/operators/norm_utils.cu.h b/paddle/phi/kernels/funcs/norm_utils.cu.h
similarity index 98%
rename from paddle/fluid/operators/norm_utils.cu.h
rename to paddle/phi/kernels/funcs/norm_utils.cu.h
index 2412913995b95c..0971db10529a96 100644
--- a/paddle/fluid/operators/norm_utils.cu.h
+++ b/paddle/phi/kernels/funcs/norm_utils.cu.h
@@ -24,8 +24,7 @@ limitations under the License. */
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
 #endif
-#include "paddle/fluid/framework/data_layout.h"
-#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/phi/common/layout.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 #ifdef __HIPCC__
@@ -34,8 +33,8 @@ namespace cub = hipcub;
 #define LAUNCH_BOUNDS(BlockDim)
 #endif
 
-namespace paddle {
-namespace operators {
+namespace phi {
+namespace funcs {
 
 using DataLayout = phi::DataLayout;
 
@@ -464,7 +463,8 @@ void NormDoubleGradFunctor(const DeviceContext &ctx,
   const int sample_size = num / N / C;
   phi::DenseTensor scale_tmp;
   if (!Scale) {
-    scale_tmp.mutable_data<T>({C}, ctx.GetPlace());
+    scale_tmp.Resize({C});
+    ctx.template Alloc<T>(&scale_tmp);
     set_constant(ctx, &scale_tmp, static_cast<T>(1));
   }
   const T *scale_data = Scale ? Scale->data<T>() : scale_tmp.data<T>();
@@ -495,7 +495,7 @@ void NormDoubleGradFunctor(const DeviceContext &ctx,
   }
 
   if (dX) {
-    T *dx_data = dX->mutable_data<T>(ctx.GetPlace());
+    T *dx_data = ctx.template Alloc<T>(dX);
     set_constant(ctx, dX, static_cast<T>(0));
     if (use_global_stats) {
       if (data_layout == DataLayout::kNHWC) {
@@ -552,7 +552,7 @@ void NormDoubleGradFunctor(const DeviceContext &ctx,
     }
   }
   if (dScale) {
-    T *dscale_data = dScale->mutable_data<T>(ctx.GetPlace());
+    T *dscale_data = ctx.template Alloc<T>(dScale);
     set_constant(ctx, dScale, static_cast<T>(0));
     if (use_global_stats) {
       if (data_layout == DataLayout::kNHWC) {
@@ -605,7 +605,7 @@ void NormDoubleGradFunctor(const DeviceContext &ctx,
     }
   }
   if (ddY) {
-    T *ddy_data = ddY->mutable_data<T>(ctx.GetPlace());
+    T *ddy_data = ctx.template Alloc<T>(ddY);
     set_constant(ctx, ddY, static_cast<T>(0));
     if (use_global_stats) {
       if (data_layout == DataLayout::kNHWC) {
@@ -670,5 +670,5 @@ void NormDoubleGradFunctor(const DeviceContext &ctx,
     }
   }
 }
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/values_vectors_functor.h b/paddle/phi/kernels/funcs/values_vectors_functor.h
index 88bef61fa921ff..63202ca4a484d1 100644
--- a/paddle/phi/kernels/funcs/values_vectors_functor.h
+++ b/paddle/phi/kernels/funcs/values_vectors_functor.h
@@ -13,10 +13,10 @@
 // limitations under the License.
 
 #pragma once
-
 #include "paddle/fluid/memory/memory.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/phi/backends/dynload/cusolver.h"
+#include "paddle/phi/core/errors.h"
 #endif  // PADDLE_WITH_CUDA
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
@@ -54,6 +54,137 @@ static void CheckEighResult(const int batch, const int info) {
           info));
 }
 
+#ifdef PADDLE_WITH_CUDA
+
+#if CUDA_VERSION >= 11031
+static bool use_cusolver_syevj_batched = true;
+#else
+static bool use_cusolver_syevj_batched = false;
+#endif
+
+#define CUDASOLVER_SYEVJ_BATCHED_BUFFERSIZE_ARGTYPES(scalar_t, value_t)     \
+  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, \
+      int n, const scalar_t *A, int lda, const value_t *W, int *lwork,      \
+      syevjInfo_t params, int batchsize
+
+template <class scalar_t, class value_t = scalar_t>
+void syevjBatched_bufferSize(
+    CUDASOLVER_SYEVJ_BATCHED_BUFFERSIZE_ARGTYPES(scalar_t, value_t)) {
+  PADDLE_THROW(phi::errors::InvalidArgument(
+      "syevjBatched_bufferSize: not implemented for %s",
+      typeid(scalar_t).name()));
+}
+
+template <>
+inline void syevjBatched_bufferSize<float>(
+    CUDASOLVER_SYEVJ_BATCHED_BUFFERSIZE_ARGTYPES(float, float)) {
+  PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSsyevjBatched_bufferSize(
+      handle, jobz, uplo, n, A, lda, W, lwork, params, batchsize));
+}
+
+template <>
+inline void syevjBatched_bufferSize<double>(
+    CUDASOLVER_SYEVJ_BATCHED_BUFFERSIZE_ARGTYPES(double, double)) {
+  PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnDsyevjBatched_bufferSize(
+      handle, jobz, uplo, n, A, lda, W, lwork, params, batchsize));
+}
+
+template <>
+inline void syevjBatched_bufferSize<phi::dtype::complex<float>, float>(
+    CUDASOLVER_SYEVJ_BATCHED_BUFFERSIZE_ARGTYPES(phi::dtype::complex<float>,
+                                                 float)) {
+  PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnCheevjBatched_bufferSize(
+      handle,
+      jobz,
+      uplo,
+      n,
+      reinterpret_cast<const cuComplex *>(A),
+      lda,
+      W,
+      lwork,
+      params,
+      batchsize));
+}
+
+template <>
+inline void syevjBatched_bufferSize<phi::dtype::complex<double>, double>(
+    CUDASOLVER_SYEVJ_BATCHED_BUFFERSIZE_ARGTYPES(phi::dtype::complex<double>,
+                                                 double)) {
+  PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnZheevjBatched_bufferSize(
+      handle,
+      jobz,
+      uplo,
+      n,
+      reinterpret_cast<const cuDoubleComplex *>(A),
+      lda,
+      W,
+      lwork,
+      params,
+      batchsize));
+}
+
+#define CUDASOLVER_SYEVJ_BATCHED_ARGTYPES(scalar_t, value_t)                \
+  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, \
+      int n, scalar_t *A, int lda, value_t *W, scalar_t *work, int lwork,   \
+      int *info, syevjInfo_t params, int batchsize
+
+template <class scalar_t, class value_t = scalar_t>
+void syevjBatched(CUDASOLVER_SYEVJ_BATCHED_ARGTYPES(scalar_t, value_t)) {
+  PADDLE_THROW(phi::errors::InvalidArgument(
+      "syevjBatched: not implemented for %s", typeid(scalar_t).name()));
+}
+
+template <>
+inline void syevjBatched<float>(CUDASOLVER_SYEVJ_BATCHED_ARGTYPES(float,
+                                                                  float)) {
+  PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSsyevjBatched(
+      handle, jobz, uplo, n, A, lda, W, work, lwork, info, params, batchsize));
+}
+
+template <>
+inline void syevjBatched<double>(CUDASOLVER_SYEVJ_BATCHED_ARGTYPES(double,
+                                                                   double)) {
+  PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnDsyevjBatched(
+      handle, jobz, uplo, n, A, lda, W, work, lwork, info, params, batchsize));
+}
+
+template <>
+inline void syevjBatched<phi::dtype::complex<float>, float>(
+    CUDASOLVER_SYEVJ_BATCHED_ARGTYPES(phi::dtype::complex<float>, float)) {
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      dynload::cusolverDnCheevjBatched(handle,
+                                       jobz,
+                                       uplo,
+                                       n,
+                                       reinterpret_cast<cuComplex *>(A),
+                                       lda,
+                                       W,
+                                       reinterpret_cast<cuComplex *>(work),
+                                       lwork,
+                                       info,
+                                       params,
+                                       batchsize));
+}
+
+template <>
+inline void syevjBatched<phi::dtype::complex<double>, double>(
+    CUDASOLVER_SYEVJ_BATCHED_ARGTYPES(phi::dtype::complex<double>, double)) {
+  PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnZheevjBatched(
+      handle,
+      jobz,
+      uplo,
+      n,
+      reinterpret_cast<cuDoubleComplex *>(A),
+      lda,
+      W,
+      reinterpret_cast<cuDoubleComplex *>(work),
+      lwork,
+      info,
+      params,
+      batchsize));
+}
+#endif
+
 #ifdef PADDLE_WITH_CUDA
 static void CheckEighResult(const GPUContext &dev_ctx,
                             const int64_t batch_size,
@@ -232,17 +363,33 @@ struct MatrixEighFunctor<GPUContext, T> {
     DenseTensor input_trans = phi::TransposeLast2Dim<T>(dev_ctx, input);
     T *input_vector = input_trans.data<T>();
 
-    // Once input data type is float32, and the last dimension of
-    // input is located in range [32, 512], Syevj works better.
-    bool use_syevj = (input.dtype() == phi::DataType::FLOAT32 &&
-                      values_stride >= 32 && values_stride <= 512);
+    // Precision loss will occur in some cases while using
+    // cusolverDnZheevjBatched to calculate in Paddle(cuda11.7) but it works
+    // well in Paddle(cuda10.2)
+    use_cusolver_syevj_batched = (use_cusolver_syevj_batched) &&
+                                 (batch_size > 1) &&
+                                 (input.dtype() != phi::DataType::COMPLEX128);
+    bool use_cusolver_syevj = (input.dtype() == phi::DataType::FLOAT32 &&
+                               last_dim >= 32 && last_dim <= 512);
     auto handle = dev_ctx.cusolver_dn_handle();
 
     syevjInfo_t syevj_params;
-    if (use_syevj) {
+    if (use_cusolver_syevj_batched) {
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          dynload::cusolverDnCreateSyevjInfo(&syevj_params));
+      syevjBatched_bufferSize<T>(handle,
+                                 jobz,
+                                 uplo,
+                                 last_dim,
+                                 input_vector,
+                                 lda,
+                                 out_value,
+                                 &workspace_size,
+                                 syevj_params,
+                                 batch_size);
+    } else if (use_cusolver_syevj) {
       PADDLE_ENFORCE_GPU_SUCCESS(
           dynload::cusolverDnCreateSyevjInfo(&syevj_params));
-
       PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSsyevj_bufferSize(
           dev_ctx.cusolver_dn_handle(),
           jobz,
@@ -272,7 +419,21 @@ struct MatrixEighFunctor<GPUContext, T> {
     for (auto i = 0; i < batch_size; ++i) {
       auto *input_data = input_vector + i * vector_stride;
       auto *value_data = out_value + i * values_stride;
-      if (use_syevj) {
+      if (use_cusolver_syevj_batched) {
+        syevjBatched<T>(handle,
+                        jobz,
+                        uplo,
+                        last_dim,
+                        input_data,
+                        lda,
+                        value_data,
+                        work_ptr,
+                        workspace_size,
+                        &info_ptr[i],
+                        syevj_params,
+                        batch_size);
+        break;
+      } else if (use_cusolver_syevj) {
         PADDLE_ENFORCE_GPU_SUCCESS(
             dynload::cusolverDnSsyevj(handle,
                                       jobz,
@@ -300,7 +461,7 @@ struct MatrixEighFunctor<GPUContext, T> {
     }
     CheckEighResult(dev_ctx, batch_size, info_ptr);
 
-    if (use_syevj) {
+    if (use_cusolver_syevj_batched || use_cusolver_syevj) {
       PADDLE_ENFORCE_GPU_SUCCESS(
           dynload::cusolverDnDestroySyevjInfo(syevj_params));
     }
diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
index cfad86506c9099..1c6d1debbabd93 100644
--- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/layout_utils.h"
-#include "paddle/fluid/operators/norm_utils.cu.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_dnn.h"
 #include "paddle/phi/common/layout.h"
@@ -24,6 +23,7 @@
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/batch_norm_utils.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/norm_utils.cu.h"
 #include "paddle/phi/kernels/funcs/norm_utils.h"
 #include "paddle/phi/kernels/funcs/reduce_function.h"
 
@@ -1352,24 +1352,23 @@ void BatchNormDoubleGradKernel(
     running_mean = mean.get_ptr();
     running_variance = variance.get_ptr();
   }
-  paddle::operators::NormDoubleGradFunctor<Context, T>(
-      ctx,
-      data_layout,
-      &x,
-      &scale,
-      &y_grad,
-      &saved_mean,
-      &saved_variance,
-      running_mean,
-      running_variance,
-      epsilon,
-      use_global_stats,
-      x_grad_grad.get_ptr(),
-      scale_grad_grad.get_ptr(),
-      bias_grad_grad.get_ptr(),
-      x_grad,
-      scale_grad,
-      y_grad_grad);
+  phi::funcs::NormDoubleGradFunctor<Context, T>(ctx,
+                                                data_layout,
+                                                &x,
+                                                &scale,
+                                                &y_grad,
+                                                &saved_mean,
+                                                &saved_variance,
+                                                running_mean,
+                                                running_variance,
+                                                epsilon,
+                                                use_global_stats,
+                                                x_grad_grad.get_ptr(),
+                                                scale_grad_grad.get_ptr(),
+                                                bias_grad_grad.get_ptr(),
+                                                x_grad,
+                                                scale_grad,
+                                                y_grad_grad);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
index d01397c1fa0665..01e4f08c29bdd5 100644
--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -21,7 +21,6 @@ namespace cub = hipcub;
 #endif
 
 #include "paddle/fluid/operators/layout_utils.h"
-#include "paddle/fluid/operators/norm_utils.cu.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_dnn.h"
 #include "paddle/phi/common/layout.h"
diff --git a/paddle/phi/kernels/gpu/stack_kernel.cu b/paddle/phi/kernels/gpu/stack_kernel.cu
index c079b61c06e944..5cad80288bf691 100644
--- a/paddle/phi/kernels/gpu/stack_kernel.cu
+++ b/paddle/phi/kernels/gpu/stack_kernel.cu
@@ -18,30 +18,101 @@
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/fast_divmod.h"
 
 namespace phi {
 
-template <typename T, typename IntType>
-__global__ void StackCUDAKernel(T** input_ptrs,
-                                IntType split_size,
-                                IntType rows,
-                                IntType cols,
+template <typename IndexT>
+struct DivmodWarpper {
+ public:
+  void SetDivden(IndexT dividen) { divmoder = phi::funcs::FastDivMod(dividen); }
+  __device__ inline phi::funcs::FastDivMod::DivModT div_mod(IndexT val) {
+    return divmoder.Divmod(val);
+  }
+
+ private:
+  phi::funcs::FastDivMod divmoder;
+};
+
+template <>
+struct DivmodWarpper<int64_t> {
+ public:
+  using DivModT = phi::AlignedVector<int64_t, 2>;
+
+  void SetDivden(int64_t dividen) { dividen_ = dividen; }
+  __device__ inline DivModT div_mod(int64_t val) {
+    DivModT data;
+    data[0] = val / dividen_;
+    data[1] = val - data[0] * dividen_;
+    return data;
+  }
+
+ private:
+  int64_t dividen_;
+};
+
+constexpr int kWarpperSize = 64;
+template <typename T, typename IndexT>
+struct PointerArray : public DivmodWarpper<IndexT> {
+ public:
+  const T* data[kWarpperSize];
+  PointerArray(const std::vector<const DenseTensor*>& x,
+               int num,
+               int64_t dividen) {
+    this->SetDivden(dividen);
+    for (auto i = 0; i < num; ++i) {
+      data[i] = x[i]->data<T>();
+    }
+  }
+};
+
+template <typename Context, typename T, typename IndexT>
+struct PointerToPointer : public DivmodWarpper<IndexT> {
+ public:
+  T** data;
+  PointerToPointer(const Context& ctx,
+                   const std::vector<const DenseTensor*>& x,
+                   int num,
+                   int64_t dividen) {
+    this->SetDivden(dividen);
+    auto byte_len = num * sizeof(T*);
+    std::vector<const T*> x_datas(num);
+    for (int i = 0; i < num; ++i) {
+      x_datas[i] = x[i]->data<T>();
+    }
+    auto tmp_x_data = paddle::memory::Alloc(
+        ctx.GetPlace(),
+        byte_len,
+        phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
+    paddle::memory::Copy(ctx.GetPlace(),
+                         tmp_x_data->ptr(),
+                         phi::CPUPlace(),
+                         reinterpret_cast<void*>(x_datas.data()),
+                         x_datas.size() * sizeof(T*),
+                         ctx.stream());
+    data = reinterpret_cast<T**>(tmp_x_data->ptr());
+  }
+};
+
+template <typename T, typename IndexT, typename WarpT>
+__global__ void StackCUDAKernel(WarpT input_warpper,
+                                IndexT split_size,
+                                IndexT rows,
+                                IndexT cols,
                                 T* __restrict__ output) {
-  IntType grid_x = static_cast<IntType>(blockIdx.x) * blockDim.x + threadIdx.x;
-  IntType grid_x_stride = static_cast<IntType>(blockDim.x) * gridDim.x;
-  IntType grid_y_stride = static_cast<IntType>(blockDim.y) * gridDim.y;
+  IndexT grid_x = static_cast<IndexT>(blockIdx.x) * blockDim.x + threadIdx.x;
+  IndexT grid_x_stride = static_cast<IndexT>(blockDim.x) * gridDim.x;
+  IndexT grid_y_stride = static_cast<IndexT>(blockDim.y) * gridDim.y;
 
   for (; grid_x < cols; grid_x += grid_x_stride) {
-    IntType grid_y =
-        static_cast<IntType>(blockIdx.y) * blockDim.y + threadIdx.y;
+    IndexT grid_y = static_cast<IndexT>(blockIdx.y) * blockDim.y + threadIdx.y;
 
-    IntType split = grid_x / split_size;
-    const T* input_ptr = input_ptrs[split];
-    IntType col_offset = grid_x % split_size;
+    auto divmod_rslt = input_warpper.div_mod(grid_x);
+    const T* input_ptr = input_warpper.data[divmod_rslt[0]];
 #pragma unroll
     for (; grid_y < rows; grid_y += grid_y_stride) {
       output[grid_y * cols + grid_x] =
-          input_ptr[grid_y * split_size + col_offset];
+          input_ptr[grid_y * split_size + divmod_rslt[1]];
     }
   }
 }
@@ -52,24 +123,8 @@ void StackKernel(const Context& dev_ctx,
                  int axis,
                  DenseTensor* out) {
   if (axis < 0) axis += (x[0]->dims().size() + 1);
-
   int n = static_cast<int>(x.size());
   T* y_data = dev_ctx.template Alloc<T>(out);
-  std::vector<const T*> x_datas(n);
-  for (int i = 0; i < n; i++) {
-    x_datas[i] = x[i]->data<T>();
-  }
-
-  auto tmp_x_data = paddle::memory::Alloc(
-      dev_ctx.GetPlace(),
-      x_datas.size() * sizeof(T*),
-      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
-  paddle::memory::Copy(dev_ctx.GetPlace(),
-                       tmp_x_data->ptr(),
-                       phi::CPUPlace(),
-                       reinterpret_cast<void*>(x_datas.data()),
-                       x_datas.size() * sizeof(T*),
-                       dev_ctx.stream());
 
   // Split x dim from axis to matrix
   int64_t x_row = 1, x_col = 1;
@@ -78,33 +133,40 @@ void StackKernel(const Context& dev_ctx,
   }
   x_col = x[0]->numel() / x_row;
   int64_t out_col = x_col * n;
-
   auto config =
       phi::backends::gpu::GetGpuLaunchConfig2D(dev_ctx, out_col, x_row);
 
-  if (out->numel() < std::numeric_limits<int32_t>::max()) {
-    StackCUDAKernel<T, int32_t>
-        <<<config.block_per_grid,
-           config.thread_per_block,
-           0,
-           dev_ctx.stream()>>>(reinterpret_cast<T**>(tmp_x_data->ptr()),
-                               static_cast<int32_t>(x_col),
-                               static_cast<int32_t>(x_row),
-                               static_cast<int32_t>(out_col),
-                               y_data);
+#define IMPL_STACK_CUDA_KERNEL(index_t, input_warpper)      \
+  StackCUDAKernel<T, index_t, decltype(input_warpper)>      \
+      <<<config.block_per_grid,                             \
+         config.thread_per_block,                           \
+         0,                                                 \
+         dev_ctx.stream()>>>(input_warpper,                 \
+                             static_cast<index_t>(x_col),   \
+                             static_cast<index_t>(x_row),   \
+                             static_cast<index_t>(out_col), \
+                             y_data);
+
+  bool use_int32 = out->numel() < std::numeric_limits<int32_t>::max();
+  if (n <= kWarpperSize) {
+    if (use_int32) {
+      PointerArray<T, int32_t> ptr_array(x, n, x_col);
+      IMPL_STACK_CUDA_KERNEL(int32_t, ptr_array);
+    } else {
+      PointerArray<T, int64_t> ptr_array(x, n, x_col);
+      IMPL_STACK_CUDA_KERNEL(int64_t, ptr_array);
+    }
   } else {
-    StackCUDAKernel<T, int64_t>
-        <<<config.block_per_grid,
-           config.thread_per_block,
-           0,
-           dev_ctx.stream()>>>(reinterpret_cast<T**>(tmp_x_data->ptr()),
-                               x_col,
-                               x_row,
-                               out_col,
-                               y_data);
+    if (use_int32) {
+      PointerToPointer<Context, T, int32_t> ptr_array(dev_ctx, x, n, x_col);
+      IMPL_STACK_CUDA_KERNEL(int32_t, ptr_array);
+    } else {
+      PointerToPointer<Context, T, int64_t> ptr_array(dev_ctx, x, n, x_col);
+      IMPL_STACK_CUDA_KERNEL(int64_t, ptr_array);
+    }
   }
+#undef IMPL_STACK_CUDA_KERNEL
 }
-
 }  // namespace phi
 
 PD_REGISTER_KERNEL(stack,
diff --git a/paddle/phi/kernels/onednn/reshape_kernel.cc b/paddle/phi/kernels/onednn/reshape_kernel.cc
new file mode 100644
index 00000000000000..4d8adc4b9a6e1d
--- /dev/null
+++ b/paddle/phi/kernels/onednn/reshape_kernel.cc
@@ -0,0 +1,179 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/backends/onednn/onednn_reuse.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+static DDim ValidateShape(const std::vector<int64_t>& shape,
+                          const DDim& in_dims) {
+  const int64_t in_size = product(in_dims);
+  auto in_dims_vec = vectorize(in_dims);
+  bool all_positive = std::all_of(in_dims_vec.cbegin(),
+                                  in_dims_vec.cend(),
+                                  [](int64_t i) { return i > 0; });
+  // only one dimension can be set to -1, whose size will be automatically
+  // infered
+  const int64_t unk_dim_val = -1;
+  const int64_t copy_dim_val = 0;
+
+  std::vector<int64_t> output_shape(shape.size(), 0);
+  int64_t capacity = 1;
+  int unk_dim_idx = -1;
+  for (size_t i = 0; i < shape.size(); ++i) {
+    if (shape[i] == unk_dim_val) {
+      PADDLE_ENFORCE_EQ(
+          unk_dim_idx,
+          -1,
+          errors::InvalidArgument(
+              "Only one dimension value of 'shape' in ReshapeOp can "
+              "be -1. But received shape = [%s], shape[%d] is also -1.",
+              make_ddim(shape),
+              i));
+      unk_dim_idx = i;
+    } else if (shape[i] == copy_dim_val) {
+      PADDLE_ENFORCE_LT(
+          static_cast<int>(i),
+          in_dims.size(),
+          errors::InvalidArgument(
+              "The index of 0 in `shape` must be less than "
+              "the input tensor X's dimensions. "
+              "But received shape = [%s], shape[%d] = 0, X's shape = [%s], "
+              "X's dimensions = %d.",
+              make_ddim(shape),
+              i,
+              in_dims,
+              in_dims.size()));
+    } else {
+      PADDLE_ENFORCE_GT(
+          shape[i],
+          0,
+          errors::InvalidArgument(
+              "Each dimension value of 'shape' in ReshapeOp must not "
+              "be negative except one unknown dimension. "
+              "But received  shape = [%s], shape[%d] = %d.",
+              make_ddim(shape),
+              i,
+              shape[i]));
+    }
+
+    capacity *= (shape[i] ? shape[i] : in_dims[i]);
+    output_shape[i] = (shape[i] ? static_cast<int64_t>(shape[i]) : in_dims[i]);
+  }
+
+  if (unk_dim_idx != -1) {
+    if (all_positive) {
+      // in_size < 0 and is un-determinate in compile time, skip the check,
+      // for example, in_dims = [-1, 8, 1, 1], shape = [-1, 3, 8],
+      // capacity = -24, in_size = -8, output_shape[0] = 0
+      // the following check will fail.
+      output_shape[unk_dim_idx] = -in_size / capacity;
+      PADDLE_ENFORCE_EQ(
+          output_shape[unk_dim_idx] * capacity,
+          -in_size,
+          errors::InvalidArgument(
+              "The 'shape' attribute in ReshapeOp is invalid. "
+              "The input tensor X'size must be divisible by known "
+              "capacity of 'shape'. "
+              "But received X's shape = [%s], X's size = %d, "
+              "'shape' is [%s], known capacity of 'shape' is %d.",
+              in_dims,
+              in_size,
+              make_ddim(shape),
+              capacity));
+    } else {
+      output_shape[unk_dim_idx] = -1;
+    }
+  } else {
+    if (all_positive) {
+      PADDLE_ENFORCE_EQ(
+          capacity,
+          in_size,
+          errors::InvalidArgument(
+              "The 'shape' in ReshapeOp is invalid. "
+              "The input tensor X'size must be equal to the capacity of "
+              "'shape'. "
+              "But received X's shape = [%s], X's size = %d, 'shape' is "
+              "[%s], the capacity of 'shape' is %d.",
+              in_dims,
+              in_size,
+              make_ddim(shape),
+              capacity));
+    }
+  }
+  return make_ddim(output_shape);
+}
+
+template <typename T, typename Context>
+void ExecuteReshape(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const IntArray& shape,
+                    const DDim& x_dims,
+                    DenseTensor* out) {
+  auto out_dims = ValidateShape(shape.GetData(), x_dims);
+  auto x_vec_dims = vectorize(x_dims);
+
+  funcs::ReorderOneDNNHandler reorder_handler(
+      x_vec_dims,
+      x.dtype(),
+      funcs::ToOneDNNDataType(x.dtype()),
+      dev_ctx.GetEngine());
+
+  auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
+      x.mem_desc(), funcs::to_void_cast(x.data<T>()));
+  out->Resize(x_dims);  // to match x numel, format is changed later
+  // reorder is done into a plain tag to allow usage with blocked formats
+  auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
+      out, funcs::GetPlainOneDNNFormat(x_dims.size()), dev_ctx.GetPlace());
+  auto reorder_p = reorder_handler.AcquireReorder(reorder_dst_memory_p,
+                                                  reorder_src_memory_p);
+
+  auto& astream = OneDNNContext::tls().get_stream();
+  reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
+
+  astream.wait();
+
+  out->Resize(out_dims);
+  out->set_mem_desc(
+      reorder_dst_memory_p->get_desc().reshape(vectorize(out_dims)));
+}
+
+template <typename T, typename Context>
+void ReshapeKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const IntArray& shape,
+                   DenseTensor* out) {
+  auto x_dims = x.dims();
+  ExecuteReshape<T, Context>(dev_ctx, x, shape, x_dims, out);
+}
+
+template <typename T, typename Context>
+void ReshapeWithXShape(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const IntArray& shape,
+                       DenseTensor* out,
+                       DenseTensor* xshape) {
+  auto x_dims = slice_ddim(xshape->dims(), 1, xshape->dims().size());
+  ExecuteReshape<T, Context>(dev_ctx, x, shape, x_dims, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    reshape, OneDNN, ONEDNN, phi::ReshapeKernel, float, phi::dtype::bfloat16) {}
+
+PD_REGISTER_KERNEL(reshape_with_xshape,
+                   OneDNN,
+                   ONEDNN,
+                   phi::ReshapeWithXShape,
+                   float,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/onednn/transpose_grad_kernel.cc b/paddle/phi/kernels/onednn/transpose_grad_kernel.cc
index 64f1f9f610861b..dafbb75dc07ac5 100644
--- a/paddle/phi/kernels/onednn/transpose_grad_kernel.cc
+++ b/paddle/phi/kernels/onednn/transpose_grad_kernel.cc
@@ -13,8 +13,6 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/transpose_grad_kernel.h"
-
-#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/phi/backends/onednn/onednn_reuse.h"
 #include "paddle/phi/core/kernel_registry.h"
 
@@ -24,16 +22,16 @@ void TransposeGradKernel(const Context& dev_ctx,
                          const DenseTensor& out_grad,
                          const std::vector<int>& axis,
                          DenseTensor* x_grad) {
-  PADDLE_ENFORCE_EQ(dev_ctx.GetPlace().GetType() == phi::AllocationType::CPU,
+  PADDLE_ENFORCE_EQ(dev_ctx.GetPlace().GetType() == AllocationType::CPU,
                     true,
                     errors::PreconditionNotMet(
-                        "Operator DNNL TransposeGrad must use CPUPlace"));
+                        "oneDNN TransposeGrad kernel must use CPUPlace"));
   if (!x_grad) return;
 
   const auto& onednn_engine = dev_ctx.GetEngine();
 
   if (axis.size() == 1) {
-    paddle::framework::TensorCopy(out_grad, out_grad.place(), x_grad);
+    Copy<Context>(dev_ctx, out_grad, out_grad.place(), false, x_grad);
     x_grad->set_mem_desc(out_grad.mem_desc());
     return;
   }
diff --git a/paddle/phi/kernels/onednn/transpose_kernel.cc b/paddle/phi/kernels/onednn/transpose_kernel.cc
index 26c89197e0d7f4..a36d5e4493a549 100644
--- a/paddle/phi/kernels/onednn/transpose_kernel.cc
+++ b/paddle/phi/kernels/onednn/transpose_kernel.cc
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/transpose_kernel.h"
-#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/phi/backends/onednn/onednn_reuse.h"
 #include "paddle/phi/core/kernel_registry.h"
 
@@ -80,7 +79,7 @@ void TransposeKernel(const Context& dev_ctx,
       dev_ctx, const_cast<DenseTensor*>(&x), x.mem_desc());
 
   if (axis.size() == 1) {
-    paddle::framework::TensorCopy(x, x.place(), out);
+    Copy<Context>(dev_ctx, x, x.place(), false, out);
     out->set_mem_desc(x.mem_desc());
     return;
   }
diff --git a/paddle/phi/ops/compat/log_loss_sig.cc b/paddle/phi/ops/compat/log_loss_sig.cc
deleted file mode 100644
index adf40bac000e3f..00000000000000
--- a/paddle/phi/ops/compat/log_loss_sig.cc
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-
-KernelSignature LogLossGradOpArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  return KernelSignature("log_loss_grad",
-                         {"Predicted", "Labels", "Loss@GRAD"},
-                         {"epsilon"},
-                         {"Predicted@GRAD"});
-}
-
-}  // namespace phi
-
-PD_REGISTER_ARG_MAPPING_FN(log_loss_grad, phi::LogLossGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/put_along_axis_sig.cc b/paddle/phi/ops/compat/put_along_axis_sig.cc
deleted file mode 100644
index 83f0e5f65a0c51..00000000000000
--- a/paddle/phi/ops/compat/put_along_axis_sig.cc
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-
-KernelSignature PutAlongAxisArgumentMapping(const ArgumentMappingContext& ctx) {
-  return KernelSignature("put_along_axis",
-                         {"Input", "Index", "Value"},
-                         {"Axis", "Reduce"},
-                         {"Result"});
-}
-
-KernelSignature PutAlongAxisGradArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  return KernelSignature("put_along_axis_grad",
-                         {"Input", "Index", "Result@GRAD"},
-                         {"Axis", "Reduce"},
-                         {"Input@GRAD", "Value@GRAD"});
-}
-
-}  // namespace phi
-
-PD_REGISTER_ARG_MAPPING_FN(put_along_axis, phi::PutAlongAxisArgumentMapping);
-PD_REGISTER_ARG_MAPPING_FN(put_along_axis_grad,
-                           phi::PutAlongAxisGradArgumentMapping);
diff --git a/paddle/phi/ops/compat/svd_sig.cc b/paddle/phi/ops/compat/svd_sig.cc
deleted file mode 100644
index 2b97d23f8b85f8..00000000000000
--- a/paddle/phi/ops/compat/svd_sig.cc
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-
-KernelSignature SvdGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  return KernelSignature("svd_grad",
-                         {"X", "U", "VH", "S", "U@GRAD", "VH@GRAD", "S@GRAD"},
-                         {"full_matrices"},
-                         {"X@GRAD"});
-}
-}  // namespace phi
-
-PD_REGISTER_ARG_MAPPING_FN(svd_grad, phi::SvdGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/take_along_axis_sig.cc b/paddle/phi/ops/compat/take_along_axis_sig.cc
deleted file mode 100644
index a35c1c2db44800..00000000000000
--- a/paddle/phi/ops/compat/take_along_axis_sig.cc
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/compat/op_utils.h"
-
-namespace phi {
-
-KernelSignature TakeAlongAxisArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  return KernelSignature(
-      "take_along_axis", {"Input", "Index"}, {"Axis"}, {"Result"});
-}
-
-KernelSignature TakeAlongAxisGradArgumentMapping(
-    const ArgumentMappingContext& ctx) {
-  return KernelSignature("take_along_axis_grad",
-                         {"Input", "Index", "Result@GRAD"},
-                         {"Axis"},
-                         {"Input@GRAD"});
-}
-
-}  // namespace phi
-
-PD_REGISTER_ARG_MAPPING_FN(take_along_axis, phi::TakeAlongAxisArgumentMapping);
-PD_REGISTER_ARG_MAPPING_FN(take_along_axis_grad,
-                           phi::TakeAlongAxisGradArgumentMapping);
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 47e64afb16c57e..23895ef14c90f3 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -769,8 +769,8 @@ function run_linux_cpu_test() {
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
     pip install hypothesis
-    if [ -d "${PADDLE_ROOT}/build/python/dist/" ]; then
-        pip install ${PADDLE_ROOT}/build/python/dist/*whl
+    if [ -d "${PADDLE_ROOT}/dist/" ]; then
+        pip install ${PADDLE_ROOT}/dist/*whl
     fi
     cp ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/op_test.py ${PADDLE_ROOT}/build/python
     cp ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/testsuite.py ${PADDLE_ROOT}/build/python
@@ -3463,6 +3463,12 @@ function check_coverage_build() {
 }
 function run_setup(){
     rm -rf ${PADDLE_ROOT}/build
+    # Build script will not fail if *.deb does not exist
+    rm *.deb 2>/dev/null || true
+    # Delete previous built egg packages
+    rm -rf ${PADDLE_ROOT}/dist 2>/dev/null || true
+    # Delete previous built paddle cache
+    rm -rf ${PADDLE_ROOT}/build/python/paddle 2>/dev/null || true
     startTime_s=`date +%s`
 
     SYSTEM=`uname -s`
@@ -3477,7 +3483,6 @@ function run_setup(){
                 export PYTHON_EXECUTABLE=/Library/Frameworks/Python.framework/Versions/3.7/bin/python3
                 export PYTHON_INCLUDE_DIR=/Library/Frameworks/Python.framework/Versions/3.7/include/python3.7m/
                 export PYTHON_LIBRARY=/Library/Frameworks/Python.framework/Versions/3.7/lib/libpython3.7m.dylib
-
                 pip3.7 install --user -r ${PADDLE_ROOT}/python/requirements.txt
             else
                 exit 1
@@ -3525,15 +3530,7 @@ function run_setup(){
     else
         if [ "$1" != "" ]; then
             echo "using python abi: $1"
-            if [ "$1" == "cp36-cp36m" ]; then
-                export LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH}
-                export PATH=/opt/_internal/cpython-3.6.0/bin/:${PATH}
-                #after changing "PYTHON_LIBRARY:FILEPATH" to "PYTHON_LIBRARY" ,we can use export
-                export PYTHON_EXECUTABLE=/opt/_internal/cpython-3.6.0/bin/python3
-                export PYTHON_INCLUDE_DIR=/opt/_internal/cpython-3.6.0/include/python3.6m
-                export PYTHON_LIBRARIES=/opt/_internal/cpython-3.6.0/lib/libpython3.so
-                pip3.6 install -r ${PADDLE_ROOT}/python/requirements.txt
-            elif [ "$1" == "cp37-cp37m" ]; then
+            if [ "$1" == "cp37-cp37m" ]; then
                 export LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH}
                 export PATH=/opt/_internal/cpython-3.7.0/bin/:${PATH}
                 #after changing "PYTHON_LIBRARY:FILEPATH" to "PYTHON_LIBRARY" ,we can use export
@@ -3651,7 +3648,7 @@ function run_setup(){
     # reset ccache zero stats for collect PR's actual hit rate
     ccache -z
 
-    python setup.py install;build_error=$?
+    python setup.py $2;build_error=$?
     
     # ci will collect ccache hit rate
     collect_ccache_hits
@@ -3871,7 +3868,7 @@ function main() {
         build_mac
         ;;
       cicheck_py37)
-        cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
+        run_setup ${PYTHON_ABI:-""} bdist_wheel
         run_linux_cpu_test ${PYTHON_ABI:-""} ${PROC_RUN:-1}
         ;;
       test_cicheck_py37)
@@ -3884,7 +3881,7 @@ function main() {
         parallel_test
         ;;
       build_gpubox)
-        run_setup ${PYTHON_ABI:-""} 
+        run_setup ${PYTHON_ABI:-""} install
         ;;
       check_xpu)
         cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
index c12381c894e794..d845f3b78c6345 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
@@ -220,7 +220,8 @@ def unscale_method(self, optimizer):
         temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool_))
         temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool_))
 
-        device = "cpu" if optimizer.offload else "gpu"
+        device = paddle.get_device().split(":")[0]
+        device = "cpu" if optimizer.offload else device
         dev_id = (
             0 if device == "cpu" else int(paddle.get_device().split(":")[1])
         )
@@ -245,8 +246,9 @@ def unscale_method(self, optimizer):
         is_found_inf = paddle.to_tensor([self._found_inf], dtype="int32")
 
         paddle.distributed.all_reduce(
-            is_found_inf, op=paddle.distributed.ReduceOp.MAX, group=None
+            is_found_inf, op=paddle.distributed.ReduceOp.SUM, group=None
         )
+
         self._found_inf = is_found_inf.numpy()[0]
 
     scaler._unscale = MethodType(unscale_method, scaler)
diff --git a/python/paddle/fluid/dygraph/amp/auto_cast.py b/python/paddle/fluid/dygraph/amp/auto_cast.py
index 721d10d76b1286..1f644147a209bd 100644
--- a/python/paddle/fluid/dygraph/amp/auto_cast.py
+++ b/python/paddle/fluid/dygraph/amp/auto_cast.py
@@ -252,14 +252,26 @@ def check_models(models):
             )
 
 
+def _is_valid_optimizer(optimizer):
+    from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.dygraph_sharding_optimizer import (
+        DygraphShardingOptimizer,
+    )
+
+    return isinstance(
+        optimizer,
+        (
+            paddle.optimizer.Optimizer,
+            paddle.fluid.optimizer.Optimizer,
+            DygraphShardingOptimizer,
+        ),
+    )
+
+
 def check_optimizers(optimizers):
     for optimizer in optimizers:
-        if not isinstance(
-            optimizer,
-            (paddle.optimizer.Optimizer, paddle.fluid.optimizer.Optimizer),
-        ):
+        if not _is_valid_optimizer(optimizer):
             raise RuntimeError(
-                "Current train mode is pure fp16, optimizers should be paddle.optimizer.Optimizer or paddle.fluid.optimizer.Optimizer, but receive {}.".format(
+                "Current train mode is pure fp16, optimizers should be paddle.optimizer.Optimizer or paddle.fluid.optimizer.Optimizer or DygraphShardingOptimizer, but receive {}.".format(
                     type(optimizer)
                 )
             )
@@ -477,6 +489,20 @@ def __call__(self, state_dict):
                     state_dict[key] = param_applied
 
 
+def _set_multi_precision(optimizer, multi_precision):
+    from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.dygraph_sharding_optimizer import (
+        DygraphShardingOptimizer,
+    )
+
+    optimizer = (
+        optimizer._inner_optimizer
+        if isinstance(optimizer, DygraphShardingOptimizer)
+        else optimizer
+    )
+    if hasattr(optimizer, "_multi_precision"):
+        optimizer._multi_precision = multi_precision
+
+
 @dygraph_only
 def amp_decorate(
     models,
@@ -582,10 +608,7 @@ def amp_decorate(
     if optimizers is not None:
         # check optimizers
         optimizers_is_list = False
-        if isinstance(
-            optimizers,
-            (paddle.optimizer.Optimizer, paddle.fluid.optimizer.Optimizer),
-        ):
+        if _is_valid_optimizer(optimizers):
             optimizers_is_list = False
             optimizers = [optimizers]
             check_optimizers(optimizers)
@@ -596,13 +619,10 @@ def amp_decorate(
             raise TypeError(
                 "optimizers must be either a single optimizer or a list of optimizers."
             )
-        # supprot master_weight
-        for idx_opt in range(len(optimizers)):
-            if hasattr(optimizers[idx_opt], '_multi_precision'):
-                if master_weight is False:
-                    optimizers[idx_opt]._multi_precision = False
-                else:
-                    optimizers[idx_opt]._multi_precision = True
+        # support master_weight
+        use_multi_precision = not (master_weight is False)
+        for opt in optimizers:
+            _set_multi_precision(opt, use_multi_precision)
 
     if save_dtype is not None:
         if not (save_dtype in ['float16', 'bfloat16', 'float32', 'float64']):
diff --git a/python/paddle/fluid/dygraph/checkpoint.py b/python/paddle/fluid/dygraph/checkpoint.py
index 9515a8bd1704e5..d3e91295d43a29 100644
--- a/python/paddle/fluid/dygraph/checkpoint.py
+++ b/python/paddle/fluid/dygraph/checkpoint.py
@@ -83,9 +83,10 @@ def save_dygraph(state_dict, model_path):
         .. code-block:: python
 
             import paddle.fluid as fluid
+            import paddle
 
             with fluid.dygraph.guard():
-                emb = fluid.dygraph.Embedding([10, 10])
+                emb = paddle.nn.Embedding(10, 10)
 
                 state_dict = emb.state_dict()
                 fluid.save_dygraph( state_dict, "paddle_dy")
diff --git a/python/paddle/fluid/dygraph/learning_rate_scheduler.py b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
index 77d4f2c2573f94..cf794ad4cef899 100644
--- a/python/paddle/fluid/dygraph/learning_rate_scheduler.py
+++ b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
@@ -170,10 +170,11 @@ class PiecewiseDecay(LearningRateDecay):
         .. code-block:: python
 
           import paddle.fluid as fluid
+          import paddle
           boundaries = [10000, 20000]
           values = [1.0, 0.5, 0.1]
           with fluid.dygraph.guard():
-              emb = fluid.dygraph.Embedding( [10, 10] )
+              emb = paddle.nn.Embedding(10, 10)
               optimizer = fluid.optimizer.SGD(
                  learning_rate=fluid.dygraph.PiecewiseDecay(boundaries, values, 0),
                  parameter_list = emb.parameters() )
@@ -240,9 +241,10 @@ class NaturalExpDecay(LearningRateDecay):
         .. code-block:: python
 
             import paddle.fluid as fluid
+            import paddle
             base_lr = 0.1
             with fluid.dygraph.guard():
-                emb = fluid.dygraph.Embedding([10, 10])
+                emb = paddle.nn.Embedding(10, 10)
                 sgd_optimizer = fluid.optimizer.SGD(
                         learning_rate=fluid.dygraph.NaturalExpDecay(
                             learning_rate=base_lr,
@@ -403,9 +405,10 @@ class InverseTimeDecay(LearningRateDecay):
         .. code-block:: python
 
           import paddle.fluid as fluid
+          import paddle
           base_lr = 0.1
           with fluid.dygraph.guard():
-              emb = fluid.dygraph.Embedding([10, 10])
+              emb = paddle.nn.Embedding(10, 10)
               sgd_optimizer = fluid.optimizer.SGD(
                   learning_rate=fluid.dygraph.InverseTimeDecay(
                         learning_rate=base_lr,
@@ -487,11 +490,12 @@ class PolynomialDecay(LearningRateDecay):
         .. code-block:: python
 
           import paddle.fluid as fluid
+          import paddle
           start_lr = 0.01
           total_step = 5000
           end_lr = 0
           with fluid.dygraph.guard():
-              emb = fluid.dygraph.Embedding( [10, 10])
+              emb = paddle.nn.Embedding(10, 10)
               optimizer  = fluid.optimizer.SGD(
                   learning_rate = fluid.dygraph.PolynomialDecay(
                   start_lr, total_step, end_lr, power=1.0),
@@ -639,10 +643,11 @@ class NoamDecay(LearningRateDecay):
         .. code-block:: python
 
           import paddle.fluid as fluid
+          import paddle
           warmup_steps = 100
           learning_rate = 0.01
           with fluid.dygraph.guard():
-              emb = fluid.dygraph.Embedding([10, 10])
+              emb = paddle.nn.Embedding(10, 10)
               optimizer  = fluid.optimizer.SGD(
                   learning_rate = fluid.dygraph.NoamDecay(
                          1/(warmup_steps *(learning_rate ** 2)),
diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index f0b761fff82905..77436e9293d644 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -51,7 +51,6 @@
 
 __all__ = [
     'BatchNorm',
-    'Embedding',
 ]
 
 
@@ -360,187 +359,6 @@ def forward(self, input):
         return self._helper.append_activation(batch_norm_out, self._act)
 
 
-class Embedding(layers.Layer):
-    r"""
-    :alias_main: paddle.nn.Embedding
-        :alias: paddle.nn.Embedding,paddle.nn.layer.Embedding,paddle.nn.layer.common.Embedding
-        :old_api: paddle.fluid.dygraph.Embedding
-
-    **Embedding Layer**
-
-    This interface is used to construct a callable object of the ``Embedding`` class.
-    For specific usage, refer to code examples. It implements the function of the Embedding Layer.
-    This layer is used to lookup embeddings vector of ids provided by :attr:`input` .
-    It automatically constructs a 2D embedding matrix based on the
-    input :attr:`size` (vocab_size, emb_size) and :attr:`dtype` .
-
-    The shape of output Tensor is generated by appending an emb_size dimension to the
-    last dimension of the input Tensor shape.
-
-    **Note:** The id in :attr:`input` must satisfy :math:`0 =< id < size[0]` ,
-    otherwise the program will throw an exception and exit.
-
-    .. code-block:: text
-
-        Case 1:
-
-        input is a Tensor. padding_idx = -1
-            input.data = [[1, 3], [2, 4], [4, 127]
-            input.shape = [3, 2]
-        Given size = [128, 16]
-        output is a Tensor:
-            out.shape = [3, 2, 16]
-            out.data = [[[0.129435295, 0.244512452, ..., 0.436322452],
-                        [0.345421456, 0.524563927, ..., 0.144534654]],
-
-                        [[0.345249859, 0.124939536, ..., 0.194353745],
-                        [0.945345345, 0.435394634, ..., 0.435345365]],
-
-                        [[0.945345345, 0.435394634, ..., 0.435345365],
-                        [0.0,         0.0,         ..., 0.0        ]]]  # padding data
-        The input padding_idx is less than 0, it is automatically converted to padding_idx = -1 + 128 = 127
-        It will pad all-zero data when ids is 127.
-
-    Parameters:
-        size(tuple|list): The shape of the look up table parameter. It should have two elements which indicate the size
-            of the dictionary of embeddings and the size of each embedding vector respectively.
-        is_sparse(bool): The flag indicating whether to use sparse update. This parameter only
-            affects the performance of the backwards gradient update. It is recommended to set
-            True because sparse update is faster. But some optimizer does not support sparse update,
-            such as :ref:`api_fluid_optimizer_AdadeltaOptimizer` , :ref:`api_fluid_optimizer_AdamaxOptimizer` ,
-            :ref:`api_fluid_optimizer_DecayedAdagradOptimizer` , :ref:`api_fluid_optimizer_FtrlOptimizer` ,
-            :ref:`api_fluid_optimizer_LambOptimizer` and :ref:`api_fluid_optimizer_LarsMomentumOptimizer` .
-            In these case, is_sparse must be False. Default: False.
-        is_distributed(bool): Whether to store the embedding matrix in a distributed manner. Only used
-            in multi-machine distributed CPU training. Default: False.
-        padding_idx(int|long|None): padding_idx needs to be in the interval [-vocab_size, vocab_size).
-            If :math:`padding\_idx < 0`, the :math:`padding\_idx` will automatically be converted
-            to :math:`vocab\_size + padding\_idx` . It will output all-zero padding data whenever lookup
-            encounters :math:`padding\_idx` in id. And the padding data will not be updated while training.
-            If set None, it makes no effect to output. Default: None.
-        param_attr(ParamAttr): To specify the weight parameter property. Default: None, which means the
-            default weight parameter property is used. See usage for details in :ref:`api_fluid_ParamAttr` . In addition,
-            user-defined or pre-trained word vectors can be loaded with the :attr:`param_attr` parameter.
-            The local word vector needs to be transformed into numpy format, and the shape of local word
-            vector should be consistent with :attr:`size` . Then :ref:`api_fluid_initializer_NumpyArrayInitializer`
-            is used to load custom or pre-trained word vectors. See code example 2 for details.
-        dtype(np.dtype|core.VarDesc.VarType|str): It refers to the data type of output Tensor.
-            It must be "float32" or "float64". Default: "float32".
-
-    Attribute:
-        **weight** (Parameter): the learnable weights of this layer.
-
-    Returns:
-        Variable: Embedding Tensor or LoDTensor mapped by input. The data type is the same as :attr:`dtype` .
-
-    Examples:
-
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          import paddle.fluid.dygraph.base as base
-          import numpy as np
-
-          # example 1
-          inp_word = np.array([[2, 3, 5], [4, 2, 1]]).astype('int64')
-          inp_word.shape  # [2, 3]
-          dict_size = 20
-          with fluid.dygraph.guard():
-              emb = fluid.dygraph.Embedding(
-                  size=[dict_size, 32],
-                  param_attr='emb.w',
-                  is_sparse=False)
-              static_rlt3 = emb(base.to_variable(inp_word))
-              static_rlt3.shape  # [2, 3, 32]
-
-          # example 2: load custom or pre-trained word vectors
-          weight_data = np.random.random(size=(128, 100))  # word vectors with numpy format
-          w_param_attrs = fluid.ParamAttr(
-              name="emb_weight",
-              learning_rate=0.5,
-              initializer=fluid.initializer.NumpyArrayInitializer(weight_data),
-              trainable=True)
-          with fluid.dygraph.guard():
-              emb = fluid.dygraph.Embedding(
-                  size=[128, 100],
-                  param_attr= w_param_attrs,
-                  is_sparse=False)
-              static_rlt3 = emb(base.to_variable(inp_word))
-    """
-
-    def __init__(
-        self,
-        size,
-        is_sparse=False,
-        is_distributed=False,
-        padding_idx=None,
-        param_attr=None,
-        dtype='float32',
-    ):
-        super().__init__()
-        self._size = size
-        self._is_sparse = is_sparse
-        self._is_distributed = is_distributed
-        self._padding_idx = (
-            -1
-            if padding_idx is None
-            else padding_idx
-            if padding_idx >= 0
-            else (size[0] + padding_idx)
-        )
-
-        self._param_attr = param_attr
-        self._dtype = dtype
-        self._remote_prefetch = self._is_sparse and (not self._is_distributed)
-        if self._remote_prefetch:
-            assert self._is_sparse is True and self._is_distributed is False
-
-        self.weight = self.create_parameter(
-            attr=self._param_attr,
-            shape=self._size,
-            dtype=self._dtype,
-            is_bias=False,
-        )
-
-    def forward(self, input):
-        if _non_static_mode():
-            return _legacy_C_ops.lookup_table_v2(
-                self.weight,
-                input,
-                'is_sparse',
-                self._is_sparse,
-                'is_distributed',
-                self._is_distributed,
-                'remote_prefetch',
-                self._remote_prefetch,
-                'padding_idx',
-                self._padding_idx,
-            )
-
-        check_variable_and_dtype(
-            input,
-            'input',
-            ['uint8', 'int8', 'int16', 'int32', 'int64'],
-            'Embedding',
-        )
-        attrs = {
-            'is_sparse': self._is_sparse,
-            'is_distributed': self._is_distributed,
-            'remote_prefetch': self._remote_prefetch,
-            'padding_idx': self._padding_idx,
-        }
-
-        out = self._helper.create_variable_for_type_inference(self._dtype)
-        self._helper.append_op(
-            type='lookup_table_v2',
-            inputs={'Ids': input, 'W': self.weight},
-            outputs={'Out': out},
-            attrs=attrs,
-        )
-
-        return out
-
-
 class RowConv(layers.Layer):
     """
     ***Row-convolution operator***
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index cb030f71a45bc5..3432baf442e2ad 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -723,10 +723,6 @@ def init_reducer(self):
         def check_layer_sparse(sublayer):
             if isinstance(sublayer, paddle.nn.layer.common.Embedding):
                 return sublayer._sparse
-            # NOTE(shenliang03):This is for compatibility. If paddle.fluid.dygraph.Embedding
-            # is removed in the future, the check will also be removed here.
-            if isinstance(sublayer, paddle.fluid.dygraph.Embedding):
-                return sublayer._is_sparse
             return False
 
         is_sparse_gradient = [
@@ -875,8 +871,8 @@ def state_dict(
 
                 dist.init_parallel_env()
 
-                emb = fluid.dygraph.Embedding([10, 10])
-                emb = fluid.dygraph.DataParallel(emb)
+                emb = paddle.nn.Embedding(10, 10)
+                emb = paddle.fluid.dygraph.DataParallel(emb)
 
                 state_dict = emb.state_dict()
                 paddle.save(state_dict, "paddle_dy.pdparams")
@@ -910,7 +906,7 @@ def set_state_dict(self, state_dict, use_structured_name=True):
                 dist.init_parallel_env()
 
                 emb = paddle.nn.Embedding(10, 10)
-                emb = fluid.dygraph.DataParallel(emb)
+                emb = paddle.fluid.dygraph.DataParallel(emb)
 
                 state_dict = emb.state_dict()
                 paddle.save(state_dict, "paddle_dy.pdparams")
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index da4f609c401ac3..7393c6104f38ed 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1660,10 +1660,11 @@ def gradient(self):
 
                 # example2: return tuple of ndarray
                 with fluid.dygraph.guard():
-                    embedding = fluid.dygraph.Embedding(
-                        size=[20, 32],
-                        param_attr='emb.w',
-                        is_sparse=True)
+                    embedding = paddle.nn.Embedding(
+                        20,
+                        32,
+                        weight_attr='emb.w',
+                        sparse=True)
                     x_data = np.arange(12).reshape(4, 3).astype('int64')
                     x_data = x_data.reshape((-1, 3, 1))
                     x = fluid.dygraph.base.to_variable(x_data)
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 71167f30e026a6..42c57193941f30 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -214,9 +214,10 @@ def state_dict(self):
             .. code-block:: python
 
                 import paddle.fluid as fluid
+                import paddle
 
                 with fluid.dygraph.guard():
-                    emb = fluid.dygraph.Embedding([10, 10])
+                    emb = paddle.nn.Embedding(10, 10)
 
                     adam = fluid.optimizer.Adam(0.001, parameter_list=emb.parameters())
                     state_dict = adam.state_dict()
@@ -582,7 +583,7 @@ def current_step_lr(self):
 
                 # example1: LearningRateDecay is not used, return value is all the same
                 with fluid.dygraph.guard():
-                    emb = fluid.dygraph.Embedding([10, 10])
+                    emb = paddle.nn.Embedding(10, 10)
                     adam = fluid.optimizer.Adam(0.001, parameter_list = emb.parameters())
                     lr = adam.current_step_lr()
                     print(lr) # 0.001
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_save_for_auto_infer.py b/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_save_for_auto_infer.py
index f63cfc089ed8a5..ef85aab80f6c95 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_save_for_auto_infer.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_save_for_auto_infer.py
@@ -39,7 +39,6 @@
 from paddle.distributed.sharding.group_sharded import group_sharded_parallel
 from paddle.distributed.utils.log_utils import get_logger
 from paddle.fluid.dataloader.dataset import IterableDataset
-from paddle.fluid.dygraph.nn import Embedding
 from paddle.incubate.distributed.utils.io import save_for_auto_inference
 from paddle.nn import Linear
 
@@ -131,7 +130,7 @@ def __init__(
         bias_attr=None,
     ):
         super(MLP, self).__init__()
-        self.embedding = Embedding((embedding_size, linear_size))
+        self.embedding = paddle.nn.Embedding(embedding_size, linear_size)
         self._linear1 = Linear(linear_size, linear_size)
         self._linear2 = Linear(linear_size, linear_size)
         self._linear3 = Linear(linear_size, 10)
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_control_flow_different.py b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_control_flow_different.py
index 48ec09552d7f98..deaf9779d44f60 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_control_flow_different.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_control_flow_different.py
@@ -18,7 +18,6 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.nn.functional as F
-from paddle.fluid.dygraph.nn import Embedding
 
 paddle.seed(123)
 np.random.seed(2021)
@@ -29,10 +28,10 @@ def __init__(self, hidden_size, vocab_size, is_sparse=False):
         super().__init__()
         self.hidden_size = hidden_size
         self.vocab_size = vocab_size
-        self.embedding = Embedding(
-            size=[self.vocab_size, self.hidden_size],
-            dtype='float32',
-            is_sparse=is_sparse,
+        self.embedding = paddle.nn.Embedding(
+            self.vocab_size,
+            self.hidden_size,
+            sparse=is_sparse,
         )
 
         self.lin_a = paddle.nn.Linear(self.hidden_size, self.vocab_size)
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py
index 8d49434ac54e8b..a8ddeb0bfdbede 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py
@@ -18,7 +18,7 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.nn.functional as F
-from paddle.fluid.dygraph import Embedding, Layer, to_variable
+from paddle.fluid.dygraph import Layer, to_variable
 from paddle.optimizer.lr import NoamDecay
 
 """
@@ -513,11 +513,11 @@ def __init__(
         self._src_emb_dim = src_emb_dim
         self._src_vocab_size = src_vocab_size
         self._dropout_rate = dropout_rate
-        self._input_emb = Embedding(
-            size=[src_vocab_size, src_emb_dim],
-            is_sparse=is_sparse,
-            padding_idx=0,
-            param_attr=fluid.ParamAttr(
+        self._input_emb = paddle.nn.Embedding(
+            src_vocab_size,
+            src_emb_dim,
+            sparse=is_sparse,
+            weight_attr=fluid.ParamAttr(
                 name=word_emb_param_name,
                 initializer=fluid.initializer.Normal(0.0, src_emb_dim**-0.5),
             ),
@@ -527,10 +527,11 @@ def __init__(
             pos_inp = pos_inp1
         else:
             pos_inp = pos_inp2
-        self._pos_emb = Embedding(
-            size=[self._src_max_len, src_emb_dim],
-            is_sparse=is_sparse,
-            param_attr=fluid.ParamAttr(
+        self._pos_emb = paddle.nn.Embedding(
+            self._src_max_len,
+            src_emb_dim,
+            sparse=is_sparse,
+            weight_attr=fluid.ParamAttr(
                 name=pos_enc_param_name,
                 initializer=fluid.initializer.NumpyArrayInitializer(pos_inp),
                 trainable=False,
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision.py
index d30466d9fc957b..2af1f4adec9bb8 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision.py
@@ -344,7 +344,7 @@ def nan_inf(self):
             scaled_loss = scaler.scale(loss)
             scaled_loss.backward()
             optimize_ops, params_grads = scaler.minimize(optimizer, scaled_loss)
-            self.assertEqual(scaler._found_inf.numpy() == 1, True)
+            self.assertEqual(scaler._found_inf.numpy() >= 1, True)
 
             for param in model.parameters():
                 # param not update when tensor contains nan or inf
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision_for_eager.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision_for_eager.py
index f688d28b856031..b5d36dfebaad4a 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision_for_eager.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision_for_eager.py
@@ -343,7 +343,7 @@ def nan_inf(self):
             scaled_loss = scaler.scale(loss)
             scaled_loss.backward()
             optimize_ops, params_grads = scaler.minimize(optimizer, scaled_loss)
-            self.assertEqual(scaler._found_inf.numpy() == 1, True)
+            self.assertEqual(scaler._found_inf.numpy() >= 1, True)
 
             for param in model.parameters():
                 # param not update when tensor contains nan or inf
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py
index 43f7f0f6d2b5e1..a6e4f09564dfa0 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py
@@ -16,7 +16,7 @@
 
 import paddle
 import paddle.fluid as fluid
-from paddle.fluid.dygraph import Embedding, Layer
+from paddle.fluid.dygraph import Layer
 from paddle.jit.api import declarative
 from paddle.nn import Linear
 
@@ -208,29 +208,29 @@ def __init__(self, config, return_pooled_out=True, use_fp16=False):
         self._param_initializer = fluid.initializer.TruncatedNormal(
             scale=config['initializer_range']
         )
-
-        self._src_emb = Embedding(
-            size=[self._voc_size, self._emb_size],
-            param_attr=fluid.ParamAttr(
+        paddle.set_default_dtype(self._dtype)
+        self._src_emb = paddle.nn.Embedding(
+            self._voc_size,
+            self._emb_size,
+            weight_attr=fluid.ParamAttr(
                 name=self._word_emb_name, initializer=self._param_initializer
             ),
-            dtype=self._dtype,
         )
 
-        self._pos_emb = Embedding(
-            size=[self._max_position_seq_len, self._emb_size],
-            param_attr=fluid.ParamAttr(
+        self._pos_emb = paddle.nn.Embedding(
+            self._max_position_seq_len,
+            self._emb_size,
+            weight_attr=fluid.ParamAttr(
                 name=self._pos_emb_name, initializer=self._param_initializer
             ),
-            dtype=self._dtype,
         )
 
-        self._sent_emb = Embedding(
-            size=[self._sent_types, self._emb_size],
-            param_attr=fluid.ParamAttr(
+        self._sent_emb = paddle.nn.Embedding(
+            self._sent_types,
+            self._emb_size,
+            weight_attr=fluid.ParamAttr(
                 name=self._sent_emb_name, initializer=self._param_initializer
             ),
-            dtype=self._dtype,
         )
 
         self.pooled_fc = Linear(
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py
index bbca449bde67a3..eceba1198fa474 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py
@@ -21,8 +21,8 @@
 from paddle.fluid import ParamAttr, layers
 from paddle.fluid.dygraph import Layer
 from paddle.fluid.dygraph.base import to_variable
-from paddle.fluid.dygraph.nn import Embedding
 from paddle.jit.api import declarative
+from paddle.nn import Embedding
 
 INF = 1.0 * 1e5
 alpha = 0.6
@@ -122,16 +122,18 @@ def __init__(
         forget_bias = 1.0
 
         self.src_embeder = Embedding(
-            size=[self.src_vocab_size, self.hidden_size],
-            param_attr=fluid.ParamAttr(
+            self.src_vocab_size,
+            self.hidden_size,
+            weight_attr=fluid.ParamAttr(
                 initializer=uniform_initializer(init_scale)
             ),
         )
 
         self.tar_embeder = Embedding(
-            size=[self.tar_vocab_size, self.hidden_size],
-            is_sparse=False,
-            param_attr=fluid.ParamAttr(
+            self.tar_vocab_size,
+            self.hidden_size,
+            sparse=False,
+            weight_attr=fluid.ParamAttr(
                 initializer=uniform_initializer(init_scale)
             ),
         )
@@ -545,17 +547,19 @@ def __init__(
         forget_bias = 1.0
 
         self.src_embeder = Embedding(
-            size=[self.src_vocab_size, self.hidden_size],
-            param_attr=fluid.ParamAttr(
+            self.src_vocab_size,
+            self.hidden_size,
+            weight_attr=fluid.ParamAttr(
                 name='source_embedding',
                 initializer=uniform_initializer(init_scale),
             ),
         )
 
         self.tar_embeder = Embedding(
-            size=[self.tar_vocab_size, self.hidden_size],
-            is_sparse=False,
-            param_attr=fluid.ParamAttr(
+            self.tar_vocab_size,
+            self.hidden_size,
+            sparse=False,
+            weight_attr=fluid.ParamAttr(
                 name='target_embedding',
                 initializer=uniform_initializer(init_scale),
             ),
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
index d16f07d9a2e343..b6baf7ddf2913c 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
@@ -17,7 +17,7 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.param_attr as attr
-from paddle.fluid.dygraph import Embedding, Layer
+from paddle.fluid.dygraph import Layer
 from paddle.jit.api import declarative
 from paddle.static import Variable
 
@@ -42,11 +42,12 @@ def ops(self):
         """
         # TODO(huihuangzheng): The original code set the is_sparse=True, but it
         # causes crush in dy2stat. Set it to True after fixing it.
-        emb = Embedding(
-            size=[self.dict_size, self.emb_dim],
-            is_sparse=True,
+        emb = paddle.nn.Embedding(
+            self.dict_size,
+            self.emb_dim,
+            sparse=True,
             padding_idx=self.padding_idx,
-            param_attr=attr.ParamAttr(
+            weight_attr=attr.ParamAttr(
                 name=self.name, initializer=fluid.initializer.Xavier()
             ),
         )
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py
index 06f460912b45be..99fe330c692410 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py
@@ -38,11 +38,12 @@ def ops(self):
         """
         # TODO(huihuangzheng): The original code set the is_sparse=True, but it
         # causes crush in dy2stat. Set it to True after fixing it.
-        emb = paddle.fluid.dygraph.Embedding(
-            size=[self.dict_size, self.emb_dim],
-            is_sparse=True,
+        emb = paddle.nn.Embedding(
+            self.dict_size,
+            self.emb_dim,
+            sparse=True,
             padding_idx=self.padding_idx,
-            param_attr=paddle.ParamAttr(
+            weight_attr=paddle.ParamAttr(
                 name=self.name,
                 initializer=paddle.nn.initializer.XavierUniform(),
             ),
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
index 7a5fbbc0842434..1ec320317d4c54 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
@@ -25,7 +25,7 @@
 import paddle
 import paddle.fluid as fluid
 from paddle import _legacy_C_ops
-from paddle.fluid.dygraph import Embedding, to_variable
+from paddle.fluid.dygraph import to_variable
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 from paddle.fluid.framework import _non_static_mode
 from paddle.jit import ProgramTranslator
@@ -371,10 +371,10 @@ def __init__(self, args, length=None):
         self.bigru_num = args.bigru_num
         self.init_bound = 0.1
 
-        self.word_embedding = Embedding(
-            size=[self.vocab_size, self.word_emb_dim],
-            dtype='float32',
-            param_attr=fluid.ParamAttr(
+        self.word_embedding = paddle.nn.Embedding(
+            self.vocab_size,
+            self.word_emb_dim,
+            weight_attr=fluid.ParamAttr(
                 learning_rate=self.emb_lr,
                 name="word_emb",
                 initializer=fluid.initializer.Uniform(
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py
index 46970eaa27bb6c..49e7c32d6e3186 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py
@@ -21,7 +21,6 @@
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph.base import to_variable
-from paddle.fluid.dygraph.nn import Embedding
 from paddle.fluid.optimizer import SGDOptimizer
 from paddle.jit import ProgramTranslator
 from paddle.jit.api import declarative
@@ -156,11 +155,11 @@ def __init__(
             init_scale=init_scale,
             dropout=dropout,
         )
-        self.embedding = Embedding(
-            size=[vocab_size, hidden_size],
-            dtype='float32',
-            is_sparse=False,
-            param_attr=fluid.ParamAttr(
+        self.embedding = paddle.nn.Embedding(
+            vocab_size,
+            hidden_size,
+            sparse=False,
+            weight_attr=fluid.ParamAttr(
                 name='embedding_para',
                 initializer=fluid.initializer.UniformInitializer(
                     low=-init_scale, high=init_scale
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm_v2.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm_v2.py
index f589c37c2fbfd1..60712aeda7aac9 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm_v2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm_v2.py
@@ -150,11 +150,11 @@ def __init__(
             init_scale=init_scale,
             dropout=dropout,
         )
-        self.embedding = paddle.fluid.dygraph.nn.Embedding(
-            size=[vocab_size, hidden_size],
-            dtype='float32',
-            is_sparse=False,
-            param_attr=paddle.ParamAttr(
+        self.embedding = paddle.nn.Embedding(
+            vocab_size,
+            hidden_size,
+            sparse=False,
+            weight_attr=paddle.ParamAttr(
                 name='embedding_para',
                 initializer=paddle.nn.initializer.Uniform(
                     low=-init_scale, high=init_scale
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py
index 12e5099f257fe7..d9eb993f720707 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py
@@ -20,10 +20,9 @@
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph import to_variable
-from paddle.fluid.dygraph.nn import Embedding
 from paddle.jit import ProgramTranslator
 from paddle.jit.api import declarative
-from paddle.nn import Linear
+from paddle.nn import Embedding, Linear
 
 SEED = 2020
 program_translator = ProgramTranslator()
@@ -73,9 +72,9 @@ def __init__(self, dict_dim, batch_size, seq_len):
         self.batch_size = batch_size
         self.seq_len = seq_len
         self.embedding = Embedding(
-            size=[self.dict_dim + 1, self.emb_dim],
-            dtype='float32',
-            is_sparse=False,
+            self.dict_dim + 1,
+            self.emb_dim,
+            sparse=False,
         )
         self._simple_conv_pool_1 = SimpleConvPool(
             self.channels,
@@ -124,9 +123,9 @@ def __init__(self, dict_dim, batch_size, seq_len):
         self.batch_size = batch_size
         self.seq_len = seq_len
         self.embedding = Embedding(
-            size=[self.dict_dim + 1, self.emb_dim],
-            dtype='float32',
-            is_sparse=False,
+            self.dict_dim + 1,
+            self.emb_dim,
+            sparse=False,
         )
         self._fc1 = Linear(self.hid_dim, self.hid_dim)
         self._fc2 = Linear(self.hid_dim, self.fc_hid_dim)
@@ -167,10 +166,10 @@ def __init__(self, dict_dim, batch_size, seq_len):
         self.batch_size = batch_size
         self.seq_len = seq_len
         self.embedding = Embedding(
-            size=[self.dict_dim + 1, self.emb_dim],
-            dtype='float32',
-            param_attr=fluid.ParamAttr(learning_rate=30),
-            is_sparse=False,
+            self.dict_dim + 1,
+            self.emb_dim,
+            weight_attr=fluid.ParamAttr(learning_rate=30),
+            sparse=False,
         )
         h_0 = np.zeros((self.batch_size, self.hid_dim), dtype="float32")
         h_0 = to_variable(h_0)
@@ -213,10 +212,10 @@ def __init__(self, dict_dim, batch_size, seq_len):
         self.batch_size = batch_size
         self.seq_len = seq_len
         self.embedding = Embedding(
-            size=[self.dict_dim + 1, self.emb_dim],
-            dtype='float32',
-            param_attr=fluid.ParamAttr(learning_rate=30),
-            is_sparse=False,
+            self.dict_dim + 1,
+            self.emb_dim,
+            weight_attr=fluid.ParamAttr(learning_rate=30),
+            sparse=False,
         )
         h_0 = np.zeros((self.batch_size, self.hid_dim), dtype="float32")
         h_0 = to_variable(h_0)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_word2vec.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_word2vec.py
index 13f1a9b882fed3..e546e26a2304f3 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_word2vec.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_word2vec.py
@@ -20,9 +20,9 @@
 
 import paddle
 import paddle.fluid as fluid
-from paddle.fluid.dygraph.nn import Embedding
 from paddle.jit import ProgramTranslator
 from paddle.jit.api import declarative
+from paddle.nn import Embedding
 
 
 def fake_text():
@@ -227,9 +227,9 @@ def __init__(self, name_scope, vocab_size, embedding_size, init_scale=0.1):
         self.embedding_size = embedding_size
 
         self.embedding = Embedding(
-            size=[self.vocab_size, self.embedding_size],
-            dtype='float32',
-            param_attr=fluid.ParamAttr(
+            self.vocab_size,
+            self.embedding_size,
+            weight_attr=fluid.ParamAttr(
                 name='embedding_para',
                 initializer=fluid.initializer.UniformInitializer(
                     low=-0.5 / self.embedding_size,
@@ -239,9 +239,9 @@ def __init__(self, name_scope, vocab_size, embedding_size, init_scale=0.1):
         )
 
         self.embedding_out = Embedding(
-            size=[self.vocab_size, self.embedding_size],
-            dtype='float32',
-            param_attr=fluid.ParamAttr(
+            self.vocab_size,
+            self.embedding_size,
+            weight_attr=fluid.ParamAttr(
                 name='embedding_out_para',
                 initializer=fluid.initializer.UniformInitializer(
                     low=-0.5 / self.embedding_size,
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py
index 88cc415b4bbab3..f8641dd2ac4f33 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py
@@ -18,7 +18,7 @@
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 import paddle.nn.functional as F
-from paddle.fluid.dygraph import Embedding, Layer, to_variable
+from paddle.fluid.dygraph import Layer, to_variable
 from paddle.fluid.layers.utils import map_structure
 from paddle.jit.api import dygraph_to_static_func
 from paddle.nn import Linear
@@ -276,10 +276,10 @@ def forward(self, enc_input, attn_bias):
 class Embedder(Layer):
     def __init__(self, vocab_size, emb_dim, bos_idx=0):
         super().__init__()
-        self.word_embedder = Embedding(
-            size=[vocab_size, emb_dim],
-            padding_idx=bos_idx,
-            param_attr=fluid.ParamAttr(
+        self.word_embedder = paddle.nn.Embedding(
+            vocab_size,
+            emb_dim,
+            weight_attr=fluid.ParamAttr(
                 initializer=fluid.initializer.Normal(0.0, emb_dim**-0.5)
             ),
         )
@@ -311,9 +311,10 @@ def __init__(
         self.emb_dropout = prepostprocess_dropout
         self.emb_dim = d_model
         self.word_embedder = word_embedder
-        self.pos_encoder = Embedding(
-            size=[max_length, self.emb_dim],
-            param_attr=fluid.ParamAttr(
+        self.pos_encoder = paddle.nn.Embedding(
+            max_length,
+            self.emb_dim,
+            weight_attr=fluid.ParamAttr(
                 initializer=fluid.initializer.NumpyArrayInitializer(
                     position_encoding_init(max_length, self.emb_dim)
                 ),
@@ -499,9 +500,10 @@ def __init__(
         self.emb_dropout = prepostprocess_dropout
         self.emb_dim = d_model
         self.word_embedder = word_embedder
-        self.pos_encoder = Embedding(
-            size=[max_length, self.emb_dim],
-            param_attr=fluid.ParamAttr(
+        self.pos_encoder = paddle.nn.Embedding(
+            max_length,
+            self.emb_dim,
+            weight_attr=fluid.ParamAttr(
                 initializer=fluid.initializer.NumpyArrayInitializer(
                     position_encoding_init(max_length, self.emb_dim)
                 ),
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_int64.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_int64.py
new file mode 100644
index 00000000000000..f83cffa3400a7d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_int64.py
@@ -0,0 +1,239 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from functools import partial
+from typing import Any, Dict, List
+
+import numpy as np
+from program_config import ProgramConfig, TensorConfig
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest
+
+import paddle.inference as paddle_infer
+
+
+class TrtInt64Test1(TrtLayerAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        inputs = program_config.inputs
+        weights = program_config.weights
+        attrs = [
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
+        ]
+        out_shape = list(inputs['input_data'].shape)
+        for x in range(len(attrs[0]["axes"])):
+            start = 0
+            end = 0
+            if attrs[0]["starts"][x] < 0:
+                start = (
+                    attrs[0]["starts"][x]
+                    + inputs['input_data'].shape[attrs[0]["axes"][x]]
+                )
+            else:
+                start = attrs[0]["starts"][x]
+            if attrs[0]["ends"][x] < 0:
+                end = (
+                    attrs[0]["ends"][x]
+                    + inputs['input_data'].shape[attrs[0]["axes"][x]]
+                )
+            else:
+                end = attrs[0]["ends"][x]
+            start = max(0, start)
+            end = max(0, end)
+            out_shape[attrs[0]["axes"][x]] = end - start
+            if start >= end:
+                return False
+        for x in attrs[0]["decrease_axis"]:
+            if x < 0:
+                return False
+            if out_shape[x] != 1:
+                return False
+        return True
+
+    def sample_program_configs(self):
+        def generate_input1(attrs: List[Dict[str, Any]]):
+            return (10 * np.random.random([6, 6, 64, 64])).astype(np.int64)
+
+        for axes in [[0, 1], [1, 3], [2, 3]]:
+            for starts in [[0, 1]]:
+                for ends in [[2, 2], [5, 5], [1, -1]]:
+                    for decrease_axis in [[], [1], [2], [-1], [-100]]:
+                        for infer_flags in [[-1]]:
+                            dics = [
+                                {
+                                    "axes": axes,
+                                    "starts": starts,
+                                    "ends": ends,
+                                    "decrease_axis": decrease_axis,
+                                    "infer_flags": infer_flags,
+                                }
+                            ]
+
+                            ops_config = [
+                                {
+                                    "op_type": "slice",
+                                    "op_inputs": {"Input": ["input_data"]},
+                                    "op_outputs": {
+                                        "Out": ["slice_output_data"]
+                                    },
+                                    "op_attrs": dics[0],
+                                }
+                            ]
+                            ops = self.generate_op_config(ops_config)
+
+                            program_config = ProgramConfig(
+                                ops=ops,
+                                weights={},
+                                inputs={
+                                    "input_data": TensorConfig(
+                                        data_gen=partial(generate_input1, dics)
+                                    )
+                                },
+                                outputs=["slice_output_data"],
+                            )
+
+                            yield program_config
+
+    def sample_predictor_configs(
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
+        def generate_dynamic_shape(attrs):
+            self.dynamic_shape.min_input_shape = {"input_data": [1, 3, 32, 32]}
+            self.dynamic_shape.max_input_shape = {"input_data": [8, 8, 64, 64]}
+            self.dynamic_shape.opt_input_shape = {"input_data": [6, 6, 64, 64]}
+
+        def generate_trt_nodes_num(attrs, dynamic_shape):
+            return 1, 2
+
+        attrs = [
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
+        ]
+
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True
+        ), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True
+        ), 1e-3
+
+    def test(self):
+        self.run_test()
+
+
+class TrtInt64Test2(TrtLayerAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_configs(self):
+        def generate_input(shape, op_type):
+            return np.random.randint(
+                low=1, high=10000, size=shape, dtype=np.int64
+            )
+
+        for shape in [[2, 32, 16], [1, 8, 16, 32]]:
+            for op_type in [
+                "elementwise_add",
+                "elementwise_mul",
+                "elementwise_sub",
+            ]:
+                for axis in [0, -1]:
+                    self.dims = len(shape)
+                    dics = [{"axis": axis}]
+                    ops_config = [
+                        {
+                            "op_type": op_type,
+                            "op_inputs": {
+                                "X": ["input_data1"],
+                                "Y": ["input_data2"],
+                            },
+                            "op_outputs": {"Out": ["output_data"]},
+                            "op_attrs": dics[0],
+                        }
+                    ]
+                    ops = self.generate_op_config(ops_config)
+
+                    program_config = ProgramConfig(
+                        ops=ops,
+                        weights={},
+                        inputs={
+                            "input_data1": TensorConfig(
+                                data_gen=partial(generate_input, shape, op_type)
+                            ),
+                            "input_data2": TensorConfig(
+                                data_gen=partial(generate_input, shape, op_type)
+                            ),
+                        },
+                        outputs=["output_data"],
+                    )
+
+                    yield program_config
+
+    def sample_predictor_configs(
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
+        def generate_dynamic_shape(attrs):
+            if self.dims == 3:
+                self.dynamic_shape.min_input_shape = {
+                    "input_data1": [1, 4, 4],
+                    "input_data2": [1, 4, 4],
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input_data1": [128, 128, 256],
+                    "input_data2": [128, 128, 256],
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input_data1": [2, 32, 16],
+                    "input_data2": [2, 32, 16],
+                }
+            elif self.dims == 4:
+                self.dynamic_shape.min_input_shape = {
+                    "input_data1": [1, 4, 4, 4],
+                    "input_data2": [1, 4, 4, 4],
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input_data1": [8, 128, 64, 128],
+                    "input_data2": [8, 128, 64, 128],
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input_data1": [2, 64, 32, 32],
+                    "input_data2": [2, 64, 32, 32],
+                }
+
+        def generate_trt_nodes_num(attrs, dynamic_shape):
+            return 1, 3
+
+        attrs = [
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
+        ]
+
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), (1, 3), (1e-5, 1e-5)
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), (1, 3), (1e-3, 1e-3)
+
+    def add_skip_trt_case(self):
+        pass
+
+    def test(self):
+        self.add_skip_trt_case()
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding.py
index e1103c1d595c0c..11aacd02439e99 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding.py
@@ -18,7 +18,7 @@
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph.base import to_variable
-from paddle.fluid.dygraph.nn import Embedding
+from paddle.nn import Embedding
 
 
 class SimpleNet(fluid.Layer):
@@ -37,10 +37,10 @@ def __init__(
         self.init_scale = init_scale
         self.num_steps = num_steps
         self.embedding = Embedding(
-            size=[self.vocab_size, self.hidden_size],
-            dtype=dtype,
-            is_sparse=is_sparse,
-            param_attr=fluid.ParamAttr(
+            self.vocab_size,
+            self.hidden_size,
+            sparse=is_sparse,
+            weight_attr=fluid.ParamAttr(
                 initializer=fluid.initializer.UniformInitializer(
                     low=-init_scale, high=init_scale
                 )
diff --git a/python/paddle/fluid/tests/unittests/standalone_executor/CMakeLists.txt b/python/paddle/fluid/tests/unittests/standalone_executor/CMakeLists.txt
index ee215ebf27a391..a9832154200435 100644
--- a/python/paddle/fluid/tests/unittests/standalone_executor/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/standalone_executor/CMakeLists.txt
@@ -5,49 +5,13 @@ file(
 string(REPLACE ".py" "" TEST_INTERP_CASES "${TEST_INTERP_CASES}")
 
 foreach(target ${TEST_INTERP_CASES})
-  py_test_modules(
-    ${target}
-    MODULES
-    ${target}
-    ENVS
-    FLAGS_host_trace_level=10
-    FLAGS_static_executor_perfstat_filepath=./perfstat
-    FLAGS_allocator_strategy=auto_growth
-    FLAGS_use_stream_safe_cuda_allocator=true
-    FLAGS_fast_eager_deletion_mode=false
-    FLAGS_eager_delete_tensor_gb=0)
-
-  py_test_modules(
-    ${target}_non_eager_deletion
-    MODULES
-    ${target}
-    ENVS
-    FLAGS_allocator_strategy=auto_growth
-    FLAGS_use_stream_safe_cuda_allocator=true
-    FLAGS_fast_eager_deletion_mode=false
-    FLAGS_eager_delete_tensor_gb=0.000001)
-
-  py_test_modules(
-    ${target}_fast_gc
-    MODULES
-    ${target}
-    ENVS
-    FLAGS_allocator_strategy=auto_growth
-    FLAGS_use_stream_safe_cuda_allocator=true
-    FLAGS_fast_eager_deletion_mode=true
-    FLAGS_eager_delete_tensor_gb=0)
-
-  py_test_modules(
-    ${target}_fast_gc_non_eager_deletion
-    MODULES
-    ${target}
-    ENVS
-    FLAGS_allocator_strategy=auto_growth
-    FLAGS_use_stream_safe_cuda_allocator=true
-    FLAGS_fast_eager_deletion_mode=true
-    FLAGS_eager_delete_tensor_gb=0.000001)
+  py_test_modules(${target} MODULES ${target})
 endforeach()
 
+py_test_modules(
+  test_standalone_executor_no_fast_gc MODULES test_standalone_executor ENVS
+  FLAGS_fast_eager_deletion_mode=false)
+
 py_test_modules(
   test_standalone_executor_sequential_run MODULES test_standalone_executor ENVS
   FLAGS_new_executor_sequential_run=true)
@@ -56,5 +20,8 @@ py_test_modules(
   test_standalone_executor_serial_run MODULES test_standalone_executor ENVS
   FLAGS_new_executor_serial_run=true)
 
-py_test_modules(test_convert_graph_to_program MODULES test_standalone_executor
-                ENVS FLAGS_CONVERT_GRAPH_TO_PROGRAM=true)
+py_test_modules(
+  test_standalone_executor_stats MODULES test_standalone_executor ENVS
+  FLAGS_host_trace_level=10 FLAGS_static_executor_perfstat_filepath=./perfstat)
+
+set_tests_properties(test_standalone_cross_step_overlap PROPERTIES TIMEOUT 30)
diff --git a/python/paddle/fluid/tests/unittests/standalone_executor/test_standalone_cross_step_overlap.py b/python/paddle/fluid/tests/unittests/standalone_executor/test_standalone_cross_step_overlap.py
new file mode 100644
index 00000000000000..a4fe9f9d258499
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/standalone_executor/test_standalone_cross_step_overlap.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle import static
+
+paddle.enable_static()
+
+
+class TestCrossStepOverlap(unittest.TestCase):
+    def setUp(self):
+        self.shape = [16, 513, 513, 19]
+        self.x_value = 2
+        self.y_value = 3
+        self.overlap_op_num = 1500
+        self.step_num = 3
+
+    def test_cross_step_overlap(self):
+        if not paddle.fluid.core.is_compiled_with_cuda():
+            return
+
+        # In this test case, z=x+y is calculated in the default stream,
+        # and at the same time, numerous reduce_min ops that output to y
+        # are executed in another stream (i.e., the custom stream).
+        # These reduce_min ops are carefully designed that their kernel
+        # calculation will overlap with the fill_constant kernels (output
+        # to x and y) in the next step, and therefore cross-step multi-stream
+        # synchronization is required. An Event should be recorded after the
+        # last reduce_min in the first step and waited before the fill_constant
+        # in the second step. Otherwise, the result of z will be wrong.
+        program = static.Program()
+        with static.program_guard(program):
+            x = paddle.full(
+                self.shape, fill_value=self.x_value, dtype='float64'
+            )
+            y = paddle.full(
+                self.shape, fill_value=self.y_value, dtype='float64'
+            )
+            z = paddle.add(x, y)
+
+            block = program.global_block()
+            block.var(x.name).desc.set_persistable(True)
+            block.var(y.name).desc.set_persistable(True)
+            for i in range(self.overlap_op_num):
+                block.append_op(
+                    type='reduce_min',
+                    inputs={'X': x.name},
+                    outputs={'Out': y.name},
+                    attrs={'axis': 0, 'keepdim': True},
+                )
+                block.ops[-1].dist_attr.execution_stream = "custom"
+
+            exe = static.Executor()
+            results = []
+            for i in range(self.step_num):
+                result = exe.run(program, fetch_list=[z])
+                results.append(result)
+
+            for result in results:
+                self.assertAlmostEqual(
+                    np.sum(result),
+                    (self.x_value + self.y_value) * np.prod(self.shape),
+                )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
index 5a301d3f0a5adc..e4dca329fe7c23 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
@@ -19,6 +19,7 @@
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.framework import _test_eager_guard
+from paddle.nn import Embedding
 from paddle.tensor import random
 
 
@@ -122,8 +123,8 @@ def forward(self, x, label, test_num):
 class MyLayer(fluid.Layer):
     def __init__(self, input_size, vocab_size, size, dtype="float32"):
         super().__init__(dtype=dtype)
-        self.embed0 = fluid.Embedding(size=(vocab_size, size))
-        self.embed1 = fluid.Embedding(size=(vocab_size, size))
+        self.embed0 = Embedding(vocab_size, size)
+        self.embed1 = Embedding(vocab_size, size)
         self.linear_0 = paddle.nn.Linear(input_size, size)
         self.linear_1 = paddle.nn.Linear(input_size, size)
 
@@ -144,8 +145,8 @@ def embed_linear0(self, x):
 class MyLayer2(fluid.Layer):
     def __init__(self, input_size, vocab_size, size, dtype="float32"):
         super().__init__(dtype=dtype)
-        self.embed0 = fluid.Embedding(size=(vocab_size, size))
-        self.embed1 = fluid.Embedding(size=(vocab_size, size))
+        self.embed0 = Embedding(vocab_size, size)
+        self.embed1 = Embedding(vocab_size, size)
         self.linear_0 = paddle.nn.Linear(input_size, size)
         self.linear_1 = paddle.nn.Linear(input_size, size)
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py b/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py
index f864e2829046b7..2003e685327b8f 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py
@@ -21,7 +21,7 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.framework as framework
-from paddle.fluid.dygraph.nn import BatchNorm, Embedding
+from paddle.fluid.dygraph.nn import BatchNorm
 from paddle.nn import Linear
 
 
@@ -206,8 +206,8 @@ def __init__(self):
                     self.batch_norm_1 = BatchNorm(10)
                     self.batch_norm_2 = BatchNorm(10)
 
-                    self.emb1 = Embedding([1000, 100])
-                    self.emb2 = Embedding([2000, 200])
+                    self.emb1 = paddle.nn.Embedding(1000, 100)
+                    self.emb2 = paddle.nn.Embedding(2000, 200)
 
                     self.layer_norm_1 = paddle.nn.LayerNorm([10])
                     self.layer_norm_2 = paddle.nn.LayerNorm(10)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
index fe706a78f8fe69..0027cbfa2a9bff 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
@@ -22,7 +22,6 @@
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.dygraph.base import to_variable
-from paddle.fluid.dygraph.nn import Embedding
 from paddle.fluid.framework import _test_eager_guard
 from paddle.fluid.optimizer import SGDOptimizer
 
@@ -42,11 +41,12 @@ def __init__(
         self.vocab_size = vocab_size
         self.init_scale = init_scale
         self.num_steps = num_steps
-        self.embedding = Embedding(
-            size=[vocab_size, hidden_size],
-            dtype=dtype,
-            is_sparse=is_sparse,
-            param_attr=fluid.ParamAttr(
+        paddle.set_default_dtype(dtype)
+        self.embedding = paddle.nn.Embedding(
+            vocab_size,
+            hidden_size,
+            sparse=is_sparse,
+            weight_attr=fluid.ParamAttr(
                 name='embedding_para',
                 initializer=fluid.initializer.UniformInitializer(
                     low=-init_scale, high=init_scale
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_named_members.py b/python/paddle/fluid/tests/unittests/test_imperative_named_members.py
index faaa02ea46a5d0..0984104269c420 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_named_members.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_named_members.py
@@ -101,7 +101,7 @@ def __init__(self):
                     self.linear1 = paddle.nn.Linear(10, 10)
                     self.linear2 = paddle.nn.Linear(5, 5)
                     self.conv2d = paddle.nn.Conv2D(3, 2, 3)
-                    self.embedding = fluid.dygraph.Embedding(size=[128, 16])
+                    self.embedding = paddle.nn.Embedding(128, 16)
                     self.h_0 = fluid.dygraph.to_variable(
                         np.zeros([10, 10]).astype('float32')
                     )
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
index 1df0a4148c9c63..12118beaffe3b9 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
@@ -21,7 +21,7 @@
 import paddle.fluid as fluid
 from paddle.fluid import core
 from paddle.fluid.dygraph.base import to_variable
-from paddle.fluid.dygraph.nn import BatchNorm, Embedding
+from paddle.fluid.dygraph.nn import BatchNorm
 from paddle.fluid.framework import _test_eager_guard
 from paddle.nn import Linear
 
@@ -371,8 +371,8 @@ def __init__(self):
             Config.decoder_size,
             bias_attr=False,
         )
-        self.embedding = Embedding(
-            [Config.num_classes + 2, Config.word_vector_dim], dtype='float32'
+        self.embedding = paddle.nn.Embedding(
+            Config.num_classes + 2, Config.word_vector_dim
         )
         self.gru_decoder_with_attention = GRUDecoderWithAttention(
             Config.decoder_size, Config.num_classes
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
index 2a59dd396f000b..6bbf0a70c2e347 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
@@ -23,9 +23,9 @@
 import paddle.fluid.core as core
 import paddle.fluid.framework as framework
 from paddle.fluid.dygraph.base import to_variable
-from paddle.fluid.dygraph.nn import Embedding
 from paddle.fluid.framework import _test_eager_guard
 from paddle.fluid.optimizer import SGDOptimizer
+from paddle.nn import Embedding
 
 
 class SimpleLSTMRNN(fluid.Layer):
@@ -172,10 +172,10 @@ def __init__(
             dropout=dropout,
         )
         self.embedding = Embedding(
-            size=[vocab_size, hidden_size],
-            dtype='float32',
-            is_sparse=is_sparse,
-            param_attr=fluid.ParamAttr(
+            vocab_size,
+            hidden_size,
+            sparse=is_sparse,
+            weight_attr=fluid.ParamAttr(
                 name='embedding_para',
                 initializer=fluid.initializer.UniformInitializer(
                     low=-init_scale, high=init_scale
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
index 2e30ea41a18cd2..4e30f591686dc1 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
@@ -22,9 +22,9 @@
 import paddle.fluid.core as core
 from paddle.fluid.dygraph.base import to_variable
 from paddle.fluid.dygraph.learning_rate_scheduler import LearningRateDecay
-from paddle.fluid.dygraph.nn import Embedding
 from paddle.fluid.framework import _test_eager_guard
 from paddle.fluid.optimizer import Adam
+from paddle.nn import Embedding
 
 
 class SimpleLSTMRNN(fluid.Layer):
@@ -167,10 +167,10 @@ def __init__(
             dropout=dropout,
         )
         self.embedding = Embedding(
-            size=[vocab_size, hidden_size],
-            dtype='float32',
-            is_sparse=False,
-            param_attr=fluid.ParamAttr(
+            vocab_size,
+            hidden_size,
+            sparse=False,
+            weight_attr=fluid.ParamAttr(
                 name='embedding_para',
                 initializer=fluid.initializer.UniformInitializer(
                     low=-init_scale, high=init_scale
@@ -991,7 +991,7 @@ def func_testSetNumpyBeforeTrain(self):
 
     def func_testOnlyLoadParams(self):
         with fluid.dygraph.guard():
-            emb = fluid.dygraph.Embedding([10, 10])
+            emb = paddle.nn.Embedding(10, 10)
             state_dict = emb.state_dict()
             fluid.save_dygraph(state_dict, os.path.join('saved_dy', 'emb_dy'))
 
@@ -1011,7 +1011,7 @@ def func_testOnlyLoadParams(self):
 
     def func_test_load_compatible_with_keep_name_table(self):
         with fluid.dygraph.guard():
-            emb = fluid.dygraph.Embedding([10, 10])
+            emb = paddle.nn.Embedding(10, 10)
             state_dict = emb.state_dict()
             fluid.save_dygraph(state_dict, os.path.join('saved_dy', 'emb_dy'))
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
index 4a3c6c64a6f6e5..a567a443e44859 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
@@ -23,8 +23,8 @@
 import paddle.fluid.core as core
 from paddle.fluid.dygraph.base import to_variable
 from paddle.fluid.dygraph.learning_rate_scheduler import LearningRateDecay
-from paddle.fluid.dygraph.nn import Embedding
 from paddle.fluid.framework import _test_eager_guard
+from paddle.nn import Embedding
 from paddle.optimizer import Adam
 
 
@@ -168,10 +168,10 @@ def __init__(
             dropout=dropout,
         )
         self.embedding = Embedding(
-            size=[vocab_size, hidden_size],
-            dtype='float32',
-            is_sparse=False,
-            param_attr=fluid.ParamAttr(
+            vocab_size,
+            hidden_size,
+            sparse=False,
+            weight_attr=fluid.ParamAttr(
                 name='embedding_para',
                 initializer=fluid.initializer.UniformInitializer(
                     low=-init_scale, high=init_scale
@@ -1015,7 +1015,7 @@ def func_testSetNumpyBeforeTrain(self):
 
     def func_testOnlyLoadParams(self):
         with fluid.dygraph.guard():
-            emb = fluid.dygraph.Embedding([10, 10])
+            emb = paddle.nn.Embedding(10, 10)
             state_dict = emb.state_dict()
             paddle.save(
                 state_dict,
@@ -1028,7 +1028,7 @@ def func_testOnlyLoadParams(self):
 
     def func_test_no_state_in_input_dict(self):
         with fluid.dygraph.guard():
-            emb = fluid.dygraph.Embedding([10, 10])
+            emb = paddle.nn.Embedding(10, 10)
             state_dict = emb.state_dict()
             paddle.save(
                 state_dict,
@@ -1044,7 +1044,7 @@ def func_test_no_state_in_input_dict(self):
 
     def func_test_state_shape_mismatch(self):
         with fluid.dygraph.guard():
-            emb = fluid.dygraph.Embedding([10, 10])
+            emb = paddle.nn.Embedding(10, 10)
             state_dict = emb.state_dict()
             paddle.save(
                 state_dict,
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py
index 72c77e753f54b6..498317b2a33f9f 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py
@@ -27,11 +27,11 @@
 class SimpleNet(paddle.nn.Layer):
     def __init__(self, vocab_size, hidden_size, dtype):
         super().__init__()
-        self.emb = fluid.dygraph.Embedding(
-            size=[vocab_size, hidden_size],
-            dtype=dtype,
-            param_attr='emb.w',
-            is_sparse=True,
+        self.emb = paddle.nn.Embedding(
+            vocab_size,
+            hidden_size,
+            weight_attr='emb.w',
+            sparse=True,
         )
 
     def forward(self, input):
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py
index dd490e8d5553bb..220bde8e5b235f 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py
@@ -22,9 +22,9 @@
 import paddle.fluid.core as core
 import paddle.fluid.framework as framework
 from paddle.fluid.dygraph.base import to_variable
-from paddle.fluid.dygraph.nn import Embedding
 from paddle.fluid.framework import _test_eager_guard
 from paddle.fluid.optimizer import SGDOptimizer
+from paddle.nn import Embedding
 
 
 class SimpleNet(fluid.Layer):
@@ -42,11 +42,12 @@ def __init__(
         self.vocab_size = vocab_size
         self.init_scale = init_scale
         self.num_steps = num_steps
+        paddle.set_default_dtype(dtype)
         self.embedding = Embedding(
-            size=[vocab_size, hidden_size],
-            dtype=dtype,
-            is_sparse=is_sparse,
-            param_attr=fluid.ParamAttr(
+            vocab_size,
+            hidden_size,
+            sparse=is_sparse,
+            weight_attr=fluid.ParamAttr(
                 name='embedding_para',
                 initializer=fluid.initializer.UniformInitializer(
                     low=-init_scale, high=init_scale
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
index 3cc07ee6a3378b..f73e94363844cd 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
@@ -20,7 +20,7 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.nn.functional as F
-from paddle.fluid import Embedding, Layer, core
+from paddle.fluid import Layer, core
 from paddle.fluid.dygraph import guard, to_variable
 from paddle.fluid.framework import _in_legacy_dygraph, _test_eager_guard
 from paddle.nn import Linear
@@ -664,11 +664,11 @@ def __init__(
         self._src_emb_dim = src_emb_dim
         self._src_vocab_size = src_vocab_size
         self._dropout_rate = dropout_rate
-        self._input_emb = Embedding(
-            size=[src_vocab_size, src_emb_dim],
-            is_sparse=is_sparse,
-            padding_idx=0,
-            param_attr=fluid.ParamAttr(
+        self._input_emb = paddle.nn.Embedding(
+            src_vocab_size,
+            src_emb_dim,
+            sparse=is_sparse,
+            weight_attr=fluid.ParamAttr(
                 name=word_emb_param_name,
                 initializer=fluid.initializer.Normal(0.0, src_emb_dim**-0.5),
             ),
@@ -678,10 +678,11 @@ def __init__(
             pos_inp = pos_inp1
         else:
             pos_inp = pos_inp2
-        self._pos_emb = Embedding(
-            size=[self._src_max_len, src_emb_dim],
-            is_sparse=is_sparse,
-            param_attr=fluid.ParamAttr(
+        self._pos_emb = paddle.nn.Embedding(
+            self._src_max_len,
+            src_emb_dim,
+            sparse=is_sparse,
+            weight_attr=fluid.ParamAttr(
                 name=pos_enc_param_name,
                 initializer=fluid.initializer.NumpyArrayInitializer(pos_inp),
                 trainable=False,
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 01a9c098b0e913..83cec6d60443fa 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -26,7 +26,7 @@
 import paddle.fluid.nets as nets
 import paddle.nn.functional as F
 from paddle.fluid import core
-from paddle.fluid.dygraph import base, nn, to_variable
+from paddle.fluid.dygraph import base, to_variable
 from paddle.fluid.framework import (
     Program,
     _test_eager_guard,
@@ -732,8 +732,8 @@ def test_embeding(self):
             )[0]
         with self.static_graph():
             data_t = layers.data(name='word', shape=[1], dtype='int64')
-            emb2 = nn.Embedding(
-                size=[dict_size, 32], param_attr='emb.w', is_sparse=False
+            emb2 = paddle.nn.Embedding(
+                dict_size, 32, weight_attr='emb.w', sparse=False
             )
             emb_rlt = emb2(data_t)
             static_rlt2 = self.get_static_graph_result(
@@ -741,16 +741,17 @@ def test_embeding(self):
             )[0]
         with self.dynamic_graph():
             with _test_eager_guard():
-                emb2 = nn.Embedding(
-                    size=[dict_size, 32],
-                    param_attr='eager_emb.w',
-                    is_sparse=False,
+                emb2 = paddle.nn.Embedding(
+                    dict_size,
+                    32,
+                    weight_attr='eager_emb.w',
+                    sparse=False,
                 )
                 dy_eager_rlt = emb2(base.to_variable(inp_word))
                 dy_eager_rlt_value = dy_eager_rlt.numpy()
 
-            emb2 = nn.Embedding(
-                size=[dict_size, 32], param_attr='emb.w', is_sparse=False
+            emb2 = paddle.nn.Embedding(
+                dict_size, 32, weight_attr='emb.w', sparse=False
             )
             dy_rlt = emb2(base.to_variable(inp_word))
             dy_rlt_value = dy_rlt.numpy()
@@ -767,11 +768,12 @@ def test_embeding(self):
                         custom_weight
                     )
                 )
-                emb1 = nn.Embedding(size=[dict_size, 32], is_sparse=False)
-                emb2 = nn.Embedding(
-                    size=[dict_size, 32],
-                    param_attr=weight_attr,
-                    is_sparse=False,
+                emb1 = paddle.nn.Embedding(dict_size, 32, sparse=False)
+                emb2 = paddle.nn.Embedding(
+                    dict_size,
+                    32,
+                    weight_attr=weight_attr,
+                    sparse=False,
                 )
                 rep1 = emb1(base.to_variable(inp_word))
                 rep2 = emb2(base.to_variable(inp_word))
@@ -797,9 +799,9 @@ def test_embeding(self):
                     custom_weight
                 )
             )
-            emb1 = nn.Embedding(size=[dict_size, 32], is_sparse=False)
-            emb2 = nn.Embedding(
-                size=[dict_size, 32], param_attr=weight_attr, is_sparse=False
+            emb1 = paddle.nn.Embedding(dict_size, 32, sparse=False)
+            emb2 = paddle.nn.Embedding(
+                dict_size, 32, weight_attr=weight_attr, sparse=False
             )
             rep1 = emb1(base.to_variable(inp_word))
             rep2 = emb2(base.to_variable(inp_word))
diff --git a/python/paddle/fluid/tests/unittests/test_nan_inf.py b/python/paddle/fluid/tests/unittests/test_nan_inf.py
index 289b5ec40f9d5f..4f0e02fdf613c0 100644
--- a/python/paddle/fluid/tests/unittests/test_nan_inf.py
+++ b/python/paddle/fluid/tests/unittests/test_nan_inf.py
@@ -17,9 +17,9 @@
 import sys
 import unittest
 
-import paddle
+import numpy as np
 
-paddle.enable_static()
+import paddle
 
 
 class TestNanInf(unittest.TestCase):
@@ -47,12 +47,7 @@ def check_nan_inf(self):
         print(err)
 
         # in python3, type(out+err) is 'bytes', need use encode
-        if paddle.fluid.core.is_compiled_with_cuda():
-            assert (out + err).find('find_nan=1, find_inf=1'.encode()) != -1
-        else:
-            assert (out + err).find(
-                'There are `nan` or `inf` in tensor'.encode()
-            ) != -1
+        assert (out + err).find('There are NAN or INF'.encode()) != -1
 
     def test_nan_inf_in_static_mode(self):
         self._python_interp += " check_nan_inf_base.py"
@@ -75,5 +70,97 @@ def setUp(self):
         )
 
 
+class TestNanInfCheckResult(unittest.TestCase):
+    def generate_inputs(self, shape, dtype="float32"):
+        data = np.random.random(size=shape).astype(dtype)
+        # [-10, 10)
+        x = (data * 20 - 10) * np.random.randint(
+            low=0, high=2, size=shape
+        ).astype(dtype)
+        y = np.random.randint(low=0, high=2, size=shape).astype(dtype)
+        return x, y
+
+    def get_reference_num_nan_inf(self, x):
+        out = np.log(x)
+        num_nan = np.sum(np.isnan(out))
+        num_inf = np.sum(np.isinf(out))
+        print("[reference] num_nan={}, num_inf={}".format(num_nan, num_inf))
+        return num_nan, num_inf
+
+    def get_num_nan_inf(self, x_np, use_cuda=True, add_assert=False):
+        num_nan = 0
+        num_inf = 0
+        try:
+            if use_cuda:
+                paddle.device.set_device("gpu:0")
+            else:
+                paddle.device.set_device("cpu")
+            x = paddle.to_tensor(x_np)
+            out = paddle.log(x)
+            sys.stdout.flush()
+            if add_assert:
+                assert False
+        except Exception as e:
+            # Cannot catch the log in CUDA kernel.
+            err_str_list = (
+                str(e)
+                .replace("(", " ")
+                .replace(")", " ")
+                .replace(",", " ")
+                .split(" ")
+            )
+            for err_str in err_str_list:
+                if "num_nan" in err_str:
+                    num_nan = int(err_str.split("=")[1])
+                elif "num_inf" in err_str:
+                    num_inf = int(err_str.split("=")[1])
+            print("[paddle] num_nan={}, num_inf={}".format(num_nan, num_inf))
+        return num_nan, num_inf
+
+    def test_num_nan_inf(self):
+        def _check_num_nan_inf(use_cuda):
+            shape = [32, 32]
+            x_np, _ = self.generate_inputs(shape)
+            num_nan_np, num_inf_np = self.get_reference_num_nan_inf(x_np)
+            add_assert = (num_nan_np + num_inf_np) > 0
+            num_nan, num_inf = self.get_num_nan_inf(x_np, use_cuda, add_assert)
+            if not use_cuda:
+                assert num_nan == num_nan_np and num_inf == num_inf_np
+
+        paddle.set_flags(
+            {"FLAGS_check_nan_inf": 1, "FLAGS_check_nan_inf_level": 0}
+        )
+        _check_num_nan_inf(use_cuda=False)
+        if paddle.fluid.core.is_compiled_with_cuda():
+            _check_num_nan_inf(use_cuda=True)
+
+    def check_nan_inf_level(self, use_cuda, dtype):
+        shape = [8, 8]
+        x_np, y_np = self.generate_inputs(shape, dtype)
+
+        if use_cuda:
+            paddle.device.set_device("gpu:0")
+        else:
+            paddle.device.set_device("cpu")
+        x = paddle.to_tensor(x_np)
+        y = paddle.to_tensor(y_np)
+        out = paddle.log(x * 1e6) / y
+
+    def test_check_nan_inf_level_float32(self):
+        paddle.set_flags(
+            {"FLAGS_check_nan_inf": 1, "FLAGS_check_nan_inf_level": 2}
+        )
+        self.check_nan_inf_level(use_cuda=False, dtype="float32")
+        if paddle.fluid.core.is_compiled_with_cuda():
+            self.check_nan_inf_level(use_cuda=True, dtype="float32")
+
+    def test_check_nan_inf_level_float16(self):
+        paddle.set_flags(
+            {"FLAGS_check_nan_inf": 1, "FLAGS_check_nan_inf_level": 3}
+        )
+        if paddle.fluid.core.is_compiled_with_cuda():
+            self.check_nan_inf_level(use_cuda=True, dtype="float16")
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py b/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
index bfa6966e543b09..f990c2171b92e3 100644
--- a/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
+++ b/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
@@ -689,9 +689,7 @@ def model_init(
         beam_size=4,
         max_step_num=20,
     ):
-        embedder = paddle.fluid.dygraph.Embedding(
-            size=[vocab_size, embed_dim], dtype="float64"
-        )
+        embedder = paddle.nn.Embedding(vocab_size, embed_dim)
         output_layer = nn.Linear(hidden_size, vocab_size)
         cell = nn.LSTMCell(embed_dim, hidden_size)
         self.max_step_num = max_step_num
diff --git a/python/paddle/static/__init__.py b/python/paddle/static/__init__.py
index 5413b5983cd19b..bdb88e7290b570 100644
--- a/python/paddle/static/__init__.py
+++ b/python/paddle/static/__init__.py
@@ -75,8 +75,8 @@
 from ..fluid.layers import exponential_decay  # noqa: F401
 
 from .nn.common import batch_norm  # noqa: F401
-from paddle.static.nn.metric import auc  # noqa: F401
-from paddle.static.nn.metric import accuracy  # noqa: F401
+from .nn.metric import auc  # noqa: F401
+from .nn.metric import accuracy  # noqa: F401
 
 __all__ = [  # noqa
     'append_backward',
diff --git a/python/paddle/static/nn/metric.py b/python/paddle/static/nn/metric.py
index 3ed54ddd2cceee..9b386c38c8e68a 100644
--- a/python/paddle/static/nn/metric.py
+++ b/python/paddle/static/nn/metric.py
@@ -22,7 +22,7 @@
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.layers import tensor
 
-__all__ = ['accuracy', 'auc']
+__all__ = []
 
 
 def accuracy(input, label, k=1, correct=None, total=None):