diff --git a/.gitignore b/.gitignore
index 46b8e3a47c618..8560650344b95 100644
--- a/.gitignore
+++ b/.gitignore
@@ -97,5 +97,6 @@ python/paddle/incubate/fleet/parameter_server/pslib/ps_pb2.py
 paddle/phi/kernels/fusion/cutlass/conv2d/generated/*
 python/paddle/base/incubate/fleet/parameter_server/pslib/ps_pb2.py
 paddle/fluid/ir_adaptor/translator/op_compat_info.cc
+paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen/*
 paddle/fluid/pybind/static_op_function.*
 paddle/fluid/pybind/ops_api.cc
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
index 54131b48eca46..f5e6a6e426c60 100644
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@@ -34,7 +34,7 @@ This Code of Conduct applies both within project spaces and in public spaces whe
 
 ## Enforcement
 
-Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at paddle-dev@baidu.com. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately.
+Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at  ext_paddle_oss@baidu.com. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately.
 
 Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership.
 
diff --git a/CODE_OF_CONDUCT_cn.md b/CODE_OF_CONDUCT_cn.md
index 2be794f1f324c..92153a4dadcbe 100644
--- a/CODE_OF_CONDUCT_cn.md
+++ b/CODE_OF_CONDUCT_cn.md
@@ -36,7 +36,7 @@
 
 ## 强制执行
 
-可以通过paddle-dev@baidu.com，来联系项目团队来举报滥用、骚扰或其他不被接受的行为。
+可以通过ext_paddle_oss@baidu.com，来联系项目团队来举报滥用、骚扰或其他不被接受的行为。
 
 任何维护团队认为有必要且适合的所有投诉都将进行审查及调查，并做出相对应的回应。项目小组有对事件回报者有保密的义务。具体执行的方针近一步细节可能会单独公布。
 
diff --git a/cmake/cinn.cmake b/cmake/cinn.cmake
index a8ebe6a9a46ae..44d502fc4b792 100644
--- a/cmake/cinn.cmake
+++ b/cmake/cinn.cmake
@@ -164,8 +164,8 @@ cinn_cc_library(
 add_dependencies(cinnapi GEN_LLVM_RUNTIME_IR_HEADER ZLIB::ZLIB)
 add_dependencies(cinnapi GEN_LLVM_RUNTIME_IR_HEADER ${core_deps})
 if(NOT CINN_ONLY)
-  target_link_libraries(cinnapi pd_dialect phi)
-  add_dependencies(cinnapi pd_dialect phi)
+  target_link_libraries(cinnapi pd_op_dialect phi)
+  add_dependencies(cinnapi pd_op_dialect phi)
 endif()
 
 target_link_libraries(cinnapi ${PYTHON_LIBRARIES})
@@ -222,8 +222,8 @@ function(gen_cinncore LINKTYPE)
   add_dependencies(${CINNCORE_TARGET} GEN_LLVM_RUNTIME_IR_HEADER ZLIB::ZLIB)
   add_dependencies(${CINNCORE_TARGET} GEN_LLVM_RUNTIME_IR_HEADER ${core_deps})
   if(NOT CINN_ONLY)
-    target_link_libraries(${CINNCORE_TARGET} pd_dialect phi)
-    add_dependencies(${CINNCORE_TARGET} pd_dialect phi)
+    target_link_libraries(${CINNCORE_TARGET} pd_op_dialect phi)
+    add_dependencies(${CINNCORE_TARGET} pd_op_dialect phi)
   endif()
 
   add_dependencies(${CINNCORE_TARGET} pybind)
diff --git a/cmake/external/brpc.cmake b/cmake/external/brpc.cmake
index 3c9f2b6962048..d647e9116b586 100755
--- a/cmake/external/brpc.cmake
+++ b/cmake/external/brpc.cmake
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 include(ExternalProject)
-
+set(OPENSSL_USE_STATIC_LIBS ON)
 find_package(OpenSSL REQUIRED)
 
 message(STATUS "ssl:" ${OPENSSL_SSL_LIBRARY})
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 13fce9613650f..f73b20d389ef4 100755
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -269,10 +269,10 @@ else()
       DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib)
   endif()
   if(WITH_SHARED_IR)
-    set(paddle_ir_lib ${PADDLE_BINARY_DIR}/paddle/ir/libir.*)
+    set(paddle_pir_lib ${PADDLE_BINARY_DIR}/paddle/pir/libpir.*)
     copy(
       inference_lib_dist
-      SRCS ${paddle_ir_lib}
+      SRCS ${paddle_pir_lib}
       DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib)
   endif()
 endif()
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index 92e302eb15acc..b5f2ffa394a89 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -3,7 +3,7 @@ set(PYTHON_TESTS_DIR
     CACHE INTERNAL "python tests directory")
 
 add_subdirectory(utils)
-add_subdirectory(ir)
+add_subdirectory(pir)
 add_subdirectory(scripts)
 add_subdirectory(testing)
 add_subdirectory(phi)
diff --git a/paddle/cinn/CMakeLists.txt b/paddle/cinn/CMakeLists.txt
index 4645ff2c06636..0f0f7beed265a 100644
--- a/paddle/cinn/CMakeLists.txt
+++ b/paddle/cinn/CMakeLists.txt
@@ -3,6 +3,7 @@ if(WITH_TESTING)
 endif()
 
 add_subdirectory(api)
+add_subdirectory(ast_gen_ius)
 add_subdirectory(auto_schedule)
 add_subdirectory(common)
 add_subdirectory(utils)
diff --git a/paddle/cinn/ast_gen_ius/CMakeLists.txt b/paddle/cinn/ast_gen_ius/CMakeLists.txt
new file mode 100644
index 0000000000000..c3908dfed2537
--- /dev/null
+++ b/paddle/cinn/ast_gen_ius/CMakeLists.txt
@@ -0,0 +1,6 @@
+core_gather_headers()
+
+gather_srcs(cinnapi_src SRCS ast_gen.cc tensor_group.cc)
+
+cinn_cc_test(test_ast_gen_ius SRCS ast_gen_test.cc DEPS cinncore)
+cinn_cc_test(test_tensor_group SRCS tensor_group_test.cc DEPS cinncore)
diff --git a/paddle/cinn/ast_gen_ius/ast_gen.cc b/paddle/cinn/ast_gen_ius/ast_gen.cc
new file mode 100644
index 0000000000000..d10560209e6ae
--- /dev/null
+++ b/paddle/cinn/ast_gen_ius/ast_gen.cc
@@ -0,0 +1,52 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/cinn/ast_gen_ius/ast_gen.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/operation.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/ir/utils/ir_printer.h"
+
+namespace cinn {
+namespace ast_gen_ius {
+
+ir::Expr AstGen::Build(const ir::Tensor& tensor) {
+  const std::vector<ir::Var>& axis = tensor->axis();
+  const std::vector<ir::Expr>& shape = tensor->shape;
+  size_t axis_len = axis.size();
+  CHECK_EQ(shape.size(), axis_len)
+      << "Internal Error: Tensor has different shape and axis length in AstGen";
+
+  std::vector<ir::Expr> axis_exprs;
+  for (const auto& a : axis) {
+    axis_exprs.push_back(a);
+  }
+  ir::Expr body = ir::Store::Make(tensor, tensor->body(), axis_exprs);
+
+  for (int i = static_cast<int>(axis_len) - 1; i >= 0; --i) {
+    ir::Var loop_var = axis[i];
+    ir::Expr loop_extent = shape[i];
+    body = ir::For::Make(loop_var,
+                         Expr(0),
+                         loop_extent,
+                         ir::ForType::Serial,
+                         ir::DeviceAPI::Host,
+                         ir::Block::Make({body}));
+  }
+  return body;
+}
+
+}  // namespace ast_gen_ius
+}  // namespace cinn
diff --git a/paddle/cinn/optim/remove_nested_block.h b/paddle/cinn/ast_gen_ius/ast_gen.h
similarity index 66%
rename from paddle/cinn/optim/remove_nested_block.h
rename to paddle/cinn/ast_gen_ius/ast_gen.h
index 41220c18b254a..2e9dc7fde8d8e 100644
--- a/paddle/cinn/optim/remove_nested_block.h
+++ b/paddle/cinn/ast_gen_ius/ast_gen.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,22 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-/**
- * This file implements the strategy to remove the unnecessary nested block.
- */
 #pragma once
-#include <vector>
 
-#include "paddle/cinn/common/common.h"
 #include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/tensor.h"
 
 namespace cinn {
-namespace optim {
+namespace ast_gen_ius {
 
-/**
- * Remove the unecessary nested block.
- */
-void RemoveNestedBlock(Expr* e);
+class AstGen {
+ public:
+  static ir::Expr Build(const ir::Tensor& tensor);
+};
 
-}  // namespace optim
+}  // namespace ast_gen_ius
 }  // namespace cinn
diff --git a/paddle/cinn/ast_gen_ius/ast_gen_test.cc b/paddle/cinn/ast_gen_ius/ast_gen_test.cc
new file mode 100644
index 0000000000000..e91c0f4ca0e28
--- /dev/null
+++ b/paddle/cinn/ast_gen_ius/ast_gen_test.cc
@@ -0,0 +1,44 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <vector>
+
+#include "paddle/cinn/ast_gen_ius/ast_gen.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/lang/builtin.h"
+#include "paddle/cinn/lang/compute.h"
+#include "paddle/cinn/lang/placeholder.h"
+
+namespace cinn {
+namespace ast_gen_ius {
+
+using cinn::ir::Expr;
+using cinn::ir::Tensor;
+
+TEST(AstGen, Build) {
+  std::vector<Expr> shape = {Expr(10), Expr(10), Expr(10), Expr(10)};
+  lang::Placeholder<float> A("A", shape);
+  Tensor B = lang::Compute(
+      shape,
+      [&](const std::vector<Expr>& indice) { return lang::Relu(A(indice), 0); },
+      "relu_test");
+  Expr out = AstGen::Build(B);
+  LOG(INFO) << out;
+}
+
+}  // namespace ast_gen_ius
+}  // namespace cinn
diff --git a/paddle/cinn/ast_gen_ius/tensor_group.cc b/paddle/cinn/ast_gen_ius/tensor_group.cc
new file mode 100644
index 0000000000000..cca8b4136ba1b
--- /dev/null
+++ b/paddle/cinn/ast_gen_ius/tensor_group.cc
@@ -0,0 +1,198 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/cinn/ast_gen_ius/tensor_group.h"
+
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/ir/utils/ir_nodes_collector.h"
+
+namespace cinn {
+namespace ast_gen_ius {
+
+TensorGroup::TensorGroup(const std::vector<ir::Tensor>& tensors) {
+  std::set<ir::Tensor> all_tensors(tensors.begin(), tensors.end());
+
+  for (auto& tensor : tensors) {
+    output_tensor_names_.insert(tensor->name);
+    std::set<ir::Expr> used_tensors = ir::CollectIRNodes(
+        tensor->body(), [](const Expr* x) { return x->as_tensor(); });
+    for (const Expr& x : used_tensors) {
+      const ir::Tensor to_dep = x.as_tensor_ref();
+      all_tensors.insert(to_dep);
+      this->CtrlDepend(tensor, to_dep);
+    }
+  }
+
+  for (const ir::Tensor& t : all_tensors) {
+    name_to_tensor_.insert({t->name, t});
+  }
+}
+
+TensorGroup::~TensorGroup() {}
+
+bool TensorGroup::Contain(const std::string& name) const {
+  return name_to_tensor_.find(name) != name_to_tensor_.end();
+}
+
+void TensorGroup::Insert(const ir::Tensor& tensor) {
+  name_to_tensor_.insert({tensor->name, tensor});
+}
+
+ir::Tensor TensorGroup::Get(const std::string& name) {
+  return name_to_tensor_[name];
+}
+
+std::set<ir::Tensor> TensorGroup::GetAllTensors() {
+  std::set<ir::Tensor> all_tensors;
+  for (const std::pair<std::string, ir::Tensor>& p : name_to_tensor_) {
+    all_tensors.insert(p.second);
+  }
+  return all_tensors;
+}
+
+std::vector<ir::Tensor> TensorGroup::GetGenFuncTopoOrder(
+    const std::vector<ir::Tensor>& func_args) {
+  std::unordered_map<std::string, int> in_degree;
+  for (const auto& dep_pair : ctrl_dep_) {
+    const std::unordered_set<std::string>& dep_tensor_names = dep_pair.second;
+    in_degree[dep_pair.first] = dep_tensor_names.size();
+  }
+
+  std::vector<ir::Tensor> ret;
+  std::vector<std::string> stack;
+  for (const auto& name_tensor : name_to_tensor_) {
+    if (!in_degree.count(name_tensor.first)) {
+      stack.emplace_back(name_tensor.first);
+    }
+  }
+
+  std::set<std::string> input_arg_names;
+  for (const ir::Tensor& arg : func_args) {
+    input_arg_names.insert(arg->name);
+  }
+  for (const std::string& name : output_tensor_names_) {
+    input_arg_names.erase(name);
+  }
+
+  while (!stack.empty()) {
+    const std::string& cur = stack.back();
+    stack.pop_back();
+
+    if (!input_arg_names.count(cur)) {
+      ret.push_back(name_to_tensor_[cur]);
+    }
+
+    for (const auto& dep_pair : ctrl_dep_) {
+      const std::unordered_set<std::string>& dep_tensor_names = dep_pair.second;
+      if (dep_tensor_names.count(cur)) {
+        --in_degree[dep_pair.first];
+        if (in_degree[dep_pair.first] == 0) {
+          stack.emplace_back(dep_pair.first);
+        }
+      }
+    }
+  }
+  return ret;
+}
+
+bool TensorGroup::HasMarkedReduceInit(const std::string& tensor_name) const {
+  return tensor_name_needs_reduce_init_.count(tensor_name);
+}
+
+ir::Tensor TensorGroup::MarkReduceInit(const std::string& tensor_name) {
+  // TODO(zhhsplendid): add check
+  tensor_name_needs_reduce_init_.insert(tensor_name);
+}
+
+void TensorGroup::CtrlDepend(const ir::Tensor& tensor,
+                             const ir::Tensor& to_dep) {
+  ctrl_dep_[tensor->name].insert(to_dep->name);
+  if (!name_to_tensor_.count(tensor->name)) {
+    name_to_tensor_[tensor->name] = tensor;
+  }
+  if (!name_to_tensor_.count(to_dep->name)) {
+    name_to_tensor_[to_dep->name] = to_dep;
+  }
+}
+
+std::set<ir::Tensor> TensorGroup::GetCrtlDepTensors(
+    const std::string& tensor_name) {
+  if (!ctrl_dep_.count(tensor_name)) {
+    return {};
+  }
+  std::set<ir::Tensor> ret;
+  for (const std::string& dep_name : ctrl_dep_[tensor_name]) {
+    ret.insert(name_to_tensor_[dep_name]);
+  }
+  return ret;
+}
+
+std::string TensorGroup::GetShareMemRootName(const std::string& tensor_name) {
+  if (!share_memory_tensor_.count(tensor_name)) {
+    share_memory_tensor_[tensor_name] = tensor_name;
+    return tensor_name;
+  }
+  if (share_memory_tensor_[tensor_name] == tensor_name) {
+    return tensor_name;
+  }
+  share_memory_tensor_[tensor_name] =
+      GetShareMemRootName(share_memory_tensor_[tensor_name]);
+  return share_memory_tensor_[tensor_name];
+}
+
+void TensorGroup::ShareMemoryBuffer(const ir::Tensor& tensor,
+                                    const ir::Tensor& to_share) {
+  share_memory_tensor_[GetShareMemRootName(to_share->name)] =
+      GetShareMemRootName(tensor->name);
+}
+
+absl::flat_hash_map<std::string, ir::Tensor> TensorGroup::AllocateBuffers() {
+  std::unordered_set<std::string> allocated_roots;
+  for (auto& name_tensor : name_to_tensor_) {
+    std::string root_name = GetShareMemRootName(name_tensor.first);
+
+    // Allocate root buffer
+    if (!allocated_roots.count(root_name)) {
+      ir::Tensor root_tensor = name_to_tensor_[root_name];
+      if (!root_tensor->buffer.defined() && !root_tensor->type().is_void()) {
+        root_tensor->WithBuffer();
+        VLOG(6) << "Bind root_tensor " << root_name << " with buffer "
+                << root_tensor->buffer->name;
+      }
+      allocated_roots.insert(root_name);
+    }
+
+    // Share buffer
+    if (root_name != name_tensor.first) {
+      ir::Tensor& root_tensor = name_to_tensor_[root_name];
+      ir::Tensor& tensor = name_tensor.second;
+
+      auto keep_shape = root_tensor->buffer->shape;
+      tensor->Bind(root_tensor->buffer);
+      root_tensor->buffer->shape = keep_shape;
+      tensor->buffer->shape = keep_shape;
+      VLOG(6) << "Share buffer " << root_name << " with " << name_tensor.first;
+    }
+  }
+
+  return name_to_tensor_;
+}
+
+}  // namespace ast_gen_ius
+}  // namespace cinn
diff --git a/paddle/cinn/ast_gen_ius/tensor_group.h b/paddle/cinn/ast_gen_ius/tensor_group.h
new file mode 100644
index 0000000000000..1fa37c730c455
--- /dev/null
+++ b/paddle/cinn/ast_gen_ius/tensor_group.h
@@ -0,0 +1,82 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <absl/container/flat_hash_map.h>
+
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/tensor.h"
+
+namespace cinn {
+namespace ast_gen_ius {
+
+/* Collection used for Tensors, used in AST generation */
+class TensorGroup {
+ public:
+  explicit TensorGroup(const std::vector<ir::Tensor>& tensors);
+  ~TensorGroup();
+
+  bool Contain(const std::string& name) const;
+
+  void Insert(const ir::Tensor& tensor);
+
+  ir::Tensor Get(const std::string& name);
+
+  std::set<ir::Tensor> GetAllTensors();
+
+  void CtrlDepend(const ir::Tensor& tensor, const ir::Tensor& to_dep);
+
+  std::set<ir::Tensor> GetCrtlDepTensors(const std::string& tensor_name);
+
+  std::string GetShareMemRootName(const std::string& tensor_name);
+
+  void ShareMemoryBuffer(const ir::Tensor& tensor, const ir::Tensor& to_share);
+
+  absl::flat_hash_map<std::string, ir::Tensor> AllocateBuffers();
+
+  // Returns tensors in topological order and remove those args
+  // Becuase the order is used for generating function body, we don't have to
+  // generate args
+  std::vector<ir::Tensor> GetGenFuncTopoOrder(
+      const std::vector<ir::Tensor>& func_args = {});
+
+  bool HasMarkedReduceInit(const std::string& tensor_name) const;
+
+  // Marks a tensor needs to do reduce init
+  ir::Tensor MarkReduceInit(const std::string& tensor_name);
+
+ private:
+  std::set<std::string> output_tensor_names_;
+
+  absl::flat_hash_map<std::string, ir::Tensor> name_to_tensor_;
+
+  // Stores vector of tensor names, which the key tensor depends on
+  std::unordered_map<std::string, std::unordered_set<std::string>> ctrl_dep_;
+
+  // Keeps Union Find Set style, each tensor name whose buffer is shared maps to
+  // the same name tensor
+  std::unordered_map<std::string, std::string> share_memory_tensor_;
+
+  std::unordered_set<std::string> tensor_name_needs_reduce_init_;
+};
+
+}  // namespace ast_gen_ius
+}  // namespace cinn
diff --git a/paddle/cinn/ast_gen_ius/tensor_group_test.cc b/paddle/cinn/ast_gen_ius/tensor_group_test.cc
new file mode 100644
index 0000000000000..3711419da9c56
--- /dev/null
+++ b/paddle/cinn/ast_gen_ius/tensor_group_test.cc
@@ -0,0 +1,61 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <vector>
+
+#include "paddle/cinn/ast_gen_ius/tensor_group.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/lang/compute.h"
+#include "paddle/cinn/lang/placeholder.h"
+
+namespace cinn {
+namespace ast_gen_ius {
+
+using ir::Expr;
+using ir::Tensor;
+using ir::Var;
+using lang::Compute;
+using lang::Placeholder;
+
+TEST(TensorGroup, Easy) {
+  auto M = Expr(100);
+  auto N = Expr(15);
+  Placeholder<float> A("A", {M, N});
+
+  Tensor B = Compute(
+      {M, N}, [=](Var i, Var j) -> Expr { return A(i, j) + 1.f; }, "B");
+
+  TensorGroup tensor_group({B});
+
+  ASSERT_TRUE(tensor_group.Contain("A"));
+  ASSERT_TRUE(tensor_group.Contain("B"));
+  ASSERT_EQ(tensor_group.Get("B")->name, "B");
+  ASSERT_EQ(tensor_group.Get("A")->name, "A");
+  ASSERT_EQ(tensor_group.GetAllTensors().size(), 2UL);
+
+  ASSERT_EQ(tensor_group.GetCrtlDepTensors("A").size(), 0UL);
+  ASSERT_EQ(tensor_group.GetCrtlDepTensors("B").size(), 1UL);
+  ASSERT_TRUE(tensor_group.GetCrtlDepTensors("B").count(A));
+
+  std::vector<ir::Tensor> topo_tensors =
+      tensor_group.GetGenFuncTopoOrder({A.tensor(), B});
+  ASSERT_EQ(topo_tensors.size(), 1UL);
+  ASSERT_EQ(topo_tensors[0]->name, "B");
+}
+
+}  // namespace ast_gen_ius
+}  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/cost_model/CMakeLists.txt b/paddle/cinn/auto_schedule/cost_model/CMakeLists.txt
index 7f514471a4f7a..17af89c8ae2a1 100644
--- a/paddle/cinn/auto_schedule/cost_model/CMakeLists.txt
+++ b/paddle/cinn/auto_schedule/cost_model/CMakeLists.txt
@@ -3,7 +3,8 @@ core_gather_headers()
 gather_srcs(cinnapi_src SRCS xgb_cost_model.cc expr_cost_model.cc feature.cc
             feature_extractor.cc)
 
-cinn_cc_test(test_xgb_cost_model SRCS xgb_cost_model_test.cc DEPS cinncore)
+# TODO(zhhsplendid): enable this test again
+#cinn_cc_test(test_xgb_cost_model SRCS xgb_cost_model_test.cc DEPS cinncore)
 cinn_cc_test(test_feature_extractor SRCS feature_extractor_test.cc DEPS
              cinncore)
 cinn_cc_test(test_feature SRCS feature_test.cc DEPS cinncore)
diff --git a/paddle/cinn/auto_schedule/search_strategy/CMakeLists.txt b/paddle/cinn/auto_schedule/search_strategy/CMakeLists.txt
index 7f393dfb39837..ab1db5f7bb1bd 100644
--- a/paddle/cinn/auto_schedule/search_strategy/CMakeLists.txt
+++ b/paddle/cinn/auto_schedule/search_strategy/CMakeLists.txt
@@ -4,5 +4,6 @@ core_gather_headers()
 
 gather_srcs(cinnapi_src SRCS evolutionary_search.cc)
 
-cinn_cc_test(test_evolutionary_search SRCS evolutionary_search_test.cc DEPS
-             cinncore test_program_builder)
+# TODO(zhhsplendid): enable this test again
+#cinn_cc_test(test_evolutionary_search SRCS evolutionary_search_test.cc DEPS
+#             cinncore test_program_builder)
diff --git a/paddle/cinn/backends/codegen_c.cc b/paddle/cinn/backends/codegen_c.cc
index cffebdc1a6736..3352a458ceceb 100644
--- a/paddle/cinn/backends/codegen_c.cc
+++ b/paddle/cinn/backends/codegen_c.cc
@@ -23,7 +23,6 @@
 #include "paddle/cinn/ir/op/ir_operators.h"
 #include "paddle/cinn/ir/utils/ir_verify.h"
 #include "paddle/cinn/optim/ir_simplify.h"
-#include "paddle/cinn/optim/remove_nested_block.h"
 #include "paddle/cinn/runtime/cpu/thread_backend.h"
 #include "paddle/cinn/runtime/intrinsic.h"
 #include "paddle/cinn/utils/string.h"
@@ -645,7 +644,7 @@ void CodeGenC::Visit(const ir::_LoweredFunc_ *op) {
 
   Expr func_body = ir::Block::Make(new_body);
 
-  optim::RemoveNestedBlock(&func_body);
+  optim::SimplifyBlocks(&func_body);
 
   IrPrinter::Visit(func_body);
 }
diff --git a/paddle/cinn/backends/codegen_cuda_dev.cc b/paddle/cinn/backends/codegen_cuda_dev.cc
index 018f935482c7f..e33154f0c0129 100644
--- a/paddle/cinn/backends/codegen_cuda_dev.cc
+++ b/paddle/cinn/backends/codegen_cuda_dev.cc
@@ -24,7 +24,6 @@
 #include "paddle/cinn/ir/op/ir_operators.h"
 #include "paddle/cinn/ir/utils/ir_verify.h"
 #include "paddle/cinn/optim/ir_simplify.h"
-#include "paddle/cinn/optim/remove_nested_block.h"
 
 namespace cinn {
 namespace backends {
@@ -141,7 +140,7 @@ void CodeGenCUDA_Dev::Visit(const ir::_LoweredFunc_ *op) {
 
   Expr func_body = ir::Block::Make(new_body);
 
-  optim::RemoveNestedBlock(&func_body);
+  optim::SimplifyBlocks(&func_body);
   // Make sure that the function's body is wrapped by a block
   if (!func_body.As<ir::Block>()) {
     func_body = ir::Block::Make({func_body});
diff --git a/paddle/cinn/backends/llvm/codegen_x86.cc b/paddle/cinn/backends/llvm/codegen_x86.cc
index 28159f9ea4e4f..ccae02ac5746b 100644
--- a/paddle/cinn/backends/llvm/codegen_x86.cc
+++ b/paddle/cinn/backends/llvm/codegen_x86.cc
@@ -28,7 +28,7 @@
 #include "paddle/cinn/common/target.h"
 #include "paddle/cinn/ir/ir.h"
 #include "paddle/cinn/ir/op/ir_operators.h"
-#include "paddle/cinn/optim/collect_undefined_vars.h"
+#include "paddle/cinn/ir/utils/ir_nodes_collector.h"
 #include "paddle/cinn/runtime/intrinsic.h"
 
 namespace cinn::backends {
@@ -98,7 +98,7 @@ void CodeGenX86::CreateParallelLaunch(Expr body, int num_task) {
                                              llvm::Function::PrivateLinkage,
                                              "__parallel_lambda",
                                              m_);
-  std::vector<std::string> vars = optim::CollectUndefinedVars(&body);
+  std::vector<std::string> vars = ir::CollectUndefinedVars(&body);
   uint64_t nbytes;
   auto* data = PackVars(vars, &nbytes);
 
diff --git a/paddle/cinn/hlir/dialect/CMakeLists.txt b/paddle/cinn/hlir/dialect/CMakeLists.txt
index 5d30ab6d34504..3787fdf2b4b08 100755
--- a/paddle/cinn/hlir/dialect/CMakeLists.txt
+++ b/paddle/cinn/hlir/dialect/CMakeLists.txt
@@ -1,2 +1,2 @@
-add_subdirectory(cinn_dialect)
-add_subdirectory(runtime_dialect)
+add_subdirectory(operator)
+add_subdirectory(runtime)
diff --git a/paddle/cinn/hlir/dialect/generated/cinn_ops.parsed.yaml b/paddle/cinn/hlir/dialect/generated/cinn_ops.parsed.yaml
deleted file mode 100644
index b345bb699084e..0000000000000
--- a/paddle/cinn/hlir/dialect/generated/cinn_ops.parsed.yaml
+++ /dev/null
@@ -1,31 +0,0 @@
-- name: add
-  inputs:
-  - typename: Tensor
-    name: x
-    optional: false
-    no_need_buffer: false
-    data_transform: {}
-  - typename: Tensor
-    name: y
-    optional: false
-    no_need_buffer: false
-    data_transform: {}
-  attrs: []
-  outputs:
-  - {typename: Tensor, name: out, optional: false, intermediate: false}
-  no_need_buffer: null
-  data_transform: null
-  infer_meta:
-    func: ElementwiseInferMeta
-    param: [x, y]
-  kernel:
-    func: [add]
-    param: [x, y]
-    backend: null
-    layout: null
-    data_type: null
-    dispatch: {add: null}
-    force_backend: null
-  inplace: {out: x}
-  view: null
-  backward: null
diff --git a/paddle/cinn/hlir/dialect/cinn_dialect/CMakeLists.txt b/paddle/cinn/hlir/dialect/operator/CMakeLists.txt
similarity index 100%
rename from paddle/cinn/hlir/dialect/cinn_dialect/CMakeLists.txt
rename to paddle/cinn/hlir/dialect/operator/CMakeLists.txt
diff --git a/paddle/cinn/hlir/dialect/cinn_dialect/ir/CMakeLists.txt b/paddle/cinn/hlir/dialect/operator/ir/CMakeLists.txt
similarity index 71%
rename from paddle/cinn/hlir/dialect/cinn_dialect/ir/CMakeLists.txt
rename to paddle/cinn/hlir/dialect/operator/ir/CMakeLists.txt
index 5fa53f74cc4a9..896a727f7e59f 100644
--- a/paddle/cinn/hlir/dialect/cinn_dialect/ir/CMakeLists.txt
+++ b/paddle/cinn/hlir/dialect/operator/ir/CMakeLists.txt
@@ -1,31 +1,30 @@
-# TODO(Aurelius84): new_ir_compiler depends on pd_dialect and could
+# TODO(Aurelius84): new_ir_compiler depends on pd_op_dialect and could
 # not found under CINN_ONLY mode
 if(NOT CINN_ONLY)
   set(CINN_DIALECT_BINARY_DIR
-      "${PADDLE_BINARY_DIR}/paddle/cinn/hlir/dialect/cinn_dialect/ir")
+      "${PADDLE_BINARY_DIR}/paddle/cinn/hlir/dialect/operator/ir")
 
-  # Generate cinn_dialect files defining op using op_gen_file
+  # Generate cinn_op_dialect files defining op using op_gen_file
   set(cinn_op_gen_parsed_yaml_file
       ${PADDLE_SOURCE_DIR}/paddle/fluid/operators/generator/parse_op.py)
 
   set(cinn_op_gen_file
-      ${PADDLE_SOURCE_DIR}/paddle/fluid/ir/dialect/op_generator/op_gen.py)
+      ${PADDLE_SOURCE_DIR}/paddle/fluid/pir/dialect/op_generator/op_gen.py)
 
   set(cinn_op_compat_yaml_file
       ${PADDLE_SOURCE_DIR}/paddle/phi/api/yaml/op_compat.yaml)
 
   set(cinn_op_yaml_file
-      ${PADDLE_SOURCE_DIR}/paddle/cinn/hlir/dialect/cinn_dialect/ir/cinn_ops.yaml
-  )
+      ${PADDLE_SOURCE_DIR}/paddle/cinn/hlir/dialect/operator/ir/ops.yaml)
 
   set(parsed_op_dir ${PADDLE_SOURCE_DIR}/paddle/cinn/hlir/dialect/generated)
 
-  set(cinn_op_parsed_yaml_file ${parsed_op_dir}/cinn_ops.parsed.yaml)
+  set(cinn_op_parsed_yaml_file ${parsed_op_dir}/ops.parsed.yaml)
 
   set(cinn_op_parsed_yaml_files ${cinn_op_parsed_yaml_file})
 
   set(cinn_op_namespace cinn,dialect)
-  set(cinn_dialect_name cinn)
+  set(cinn_op_dialect_name cinn_op)
   set(cinn_op_header_file ${CINN_DIALECT_BINARY_DIR}/cinn_op.h)
   set(cinn_op_source_file ${CINN_DIALECT_BINARY_DIR}/cinn_op.cc)
   set(cinn_op_header_file_tmp ${cinn_op_header_file}.tmp)
@@ -44,7 +43,7 @@ if(NOT CINN_ONLY)
       ${PYTHON_EXECUTABLE} ${cinn_op_gen_file} --op_yaml_files
       ${cinn_op_parsed_yaml_files} --op_compat_yaml_file
       ${cinn_op_compat_yaml_file} --namespaces ${cinn_op_namespace}
-      --dialect_name ${cinn_dialect_name} --op_def_h_file
+      --dialect_name ${cinn_op_dialect_name} --op_def_h_file
       ${cinn_op_header_file_tmp} --op_def_cc_file ${cinn_op_source_file_tmp}
     COMMAND ${CMAKE_COMMAND} -E copy_if_different ${cinn_op_header_file_tmp}
             ${cinn_op_header_file}
@@ -54,8 +53,8 @@ if(NOT CINN_ONLY)
             ${cinn_op_compat_yaml_file}
     VERBATIM)
 
-  cinn_cc_library(cinn_dialect SRCS cinn_dialect.cc ${cinn_op_source_file} DEPS
-                  pd_dialect)
+  cinn_cc_library(cinn_op_dialect SRCS op_dialect.cc ${cinn_op_source_file}
+                  DEPS pd_op_dialect)
 
-  target_include_directories(cinn_dialect PRIVATE ${CINN_DIALECT_BINARY_DIR})
+  target_include_directories(cinn_op_dialect PRIVATE ${CINN_DIALECT_BINARY_DIR})
 endif()
diff --git a/paddle/cinn/hlir/dialect/cinn_dialect/ir/cinn_dialect.cc b/paddle/cinn/hlir/dialect/operator/ir/op_dialect.cc
similarity index 68%
rename from paddle/cinn/hlir/dialect/cinn_dialect/ir/cinn_dialect.cc
rename to paddle/cinn/hlir/dialect/operator/ir/op_dialect.cc
index 9e8ccfb6492e4..d8a3bc7b8b35a 100644
--- a/paddle/cinn/hlir/dialect/cinn_dialect/ir/cinn_dialect.cc
+++ b/paddle/cinn/hlir/dialect/operator/ir/op_dialect.cc
@@ -12,31 +12,32 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/cinn/hlir/dialect/cinn_dialect/ir/cinn_dialect.h"
+#include "paddle/cinn/hlir/dialect/operator/ir/op_dialect.h"
 // NOTE(chenxi67): File cinn_op.h is generated by op_gen.py, see details in
 // paddle/cinn/hlir/dialect/CMakeLists.txt.
-#include "paddle/cinn/hlir/dialect/cinn_dialect/ir/cinn_op.h"
+#include "paddle/cinn/hlir/dialect/operator/ir/cinn_op.h"
 
 namespace cinn {
 namespace dialect {
 
-CinnDialect::CinnDialect(::ir::IrContext* context)
-    : ::ir::Dialect(
-          name(), context, ::ir::TypeId::get<cinn::dialect::CinnDialect>()) {
+OperatorDialect::OperatorDialect(::pir::IrContext* context)
+    : ::pir::Dialect(name(),
+                     context,
+                     ::pir::TypeId::get<cinn::dialect::OperatorDialect>()) {
   this->initialize();
 }
 
-void CinnDialect::initialize() {
+void OperatorDialect::initialize() {
   // NOTE(chenxi67): GET_OP_LIST is defined in cinn_op.h which is
   // generated by op_gen.py, see details in
   // paddle/cinn/hlir/dialect/CMakeLists.txt.
   RegisterOps<
 #define GET_OP_LIST
-#include "paddle/cinn/hlir/dialect/cinn_dialect/ir/cinn_op.h"  // NOLINT
+#include "paddle/cinn/hlir/dialect/operator/ir/cinn_op.h"  // NOLINT
       >();
 }
 
 }  // namespace dialect
 }  // namespace cinn
 
-IR_DEFINE_EXPLICIT_TYPE_ID(cinn::dialect::CinnDialect)
+IR_DEFINE_EXPLICIT_TYPE_ID(cinn::dialect::OperatorDialect)
diff --git a/paddle/cinn/hlir/dialect/cinn_dialect/ir/cinn_dialect.h b/paddle/cinn/hlir/dialect/operator/ir/op_dialect.h
similarity index 75%
rename from paddle/cinn/hlir/dialect/cinn_dialect/ir/cinn_dialect.h
rename to paddle/cinn/hlir/dialect/operator/ir/op_dialect.h
index 77fb96863ad37..58a0487e9e8f9 100644
--- a/paddle/cinn/hlir/dialect/cinn_dialect/ir/cinn_dialect.h
+++ b/paddle/cinn/hlir/dialect/operator/ir/op_dialect.h
@@ -14,16 +14,16 @@
 
 #pragma once
 
-#include "paddle/ir/core/dialect.h"
+#include "paddle/pir/core/dialect.h"
 
 namespace cinn {
 namespace dialect {
 
-class CinnDialect : public ::ir::Dialect {
+class OperatorDialect : public ::pir::Dialect {
  public:
-  explicit CinnDialect(::ir::IrContext* context);
+  explicit OperatorDialect(::pir::IrContext* context);
 
-  static const char* name() { return "cinn"; }
+  static const char* name() { return "cinn_op"; }
 
  private:
   void initialize();
@@ -32,4 +32,4 @@ class CinnDialect : public ::ir::Dialect {
 }  // namespace dialect
 }  // namespace cinn
 
-IR_DECLARE_EXPLICIT_TYPE_ID(cinn::dialect::CinnDialect)
+IR_DECLARE_EXPLICIT_TYPE_ID(cinn::dialect::OperatorDialect)
diff --git a/paddle/cinn/hlir/dialect/cinn_dialect/ir/cinn_ops.yaml b/paddle/cinn/hlir/dialect/operator/ir/ops.yaml
similarity index 100%
rename from paddle/cinn/hlir/dialect/cinn_dialect/ir/cinn_ops.yaml
rename to paddle/cinn/hlir/dialect/operator/ir/ops.yaml
diff --git a/paddle/cinn/hlir/dialect/runtime_dialect/CMakeLists.txt b/paddle/cinn/hlir/dialect/runtime/CMakeLists.txt
similarity index 100%
rename from paddle/cinn/hlir/dialect/runtime_dialect/CMakeLists.txt
rename to paddle/cinn/hlir/dialect/runtime/CMakeLists.txt
diff --git a/paddle/cinn/hlir/dialect/runtime/ir/CMakeLists.txt b/paddle/cinn/hlir/dialect/runtime/ir/CMakeLists.txt
new file mode 100644
index 0000000000000..6023117faee09
--- /dev/null
+++ b/paddle/cinn/hlir/dialect/runtime/ir/CMakeLists.txt
@@ -0,0 +1,4 @@
+if(NOT CINN_ONLY)
+  cinn_cc_library(cinn_runtime_dialect SRCS runtime_dialect.cc jit_kernel_op.cc
+                  DEPS pir_core)
+endif()
diff --git a/paddle/cinn/hlir/dialect/runtime_dialect/ir/jit_kernel_op.cc b/paddle/cinn/hlir/dialect/runtime/ir/jit_kernel_op.cc
similarity index 80%
rename from paddle/cinn/hlir/dialect/runtime_dialect/ir/jit_kernel_op.cc
rename to paddle/cinn/hlir/dialect/runtime/ir/jit_kernel_op.cc
index 49e3685a8475a..ed3d4a4045c59 100644
--- a/paddle/cinn/hlir/dialect/runtime_dialect/ir/jit_kernel_op.cc
+++ b/paddle/cinn/hlir/dialect/runtime/ir/jit_kernel_op.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/cinn/hlir/dialect/runtime_dialect/ir/jit_kernel_op.h"
+#include "paddle/cinn/hlir/dialect/runtime/ir/jit_kernel_op.h"
 
-#include "paddle/ir/core/builtin_attribute.h"
-#include "paddle/ir/core/enforce.h"
+#include "paddle/pir/core/builtin_attribute.h"
+#include "paddle/pir/core/enforce.h"
 
 namespace cinn {
 namespace dialect {
@@ -28,13 +28,13 @@ void JitKernelOp::Verify() {
   auto& attributes = this->attributes();
 
   IR_ENFORCE(attributes.count(kAttrName) > 0 &&
-                 attributes.at(kAttrName).isa<::ir::PointerAttribute>(),
+                 attributes.at(kAttrName).isa<::pir::PointerAttribute>(),
              "Type of attribute: instruction is not right.");
 }
 
 hlir::framework::Instruction* JitKernelOp::instruction() {
   void* ptr =
-      attributes().at(kAttrName).dyn_cast<ir::PointerAttribute>().data();
+      attributes().at(kAttrName).dyn_cast<::pir::PointerAttribute>().data();
   return reinterpret_cast<hlir::framework::Instruction*>(ptr);
 }
 
diff --git a/paddle/cinn/hlir/dialect/runtime_dialect/ir/jit_kernel_op.h b/paddle/cinn/hlir/dialect/runtime/ir/jit_kernel_op.h
similarity index 91%
rename from paddle/cinn/hlir/dialect/runtime_dialect/ir/jit_kernel_op.h
rename to paddle/cinn/hlir/dialect/runtime/ir/jit_kernel_op.h
index 37b9c66bb6e17..f410e4d46c021 100644
--- a/paddle/cinn/hlir/dialect/runtime_dialect/ir/jit_kernel_op.h
+++ b/paddle/cinn/hlir/dialect/runtime/ir/jit_kernel_op.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "paddle/ir/core/op_base.h"
+#include "paddle/pir/core/op_base.h"
 
 namespace cinn {
 
@@ -40,10 +40,10 @@ namespace dialect {
  *   temporarily, and will spilt executor information like
  *   scope, inputs, outputs into InterpretorCore module.
 */
-class JitKernelOp : public ::ir::Op<JitKernelOp> {
+class JitKernelOp : public ::pir::Op<JitKernelOp> {
  public:
   using Op::Op;
-  static const char* name() { return "cinn.jit_kernel"; }
+  static const char* name() { return "cinn_runtime.jit_kernel"; }
   // TODO(Aurelius84): Think deeply what should contains
   static constexpr uint32_t attributes_num = 1;
   static constexpr char* kAttrName = "instruction";
diff --git a/paddle/cinn/hlir/dialect/runtime_dialect/ir/runtime_dialect.cc b/paddle/cinn/hlir/dialect/runtime/ir/runtime_dialect.cc
similarity index 73%
rename from paddle/cinn/hlir/dialect/runtime_dialect/ir/runtime_dialect.cc
rename to paddle/cinn/hlir/dialect/runtime/ir/runtime_dialect.cc
index c21d21f11213e..40fd092e1329e 100644
--- a/paddle/cinn/hlir/dialect/runtime_dialect/ir/runtime_dialect.cc
+++ b/paddle/cinn/hlir/dialect/runtime/ir/runtime_dialect.cc
@@ -12,15 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/cinn/hlir/dialect/runtime_dialect/ir/runtime_dialect.h"
-#include "paddle/cinn/hlir/dialect/runtime_dialect/ir/jit_kernel_op.h"
+#include "paddle/cinn/hlir/dialect/runtime/ir/runtime_dialect.h"
+#include "paddle/cinn/hlir/dialect/runtime/ir/jit_kernel_op.h"
 
 namespace cinn {
 namespace dialect {
 
-RuntimeDialect::RuntimeDialect(::ir::IrContext* context)
-    : ::ir::Dialect(
-          name(), context, ::ir::TypeId::get<cinn::dialect::RuntimeDialect>()) {
+RuntimeDialect::RuntimeDialect(::pir::IrContext* context)
+    : ::pir::Dialect(name(),
+                     context,
+                     ::pir::TypeId::get<cinn::dialect::RuntimeDialect>()) {
   this->initialize();
 }
 
diff --git a/paddle/cinn/hlir/dialect/runtime_dialect/ir/runtime_dialect.h b/paddle/cinn/hlir/dialect/runtime/ir/runtime_dialect.h
similarity index 81%
rename from paddle/cinn/hlir/dialect/runtime_dialect/ir/runtime_dialect.h
rename to paddle/cinn/hlir/dialect/runtime/ir/runtime_dialect.h
index a35c7a24b8d7f..8ba0af9334498 100644
--- a/paddle/cinn/hlir/dialect/runtime_dialect/ir/runtime_dialect.h
+++ b/paddle/cinn/hlir/dialect/runtime/ir/runtime_dialect.h
@@ -14,16 +14,16 @@
 
 #pragma once
 
-#include "paddle/ir/core/dialect.h"
+#include "paddle/pir/core/dialect.h"
 
 namespace cinn {
 namespace dialect {
 
-class RuntimeDialect : public ::ir::Dialect {
+class RuntimeDialect : public ::pir::Dialect {
  public:
-  explicit RuntimeDialect(::ir::IrContext* context);
+  explicit RuntimeDialect(::pir::IrContext* context);
 
-  static const char* name() { return "cinn"; }
+  static const char* name() { return "cinn_runtime"; }
 
  private:
   void initialize();
diff --git a/paddle/cinn/hlir/dialect/runtime_dialect/ir/CMakeLists.txt b/paddle/cinn/hlir/dialect/runtime_dialect/ir/CMakeLists.txt
deleted file mode 100644
index 1df80a5bb3f75..0000000000000
--- a/paddle/cinn/hlir/dialect/runtime_dialect/ir/CMakeLists.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-if(NOT CINN_ONLY)
-  cinn_cc_library(runtime_dialect SRCS runtime_dialect.cc jit_kernel_op.cc DEPS
-                  ir_core)
-endif()
diff --git a/paddle/cinn/hlir/framework/CMakeLists.txt b/paddle/cinn/hlir/framework/CMakeLists.txt
index d14ffa70234fc..5e202578b125c 100755
--- a/paddle/cinn/hlir/framework/CMakeLists.txt
+++ b/paddle/cinn/hlir/framework/CMakeLists.txt
@@ -23,13 +23,13 @@ gather_srcs(
   accuracy_checker.cc
   visualize_helper.cc)
 
-# TODO(Aurelius84): new_ir_compiler depends on pd_dialect and could
+# TODO(Aurelius84): new_ir_compiler depends on pd_op_dialect and could
 # not found under CINN_ONLY mode
 if(NOT CINN_ONLY)
   cinn_cc_library(new_ir_compiler SRCS new_ir_compiler.cc DEPS cinnapi
-                  pd_dialect)
+                  pd_op_dialect)
   cinn_cc_library(convert_to_dialect SRCS convert_to_dialect.cc DEPS cinnapi
-                  cinn_dialect)
+                  cinn_op_dialect)
 endif()
 
 if(WITH_CUDA)
diff --git a/paddle/cinn/hlir/framework/convert_to_dialect.cc b/paddle/cinn/hlir/framework/convert_to_dialect.cc
index 306e27dc1fea5..f76b49a54555f 100644
--- a/paddle/cinn/hlir/framework/convert_to_dialect.cc
+++ b/paddle/cinn/hlir/framework/convert_to_dialect.cc
@@ -17,34 +17,34 @@
 #include <string>
 #include <unordered_map>
 
-#include "paddle/cinn/hlir/dialect/runtime_dialect/ir/jit_kernel_op.h"
-#include "paddle/cinn/hlir/dialect/runtime_dialect/ir/runtime_dialect.h"
+#include "paddle/cinn/hlir/dialect/runtime/ir/jit_kernel_op.h"
+#include "paddle/cinn/hlir/dialect/runtime/ir/runtime_dialect.h"
 #include "paddle/cinn/hlir/framework/program.h"
-#include "paddle/ir/core/builtin_attribute.h"
-#include "paddle/ir/core/program.h"
+#include "paddle/pir/core/builtin_attribute.h"
+#include "paddle/pir/core/program.h"
 
 namespace cinn {
 namespace hlir {
 namespace framework {
 
-std::unique_ptr<::ir::Program> ConvertToRuntimeDialect(
+std::unique_ptr<::pir::Program> ConvertToRuntimeDialect(
     const hlir::framework::Program& program) {
-  ::ir::IrContext* ctx = ::ir::IrContext::Instance();
+  ::pir::IrContext* ctx = ::pir::IrContext::Instance();
   ctx->GetOrRegisterDialect<cinn::dialect::RuntimeDialect>();
-  auto ir_program = std::make_unique<::ir::Program>(ctx);
+  auto ir_program = std::make_unique<::pir::Program>(ctx);
 
   std::string jit_op_name = dialect::JitKernelOp::name();
-  ::ir::OpInfo op_info = ctx->GetRegisteredOpInfo(jit_op_name);
+  ::pir::OpInfo op_info = ctx->GetRegisteredOpInfo(jit_op_name);
 
   auto& instrs = program.GetRunInstructions();
   for (auto& instr : instrs) {
-    std::unordered_map<std::string, ::ir::Attribute> op_attrs{
+    std::unordered_map<std::string, ::pir::Attribute> op_attrs{
         {dialect::JitKernelOp::kAttrName,
-         ::ir::PointerAttribute::get(ctx, instr.get())},
+         ::pir::PointerAttribute::get(ctx, instr.get())},
     };
 
-    ::ir::Operation* cinn_op =
-        ::ir::Operation::Create({}, op_attrs, {}, op_info);
+    ::pir::Operation* cinn_op =
+        ::pir::Operation::Create({}, op_attrs, {}, op_info);
     ir_program->block()->push_back(cinn_op);
   }
   return std::move(ir_program);
diff --git a/paddle/cinn/hlir/framework/convert_to_dialect.h b/paddle/cinn/hlir/framework/convert_to_dialect.h
index a88b5222b63bd..7ea0a2ace40c7 100644
--- a/paddle/cinn/hlir/framework/convert_to_dialect.h
+++ b/paddle/cinn/hlir/framework/convert_to_dialect.h
@@ -16,16 +16,16 @@
 
 #include <memory>
 
-namespace ir {
+namespace pir {
 class Program;
-}  // namespace ir
+}  // namespace pir
 
 namespace cinn {
 namespace hlir {
 namespace framework {
 class Program;
 
-std::unique_ptr<::ir::Program> ConvertToRuntimeDialect(
+std::unique_ptr<::pir::Program> ConvertToRuntimeDialect(
     const hlir::framework::Program& program);
 
 }  // namespace framework
diff --git a/paddle/cinn/hlir/framework/new_ir/group.h b/paddle/cinn/hlir/framework/new_ir/group.h
index b62c315873c70..1a67a02e58ca9 100644
--- a/paddle/cinn/hlir/framework/new_ir/group.h
+++ b/paddle/cinn/hlir/framework/new_ir/group.h
@@ -18,7 +18,7 @@
 
 #include "paddle/cinn/hlir/framework/new_ir/utils.h"
 #include "paddle/cinn/hlir/framework/op.h"
-#include "paddle/ir/core/operation.h"
+#include "paddle/pir/core/operation.h"
 
 namespace cinn {
 namespace hlir {
@@ -29,12 +29,12 @@ using framework::OpPatternKind;
 // TODO(Aurelius84): Need to be replaced with CinnGroupOp
 struct Group {
  public:
-  explicit Group(const std::vector<::ir::Operation*>& group_ops)
+  explicit Group(const std::vector<::pir::Operation*>& group_ops)
       : ops(group_ops) {
     Initialize();
   }
 
-  explicit Group(std::initializer_list<::ir::Operation*> group_ops)
+  explicit Group(std::initializer_list<::pir::Operation*> group_ops)
       : ops(group_ops) {
     Initialize();
   }
@@ -42,7 +42,7 @@ struct Group {
   int group_id;
   std::string fn_name;
   OpPatternKind op_pattern_kind;
-  std::vector<::ir::Operation*> ops;
+  std::vector<::pir::Operation*> ops;
   std::vector<std::string> input_names;
   std::vector<std::string> output_names;
 
diff --git a/paddle/cinn/hlir/framework/new_ir/op_lowering_impl.cc b/paddle/cinn/hlir/framework/new_ir/op_lowering_impl.cc
index d291aba2e406e..235d545dc331f 100644
--- a/paddle/cinn/hlir/framework/new_ir/op_lowering_impl.cc
+++ b/paddle/cinn/hlir/framework/new_ir/op_lowering_impl.cc
@@ -23,7 +23,7 @@
 #include "paddle/cinn/hlir/framework/new_ir/utils.h"
 #include "paddle/cinn/lang/placeholder.h"
 #include "paddle/cinn/utils/attribute_util.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_type.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
 #include "paddle/phi/core/ddim.h"
 
 PD_DECLARE_bool(cinn_use_cuda_vectorize);
@@ -39,7 +39,7 @@ using framework::OpPatternKind;
 using framework::StrategyFunction;
 
 namespace details {
-ir::Tensor GetTensor(const ::ir::Value& value) {
+ir::Tensor GetTensor(const ::pir::Value& value) {
   auto type_info = value.type().dyn_cast<paddle::dialect::DenseTensorType>();
   auto in_shape = phi::vectorize<int>(type_info.dims());
   auto dtype = type_info.dtype();
@@ -49,9 +49,9 @@ ir::Tensor GetTensor(const ::ir::Value& value) {
 }
 
 std::vector<ir::Tensor> CollectInputTensor(
-    const ::ir::Operation* op,
+    const ::pir::Operation* op,
     std::vector<ir::Tensor>* func_args,
-    std::unordered_map<::ir::Value, ir::Tensor>* tensor_map) {
+    std::unordered_map<::pir::Value, ir::Tensor>* tensor_map) {
   std::vector<ir::Tensor> tensors;
   for (auto& operand : op->operands()) {
     CHECK(operand);
@@ -72,7 +72,7 @@ std::vector<ir::Tensor> CollectInputTensor(
   return tensors;
 }
 
-void CollectOutputInfo(const ::ir::Operation* op,
+void CollectOutputInfo(const ::pir::Operation* op,
                        std::vector<Type>* out_types,
                        std::vector<std::vector<int>>* out_shapes) {
   auto op_results = op->results();
@@ -88,7 +88,7 @@ void CollectOutputInfo(const ::ir::Operation* op,
   }
 }
 
-NodeAttr CollectAttrs(const ::ir::Operation& op) {
+NodeAttr CollectAttrs(const ::pir::Operation& op) {
   NodeAttr node_attrs;
   VLOG(4) << "op.attributes():" << op.attributes().size();
   auto attrs = utils::ConvertAttributes(op.attributes());
@@ -134,18 +134,18 @@ std::vector<ir::LoweredFunc> OpLowererImpl::Lower(const GroupPtr& group,
   }
 }
 
-bool OpLowererImpl::ElementwiseScheduleDetermineFunction(::ir::Operation* op) {
+bool OpLowererImpl::ElementwiseScheduleDetermineFunction(::pir::Operation* op) {
   return true;
 }
 
-bool OpLowererImpl::ReduceScheduleDetermineFunction(::ir::Operation* op) {
+bool OpLowererImpl::ReduceScheduleDetermineFunction(::pir::Operation* op) {
   // TODO(Aurelius84): Support this.
   // auto& op_pattern_dict = Operator::GetAttrs<OpPatternKind>("OpPattern");
   // return op_pattern_dict[op] == framework::kReduction;
   return true;
 }
 
-bool OpLowererImpl::NonFusibleScheduleDetermineFunction(::ir::Operation* op) {
+bool OpLowererImpl::NonFusibleScheduleDetermineFunction(::pir::Operation* op) {
   return true;
 }
 
@@ -160,7 +160,7 @@ std::vector<ir::LoweredFunc> OpLowererImpl::LowerGroup(
     return LowerCustomCall(group);
   }
   std::vector<ir::Tensor> group_func_arg_tensors;
-  std::unordered_map<::ir::Value, ir::Tensor> tensor_map;
+  std::unordered_map<::pir::Value, ir::Tensor> tensor_map;
   bool do_op_schedule = apply_group_schedule || apply_op_schedule;
   std::vector<ir::Expr> func_bodies = LowerOps(ops,
                                                do_op_schedule,
@@ -191,8 +191,8 @@ std::vector<ir::LoweredFunc> OpLowererImpl::LowerCustomCall(
     const GroupPtr& group) {
   auto& ops = group->ops;
   CHECK_EQ(ops.size(), 1);
-  ::ir::Operation* op = ops[0];
-  std::unordered_map<::ir::Value, ir::Tensor> tensor_map;
+  ::pir::Operation* op = ops[0];
+  std::unordered_map<::pir::Value, ir::Tensor> tensor_map;
   std::vector<ir::Tensor> op_func_arg_tensors =
       details::CollectInputTensor(op, nullptr, &tensor_map);
   VLOG(4) << "inputs.size(): " << op_func_arg_tensors.size();
@@ -234,7 +234,7 @@ std::vector<ir::LoweredFunc> OpLowererImpl::LowerCustomCall(
 
 std::vector<ir::LoweredFunc> OpLowererImpl::PostProcess(
     const GroupPtr& group,
-    const std::unordered_map<::ir::Value, ir::Tensor>& tensor_map,
+    const std::unordered_map<::pir::Value, ir::Tensor>& tensor_map,
     bool done_op_schedule,
     ir::IRSchedule* ir_sch,
     std::vector<ir::Tensor>* group_func_arg_tensors) {
@@ -313,11 +313,11 @@ std::vector<ir::LoweredFunc> OpLowererImpl::PostProcess(
 }
 
 std::vector<ir::Expr> OpLowererImpl::LowerOps(
-    const std::vector<::ir::Operation*>& ops,
+    const std::vector<::pir::Operation*>& ops,
     bool apply_op_schedule,
     ScheduleDetermineFunction schedule_determine_func,
     std::vector<ir::Tensor>* group_func_arg_tensors,
-    std::unordered_map<::ir::Value, ir::Tensor>* tensor_map) {
+    std::unordered_map<::pir::Value, ir::Tensor>* tensor_map) {
   auto& strategy = Operator::GetAttrs<StrategyFunction>("CINNStrategy");
   std::vector<Expr> func_bodies;
   for (auto* op : ops) {
@@ -359,8 +359,8 @@ std::vector<ir::Expr> OpLowererImpl::LowerOps(
 
 std::vector<ir::LoweredFunc> OpLowererImpl::DoOpLower(
     std::shared_ptr<hlir::framework::OpImpl> op_impl,
-    const ::ir::Operation* op,
-    std::unordered_map<::ir::Value, ir::Tensor>* tensor_map,
+    const ::pir::Operation* op,
+    std::unordered_map<::pir::Value, ir::Tensor>* tensor_map,
     std::vector<ir::Tensor>* op_func_arg_tensors) {
   VLOG(4) << "Do lower with Compute, op: " << op->name();
   std::vector<common::CINNValue> cinn_inputs;
diff --git a/paddle/cinn/hlir/framework/new_ir/op_lowering_impl.h b/paddle/cinn/hlir/framework/new_ir/op_lowering_impl.h
index ffa6218299100..81e36d8bb7b3b 100644
--- a/paddle/cinn/hlir/framework/new_ir/op_lowering_impl.h
+++ b/paddle/cinn/hlir/framework/new_ir/op_lowering_impl.h
@@ -26,7 +26,7 @@
 #include "paddle/cinn/ir/schedule/ir_schedule.h"
 #include "paddle/cinn/ir/schedule/ir_schedule_util.h"
 #include "paddle/cinn/lang/packed_func.h"
-#include "paddle/ir/core/operation.h"
+#include "paddle/pir/core/operation.h"
 
 // Fusion Op lowering, there are four kinds of lowering function:
 // Elementwise/Broadcast/Injective,Reduce,OutEWiseFusable,NonFusible.
@@ -43,7 +43,7 @@ using GroupPtr = std::shared_ptr<Group>;
 using common::Target;
 class OpLowererImpl;
 
-typedef bool (OpLowererImpl::*ScheduleDetermineFunction)(::ir::Operation*);
+typedef bool (OpLowererImpl::*ScheduleDetermineFunction)(::pir::Operation*);
 
 class OpLowererImpl : public OpLowererImplBase<GroupPtr> {
  public:
@@ -96,7 +96,7 @@ class OpLowererImpl : public OpLowererImplBase<GroupPtr> {
    */
   std::vector<ir::LoweredFunc> PostProcess(
       const GroupPtr& group,
-      const std::unordered_map<::ir::Value, ir::Tensor>& tensor_map,
+      const std::unordered_map<::pir::Value, ir::Tensor>& tensor_map,
       bool done_op_schedule,
       ir::IRSchedule* ir_sch,
       std::vector<ir::Tensor>* group_func_arg_tensors);
@@ -114,11 +114,11 @@ class OpLowererImpl : public OpLowererImplBase<GroupPtr> {
    * @return The lowered func bodies of Op set.
    */
   std::vector<ir::Expr> LowerOps(
-      const std::vector<::ir::Operation*>& ops,
+      const std::vector<::pir::Operation*>& ops,
       bool apply_op_schedule,
       ScheduleDetermineFunction schedule_determine_func,
       std::vector<ir::Tensor>* group_func_arg_tensors,
-      std::unordered_map<::ir::Value, ir::Tensor>* tensor_map);
+      std::unordered_map<::pir::Value, ir::Tensor>* tensor_map);
 
   /**
    * @brief Lower an Op to CINN IR. The Compute and Lower processes will be
@@ -131,8 +131,8 @@ class OpLowererImpl : public OpLowererImplBase<GroupPtr> {
    */
   std::vector<ir::LoweredFunc> DoOpLower(
       std::shared_ptr<hlir::framework::OpImpl> op_impl,
-      const ::ir::Operation* op,
-      std::unordered_map<::ir::Value, ir::Tensor>* tensor_map,
+      const ::pir::Operation* op,
+      std::unordered_map<::pir::Value, ir::Tensor>* tensor_map,
       std::vector<ir::Tensor>* op_func_arg_tensors);
 
   /**
@@ -148,9 +148,9 @@ class OpLowererImpl : public OpLowererImplBase<GroupPtr> {
 
   // Functions used to determine which Ops to schedule at op level, define a
   // policy for each type of group.
-  inline bool ReduceScheduleDetermineFunction(::ir::Operation* op);
-  inline bool ElementwiseScheduleDetermineFunction(::ir::Operation* op);
-  inline bool NonFusibleScheduleDetermineFunction(::ir::Operation* op);
+  inline bool ReduceScheduleDetermineFunction(::pir::Operation* op);
+  inline bool ElementwiseScheduleDetermineFunction(::pir::Operation* op);
+  inline bool NonFusibleScheduleDetermineFunction(::pir::Operation* op);
 
  private:
   Target target_;
diff --git a/paddle/cinn/hlir/framework/new_ir/utils.cc b/paddle/cinn/hlir/framework/new_ir/utils.cc
index 38bfcf05776e0..b027992af8c47 100644
--- a/paddle/cinn/hlir/framework/new_ir/utils.cc
+++ b/paddle/cinn/hlir/framework/new_ir/utils.cc
@@ -20,9 +20,9 @@ namespace framework {
 namespace newir {
 
 const std::unordered_map<std::string, std::string> CompatibleInfo::OP_NAMES = {
-    {"pd.full", "fill_constant"}};
+    {"pd_op.full", "fill_constant"}};
 
-std::string CompatibleInfo::OpName(const ::ir::Operation& op) {
+std::string CompatibleInfo::OpName(const ::pir::Operation& op) {
   std::string name = op.name();
   if (OP_NAMES.count(name)) {
     return OP_NAMES.at(name);
@@ -36,12 +36,12 @@ std::string CompatibleInfo::OpName(const ::ir::Operation& op) {
   return cinn_op_name;
 }
 
-std::string CompatibleInfo::ValueName(const ::ir::Value& value) {
+std::string CompatibleInfo::ValueName(const ::pir::Value& value) {
   return CompatibleInfo::kNamePrefix +
-         std::to_string(std::hash<::ir::Value>()(value));
+         std::to_string(std::hash<::pir::Value>()(value));
 }
 
-std::string CompatibleInfo::OpFuncName(const ::ir::Operation& op) {
+std::string CompatibleInfo::OpFuncName(const ::pir::Operation& op) {
   std::string op_name = OpName(op);
   std::string func_name =
       cinn::common::Context::Global().NewName("fn_" + op_name);
@@ -49,7 +49,7 @@ std::string CompatibleInfo::OpFuncName(const ::ir::Operation& op) {
 }
 
 std::string CompatibleInfo::GroupOpsName(
-    const std::vector<::ir::Operation*>& ops) {
+    const std::vector<::pir::Operation*>& ops) {
   std::string name = "fn";
   for (auto* op : ops) {
     std::string op_name = OpName(*op);
@@ -58,7 +58,7 @@ std::string CompatibleInfo::GroupOpsName(
   return name;
 }
 
-std::vector<std::string> CompatibleInfo::InputNames(const ::ir::Operation& op,
+std::vector<std::string> CompatibleInfo::InputNames(const ::pir::Operation& op,
                                                     bool allow_duplicate) {
   std::vector<std::string> names;
   std::unordered_set<std::string> repeat;
@@ -75,7 +75,7 @@ std::vector<std::string> CompatibleInfo::InputNames(const ::ir::Operation& op,
 }
 
 std::vector<std::string> CompatibleInfo::OutputNames(
-    const ::ir::Operation& op) {
+    const ::pir::Operation& op) {
   std::vector<std::string> names;
   for (int i = 0; i < op.num_results(); ++i) {
     auto value = op.result(i);
diff --git a/paddle/cinn/hlir/framework/new_ir/utils.h b/paddle/cinn/hlir/framework/new_ir/utils.h
index 4c437dd19ef8a..2a70cd9eedc17 100644
--- a/paddle/cinn/hlir/framework/new_ir/utils.h
+++ b/paddle/cinn/hlir/framework/new_ir/utils.h
@@ -16,7 +16,7 @@
 #include <string>
 #include <unordered_map>
 #include "paddle/cinn/common/context.h"
-#include "paddle/ir/core/operation.h"
+#include "paddle/pir/core/operation.h"
 
 namespace cinn {
 namespace hlir {
@@ -29,18 +29,18 @@ struct CompatibleInfo {
   // macros or attempt to unify Op name with Paddle and CINN.
   static const std::unordered_map<std::string, std::string> OP_NAMES;
 
-  static std::string OpName(const ::ir::Operation& op);
+  static std::string OpName(const ::pir::Operation& op);
 
-  static std::string ValueName(const ::ir::Value& value);
+  static std::string ValueName(const ::pir::Value& value);
 
-  static std::string OpFuncName(const ::ir::Operation& op);
+  static std::string OpFuncName(const ::pir::Operation& op);
 
-  static std::string GroupOpsName(const std::vector<::ir::Operation*>& ops);
+  static std::string GroupOpsName(const std::vector<::pir::Operation*>& ops);
 
-  static std::vector<std::string> InputNames(const ::ir::Operation& op,
+  static std::vector<std::string> InputNames(const ::pir::Operation& op,
                                              bool allow_duplicate = false);
 
-  static std::vector<std::string> OutputNames(const ::ir::Operation& op);
+  static std::vector<std::string> OutputNames(const ::pir::Operation& op);
 };
 
 }  // namespace newir
diff --git a/paddle/cinn/hlir/framework/new_ir_compiler.cc b/paddle/cinn/hlir/framework/new_ir_compiler.cc
index bcc7c0f1c2a05..9172a1d8b052f 100644
--- a/paddle/cinn/hlir/framework/new_ir_compiler.cc
+++ b/paddle/cinn/hlir/framework/new_ir_compiler.cc
@@ -17,8 +17,8 @@
 #include <absl/types/variant.h>
 #include "paddle/cinn/hlir/framework/new_ir/utils.h"
 #include "paddle/cinn/utils/attribute_util.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_type.h"
-#include "paddle/ir/core/builtin_type.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
+#include "paddle/pir/core/builtin_type.h"
 
 namespace cinn {
 namespace hlir {
@@ -33,7 +33,7 @@ std::unique_ptr<Program> NewIRCompiler::Build() {
   std::vector<newir::GroupPtr> groups;
   for (auto it = program_.block()->begin(); it != program_.block()->end();
        ++it) {
-    std::vector<::ir::Operation*> ops = {*it};
+    std::vector<::pir::Operation*> ops = {*it};
     groups.push_back(std::make_shared<newir::Group>(ops));
   }
   VLOG(4) << "Groups size: " << groups.size();
@@ -123,11 +123,11 @@ std::vector<std::unique_ptr<Instruction>> NewIRCompiler::BuildInstructions(
 }
 
 std::shared_ptr<Scope> BuildScope(const Target& target,
-                                  const ::ir::Program& program) {
-  std::unordered_set<::ir::Value> visited;
+                                  const ::pir::Program& program) {
+  std::unordered_set<::pir::Value> visited;
   auto scope = std::make_shared<Scope>();
 
-  auto create_var = [&](::ir::Value value) {
+  auto create_var = [&](::pir::Value value) {
     if (visited.count(value) > 0) return;
     visited.emplace(value);
 
diff --git a/paddle/cinn/hlir/framework/new_ir_compiler.h b/paddle/cinn/hlir/framework/new_ir_compiler.h
index bb18da54bc4f3..62c3d97a21a41 100644
--- a/paddle/cinn/hlir/framework/new_ir_compiler.h
+++ b/paddle/cinn/hlir/framework/new_ir_compiler.h
@@ -17,7 +17,7 @@
 #include <memory>
 #include <unordered_map>
 #include "paddle/cinn/common/macros.h"
-#include "paddle/ir/core/program.h"
+#include "paddle/pir/core/program.h"
 
 #include "paddle/cinn/hlir/framework/graph_compiler.h"
 #include "paddle/cinn/hlir/framework/op_lowering.h"
@@ -30,7 +30,7 @@ namespace framework {
 // the co-existance with GraphCompiler.
 class NewIRCompiler final {
  public:
-  NewIRCompiler(const ::ir::Program& prog,
+  NewIRCompiler(const ::pir::Program& prog,
                 const Target& target,
                 const std::shared_ptr<Scope>& scope)
       : program_(prog),
@@ -45,14 +45,14 @@ class NewIRCompiler final {
  private:
   CINN_DISALLOW_COPY_AND_ASSIGN(NewIRCompiler);
 
-  std::vector<ir::LoweredFunc> GetOpFunc(const ::ir::Operation& op, int idx);
+  std::vector<ir::LoweredFunc> GetOpFunc(const ::pir::Operation& op, int idx);
 
   void ProcessFunction(const std::vector<ir::LoweredFunc>& lowered_funcs);
 
   std::vector<std::unique_ptr<Instruction>> BuildInstructions(
       const std::vector<newir::GroupPtr>& groups);
 
-  const ::ir::Program& program_;
+  const ::pir::Program& program_;
   ir::Module::Builder m_builder_;
   std::unique_ptr<backends::Compiler> compiler_{nullptr};
   Target target_;
@@ -60,7 +60,7 @@ class NewIRCompiler final {
   std::unordered_map<std::string, std::string> func_names_;
 };
 
-std::shared_ptr<Scope> BuildScope(const Target&, const ::ir::Program&);
+std::shared_ptr<Scope> BuildScope(const Target&, const ::pir::Program&);
 
 }  // namespace framework
 }  // namespace hlir
diff --git a/paddle/cinn/ir/lowered_func.cc b/paddle/cinn/ir/lowered_func.cc
index 84e8fb3e974e7..5a897e7c334a5 100644
--- a/paddle/cinn/ir/lowered_func.cc
+++ b/paddle/cinn/ir/lowered_func.cc
@@ -27,7 +27,6 @@
 #include "paddle/cinn/ir/buffer.h"
 #include "paddle/cinn/ir/utils/ir_printer.h"
 #include "paddle/cinn/ir/utils/ir_visitor.h"
-#include "paddle/cinn/optim/tensor_write_tell.h"
 #include "paddle/cinn/runtime/intrinsic.h"
 #include "paddle/cinn/utils/string.h"
 
@@ -209,8 +208,7 @@ void _LoweredFunc_::AllocTempBuffer() {}
 void _LoweredFunc_::PrepareBufferCastExprs(bool with_expr_gen_tensor) {
   buffer_data_cast_exprs.clear();
   // collect write.
-  optim::TensorWriteTeller write_teller;
-  write_teller.Collect(&body);
+  auto write_teller = ir::CollectTensorNeedsWrite(&body);
 
   auto tensors = CollectAllTensorReference(with_expr_gen_tensor);
   std::sort(tensors.begin(),
@@ -224,7 +222,7 @@ void _LoweredFunc_::PrepareBufferCastExprs(bool with_expr_gen_tensor) {
     if (!tensor->buffer.defined()) continue;
 
     Type value_type = tensor->type().ElementOf();
-    bool is_const = !write_teller.IsWrite(tensor->name);
+    bool is_const = !write_teller.count(tensor->name);
     value_type.set_cpp_handle();
     value_type.set_cpp_const(is_const);
     Var variable = _Var_::Make(tensor->name, value_type);
@@ -250,8 +248,7 @@ std::vector<Expr> _LoweredFunc_::CudaAliasVarExprs() const {
   }
   // collect write.
   std::vector<Expr> res;
-  optim::TensorWriteTeller write_teller;
-  write_teller.Collect(&body);
+  auto write_teller = ir::CollectTensorNeedsWrite(&body);
 
   auto tensors = CollectAllTensorReference();
   std::sort(tensors.begin(),
@@ -269,7 +266,7 @@ std::vector<Expr> _LoweredFunc_::CudaAliasVarExprs() const {
       continue;
     }
     Type value_type = tensor->type().ElementOf();
-    bool is_const = !write_teller.IsWrite(tensor->name);
+    bool is_const = !write_teller.count(tensor->name);
     value_type.set_cpp_handle();
     value_type.set_cpp_const(is_const);
     Var variable = _Var_::Make(tensor->name, value_type);
diff --git a/paddle/cinn/ir/operation.cc b/paddle/cinn/ir/operation.cc
index 44b1af64fe6b0..9dff3b5e0a5f9 100644
--- a/paddle/cinn/ir/operation.cc
+++ b/paddle/cinn/ir/operation.cc
@@ -49,10 +49,12 @@ Operation ComputeOp::Make(const std::string &name,
   n->reduce_axis = reduce_axis;
   n->tag = tag;
   n->attrs = attrs;
-  auto axis = common::GenDefaultAxis(domain.size());
-  std::vector<Expr> _axis;
-  for (auto &x : axis) _axis.push_back(x);
-  n->body = {handle(_axis)};
+  n->axis = common::GenDefaultAxis(domain.size());
+  std::vector<Expr> tmp_axis;
+  for (auto &x : n->axis) {
+    tmp_axis.push_back(x);
+  }
+  n->body = {handle(tmp_axis)};
   n->reduce_axis = reduce_axis;
   return Operation(n);
 }
diff --git a/paddle/cinn/ir/operation.h b/paddle/cinn/ir/operation.h
index 651c2a9a9dc5c..cdc5175830e38 100644
--- a/paddle/cinn/ir/operation.h
+++ b/paddle/cinn/ir/operation.h
@@ -105,6 +105,8 @@ struct BufferShareOp : public _Operation_ {
  */
 struct ComputeOp : public _Operation_ {
   using handle_t = std::function<Expr(const std::vector<Expr> &)>;
+  //! Var on each dimension
+  std::vector<Var> axis;
   //! Var on each reduction axis, if the body is a Reduction.
   std::vector<Var> reduce_axis;
   //! Shape of the output.
diff --git a/paddle/cinn/ir/tensor.cc b/paddle/cinn/ir/tensor.cc
index 2bfa6ee7737ef..7631141d115cd 100644
--- a/paddle/cinn/ir/tensor.cc
+++ b/paddle/cinn/ir/tensor.cc
@@ -16,6 +16,7 @@
 
 #include <cstring>
 
+#include "paddle/cinn/ast_gen_ius/tensor_group.h"
 #include "paddle/cinn/cinn.h"
 #include "paddle/cinn/common/arithmatic.h"
 #include "paddle/cinn/common/axis.h"
@@ -250,6 +251,11 @@ Expr *_Tensor_::mutable_body() {
   CINN_NOT_IMPLEMENTED
 }
 
+ir::Tensor _Tensor_::InitReduction(
+    ast_gen_ius::TensorGroup *tensor_group) const {
+  return tensor_group->MarkReduceInit(this->name);
+}
+
 ir::Tensor _Tensor_::InitReduction(poly::StageMap stages,
                                    const Target &target) const {
   CHECK(contains_reduce_axis())
diff --git a/paddle/cinn/ir/tensor.h b/paddle/cinn/ir/tensor.h
index 8879e35afa98d..fd8e79f73ffdd 100644
--- a/paddle/cinn/ir/tensor.h
+++ b/paddle/cinn/ir/tensor.h
@@ -25,6 +25,7 @@
 #include <utility>
 #include <vector>
 
+#include "paddle/cinn/ast_gen_ius/tensor_group.h"
 #include "paddle/cinn/common/graph_utils.h"
 #include "paddle/cinn/ir/buffer.h"
 #include "paddle/cinn/ir/function_base.h"
@@ -33,28 +34,13 @@
 
 namespace cinn {
 
-namespace ir {
-class Tensor;
-}  // namespace ir
-
-namespace lang {
-template <typename T>
-struct Placeholder;
-
-void InitReduceTensor(poly::StageMap stages,
-                      const ir::Tensor& tensor,
-                      const Target& target = common::DefaultHostTarget());
-}  // namespace lang
+namespace ast_gen_ius {
+class TensorGroup;
+}  // namespace ast_gen_ius
 
 namespace ir {
-namespace detail {
-constexpr bool LE(int a, int b) { return a <= b; }
-constexpr bool GE(int a, int b) { return a >= b; }
-
-}  // namespace detail
 
 class _Tensor_;
-class Tensor;
 
 class Tensor : public ir::IrNodeRef {
  public:
@@ -84,8 +70,8 @@ class Tensor : public ir::IrNodeRef {
     return operator()(std::vector<Expr>({a}));
   }
   template <typename... Args>
-  inline typename std::enable_if<detail::GE(sizeof...(Args), 2), Expr>::type
-  operator()(Args&&... args) const {
+  inline typename std::enable_if<sizeof...(Args) >= 2, Expr>::type operator()(
+      Args&&... args) const {
     return operator()({std::forward<Args>(args)...});
   }
   // @}
@@ -288,11 +274,7 @@ class _Tensor_ : public ExprNode<_Tensor_> {
       poly::StageMap stages,
       const Target& target = common::DefaultHostTarget()) const;
 
- private:
-  //! Initialize the axis field after the shape field is assigned.
-  void InitAxis() const;
-
-  isl::set GenerateIslDomain() const;
+  ir::Tensor InitReduction(ast_gen_ius::TensorGroup* tensor_group) const;
 
   /**
    * Create the initialization tensor.
@@ -304,15 +286,17 @@ class _Tensor_ : public ExprNode<_Tensor_> {
       poly::StageMap stages,
       const Target& target = common::DefaultHostTarget()) const;
 
+ private:
+  //! Initialize the axis field after the shape field is assigned.
+  void InitAxis() const;
+
+  isl::set GenerateIslDomain() const;
+
   //! The names of the tensors depend the same buffer and should schedule before
   //! this.
   std::set<std::string> buffer_depended_tensor_names_;
 
   friend Shared<poly::Stage> CreateStage(Tensor tensor);
-
-  friend void lang::InitReduceTensor(poly::StageMap stages,
-                                     const ir::Tensor& tensor,
-                                     const Target& target);
 };
 
 Shared<poly::Stage> CreateStage(Tensor tensor);
diff --git a/paddle/cinn/ir/utils/ir_nodes_collector.cc b/paddle/cinn/ir/utils/ir_nodes_collector.cc
index e99da88a1dd35..d44c3701b5ac2 100644
--- a/paddle/cinn/ir/utils/ir_nodes_collector.cc
+++ b/paddle/cinn/ir/utils/ir_nodes_collector.cc
@@ -207,5 +207,116 @@ std::set<Expr> CollectReferencedTensors(
   return ts0;
 }
 
+std::vector<std::string> CollectUndefinedVars(const Expr* e) {
+  struct Mutator : public ir::IRMutator<const Expr*> {
+    using ir::IRMutator<const Expr*>::Visit;
+    std::vector<std::string> undefined_vars;
+    std::set<std::string> defined_vars;
+    std::set<std::string> used_vars;
+
+    void CollectVarDef(const std::string& var) {
+      CHECK(!defined_vars.count(var))
+          << "var " << var << " has been defined, please check";
+      CHECK(!used_vars.count(var))
+          << "var " << var << " is wrongly used before definition";
+      defined_vars.insert(var);
+    }
+
+    void ClearVar(const std::string& var) {
+      defined_vars.erase(var);
+      used_vars.erase(var);
+    }
+
+    void CollectVarUse(const std::string& var) {
+      used_vars.insert(var);
+      if (defined_vars.count(var) == 0) {
+        undefined_vars.push_back(var);
+      }
+    }
+
+    void Visit(const ir::Let* op, const Expr* expr) override {
+      Expr symbol = op->symbol;
+      auto var = symbol.as_var_ref();
+      CHECK(var.defined());
+      CollectVarDef(var->name);
+      auto* node = expr->As<ir::Let>();
+      Visit(&node->body, &node->body);
+    }
+
+    void Visit(const ir::For* op, const Expr* expr) override {
+      CollectVarDef(op->loop_var->name);
+      auto* node = expr->As<ir::For>();
+      Visit(&node->min, &node->min);
+      Visit(&node->extent, &node->extent);
+      Visit(&node->body, &node->body);
+      ClearVar(op->loop_var->name);
+    }
+
+    void Visit(const ir::Load* op, const Expr* expr) override {
+      auto tensor = op->tensor.as_tensor_ref();
+      CollectVarUse(tensor->name);
+      auto* node = expr->As<ir::Load>();
+      for (auto& idx : node->indices) Visit(&idx, &idx);
+    }
+
+    void Visit(const ir::Store* op, const Expr* expr) override {
+      auto tensor = op->tensor.as_tensor_ref();
+      CollectVarUse(tensor->name);
+      auto* node = expr->As<ir::Store>();
+      for (auto& idx : node->indices) Visit(&idx, &idx);
+      Visit(&node->value, &node->value);
+    }
+
+    void Visit(const ir::_Var_* op, const Expr* expr) override {
+      CollectVarUse(op->name);
+      auto* node = expr->As<ir::_Var_>();
+      if (node->lower_bound.defined()) {
+        Visit(&node->lower_bound, &node->lower_bound);
+      }
+      if (node->upper_bound.defined()) {
+        Visit(&node->upper_bound, &node->upper_bound);
+      }
+    }
+
+    void Visit(const ir::Reduce* op, const Expr* expr) override {
+      for (auto& axis : op->reduce_axis) {
+        CollectVarDef(axis->name);
+      }
+      auto* node = expr->As<ir::Reduce>();
+      if (node->init.defined()) Visit(&node->init, &node->init);
+      Visit(&node->body, &node->body);
+    }
+  };
+
+  Mutator mutator;
+  mutator.Visit(e, e);
+  return mutator.undefined_vars;
+}
+
+std::set<std::string> CollectTensorNeedsWrite(const Expr* e) {
+  std::set<std::string> tensor_written;
+  IrNodesCollector::handler_t handler = [&](const Expr* x) {
+    if (x->As<ir::Store>()) {
+      tensor_written.insert(
+          x->As<ir::Store>()->tensor.As<ir::_Tensor_>()->name);
+    }
+    if (x->As<ir::_Tensor_>()) {
+      tensor_written.insert(x->As<ir::_Tensor_>()->name);
+    }
+  };
+  IrNodesCollector::teller_t teller = [](const Expr* x) {
+    if (x->As<ir::Store>() && x->As<ir::Store>()->tensor.As<ir::_Tensor_>()) {
+      return true;
+    }
+    if (x->As<ir::_Tensor_>() && x->As<ir::_Tensor_>()->is_call_node()) {
+      return true;
+    }
+    return false;
+  };
+  IrNodesCollector collector(std::move(teller), std::move(handler), false);
+  collector.Visit(e);
+  return tensor_written;
+}
+
 }  // namespace ir
 }  // namespace cinn
diff --git a/paddle/cinn/ir/utils/ir_nodes_collector.h b/paddle/cinn/ir/utils/ir_nodes_collector.h
old mode 100755
new mode 100644
index 75ed3fa9e64f4..0f8a390e1ade7
--- a/paddle/cinn/ir/utils/ir_nodes_collector.h
+++ b/paddle/cinn/ir/utils/ir_nodes_collector.h
@@ -65,5 +65,24 @@ std::map<std::string, Expr> CollectTensorMap(
       return true;
     });
 
+/**
+ * Collect undefined vars in the scope.
+ *
+ * e.g.
+ *
+ * The expression:
+ * for i
+ *  for j
+ *    a[i, j] = b[i, j]
+ *
+ * here a, b are vars without definition
+ */
+std::vector<std::string> CollectUndefinedVars(const Expr* e);
+
+/**
+ * Collect the Tensor Nodes which will be Writed by Store or Call Nodes
+ */
+std::set<std::string> CollectTensorNeedsWrite(const Expr* e);
+
 }  // namespace ir
 }  // namespace cinn
diff --git a/paddle/cinn/lang/CMakeLists.txt b/paddle/cinn/lang/CMakeLists.txt
index f4ef9e6d7b103..62d91e8103c7a 100644
--- a/paddle/cinn/lang/CMakeLists.txt
+++ b/paddle/cinn/lang/CMakeLists.txt
@@ -7,6 +7,8 @@ gather_srcs(
   compute.cc
   placeholder.cc
   lower.cc
+  lower_impl.cc
+  lower_tensor_group.cc
   builtin.cc
   lower_impl.cc
   packed_func.cc)
diff --git a/paddle/cinn/lang/lower.cc b/paddle/cinn/lang/lower.cc
old mode 100755
new mode 100644
index 1661f65975c8f..667c0646c43cd
--- a/paddle/cinn/lang/lower.cc
+++ b/paddle/cinn/lang/lower.cc
@@ -24,12 +24,14 @@
 #include "paddle/cinn/ir/buffer.h"
 #include "paddle/cinn/ir/utils/ir_printer.h"
 #include "paddle/cinn/lang/lower_impl.h"
+#include "paddle/cinn/lang/lower_tensor_group.h"
 #include "paddle/cinn/optim/optimize.h"
 #include "paddle/cinn/utils/string.h"
 
 namespace cinn {
 namespace lang {
 
+using ast_gen_ius::TensorGroup;
 using ir::Tensor;
 using poly::Stage;
 
@@ -84,6 +86,49 @@ std::vector<ir::Argument> GetArgs(
   return res;
 }
 
+//! Collect the temporary tensors from a computational graph.
+std::vector<ir::Buffer> GetTempBuffers(const std::vector<Tensor>& tensor_args,
+                                       const TensorGroup& tensor_group,
+                                       Expr body) {
+  std::unordered_set<std::string> tensor_arg_names;
+  std::unordered_set<std::string> buffer_arg_names;
+  for (auto& tensor : tensor_args) {
+    tensor_arg_names.insert(tensor->name);
+    if (tensor->buffer.defined()) {
+      buffer_arg_names.insert(tensor->buffer->name);
+    }
+  }
+  std::map<std::string, ir::Buffer>
+      name_to_buffer;  // used to avoid duplication.
+
+  auto all_temp_tensors =
+      ir::CollectIRNodesWithoutTensor(body, [&](const Expr* x) {
+        return x->as_tensor() && x->as_tensor()->buffer.defined() &&
+               (!tensor_group.Contain(x->as_tensor()->name) &&
+                ((!buffer_arg_names.count(x->as_tensor()->buffer->name) &&
+                  !tensor_arg_names.count(x->as_tensor()->name)) ||
+                 utils::Endswith(x->as_tensor()->buffer->name, "temp_buffer")));
+      });
+  for (auto& e : all_temp_tensors) {
+    auto buffer_name = e.as_tensor()->buffer->name;
+    if (!name_to_buffer.count(buffer_name)) {
+      name_to_buffer[buffer_name] = e.as_tensor()->buffer;
+    } else {
+      // Just copy from old code, but why?
+      if (e.as_tensor()->buffer->numel() <
+          name_to_buffer[buffer_name]->numel()) {
+        name_to_buffer[buffer_name] = e.as_tensor()->buffer;
+      }
+    }
+  }
+
+  std::vector<ir::Buffer> temp_buffers;
+  for (auto& i : name_to_buffer) {
+    temp_buffers.push_back(i.second);
+  }
+  return temp_buffers;
+}
+
 //! Collect the temporary tensors from a computational graph.
 std::vector<ir::Buffer> GetTempBuffers(const std::vector<Tensor>& tensor_args,
                                        const poly::StageMap& stage_map,
@@ -198,6 +243,25 @@ std::set<ir::Tensor> CollectTempTensorsFromCtrlDepends(
   return res;
 }
 
+void InitReduceTensor(TensorGroup* tensor_group,
+                      const Tensor& tensor,
+                      const Target& target) {
+  if (tensor->is_reduce_tensor()) {
+    tensor_group->MarkReduceInit(tensor->name);
+  }
+  auto uninited_reduce_tensors =
+      ir::CollectIRNodes(tensor->body(), [&](const Expr* x) {
+        return x && x->defined() && x->as_tensor() &&
+               x->as_tensor()->is_reduce_tensor() &&
+               !tensor_group->HasMarkedReduceInit(x->as_tensor()->name);
+      });
+  for (auto& t : uninited_reduce_tensors) {
+    std::string reduce_name = t.as_tensor()->name;
+    VLOG(3) << "Init reduce tensor: " << reduce_name;
+    tensor_group->MarkReduceInit(reduce_name);
+  }
+}
+
 void InitReduceTensor(StageMap stages,
                       const Tensor& tensor,
                       const Target& target) {
@@ -216,6 +280,63 @@ void InitReduceTensor(StageMap stages,
   }
 }
 
+std::set<ir::Tensor> CollectTempTensorsFromCtrlDepends(
+    ast_gen_ius::TensorGroup* tensor_group,
+    const std::vector<Tensor>& tensor_args) {
+  std::set<ir::Tensor> res;
+  for (const ir::Tensor& a : tensor_group->GetAllTensors()) {
+    for (const ir::Tensor& t : tensor_group->GetCrtlDepTensors(a->name)) {
+      res.emplace(t);
+    }
+  }
+  for (const ir::Tensor& t : tensor_args) {
+    if (res.count(t)) {
+      res.erase(t);
+    }
+  }
+  return res;
+}
+
+ir::LoweredFunc LowerToAst(const std::string& name,
+                           const std::vector<Tensor>& tensor_args,
+                           ast_gen_ius::TensorGroup* tensor_group,
+                           const Target& target) {
+  // Init the reduce tensors first before any process.
+  for (auto& t : tensor_args) {
+    InitReduceTensor(tensor_group, t, target);
+  }
+  // Merge the ctrl_deps with the given temp_tensors ang get a new temp_tensors
+  std::set<ir::Tensor> ctrl_deps =
+      CollectTempTensorsFromCtrlDepends(tensor_group, tensor_args);
+  std::vector<ast_gen_ius::TensorGroup*> group_vec = {tensor_group};
+  auto lower_instance = detail::LowerTensorGroup(
+      name,
+      tensor_args,
+      {},
+      group_vec,
+      std::vector<Tensor>(ctrl_deps.begin(), ctrl_deps.end()),
+      target);
+  std::vector<ir::LoweredFunc> result = lower_instance();
+  for (auto& res : result) {
+    if (target == common::DefaultNVGPUTarget()) {
+      res->device_api = ir::DeviceAPI::GPU;
+    }
+  }
+  return result[0];
+}
+
+std::vector<ir::LoweredFunc> LowerToAstVec(
+    const std::string& name,
+    const std::vector<Tensor>& tensor_args,
+    std::vector<ast_gen_ius::TensorGroup*> tensor_groups,
+    const Target& target) {
+  std::vector<ir::LoweredFunc> ret;
+  for (ast_gen_ius::TensorGroup* tg : tensor_groups) {
+    ret.push_back(LowerToAst(name, tensor_args, tg, target));
+  }
+  return ret;
+}
+
 ir::LoweredFunc Lower(const std::string& name,
                       StageMap stages,
                       const std::vector<Tensor>& tensor_args,
diff --git a/paddle/cinn/lang/lower.h b/paddle/cinn/lang/lower.h
index af8a186583a69..c80d4bc769cdf 100644
--- a/paddle/cinn/lang/lower.h
+++ b/paddle/cinn/lang/lower.h
@@ -20,6 +20,7 @@
 #include <string>
 #include <vector>
 
+#include "paddle/cinn/ast_gen_ius/tensor_group.h"
 #include "paddle/cinn/ir/ir.h"
 #include "paddle/cinn/ir/lowered_func.h"
 #include "paddle/cinn/ir/module.h"
@@ -73,6 +74,22 @@ std::vector<ir::LoweredFunc> LowerVec(
     const Target &target = common::DefaultHostTarget(),
     bool support_ir_schedule = false);
 
+ir::LoweredFunc LowerToAst(const std::string &name,
+                           const std::vector<Tensor> &tensor_args,
+                           ast_gen_ius::TensorGroup *tensor_group,
+                           const Target &target = common::DefaultHostTarget());
+
+std::vector<ir::LoweredFunc> LowerToAstVec(
+    const std::string &name,
+    const std::vector<Tensor> &tensor_args,
+    std::vector<ast_gen_ius::TensorGroup *> tensor_groups,
+    const Target &target = common::DefaultHostTarget());
+
+std::vector<ir::Buffer> GetTempBuffers(
+    const std::vector<Tensor> &tensor_args,
+    const ast_gen_ius::TensorGroup &tensor_group,
+    Expr body);
+
 std::vector<ir::Argument> GetArgs(
     const Expr &func_body, const std::vector<std::string> &input_output_nodes);
 
diff --git a/paddle/cinn/lang/lower_impl.cc b/paddle/cinn/lang/lower_impl.cc
index f313d52938a93..629b405dcd2f0 100644
--- a/paddle/cinn/lang/lower_impl.cc
+++ b/paddle/cinn/lang/lower_impl.cc
@@ -25,7 +25,7 @@
 #include "paddle/cinn/ir/ir_base.h"
 #include "paddle/cinn/ir/tensor.h"
 #include "paddle/cinn/ir/utils/ir_printer.h"
-#include "paddle/cinn/optim/remove_nested_block.h"
+#include "paddle/cinn/optim/ir_simplify.h"
 #include "paddle/cinn/optim/replace_var_with_expr.h"
 #include "paddle/cinn/optim/transform_polyfor_to_for.h"
 #include "paddle/cinn/poly/stage.h"
@@ -342,8 +342,7 @@ std::vector<ir::Argument> LowerImpl::GenerateFunctionArgumentList(
   CheckArgsUnique();
 
   std::vector<ir::Argument> args;
-  optim::TensorWriteTeller teller;
-  teller.Collect(&fn_body);
+  auto teller = ir::CollectTensorNeedsWrite(&fn_body);
 
   std::set<std::string> arg_names;
 
@@ -358,7 +357,7 @@ std::vector<ir::Argument> LowerImpl::GenerateFunctionArgumentList(
 
   for (auto& tensor : tensor_args_) {
     auto* tensor_node = tensor.As<ir::_Tensor_>();
-    bool is_output = teller.IsWrite(tensor->name);
+    bool is_output = teller.count(tensor->name);
     VLOG(1) << "tensor argument " << tensor->name << " buffer "
             << tensor->buffer->name;
 
@@ -396,8 +395,7 @@ std::vector<ir::Argument> LowerImpl::GenFuncArgForSplitKernel(
 
   std::vector<ir::Argument> in_args;
   std::vector<ir::Argument> out_args;
-  optim::TensorWriteTeller teller;
-  teller.Collect(&func_iterator);
+  auto teller = ir::CollectTensorNeedsWrite(&func_iterator);
   std::set<std::string> arg_names;
   std::set<std::string> all_tensor_names;
 
@@ -448,7 +446,7 @@ std::vector<ir::Argument> LowerImpl::GenFuncArgForSplitKernel(
     VLOG(3) << "In tensor_args_, it has : " << tensor->name;
     if (temp_tensor_names.count(tensor->name) > 0) continue;
     if (all_tensor_names.count(tensor->name) == 0) continue;
-    bool is_output = teller.IsWrite(tensor->name);
+    bool is_output = teller.count(tensor->name);
     VLOG(3) << "tensor argument " << tensor->name << " buffer "
             << tensor->buffer->name;
 
@@ -485,7 +483,7 @@ std::vector<ir::Argument> LowerImpl::GenFuncArgForSplitKernel(
       VLOG(3) << "Tensor " << tensor->name;
       if (tensor->buffer.defined() && !arg_names.count(tensor->buffer->name)) {
         bool is_output =
-            teller.IsWrite(tensor->name) && teller.IsWrite(tensor->name);
+            teller.count(tensor->name) && teller.count(tensor->name);
         if (is_output)
           out_args.emplace_back(tensor->buffer, ir::Argument::IO::kOutput);
       }
@@ -655,7 +653,7 @@ std::vector<ir::LoweredFunc> LowerImpl::operator()() {
 
     if (support_ir_schedule_) {
       optim::TransformPolyForToFor(&func->body);
-      optim::RemoveNestedBlock(&func->body);
+      optim::SimplifyBlocks(&func->body);
       func->body = ir::Block::Make({func->body});
       result.push_back(ir::LoweredFunc(func.get()));
       num_func++;
diff --git a/paddle/cinn/lang/lower_impl.h b/paddle/cinn/lang/lower_impl.h
index 3e52279b19566..c5bfdfb1fb74d 100644
--- a/paddle/cinn/lang/lower_impl.h
+++ b/paddle/cinn/lang/lower_impl.h
@@ -27,14 +27,13 @@
 
 #include "paddle/cinn/common/graph_utils.h"
 #include "paddle/cinn/ir/buffer.h"
+#include "paddle/cinn/ir/utils/ir_mutator.h"
 #include "paddle/cinn/ir/utils/ir_printer.h"
 #include "paddle/cinn/optim/buffer_assign.h"
 #include "paddle/cinn/optim/compute_inline_expand.h"
 #include "paddle/cinn/optim/fold_cinn_call_arguments.h"
 #include "paddle/cinn/optim/optimize.h"
-#include "paddle/cinn/optim/remove_nested_block.h"
 #include "paddle/cinn/optim/replace_call_with_expr.h"
-#include "paddle/cinn/optim/tensor_write_tell.h"
 #include "paddle/cinn/optim/transform_gpu_forloop.h"
 #include "paddle/cinn/optim/transform_polyfor_to_for.h"
 #include "paddle/cinn/poly/ast_gen.h"
diff --git a/paddle/cinn/lang/lower_tensor_group.cc b/paddle/cinn/lang/lower_tensor_group.cc
new file mode 100644
index 0000000000000..6fb8e72f43c68
--- /dev/null
+++ b/paddle/cinn/lang/lower_tensor_group.cc
@@ -0,0 +1,215 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/cinn/lang/lower_tensor_group.h"
+
+#include <algorithm>
+#include <queue>
+#include <string>
+#include <unordered_set>
+
+#include "paddle/cinn/ast_gen_ius/ast_gen.h"
+#include "paddle/cinn/ast_gen_ius/tensor_group.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/common/context.h"
+#include "paddle/cinn/common/ir_util.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/ir/utils/ir_mutator.h"
+#include "paddle/cinn/ir/utils/ir_printer.h"
+#include "paddle/cinn/optim/ir_simplify.h"
+#include "paddle/cinn/optim/replace_var_with_expr.h"
+#include "paddle/cinn/optim/transform_polyfor_to_for.h"
+#include "paddle/cinn/poly/stage.h"
+
+namespace cinn {
+namespace lang {
+namespace detail {
+
+LowerTensorGroup::LowerTensorGroup(
+    const std::string& fn_name,
+    const std::vector<ir::Tensor>& tensor_args,
+    const std::vector<ir::Var>& scalar_args,
+    const std::vector<ast_gen_ius::TensorGroup*>& tensor_groups,
+    const std::vector<ir::Tensor>& temp_tensor_args,
+    const Target& target)
+    : fn_name_(fn_name),
+      tensor_args_(tensor_args),
+      scalar_args_(scalar_args),
+      tensor_groups_(tensor_groups),
+      temp_tensor_args_(temp_tensor_args),
+      target_(target) {}
+
+std::vector<ir::LoweredFunc> LowerTensorGroup::operator()() {
+  std::vector<ir::LoweredFunc> result;
+  int num_func = 0;
+  for (ast_gen_ius::TensorGroup* tensor_group : tensor_groups_) {
+    // 1. Generate function body
+    ir::Expr func_body = GenerateFunctionBody(tensor_group);
+    // 2. Assign buffer to tensors
+    auto tensor_map = tensor_group->AllocateBuffers();
+    // copy the tensor(with buffer assigned) back to func's args.
+    for (auto& arg : tensor_args_) {
+      if (arg->is_placeholder_node() || arg->buffer.defined()) {
+        continue;
+      }
+      if (arg->body().As<ir::Call>() && arg->body().type().is_void()) {
+        continue;  // extern call
+      }
+
+      if (tensor_map.find(arg->name) == tensor_map.end()) {
+        LOG(INFO) << "Didn't find arg tensor " << arg->name
+                  << "in tensor_map.\n"
+                  << "The function is " << fn_name_
+                  << "\nAnd all the arg tensors are:\n";
+        for (auto& i : tensor_args_) {
+          LOG(INFO) << i->name;
+        }
+        LOG(FATAL) << "Fatal Error!";
+      }
+      Reference(&arg)->buffer = tensor_map.at(arg->name)->buffer;
+    }
+
+    // 3. Collect temp tensor buffers
+    std::set<std::string> temp_tensor_names;
+    for (auto& t : temp_tensor_args_) {
+      temp_tensor_names.insert(t->name);
+    }
+
+    // Some store tensors are also temp tensors;
+    auto store_exprs = ir::CollectIRNodes(
+        func_body, [](const Expr* x) { return x->As<ir::Store>(); });
+    for (auto& expr : store_exprs) {
+      auto* store_node = expr.As<ir::Store>();
+      CHECK(store_node);
+      auto* tensor = store_node->tensor.As<ir::_Tensor_>();
+      CHECK(tensor);
+      VLOG(3) << "In store_exprs, its name is : " << tensor->name;
+      CHECK(tensor->buffer.defined());
+      if (tensor->buffer->memory_type != ir::MemoryType::Heap) {
+        temp_tensor_names.insert(store_node->tensor.as_tensor_ref()->name);
+      }
+    }
+
+    std::vector<ir::Buffer> temp_buffers;
+    std::unordered_set<std::string> buffer_name_set;
+    for (const std::string& name : temp_tensor_names) {
+      if (!tensor_map.count(name)) {
+        continue;
+      }
+      ir::Tensor& t = tensor_map[name];
+      if (t->buffer.defined() && !buffer_name_set.count(t->buffer->name)) {
+        temp_buffers.push_back(t->buffer);
+        buffer_name_set.insert(t->buffer->name);
+      }
+    }
+
+    // 4. Handle function args
+    std::vector<ir::Argument> func_args =
+        GenerateFunctionArgumentList(func_body);
+
+    // 5. Actual function make
+    std::string actual_fn_name = fn_name_;
+    if (num_func > 0) {
+      actual_fn_name += "_" + std::to_string(num_func);
+      VLOG(3) << "Making func :" << actual_fn_name;
+    }
+    for (auto& i : func_args) {
+      VLOG(3) << "func_args is : " << i.name();
+    }
+    for (auto& i : temp_buffers) {
+      VLOG(3) << "temp_buffers is : " << i->name;
+    }
+    ir::LoweredFunc func = ir::_LoweredFunc_::Make(
+        actual_fn_name, func_args, func_body, temp_buffers);
+
+    // 6. Final clean up
+    optim::SimplifyBlocks(&func->body);
+    func->body = ir::Block::Make({func->body});
+    result.push_back(ir::LoweredFunc(func.get()));
+    num_func++;
+  }
+  return result;
+}
+
+std::vector<ir::Argument> LowerTensorGroup::GenerateFunctionArgumentList(
+    Expr fn_body) {
+  std::vector<ir::Argument> args;
+  auto teller = ir::CollectTensorNeedsWrite(&fn_body);
+
+  std::set<std::string> arg_names;
+
+  for (auto& scalar : scalar_args_) {
+    CHECK(!arg_names.count(scalar->name));
+    auto* scalar_node = scalar.As<ir::_Var_>();
+    CHECK(scalar_node->type().valid());
+    arg_names.insert(scalar->name);
+
+    args.emplace_back(scalar, ir::Argument::IO::kInput);
+  }
+
+  for (auto& tensor : tensor_args_) {
+    auto* tensor_node = tensor.As<ir::_Tensor_>();
+    bool is_output = teller.count(tensor->name);
+    VLOG(6) << "tensor argument " << tensor->name << ", buffer "
+            << tensor->buffer->name << ", is output: " << is_output;
+
+    // avoid duplicate
+    if (!tensor_node->buffer.defined()) {
+      continue;
+    }
+    // if a argument is already marked as kInput, mark it as kOutput and move it
+    // to the back.
+    if (arg_names.count(tensor_node->buffer->name)) {
+      auto it =
+          std::find_if(args.begin(), args.end(), [&](const ir::Argument& x) {
+            return x.name() == tensor_node->buffer->name;
+          });
+      CHECK(it != args.end());
+      if (it->is_input()) {
+        args.erase(it);
+      } else if (it->is_output()) {
+        continue;
+      }
+    }
+
+    arg_names.insert(tensor_node->buffer->name);
+
+    auto io = is_output ? ir::Argument::IO::kOutput : ir::Argument::IO::kInput;
+    VLOG(6) << "Collect " << (is_output ? "W" : "R") << " argument "
+            << tensor->buffer->name;
+    args.emplace_back(tensor_node->buffer, io);
+  }
+
+  return args;
+}
+
+ir::Expr LowerTensorGroup::GenerateFunctionBody(
+    ast_gen_ius::TensorGroup* tensor_group) {
+  std::vector<ir::Tensor> ordered_tensors =
+      tensor_group->GetGenFuncTopoOrder(tensor_args_);
+  std::vector<ir::Expr> bodies;
+  for (const ir::Tensor& tensor : ordered_tensors) {
+    bodies.emplace_back(ast_gen_ius::AstGen::Build(tensor));
+  }
+  if (bodies.size() == 1) {
+    return bodies[0];
+  }
+
+  return ir::Block::Make(bodies);
+}
+
+}  // namespace detail
+}  // namespace lang
+}  // namespace cinn
diff --git a/paddle/cinn/lang/lower_tensor_group.h b/paddle/cinn/lang/lower_tensor_group.h
new file mode 100644
index 0000000000000..ce7f1f7c7cdc9
--- /dev/null
+++ b/paddle/cinn/lang/lower_tensor_group.h
@@ -0,0 +1,74 @@
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <absl/container/flat_hash_map.h>
+
+#include <iostream>
+#include <map>
+#include <memory>
+#include <set>
+#include <stack>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "paddle/cinn/ast_gen_ius/tensor_group.h"
+#include "paddle/cinn/common/graph_utils.h"
+#include "paddle/cinn/ir/buffer.h"
+#include "paddle/cinn/ir/utils/ir_printer.h"
+#include "paddle/cinn/optim/buffer_assign.h"
+#include "paddle/cinn/optim/compute_inline_expand.h"
+#include "paddle/cinn/optim/fold_cinn_call_arguments.h"
+#include "paddle/cinn/optim/optimize.h"
+#include "paddle/cinn/optim/replace_call_with_expr.h"
+#include "paddle/cinn/optim/transform_gpu_forloop.h"
+#include "paddle/cinn/optim/transform_polyfor_to_for.h"
+#include "paddle/cinn/poly/ast_gen.h"
+
+namespace cinn {
+namespace lang {
+namespace detail {
+
+class LowerTensorGroup {
+ public:
+  LowerTensorGroup(const std::string& fn_name,
+                   const std::vector<ir::Tensor>& tensor_args,
+                   const std::vector<ir::Var>& scalar_args,
+                   const std::vector<ast_gen_ius::TensorGroup*>& tensor_groups,
+                   const std::vector<ir::Tensor>& temp_tensor_args = {},
+                   const Target& target = common::DefaultHostTarget());
+
+  std::vector<ir::LoweredFunc> operator()();
+
+  ir::Expr GenerateFunctionBody(ast_gen_ius::TensorGroup* tensor_group);
+
+  std::vector<ir::Argument> GenerateFunctionArgumentList(ir::Expr fn_body);
+
+ private:
+  const std::string& fn_name_;
+  const std::vector<ir::Tensor>& tensor_args_;
+  const std::vector<Var>& scalar_args_;
+  std::vector<ir::Tensor> temp_tensor_args_;
+  std::vector<ast_gen_ius::TensorGroup*> tensor_groups_;
+  Target target_;
+
+  //! CUDA axis info for this function.
+  std::vector<ir::CudaAxisInfo> cuda_axis_info_;
+};
+
+}  // namespace detail
+}  // namespace lang
+}  // namespace cinn
diff --git a/paddle/cinn/lang/lower_test.cc b/paddle/cinn/lang/lower_test.cc
index 14f81090e30cb..431d73d075be6 100755
--- a/paddle/cinn/lang/lower_test.cc
+++ b/paddle/cinn/lang/lower_test.cc
@@ -18,6 +18,7 @@
 
 #include <set>
 
+#include "paddle/cinn/ast_gen_ius/tensor_group.h"
 #include "paddle/cinn/cinn.h"
 #include "paddle/cinn/lang/buffer.h"
 #include "paddle/cinn/lang/compute.h"
@@ -27,6 +28,10 @@
 namespace cinn {
 namespace lang {
 
+#define TEST_SOUTPUT(x, out)           \
+  LOG(INFO) << "\n" << x << std::endl; \
+  EXPECT_EQ(utils::GetStreamCnt(x), utils::Trim(out));
+
 TEST(lower, basic) {
   auto M = Expr(100);
   auto N = Expr(15);
@@ -42,10 +47,6 @@ TEST(lower, basic) {
 
   LOG(INFO) << "lower_size " << lower_funcs;
 
-#define TEST_SOUTPUT(x, out)           \
-  std::cout << "\n" << x << std::endl; \
-  EXPECT_EQ(utils::GetStreamCnt(x), utils::Trim(out));
-
   auto out = R"ROC(
 {
   serial for (i, 0, 100)
@@ -77,7 +78,7 @@ TEST(lower, more_complex) {
 
   auto lower_funcs = Lower("cal_C", stages, {A, B, C});
 
-  std::cout << "func:\n" << Expr(lower_funcs->self()) << std::endl;
+  LOG(INFO) << "func:\n" << Expr(lower_funcs->self()) << std::endl;
 }
 
 //! To support training, the dynamic shape support is vital. We test the
@@ -157,5 +158,34 @@ TEST(lower, temp_buffer_collects) {
   }
 }
 
+TEST(lower_to_ast, basic) {
+  auto M = Expr(100);
+  auto N = Expr(15);
+
+  Placeholder<float> A("A", {Expr(M), Expr(N)});
+
+  ir::Tensor B = Compute(
+      {M, N}, [=](Var i, Var j) -> Expr { return A(i, j) + 1.f; }, "B");
+
+  ast_gen_ius::TensorGroup tensor_group({B});
+
+  auto lower_funcs = LowerToAst("cal_B", {A, B}, &tensor_group);
+
+  LOG(INFO) << "lower_func " << lower_funcs;
+
+  auto out = R"ROC(
+{
+  serial for (i, 0, 100)
+  {
+    serial for (j, 0, 15)
+    {
+      B[i, j] = (A[i, j] + 1.00000000f)
+    }
+  }
+}
+)ROC";
+  TEST_SOUTPUT(lower_funcs->body, out);
+}
+
 }  // namespace lang
 }  // namespace cinn
diff --git a/paddle/cinn/optim/CMakeLists.txt b/paddle/cinn/optim/CMakeLists.txt
index 99ae9cf3bd3d6..1b4a55479ef0b 100755
--- a/paddle/cinn/optim/CMakeLists.txt
+++ b/paddle/cinn/optim/CMakeLists.txt
@@ -3,11 +3,9 @@ core_gather_headers()
 gather_srcs(
   cinnapi_src
   SRCS
-  remove_nested_block.cc
   replace_call_with_expr.cc
   ir_replace.cc
   replace_var_with_expr.cc
-  tensor_write_tell.cc
   ir_simplify.cc
   optimize.cc
   vectorize_loops.cc
@@ -25,7 +23,6 @@ gather_srcs(
   replace_const_param_to_integer.cc
   lower_intrin.cc
   cast_bool_to_int8.cc
-  collect_undefined_vars.cc
   var_mod_simplify.cc
   remove_schedule_block.cc)
 
@@ -33,8 +30,6 @@ if(WITH_CUDA)
   gather_srcs(cinnapi_src SRCS transform_gpu_forloop.cc)
 endif()
 
-cinn_cc_test(test_remove_nested_block SRCS remove_nested_block_test.cc DEPS
-             cinncore)
 cinn_cc_test(test_ir_simplify SRCS ir_simplify_test.cc DEPS cinncore)
 cinn_cc_test(test_replace_call_with_expr SRCS replace_call_with_expr_test.cc
              DEPS cinncore)
diff --git a/paddle/cinn/optim/collect_undefined_vars.cc b/paddle/cinn/optim/collect_undefined_vars.cc
deleted file mode 100644
index 2f925d1333f39..0000000000000
--- a/paddle/cinn/optim/collect_undefined_vars.cc
+++ /dev/null
@@ -1,111 +0,0 @@
-// Copyright (c) 2021 CINN Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/cinn/optim/collect_undefined_vars.h"
-
-#include <set>
-
-#include "paddle/cinn/ir/utils/ir_mutator.h"
-
-namespace cinn::optim {
-
-namespace {
-struct Mutator : public ir::IRMutator<> {
-  using ir::IRMutator<>::Visit;
-  std::vector<std::string> undefined_vars;
-  std::set<std::string> defined_vars;
-  std::set<std::string> used_vars;
-
-  void CollectVarDef(const std::string& var) {
-    CHECK(!defined_vars.count(var))
-        << "var " << var << " has been defined, please check";
-    CHECK(!used_vars.count(var))
-        << "var " << var << " is wrongly used before definition";
-    defined_vars.insert(var);
-  }
-
-  void ClearVar(const std::string& var) {
-    defined_vars.erase(var);
-    used_vars.erase(var);
-  }
-
-  void CollectVarUse(const std::string& var) {
-    used_vars.insert(var);
-    if (defined_vars.count(var) == 0) {
-      undefined_vars.push_back(var);
-    }
-  }
-
-  void Visit(const ir::Let* op, Expr* expr) final {
-    Expr symbol = op->symbol;
-    auto var = symbol.as_var_ref();
-    CHECK(var.defined());
-    CollectVarDef(var->name);
-    auto* node = expr->As<ir::Let>();
-    Visit(&node->body, &node->body);
-  }
-
-  void Visit(const ir::For* op, Expr* expr) final {
-    CollectVarDef(op->loop_var->name);
-    auto* node = expr->As<ir::For>();
-    Visit(&node->min, &node->min);
-    Visit(&node->extent, &node->extent);
-    Visit(&node->body, &node->body);
-    ClearVar(op->loop_var->name);
-  }
-
-  void Visit(const ir::Load* op, Expr* expr) final {
-    auto tensor = op->tensor.as_tensor_ref();
-    CollectVarUse(tensor->name);
-    auto* node = expr->As<ir::Load>();
-    for (auto& idx : node->indices) Visit(&idx, &idx);
-  }
-
-  void Visit(const ir::Store* op, Expr* expr) final {
-    auto tensor = op->tensor.as_tensor_ref();
-    CollectVarUse(tensor->name);
-    auto* node = expr->As<ir::Store>();
-    for (auto& idx : node->indices) Visit(&idx, &idx);
-    Visit(&node->value, &node->value);
-  }
-
-  void Visit(const ir::_Var_* op, Expr* expr) final {
-    CollectVarUse(op->name);
-    auto* node = expr->As<ir::_Var_>();
-    if (node->lower_bound.defined()) {
-      Visit(&node->lower_bound, &node->lower_bound);
-    }
-    if (node->upper_bound.defined()) {
-      Visit(&node->upper_bound, &node->upper_bound);
-    }
-  }
-
-  void Visit(const ir::Reduce* op, Expr* expr) final {
-    for (auto& axis : op->reduce_axis) {
-      CollectVarDef(axis->name);
-    }
-    auto* node = expr->As<ir::Reduce>();
-    if (node->init.defined()) Visit(&node->init, &node->init);
-    Visit(&node->body, &node->body);
-  }
-};
-}  // namespace
-
-std::vector<std::string> CollectUndefinedVars(Expr* e) {
-  Mutator mutator;
-  mutator.Visit(e, e);
-  return mutator.undefined_vars;
-}
-
-}  // namespace cinn::optim
diff --git a/paddle/cinn/optim/collect_undefined_vars.h b/paddle/cinn/optim/collect_undefined_vars.h
deleted file mode 100644
index b83620fcc1cb0..0000000000000
--- a/paddle/cinn/optim/collect_undefined_vars.h
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright (c) 2021 CINN Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <string>
-#include <vector>
-
-#include "paddle/cinn/ir/ir.h"
-namespace cinn::optim {
-
-/**
- * Collect undefined vars in the scope.
- *
- * e.g.
- *
- * The expression:
- * for i
- *  for j
- *    a[i, j] = b[i, j]
- *
- * here a, b are vars without definition
- */
-std::vector<std::string> CollectUndefinedVars(Expr* e);
-
-}  // namespace cinn::optim
diff --git a/paddle/cinn/optim/ir_simplify.cc b/paddle/cinn/optim/ir_simplify.cc
index bfed498da521d..6cf3fcf4b7be8 100644
--- a/paddle/cinn/optim/ir_simplify.cc
+++ b/paddle/cinn/optim/ir_simplify.cc
@@ -305,6 +305,50 @@ struct SimplifyBlocksMutator : public ir::IRMutator<> {
       expr->As<ir::Block>()->stmts = stmts;
     }
   }
+
+  void Visit(const IfThenElse* op, Expr* expr) override {
+    auto* node = expr->As<IfThenElse>();
+    Visit(&node->condition, &node->condition);
+    if (node->true_case.As<Block>() &&
+        (node->true_case.As<Block>()->stmts.size() == 1)) {
+      node->true_case = node->true_case.As<Block>()->stmts[0];
+    }
+    Visit(&node->true_case, &node->true_case);
+    if (node->false_case.defined()) {
+      if (node->false_case.As<Block>() &&
+          (node->false_case.As<Block>()->stmts.size() == 1)) {
+        node->false_case = node->false_case.As<Block>()->stmts[0];
+      }
+      Visit(&node->false_case, &node->false_case);
+    }
+  }
+
+  void Visit(const ScheduleBlock* op, Expr* expr) override {
+    auto* node = expr->As<ScheduleBlock>();
+    CHECK(node);
+    for (auto& var : node->iter_vars) {
+      if (var->lower_bound.defined()) {
+        Visit(&var->lower_bound, &var->lower_bound);
+      }
+      if (var->upper_bound.defined()) {
+        Visit(&var->upper_bound, &var->upper_bound);
+      }
+    }
+    for (auto& buffer_region : node->read_buffers) {
+      Visit(&buffer_region, &buffer_region);
+    }
+    for (auto& buffer_region : node->write_buffers) {
+      Visit(&buffer_region, &buffer_region);
+    }
+
+    if (node->body.As<Block>()) {
+      if (node->body.As<Block>()->stmts.size() == 1) {
+        node->body = node->body.As<Block>()->stmts[0];
+      }
+    }
+
+    Visit(&(node->body), &(node->body));
+  }
 };
 
 struct SimplifyForLoopsMutator : public ir::IRMutator<> {
diff --git a/paddle/cinn/optim/optimize.cc b/paddle/cinn/optim/optimize.cc
index b1e73e3c58a9b..3764e1bd616e2 100644
--- a/paddle/cinn/optim/optimize.cc
+++ b/paddle/cinn/optim/optimize.cc
@@ -27,7 +27,6 @@
 #include "paddle/cinn/optim/lower_function_call_bind_vars.h"
 #include "paddle/cinn/optim/lower_intrin.h"
 #include "paddle/cinn/optim/map_extern_call.h"
-#include "paddle/cinn/optim/remove_nested_block.h"
 #include "paddle/cinn/optim/remove_schedule_block.h"
 #include "paddle/cinn/optim/replace_const_param_to_integer.h"
 #include "paddle/cinn/optim/transform_gpu_forloop.h"
@@ -65,8 +64,8 @@ Expr Optimize(Expr e,
   CudaSyncThreadsDropIfThenElse(&copied);
 #endif
 
-  RemoveNestedBlock(&copied);
-  VLOG(4) << "After Optimize RemoveNestedBlock:" << copied;
+  SimplifyBlocks(&copied);
+  VLOG(4) << "After SimplifyBlocks:" << copied;
 
   MapExternCall(&copied, target);
   VLOG(10) << "After Optimize MapExternCall:" << copied;
diff --git a/paddle/cinn/optim/remove_nested_block.cc b/paddle/cinn/optim/remove_nested_block.cc
deleted file mode 100644
index 06050ec5b123c..0000000000000
--- a/paddle/cinn/optim/remove_nested_block.cc
+++ /dev/null
@@ -1,123 +0,0 @@
-// Copyright (c) 2021 CINN Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/cinn/optim/remove_nested_block.h"
-
-#include "paddle/cinn/ir/utils/ir_mutator.h"
-#include "paddle/cinn/ir/utils/ir_printer.h"
-
-namespace cinn {
-namespace optim {
-
-Expr GetExprInsideBlock(Expr op) {
-  Expr node = op;
-  while (node.As<ir::Block>()) {
-    auto& stmts = node.As<ir::Block>()->stmts;
-    if (stmts.size() == 1) {
-      node = stmts.front();
-    } else {
-      break;
-    }
-  }
-  return node;
-}
-
-// This will remove the nested blocks, but it will also remove the block outside
-// the forloop's body.
-struct NestedBlockSimplifer : public ir::IRMutator<Expr*> {
-  void operator()(ir::Expr* expr) { Visit(expr); }
-
- private:
-  void Visit(ir::Expr* expr) { ir::IRMutator<>::Visit(expr, expr); }
-
-  void Visit(const ir::Block* expr, Expr* op) override {
-    auto* node = op->As<ir::Block>();
-    if (node->stmts.size() == 1) {
-      *op = GetExprInsideBlock(*op);
-      IRMutator::Visit(op, op);
-    } else {
-      IRMutator::Visit(expr, op);
-    }
-  }
-};
-
-struct NestedBlockRemover : public ir::IRMutator<Expr*> {
-  void operator()(ir::Expr* expr) { Visit(expr); }
-
- private:
-  void Visit(ir::Expr* expr) { ir::IRMutator<>::Visit(expr, expr); }
-
-  void Visit(const ir::Block* expr, Expr* op) override {
-    auto* node = op->As<ir::Block>();
-
-    std::vector<ir::Expr> new_exprs;
-
-    bool detect_nested = false;
-    for (auto it = node->stmts.begin(); it != node->stmts.end(); it++) {
-      auto* block = it->As<ir::Block>();
-      if (block) {
-        detect_nested = true;
-        new_exprs.insert(
-            std::end(new_exprs), block->stmts.begin(), block->stmts.end());
-      } else {
-        new_exprs.push_back(*it);
-      }
-    }
-
-    node->stmts = new_exprs;
-
-    IRMutator::Visit(expr, op);
-  }
-};
-
-// add block outside forloop's body.
-struct AddBlockToForloop : public ir::IRMutator<> {
-  void operator()(ir::Expr* expr) { ir::IRMutator<>::Visit(expr, expr); }
-
-  void Visit(const ir::For* expr, Expr* op) override {
-    auto* node = op->As<ir::For>();
-    if (!node->body.As<ir::Block>()) {
-      node->body = ir::Block::Make({node->body});
-    }
-
-    ir::IRMutator<>::Visit(expr, op);
-  }
-
-  void Visit(const ir::PolyFor* expr, Expr* op) override {
-    auto* node = op->As<ir::PolyFor>();
-    if (!node->body.As<ir::Block>()) {
-      node->body = ir::Block::Make({node->body});
-    }
-
-    ir::IRMutator<>::Visit(expr, op);
-  }
-
-  void Visit(const ir::_LoweredFunc_* expr, Expr* op) override {
-    auto* node = op->As<ir::_LoweredFunc_>();
-    if (!node->body.As<ir::Block>()) {
-      node->body = ir::Block::Make({node->body});
-    }
-
-    ir::IRMutator<>::Visit(expr, op);
-  }
-};
-
-void RemoveNestedBlock(Expr* e) {
-  NestedBlockRemover()(e);
-  NestedBlockSimplifer()(e);
-  AddBlockToForloop()(e);
-}
-
-}  // namespace optim
-}  // namespace cinn
diff --git a/paddle/cinn/optim/remove_nested_block_test.cc b/paddle/cinn/optim/remove_nested_block_test.cc
deleted file mode 100644
index 27238329dfbd7..0000000000000
--- a/paddle/cinn/optim/remove_nested_block_test.cc
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) 2021 CINN Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/cinn/optim/remove_nested_block.h"
-
-#include <gtest/gtest.h>
-
-#include <string>
-#include <vector>
-
-#include "paddle/cinn/ir/utils/ir_printer.h"
-#include "paddle/cinn/utils/string.h"
-
-namespace cinn {
-namespace optim {
-
-TEST(RemoveNestedBlock, basic) {
-  auto block0 = ir::Block::Make({Expr(1.f), Expr(1.f)});
-  auto block1 = ir::Block::Make({block0});
-  auto e = Expr(block1);
-
-  std::string origin = utils::GetStreamCnt(e);
-  EXPECT_EQ(origin, utils::Trim(R"ROC(
-{
-  {
-    1.00000000f
-    1.00000000f
-  }
-}
-  )ROC"));
-
-  std::cout << "origin:\n" << e << std::endl;
-
-  RemoveNestedBlock(&e);
-
-  std::cout << "e:\n" << e << std::endl;
-
-  EXPECT_EQ(utils::GetStreamCnt(e), utils::Trim(R"ROC(
-{
-  1.00000000f
-  1.00000000f
-}
-  )ROC"));
-}
-
-}  // namespace optim
-}  // namespace cinn
diff --git a/paddle/cinn/optim/tensor_write_tell.h b/paddle/cinn/optim/tensor_write_tell.h
deleted file mode 100644
index f8ee114561a30..0000000000000
--- a/paddle/cinn/optim/tensor_write_tell.h
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) 2021 CINN Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <set>
-#include <string>
-
-#include "paddle/cinn/ir/ir.h"
-#include "paddle/cinn/ir/utils/ir_mutator.h"
-
-namespace cinn {
-namespace optim {
-
-struct TensorWriteTeller : public ir::IRMutator<const Expr*> {
-  //! Collect the write info in \p op.
-  void Collect(const Expr* op) { Visit(op, op); }
-
-  bool IsWrite(const std::string& tensor_name) const {
-    return tensor_written.count(tensor_name);
-  }
-
- private:
-  std::set<std::string> tensor_written;
-
-  void Visit(const Expr* expr, const Expr* op) override {
-    IRMutator::Visit(expr, op);
-  }
-
-  void Visit(const ir::Store* expr, const Expr* op) override {
-    auto* node = op->As<ir::Store>();
-    CHECK(node);
-    auto* tensor = node->tensor.As<ir::_Tensor_>();
-    CHECK(tensor);
-    tensor_written.insert(tensor->name);
-    IRMutator::Visit(expr, op);
-  }
-
-  void Visit(const ir::_Tensor_* op, const Expr* expr) override {
-    auto* node = expr->As<ir::_Tensor_>();
-    if (node->is_call_node()) {
-      tensor_written.insert(node->name);
-    }
-  }
-};
-
-}  // namespace optim
-}  // namespace cinn
diff --git a/paddle/cinn/optim/vectorize_loops.cc b/paddle/cinn/optim/vectorize_loops.cc
index 745bec47b4507..2f3a9b29a3567 100644
--- a/paddle/cinn/optim/vectorize_loops.cc
+++ b/paddle/cinn/optim/vectorize_loops.cc
@@ -31,7 +31,6 @@
 #include "paddle/cinn/ir/utils/ir_printer.h"
 #include "paddle/cinn/optim/ir_replace.h"
 #include "paddle/cinn/optim/ir_simplify.h"
-#include "paddle/cinn/optim/tensor_write_tell.h"
 #include "paddle/cinn/optim/unroll_loops.h"
 #include "paddle/cinn/utils/functional.h"
 
@@ -185,7 +184,7 @@ class CudaVectorizer : public IRMutator<Expr *> {
   const Var iter_var_;  // the loop var of the vecotrized loop
   const int factor_;    // the factor for vectorize
 
-  TensorWriteTeller write_teller_;
+  std::set<std::string> write_teller_;
   TensorVectorizeTeller vectorized_teller_;
 
   absl::flat_hash_map<std::string, Var> tensor2vectorized_vars_;
@@ -215,7 +214,7 @@ class CudaVectorizer : public IRMutator<Expr *> {
   }
 
   void Visit(Expr *expr) {
-    write_teller_.Collect(expr);
+    write_teller_ = ir::CollectTensorNeedsWrite(expr);
     vectorized_teller_.Collect(expr);
     IRMutator<Expr *>::Visit(expr, expr);
   }
@@ -289,7 +288,7 @@ class CudaVectorizer : public IRMutator<Expr *> {
                   const std::vector<Expr> &indices,
                   bool is_store) {
     auto *node = tensor.As<ir::_Tensor_>();
-    bool is_const = !write_teller_.IsWrite(node->name);
+    bool is_const = !write_teller_.count(node->name);
 
     // generate the corresponding vector type
     Type scalar_type = tensor->type().ElementOf();
diff --git a/paddle/cinn/runtime/cuda/float16.h b/paddle/cinn/runtime/cuda/float16.h
index cae59186dc832..d64731387d596 100644
--- a/paddle/cinn/runtime/cuda/float16.h
+++ b/paddle/cinn/runtime/cuda/float16.h
@@ -597,9 +597,9 @@ __host__ __device__ inline bool(isfinite)(const float16& a) {
 
 __host__ __device__ inline float16(abs)(const float16& a) {
 #if defined(CINN_CUDA_FP16) && (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530)
-  return float16(__habs(a.to_half()));
+  return static_cast<float16>(__habs(a.to_half()));
 #else
-  return float16(fabsf(static_cast<float>(a)));
+  return static_cast<float16>(fabsf(static_cast<float>(a)));
 #endif
 }
 
diff --git a/paddle/cinn/utils/attribute_util.h b/paddle/cinn/utils/attribute_util.h
index aaffed7085c7b..17c1471c38c2d 100644
--- a/paddle/cinn/utils/attribute_util.h
+++ b/paddle/cinn/utils/attribute_util.h
@@ -18,29 +18,29 @@
 
 #include "paddle/cinn/common/type.h"
 #include "paddle/cinn/utils/type_defs.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_attribute.h"
-#include "paddle/ir/core/builtin_type.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
 #include "paddle/phi/common/data_type.h"
+#include "paddle/pir/core/builtin_type.h"
 
 namespace cinn {
 namespace utils {
 
-using NewIR_AttributeMap = std::unordered_map<std::string, ::ir::Attribute>;
+using NewIR_AttributeMap = std::unordered_map<std::string, ::pir::Attribute>;
 
-Attribute ConvertAttribute(const ::ir::Attribute& src_attr) {
+Attribute ConvertAttribute(const ::pir::Attribute& src_attr) {
   Attribute dst_attr;
-  if (src_attr.isa<::ir::BoolAttribute>()) {
-    dst_attr = src_attr.dyn_cast<::ir::BoolAttribute>().data();
-  } else if (src_attr.isa<::ir::FloatAttribute>()) {
-    dst_attr = src_attr.dyn_cast<::ir::FloatAttribute>().data();
-  } else if (src_attr.isa<::ir::Int32Attribute>()) {
-    dst_attr = src_attr.dyn_cast<::ir::Int32Attribute>().data();
-  } else if (src_attr.isa<::ir::StrAttribute>()) {
-    dst_attr = src_attr.dyn_cast<::ir::StrAttribute>().AsString();
-  } else if (src_attr.isa<::ir::Int64Attribute>()) {
-    dst_attr = src_attr.dyn_cast<::ir::Int64Attribute>().data();
-  } else if (src_attr.isa<::ir::DoubleAttribute>()) {
-    dst_attr = src_attr.dyn_cast<::ir::DoubleAttribute>().data();
+  if (src_attr.isa<::pir::BoolAttribute>()) {
+    dst_attr = src_attr.dyn_cast<::pir::BoolAttribute>().data();
+  } else if (src_attr.isa<::pir::FloatAttribute>()) {
+    dst_attr = src_attr.dyn_cast<::pir::FloatAttribute>().data();
+  } else if (src_attr.isa<::pir::Int32Attribute>()) {
+    dst_attr = src_attr.dyn_cast<::pir::Int32Attribute>().data();
+  } else if (src_attr.isa<::pir::StrAttribute>()) {
+    dst_attr = src_attr.dyn_cast<::pir::StrAttribute>().AsString();
+  } else if (src_attr.isa<::pir::Int64Attribute>()) {
+    dst_attr = src_attr.dyn_cast<::pir::Int64Attribute>().data();
+  } else if (src_attr.isa<::pir::DoubleAttribute>()) {
+    dst_attr = src_attr.dyn_cast<::pir::DoubleAttribute>().data();
   } else if (src_attr.isa<paddle::dialect::IntArrayAttribute>()) {
     auto& arr = src_attr.dyn_cast<paddle::dialect::IntArrayAttribute>()
                     .data()
@@ -75,10 +75,10 @@ AttributeMap ConvertAttributes(const NewIR_AttributeMap& src_attrs) {
 }
 
 #define CASE_TYPE(src, dst) \
-  else if (type.isa<::ir::src>()) return common::dst();
+  else if (type.isa<::pir::src>()) return common::dst();
 
-common::Type ConvertIRType(::ir::Type type) {
-  if (type.isa<::ir::BFloat16Type>()) return common::BF16();
+common::Type ConvertIRType(::pir::Type type) {
+  if (type.isa<::pir::BFloat16Type>()) return common::BF16();
   CASE_TYPE(Float16Type, F16)
   CASE_TYPE(Float32Type, F32)
   CASE_TYPE(Float64Type, F64)
diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt
index 628bf6d00c11c..c8e35ad43a36b 100644
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -8,7 +8,7 @@ add_subdirectory(pybind)
 add_subdirectory(eager)
 add_subdirectory(prim)
 add_subdirectory(jit)
-add_subdirectory(ir)
+add_subdirectory(pir)
 add_subdirectory(ir_adaptor)
 add_subdirectory(primitive)
 # NOTE: please add subdirectory inference at last.
diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/layer_norm_spmd_rule.cc b/paddle/fluid/distributed/auto_parallel/spmd_rules/layer_norm_spmd_rule.cc
index 61b2c6bb91c46..81f25a8d6ed88 100644
--- a/paddle/fluid/distributed/auto_parallel/spmd_rules/layer_norm_spmd_rule.cc
+++ b/paddle/fluid/distributed/auto_parallel/spmd_rules/layer_norm_spmd_rule.cc
@@ -63,7 +63,6 @@ LayerNormSPMDRule::InferForward(const std::vector<DistTensorSpec>& input_specs,
 
   int begin_norm_axis = ExtractAttr<int>("begin_norm_axis", attrs);
 
-  // Step2.3.2  handle input tensor partial (TODO)
   VLOG(4) << "LayerNormSPMDRule InferForward Inputs: "
           << "x shape: [" << str_join(x_shape) << "], x_dims_mapping: ["
           << str_join(x_dims_mapping) << "]; scale shape: ["
@@ -74,9 +73,9 @@ LayerNormSPMDRule::InferForward(const std::vector<DistTensorSpec>& input_specs,
           << begin_norm_axis << "]; ";
 
   // step1: build Einsum Notation
-  // ijk,k,k->ijk,x,x (x,scale,bias->out,mean,variance, begin_norm_axis=2, x=ij)
-  // ijkl,y(kl),y(kl)->ijkl,x(ij),x(ij) (x,scale,bias->out,mean,variance,
-  // begin_norm_axis=2, x=ij, y=kl)
+  // ijk,k,k->ijk,z,z (x,scale,bias->out,mean,variance, begin_norm_axis=2, z=ij)
+  // ijkl,y(kl),y(kl)->ijkl,z(ij),z(ij) (x,scale,bias->out,mean,variance,
+  // begin_norm_axis=2, z=ij, y=kl)
   std::string x_axes = "";
   for (auto i = 0; i < x_ndim; ++i) {
     x_axes += static_cast<char>(static_cast<int>('k') - begin_norm_axis + i);
@@ -124,15 +123,20 @@ LayerNormSPMDRule::InferForward(const std::vector<DistTensorSpec>& input_specs,
   out_dims_mapping.reserve(out_axes.size());
 
   int64_t mean_shard_dim = -1;
-  for (size_t i = 0; i < out_axes.size(); ++i) {
-    if (i < static_cast<size_t>(begin_norm_axis)) {
-      out_dims_mapping.push_back(x_dims_mapping[i]);
-      // if ijk,k,k->ijk,x,x (x,scale,bias->out,mean,variance,
-      // begin_norm_axis=2, x=ij), and the dims_mapping of input is (0,1,-1),
+  // As the mean and variance in outputs are `flattened` from
+  // x[0:begin_norm_axis], only the first axis can be sharded,
+  // the axes 1 to begin_norm_axis-1 are set to be replicated.
+  std::vector<int64_t> x_dims_mapping_dst(x_ndim, -1);
+  x_dims_mapping_dst[0] = x_dims_mapping[0];
+  for (int i = 0; i < x_ndim; ++i) {
+    if (i < begin_norm_axis) {
+      out_dims_mapping.push_back(x_dims_mapping_dst[i]);
+      // if ijk,k,k->ijk,z,z (x,scale,bias->out,mean,variance,
+      // begin_norm_axis=2, z=ij), and the dims_mapping of input is (0,1,-1),
       // the mean and varience is sharded by dim 0 and 1,
       // which is not supported currently.
-      mean_shard_dim =
-          ShardingMergeForAxis(mean_axes, mean_shard_dim, x_dims_mapping[i]);
+      mean_shard_dim = ShardingMergeForAxis(
+          mean_axes, mean_shard_dim, x_dims_mapping_dst[i]);
     } else {
       out_dims_mapping.push_back(-1);
     }
@@ -142,7 +146,7 @@ LayerNormSPMDRule::InferForward(const std::vector<DistTensorSpec>& input_specs,
   varience_dist_attr_dst.set_dims_mapping({mean_shard_dim});
 
   // step2.3: Merge and get Inputs' New Dims Mapping.
-  x_dist_attr_dst.set_dims_mapping(out_dims_mapping);
+  x_dist_attr_dst.set_dims_mapping(x_dims_mapping_dst);
   input_dist_attrs.emplace_back(x_dist_attr_dst);
   // TODO(zhiqiu): support shardding on scale and bias
   // Now, apply replicating.
@@ -173,12 +177,102 @@ LayerNormSPMDRule::InferForward(const std::vector<DistTensorSpec>& input_specs,
 
 std::pair<std::vector<TensorDistAttr>, std::vector<TensorDistAttr>>
 LayerNormSPMDRule::InferBackward(
+    const std::vector<DistTensorSpec>& input_specs,
     const std::vector<DistTensorSpec>& output_specs,
     const paddle::framework::AttributeMap& attrs) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      "InferBackward of LayerNormSPMDRule is NOT implemented yet."));
+  // step0: verify input args based on layer_norm logic
+  int64_t ninputs = input_specs.size();
+  int64_t noutputs = output_specs.size();
+  PADDLE_ENFORCE_EQ(
+      ninputs,
+      3,
+      phi::errors::InvalidArgument(
+          "The size of InputSpec of layer_norm should be 3, but got [%d].",
+          ninputs));
+  PADDLE_ENFORCE_EQ(
+      noutputs,
+      3,
+      phi::errors::InvalidArgument(
+          "The size of InputSpec of layer_norm should be 3, but got [%d].",
+          noutputs));
+  VerifySpecs(output_specs, "layer_norm_backward");
+
+  // step1: build Einsum Notation
+  // ijk,k,k->ijk,z,z (x,scale,bias->out,mean,variance, begin_norm_axis=2, z=ij)
+  // ijkl,y(kl),y(kl)->ijkl,z(ij),z(ij) (x,scale,bias->out,mean,variance,
+  // begin_norm_axis=2, z=ij, y=kl)
+  int begin_norm_axis = ExtractAttr<int>("begin_norm_axis", attrs);
+  std::string alphabet = "ijklmnopqrstuvwxyz";
+  int x_ndim = input_specs[0].shape().size();
+  std::string x_axes = alphabet.substr(0, x_ndim);
+  // the axes after norm_axis should be replicated,
+  // so set their notation to '1'.
+  for (int i = 1; i < x_ndim; i++) {
+    x_axes[i] = '1';
+  }
+  std::string out_axes = x_axes;
+  std::string mean_axes(1, '1'), varience_axes(1, '1');
+  if (begin_norm_axis > 0) {
+    mean_axes[0] = out_axes[0];
+    varience_axes[0] = out_axes[0];
+  }
+  std::vector<std::string> output_axes_vec;
+  output_axes_vec.emplace_back(out_axes);
+  output_axes_vec.emplace_back(mean_axes);
+  output_axes_vec.emplace_back(varience_axes);
+
+  // step2: Sharding Propogation
+  // For the axes after norm_axis in both input and output tensors,
+  // set their dims mappings to -1. For the other axes, set input
+  // tensor's dims mapping the same as output tensor's dims mapping.
+  // step2.1 merge dims mappings of output, mean, variance.
+  std::vector<std::pair<std::string, std::vector<int64_t>>> axes_sharding_info;
+  axes_sharding_info = GetAxesDimsMappingPair(output_axes_vec, output_specs);
+  std::unordered_map<std::string, int64_t> axis_to_dim_map =
+      ShardingMergeForTensors(axes_sharding_info);
+
+  // step2.2 infer input dims mapping
+  std::vector<int64_t> input_dims_mapping =
+      GetDimsMappingForAxes(x_axes, axis_to_dim_map);
+  std::vector<TensorDistAttr> input_dist_attrs;
+  for (int64_t i = 0; i < ninputs; i++) {
+    input_dist_attrs.emplace_back(input_specs[i].dist_attr());
+  }
+  input_dist_attrs[0].set_dims_mapping(input_dims_mapping);
+  // set bias and scale to be replicated
+  input_dist_attrs[1].set_dims_mapping({-1});
+  input_dist_attrs[2].set_dims_mapping({-1});
+
+  // step2.3 update output dims mappings with merged one
+  std::vector<TensorDistAttr> output_dist_attrs;
+  for (int64_t i = 0; i < noutputs; i++) {
+    output_dist_attrs.emplace_back(output_specs[i].dist_attr());
+    output_dist_attrs[i].set_dims_mapping(
+        GetDimsMappingForAxes(output_axes_vec[i], axis_to_dim_map));
+  }
+
+  VLOG(4) << "LayerNormSPMDRule InferBackward:";
+  VLOG(4) << "begin_norm_axis: " << begin_norm_axis;
+  for (int64_t i = 0; i < noutputs; i++) {
+    VLOG(4) << "Output" << std::to_string(i) << " shape: ["
+            << str_join(output_specs[i].shape()) << "] "
+            << "Einsum Notation: " << output_axes_vec[i]
+            << " src_dims_mapping: ["
+            << str_join(output_specs[i].dims_mapping()) << "] "
+            << "dst_dims_mapping: ["
+            << str_join(output_dist_attrs[i].dims_mapping()) << "]";
+  }
+
+  for (int64_t i = 0; i < ninputs; i++) {
+    VLOG(4) << "Input" << std::to_string(i) << " shape: ["
+            << str_join(input_specs[i].shape()) << "] "
+            << "Einsum Notation: " << std::string(i == 0 ? x_axes : "1")
+            << " dims_mapping: ["
+            << str_join(input_dist_attrs[i].dims_mapping()) << "]";
+  }
+  VLOG(4) << std::endl;
 
-  return {};
+  return {input_dist_attrs, output_dist_attrs};
 }
 
 }  // namespace auto_parallel
diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/layer_norm_spmd_rule.h b/paddle/fluid/distributed/auto_parallel/spmd_rules/layer_norm_spmd_rule.h
index b3bd6b6b18faf..da40f3da5653f 100644
--- a/paddle/fluid/distributed/auto_parallel/spmd_rules/layer_norm_spmd_rule.h
+++ b/paddle/fluid/distributed/auto_parallel/spmd_rules/layer_norm_spmd_rule.h
@@ -32,7 +32,8 @@ class LayerNormSPMDRule : public SPMDRuleBase {
                const paddle::framework::AttributeMap& attrs) override;
 
   std::pair<std::vector<TensorDistAttr>, std::vector<TensorDistAttr>>
-  InferBackward(const std::vector<DistTensorSpec>& output_specs,
+  InferBackward(const std::vector<DistTensorSpec>& input_specs,
+                const std::vector<DistTensorSpec>& output_specs,
                 const paddle::framework::AttributeMap& attrs) override;
 };
 }  // namespace auto_parallel
diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/split_spmd_rule.cc b/paddle/fluid/distributed/auto_parallel/spmd_rules/split_spmd_rule.cc
index 6f50c17fc5c2b..51b4f4b10c675 100644
--- a/paddle/fluid/distributed/auto_parallel/spmd_rules/split_spmd_rule.cc
+++ b/paddle/fluid/distributed/auto_parallel/spmd_rules/split_spmd_rule.cc
@@ -73,7 +73,7 @@ SplitSPMDRule::InferForward(const std::vector<DistTensorSpec>& input_specs,
   std::unordered_map<std::string, int64_t> axis_to_dim_map =
       ShardingMergeForTensors(axes_sharding_info);
 
-  // step2.2: infer output dimsmapping from merged input dimsmapping
+  // step2.2: infer output dims mapping from merged input dims mapping
   std::vector<int64_t> output_dims_mapping =
       GetDimsMappingForAxes(output_axes, axis_to_dim_map);
 
@@ -94,7 +94,7 @@ SplitSPMDRule::InferForward(const std::vector<DistTensorSpec>& input_specs,
   new_input_dims_mapping[axis] = -1;
   new_input_dist_attrs[0].set_dims_mapping(new_input_dims_mapping);
 
-  // Step2.4  handle input tensor partial (TODO)
+  // Step3 Handle input tensor partial (TODO)
   VLOG(4) << "SplitSPMDRule InferForward: ";
   for (int64_t i = 0; i < ninputs; i++) {
     VLOG(4) << "Input" << std::to_string(i) << " shape: ["
@@ -113,12 +113,104 @@ SplitSPMDRule::InferForward(const std::vector<DistTensorSpec>& input_specs,
 }
 
 std::pair<std::vector<TensorDistAttr>, std::vector<TensorDistAttr>>
-SplitSPMDRule::InferBackward(const std::vector<DistTensorSpec>& output_specs,
+SplitSPMDRule::InferBackward(const std::vector<DistTensorSpec>& input_specs,
+                             const std::vector<DistTensorSpec>& output_specs,
                              const paddle::framework::AttributeMap& attrs) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      "InferBackward of SplitPMDRule is NOT implemented yet."));
+  // step0: Verify Input Args Based on Elementwise Logic
+  int64_t ninputs = input_specs.size();
+  int64_t noutputs = output_specs.size();
+  PADDLE_ENFORCE_EQ(
+      ninputs,
+      1,
+      phi::errors::InvalidArgument("The size of InputSpec in split must "
+                                   "be equal to 1, but got [%d].",
+                                   ninputs));
+  VerifySpecs(output_specs, "split");
+
+  // check whether the size of output_specs equals
+  // to the specified split num in op attributes
+  int64_t specified_split_num = -1;
+  // split api uses num or sections as attribute
+  if (attrs.find("num") != attrs.end()) {
+    specified_split_num = ExtractAttr<int64_t>("num", attrs);
+  } else if (attrs.find("sections") != attrs.end()) {
+    std::vector<int64_t> sections =
+        ExtractAttr<std::vector<int64_t>>("sections", attrs);
+    specified_split_num = sections.size();
+  }
+  PADDLE_ENFORCE_EQ(
+      noutputs,
+      specified_split_num,
+      phi::errors::InvalidArgument("The size of OutputSpec [%d] is not equal "
+                                   "to the specified split number [%d]",
+                                   noutputs,
+                                   specified_split_num));
+
+  // step1: Build Einsum Notation
+  int64_t ndim = input_specs[0].shape().size();
+  int64_t axis = ExtractAttr<int>("axis", attrs);
+  if (axis < 0) {
+    axis += ndim;
+  }
+  std::string alphabet = "abcdefghijlmnopqrstuvwxyz";
+
+  // get einsum notation for input, use a special
+  // notation 'k' to mark the splitted axis in input
+  std::string input_axes = alphabet.substr(0, ndim);
+  input_axes[axis] = 'k';
+
+  // get einsum notation for output
+  std::string output_axes(input_axes);
+  output_axes[axis] = 'k';
+
+  // step2: Sharding Propogation
+  // step2.1: merge input shardings
+  std::vector<std::string> output_axes_vec;
+  for (int64_t i = 0; i < noutputs; i++) {
+    output_axes_vec.emplace_back(output_axes);
+  }
+  std::vector<std::pair<std::string, std::vector<int64_t>>> axes_sharding_info;
+  axes_sharding_info = GetAxesDimsMappingPair(output_axes_vec, output_specs);
+  std::unordered_map<std::string, int64_t> axis_to_dim_map =
+      ShardingMergeForTensors(axes_sharding_info);
+
+  // step2.2: infer input dims mapping from output dims mapping
+  // the split axis in input is set to -1.
+  std::vector<int64_t> input_dims_mapping =
+      GetDimsMappingForAxes(input_axes, axis_to_dim_map, true);
+  input_dims_mapping[axis] = -1;
+  TensorDistAttr input_dist_attr(input_specs[0].dist_attr());
+  input_dist_attr.set_dims_mapping(input_dims_mapping);
+
+  // step2.3 get new dist attribute for output. the splitted
+  // cannot be sharded, if it is sharded, set it to replicated.
+  std::vector<TensorDistAttr> output_dist_attrs;
+  for (int64_t i = 0; i < noutputs; i++) {
+    output_dist_attrs.emplace_back(output_specs[i].dist_attr());
+    std::vector<int64_t> out_dims_mapping =
+        GetDimsMappingForAxes(output_axes, axis_to_dim_map, true);
+    out_dims_mapping[axis] = -1;
+    output_dist_attrs[i].set_dims_mapping(out_dims_mapping);
+  }
+
+  // step3 Handle input tensor partial (TODO)
+
+  VLOG(4) << "SplitSPMDRule InferBackward: ";
+  for (int64_t i = 0; i < noutputs; i++) {
+    VLOG(4) << "Output" << std::to_string(i) << " shape: ["
+            << str_join(output_specs[i].shape()) << "] "
+            << "einsum_notation: " << output_axes << " dims_mapping: ["
+            << str_join(output_specs[i].dims_mapping()) << "]";
+  }
+  for (int64_t i = 0; i < ninputs; i++) {
+    VLOG(4) << "Input" << std::to_string(i) << " shape: ["
+            << str_join(input_specs[i].shape()) << "] "
+            << "einsum_notation: " << input_axes << " dims_mapping: ["
+            << str_join(input_dims_mapping) << "]";
+  }
+  VLOG(4) << std::endl;
 
-  return {};
+  return {{input_dist_attr}, output_dist_attrs};
 }
 
 }  // namespace auto_parallel
diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/split_spmd_rule.h b/paddle/fluid/distributed/auto_parallel/spmd_rules/split_spmd_rule.h
index f974e4cccce05..f8a1300e62409 100644
--- a/paddle/fluid/distributed/auto_parallel/spmd_rules/split_spmd_rule.h
+++ b/paddle/fluid/distributed/auto_parallel/spmd_rules/split_spmd_rule.h
@@ -32,7 +32,8 @@ class SplitSPMDRule : public SPMDRuleBase {
                const paddle::framework::AttributeMap& attrs) override;
 
   std::pair<std::vector<TensorDistAttr>, std::vector<TensorDistAttr>>
-  InferBackward(const std::vector<DistTensorSpec>& output_specs,
+  InferBackward(const std::vector<DistTensorSpec>& input_specs,
+                const std::vector<DistTensorSpec>& output_specs,
                 const paddle::framework::AttributeMap& attrs) override;
 };
 }  // namespace auto_parallel
diff --git a/paddle/fluid/distributed/fleet_executor/carrier.cc b/paddle/fluid/distributed/fleet_executor/carrier.cc
index 70ab3b94de3c5..6dc25faa80b4b 100644
--- a/paddle/fluid/distributed/fleet_executor/carrier.cc
+++ b/paddle/fluid/distributed/fleet_executor/carrier.cc
@@ -284,6 +284,14 @@ static std::shared_ptr<framework::GarbageCollector> GetGC(
                                                               max_memory_size));
       }
     }
+#endif
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+    if (platform::is_custom_place(place)) {
+      if (framework::IsFastEagerDeletionModeEnabled()) {
+        gc.reset(new framework::CustomDeviceUnsafeFastGarbageCollector(
+            place, max_memory_size));
+      }
+    }
 #endif
   }  // max_memory_size >= 0
 
diff --git a/paddle/fluid/distributed/fleet_executor/interceptor.h b/paddle/fluid/distributed/fleet_executor/interceptor.h
index 7c9cf9c8112ef..7645abf24cfd3 100644
--- a/paddle/fluid/distributed/fleet_executor/interceptor.h
+++ b/paddle/fluid/distributed/fleet_executor/interceptor.h
@@ -29,8 +29,8 @@
 #include "paddle/fluid/platform/errors.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/place.h"
-#include "paddle/ir/core/program.h"
-#include "paddle/ir/core/value.h"
+#include "paddle/pir/core/program.h"
+#include "paddle/pir/core/value.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
index e27310dea5629..25d2f4dacfd16 100644
--- a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
+++ b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
@@ -65,7 +65,7 @@ if(WIN32)
     add_custom_command(
       OUTPUT ${eager_generator_path}/ir.dll
       COMMAND ${CMAKE_COMMAND} -E copy ${IR_LIB} ${eager_generator_path}
-      DEPENDS ir)
+      DEPENDS pir)
     list(APPEND EAGER_CODEGEN_DEPS ${eager_generator_path}/ir.dll)
   endif()
 
diff --git a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
index da4a9aab53870..519e50b9175cc 100644
--- a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
@@ -81,6 +81,30 @@
     "matmul_grad": {"x": "grad_y", "y": "grad_x"},
 }
 
+strided_op_list = {
+    "as_complex",
+    "as_real",
+    "as_strided",
+    "real",
+    "imag",
+    "diagonal",
+    "flatten",
+    "flatten_infer",
+    "reshape",
+    "slice",
+    "squeeze_infer",
+    "squeeze",
+    "strided_slice",
+    "strided_slice_raw",
+    "tensor_unfold",
+    "transpose",
+    "unbind",
+    "unsqueeze_infer",
+    "unsqueeze",
+    "view_shape",
+    "view_dtype",
+}
+
 
 #########
 # Utils #
@@ -234,6 +258,9 @@ class {} : public egr::GradNodeBase {{
   // Node Declaration
   std::shared_ptr<{}> grad_node;
 
+  // Pre contiguous tensor in not strided op, if 1)require_any_grad=true; 2) need wrapper to backward; 3) not contiguous
+{}
+
   // Set grad_node before API Call
 {}
 
@@ -380,6 +407,7 @@ class {} : public egr::GradNodeBase {{
 #include "paddle/fluid/prim/api/all.h"
 #include "paddle/fluid/prim/utils/utils.h"
 #include "paddle/phi/core/flags.h"
+#include "paddle/phi/api/lib/data_transform.h"
 PHI_DECLARE_bool(check_nan_inf);
 {}
 """
@@ -408,6 +436,7 @@ class {} : public egr::GradNodeBase {{
 #include "paddle/fluid/eager/nan_inf_utils.h"
 #include "paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h"
 #include "paddle/phi/core/flags.h"
+#include "paddle/phi/api/lib/data_transform.h"
 
 PHI_DECLARE_bool(check_nan_inf);
 PHI_DECLARE_string(tensor_operants_mode);
@@ -505,6 +534,12 @@ class {} : public egr::GradNodeBase {{
   if ({}.initialized()) {{
     VLOG(10) << {}.name() << "({}) use_count: " << {}.impl().use_count();
     if ({}.impl().use_count() == 1 || ({}.impl().use_count() == 2 && {}.impl().get() == {}.impl().get())) {{
+      if ({}.is_dense_tensor() && !std::dynamic_pointer_cast<phi::DenseTensor>({}.impl())->meta().is_contiguous()) {{
+        auto tmp = paddle::experimental::Trans2Contiguous(*(std::dynamic_pointer_cast<phi::DenseTensor>({}.impl())));
+        auto holder = tmp.MoveMemoryHolder();
+        std::dynamic_pointer_cast<phi::DenseTensor>({}.impl())->ResetHolder(holder);
+        std::dynamic_pointer_cast<phi::DenseTensor>({}.impl())->set_meta(tmp.meta());
+      }}
       can_be_inplaced = true;
     }}
   }}"""
@@ -977,6 +1012,7 @@ def GenerateNodeCreationCodes(self, for_backward=False, is_inplaced=False):
             set_attributes_list.append(set_attributes)
         set_attributes_str = "\n".join(set_attributes_list)
 
+        need_pre_contiguous_set = set()
         # SetTensorWrappers
         set_input_tensor_wrappers_list = []
         set_output_tensor_wrappers_list = []
@@ -1000,12 +1036,30 @@ def GenerateNodeCreationCodes(self, for_backward=False, is_inplaced=False):
                             {"indent": indent, "name": name}
                         )
                     else:
-                        set_tensor_wrappers = f"{indent}if({name}) grad_node->SetTensorWrapper{name}(*{name});"
+                        if (
+                            (forward_api_name in strided_op_list)
+                            or for_backward
+                            or IsVectorTensorType(atype)
+                            or (name in self.optional_inputs)
+                        ):
+                            set_tensor_wrappers = f"{indent}if({name}) grad_node->SetTensorWrapper{name}(*{name});"
+                        else:
+                            need_pre_contiguous_set.add(name)
+                            set_tensor_wrappers = f"{indent}if({name}) grad_node->SetTensorWrapper{name}(*{name}_tmp);"
                 else:
                     if is_inplace_input:
                         set_tensor_wrappers = f"{indent}auto {name}_clone = paddle::experimental::assign({name});\n{indent}grad_node->SetTensorWrapper{name}({name}_clone);"
                     else:
-                        set_tensor_wrappers = f"{indent}grad_node->SetTensorWrapper{name}({name});"
+                        if (
+                            (forward_api_name in strided_op_list)
+                            or for_backward
+                            or IsVectorTensorType(atype)
+                            or (name in self.optional_inputs)
+                        ):
+                            set_tensor_wrappers = f"{indent}grad_node->SetTensorWrapper{name}({name});"
+                        else:
+                            need_pre_contiguous_set.add(name)
+                            set_tensor_wrappers = f"{indent}grad_node->SetTensorWrapper{name}({name}_tmp);"
                 set_input_tensor_wrappers_list.append(set_tensor_wrappers)
             else:  # Forwad's output as backward's input
                 if num_fwd_outputs > 1:
@@ -1025,6 +1079,24 @@ def GenerateNodeCreationCodes(self, for_backward=False, is_inplaced=False):
             set_output_tensor_wrappers_list
         )
 
+        if (forward_api_name in strided_op_list) or for_backward:
+            self.inputs_call_list_tmp = None
+            self.node_creation_pre_contiguous_str = ""
+        else:
+            self.inputs_call_list_tmp = self.inputs_call_list
+            pre_contiguous_list = []
+            for name, (ttype, pos) in forward_inputs_position_map.items():
+                if name in need_pre_contiguous_set:
+                    pre_contiguous_list.append(
+                        f"{indent}const auto& {name}_tmp = (require_any_grad && {name}.is_dense_tensor() && !std::dynamic_pointer_cast<phi::DenseTensor>({name}.impl())->meta().is_contiguous()) ? paddle::Tensor(std::make_shared<phi::DenseTensor>(std::move(paddle::experimental::Trans2Contiguous(*(std::dynamic_pointer_cast<phi::DenseTensor>({name}.impl())))))) : {name};"
+                    )
+                    self.inputs_call_list_tmp[pos] = (
+                        self.inputs_call_list_tmp[pos] + '_tmp'
+                    )
+            self.node_creation_pre_contiguous_str = "\n".join(
+                pre_contiguous_list
+            )
+
         # SetGradOutMeta & SetEdges
         grad_node_out_list = []
         set_grad_out_meta_list = []
@@ -1463,6 +1535,7 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
         inputs_args_declaration_str = ", ".join(inputs_args_declaration_list)
         inputs_args_definition_str = ", ".join(inputs_args_definition_list)
         inputs_call_args_str = ", ".join(inputs_call_list)
+        self.inputs_call_list = inputs_call_list
 
         # Forward Full Logic
         function_name = forward_api_name
@@ -1649,6 +1722,12 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
             node_creation_str = self.node_creation_str
             node_creation_before_call_str = self.node_creation_before_call_str
             node_creation_after_call_str = self.node_creation_after_call_str
+            node_creation_pre_contiguous_str = (
+                self.node_creation_pre_contiguous_str
+            )
+            if self.inputs_call_list_tmp is not None:
+                inputs_call_args_str_tmp = ", ".join(self.inputs_call_list_tmp)
+                forward_call_str = f"{indent}{api_out_type} api_result = paddle::experimental::{namespace}{function_name}({inputs_call_args_str_tmp});"
 
         dygraph_event_str = f"{indent}paddle::platform::RecordEvent dygraph_entrance_record_event(\"{forward_api_name} dygraph\", paddle::platform::TracerEventType::Operator, 1);\n"
         forward_ad_function_name = GetDygraphForwardFunctionName(
@@ -1760,6 +1839,7 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
                 before_log_str,
                 compute_require_grad_args_str,
                 self.grad_node_name,
+                node_creation_pre_contiguous_str,
                 node_creation_before_call_str,
                 forward_call_str,
                 check_nan_inf_str,
@@ -2160,6 +2240,11 @@ def GenerateNodeDefinition(
                     transformed_tensor_name,
                     transformed_tensor_name,
                     tensor_wrapper_intermidiate_tensor_str,
+                    transformed_tensor_name,
+                    transformed_tensor_name,
+                    transformed_tensor_name,
+                    transformed_tensor_name,
+                    transformed_tensor_name,
                 )
                 inplace_grad_input_str = transformed_tensor_name
             if is_optional:
@@ -2229,6 +2314,11 @@ def GenerateNodeDefinition(
                         transformed_tensor_name,
                         transformed_tensor_name,
                         grads_tensor_str,
+                        transformed_tensor_name,
+                        transformed_tensor_name,
+                        transformed_tensor_name,
+                        transformed_tensor_name,
+                        transformed_tensor_name,
                     )
                     inplace_grad_input_str = transformed_tensor_name
 
diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.cc b/paddle/fluid/eager/custom_operator/custom_operator_node.cc
index 9532d3181ac73..cdd1f7bfbe945 100644
--- a/paddle/fluid/eager/custom_operator/custom_operator_node.cc
+++ b/paddle/fluid/eager/custom_operator/custom_operator_node.cc
@@ -18,6 +18,7 @@
 #include "paddle/fluid/framework/custom_operator_utils.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/phi/api/ext/op_meta_info.h"
+#include "paddle/phi/api/lib/data_transform.h"
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace egr {
@@ -201,7 +202,18 @@ RunCustomOpNode::operator()(paddle::small_vector<std::vector<paddle::Tensor>,
   }
 
   VLOG(6) << "Prepare Grad inputs";
-  for (const auto& in : tmp_ins) {
+  for (auto& in : tmp_ins) {
+    for (auto& tensor : in) {
+      if (tensor.initialized() && tensor.is_dense_tensor() &&
+          !std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl())
+               ->meta()
+               .is_contiguous()) {
+        tensor.set_impl(std::make_shared<phi::DenseTensor>(
+            std::move(paddle::experimental::Trans2Contiguous(*(
+                std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl()))))));
+      }
+    }
+
     ctx.EmplaceBackInputs(in);
   }
   VLOG(6) << "Prepare Grad attrs";
diff --git a/paddle/fluid/eager/to_static/run_program_op_func.h b/paddle/fluid/eager/to_static/run_program_op_func.h
index 8f6e6f4028c1d..2a3304fffe63c 100644
--- a/paddle/fluid/eager/to_static/run_program_op_func.h
+++ b/paddle/fluid/eager/to_static/run_program_op_func.h
@@ -98,6 +98,7 @@ inline void run_program_ad_func(
     std::vector<paddle::Tensor*>& dout,                  // NOLINT
     const paddle::framework::AttributeMap& attrs) {
   // Prepare Autograd Meta
+  VLOG(2) << "start run run_program ad function.";
   auto deref_out = details::DereferenceTensors(out);
   std::vector<egr::AutogradMeta*> p_autograd_x =
       egr::EagerUtils::nullable_autograd_meta(x);
@@ -174,3 +175,107 @@ inline void run_program_ad_func(
     egr::EagerUtils::SetHistory(&p_autograd_outs, grad_node);
   }
 }
+
+inline void newir_run_program_ad_func(
+    const std::vector<paddle::Tensor>& x,
+    const std::vector<paddle::Tensor>& params,
+    std::vector<paddle::Tensor*>& out,                   // NOLINT
+    std::vector<paddle::framework::Scope*>& step_scope,  // NOLINT
+    std::vector<paddle::Tensor*>& dout,                  // NOLINT
+    const paddle::framework::AttributeMap& attrs) {
+  // Prepare Autograd Meta
+  VLOG(2) << "start run newir run_program ad function.";
+  auto deref_out = details::DereferenceTensors(out);
+  std::vector<egr::AutogradMeta*> p_autograd_x =
+      egr::EagerUtils::nullable_autograd_meta(x);
+  std::vector<egr::AutogradMeta*> p_autograd_params =
+      egr::EagerUtils::nullable_autograd_meta(params);
+  std::vector<egr::AutogradMeta*> p_autograd_outs =
+      egr::EagerUtils::nullable_autograd_meta(deref_out);
+
+  bool trace_backward = egr::Controller::Instance().HasGrad();
+  bool require_any_grad = egr::EagerUtils::ComputeRequireGrad(
+      trace_backward, &p_autograd_x, &p_autograd_params);
+
+  // Create Middle Output for GradNode.
+  auto middle_size =
+      PADDLE_GET_CONST(std::vector<::pir::Value>, attrs.at("fm")).size();
+  auto output_size =
+      PADDLE_GET_CONST(std::vector<::pir::Value>, attrs.at("fo")).size();
+  auto middles = std::vector<paddle::Tensor*>();
+  std::shared_ptr<NewIRGradNodeRunProgram> grad_node;
+  VLOG(2) << "start run run_program with require_any_grad = "
+          << require_any_grad;
+
+  if (require_any_grad) {
+    // Create GradOpNode (1 means [out_grad], 2 means [x_grad, paramx_grad])
+    grad_node = std::make_shared<NewIRGradNodeRunProgram>(1, 2);
+    grad_node->GetMiddle().resize(middle_size);
+    grad_node->GetOutputs().resize(output_size);
+    for (size_t i = 0; i < middle_size; ++i) {
+      grad_node->GetMiddle()[i] =
+          paddle::Tensor(std::make_shared<phi::DenseTensor>());
+      middles.push_back(&grad_node->GetMiddle()[i]);
+    }
+    for (size_t i = 0; i < output_size; ++i) {
+      grad_node->GetOutputs()[i] = *out[i];
+    }
+  }
+
+  // Call forward function
+  // if require_any_grad is False, don't save any middle vars.
+  NewIRRunProgramAPI(
+      x, params, out, middles, step_scope, dout, require_any_grad, attrs);
+  if (require_any_grad) {
+    // auto x_names =
+    // PADDLE_GET_CONST(std::vector<std::string>, attrs.at("x_names"));
+
+    egr::EagerUtils::PassStopGradient(false, &p_autograd_outs);
+
+    // Set Attributes
+    grad_node->SetAttrMap(attrs);
+
+    // auto* forward_global_block = PADDLE_GET_CONST(
+    // paddle::framework::BlockDesc*, attrs.at("forward_global_block"));
+    // auto* backward_global_block = PADDLE_GET_CONST(
+    // paddle::framework::BlockDesc*, attrs.at("backward_global_block"));
+    // Clear unused x vars
+    // auto filter_x =
+    // filter_unused_input_var_in_backward(x, x_names, backward_global_block);
+    // Set TensorWrappers
+    grad_node->SetFwdX(x);
+    // Clear unused out vars
+    // clear_unused_out_var_in_backward(out, backward_global_block,
+    // step_scope[0]);
+
+    grad_node->SetFwdParams(params);
+    grad_node->SetStepScope(step_scope);  // just for set useable.
+
+    // Set Grad out rank as same as fwd input and set stop gradient to bwd
+    // NOTE(@xiongkun): Not every tensor in x(list of tensor) is required
+    // gradient. for example: x[1] is not used for output, the x[1] is ignored.
+
+    // TODO(@xiongkun): rewrite by new ir representation.
+    std::vector<const paddle::Tensor*> x_require_grad;
+    for (size_t i = 0; i < x.size(); ++i) {
+      x_require_grad.push_back(&x[i]);
+    }
+
+    grad_node->SetGradOutMeta(x_require_grad, /*slot id*/ 0);
+    grad_node->SetGradOutMeta(params, /*slot id*/ 1);
+
+    // VLOG(2) << "clear_no_grad_edges.";
+    // clear_no_grad_edges_with_partial_block(params,
+    // forward_global_block,
+    // backward_global_block,
+    // grad_node.get(),
+    // [>slot id<] 1);
+
+    grad_node->SetGradInMeta(deref_out, 0);
+
+    egr::EagerUtils::SetOutRankWithSlot(&p_autograd_outs, 0);
+
+    // Set History for output set current Grad Node for
+    egr::EagerUtils::SetHistory(&p_autograd_outs, grad_node);
+  }
+}
diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h
index 72c61c1723a3b..8f6f8cbbc22fc 100644
--- a/paddle/fluid/eager/to_static/run_program_op_node.h
+++ b/paddle/fluid/eager/to_static/run_program_op_node.h
@@ -19,13 +19,16 @@
 #include "paddle/fluid/eager/tensor_wrapper.h"
 #include "paddle/fluid/framework/new_executor/interpretercore.h"
 #include "paddle/fluid/framework/variable_helper.h"
-#include "paddle/fluid/ir/transforms/pd_op_to_kernel_pass.h"
 #include "paddle/fluid/ir_adaptor/translator/program_translator.h"
 #include "paddle/fluid/operators/run_program_op.h"
+#include "paddle/fluid/pir/transforms/pd_op_to_kernel_pass.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
-#include "paddle/ir/core/program.h"
-#include "paddle/ir/core/value.h"
+#include "paddle/pir/core/attribute.h"
+#include "paddle/pir/core/block.h"
+#include "paddle/pir/core/builtin_attribute.h"
+#include "paddle/pir/core/program.h"
+#include "paddle/pir/core/value.h"
 
 PHI_DECLARE_bool(enable_new_ir_in_executor);
 
@@ -175,6 +178,33 @@ static void ShareTensorsIntoScopeWithName(
   }
 }
 
+static auto GetNameFromValue(const ::pir::Block *block,
+                             const std::vector<::pir::Value> &values) {
+  // we use name here, later value is used directly.
+  std::unordered_map<::pir::Value, std::string> value2name;
+  for (auto *op : *block) {
+    std::string name;
+    if (op->name() == "pd_op.data") {
+      name =
+          op->attributes().at("name").dyn_cast<pir::StrAttribute>().AsString();
+      value2name[op->results()[0].Value::impl()] = name;
+    } else if (op->name() == "builtin.set_parameter") {
+      name = op->attributes()
+                 .at("parameter_name")
+                 .dyn_cast<pir::StrAttribute>()
+                 .AsString();
+      value2name[op->operand(0).source()] = name;
+    }
+  }
+  std::vector<std::string> names;
+  std::transform(
+      values.begin(),
+      values.end(),
+      std::back_inserter(names),
+      [&value2name](const ::pir::Value &v) { return value2name[v]; });
+  return names;
+}
+
 static void ShareTensorsFromScope(
     const std::vector<Tensor *> &tensors,
     const paddle::framework::BlockDesc &global_block,
@@ -216,6 +246,52 @@ static void ShareTensorsFromScope(
   }
 }
 
+static void ShareTensorsIntoScopeByValue(
+    const ::pir::Block *block,
+    const std::vector<Tensor> &tensors,
+    const std::vector<::pir::Value> &values,
+    paddle::framework::Scope *scope) {
+  auto names = GetNameFromValue(block, values);
+  ShareTensorsIntoScopeWithName(tensors, names, scope);
+}
+
+static void ShareTensorsFromScopeByValue(
+    const ::pir::Block *block,
+    const std::vector<Tensor *> &tensors,
+    const std::vector<::pir::Value> &values,
+    paddle::framework::Scope *scope) {
+  auto names = GetNameFromValue(block, values);
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    auto &name = names[i];
+    auto &value = values[i];
+    if (value.impl() == nullptr) {
+      // skip stop_gradient.
+      continue;
+    }
+    auto *var = scope->FindVar(name);
+    PADDLE_ENFORCE_NOT_NULL(
+        var,
+        paddle::platform::errors::NotFound("The output tensor %s is not in "
+                                           "RunProgram(Grad)Op'"
+                                           "s internal scope.",
+                                           name));
+    CheckOutputVarStatus(*var, *tensors[i]);
+    // share tensor
+    if (var->IsType<phi::DenseTensor>()) {
+      auto &src_tensor = var->Get<phi::DenseTensor>();
+      auto *dst_tensor = const_cast<phi::DenseTensor *>(
+          dynamic_cast<const phi::DenseTensor *>(tensors[i]->impl().get()));
+      VLOG(2) << "share " << name << " from scope";
+      *dst_tensor = src_tensor;
+    } else if (var->IsType<phi::SelectedRows>()) {
+      auto &src_tensor = var->Get<phi::SelectedRows>();
+      auto *dst_tensor = const_cast<phi::SelectedRows *>(
+          dynamic_cast<const phi::SelectedRows *>(tensors[i]->impl().get()));
+      *dst_tensor = src_tensor;
+    }
+  }
+}
+
 static void ShareTensorsFromScopeWithPartialBlock(
     const std::vector<Tensor *> &tensors,
     const paddle::framework::BlockDesc &forward_global_block,
@@ -309,8 +385,194 @@ static void GcScope(paddle::framework::Scope *scope) {
   delete garbages;  // free mem
 }
 
+template <class T>
+void print_collection(const T &t) {
+  VLOG(5) << "Print collection start :";
+  for (auto s : t) {
+    VLOG(5) << s;
+  }
+  VLOG(5) << "Print collection end.";
+}
+
 }  // namespace details
 
+inline void NewIRRunProgramAPI(
+    const std::vector<paddle::Tensor> &x,
+    const std::vector<paddle::Tensor> &params,
+    std::vector<paddle::Tensor *> &out,                   // NOLINT
+    std::vector<paddle::Tensor *> &middles,               // NOLINT
+    std::vector<paddle::framework::Scope *> &step_scope,  // NOLINT
+    std::vector<paddle::Tensor *> &dout,                  // NOLINT
+    bool require_any_grad,
+    const paddle::framework::AttributeMap &attrs) {
+  VLOG(2) << "RunProgramOpKernel Compute";
+  // In the original run_program OP, the default value of the is_test
+  // attribute is false, we should check if there is is_test parameter
+  // in attrs
+  auto is_test = false;
+  if (attrs.count("is_test")) {
+    is_test = PADDLE_GET_CONST(bool, attrs.at("is_test"));
+  }
+  int64_t program_id = PADDLE_GET_CONST(int64_t, attrs.at("program_id"));
+  auto place = egr::Controller::Instance().GetExpectedPlace();
+
+  // NOTE(chenweihang): In order not to add new variable type, use vector
+  // here. Originally, here can use scope directly.
+  auto *out_scope_vec = &step_scope;
+  PADDLE_ENFORCE_EQ(
+      out_scope_vec->size(),
+      1,
+      paddle::platform::errors::InvalidArgument(
+          "The OutScope of RunProgramGradOp should only hold one scope."));
+
+  VLOG(2) << "RunProgramOp use interpretercore to execute program.";
+
+  paddle::framework::Scope *global_inner_scope = out_scope_vec->front();
+
+  VLOG(4) << "global_inner_scope:" << global_inner_scope;
+
+  auto input_values =
+      PADDLE_GET_CONST(std::vector<::pir::Value>, attrs.at("fx"));
+  auto output_values =
+      PADDLE_GET_CONST(std::vector<::pir::Value>, attrs.at("fo"));
+  auto middle_values =
+      PADDLE_GET_CONST(std::vector<::pir::Value>, attrs.at("fm"));
+  auto param_values =
+      PADDLE_GET_CONST(std::vector<::pir::Value>, attrs.at("fp"));
+  // auto dout_names =
+  // PADDLE_GET_CONST(std::vector<::pir::Value>, attrs.at("fp"));
+
+  auto *forward_global_block =
+      PADDLE_GET_CONST(::pir::Block *, attrs.at("forward_global_block"));
+  auto *backward_global_block =
+      PADDLE_GET_CONST(::pir::Block *, attrs.at("backward_global_block"));
+
+  auto *forward_program =
+      forward_global_block->GetParentOp()->GetParentProgram();
+  auto *backward_program =
+      backward_global_block->GetParentOp()->GetParentProgram();
+
+  if (VLOG_IS_ON(4)) {
+    std::ostringstream print_stream;
+    forward_program->Print(print_stream);
+    print_stream << "\n";
+    backward_program->Print(print_stream);
+    VLOG(4) << print_stream.str();
+  }
+
+  VLOG(10) << is_test << program_id;
+
+  auto &interpretercore_info_cache =
+      paddle::framework::InterpreterCoreInfoCache::Instance();
+  std::shared_ptr<paddle::framework::InterpreterCore> interpreter_core =
+      nullptr;
+  if (!interpretercore_info_cache.Has(
+          program_id, global_inner_scope, /*is_grad=*/false)) {
+    paddle::platform::RecordEvent record_event(
+        "create_new_interpretercore",
+        paddle::platform::TracerEventType::UserDefined,
+        1);
+    VLOG(2) << "No interpretercore cache, so create a new interpretercore "
+               "for program: "
+            << program_id;
+    // Step 1. share input_vars & parameters into scope
+    details::ShareTensorsIntoScopeByValue(
+        forward_global_block, x, input_values, global_inner_scope);
+    details::ShareTensorsIntoScopeByValue(
+        forward_global_block, params, param_values, global_inner_scope);
+    // Step 2. create new interpretercore
+    auto kernel_forward_program =
+        paddle::dialect::PdOpLowerToKernelPass(forward_program, place);
+    interpreter_core = paddle::framework::CreateNewIRInterpreterCoreInfoToCache(
+        std::move(kernel_forward_program),
+        place,
+        /*is_grad=*/false,
+        program_id,
+        global_inner_scope);
+    // Step 3. get all eager gc vars
+    // std::set<std::string> skip_eager_delete_vars =
+    // paddle::framework::details::ParseSafeEagerDeletionSkipVarsSet(
+    // *backward_program);
+
+    // update interpretercore skip_gc_var
+    auto skip_names =
+        details::GetNameFromValue(forward_global_block, middle_values);
+    auto skip_names_set =
+        std::set<std::string>(skip_names.begin(), skip_names.end());
+    skip_names = details::GetNameFromValue(forward_global_block, output_values);
+    skip_names_set.insert(skip_names.begin(), skip_names.end());
+    details::print_collection(skip_names_set);
+    interpreter_core->SetSkipGcVars(skip_names_set);
+
+    // std::set<std::string> input_vars;
+    // input_vars.insert(input_names.begin(), input_names.end());
+    // interpreter_core->SetJitInputVars(input_vars);
+
+    // interpretercore_info_cache.UpdateSkipEagerDeleteVars(
+    // program_id, global_inner_scope, false, skip_eager_delete_vars);
+  } else {
+    paddle::platform::RecordEvent record_event(
+        "get_interpretercore_cahce",
+        paddle::platform::TracerEventType::UserDefined,
+        1);
+    VLOG(2) << "Get interpretercore cache by program:" << program_id;
+    // Step 1. get cache interpretercore
+    auto &cached_value = interpretercore_info_cache.GetMutable(
+        program_id, global_inner_scope, /*is_grad=*/false);
+    interpreter_core = cached_value.core_;
+    // Step 2. update scope for cache interpretercore
+    details::ShareTensorsIntoScopeByValue(
+        forward_global_block, x, input_values, global_inner_scope);
+    details::ShareTensorsIntoScopeByValue(
+        forward_global_block, params, param_values, global_inner_scope);
+    // TODO(xiongkun): new ir how to build scope.
+    // if (interpreter_core->GetVariableScope()->GetMutableScope() !=
+    // global_inner_scope) {
+    // details::BuildScopeByBlock(
+    // *interpreter_core.get(), *forward_global_block, global_inner_scope);
+    // interpreter_core->reset_scope(global_inner_scope);
+    //}
+  }
+
+  // interpretercore run
+  if (!forward_global_block->empty()) {
+    paddle::platform::RecordEvent record_event(
+        "interpreter_core_run",
+        paddle::platform::TracerEventType::UserDefined,
+        1);
+    interpreter_core->Run({});
+  }
+
+  {
+    paddle::platform::RecordEvent record_event(
+        "fetch_and_gc", paddle::platform::TracerEventType::UserDefined, 1);
+    // Get Output, and Middle Outputs
+    details::ShareTensorsFromScopeByValue(
+        forward_global_block, out, output_values, global_inner_scope);
+    details::ShareTensorsFromScopeByValue(
+        forward_global_block, middles, middle_values, global_inner_scope);
+
+    VLOG(3) << paddle::framework::GenScopeTreeDebugInfo(out_scope_vec->front());
+
+    if (is_test || !require_any_grad) {
+      VLOG(4) << "don't require any grad, set this scope can reused";
+      VLOG(4) << "is_test: " << is_test
+              << ", require_any_grad: " << require_any_grad;
+      global_inner_scope->SetCanReused(true);
+      details::GcScope(global_inner_scope);
+    } else {
+      VLOG(4) << "not test, set this scope can not reused";
+      global_inner_scope->SetCanReused(false);
+      details::GcScope(global_inner_scope);  // we can gc all the time, because
+                                             // we save the middles.
+    }
+  }
+
+#ifdef PADDLE_WITH_DNNL
+  if (FLAGS_use_mkldnn) paddle::platform::DontClearMKLDNNCache(place);
+#endif
+}
+
 inline void RunProgramAPI(
     const std::vector<paddle::Tensor> &x,
     const std::vector<paddle::Tensor> &params,
@@ -403,8 +665,13 @@ inline void RunProgramAPI(
 
     if (FLAGS_enable_new_ir_in_executor) {
       // build new ir program
-      auto ir_program = paddle::framework::ConstructFowardIrProgram(
-          forward_global_block, backward_global_block, output_names, x, params);
+      auto ir_program =
+          paddle::framework::ConstructFowardIrProgram(forward_global_block,
+                                                      backward_global_block,
+                                                      output_names,
+                                                      x,
+                                                      params,
+                                                      place);
       interpreter_core =
           paddle::framework::CreateNewIRInterpreterCoreInfoToCache(
               std::move(ir_program),
@@ -660,12 +927,164 @@ inline void RunProgramGradAPI(
   }
 }
 
+inline void NewIRRunProgramGradAPI(
+    const std::vector<paddle::Tensor> &x,
+    const std::vector<paddle::Tensor> &params,
+    const std::vector<paddle::Tensor> &out_grad,
+    const std::vector<paddle::Tensor> &middles,
+    const std::vector<paddle::Tensor> &out,
+    const std::vector<paddle::framework::Scope *> &step_scope,  // NOLINT
+    const paddle::framework::AttributeMap &attrs,
+    std::vector<paddle::Tensor *> &x_grad,      // NOLINT
+    std::vector<paddle::Tensor *> &params_grad  // NOLINT
+) {
+  // if all output vars are set to stop_gradient, grad op no need to executed
+  if (x_grad.empty() && params_grad.empty()) return;
+  auto *out_scope_vec = &step_scope;
+  PADDLE_ENFORCE_EQ(
+      out_scope_vec->size(),
+      1,
+      paddle::platform::errors::InvalidArgument(
+          "The OutScope of RunProgramGradOp should only hold one scope."));
+  paddle::framework::Scope *global_inner_scope = out_scope_vec->front();
+
+  int64_t program_id = PADDLE_GET_CONST(int64_t, attrs.at("program_id"));
+
+  auto place = egr::Controller::Instance().GetExpectedPlace();
+  VLOG(2) << "RunProgramGradOp use interpretercore to execute program.";
+
+  VLOG(4) << "global_inner_scope:" << global_inner_scope;
+
+  auto *backward_global_block =
+      PADDLE_GET_CONST(::pir::Block *, attrs.at("backward_global_block"));
+  auto *backward_program =
+      backward_global_block->GetParentOp()->GetParentProgram();
+
+  auto output_grad_values =
+      PADDLE_GET_CONST(std::vector<::pir::Value>, attrs.at("bo_g"));
+  auto forward_input_values =
+      PADDLE_GET_CONST(std::vector<::pir::Value>, attrs.at("bx"));
+  auto forward_middle_values =
+      PADDLE_GET_CONST(std::vector<::pir::Value>, attrs.at("bm"));
+  auto forward_output_values =
+      PADDLE_GET_CONST(std::vector<::pir::Value>, attrs.at("bo"));
+  auto x_grad_values =
+      PADDLE_GET_CONST(std::vector<::pir::Value>, attrs.at("bx_g"));
+  auto p_grad_values =
+      PADDLE_GET_CONST(std::vector<::pir::Value>, attrs.at("bp_g"));
+
+  auto &interpretercore_info_cache =
+      paddle::framework::InterpreterCoreInfoCache::Instance();
+  std::shared_ptr<paddle::framework::InterpreterCore> interpreter_core =
+      nullptr;
+  if (!interpretercore_info_cache.Has(
+          program_id, global_inner_scope, /*is_grad=*/true)) {
+    paddle::platform::RecordEvent record_event(
+        "create_new_interpretercore",
+        paddle::platform::TracerEventType::UserDefined,
+        1);
+    VLOG(2) << "No interpretercore cahce, so create a new interpretercore";
+    // Step 1. share input_vars & parameters into scope
+    // x, param, middles, output_grads
+    details::ShareTensorsIntoScopeByValue(backward_global_block,
+                                          out_grad,
+                                          output_grad_values,
+                                          global_inner_scope);
+    details::ShareTensorsIntoScopeByValue(
+        backward_global_block, x, forward_input_values, global_inner_scope);
+    details::ShareTensorsIntoScopeByValue(backward_global_block,
+                                          middles,
+                                          forward_middle_values,
+                                          global_inner_scope);
+    details::ShareTensorsIntoScopeByValue(
+        backward_global_block, out, forward_output_values, global_inner_scope);
+    auto kernel_backward_program =
+        paddle::dialect::PdOpLowerToKernelPass(backward_program, place);
+    interpreter_core = paddle::framework::CreateNewIRInterpreterCoreInfoToCache(
+        std::move(kernel_backward_program),
+        place,
+        /*is_grad=*/true,
+        program_id,
+        global_inner_scope);
+    // share threadpool
+    // NOTE(zhiqiu): this only works interpreter_core is executed strictly
+    // after the related fwd_interpreter_core.
+    if (interpretercore_info_cache.Has(program_id, global_inner_scope, false)) {
+      auto fwd_interpreter_core =
+          interpretercore_info_cache
+              .GetMutable(program_id, global_inner_scope, /*is_grad=*/false)
+              .core_;
+      interpreter_core->ShareWorkQueueFrom(fwd_interpreter_core);
+      VLOG(4) << "Share workqueue from " << fwd_interpreter_core.get() << " to "
+              << interpreter_core.get();
+    }
+
+    // get all eager gc vars
+    std::set<std::string> skip_eager_delete_vars;
+    auto skip_names =
+        details::GetNameFromValue(backward_global_block, x_grad_values);
+    skip_eager_delete_vars.insert(skip_names.begin(), skip_names.end());
+    skip_names =
+        details::GetNameFromValue(backward_global_block, p_grad_values);
+    skip_eager_delete_vars.insert(skip_names.begin(), skip_names.end());
+    interpreter_core->SetSkipGcVars(skip_eager_delete_vars);
+    interpretercore_info_cache.UpdateSkipEagerDeleteVars(
+        program_id,
+        global_inner_scope,
+        /*is_grad=*/true,
+        skip_eager_delete_vars);
+    VLOG(2) << "Get skip GC vars size is: " << skip_eager_delete_vars.size();
+    details::print_collection(skip_eager_delete_vars);
+  } else {
+    paddle::platform::RecordEvent record_event(
+        "get_interpretercore_cahce",
+        paddle::platform::TracerEventType::UserDefined,
+        1);
+    VLOG(2) << "Get interpretercore cahce by program:" << program_id;
+    auto &cached_value = interpretercore_info_cache.GetMutable(
+        program_id, global_inner_scope, /*is_grad=*/true);
+    interpreter_core = cached_value.core_;
+
+    // update scope (TODO: why share again)
+    // details::ShareTensorsIntoScope(out_grad, global_inner_scope);
+    // if (interpreter_core->GetVariableScope()->GetMutableScope() !=
+    // global_inner_scope) {
+    // details::BuildScopeByBlock(
+    // *interpreter_core.get(), *backward_global_block, global_inner_scope);
+    // interpreter_core->reset_scope(global_inner_scope);
+    //}
+  }
+
+  if (!backward_global_block->empty()) {
+    paddle::platform::RecordEvent record_event(
+        "interpreter_core_run",
+        paddle::platform::TracerEventType::UserDefined,
+        1);
+    // Debug info: scope info when run end
+    VLOG(3) << paddle::framework::GenScopeTreeDebugInfo(out_scope_vec->front());
+    interpreter_core->Run({});
+  }
+
+  {
+    paddle::platform::RecordEvent record_event(
+        "fetch_and_gc", paddle::platform::TracerEventType::UserDefined, 1);
+    // Step 4. get outputs
+    details::ShareTensorsFromScopeByValue(
+        backward_global_block, x_grad, x_grad_values, global_inner_scope);
+    details::ShareTensorsFromScopeByValue(
+        backward_global_block, params_grad, p_grad_values, global_inner_scope);
+    VLOG(4) << "after backward gc all vars";
+    global_inner_scope->SetCanReused(true);
+    details::GcScope(global_inner_scope);
+  }
+}
+
 class GradNodeRunProgram : public egr::GradNodeBase {
  public:
   GradNodeRunProgram(size_t bwd_in_slot_num, size_t bwd_out_slot_num)
       : egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {}
 
-  ~GradNodeRunProgram() {
+  ~GradNodeRunProgram() override {
     if (!executed_) {
       auto *out_scope_vec = &step_scope_;
       VLOG(4) << "~GradNodeRunProgram";
@@ -828,3 +1247,187 @@ class GradNodeRunProgram : public egr::GradNodeBase {
 
   bool executed_{false};
 };
+
+class NewIRGradNodeRunProgram : public egr::GradNodeBase {
+ public:
+  NewIRGradNodeRunProgram(size_t bwd_in_slot_num, size_t bwd_out_slot_num)
+      : egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {}
+
+  ~NewIRGradNodeRunProgram() override {
+    if (!executed_) {
+      auto *out_scope_vec = &step_scope_;
+      VLOG(4) << "~GradNodeRunProgram";
+      // Normally out_scope_vec.size() == 1. for safty, we add for-loop here.
+      for (size_t i = 0; i < out_scope_vec->size(); ++i) {
+        paddle::framework::Scope *global_inner_scope = out_scope_vec->at(i);
+        global_inner_scope->SetCanReused(true);
+        details::GcScope(global_inner_scope);
+        VLOG(4) << "global_inner_scope SetCanReused";
+      }
+      middles_.clear();
+      outputs_.clear();
+    }
+  }
+  // Functor: perform backward computations
+  virtual paddle::small_vector<std::vector<paddle::Tensor>,
+                               egr::kSlotSmallVectorSize>
+  operator()(paddle::small_vector<std::vector<paddle::Tensor>,
+                                  egr::kSlotSmallVectorSize> &grads,  // NOLINT
+             bool create_graph UNUSED,
+             bool is_new_grad UNUSED) override {
+    VLOG(3) << "Running Eager Backward Node: GradNodeRunProgram";
+    paddle::small_vector<std::vector<paddle::Tensor>, egr::kSlotSmallVectorSize>
+        hooked_grads = NewIRGradNodeRunProgram::ApplyGradientHooks(grads);
+    PADDLE_ENFORCE_EQ(hooked_grads.size(),
+                      1,
+                      paddle::platform::errors::InvalidArgument(
+                          "The hooked_grads.size() of RunProgramGradOp should "
+                          "be equal to 1."));
+
+    std::vector<paddle::Tensor> x_grad;
+    std::vector<paddle::Tensor> params_grad;
+    std::vector<paddle::Tensor *> x_grad_ptr;
+    std::vector<paddle::Tensor *> params_grad_ptr;
+    {
+      paddle::platform::RecordEvent record_event(
+          "construct_grad_tensor",
+          paddle::platform::TracerEventType::UserDefined,
+          1);
+
+      egr::EagerUtils::FillZeroForEmptyOptionalGradInput(&hooked_grads[0],
+                                                         this->InputMeta()[0]);
+      VLOG(3) << "hooked_grads[0].size() : " << hooked_grads[0].size();
+      ConstructXGradTensors(x_, &x_grad);
+      ConstructParamGradTensors(params_, &params_grad);
+      for (auto &i : x_grad) {
+        x_grad_ptr.emplace_back(&i);
+      }
+      for (auto &i : params_grad) {
+        if (i.defined()) {
+          params_grad_ptr.emplace_back(&i);
+        }
+      }
+    }
+
+    auto out_grad_values =
+        PADDLE_GET_CONST(std::vector<::pir::Value>, attrs_.at("bo_g"));
+    PADDLE_ENFORCE_EQ(hooked_grads[0].size(),
+                      out_grad_values.size(),
+                      paddle::platform::errors::InvalidArgument(
+                          "The hooked_grads[0].size() and "
+                          "out_grad_values.size() should be equal."));
+
+    VLOG(1) << "Run Program Grad API start.";
+    NewIRRunProgramGradAPI(x_,
+                           params_,
+                           hooked_grads[0],
+                           middles_,
+                           outputs_,
+                           step_scope_,
+                           attrs_,
+                           x_grad_ptr,
+                           params_grad_ptr);
+    VLOG(1) << "Run Program Grad API end.";
+    VLOG(3) << "End Eager Backward Node: GradNodeRunProgram";
+
+    executed_ = true;
+    return {x_grad, params_grad};
+  }
+
+  void ClearTensorWrappers() override {
+    x_.clear();
+    params_.clear();
+    middles_.clear();
+    outputs_.clear();
+    SetIsTensorWrappersCleared(true);
+  }
+
+  // SetAttrMap
+  void SetAttrMap(const paddle::framework::AttributeMap &attrs) {
+    attrs_ = attrs;
+  }
+
+  void SetFwdX(const std::vector<paddle::Tensor> &tensors) { x_ = tensors; }
+
+  std::vector<paddle::Tensor> &GetMiddle() { return middles_; }
+
+  std::vector<paddle::Tensor> &GetOutputs() { return outputs_; }
+
+  void SetFwdParams(const std::vector<paddle::Tensor> &tensors) {
+    params_ = tensors;
+  }
+
+  void SetStepScope(const std::vector<paddle::framework::Scope *> &scopes) {
+    step_scope_ = scopes;
+  }
+
+ protected:
+  void ConstructXGradTensors(const std::vector<paddle::Tensor> &x,
+                             std::vector<paddle::Tensor> *x_grad) {
+    auto x_grad_values =
+        PADDLE_GET_CONST(std::vector<::pir::Value>, attrs_.at("bx_g"));
+    PADDLE_ENFORCE_EQ(
+        x.size(),
+        x_grad_values.size(),
+        paddle::platform::errors::InvalidArgument(
+            "The x.size() and x_grad_names.size() should be equal. "
+            "But received x.size() = %d, x_grad_names.size() = %d",
+            x.size(),
+            x_grad_values.size()));
+
+    // TODO(dev): Need an elegant way to determine inforamtion of grad_tensor,
+    // such as: name, tensor type(DenseTensor or SelectedRows).
+    for (size_t i = 0; i < x.size(); i++) {
+      if (x[i].is_dense_tensor()) {
+        x_grad->emplace_back(std::make_shared<phi::DenseTensor>());
+      } else if (x[i].is_selected_rows()) {
+        x_grad->emplace_back(std::make_shared<phi::SelectedRows>());
+      }
+    }
+  }
+
+  void ConstructParamGradTensors(const std::vector<paddle::Tensor> &params,
+                                 std::vector<paddle::Tensor> *param_grads) {
+    auto p_grad_values =
+        PADDLE_GET_CONST(std::vector<::pir::Value>, attrs_.at("bp_g"));
+    PADDLE_ENFORCE_EQ(params.size(),
+                      p_grad_values.size(),
+                      paddle::platform::errors::InvalidArgument(
+                          "The param.size() and "
+                          "param_grad_names.size() should be equal."));
+
+    for (size_t i = 0; i < params.size(); ++i) {
+      auto &p = params[i];
+      auto &p_grad = egr::EagerUtils::unsafe_autograd_meta(p)->Grad();
+      // In eager mode, the number of param_grad should be the same as
+      // param, so here an empty Tensor is added for the param with
+      // stop_gradient=True
+      if (!p_grad.defined()) {
+        param_grads->emplace_back();
+      } else if (p_grad.is_dense_tensor()) {
+        param_grads->emplace_back(std::make_shared<phi::DenseTensor>());
+      } else if (p_grad.is_selected_rows()) {
+        param_grads->emplace_back(std::make_shared<phi::SelectedRows>());
+      }
+    }
+  }
+
+  std::shared_ptr<GradNodeBase> Copy() const override {
+    auto copied_node = std::shared_ptr<NewIRGradNodeRunProgram>(
+        new NewIRGradNodeRunProgram(*this));
+    return copied_node;
+  }
+
+ private:
+  // TensorWrappers
+  std::vector<paddle::Tensor> x_;
+  std::vector<paddle::Tensor> params_;
+  std::vector<paddle::Tensor> middles_;
+  std::vector<paddle::Tensor> outputs_;
+  std::vector<paddle::framework::Scope *> step_scope_;
+
+  // Attribute Map
+  paddle::framework::AttributeMap attrs_;
+
+  bool executed_{false};
+};
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 3befea7d0fd2b..f72d4ad182ddd 100755
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -696,6 +696,7 @@ cc_library(
   DEPS while_op_helper
        recurrent_op_helper
        conditional_block_op_helper
+       pylayer_op_helper
        scope
        proto_desc
        operator
@@ -1014,8 +1015,9 @@ else()
          monitor)
 endif()
 
-target_link_libraries(executor while_op_helper executor_gc_helper
-                      recurrent_op_helper conditional_block_op_helper)
+target_link_libraries(
+  executor while_op_helper executor_gc_helper recurrent_op_helper
+  conditional_block_op_helper pylayer_op_helper)
 
 cc_library(
   parallel_executor
@@ -1035,7 +1037,7 @@ cc_library(
   executor_cache
   SRCS executor_cache.cc
   DEPS parallel_executor standalone_executor phi_kernel_adaptor pd_inplace_pass
-       pd_op_to_kernel_pass ir)
+       pd_op_to_kernel_pass pir)
 if(WITH_PSCORE)
   get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)
   if(WITH_HETERPS)
diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index c2dd1bf37dd19..8814935e3fceb 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -947,12 +947,10 @@ static void RegisterOperatorKernel(
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
   auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes();
   for (const auto& dev_type : device_types) {
-    for (auto& dev_id : phi::DeviceManager::GetSelectedDeviceList(dev_type)) {
-      RegisterOperatorKernelWithPlace(name,
-                                      op_kernel_func,
-                                      proto::VarType::RAW,
-                                      platform::CustomPlace(dev_type, dev_id));
-    }
+    RegisterOperatorKernelWithPlace(name,
+                                    op_kernel_func,
+                                    proto::VarType::RAW,
+                                    platform::CustomPlace(dev_type));
   }
 #endif
 }
diff --git a/paddle/fluid/framework/executor_cache.cc b/paddle/fluid/framework/executor_cache.cc
index f5c4c745cfd51..c03c8542b49c6 100644
--- a/paddle/fluid/framework/executor_cache.cc
+++ b/paddle/fluid/framework/executor_cache.cc
@@ -16,14 +16,14 @@
 
 #include "paddle/fluid/framework/new_executor/interpretercore.h"
 #include "paddle/fluid/framework/op_info.h"
-#include "paddle/fluid/ir/transforms/inplace_pass.h"
-#include "paddle/fluid/ir/transforms/pd_op_to_kernel_pass.h"
 #include "paddle/fluid/ir_adaptor/translator/translate.h"
-#include "paddle/ir/core/program.h"
-#include "paddle/ir/core/value.h"
-#include "paddle/ir/pass/pass.h"
-#include "paddle/ir/pass/pass_manager.h"
+#include "paddle/fluid/pir/transforms/inplace_pass.h"
+#include "paddle/fluid/pir/transforms/pd_op_to_kernel_pass.h"
 #include "paddle/phi/core/flags.h"
+#include "paddle/pir/core/program.h"
+#include "paddle/pir/core/value.h"
+#include "paddle/pir/pass/pass.h"
+#include "paddle/pir/pass/pass_manager.h"
 
 PHI_DECLARE_bool(new_ir_apply_inplace_pass);
 
@@ -304,7 +304,7 @@ std::shared_ptr<InterpreterCore> CreateProgramInterpreterCoreInfoToCache(
     framework::Scope *scope) {
   auto &interpretercore_info_cache =
       framework::InterpreterCoreInfoCache::Instance();
-  if (interpretercore_info_cache.Size() > 10u /* max_cached_size*/) {
+  if (interpretercore_info_cache.Size() > 256u /* max_cached_size*/) {
     VLOG(2) << "The cached info size has exceeded max_cached_size: 4, clear "
                "all cache!";
     interpretercore_info_cache.Finalize();
@@ -325,14 +325,14 @@ std::shared_ptr<InterpreterCore> CreateProgramInterpreterCoreInfoToCache(
 }
 
 std::shared_ptr<InterpreterCore> CreateNewIRInterpreterCoreInfoToCache(
-    std::unique_ptr<::ir::Program> ir_program,
+    std::unique_ptr<::pir::Program> ir_program,
     const platform::Place &place,
     bool is_grad,
     int64_t program_id,
     framework::Scope *scope) {
   auto &interpretercore_info_cache =
       framework::InterpreterCoreInfoCache::Instance();
-  if (interpretercore_info_cache.Size() > 10u /* max_cached_size*/) {
+  if (interpretercore_info_cache.Size() > 256u /* max_cached_size*/) {
     VLOG(2) << "The cached info size has exceeded max_cached_size: 4, clear "
                "all cache!";
     interpretercore_info_cache.Finalize();
@@ -352,14 +352,15 @@ std::shared_ptr<InterpreterCore> CreateNewIRInterpreterCoreInfoToCache(
   return core;
 }
 
-std::unique_ptr<::ir::Program> ConstructFowardIrProgram(
+std::unique_ptr<::pir::Program> ConstructFowardIrProgram(
     const paddle::framework::BlockDesc *forward_global_block,
     const paddle::framework::BlockDesc *backward_global_block,
     const std::vector<std::string> output_names,
     const std::vector<paddle::Tensor> &x,
-    const std::vector<paddle::Tensor> &params) {
-  auto ir_ctx = ::ir::IrContext::Instance();
-  auto program = std::make_unique<::ir::Program>(ir_ctx);
+    const std::vector<paddle::Tensor> &params,
+    const phi::Place &place) {
+  auto ir_ctx = ::pir::IrContext::Instance();
+  auto program = std::make_unique<::pir::Program>(ir_ctx);
 
   std::set<std::string> set_output_names;
   auto local_program =
@@ -381,14 +382,14 @@ std::unique_ptr<::ir::Program> ConstructFowardIrProgram(
     if (block->FindVarRecursive(name) == nullptr) {
       continue;
     }
-    auto place = in_t.place().GetType();
+    auto p = in_t.place().GetType();
 
     auto op_desc = block->PrependOp();
     op_desc->SetType("data");
     op_desc->SetAttr("shape", std::vector<int64_t>());
     // TODO(phlrain) : using tensor dtype
     op_desc->SetAttr("dtype", 0);
-    op_desc->SetAttr("place", static_cast<int>(place));
+    op_desc->SetAttr("place", static_cast<int>(p));
     op_desc->SetAttr("name", name);
     op_desc->SetOutput("out", {name});
   }
@@ -396,14 +397,14 @@ std::unique_ptr<::ir::Program> ConstructFowardIrProgram(
   std::set<std::string> input_param_names;
   for (auto &param : params) {
     auto &name = param.name();
-    auto place = param.place().GetType();
+    auto p = param.place().GetType();
 
     auto op_desc = local_program.MutableBlock(0)->PrependOp();
     op_desc->SetType("data");
     op_desc->SetAttr("shape", std::vector<int64_t>());
     // TODO(phlrain) : using tensor dtype
     op_desc->SetAttr("dtype", 0);
-    op_desc->SetAttr("place", static_cast<int>(place));
+    op_desc->SetAttr("place", static_cast<int>(p));
     op_desc->SetAttr("name", name);
     op_desc->SetOutput("out", {name});
 
@@ -445,25 +446,25 @@ std::unique_ptr<::ir::Program> ConstructFowardIrProgram(
 
   program_translator.Translate();
 
-  auto ir_res = paddle::dialect::PdOpLowerToKernelPass(program.get());
+  auto ir_res = paddle::dialect::PdOpLowerToKernelPass(program.get(), place);
 
   if (FLAGS_new_ir_apply_inplace_pass) {
-    ::ir::PassManager pm(::ir::IrContext::Instance(), 3);
-    pm.AddPass(::ir::CreateInplacePass());
+    ::pir::PassManager pm(::pir::IrContext::Instance(), 3);
+    pm.AddPass(::pir::CreateInplacePass());
     pm.Run(ir_res.get());
   }
 
   return ir_res;
 }
 
-std::unique_ptr<::ir::Program> ConstructBackwardIrProgram(
+std::unique_ptr<::pir::Program> ConstructBackwardIrProgram(
     const paddle::framework::BlockDesc *backward_global_block,
     const std::vector<paddle::Tensor> &out_grad,
     const std::vector<paddle::Tensor *> &x_grad,
     const std::vector<paddle::Tensor *> &params_grad,
     const paddle::framework::Scope *scope) {
-  auto ir_ctx = ::ir::IrContext::Instance();
-  auto program = std::make_unique<::ir::Program>(ir_ctx);
+  auto ir_ctx = ::pir::IrContext::Instance();
+  auto program = std::make_unique<::pir::Program>(ir_ctx);
 
   auto local_program =
       paddle::framework::ProgramDesc(*(backward_global_block->Program()));
@@ -527,8 +528,8 @@ std::unique_ptr<::ir::Program> ConstructBackwardIrProgram(
   auto res = paddle::dialect::PdOpLowerToKernelPass(program.get());
 
   if (FLAGS_new_ir_apply_inplace_pass) {
-    ::ir::PassManager pm(::ir::IrContext::Instance(), 3);
-    pm.AddPass(::ir::CreateInplacePass());
+    ::pir::PassManager pm(::pir::IrContext::Instance(), 3);
+    pm.AddPass(::pir::CreateInplacePass());
     pm.Run(res.get());
   }
 
diff --git a/paddle/fluid/framework/executor_cache.h b/paddle/fluid/framework/executor_cache.h
index edbbc0e9420af..1c5602a31f872 100644
--- a/paddle/fluid/framework/executor_cache.h
+++ b/paddle/fluid/framework/executor_cache.h
@@ -30,9 +30,9 @@
 #include "paddle/fluid/string/string_helper.h"
 
 #include "paddle/fluid/ir_adaptor/translator/program_translator.h"
-#include "paddle/ir/core/dialect.h"
-#include "paddle/ir/core/ir_context.h"
-#include "paddle/ir/core/program.h"
+#include "paddle/pir/core/dialect.h"
+#include "paddle/pir/core/ir_context.h"
+#include "paddle/pir/core/program.h"
 
 PHI_DECLARE_bool(enable_new_ir_in_executor);
 namespace paddle {
@@ -243,20 +243,21 @@ std::shared_ptr<InterpreterCore> CreateProgramInterpreterCoreInfoToCache(
     framework::Scope* scope);
 
 std::shared_ptr<InterpreterCore> CreateNewIRInterpreterCoreInfoToCache(
-    std::unique_ptr<::ir::Program> ir_prog,
+    std::unique_ptr<::pir::Program> ir_prog,
     const platform::Place& place,
     bool is_grad,
     int64_t program_id,
     framework::Scope* scope);
 
-std::unique_ptr<::ir::Program> ConstructFowardIrProgram(
+std::unique_ptr<::pir::Program> ConstructFowardIrProgram(
     const paddle::framework::BlockDesc* forward_global_block,
     const paddle::framework::BlockDesc* backward_global_block,
     const std::vector<std::string> output_names,
     const std::vector<paddle::Tensor>& x,
-    const std::vector<paddle::Tensor>& params);
+    const std::vector<paddle::Tensor>& params,
+    const phi::Place& place);
 
-std::unique_ptr<::ir::Program> ConstructBackwardIrProgram(
+std::unique_ptr<::pir::Program> ConstructBackwardIrProgram(
     const paddle::framework::BlockDesc* backward_global_block,
     const std::vector<paddle::Tensor>& out_grad,
     const std::vector<paddle::Tensor*>& x_grad,
diff --git a/paddle/fluid/framework/executor_gc_helper.cc b/paddle/fluid/framework/executor_gc_helper.cc
index fa63b4bca16ea..27342b123d6e9 100644
--- a/paddle/fluid/framework/executor_gc_helper.cc
+++ b/paddle/fluid/framework/executor_gc_helper.cc
@@ -24,6 +24,7 @@
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/var_desc.h"
 #include "paddle/fluid/operators/controlflow/conditional_block_op_helper.h"
+#include "paddle/fluid/operators/controlflow/pylayer_op_helper.h"
 #include "paddle/fluid/operators/controlflow/recurrent_op_helper.h"
 #include "paddle/fluid/operators/controlflow/while_op_helper.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -226,6 +227,8 @@ GetEagerDeletionCleanVarsForPartial(const ProgramDesc &origin_program,
     auto global_block_ops = CreateOpsFromBlock(program.Block(0));
     operators::PrepareSafeEagerDeletionOnConditionalOpAndConditionalGradOp(
         program, 0, global_block_ops);
+    operators::PrepareSafeEagerDeletionOnPyLayerOpAndPyLayerGradOp(
+        program, 0, global_block_ops);
     operators::PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(
         program, 0, global_block_ops);
     operators::PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp(
@@ -240,35 +243,54 @@ GetEagerDeletionCleanVarsForPartial(const ProgramDesc &origin_program,
 
   const char *kSubBlock = "sub_block";
   const char *kSkipEagerDeletionVars = "skip_eager_deletion_vars";
+  // NOTE: pylayer op contains may contain two blocks: forward block and
+  // backward block
+  const char *kBlocks = "blocks";
 
   for (size_t i = 0; i < block_num; ++i) {
     const auto &block = program.Block(i);
     size_t op_num = block.OpSize();
     for (size_t j = 0; j < op_num; ++j) {
       auto *op = block.Op(static_cast<int>(j));
-      if (!op->HasAttr(kSubBlock) || !op->HasAttr(kSkipEagerDeletionVars)) {
+      if ((!op->HasAttr(kSubBlock) && !op->HasAttr(kBlocks)) ||
+          !op->HasAttr(kSkipEagerDeletionVars)) {
         continue;
       }
-      auto sub_block_id = op->GetAttrIfExists<BlockDesc *>(kSubBlock)->ID();
-      PADDLE_ENFORCE_GE(sub_block_id,
-                        0,
-                        platform::errors::PermissionDenied(
-                            "sub_block id must be non-negative number"));
-      PADDLE_ENFORCE_LT(sub_block_id,
-                        block_num,
-                        platform::errors::PermissionDenied(
-                            "sub_block id exceeds max block num"));
-      PADDLE_ENFORCE_EQ(
-          found_skip_vars[sub_block_id],
-          false,
-          platform::errors::PermissionDenied(
-              "there are 2 ops which refer to the same sub_block %d",
-              sub_block_id));
-
-      found_skip_vars[sub_block_id] = true;
-      auto sub_block_skip_vars =
-          op->GetAttrIfExists<std::vector<std::string>>(kSkipEagerDeletionVars);
-      skip_vars_on_each_block[sub_block_id] = std::move(sub_block_skip_vars);
+
+      std::vector<int32_t> sub_block_ids;
+      if (op->HasAttr(kSubBlock)) {
+        sub_block_ids.push_back(
+            op->GetAttrIfExists<BlockDesc *>(kSubBlock)->ID());
+      } else if (op->HasAttr(kBlocks)) {
+        const auto &blocks =
+            op->GetAttrIfExists<std::vector<BlockDesc *>>(kBlocks);
+        for (const auto &block : blocks) {
+          sub_block_ids.push_back(block->ID());
+        }
+      }
+
+      for (auto sub_block_id : sub_block_ids) {
+        PADDLE_ENFORCE_GE(sub_block_id,
+                          0,
+                          platform::errors::PermissionDenied(
+                              "sub_block id must be non-negative number"));
+        PADDLE_ENFORCE_LT(sub_block_id,
+                          block_num,
+                          platform::errors::PermissionDenied(
+                              "sub_block id exceeds max block num"));
+        PADDLE_ENFORCE_EQ(
+            found_skip_vars[sub_block_id],
+            false,
+            platform::errors::PermissionDenied(
+                "there are 2 ops which refer to the same sub_block %d",
+                sub_block_id));
+
+        found_skip_vars[sub_block_id] = true;
+        auto sub_block_skip_vars =
+            op->GetAttrIfExists<std::vector<std::string>>(
+                kSkipEagerDeletionVars);
+        skip_vars_on_each_block[sub_block_id] = std::move(sub_block_skip_vars);
+      }
     }
   }
 
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index b6143f335d163..526847bb32de5 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -46,7 +46,7 @@ cc_library(
 cc_library(
   op_compat_sensible_pass
   SRCS op_compat_sensible_pass.cc
-  DEPS graph_pattern_detector op_def_api pass)
+  DEPS graph_pattern_detector op_def_api pass pir_core)
 cc_library(
   subgraph_detector
   SRCS subgraph_detector.cc
@@ -156,6 +156,8 @@ if(WITH_TENSORRT)
   pass_library(preln_elementwise_groupnorm_act_pass inference)
   pass_library(groupnorm_act_pass inference)
   pass_library(trans_layernorm_fuse_pass inference)
+  pass_library(trt_prompt_tuning_embedding_eltwise_layernorm_fuse_pass
+               inference)
   pass_library(trt_embedding_eltwise_layernorm_fuse_pass inference)
   pass_library(preln_embedding_eltwise_layernorm_fuse_pass inference)
   pass_library(split_layernorm_to_math_ops_pass inference)
@@ -241,8 +243,9 @@ if(WITH_XPU)
   pass_library(yolo_box_xpu_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS})
   pass_library(cast_embedding_trans_ids_to_int32_pass inference DIR xpu DEPS
                ${XPU_PASS_DEPS})
-  pass_library(conv1d_xpu_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS})
+  # pass_library(conv1d_xpu_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS})
   pass_library(conv2d_xpu_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS})
+  pass_library(conv2d_bias_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS})
   pass_library(redundant_unsqueeze_squeeze_elimination_pass inference DIR xpu
                DEPS ${XPU_PASS_DEPS})
   pass_library(redundant_squeeze_unsqueeze_elimination_pass inference DIR xpu
diff --git a/paddle/fluid/framework/ir/generate_pass.cc b/paddle/fluid/framework/ir/generate_pass.cc
index 17d2bdda56cb9..e0ab584ee3225 100644
--- a/paddle/fluid/framework/ir/generate_pass.cc
+++ b/paddle/fluid/framework/ir/generate_pass.cc
@@ -15,6 +15,8 @@
 #include "paddle/fluid/framework/ir/generate_pass.h"
 
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/pir/core/block.h"
+#include "paddle/pir/core/value.h"
 #include "paddle/utils/blank.h"
 
 namespace paddle {
@@ -47,6 +49,12 @@ class element_visitor {
   int index_;
 };
 
+template <>
+Attribute element_visitor::operator()(
+    const std::vector<::pir::Value>& attr UNUSED) const {
+  PADDLE_THROW(platform::errors::Unimplemented("Unimplemented operand."));
+}
+
 class operation_visitor {
  public:
   explicit operation_visitor(const proto::PassDesc::OperationType& type)
diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc
index 3e744e18bf6c8..b322e3f8bce28 100644
--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -704,10 +704,18 @@ static void GetGraphOpDesc(const std::vector<Node *> &nodes,
         ops->emplace_back(depend_desc);
         VLOG(4) << "add depend op";
       }
-      if (n->Name() == "while" || n->Name() == "while_grad" ||
-          n->Name() == "conditional_block" ||
-          n->Name() == "conditional_block_grad" || n->Name() == "recurrent" ||
-          n->Name() == "recurrent_grad") {
+
+      const std::unordered_set<std::string> control_flow_ops = {
+          "while",
+          "while_grad",
+          "conditional_block",
+          "conditional_block_grad",
+          "recurrent",
+          "recurrent_grad",
+          "pylayer",
+          "pylayer_grad"};
+
+      if (control_flow_ops.count(n->Name())) {
         VLOG(1) << "Update control op attr: skip_eager_deletion_vars";
         UpdateControlOpSkipEagerDeletionVars(*n, graph, graph_idx, n->Name());
       }
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt b/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
index ffb1606b95ccd..1e634343c7fc1 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
@@ -6,6 +6,10 @@ cc_library(
   conditional_block_op_eager_deletion_pass
   SRCS conditional_block_op_eager_deletion_pass.cc
   DEPS conditional_block_op_helper graph_helper pass computation_op_handle)
+cc_library(
+  pylayer_op_eager_deletion_pass
+  SRCS pylayer_op_eager_deletion_pass.cc
+  DEPS pylayer_op_helper graph_helper pass computation_op_handle)
 cc_library(
   while_op_eager_deletion_pass
   SRCS while_op_eager_deletion_pass.cc
@@ -31,6 +35,7 @@ set(EAGER_DELETETION_PASS_DEPS
     graph_helper
     pass
     conditional_block_op_eager_deletion_pass
+    pylayer_op_eager_deletion_pass
     while_op_eager_deletion_pass
     recurrent_op_eager_deletion_pass
     reference_count_pass_helper)
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/eager_deletion_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/eager_deletion_pass.cc
index b0729abfcf883..40525a14141a6 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/eager_deletion_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/eager_deletion_pass.cc
@@ -294,6 +294,10 @@ void EagerDeletionPass::ApplyImpl(ir::Graph *graph) const {
           "conditional_block_op_eager_deletion_pass");
   conditional_block_op_eager_deletion_pass->Apply(graph);
 
+  auto pylayer_op_eager_deletion_pass =
+      ir::PassRegistry::Instance().Get("pylayer_op_eager_deletion_pass");
+  pylayer_op_eager_deletion_pass->Apply(graph);
+
   auto while_op_eager_deletion_pass =
       ir::PassRegistry::Instance().Get("while_op_eager_deletion_pass");
   while_op_eager_deletion_pass->Apply(graph);
@@ -321,6 +325,7 @@ REGISTER_PASS(eager_deletion_pass, paddle::framework::ir::EagerDeletionPass)
     .RequirePassAttr(paddle::framework::ir::kGarbageCollector);
 
 USE_PASS(conditional_block_op_eager_deletion_pass);
+USE_PASS(pylayer_op_eager_deletion_pass);
 USE_PASS(while_op_eager_deletion_pass);
 USE_PASS(recurrent_op_eager_deletion_pass);
 #ifdef PADDLE_WITH_CINN
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/pylayer_op_eager_deletion_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/pylayer_op_eager_deletion_pass.cc
new file mode 100644
index 0000000000000..6d2fe78ea1d12
--- /dev/null
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/pylayer_op_eager_deletion_pass.cc
@@ -0,0 +1,102 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/computation_op_handle.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/operators/controlflow/op_variant.h"
+#include "paddle/fluid/operators/controlflow/pylayer_op_helper.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+using OpVariant = operators::OpVariant;
+class PyLayerOpEagerDeletionPass : public Pass {
+ protected:
+  void ApplyImpl(Graph *graph) const override {
+    auto all_ops = ir::FilterByNodeWrapper<details::OpHandleBase>(*graph);
+
+    // Find all pylayer_op and pylayer_grad_op
+    std::unordered_map<
+        size_t,
+        std::pair<std::vector<OpVariant>, std::vector<OpVariant>>>
+        target_ops;
+    for (auto *op : all_ops) {
+      auto compute_op = dynamic_cast<details::ComputationOpHandle *>(op);
+      if (compute_op == nullptr) continue;
+
+      if (compute_op->Name() == "pylayer") {
+        target_ops[compute_op->GetScopeIdx()].first.emplace_back(
+            compute_op->GetOp());
+      } else if (compute_op->Name() == "pylayer_grad") {
+        target_ops[compute_op->GetScopeIdx()].second.emplace_back(
+            compute_op->GetOp());
+      }
+    }
+
+    // NOTE(Aurelius84): In case of @to_static, after we finish executing
+    // forward graph, some necessaray variable in step_scope of pylayer_op
+    // should be kept for backward graph.
+    if (graph->IsConstructedByPartialProgram()) {
+      PADDLE_ENFORCE_LE(target_ops.size(),
+                        1,
+                        platform::errors::InvalidArgument(
+                            "Unsupported multi devices if graph is constructed "
+                            "with partial program."));
+      size_t scope_idx = 0;
+      auto &pylayer_ops = target_ops[scope_idx].first;
+      auto &pylayer_grad_ops = target_ops[scope_idx].second;
+
+      auto all_ops = graph->OriginProgram().Block(0).AllOps();
+      if (pylayer_ops.empty()) {
+        operators::AppendOpVariantByOpName(
+            all_ops, std::string("pylayer"), &pylayer_ops);
+      } else if (pylayer_grad_ops.empty()) {
+        operators::AppendOpVariantByOpName(
+            all_ops, std::string("pylayer_grad"), &pylayer_grad_ops);
+      } else {
+        PADDLE_THROW("One of pylayer_ops or pylayer_grad_ops should be empty.");
+      }
+    }
+
+    for (auto &ops_pair : target_ops) {
+      auto &pylayer_ops = ops_pair.second.first;
+      auto &pylayer_grad_ops = ops_pair.second.second;
+      operators::PrepareSafeEagerDeletionOnPyLayerOpAndPyLayerGradOp(
+          graph->OriginProgram(), pylayer_ops, pylayer_grad_ops);
+    }
+
+    for (auto op_hander : all_ops) {
+      auto *compute_op =
+          dynamic_cast<details::ComputationOpHandle *>(op_hander);
+      if (compute_op == nullptr) continue;
+      if (compute_op->Name() == "pylayer" ||
+          compute_op->Name() == "pylayer_grad") {
+        ir::Node *op_node = op_hander->Node();
+        auto *op_base = compute_op->GetOp();
+        if (op_base->Attrs().count("skip_eager_deletion_vars")) {
+          op_node->Op()->SetAttr(
+              "skip_eager_deletion_vars",
+              op_base->Attrs().at("skip_eager_deletion_vars"));
+        }
+      }
+    }
+  }
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(pylayer_op_eager_deletion_pass,
+              paddle::framework::ir::PyLayerOpEagerDeletionPass);
diff --git a/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.cc b/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.cc
index b404d023d487b..704f59bbace67 100644
--- a/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.cc
+++ b/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.cc
@@ -23,15 +23,18 @@ namespace framework {
 namespace ir {
 namespace patterns {
 void EmbEltwiseLayernorm::operator()() {
-  // Create nodes for fused_embedding_eltwise_layernorm.
-  auto* emb_elt_layernorm_op =
-      pattern->NewNode(emb_elt_layernorm_op_repr())
-          ->assert_is_op("fused_embedding_eltwise_layernorm");
+  // Create nodes for fused_embedding_eltwise_layernorm or
+  // prompt_tuning_emb_eltwise_layernorm.
+  std::unordered_set<std::string> embedding_ops{
+      "fused_embedding_eltwise_layernorm",
+      "prompt_tuning_emb_eltwise_layernorm"};
+  auto* emb_elt_layernorm_op = pattern->NewNode(emb_elt_layernorm_op_repr())
+                                   ->assert_is_ops(embedding_ops);
   auto* emb_elt_layernorm_out =
       pattern->NewNode(emb_elt_layernorm_out_repr())
-          ->assert_is_op_output("fused_embedding_eltwise_layernorm", "Out");
+          ->assert_is_ops_output(embedding_ops, "Out");
 
-  // Add links for fused_embedding_eltwise_layernorm op.
+  // Add links for embedding_ops.
   emb_elt_layernorm_op->LinksTo({emb_elt_layernorm_out});
 }
 
diff --git a/paddle/fluid/framework/ir/trt_prompt_tuning_embedding_eltwise_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/trt_prompt_tuning_embedding_eltwise_layernorm_fuse_pass.cc
new file mode 100644
index 0000000000000..6bdd56dff2087
--- /dev/null
+++ b/paddle/fluid/framework/ir/trt_prompt_tuning_embedding_eltwise_layernorm_fuse_pass.cc
@@ -0,0 +1,596 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/trt_prompt_tuning_embedding_eltwise_layernorm_fuse_pass.h"
+
+#include <string>
+
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+class Node;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace patterns {
+
+static PDNode* create_emb_vars(PDPattern* pattern,
+                               const std::string& name,
+                               const std::string& arg,
+                               bool is_persist = false) {
+  std::unordered_set<std::string> embedding_ops{"lookup_table",
+                                                "lookup_table_v2"};
+  PDNode* node =
+      pattern->NewNode(name)->assert_is_ops_input(embedding_ops, arg);
+  if (is_persist) return node->assert_is_persistable_var();
+  return node;
+}
+static PDNode* create_emb_out_vars(PDPattern* pattern,
+                                   const std::string& name,
+                                   const std::string& arg) {
+  std::unordered_set<std::string> embedding_ops{"lookup_table",
+                                                "lookup_table_v2"};
+  PDNode* node = pattern->NewNode(name)
+                     ->assert_is_only_output_of_ops(embedding_ops)
+                     ->assert_is_op_input("elementwise_add", arg)
+                     ->AsIntermediate();
+  return node;
+}
+void TrtPromptTuningEmbedding2Eltwise1Pattern::operator()() {
+  auto* lookup_table1_x =
+      create_emb_vars(pattern, lookup_table1_x_repr(), "Ids");
+  auto* lookup_table2_x =
+      create_emb_vars(pattern, lookup_table2_x_repr(), "Ids");
+  auto* lookup_table1_w =
+      create_emb_vars(pattern, lookup_table1_w_repr(), "W", true);
+  auto* lookup_table2_w =
+      create_emb_vars(pattern, lookup_table2_w_repr(), "W", true);
+  std::unordered_set<std::string> embedding_ops{"lookup_table",
+                                                "lookup_table_v2"};
+
+  auto* lookup_table1 =
+      pattern->NewNode(lookup_table1_repr())->assert_is_ops(embedding_ops);
+  auto* lookup_table2 =
+      pattern->NewNode(lookup_table2_repr())->assert_is_ops(embedding_ops);
+  auto* lookup_table1_out =
+      create_emb_out_vars(pattern, lookup_table1_out_repr(), "X");
+  auto* lookup_table2_out =
+      create_emb_out_vars(pattern, lookup_table2_out_repr(), "Y");
+  auto* eltwise_add =
+      pattern->NewNode(eltwise_add_repr())->assert_is_op("elementwise_add");
+  auto* eltwise_add_out = pattern->NewNode(eltwise_add_out_repr())
+                              ->assert_is_op_output("elementwise_add");
+  lookup_table1->LinksFrom({lookup_table1_x, lookup_table1_w})
+      .LinksTo({lookup_table1_out});
+  lookup_table2->LinksFrom({lookup_table2_x, lookup_table2_w})
+      .LinksTo({lookup_table2_out});
+  eltwise_add->LinksFrom({lookup_table1_out, lookup_table2_out})
+      .LinksTo({eltwise_add_out});
+}
+void TrtPromptTuningEmbedding1Eltwise1Pattern::operator()() {
+  auto* lookup_table1_x =
+      create_emb_vars(pattern, lookup_table1_x_repr(), "Ids");
+  auto* lookup_table1_w =
+      create_emb_vars(pattern, lookup_table1_w_repr(), "W", true);
+  std::unordered_set<std::string> embedding_ops{"lookup_table",
+                                                "lookup_table_v2"};
+
+  auto* lookup_table1 =
+      pattern->NewNode(lookup_table1_repr())->assert_is_ops(embedding_ops);
+  auto* lookup_table1_out =
+      create_emb_out_vars(pattern, lookup_table1_out_repr(), "Y");
+  auto* eltwise_add =
+      pattern->NewNode(eltwise_add_repr())->assert_is_op("elementwise_add");
+  auto* eltwise_add_in = pattern->NewNode(eltwise_add_in_repr())
+                             ->assert_is_op_input("elementwise_add", "X")
+                             ->assert_is_op_output("elementwise_add");
+  auto* eltwise_add_out = pattern->NewNode(eltwise_add_out_repr())
+                              ->assert_is_op_output("elementwise_add");
+  lookup_table1->LinksFrom({lookup_table1_x, lookup_table1_w})
+      .LinksTo({lookup_table1_out});
+  eltwise_add->LinksFrom({lookup_table1_out, eltwise_add_in})
+      .LinksTo({eltwise_add_out});
+}
+void TrtPromptTuningSkipLayerNorm::operator()() {
+  auto* eltwise_add =
+      pattern->NewNode(eltwise_add_repr())->assert_is_op("elementwise_add");
+  auto* eltwise_add_out = pattern->NewNode(eltwise_add_out_repr())
+                              ->assert_is_op_output("elementwise_add")
+                              ->AsIntermediate();
+
+  auto* mul0_x = pattern->NewNode(mul0_x_repr())
+                     ->assert_is_op_input("matrix_multiply", "X");
+
+  auto* mul0_y = pattern->NewNode(mul0_y_repr())
+                     ->assert_is_op_input("matrix_multiply", "Y");
+
+  auto* mul0 = pattern->NewNode(mul0_repr())->assert_is_op("matrix_multiply");
+
+  auto* mul0_out = pattern->NewNode(mul0_out_repr())
+                       ->assert_is_op_output("matrix_multiply")
+                       ->assert_is_op_input("elementwise_add", "X")
+                       ->AsIntermediate();
+
+  auto* eltadd0_b = pattern->NewNode(eltadd0_b_repr())
+                        ->assert_is_op_input("elementwise_add", "Y");
+
+  auto* eltadd0 =
+      pattern->NewNode(eltadd0_repr())->assert_is_op("elementwise_add");
+
+  auto* eltadd0_out = pattern->NewNode(eltadd0_out_repr())
+                          ->assert_is_op_output("elementwise_add")
+                          ->assert_is_op_input("relu")
+                          ->AsIntermediate();
+
+  auto* relu = pattern->NewNode(relu_repr())->assert_is_op("relu");
+  auto* relu_out = pattern->NewNode(relu_out_repr())
+                       ->assert_is_op_output("relu")
+                       ->assert_is_op_input("matrix_multiply", "X")
+                       ->AsIntermediate();
+
+  auto* mul1_y = pattern->NewNode(mul1_y_repr())
+                     ->assert_is_op_input("matrix_multiply", "Y");
+  auto* mul1 = pattern->NewNode(mul1_repr())->assert_is_op("matrix_multiply");
+
+  auto* mul1_out = pattern->NewNode(mul1_out_repr())
+                       ->assert_is_op_output("matrix_multiply")
+                       ->assert_is_op_input("elementwise_add", "X")
+                       ->AsIntermediate();
+
+  auto* eltadd1_b = pattern->NewNode(eltadd1_b_repr())
+                        ->assert_is_op_input("elementwise_add", "Y");
+
+  auto* eltadd1 =
+      pattern->NewNode(eltadd1_repr())->assert_is_op("elementwise_add");
+
+  auto* eltadd1_out = pattern->NewNode(eltadd1_out_repr())
+                          ->assert_is_op_output("elementwise_add");
+
+  auto* concat = pattern->NewNode(concat_repr())->assert_is_op("concat");
+
+  auto* concat_out = pattern->NewNode(concat_out_repr())
+                         ->assert_is_op_output("concat")
+                         ->assert_is_op_input("layer_norm", "X")
+                         ->AsIntermediate();
+  auto* layer_norm =
+      pattern->NewNode(layer_norm_repr())->assert_is_op("layer_norm");
+  auto* layer_norm_out = pattern->NewNode(layer_norm_out_repr())
+                             ->assert_is_op_output("layer_norm", "Y")
+                             ->AsOutput();
+  auto* layer_norm_bias_var = pattern->NewNode(layer_norm_bias_repr())
+                                  ->AsInput()
+                                  ->assert_is_persistable_var()
+                                  ->assert_is_op_input("layer_norm", "Bias");
+  auto* layer_norm_scale_var = pattern->NewNode(layer_norm_scale_repr())
+                                   ->AsInput()
+                                   ->assert_is_persistable_var()
+                                   ->assert_is_op_input("layer_norm", "Scale");
+
+  eltwise_add->LinksTo({eltwise_add_out});
+
+  mul0->LinksFrom({mul0_x, mul0_y}).LinksTo({mul0_out});
+
+  eltadd0->LinksFrom({mul0_out, eltadd0_b}).LinksTo({eltadd0_out});
+
+  relu->LinksFrom({eltadd0_out}).LinksTo({relu_out});
+
+  mul1->LinksFrom({relu_out, mul1_y}).LinksTo({mul1_out});
+
+  eltadd1->LinksFrom({mul1_out, eltadd1_b}).LinksTo({eltadd1_out});
+
+  concat->LinksFrom({eltadd1_out, eltwise_add_out}).LinksTo({concat_out});
+
+  layer_norm->LinksFrom({concat_out, layer_norm_bias_var, layer_norm_scale_var})
+      .LinksTo({layer_norm_out});
+}
+
+}  // namespace patterns
+
+int TrtPromptTuningEmbeddingEltwiseLayerNormFusePass::BuildFusion(
+    Graph* graph, const std::string& name_scope
+    /*const Scope* scope*/) const {
+  GraphPatternDetector gpd;
+  auto* pattern = gpd.mutable_pattern();
+  std::string pos_id = Get<std::string>("tensorrt_transformer_posid");
+  std::string mask_id = Get<std::string>("tensorrt_transformer_maskid");
+  std::vector<std::vector<std::pair<Node*, Node*>>> start_pattern_in_nodes;
+  std::vector<Node*> start_pattern_out_node;
+  std::vector<std::unordered_set<Node*>> start_pattern_remove_nodes;
+
+  // Create pattern.
+  patterns::TrtPromptTuningEmbedding2Eltwise1Pattern start_pattern(
+      pattern, name_scope + "/start");
+  start_pattern();
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(lookup_table1_x, lookup_table1_x, start_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(lookup_table2_x, lookup_table2_x, start_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(lookup_table1_w, lookup_table1_w, start_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(lookup_table2_w, lookup_table2_w, start_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(lookup_table1, lookup_table1, start_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(lookup_table2, lookup_table2, start_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        lookup_table1_out, lookup_table1_out, start_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        lookup_table2_out, lookup_table2_out, start_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltwise_add, eltwise_add, start_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltwise_add_out, eltwise_add_out, start_pattern);
+
+    if (!IsCompat(subgraph, graph)) {
+      LOG(WARNING) << "Pass(TrtPromptTuningEmbedding2Eltwise1Pattern) in op "
+                      "compat failed.";
+      return;
+    }
+    std::vector<std::pair<Node*, Node*>> ins;
+    ins.push_back(std::make_pair(lookup_table1_x, lookup_table1_w));
+    ins.push_back(std::make_pair(lookup_table2_x, lookup_table2_w));
+    start_pattern_in_nodes.push_back(ins);
+    start_pattern_out_node.push_back(eltwise_add_out);
+
+    std::unordered_set<Node*> rm_nodes;
+    rm_nodes.insert({lookup_table1,
+                     lookup_table2,
+                     lookup_table1_out,
+                     lookup_table2_out,
+                     eltwise_add,
+                     eltwise_add_out});
+    start_pattern_remove_nodes.push_back(rm_nodes);
+  };
+  gpd(graph, handler);
+
+  std::vector<std::pair<Node*, Node*>> inner_pattern_ins;
+  std::vector<Node*> inner_pattern_tmp_in;
+  std::vector<Node*> inner_pattern_out;
+  std::vector<std::unordered_set<Node*>> inner_pattern_remove_nodes;
+
+  GraphPatternDetector gpd2;
+  auto* pattern2 = gpd2.mutable_pattern();
+  patterns::TrtPromptTuningEmbedding1Eltwise1Pattern second_pattern(
+      pattern2, name_scope + "/second");
+  second_pattern();
+  auto handler2 = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                      Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(lookup_table1_x, lookup_table1_x, second_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(lookup_table1_w, lookup_table1_w, second_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(lookup_table1, lookup_table1, second_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        lookup_table1_out, lookup_table1_out, second_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltwise_add_in, eltwise_add_in, second_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltwise_add, eltwise_add, second_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltwise_add_out, eltwise_add_out, second_pattern);
+    if (!IsCompat(subgraph, graph)) {
+      LOG(WARNING) << "Pass(TrtPromptTuningEmbedding1Eltwise1Pattern) in op "
+                      "compat failed.";
+      return;
+    }
+    auto in = std::make_pair(lookup_table1_x, lookup_table1_w);
+    inner_pattern_ins.push_back(in);
+    inner_pattern_tmp_in.push_back(eltwise_add_in);
+    inner_pattern_out.push_back(eltwise_add_out);
+
+    std::unordered_set<Node*> rm_nodes;
+    rm_nodes.insert(
+        {lookup_table1, lookup_table1_out, eltwise_add, eltwise_add_out});
+    inner_pattern_remove_nodes.push_back(rm_nodes);
+  };
+  gpd2(graph, handler2);
+
+  std::vector<Node*> end_pattern_elt_out;
+  std::vector<Node*> end_pattern_eltadd1;
+  std::vector<Node*> end_pattern_eltadd1_out;
+  std::vector<Node*> end_pattern_concat;
+  std::vector<Node*> end_pattern_concat_out;
+  std::vector<Node*> end_pattern_scales;
+  std::vector<Node*> end_pattern_biases;
+  std::vector<Node*> end_pattern_out;
+  std::vector<Node*> end_patter_layernorms;
+  std::vector<std::unordered_set<Node*>> end_pattern_remove_nodes;
+  GraphPatternDetector gpd3;
+  auto* pattern3 = gpd3.mutable_pattern();
+  patterns::TrtPromptTuningSkipLayerNorm skip_layernorm_pattern(
+      pattern3, name_scope + "/third");
+  skip_layernorm_pattern();
+  auto handler3 = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                      Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(eltwise_add, eltwise_add, skip_layernorm_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltwise_add_out, eltwise_add_out, skip_layernorm_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd1, eltadd1, skip_layernorm_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd1_out, eltadd1_out, skip_layernorm_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(concat, concat, skip_layernorm_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(concat_out, concat_out, skip_layernorm_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(layer_norm, layer_norm, skip_layernorm_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        layer_norm_out, layer_norm_out, skip_layernorm_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        layer_norm_bias, layer_norm_bias, skip_layernorm_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        layer_norm_scale, layer_norm_scale, skip_layernorm_pattern);
+    if (!IsCompat(subgraph, graph)) {
+      LOG(WARNING) << "Pass(TrtPromptTuningSkipLayerNorm) in op compat failed.";
+      return;
+    }
+    end_pattern_elt_out.push_back(eltwise_add_out);
+    std::unordered_set<Node*> rm_nodes;
+    rm_nodes.insert({concat});
+    rm_nodes.insert({concat_out});
+    rm_nodes.insert({layer_norm});
+    end_pattern_remove_nodes.push_back(rm_nodes);
+
+    end_pattern_eltadd1.push_back(eltadd1);
+    end_pattern_eltadd1_out.push_back(eltadd1_out);
+    end_pattern_concat.push_back(concat);
+    end_pattern_concat_out.push_back(concat_out);
+    end_pattern_biases.push_back(layer_norm_bias);
+    end_pattern_scales.push_back(layer_norm_scale);
+    end_pattern_out.push_back(layer_norm_out);
+    end_patter_layernorms.push_back(layer_norm);
+  };
+  gpd3(graph, handler3);
+
+  if (start_pattern_in_nodes.empty() || end_pattern_elt_out.empty()) {
+    return 0;
+  }
+  // only reserve the subgraphs that in connected domains.
+  int fusion_count = 0;
+  // fusion_id for (i, k, js)
+  std::vector<std::pair<size_t, std::pair<size_t, std::vector<size_t>>>>
+      fusion_ids;
+  for (size_t i = 0; i < start_pattern_in_nodes.size(); ++i) {
+    Node* tmp = start_pattern_out_node[i];
+    Node* old_tmp = nullptr;
+    // get correct inner pattern node order.
+    std::vector<size_t> js;
+    while (tmp != old_tmp) {
+      old_tmp = tmp;
+      for (size_t j = 0; j < inner_pattern_tmp_in.size(); ++j) {
+        if (inner_pattern_tmp_in[j] == tmp) {
+          tmp = inner_pattern_out[j];
+          js.push_back(j);
+          break;
+        }
+      }
+    }
+
+    for (size_t k = 0; k < end_pattern_elt_out.size(); ++k) {
+      if (tmp == end_pattern_elt_out[k]) {
+        fusion_ids.push_back(std::make_pair(i, std::make_pair(k, js)));
+        break;
+      }
+    }
+  }
+
+  for (auto& fusion_id : fusion_ids) {
+    int i = fusion_id.first;
+    int k = fusion_id.second.first;
+    std::vector<size_t> js = fusion_id.second.second;
+
+    std::vector<std::string> ids;
+    std::vector<std::string> embs;
+
+    auto ids0_shape = start_pattern_in_nodes[i][0].first->Var()->GetShape();
+    bool flag = true;
+    for (auto& item : start_pattern_in_nodes[i]) {
+      auto ids_shape = item.first->Var()->GetShape();
+      if (ids_shape.size() != ids0_shape.size()) {
+        VLOG(3) << "Shape check failed, ids'rank are not all equal, stop "
+                   "trt_prompt_tuning_embedding_eltwise_layernorm_fuse_pass.";
+        flag = false;
+      } else {
+        for (size_t j = 0; j < ids_shape.size(); ++j) {
+          if (ids_shape[j] != ids0_shape[j]) {
+            VLOG(3)
+                << "Shape check failed, ids.shape[i] are not all equal, stop "
+                   "trt_prompt_tuning_embedding_eltwise_layernorm_fuse_pass.";
+            flag = false;
+          }
+        }
+      }
+      ids.push_back(item.first->Name());
+      embs.push_back(item.second->Name());
+    }
+    for (auto item : js) {
+      auto ids_shape = inner_pattern_ins[item].first->Var()->GetShape();
+      if (ids_shape.size() != ids0_shape.size()) {
+        VLOG(3) << "Shape check failed, ids'rank are not all equal, stop "
+                   "trt_prompt_tuning_embedding_eltwise_layernorm_fuse_pass.";
+        flag = false;
+      } else {
+        for (size_t j = 0; j < ids_shape.size(); ++j) {
+          if (ids_shape[j] != ids0_shape[j]) {
+            VLOG(3)
+                << "Shape check failed, ids.shape[i] are not all equal, stop "
+                   "trt_prompt_tuning_embedding_eltwise_layernorm_fuse_pass.";
+            flag = false;
+          }
+        }
+      }
+      ids.push_back(inner_pattern_ins[item].first->Name());
+      embs.push_back(inner_pattern_ins[item].second->Name());
+    }
+
+    if (flag) {
+      OpDesc new_op_desc(end_patter_layernorms[0]->Op()->Block());
+      new_op_desc.SetType("prompt_tuning_emb_eltwise_layernorm");
+      new_op_desc.SetInput("Ids", ids);
+      new_op_desc.SetInput("Embs", embs);
+      new_op_desc.SetInput("PosId", {pos_id});
+      new_op_desc.SetInput("MaskId", {mask_id});
+
+      new_op_desc.SetInput("Bias", {end_pattern_biases[k]->Name()});
+      new_op_desc.SetInput("Scale", {end_pattern_scales[k]->Name()});
+      new_op_desc.SetInput("DenseVector", {end_pattern_eltadd1_out[k]->Name()});
+      new_op_desc.SetOutput("Out", {end_pattern_out[k]->Name()});
+      new_op_desc.SetAttr("epsilon",
+                          end_patter_layernorms[k]->Op()->GetAttr("epsilon"));
+
+      if (end_patter_layernorms[k]->Op()->HasAttr("out_threshold")) {
+        new_op_desc.SetAttr("enable_int8", true);
+        new_op_desc.SetAttr(
+            "out_threshold",
+            end_patter_layernorms[k]->Op()->GetAttr("out_threshold"));
+      }
+
+      auto* embedding_eltwise_layernorm = graph->CreateOpNode(&new_op_desc);
+
+      for (auto& item : start_pattern_in_nodes[i]) {
+        IR_NODE_LINK_TO(item.first, embedding_eltwise_layernorm);
+        IR_NODE_LINK_TO(item.second, embedding_eltwise_layernorm);
+      }
+      for (auto item : js) {
+        IR_NODE_LINK_TO(inner_pattern_ins[item].first,
+                        embedding_eltwise_layernorm);
+        IR_NODE_LINK_TO(inner_pattern_ins[item].second,
+                        embedding_eltwise_layernorm);
+      }
+      IR_NODE_LINK_TO(end_pattern_biases[k], embedding_eltwise_layernorm);
+      IR_NODE_LINK_TO(end_pattern_scales[k], embedding_eltwise_layernorm);
+      IR_NODE_LINK_TO(end_pattern_eltadd1_out[k], embedding_eltwise_layernorm);
+      IR_NODE_LINK_TO(embedding_eltwise_layernorm, end_pattern_out[k]);
+
+      // Remove unneeded nodes.
+      std::unordered_set<const Node*> marked_nodes;
+      marked_nodes.insert(start_pattern_remove_nodes[i].begin(),
+                          start_pattern_remove_nodes[i].end());
+      marked_nodes.insert(end_pattern_remove_nodes[k].begin(),
+                          end_pattern_remove_nodes[k].end());
+      for (auto item : js) {
+        marked_nodes.insert(inner_pattern_remove_nodes[item].begin(),
+                            inner_pattern_remove_nodes[item].end());
+      }
+      GraphSafeRemoveNodes(graph, marked_nodes);
+      ++fusion_count;
+    } else {
+      VLOG(3) << "Shape check failed, stop "
+                 "trt_prompt_tuning_embedding_eltwise_layernorm_fuse_pass.";
+    }
+  }
+  return fusion_count;
+}
+
+TrtPromptTuningEmbeddingEltwiseLayerNormFusePass::
+    TrtPromptTuningEmbeddingEltwiseLayerNormFusePass() {
+  AddOpCompat(OpCompat("elementwise_add"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Y")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("axis")
+      .End();
+
+  AddOpCompat(OpCompat("relu"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End();
+
+  AddOpCompat(OpCompat("concat"))
+      .AddInput("X")
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("axis")
+      .End();
+
+  AddOpCompat(OpCompat("layer_norm"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Scale")
+      .IsTensor()
+      .End()
+      .AddInput("Bias")
+      .IsTensor()
+      .End()
+      .AddOutput("Y")
+      .IsTensor()
+      .End()
+      .AddOutput("Mean")
+      .IsTensor()
+      .End()
+      .AddOutput("Variance")
+      .IsTensor()
+      .End()
+      .AddAttr("epsilon")
+      .IsNumGE(0.0f)
+      .IsNumLE(0.001f)
+      .End()
+      .AddAttr("begin_norm_axis")
+      .IsNumGT(0)
+      .End();
+}
+
+void TrtPromptTuningEmbeddingEltwiseLayerNormFusePass::ApplyImpl(
+    Graph* graph) const {
+  bool with_dynamic_shape = Get<bool>("with_dynamic_shape");
+  if (!with_dynamic_shape) {
+    VLOG(3) << "Stop this pass, because "
+               "trt_prompt_tuning_embedding_eltwise_layernorm_fuse_pass need: "
+               "use_varseqlen, "
+               "with_dynamic_shape."
+               "please reconfig.";
+    return;
+  }
+  FusePassBase::Init(name_scope_, graph);
+  int fusion_count =
+      TrtPromptTuningEmbeddingEltwiseLayerNormFusePass::BuildFusion(
+          graph, name_scope_);
+  if (fusion_count > 0) {
+    bool use_varseqlen = Get<bool>("use_varseqlen");
+    std::string pos_id = Get<std::string>("tensorrt_transformer_posid");
+    std::string mask_id = Get<std::string>("tensorrt_transformer_maskid");
+
+    if ((use_varseqlen && !pos_id.empty() && !mask_id.empty())) {
+      VLOG(3)
+          << "start trt_prompt_tuning_embedding_eltwise_layernorm_fuse_pass";
+    } else {
+      VLOG(3) << "Stop this pass, because "
+                 "trt_prompt_tuning_embedding_eltwise_layernorm_fuse_pass only "
+                 "support use_varseqlen, please reconfig";
+      return;
+    }
+    graph->Set(kEmbEltwiseLayernormPass, new bool(true));
+  }
+  AddStatis(fusion_count);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(
+    trt_prompt_tuning_embedding_eltwise_layernorm_fuse_pass,
+    paddle::framework::ir::TrtPromptTuningEmbeddingEltwiseLayerNormFusePass);
+REGISTER_PASS_CAPABILITY(
+    trt_prompt_tuning_embedding_eltwise_layernorm_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .LE("lookup_table", 1)
+            .LE("lookup_table_v2", 1)
+            .LE("elementweise_add", 1));
diff --git a/paddle/fluid/framework/ir/trt_prompt_tuning_embedding_eltwise_layernorm_fuse_pass.h b/paddle/fluid/framework/ir/trt_prompt_tuning_embedding_eltwise_layernorm_fuse_pass.h
new file mode 100644
index 0000000000000..16fd38b1abed6
--- /dev/null
+++ b/paddle/fluid/framework/ir/trt_prompt_tuning_embedding_eltwise_layernorm_fuse_pass.h
@@ -0,0 +1,118 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+class Graph;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace patterns {
+
+struct TrtPromptTuningEmbedding2Eltwise1Pattern : public PatternBase {
+  TrtPromptTuningEmbedding2Eltwise1Pattern(PDPattern* pattern,
+                                           const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "embedding2_eltwise1") {}
+
+  void operator()();
+  PATTERN_DECL_NODE(feed1);
+  PATTERN_DECL_NODE(feed2);
+  PATTERN_DECL_NODE(lookup_table1_x);
+  PATTERN_DECL_NODE(lookup_table2_x);
+  PATTERN_DECL_NODE(lookup_table1_w);
+  PATTERN_DECL_NODE(lookup_table2_w);
+  PATTERN_DECL_NODE(lookup_table1);
+  PATTERN_DECL_NODE(lookup_table2);
+  PATTERN_DECL_NODE(lookup_table1_out);
+  PATTERN_DECL_NODE(lookup_table2_out);
+  PATTERN_DECL_NODE(eltwise_add);
+  PATTERN_DECL_NODE(eltwise_add_out);
+};
+
+struct TrtPromptTuningEmbedding1Eltwise1Pattern : public PatternBase {
+  TrtPromptTuningEmbedding1Eltwise1Pattern(PDPattern* pattern,
+                                           const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "embedding1_eltwise1") {}
+  void operator()();
+  PATTERN_DECL_NODE(feed1);
+  PATTERN_DECL_NODE(lookup_table1_x);
+  PATTERN_DECL_NODE(lookup_table1_w);
+  PATTERN_DECL_NODE(lookup_table1);
+  PATTERN_DECL_NODE(lookup_table1_out);
+  PATTERN_DECL_NODE(eltwise_add_in);
+  PATTERN_DECL_NODE(eltwise_add);
+  PATTERN_DECL_NODE(eltwise_add_out);
+};
+
+struct TrtPromptTuningSkipLayerNorm : public PatternBase {
+  TrtPromptTuningSkipLayerNorm(PDPattern* pattern,
+                               const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "skip_layernorm") {}
+  void operator()();
+
+  PATTERN_DECL_NODE(eltwise_add);
+  PATTERN_DECL_NODE(eltwise_add_out);
+  PATTERN_DECL_NODE(mul0_x);
+  PATTERN_DECL_NODE(mul0_y);
+  PATTERN_DECL_NODE(mul0);
+  PATTERN_DECL_NODE(mul0_out);
+  PATTERN_DECL_NODE(eltadd0_b);
+  PATTERN_DECL_NODE(eltadd0);
+  PATTERN_DECL_NODE(eltadd0_out);
+  PATTERN_DECL_NODE(relu);
+  PATTERN_DECL_NODE(relu_out);
+  PATTERN_DECL_NODE(mul1_y);
+  PATTERN_DECL_NODE(mul1);
+  PATTERN_DECL_NODE(mul1_out);
+  PATTERN_DECL_NODE(eltadd1_b);
+  PATTERN_DECL_NODE(eltadd1);
+  PATTERN_DECL_NODE(eltadd1_out);
+  PATTERN_DECL_NODE(concat);
+  PATTERN_DECL_NODE(concat_out);
+  PATTERN_DECL_NODE(layer_norm);
+  PATTERN_DECL_NODE(layer_norm_bias);
+  PATTERN_DECL_NODE(layer_norm_scale);
+  PATTERN_DECL_NODE(layer_norm_out);
+};
+}  // namespace patterns
+
+class TrtPromptTuningEmbeddingEltwiseLayerNormFusePass : public FusePassBase {
+ public:
+  TrtPromptTuningEmbeddingEltwiseLayerNormFusePass();
+  virtual ~TrtPromptTuningEmbeddingEltwiseLayerNormFusePass() {}
+
+ protected:
+  void ApplyImpl(Graph* graph) const;
+  int BuildFusion(Graph* graph, const std::string& name_scope
+                  /*const Scope* scope*/) const;
+  const std::string name_scope_{
+      "trt_prompt_tuning_embedding_eltwise_layernorm_fuse"};
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/xpu/conv2d_bias_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/conv2d_bias_fuse_pass.cc
new file mode 100644
index 0000000000000..7f53507a85c83
--- /dev/null
+++ b/paddle/fluid/framework/ir/xpu/conv2d_bias_fuse_pass.cc
@@ -0,0 +1,339 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/xpu/conv2d_bias_fuse_pass.h"
+
+#include "glog/logging.h"
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/ir/xpu/pass_utils.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace patterns {
+
+struct FcBiasPattern : public PatternBase {
+  FcBiasPattern(PDPattern* pattern,
+                const std::string& name_scope,
+                const std::string& mul_type);
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(ew_bias_add);
+  // declare variable node's name
+  PATTERN_DECL_NODE(mul_out);
+  PATTERN_DECL_NODE(ew_bias_add_x);
+  PATTERN_DECL_NODE(ew_bias_add_out);
+
+ private:
+  std::string mul_type_;
+};
+
+FcBiasPattern::FcBiasPattern(PDPattern* pattern,
+                             const std::string& name_scope,
+                             const std::string& mul_type)
+    : PatternBase(pattern, name_scope, name_scope), mul_type_(mul_type) {
+  auto* mul_out = pattern->NewNode(mul_out_repr())
+                      ->assert_is_op_output(mul_type_, "Out")
+                      ->assert_is_op_input("elementwise_add", "Y")
+                      ->assert_has_n_outputs(1);
+  auto* ew_bias_add = pattern->NewNode(ew_bias_add_repr())
+                          ->assert_is_op("elementwise_add")
+                          ->assert_more([](Node* node) {
+                            auto* op_desc = node->Op();
+                            auto axis = op_desc->GetAttrIfExists<int>("axis");
+                            return axis == -1;
+                          });
+  auto* ew_bias_add_x = pattern->NewNode(ew_bias_add_x_repr())
+                            ->assert_is_op_input("elementwise_add", "X")
+                            ->assert_is_persistable_var()
+                            ->assert_has_n_outputs(1);
+  auto* ew_bias_add_out = pattern->NewNode(ew_bias_add_out_repr())
+                              ->assert_is_op_output("elementwise_add", "Out");
+  ew_bias_add->LinksFrom({mul_out, ew_bias_add_x}).LinksTo({ew_bias_add_out});
+}
+
+struct Conv2dBiasPattern : public PatternBase {
+  Conv2dBiasPattern(PDPattern* pattern, const std::string& name_scope);
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(ew_bias_add);
+  // declare variable node's name
+  PATTERN_DECL_NODE(x);
+  PATTERN_DECL_NODE(ew_bias_add_y);
+  PATTERN_DECL_NODE(ew_bias_add_out);
+};
+
+Conv2dBiasPattern::Conv2dBiasPattern(PDPattern* pattern,
+                                     const std::string& name_scope)
+    : PatternBase(pattern, name_scope, name_scope) {
+  auto* x = pattern->NewNode(x_repr())
+                ->assert_is_op_output("conv2d", "Output")
+                ->assert_is_op_input("elementwise_add", "X")
+                ->assert_has_n_outputs(1);
+  auto* ew_bias_add = pattern->NewNode(ew_bias_add_repr())
+                          ->assert_is_op("elementwise_add")
+                          ->assert_more([](Node* node) {
+                            auto* op_desc = node->Op();
+                            auto axis = op_desc->GetAttrIfExists<int>("axis");
+                            return axis == -1;
+                          });
+  auto* ew_bias_add_y = pattern->NewNode(ew_bias_add_y_repr())
+                            ->assert_is_op_input("elementwise_add", "Y")
+                            ->assert_is_persistable_var()
+                            ->assert_has_n_outputs(1)
+                            ->assert_more([](Node* node) {
+                              auto y_shape = node->Var()->GetShape();
+                              size_t y_rank = y_shape.size();
+                              return y_rank == 4 && y_shape[0] == 1 &&
+                                     y_shape[2] == 1 && y_shape[3] == 1;
+                            });
+  auto* ew_bias_add_out = pattern->NewNode(ew_bias_add_out_repr())
+                              ->assert_is_op_output("elementwise_add", "Out");
+  ew_bias_add->LinksFrom({x, ew_bias_add_y}).LinksTo({ew_bias_add_out});
+}
+
+struct ScaleFusePattern : public PatternBase {
+  ScaleFusePattern(PDPattern* pattern, const std::string& name_scope);
+  // declare operator node's name
+  PATTERN_DECL_NODE(ele_mul);
+  PATTERN_DECL_NODE(ele_add);
+  // declare variable node's name
+  PATTERN_DECL_NODE(x);
+  PATTERN_DECL_NODE(ele_mul_y);
+  PATTERN_DECL_NODE(ele_mul_out);
+  PATTERN_DECL_NODE(ele_add_y);
+  PATTERN_DECL_NODE(ele_add_out);
+};
+
+ScaleFusePattern::ScaleFusePattern(PDPattern* pattern,
+                                   const std::string& name_scope)
+    : PatternBase(pattern, name_scope, name_scope) {
+  // ele_mul op
+  auto ele_mul =
+      pattern->NewNode(ele_mul_repr())->assert_is_op("elementwise_mul");
+  auto x = pattern->NewNode(x_repr())
+               ->assert_is_op_input("elementwise_mul", "X")
+               ->AsInput();
+  auto ele_mul_y = pattern->NewNode(ele_mul_y_repr())
+                       ->assert_is_op_input("elementwise_mul", "Y")
+                       ->assert_is_persistable_var()
+                       ->assert_has_n_outputs(1)
+                       ->assert_more([](Node* node) {
+                         return node->Var()->GetShape().size() == 1;
+                       });
+  auto ele_mul_out = pattern->NewNode(ele_mul_out_repr())
+                         ->assert_is_op_output("elementwise_mul", "Out")
+                         ->assert_is_op_input("elementwise_add", "X")
+                         ->assert_has_n_outputs(1);
+  ele_mul->LinksFrom({x, ele_mul_y}).LinksTo({ele_mul_out});
+  // ele_add op
+  auto ele_add =
+      pattern->NewNode(ele_add_repr())->assert_is_op("elementwise_add");
+  auto ele_add_y = pattern->NewNode(ele_add_y_repr())
+                       ->assert_is_op_input("elementwise_add", "Y")
+                       ->assert_is_persistable_var()
+                       ->assert_has_n_outputs(1)
+                       ->assert_more([](Node* node) {
+                         return node->Var()->GetShape().size() == 1;
+                       });
+  auto ele_add_out = pattern->NewNode(ele_add_out_repr())
+                         ->assert_is_op_output("elementwise_add", "Out");
+  ele_add->LinksFrom({ele_mul_out, ele_add_y}).LinksTo({ele_add_out});
+}
+
+}  // namespace patterns
+
+void Conv2dBiasFusePass::TransFcBias(ir::Graph* graph,
+                                     const std::string& mul_type) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::PreconditionNotMet("graph should not be null."));
+  Init(name_scope_, graph);
+  GraphPatternDetector gpd;
+  patterns::FcBiasPattern pattern(gpd.mutable_pattern(), name_scope_, mul_type);
+
+  int found_subgraph_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* graph) {
+    VLOG(4) << "handle TransFcBias fuse";
+    // declare operator node's name
+    GET_IR_NODE(ew_bias_add);
+    // declare variable node's name
+    GET_IR_NODE(mul_out);
+    GET_IR_NODE(ew_bias_add_x);
+    GET_IR_NODE(ew_bias_add_out);
+
+    // trans link order of x && y for ew_bias_add op
+    auto ew_bias_add_desc = ew_bias_add->Op();
+    IR_NODE_UNLINK(mul_out, ew_bias_add);
+    IR_NODE_UNLINK(ew_bias_add_x, ew_bias_add);
+    ew_bias_add_desc->RemoveInput("X");
+    ew_bias_add_desc->RemoveInput("Y");
+    ew_bias_add_desc->Flush();
+    ew_bias_add_desc->SetInput("X", {mul_out->Name()});
+    ew_bias_add_desc->SetInput("Y", {ew_bias_add_x->Name()});
+    IR_OP_VAR_LINK(mul_out, ew_bias_add);
+    IR_OP_VAR_LINK(ew_bias_add_x, ew_bias_add);
+
+    found_subgraph_count++;
+  };
+
+  gpd(graph, handler);
+  AddStatis(found_subgraph_count);
+}
+
+void Conv2dBiasFusePass::FoldConv2dBias(ir::Graph* graph) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::PreconditionNotMet("graph should not be null."));
+  Init(name_scope_, graph);
+  GraphPatternDetector gpd;
+  patterns::Conv2dBiasPattern pattern(gpd.mutable_pattern(), name_scope_);
+
+  int found_subgraph_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* graph) {
+    VLOG(4) << "handle TransEwBiasAdd fuse";
+    // declare operator node's name
+    GET_IR_NODE(ew_bias_add);
+    // declare variable node's name
+    GET_IR_NODE(x);
+    GET_IR_NODE(ew_bias_add_y);
+    GET_IR_NODE(ew_bias_add_out);
+
+    auto* scope = param_scope();
+    // resize 4D dims of ew_bias_add_y to 1-D dim
+    auto ew_bias_add_desc = ew_bias_add->Op();
+    ew_bias_add_desc->SetAttr("axis", 1);
+    auto* ew_bias_add_y_desc = ew_bias_add_y->Var();
+    auto y_shape = ew_bias_add_y_desc->GetShape();
+    ew_bias_add_y_desc->SetShape({y_shape[1]});
+    auto* ew_bias_add_y_tensor =
+        scope->GetVar(ew_bias_add_y->Name())->GetMutable<phi::DenseTensor>();
+    ew_bias_add_y_tensor->Resize(phi::make_ddim({y_shape[1]}));
+    ew_bias_add_desc->Flush();
+
+    found_subgraph_count++;
+  };
+
+  gpd(graph, handler);
+  AddStatis(found_subgraph_count);
+}
+
+void Conv2dBiasFusePass::FuseScaleOps(ir::Graph* graph) const {
+  GraphPatternDetector gpd;
+  patterns::ScaleFusePattern pattern(gpd.mutable_pattern(), name_scope_);
+  int found_subgraph_count = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* graph) {
+    VLOG(4) << "handle FuseScaleOps";
+    /* declare operator node's name */
+    GET_IR_NODE(ele_mul);
+    GET_IR_NODE(ele_add);
+    // declare variable node's name
+    GET_IR_NODE(x);
+    GET_IR_NODE(ele_mul_y);
+    GET_IR_NODE(ele_mul_out);
+    GET_IR_NODE(ele_add_y);
+    GET_IR_NODE(ele_add_out);
+
+    auto* scope = param_scope();
+    PADDLE_ENFORCE_NOT_NULL(
+        scope, platform::errors::InvalidArgument("Scope cannot be nullptr."));
+    // get attrs of scale from ele_mul && ele_add
+    const auto& ele_mul_y_t =
+        scope->GetVar(ele_mul_y->Name())->GetMutable<phi::DenseTensor>();
+    auto ele_mul_y_t_len = ele_mul_y_t->numel();
+    PADDLE_ENFORCE_EQ(
+        ele_mul_y_t_len,
+        1,
+        platform::errors::InvalidArgument("the size(%ld) of ele_mul y tensor "
+                                          "must equal 1",
+                                          ele_mul_y_t_len));
+    const auto& ele_add_y_t =
+        scope->GetVar(ele_add_y->Name())->GetMutable<phi::DenseTensor>();
+    auto ele_add_y_t_len = ele_add_y_t->numel();
+    PADDLE_ENFORCE_EQ(
+        ele_add_y_t_len,
+        1,
+        platform::errors::InvalidArgument("the size(%ld) of ele_add y tensor "
+                                          "must equal 1",
+                                          ele_mul_y_t_len));
+    auto tensor_type = ele_mul_y_t->dtype();
+    float scale_val_ = 1.f;
+    float bias_val_ = 0.f;
+    if (tensor_type == phi::DataType::FLOAT16) {
+      CastToFp32(ele_mul_y_t, nullptr);
+      CastToFp32(ele_add_y_t, nullptr);
+    }
+    float* ele_mul_y_ptr =
+        ele_mul_y_t->mutable_data<float>(paddle::platform::CPUPlace());
+    float* ele_add_y_ptr =
+        ele_add_y_t->mutable_data<float>(paddle::platform::CPUPlace());
+    scale_val_ = ele_mul_y_ptr[0];
+    bias_val_ = ele_add_y_ptr[0];
+    // replace ele_mul+ele_add with scale
+    OpDesc new_desc;
+    new_desc.SetType("scale");
+    new_desc.SetAttr("bias_after_scale", true);
+    new_desc.SetAttr("scale", scale_val_);
+    new_desc.SetAttr("bias", bias_val_);
+    new_desc.SetInput("X", {x->Name()});
+    new_desc.SetOutput("Out", {ele_add_out->Name()});
+    new_desc.Flush();
+
+    auto fused_node = graph->CreateOpNode(&new_desc);
+    IR_NODE_LINK_TO(x, fused_node);
+    IR_NODE_LINK_TO(fused_node, ele_add_out);
+
+    std::unordered_set<const Node*> del_node_set = {
+        ele_mul, ele_mul_y, ele_mul_out, ele_add, ele_add_y};
+    GraphSafeRemoveNodes(graph, del_node_set);
+    found_subgraph_count++;
+  };
+
+  gpd(graph, handler);
+  AddStatis(found_subgraph_count);
+}
+
+void Conv2dBiasFusePass::ApplyImpl(ir::Graph* graph) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::PreconditionNotMet("graph should not be null."));
+  Init(name_scope_, graph);
+  // for conv2d + scale fuse
+  FuseScaleOps(graph);
+  // for conv2d + ew_bias_add + scale fuse
+  FoldConv2dBias(graph);
+  // for matmul + ew_bias_add fuse
+  for (auto mul_type : {"mul", "matmul", "matmul_v2"}) {
+    TransFcBias(graph, mul_type);
+  }
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(conv2d_bias_fuse_pass, paddle::framework::ir::Conv2dBiasFusePass);
+
+REGISTER_PASS_CAPABILITY(conv2d_bias_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .EQ("mul", 0)
+            .LE("elementwise_add", 1));
diff --git a/paddle/fluid/framework/ir/xpu/conv2d_bias_fuse_pass.h b/paddle/fluid/framework/ir/xpu/conv2d_bias_fuse_pass.h
new file mode 100644
index 0000000000000..7d9d3fbe3154c
--- /dev/null
+++ b/paddle/fluid/framework/ir/xpu/conv2d_bias_fuse_pass.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace phi {
+class DenseTensor;
+}  // namespace phi
+
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class Conv2dBiasFusePass : public FusePassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+
+ private:
+  void TransFcBias(ir::Graph* graph, const std::string& mul_type) const;
+
+  void FoldConv2dBias(ir::Graph* graph) const;
+  /*
+  For example:
+                        x
+                        |
+                  elementwise_mul
+                        |
+                  elementwise_add
+                        |
+                       out
+  ------------------------------------------------------
+  After the pass is applied:
+                        x
+                        |
+            bias --- scale_op --- scale
+                        |
+                       out
+  */
+  void FuseScaleOps(ir::Graph* graph) const;
+
+  const std::string name_scope_{"conv2d_bias_fuse_pass"};
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc
index 697f90e38b7ce..502c275a419d3 100644
--- a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc
@@ -46,12 +46,14 @@ struct Conv2dXPUPattern : public PatternBase {
                    const std::string& act_type,
                    bool with_conv_bias,
                    bool with_bn,
+                   bool with_scale,
                    bool with_branch_x,
                    bool with_branch_y);
   // declare operator node's name
   PATTERN_DECL_NODE(conv);
   PATTERN_DECL_NODE(ew_bias_add);
   PATTERN_DECL_NODE(bn);
+  PATTERN_DECL_NODE(scale);
   PATTERN_DECL_NODE(ew_branch_add);
   PATTERN_DECL_NODE(act);
   // declare variable node's name
@@ -69,6 +71,7 @@ struct Conv2dXPUPattern : public PatternBase {
   PATTERN_DECL_NODE(bn_mean_out);
   PATTERN_DECL_NODE(bn_saved_var);
   PATTERN_DECL_NODE(bn_saved_mean);
+  PATTERN_DECL_NODE(scale_out);
   PATTERN_DECL_NODE(ew_branch_add_in);
   PATTERN_DECL_NODE(ew_branch_add_out);
   PATTERN_DECL_NODE(act_out);
@@ -78,6 +81,7 @@ struct Conv2dXPUPattern : public PatternBase {
   std::string act_type_;
   bool with_conv_bias_{false};
   bool with_bn_{false};
+  bool with_scale_{false};
   bool with_branch_{false};
   bool with_branch_x_{false};
   bool with_branch_y_{false};
@@ -89,6 +93,7 @@ Conv2dXPUPattern::Conv2dXPUPattern(PDPattern* pattern,
                                    const std::string& act_type,
                                    bool with_conv_bias,
                                    bool with_bn,
+                                   bool with_scale,
                                    bool with_branch_x,
                                    bool with_branch_y)
     : PatternBase(pattern, name_scope, name_scope),
@@ -96,6 +101,7 @@ Conv2dXPUPattern::Conv2dXPUPattern(PDPattern* pattern,
       act_type_(act_type),
       with_conv_bias_(with_conv_bias),
       with_bn_(with_bn),
+      with_scale_(with_scale),
       with_branch_(with_branch_x || with_branch_y),
       with_branch_x_(with_branch_x),
       with_branch_y_(with_branch_y) {
@@ -130,7 +136,7 @@ Conv2dXPUPattern::Conv2dXPUPattern(PDPattern* pattern,
         pattern->NewNode(ew_bias_add_repr())->assert_is_op("elementwise_add");
     ew_bias_add_out = pattern->NewNode(ew_bias_add_out_repr())
                           ->assert_is_op_output("elementwise_add", "Out");
-    if (with_bn_ || with_branch_ || !act_type_.empty()) {
+    if (with_bn_ || with_scale_ || with_branch_ || !act_type_.empty()) {
       ew_bias_add_out->assert_has_n_outputs(1);
     }
     ew_bias_add->LinksFrom({conv_out, ew_bias_add_y})
@@ -151,6 +157,8 @@ Conv2dXPUPattern::Conv2dXPUPattern(PDPattern* pattern,
   PDNode* ew_branch_add = nullptr;
   PDNode* ew_branch_add_in = nullptr;
   PDNode* ew_branch_add_out = nullptr;
+  PDNode* scale = nullptr;
+  PDNode* scale_out = nullptr;
   PDNode* act = nullptr;
   PDNode* act_out = nullptr;
   // batch_norm op
@@ -179,7 +187,7 @@ Conv2dXPUPattern::Conv2dXPUPattern(PDPattern* pattern,
     bn = pattern->NewNode(bn_repr())->assert_is_op("batch_norm");
     bn_out =
         pattern->NewNode(bn_out_repr())->assert_is_op_output("batch_norm", "Y");
-    if (with_branch_ || !act_type_.empty()) {
+    if (with_scale_ || with_branch_ || !act_type_.empty()) {
       bn_out->assert_has_n_outputs(1);
     }
     bn_mean_out = pattern->NewNode(bn_mean_out_repr())
@@ -196,10 +204,23 @@ Conv2dXPUPattern::Conv2dXPUPattern(PDPattern* pattern,
   } else {
     bn_out = ew_bias_add_out;
   }
+  // scale op
+  if (with_scale_) {
+    bn_out->assert_is_op_input("scale", "X");
+    scale = pattern->NewNode(scale_repr())->assert_is_op("scale");
+    scale_out =
+        pattern->NewNode(scale_out_repr())->assert_is_op_output("scale", "Out");
+    if (with_bn_ || !act_type_.empty()) {
+      scale_out->assert_has_n_outputs(1);
+    }
+    scale->LinksFrom({bn_out}).LinksTo({scale_out});
+  } else {
+    scale_out = bn_out;
+  }
   // ew_branch_add op
   if (with_branch_) {
     if (with_branch_x_) {
-      bn_out->assert_is_op_input("elementwise_add", "Y");
+      scale_out->assert_is_op_input("elementwise_add", "Y");
       ew_branch_add_in = pattern->NewNode(ew_branch_add_in_repr())
                              ->assert_is_op_input("elementwise_add", "X")
                              ->AsInput();
@@ -226,7 +247,7 @@ Conv2dXPUPattern::Conv2dXPUPattern(PDPattern* pattern,
     ew_branch_add->LinksFrom({bn_out, ew_branch_add_in})
         .LinksTo({ew_branch_add_out});
   } else {
-    ew_branch_add_out = bn_out;
+    ew_branch_add_out = scale_out;
   }
   // act op
   if (!act_type_.empty()) {
@@ -330,6 +351,7 @@ class Conv2dXPUFusePass : public FusePassBase {
                 const std::string& act_type,
                 bool with_conv_bias,
                 bool with_bn,
+                bool with_scale,
                 bool with_branch_x,
                 bool with_branch_y) const;
 
@@ -345,28 +367,31 @@ void Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph) const {
   for (auto conv_type : {"conv2d", "depthwise_conv2d"}) {
     for (auto with_conv_bias : {true, false}) {
       for (auto with_bn : {true, false}) {
-        for (auto with_branch_x : {true, false}) {
-          for (auto with_branch_y : {true, false}) {
-            for (auto act_type : {
-                     "relu",
-                     "sigmoid",
-                     "tanh",
-                     "gelu",
-                     "leaky_relu",
-                     "hard_swish",
-                     "hard_sigmoid",
-                     "relu6",
-                     "swish",
-                     "",
-                 }) {
-              if (with_branch_x && with_branch_y) continue;
-              found_subgraph_count += ApplyImpl(graph,
-                                                conv_type,
-                                                act_type,
-                                                with_conv_bias,
-                                                with_bn,
-                                                with_branch_x,
-                                                with_branch_y);
+        for (auto with_scale : {true, false}) {
+          for (auto with_branch_x : {true, false}) {
+            for (auto with_branch_y : {true, false}) {
+              for (auto act_type : {
+                       "relu",
+                       "sigmoid",
+                       "tanh",
+                       "gelu",
+                       "leaky_relu",
+                       "hard_swish",
+                       "hard_sigmoid",
+                       "relu6",
+                       "swish",
+                       "",
+                   }) {
+                if (with_branch_x && with_branch_y) continue;
+                found_subgraph_count += ApplyImpl(graph,
+                                                  conv_type,
+                                                  act_type,
+                                                  with_conv_bias,
+                                                  with_bn,
+                                                  with_scale,
+                                                  with_branch_x,
+                                                  with_branch_y);
+              }
             }
           }
         }
@@ -381,6 +406,7 @@ int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph,
                                  const std::string& act_type,
                                  bool with_conv_bias,
                                  bool with_bn,
+                                 bool with_scale,
                                  bool with_branch_x,
                                  bool with_branch_y) const {
   GraphPatternDetector gpd;
@@ -390,6 +416,7 @@ int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph,
                                      act_type,
                                      with_conv_bias,
                                      with_bn,
+                                     with_scale,
                                      with_branch_x,
                                      with_branch_y);
   int found_subgraph_count = 0;
@@ -400,6 +427,7 @@ int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph,
     GET_IR_NODE(conv);
     GET_IR_NODE(ew_bias_add);
     GET_IR_NODE(bn);
+    GET_IR_NODE(scale);
     GET_IR_NODE(ew_branch_add);
     GET_IR_NODE(act);
     /* declare variable node's name*/
@@ -417,6 +445,7 @@ int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph,
     GET_IR_NODE(bn_mean_out);
     GET_IR_NODE(bn_saved_var);
     GET_IR_NODE(bn_saved_mean);
+    GET_IR_NODE(scale_out);
     GET_IR_NODE(ew_branch_add_in);
     GET_IR_NODE(ew_branch_add_out);
     GET_IR_NODE(act_out);
@@ -429,6 +458,7 @@ int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph,
     auto* filter_t =
         scope->FindVar(conv_filter->Name())->GetMutable<phi::DenseTensor>();
     // conv_filter fp16 --> fp32
+    auto filter_len = filter_t->numel();
     auto filter_dtype = filter_t->dtype();
     int out_dtype = proto::VarType::Type::VarType_Type_FP32;
     if (filter_dtype == phi::DataType::FLOAT16) {
@@ -481,7 +511,6 @@ int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph,
         float* bn_var_ptr =
             bn_var_t->mutable_data<float>(paddle::platform::CPUPlace());
         auto mean_len = bn_mean_t->numel();
-        auto filter_len = filter_t->numel();
         auto filter_stride = filter_len / mean_len;
         float epsilon = PADDLE_GET_CONST(float, bn->Op()->GetAttr("epsilon"));
         if (!with_conv_bias) {  // prev node is conv
@@ -513,6 +542,34 @@ int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph,
         }
       }
     }
+    // deal with scale op
+    if (with_scale) {
+      auto bias_len = filter_dims[0];
+      float scale_val_ = 1.f;
+      float bias_val_ = 0.f;
+      scale_val_ = PADDLE_GET_CONST(float, scale->Op()->GetAttr("scale"));
+      bias_val_ = PADDLE_GET_CONST(float, scale->Op()->GetAttr("bias"));
+      bool bias_after_scale_ =
+          PADDLE_GET_CONST(bool, scale->Op()->GetAttr("bias_after_scale"));
+      // recompute bias as scale op
+      auto fusion_bias_t = scope->GetVar(fusion_bias_node->Name())
+                               ->GetMutable<phi::DenseTensor>();
+      float* fusion_bias_ptr =
+          fusion_bias_t->mutable_data<float>(paddle::platform::CPUPlace());
+      for (int i = 0; i < bias_len; ++i) {
+        if (bias_after_scale_) {
+          fusion_bias_ptr[i] = fusion_bias_ptr[i] * scale_val_ + bias_val_;
+        } else {
+          fusion_bias_ptr[i] = (fusion_bias_ptr[i] + bias_val_) * scale_val_;
+        }
+      }
+      // recompute weight as scale op
+      float* filter_ptr =
+          filter_t->mutable_data<float>(paddle::platform::CPUPlace());
+      for (int i = 0; i < filter_len; ++i) {
+        filter_ptr[i] *= scale_val_;
+      }
+    }
     // filter max
     Node* filter_int16 = nullptr;
     Node* filter_max = nullptr;
@@ -524,6 +581,8 @@ int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph,
       conv2d_xpu_out_name = act_out->Name();
     } else if (ew_branch_add) {
       conv2d_xpu_out_name = ew_branch_add_out->Name();
+    } else if (scale) {
+      conv2d_xpu_out_name = scale_out->Name();
     } else if (bn) {
       conv2d_xpu_out_name = bn_out->Name();
     } else if (ew_bias_add) {
@@ -531,9 +590,9 @@ int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph,
     } else {
       conv2d_xpu_out_name = conv_out->Name();
     }
-    std::string conv_out_max_name = conv2d_xpu_out_name + "_max";
-    VarDesc conv_out_max_desc(conv_out_max_name);
-    Node* conv2d_xpu_out_max = graph->CreateVarNode(&conv_out_max_desc);
+    std::string conv2d_xpu_out_max_name = conv2d_xpu_out_name + "_max";
+    VarDesc conv2d_xpu_out_max_desc(conv2d_xpu_out_max_name);
+    Node* conv2d_xpu_out_max = graph->CreateVarNode(&conv2d_xpu_out_max_desc);
     // Generate conv2d_xpu op
     framework::OpDesc conv2d_xpu_op_desc(block);
     // set input&output var
@@ -542,7 +601,7 @@ int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph,
     conv2d_xpu_op_desc.SetInput("filter", {filter_int16->Name()});
     conv2d_xpu_op_desc.SetInput("filter_max", {filter_max->Name()});
     conv2d_xpu_op_desc.SetOutput("out", {conv2d_xpu_out_name});
-    conv2d_xpu_op_desc.SetOutput("out_max", {conv_out_max_name});
+    conv2d_xpu_op_desc.SetOutput("out_max", {conv2d_xpu_out_max_name});
     // set fusion_bias input node
     if (has_bias) {
       conv2d_xpu_op_desc.SetInput("bias", {fusion_bias_node->Name()});
@@ -603,6 +662,8 @@ int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph,
       IR_NODE_LINK_TO(conv2d_xpu, act_out);
     } else if (ew_branch_add_out) {
       IR_NODE_LINK_TO(conv2d_xpu, ew_branch_add_out);
+    } else if (scale_out) {
+      IR_NODE_LINK_TO(conv2d_xpu, scale_out);
     } else if (bn_out) {
       IR_NODE_LINK_TO(conv2d_xpu, bn_out);
     } else if (ew_bias_add_out) {
@@ -619,6 +680,9 @@ int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph,
     if (ew_branch_add != nullptr) {
       delete_nodes.insert(ew_branch_add);
     }
+    if (scale != nullptr) {
+      delete_nodes.insert(scale);
+    }
     if (bn != nullptr) {
       delete_nodes.insert(bn);
       delete_nodes.insert(bn_bias);
@@ -630,7 +694,7 @@ int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph,
       delete_nodes.insert(bn_saved_var);
       delete_nodes.insert(bn_saved_mean);
     }
-    if (ew_bias_add) {
+    if (ew_bias_add != nullptr) {
       delete_nodes.insert(ew_bias_add);
       delete_nodes.insert(ew_bias_add_y);
     }
diff --git a/paddle/fluid/framework/ir/xpu/redundant_unsqueeze_squeeze_elimination_pass.cc b/paddle/fluid/framework/ir/xpu/redundant_unsqueeze_squeeze_elimination_pass.cc
index 710fa94b4e0ff..7c847ae2e9ba1 100644
--- a/paddle/fluid/framework/ir/xpu/redundant_unsqueeze_squeeze_elimination_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/redundant_unsqueeze_squeeze_elimination_pass.cc
@@ -184,8 +184,167 @@ FoldGatherSqueeze2Pattern::FoldGatherSqueeze2Pattern(
   squeeze2_op->LinksFrom({gather_op_out}).LinksTo({squeeze2_op_out});
 }
 
+struct FoldConv1dSqueeze2Pattern : public PatternBase {
+  FoldConv1dSqueeze2Pattern(PDPattern* pattern,
+                            const std::string& name_scope,
+                            const std::string& act_type);
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(squeeze2);
+  PATTERN_DECL_NODE(bn);
+  PATTERN_DECL_NODE(act);
+  PATTERN_DECL_NODE(unsqueeze2);
+  // declare variable node's name
+  PATTERN_DECL_NODE(x);
+  PATTERN_DECL_NODE(squeeze2_out);
+  PATTERN_DECL_NODE(bn_bias);
+  PATTERN_DECL_NODE(bn_mean);
+  PATTERN_DECL_NODE(bn_scale);
+  PATTERN_DECL_NODE(bn_var);
+  PATTERN_DECL_NODE(bn_out);
+  PATTERN_DECL_NODE(bn_mean_out);
+  PATTERN_DECL_NODE(bn_saved_mean);
+  PATTERN_DECL_NODE(bn_saved_var);
+  PATTERN_DECL_NODE(bn_var_out);
+  PATTERN_DECL_NODE(act_out);
+  PATTERN_DECL_NODE(unsqueeze2_out);
+
+ private:
+  std::string act_type_;
+};
+
+FoldConv1dSqueeze2Pattern::FoldConv1dSqueeze2Pattern(
+    PDPattern* pattern,
+    const std::string& name_scope,
+    const std::string& act_type)
+    : PatternBase(pattern, name_scope, name_scope), act_type_(act_type) {
+  auto* x = pattern->NewNode(x_repr())
+                ->assert_is_op_input("squeeze2", "X")
+                ->assert_more([](Node* node) {
+                  auto x_shape = node->Var()->GetShape();
+                  size_t x_rank = x_shape.size();
+                  return x_rank == 4 && x_shape[2] == 1;
+                });
+  auto* squeeze2 = pattern->NewNode(squeeze2_repr())
+                       ->assert_is_op("squeeze2")
+                       ->assert_more([](Node* node) {
+                         auto* op_desc = node->Op();
+                         auto axes_array =
+                             op_desc->GetAttrIfExists<std::vector<int>>("axes");
+                         return axes_array == std::vector<int>{-2} ||
+                                axes_array == std::vector<int>{2};
+                       });
+  auto* squeeze2_out = pattern->NewNode(squeeze2_out_repr())
+                           ->assert_is_op_output("squeeze2", "Out")
+                           ->assert_is_op_input("batch_norm", "X");
+  squeeze2->LinksFrom({x}).LinksTo({squeeze2_out});
+
+  auto* bn_bias = pattern->NewNode(bn_bias_repr())
+                      ->AsInput()
+                      ->assert_is_persistable_var()
+                      ->assert_is_op_input("batch_norm", "Bias")
+                      ->assert_has_n_outputs(1);
+  auto* bn_mean = pattern->NewNode(bn_mean_repr())
+                      ->AsInput()
+                      ->assert_is_persistable_var()
+                      ->assert_is_op_input("batch_norm", "Mean")
+                      ->assert_has_n_outputs(1);
+  auto* bn_scale = pattern->NewNode(bn_scale_repr())
+                       ->AsInput()
+                       ->assert_is_persistable_var()
+                       ->assert_is_op_input("batch_norm", "Scale")
+                       ->assert_has_n_outputs(1);
+  auto* bn_var = pattern->NewNode(bn_var_repr())
+                     ->AsInput()
+                     ->assert_is_persistable_var()
+                     ->assert_is_op_input("batch_norm", "Variance")
+                     ->assert_has_n_outputs(1);
+  auto* bn = pattern->NewNode(bn_repr())->assert_is_op("batch_norm");
+  auto* bn_out = pattern->NewNode(bn_out_repr())
+                     ->assert_is_op_output("batch_norm", "Y")
+                     ->assert_is_op_input(act_type_, "X");
+  auto* bn_mean_out = pattern->NewNode(bn_mean_out_repr())
+                          ->assert_is_op_output("batch_norm", "MeanOut");
+  auto* bn_saved_mean = pattern->NewNode(bn_saved_mean_repr())
+                            ->assert_is_op_output("batch_norm", "SavedMean");
+  auto* bn_var_out = pattern->NewNode(bn_var_out_repr())
+                         ->assert_is_op_output("batch_norm", "VarianceOut");
+  auto* bn_saved_var = pattern->NewNode(bn_saved_var_repr())
+                           ->assert_is_op_output("batch_norm", "SavedVariance");
+  bn->LinksFrom({squeeze2_out, bn_bias, bn_mean, bn_scale, bn_var})
+      .LinksTo({bn_out, bn_mean_out, bn_var_out, bn_saved_mean, bn_saved_var});
+
+  auto act = pattern->NewNode(act_repr())->assert_is_op(act_type_);
+  auto act_out = pattern->NewNode(act_out_repr())
+                     ->assert_is_op_output(act_type_, "Out")
+                     ->assert_is_op_input("unsqueeze2", "X");
+  act->LinksFrom({bn_out}).LinksTo({act_out});
+
+  auto* unsqueeze2 =
+      pattern->NewNode(unsqueeze2_repr())
+          ->assert_is_op("unsqueeze2")
+          ->assert_more([](Node* node) {
+            auto* op_desc = node->Op();
+            auto axes_array =
+                op_desc->GetAttrIfExists<std::vector<int>>("axes");
+            return axes_array == std::vector<int>{-2} ||
+                   axes_array == std::vector<int>{2};
+          });
+  auto* unsqueeze2_out = pattern->NewNode(unsqueeze2_out_repr())
+                             ->assert_is_op_output("unsqueeze2", "Out");
+  unsqueeze2->LinksFrom({act_out}).LinksTo({unsqueeze2_out});
+}
+
 }  // namespace patterns
 
+void RedundantUnsqueeze2EliminationPass::FoldConv1dSqueeze2Ops(
+    ir::Graph* graph, const std::string& act_type) const {
+  GraphPatternDetector gpd;
+  patterns::FoldConv1dSqueeze2Pattern pattern(
+      gpd.mutable_pattern(), name_scope_, act_type);
+  int found_subgraph_count = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* graph) {
+    VLOG(4) << "handle FoldConv1dSqueeze2Ops";
+    // declare operator node's name
+    GET_IR_NODE(squeeze2);
+    GET_IR_NODE(bn);
+    GET_IR_NODE(act);
+    GET_IR_NODE(unsqueeze2);
+    // declare variable node's name
+    GET_IR_NODE(x);
+    GET_IR_NODE(squeeze2_out);
+    GET_IR_NODE(bn_out);
+    GET_IR_NODE(act_out);
+    GET_IR_NODE(unsqueeze2_out);
+
+    auto bn_op_desc = bn->Op();
+    bn_op_desc->RenameInput(squeeze2_out->Var()->Name(), x->Var()->Name());
+    bn_out->Var()->SetShape(x->Var()->GetShape());
+    act_out->Var()->SetShape(x->Var()->GetShape());
+    bn_op_desc->Flush();
+    IR_NODE_LINK_TO(x, bn);
+    // behind unsqueeze op node
+    auto unsqueeze_out_link_nodes = unsqueeze2_out->outputs;
+    for (auto out_link_node : unsqueeze_out_link_nodes) {
+      auto op_desc = out_link_node->Op();
+      op_desc->RenameInput(unsqueeze2_out->Var()->Name(),
+                           act_out->Var()->Name());
+      op_desc->Flush();
+      IR_NODE_LINK_TO(act_out, out_link_node);
+    }
+    // delete useless node
+    std::unordered_set<const Node*> delete_nodes = {
+        squeeze2, squeeze2_out, unsqueeze2, unsqueeze2_out};
+    GraphSafeRemoveNodes(graph, delete_nodes);
+    found_subgraph_count++;
+  };
+
+  gpd(graph, handler);
+  AddStatis(found_subgraph_count);
+}
+
 void RedundantUnsqueeze2EliminationPass::FoldTranspose2Ops(
     ir::Graph* graph, const std::string& act_type) const {
   GraphPatternDetector gpd;
@@ -315,6 +474,9 @@ void RedundantUnsqueeze2EliminationPass::ApplyImpl(ir::Graph* graph) const {
     FoldTranspose2Ops(graph, act_type);
   }
   FoldGatherSqueeze2Ops(graph);
+  for (auto act_type : {"leaky_relu", "elu"}) {
+    FoldConv1dSqueeze2Ops(graph, act_type);
+  }
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/xpu/redundant_unsqueeze_squeeze_elimination_pass.h b/paddle/fluid/framework/ir/xpu/redundant_unsqueeze_squeeze_elimination_pass.h
index 04ed41e2b6d2d..6019c135e4dad 100644
--- a/paddle/fluid/framework/ir/xpu/redundant_unsqueeze_squeeze_elimination_pass.h
+++ b/paddle/fluid/framework/ir/xpu/redundant_unsqueeze_squeeze_elimination_pass.h
@@ -74,6 +74,44 @@ class RedundantUnsqueeze2EliminationPass : public FusePassBase {
                       |
   */
   void FoldGatherSqueeze2Ops(ir::Graph* graph) const;
+  /*
+   Origin subgraph:
+           x                    filter
+           |                      |
+      unsqueeze2(axes={-2})   unsqueeze2(axes={-2})
+            \                   /
+              \               /
+                conv2d(conv1d)
+                      |
+                elementwise_add
+                      |
+                squeeze2(axes={-2})
+                      |
+                 batch_norm
+                      |
+                     act
+                      |
+                  unsqueeze2
+                      |
+                  conv2d(conv1d)
+   Fused subgraph:
+           x                    filter
+           |                      |
+      unsqueeze2(axes={-2})   unsqueeze2(axes={-2})
+            \                   /
+              \               /
+                conv2d(conv1d)
+                      |
+                elementwise_add
+                      |
+                  batch_norm
+                      |
+                     act
+                      |
+                  conv2d(conv1d)
+  */
+  void FoldConv1dSqueeze2Ops(ir::Graph* graph,
+                             const std::string& act_type) const;
 
   const std::string name_scope_{"redundant_unsqueeze_squeeze_elimination_pass"};
 };
diff --git a/paddle/fluid/framework/new_executor/CMakeLists.txt b/paddle/fluid/framework/new_executor/CMakeLists.txt
index 16b18c2d7d6bd..ae30121bc930b 100644
--- a/paddle/fluid/framework/new_executor/CMakeLists.txt
+++ b/paddle/fluid/framework/new_executor/CMakeLists.txt
@@ -11,13 +11,13 @@ set(STANDALONE_EXECUTOR_DEPS
     interpreter
     interpretercore_garbage_collector
     workqueue
-    pd_dialect
+    pd_op_dialect
     pd_op_to_kernel_pass
     phi_kernel_adaptor
     program_translator
     instruction_base
     pd_inplace_pass
-    ir)
+    pir)
 
 cc_library(
   standalone_executor
diff --git a/paddle/fluid/framework/new_executor/instruction/CMakeLists.txt b/paddle/fluid/framework/new_executor/instruction/CMakeLists.txt
index 8a9247859b85f..7706e462fef76 100644
--- a/paddle/fluid/framework/new_executor/instruction/CMakeLists.txt
+++ b/paddle/fluid/framework/new_executor/instruction/CMakeLists.txt
@@ -8,5 +8,5 @@ if(WITH_CINN AND NOT CINN_ONLY)
   cc_library(
     cinn_jit_instruction NOT_FOR_INFER
     SRCS cinn_jit_instruction.cc
-    DEPS phi cinnapi cinn_dialect runtime_dialect)
+    DEPS phi cinnapi cinn_op_dialect cinn_runtime_dialect)
 endif()
diff --git a/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.cc b/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.cc
index d56ccc7b7ba6b..8841103213400 100644
--- a/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.cc
+++ b/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.cc
@@ -14,8 +14,8 @@
 
 #include "paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.h"
 
-#include "paddle/cinn/hlir/dialect/runtime_dialect/ir/jit_kernel_op.h"
-#include "paddle/cinn/hlir/dialect/runtime_dialect/ir/runtime_dialect.h"
+#include "paddle/cinn/hlir/dialect/runtime/ir/jit_kernel_op.h"
+#include "paddle/cinn/hlir/dialect/runtime/ir/runtime_dialect.h"
 #include "paddle/cinn/hlir/framework/instruction.h"
 #include "paddle/fluid/framework/paddle2cinn/transform_type.h"
 
@@ -93,7 +93,7 @@ class CinnJitInstruction::Impl {
 
 CinnJitInstruction::CinnJitInstruction(size_t id,
                                        const platform::Place& place,
-                                       ::ir::Operation* op,
+                                       ::pir::Operation* op,
                                        Scope* scope)
     : InstructionBase(id, place) {
   // TODO(Aurelius84): We shall simplify members of JitKernelOp to make it
@@ -101,6 +101,7 @@ CinnJitInstruction::CinnJitInstruction(size_t id,
   // responsible to construct hlir::framework::Instruction.
   auto jit_kernel_op = op->dyn_cast<cinn::dialect::JitKernelOp>();
   impl_ = std::make_shared<Impl>(jit_kernel_op.instruction());
+  op_ = op;
 }
 
 void CinnJitInstruction::Run() {
diff --git a/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.h b/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.h
index b20f6e08d9afc..5f5e4f74e8884 100644
--- a/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.h
+++ b/paddle/fluid/framework/new_executor/instruction/cinn_jit_instruction.h
@@ -17,7 +17,7 @@
 #include <memory>
 #include "paddle/fluid/framework/new_executor/instruction/instruction_base.h"
 
-namespace ir {
+namespace pir {
 class Operation;
 }
 
@@ -29,7 +29,7 @@ class CinnJitInstruction : public InstructionBase {
  public:
   CinnJitInstruction(size_t id,
                      const platform::Place& place,
-                     ::ir::Operation* op,
+                     ::pir::Operation* op,
                      Scope* scope);
 
   // TODO(Aurelius84): Only implement core interface and need implement GC and
@@ -38,9 +38,13 @@ class CinnJitInstruction : public InstructionBase {
 
   const std::string& Name() const override;
 
+  ::pir::Operation* Operation() const override { return op_; }
+
  private:
   class Impl;
   std::shared_ptr<Impl> impl_{nullptr};
+
+  ::pir::Operation* op_{nullptr};  // not owned
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_base.cc b/paddle/fluid/framework/new_executor/instruction/instruction_base.cc
index 56dafd3132c03..6836a7f306daa 100644
--- a/paddle/fluid/framework/new_executor/instruction/instruction_base.cc
+++ b/paddle/fluid/framework/new_executor/instruction/instruction_base.cc
@@ -20,7 +20,7 @@
 
 #include "paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h"
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/ir/core/builtin_attribute.h"
+#include "paddle/pir/core/builtin_attribute.h"
 
 namespace paddle {
 namespace framework {
@@ -90,28 +90,28 @@ void InstructionBase::AddInplace(Variable* in, Variable* out) {
 void InstructionBase::ClearInplace() { vec_inplace_in_to_out_.clear(); }
 
 void InstructionBase::SetInputs(
-    const std::unordered_map<ir::Value, std::vector<int>>& inputs) {
+    const std::unordered_map<pir::Value, std::vector<int>>& inputs) {
   input_index_ = inputs;
 }
 
 void InstructionBase::SetOutputs(
-    const std::unordered_map<ir::Value, std::vector<int>>& outputs) {
+    const std::unordered_map<pir::Value, std::vector<int>>& outputs) {
   output_index_ = outputs;
 }
 
 void InstructionBase::InitInputsOutputsIds(
-    ::ir::Operation* op,
+    ::pir::Operation* op,
     Scope* inner_scope,
-    const std::unordered_map<::ir::Value, std::string>& value_2_var_name,
+    const std::unordered_map<pir::Value, std::string>& value_2_var_name,
     const std::map<std::string, int>& var_name_2_id,
     const std::unordered_map<const paddle::framework::Variable*, std::string>&
         variable_2_var_name) {
   auto op_attributes = op->attributes();
   auto op_name =
-      op_attributes.at("op_name").dyn_cast<::ir::StrAttribute>().AsString();
-  std::unordered_map<ir::Value, std::vector<int>> inputs;
+      op_attributes.at("op_name").dyn_cast<pir::StrAttribute>().AsString();
+  std::unordered_map<pir::Value, std::vector<int>> inputs;
   for (size_t i = 0; i < op->num_operands(); i++) {
-    ir::Value value = op->operand_source(i);
+    pir::Value value = op->operand_source(i);
     if (value) {
       PADDLE_ENFORCE_NE(
           value_2_var_name.find(value),
@@ -130,9 +130,9 @@ void InstructionBase::InitInputsOutputsIds(
   }
   SetInputs(inputs);
   VLOG(8) << "finish process inputs_index";
-  std::unordered_map<ir::Value, std::vector<int>> outputs;
+  std::unordered_map<pir::Value, std::vector<int>> outputs;
   for (size_t i = 0; i < op->num_results(); i++) {
-    ir::Value value = op->result(i);
+    pir::Value value = op->result(i);
     if (value && value.type()) {
       PADDLE_ENFORCE_NE(
           value_2_var_name.find(value),
diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_base.h b/paddle/fluid/framework/new_executor/instruction/instruction_base.h
index b8271a0ea0012..c20f46f15c716 100644
--- a/paddle/fluid/framework/new_executor/instruction/instruction_base.h
+++ b/paddle/fluid/framework/new_executor/instruction/instruction_base.h
@@ -22,9 +22,9 @@
 #include "paddle/fluid/framework/new_executor/new_executor_defs.h"
 #include "paddle/fluid/platform/event.h"
 
-namespace ir {
+namespace pir {
 class Value;
-}  // namespace ir
+}  // namespace pir
 
 namespace paddle {
 namespace framework {
@@ -107,29 +107,29 @@ class InstructionBase {
   std::map<int, int>& GetMutableInplaceBackMap() { return inplace_back_map_; }
   const std::map<int, int>& GetInplaceBackMap() { return inplace_back_map_; }
 
-  const std::unordered_map<::ir::Value, std::vector<int>>& Inputs() const {
+  const std::unordered_map<::pir::Value, std::vector<int>>& Inputs() const {
     return input_index_;
   }
-  std::unordered_map<::ir::Value, std::vector<int>>& GetMutableInputs() {
+  std::unordered_map<::pir::Value, std::vector<int>>& GetMutableInputs() {
     return input_index_;
   }
   void SetInputs(
-      const std::unordered_map<::ir::Value, std::vector<int>>& inputs);
+      const std::unordered_map<::pir::Value, std::vector<int>>& inputs);
 
-  const std::unordered_map<::ir::Value, std::vector<int>>& Outputs() const {
+  const std::unordered_map<::pir::Value, std::vector<int>>& Outputs() const {
     return output_index_;
   }
-  std::unordered_map<::ir::Value, std::vector<int>>& GetMutableOutputs() {
+  std::unordered_map<::pir::Value, std::vector<int>>& GetMutableOutputs() {
     return output_index_;
   }
   void SetOutputs(
-      const std::unordered_map<::ir::Value, std::vector<int>>& outputs);
+      const std::unordered_map<::pir::Value, std::vector<int>>& outputs);
 
-  const std::unordered_set<::ir::Value>& NoNeedBuffer() const {
+  const std::unordered_set<::pir::Value>& NoNeedBuffer() const {
     return no_need_buffer_values_;
   }
   void SetNoNeedBuffer(
-      const std::unordered_set<::ir::Value>& no_need_buffer_values) {
+      const std::unordered_set<::pir::Value>& no_need_buffer_values) {
     no_need_buffer_values_ = no_need_buffer_values;
   }
 
@@ -137,10 +137,12 @@ class InstructionBase {
 
   virtual const std::string& Name() const = 0;
 
+  virtual ::pir::Operation* Operation() const = 0;
+
   void InitInputsOutputsIds(
-      ::ir::Operation* op,
+      ::pir::Operation* op,
       Scope* inner_scope,
-      const std::unordered_map<::ir::Value, std::string>& value_2_var_name,
+      const std::unordered_map<::pir::Value, std::string>& value_2_var_name,
       const std::map<std::string, int>& var_name_2_id,
       const std::unordered_map<const paddle::framework::Variable*, std::string>&
           variable_2_var_name);
@@ -176,11 +178,11 @@ class InstructionBase {
 
   std::map<int, int> inplace_back_map_;
 
-  std::unordered_map<::ir::Value, std::vector<int>> input_index_;
+  std::unordered_map<::pir::Value, std::vector<int>> input_index_;
 
-  std::unordered_map<::ir::Value, std::vector<int>> output_index_;
+  std::unordered_map<::pir::Value, std::vector<int>> output_index_;
 
-  std::unordered_set<::ir::Value> no_need_buffer_values_;
+  std::unordered_set<::pir::Value> no_need_buffer_values_;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_util.cc b/paddle/fluid/framework/new_executor/instruction/instruction_util.cc
index dd6aa26a1ae53..dfa8e1ec85f9f 100644
--- a/paddle/fluid/framework/new_executor/instruction/instruction_util.cc
+++ b/paddle/fluid/framework/new_executor/instruction/instruction_util.cc
@@ -22,22 +22,28 @@
 #include "paddle/fluid/framework/new_executor/new_executor_defs.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/event.h"
-#include "paddle/ir/core/builtin_attribute.h"
-#include "paddle/ir/core/operation.h"
-#include "paddle/ir/core/value.h"
+#include "paddle/pir/core/builtin_attribute.h"
+#include "paddle/pir/core/operation.h"
+#include "paddle/pir/core/value.h"
 
 #include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h"
 #include "paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h"
-#include "paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h"
+#include "paddle/fluid/pir/phi_kernel_adaptor/phi_kernel_util.h"
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/phi/core/distributed/comm_context_manager.h"
+#include "paddle/phi/core/distributed/nccl_comm_context.h"
+#include "paddle/phi/core/flags.h"
+PHI_DECLARE_bool(dynamic_static_unified_comm);
+#endif
 
 namespace paddle {
 namespace framework {
 
 std::vector<int> GetValueIds(
-    ir::Value value,
+    pir::Value value,
     Scope* inner_scope,
-    const std::unordered_map<::ir::Value, std::string>& value_2_var_name,
+    const std::unordered_map<pir::Value, std::string>& value_2_var_name,
     const std::map<std::string, int>& var_name_2_id,
     const std::unordered_map<const paddle::framework::Variable*, std::string>&
         variable_2_var_name) {
@@ -56,14 +62,14 @@ std::vector<int> GetValueIds(
 }
 
 platform::DeviceContext* ParseDeviceContext(
-    ir::Operation* op,
+    pir::Operation* op,
     platform::DeviceContext* origin_dev_ctx,
     const platform::Place& place,
     const std::string& execution_stream,
     const int stream_priority) {
   auto& op_attributes = op->attributes();
   auto op_name =
-      op_attributes.at("op_name").dyn_cast<::ir::StrAttribute>().AsString();
+      op_attributes.at("op_name").dyn_cast<pir::StrAttribute>().AsString();
   interpreter::ContextManager& ctx_manager =
       interpreter::ContextManager::Instance();
 
@@ -109,13 +115,23 @@ platform::DeviceContext* ParseDeviceContext(
     // c_allreduce_op.h). Now it is just a temporary solution for ONLY
     // c_allreduce_sum which is used in ResNet50 distributed training.
     if (op_name == "c_allreduce_sum" && op_attributes.at("use_calc_stream")
-                                                .dyn_cast<::ir::BoolAttribute>()
+                                                .dyn_cast<pir::BoolAttribute>()
                                                 .data() == false) {
       int ring_id =
-          op_attributes.at("ring_id").dyn_cast<::ir::Int32Attribute>().data();
-      return platform::NCCLCommContext::Instance()
-          .Get(ring_id, place)
-          ->dev_context();
+          op_attributes.at("ring_id").dyn_cast<pir::Int32Attribute>().data();
+      if (FLAGS_dynamic_static_unified_comm) {
+        const auto& comm_context_manager =
+            phi::distributed::CommContextManager::GetInstance();
+        dev_ctx = static_cast<platform::DeviceContext*>(
+            static_cast<phi::distributed::NCCLCommContext*>(
+                comm_context_manager.Get(std::to_string(ring_id)))
+                ->GetDevContext());
+      } else {
+        dev_ctx = platform::NCCLCommContext::Instance()
+                      .Get(ring_id, place)
+                      ->dev_context();
+      }
+      return dev_ctx;
     }
 #endif
   }
@@ -126,8 +142,7 @@ platform::DeviceContext* ParseDeviceContext(
   return origin_dev_ctx;
 }
 
-OpFuncType AnalyseOpFuncType(::ir::Operation* op,
-                             const platform::Place& place) {
+OpFuncType AnalyseOpFuncType(pir::Operation* op, const platform::Place& place) {
   if (platform::is_cpu_place(place)) {
     return OpFuncType::kCpuSync;
   }
@@ -151,21 +166,21 @@ OpFuncType AnalyseOpFuncType(::ir::Operation* op,
   // and so that they would be dispatched to host thread.
   auto& op_attributes = op->attributes();
   auto op_name =
-      op_attributes.at("op_name").dyn_cast<::ir::StrAttribute>().AsString();
-  if (op_name == kCoalesceTensor &&
+      op_attributes.at("op_name").dyn_cast<pir::StrAttribute>().AsString();
+  if (op_name == "pd_op.coalesce_tensor" &&
       (!platform::is_xpu_place(place) ||
-       op->attribute<ir::BoolAttribute>("persist_output").data() == false) &&
-      op->attribute<ir::BoolAttribute>("set_constant").data() == false &&
-      op->attribute<ir::BoolAttribute>("copy_data").data() == false) {
+       op->attribute<pir::BoolAttribute>("persist_output").data() == false) &&
+      op->attribute<pir::BoolAttribute>("set_constant").data() == false &&
+      op->attribute<pir::BoolAttribute>("copy_data").data() == false) {
     return OpFuncType::kGpuSync;
   }
 
   // for memcpy explicitly called by user
-  if (platform::is_gpu_place(place) && op_name == interpreter::kMemcpyD2H) {
+  if (platform::is_gpu_place(place) && op_name == "pd_op.memcpy_d2h") {
     return OpFuncType::kGpuSync;
   }
 
-  if (op_name == "shape") {
+  if (op_name == "pd_op.shape") {
     return OpFuncType::kGpuSync;
   }
   return OpFuncType::kGpuAsync;
diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_util.h b/paddle/fluid/framework/new_executor/instruction/instruction_util.h
index a41ce07957e4a..c555a101d8366 100644
--- a/paddle/fluid/framework/new_executor/instruction/instruction_util.h
+++ b/paddle/fluid/framework/new_executor/instruction/instruction_util.h
@@ -22,28 +22,29 @@
 #include "paddle/fluid/framework/new_executor/new_executor_defs.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/event.h"
-#include "paddle/ir/core/builtin_attribute.h"
-#include "paddle/ir/core/operation.h"
-#include "paddle/ir/core/value.h"
+#include "paddle/pir/core/builtin_attribute.h"
+#include "paddle/pir/core/operation.h"
+#include "paddle/pir/core/value.h"
 namespace paddle {
 namespace framework {
 
 std::vector<int> GetValueIds(
-    ir::Value value,
+    pir::Value value,
     Scope* inner_scope,
-    const std::unordered_map<::ir::Value, std::string>& value_2_var_name,
+    const std::unordered_map<::pir::Value, std::string>& value_2_var_name,
     const std::map<std::string, int>& var_name_2_id,
     const std::unordered_map<const paddle::framework::Variable*, std::string>&
         variable_2_var_name);
 
 platform::DeviceContext* ParseDeviceContext(
-    ir::Operation* op,
+    pir::Operation* op,
     platform::DeviceContext* origin_dev_ctx,
     const platform::Place& place,
     const std::string& execution_stream,
     const int stream_priority);
 
-OpFuncType AnalyseOpFuncType(::ir::Operation* op, const platform::Place& place);
+OpFuncType AnalyseOpFuncType(::pir::Operation* op,
+                             const platform::Place& place);
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc b/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc
index 88037b15193d8..50623c6eb1118 100644
--- a/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc
+++ b/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc
@@ -18,11 +18,11 @@
 #include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h"
 #include "paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/interface/infermeta.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/interface/op_yaml_info.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_dialect.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/utils/op_yaml_info_parser.h"
-#include "paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h"
+#include "paddle/fluid/pir/dialect/operator/interface/infermeta.h"
+#include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
+#include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.h"
+#include "paddle/fluid/pir/phi_kernel_adaptor/phi_kernel_util.h"
 
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/core/infermeta_utils.h"
@@ -35,19 +35,20 @@ namespace framework {
 LegacyKernelInstruction::LegacyKernelInstruction(
     size_t id,
     const platform::Place& place,
-    ir::Operation* op,
+    pir::Operation* op,
     Scope* scope,
     Scope* local_scope,
-    const std::unordered_map<::ir::Value, std::string>& value_2_var_name,
+    const std::unordered_map<pir::Value, std::string>& value_2_var_name,
     const std::map<std::string, int>& var_name_2_id,
     const std::unordered_map<const paddle::framework::Variable*, std::string>&
         variable_2_var_name)
     : InstructionBase(id, place) {
   auto& op_attributes = op->attributes();
   auto op_name =
-      op_attributes.at("op_name").dyn_cast<::ir::StrAttribute>().AsString();
-  ir::OpInfo op_info = ir::IrContext::Instance()->GetRegisteredOpInfo(op_name);
-
+      op_attributes.at("op_name").dyn_cast<pir::StrAttribute>().AsString();
+  pir::OpInfo op_info =
+      pir::IrContext::Instance()->GetRegisteredOpInfo(op_name);
+  op_ = op;
   legacy_op_name_ = op_name;
   VLOG(6) << "construct phi kernel instruction for: " << legacy_op_name_;
 
@@ -55,17 +56,17 @@ LegacyKernelInstruction::LegacyKernelInstruction(
   //   if (op_attributes.count("dist_attr") != 0) {
   //     if (op_attributes.count("execution_stream") != 0) {
   //         SetExecutionStream(op_attributes.at("execution_stream")
-  //                             .dyn_cast<::ir::StrAttribute>()
+  //                             .dyn_cast<pir::StrAttribute>()
   //                             .data());
   //     }
   //     if (op_attributes.count("stream_priority") != 0) {
   //         SetStreamPriority(op_attributes.at("stream_priority")
-  //                             .dyn_cast<::ir::Int32Attribute>()
+  //                             .dyn_cast<pir::Int32Attribute>()
   //                             .data());
   //     }
   //     if (op_attributes.count("scheduling_priority") != 0) {
   //         SetSchedulingPriority(op_attributes.at("scheduling_priority")
-  //                                 .dyn_cast<::ir::Int64Attribute>()
+  //                                 .dyn_cast<pir::Int64Attribute>()
   //                                 .data());
   //     }
   //   } else {
@@ -98,7 +99,7 @@ LegacyKernelInstruction::LegacyKernelInstruction(
   VLOG(6) << "finish process yaml_info_parser";
 
   if (infer_meta_interface_) {
-    ::ir::BuildPhiContext<
+    pir::BuildPhiContext<
         phi::InferMetaContext,
         phi::MetaTensor,
         phi::MetaTensor,
@@ -114,7 +115,7 @@ LegacyKernelInstruction::LegacyKernelInstruction(
   VLOG(6) << "finish process infer meta context";
 
   auto kernel_name =
-      op_attributes.at("kernel_name").dyn_cast<ir::StrAttribute>().AsString();
+      op_attributes.at("kernel_name").dyn_cast<pir::StrAttribute>().AsString();
   auto kernel_key = op_attributes.at("kernel_key")
                         .dyn_cast<paddle::dialect::KernelAttribute>()
                         .data();
@@ -127,7 +128,7 @@ LegacyKernelInstruction::LegacyKernelInstruction(
 
   Scope* inner_scope = local_scope == nullptr ? scope : local_scope;
 
-  operator_base_ = ir::BuildOperatorBase(
+  operator_base_ = pir::BuildOperatorBase(
       op, value_2_var_name, yaml_info_parser, variable_2_var_name, inner_scope);
   paddle::framework::VariableValueMap in_map;
   paddle::framework::VariableValueMap out_map;
@@ -136,12 +137,12 @@ LegacyKernelInstruction::LegacyKernelInstruction(
 
   runtime_context_ = std::make_shared<paddle::framework::RuntimeContext>(
       paddle::framework::RuntimeContext(in_map, out_map));
-  ir::BuildRuntimeContext(op,
-                          value_2_var_name,
-                          scope,
-                          local_scope,
-                          yaml_info_parser,
-                          runtime_context_.get());
+  pir::BuildRuntimeContext(op,
+                           value_2_var_name,
+                           scope,
+                           local_scope,
+                           yaml_info_parser,
+                           runtime_context_.get());
   kernel_context_ = new paddle::framework::ExecutionContext(
       *operator_base_, *local_scope, *dev_ctx, *(runtime_context_.get()));
 
@@ -160,7 +161,7 @@ LegacyKernelInstruction::LegacyKernelInstruction(
   VLOG(6) << "finish process inputs outputs index";
 
   auto& no_need_buffer_ids = yaml_info_parser.NoNeedBufferIds();
-  std::unordered_set<::ir::Value> no_need_buffer_values;
+  std::unordered_set<pir::Value> no_need_buffer_values;
   for (size_t id = 0; id < no_need_buffer_ids.size(); id++) {
     no_need_buffer_values.insert(op->operand_source(no_need_buffer_ids[id]));
   }
@@ -186,6 +187,5 @@ void LegacyKernelInstruction::Run() {
   (*(phi_kernel_))((kernel_context_));
   VLOG(6) << "Run op " << legacy_op_name_ << " kernel.";
 }
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h b/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h
index 27c1cb133bec0..9c6fbd9b7d807 100644
--- a/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h
+++ b/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h
@@ -16,10 +16,10 @@
 
 #include "paddle/fluid/framework/new_executor/instruction/instruction_base.h"
 
-namespace ir {
+namespace pir {
 class Operation;
 class Value;
-}  // namespace ir
+}  // namespace pir
 
 namespace paddle {
 namespace framework {
@@ -30,10 +30,10 @@ class LegacyKernelInstruction : public InstructionBase {
   LegacyKernelInstruction(
       size_t id,
       const platform::Place& place,
-      ::ir::Operation* op,
+      ::pir::Operation* op,
       Scope* scope,
       Scope* local_scope,
-      const std::unordered_map<::ir::Value, std::string>& value_2_var_name,
+      const std::unordered_map<::pir::Value, std::string>& value_2_var_name,
       const std::map<std::string, int>& var_name_2_id,
       const std::unordered_map<const paddle::framework::Variable*, std::string>&
           variable_2_var_name);
@@ -53,6 +53,8 @@ class LegacyKernelInstruction : public InstructionBase {
 
   const std::string& Name() const override { return legacy_op_name_; }
 
+  ::pir::Operation* Operation() const override { return op_; }
+
  private:
   std::string legacy_op_name_;
 
@@ -66,6 +68,8 @@ class LegacyKernelInstruction : public InstructionBase {
   std::shared_ptr<paddle::framework::OperatorBase> operator_base_;
 
   phi::Kernel* phi_kernel_{nullptr};  // not owned
+
+  ::pir::Operation* op_{nullptr};  // not owned
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc
index 093435f8b98a2..849a83fcf2ce9 100644
--- a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc
+++ b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc
@@ -17,20 +17,20 @@
 #include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h"
 #include "paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/interface/infermeta.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/interface/op_yaml_info.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_dialect.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/utils/op_yaml_info_parser.h"
-#include "paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h"
+#include "paddle/fluid/pir/dialect/operator/interface/infermeta.h"
+#include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
+#include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.h"
+#include "paddle/fluid/pir/phi_kernel_adaptor/phi_kernel_util.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/core/meta_tensor.h"
 #include "paddle/phi/core/type_defs.h"
 
-#include "paddle/ir/core/builtin_attribute.h"
-#include "paddle/ir/core/operation.h"
-#include "paddle/ir/core/value.h"
+#include "paddle/pir/core/builtin_attribute.h"
+#include "paddle/pir/core/operation.h"
+#include "paddle/pir/core/value.h"
 
 #include "paddle/fluid/framework/new_executor/instruction/instruction_util.h"
 namespace paddle {
@@ -39,19 +39,20 @@ namespace framework {
 PhiKernelInstruction::PhiKernelInstruction(
     size_t id,
     const platform::Place& place,
-    ir::Operation* op,
+    pir::Operation* op,
     Scope* scope,
     Scope* local_scope,
-    const std::unordered_map<::ir::Value, std::string>& value_2_var_name,
+    const std::unordered_map<pir::Value, std::string>& value_2_var_name,
     const std::map<std::string, int>& var_name_2_id,
     const std::unordered_map<const paddle::framework::Variable*, std::string>&
         variable_2_var_name)
     : InstructionBase(id, place) {
   auto op_attributes = op->attributes();
   auto op_name =
-      op_attributes.at("op_name").dyn_cast<::ir::StrAttribute>().AsString();
-  ir::OpInfo op_info = ir::IrContext::Instance()->GetRegisteredOpInfo(op_name);
-
+      op_attributes.at("op_name").dyn_cast<pir::StrAttribute>().AsString();
+  pir::OpInfo op_info =
+      pir::IrContext::Instance()->GetRegisteredOpInfo(op_name);
+  op_ = op;
   phi_op_name_ = op_name;
   VLOG(6) << "construct phi kernel instruction for: " << phi_op_name_;
 
@@ -59,17 +60,17 @@ PhiKernelInstruction::PhiKernelInstruction(
   //   if (op_attributes.count("dist_attr") != 0) {
   //     if (op_attributes.count("execution_stream") != 0) {
   //         SetExecutionStream(op_attributes.at("execution_stream")
-  //                             .dyn_cast<::ir::StrAttribute>()
+  //                             .dyn_cast<pir::StrAttribute>()
   //                             .data());
   //     }
   //     if (op_attributes.count("stream_priority") != 0) {
   //         SetStreamPriority(op_attributes.at("stream_priority")
-  //                             .dyn_cast<::ir::Int32Attribute>()
+  //                             .dyn_cast<pir::Int32Attribute>()
   //                             .data());
   //     }
   //     if (op_attributes.count("scheduling_priority") != 0) {
   //         SetSchedulingPriority(op_attributes.at("scheduling_priority")
-  //                                 .dyn_cast<::ir::Int64Attribute>()
+  //                                 .dyn_cast<pir::Int64Attribute>()
   //                                 .data());
   //     }
   //   } else {
@@ -102,7 +103,7 @@ PhiKernelInstruction::PhiKernelInstruction(
   VLOG(6) << "finish process yaml_info_parser";
 
   if (infer_meta_interface_) {
-    ::ir::BuildPhiContext<
+    pir::BuildPhiContext<
         phi::InferMetaContext,
         phi::MetaTensor,
         phi::MetaTensor,
@@ -118,7 +119,7 @@ PhiKernelInstruction::PhiKernelInstruction(
   VLOG(6) << "finish process infer meta context";
 
   auto kernel_name =
-      op_attributes.at("kernel_name").dyn_cast<ir::StrAttribute>().AsString();
+      op_attributes.at("kernel_name").dyn_cast<pir::StrAttribute>().AsString();
   auto kernel_key = op_attributes.at("kernel_key")
                         .dyn_cast<paddle::dialect::KernelAttribute>()
                         .data();
@@ -129,17 +130,17 @@ PhiKernelInstruction::PhiKernelInstruction(
       phi_kernel_->IsValid(), true, "not found kernel for [%s]", kernel_name);
   VLOG(6) << "finish process select kernel";
 
-  ::ir::BuildPhiContext<phi::KernelContext,
-                        const phi::TensorBase*,
-                        phi::TensorBase*,
-                        paddle::small_vector<const phi::TensorBase*>,
-                        paddle::small_vector<phi::TensorBase*>,
-                        true>(op,
-                              value_2_var_name,
-                              scope,
-                              local_scope,
-                              yaml_info_parser,
-                              &kernel_context_);
+  pir::BuildPhiContext<phi::KernelContext,
+                       const phi::TensorBase*,
+                       phi::TensorBase*,
+                       paddle::small_vector<const phi::TensorBase*>,
+                       paddle::small_vector<phi::TensorBase*>,
+                       true>(op,
+                             value_2_var_name,
+                             scope,
+                             local_scope,
+                             yaml_info_parser,
+                             &kernel_context_);
   kernel_context_.SetDeviceContext(phi::DeviceContextPool::Instance().Get(
       phi::TransToPhiPlace(kernel_key.backend())));
   VLOG(6) << "finish process kernel context";
@@ -159,7 +160,7 @@ PhiKernelInstruction::PhiKernelInstruction(
   VLOG(6) << "finish process inputs outputs index";
 
   auto& no_need_buffer_ids = yaml_info_parser.NoNeedBufferIds();
-  std::unordered_set<::ir::Value> no_need_buffer_values;
+  std::unordered_set<pir::Value> no_need_buffer_values;
   for (size_t id = 0; id < no_need_buffer_ids.size(); id++) {
     no_need_buffer_values.insert(op->operand_source(no_need_buffer_ids[id]));
   }
@@ -167,6 +168,12 @@ PhiKernelInstruction::PhiKernelInstruction(
   VLOG(6) << "finish process no need buffer";
 }
 
+PhiKernelInstruction::~PhiKernelInstruction() {
+  if (phi_kernel_ != nullptr) {
+    delete phi_kernel_;
+  }
+}
+
 void PhiKernelInstruction::Run() {
   if (infer_meta_interface_) {
     infer_meta_interface_->infer_meta_(&(infer_meta_context_));
diff --git a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h
index c637cce8651fb..96484f435a9f7 100644
--- a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h
+++ b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h
@@ -16,9 +16,9 @@
 
 #include "paddle/fluid/framework/new_executor/instruction/instruction_base.h"
 
-namespace ir {
+namespace pir {
 class Operation;
-}  // namespace ir
+}  // namespace pir
 
 namespace paddle {
 namespace framework {
@@ -30,14 +30,16 @@ class PhiKernelInstruction : public InstructionBase {
   PhiKernelInstruction(
       size_t id,
       const platform::Place& place,
-      ::ir::Operation* op,
+      ::pir::Operation* op,
       Scope* scope,
       Scope* local_scope,
-      const std::unordered_map<::ir::Value, std::string>& value_2_var_name,
+      const std::unordered_map<::pir::Value, std::string>& value_2_var_name,
       const std::map<std::string, int>& var_name_2_id,
       const std::unordered_map<const paddle::framework::Variable*, std::string>&
           variable_2_var_name);
 
+  ~PhiKernelInstruction();
+
   phi::Kernel* PhiKernel() const { return phi_kernel_; }
 
   const phi::KernelContext& KernelContext() const { return kernel_context_; }
@@ -50,6 +52,8 @@ class PhiKernelInstruction : public InstructionBase {
     return infer_meta_interface_;
   }
 
+  ::pir::Operation* Operation() const override { return op_; }
+
   void Run() override;
 
   const std::string& Name() const override { return phi_op_name_; }
@@ -65,6 +69,8 @@ class PhiKernelInstruction : public InstructionBase {
   phi::Kernel* phi_kernel_{nullptr};  // not owned
 
   std::string phi_op_name_;
+
+  ::pir::Operation* op_{nullptr};  // not owned
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
index a717a3ed09531..c6655c55fb2c3 100644
--- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
@@ -23,15 +23,16 @@
 #include "paddle/fluid/framework/new_executor/interpreter/data_transfer.h"
 #include "paddle/fluid/framework/new_executor/interpreter/execution_config.h"
 #include "paddle/fluid/framework/new_executor/interpreter/static_build.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/interface/op_yaml_info.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_dialect.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/utils/op_yaml_info_parser.h"
-#include "paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h"
 #include "paddle/fluid/memory/stats.h"
 #include "paddle/fluid/operators/controlflow/conditional_block_op_helper.h"
+#include "paddle/fluid/operators/controlflow/pylayer_op_helper.h"
 #include "paddle/fluid/operators/controlflow/recurrent_op_helper.h"
 #include "paddle/fluid/operators/controlflow/while_op_helper.h"
 #include "paddle/fluid/operators/ops_extra_info.h"
+#include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
+#include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.h"
+#include "paddle/fluid/pir/phi_kernel_adaptor/phi_kernel_util.h"
 #include "paddle/fluid/platform/flags.h"
 #include "paddle/phi/core/distributed/comm_context_manager.h"
 #include "paddle/phi/core/kernel_context.h"
@@ -191,7 +192,7 @@ bool IsMemcpyH2D(Instruction* instr) {
 }
 
 bool IsMemcpyH2D(paddle::framework::InstructionBase* instr) {
-  return instr->Name() == "pd.memcpy_h2d";
+  return instr->Name() == "pd_op.memcpy_h2d";
 }
 
 bool IsMemcpyOp(const Instruction& instr) {
@@ -571,6 +572,8 @@ void BuildOpFuncList(const platform::Place& place,
     const ProgramDesc& main_program = *block.Program();
     operators::PrepareSafeEagerDeletionOnConditionalOpAndConditionalGradOp(
         main_program, block.ID(), ops_unique);
+    operators::PrepareSafeEagerDeletionOnPyLayerOpAndPyLayerGradOp(
+        main_program, block.ID(), ops_unique);
     operators::PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(
         main_program, block.ID(), ops_unique);
     operators::PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp(
@@ -611,6 +614,8 @@ void BuildOpFuncList(const platform::Place& place,
     const std::set<std::string> ops_with_var_not_in_scope = {
         "conditional_block",
         "conditional_block_grad",
+        "pylayer",
+        "pylayer_grad"
         "recurrent_grad",
         "rnn_memory_helper",
         "rnn_memory_helper_grad",
@@ -1016,23 +1021,23 @@ void BuildOpFuncList(const platform::Place& place,
 
 void BuildOpFuncList(
     const platform::Place& place,
-    ::ir::Block* block,
+    pir::Block* block,
     std::vector<OpFuncNode>* vec_func_list,
     framework::Scope* scope,
     framework::Scope* local_scope,
-    const std::unordered_map<::ir::Value, std::string>& value_2_name_map,
+    const std::unordered_map<pir::Value, std::string>& value_2_name_map,
     const ExecutionConfig& execution_config) {
   vec_func_list->reserve(block->size());
-  ::ir::IrContext* ctx = ir::IrContext::Instance();
+  pir::IrContext* ctx = pir::IrContext::Instance();
 
-  ctx->GetOrRegisterDialect<paddle::dialect::PaddleDialect>();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
 
   for (auto op : *block) {
     OpFuncNode op_func_node;
     auto attr_map = op->attributes();
 
     auto op_name =
-        attr_map.at("op_name").dyn_cast<::ir::StrAttribute>().AsString();
+        attr_map.at("op_name").dyn_cast<pir::StrAttribute>().AsString();
     op_func_node.phi_op_name_ = op_name;
 
     if (GetSpecialOpNames().count(op_name)) {
@@ -1040,7 +1045,7 @@ void BuildOpFuncList(
       continue;
     }
 
-    ::ir::OpInfo op_info = ctx->GetRegisteredOpInfo(op_name);
+    pir::OpInfo op_info = ctx->GetRegisteredOpInfo(op_name);
 
     auto impl =
         op_info.GetInterfaceImpl<paddle::dialect::OpYamlInfoInterface>();
@@ -1051,7 +1056,7 @@ void BuildOpFuncList(
     VLOG(6) << "op name" << op_func_node.phi_op_name_;
     dialect::OpYamlInfoParser op_yaml_info_parser(impl->get_op_info_());
     if (op_func_node.infer_meta_interface_) {
-      ::ir::BuildPhiContext<
+      pir::BuildPhiContext<
           phi::InferMetaContext,
           phi::MetaTensor,
           phi::MetaTensor,
@@ -1066,7 +1071,7 @@ void BuildOpFuncList(
     }
 
     auto kernel_name =
-        attr_map.at("kernel_name").dyn_cast<ir::StrAttribute>().AsString();
+        attr_map.at("kernel_name").dyn_cast<pir::StrAttribute>().AsString();
     auto kernel_key = attr_map.at("kernel_key")
                           .dyn_cast<paddle::dialect::KernelAttribute>()
                           .data();
@@ -1081,17 +1086,17 @@ void BuildOpFuncList(
                       "not found kernel for [%s]",
                       kernel_name);
 
-    ::ir::BuildPhiContext<phi::KernelContext,
-                          const phi::TensorBase*,
-                          phi::TensorBase*,
-                          paddle::small_vector<const phi::TensorBase*>,
-                          paddle::small_vector<phi::TensorBase*>,
-                          true>(op,
-                                value_2_name_map,
-                                scope,
-                                local_scope,
-                                op_yaml_info_parser,
-                                &(op_func_node.kernel_context_));
+    pir::BuildPhiContext<phi::KernelContext,
+                         const phi::TensorBase*,
+                         phi::TensorBase*,
+                         paddle::small_vector<const phi::TensorBase*>,
+                         paddle::small_vector<phi::TensorBase*>,
+                         true>(op,
+                               value_2_name_map,
+                               scope,
+                               local_scope,
+                               op_yaml_info_parser,
+                               &(op_func_node.kernel_context_));
 
     VLOG(6) << "finish process kernel context";
     op_func_node.kernel_context_.SetDeviceContext(
@@ -1184,12 +1189,12 @@ void SetDeviceCommContext(framework::OperatorBase* operator_base,
   }
 }
 
-void SetDeviceCommContext(::ir::Operation* op,
+void SetDeviceCommContext(pir::Operation* op,
                           platform::DeviceContext* dev_ctx) {
   auto op_attributes = op->attributes();
   if (op_attributes.count("ring_id") != 0) {
     int ring_id =
-        op_attributes.at("ring_id").dyn_cast<::ir::Int32Attribute>().data();
+        op_attributes.at("ring_id").dyn_cast<pir::Int32Attribute>().data();
     const auto& comm_context_manager =
         phi::distributed::CommContextManager::GetInstance();
     if (comm_context_manager.Has(std::to_string(ring_id))) {
@@ -1200,7 +1205,7 @@ void SetDeviceCommContext(::ir::Operation* op,
     } else {
       VLOG(3) << "op: "
               << op_attributes.at("op_name")
-                     .dyn_cast<::ir::StrAttribute>()
+                     .dyn_cast<pir::StrAttribute>()
                      .AsString()
               << ", ring_id: " << ring_id << ", get comm_context failed!";
     }
@@ -1211,11 +1216,11 @@ std::unordered_set<std::string> GetSpecialOpNames() {
   return {
       "builtin.combine",
       "builtin.slice",
-      "pd.feed",
+      "pd_op.feed",
       "builtin.set_parameter",
       "builtin.get_parameter",
-      "pd.data",
-      "pd.shadow_output",
+      "pd_op.data",
+      "pd_op.shadow_output",
   };
 }
 
@@ -1229,6 +1234,32 @@ void BuildId2VarName(const std::map<std::string, int>& var_name_2_id,
   }
 }
 
+const std::vector<std::string> GetInstructionCallStack(
+    const std::string& type, const pir::AttributeMap& attrs) {
+  std::vector<std::string> vec_str;
+  if (attrs.count("sub_block") != 0) {
+    return vec_str;
+  }
+  auto iter = attrs.find(OpProtoAndCheckerMaker::OpCreationCallstackAttrName());
+  if (iter != attrs.end()) {
+    auto attr = iter->second;
+    PADDLE_ENFORCE(
+        attr.isa<pir::ArrayAttribute>(),
+        paddle::platform::errors::InvalidArgument(
+            "%s: Callstack attributes of %s is not ArrayAttribute type", type));
+    pir::ArrayAttribute array_attribute = attr.dyn_cast<pir::ArrayAttribute>();
+    std::vector<pir::Attribute> vec_attr = array_attribute.AsVector();
+    for (auto value : vec_attr) {
+      PADDLE_ENFORCE(
+          value.isa<pir::StrAttribute>(),
+          paddle::platform::errors::InvalidArgument(
+              "%s: Callstack attributes of %s is not StrAttribute type", type));
+      vec_str.emplace_back(value.dyn_cast<pir::StrAttribute>().AsString());
+    }
+  }
+  return vec_str;
+}
+
 }  // namespace interpreter
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.h b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.h
index 33b89cac542d4..413db7e75ecd4 100644
--- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.h
+++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.h
@@ -106,11 +106,11 @@ void BuildOpFuncList(const platform::Place& place,
 
 void BuildOpFuncList(
     const platform::Place& place,
-    ::ir::Block* block,
+    ::pir::Block* block,
     std::vector<OpFuncNode>* vec_func_list,
     framework::Scope* scope,
     framework::Scope* local_scope,
-    const std::unordered_map<::ir::Value, std::string>& value_2_name_map,
+    const std::unordered_map<::pir::Value, std::string>& value_2_name_map,
     const ExecutionConfig& execution_config);
 
 void BuildVariableScope(const framework::BlockDesc& block,
@@ -124,10 +124,13 @@ void LogDeviceMemoryStats(const platform::Place& place);
 void SetDeviceCommContext(framework::OperatorBase* operator_base,
                           platform::DeviceContext* dev_ctx);
 
-void SetDeviceCommContext(::ir::Operation* op,
+void SetDeviceCommContext(::pir::Operation* op,
                           platform::DeviceContext* dev_ctx);
 
 std::unordered_set<std::string> GetSpecialOpNames();
+
+const std::vector<std::string> GetInstructionCallStack(
+    const std::string& type, const pir::AttributeMap& attrs);
 }  // namespace interpreter
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/interpreter/plan.cc b/paddle/fluid/framework/new_executor/interpreter/plan.cc
index 0217219302f6d..ce2f8b2718ff3 100644
--- a/paddle/fluid/framework/new_executor/interpreter/plan.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/plan.cc
@@ -41,7 +41,7 @@ Plan::Plan(const std::vector<std::shared_ptr<Job>>& job_list,
 
 Plan::Plan(
     const std::vector<std::shared_ptr<Job>>& job_list,
-    const std::unordered_map<std::string, std::shared_ptr<::ir::Program>>&
+    const std::unordered_map<std::string, std::shared_ptr<::pir::Program>>&
         type_to_ir_program)
     : job_list_(job_list),
       type_to_ir_program_(type_to_ir_program),
@@ -69,7 +69,7 @@ const ProgramDesc* Plan::Program(const std::string& job_type) const {
   return type_to_program_.at(job_type);
 }
 
-std::shared_ptr<::ir::Program> Plan::IrProgram(
+std::shared_ptr<::pir::Program> Plan::IrProgram(
     const std::string& job_type) const {
   return type_to_ir_program_.at(job_type);
 }
diff --git a/paddle/fluid/framework/new_executor/interpreter/plan.h b/paddle/fluid/framework/new_executor/interpreter/plan.h
index aac750a38f97b..8ce66db821305 100644
--- a/paddle/fluid/framework/new_executor/interpreter/plan.h
+++ b/paddle/fluid/framework/new_executor/interpreter/plan.h
@@ -21,8 +21,8 @@
 #include "paddle/fluid/framework/new_executor/interpreter/job.h"
 
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/ir/core/program.h"
 #include "paddle/phi/core/macros.h"
+#include "paddle/pir/core/program.h"
 
 namespace paddle {
 namespace framework {
@@ -33,7 +33,7 @@ class Plan final {
   Plan(const std::vector<std::shared_ptr<Job>>& job_list,
        const std::unordered_map<std::string, ProgramDesc*>& type_to_program);
   Plan(const std::vector<std::shared_ptr<Job>>& job_list,
-       const std::unordered_map<std::string, std::shared_ptr<::ir::Program>>&
+       const std::unordered_map<std::string, std::shared_ptr<::pir::Program>>&
            type_to_ir_program);
 
   ~Plan() = default;
@@ -41,14 +41,14 @@ class Plan final {
   const std::vector<std::shared_ptr<Job>>& JobList() const;
 
   const ProgramDesc* Program(const std::string& job_type) const;
-  std::shared_ptr<::ir::Program> IrProgram(const std::string& job_type) const;
+  std::shared_ptr<::pir::Program> IrProgram(const std::string& job_type) const;
 
   int64_t MicroBatchNum() const;
 
  private:
   const std::vector<std::shared_ptr<Job>> job_list_;
   const std::unordered_map<std::string, ProgramDesc*> type_to_program_;
-  const std::unordered_map<std::string, std::shared_ptr<::ir::Program>>
+  const std::unordered_map<std::string, std::shared_ptr<::pir::Program>>
       type_to_ir_program_;
   int64_t micro_batch_num_;
 };
diff --git a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc
index 3dc9175dbfd4b..bbbaf4c0dd75f 100644
--- a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc
@@ -19,8 +19,14 @@
 
 #include "paddle/fluid/framework/new_executor/instruction/instruction_base.h"
 #include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h"
-#include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device_context.h"
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/phi/core/distributed/comm_context_manager.h"
+#include "paddle/phi/core/distributed/nccl_comm_context.h"
+#include "paddle/phi/core/flags.h"
+PHI_DECLARE_bool(dynamic_static_unified_comm);
+#endif
 
 namespace paddle {
 namespace framework {
@@ -235,9 +241,20 @@ DeviceContext* StreamAnalyzer::ParseDeviceContext(
     if (op_type == "c_allreduce_sum" &&
         op->Attr<bool>("use_calc_stream") == false) {
       int ring_id = op->Attr<int>("ring_id");
-      return platform::NCCLCommContext::Instance()
-          .Get(ring_id, place_)
-          ->dev_context();
+
+      if (FLAGS_dynamic_static_unified_comm) {
+        const auto& comm_context_manager =
+            phi::distributed::CommContextManager::GetInstance();
+        dev_ctx = static_cast<platform::DeviceContext*>(
+            static_cast<phi::distributed::NCCLCommContext*>(
+                comm_context_manager.Get(std::to_string(ring_id)))
+                ->GetDevContext());
+      } else {
+        dev_ctx = platform::NCCLCommContext::Instance()
+                      .Get(ring_id, place_)
+                      ->dev_context();
+      }
+      return dev_ctx;
     }
 #endif
   }
@@ -257,7 +274,7 @@ const std::unordered_set<std::string> no_need_buffer_ins(Instruction* instr) {
   return std::unordered_set<std::string>();
 }
 
-const std::unordered_set<ir::Value> no_need_buffer_ins(
+const std::unordered_set<pir::Value> no_need_buffer_ins(
     const paddle::framework::InstructionBase* instr) {
   return instr->NoNeedBuffer();
 }
@@ -471,9 +488,9 @@ void analyse_event_info_for_two_instructions<
   // fused_var share the same tensor. However, as the dependency is implicit, we
   // can only add event for it with the help of depend_op.
 
-  if (has_data_dependency<paddle::framework::InstructionBase, ir::Value>(
+  if (has_data_dependency<paddle::framework::InstructionBase, pir::Value>(
           instructions[cur_instr_id], instructions[next_instr_id]) ||
-      instructions[next_instr_id]->Name() == "pd.depend") {
+      instructions[next_instr_id]->Name() == "pd_op.depend") {
     waiter_instr_ids->insert(next_instr_id);
     return;
   }
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index 384c668ed2e56..a2c3c49e1c634 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -16,8 +16,8 @@
 
 #include "paddle/fluid/framework/new_executor/new_ir_interpreter.h"
 #include "paddle/fluid/framework/new_executor/program_interpreter.h"
-#include "paddle/ir/core/program.h"
-#include "paddle/ir/core/value.h"
+#include "paddle/pir/core/program.h"
+#include "paddle/pir/core/value.h"
 
 PADDLE_DEFINE_EXPORTED_bool(
     new_executor_serial_run,
@@ -50,7 +50,7 @@ InterpreterCore::InterpreterCore(const platform::Place& place,
 InterpreterCore::InterpreterCore(
     const platform::Place& place,
     const std::vector<std::string>& fetch_var_names,
-    std::unique_ptr<::ir::Program> ir_prog,
+    std::unique_ptr<::pir::Program> ir_prog,
     framework::Scope* scope,
     const ExecutionConfig& execution_config) {
   VLOG(4) << "InterpreterCore(): " << this << " on " << place;
diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h
index f01c12b27c3a1..52df30cbfd976 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.h
+++ b/paddle/fluid/framework/new_executor/interpretercore.h
@@ -17,9 +17,9 @@
 
 PD_DECLARE_bool(new_executor_use_local_scope);
 
-namespace ir {
+namespace pir {
 class Program;
-}  // namespace ir
+}  // namespace pir
 
 namespace paddle {
 namespace framework {
@@ -38,7 +38,7 @@ class InterpreterCore {
   // This constructor is for New IR.
   InterpreterCore(const platform::Place& place,
                   const std::vector<std::string>& fetch_var_names,
-                  std::unique_ptr<::ir::Program> ir_prog,
+                  std::unique_ptr<::pir::Program> ir_prog,
                   Scope* scope,
                   const ExecutionConfig& execution_config = ExecutionConfig());
   ~InterpreterCore();
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h
index bf0c0880f385d..ee9f17034a45f 100644
--- a/paddle/fluid/framework/new_executor/new_executor_defs.h
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.h
@@ -20,7 +20,7 @@
 
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/variable_helper.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/interface/infermeta.h"
+#include "paddle/fluid/pir/dialect/operator/interface/infermeta.h"
 #include "paddle/fluid/platform/device_event_base.h"
 #include "paddle/fluid/platform/event.h"
 #include "paddle/phi/core/utils/rw_lock.h"
diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
index 94ef1e3af217e..78225dee6f337 100644
--- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
@@ -41,16 +41,15 @@
 #endif
 #include "paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h"
 #include "paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/utils/utils.h"
-#include "paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_attribute.h"
-#include "paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_dialect.h"
-#include "paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_op.h"
-#include "paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_type.h"
-#include "paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h"
-#include "paddle/ir/core/builtin_attribute.h"
+#include "paddle/fluid/pir/dialect/kernel/ir/kernel_attribute.h"
+#include "paddle/fluid/pir/dialect/kernel/ir/kernel_dialect.h"
+#include "paddle/fluid/pir/dialect/kernel/ir/kernel_op.h"
+#include "paddle/fluid/pir/dialect/kernel/ir/kernel_type.h"
+#include "paddle/fluid/pir/dialect/operator/utils/utils.h"
+#include "paddle/fluid/pir/phi_kernel_adaptor/phi_kernel_util.h"
+#include "paddle/pir/core/builtin_attribute.h"
 
 PHI_DECLARE_bool(enable_new_ir_in_executor);
-
 PHI_DECLARE_bool(enable_new_ir_in_executor_trace_run);
 
 namespace paddle {
@@ -59,7 +58,7 @@ namespace framework {
 NewIRInterpreter::NewIRInterpreter(
     const platform::Place& place,
     const std::vector<std::string>& fetch_var_names,
-    std::unique_ptr<::ir::Program> ir_prog,
+    std::unique_ptr<::pir::Program> ir_prog,
     framework::Scope* scope,
     const ExecutionConfig& execution_config)
     : place_(place),
@@ -349,79 +348,79 @@ void NewIRInterpreter::UpdateSyncOpNum() {
 
 void NewIRInterpreter::UpdateNcclOpNum() {
   static std::set<std::string> nccl_op_set = {
-      "pd.c_softmax_with_cross_entropy",
-      "pd.c_allgather",
-      "pd.c_allreduce_max",
-      "pd.c_allreduce_min",
-      "pd.c_allreduce_sum",
-      "pd.c_allreduce_prod",
-      "pd.c_reduce_max",
-      "pd.c_reduce_min",
-      "pd.c_reduce_prod",
-      "pd.c_reducescatter",
-      "pd.c_broadcast",
-      "pd.c_broadcast_",
-      "pd.c_scatter",
-      "pd.partial_send",
-      "pd.partial_recv",
-      "pd.partial_allgather",
-      "pd.recv_v2",
-      "pd.send_v2",
-      "pd.mp_allreduce_sum",
-      "pd.barrier",
-      "pd.alltoall",
-      "pd.global_gather",
-      "pd.distributed_fused_lamb",
-      "pd.margin_cross_entropy",
-      "pd.sync_batch_norm",
-      "pd.sync_batch_norm_",
-      "pd.data_norm",
-      "pd.class_center_sample",
-      "pd.all_to_all",
-      "pd.dist_concat",
-      "pd.all_gather",
-      "pd.broadcast",
-      "pd.p_recv",
-      "pd.p_send",
-      "pd.reduce_scatter",
-      "pd.all_reduce",
-      "pd.reduce",
-      "pd.c_softmax_with_cross_entropy_grad",
-      "pd.c_allgather_grad",
-      "pd.c_allreduce_max_grad",
-      "pd.c_allreduce_min_grad",
-      "pd.c_allreduce_sum_grad",
-      "pd.c_allreduce_prod_grad",
-      "pd.c_reduce_max_grad",
-      "pd.c_reduce_min_grad",
-      "pd.c_reduce_prod_grad",
-      "pd.c_reducescatter_grad",
-      "pd.c_broadcast_grad",
-      "pd.c_scatter_grad",
-      "pd.partial_send_grad",
-      "pd.partial_recv_grad",
-      "pd.partial_allgather_grad",
-      "pd.recv_v2_grad",
-      "pd.send_v2_grad",
-      "pd.mp_allreduce_sum_grad",
-      "pd.barrier_grad",
-      "pd.alltoall_grad",
-      "pd.global_gather_grad",
-      "pd.distributed_fused_lamb_grad",
-      "pd.margin_cross_entropy_grad",
-      "pd.margin_cross_entropy_grad_"
-      "pd.sync_batch_norm_grad",
-      "pd.data_norm_grad",
-      "pd.class_center_sample_grad",
-      "pd.all_to_all_grad",
-      "pd.dist_concat_grad",
-      "pd.all_gather_grad",
-      "pd.broadcast_grad",
-      "pd.p_recv_grad",
-      "pd.p_send_grad",
-      "pd.reduce_scatter_grad",
-      "pd.all_reduce_grad",
-      "pd.reduce_grad"};
+      "pd_op.c_softmax_with_cross_entropy",
+      "pd_op.c_allgather",
+      "pd_op.c_allreduce_max",
+      "pd_op.c_allreduce_min",
+      "pd_op.c_allreduce_sum",
+      "pd_op.c_allreduce_prod",
+      "pd_op.c_reduce_max",
+      "pd_op.c_reduce_min",
+      "pd_op.c_reduce_prod",
+      "pd_op.c_reducescatter",
+      "pd_op.c_broadcast",
+      "pd_op.c_broadcast_",
+      "pd_op.c_scatter",
+      "pd_op.partial_send",
+      "pd_op.partial_recv",
+      "pd_op.partial_allgather",
+      "pd_op.recv_v2",
+      "pd_op.send_v2",
+      "pd_op.mp_allreduce_sum",
+      "pd_op.barrier",
+      "pd_op.alltoall",
+      "pd_op.global_gather",
+      "pd_op.distributed_fused_lamb",
+      "pd_op.margin_cross_entropy",
+      "pd_op.sync_batch_norm",
+      "pd_op.sync_batch_norm_",
+      "pd_op.data_norm",
+      "pd_op.class_center_sample",
+      "pd_op.all_to_all",
+      "pd_op.dist_concat",
+      "pd_op.all_gather",
+      "pd_op.broadcast",
+      "pd_op.p_recv",
+      "pd_op.p_send",
+      "pd_op.reduce_scatter",
+      "pd_op.all_reduce",
+      "pd_op.reduce",
+      "pd_op.c_softmax_with_cross_entropy_grad",
+      "pd_op.c_allgather_grad",
+      "pd_op.c_allreduce_max_grad",
+      "pd_op.c_allreduce_min_grad",
+      "pd_op.c_allreduce_sum_grad",
+      "pd_op.c_allreduce_prod_grad",
+      "pd_op.c_reduce_max_grad",
+      "pd_op.c_reduce_min_grad",
+      "pd_op.c_reduce_prod_grad",
+      "pd_op.c_reducescatter_grad",
+      "pd_op.c_broadcast_grad",
+      "pd_op.c_scatter_grad",
+      "pd_op.partial_send_grad",
+      "pd_op.partial_recv_grad",
+      "pd_op.partial_allgather_grad",
+      "pd_op.recv_v2_grad",
+      "pd_op.send_v2_grad",
+      "pd_op.mp_allreduce_sum_grad",
+      "pd_op.barrier_grad",
+      "pd_op.alltoall_grad",
+      "pd_op.global_gather_grad",
+      "pd_op.distributed_fused_lamb_grad",
+      "pd_op.margin_cross_entropy_grad",
+      "pd_op.margin_cross_entropy_grad_"
+      "pd_op.sync_batch_norm_grad",
+      "pd_op.data_norm_grad",
+      "pd_op.class_center_sample_grad",
+      "pd_op.all_to_all_grad",
+      "pd_op.dist_concat_grad",
+      "pd_op.all_gather_grad",
+      "pd_op.broadcast_grad",
+      "pd_op.p_recv_grad",
+      "pd_op.p_send_grad",
+      "pd_op.reduce_scatter_grad",
+      "pd_op.all_reduce_grad",
+      "pd_op.reduce_grad"};
   int64_t nccl_op_num = 0;
   for (auto& ins : vec_instruction_base_) {
     if (nccl_op_set.count(ins->Name())) {
@@ -512,7 +511,7 @@ void NewIRInterpreter::BuildInstruction() {
     } else if (op->dialect()->name() == "pd_kernel") {
       auto op_name = op->attributes()
                          .at("op_name")
-                         .dyn_cast<::ir::StrAttribute>()
+                         .dyn_cast<::pir::StrAttribute>()
                          .AsString();
       if (interpreter::GetSpecialOpNames().count(op_name)) {
         VLOG(6) << "skip process " << op_name;
@@ -542,7 +541,7 @@ void NewIRInterpreter::BuildInstruction() {
                                                    variable_2_var_name_));
       }
 #ifdef PADDLE_WITH_CINN
-    } else if (op->dialect()->name() == "cinn") {
+    } else if (op->dialect()->name() == "cinn_runtime") {
       vec_instruction_base_.emplace_back(
           std::make_unique<CinnJitInstruction>(op_idx++, place_, op, scope_));
 #endif
@@ -634,7 +633,7 @@ void NewIRInterpreter::BuildInstructionDependences() {
 
 void NewIRInterpreter::RecordMemcpyD2H(InstructionBase* instr_node) {
   // NOTE(zhiqiu): hot fix for jit input var
-  if (instr_node->Name() == "pd.memcpy_d2h") {
+  if (instr_node->Name() == "pd_op.memcpy_d2h") {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     auto* default_dev_ctx = pool.Get(place_);
     for (auto& event : instr_node->EventsToWait()) {
@@ -781,14 +780,14 @@ void NewIRInterpreter::CalculateLastLiveOps() {
     InstructionBase* instr = vec_instruction_base_[op_idx].get();
     std::set<size_t> gc_check_vars;
 
-    const std::unordered_map<::ir::Value, std::vector<int>>& ins =
+    const std::unordered_map<::pir::Value, std::vector<int>>& ins =
         instr->Inputs();
-    const std::unordered_map<::ir::Value, std::vector<int>>& outs =
+    const std::unordered_map<::pir::Value, std::vector<int>>& outs =
         instr->Outputs();
-    std::unordered_multimap<::ir::Value, std::vector<int>> ins_and_outs{
+    std::unordered_multimap<::pir::Value, std::vector<int>> ins_and_outs{
         ins.begin(), ins.end()};
 
-    if (instr->Name() != "pd.fetch") {
+    if (instr->Name() != "pd_op.fetch") {
       ins_and_outs.insert(outs.begin(), outs.end());
     }
 
@@ -879,7 +878,8 @@ void NewIRInterpreter::ConstructEventForJitInput() {
   for (size_t i = 0; i < dependecy_count_->size(); ++i) {
     if ((*dependecy_count_)[i] == 0) {
       InstructionBase* inst = vec_instruction_base_[i].get();
-      if (inst->Name() == "pd.memcpy_d2h" && platform::is_gpu_place(place_)) {
+      if (inst->Name() == "pd_op.memcpy_d2h" &&
+          platform::is_gpu_place(place_)) {
         for (auto& item : inst->Inputs()) {
           for (auto var_id : item.second) {
             auto name = GetNameById(var_id);
@@ -919,13 +919,13 @@ FetchList NewIRInterpreter::Run(const std::vector<std::string>& feed_names,
     // Build
     std::stringstream ss;
     ss << this;
-    ::ir::BuildScope(*ir_program_->block(),
-                     InnerScope(),
-                     ss.str(),
-                     &value_2_var_name_,
-                     &variable_2_var_name_,
-                     &var_name_2_id_,
-                     &variable_list_);
+    ::pir::BuildScope(*ir_program_->block(),
+                      InnerScope(),
+                      ss.str(),
+                      &value_2_var_name_,
+                      &variable_2_var_name_,
+                      &var_name_2_id_,
+                      &variable_list_);
 
     interpreter::BuildId2VarName(var_name_2_id_, &id_2_var_name_);
 
@@ -1240,6 +1240,10 @@ void NewIRInterpreter::RunInstructionBase(InstructionBase* instr_node) {
     VLOG(5) << "after run kernel";
     instr_node->RecordEvent(place_);
   } catch (platform::EnforceNotMet& ex) {
+    auto* op = instr_node->Operation();
+    const std::vector<std::string> op_callstack_attr =
+        interpreter::GetInstructionCallStack(op->name(), op->attributes());
+    framework::InsertCallStackInfo(op->name(), op_callstack_attr, &ex);
     LOG(WARNING) << instr_node->Name() << " raises an EnforceNotMet exception "
                  << platform::demangle(typeid(ex).name()) << ", " << ex.what();
     exception_holder_.Catch(std::make_exception_ptr(std::move(ex)));
@@ -1281,7 +1285,7 @@ void NewIRInterpreter::PreAnalysis() {
   VLOG(4) << "Done UpdateNcclOpNum";
 }
 
-::ir::Value NewIRInterpreter::GetValueByName(const std::string& var_name) {
+::pir::Value NewIRInterpreter::GetValueByName(const std::string& var_name) {
   for (auto kv : value_2_var_name_) {
     if (kv.second == var_name) {
       return kv.first;
@@ -1293,16 +1297,16 @@ ::ir::Value NewIRInterpreter::GetValueByName(const std::string& var_name) {
 void NewIRInterpreter::SolvePersisableVarNames() {
   VLOG(6) << "SolvePersisableVarNames";
   for (auto kv : value_2_var_name_) {
-    ::ir::Value value = kv.first;
+    ::pir::Value value = kv.first;
     const std::string& var_name = kv.second;
-    ::ir::OpResult result = value.dyn_cast<::ir::OpResult>();
+    ::pir::OpResult result = value.dyn_cast<::pir::OpResult>();
     auto* defining_op = value.GetDefiningOp();
     if (defining_op->HasAttribute(kAttrIsPersisable)) {
       auto is_persisables = defining_op->attribute(kAttrIsPersisable)
-                                .dyn_cast<::ir::ArrayAttribute>()
+                                .dyn_cast<::pir::ArrayAttribute>()
                                 .AsVector();
       if (is_persisables[result.GetResultIndex()]
-              .dyn_cast<::ir::BoolAttribute>()
+              .dyn_cast<::pir::BoolAttribute>()
               .data()) {
         VLOG(6) << "parameter_var_names_ include: " << var_name;
         parameter_var_names_.insert(var_name);
diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.h b/paddle/fluid/framework/new_executor/new_ir_interpreter.h
index b37b26d107560..c0681a277d5f7 100644
--- a/paddle/fluid/framework/new_executor/new_ir_interpreter.h
+++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.h
@@ -16,7 +16,7 @@
 #include <memory>
 #include "paddle/fluid/framework/new_executor/instruction/instruction_base.h"
 #include "paddle/fluid/framework/new_executor/interpreter_base_impl.h"
-#include "paddle/ir/core/value.h"
+#include "paddle/pir/core/value.h"
 
 namespace ir {
 class Program;
@@ -36,7 +36,7 @@ class NewIRInterpreter : public InterpreterBaseImpl {
  public:
   NewIRInterpreter(const platform::Place& place,
                    const std::vector<std::string>& fetch_var_names,
-                   std::unique_ptr<::ir::Program> ir_prog,
+                   std::unique_ptr<::pir::Program> ir_prog,
                    Scope* scope,
                    const ExecutionConfig& execution_config = ExecutionConfig());
 
@@ -184,7 +184,7 @@ class NewIRInterpreter : public InterpreterBaseImpl {
 
   void RecordMemcpyD2H(InstructionBase* instr_node);
 
-  ::ir::Value GetValueByName(const std::string& var_name);
+  ::pir::Value GetValueByName(const std::string& var_name);
 
   void CheckGC(InstructionBase* instr);
 
@@ -198,11 +198,11 @@ class NewIRInterpreter : public InterpreterBaseImpl {
 
   InstructionSchedulingPriorityLess ir_instruction_scheduling_priority_less;
 
-  std::unique_ptr<::ir::Program> ir_program_{nullptr};
+  std::unique_ptr<::pir::Program> ir_program_{nullptr};
 
   std::vector<std::unique_ptr<InstructionBase>> vec_instruction_base_;
 
-  std::unordered_map<::ir::Value, std::string> value_2_var_name_;
+  std::unordered_map<::pir::Value, std::string> value_2_var_name_;
 
   std::unordered_map<const paddle::framework::Variable*, std::string>
       variable_2_var_name_;
diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc
index f59d5812273c3..a29e45515d894 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/program_interpreter.cc
@@ -32,6 +32,10 @@
 #include "paddle/phi/backends/device_manager.h"
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
+#include "paddle/phi/core/distributed/comm_context_manager.h"
+#include "paddle/phi/core/distributed/nccl_comm_context.h"
+#include "paddle/phi/core/flags.h"
+PHI_DECLARE_bool(dynamic_static_unified_comm);
 #endif
 
 namespace paddle {
@@ -1204,10 +1208,18 @@ void ProgramInterpreter::RecordStreamForGC(const Instruction& instr) {
   auto operator_base_ptr = instr.OpBase();
   if ((operator_base_ptr->Type() == "send_v2") &&
       (operator_base_ptr->Attr<bool>("use_calc_stream") == false)) {
-    stream = platform::NCCLCommContext::Instance()
-                 .Get(operator_base_ptr->Attr<int>("ring_id"),
-                      instr.DeviceContext().GetPlace())
-                 ->stream();
+    int ring_id = operator_base_ptr->Attr<int>("ring_id");
+    if (FLAGS_dynamic_static_unified_comm) {
+      const auto& comm_context_manager =
+          phi::distributed::CommContextManager::GetInstance();
+      stream = static_cast<phi::distributed::NCCLCommContext*>(
+                   comm_context_manager.Get(std::to_string(ring_id)))
+                   ->GetStream();
+    } else {
+      stream = platform::NCCLCommContext::Instance()
+                   .Get(ring_id, instr.DeviceContext().GetPlace())
+                   ->stream();
+    }
   }
 #endif
   auto TensorRecordStream = [&stream](phi::DenseTensor& tensor) {
diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc
index ed109f9cd0b96..a2ae422b814a3 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor.cc
@@ -19,13 +19,13 @@
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/phi/core/flags.h"
 
-#include "paddle/fluid/ir/transforms/pd_op_to_kernel_pass.h"
+#include "paddle/fluid/pir/transforms/pd_op_to_kernel_pass.h"
 
-#include "paddle/fluid/ir/transforms/inplace_pass.h"
 #include "paddle/fluid/ir_adaptor/translator/translate.h"
-#include "paddle/ir/core/program.h"
-#include "paddle/ir/pass/pass.h"
-#include "paddle/ir/pass/pass_manager.h"
+#include "paddle/fluid/pir/transforms/inplace_pass.h"
+#include "paddle/pir/core/program.h"
+#include "paddle/pir/pass/pass.h"
+#include "paddle/pir/pass/pass_manager.h"
 
 PHI_DECLARE_bool(enable_new_ir_in_executor);
 PHI_DECLARE_bool(enable_new_ir_api);
@@ -54,7 +54,7 @@ StandaloneExecutor::StandaloneExecutor(const platform::Place& place,
   for (const auto& job : jobs) {
     const std::string& job_type = job->Type();
     std::shared_ptr<ProgramDesc> program = nullptr;
-    std::shared_ptr<::ir::Program> ir_program = nullptr;
+    std::shared_ptr<::pir::Program> ir_program = nullptr;
     if (FLAGS_enable_new_ir_api) {
       ir_program = plan_.IrProgram(job_type);
     } else {
@@ -79,18 +79,18 @@ StandaloneExecutor::StandaloneExecutor(const platform::Place& place,
 
     // TODO(phlrain) we only support cpu for now
     if (FLAGS_enable_new_ir_in_executor) {
-      std::shared_ptr<::ir::Program> base_program = ir_program;
+      std::shared_ptr<::pir::Program> base_program = ir_program;
       if (!FLAGS_enable_new_ir_api) {
         VLOG(6) << "begin to translate" << std::endl;
         base_program = paddle::TranslateLegacyProgramToProgram(*program);
       }
       auto block = base_program->block();
       for (auto it = block->begin(); it != block->end(); ++it) {
-        if ((*it)->name() == "pd.fetch") {
+        if ((*it)->name() == "pd_op.fetch") {
           size_t index = (*it)
                              ->attributes()
                              .at("col")
-                             .dyn_cast<ir::Int32Attribute>()
+                             .dyn_cast<pir::Int32Attribute>()
                              .data();
 
           if (fetch_var_names_.size() < index + 1) {
@@ -100,7 +100,7 @@ StandaloneExecutor::StandaloneExecutor(const platform::Place& place,
           fetch_var_names_[index] = (*it)
                                         ->attributes()
                                         .at("name")
-                                        .dyn_cast<ir::StrAttribute>()
+                                        .dyn_cast<pir::StrAttribute>()
                                         .AsString() +
                                     "@fetch";
         }
@@ -109,8 +109,8 @@ StandaloneExecutor::StandaloneExecutor(const platform::Place& place,
           paddle::dialect::PdOpLowerToKernelPass(base_program.get(), place);
 
       if (FLAGS_new_ir_apply_inplace_pass) {
-        ir::PassManager pm(ir::IrContext::Instance(), 3);
-        pm.AddPass(ir::CreateInplacePass());
+        pir::PassManager pm(pir::IrContext::Instance(), 3);
+        pm.AddPass(pir::CreateInplacePass());
         pm.Run(kernel_program.get());
       }
 
diff --git a/paddle/fluid/framework/new_executor/standalone_executor.h b/paddle/fluid/framework/new_executor/standalone_executor.h
index bec52add981bf..e9ee5509d20be 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.h
+++ b/paddle/fluid/framework/new_executor/standalone_executor.h
@@ -24,7 +24,7 @@
 #include "paddle/fluid/framework/new_executor/new_executor_defs.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/platform/place.h"
-#include "paddle/ir/core/program.h"
+#include "paddle/pir/core/program.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/op_call_stack.cc b/paddle/fluid/framework/op_call_stack.cc
index b9a7aad1fdf4a..f7b60af104747 100644
--- a/paddle/fluid/framework/op_call_stack.cc
+++ b/paddle/fluid/framework/op_call_stack.cc
@@ -13,9 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_call_stack.h"
-
 #include <string>
-
 #include "paddle/fluid/framework/op_proto_maker.h"
 
 namespace paddle {
@@ -34,7 +32,7 @@ std::string InsertIndentationIntoEachLine(const std::string &str) {
 }
 
 void InsertCallStackInfo(const std::string &type,
-                         const AttributeMap &attrs,
+                         const paddle::framework::AttributeMap &attrs,
                          platform::EnforceNotMet *exception) {
   if (attrs.count("sub_block") != 0) {
     return;
@@ -76,6 +74,39 @@ void InsertCallStackInfo(const std::string &type,
   exception->set_error_str(sout.str());
 }
 
+void InsertCallStackInfo(const std::string &type,
+                         const std::vector<std::string> &callstack_attr_str,
+                         platform::EnforceNotMet *exception) {
+  const std::vector<std::string> *callstack = &callstack_attr_str;
+  std::ostringstream sout;
+  // Step 1. Construct python call stack string
+  if (callstack) {
+    if (FLAGS_call_stack_level > 1) {
+      sout << "\n\n  Compile Traceback (most recent call last):";
+    } else {
+      sout << "In user code:\n";
+    }
+    for (auto &line : *callstack) {
+      sout << "\n  " << line;
+    }
+  }
+  VLOG(1) << exception->error_str();
+  // Step 2. Construct final call stack & append error op name
+  if (FLAGS_call_stack_level > 1) {
+    sout << exception->what();
+  } else {
+    // If callstack exists, use err_str_ instead sub_err_str_
+    if (callstack) {
+      sout << "\n\n";
+      sout << InsertIndentationIntoEachLine(exception->error_str());
+    } else {
+      sout << exception->simple_error_str();
+    }
+  }
+  sout << "  [operator < " << type << " > error]";
+  exception->set_error_str(sout.str());
+}
+
 void AppendErrorOpHint(const std::string &type,
                        platform::EnforceNotMet *exception) {
   std::ostringstream sout;
diff --git a/paddle/fluid/framework/op_call_stack.h b/paddle/fluid/framework/op_call_stack.h
index 0cd10df89b86c..9f9ecd14ef8be 100644
--- a/paddle/fluid/framework/op_call_stack.h
+++ b/paddle/fluid/framework/op_call_stack.h
@@ -24,7 +24,11 @@ namespace framework {
 
 // insert python call stack & append error op for exception message
 void InsertCallStackInfo(const std::string &type,
-                         const AttributeMap &attrs,
+                         const paddle::framework::AttributeMap &attrs,
+                         platform::EnforceNotMet *exception);
+
+void InsertCallStackInfo(const std::string &type,
+                         const std::vector<std::string> &callstack_attr_str,
                          platform::EnforceNotMet *exception);
 
 // only append error op for exception message
diff --git a/paddle/fluid/framework/op_call_stack_test.cc b/paddle/fluid/framework/op_call_stack_test.cc
index 23bb25270ccc8..dee60aa0fe3de 100644
--- a/paddle/fluid/framework/op_call_stack_test.cc
+++ b/paddle/fluid/framework/op_call_stack_test.cc
@@ -44,6 +44,7 @@ TEST(OpCallStack, InsertCallStackInfo) {
     stack_test_vec.emplace_back(stack_test_str);
     attr_map["op_callstack"] = stack_test_vec;
     paddle::framework::InsertCallStackInfo("test", attr_map, &exception);
+    paddle::framework::InsertCallStackInfo("test", stack_test_vec, &exception);
     std::string ex_msg = exception.what();
     EXPECT_TRUE(ex_msg.find(stack_test_str) != std::string::npos);
     EXPECT_TRUE(ex_msg.find("[operator < test > error]") != std::string::npos);
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index ab74b2691b062..a2eef6417870a 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -25,6 +25,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/var_type_inference.h"
 #include "paddle/fluid/operators/ops_extra_info.h"
 #include "paddle/phi/common/complex.h"
+#include "paddle/pir/core/block.h"
+#include "paddle/pir/core/value.h"
 #include "paddle/utils/blank.h"
 
 namespace paddle {
@@ -964,7 +966,12 @@ struct SetAttrDescVisitor {
   void operator()(const std::vector<bool> &v) const {
     VectorToRepeated(v, attr_->mutable_bools());
   }
-
+  void operator()(const std::vector<pir::Value> &v) const {
+    // just do nothing.
+  }
+  void operator()(const std::vector<pir::Block *> &v) const {
+    // just do nothing.
+  }
   void operator()(const std::vector<VarDesc *> &v) const {
     std::vector<std::string> var_names;
     for (auto var : v) {
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
index e0ddafd37da70..ff898db3819f6 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
@@ -48,9 +48,9 @@
 #include "paddle/fluid/operators/cinn/cinn_launch_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/string/string_helper.h"
-#include "paddle/ir/core/program.h"
-#include "paddle/ir/core/value.h"
 #include "paddle/phi/core/flags.h"
+#include "paddle/pir/core/program.h"
+#include "paddle/pir/core/value.h"
 #include "paddle/utils/flags.h"
 
 PHI_DECLARE_bool(enable_pe_launch_cinn);
diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h
index 961b7c1e663c0..4ad1bcb80c4bc 100644
--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
@@ -25,6 +25,8 @@ limitations under the License. */
 #include "paddle/fluid/imperative/type_defs.h"
 
 #include "paddle/phi/common/scalar.h"
+#include "paddle/pir/core/block.h"
+#include "paddle/pir/core/value.h"
 #include "paddle/utils/blank.h"
 #include "paddle/utils/small_vector.h"
 #include "paddle/utils/variant.h"
@@ -62,7 +64,9 @@ using Attribute = paddle::variant<paddle::blank,
                                   std::vector<VarDesc*>,
                                   double,
                                   paddle::experimental::Scalar,
-                                  std::vector<paddle::experimental::Scalar>>;
+                                  std::vector<paddle::experimental::Scalar>,
+                                  ::pir::Block*,
+                                  std::vector<::pir::Value>>;
 using AttributeMap = std::unordered_map<std::string, Attribute>;
 
 using OpCreator =
diff --git a/paddle/fluid/framework/type_info.cc b/paddle/fluid/framework/type_info.cc
index cb7dae540d119..03086f46ad216 100644
--- a/paddle/fluid/framework/type_info.cc
+++ b/paddle/fluid/framework/type_info.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/raw_tensor.h"
 #include "paddle/fluid/framework/string_array.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_meta_tensor.h"
+#include "paddle/fluid/pir/dialect/operator/ir/meta_tensor.h"
 #include "paddle/fluid/prim/utils/static/desc_tensor.h"
 #include "paddle/fluid/primitive/type/lazy_tensor.h"
 
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 48c9f79f34de1..da39c21e84c03 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -103,8 +103,8 @@ set(SHARED_INFERENCE_SRCS
 
 # NOTE(Aurelius84): For inference library, some DEPS is usless
 # such as non-infer operator related targets et.al.
-list(REMOVE_ITEM fluid_modules cinn_dialect)
-# NOTE(Aurelisu84): Remove ir dialect related target DEPS for inference
+list(REMOVE_ITEM fluid_modules cinn_op_dialect)
+# NOTE(Aurelisu84): Remove pir dialect related target DEPS for inference
 # shared library to prune library size.
 list(REMOVE_ITEM fluid_modules ${not_infer_modules})
 
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 83f75c1ae0703..1e3be4d0cfcd3 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -2010,12 +2010,9 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
     }
   } else if (platform::is_custom_place(place_)) {
     auto custom_place = place_;
-    auto paddleplace = static_cast<PaddlePlace>(
-        static_cast<size_t>(PaddlePlace::kCUSTOM) +
-        phi::CustomRegisteredDeviceMap::Instance()
-            .GetOrRegisterGlobalDeviceTypeId(place_.GetDeviceType()));
-    res->SetPlace(
-        paddleplace, custom_place.GetDeviceId(), place_.GetDeviceType());
+    res->SetPlace(PaddlePlace::kCUSTOM,
+                  custom_place.GetDeviceId(),
+                  custom_place.GetDeviceType());
   } else {
     auto gpu_place = place_;
     res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
@@ -2064,12 +2061,9 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
     }
   } else if (platform::is_custom_place(place_)) {
     auto custom_place = place_;
-    auto paddleplace = static_cast<PaddlePlace>(
-        static_cast<size_t>(PaddlePlace::kCUSTOM) +
-        phi::CustomRegisteredDeviceMap::Instance()
-            .GetOrRegisterGlobalDeviceTypeId(place_.GetDeviceType()));
-    res->SetPlace(
-        paddleplace, custom_place.GetDeviceId(), place_.GetDeviceType());
+    res->SetPlace(PaddlePlace::kCUSTOM,
+                  custom_place.GetDeviceId(),
+                  custom_place.GetDeviceType());
   } else {
     auto gpu_place = place_;
     res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
@@ -2893,6 +2887,7 @@ USE_TRT_CONVERTER(sign);
 #endif
 USE_TRT_CONVERTER(rsqrt);
 USE_TRT_CONVERTER(fused_preln_embedding_eltwise_layernorm)
+USE_TRT_CONVERTER(prompt_tuning_emb_eltwise_layernorm);
 USE_TRT_CONVERTER(fused_embedding_eltwise_layernorm);
 USE_TRT_CONVERTER(preln_skip_layernorm)
 USE_TRT_CONVERTER(fused_bias_dropout_residual_layer_norm)
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
index 193e244f86e38..7a399bb55fe7b 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -244,16 +244,11 @@ void Tensor::CopyFromCpu(const T *data) {
         "Can not create tensor with XPU place because paddle is not compiled "
         "with XPU."));
 #endif
-  } else {
+  } else if (place_ == PlaceType::kCUSTOM) {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-    auto device_type_id =
-        static_cast<size_t>(place_) - static_cast<size_t>(PlaceType::kCUSTOM);
     paddle::platform::DeviceContextPool &pool =
         paddle::platform::DeviceContextPool::Instance();
-    paddle::platform::CustomPlace custom_place(
-        phi::CustomRegisteredDeviceMap::Instance().GetGlobalDeviceType(
-            device_type_id),
-        device_);
+    paddle::platform::CustomPlace custom_place(device_type_, device_);
     auto *t_data = tensor->mutable_data<T>(custom_place);
     auto *dev_ctx = static_cast<const paddle::platform::CustomDeviceContext *>(
         pool.Get(custom_place));
@@ -264,9 +259,15 @@ void Tensor::CopyFromCpu(const T *data) {
                          ele_size,
                          dev_ctx->stream());
 #else
-    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
-        "The analysis predictor supports CPU, GPU and XPU now."));
+    PADDLE_THROW(paddle::platform::errors::Unavailable(
+        "Can not create tensor with Custom place because paddle is not "
+        "compiled "
+        "with XPU."));
 #endif
+  } else {
+    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+        "The analysis predictor supports CPU, GPU, XPU and CUSTOM_DEVICE "
+        "now."));
   }
 }
 
@@ -355,6 +356,14 @@ void Tensor::ShareExternalData(const T *data,
             const_cast<T *>(data), size, paddle::platform::XPUPlace(device_)),
         meta);
     *tensor = std::move(dtensor);
+  } else if (place == PlaceType::kCUSTOM) {
+    phi::DenseTensor dtensor(
+        std::make_shared<phi::Allocation>(
+            const_cast<T *>(data),
+            size,
+            paddle::platform::CustomPlace(device_type_, device_)),
+        meta);
+    *tensor = std::move(dtensor);
   } else {
     PADDLE_THROW(paddle::platform::errors::InvalidArgument(
         "PlaceType must be one of [PlaceType::kCPU, PlaceType::kGPU, "
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index ba71eff17387d..2058525946914 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -89,23 +89,24 @@ void PaddlePassBuilder::ClearPasses() { passes_.clear(); }
 
 const std::vector<std::string> kTRTSubgraphPasses({
   "trt_support_nhwc_pass",
-      "adaptive_pool2d_convert_global_pass",          //
-      "trt_map_ops_to_matrix_multiply_pass",          //
-      "shuffle_channel_detect_pass",                  //
-      "quant_conv2d_dequant_fuse_pass",               //
-      "delete_quant_dequant_op_pass",                 //
-      "delete_quant_dequant_filter_op_pass",          //
-      "trt_delete_weight_dequant_linear_op_pass",     //
-      "delete_quant_dequant_linear_op_pass",          //
-      "identity_op_clean_pass",                       //
-      "add_support_int8_pass",                        //
-      "simplify_with_basic_ops_pass",                 //
-      "trt_embedding_eltwise_layernorm_fuse_pass",    //
-      "preln_embedding_eltwise_layernorm_fuse_pass",  //
-      "trt_multihead_matmul_fuse_pass_v2",            //
-      "trt_multihead_matmul_fuse_pass_v3",            //
-      "multihead_matmul_roformer_fuse_pass",          //
-      "constant_folding_pass",                        //
+      "adaptive_pool2d_convert_global_pass",                      //
+      "trt_map_ops_to_matrix_multiply_pass",                      //
+      "shuffle_channel_detect_pass",                              //
+      "quant_conv2d_dequant_fuse_pass",                           //
+      "delete_quant_dequant_op_pass",                             //
+      "delete_quant_dequant_filter_op_pass",                      //
+      "trt_delete_weight_dequant_linear_op_pass",                 //
+      "delete_quant_dequant_linear_op_pass",                      //
+      "identity_op_clean_pass",                                   //
+      "add_support_int8_pass",                                    //
+      "simplify_with_basic_ops_pass",                             //
+      "trt_prompt_tuning_embedding_eltwise_layernorm_fuse_pass",  //
+      "trt_embedding_eltwise_layernorm_fuse_pass",                //
+      "preln_embedding_eltwise_layernorm_fuse_pass",              //
+      "trt_multihead_matmul_fuse_pass_v2",                        //
+      "trt_multihead_matmul_fuse_pass_v3",                        //
+      "multihead_matmul_roformer_fuse_pass",                      //
+      "constant_folding_pass",                                    //
 #ifdef PADDLE_WITH_TENSORRT
 #if !IS_TRT_VERSION_GE(8610)
       "trt_flash_multihead_matmul_fuse_pass",  //
@@ -527,8 +528,9 @@ XpuPassStrategy::XpuPassStrategy() : PassStrategy({}) {
       "one_beam_size_fuse_pass",
       "fold_interp_outsize_fuse_pass",
       "fold_two_squeeze2_fuse_pass",
-      "conv1d_xpu_fuse_pass",
+      // "conv1d_xpu_fuse_pass",
       "duplicated_transpose_fuse_pass",
+      "conv2d_bias_fuse_pass",
       "redundant_unsqueeze_squeeze_elimination_pass",
       "reduce_ops_fuse_pass",
       "delete_cast_op_pass",
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index dad1b073d51f2..2471c365e29ed 100755
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -115,7 +115,7 @@ list(
 
 if(${TENSORRT_MAJOR_VERSION} GREATER_EQUAL 7)
   list(APPEND CONVERT_FILES emb_eltwise_layernorm.cc
-       preln_emb_eltwise_layernorm.cc)
+       preln_emb_eltwise_layernorm.cc prompt_tuning_emb_eltwise_layernorm.cc)
 endif()
 
 if(CUSPARSELT_FOUND AND ${TENSORRT_MAJOR_VERSION} GREATER_EQUAL 8)
diff --git a/paddle/fluid/inference/tensorrt/convert/prompt_tuning_emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/prompt_tuning_emb_eltwise_layernorm.cc
new file mode 100644
index 0000000000000..f6e99461695c5
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/prompt_tuning_emb_eltwise_layernorm.cc
@@ -0,0 +1,177 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/utils.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/inference/tensorrt/helper.h"
+#include "paddle/phi/core/ddim.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class PromptTuningEmbEltwiseLayerNormOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope,
+                  bool test_mode) override {
+    VLOG(4) << "convert fused_prompt_tuning_embedding_eltwise_layernorm op to "
+               "tensorrt layer";
+    // get the presistable var's data
+    auto GetWeight = [&](const std::string& var_name,
+                         framework::DDim* dim) -> TensorRTEngine::Weight {
+      auto* temp_var = scope.FindVar(var_name);
+      auto* temp_tensor = temp_var->GetMutable<phi::DenseTensor>();
+      *dim = temp_tensor->dims();
+      auto weight = engine_->GetTrtWeight(var_name, *temp_tensor);
+      return weight;
+    };
+
+    framework::OpDesc op_desc(op, nullptr);
+    auto* dense_vector = engine_->GetITensor(op_desc.Input("DenseVector")[0]);
+
+    auto pos_id_name = engine_->tensorrt_transformer_posid();
+    auto mask_id_name = engine_->tensorrt_transformer_maskid();
+
+    // bool with_fp16 = engine_->WithFp16() &&
+    // !engine_->disable_trt_plugin_fp16(); int hidden = 0; Declare inputs
+    std::vector<nvinfer1::ITensor*> input_ids;
+
+    // Declare inputs_weight
+    std::vector<nvinfer1::Weights> input_embs;
+    std::vector<int> emb_sizes;
+    TensorRTEngine::Weight weight;
+    framework::DDim emb_dims;
+    framework::DDim bias_dims, scale_dims;
+    TensorRTEngine::Weight bias_weight, scale_weight;
+
+    int64_t bias_size = phi::product(bias_dims);
+    int64_t scale_size = phi::product(scale_dims);
+    bool enable_int8 = op_desc.HasAttr("enable_int8");
+
+    std::vector<std::string> id_names = op_desc.Input("Ids");
+    std::vector<std::string> emb_names = op_desc.Input("Embs");
+    int input_num = id_names.size();
+
+    engine_->SetITensor("pos_id", engine_->GetITensor(pos_id_name));
+    engine_->SetITensor("mask_id", engine_->GetITensor(mask_id_name));
+    for (int i = 0; i < input_num; i++) {
+      auto input_tensor = engine_->GetITensor(id_names[i]);
+      weight = GetWeight(emb_names[i], &emb_dims);
+      if (id_names[i] == pos_id_name) {
+        input_ids.insert(input_ids.begin(), input_tensor);
+        input_embs.insert(input_embs.begin(), weight.get());
+        emb_sizes.insert(emb_sizes.begin(), weight.get().count);
+      } else {
+        input_ids.push_back(input_tensor);
+        input_embs.push_back(weight.get());
+        emb_sizes.push_back(weight.get().count);
+      }
+    }
+    bias_weight = GetWeight(op_desc.Input("Bias").front(), &bias_dims);
+    scale_weight = GetWeight(op_desc.Input("Scale").front(), &scale_dims);
+    bias_size = phi::product(bias_dims);
+    scale_size = phi::product(scale_dims);
+    // other_id(except pos_id)
+    engine_->SetITensor("word_id", input_ids[1]);
+
+    int output_fp16 = static_cast<int>((engine_->WithFp16() == 1) ? 1 : 0);
+    if (enable_int8) {
+      output_fp16 = 1;
+    }
+    PADDLE_ENFORCE_EQ(
+        output_fp16,
+        1,
+        platform::errors::InvalidArgument(
+            "Only Precision::KHalf(fp16) is supported when infering "
+            "ernie(bert) model with config.EnableVarseqlen(). "
+            "But Precision::KFloat32 is setted."));
+
+    std::vector<nvinfer1::PluginField> fields;
+    std::vector<std::string> temp_fields_keys;
+    fields.emplace_back("bert_embeddings_layernorm_beta",
+                        bias_weight.get().values,
+                        GetPluginFieldType(bias_weight.get().type),
+                        static_cast<int32_t>(bias_size));
+    fields.emplace_back("bert_embeddings_layernorm_gamma",
+                        scale_weight.get().values,
+                        GetPluginFieldType(scale_weight.get().type),
+                        static_cast<int32_t>(scale_size));
+    fields.emplace_back(
+        "output_fp16", &output_fp16, nvinfer1::PluginFieldType::kINT32, 1);
+    for (int i = 0; i < input_num; ++i) {
+      temp_fields_keys.push_back("bert_embeddings_word_embeddings_" +
+                                 std::to_string(i));
+      fields.emplace_back(temp_fields_keys.rbegin()->c_str(),
+                          input_embs[i].values,
+                          GetPluginFieldType(input_embs[i].type),
+                          static_cast<int32_t>(emb_sizes[i]));
+    }
+
+    nvinfer1::PluginFieldCollection* plugin_ptr =
+        static_cast<nvinfer1::PluginFieldCollection*>(
+            malloc(sizeof(*plugin_ptr) +
+                   fields.size() * sizeof(nvinfer1::PluginField)));
+    plugin_ptr->nbFields = static_cast<int>(fields.size());
+    plugin_ptr->fields = fields.data();
+
+    std::vector<nvinfer1::ITensor*> plugin_inputs = input_ids;
+    plugin_inputs.emplace_back(
+        engine_->GetITensor("mask_id"));  // input mask_id
+
+    plugin_inputs.emplace_back(dense_vector);  // prompt_tuning'dense_vector
+
+    auto creator = GetPluginRegistry()->getPluginCreator(
+        "PromptTuningEmbLayerNormVarlenPluginDynamic", "1");
+    auto plugin_obj = creator->createPlugin(
+        "PromptTuningEmbLayerNormVarlenPluginDynamic", plugin_ptr);
+
+    auto plugin_layer = engine_->network()->addPluginV2(
+        plugin_inputs.data(), plugin_inputs.size(), *plugin_obj);
+
+    plugin_layer->setName(
+        ("PromptTuningEmbLayerNormVarlenPluginDynamicV1(Output: " +
+         op_desc.Output("Out")[0] + ")")
+            .c_str());
+    free(plugin_ptr);
+    if (enable_int8) {
+      float out_scale =
+          PADDLE_GET_CONST(float, op_desc.GetAttr("out_threshold"));
+      engine_->SetTensorDynamicRange(plugin_layer->getOutput(0),
+                                     out_scale);  // output
+      engine_->SetTensorDynamicRange(plugin_layer->getOutput(1),
+                                     out_scale);  // mask
+      engine_->SetTensorDynamicRange(plugin_layer->getOutput(2),
+                                     out_scale);  // max seqlen
+    }
+
+    engine_->DeleteITensor("mask_id", engine_->GetITensor("mask_id"));
+    engine_->DeleteITensor("pos_id", engine_->GetITensor("pos_id"));
+
+    auto output_name = op_desc.Output("Out")[0];
+    RreplenishLayerAndOutput(plugin_layer,
+                             "PromptTuningEmbLayerNormVarlenPluginDynamicV1",
+                             {output_name,
+                              std::string("qkv_plugin_mask"),
+                              std::string("max_seqlen_tensor"),
+                              std::string("mask_id"),
+                              std::string("pos_id")},
+                             test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(prompt_tuning_emb_eltwise_layernorm,
+                          PromptTuningEmbEltwiseLayerNormOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 2378cf97be982..a00d97a21fb47 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -2870,6 +2870,7 @@ struct SimpleOpTypeSetTeller : public Teller {
       "relu6",
       "hard_sigmoid",
       "clip",
+      "prompt_tuning_emb_eltwise_layernorm",
       "fused_embedding_eltwise_layernorm",
       "multihead_matmul",
       "multihead_matmul_roformer",
@@ -3036,6 +3037,7 @@ struct SimpleOpTypeSetTeller : public Teller {
       "relu6",
       "hard_sigmoid",
       "clip",
+      "prompt_tuning_emb_eltwise_layernorm",
       "fused_embedding_eltwise_layernorm",
       "multihead_matmul",
       "multihead_matmul_roformer",
diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
index b1df5a733623e..bfc9e6b9072da 100644
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
@@ -40,7 +40,9 @@ list(
   elementwiseadd_transpose_op_plugin.cu
   generic_plugin.cu
   many_emb_layernorm_plugin.cu
-  many_emb_layernorm_kernel.cu)
+  many_emb_layernorm_kernel.cu
+  prompt_tuning_emb_layernorm_varseqlen_kernel_hface.cu
+  prompt_tuning_emb_layernorm_varseqlen_plugin.cu)
 
 if(${TENSORRT_MAJOR_VERSION} GREATER_EQUAL 7)
   list(APPEND TRT_FILES many_emb_layernorm_varseqlen_plugin.cu
diff --git a/paddle/fluid/inference/tensorrt/plugin/prompt_tuning_emb_layernorm_varseqlen_kernel_hface.cu b/paddle/fluid/inference/tensorrt/plugin/prompt_tuning_emb_layernorm_varseqlen_kernel_hface.cu
new file mode 100644
index 0000000000000..919ff565870a8
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/prompt_tuning_emb_layernorm_varseqlen_kernel_hface.cu
@@ -0,0 +1,204 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+// SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION &
+// AFFILIATES. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cuda.h>
+#include <cassert>
+#include <cstring>
+#include <iostream>
+#include <vector>
+
+#include "NvInfer.h"
+#include "paddle/fluid/inference/tensorrt/plugin/common/common.cuh"
+#include "paddle/fluid/inference/tensorrt/plugin/prompt_tuning_emb_layernorm_varseqlen_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+template <typename T, unsigned TPB>
+__global__ void prompt_tuning_embKernel(int32_t B,
+                                        int32_t ld,
+                                        int32_t const* inputIds0,
+                                        int32_t const* inputIds1,
+                                        int32_t const* inputIds2,
+                                        T const* dense_vector,
+                                        float const* beta,
+                                        float const* gamma,
+                                        T const* mIdsEmbDev0,
+                                        T const* mIdsEmbDev1,
+                                        T const* mIdsEmbDev2,
+                                        int32_t IdsSize0,
+                                        int32_t IdsSize1,
+                                        int32_t IdsSize2,
+                                        T* output,
+                                        int32_t* new_pos_id) {
+  cub::Sum pairSum;
+  int32_t const s = blockIdx.x;
+  int32_t const b = blockIdx.y;
+  int32_t const sumS = inputIds0[b];
+  int32_t const s_b = inputIds0[b + 1] - inputIds0[b];
+
+  int32_t const new_sumS = sumS + b;
+
+  // new pos_id: Add an id to each sentence
+  new_pos_id[b] = new_sumS;
+
+  // last id
+  if (b == B - 1) {
+    new_pos_id[B] = inputIds0[B] + B;
+  }
+
+  T const rld = T(1.f) / T(ld);
+  int32_t const seqPos = sumS + s;
+  int32_t const out_seqPos = new_sumS + s + 1;
+  int32_t const new_out_seqPos = new_sumS + s;
+
+  kvp<T> threadData(0, 0);
+
+  int32_t const new_outoffset = new_out_seqPos * ld;
+  int32_t const prompt_tuning_offset = new_sumS * ld;
+  int32_t const dense_vector_offset = b * ld;
+
+  if (s < s_b) {
+    extern __shared__ int32_t word_id[];
+    if (threadIdx.x == 0) {
+      if (static_cast<int32_t const*>(inputIds1)[seqPos] < 0 ||
+          static_cast<int32_t const*>(inputIds1)[seqPos] >= IdsSize1) {
+        printf(
+            "Error!!!!!!(embLayerNormVarSeqlenPlugin): ID cannot be lookup "
+            "table: ID < 0 or ID > max ");
+        return;
+      } else {
+        word_id[0] = static_cast<int32_t const*>(inputIds1)[seqPos];
+      }
+
+      if (static_cast<int32_t const*>(inputIds2)[seqPos] < 0 ||
+          static_cast<int32_t const*>(inputIds2)[seqPos] >= IdsSize2) {
+        printf(
+            "Error!!!!!!(embLayerNormVarSeqlenPlugin): ID cannot be lookup "
+            "table: ID < 0 or ID > max ");
+        return;
+      } else {
+        word_id[1] = static_cast<int32_t const*>(inputIds2)[seqPos];
+      }
+    }
+    __syncthreads();
+
+    // 2. load pos/tok/word embeddings and add them toghether
+    // offset into embeddings is given by wordId * hidden_size
+    int32_t const poffset = blockIdx.x * ld;
+    int32_t const outoffset = out_seqPos * ld;
+
+    // the output offset is given by b * (S*hidden_size) + s * hidden_size
+
+    for (int32_t it = threadIdx.x; it < ld; it += TPB) {
+      T p(mIdsEmbDev0[poffset + it]);  // pos id
+      T val = p;
+      int32_t const offset0 = word_id[0] * ld;
+      val += mIdsEmbDev1[offset0 + it];
+      int32_t const offset1 = word_id[1] * ld;
+      val += mIdsEmbDev2[offset1 + it];
+      output[outoffset + it] = val;
+      T const rldval = rld * val;
+      threadData = pairSum(threadData, kvp<T>(rldval, rldval * val));
+    }
+    // 3. layer norm on the sum
+    layerNorm<T, T, float, TPB>(threadData, ld, outoffset, beta, gamma, output);
+  } else if (s == s_b) {
+    for (int32_t it = threadIdx.x; it < ld; it += TPB) {
+      T val = dense_vector[dense_vector_offset + it];
+      output[prompt_tuning_offset + it] = val;
+      T const rldval = rld * val;
+      threadData = pairSum(threadData, kvp<T>(rldval, rldval * val));
+      // 3. layer norm on the sum
+    }
+    layerNorm<T, T, float, TPB>(
+        threadData, ld, prompt_tuning_offset, beta, gamma, output);
+
+  } else {
+    return;  // This CTA has nothing to do
+  }
+}
+
+template <typename T>
+int32_t prompt_tuning_emb(cudaStream_t stream,
+                          int32_t ld,
+                          int32_t B,
+                          int32_t S,
+                          int const* inputIds0,
+                          int const* inputIds1,
+                          int const* inputIds2,
+                          T const* dense_vector,
+                          int32_t nbLookupTables,
+                          float const* beta,
+                          float const* gamma,
+                          T const* mIdsEmbDev0,
+                          T const* mIdsEmbDev1,
+                          T const* mIdsEmbDev2,
+                          int32_t IdsSize0,
+                          int32_t IdsSize1,
+                          int32_t IdsSize2,
+                          T* output,
+                          int32_t* new_pos_id) {
+  constexpr int32_t tpb = 256;
+  dim3 const grid(S, B, 1);
+  dim3 const block(tpb, 1, 1);
+  size_t cache_size = sizeof(int32_t) * (nbLookupTables - 1);
+  prompt_tuning_embKernel<T, tpb>
+      <<<grid, block, cache_size, stream>>>(B,
+                                            ld,
+                                            inputIds0,
+                                            inputIds1,
+                                            inputIds2,
+                                            dense_vector,
+                                            beta,
+                                            gamma,
+                                            mIdsEmbDev0,
+                                            mIdsEmbDev1,
+                                            mIdsEmbDev2,
+                                            IdsSize0,
+                                            IdsSize1,
+                                            IdsSize2,
+                                            output,
+                                            new_pos_id);
+  return cudaPeekAtLastError();
+}
+
+template int32_t prompt_tuning_emb<half>(cudaStream_t,
+                                         int32_t,
+                                         int32_t,
+                                         int32_t,
+                                         int32_t const*,
+                                         int32_t const*,
+                                         int32_t const*,
+                                         half const*,
+                                         int32_t,
+                                         float const*,
+                                         float const*,
+                                         half const*,
+                                         half const*,
+                                         half const*,
+                                         int32_t,
+                                         int32_t,
+                                         int32_t,
+                                         half*,
+                                         int32_t*);
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/prompt_tuning_emb_layernorm_varseqlen_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/prompt_tuning_emb_layernorm_varseqlen_plugin.cu
new file mode 100644
index 0000000000000..64fde0785fdc7
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/prompt_tuning_emb_layernorm_varseqlen_plugin.cu
@@ -0,0 +1,562 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+// SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION &
+// AFFILIATES. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/tensorrt/plugin/prompt_tuning_emb_layernorm_varseqlen_plugin.h"
+#include <cuda.h>
+#include <cstring>
+#include <vector>
+#include "NvInfer.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+constexpr size_t threadsPerCta128 = 2 * 2 * 32;
+constexpr size_t threadsPerCta256 = 1 * 4 * 32;
+constexpr size_t threadsPerCta384 = 1 * 8 * 32;
+// The number of xmmas in the M dimension. We use one uint32_t per XMMA in the M
+// dimension: (s + 16*warps_m - 1) / (16*warps_m);
+constexpr size_t xmmasM128 = 4;
+constexpr size_t xmmasM256 = 16;
+constexpr size_t xmmasM384 = 24;
+// Packed mask size per batch. Layout is XMMAS_M * THREADS_PER_CTA.
+constexpr size_t packedMaskSize128 = xmmasM128 * threadsPerCta128;
+constexpr size_t packedMaskSize256 = xmmasM256 * threadsPerCta256;
+constexpr size_t packedMaskSize384 = xmmasM384 * threadsPerCta384;
+char const* EMB_LAYER_NORM_VAR_SEQLEN_VERSION_HFACE{"1"};
+char const* EMB_LAYER_NORM_VAR_SEQLEN_NAME{
+    "PromptTuningEmbLayerNormVarlenPluginDynamic"};
+// Static class fields initialization
+nvinfer1::PluginFieldCollection
+    TrtPromptTuningEmbLayerNormVarSeqlenPluginBaseCreator::mFC{};
+std::vector<nvinfer1::PluginField>
+    TrtPromptTuningEmbLayerNormVarSeqlenPluginBaseCreator::mPluginAttributes;
+
+TrtPromptTuningEmbLayerNormVarSeqlenPluginBase::
+    TrtPromptTuningEmbLayerNormVarSeqlenPluginBase(
+        std::string const& name,
+        nvinfer1::DataType const type,
+        nvinfer1::Weights const& beta,
+        nvinfer1::Weights const& gamma,
+        const std::vector<nvinfer1::Weights>& IdsEmb)
+    : mLayerName(name),
+      mLd(beta.count),
+      mType(type),
+      mIdsEmb_(IdsEmb),
+      nbLookupTables_(static_cast<int>(IdsEmb.size())) {
+  // Assuming Weights.count is the number of elements and not bytes
+  assert(beta.count == gamma.count);
+  mBeta.convertAndCopy(beta, nvinfer1::DataType::kFLOAT);
+  mGamma.convertAndCopy(gamma, nvinfer1::DataType::kFLOAT);
+  copyToDevice(&mGamma, sizeof(float) * mGamma.count, &mGammaDev);
+  copyToDevice(&mBeta, sizeof(float) * mBeta.count, &mBetaDev);
+  for (size_t i = 0; i < mIdsEmb_.size(); ++i) {
+    assert(mIdsEmb_[i].count % mLd == 0);
+    mIdsVocabSize.push_back(int32_t(mIdsEmb_[i].count / mLd));
+    WeightsWithOwnership tem_weight;
+    tem_weight.convertAndCopy(mIdsEmb_[i], mType);
+    void* cudaMem{nullptr};
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        cudaMalloc(&cudaMem, getWeightsSize(tem_weight, mType)));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(cudaMem,
+                                          tem_weight.values,
+                                          getWeightsSize(tem_weight, mType),
+                                          cudaMemcpyHostToDevice));
+    mIdsEmbPtrs.push_back(cudaMem);
+  }
+}
+
+TrtPromptTuningEmbLayerNormVarSeqlenPluginBase::
+    TrtPromptTuningEmbLayerNormVarSeqlenPluginBase(std::string const& name,
+                                                   void const* data,
+                                                   size_t length)
+    : mLayerName(name),
+      mGammaDev(nullptr),
+      mBetaDev(nullptr),
+      mIdsEmbPtrs{},
+      mIdsEmb_{} {
+  // Deserialize in the same order as serialization
+  deserialize_value(&data, &length, &mType);
+  deserialize_value(&data, &length, &mLd);
+  deserialize_value(&data, &length, &nbLookupTables_);
+  for (int32_t i = 0; i < nbLookupTables_; ++i) {
+    int32_t tem;
+    deserialize_value(&data, &length, &tem);
+    mIdsVocabSize.push_back(tem);
+  }
+  char const* d = static_cast<char const*>(data);
+  mBeta.convertAndCopy(&d, mLd, nvinfer1::DataType::kFLOAT);
+  mGamma.convertAndCopy(&d, mLd, nvinfer1::DataType::kFLOAT);
+  for (int32_t i = 0; i < nbLookupTables_; ++i) {
+    nvinfer1::Weights pre_tem_weight;
+    pre_tem_weight.type = mType;
+    pre_tem_weight.count = mLd * size_t(mIdsVocabSize[i]);
+    const auto nbBytes = mLd * size_t(mIdsVocabSize[i]) * getElementSize(mType);
+    auto destBuf = new char[nbBytes];
+    pre_tem_weight.values = destBuf;
+    std::copy_n(d, nbBytes, destBuf);
+    d += nbBytes;
+    mIdsEmb_.push_back(pre_tem_weight);
+  }
+}
+
+TrtPromptTuningEmbLayerNormVarSeqlenPluginHFace::
+    TrtPromptTuningEmbLayerNormVarSeqlenPluginHFace(
+        std::string const& name,
+        nvinfer1::DataType const type,
+        nvinfer1::Weights const& beta,
+        nvinfer1::Weights const& gamma,
+        const std::vector<nvinfer1::Weights>& IdsEmb)
+    : TrtPromptTuningEmbLayerNormVarSeqlenPluginBase(
+          name, type, beta, gamma, IdsEmb) {}
+
+TrtPromptTuningEmbLayerNormVarSeqlenPluginHFace::
+    TrtPromptTuningEmbLayerNormVarSeqlenPluginHFace(std::string const& name,
+                                                    void const* data,
+                                                    size_t length)
+    : TrtPromptTuningEmbLayerNormVarSeqlenPluginBase(name, data, length) {
+  TRANSFORMER_DEBUG_MSG(
+      "TrtPromptTuningEmbLayerNormVarSeqlenPluginHFace deserialize");
+}
+
+// IPluginV2DynamicExt Methods
+nvinfer1::IPluginV2DynamicExt*
+TrtPromptTuningEmbLayerNormVarSeqlenPluginHFace::clone() const noexcept {
+  TRANSFORMER_DEBUG_MSG(
+      "TrtPromptTuningEmbLayerNormVarSeqlenPluginHFace clone");
+  auto p = new TrtPromptTuningEmbLayerNormVarSeqlenPluginHFace(
+      mLayerName, mType, mBeta, mGamma, mIdsEmb_);
+  p->setPluginNamespace(mNamespace.c_str());
+  return p;
+}
+
+nvinfer1::DimsExprs
+TrtPromptTuningEmbLayerNormVarSeqlenPluginHFace::getOutputDimensions(
+    int32_t outputIndex,
+    nvinfer1::DimsExprs const* inputs,
+    int32_t nbInputs,
+    nvinfer1::IExprBuilder& exprBuilder) noexcept {
+  for (int i = 1; i < nbInputs - 2; ++i) {
+    assert(inputs[i].nbDims == 1);                 // seq length
+    assert(inputs[i].nbDims == inputs[1].nbDims);  // same shape
+  }
+  assert(inputs[0].nbDims == 1);  // pos_id: B+1
+  auto one = exprBuilder.constant(1);
+  auto Bplus1 = inputs[0].d[0];  // pos_id
+  auto B =
+      exprBuilder.operation(nvinfer1::DimensionOperation::kSUB, *Bplus1, *one);
+  if (outputIndex == 0) {
+    nvinfer1::DimsExprs ret;
+    ret.nbDims = 4;
+    ret.d[0] = exprBuilder.operation(nvinfer1::DimensionOperation::kSUM,
+                                     *inputs[1].d[0],
+                                     *B);  // sum of seq length
+    ret.d[1] = exprBuilder.constant(mLd);
+    ret.d[2] = exprBuilder.constant(1);
+    ret.d[3] = exprBuilder.constant(1);
+    return ret;
+  } else if (outputIndex == 1) {
+    // This is a hack: we just report some mask size and rely the plugins to
+    // play nicely together.
+    //      At runtime, depending on the actual maxSeqlen, the size might be
+    //      different.
+    int32_t maskSize_ = packedMaskSize384;
+    auto maskSize = exprBuilder.constant(maskSize_);
+    auto fp16maskSize =
+        exprBuilder.operation(nvinfer1::DimensionOperation::kPROD,
+                              *maskSize,
+                              *exprBuilder.constant(2));
+    nvinfer1::DimsExprs ret;
+    ret.nbDims = 2;
+    ret.d[0] = B;
+    ret.d[1] = fp16maskSize;
+    return ret;
+  } else if (outputIndex == 2) {
+    nvinfer1::DimsExprs ret;
+    ret.nbDims = 1;
+    ret.d[0] = exprBuilder.operation(nvinfer1::DimensionOperation::kSUM,
+                                     *inputs[nbInputs - 2].d[1],
+                                     *one);  // max seqlen
+    return ret;
+  } else if (outputIndex == 3) {
+    nvinfer1::DimsExprs ret = inputs[nbInputs - 2];  // new mask_id
+    ret.d[1] = exprBuilder.operation(
+        nvinfer1::DimensionOperation::kSUM, *inputs[nbInputs - 2].d[1], *one);
+    return ret;
+  } else if (outputIndex == 4) {
+    nvinfer1::DimsExprs ret = inputs[0];  // new pos_id
+    return ret;
+  }
+}
+
+bool TrtPromptTuningEmbLayerNormVarSeqlenPluginBase::supportsFormatCombination(
+    int32_t pos,
+    nvinfer1::PluginTensorDesc const* inOut,
+    int32_t nbInputs,
+    int32_t nbOutputs) noexcept {
+  assert(nbOutputs == 5);
+  nvinfer1::PluginTensorDesc const& desc = inOut[pos];
+  if (desc.format != nvinfer1::TensorFormat::kLINEAR) {
+    return false;
+  }
+  if (pos == 0) {  // pos_id
+    return desc.dims.nbDims == 1 && desc.type == nvinfer1::DataType::kINT32;
+  }
+  if (pos == 1) {  //  input_id
+    return desc.dims.nbDims == 1 && desc.type == nvinfer1::DataType::kINT32;
+  }
+  nvinfer1::PluginTensorDesc const& prev = inOut[1];  // input_ids
+  if (1 < pos &&
+      pos < (nbInputs - 2)) {  // other ids: check it's the same as input_ids
+    return desc.type == prev.type && desc.dims.nbDims == 1 &&
+           desc.dims.d[0] == prev.dims.d[0];
+  }
+  if (pos == nbInputs - 2) {  // mask id
+    return desc.type == mType;
+  }
+  if (pos == nbInputs - 1) {  // dense vector
+    return desc.type == mType;
+  }
+  // embedded sequence
+  if (pos == nbInputs) {
+    return desc.type == mType && desc.dims.nbDims == 4 && desc.dims.d[2] == 1 &&
+           desc.dims.d[3] == 1;
+  }
+  // mask(HFace)
+  if (pos == nbInputs + 1) {
+    return desc.type == mType;
+  }
+  // max seqlen
+  if (pos == nbInputs + 2) {
+    return desc.type == mType;
+  }
+  // new mask_id
+  if (pos == nbInputs + 3) {
+    return desc.type == mType;
+  }
+  // new pos_id
+  if (pos == nbInputs + 4) {
+    return desc.dims.nbDims == 1 && desc.type == nvinfer1::DataType::kINT32;
+  }
+}
+
+void checkConfigurationInputs(nvinfer1::DynamicPluginTensorDesc const* inputs,
+                              int32_t nbInputs,
+                              nvinfer1::DynamicPluginTensorDesc const* outputs,
+                              int32_t nbOutputs) noexcept {
+  // Validate input arguments
+  assert(nbOutputs == 5);
+  assert(inputs[0].desc.dims.nbDims == 1);
+  assert(inputs[0].desc.type == nvinfer1::DataType::kINT32);
+  for (int i = 1; i < nbInputs - 2; ++i) {
+    assert(inputs[i].desc.dims.nbDims == 1);
+    assert(inputs[i].desc.dims.d[0] == inputs[1].desc.dims.d[0]);
+    assert(inputs[i].desc.type == nvinfer1::DataType::kINT32);
+  }
+  assert(outputs[0].desc.dims.nbDims == 4);
+  assert(outputs[0].desc.dims.d[2] == 1);
+  assert(outputs[0].desc.dims.d[3] == 1);
+}
+
+void TrtPromptTuningEmbLayerNormVarSeqlenPluginHFace::configurePlugin(
+    nvinfer1::DynamicPluginTensorDesc const* inputs,
+    int32_t nbInputs,
+    nvinfer1::DynamicPluginTensorDesc const* outputs,
+    int32_t nbOutputs) noexcept {
+  TRANSFORMER_DEBUG_MSG(
+      "TrtPromptTuningEmbLayerNormVarSeqlenPluginHFace configurePlugin");
+  checkConfigurationInputs(inputs, nbInputs, outputs, nbOutputs);
+  assert(static_cast<size_t>(outputs[0].desc.dims.d[1]) ==
+         static_cast<size_t>(mLd));
+  int32_t const B = inputs[0].desc.dims.d[0] - 1;
+  // check mask
+  assert(outputs[1].desc.dims.nbDims == 2);
+  if (B > 0) {
+    assert(outputs[1].desc.dims.d[0] == B);
+  }
+  assert((outputs[1].desc.dims.d[1] == 2 * packedMaskSize384) ||
+         (outputs[1].desc.dims.d[1] == 2 * packedMaskSize128) ||
+         (outputs[1].desc.dims.d[1] == 2 * packedMaskSize256));
+  assert(outputs[0].desc.type == mType);
+  assert(outputs[1].desc.type == nvinfer1::DataType::kHALF);
+}
+
+size_t TrtPromptTuningEmbLayerNormVarSeqlenPluginBase::getWorkspaceSize(
+    nvinfer1::PluginTensorDesc const* inputs,
+    int32_t nbInputs,
+    nvinfer1::PluginTensorDesc const* outputs,
+    int32_t nbOutputs) const noexcept {
+  return 0;
+}
+
+int32_t TrtPromptTuningEmbLayerNormVarSeqlenPluginHFace::enqueue(
+    nvinfer1::PluginTensorDesc const* inputDesc,
+    nvinfer1::PluginTensorDesc const* outputDesc,
+    void const* const* inputs,
+    void* const* outputs,
+    void* workspace,
+    cudaStream_t stream) noexcept {
+  int32_t batchSize = inputDesc[0].dims.d[0] - 1;
+  // read out the maximum sequence length from the dummy input
+  int32_t const maxSeqlen = inputDesc[nbLookupTables_].dims.d[1] + 1;
+  int32_t S;
+  if (maxSeqlen <= 128) {
+    S = 128;
+  } else if (maxSeqlen <= 192) {
+    S = 192;
+  } else if (maxSeqlen <= 256) {
+    S = 256;
+  } else if (maxSeqlen <= 384) {
+    S = 384;
+  } else if (maxSeqlen <= 512) {
+    S = 512;
+  } else {
+    PADDLE_THROW(platform::errors::Fatal("The max seqlenth is 512."));
+  }
+  const float* beta = mBetaDev.get();
+  const float* gamma = mGammaDev.get();
+
+  auto output = static_cast<half*>(outputs[0]);
+  auto new_pos_id = static_cast<int32_t*>(outputs[4]);
+  return prompt_tuning_emb<half>(stream,
+                                 static_cast<int32_t>(mLd),
+                                 batchSize,
+                                 S,
+                                 static_cast<int32_t const*>(inputs[0]),
+                                 static_cast<int32_t const*>(inputs[1]),
+                                 static_cast<int32_t const*>(inputs[2]),
+                                 static_cast<half const*>(inputs[4]),
+                                 nbLookupTables_,
+                                 beta,
+                                 gamma,
+                                 static_cast<half const*>(mIdsEmbPtrs[0]),
+                                 static_cast<half const*>(mIdsEmbPtrs[1]),
+                                 static_cast<half const*>(mIdsEmbPtrs[2]),
+                                 mIdsVocabSize[0],
+                                 mIdsVocabSize[1],
+                                 mIdsVocabSize[2],
+                                 output,
+                                 new_pos_id);
+}
+
+// IPluginV2Ext Methods
+nvinfer1::DataType
+TrtPromptTuningEmbLayerNormVarSeqlenPluginBase::getOutputDataType(
+    int32_t index,
+    nvinfer1::DataType const* inputTypes,
+    int32_t nbInputs) const noexcept {
+  assert(mType == nvinfer1::DataType::kHALF);
+  if (index == 0) {
+    return mType;
+  } else if (index == 1) {
+    return mType;
+  } else if (index == 2) {
+    return mType;
+  } else if (index == 3) {
+    return mType;
+  } else if (index == 4) {
+    return nvinfer1::DataType::kINT32;
+  }
+}
+
+// IPluginV2 Methods
+char const* TrtPromptTuningEmbLayerNormVarSeqlenPluginBase::getPluginType()
+    const noexcept {
+  return EMB_LAYER_NORM_VAR_SEQLEN_NAME;
+}
+
+char const* TrtPromptTuningEmbLayerNormVarSeqlenPluginHFace::getPluginVersion()
+    const noexcept {
+  return EMB_LAYER_NORM_VAR_SEQLEN_VERSION_HFACE;
+}
+
+int32_t TrtPromptTuningEmbLayerNormVarSeqlenPluginBase::getNbOutputs()
+    const noexcept {
+  return 5;
+}
+
+int32_t TrtPromptTuningEmbLayerNormVarSeqlenPluginHFace::initialize() noexcept {
+  TRANSFORMER_DEBUG_MSG(
+      "TrtPromptTuningEmbLayerNormVarSeqlenPluginHFace initialize");
+  return 0;
+}
+
+void TrtPromptTuningEmbLayerNormVarSeqlenPluginHFace::terminate() noexcept {
+  TRANSFORMER_DEBUG_MSG(
+      "TrtPromptTuningEmbLayerNormVarSeqlenPluginHFace terminate");
+}
+
+size_t TrtPromptTuningEmbLayerNormVarSeqlenPluginBase::getSerializationSize()
+    const noexcept {
+  size_t const wordSize = getElementSize(mType);
+  return 2 * sizeof(float) * mLd                            // beta + gamma
+         + sizeof(mType)                                    //
+         + sizeof(mLd)                                      //
+         + mIdsVocabSize.size() * sizeof(mIdsVocabSize[0])  //
+         + wordSize * mLd *
+               accumulate(
+                   mIdsVocabSize.begin(), mIdsVocabSize.end(), 0)  // ids emb
+         + sizeof(nbLookupTables_);  // numbers of lookup_table
+}
+
+void TrtPromptTuningEmbLayerNormVarSeqlenPluginBase::serialize(
+    void* buffer) const noexcept {
+  serialize_value(&buffer, mType);
+  serialize_value(&buffer, mLd);
+  serialize_value(&buffer, nbLookupTables_);
+  for (size_t i = 0; i < mIdsVocabSize.size(); ++i) {
+    serialize_value(&buffer, mIdsVocabSize[i]);
+  }
+  char* d = static_cast<char*>(buffer);
+  size_t const wordSize = getElementSize(mType);
+  serFromDev(&d, mBetaDev.get(), mLd);
+  serFromDev(&d, mGammaDev.get(), mLd);
+  for (size_t i = 0; i < mIdsEmbPtrs.size(); ++i) {
+    serFromDev(&d,
+               static_cast<char*>(mIdsEmbPtrs[i]),
+               mLd * mIdsVocabSize[i] * wordSize);
+  }
+}
+
+void TrtPromptTuningEmbLayerNormVarSeqlenPluginBase::destroy() noexcept {
+  // This gets called when the network containing plugin is destroyed
+  mBetaDev.reset(nullptr);
+  mGammaDev.reset(nullptr);
+  for (size_t i = 0; i < mIdsEmbPtrs.size(); ++i) {
+    cudaFree(mIdsEmbPtrs[i]);
+  }
+  delete this;
+}
+
+void TrtPromptTuningEmbLayerNormVarSeqlenPluginHFace::destroy() noexcept {
+  TRANSFORMER_DEBUG_MSG(
+      "TrtPromptTuningEmbLayerNormVarSeqlenPluginHFace destroy");
+  TrtPromptTuningEmbLayerNormVarSeqlenPluginBase::destroy();
+}
+
+void TrtPromptTuningEmbLayerNormVarSeqlenPluginBase::setPluginNamespace(
+    char const* libNamespace) noexcept {
+  mNamespace = libNamespace;
+}
+
+char const* TrtPromptTuningEmbLayerNormVarSeqlenPluginBase::getPluginNamespace()
+    const noexcept {
+  return mNamespace.c_str();
+}
+
+TrtPromptTuningEmbLayerNormVarSeqlenPluginBaseCreator::
+    TrtPromptTuningEmbLayerNormVarSeqlenPluginBaseCreator() = default;
+
+char const*
+TrtPromptTuningEmbLayerNormVarSeqlenPluginBaseCreator::getPluginName()
+    const noexcept {
+  return EMB_LAYER_NORM_VAR_SEQLEN_NAME;
+}
+
+char const*
+TrtPromptTuningEmbLayerNormVarSeqlenPluginHFaceCreator::getPluginVersion()
+    const noexcept {
+  return EMB_LAYER_NORM_VAR_SEQLEN_VERSION_HFACE;
+}
+
+nvinfer1::PluginFieldCollection const*
+TrtPromptTuningEmbLayerNormVarSeqlenPluginBaseCreator::
+    getFieldNames() noexcept {
+  return &mFC;
+}
+
+bool initializeFields(nvinfer1::PluginFieldCollection const* fc,
+                      nvinfer1::Weights* beta,
+                      nvinfer1::Weights* gamma,
+                      std::vector<nvinfer1::Weights>* IdsEmb) {
+  bool output_fp16 = false;
+  for (int32_t i = 0; i < fc->nbFields; i++) {
+    std::string field_name(fc->fields[i].name);
+    if (field_name.compare("bert_embeddings_layernorm_beta") == 0) {
+      TRANSFORMER_DEBUG_MSG("Building bert_embeddings_layernorm_beta...");
+      beta->values = fc->fields[i].data;
+      beta->count = fc->fields[i].length;
+      beta->type = fieldTypeToDataType(fc->fields[i].type);
+    }
+
+    if (field_name.compare("bert_embeddings_layernorm_gamma") == 0) {
+      TRANSFORMER_DEBUG_MSG("Building bert_embeddings_layernorm_gamma...");
+      gamma->values = fc->fields[i].data;
+      gamma->count = fc->fields[i].length;
+      gamma->type = fieldTypeToDataType(fc->fields[i].type);
+    }
+
+    if (field_name.compare("output_fp16") == 0) {
+      TRANSFORMER_DEBUG_MSG("Building output_fp16...");
+      assert(fc->fields[i].type == nvinfer1::PluginFieldType::kINT32);
+      output_fp16 = static_cast<int32_t const*>(fc->fields[i].data)[0] != 0;
+    }
+    if (field_name.compare("bert_embeddings_word_embeddings_" +
+                           std::to_string(i - 3)) == 0) {
+      TRANSFORMER_DEBUG_MSG(
+          ("bert_embeddings_word_embeddings_" + std::to_string(i - 3)).c_str());
+      nvinfer1::Weights tem;
+      tem.values = fc->fields[i].data;
+      tem.count = fc->fields[i].length;
+      tem.type = fieldTypeToDataType(fc->fields[i].type);
+      IdsEmb->push_back(tem);
+    }
+  }
+  return output_fp16;
+}
+
+nvinfer1::IPluginV2*
+TrtPromptTuningEmbLayerNormVarSeqlenPluginHFaceCreator::createPlugin(
+    char const* name, nvinfer1::PluginFieldCollection const* fc) noexcept {
+  TRANSFORMER_DEBUG_MSG("EmbLayerNormVarSeqlenHFace createPlugin");
+  nvinfer1::Weights beta;
+  nvinfer1::Weights gamma;
+  std::vector<nvinfer1::Weights> IdsEmb;
+  bool output_fp16 = initializeFields(fc, &beta, &gamma, &IdsEmb);
+  TRANSFORMER_DEBUG_MSG("Building the Plugin...");
+  TrtPromptTuningEmbLayerNormVarSeqlenPluginHFace* p =
+      new TrtPromptTuningEmbLayerNormVarSeqlenPluginHFace(
+          name,
+          output_fp16 ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT,
+          beta,
+          gamma,
+          IdsEmb);
+  return p;
+}
+
+nvinfer1::IPluginV2*
+TrtPromptTuningEmbLayerNormVarSeqlenPluginHFaceCreator::deserializePlugin(
+    char const* name, void const* serialData, size_t serialLength) noexcept {
+  return new TrtPromptTuningEmbLayerNormVarSeqlenPluginHFace(
+      name, serialData, serialLength);
+}
+
+void TrtPromptTuningEmbLayerNormVarSeqlenPluginBaseCreator::setPluginNamespace(
+    char const* libNamespace) noexcept {
+  mNamespace = libNamespace;
+}
+
+char const*
+TrtPromptTuningEmbLayerNormVarSeqlenPluginBaseCreator::getPluginNamespace()
+    const noexcept {
+  return mNamespace.c_str();
+}
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/prompt_tuning_emb_layernorm_varseqlen_plugin.h b/paddle/fluid/inference/tensorrt/plugin/prompt_tuning_emb_layernorm_varseqlen_plugin.h
new file mode 100644
index 0000000000000..b479a992bbb5d
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/prompt_tuning_emb_layernorm_varseqlen_plugin.h
@@ -0,0 +1,189 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+// SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION &
+// AFFILIATES. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cuda.h>
+#include "NvInferPlugin.h"
+#include "NvInferRuntime.h"
+
+#include "paddle/fluid/inference/tensorrt/plugin/common/bertCommon.h"
+#include "paddle/fluid/inference/tensorrt/plugin/common/plugin.h"
+#include "paddle/fluid/inference/tensorrt/plugin/common/serialize.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+template <typename T>
+int32_t prompt_tuning_emb(cudaStream_t,
+                          int32_t,
+                          int32_t,
+                          int32_t,
+                          int32_t const*,
+                          int32_t const*,
+                          int32_t const*,
+                          T const*,
+                          int32_t,
+                          float const*,
+                          float const*,
+                          T const*,
+                          T const*,
+                          T const*,
+                          int32_t,
+                          int32_t,
+                          int32_t,
+                          T*,
+                          int32_t*);
+class TrtPromptTuningEmbLayerNormVarSeqlenPluginBase
+    : public nvinfer1::IPluginV2DynamicExt {
+ public:
+  TrtPromptTuningEmbLayerNormVarSeqlenPluginBase(
+      std::string const& name,
+      nvinfer1::DataType const type,
+      nvinfer1::Weights const& beta,
+      nvinfer1::Weights const& gamma,
+      const std::vector<nvinfer1::Weights>& ids_emb);
+
+  TrtPromptTuningEmbLayerNormVarSeqlenPluginBase(std::string const& name,
+                                                 void const* data,
+                                                 size_t length);
+
+  // It doesn't make sense to make TrtPromptTuningEmbLayerNormVarSeqlenPlugin
+  // without arguments, so we delete default constructor.
+  TrtPromptTuningEmbLayerNormVarSeqlenPluginBase() = delete;
+
+  // IPluginV2DynamicExt Methods
+  bool supportsFormatCombination(int32_t pos,
+                                 nvinfer1::PluginTensorDesc const* inOut,
+                                 int32_t nbInputs,
+                                 int32_t nbOutputs) noexcept override;
+  size_t getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs,
+                          int32_t nbInputs,
+                          nvinfer1::PluginTensorDesc const* outputs,
+                          int32_t nbOutputs) const noexcept override;
+
+  // IPluginV2Ext Methods
+  nvinfer1::DataType getOutputDataType(
+      int32_t index,
+      nvinfer1::DataType const* inputTypes,
+      int32_t nbInputs) const noexcept override;
+
+  // IPluginV2 Methods
+  char const* getPluginType() const noexcept override;
+  int32_t getNbOutputs() const noexcept override;
+  size_t getSerializationSize() const noexcept override;
+  void serialize(void* buffer) const noexcept override;
+  void destroy() noexcept override;
+  char const* getPluginNamespace() const noexcept override;
+  void setPluginNamespace(char const* pluginNamespace) noexcept override;
+
+ protected:
+  std::string const mLayerName;
+  std::string mNamespace;
+  cuda_unique_ptr<float> mGammaDev;
+  cuda_unique_ptr<float> mBetaDev;
+  std::vector<void*> mIdsEmbPtrs;
+  size_t mLd;  // leading dim = hidden size
+  std::vector<int32_t> mIdsVocabSize;
+  WeightsWithOwnership mBeta;
+  WeightsWithOwnership mGamma;
+  nvinfer1::DataType mType;
+  std::vector<nvinfer1::Weights> mIdsEmb_;
+  int32_t nbLookupTables_ = 0;
+};
+
+class TrtPromptTuningEmbLayerNormVarSeqlenPluginHFace
+    : public TrtPromptTuningEmbLayerNormVarSeqlenPluginBase {
+ public:
+  TrtPromptTuningEmbLayerNormVarSeqlenPluginHFace(
+      std::string const& name,
+      nvinfer1::DataType const type,
+      nvinfer1::Weights const& beta,
+      nvinfer1::Weights const& gamma,
+      const std::vector<nvinfer1::Weights>& ids_emb);
+
+  TrtPromptTuningEmbLayerNormVarSeqlenPluginHFace(std::string const& name,
+                                                  void const* data,
+                                                  size_t length);
+
+  // It doesn't make sense to make TrtPromptTuningEmbLayerNormVarSeqlenPlugin
+  // without arguments, so we delete default constructor.
+  TrtPromptTuningEmbLayerNormVarSeqlenPluginHFace() = delete;
+
+  // IPluginV2DynamicExt Methods
+  nvinfer1::IPluginV2DynamicExt* clone() const noexcept override;
+  nvinfer1::DimsExprs getOutputDimensions(
+      int32_t outputIndex,
+      const nvinfer1::DimsExprs* inputs,
+      int32_t nbInputs,
+      nvinfer1::IExprBuilder& exprBuilder) noexcept override;
+  void configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in,
+                       int32_t nbInputs,
+                       nvinfer1::DynamicPluginTensorDesc const* out,
+                       int32_t nbOutputs) noexcept override;
+  int32_t enqueue(nvinfer1::PluginTensorDesc const* inputDesc,
+                  nvinfer1::PluginTensorDesc const* outputDesc,
+                  void const* const* inputs,
+                  void* const* outputs,
+                  void* workspace,
+                  cudaStream_t stream) noexcept override;
+  // IPluginV2 Methods
+  int32_t initialize() noexcept override;
+  void terminate() noexcept override;
+  void destroy() noexcept override;
+  char const* getPluginVersion() const noexcept override;
+};
+
+class TrtPromptTuningEmbLayerNormVarSeqlenPluginBaseCreator
+    : public nvinfer1::IPluginCreator {
+ public:
+  TrtPromptTuningEmbLayerNormVarSeqlenPluginBaseCreator();
+
+  char const* getPluginName() const noexcept override;
+
+  const nvinfer1::PluginFieldCollection* getFieldNames() noexcept override;
+
+  void setPluginNamespace(char const* pluginNamespace) noexcept override;
+
+  char const* getPluginNamespace() const noexcept override;
+
+ protected:
+  static nvinfer1::PluginFieldCollection mFC;
+  static std::vector<nvinfer1::PluginField> mPluginAttributes;
+  std::string mNamespace;
+};
+
+class TrtPromptTuningEmbLayerNormVarSeqlenPluginHFaceCreator
+    : public TrtPromptTuningEmbLayerNormVarSeqlenPluginBaseCreator {
+ public:
+  nvinfer1::IPluginV2* createPlugin(
+      char const* name,
+      const nvinfer1::PluginFieldCollection* fc) noexcept override;
+  char const* getPluginVersion() const noexcept override;
+  nvinfer1::IPluginV2* deserializePlugin(char const* name,
+                                         void const* serialData,
+                                         size_t serialLength) noexcept override;
+};
+
+REGISTER_TRT_PLUGIN_V2(TrtPromptTuningEmbLayerNormVarSeqlenPluginHFaceCreator);
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/ir/dialect/CMakeLists.txt b/paddle/fluid/ir/dialect/CMakeLists.txt
deleted file mode 100644
index 7500642867f30..0000000000000
--- a/paddle/fluid/ir/dialect/CMakeLists.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-add_subdirectory(paddle_dialect)
-add_subdirectory(paddle_kernel_dialect)
diff --git a/paddle/fluid/ir/dialect/paddle_dialect/interface/CMakeLists.txt b/paddle/fluid/ir/dialect/paddle_dialect/interface/CMakeLists.txt
deleted file mode 100644
index 5ee2f3510ca93..0000000000000
--- a/paddle/fluid/ir/dialect/paddle_dialect/interface/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-# All source files of pd_dialect, except for the source file of op, which is generated in the compilation directory.
-file(GLOB PD_INTERFACE_SRCS "*.cc")
-
-cc_library(
-  pd_interface
-  SRCS ${PD_INTERFACE_SRCS}
-  DEPS ir_core phi_utils)
diff --git a/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_manual_api.cc b/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_manual_api.cc
deleted file mode 100644
index b95d78a74f470..0000000000000
--- a/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_manual_api.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_manual_api.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/api_builder.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_op.h"
-#include "paddle/ir/core/builtin_op.h"
-
-namespace paddle {
-namespace dialect {
-ir::OpResult split_grad(std::vector<ir::OpResult> out_grads,
-                        ir::OpResult axis) {
-  auto combine_op =
-      APIBuilder::Instance().GetBuilder()->Build<ir::CombineOp>(out_grads);
-  paddle::dialect::SplitGradOp split_grad_op =
-      APIBuilder::Instance().GetBuilder()->Build<paddle::dialect::SplitGradOp>(
-          combine_op.out(), axis);
-
-  return split_grad_op.x_grad();
-}
-
-ir::OpResult split_grad(std::vector<ir::OpResult> out_grads, int axis) {
-  auto combine_op =
-      APIBuilder::Instance().GetBuilder()->Build<ir::CombineOp>(out_grads);
-  paddle::dialect::SplitGradOp split_grad_op =
-      APIBuilder::Instance().GetBuilder()->Build<paddle::dialect::SplitGradOp>(
-          combine_op.out(), axis);
-
-  return split_grad_op.x_grad();
-}
-}  // namespace dialect
-}  // namespace paddle
diff --git a/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_manual_op.h b/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_manual_op.h
deleted file mode 100644
index c8a5e1658ec4d..0000000000000
--- a/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_manual_op.h
+++ /dev/null
@@ -1,204 +0,0 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifdef GET_MANUAL_OP_LIST
-#undef GET_MANUAL_OP_LIST
-paddle::dialect::AddNOp, paddle::dialect::SplitGradOp, paddle::dialect::IfOp
-
-#else
-
-#pragma once
-#include <vector>
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/interface/infermeta.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/interface/op_yaml_info.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/trait/inplace.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/utils/op_yaml_info_util.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/utils/utils.h"
-#include "paddle/ir/core/builder.h"
-#include "paddle/ir/core/ir_printer.h"
-#include "paddle/ir/core/op_base.h"
-#include "paddle/ir/core/operation_utils.h"
-#include "paddle/phi/core/infermeta_utils.h"
-
-namespace paddle {
-namespace dialect {
-
-class AddNOp : public ir::Op<AddNOp, OpYamlInfoInterface, InferMetaInterface> {
- public:
-  using Op::Op;
-  static const char *name() { return "pd.add_n"; }
-  static constexpr const char **attributes_name = nullptr;
-  static constexpr uint32_t attributes_num = 0;
-  static OpInfoTuple GetOpInfo();
-  static void Build(ir::Builder &builder,             // NOLINT
-                    ir::OperationArgument &argument,  // NOLINT
-                    ir::OpResult inputs);
-
-  void Verify();
-  ir::Value inputs() { return operand_source(0); }
-  ir::OpResult out() { return result(0); }
-  static void InferMeta(phi::InferMetaContext *infer_meta);
-};
-
-class AddN_Op : public ir::Op<AddN_Op,
-                              paddle::dialect::OpYamlInfoInterface,
-                              paddle::dialect::InferMetaInterface,
-                              paddle::dialect::InplaceTrait> {
- public:
-  using Op::Op;
-  static const char *name() { return "pd.add_n_"; }
-  static constexpr const char **attributes_name = nullptr;
-  static constexpr uint32_t attributes_num = 0;
-  static OpInfoTuple GetOpInfo();
-  static void Build(ir::Builder &builder,             // NOLINT
-                    ir::OperationArgument &argument,  // NOLINT
-                    ir::OpResult inputs_);
-
-  void Verify();
-  ir::Value inputs() { return operand_source(0); }
-  ir::OpResult out() { return result(0); }
-
-  static void InferMeta(phi::InferMetaContext *infer_meta);
-};
-
-class AddNWithKernelOp : public ir::Op<AddNWithKernelOp,
-                                       paddle::dialect::OpYamlInfoInterface,
-                                       paddle::dialect::InferMetaInterface> {
- public:
-  using Op::Op;
-  static const char *name() { return "pd.add_n_with_kernel"; }
-  static constexpr const char **attributes_name = nullptr;
-  static constexpr uint32_t attributes_num = 0;
-  static OpInfoTuple GetOpInfo();
-  static void Build(ir::Builder &builder,             // NOLINT
-                    ir::OperationArgument &argument,  // NOLINT
-                    ir::OpResult inputs_);
-
-  void Verify();
-  ir::Value inputs() { return operand_source(0); }
-  ir::OpResult out() { return result(0); }
-
-  static void InferMeta(phi::InferMetaContext *infer_meta);
-};
-
-class FusedGemmEpilogueOp : public ir::Op<FusedGemmEpilogueOp,
-                                          paddle::dialect::OpYamlInfoInterface,
-                                          paddle::dialect::InferMetaInterface> {
- public:
-  using Op::Op;
-  static const char *name() { return "pd.fused_gemm_epilogue"; }
-  static const char *attributes_name[3];
-  static constexpr uint32_t attributes_num = 3;
-  static OpInfoTuple GetOpInfo();
-
-  static void Build(ir::Builder &builder,             // NOLINT
-                    ir::OperationArgument &argument,  // NOLINT
-                    ir::OpResult x_,
-                    ir::OpResult y_,
-                    ir::OpResult bias_,
-                    ir::AttributeMap attributes);
-  void Verify();
-  ir::Value x() { return operand_source(0); }
-  ir::Value y() { return operand_source(1); }
-  ir::Value bias() { return operand_source(2); }
-  ir::OpResult out() { return result(0); }
-  ir::OpResult reserve_space() { return result(1); }
-
-  static void InferMeta(phi::InferMetaContext *infer_meta);
-};
-
-class FusedGemmEpilogueGradOp
-    : public ir::Op<FusedGemmEpilogueGradOp,
-                    paddle::dialect::OpYamlInfoInterface,
-                    paddle::dialect::InferMetaInterface> {
- public:
-  using Op::Op;
-  static const char *name() { return "pd.fused_gemm_epilogue_grad"; }
-  static const char *attributes_name[3];
-  static constexpr uint32_t attributes_num = 3;
-  static OpInfoTuple GetOpInfo();
-
-  static void Build(ir::Builder &builder,             // NOLINT
-                    ir::OperationArgument &argument,  // NOLINT
-                    ir::OpResult x_,
-                    ir::OpResult y_,
-                    ir::OpResult reserve_space_,
-                    ir::OpResult out_grad_,
-                    ir::AttributeMap attributes);
-  void Verify();
-  ir::Value x() { return operand_source(0); }
-  ir::Value y() { return operand_source(1); }
-  ir::Value reserve_space() { return operand_source(2); }
-  ir::Value out_grad() { return operand_source(3); }
-  ir::OpResult x_grad() { return result(0); }
-  ir::OpResult y_grad() { return result(1); }
-  ir::OpResult bias_grad() { return result(2); }
-
-  static void InferMeta(phi::InferMetaContext *infer_meta);
-};
-
-class SplitGradOp : public ir::Op<SplitGradOp, OpYamlInfoInterface> {
- public:
-  using Op::Op;
-  static const char *name() { return "pd.split_grad"; }
-  static const char *attributes_name[1];
-  static constexpr uint32_t attributes_num = 1;
-  static OpInfoTuple GetOpInfo();
-  static void Build(ir::Builder &builder,             // NOLINT
-                    ir::OperationArgument &argument,  // NOLINT
-                    ir::OpResult x_,
-                    float axis = 0);
-  static void Build(ir::Builder &builder,             // NOLINT
-                    ir::OperationArgument &argument,  // NOLINT
-                    ir::OpResult out_grad_,
-                    ir::OpResult axis_);
-
-  void Verify();
-  ir::Value out_grad() { return operand_source(0); }
-  ir::Value axis() { return operand_source(1); }
-  ir::OpResult x_grad() { return result(0); }
-  static void InferMeta(phi::InferMetaContext *infer_meta);
-};
-
-class IfOp : public ir::Op<IfOp> {
- public:
-  using Op::Op;
-  static const char *name() { return "pd.if"; }
-  static constexpr const char **attributes_name = nullptr;
-  static constexpr uint32_t attributes_num = 0;
-  static void Build(ir::Builder &builder,             // NOLINT
-                    ir::OperationArgument &argument,  // NOLINT
-                    ir::OpResult cond,
-                    std::vector<ir::Type> &&output_types);
-  ir::Value cond() { return operand_source(0); }
-  ir::Block *true_block();
-  ir::Block *false_block();
-  void Print(ir::IrPrinter &printer);  // NOLINT
-  void Verify();
-};
-
-}  // namespace dialect
-}  // namespace paddle
-
-IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::AddNOp)
-IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::SplitGradOp)
-IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::AddN_Op)
-IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::AddNWithKernelOp)
-IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::FusedGemmEpilogueOp)
-IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::FusedGemmEpilogueGradOp)
-
-IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::IfOp)
-#endif
diff --git a/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_manual_op_vjp.cc b/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_manual_op_vjp.cc
deleted file mode 100644
index c7bac02e3347e..0000000000000
--- a/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_manual_op_vjp.cc
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_attribute.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_op.h"
-#include "paddle/fluid/primitive/rule/vjp/vjp.h"
-#include "paddle/fluid/primitive/type/lazy_tensor.h"
-#include "paddle/ir/core/builtin_op.h"
-#include "paddle/ir/core/op_base.h"
-#include "paddle/phi/common/int_array.h"
-
-// TODO(wanghao107)
-// this file will be generated in pd_op.cc
-
-namespace paddle {
-namespace dialect {
-using IntArray = paddle::experimental::IntArray;
-
-std::vector<std::vector<ir::OpResult>> SumOp::Vjp(
-    ir::Operation* op,
-    const std::vector<std::vector<ir::OpResult>>& out_grads,
-    const std::vector<std::vector<bool>>& stop_gradients) {
-  SumOp op_obj = op->dyn_cast<SumOp>();
-  Tensor x(std::make_shared<primitive::LazyTensor>(op_obj.x()));
-  Tensor out_grad(std::make_shared<primitive::LazyTensor>(out_grads[0][0]));
-
-  Tensor axis(std::make_shared<primitive::LazyTensor>(op_obj.axis()));
-
-  bool keepdim = op->attribute("keepdim").dyn_cast<ir::BoolAttribute>().data();
-  bool reduce_all = false;
-  std::vector<std::vector<Tensor>> tensor_res = primitive::sum_vjp(
-      x, out_grad, axis, keepdim, reduce_all, stop_gradients);
-  std::vector<std::vector<ir::OpResult>> res(2, std::vector<ir::OpResult>(1));
-  if (tensor_res[0][0].defined()) {
-    res[0][0] =
-        std::static_pointer_cast<primitive::LazyTensor>(tensor_res[0][0].impl())
-            ->getValue()
-            .dyn_cast<ir::OpResult>();
-  }
-  return res;
-}
-
-}  // namespace dialect
-}  // namespace paddle
diff --git a/paddle/fluid/ir_adaptor/translator/CMakeLists.txt b/paddle/fluid/ir_adaptor/translator/CMakeLists.txt
index 632411383db56..4ac1dc065143f 100644
--- a/paddle/fluid/ir_adaptor/translator/CMakeLists.txt
+++ b/paddle/fluid/ir_adaptor/translator/CMakeLists.txt
@@ -20,4 +20,4 @@ file(GLOB PD_PROGRAM_TRANSLATOR_SRCS "*.cc")
 cc_library(
   program_translator
   SRCS ${PD_PROGRAM_TRANSLATOR_SRCS} ${op_compat_source_file}
-  DEPS proto_desc pd_dialect ir framework_proto)
+  DEPS proto_desc pd_op_dialect pir framework_proto)
diff --git a/paddle/fluid/ir_adaptor/translator/attribute_translator.cc b/paddle/fluid/ir_adaptor/translator/attribute_translator.cc
index f6a4b94f2bfdf..ebb58cc0ebf61 100644
--- a/paddle/fluid/ir_adaptor/translator/attribute_translator.cc
+++ b/paddle/fluid/ir_adaptor/translator/attribute_translator.cc
@@ -17,14 +17,14 @@
 #include <string>
 #include <vector>
 
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_attribute.h"
-#include "paddle/ir/core/enforce.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/utils/data_type.h"
+#include "paddle/pir/core/enforce.h"
 #include "paddle/utils/variant.h"
 
 namespace paddle {
@@ -32,127 +32,128 @@ namespace translator {
 
 class AttributeVisitor {
  public:
-  ir::IrContext* ctx;
-  AttributeVisitor() { ctx = ir::IrContext::Instance(); }
+  pir::IrContext* ctx;
+  AttributeVisitor() { ctx = pir::IrContext::Instance(); }
   ~AttributeVisitor() = default;
 
  public:
-  virtual ir::Attribute operator()(int i) {
+  virtual pir::Attribute operator()(int i) {
     VLOG(10) << "translating int";
-    return ir::Int32Attribute::get(ctx, i);
+    return pir::Int32Attribute::get(ctx, i);
   }
 
-  virtual ir::Attribute operator()(int64_t i) {
+  virtual pir::Attribute operator()(int64_t i) {
     VLOG(10) << "translating int";
-    return ir::Int64Attribute::get(ctx, i);
+    return pir::Int64Attribute::get(ctx, i);
   }
 
-  virtual ir::Attribute operator()(float f) {
+  virtual pir::Attribute operator()(float f) {
     VLOG(10) << "translating float";
-    return ir::FloatAttribute::get(ctx, f);
+    return pir::FloatAttribute::get(ctx, f);
   }
 
-  virtual ir::Attribute operator()(bool b) {
+  virtual pir::Attribute operator()(bool b) {
     VLOG(10) << "translating bool";
-    return ir::BoolAttribute::get(ctx, b);
+    return pir::BoolAttribute::get(ctx, b);
   }
 
-  virtual ir::Attribute operator()(double d) {
+  virtual pir::Attribute operator()(double d) {
     VLOG(10) << "translating double";
-    return ir::DoubleAttribute::get(ctx, d);
+    return pir::DoubleAttribute::get(ctx, d);
   }
 
-  virtual ir::Attribute operator()(const std::string& str) {
+  virtual pir::Attribute operator()(const std::string& str) {
     VLOG(10) << "translating string";
-    return ir::StrAttribute::get(ctx, str);
+    return pir::StrAttribute::get(ctx, str);
   }
 
-  virtual ir::Attribute operator()(const paddle::experimental::Scalar& scalar) {
+  virtual pir::Attribute operator()(
+      const paddle::experimental::Scalar& scalar) {
     VLOG(10) << "translating scalar";
     IR_THROW("not support translating paddle::experimental::Scalar");
   }
 
-  virtual ir::Attribute operator()(const std::vector<std::string>& strs) {
+  virtual pir::Attribute operator()(const std::vector<std::string>& strs) {
     VLOG(10) << "translating vector<string>";
-    std::vector<ir::Attribute> attrs;
+    std::vector<pir::Attribute> attrs;
     attrs.reserve(strs.size());
     for (const auto& v : strs) {
-      attrs.push_back(ir::StrAttribute::get(ctx, v));
+      attrs.push_back(pir::StrAttribute::get(ctx, v));
     }
-    return ir::ArrayAttribute::get(ctx, attrs);
+    return pir::ArrayAttribute::get(ctx, attrs);
   }
 
-  virtual ir::Attribute operator()(const std::vector<float>& fs) {
+  virtual pir::Attribute operator()(const std::vector<float>& fs) {
     VLOG(10) << "translating vector<float>";
-    std::vector<ir::Attribute> attrs;
+    std::vector<pir::Attribute> attrs;
     attrs.reserve(fs.size());
     for (const auto& v : fs) {
-      attrs.push_back(ir::FloatAttribute::get(ctx, v));
+      attrs.push_back(pir::FloatAttribute::get(ctx, v));
     }
-    return ir::ArrayAttribute::get(ctx, attrs);
+    return pir::ArrayAttribute::get(ctx, attrs);
   }
 
-  virtual ir::Attribute operator()(const std::vector<int>& is) {
+  virtual pir::Attribute operator()(const std::vector<int>& is) {
     VLOG(10) << "translating vector<int>";
-    std::vector<ir::Attribute> attrs;
+    std::vector<pir::Attribute> attrs;
     attrs.reserve(is.size());
     for (const auto& v : is) {
-      attrs.push_back(ir::Int32Attribute::get(ctx, v));
+      attrs.push_back(pir::Int32Attribute::get(ctx, v));
     }
-    return ir::ArrayAttribute::get(ctx, attrs);
+    return pir::ArrayAttribute::get(ctx, attrs);
   }
 
-  virtual ir::Attribute operator()(const std::vector<bool>& bs) {
+  virtual pir::Attribute operator()(const std::vector<bool>& bs) {
     VLOG(10) << "translating vector<bool>";
-    std::vector<ir::Attribute> attrs;
+    std::vector<pir::Attribute> attrs;
     attrs.reserve(bs.size());
     for (const auto& v : bs) {
-      attrs.push_back(ir::BoolAttribute::get(ctx, v));
+      attrs.push_back(pir::BoolAttribute::get(ctx, v));
     }
-    return ir::ArrayAttribute::get(ctx, attrs);
+    return pir::ArrayAttribute::get(ctx, attrs);
   }
 
-  virtual ir::Attribute operator()(const std::vector<int64_t>& i64s) {
+  virtual pir::Attribute operator()(const std::vector<int64_t>& i64s) {
     VLOG(10) << "translating vector<int64> size: " << i64s.size();
-    std::vector<ir::Attribute> attrs;
+    std::vector<pir::Attribute> attrs;
     attrs.reserve(i64s.size());
     for (const auto& v : i64s) {
-      attrs.push_back(ir::Int64Attribute::get(ctx, v));
+      attrs.push_back(pir::Int64Attribute::get(ctx, v));
     }
-    return ir::ArrayAttribute::get(ctx, attrs);
+    return pir::ArrayAttribute::get(ctx, attrs);
   }
 
-  virtual ir::Attribute operator()(const std::vector<double>& ds) {
+  virtual pir::Attribute operator()(const std::vector<double>& ds) {
     VLOG(10) << "translating vector<double>";
-    std::vector<ir::Attribute> attrs;
+    std::vector<pir::Attribute> attrs;
     attrs.reserve(ds.size());
     for (const auto& v : ds) {
-      attrs.push_back(ir::DoubleAttribute::get(ctx, v));
+      attrs.push_back(pir::DoubleAttribute::get(ctx, v));
     }
-    return ir::ArrayAttribute::get(ctx, attrs);
+    return pir::ArrayAttribute::get(ctx, attrs);
   }
 
-  virtual ir::Attribute operator()(
+  virtual pir::Attribute operator()(
       const std::vector<paddle::experimental::Scalar>& ss) {
     VLOG(10) << "translating vector<scalar>";
-    std::vector<ir::Attribute> attrs;
+    std::vector<pir::Attribute> attrs;
     attrs.reserve(ss.size());
     for (const auto& v : ss) {
       attrs.push_back(dialect::ScalarAttribute::get(ctx, v));
     }
     VLOG(10) << "translating vector<scalar> Done";
-    return ir::ArrayAttribute::get(ctx, attrs);
+    return pir::ArrayAttribute::get(ctx, attrs);
   }
 
-  virtual ir::Attribute operator()(const paddle::blank& blank) {
+  virtual pir::Attribute operator()(const paddle::blank& blank) {
     VLOG(10) << "translating paddle::blank";
-    return ir::Attribute(nullptr);
+    return pir::Attribute(nullptr);
   }
 
   template <typename T>
-  ir::Attribute operator()(T attr) {
+  pir::Attribute operator()(T attr) {
     VLOG(10) << "translating null type";
-    return ir::Attribute(nullptr);
+    return pir::Attribute(nullptr);
   }
 };
 
@@ -160,19 +161,19 @@ class Int64ArrayAttributeVisitor : public AttributeVisitor {
  public:
   using AttributeVisitor::AttributeVisitor;
 
-  ir::Attribute operator()(const std::vector<int>& is) override {
+  pir::Attribute operator()(const std::vector<int>& is) override {
     VLOG(10) << "translating vector<int64>";
-    std::vector<ir::Attribute> attrs;
+    std::vector<pir::Attribute> attrs;
     attrs.reserve(is.size());
     for (const auto& v : is) {
-      attrs.push_back(ir::Int64Attribute::get(ctx, v));
+      attrs.push_back(pir::Int64Attribute::get(ctx, v));
     }
-    return ir::ArrayAttribute::get(ctx, attrs);
+    return pir::ArrayAttribute::get(ctx, attrs);
   }
 
-  ir::Attribute operator()(const paddle::blank& blank) override {
+  pir::Attribute operator()(const paddle::blank& blank) override {
     VLOG(10) << "translating paddle::blank to int64[]";
-    return ir::ArrayAttribute::get(ctx, {});
+    return pir::ArrayAttribute::get(ctx, {});
   }
 };
 
@@ -180,22 +181,22 @@ class Int64AttributeVisitor : public AttributeVisitor {
  public:
   using AttributeVisitor::AttributeVisitor;
 
-  ir::Attribute operator()(int is) override {
+  pir::Attribute operator()(int is) override {
     VLOG(10) << "translating int to Int64Attribute";
-    return ir::Int64Attribute::get(ctx, is);
+    return pir::Int64Attribute::get(ctx, is);
   }
 };
 
 class IntArrayAttributeVisitor : public AttributeVisitor {
  public:
   using AttributeVisitor::AttributeVisitor;
-  ir::Attribute operator()(const std::vector<int>& is) override {
+  pir::Attribute operator()(const std::vector<int>& is) override {
     VLOG(10) << "translating vector<int> to IntArray";
     phi::IntArray data(is);
     return paddle::dialect::IntArrayAttribute::get(ctx, data);
   }
 
-  ir::Attribute operator()(const std::vector<int64_t>& is) override {
+  pir::Attribute operator()(const std::vector<int64_t>& is) override {
     VLOG(10) << "translating vector<int> to IntArray";
     phi::IntArray data(is);
     return paddle::dialect::IntArrayAttribute::get(ctx, data);
@@ -205,14 +206,14 @@ class IntArrayAttributeVisitor : public AttributeVisitor {
 class DataTypeAttributeVisitor : public AttributeVisitor {
  public:
   using AttributeVisitor::AttributeVisitor;
-  ir::Attribute operator()(int i) override {
+  pir::Attribute operator()(int i) override {
     VLOG(10) << "translating int to DataType: " << i;
 
     auto phi_dtype = phi::TransToPhiDataType(i);
     return paddle::dialect::DataTypeAttribute::get(ctx, phi_dtype);
   }
 
-  ir::Attribute operator()(const paddle::blank& blank) override {
+  pir::Attribute operator()(const paddle::blank& blank) override {
     VLOG(10) << "translating paddle::blank to DataType::UNDEFINED";
     return paddle::dialect::DataTypeAttribute::get(ctx, phi::DataType());
   }
@@ -222,7 +223,7 @@ class PlaceAttributeVisitor : public AttributeVisitor {
  public:
   using AttributeVisitor::AttributeVisitor;
 
-  ir::Attribute operator()(const paddle::blank& blank) override {
+  pir::Attribute operator()(const paddle::blank& blank) override {
     VLOG(10) << "translating paddle::blank to Place::UNDEFINED";
     phi::Place data(phi::AllocationType::UNDEFINED);
     return paddle::dialect::PlaceAttribute::get(ctx, data);
@@ -237,17 +238,17 @@ AttributeTranslator::AttributeTranslator() {
       new DataTypeAttributeVisitor();
   special_visitors["paddle::dialect::PlaceAttribute"] =
       new PlaceAttributeVisitor();
-  special_visitors["ir::ArrayAttribute<ir::Int64Attribute>"] =
+  special_visitors["pir::ArrayAttribute<pir::Int64Attribute>"] =
       new Int64ArrayAttributeVisitor();
-  special_visitors["ir::Int64Attribute"] = new Int64AttributeVisitor();
+  special_visitors["pir::Int64Attribute"] = new Int64AttributeVisitor();
 }
 
-ir::Attribute AttributeTranslator::operator()(
+pir::Attribute AttributeTranslator::operator()(
     const framework::Attribute& attr) {
   return paddle::visit(*general_visitor, attr);
 }
 
-ir::Attribute AttributeTranslator::operator()(
+pir::Attribute AttributeTranslator::operator()(
     const std::string& target_type, const framework::Attribute& attr) {
   if (special_visitors.find(target_type) == special_visitors.end()) {
     VLOG(10) << "[" << target_type << "] not found";
diff --git a/paddle/fluid/ir_adaptor/translator/attribute_translator.h b/paddle/fluid/ir_adaptor/translator/attribute_translator.h
index ea509c7e34673..2a716b0ef7d18 100644
--- a/paddle/fluid/ir_adaptor/translator/attribute_translator.h
+++ b/paddle/fluid/ir_adaptor/translator/attribute_translator.h
@@ -17,9 +17,9 @@
 
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/type_defs.h"
-#include "paddle/ir/core/attribute.h"
-#include "paddle/ir/core/builtin_attribute.h"
-#include "paddle/ir/core/ir_context.h"
+#include "paddle/pir/core/attribute.h"
+#include "paddle/pir/core/builtin_attribute.h"
+#include "paddle/pir/core/ir_context.h"
 
 #pragma once
 
@@ -45,9 +45,9 @@ class AttributeTranslator {
     return attribute_translator;
   }
 
-  ir::Attribute operator()(const framework::Attribute& attr);
-  ir::Attribute operator()(const std::string& target_type,
-                           const framework::Attribute& attr);
+  pir::Attribute operator()(const framework::Attribute& attr);
+  pir::Attribute operator()(const std::string& target_type,
+                            const framework::Attribute& attr);
 };
 
 }  // namespace translator
diff --git a/paddle/fluid/ir_adaptor/translator/op_translator.cc b/paddle/fluid/ir_adaptor/translator/op_translator.cc
index 39a6acdd21b55..b441fe6c87b69 100644
--- a/paddle/fluid/ir_adaptor/translator/op_translator.cc
+++ b/paddle/fluid/ir_adaptor/translator/op_translator.cc
@@ -23,27 +23,27 @@
 #include <vector>
 
 #include "paddle/fluid/framework/op_desc.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/interface/op_yaml_info.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_attribute.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_dialect.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_type.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/utils/utils.h"
 #include "paddle/fluid/ir_adaptor/translator/attribute_translator.h"
 #include "paddle/fluid/ir_adaptor/translator/op_compat_info.h"
 #include "paddle/fluid/ir_adaptor/translator/program_translator.h"
 #include "paddle/fluid/ir_adaptor/translator/type_translator.h"
 #include "paddle/fluid/ir_adaptor/translator/utils.h"
-#include "paddle/ir/core/builder.h"
-#include "paddle/ir/core/builtin_op.h"
-#include "paddle/ir/core/builtin_type.h"
-#include "paddle/ir/core/enforce.h"
-#include "paddle/ir/core/ir_context.h"
-#include "paddle/ir/core/operation.h"
-#include "paddle/ir/core/value.h"
+#include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
+#include "paddle/fluid/pir/dialect/operator/utils/utils.h"
+#include "paddle/pir/core/builder.h"
+#include "paddle/pir/core/builtin_op.h"
+#include "paddle/pir/core/builtin_type.h"
+#include "paddle/pir/core/enforce.h"
+#include "paddle/pir/core/ir_context.h"
+#include "paddle/pir/core/operation.h"
+#include "paddle/pir/core/value.h"
 
 // NOTE(zhangbo9674): File pd_op.h is generated by op_gen.py, see details in
-// paddle/fluid/ir/dialect/CMakeLists.txt.
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_op.h"
+// paddle/fluid/pir/dialect/CMakeLists.txt.
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
 
 namespace paddle {
 namespace translator {
@@ -56,7 +56,7 @@ using ResultIdx = std::tuple<IdxInOp, IdxInVector>;
 using OpDesc = paddle::framework::OpDesc;
 using BlockDesc = paddle::framework::BlockDesc;
 using VarDesc = paddle::framework::VarDesc;
-using OpOutputTypeList = std::vector<ir::Type>;
+using OpOutputTypeList = std::vector<pir::Type>;
 using OpOutputMapping = std::unordered_map<std::string, ResultIdx>;
 using OpInputInfo = dialect::OpInputInfo;
 using OpInputInfoList = std::vector<dialect::OpInputInfo>;
@@ -64,16 +64,16 @@ using OpAttributeInfo = dialect::OpAttributeInfo;
 using OpAttributeInfoList = std::vector<dialect::OpAttributeInfo>;
 using OpOutputInfo = dialect::OpOutputInfo;
 using OpOutputInfoList = std::vector<dialect::OpOutputInfo>;
-using InputHandlerFn = std::function<ir::OpResult(ir::IrContext*,
-                                                  TranslationContext*,
-                                                  const OpDesc&,
-                                                  const std::string&,
-                                                  const OpInputInfo&,
-                                                  ir::Program*)>;
-using AttributeHandlerFn = std::function<ir::Attribute(
-    ir::IrContext*, const OpDesc&, const OpAttributeInfo&)>;
-constexpr char kTargetDialectPrefix[] = "pd.";  // NOLINT
-constexpr char kEmptyVarName[] = "@EMPTY@";     // NOLINT
+using InputHandlerFn = std::function<pir::OpResult(pir::IrContext*,
+                                                   TranslationContext*,
+                                                   const OpDesc&,
+                                                   const std::string&,
+                                                   const OpInputInfo&,
+                                                   pir::Program*)>;
+using AttributeHandlerFn = std::function<pir::Attribute(
+    pir::IrContext*, const OpDesc&, const OpAttributeInfo&)>;
+constexpr char kTargetDialectPrefix[] = "pd_op.";  // NOLINT
+constexpr char kEmptyVarName[] = "@EMPTY@";        // NOLINT
 
 static const std::unordered_set<std::string> SpecialNonInplaceOps = {};
 
@@ -126,47 +126,46 @@ inline std::string OpNameCompatibleMapping(std::string op_name) {
   return op_normalizer[op_name];
 }
 
-inline ir::Operation* InsertCombineOperationForTarget(
-    ir::IrContext* ctx,
+inline pir::Operation* InsertCombineOperationForTarget(
+    pir::IrContext* ctx,
     TranslationContext* param_map,
-    ir::Program* program,
+    pir::Program* program,
     const std::vector<std::string>& args) {
-  std::string combine_op_name(ir::CombineOp::name());
-  ir::OpInfo op_info = ctx->GetRegisteredOpInfo(combine_op_name);
+  std::string combine_op_name(pir::CombineOp::name());
+  pir::OpInfo op_info = ctx->GetRegisteredOpInfo(combine_op_name);
 
-  std::vector<ir::OpResult> src_values;
-  std::vector<ir::Type> types_in_vec;
+  std::vector<pir::OpResult> src_values;
+  std::vector<pir::Type> types_in_vec;
   for (const auto& arg_name : args) {
     auto defining_info = param_map->at(arg_name);
     src_values.push_back(defining_info.value);
     types_in_vec.push_back(defining_info.value.type());
   }
-  ir::Type target_vec_type = ir::VectorType::get(ctx, types_in_vec);
-  ir::Operation* operation =
-      ir::Operation::Create(src_values, {}, {target_vec_type}, op_info);
+  pir::Type target_vec_type = pir::VectorType::get(ctx, types_in_vec);
+  pir::Operation* operation =
+      pir::Operation::Create(src_values, {}, {target_vec_type}, op_info);
   program->block()->push_back(operation);
   return operation;
 }
 
-inline ir::Operation* InsertFullOperationForAttributeInput(ir::IrContext* ctx,
-                                                           ir::Program* program,
-                                                           ir::Attribute attr) {
+inline pir::Operation* InsertFullOperationForAttributeInput(
+    pir::IrContext* ctx, pir::Program* program, pir::Attribute attr) {
   float data = 0.0f;
   phi::DataType dtype = phi::DataType::UNDEFINED;
-  if (attr.isa<ir::FloatAttribute>()) {
-    data = attr.dyn_cast<ir::FloatAttribute>().data();
+  if (attr.isa<pir::FloatAttribute>()) {
+    data = attr.dyn_cast<pir::FloatAttribute>().data();
     dtype = phi::DataType::FLOAT32;
-  } else if (attr.isa<ir::DoubleAttribute>()) {
-    data = static_cast<float>(attr.dyn_cast<ir::DoubleAttribute>().data());
+  } else if (attr.isa<pir::DoubleAttribute>()) {
+    data = static_cast<float>(attr.dyn_cast<pir::DoubleAttribute>().data());
     dtype = phi::DataType::FLOAT64;
-  } else if (attr.isa<ir::Int32Attribute>()) {
-    data = static_cast<float>(attr.dyn_cast<ir::Int32Attribute>().data());
+  } else if (attr.isa<pir::Int32Attribute>()) {
+    data = static_cast<float>(attr.dyn_cast<pir::Int32Attribute>().data());
     dtype = phi::DataType::INT32;
-  } else if (attr.isa<ir::Int64Attribute>()) {
-    data = static_cast<float>(attr.dyn_cast<ir::Int64Attribute>().data());
+  } else if (attr.isa<pir::Int64Attribute>()) {
+    data = static_cast<float>(attr.dyn_cast<pir::Int64Attribute>().data());
     dtype = phi::DataType::INT64;
-  } else if (attr.isa<ir::BoolAttribute>()) {
-    data = static_cast<float>(attr.dyn_cast<ir::BoolAttribute>().data());
+  } else if (attr.isa<pir::BoolAttribute>()) {
+    data = static_cast<float>(attr.dyn_cast<pir::BoolAttribute>().data());
     dtype = phi::DataType::BOOL;
   } else if (attr.isa<dialect::ScalarAttribute>()) {
     // TODO(phlrain) : need update here, downcast from double to float
@@ -174,35 +173,35 @@ inline ir::Operation* InsertFullOperationForAttributeInput(ir::IrContext* ctx,
         attr.dyn_cast<dialect::ScalarAttribute>().data().to<double>());
     dtype = phi::DataType::FLOAT64;
   }
-  ir::Builder builder(ctx, program->block());
+  pir::Builder builder(ctx, program->block());
   dialect::FullOp full_op = builder.Build<dialect::FullOp>(
       std::vector<int64_t>{1}, data, dtype, phi::CPUPlace());
 
   return full_op.operation();
 }
 
-inline ir::Operation* InsertFullArrayOperationForAttributeInput(
-    ir::IrContext* ctx, ir::Program* program, ir::Attribute attr) {
+inline pir::Operation* InsertFullArrayOperationForAttributeInput(
+    pir::IrContext* ctx, pir::Program* program, pir::Attribute attr) {
   IR_ENFORCE(attr.isa<dialect::IntArrayAttribute>(),
              "Encounter non IntArray type when trying to insert IntArray "
              "mutable attribute");
   phi::IntArray int_array = attr.dyn_cast<dialect::IntArrayAttribute>().data();
-  ir::Builder builder(ctx, program->block());
+  pir::Builder builder(ctx, program->block());
   dialect::FullIntArrayOp full_int_array_op =
       builder.Build<dialect::FullIntArrayOp>(
           int_array.GetData(), phi::DataType::INT64, phi::CPUPlace());
   return full_int_array_op.operation();
 }
 
-inline ir::Operation* InsertStackOperationForTarget(
-    ir::IrContext* ctx,
+inline pir::Operation* InsertStackOperationForTarget(
+    pir::IrContext* ctx,
     TranslationContext* param_map,
-    ir::Program* program,
+    pir::Program* program,
     const std::vector<std::string>& args,
     int axis = 0) {
   auto* combine_op =
       InsertCombineOperationForTarget(ctx, param_map, program, args);
-  ir::Builder builder(ctx, program->block());
+  pir::Builder builder(ctx, program->block());
   dialect::StackOp stack_op =
       builder.Build<dialect::StackOp>(combine_op->result(0), axis);
   return stack_op.operation();
@@ -210,8 +209,8 @@ inline ir::Operation* InsertStackOperationForTarget(
 
 }  // namespace
 
-ir::OpInfo OpTranscriber::LoopkUpOpInfo(ir::IrContext* ctx,
-                                        const OpDesc& op_desc) {
+pir::OpInfo OpTranscriber::LoopkUpOpInfo(pir::IrContext* ctx,
+                                         const OpDesc& op_desc) {
   std::string target_op_name =
       kTargetDialectPrefix + OpNameCompatibleMapping(op_desc.Type());
   if (IsInplace(op_desc) && *target_op_name.rbegin() != '_') {
@@ -230,11 +229,11 @@ ir::OpInfo OpTranscriber::LoopkUpOpInfo(ir::IrContext* ctx,
 }
 
 void OpTranscriber::InsertSliceOperationForInput(
-    ir::IrContext* ctx,
+    pir::IrContext* ctx,
     TranslationContext* param_map,
     const OpDesc& op_desc,
     const OpInputInfoList& input_infos,
-    ir::Program* program) {
+    pir::Program* program) {
   auto& op_normalizer = OpNameNormalizer::instance();
   std::set<std::string> yaml_input_set;
   for (const auto& info : input_infos) {
@@ -265,10 +264,11 @@ void OpTranscriber::InsertSliceOperationForInput(
   }
 }
 
-ir::OpResult OpTranscriber::GetAttributeAsInput(ir::IrContext* ctx,
-                                                ir::Program* program,
-                                                const OpDesc& op_desc,
-                                                const OpInputInfo& input_info) {
+pir::OpResult OpTranscriber::GetAttributeAsInput(
+    pir::IrContext* ctx,
+    pir::Program* program,
+    const OpDesc& op_desc,
+    const OpInputInfo& input_info) {
   auto& attribute_translator = AttributeTranslator::instance();
   auto& op_normalizer = OpNameNormalizer::instance();
 
@@ -283,10 +283,10 @@ ir::OpResult OpTranscriber::GetAttributeAsInput(ir::IrContext* ctx,
   paddle::framework::Attribute legacy_attr = op_desc.GetAttr(legacy_attr_name);
   VLOG(10) << "[" << op_desc.Type() << "][attribute]"
            << " name: " << legacy_attr_name << " " << legacy_attr.index();
-  ir::Attribute new_attr =
+  pir::Attribute new_attr =
       attribute_translator(input_info.type_name, legacy_attr);
 
-  ir::Operation* defining_op = nullptr;
+  pir::Operation* defining_op = nullptr;
   bool is_int_array = (input_info.type_name.find("IntArrayAttribute") !=
                        input_info.type_name.npos);
   if (is_int_array) {
@@ -299,13 +299,13 @@ ir::OpResult OpTranscriber::GetAttributeAsInput(ir::IrContext* ctx,
   return defining_op->result(0);
 }
 
-std::vector<ir::OpResult> OpTranscriber::GenerateOperationInput(
-    ir::IrContext* ctx,
+std::vector<pir::OpResult> OpTranscriber::GenerateOperationInput(
+    pir::IrContext* ctx,
     TranslationContext* param_map,
     const OpDesc& op_desc,
     const std::string& normalized_op_name,
     const OpInputInfoList& input_infos,
-    ir::Program* program) {
+    pir::Program* program) {
   VLOG(10) << "[op:" << op_desc.Type() << "][input] entrance";
 
   auto& op_normalizer = OpNameNormalizer::instance();
@@ -314,11 +314,11 @@ std::vector<ir::OpResult> OpTranscriber::GenerateOperationInput(
 
   VLOG(10) << "[op:" << op_desc.Type() << "][input] start";
 
-  std::vector<ir::OpResult> op_inputs;
+  std::vector<pir::OpResult> op_inputs;
 
   for (const auto& info : input_infos) {
     if (auto special_handler = this->GetSpecialInputHandlers(info.name)) {
-      ir::OpResult ret = special_handler(
+      pir::OpResult ret = special_handler(
           ctx, param_map, op_desc, normalized_op_name, info, program);
       op_inputs.push_back(ret);
       continue;
@@ -407,7 +407,7 @@ std::vector<ir::OpResult> OpTranscriber::GenerateOperationInput(
 }
 
 std::tuple<OpOutputTypeList, OpOutputMapping>
-OpTranscriber::GenerateOperationOutput(ir::IrContext* ctx,
+OpTranscriber::GenerateOperationOutput(pir::IrContext* ctx,
                                        const OpDesc& op_desc,
                                        const OpOutputInfoList& output_infos) {
   OpOutputMapping arg_to_idx;
@@ -457,7 +457,7 @@ OpTranscriber::GenerateOperationOutput(ir::IrContext* ctx,
                  legacy_output_vars[0]);
       if (var->GetType() ==
           paddle::framework::proto::VarType::LOD_TENSOR_ARRAY) {
-        ir::Type translated_var_type =
+        pir::Type translated_var_type =
             type_translator[var->GetType()](ctx, *var);
         op_output_types.push_back(translated_var_type);
         arg_to_idx[var->Name()] = {cur_output_idx, 0};
@@ -486,7 +486,8 @@ OpTranscriber::GenerateOperationOutput(ir::IrContext* ctx,
                << "[" << op_desc.Type() << "]" << info.name
                << " var: " << var_name << " type: " << var->GetType();
 
-      ir::Type translated_var_type = type_translator[var->GetType()](ctx, *var);
+      pir::Type translated_var_type =
+          type_translator[var->GetType()](ctx, *var);
 
       arg_to_idx[var_name] = {cur_output_idx, 0};
       op_output_types.push_back(translated_var_type);
@@ -496,7 +497,7 @@ OpTranscriber::GenerateOperationOutput(ir::IrContext* ctx,
       VLOG(10) << "[output translating]"
                << "[" << op_desc.Type() << "]" << info.name << " :"
                << info.type_name << " var: " << legacy_output_name;
-      std::vector<ir::Type> types;
+      std::vector<pir::Type> types;
       for (IdxInVector idx_in_vec = 0; idx_in_vec < legacy_output_vars.size();
            idx_in_vec++) {
         const auto& var_name = legacy_output_vars[idx_in_vec];
@@ -509,26 +510,26 @@ OpTranscriber::GenerateOperationOutput(ir::IrContext* ctx,
         VLOG(10) << "[output translating]"
                  << "[" << op_desc.Type() << "]" << info.name
                  << " var: " << var_name << " type: " << var->GetType();
-        ir::Type translated_var_type =
+        pir::Type translated_var_type =
             type_translator[var->GetType()](ctx, *var);
         types.push_back(translated_var_type);
         arg_to_idx[var_name] = {cur_output_idx, idx_in_vec};
       }
-      ir::Type vec_type = ir::VectorType::get(ctx, types);
+      pir::Type vec_type = pir::VectorType::get(ctx, types);
       op_output_types.push_back(vec_type);
     }
   }
   return {op_output_types, arg_to_idx};
 }
 
-ir::AttributeMap OpTranscriber::TranslateOpAttribute(
-    ir::IrContext* ctx,
+pir::AttributeMap OpTranscriber::TranslateOpAttribute(
+    pir::IrContext* ctx,
     const std::string& normalized_op_name,
     const OpAttributeInfoList& op_attr_infos,
     const OpDesc& op_desc) {
   auto& attribute_translator = AttributeTranslator::instance();
   auto& op_normalizer = OpNameNormalizer::instance();
-  ir::AttributeMap attribute_map = {};
+  pir::AttributeMap attribute_map = {};
 
   for (const auto& info : op_attr_infos) {
     if (auto handler = this->GetSpecialAttributeHandlers(info.name)) {
@@ -546,7 +547,7 @@ ir::AttributeMap OpTranscriber::TranslateOpAttribute(
           op_desc.GetAttr(legacy_attr_name);
       VLOG(10) << "attribute in " << op_desc.Type()
                << " name: " << legacy_attr_name << " " << legacy_attr.index();
-      ir::Attribute new_attr =
+      pir::Attribute new_attr =
           attribute_translator(info.type_name, legacy_attr);
       attribute_map[info.name] = new_attr;
       if (!new_attr) {
@@ -563,36 +564,36 @@ ir::AttributeMap OpTranscriber::TranslateOpAttribute(
   return attribute_map;
 }
 
-void OpTranscriber::HandleNonexistentAttribute(ir::IrContext*,
-                                               ir::AttributeMap* attribute_map,
+void OpTranscriber::HandleNonexistentAttribute(pir::IrContext*,
+                                               pir::AttributeMap* attribute_map,
                                                const OpAttributeInfo& info) {
   auto& attribute_translator = AttributeTranslator::instance();
   (*attribute_map)[info.name] =
       attribute_translator(info.type_name, paddle::framework::Attribute());
 }
 
-void OpTranscriber::RecordOpResultMapping(ir::IrContext* ctx,
+void OpTranscriber::RecordOpResultMapping(pir::IrContext* ctx,
                                           TranslationContext* param_map,
                                           const OpDesc& op_desc,
-                                          ir::Operation* operation,
+                                          pir::Operation* operation,
                                           const OpOutputMapping& arg_to_idx) {
   for (const auto& [arg_name, idx] : arg_to_idx) {
     const auto& [idx_in_op, idx_in_vec] = idx;
     VLOG(10) << "[output recording]"
              << "[" << op_desc.Type() << "]" << arg_name << " " << idx_in_op
              << " " << idx_in_vec;
-    ir::OpResult value = operation->result(idx_in_op);
-    bool generated_by_vector = value.type().isa<ir::VectorType>();
+    pir::OpResult value = operation->result(idx_in_op);
+    bool generated_by_vector = value.type().isa<pir::VectorType>();
 
     (*param_map)[arg_name] = VariableDefiningInfo(
         value, generated_by_vector, generated_by_vector ? idx_in_vec : -1);
   }
 }
 
-ir::Operation* OpTranscriber::operator()(ir::IrContext* ctx,
-                                         TranslationContext* param_map,
-                                         const OpDesc& op_desc,
-                                         ir::Program* program) {
+pir::Operation* OpTranscriber::operator()(pir::IrContext* ctx,
+                                          TranslationContext* param_map,
+                                          const OpDesc& op_desc,
+                                          pir::Program* program) {
   auto op_info = this->LoopkUpOpInfo(ctx, op_desc);
   auto* op_info_concept =
       op_info.GetInterfaceImpl<dialect::OpYamlInfoInterface>();
@@ -618,8 +619,8 @@ ir::Operation* OpTranscriber::operator()(ir::IrContext* ctx,
       this->TranslateOpAttribute(ctx, op_info.name(), attr_infos, op_desc);
   VLOG(4) << "[general op][" << op_desc.Type() << "] preparation end.";
 
-  ir::Operation* operation =
-      ir::Operation::Create(op_inputs, attribute_map, op_output_types, op_info);
+  pir::Operation* operation = pir::Operation::Create(
+      op_inputs, attribute_map, op_output_types, op_info);
   VLOG(4) << "[general op][" << op_desc.Type() << "] opearation creation end.";
   program->block()->push_back(operation);
 
@@ -630,13 +631,13 @@ ir::Operation* OpTranscriber::operator()(ir::IrContext* ctx,
 }
 
 struct CastOpTranscriber : public OpTranscriber {
-  ir::AttributeMap TranslateOpAttribute(
-      ir::IrContext*,
+  pir::AttributeMap TranslateOpAttribute(
+      pir::IrContext*,
       const std::string& normalized_op_name,
       const OpAttributeInfoList& op_attr_infos,
       const OpDesc& op_desc) override {
     auto& attribute_translator = AttributeTranslator::instance();
-    ir::AttributeMap attribute_map = {};
+    pir::AttributeMap attribute_map = {};
     const OpAttributeInfo info = op_attr_infos[0];
 
     std::string legacy_attr_name("out_dtype");
@@ -647,7 +648,7 @@ struct CastOpTranscriber : public OpTranscriber {
     }
     VLOG(10) << "attribute in " << op_desc.Type()
              << " name: " << legacy_attr_name << " " << legacy_attr.index();
-    ir::Attribute new_attr = attribute_translator(info.type_name, legacy_attr);
+    pir::Attribute new_attr = attribute_translator(info.type_name, legacy_attr);
     attribute_map[info.name] = new_attr;
 
     return attribute_map;
@@ -655,35 +656,35 @@ struct CastOpTranscriber : public OpTranscriber {
 };
 
 struct EmbeddingOpTranscriber : public OpTranscriber {
-  void HandleNonexistentAttribute(ir::IrContext* ctx,
-                                  ir::AttributeMap* attribute_map,
+  void HandleNonexistentAttribute(pir::IrContext* ctx,
+                                  pir::AttributeMap* attribute_map,
                                   const OpAttributeInfo& info) override {
     if (info.name == "padding_idx") {
-      (*attribute_map)[info.name] = ir::Int64Attribute::get(ctx, -1);
+      (*attribute_map)[info.name] = pir::Int64Attribute::get(ctx, -1);
     } else if (info.name == "sparse") {
-      (*attribute_map)[info.name] = ir::BoolAttribute::get(ctx, false);
+      (*attribute_map)[info.name] = pir::BoolAttribute::get(ctx, false);
     }
   }
 };
 
 struct IncrementOpTranscriber : public OpTranscriber {
-  ir::AttributeMap TranslateOpAttribute(
-      ir::IrContext* ctx,
+  pir::AttributeMap TranslateOpAttribute(
+      pir::IrContext* ctx,
       const std::string& normalized_op_name,
       const OpAttributeInfoList& op_attr_infos,
       const OpDesc& op_desc) override {
     auto& attribute_translator = AttributeTranslator::instance();
-    ir::AttributeMap attribute_map = {};
+    pir::AttributeMap attribute_map = {};
 
     paddle::framework::Attribute legacy_attr;
     if (op_desc.HasAttr("step")) {
       legacy_attr = op_desc.GetAttr("step");
       VLOG(10) << "attribute in " << op_desc.Type() << " step: "
                << " " << legacy_attr.index();
-      ir::Attribute new_attr = attribute_translator(legacy_attr);
+      pir::Attribute new_attr = attribute_translator(legacy_attr);
       attribute_map["value"] = new_attr;
     } else {
-      attribute_map["value"] = ir::FloatAttribute::get(ctx, 1.0f);
+      attribute_map["value"] = pir::FloatAttribute::get(ctx, 1.0f);
     }
 
     return attribute_map;
@@ -694,21 +695,23 @@ struct IncrementOpTranscriber : public OpTranscriber {
 // `legacy_ops.yaml`. For this op we simulate the logic in
 // python/paddle/tensor/creation.py::assign(x, output)
 struct AssignValueOpTranscriber : public OpTranscriber {
-  ir::OpInfo LoopkUpOpInfo(ir::IrContext* ctx, const OpDesc& op_desc) override {
-    std::string target_op_name = "pd.assign_value";
+  pir::OpInfo LoopkUpOpInfo(pir::IrContext* ctx,
+                            const OpDesc& op_desc) override {
+    std::string target_op_name = "pd_op.assign_value";
     const auto& op_info = ctx->GetRegisteredOpInfo(target_op_name);
     if (!op_info) {
       IR_THROW(
-          "Op assign_value should have corresponding OpInfo pd.assign_value");
+          "Op assign_value should have corresponding OpInfo "
+          "pd_op.assign_value");
     }
 
     return op_info;
   }
 
-  ir::Operation* operator()(ir::IrContext* ctx,
-                            TranslationContext* param_map,
-                            const OpDesc& op_desc,
-                            ir::Program* program) override {
+  pir::Operation* operator()(pir::IrContext* ctx,
+                             TranslationContext* param_map,
+                             const OpDesc& op_desc,
+                             pir::Program* program) override {
     VLOG(10) << "[op assign_value] start transcribing";
     auto op_info = this->LoopkUpOpInfo(ctx, op_desc);
     auto* op_info_concept =
@@ -724,7 +727,7 @@ struct AssignValueOpTranscriber : public OpTranscriber {
     }
 
     auto& attribute_translator = AttributeTranslator::instance();
-    ir::AttributeMap attribute_map;
+    pir::AttributeMap attribute_map;
 
     paddle::framework::Attribute legacy_attr;
     if (op_desc.HasAttr("shape")) {
@@ -732,7 +735,7 @@ struct AssignValueOpTranscriber : public OpTranscriber {
     } else {
       IR_THROW("Op assign_value should have attribute `shape` but not find");
     }
-    ir::Attribute attr_shape =
+    pir::Attribute attr_shape =
         attribute_translator(attr_info_maps.at("shape").type_name, legacy_attr);
     attribute_map["shape"] = attr_shape;
 
@@ -741,11 +744,11 @@ struct AssignValueOpTranscriber : public OpTranscriber {
     } else {
       IR_THROW("Op assign_value should have attribute `dtype` but not find");
     }
-    ir::Attribute attr_dtype =
+    pir::Attribute attr_dtype =
         attribute_translator(attr_info_maps.at("dtype").type_name, legacy_attr);
     attribute_map["dtype"] = attr_dtype;
 
-    ir::Attribute attr_place =
+    pir::Attribute attr_place =
         dialect::PlaceAttribute::get(ctx, phi::CPUPlace());
     attribute_map["place"] = attr_place;
 
@@ -764,20 +767,20 @@ struct AssignValueOpTranscriber : public OpTranscriber {
           "Op assign_value should have attribute `**_values` but not find");
     }
 
-    ir::Attribute attr_values = attribute_translator(
+    pir::Attribute attr_values = attribute_translator(
         attr_info_maps.at("values").type_name, legacy_attr);
     attribute_map["values"] = attr_values;
 
     VLOG(10) << "[op assign_value] attribute translation done";
 
-    std::vector<ir::OpResult> op_inputs = {};
+    std::vector<pir::OpResult> op_inputs = {};
 
     OpOutputMapping arg_to_idx;
     OpOutputTypeList op_output_types;
     std::tie(op_output_types, arg_to_idx) =
         this->GenerateOperationOutput(ctx, op_desc, output_infos);
 
-    ir::Operation* operation = ir::Operation::Create(
+    pir::Operation* operation = pir::Operation::Create(
         op_inputs, attribute_map, op_output_types, op_info);
     program->block()->push_back(operation);
     RecordOpResultMapping(ctx, param_map, op_desc, operation, arg_to_idx);
@@ -792,12 +795,12 @@ struct AssignValueOpTranscriber : public OpTranscriber {
 // So we generate an input by `full` with same type of output `DropoutState` of
 // OpDesc And we still should be aware that `DropoutState` is an optional output
 // in static graph.
-ir::OpResult TranslateDropOutStateIn(ir::IrContext* ctx,
-                                     TranslationContext* param_map,
-                                     const OpDesc& op_desc,
-                                     const std::string& normalized_op_name,
-                                     const OpInputInfo& input_info,
-                                     ir::Program* program) {
+pir::OpResult TranslateDropOutStateIn(pir::IrContext* ctx,
+                                      TranslationContext* param_map,
+                                      const OpDesc& op_desc,
+                                      const std::string& normalized_op_name,
+                                      const OpInputInfo& input_info,
+                                      pir::Program* program) {
   const std::string legacy_output_name = "DropoutState";
   std::vector<std::string> legacy_output_vars;
   if (op_desc.HasOutput(legacy_output_name)) {
@@ -806,7 +809,7 @@ ir::OpResult TranslateDropOutStateIn(ir::IrContext* ctx,
 
   if (legacy_output_vars.empty()) {
     VLOG(3) << "[input translating] not find output variable: DropoutState";
-    return ir::OpResult(nullptr);
+    return pir::OpResult(nullptr);
   }
 
   // `DropoutState` is a tensor
@@ -816,14 +819,14 @@ ir::OpResult TranslateDropOutStateIn(ir::IrContext* ctx,
     IR_THROW("Unexpected: Rnn Op should have a non-empty DropoutState");
   }
   auto& type_translator = TypeTranslator::instance();
-  ir::Type translated_var_type =
+  pir::Type translated_var_type =
       type_translator[dropout_state->GetType()](ctx, *dropout_state);
   IR_ENFORCE(
       translated_var_type.isa<dialect::DenseTensorType>(),
       "Unexpected: Rnn Op's output DropoutState should be a DenseTensor");
   auto tensor_type = translated_var_type.dyn_cast<dialect::DenseTensorType>();
 
-  ir::Builder builder(ctx, program->block());
+  pir::Builder builder(ctx, program->block());
   dialect::FullOp full_op = builder.Build<dialect::FullOp>(
       phi::vectorize(tensor_type.dims()),
       0.0f,
@@ -845,26 +848,27 @@ struct RnnOpTranscriber : public OpTranscriber {
 };
 
 struct EmbeddingGradOpTranscriber : public OpTranscriber {
-  void HandleNonexistentAttribute(ir::IrContext* ctx,
-                                  ir::AttributeMap* attribute_map,
+  void HandleNonexistentAttribute(pir::IrContext* ctx,
+                                  pir::AttributeMap* attribute_map,
                                   const OpAttributeInfo& info) override {
     if (info.name == "padding_idx") {
-      (*attribute_map)[info.name] = ir::Int64Attribute::get(ctx, -1);
+      (*attribute_map)[info.name] = pir::Int64Attribute::get(ctx, -1);
     } else if (info.name == "sparse") {
-      (*attribute_map)[info.name] = ir::BoolAttribute::get(ctx, false);
+      (*attribute_map)[info.name] = pir::BoolAttribute::get(ctx, false);
     }
   }
 
-  ir::OpInfo LoopkUpOpInfo(ir::IrContext* ctx, const OpDesc& op_desc) override {
+  pir::OpInfo LoopkUpOpInfo(pir::IrContext* ctx,
+                            const OpDesc& op_desc) override {
     std::string target_op_name =
         kTargetDialectPrefix + OpNameCompatibleMapping(op_desc.Type());
 
     bool is_sparse = paddle::get<bool>(op_desc.GetAttr("is_sparse"));
 
     if (is_sparse) {
-      target_op_name = "pd.embedding_grad_sparse";
+      target_op_name = "pd_op.embedding_grad_sparse";
     } else {
-      target_op_name = "pd.embedding_grad_dense";
+      target_op_name = "pd_op.embedding_grad_dense";
     }
     VLOG(6) << "[op name normalizing: " << op_desc.Type() << " to "
             << target_op_name;
@@ -880,45 +884,45 @@ struct EmbeddingGradOpTranscriber : public OpTranscriber {
 };
 
 struct FeedOpTranscriber : public OpTranscriber {
-  ir::AttributeMap TranslateOpAttribute(
-      ir::IrContext* ctx,
+  pir::AttributeMap TranslateOpAttribute(
+      pir::IrContext* ctx,
       const std::string& normalized_op_name,
       const OpAttributeInfoList& op_attr_infos,
       const OpDesc& op_desc) override {
-    ir::AttributeMap attribute_map = {
-        {"name", ir::StrAttribute::get(ctx, op_desc.OutputArgumentNames()[0])},
+    pir::AttributeMap attribute_map = {
+        {"name", pir::StrAttribute::get(ctx, op_desc.OutputArgumentNames()[0])},
         {"col",
-         ir::Int32Attribute::get(ctx, op_desc.GetAttrIfExists<int>("col"))},
+         pir::Int32Attribute::get(ctx, op_desc.GetAttrIfExists<int>("col"))},
     };
 
     return attribute_map;
   }
 
-  std::vector<ir::OpResult> GenerateOperationInput(
-      ir::IrContext* ctx,
+  std::vector<pir::OpResult> GenerateOperationInput(
+      pir::IrContext* ctx,
       TranslationContext* param_map,
       const OpDesc& op_desc,
       const std::string& normalized_op_name,
       const OpInputInfoList& input_infos,
-      ir::Program* program) override {
+      pir::Program* program) override {
     return {};
   }
 };
 
 struct DataOpTranscriber : public FeedOpTranscriber {
-  ir::AttributeMap TranslateOpAttribute(
-      ir::IrContext* ctx,
+  pir::AttributeMap TranslateOpAttribute(
+      pir::IrContext* ctx,
       const std::string& normalized_op_name,
       const OpAttributeInfoList& op_attr_infos,
       const OpDesc& op_desc) override {
     int allocate_type = paddle::get<int>(op_desc.GetAttr("place"));
     auto& attribute_translator = AttributeTranslator::instance();
-    ir::Attribute shape = attribute_translator(
+    pir::Attribute shape = attribute_translator(
         "paddle::dialect::IntArrayAttribute", op_desc.GetAttr("shape"));
-    ir::AttributeMap attribute_map = {
+    pir::AttributeMap attribute_map = {
         {"name",
-         ir::StrAttribute::get(ctx,
-                               op_desc.GetAttrIfExists<std::string>("name"))},
+         pir::StrAttribute::get(ctx,
+                                op_desc.GetAttrIfExists<std::string>("name"))},
         {"shape", shape},
         {"dtype",
          paddle::dialect::DataTypeAttribute::get(ctx, phi::DataType::FLOAT32)},
@@ -932,18 +936,18 @@ struct DataOpTranscriber : public FeedOpTranscriber {
 };
 
 struct SplitOpTranscriber : public OpTranscriber {
-  std::vector<ir::OpResult> GenerateOperationInput(
-      ir::IrContext* ctx,
+  std::vector<pir::OpResult> GenerateOperationInput(
+      pir::IrContext* ctx,
       TranslationContext* param_map,
       const OpDesc& op_desc,
       const std::string& normalized_op_name,
       const OpInputInfoList& input_infos,
-      ir::Program* program) override {
+      pir::Program* program) override {
     // input of split is [Tensor x, IntArray sections, Scalar(int) axis)]
 
     VLOG(10) << "[op:split][input] start";
 
-    std::vector<ir::OpResult> op_inputs;
+    std::vector<pir::OpResult> op_inputs;
     // process first input
     auto x_input_vars = op_desc.Input("X");
     IR_ENFORCE(x_input_vars.size() == 1, "x input of split MUST be a tensor");
@@ -963,7 +967,7 @@ struct SplitOpTranscriber : public OpTranscriber {
         op_inputs.push_back(combine_op->result(0));
       } else {
         auto& attribute_translator = AttributeTranslator::instance();
-        ir::Attribute new_attr = attribute_translator(
+        pir::Attribute new_attr = attribute_translator(
             "paddle::dialect::IntArrayAttribute", op_desc.GetAttr("sections"));
         auto sec_defin_op =
             InsertFullArrayOperationForAttributeInput(ctx, program, new_attr);
@@ -982,8 +986,8 @@ struct SplitOpTranscriber : public OpTranscriber {
       op_inputs.push_back(axis_defining_info.value);
     } else {
       auto& attribute_translator = AttributeTranslator::instance();
-      ir::Attribute new_attr =
-          attribute_translator("ir::Int32Attribute", op_desc.GetAttr("axis"));
+      pir::Attribute new_attr =
+          attribute_translator("pir::Int32Attribute", op_desc.GetAttr("axis"));
 
       auto sec_defin_op =
           InsertFullOperationForAttributeInput(ctx, program, new_attr);
@@ -993,16 +997,16 @@ struct SplitOpTranscriber : public OpTranscriber {
     return op_inputs;
   }
 
-  ir::AttributeMap TranslateOpAttribute(
-      ir::IrContext* ctx,
+  pir::AttributeMap TranslateOpAttribute(
+      pir::IrContext* ctx,
       const std::string& normalized_op_name,
       const OpAttributeInfoList& op_attr_infos,
       const OpDesc& op_desc) override {
     int num = paddle::get<int>(op_desc.GetAttr("num"));
     if (num > 0) {
-      ir::AttributeMap attribute_map = {
+      pir::AttributeMap attribute_map = {
           {"num",
-           ir::Int32Attribute::get(ctx, op_desc.GetAttrIfExists<int>("num"))},
+           pir::Int32Attribute::get(ctx, op_desc.GetAttrIfExists<int>("num"))},
       };
 
       return attribute_map;
@@ -1011,19 +1015,20 @@ struct SplitOpTranscriber : public OpTranscriber {
     return {};
   }
 
-  ir::OpInfo LoopkUpOpInfo(ir::IrContext* ctx, const OpDesc& op_desc) override {
+  pir::OpInfo LoopkUpOpInfo(pir::IrContext* ctx,
+                            const OpDesc& op_desc) override {
     int num = paddle::get<int>(op_desc.GetAttr("num"));
     std::string target_op_name;
     if (num > 0) {
-      target_op_name = "pd.split_with_num";
+      target_op_name = "pd_op.split_with_num";
 
     } else {
-      target_op_name = "pd.split";
+      target_op_name = "pd_op.split";
     }
 
     const auto& op_info = ctx->GetRegisteredOpInfo(target_op_name);
     if (!op_info) {
-      IR_THROW("Op assign_value should have corresponding OpInfo pd.split");
+      IR_THROW("Op assign_value should have corresponding OpInfo pd_op.split");
     }
 
     return op_info;
@@ -1031,10 +1036,10 @@ struct SplitOpTranscriber : public OpTranscriber {
 };
 
 struct FetchOpTranscriber : public OpTranscriber {
-  ir::Operation* operator()(ir::IrContext* ctx,
-                            TranslationContext* param_map,
-                            const OpDesc& op_desc,
-                            ir::Program* program) override {
+  pir::Operation* operator()(pir::IrContext* ctx,
+                             TranslationContext* param_map,
+                             const OpDesc& op_desc,
+                             pir::Program* program) override {
     auto op_info = this->LoopkUpOpInfo(ctx, op_desc);
 
     auto* op_info_concept =
@@ -1052,14 +1057,14 @@ struct FetchOpTranscriber : public OpTranscriber {
         ctx, param_map, op_desc, op_info.name(), input_infos, program);
 
     OpOutputTypeList op_output_types;
-    ir::AttributeMap attribute_map = {
-        {"name", ir::StrAttribute::get(ctx, op_desc.InputArgumentNames()[0])},
+    pir::AttributeMap attribute_map = {
+        {"name", pir::StrAttribute::get(ctx, op_desc.InputArgumentNames()[0])},
         {"col",
-         ir::Int32Attribute::get(ctx, op_desc.GetAttrIfExists<int>("col"))},
+         pir::Int32Attribute::get(ctx, op_desc.GetAttrIfExists<int>("col"))},
     };
 
     op_output_types.push_back(op_inputs[0].type());
-    ir::Operation* operation = ir::Operation::Create(
+    pir::Operation* operation = pir::Operation::Create(
         op_inputs, attribute_map, op_output_types, op_info);
     program->block()->push_back(operation);
 
@@ -1068,13 +1073,13 @@ struct FetchOpTranscriber : public OpTranscriber {
 };
 
 struct ShadowOutputOpTranscriber : public OpTranscriber {
-  ir::Operation* operator()(ir::IrContext* ctx,
-                            TranslationContext* param_map,
-                            const OpDesc& op_desc,
-                            ir::Program* program) override {
-    auto op_info = ctx->GetRegisteredOpInfo(ir::SetParameterOp::name());
+  pir::Operation* operator()(pir::IrContext* ctx,
+                             TranslationContext* param_map,
+                             const OpDesc& op_desc,
+                             pir::Program* program) override {
+    auto op_info = ctx->GetRegisteredOpInfo(pir::SetParameterOp::name());
 
-    std::vector<ir::OpResult> op_inputs;
+    std::vector<pir::OpResult> op_inputs;
     auto legacy_input_vars = op_desc.Input("x", true);
 
     auto defining_info = (*param_map)[legacy_input_vars[0]];
@@ -1086,14 +1091,14 @@ struct ShadowOutputOpTranscriber : public OpTranscriber {
 
     op_inputs.push_back(defining_info.value);
 
-    ir::AttributeMap attribute_map = {
+    pir::AttributeMap attribute_map = {
         {"parameter_name",
-         ir::StrAttribute::get(ctx,
-                               op_desc.GetAttrIfExists<std::string>("name"))},
+         pir::StrAttribute::get(ctx,
+                                op_desc.GetAttrIfExists<std::string>("name"))},
     };
 
-    ir::Operation* operation =
-        ir::Operation::Create(op_inputs, attribute_map, {}, op_info);
+    pir::Operation* operation =
+        pir::Operation::Create(op_inputs, attribute_map, {}, op_info);
     program->block()->push_back(operation);
 
     return operation;
@@ -1102,7 +1107,8 @@ struct ShadowOutputOpTranscriber : public OpTranscriber {
 
 // NOTE, add_n op in legacy ops don't have a kernel, so we use a new op for now
 struct AddNOpTranscriber : public OpTranscriber {
-  ir::OpInfo LoopkUpOpInfo(ir::IrContext* ctx, const OpDesc& op_desc) override {
+  pir::OpInfo LoopkUpOpInfo(pir::IrContext* ctx,
+                            const OpDesc& op_desc) override {
     std::string target_op_name =
         kTargetDialectPrefix + OpNameCompatibleMapping(op_desc.Type());
     if (IsInplace(op_desc)) {
@@ -1120,18 +1126,20 @@ struct AddNOpTranscriber : public OpTranscriber {
 };
 
 struct TrilAndTriuOpTranscriber : public OpTranscriber {
-  ir::OpInfo LoopkUpOpInfo(ir::IrContext* ctx, const OpDesc& op_desc) override {
+  pir::OpInfo LoopkUpOpInfo(pir::IrContext* ctx,
+                            const OpDesc& op_desc) override {
     bool lower = PADDLE_GET_CONST(bool, op_desc.GetAttr("lower"));
     std::string target_op_name = "";
     if (lower) {
-      target_op_name = "pd.tril";
+      target_op_name = "pd_op.tril";
     } else {
-      target_op_name = "pd.triu";
+      target_op_name = "pd_op.triu";
     }
     const auto& op_info = ctx->GetRegisteredOpInfo(target_op_name);
     if (!op_info) {
       IR_THROW(
-          "Op tril_triu should have corresponding OpInfo pd.tril or pd.triu.");
+          "Op tril_triu should have corresponding OpInfo pd_op.tril or "
+          "pd_op.triu.");
     }
 
     return op_info;
@@ -1139,27 +1147,28 @@ struct TrilAndTriuOpTranscriber : public OpTranscriber {
 };
 
 struct FillConstant2FullTranscriber : public OpTranscriber {
-  ir::OpInfo LoopkUpOpInfo(ir::IrContext* ctx, const OpDesc& op_desc) override {
+  pir::OpInfo LoopkUpOpInfo(pir::IrContext* ctx,
+                            const OpDesc& op_desc) override {
     const auto& op_info = ctx->GetRegisteredOpInfo(dialect::FullOp::name());
     if (!op_info) {
-      IR_THROW("Op fill_constant should have corresponding OpInfo pd.full");
+      IR_THROW("Op fill_constant should have corresponding OpInfo pd_op.full");
     }
 
     return op_info;
   }
 
-  std::vector<ir::OpResult> GenerateOperationInput(
-      ir::IrContext* ctx,
+  std::vector<pir::OpResult> GenerateOperationInput(
+      pir::IrContext* ctx,
       TranslationContext* param_map,
       const OpDesc& op_desc,
       const std::string& normalized_op_name,
       const OpInputInfoList& input_infos,
-      ir::Program* program) override {
+      pir::Program* program) override {
     return {};
   }
 
-  ir::AttributeMap TranslateOpAttribute(
-      ir::IrContext* ctx,
+  pir::AttributeMap TranslateOpAttribute(
+      pir::IrContext* ctx,
       const std::string& normalized_op_name,
       const OpAttributeInfoList& op_attr_infos,
       const OpDesc& op_desc) override {
@@ -1168,9 +1177,9 @@ struct FillConstant2FullTranscriber : public OpTranscriber {
     float value = PADDLE_GET_CONST(float, op_desc.GetAttr("value"));
     int dtype = PADDLE_GET_CONST(int, op_desc.GetAttr("dtype"));
 
-    auto attr_value = ir::FloatAttribute::get(ctx, value);
+    auto attr_value = pir::FloatAttribute::get(ctx, value);
 
-    ir::AttributeMap attribute_map = {
+    pir::AttributeMap attribute_map = {
         {"shape",
          attribute_translator("paddle::dialect::IntArrayAttribute",
                               shape_attr)},
@@ -1181,14 +1190,6 @@ struct FillConstant2FullTranscriber : public OpTranscriber {
              paddle::dialect::VarTypeToDataType(
                  static_cast<paddle::framework::proto::VarType_Type>(dtype)))}};
 
-    if (op_desc.HasAttr("force_cpu")) {
-      bool force_cpu = PADDLE_GET_CONST(bool, op_desc.GetAttr("force_cpu"));
-      if (force_cpu) {
-        attribute_map["place"] =
-            paddle::dialect::PlaceAttribute::get(ctx, phi::CPUPlace());
-      }
-    }
-
     int place_type = PADDLE_GET_CONST(int, op_desc.GetAttr("place_type"));
     switch (place_type) {
       case -1:
@@ -1212,30 +1213,40 @@ struct FillConstant2FullTranscriber : public OpTranscriber {
             paddle::dialect::PlaceAttribute::get(ctx, phi::XPUPlace());
         break;
     }
+
+    if (op_desc.HasAttr("force_cpu")) {
+      bool force_cpu = PADDLE_GET_CONST(bool, op_desc.GetAttr("force_cpu"));
+      if (force_cpu) {
+        attribute_map["place"] =
+            paddle::dialect::PlaceAttribute::get(ctx, phi::CPUPlace());
+      }
+    }
+
     return attribute_map;
   }
 };
 
 struct FillConstant2FullWithTensorTranscriber : public OpTranscriber {
-  ir::OpInfo LoopkUpOpInfo(ir::IrContext* ctx, const OpDesc& op_desc) override {
-    const auto& op_info = ctx->GetRegisteredOpInfo("pd.full_with_tensor");
+  pir::OpInfo LoopkUpOpInfo(pir::IrContext* ctx,
+                            const OpDesc& op_desc) override {
+    const auto& op_info = ctx->GetRegisteredOpInfo("pd_op.full_with_tensor");
     if (!op_info) {
       IR_THROW(
           "Op fill_constant should have corresponding OpInfo "
-          "pd.full_with_tensor");
+          "pd_op.full_with_tensor");
     }
 
     return op_info;
   }
 
-  std::vector<ir::OpResult> GenerateOperationInput(
-      ir::IrContext* ctx,
+  std::vector<pir::OpResult> GenerateOperationInput(
+      pir::IrContext* ctx,
       TranslationContext* param_map,
       const OpDesc& op_desc,
       const std::string& normalized_op_name,
       const OpInputInfoList& input_infos,
-      ir::Program* program) override {
-    std::vector<ir::OpResult> op_inputs;
+      pir::Program* program) override {
+    std::vector<pir::OpResult> op_inputs;
     if (op_desc.HasInput("ShapeTensor", true) &&
         op_desc.Input("ShapeTensor", true).size() > 0) {
       auto shape_tensor_vars = op_desc.Input("ShapeTensor", true);
@@ -1250,7 +1261,7 @@ struct FillConstant2FullWithTensorTranscriber : public OpTranscriber {
     } else {
       auto& attribute_translator = AttributeTranslator::instance();
       paddle::framework::Attribute shape_attr = op_desc.GetAttr("shape");
-      ir::Attribute new_attr = attribute_translator(
+      pir::Attribute new_attr = attribute_translator(
           "paddle::dialect::IntArrayAttribute", shape_attr);
       auto defining_op =
           InsertFullArrayOperationForAttributeInput(ctx, program, new_attr);
@@ -1264,7 +1275,7 @@ struct FillConstant2FullWithTensorTranscriber : public OpTranscriber {
       op_inputs.push_back(defining_info.value);
     } else {
       float value = PADDLE_GET_CONST(float, op_desc.GetAttr("value"));
-      ir::Attribute new_attr = ir::FloatAttribute::get(ctx, value);
+      pir::Attribute new_attr = pir::FloatAttribute::get(ctx, value);
       auto defining_op =
           InsertFullOperationForAttributeInput(ctx, program, new_attr);
       op_inputs.push_back(defining_op->result(0));
@@ -1272,14 +1283,14 @@ struct FillConstant2FullWithTensorTranscriber : public OpTranscriber {
     return op_inputs;
   }
 
-  ir::AttributeMap TranslateOpAttribute(
-      ir::IrContext* ctx,
+  pir::AttributeMap TranslateOpAttribute(
+      pir::IrContext* ctx,
       const std::string& normalized_op_name,
       const OpAttributeInfoList& op_attr_infos,
       const OpDesc& op_desc) override {
     int dtype = PADDLE_GET_CONST(int, op_desc.GetAttr("dtype"));
 
-    ir::AttributeMap attribute_map = {
+    pir::AttributeMap attribute_map = {
         {"dtype",
          paddle::dialect::DataTypeAttribute::get(
              ctx,
@@ -1290,10 +1301,10 @@ struct FillConstant2FullWithTensorTranscriber : public OpTranscriber {
 };
 
 struct FillConstantTranscriber : public OpTranscriber {
-  ir::Operation* operator()(ir::IrContext* ctx,
-                            TranslationContext* param_map,
-                            const OpDesc& op_desc,
-                            ir::Program* program) override {
+  pir::Operation* operator()(pir::IrContext* ctx,
+                             TranslationContext* param_map,
+                             const OpDesc& op_desc,
+                             pir::Program* program) override {
     bool has_mutable_attribute = op_desc.HasInput("ShapeTensor", true) &&
                                  op_desc.Input("ShapeTensor", true).size() > 0;
     has_mutable_attribute |= op_desc.HasInput("ShapeTensorList", true) &&
@@ -1310,12 +1321,13 @@ struct FillConstantTranscriber : public OpTranscriber {
   }
 };
 
-ir::OpResult TranslateNumClassesForOneHot(ir::IrContext* ctx,
-                                          TranslationContext* param_map,
-                                          const OpDesc& op_desc,
-                                          const std::string& normalized_op_name,
-                                          const OpInputInfo& input_info,
-                                          ir::Program* program) {
+pir::OpResult TranslateNumClassesForOneHot(
+    pir::IrContext* ctx,
+    TranslationContext* param_map,
+    const OpDesc& op_desc,
+    const std::string& normalized_op_name,
+    const OpInputInfo& input_info,
+    pir::Program* program) {
   const std::string legacy_attr_name = "depth";
   const std::string legacy_tensor_name = "depth_tensor";
   std::vector<std::string> legacy_vars;
@@ -1343,9 +1355,9 @@ ir::OpResult TranslateNumClassesForOneHot(ir::IrContext* ctx,
   paddle::framework::Attribute legacy_attr = op_desc.GetAttr(legacy_attr_name);
   VLOG(10) << "[" << op_desc.Type() << "][attribute]"
            << " name: " << legacy_attr_name << " " << legacy_attr.index();
-  ir::Attribute new_attr = attribute_translator(legacy_attr);
+  pir::Attribute new_attr = attribute_translator(legacy_attr);
 
-  ir::Operation* defining_op =
+  pir::Operation* defining_op =
       InsertFullOperationForAttributeInput(ctx, program, new_attr);
   return defining_op->result(0);
 }
@@ -1360,16 +1372,16 @@ struct OneHotTranscriber : public OpTranscriber {
   };
 };
 
-ir::Attribute TranslateReduceAll(ir::IrContext* ctx,
-                                 const OpDesc& op_desc,
-                                 const OpAttributeInfo& attr_info) {
+pir::Attribute TranslateReduceAll(pir::IrContext* ctx,
+                                  const OpDesc& op_desc,
+                                  const OpAttributeInfo& attr_info) {
   bool reduce_all = false;
   if (op_desc.HasAttr("reduce_all")) {
     reduce_all = paddle::get<bool>(op_desc.GetAttr("reduce_all"));
   }
 
   if (reduce_all) {
-    return ir::ArrayAttribute::get(ctx, std::vector<ir::Attribute>{});
+    return pir::ArrayAttribute::get(ctx, std::vector<pir::Attribute>{});
   }
 
   auto& attribute_translator = AttributeTranslator::instance();
@@ -1391,13 +1403,13 @@ struct ReduceOpTranscriber : public OpTranscriber {
 };
 
 struct ElementwiseTranscriber : public OpTranscriber {
-  std::vector<ir::OpResult> GenerateOperationInput(
-      ir::IrContext* ctx,
+  std::vector<pir::OpResult> GenerateOperationInput(
+      pir::IrContext* ctx,
       TranslationContext* param_map,
       const OpDesc& op_desc,
       const std::string& normalized_op_name,
       const OpInputInfoList& input_infos,
-      ir::Program* program) override {
+      pir::Program* program) override {
     int axis = paddle::get<int>(op_desc.GetAttr("axis"));
 
     if (axis == -1) {
@@ -1421,12 +1433,12 @@ struct ElementwiseTranscriber : public OpTranscriber {
           ctx, param_map, program, x_defining_info, x_name);
       x_defining_info = param_map->at(x_name);
     }
-    ir::OpResult x_value = x_defining_info.value;
+    pir::OpResult x_value = x_defining_info.value;
     IR_ENFORCE(x_value,
                "Expected op[%s]'s input %s is not null",
                op_desc.Type(),
                x_name);
-    ir::Type x_type = x_value.type();
+    pir::Type x_type = x_value.type();
     IR_ENFORCE(x_type.isa<dialect::DenseTensorType>(),
                "Expected op[%s]'s input %s is DenseTensor but got %s",
                op_desc.Type(),
@@ -1452,12 +1464,12 @@ struct ElementwiseTranscriber : public OpTranscriber {
           ctx, param_map, program, y_defining_info, y_name);
       y_defining_info = param_map->at(y_name);
     }
-    ir::OpResult y_value = y_defining_info.value;
+    pir::OpResult y_value = y_defining_info.value;
     IR_ENFORCE(y_value,
                "Expected op[%s]'s input %s is not null",
                op_desc.Type(),
                y_name);
-    ir::Type y_type = y_value.type();
+    pir::Type y_type = y_value.type();
     IR_ENFORCE(y_type.isa<dialect::DenseTensorType>(),
                "Expected op[%s]'s input %s is DenseTensor but got %s",
                op_desc.Type(),
@@ -1482,8 +1494,8 @@ struct ElementwiseTranscriber : public OpTranscriber {
                axis,
                append_size);
 
-    ir::Builder builder(ctx, program->block());
-    ir::OpResult y_new;
+    pir::Builder builder(ctx, program->block());
+    pir::OpResult y_new;
     if (std::find(y_shape.begin(), y_shape.end(), -1) == y_shape.end()) {
       std::vector<int64_t> y_new_shape(y_shape);
       for (int i = 0; i <= append_size; i++) {
@@ -1500,8 +1512,8 @@ struct ElementwiseTranscriber : public OpTranscriber {
           std::vector<int64_t>(append_size, 1),
           phi::DataType::INT64,
           phi::CPUPlace());
-      auto y_true_shape_op = builder.Build<ir::CombineOp>(
-          std::vector<ir::OpResult>{shape_op.out(), append_shape_op.out()});
+      auto y_true_shape_op = builder.Build<pir::CombineOp>(
+          std::vector<pir::OpResult>{shape_op.out(), append_shape_op.out()});
       auto concat_op =
           builder.Build<dialect::ConcatOp>(y_true_shape_op.out(), 0);
       auto y_new_shape = concat_op.out();
@@ -1513,12 +1525,14 @@ struct ElementwiseTranscriber : public OpTranscriber {
 };
 
 struct GradAddOpTranscriber : public ElementwiseTranscriber {
-  ir::OpInfo LoopkUpOpInfo(ir::IrContext* ctx, const OpDesc& op_desc) override {
-    const std::string& target_op_name = "pd.add";
+  pir::OpInfo LoopkUpOpInfo(pir::IrContext* ctx,
+                            const OpDesc& op_desc) override {
+    const std::string& target_op_name = "pd_op.add";
     const auto& op_info = ctx->GetRegisteredOpInfo(target_op_name);
     if (!op_info) {
       IR_THROW(
-          "Op assign_value should have corresponding OpInfo pd.assign_value_");
+          "Op assign_value should have corresponding OpInfo "
+          "pd_op.assign_value_");
     }
 
     return op_info;
@@ -1526,10 +1540,10 @@ struct GradAddOpTranscriber : public ElementwiseTranscriber {
 };
 
 struct ElementwiseGradTranscriber : public OpTranscriber {
-  void RecordOpResultMapping(ir::IrContext* ctx,
+  void RecordOpResultMapping(pir::IrContext* ctx,
                              TranslationContext* param_map,
                              const OpDesc& op_desc,
-                             ir::Operation* operation,
+                             pir::Operation* operation,
                              const OpOutputMapping& arg_to_idx) override {
     OpTranscriber::RecordOpResultMapping(
         ctx, param_map, op_desc, operation, arg_to_idx);
@@ -1566,12 +1580,12 @@ struct ElementwiseGradTranscriber : public OpTranscriber {
                op_desc.Type(),
                y_name);
     auto y_defining_info = param_map->at(y_name);
-    ir::OpResult y_value = y_defining_info.value;
+    pir::OpResult y_value = y_defining_info.value;
     IR_ENFORCE(y_value,
                "Expected op[%s]'s input %s is not null",
                op_desc.Type(),
                y_name);
-    ir::Type y_type = y_value.type();
+    pir::Type y_type = y_value.type();
     IR_ENFORCE(y_type.isa<dialect::DenseTensorType>(),
                "Expected op[%s]'s input %s is DenseTensor but got %s",
                op_desc.Type(),
@@ -1581,8 +1595,8 @@ struct ElementwiseGradTranscriber : public OpTranscriber {
         y_type.dyn_cast<dialect::DenseTensorType>();
     std::vector<int64_t> y_shape = phi::vectorize(y_tensor_type.dims());
 
-    ir::OpResult value = operation->result(idx_in_op);
-    ir::Builder builder(ctx, operation->GetParent());
+    pir::OpResult value = operation->result(idx_in_op);
+    pir::Builder builder(ctx, operation->GetParent());
     auto reshape_op = builder.Build<dialect::ReshapeOp>(value, y_shape);
     (*param_map)[y_grad_var_name] =
         VariableDefiningInfo(reshape_op.out(), false, -1);
@@ -1590,10 +1604,10 @@ struct ElementwiseGradTranscriber : public OpTranscriber {
 };
 
 struct SetValueOpTranscriber : public OpTranscriber {
-  ir::OpResult GetAttributeAsInput(ir::IrContext* ctx,
-                                   ir::Program* program,
-                                   const OpDesc& op_desc,
-                                   const OpInputInfo& input_info) override {
+  pir::OpResult GetAttributeAsInput(pir::IrContext* ctx,
+                                    pir::Program* program,
+                                    const OpDesc& op_desc,
+                                    const OpInputInfo& input_info) override {
     auto& attribute_translator = AttributeTranslator::instance();
     auto& op_normalizer = OpNameNormalizer::instance();
 
@@ -1608,23 +1622,24 @@ struct SetValueOpTranscriber : public OpTranscriber {
     framework::Attribute legacy_attr = op_desc.GetAttr(legacy_attr_name);
     VLOG(10) << "[" << op_desc.Type() << "][attribute]"
              << " name: " << legacy_attr_name << " " << legacy_attr.index();
-    ir::Attribute new_attr =
+    pir::Attribute new_attr =
         attribute_translator("paddle::dialect::IntArrayAttribute", legacy_attr);
 
-    ir::Operation* defining_op =
+    pir::Operation* defining_op =
         InsertFullArrayOperationForAttributeInput(ctx, program, new_attr);
     return defining_op->result(0);
   }
 };
 
 struct SetValueWithTensorOpTranscriber : public SetValueOpTranscriber {
-  ir::OpInfo LoopkUpOpInfo(ir::IrContext* ctx, const OpDesc& op_desc) override {
+  pir::OpInfo LoopkUpOpInfo(pir::IrContext* ctx,
+                            const OpDesc& op_desc) override {
     std::string target_op_name = dialect::SetValueWithTensorOp::name();
     const auto& op_info = ctx->GetRegisteredOpInfo(target_op_name);
     if (!op_info) {
       IR_THROW(
           "Op set_value should have corresponding OpInfo "
-          "pd.set_value_with_tensor");
+          "pd_op.set_value_with_tensor");
     }
 
     return op_info;
@@ -1635,12 +1650,12 @@ struct SetValueWithTensorOpTranscriber : public SetValueOpTranscriber {
     if (input_name != "values") {
       return nullptr;
     }
-    return [](ir::IrContext* ctx,
+    return [](pir::IrContext* ctx,
               TranslationContext* param_map,
               const OpDesc& op_desc,
               const std::string&,
               const OpInputInfo& info,
-              ir::Program* program) -> ir::OpResult {
+              pir::Program* program) -> pir::OpResult {
       std::vector<std::string> legacy_input_vars;
       IR_ENFORCE(op_desc.HasInput("ValueTensor"),
                  "[set_value] should have ValueTensor");
@@ -1662,13 +1677,14 @@ struct SetValueWithTensorOpTranscriber : public SetValueOpTranscriber {
 };
 
 struct SetValueGradOpTranscriber : public SetValueWithTensorOpTranscriber {
-  ir::OpInfo LoopkUpOpInfo(ir::IrContext* ctx, const OpDesc& op_desc) override {
+  pir::OpInfo LoopkUpOpInfo(pir::IrContext* ctx,
+                            const OpDesc& op_desc) override {
     std::string target_op_name = dialect::SetValueGradOp::name();
     const auto& op_info = ctx->GetRegisteredOpInfo(target_op_name);
     if (!op_info) {
       IR_THROW(
           "Op set_value_grad should have corresponding OpInfo "
-          "pd.set_value_grad");
+          "pd_op.set_value_grad");
     }
 
     return op_info;
@@ -1676,10 +1692,10 @@ struct SetValueGradOpTranscriber : public SetValueWithTensorOpTranscriber {
 };
 
 struct LegacySetValueDispatcher : public OpTranscriber {
-  ir::Operation* operator()(ir::IrContext* ctx,
-                            TranslationContext* param_map,
-                            const OpDesc& op_desc,
-                            ir::Program* program) override {
+  pir::Operation* operator()(pir::IrContext* ctx,
+                             TranslationContext* param_map,
+                             const OpDesc& op_desc,
+                             pir::Program* program) override {
     std::vector<std::string> legacy_input_vars;
 
     // if op has input with name "ValueTensor", then use that input as value
@@ -1698,8 +1714,8 @@ struct LegacySetValueDispatcher : public OpTranscriber {
 };
 
 OpTranslator::OpTranslator() {
-  ir::IrContext* ctx = ir::IrContext::Instance();
-  ctx->GetOrRegisterDialect<paddle::dialect::PaddleDialect>();
+  pir::IrContext* ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
 
   general_handler = OpTranscriber();
   special_handlers["add_n"] = AddNOpTranscriber();
diff --git a/paddle/fluid/ir_adaptor/translator/op_translator.h b/paddle/fluid/ir_adaptor/translator/op_translator.h
index afc7566be12b3..2ae6643999b8d 100644
--- a/paddle/fluid/ir_adaptor/translator/op_translator.h
+++ b/paddle/fluid/ir_adaptor/translator/op_translator.h
@@ -20,12 +20,12 @@
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/var_desc.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/interface/op_yaml_info.h"
 #include "paddle/fluid/ir_adaptor/translator/program_translator.h"
-#include "paddle/ir/core/ir_context.h"
-#include "paddle/ir/core/operation.h"
-#include "paddle/ir/core/program.h"
-#include "paddle/ir/core/value.h"
+#include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
+#include "paddle/pir/core/ir_context.h"
+#include "paddle/pir/core/operation.h"
+#include "paddle/pir/core/program.h"
+#include "paddle/pir/core/value.h"
 
 namespace paddle {
 namespace translator {
@@ -41,7 +41,7 @@ struct OpTranscriber {
   using IdxInVector = size_t;
   using ResultIdx = std::tuple<IdxInOp, IdxInVector>;
   using OpDesc = paddle::framework::OpDesc;
-  using OpOutputTypeList = std::vector<ir::Type>;
+  using OpOutputTypeList = std::vector<pir::Type>;
   using OpOutputMapping = std::unordered_map<std::string, ResultIdx>;
   using OpInputInfo = dialect::OpInputInfo;
   using OpInputInfoList = std::vector<dialect::OpInputInfo>;
@@ -49,51 +49,51 @@ struct OpTranscriber {
   using OpAttributeInfoList = std::vector<dialect::OpAttributeInfo>;
   using OpOutputInfo = dialect::OpOutputInfo;
   using OpOutputInfoList = std::vector<dialect::OpOutputInfo>;
-  using InputHandlerFn = std::function<ir::OpResult(ir::IrContext*,
-                                                    TranslationContext*,
-                                                    const OpDesc&,
-                                                    const std::string&,
-                                                    const OpInputInfo&,
-                                                    ir::Program*)>;
-  using AttributeHandlerFn = std::function<ir::Attribute(
-      ir::IrContext*, const OpDesc&, const OpAttributeInfo&)>;
+  using InputHandlerFn = std::function<pir::OpResult(pir::IrContext*,
+                                                     TranslationContext*,
+                                                     const OpDesc&,
+                                                     const std::string&,
+                                                     const OpInputInfo&,
+                                                     pir::Program*)>;
+  using AttributeHandlerFn = std::function<pir::Attribute(
+      pir::IrContext*, const OpDesc&, const OpAttributeInfo&)>;
 
  public:
-  virtual ir::Operation* operator()(ir::IrContext* ctx,
-                                    TranslationContext* param_map,
-                                    const OpDesc& op_desc,
-                                    ir::Program* program);
+  virtual pir::Operation* operator()(pir::IrContext* ctx,
+                                     TranslationContext* param_map,
+                                     const OpDesc& op_desc,
+                                     pir::Program* program);
 
  public:
-  virtual ir::OpInfo LoopkUpOpInfo(ir::IrContext* ctx, const OpDesc& op_desc);
-  virtual std::vector<ir::OpResult> GenerateOperationInput(
-      ir::IrContext* ctx,
+  virtual pir::OpInfo LoopkUpOpInfo(pir::IrContext* ctx, const OpDesc& op_desc);
+  virtual std::vector<pir::OpResult> GenerateOperationInput(
+      pir::IrContext* ctx,
       TranslationContext* param_map,
       const OpDesc& op_desc,
       const std::string& normalized_op_name,
       const OpInputInfoList& input_infos,
-      ir::Program* program);
+      pir::Program* program);
   virtual std::tuple<OpOutputTypeList, OpOutputMapping> GenerateOperationOutput(
-      ir::IrContext* ctx,
+      pir::IrContext* ctx,
       const OpDesc& op_desc,
       const OpOutputInfoList& output_infos);
-  virtual void HandleNonexistentAttribute(ir::IrContext*,
-                                          ir::AttributeMap* attribute_map,
+  virtual void HandleNonexistentAttribute(pir::IrContext*,
+                                          pir::AttributeMap* attribute_map,
                                           const OpAttributeInfo& info);
-  virtual ir::AttributeMap TranslateOpAttribute(
-      ir::IrContext* ctx,
+  virtual pir::AttributeMap TranslateOpAttribute(
+      pir::IrContext* ctx,
       const std::string& normalized_op_name,
       const OpAttributeInfoList& op_attr_infos,
       const OpDesc& op_desc);
-  virtual ir::OpResult GetAttributeAsInput(ir::IrContext* ctx,
-                                           ir::Program* program,
-                                           const OpDesc& op_desc,
-                                           const OpInputInfo& input_info);
+  virtual pir::OpResult GetAttributeAsInput(pir::IrContext* ctx,
+                                            pir::Program* program,
+                                            const OpDesc& op_desc,
+                                            const OpInputInfo& input_info);
 
-  virtual void RecordOpResultMapping(ir::IrContext* ctx,
+  virtual void RecordOpResultMapping(pir::IrContext* ctx,
                                      TranslationContext* param_map,
                                      const OpDesc& op_desc,
-                                     ir::Operation* operation,
+                                     pir::Operation* operation,
                                      const OpOutputMapping& arg_to_idx);
 
  public:
@@ -105,11 +105,11 @@ struct OpTranscriber {
       const std::string& input_name) {
     return nullptr;
   }
-  virtual void InsertSliceOperationForInput(ir::IrContext* ctx,
+  virtual void InsertSliceOperationForInput(pir::IrContext* ctx,
                                             TranslationContext* param_map,
                                             const OpDesc& op_desc,
                                             const OpInputInfoList& input_infos,
-                                            ir::Program* program);
+                                            pir::Program* program);
 };
 
 class OpTranslator {
@@ -118,8 +118,8 @@ class OpTranslator {
   using OpDesc = paddle::framework::OpDesc;
   using BlockDesc = paddle::framework::BlockDesc;
   using VarDesc = paddle::framework::VarDesc;
-  using OpTranslateFn = std::function<ir::Operation*(
-      ir::IrContext*, TranslationContext*, const OpDesc&, ir::Program*)>;
+  using OpTranslateFn = std::function<pir::Operation*(
+      pir::IrContext*, TranslationContext*, const OpDesc&, pir::Program*)>;
 
  private:
   OpTranslator();  // Disallow instantiation outside of the class.
diff --git a/paddle/fluid/ir_adaptor/translator/program_translator.cc b/paddle/fluid/ir_adaptor/translator/program_translator.cc
index 9065554781265..678a79a5540b8 100644
--- a/paddle/fluid/ir_adaptor/translator/program_translator.cc
+++ b/paddle/fluid/ir_adaptor/translator/program_translator.cc
@@ -17,21 +17,21 @@
 #include <unordered_map>
 
 #include "glog/logging.h"
-
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/var_desc.h"
+#include "paddle/fluid/ir_adaptor/translator/attribute_translator.h"
 #include "paddle/fluid/ir_adaptor/translator/op_translator.h"
 #include "paddle/fluid/ir_adaptor/translator/type_translator.h"
 #include "paddle/fluid/ir_adaptor/translator/utils.h"
-#include "paddle/ir/core/attribute.h"
-#include "paddle/ir/core/block.h"
-#include "paddle/ir/core/builtin_attribute.h"
-#include "paddle/ir/core/builtin_op.h"
-#include "paddle/ir/core/builtin_type.h"
-#include "paddle/ir/core/enforce.h"
-#include "paddle/ir/core/operation.h"
-#include "paddle/ir/core/value.h"
 #include "paddle/phi/core/enforce.h"
+#include "paddle/pir/core/attribute.h"
+#include "paddle/pir/core/block.h"
+#include "paddle/pir/core/builtin_attribute.h"
+#include "paddle/pir/core/builtin_op.h"
+#include "paddle/pir/core/builtin_type.h"
+#include "paddle/pir/core/enforce.h"
+#include "paddle/pir/core/operation.h"
+#include "paddle/pir/core/value.h"
 
 namespace paddle {
 namespace translator {
@@ -46,9 +46,9 @@ const std::unordered_set<std::string> ProgramTranslator::no_cast_var_names = {
 };
 
 ProgramTranslator::ProgramTranslator(const ProgramDesc* legacy_program,
-                                     ir::Program* program)
+                                     pir::Program* program)
     : legacy_program_(legacy_program), program_(program) {
-  ctx_ = ir::IrContext::Instance();
+  ctx_ = pir::IrContext::Instance();
 }
 
 void ProgramTranslator::Translate() {
@@ -84,31 +84,31 @@ void ProgramTranslator::Translate() {
   }
 }
 
-inline ir::Operation* InsertGetParamaterOp(ir::IrContext* ctx,
-                                           const VarDesc* var) {
+inline pir::Operation* InsertGetParamaterOp(pir::IrContext* ctx,
+                                            const VarDesc* var) {
   auto& type_translator = TypeTranslator::instance();
-  std::string get_parameter_op_name(ir::GetParameterOp::name());
-  ir::OpInfo op_info = ctx->GetRegisteredOpInfo(get_parameter_op_name);
-  std::unordered_map<std::string, ir::Attribute> op_attribute_map = {
-      {"parameter_name", ir::StrAttribute::get(ctx, var->Name())},
+  std::string get_parameter_op_name(pir::GetParameterOp::name());
+  pir::OpInfo op_info = ctx->GetRegisteredOpInfo(get_parameter_op_name);
+  std::unordered_map<std::string, pir::Attribute> op_attribute_map = {
+      {"parameter_name", pir::StrAttribute::get(ctx, var->Name())},
   };
 
-  ir::Type translated_var_type = type_translator[var->GetType()](ctx, *var);
-  ir::Operation* operation = ir::Operation::Create(
+  pir::Type translated_var_type = type_translator[var->GetType()](ctx, *var);
+  pir::Operation* operation = pir::Operation::Create(
       {}, op_attribute_map, {translated_var_type}, op_info);
   return operation;
 }
 
-inline ir::Operation* InsertSetParamaterOp(ir::IrContext* ctx,
-                                           ir::OpResult defining_op_result,
-                                           const VarDesc* var) {
-  std::string set_parameter_op_name(ir::SetParameterOp::name());
-  ir::OpInfo op_info = ctx->GetRegisteredOpInfo(set_parameter_op_name);
-  std::unordered_map<std::string, ir::Attribute> op_attribute_map = {
-      {"parameter_name", ir::StrAttribute::get(ctx, var->Name())},
+inline pir::Operation* InsertSetParamaterOp(pir::IrContext* ctx,
+                                            pir::OpResult defining_op_result,
+                                            const VarDesc* var) {
+  std::string set_parameter_op_name(pir::SetParameterOp::name());
+  pir::OpInfo op_info = ctx->GetRegisteredOpInfo(set_parameter_op_name);
+  std::unordered_map<std::string, pir::Attribute> op_attribute_map = {
+      {"parameter_name", pir::StrAttribute::get(ctx, var->Name())},
   };
 
-  ir::Operation* operation = ir::Operation::Create(
+  pir::Operation* operation = pir::Operation::Create(
       {defining_op_result}, op_attribute_map, {}, op_info);
   return operation;
 }
@@ -149,7 +149,7 @@ void ProgramTranslator::GetParameterForSingleBlock(const BlockDesc& block) {
               var_desc,
               phi::errors::PreconditionNotMet(
                   "VarDesc of [%s] can not be nullptr", var_name));
-          ir::Operation* op = InsertGetParamaterOp(ctx_, var_desc);
+          pir::Operation* op = InsertGetParamaterOp(ctx_, var_desc);
           program_->block()->push_back(op);
           param_map_[var_name] = VariableDefiningInfo(op->result(0));
           VLOG(10) << "[op translated][get parameter]" << var_name;
@@ -178,7 +178,7 @@ void ProgramTranslator::InsertOperationToSingleBlock(const BlockDesc& block) {
         continue;
       }
     }
-    ir::Operation* operation = fn(ctx_, &param_map_, *op, program_);
+    pir::Operation* operation = fn(ctx_, &param_map_, *op, program_);
     VLOG(10) << "[op translated][special]" << operation;
   }
 }
@@ -203,7 +203,7 @@ void ProgramTranslator::SetParameterFromSingleBlock(const BlockDesc& block) {
         need_set_parameter_op &= (param_map_.count(var_name) != 0);
         need_set_parameter_op &= (!set_input_var_names.count(var_name));
         if (need_set_parameter_op) {
-          ir::OpResult defining_op_result = param_map_[var_name].value;
+          pir::OpResult defining_op_result = param_map_[var_name].value;
           if (!defining_op_result) {
             continue;
           }
@@ -214,11 +214,11 @@ void ProgramTranslator::SetParameterFromSingleBlock(const BlockDesc& block) {
             defining_op_result = param_map_.at(var_name).value;
           }
 
-          ir::Operation* op = InsertSetParamaterOp(
+          pir::Operation* op = InsertSetParamaterOp(
               ctx_, defining_op_result, parameter_name_mappings_[var_name]);
 
-          ir::Block* block = program_->block();
-          ir::Block::iterator insert_pos = std::find(
+          pir::Block* block = program_->block();
+          pir::Block::iterator insert_pos = std::find(
               block->begin(), block->end(), defining_op_result.owner());
 
           IR_ENFORCE(
@@ -249,7 +249,7 @@ void ProgramTranslator::SetStopGradientAttributeForAllValue(
     if (var == nullptr) {
       continue;
     }
-    ir::OpResult value = value_info.value;
+    pir::OpResult value = value_info.value;
     if (!value) {
       PADDLE_THROW(phi::errors::PreconditionNotMet(
           "Value of [%s] can not ber None", var_name));
@@ -261,19 +261,19 @@ void ProgramTranslator::SetStopGradientAttributeForAllValue(
             "Defining operator of [%s] can not be nullptr", var_name));
     VLOG(8) << "[op translated][stop gradient]" << var_name
             << " from: " << defining_op->name();
-    std::vector<ir::Attribute> stop_gradients;
+    std::vector<pir::Attribute> stop_gradients;
     if (defining_op->HasAttribute(kAttrStopGradients)) {
       stop_gradients = defining_op->attribute(kAttrStopGradients)
-                           .dyn_cast<ir::ArrayAttribute>()
+                           .dyn_cast<pir::ArrayAttribute>()
                            .AsVector();
     } else {
-      stop_gradients = std::vector<ir::Attribute>(
-          defining_op->num_results(), ir::BoolAttribute::get(ctx_, false));
+      stop_gradients = std::vector<pir::Attribute>(
+          defining_op->num_results(), pir::BoolAttribute::get(ctx_, false));
     }
     stop_gradients[value.GetResultIndex()] =
-        ir::BoolAttribute::get(ctx_, var->StopGradient());
+        pir::BoolAttribute::get(ctx_, var->StopGradient());
     defining_op->set_attribute(kAttrStopGradients,
-                               ir::ArrayAttribute::get(ctx_, stop_gradients));
+                               pir::ArrayAttribute::get(ctx_, stop_gradients));
   }
 }
 
@@ -288,7 +288,7 @@ void ProgramTranslator::SetIsPersisableAttributeForAllValue(
     if (var == nullptr) {
       continue;
     }
-    ir::OpResult value = value_info.value;
+    pir::OpResult value = value_info.value;
     if (!value) {
       PADDLE_THROW(phi::errors::PreconditionNotMet(
           "Value of [%s] can not ber None", var_name));
@@ -300,19 +300,19 @@ void ProgramTranslator::SetIsPersisableAttributeForAllValue(
             "Defining operator of [%s] can not be nullptr", var_name));
     VLOG(8) << "[op translated][is persisable]" << var_name
             << " from: " << defining_op->name();
-    std::vector<ir::Attribute> is_persisable;
+    std::vector<pir::Attribute> is_persisable;
     if (defining_op->HasAttribute(kAttrIsPersisable)) {
       is_persisable = defining_op->attribute(kAttrIsPersisable)
-                          .dyn_cast<ir::ArrayAttribute>()
+                          .dyn_cast<pir::ArrayAttribute>()
                           .AsVector();
     } else {
-      is_persisable = std::vector<ir::Attribute>(
-          defining_op->num_results(), ir::BoolAttribute::get(ctx_, false));
+      is_persisable = std::vector<pir::Attribute>(
+          defining_op->num_results(), pir::BoolAttribute::get(ctx_, false));
     }
     is_persisable[value.GetResultIndex()] =
-        ir::BoolAttribute::get(ctx_, var->Persistable());
+        pir::BoolAttribute::get(ctx_, var->Persistable());
     defining_op->set_attribute(kAttrIsPersisable,
-                               ir::ArrayAttribute::get(ctx_, is_persisable));
+                               pir::ArrayAttribute::get(ctx_, is_persisable));
   }
 }
 
diff --git a/paddle/fluid/ir_adaptor/translator/program_translator.h b/paddle/fluid/ir_adaptor/translator/program_translator.h
index 88901376ae3cb..02ee94d7dd0cd 100644
--- a/paddle/fluid/ir_adaptor/translator/program_translator.h
+++ b/paddle/fluid/ir_adaptor/translator/program_translator.h
@@ -18,17 +18,18 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
-
+#include "paddle/fluid/framework/op_call_stack.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/ir/core/ir_context.h"
-#include "paddle/ir/core/program.h"
-#include "paddle/ir/core/value.h"
+#include "paddle/pir/core/ir_context.h"
+#include "paddle/pir/core/program.h"
+#include "paddle/pir/core/value.h"
 
 namespace paddle {
 namespace translator {
 
 struct VariableDefiningInfo {
-  VariableDefiningInfo(ir::OpResult value,
+  VariableDefiningInfo(pir::OpResult value,
                        bool generated_by_vector = false,
                        int idx_in_vector = -1)
       : value(value),
@@ -36,7 +37,7 @@ struct VariableDefiningInfo {
         idx_in_vector(idx_in_vector) {}
   VariableDefiningInfo() {}
 
-  ir::OpResult value;
+  pir::OpResult value;
 
   bool generated_by_vector =
       false;  // true if target variable is generated by Vector<Tensor>
@@ -54,14 +55,14 @@ class ProgramTranslator {
 
  public:
   explicit ProgramTranslator(const ProgramDesc* legacy_program,
-                             ir::Program* program);
+                             pir::Program* program);
 
   void Translate();
 
  private:
   const ProgramDesc* legacy_program_;  // not owned
-  ir::Program* program_;               // not owned
-  ir::IrContext* ctx_;                 // not owned
+  pir::Program* program_;              // not owned
+  pir::IrContext* ctx_;                // not owned
 
   TranslationContext param_map_;
   std::unordered_map<std::string, VarDesc*> parameter_name_mappings_;
diff --git a/paddle/fluid/ir_adaptor/translator/translate.cc b/paddle/fluid/ir_adaptor/translator/translate.cc
index 87bef41641a5f..0f98e557743fc 100644
--- a/paddle/fluid/ir_adaptor/translator/translate.cc
+++ b/paddle/fluid/ir_adaptor/translator/translate.cc
@@ -17,20 +17,20 @@
 #include <memory>
 
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_dialect.h"
 #include "paddle/fluid/ir_adaptor/translator/program_translator.h"
-#include "paddle/ir/core/builtin_dialect.h"
-#include "paddle/ir/core/program.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
+#include "paddle/pir/core/builtin_dialect.h"
+#include "paddle/pir/core/program.h"
 
 namespace paddle {
 
 using LegacyProgramDesc = ::paddle::framework::ProgramDesc;
-using Program = ::ir::Program;
+using Program = pir::Program;
 
 std::unique_ptr<Program> TranslateLegacyProgramToProgram(
     const LegacyProgramDesc& legacy_program) {
-  ir::IrContext* ctx = ir::IrContext::Instance();
-  ctx->GetOrRegisterDialect<dialect::PaddleDialect>();
+  pir::IrContext* ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<dialect::OperatorDialect>();
   auto program = std::make_unique<Program>(ctx);
   translator::ProgramTranslator program_translator(&legacy_program,
                                                    program.get());
diff --git a/paddle/fluid/ir_adaptor/translator/translate.h b/paddle/fluid/ir_adaptor/translator/translate.h
index 8f604a47761fc..47ad12003f807 100644
--- a/paddle/fluid/ir_adaptor/translator/translate.h
+++ b/paddle/fluid/ir_adaptor/translator/translate.h
@@ -17,12 +17,12 @@
 #include <memory>
 
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_dialect.h"
-#include "paddle/ir/core/program.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
+#include "paddle/pir/core/program.h"
 
 namespace paddle {
 
-std::unique_ptr<::ir::Program> TranslateLegacyProgramToProgram(
+std::unique_ptr<::pir::Program> TranslateLegacyProgramToProgram(
     const ::paddle::framework::ProgramDesc& legacy_program);
 
 }  // namespace paddle
diff --git a/paddle/fluid/ir_adaptor/translator/type_translator.cc b/paddle/fluid/ir_adaptor/translator/type_translator.cc
index 5c3cbdbc240ce..ef1dbf543c671 100644
--- a/paddle/fluid/ir_adaptor/translator/type_translator.cc
+++ b/paddle/fluid/ir_adaptor/translator/type_translator.cc
@@ -15,9 +15,9 @@
 #include "paddle/fluid/ir_adaptor/translator/type_translator.h"
 
 #include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_type.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_type_storage.h"
-#include "paddle/ir/core/builtin_type.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
+#include "paddle/fluid/pir/dialect/operator/ir/type_storage.h"
+#include "paddle/pir/core/builtin_type.h"
 
 namespace paddle {
 namespace translator {
@@ -34,59 +34,59 @@ using SelectedRowsTypeStorage = paddle::dialect::SelectedRowsTypeStorage;
 TypeTranslator::TypeTranslator() {
   handlers = {
       {VarType::BOOL,
-       [&](ir::IrContext* ctx, const VarDesc& var_desc) -> ir::Type {
-         return ir::BoolType::get(ctx);
+       [&](pir::IrContext* ctx, const VarDesc& var_desc) -> pir::Type {
+         return pir::BoolType::get(ctx);
        }},
       {VarType::UINT8,
-       [&](ir::IrContext* ctx, const VarDesc& var_desc) -> ir::Type {
-         return ir::UInt8Type::get(ctx);
+       [&](pir::IrContext* ctx, const VarDesc& var_desc) -> pir::Type {
+         return pir::UInt8Type::get(ctx);
        }},
       {VarType::INT8,
-       [&](ir::IrContext* ctx, const VarDesc& var_desc) -> ir::Type {
-         return ir::Int8Type::get(ctx);
+       [&](pir::IrContext* ctx, const VarDesc& var_desc) -> pir::Type {
+         return pir::Int8Type::get(ctx);
        }},
       {VarType::INT16,
-       [&](ir::IrContext* ctx, const VarDesc& var_desc) -> ir::Type {
-         return ir::Int16Type::get(ctx);
+       [&](pir::IrContext* ctx, const VarDesc& var_desc) -> pir::Type {
+         return pir::Int16Type::get(ctx);
        }},
       {VarType::INT32,
-       [&](ir::IrContext* ctx, const VarDesc& var_desc) -> ir::Type {
-         return ir::Int32Type::get(ctx);
+       [&](pir::IrContext* ctx, const VarDesc& var_desc) -> pir::Type {
+         return pir::Int32Type::get(ctx);
        }},
       {VarType::INT64,
-       [&](ir::IrContext* ctx, const VarDesc& var_desc) -> ir::Type {
-         return ir::Int64Type::get(ctx);
+       [&](pir::IrContext* ctx, const VarDesc& var_desc) -> pir::Type {
+         return pir::Int64Type::get(ctx);
        }},
       {VarType::FP16,
-       [&](ir::IrContext* ctx, const VarDesc& var_desc) -> ir::Type {
-         return ir::Float16Type::get(ctx);
+       [&](pir::IrContext* ctx, const VarDesc& var_desc) -> pir::Type {
+         return pir::Float16Type::get(ctx);
        }},
       {VarType::FP32,
-       [&](ir::IrContext* ctx, const VarDesc& var_desc) -> ir::Type {
-         return ir::Float32Type::get(ctx);
+       [&](pir::IrContext* ctx, const VarDesc& var_desc) -> pir::Type {
+         return pir::Float32Type::get(ctx);
        }},
       {VarType::FP64,
-       [&](ir::IrContext* ctx, const VarDesc& var_desc) -> ir::Type {
-         return ir::Float64Type::get(ctx);
+       [&](pir::IrContext* ctx, const VarDesc& var_desc) -> pir::Type {
+         return pir::Float64Type::get(ctx);
        }},
       {VarType::BF16,
-       [&](ir::IrContext* ctx, const VarDesc& var_desc) -> ir::Type {
-         return ir::BFloat16Type::get(ctx);
+       [&](pir::IrContext* ctx, const VarDesc& var_desc) -> pir::Type {
+         return pir::BFloat16Type::get(ctx);
        }},
       {VarType::COMPLEX64,
-       [&](ir::IrContext* ctx, const VarDesc& var_desc) -> ir::Type {
-         return ir::Complex64Type::get(ctx);
+       [&](pir::IrContext* ctx, const VarDesc& var_desc) -> pir::Type {
+         return pir::Complex64Type::get(ctx);
        }},
       {VarType::COMPLEX128,
-       [&](ir::IrContext* ctx, const VarDesc& var_desc) -> ir::Type {
-         return ir::Complex128Type::get(ctx);
+       [&](pir::IrContext* ctx, const VarDesc& var_desc) -> pir::Type {
+         return pir::Complex128Type::get(ctx);
        }},
       {VarType::LOD_TENSOR,
-       [&](ir::IrContext* ctx, const VarDesc& var_desc) -> ir::Type {
+       [&](pir::IrContext* ctx, const VarDesc& var_desc) -> pir::Type {
          VLOG(10) << "[vartype translating]"
                   << "[" << var_desc.Name() << "] from LOD_TENSOR";
 
-         ir::Type dtype =
+         pir::Type dtype =
              this->operator[](var_desc.GetDataType())(ctx, var_desc);
          DenseTensorTypeStorage::Dim dim = phi::make_ddim(var_desc.GetShape());
          DenseTensorTypeStorage::DataLayout layout =
@@ -96,18 +96,18 @@ TypeTranslator::TypeTranslator() {
          return DenseTensorType::get(ctx, dtype, dim, layout, lod, offset);
        }},
       {VarType::LOD_TENSOR_ARRAY,
-       [&](ir::IrContext* ctx, const VarDesc& var_desc) -> ir::Type {
+       [&](pir::IrContext* ctx, const VarDesc& var_desc) -> pir::Type {
          VLOG(10) << "[vartype translating]"
                   << "[" << var_desc.Name() << "] from LOD_TENSOR_ARRAY";
 
-         return ir::VectorType::get(ctx, std::vector<ir::Type>{});
+         return pir::VectorType::get(ctx, std::vector<pir::Type>{});
        }},
       {VarType::SELECTED_ROWS,
-       [&](ir::IrContext* ctx, const VarDesc& var_desc) -> ir::Type {
+       [&](pir::IrContext* ctx, const VarDesc& var_desc) -> pir::Type {
          VLOG(10) << "[vartype translating]"
                   << "[" << var_desc.Name() << "] from SELECTED_ROWS";
 
-         ir::Type dtype =
+         pir::Type dtype =
              this->operator[](var_desc.GetDataType())(ctx, var_desc);
 
          SelectedRowsTypeStorage::Dim dim = phi::make_ddim(var_desc.GetShape());
@@ -115,7 +115,7 @@ TypeTranslator::TypeTranslator() {
              SelectedRowsTypeStorage::DataLayout::UNDEFINED;
          SelectedRowsTypeStorage::LoD lod = {};
          size_t offset = 0;
-         ir::Type SelectedRows =
+         pir::Type SelectedRows =
              SelectedRowsType::get(ctx, dtype, dim, layout, lod, offset);
          return SelectedRows;
        }},
diff --git a/paddle/fluid/ir_adaptor/translator/type_translator.h b/paddle/fluid/ir_adaptor/translator/type_translator.h
index d93be9a9db371..255795c92d807 100644
--- a/paddle/fluid/ir_adaptor/translator/type_translator.h
+++ b/paddle/fluid/ir_adaptor/translator/type_translator.h
@@ -20,15 +20,15 @@
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/var_desc.h"
 #include "paddle/fluid/ir_adaptor/translator/program_translator.h"
-#include "paddle/ir/core/builtin_type.h"
-#include "paddle/ir/core/dialect.h"
-#include "paddle/ir/core/ir_context.h"
+#include "paddle/pir/core/builtin_type.h"
+#include "paddle/pir/core/dialect.h"
+#include "paddle/pir/core/ir_context.h"
 
 namespace paddle {
 namespace translator {
 
 using TypeTranslateFn =
-    std::function<ir::Type(ir::IrContext*, const framework::VarDesc&)>;
+    std::function<pir::Type(pir::IrContext*, const framework::VarDesc&)>;
 
 class TypeTranslator {
  public:
diff --git a/paddle/fluid/ir_adaptor/translator/utils.cc b/paddle/fluid/ir_adaptor/translator/utils.cc
index 38f3f5fd8c90b..4a591eeedf083 100644
--- a/paddle/fluid/ir_adaptor/translator/utils.cc
+++ b/paddle/fluid/ir_adaptor/translator/utils.cc
@@ -16,43 +16,43 @@
 
 #include <unordered_map>
 
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_dialect.h"
 #include "paddle/fluid/ir_adaptor/translator/op_translator.h"
-#include "paddle/ir/core/builtin_attribute.h"
-#include "paddle/ir/core/builtin_type.h"
-#include "paddle/ir/core/enforce.h"
-#include "paddle/ir/core/utils.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
+#include "paddle/pir/core/builtin_attribute.h"
+#include "paddle/pir/core/builtin_type.h"
+#include "paddle/pir/core/enforce.h"
+#include "paddle/pir/core/utils.h"
 
 namespace paddle {
 namespace translator {
 
-ir::Operation* InsertSliceOperationForTarget(
-    ir::IrContext* ctx,
+pir::Operation* InsertSliceOperationForTarget(
+    pir::IrContext* ctx,
     TranslationContext* param_map,
-    ir::Program* program,
+    pir::Program* program,
     const VariableDefiningInfo& defining_info,
     const std::string& arg_name) {
-  std::string slice_op_name(ir::SliceOp::name());
-  ir::OpInfo op_info = ctx->GetRegisteredOpInfo(slice_op_name);
-  std::unordered_map<std::string, ir::Attribute> op_attribute_map = {
-      {"index", ir::Int32Attribute::get(ctx, defining_info.idx_in_vector)},
+  std::string slice_op_name(pir::SliceOp::name());
+  pir::OpInfo op_info = ctx->GetRegisteredOpInfo(slice_op_name);
+  std::unordered_map<std::string, pir::Attribute> op_attribute_map = {
+      {"index", pir::Int32Attribute::get(ctx, defining_info.idx_in_vector)},
   };
-  ir::VectorType src_vec_type =
-      defining_info.value.type().dyn_cast<ir::VectorType>();
-  ir::Operation* operation =
-      ir::Operation::Create({defining_info.value},
-                            op_attribute_map,
-                            {src_vec_type[defining_info.idx_in_vector]},
-                            op_info);
+  pir::VectorType src_vec_type =
+      defining_info.value.type().dyn_cast<pir::VectorType>();
+  pir::Operation* operation =
+      pir::Operation::Create({defining_info.value},
+                             op_attribute_map,
+                             {src_vec_type[defining_info.idx_in_vector]},
+                             op_info);
   program->block()->push_back(operation);
-  ir::OpResult target_op_result = operation->result(0);
+  pir::OpResult target_op_result = operation->result(0);
   (*param_map)[arg_name] = VariableDefiningInfo(target_op_result);
   return operation;
 }
 
 std::ostream& operator<<(std::ostream& os,
                          const std::vector<std::string>& vec_str) {
-  ir::PrintInterleave(
+  pir::PrintInterleave(
       vec_str.begin(),
       vec_str.end(),
       [&os](std::string s) { os << s; },
@@ -61,7 +61,7 @@ std::ostream& operator<<(std::ostream& os,
 }
 
 std::vector<std::string> CheckUnregisteredOperationInBlock(
-    ir::IrContext* ctx, const framework::BlockDesc& block) {
+    pir::IrContext* ctx, const framework::BlockDesc& block) {
   auto& op_translator = OpTranslator::instance();
   std::vector<std::string> unregistered_ops;
   for (auto op : block.AllOps()) {
@@ -71,7 +71,7 @@ std::vector<std::string> CheckUnregisteredOperationInBlock(
     OpTranscriber general_handler;
     try {
       general_handler.LoopkUpOpInfo(ctx, *op);
-    } catch (ir::IrNotMetException& e) {
+    } catch (pir::IrNotMetException& e) {
       unregistered_ops.push_back(op->Type());
     }
   }
@@ -79,8 +79,8 @@ std::vector<std::string> CheckUnregisteredOperationInBlock(
 }
 
 std::vector<std::string> CheckUnregisteredOperation(
-    ir::IrContext* ctx, const framework::ProgramDesc& legacy_program) {
-  ctx->GetOrRegisterDialect<dialect::PaddleDialect>();
+    pir::IrContext* ctx, const framework::ProgramDesc& legacy_program) {
+  ctx->GetOrRegisterDialect<dialect::OperatorDialect>();
 
   std::vector<std::string> unregistered_ops;
   for (size_t block_idx = 0; block_idx < legacy_program.Size(); block_idx++) {
diff --git a/paddle/fluid/ir_adaptor/translator/utils.h b/paddle/fluid/ir_adaptor/translator/utils.h
index 20e462b5bbde1..63bbde06d2ec0 100644
--- a/paddle/fluid/ir_adaptor/translator/utils.h
+++ b/paddle/fluid/ir_adaptor/translator/utils.h
@@ -19,17 +19,17 @@
 
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/ir_adaptor/translator/program_translator.h"
-#include "paddle/ir/core/ir_context.h"
-#include "paddle/ir/core/operation.h"
-#include "paddle/ir/core/program.h"
+#include "paddle/pir/core/ir_context.h"
+#include "paddle/pir/core/operation.h"
+#include "paddle/pir/core/program.h"
 
 namespace paddle {
 namespace translator {
 
-ir::Operation* InsertSliceOperationForTarget(
-    ir::IrContext* ctx,
+pir::Operation* InsertSliceOperationForTarget(
+    pir::IrContext* ctx,
     TranslationContext* param_map,
-    ir::Program* program,
+    pir::Program* program,
     const VariableDefiningInfo& defining_info,
     const std::string& arg_name);
 
@@ -37,7 +37,7 @@ std::ostream& operator<<(std::ostream& os,
                          const std::vector<std::string>& vec_str);
 
 std::vector<std::string> CheckUnregisteredOperation(
-    ir::IrContext* ctx, const framework::ProgramDesc& legacy_program);
+    pir::IrContext* ctx, const framework::ProgramDesc& legacy_program);
 
 }  // namespace translator
 }  // namespace paddle
diff --git a/paddle/fluid/jit/engine/interpreter_engine.cc b/paddle/fluid/jit/engine/interpreter_engine.cc
index 23cb3ee8b5a20..9c5f7b20d9fd6 100644
--- a/paddle/fluid/jit/engine/interpreter_engine.cc
+++ b/paddle/fluid/jit/engine/interpreter_engine.cc
@@ -20,9 +20,9 @@
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/new_executor/interpretercore.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/ir/core/program.h"
-#include "paddle/ir/core/value.h"
 #include "paddle/phi/core/enforce.h"
+#include "paddle/pir/core/program.h"
+#include "paddle/pir/core/value.h"
 
 namespace paddle {
 namespace jit {
diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.cc b/paddle/fluid/operators/cinn/cinn_launch_context.cc
index fc23dbf88064c..0700028807fc0 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_context.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_context.cc
@@ -42,9 +42,9 @@
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/printf.h"
-#include "paddle/ir/core/program.h"
-#include "paddle/ir/core/value.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/pir/core/program.h"
+#include "paddle/pir/core/value.h"
 #include "paddle/utils/string/string_helper.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.h b/paddle/fluid/operators/cinn/cinn_launch_op.h
index 2913da9bc5c39..02e70c549cfc2 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op.h
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.h
@@ -29,9 +29,9 @@
 #include "paddle/fluid/operators/cinn/cinn_launch_context.h"
 #include "paddle/fluid/operators/cinn/cinn_op_helper.h"
 #include "paddle/fluid/platform/profiler.h"
-#include "paddle/ir/core/program.h"
-#include "paddle/ir/core/value.h"
 #include "paddle/phi/core/flags.h"
+#include "paddle/pir/core/program.h"
+#include "paddle/pir/core/value.h"
 
 PHI_DECLARE_bool(enable_pe_launch_cinn);
 PHI_DECLARE_bool(enable_interpretercore_launch_cinn);
diff --git a/paddle/fluid/operators/class_center_sample_op.cu b/paddle/fluid/operators/class_center_sample_op.cu
index f63baadbde526..efac6332c6d29 100644
--- a/paddle/fluid/operators/class_center_sample_op.cu
+++ b/paddle/fluid/operators/class_center_sample_op.cu
@@ -30,6 +30,7 @@ namespace cub = hipcub;
 #include <random>
 
 #include "paddle/phi/common/memory_utils.h"
+#include "paddle/phi/core/distributed/comm_context_manager.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/tensor_utils.h"
 
@@ -37,6 +38,9 @@ namespace cub = hipcub;
 #include "paddle/fluid/distributed/collective/process_group.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
+#include "paddle/phi/core/distributed/nccl_comm_context.h"
+#include "paddle/phi/core/flags.h"
+PHI_DECLARE_bool(dynamic_static_unified_comm);
 #endif
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -364,21 +368,47 @@ void ClassCenterSampleKernel(const Context& dev_ctx,
       auto task = pg->AllReduce(in_tensor, out_tensor, opts);
       task->Wait();
     } else {
-      const auto& comm = paddle::platform::NCCLCommContext::Instance().Get(
-          ring_id, dev_ctx.GetPlace());
+      paddle::platform::NCCLComm* comm = nullptr;
+      phi::distributed::NCCLCommContext* comm_ctx = nullptr;
       // use global calculate stream
-      const auto calcu_stream =
+      auto stream =
           static_cast<GPUContext*>(
               phi::DeviceContextPool::Instance().Get(dev_ctx.GetPlace()))
               ->stream();
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(
-          num_classes_per_device_ptr,
-          num_classes_per_device_ptr,
-          num_classes_per_device.numel(),
-          phi::ToNCCLDataType(num_classes_per_device.dtype()),
-          ncclSum,
-          comm->comm(),
-          calcu_stream));
+      const auto& comm_context_manager =
+          phi::distributed::CommContextManager::GetInstance();
+      if (FLAGS_dynamic_static_unified_comm) {
+        PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(ring_id)),
+                          true,
+                          errors::InvalidArgument(
+                              "You choose to use new communication library by "
+                              "setting environment "
+                              "variable FLAGS_dynamic_static_unified_comm "
+                              "True. But ring_id(%d) is "
+                              "not found in comm_context_manager.",
+                              std::to_string(ring_id)));
+        comm_ctx = static_cast<phi::distributed::NCCLCommContext*>(
+            comm_context_manager.Get(std::to_string(ring_id)));
+        stream = comm_ctx->GetStream();
+      } else {
+        comm = paddle::platform::NCCLCommContext::Instance().Get(
+            ring_id, dev_ctx.GetPlace());
+      }
+
+      if (comm_ctx) {
+        comm_ctx->AllReduce(
+            &num_classes_per_device, num_classes_per_device, ncclSum, stream);
+        paddle::platform::GpuStreamSync(stream);
+      } else {
+        PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(
+            num_classes_per_device_ptr,
+            num_classes_per_device_ptr,
+            num_classes_per_device.numel(),
+            phi::ToNCCLDataType(num_classes_per_device.dtype()),
+            ncclSum,
+            comm->comm(),
+            stream));
+      }
     }
   }
 #endif
diff --git a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
index 3c70b997a7fd8..344dcd36e5235 100644
--- a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h"
+#include "paddle/phi/core/distributed/comm_context_manager.h"
 #include "paddle/phi/kernels/reduce_sum_kernel.h"
 
 #include "paddle/fluid/framework/eigen.h"
@@ -26,6 +27,12 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/softmax_impl.h"
 
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/phi/core/distributed/nccl_comm_context.h"
+#include "paddle/phi/core/flags.h"
+PHI_DECLARE_bool(dynamic_static_unified_comm);
+#endif
+
 namespace paddle {
 namespace operators {
 
@@ -136,13 +143,41 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
     const int rank = ctx.Attr<int>("rank");
 
     const auto& place = ctx.GetPlace();
-    const auto& comm = platform::NCCLCommContext::Instance().Get(rid, place);
     auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
 
-    // use global calculate stream
-    const auto stream = static_cast<phi::GPUContext*>(
-                            platform::DeviceContextPool::Instance().Get(place))
-                            ->stream();
+    gpuStream_t stream = nullptr;
+    platform::NCCLComm* comm = nullptr;
+    phi::distributed::NCCLCommContext* comm_ctx = nullptr;
+
+    const auto& comm_context_manager =
+        phi::distributed::CommContextManager::GetInstance();
+
+    if (FLAGS_dynamic_static_unified_comm) {
+      PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(rid)),
+                        true,
+                        platform::errors::InvalidArgument(
+                            "You choose to use new communication library by "
+                            "setting environment "
+                            "variable FLAGS_dynamic_static_unified_comm True. "
+                            "But ring_id(%d) is "
+                            "not found in comm_context_manager.",
+                            std::to_string(rid)));
+      comm_ctx = static_cast<phi::distributed::NCCLCommContext*>(
+          comm_context_manager.Get(std::to_string(rid)));
+      PADDLE_ENFORCE_NE(comm_ctx,
+                        nullptr,
+                        platform::errors::Unavailable(
+                            "NCCLCommContext is nullptr, collective op should "
+                            "has ring_id attr."));
+
+      stream = comm_ctx->GetStream();
+      VLOG(3) << "new comm_context_manager has ring_id " << rid;
+    } else {  // old comm_context
+      comm = platform::NCCLCommContext::Instance().Get(rid, place);
+
+      stream = comm->stream();
+      VLOG(3) << "old NCCLCommContext has ring_id " << rid;
+    }
 
     // allocate memory on device.
     softmax->mutable_data<T>(place);
@@ -166,21 +201,27 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
     // step 1, obtain logit_max
     phi::DenseTensor logits_max;
     logits_max = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
-    void* logits_max_buff = logits_max.mutable_data<T>(place);
 
     auto eigen_logits_max = phi::funcs::EigenMatrix<T>::From(logits_max);
     Eigen::DSizes<int, 1> along_axis(1);
     eigen_logits_max.device(*dev_ctx.eigen_device()) =
         eigen_logits.maximum(along_axis);
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
-        logits_max_buff,
-        logits_max_buff,
-        logits_max.numel(),
-        platform::ToNCCLDataType(
-            framework::TransToProtoVarType(logits_max.dtype())),
-        ncclMax,
-        comm->comm(),
-        stream));
+
+    if (comm_ctx) {
+      comm_ctx->AllReduce(&logits_max, logits_max, ncclMax, stream);
+    } else {
+      void* logits_max_buff = logits_max.mutable_data<T>(place);
+
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
+          logits_max_buff,
+          logits_max_buff,
+          logits_max.numel(),
+          platform::ToNCCLDataType(
+              framework::TransToProtoVarType(logits_max.dtype())),
+          ncclMax,
+          comm->comm(),
+          stream));
+    }
 
     // step 2, obtain logit - logit_max
     Eigen::DSizes<int, 2> batch_by_one(N, 1);
@@ -230,39 +271,47 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
                                                      nranks);
     }
 
-    void* predict_logits_buff = predicted_logits.mutable_data<T>(place);
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
-        predict_logits_buff,
-        predict_logits_buff,
-        predicted_logits.numel(),
-        platform::ToNCCLDataType(
-            framework::TransToProtoVarType(predicted_logits.dtype())),
-        ncclSum,
-        comm->comm(),
-        stream));
-
-    // step 4, obtain exp(logit)
+    predicted_logits.mutable_data<T>(place);
+    if (comm_ctx) {
+      comm_ctx->AllReduce(&predicted_logits, predicted_logits, ncclSum, stream);
+    } else {
+      void* predict_logits_buff = predicted_logits.data();
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
+          predict_logits_buff,
+          predict_logits_buff,
+          predicted_logits.numel(),
+          platform::ToNCCLDataType(
+              framework::TransToProtoVarType(predicted_logits.dtype())),
+          ncclSum,
+          comm->comm(),
+          stream));
+    }
     eigen_softmax.device(*dev_ctx.eigen_device()) = eigen_softmax.exp();
 
     // step 5, obtain sum_exp_logits
     phi::DenseTensor sum_exp_logits;
     sum_exp_logits = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
-    void* sum_exp_logits_buff = sum_exp_logits.mutable_data<T>(place);
+    sum_exp_logits.mutable_data<T>(place);
 
     auto eigen_sum_exp_logits =
         phi::funcs::EigenMatrix<T>::From(sum_exp_logits);
     eigen_sum_exp_logits.device(*dev_ctx.eigen_device()) =
         eigen_softmax.sum(along_axis);
 
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
-        sum_exp_logits_buff,
-        sum_exp_logits_buff,
-        sum_exp_logits.numel(),
-        platform::ToNCCLDataType(
-            framework::TransToProtoVarType(sum_exp_logits.dtype())),
-        ncclSum,
-        comm->comm(),
-        stream));
+    if (comm_ctx) {
+      comm_ctx->AllReduce(&sum_exp_logits, sum_exp_logits, ncclSum, stream);
+    } else {
+      void* sum_exp_logits_buff = sum_exp_logits.data();
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
+          sum_exp_logits_buff,
+          sum_exp_logits_buff,
+          sum_exp_logits.numel(),
+          platform::ToNCCLDataType(
+              framework::TransToProtoVarType(sum_exp_logits.dtype())),
+          ncclSum,
+          comm->comm(),
+          stream));
+    }
 
     if (label_type == framework::proto::VarType::INT32) {
       CaculateLoss<T, int32_t>
diff --git a/paddle/fluid/operators/collective/global_scatter_op.cu.cc b/paddle/fluid/operators/collective/global_scatter_op.cu.cc
index 3136ac21ab764..45d91dc724108 100644
--- a/paddle/fluid/operators/collective/global_scatter_op.cu.cc
+++ b/paddle/fluid/operators/collective/global_scatter_op.cu.cc
@@ -13,12 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/collective/global_scatter_op.h"
+#include "paddle/phi/core/distributed/comm_context_manager.h"
+
+#include "paddle/fluid/distributed/collective/utils.h"
+#include "paddle/fluid/framework/convert_utils.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
+#include "paddle/phi/core/distributed/nccl_comm_context.h"
+#include "paddle/phi/core/flags.h"
+PHI_DECLARE_bool(dynamic_static_unified_comm);
 #endif
-#include "paddle/fluid/framework/convert_utils.h"
 
 namespace paddle {
 namespace operators {
@@ -78,15 +84,48 @@ struct GlobalScatterFunctor<phi::GPUContext, T> {
             ring_id));
 
     auto place = ctx.GetPlace();
-    auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place);
     gpuStream_t stream = nullptr;
+    platform::NCCLComm* comm = nullptr;
+    phi::distributed::NCCLCommContext* comm_ctx = nullptr;
+    int nranks = 0;
+
+    const auto& comm_context_manager =
+        phi::distributed::CommContextManager::GetInstance();
+
+    if (FLAGS_dynamic_static_unified_comm) {
+      PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(ring_id)),
+                        true,
+                        platform::errors::InvalidArgument(
+                            "You choose to use new communication library by "
+                            "setting environment "
+                            "variable FLAGS_dynamic_static_unified_comm True. "
+                            "But ring_id(%d) is "
+                            "not found in comm_context_manager.",
+                            std::to_string(ring_id)));
+      comm_ctx = static_cast<phi::distributed::NCCLCommContext*>(
+          comm_context_manager.Get(std::to_string(ring_id)));
+      PADDLE_ENFORCE_NE(comm_ctx,
+                        nullptr,
+                        platform::errors::Unavailable(
+                            "NCCLCommContext is nullptr, collective op should "
+                            "has ring_id attr."));
+
+      stream = comm_ctx->GetStream();
+      nranks = comm_ctx->GetSize();
+      VLOG(3) << "new comm_context_manager has ring_id " << ring_id;
+    } else {  // old comm_context
+      comm = platform::NCCLCommContext::Instance().Get(ring_id, place);
+
+      stream = comm->stream();
+      nranks = comm->nranks();
+      VLOG(3) << "old NCCLCommContext has ring_id " << ring_id;
+    }
+
     if (ctx.Attr<bool>("use_calc_stream")) {
       // should ExecutionContext for calc stream.
       stream = ctx.cuda_device_context().stream();
-    } else {
-      stream = comm->stream();
     }
-    int nranks = comm->nranks();
+
     auto in_feat = x->dims()[1];
     auto n_expert = local_count->dims()[0] / nranks;
     int64_t fwd_count = 0;
@@ -103,34 +142,62 @@ struct GlobalScatterFunctor<phi::GPUContext, T> {
     }
 
     auto recv_ptr = 0;
-    auto send_buf = x->data<T>();
-    auto recv_buf = out->mutable_data<T>(out_dims, place);
+    out->mutable_data<T>(out_dims, place);
 
-    for (auto i = 0; i < n_expert; ++i) {
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
-      for (auto j = 0; j < nranks; ++j) {
-        int idx = i + j * n_expert;
-        if (cpu_local_count_data[idx]) {
-          PADDLE_ENFORCE_GPU_SUCCESS(
-              platform::dynload::ncclSend(send_buf + expert_ptr[idx] * in_feat,
-                                          cpu_local_count_data[idx] * in_feat,
-                                          dtype,
-                                          j,
-                                          comm->comm(),
-                                          stream));
+    if (comm_ctx) {
+      for (auto i = 0; i < n_expert; ++i) {
+        comm_ctx->GroupStart();
+        for (auto j = 0; j < nranks; ++j) {
+          int idx = i + j * n_expert;
+          if (cpu_local_count_data[idx]) {
+            auto send_buf = distributed::GetPartialTensor(
+                *x,
+                expert_ptr[idx] * in_feat,
+                cpu_local_count_data[idx] * in_feat);
+
+            comm_ctx->Send(
+                send_buf, cpu_local_count_data[idx] * in_feat, j, stream);
+          }
+          if (cpu_global_count_data[idx]) {
+            auto recv_buf = distributed::GetPartialTensor(
+                *out, recv_ptr * in_feat, cpu_global_count_data[idx] * in_feat);
+            comm_ctx->Recv(
+                &recv_buf, cpu_global_count_data[idx] * in_feat, j, stream);
+            recv_ptr += cpu_global_count_data[idx];
+          }
         }
-        if (cpu_global_count_data[idx]) {
-          PADDLE_ENFORCE_GPU_SUCCESS(
-              platform::dynload::ncclRecv(recv_buf + recv_ptr * in_feat,
-                                          cpu_global_count_data[idx] * in_feat,
-                                          dtype,
-                                          j,
-                                          comm->comm(),
-                                          stream));
-          recv_ptr += cpu_global_count_data[idx];
+        comm_ctx->GroupEnd();
+      }
+    } else {
+      auto send_buf = x->data<T>();
+      auto recv_buf = out->data<T>();
+
+      for (auto i = 0; i < n_expert; ++i) {
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
+        for (auto j = 0; j < nranks; ++j) {
+          int idx = i + j * n_expert;
+          if (cpu_local_count_data[idx]) {
+            PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
+                send_buf + expert_ptr[idx] * in_feat,
+                cpu_local_count_data[idx] * in_feat,
+                dtype,
+                j,
+                comm->comm(),
+                stream));
+          }
+          if (cpu_global_count_data[idx]) {
+            PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
+                recv_buf + recv_ptr * in_feat,
+                cpu_global_count_data[idx] * in_feat,
+                dtype,
+                j,
+                comm->comm(),
+                stream));
+            recv_ptr += cpu_global_count_data[idx];
+          }
         }
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
       }
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
     }
 
 #else
diff --git a/paddle/fluid/operators/collective/partial_allgather_op.cu.cc b/paddle/fluid/operators/collective/partial_allgather_op.cu.cc
index d22fd70bd0f61..cf353c12ffa49 100644
--- a/paddle/fluid/operators/collective/partial_allgather_op.cu.cc
+++ b/paddle/fluid/operators/collective/partial_allgather_op.cu.cc
@@ -18,8 +18,14 @@ limitations under the License. */
 #include "paddle/fluid/distributed/collective/process_group.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
+#include "paddle/phi/core/distributed/nccl_comm_context.h"
+#include "paddle/phi/core/flags.h"
+PHI_DECLARE_bool(dynamic_static_unified_comm);
 #endif
 
+#include "paddle/fluid/distributed/collective/utils.h"
+#include "paddle/phi/core/distributed/comm_context_manager.h"
+
 namespace paddle {
 namespace operators {
 
@@ -38,17 +44,57 @@ class PartialAllGatherOpCUDAKernel : public framework::OpKernel<T> {
     int rank = ctx.Attr<int>("rank");
     int rid = ctx.Attr<int>("ring_id");
     auto place = ctx.GetPlace();
-    auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
+    gpuStream_t stream = nullptr;
+
+    platform::NCCLComm* comm = nullptr;
+    phi::distributed::NCCLCommContext* comm_ctx = nullptr;
+
+    const auto& comm_context_manager =
+        phi::distributed::CommContextManager::GetInstance();
+
+    int real_nranks = 0;
+    int real_rank = 0;
+    if (FLAGS_dynamic_static_unified_comm) {
+      PADDLE_ENFORCE_EQ(comm_context_manager.Has(std::to_string(rid)),
+                        true,
+                        platform::errors::InvalidArgument(
+                            "You choose to use new communication library by "
+                            "setting environment "
+                            "variable FLAGS_dynamic_static_unified_comm True. "
+                            "But ring_id(%d) is "
+                            "not found in comm_context_manager.",
+                            std::to_string(rid)));
+      comm_ctx = static_cast<phi::distributed::NCCLCommContext*>(
+          comm_context_manager.Get(std::to_string(rid)));
+      PADDLE_ENFORCE_NE(comm_ctx,
+                        nullptr,
+                        platform::errors::Unavailable(
+                            "NCCLCommContext is nullptr, collective op should "
+                            "has ring_id attr."));
+
+      stream = comm_ctx->GetStream();
+      real_nranks = comm_ctx->GetSize();
+      real_rank = comm_ctx->GetRank();
+      VLOG(3) << "new comm_context_manager has ring_id " << rid;
+    } else {  // old comm_context
+      comm = platform::NCCLCommContext::Instance().Get(rid, place);
+
+      stream = comm->stream();
+      real_nranks = comm->nranks();
+      real_rank = comm->rank();
+      VLOG(3) << "old NCCLCommContext has ring_id " << rid;
+    }
 
     PADDLE_ENFORCE_EQ(
         nranks,
-        comm->nranks(),
+        real_nranks,
         platform::errors::InvalidArgument(
-            "nranks: %s should equal to %s", nranks, comm->nranks()));
+            "nranks: %s should equal to %s", nranks, real_nranks));
     PADDLE_ENFORCE_EQ(rank,
-                      comm->rank(),
+                      real_rank,
                       platform::errors::InvalidArgument(
-                          "rank: %s should equal to %s", rank, comm->rank()));
+                          "rank: %s should equal to %s", rank, real_rank));
+
     PADDLE_ENFORCE_EQ(
         (numel % nranks),
         0,
@@ -70,24 +116,26 @@ class PartialAllGatherOpCUDAKernel : public framework::OpKernel<T> {
       auto task = pg->AllGather(out, *in, offset, send_numel, /*sync_op*/ true);
       task->Wait();
     } else {
-      const T* send_buff = in->data<T>() + offset;
-      T* recv_buff = out->data<T>();
-
-      gpuStream_t stream = nullptr;
       if (ctx.Attr<bool>("use_calc_stream")) {
         // should ExecutionContext for calc stream.
         stream = ctx.cuda_device_context().stream();
-      } else {
-        stream = comm->stream();
       }
 
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::ncclAllGather(send_buff,
-                                           recv_buff,
-                                           send_numel,
-                                           static_cast<ncclDataType_t>(dtype),
-                                           comm->comm(),
-                                           stream));
+      if (comm_ctx) {
+        auto send_buf = distributed::GetPartialTensor(*in, offset, send_numel);
+
+        comm_ctx->AllGather(out, send_buf, stream);
+      } else {
+        const T* send_buff = in->data<T>() + offset;
+        T* recv_buff = out->data<T>();
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            platform::dynload::ncclAllGather(send_buff,
+                                             recv_buff,
+                                             send_numel,
+                                             static_cast<ncclDataType_t>(dtype),
+                                             comm->comm(),
+                                             stream));
+      }
     }
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/operators/collective/partial_send_op.cu.cc b/paddle/fluid/operators/collective/partial_send_op.cu.cc
index 4f9fc41bc4e16..67089a18c8e4f 100644
--- a/paddle/fluid/operators/collective/partial_send_op.cu.cc
+++ b/paddle/fluid/operators/collective/partial_send_op.cu.cc
@@ -18,8 +18,14 @@ limitations under the License. */
 #include "paddle/fluid/distributed/collective/process_group.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
+#include "paddle/phi/core/distributed/nccl_comm_context.h"
+#include "paddle/phi/core/flags.h"
+PHI_DECLARE_bool(dynamic_static_unified_comm);
 #endif
+
+#include "paddle/fluid/distributed/collective/utils.h"
 #include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/phi/core/distributed/comm_context_manager.h"
 
 namespace paddle {
 namespace operators {
@@ -75,33 +81,82 @@ class PartialSendCUDAKernel : public framework::OpKernel<T> {
     } else {
       gpuStream_t stream = nullptr;
       auto place = ctx.GetPlace();
-      auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
+
+      platform::NCCLComm* comm = nullptr;
+      phi::distributed::NCCLCommContext* comm_ctx = nullptr;
+      int nranks = 0;
+      int rank = 0;
+
+      const auto& comm_context_manager =
+          phi::distributed::CommContextManager::GetInstance();
+
+      if (FLAGS_dynamic_static_unified_comm) {
+        PADDLE_ENFORCE_EQ(
+            comm_context_manager.Has(std::to_string(rid)),
+            true,
+            platform::errors::InvalidArgument(
+                "You choose to use new communication library by "
+                "setting environment "
+                "variable FLAGS_dynamic_static_unified_comm True. "
+                "But ring_id(%d) is "
+                "not found in comm_context_manager.",
+                std::to_string(rid)));
+        comm_ctx = static_cast<phi::distributed::NCCLCommContext*>(
+            comm_context_manager.Get(std::to_string(rid)));
+        PADDLE_ENFORCE_NE(
+            comm_ctx,
+            nullptr,
+            platform::errors::Unavailable(
+                "NCCLCommContext is nullptr, collective op should "
+                "has ring_id attr."));
+
+        stream = comm_ctx->GetStream();
+        nranks = comm_ctx->GetSize();
+        rank = comm_ctx->GetRank();
+
+        VLOG(3) << "new comm_context_manager has ring_id " << rid;
+      } else {  // old comm_context
+        comm = platform::NCCLCommContext::Instance().Get(rid, place);
+
+        stream = comm->stream();
+        nranks = comm->nranks();
+        rank = comm->rank();
+
+        VLOG(3) << "old NCCLCommContext has ring_id " << rid;
+      }
+
       if (ctx.Attr<bool>("use_calc_stream")) {
         // should ExecutionContext for calc stream.
         stream = ctx.cuda_device_context().stream();
-      } else {
-        stream = comm->stream();
       }
+
       PADDLE_ENFORCE_LT(peer,
-                        comm->nranks(),
+                        nranks,
                         platform::errors::InvalidArgument(
                             "The value of peer (%d) you set must "
-                            "be less than comm->nranks (%d).",
+                            "be less than ranks (%d).",
                             peer,
-                            comm->nranks()));
+                            nranks));
 
       ncclDataType_t dtype =
           platform::ToNCCLDataType(framework::TransToProtoVarType(x->dtype()));
 
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::ncclSend(x->data<T>() + offset,
-                                      send_numel,
-                                      dtype,
-                                      peer,
-                                      comm->comm(),
-                                      stream));
-      VLOG(3) << "rank " << comm->rank() << " send " << send_numel
-              << " from offset[" << offset << "] to " << peer;
+      if (comm_ctx) {
+        auto send_buf = distributed::GetPartialTensor(*x, offset, send_numel);
+
+        comm_ctx->Send(send_buf, send_numel, peer, stream);
+      } else {
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            platform::dynload::ncclSend(x->data<T>() + offset,
+                                        send_numel,
+                                        dtype,
+                                        peer,
+                                        comm->comm(),
+                                        stream));
+      }
+
+      VLOG(3) << "rank " << rank << " send " << send_numel << " from offset["
+              << offset << "] to " << peer;
     }
 #else
     PADDLE_THROW(platform::errors::Unavailable(
diff --git a/paddle/fluid/operators/controlflow/CMakeLists.txt b/paddle/fluid/operators/controlflow/CMakeLists.txt
index 03e6527a58a73..b08f26a75f68c 100644
--- a/paddle/fluid/operators/controlflow/CMakeLists.txt
+++ b/paddle/fluid/operators/controlflow/CMakeLists.txt
@@ -3,13 +3,17 @@ if(WITH_UNITY_BUILD)
   # Load Unity Build rules for operators in paddle/fluid/operators/controlflow.
   include(unity_build_rule.cmake)
 endif()
-register_operators(EXCLUDES conditional_block_op DEPS naive_executor
+register_operators(EXCLUDES conditional_block_op pylayer_op DEPS naive_executor
                    standalone_executor)
 
 cc_library(
   conditional_block_op
   SRCS conditional_block_op.cc
   DEPS executor standalone_executor)
+cc_library(
+  pylayer_op
+  SRCS pylayer_op.cc
+  DEPS standalone_executor)
 cc_library(
   op_variant
   SRCS op_variant.cc
@@ -18,6 +22,10 @@ cc_library(
   conditional_block_op_helper
   SRCS conditional_block_op_helper.cc
   DEPS op_variant operator conditional_block_op)
+cc_library(
+  pylayer_op_helper
+  SRCS pylayer_op_helper.cc
+  DEPS op_variant operator pylayer_op)
 cc_library(
   recurrent_op_helper
   SRCS recurrent_op_helper.cc
@@ -28,7 +36,8 @@ cc_library(
   DEPS op_variant operator)
 
 if(WITH_UNITY_BUILD)
-  target_link_libraries(paddle_operators_controlflow_unity conditional_block_op)
+  target_link_libraries(paddle_operators_controlflow_unity conditional_block_op
+                        pylayer_op)
 else()
   target_link_libraries(conditional_block_infer_op conditional_block_op)
 endif()
diff --git a/paddle/fluid/operators/controlflow/pylayer_op.cc b/paddle/fluid/operators/controlflow/pylayer_op.cc
index eef62289d76f5..fe05f47707445 100644
--- a/paddle/fluid/operators/controlflow/pylayer_op.cc
+++ b/paddle/fluid/operators/controlflow/pylayer_op.cc
@@ -51,14 +51,35 @@ void PyLayerOp::CreateInterpreter(
         dev_place, block, cur_scope, execution_config));
     VLOG(10) << "[interpreterCore] created:" << core_;
   } else {
-    // NOTE: Borrowed from
-    // `paddle/fluid/operators/controlflow/control_flow_op_helper.h`
-    // TODO(MarioLulab): Add PyLayer Helper ?
     BuildScopeForControlFlowOp(*core_, block, cur_scope);
     core_->reset_scope(cur_scope);
   }
 }
 
+class PyLayerForwardOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput(PyLayerOp::kInputs, "The input variables of the sub-block.")
+        .AsDuplicable();
+    AddOutput(PyLayerOp::kOutputs, "The output variables of the sub-block.")
+        .AsDuplicable();
+    AddOutput(
+        PyLayerOp::kScope,
+        "(std::vector<Scope*>) The scope of static pylayer block, used for "
+        "passing intermediate variables between forward and backward.");
+    AddAttr<std::vector<framework::BlockDesc *>>(
+        "blocks",
+        "The blocks of PyLayer operator where blocks[0] indicates the forward "
+        "block and blocks[1] indicates the backward block.");
+    AddComment(R"DOC(PyLayer operator
+
+The PyLayer Operator is designed to support `@to_static` for `PyLayer in Dynamic Graph`.
+
+
+)DOC");
+  }
+};
+
 class PyLayerForwardOp : public PyLayerOp {
  public:
   PyLayerForwardOp(const std::string &type,
@@ -109,7 +130,7 @@ class PyLayerForwardOp : public PyLayerOp {
 class PyLayerForwardInferShape : public framework::InferShapeBase {
  public:
   void operator()(framework::InferShapeContext *context) const override {
-    // TODO(MarioLulab): do nothing.
+    // NOTE(MarioLulab): do nothing.
   }
 };
 
diff --git a/paddle/fluid/operators/controlflow/pylayer_op.h b/paddle/fluid/operators/controlflow/pylayer_op.h
index e06daad78041d..afbb2fd151a40 100644
--- a/paddle/fluid/operators/controlflow/pylayer_op.h
+++ b/paddle/fluid/operators/controlflow/pylayer_op.h
@@ -49,27 +49,5 @@ class PyLayerOp : public framework::OperatorBase {
  protected:
   mutable std::shared_ptr<framework::InterpreterCore> core_{nullptr};
 };
-
-class PyLayerForwardOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(PyLayerOp::kInputs, "The input variables of the sub-block.")
-        .AsDuplicable();
-    AddOutput(PyLayerOp::kOutputs, "The output variables of the sub-block.")
-        .AsDuplicable();
-    // TODO(MarioLulab): Must Use std::vector here ?
-    AddOutput(PyLayerOp::kScope,
-              "(std::vector<Scope*>) The scope of static pylayer block.");
-    AddAttr<std::vector<framework::BlockDesc *>>(
-        "blocks", "The blocks of PyLayer operator");
-    AddComment(R"DOC(PyLayer operator
-
-TO-DO: added by luqi
-
-
-)DOC");
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/controlflow/pylayer_op_helper.cc b/paddle/fluid/operators/controlflow/pylayer_op_helper.cc
new file mode 100644
index 0000000000000..dabe561eea3e7
--- /dev/null
+++ b/paddle/fluid/operators/controlflow/pylayer_op_helper.cc
@@ -0,0 +1,177 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/controlflow/pylayer_op_helper.h"
+
+#include <string>
+
+namespace paddle {
+namespace framework {
+class ProgramDesc;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+
+static bool IsMatchedPyLayerOpAndPyLayerGradOp(const OpVariant &fwd_op,
+                                               const OpVariant &bwd_op) {
+  return fwd_op.Outputs().at(PyLayerOp::kScope) ==
+         bwd_op.Inputs().at(PyLayerOp::kScope);
+}
+
+static void FindAllPyLayerOpAndPyLayerGradOp(
+    const framework::ProgramDesc &program,
+    std::vector<OpVariant> *fwd_ops,
+    std::vector<OpVariant> *bwd_ops) {
+  PADDLE_ENFORCE_GE(
+      fwd_ops->size(),
+      bwd_ops->size(),
+      platform::errors::InvalidArgument(
+          "Size of forward ops must be greater or equal to backward ops. The "
+          "number of forward ops is %d and the number of backward ops is %d",
+          fwd_ops->size(),
+          bwd_ops->size()));
+
+  for (size_t i = 1; i < program.Size(); ++i) {
+    auto &block = program.Block(i);
+    for (size_t j = 0; j < block.OpSize(); ++j) {
+      auto *op = block.Op(j);
+      if (op->Type() == "pylayer") {
+        fwd_ops->emplace_back(op);
+      } else if (op->Type() == "pylayer_grad") {
+        bwd_ops->emplace_back(op);
+      }
+    }
+  }
+
+  PADDLE_ENFORCE_GE(
+      fwd_ops->size(),
+      bwd_ops->size(),
+      platform::errors::InvalidArgument(
+          "There are more pylayer_grad ops than "
+          "pylayer ops in the graph or program. The number of "
+          "forward ops is %d and the number of backward ops is %d",
+          fwd_ops->size(),
+          bwd_ops->size()));
+}
+
+static void SetSkipVarsForPyLayerOp(OpVariant *fwd_op, OpVariant *bwd_op) {
+  auto *grad_block = bwd_op->Attr<framework::BlockDesc *>("backward_block");
+  auto is_skippable_in_fwd = [grad_block](const std::string &var_name) {
+    return var_name != framework::kEmptyVarName &&
+           !grad_block->HasVar(var_name);
+  };
+
+  std::unordered_set<std::string> forward_skip_vars;
+  for (auto *op_desc : grad_block->AllOps()) {
+    for (auto &in_arg_name : op_desc->InputArgumentNames()) {
+      if (is_skippable_in_fwd(in_arg_name)) {
+        forward_skip_vars.insert(in_arg_name);
+      }
+    }
+
+    for (auto &out_arg_name : op_desc->OutputArgumentNames()) {
+      if (is_skippable_in_fwd(out_arg_name)) {
+        forward_skip_vars.insert(out_arg_name);
+      }
+    }
+  }
+
+  auto &fwd_attrs = const_cast<framework::AttributeMap &>(fwd_op->Attrs());
+  std::vector<std::string> skip_vars_vec(forward_skip_vars.begin(),
+                                         forward_skip_vars.end());
+  VLOG(2) << "Prepare to skip " << skip_vars_vec.size()
+          << " var(s): " << string::join_strings(skip_vars_vec, ' ');
+  fwd_attrs[PyLayerOp::kSkipEagerDeletionVars] = std::move(skip_vars_vec);
+}
+
+static void PrepareSafeEagerDeletionOnPyLayerOpAndPyLayerGradOp(
+    const framework::ProgramDesc &program,
+    std::vector<OpVariant> *pylayer_ops,
+    std::vector<OpVariant> *pylayer_grad_ops) {
+  FindAllPyLayerOpAndPyLayerGradOp(program, pylayer_ops, pylayer_grad_ops);
+
+  VLOG(2) << "Found pylayer op num: " << pylayer_ops->size()
+          << ", pylayer_grad op num: " << pylayer_grad_ops->size();
+
+  if (pylayer_grad_ops->empty()) {
+    return;
+  }
+
+  std::unordered_set<OpVariant, OpVariant::Hasher> pylayer_op_set(
+      pylayer_ops->begin(), pylayer_ops->end());
+
+  for (auto &bwd_op : *pylayer_grad_ops) {
+    const OpVariant *matched_fwd_op = nullptr;
+    for (auto &fwd_op : pylayer_op_set) {
+      if (IsMatchedPyLayerOpAndPyLayerGradOp(fwd_op, bwd_op)) {
+        PADDLE_ENFORCE_EQ(matched_fwd_op,
+                          nullptr,
+                          platform::errors::PreconditionNotMet(
+                              "Found multiple matched pylayer ops."));
+        matched_fwd_op = &fwd_op;
+      }
+    }
+
+    PADDLE_ENFORCE_NOT_NULL(matched_fwd_op,
+                            platform::errors::PreconditionNotMet(
+                                "Cannot find matched forward pylayer op."));
+
+    SetSkipVarsForPyLayerOp(const_cast<OpVariant *>(matched_fwd_op), &bwd_op);
+    pylayer_op_set.erase(*matched_fwd_op);
+  }
+}
+
+void PrepareSafeEagerDeletionOnPyLayerOpAndPyLayerGradOp(
+    const framework::ProgramDesc &program,
+    int block_id,
+    const std::vector<std::unique_ptr<framework::OperatorBase>> &all_ops) {
+  // If block_id is not 0, returns
+  // This is because all pylayer_ops and pylayer_grad_ops
+  // in the whole program would be processed when block_id is 0 (i.e.
+  // when Executor::Run() or ParallelExecutor constructs).
+
+  // What's more, all pylayer_ops and pylayer_grad_ops
+  // must be processed when block_id is zero. If not, pylayer_op
+  // may run first and erase variables used in pylayer_grad_op,
+  // and in this moment, pylayer_grad_ops may be not constructed yet.
+  if (block_id != 0) return;
+
+  std::vector<OpVariant> fwd_ops, bwd_ops;
+  for (auto &op : all_ops) {
+    if (op->Type() == "pylayer") {
+      fwd_ops.emplace_back(op.get());
+    } else if (op->Type() == "pylayer_grad") {
+      bwd_ops.emplace_back(op.get());
+    }
+  }
+
+  PrepareSafeEagerDeletionOnPyLayerOpAndPyLayerGradOp(
+      program, &fwd_ops, &bwd_ops);
+}
+
+void PrepareSafeEagerDeletionOnPyLayerOpAndPyLayerGradOp(
+    const framework::ProgramDesc &program,
+    const std::vector<OpVariant> &pylayer_ops,
+    const std::vector<OpVariant> &pylayer_grad_ops) {
+  std::vector<OpVariant> fwd_ops = pylayer_ops;
+  std::vector<OpVariant> bwd_ops = pylayer_grad_ops;
+
+  PrepareSafeEagerDeletionOnPyLayerOpAndPyLayerGradOp(
+      program, &fwd_ops, &bwd_ops);
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/controlflow/pylayer_op_helper.h b/paddle/fluid/operators/controlflow/pylayer_op_helper.h
new file mode 100644
index 0000000000000..1295a6cba60a0
--- /dev/null
+++ b/paddle/fluid/operators/controlflow/pylayer_op_helper.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/controlflow/op_variant.h"
+#include "paddle/fluid/operators/controlflow/pylayer_op.h"
+#include "paddle/fluid/string/string_helper.h"
+
+namespace paddle {
+namespace framework {
+class ProgramDesc;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+
+void PrepareSafeEagerDeletionOnPyLayerOpAndPyLayerGradOp(
+    const framework::ProgramDesc &program,
+    int block_id,
+    const std::vector<std::unique_ptr<framework::OperatorBase>> &all_ops);
+
+void PrepareSafeEagerDeletionOnPyLayerOpAndPyLayerGradOp(
+    const framework::ProgramDesc &program,
+    const std::vector<OpVariant> &pylayer_ops,
+    const std::vector<OpVariant> &pylayer_grad_ops);
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc b/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc
index 6c949045ef212..778e6ed277fd7 100644
--- a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc
+++ b/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc
@@ -242,14 +242,14 @@ For case 2 (assume that the shape of $Y$ is a continuous subsequence of $X$ ):
 
 For example:
 
-  .. code-block:: python
-
-    shape(X) = (2, 3, 4, 5), shape(Y) = (,)
-    shape(X) = (2, 3, 4, 5), shape(Y) = (5,)
-    shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5), with axis=-1(default) or axis=2
-    shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
-    shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
-    shape(X) = (2, 3, 4, 5), shape(Y) = (2, 1), with axis=0
+    .. code-block:: text
+
+        shape(X) = (2, 3, 4, 5), shape(Y) = (,)
+        shape(X) = (2, 3, 4, 5), shape(Y) = (5,)
+        shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5), with axis=-1(default) or axis=2
+        shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
+        shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
+        shape(X) = (2, 3, 4, 5), shape(Y) = (2, 1), with axis=0
 
 
 The inputs $X$ and $Y$ can carry the different LoD information.
diff --git a/paddle/fluid/operators/fused/scaled_dp_attention.h b/paddle/fluid/operators/fused/scaled_dp_attention.h
index c0f3d7fee0f30..016c3995d7383 100644
--- a/paddle/fluid/operators/fused/scaled_dp_attention.h
+++ b/paddle/fluid/operators/fused/scaled_dp_attention.h
@@ -227,7 +227,6 @@ void softmax_sum_max(float* AB,
                      float refac,
                      int m,
                      int k) {
-  assert(k % 16 == 0);
   float max_val = std::numeric_limits<float>::lowest();
   __m512 vrefac = _mm512_set1_ps(refac);
   for (int i = 0; i < m; ++i) {
@@ -290,7 +289,6 @@ void update_out_blk(float* output,
                     float* max,
                     int m,
                     int n) {
-  assert(n % 16 == 0);
   for (int i = 0; i < m; ++i) {
     const float* buf = exp_ABC + i * n;
     float* outbuf = output + i * n;
@@ -298,10 +296,12 @@ void update_out_blk(float* output,
     merr = vexp(merr);
     __m512 vfac = _mm512_set1_ps(pre_sum[i] / sum[i]);
     for (int off = 0; off < n; off += 16) {
-      __m512 vout = _mm512_loadu_ps(outbuf + off);
-      __m512 vabc = _mm512_loadu_ps(buf + off);
+      int remain = n - off;
+      __mmask16 mask = (remain >= 16 ? 0xffff : (1 << remain) - 1);
+      __m512 vout = _mm512_maskz_loadu_ps(mask, outbuf + off);
+      __m512 vabc = _mm512_maskz_loadu_ps(mask, buf + off);
       __m512 vupt = vout * merr * vfac + vabc;
-      _mm512_storeu_ps(outbuf + off, vupt);
+      _mm512_mask_storeu_ps(outbuf + off, mask, vupt);
     }
     pre_sum[i] = sum[i];
     pre_max[i] = max[i];
@@ -348,8 +348,6 @@ void scaled_dp_attention(const float* query,
   int iblk = std::min(512, itsize / 1);
   int oblk = std::min(512, otsize / 1);
   float refac = scale;
-  assert(itsize % iblk == 0);
-  assert(otsize % oblk == 0);
 
 #ifdef PADDLE_WITH_MKLML
   int nth = omp_get_max_threads();
diff --git a/paddle/fluid/operators/fused/self_dp_attention_op.cc b/paddle/fluid/operators/fused/self_dp_attention_op.cc
index 04c7424a80dc5..bf0f59865c8ab 100644
--- a/paddle/fluid/operators/fused/self_dp_attention_op.cc
+++ b/paddle/fluid/operators/fused/self_dp_attention_op.cc
@@ -30,13 +30,6 @@ void SelfDPAttenOp::InferShape(framework::InferShapeContext* ctx) const {
                         "[batchsize, tokensize, 3, nhead, headsize] "
                         ", but now Input X dim is:[%s] ",
                         dim_input));
-  PADDLE_ENFORCE_EQ(dim_input[4] % 16,
-                    0,
-                    platform::errors::InvalidArgument(
-                        "The last dim of input X should be a multiple of 16, "
-                        ", but now the dim is:[%d] "
-                        "Please remove self_attention_fuse_pass from the lists",
-                        dim_input[4]));
   framework::DDim out_dims(
       {dim_input[0], dim_input[1], dim_input[3], dim_input[4]});
   ctx->SetOutputDim("Out", out_dims);
diff --git a/paddle/fluid/operators/generator/CMakeLists.txt b/paddle/fluid/operators/generator/CMakeLists.txt
index af346e402bf83..dc88ea0b3a533 100644
--- a/paddle/fluid/operators/generator/CMakeLists.txt
+++ b/paddle/fluid/operators/generator/CMakeLists.txt
@@ -25,7 +25,7 @@ function(install_py_pyyaml)
   execute_process(
     COMMAND
       ${PYTHON_EXECUTABLE} "-c"
-      "import re, pyyaml; print(re.compile('/__init__.py.*').sub('',pyyaml.__file__))"
+      "import re, yaml; print(re.compile('/__init__.py.*').sub('',yaml.__file__))"
     RESULT_VARIABLE _pyyaml_status
     ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
 
diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc
index 5c381e31673ef..69c64de705645 100644
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/impl/matmul_kernel_impl.h"
 
 namespace paddle {
 namespace operators {
@@ -53,61 +54,90 @@ static framework::DDim ColumnMatrixFromVector(const framework::DDim &y_dim) {
   return phi::make_ddim({y_dim[0], 1});
 }
 
-template <typename DeviceContext, typename T>
-class MatMulKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto &x = GET_DATA_SAFELY(
-        context.Input<phi::DenseTensor>("X"), "Input", "X", "MatMul");
-    auto &y = GET_DATA_SAFELY(
-        context.Input<phi::DenseTensor>("Y"), "Input", "Y", "MatMul");
-    auto *out = context.Output<phi::DenseTensor>("Out");
-
-    auto &dev_ctx = context.template device_context<DeviceContext>();
-    dev_ctx.template Alloc<T>(out, out->numel() * sizeof(T));
-
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
-    auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(
-        RowMatrixFromVector(x.dims()), 0, context.Attr<bool>("transpose_X"));
-    auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(
-        ColumnMatrixFromVector(y.dims()), 0, context.Attr<bool>("transpose_Y"));
-    auto scale = static_cast<T>(context.Attr<float>("alpha"));
+#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
+template <typename T, typename DeviceContext>
+typename std::enable_if<std::is_integral<T>::value, void>::type
+ComputeMatmulImpl(const framework::ExecutionContext &context) {
+  auto &dev_ctx = context.template device_context<DeviceContext>();
+
+  auto &x = GET_DATA_SAFELY(
+      context.Input<phi::DenseTensor>("X"), "Input", "X", "MatMul");
+  auto &y = GET_DATA_SAFELY(
+      context.Input<phi::DenseTensor>("Y"), "Input", "Y", "MatMul");
+  auto *out = context.Output<phi::DenseTensor>("Out");
+
+  dev_ctx.template Alloc<T>(out, out->numel() * sizeof(T));
+
+  phi::MatmulKernel<T>(dev_ctx,
+                       x,
+                       y,
+                       context.Attr<bool>("transpose_X"),
+                       context.Attr<bool>("transpose_Y"),
+                       out);
+}
+#endif
 
-    int head_number = 1;
+template <typename T, typename DeviceContext>
+typename std::enable_if<!std::is_integral<T>::value, void>::type
+ComputeMatmulImpl(const framework::ExecutionContext &context) {
+  auto &x = GET_DATA_SAFELY(
+      context.Input<phi::DenseTensor>("X"), "Input", "X", "MatMul");
+  auto &y = GET_DATA_SAFELY(
+      context.Input<phi::DenseTensor>("Y"), "Input", "Y", "MatMul");
+  auto *out = context.Output<phi::DenseTensor>("Out");
+
+  auto &dev_ctx = context.template device_context<DeviceContext>();
+  dev_ctx.template Alloc<T>(out, out->numel() * sizeof(T));
+
+  auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
+  auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(
+      RowMatrixFromVector(x.dims()), 0, context.Attr<bool>("transpose_X"));
+  auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(
+      ColumnMatrixFromVector(y.dims()), 0, context.Attr<bool>("transpose_Y"));
+  auto scale = static_cast<T>(context.Attr<float>("alpha"));
+
+  int head_number = 1;
 #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
     !defined(PADDLE_WITH_HIP)
-    head_number = context.Attr<int>("head_number");
+  head_number = context.Attr<int>("head_number");
 #endif
 
-    const auto &x_dims = x.dims();
-    const auto &y_dims = y.dims();
-    if (head_number <= 1 && x_dims.size() == 3 && y_dims.size() <= 2) {
-      // the transpose_X must be false, if is true, the transpose cost much time
-      if (!context.Attr<bool>("transpose_X")) {
-        mat_dim_a.height_ *= mat_dim_a.batch_size_;
-        mat_dim_a.batch_size_ = 0;
-      }
+  const auto &x_dims = x.dims();
+  const auto &y_dims = y.dims();
+  if (head_number <= 1 && x_dims.size() == 3 && y_dims.size() <= 2) {
+    // the transpose_X must be false, if is true, the transpose cost much time
+    if (!context.Attr<bool>("transpose_X")) {
+      mat_dim_a.height_ *= mat_dim_a.batch_size_;
+      mat_dim_a.batch_size_ = 0;
     }
+  }
 #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
     !defined(PADDLE_WITH_HIP)
-    bool split_vertical_y = (mat_dim_a.width_ != mat_dim_b.height_);
-
-    if (head_number > 1) {
-      blas.MatMulWithHead(x,
-                          mat_dim_a,
-                          y,
-                          mat_dim_b,
-                          scale,
-                          head_number,
-                          out,
-                          T(0),
-                          split_vertical_y);
-    } else {
-      blas.MatMul(x, mat_dim_a, y, mat_dim_b, scale, out, T(0));
-    }
-#else
+  bool split_vertical_y = (mat_dim_a.width_ != mat_dim_b.height_);
+
+  if (head_number > 1) {
+    blas.MatMulWithHead(x,
+                        mat_dim_a,
+                        y,
+                        mat_dim_b,
+                        scale,
+                        head_number,
+                        out,
+                        T(0),
+                        split_vertical_y);
+  } else {
     blas.MatMul(x, mat_dim_a, y, mat_dim_b, scale, out, T(0));
+  }
+#else
+  blas.MatMul(x, mat_dim_a, y, mat_dim_b, scale, out, T(0));
 #endif
+}
+
+template <typename DeviceContext, typename T>
+class MatMulKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    ComputeMatmulImpl<T, DeviceContext>(context);
   }
 };
 
@@ -926,12 +956,31 @@ REGISTER_OP_CPU_KERNEL(matmul_grad_grad,
                        ops::MatMulDoubleGradKernel<phi::CPUContext, float>,
                        ops::MatMulDoubleGradKernel<phi::CPUContext, double>);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_HIP)
 REGISTER_OP_CUDA_KERNEL(
     matmul,
     ops::MatMulKernel<phi::GPUContext, float>,
     ops::MatMulKernel<phi::GPUContext, double>,
     ops::MatMulKernel<phi::GPUContext, paddle::platform::float16>);
+#endif
+
+#if defined(PADDLE_WITH_CUDA)
+#if CUDA_VERSION >= 11060
+REGISTER_OP_CUDA_KERNEL(
+    matmul,
+    ops::MatMulKernel<phi::GPUContext, int8_t>,
+    ops::MatMulKernel<phi::GPUContext, float>,
+    ops::MatMulKernel<phi::GPUContext, double>,
+    ops::MatMulKernel<phi::GPUContext, paddle::platform::float16>);
+#else
+REGISTER_OP_CUDA_KERNEL(
+    matmul,
+    ops::MatMulKernel<phi::GPUContext, float>,
+    ops::MatMulKernel<phi::GPUContext, double>,
+    ops::MatMulKernel<phi::GPUContext, paddle::platform::float16>);
+#endif
+#endif
+
 REGISTER_OP_CUDA_KERNEL(
     matmul_grad,
     ops::MatMulGradKernel<phi::GPUContext, float>,
@@ -940,7 +989,6 @@ REGISTER_OP_CUDA_KERNEL(
 REGISTER_OP_CUDA_KERNEL(matmul_grad_grad,
                         ops::MatMulDoubleGradKernel<phi::GPUContext, float>,
                         ops::MatMulDoubleGradKernel<phi::GPUContext, double>);
-#endif
 
 REGISTER_OP_VERSION(matmul).AddCheckpoint(
     R"ROC(Register matmul for adding the attribute of
diff --git a/paddle/fluid/ir/CMakeLists.txt b/paddle/fluid/pir/CMakeLists.txt
similarity index 100%
rename from paddle/fluid/ir/CMakeLists.txt
rename to paddle/fluid/pir/CMakeLists.txt
diff --git a/paddle/fluid/pir/dialect/CMakeLists.txt b/paddle/fluid/pir/dialect/CMakeLists.txt
new file mode 100644
index 0000000000000..17a73237c5fdb
--- /dev/null
+++ b/paddle/fluid/pir/dialect/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_subdirectory(operator)
+add_subdirectory(kernel)
diff --git a/paddle/fluid/ir/dialect/paddle_kernel_dialect/CMakeLists.txt b/paddle/fluid/pir/dialect/kernel/CMakeLists.txt
similarity index 100%
rename from paddle/fluid/ir/dialect/paddle_kernel_dialect/CMakeLists.txt
rename to paddle/fluid/pir/dialect/kernel/CMakeLists.txt
diff --git a/paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/CMakeLists.txt b/paddle/fluid/pir/dialect/kernel/ir/CMakeLists.txt
similarity index 80%
rename from paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/CMakeLists.txt
rename to paddle/fluid/pir/dialect/kernel/ir/CMakeLists.txt
index af5e5c4fc9016..bdfdb75410524 100644
--- a/paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/CMakeLists.txt
+++ b/paddle/fluid/pir/dialect/kernel/ir/CMakeLists.txt
@@ -2,4 +2,4 @@ file(GLOB PADDLE_KERNEL_DIALECT_SRCS "*.cc")
 cc_library(
   pd_kernel_dialect
   SRCS ${PADDLE_KERNEL_DIALECT_SRCS}
-  DEPS pd_dialect_core)
+  DEPS pd_op_dialect_core)
diff --git a/paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_attribute_storage.h b/paddle/fluid/pir/dialect/kernel/ir/attribute_storage.h
similarity index 88%
rename from paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_attribute_storage.h
rename to paddle/fluid/pir/dialect/kernel/ir/attribute_storage.h
index 18312b88b8ae2..1c8b4f9150b25 100644
--- a/paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_attribute_storage.h
+++ b/paddle/fluid/pir/dialect/kernel/ir/attribute_storage.h
@@ -14,16 +14,16 @@
 
 #pragma once
 
-#include "paddle/ir/core/attribute.h"
-#include "paddle/ir/core/attribute_base.h"
-#include "paddle/ir/core/utils.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/kernel_factory.h"
+#include "paddle/pir/core/attribute.h"
+#include "paddle/pir/core/attribute_base.h"
+#include "paddle/pir/core/utils.h"
 
 namespace paddle {
 namespace dialect {
 
-struct KernelAttributeStorage : public ir::AttributeStorage {
+struct KernelAttributeStorage : public pir::AttributeStorage {
   using ParamKey = phi::KernelKey;
 
   explicit KernelAttributeStorage(const ParamKey &key) { kernel_key_ = key; }
diff --git a/paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_attribute.cc b/paddle/fluid/pir/dialect/kernel/ir/kernel_attribute.cc
similarity index 89%
rename from paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_attribute.cc
rename to paddle/fluid/pir/dialect/kernel/ir/kernel_attribute.cc
index 43ed52ffc6701..f8c23f993ca2d 100644
--- a/paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_attribute.cc
+++ b/paddle/fluid/pir/dialect/kernel/ir/kernel_attribute.cc
@@ -12,6 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_attribute.h"
+#include "paddle/fluid/pir/dialect/kernel/ir/kernel_attribute.h"
 
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::KernelAttribute)
diff --git a/paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_attribute.h b/paddle/fluid/pir/dialect/kernel/ir/kernel_attribute.h
similarity index 86%
rename from paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_attribute.h
rename to paddle/fluid/pir/dialect/kernel/ir/kernel_attribute.h
index fa17b823f0278..7b6bc2336813a 100644
--- a/paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_attribute.h
+++ b/paddle/fluid/pir/dialect/kernel/ir/kernel_attribute.h
@@ -14,14 +14,14 @@
 
 #pragma once
 
-#include "paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_attribute_storage.h"
-#include "paddle/ir/core/attribute.h"
+#include "paddle/fluid/pir/dialect/kernel/ir/attribute_storage.h"
 #include "paddle/phi/core/enforce.h"
+#include "paddle/pir/core/attribute.h"
 
 namespace paddle {
 namespace dialect {
 
-class KernelAttribute : public ir::Attribute {
+class KernelAttribute : public pir::Attribute {
  public:
   using Attribute::Attribute;
 
diff --git a/paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_dialect.cc b/paddle/fluid/pir/dialect/kernel/ir/kernel_dialect.cc
similarity index 76%
rename from paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_dialect.cc
rename to paddle/fluid/pir/dialect/kernel/ir/kernel_dialect.cc
index c2f4dfefb4d2b..592319dcfd36e 100644
--- a/paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_dialect.cc
+++ b/paddle/fluid/pir/dialect/kernel/ir/kernel_dialect.cc
@@ -12,26 +12,26 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_dialect.h"
-#include "paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_attribute.h"
-#include "paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_op.h"
-#include "paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_type.h"
+#include "paddle/fluid/pir/dialect/kernel/ir/kernel_dialect.h"
+#include "paddle/fluid/pir/dialect/kernel/ir/kernel_attribute.h"
+#include "paddle/fluid/pir/dialect/kernel/ir/kernel_op.h"
+#include "paddle/fluid/pir/dialect/kernel/ir/kernel_type.h"
 #include "paddle/fluid/platform/init_phi.h"
-#include "paddle/ir/core/ir_printer.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/pir/core/ir_printer.h"
 
 REGISTER_FILE_SYMBOLS(kernel_dialect);
 
 namespace paddle {
 namespace dialect {
 
-PaddleKernelDialect::PaddleKernelDialect(ir::IrContext *context)
-    : ir::Dialect(name(), context, ir::TypeId::get<PaddleKernelDialect>()) {
+KernelDialect::KernelDialect(pir::IrContext *context)
+    : pir::Dialect(name(), context, pir::TypeId::get<KernelDialect>()) {
   initialize();
 }
 
-void PaddleKernelDialect::initialize() {
+void KernelDialect::initialize() {
   RegisterTypes<paddle::dialect::AllocatedDenseTensorType>();
   RegisterTypes<paddle::dialect::AllocatedDenseTensorType,
                 paddle::dialect::AllocatedSelectedRowsType>();
@@ -39,7 +39,7 @@ void PaddleKernelDialect::initialize() {
   RegisterAttributes<paddle::dialect::KernelAttribute>();
 }
 
-void PaddleKernelDialect::PrintType(ir::Type type, std::ostream &os) const {
+void KernelDialect::PrintType(pir::Type type, std::ostream &os) const {
   if (type.isa<AllocatedDenseTensorType>()) {
     AllocatedDenseTensorType tensor_type =
         type.dyn_cast<AllocatedDenseTensorType>();
@@ -67,16 +67,16 @@ void PaddleKernelDialect::PrintType(ir::Type type, std::ostream &os) const {
   }
 }
 
-void PaddleKernelDialect::PrintAttribute(ir::Attribute attr,
-                                         std::ostream &os) const {
+void KernelDialect::PrintAttribute(pir::Attribute attr,
+                                   std::ostream &os) const {
   phi::KernelKey kernel = attr.dyn_cast<KernelAttribute>().data();
 
   os << "<backend:" << kernel.backend() << "|layout:" << kernel.layout()
      << "|dtype:" << kernel.dtype() << ">";
 }
 
-void PaddleKernelDialect::PrintOperation(ir::Operation *op,
-                                         ir::IrPrinter &printer) const {
+void KernelDialect::PrintOperation(pir::Operation *op,
+                                   pir::IrPrinter &printer) const {
   if (op->dyn_cast<PhiKernelOp>() || op->dyn_cast<LegacyKernelOp>()) {
     auto &os = printer.os;
     printer.PrintOpResult(op);
@@ -86,7 +86,7 @@ void PaddleKernelDialect::PrintOperation(ir::Operation *op,
       if (op->attributes().count("is_inplace") != 0 &&
           op->attributes()
               .at("is_inplace")
-              .dyn_cast<ir::BoolAttribute>()
+              .dyn_cast<pir::BoolAttribute>()
               .data()) {
         kernel_name = kernel_name + "_";
       }
@@ -97,7 +97,7 @@ void PaddleKernelDialect::PrintOperation(ir::Operation *op,
       if (op->attributes().count("is_inplace") != 0 &&
           op->attributes()
               .at("is_inplace")
-              .dyn_cast<ir::BoolAttribute>()
+              .dyn_cast<pir::BoolAttribute>()
               .data()) {
         kernel_name = kernel_name + "_";
       }
@@ -117,4 +117,4 @@ void PaddleKernelDialect::PrintOperation(ir::Operation *op,
 }  // namespace dialect
 }  // namespace paddle
 
-IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::PaddleKernelDialect)
+IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::KernelDialect)
diff --git a/paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_dialect.h b/paddle/fluid/pir/dialect/kernel/ir/kernel_dialect.h
similarity index 64%
rename from paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_dialect.h
rename to paddle/fluid/pir/dialect/kernel/ir/kernel_dialect.h
index 8099e1d1da093..d2fbcadaf8cf2 100644
--- a/paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_dialect.h
+++ b/paddle/fluid/pir/dialect/kernel/ir/kernel_dialect.h
@@ -14,23 +14,23 @@
 
 #pragma once
 
-#include "paddle/ir/core/dialect.h"
+#include "paddle/pir/core/dialect.h"
 
 namespace paddle {
 namespace dialect {
 
-class PaddleKernelDialect : public ir::Dialect {
+class KernelDialect : public pir::Dialect {
  public:
-  explicit PaddleKernelDialect(ir::IrContext* context);
+  explicit KernelDialect(pir::IrContext* context);
 
   static const char* name() { return "pd_kernel"; }
 
-  void PrintType(ir::Type type, std::ostream& os) const override;
+  void PrintType(pir::Type type, std::ostream& os) const override;
 
-  void PrintAttribute(ir::Attribute attr, std::ostream& os) const override;
+  void PrintAttribute(pir::Attribute attr, std::ostream& os) const override;
 
-  void PrintOperation(ir::Operation* op,
-                      ir::IrPrinter& printer) const override;  // NOLINT
+  void PrintOperation(pir::Operation* op,
+                      pir::IrPrinter& printer) const override;  // NOLINT
 
  private:
   void initialize();
@@ -39,4 +39,4 @@ class PaddleKernelDialect : public ir::Dialect {
 }  // namespace dialect
 }  // namespace paddle
 
-IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::PaddleKernelDialect)
+IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::KernelDialect)
diff --git a/paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_op.cc b/paddle/fluid/pir/dialect/kernel/ir/kernel_op.cc
similarity index 78%
rename from paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_op.cc
rename to paddle/fluid/pir/dialect/kernel/ir/kernel_op.cc
index 4a934505aad55..62c1129f84620 100644
--- a/paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_op.cc
+++ b/paddle/fluid/pir/dialect/kernel/ir/kernel_op.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_op.h"
-#include "paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_attribute.h"
-#include "paddle/ir/core/builtin_attribute.h"
+#include "paddle/fluid/pir/dialect/kernel/ir/kernel_op.h"
+#include "paddle/fluid/pir/dialect/kernel/ir/kernel_attribute.h"
 #include "paddle/phi/core/enforce.h"
+#include "paddle/pir/core/builtin_attribute.h"
 
 namespace paddle {
 namespace dialect {
@@ -31,12 +31,12 @@ void PhiKernelOp::Verify() {
   auto& attributes = this->attributes();
 
   PADDLE_ENFORCE(attributes.count("op_name") > 0 &&
-                     attributes.at("op_name").isa<ir::StrAttribute>(),
+                     attributes.at("op_name").isa<pir::StrAttribute>(),
                  phi::errors::PreconditionNotMet(
                      "Type of attribute: op_name is not right."));
 
   PADDLE_ENFORCE(attributes.count("kernel_name") > 0 &&
-                     attributes.at("kernel_name").isa<ir::StrAttribute>(),
+                     attributes.at("kernel_name").isa<pir::StrAttribute>(),
                  phi::errors::PreconditionNotMet(
                      "Type of attribute: kernel_name is not right."));
 
@@ -47,10 +47,13 @@ void PhiKernelOp::Verify() {
 }
 
 std::string PhiKernelOp::op_name() {
-  return attributes().at("op_name").dyn_cast<ir::StrAttribute>().AsString();
+  return attributes().at("op_name").dyn_cast<pir::StrAttribute>().AsString();
 }
 std::string PhiKernelOp::kernel_name() {
-  return attributes().at("kernel_name").dyn_cast<ir::StrAttribute>().AsString();
+  return attributes()
+      .at("kernel_name")
+      .dyn_cast<pir::StrAttribute>()
+      .AsString();
 }
 phi::KernelKey PhiKernelOp::kernel_key() {
   return attributes().at("kernel_key").dyn_cast<KernelAttribute>().data();
@@ -67,12 +70,12 @@ void LegacyKernelOp::Verify() {
   auto& attributes = this->attributes();
 
   PADDLE_ENFORCE(attributes.count("op_name") > 0 &&
-                     attributes.at("op_name").isa<ir::StrAttribute>(),
+                     attributes.at("op_name").isa<pir::StrAttribute>(),
                  phi::errors::PreconditionNotMet(
                      "Type of attribute: op_name is not right."));
 
   PADDLE_ENFORCE(attributes.count("kernel_name") > 0 &&
-                     attributes.at("kernel_name").isa<ir::StrAttribute>(),
+                     attributes.at("kernel_name").isa<pir::StrAttribute>(),
                  phi::errors::PreconditionNotMet(
                      "Type of attribute: kernel_name is not right."));
 
@@ -83,10 +86,13 @@ void LegacyKernelOp::Verify() {
 }
 
 std::string LegacyKernelOp::op_name() {
-  return attributes().at("op_name").dyn_cast<ir::StrAttribute>().AsString();
+  return attributes().at("op_name").dyn_cast<pir::StrAttribute>().AsString();
 }
 std::string LegacyKernelOp::kernel_name() {
-  return attributes().at("kernel_name").dyn_cast<ir::StrAttribute>().AsString();
+  return attributes()
+      .at("kernel_name")
+      .dyn_cast<pir::StrAttribute>()
+      .AsString();
 }
 phi::KernelKey LegacyKernelOp::kernel_key() {
   return attributes().at("kernel_key").dyn_cast<KernelAttribute>().data();
diff --git a/paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_op.h b/paddle/fluid/pir/dialect/kernel/ir/kernel_op.h
similarity index 89%
rename from paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_op.h
rename to paddle/fluid/pir/dialect/kernel/ir/kernel_op.h
index 0a574bc60b218..8a18959665e0c 100644
--- a/paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_op.h
+++ b/paddle/fluid/pir/dialect/kernel/ir/kernel_op.h
@@ -14,13 +14,13 @@
 
 #pragma once
 
-#include "paddle/ir/core/builder.h"
-#include "paddle/ir/core/op_base.h"
 #include "paddle/phi/core/kernel_factory.h"
+#include "paddle/pir/core/builder.h"
+#include "paddle/pir/core/op_base.h"
 
 namespace paddle {
 namespace dialect {
-class PhiKernelOp : public ir::Op<PhiKernelOp> {
+class PhiKernelOp : public pir::Op<PhiKernelOp> {
  public:
   using Op::Op;
   static const char *name() { return "pd_kernel.phi_kernel"; }
@@ -32,7 +32,7 @@ class PhiKernelOp : public ir::Op<PhiKernelOp> {
   void Verify();
 };
 
-class LegacyKernelOp : public ir::Op<LegacyKernelOp> {
+class LegacyKernelOp : public pir::Op<LegacyKernelOp> {
  public:
   using Op::Op;
   static const char *name() { return "pd_kernel.legacy_kernel"; }
diff --git a/paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_type.cc b/paddle/fluid/pir/dialect/kernel/ir/kernel_type.cc
similarity index 91%
rename from paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_type.cc
rename to paddle/fluid/pir/dialect/kernel/ir/kernel_type.cc
index 9740f1296a51b..60a722f13dab5 100644
--- a/paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_type.cc
+++ b/paddle/fluid/pir/dialect/kernel/ir/kernel_type.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_type.h"
+#include "paddle/fluid/pir/dialect/kernel/ir/kernel_type.h"
 
 namespace paddle {
 namespace dialect {
@@ -21,7 +21,7 @@ const phi::Place& AllocatedDenseTensorType::place() const {
   return storage()->place_;
 }
 
-const ir::Type& AllocatedDenseTensorType::dtype() const {
+const pir::Type& AllocatedDenseTensorType::dtype() const {
   return storage()->dense_tensor_type_.dtype();
 }
 
@@ -45,7 +45,7 @@ const phi::Place& AllocatedSelectedRowsType::place() const {
   return storage()->place_;
 }
 
-const ir::Type& AllocatedSelectedRowsType::dtype() const {
+const pir::Type& AllocatedSelectedRowsType::dtype() const {
   return storage()->selected_rows_type_.dtype();
 }
 
diff --git a/paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_type.h b/paddle/fluid/pir/dialect/kernel/ir/kernel_type.h
similarity index 66%
rename from paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_type.h
rename to paddle/fluid/pir/dialect/kernel/ir/kernel_type.h
index b00f2e5320dde..adb78639d65c0 100644
--- a/paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_type.h
+++ b/paddle/fluid/pir/dialect/kernel/ir/kernel_type.h
@@ -14,30 +14,30 @@
 
 #pragma once
 
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_type.h"
-#include "paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_type_storage.h"
-#include "paddle/ir/core/type.h"
+#include "paddle/fluid/pir/dialect/kernel/ir/type_storage.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
+#include "paddle/pir/core/type.h"
 
 namespace paddle {
 namespace dialect {
 
-class AllocatedDenseTensorType : public ir::Type {
+class AllocatedDenseTensorType
+    : public pir::Type::TypeBase<AllocatedDenseTensorType,
+                                 pir::Type,
+                                 AllocatedDenseTensorTypeStorage> {
  public:
-  using Type::Type;
+  using Base::Base;
 
-  DECLARE_TYPE_UTILITY_FUNCTOR(AllocatedDenseTensorType,
-                               AllocatedDenseTensorTypeStorage);
-
-  static AllocatedDenseTensorType get(ir::IrContext *ctx,
+  static AllocatedDenseTensorType get(pir::IrContext *ctx,
                                       const phi::Place &place,
                                       dialect::DenseTensorType type) {
-    return ir::TypeManager::template get<AllocatedDenseTensorType>(
+    return pir::TypeManager::template get<AllocatedDenseTensorType>(
         ctx, place, type);
   }
 
-  static AllocatedDenseTensorType get(ir::IrContext *ctx,
+  static AllocatedDenseTensorType get(pir::IrContext *ctx,
                                       const phi::Place &place,
-                                      const ir::Type &dtype,
+                                      const pir::Type &dtype,
                                       const phi::DDim &dims,
                                       const phi::DataLayout &layout,
                                       const phi::LoD &lod,
@@ -45,13 +45,13 @@ class AllocatedDenseTensorType : public ir::Type {
     dialect::DenseTensorType dense_tensor_type =
         dialect::DenseTensorType::get(ctx, dtype, dims, layout, lod, offset);
 
-    return ir::TypeManager::template get<AllocatedDenseTensorType>(
+    return pir::TypeManager::template get<AllocatedDenseTensorType>(
         ctx, place, dense_tensor_type);
   }
 
   const phi::Place &place() const;
 
-  const ir::Type &dtype() const;
+  const pir::Type &dtype() const;
 
   const phi::DDim &dims() const;
 
@@ -62,23 +62,23 @@ class AllocatedDenseTensorType : public ir::Type {
   const size_t &offset() const;
 };
 
-class AllocatedSelectedRowsType : public ir::Type {
+class AllocatedSelectedRowsType
+    : public pir::Type::TypeBase<AllocatedSelectedRowsType,
+                                 pir::Type,
+                                 AllocatedSelectedRowsTypeStorage> {
  public:
-  using Type::Type;
-
-  DECLARE_TYPE_UTILITY_FUNCTOR(AllocatedSelectedRowsType,
-                               AllocatedSelectedRowsTypeStorage);
+  using Base::Base;
 
-  static AllocatedSelectedRowsType get(ir::IrContext *ctx,
+  static AllocatedSelectedRowsType get(pir::IrContext *ctx,
                                        const phi::Place &place,
                                        dialect::SelectedRowsType type) {
-    return ir::TypeManager::template get<AllocatedSelectedRowsType>(
+    return pir::TypeManager::template get<AllocatedSelectedRowsType>(
         ctx, place, type);
   }
 
-  static AllocatedSelectedRowsType get(ir::IrContext *ctx,
+  static AllocatedSelectedRowsType get(pir::IrContext *ctx,
                                        const phi::Place &place,
-                                       const ir::Type &dtype,
+                                       const pir::Type &dtype,
                                        const phi::DDim &dims,
                                        const phi::DataLayout &layout,
                                        const phi::LoD &lod,
@@ -86,13 +86,13 @@ class AllocatedSelectedRowsType : public ir::Type {
     dialect::SelectedRowsType type =
         dialect::SelectedRowsType::get(ctx, dtype, dims, layout, lod, offset);
 
-    return ir::TypeManager::template get<AllocatedSelectedRowsType>(
+    return pir::TypeManager::template get<AllocatedSelectedRowsType>(
         ctx, place, type);
   }
 
   const phi::Place &place() const;
 
-  const ir::Type &dtype() const;
+  const pir::Type &dtype() const;
 
   const phi::DDim &dims() const;
 
diff --git a/paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_type_storage.h b/paddle/fluid/pir/dialect/kernel/ir/type_storage.h
similarity index 72%
rename from paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_type_storage.h
rename to paddle/fluid/pir/dialect/kernel/ir/type_storage.h
index 1913dd6e6346c..46622587e51f5 100644
--- a/paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_type_storage.h
+++ b/paddle/fluid/pir/dialect/kernel/ir/type_storage.h
@@ -16,10 +16,10 @@
 
 #include <type_traits>
 
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_type.h"
-#include "paddle/ir/core/type.h"
-#include "paddle/ir/core/utils.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
 #include "paddle/phi/core/tensor_meta.h"
+#include "paddle/pir/core/type.h"
+#include "paddle/pir/core/utils.h"
 
 namespace paddle {
 namespace dialect {
@@ -30,7 +30,7 @@ namespace dialect {
 /// following methods: (1)declare ParamKey, (2)define Construction method,
 /// (3)define HashValue method, (4)overload operator==.
 ///
-struct AllocatedDenseTensorTypeStorage : public ir::TypeStorage {
+struct AllocatedDenseTensorTypeStorage : public pir::TypeStorage {
   using Place = phi::Place;
   ///
   /// \brief Declare ParamKey according to parameter type.
@@ -56,18 +56,19 @@ struct AllocatedDenseTensorTypeStorage : public ir::TypeStorage {
   static std::size_t HashValue(const ParamKey& key) {
     std::size_t hash_value = 0;
     // hash place
-    hash_value = ir::hash_combine(hash_value, std::get<0>(key).HashValue());
+    hash_value = pir::hash_combine(hash_value, std::get<0>(key).HashValue());
 
     // hash dtype
     auto dense_tensor_type = std::get<1>(key);
-    hash_value = ir::hash_combine(hash_value,
-                                  dialect::DenseTensorTypeStorage::HashValue(
-                                      dialect::DenseTensorTypeStorage::ParamKey(
-                                          dense_tensor_type.dtype(),
-                                          dense_tensor_type.dims(),
-                                          dense_tensor_type.data_layout(),
-                                          dense_tensor_type.lod(),
-                                          dense_tensor_type.offset())));
+    hash_value =
+        pir::hash_combine(hash_value,
+                          dialect::DenseTensorTypeStorage::HashValue(
+                              dialect::DenseTensorTypeStorage::ParamKey(
+                                  dense_tensor_type.dtype(),
+                                  dense_tensor_type.dims(),
+                                  dense_tensor_type.data_layout(),
+                                  dense_tensor_type.lod(),
+                                  dense_tensor_type.offset())));
     return hash_value;
   }
 
@@ -92,7 +93,7 @@ struct AllocatedDenseTensorTypeStorage : public ir::TypeStorage {
 /// \brief Define Parametric TypeStorage for AllocatedSelectedRowsTypeStorage.
 ///
 ///
-struct AllocatedSelectedRowsTypeStorage : public ir::TypeStorage {
+struct AllocatedSelectedRowsTypeStorage : public pir::TypeStorage {
   using Place = phi::Place;
   ///
   /// \brief Declare ParamKey according to parameter type.
@@ -118,18 +119,19 @@ struct AllocatedSelectedRowsTypeStorage : public ir::TypeStorage {
   static std::size_t HashValue(const ParamKey& key) {
     std::size_t hash_value = 791;
     // hash place
-    hash_value = ir::hash_combine(hash_value, std::get<0>(key).HashValue());
+    hash_value = pir::hash_combine(hash_value, std::get<0>(key).HashValue());
 
     // hash dtype
     auto selected_rows_type = std::get<1>(key);
-    hash_value = ir::hash_combine(hash_value,
-                                  dialect::DenseTensorTypeStorage::HashValue(
-                                      dialect::DenseTensorTypeStorage::ParamKey(
-                                          selected_rows_type.dtype(),
-                                          selected_rows_type.dims(),
-                                          selected_rows_type.data_layout(),
-                                          selected_rows_type.lod(),
-                                          selected_rows_type.offset())));
+    hash_value =
+        pir::hash_combine(hash_value,
+                          dialect::DenseTensorTypeStorage::HashValue(
+                              dialect::DenseTensorTypeStorage::ParamKey(
+                                  selected_rows_type.dtype(),
+                                  selected_rows_type.dims(),
+                                  selected_rows_type.data_layout(),
+                                  selected_rows_type.lod(),
+                                  selected_rows_type.offset())));
     return hash_value;
   }
 
diff --git a/paddle/fluid/ir/dialect/op_generator/api_gen.py b/paddle/fluid/pir/dialect/op_generator/api_gen.py
similarity index 71%
rename from paddle/fluid/ir/dialect/op_generator/api_gen.py
rename to paddle/fluid/pir/dialect/op_generator/api_gen.py
index cae035c657b69..66f1af1ed69e7 100644
--- a/paddle/fluid/ir/dialect/op_generator/api_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/api_gen.py
@@ -17,7 +17,12 @@
 import re
 
 import yaml
-from op_gen import OpCompatParser, OpInfoParser, to_pascal_case
+from op_gen import (
+    PD_MANUAL_OP_LIST,
+    OpCompatParser,
+    OpInfoParser,
+    to_pascal_case,
+)
 
 H_FILE_TEMPLATE = """
 
@@ -25,11 +30,11 @@
 
 #include <vector>
 
-#include "paddle/ir/core/value.h"
+#include "paddle/pir/core/value.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/common/scalar.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_manual_api.h"
+#include "paddle/fluid/pir/dialect/operator/ir/manual_api.h"
 
 {body}
 
@@ -37,11 +42,11 @@
 
 CPP_FILE_TEMPLATE = """
 
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_api.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/api_builder.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_op.h"
-#include "paddle/ir/core/builder.h"
-#include "paddle/ir/core/builtin_op.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_api.h"
+#include "paddle/fluid/pir/dialect/operator/ir/api_builder.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
+#include "paddle/pir/core/builder.h"
+#include "paddle/pir/core/builtin_op.h"
 
 {body}
 
@@ -71,17 +76,16 @@
 """
 
 COMBINE_OP_TEMPLATE = """
-    auto {op_name} = APIBuilder::Instance().GetBuilder()->Build<ir::CombineOp>({in_name});"""
+    auto {op_name} = APIBuilder::Instance().GetBuilder()->Build<pir::CombineOp>({in_name});"""
 
 SPLIT_OP_TEMPLATE = """
-    auto {op_name} = APIBuilder::Instance().GetBuilder()->Build<ir::SplitOp>({in_name});"""
+    auto {op_name} = APIBuilder::Instance().GetBuilder()->Build<pir::SplitOp>({in_name});"""
 
 COMPUTE_OP_TEMPLATE = """
     paddle::dialect::{op_class_name} {op_inst_name} = APIBuilder::Instance().GetBuilder()->Build<paddle::dialect::{op_class_name}>({args});"""
 
-OP_RESULT = 'ir::OpResult'
-VECTOR_TYPE = 'ir::VectorType'
-PD_MANUAL_OP_LIST = ['add_n']
+OP_RESULT = 'pir::OpResult'
+VECTOR_TYPE = 'pir::VectorType'
 
 
 def get_op_class_name(op_name):
@@ -91,9 +95,9 @@ def get_op_class_name(op_name):
 class CodeGen:
     def __init__(self) -> None:
         self._type_map = {
-            'paddle::dialect::DenseTensorType': 'ir::OpResult',
-            'paddle::dialect::SelectedRowsType': 'ir::OpResult',
-            'ir::VectorType<paddle::dialect::DenseTensorType>': 'std::vector<ir::OpResult>',
+            'paddle::dialect::DenseTensorType': 'pir::OpResult',
+            'paddle::dialect::SelectedRowsType': 'pir::OpResult',
+            'pir::VectorType<paddle::dialect::DenseTensorType>': 'std::vector<pir::OpResult>',
         }
 
     def _parse_yaml(self, op_yaml_files, op_compat_yaml_file):
@@ -111,6 +115,11 @@ def _parse_yaml(self, op_yaml_files, op_compat_yaml_file):
             )
         return op_info_items
 
+    def _need_skip(self, op_info, op_name):
+        return (
+            op_info.infer_meta_func is None and op_name not in PD_MANUAL_OP_LIST
+        )
+
     # =====================================
     # Gen declare functions
     # =====================================
@@ -123,11 +132,14 @@ def _gen_api_inputs(self, op_info):
             ret.append(f'{self._type_map[type]} {name}')
         return ', '.join(ret)
 
-    def _gen_api_attrs(self, op_info, with_default, is_mutable_attr):
+    def _gen_api_attrs(
+        self, op_info, with_default, is_mutable_attr, is_vector_mutable_sttr
+    ):
         name_list = op_info.attribute_name_list
         type_list = op_info.attribute_build_arg_type_list
         default_value_list = op_info.attribute_default_value_list
         mutable_name_list = op_info.mutable_attribute_name_list
+        mutable_type_list = op_info.mutable_attribute_type_list
         assert len(name_list) == len(type_list) == len(default_value_list)
         no_mutable_attr = []
         mutable_attr = []
@@ -135,7 +147,14 @@ def _gen_api_attrs(self, op_info, with_default, is_mutable_attr):
             name_list, type_list, default_value_list
         ):
             if is_mutable_attr and name in mutable_name_list:
-                mutable_attr.append(f'{OP_RESULT} {name}')
+                if (
+                    mutable_type_list[mutable_name_list.index(name)][0]
+                    == "paddle::dialect::IntArrayAttribute"
+                    and is_vector_mutable_sttr
+                ):
+                    mutable_attr.append(f'std::vector<{OP_RESULT}> {name}')
+                else:
+                    mutable_attr.append(f'{OP_RESULT} {name}')
                 continue
             if with_default and default_value is not None:
                 if type in ['float', 'double']:
@@ -149,9 +168,17 @@ def _gen_api_attrs(self, op_info, with_default, is_mutable_attr):
                 no_mutable_attr.append(f'{type} {name}')
         return ', '.join(mutable_attr + no_mutable_attr)
 
-    def _gen_api_args(self, op_info, with_default_attr, is_mutable_attr):
+    def _gen_api_args(
+        self,
+        op_info,
+        with_default_attr,
+        is_mutable_attr,
+        is_vector_mutable_attr,
+    ):
         inputs = self._gen_api_inputs(op_info)
-        attrs = self._gen_api_attrs(op_info, with_default_attr, is_mutable_attr)
+        attrs = self._gen_api_attrs(
+            op_info, with_default_attr, is_mutable_attr, is_vector_mutable_attr
+        )
         return (inputs + ', ' + attrs).strip(', ')
 
     def _gen_ret_type(self, op_info):
@@ -178,11 +205,15 @@ def _gen_ret_type(self, op_info):
         elif output_num == 0:
             return 'void'
 
-    def _gen_one_declare(self, op_info, op_name, is_mutable_attr):
+    def _gen_one_declare(
+        self, op_info, op_name, is_mutable_attr, is_vector_mutable_attr
+    ):
         return API_DECLARE_TEMPLATE.format(
             ret_type=self._gen_ret_type(op_info),
             api_name=op_name,
-            args=self._gen_api_args(op_info, True, is_mutable_attr),
+            args=self._gen_api_args(
+                op_info, True, is_mutable_attr, is_vector_mutable_attr
+            ),
         )
 
     def _gen_h_file(self, op_info_items, namespaces, h_file_path):
@@ -191,15 +222,21 @@ def _gen_h_file(self, op_info_items, namespaces, h_file_path):
             for op_name in op_info.op_phi_name:
                 # NOTE:When infer_meta_func is None, the Build() function generated in pd_op
                 # is wrong, so temporarily skip the automatic generation of these APIs
-                if (
-                    op_info.infer_meta_func is None
-                    and op_name not in PD_MANUAL_OP_LIST
-                ):
+                if self._need_skip(op_info, op_name):
                     continue
-                declare_str += self._gen_one_declare(op_info, op_name, False)
+                declare_str += self._gen_one_declare(
+                    op_info, op_name, False, False
+                )
                 if len(op_info.mutable_attribute_name_list) > 0:
-                    declare_str += self._gen_one_declare(op_info, op_name, True)
-
+                    declare_str += self._gen_one_declare(
+                        op_info, op_name, True, False
+                    )
+                    if "paddle::dialect::IntArrayAttribute" in {
+                        type[0] for type in op_info.mutable_attribute_type_list
+                    }:
+                        declare_str += self._gen_one_declare(
+                            op_info, op_name, True, True
+                        )
         body = declare_str
         for namespace in reversed(namespaces):
             body = NAMESPACE_TEMPLATE.format(namespace=namespace, body=body)
@@ -209,7 +246,7 @@ def _gen_h_file(self, op_info_items, namespaces, h_file_path):
     # =====================================
     # Gen impl functions
     # =====================================
-    def _gen_in_combine(self, op_info):
+    def _gen_in_combine(self, op_info, is_mutable_attr, is_vector_mutable_attr):
         name_list = op_info.input_name_list
         type_list = op_info.input_type_list
         assert len(name_list) == len(type_list)
@@ -224,6 +261,24 @@ def _gen_in_combine(self, op_info):
                 combine_op_list.append(op_name)
             else:
                 combine_op_list.append(None)
+
+        if is_mutable_attr:
+            name_list = op_info.mutable_attribute_name_list
+            type_list = op_info.mutable_attribute_type_list
+            assert len(name_list) == len(type_list)
+            for name, type in zip(name_list, type_list):
+                if (
+                    type[0] == "paddle::dialect::IntArrayAttribute"
+                    and is_vector_mutable_attr
+                ):
+                    op_name = f'{name}_combine_op'
+                    combine_op += COMBINE_OP_TEMPLATE.format(
+                        op_name=op_name, in_name=name
+                    )
+                    combine_op_list.append(op_name)
+                else:
+                    combine_op_list.append(None)
+
         return combine_op, combine_op_list
 
     def _gen_compute_op_args(
@@ -233,15 +288,22 @@ def _gen_compute_op_args(
         all_attr_list = op_info.attribute_name_list
         no_mutable_attr_list = op_info.non_mutable_attribute_name_list
         mutable_attr_list = op_info.mutable_attribute_name_list
-        assert len(input_name_list) == len(in_combine_op_list)
+        assert len(input_name_list) + len(mutable_attr_list) == len(
+            in_combine_op_list
+        ) or len(input_name_list) == len(in_combine_op_list)
         ret = []
-        for input_name, combine_op in zip(input_name_list, in_combine_op_list):
+        if is_mutable_attr:
+            name_list = input_name_list + mutable_attr_list
+        else:
+            name_list = input_name_list
+
+        for input_name, combine_op in zip(name_list, in_combine_op_list):
             if combine_op is None:
                 ret.append(input_name)
             else:
                 ret.append(f'{combine_op}.out()')
         if is_mutable_attr:
-            ret += list(mutable_attr_list + no_mutable_attr_list)
+            ret += list(no_mutable_attr_list)
         else:
             ret += list(all_attr_list)
         return ', '.join(ret)
@@ -293,9 +355,13 @@ def _gen_return_result(self, ret_list):
         elif len(ret_list) == 0:
             return 'return;'
 
-    def _gen_one_impl(self, op_info, op_name, is_mutable_attr):
+    def _gen_one_impl(
+        self, op_info, op_name, is_mutable_attr, is_vector_mutable_attr
+    ):
         ret_type = self._gen_ret_type(op_info)
-        in_combine, in_combine_op_list = self._gen_in_combine(op_info)
+        in_combine, in_combine_op_list = self._gen_in_combine(
+            op_info, is_mutable_attr, is_vector_mutable_attr
+        )
         compute_op, op_inst_name = self._gen_compute_op(
             op_info, op_name, in_combine_op_list, is_mutable_attr
         )
@@ -309,7 +375,9 @@ def _gen_one_impl(self, op_info, op_name, is_mutable_attr):
         ret = API_IMPL_TEMPLATE.format(
             ret_type=ret_type,
             api_name=op_name,
-            args=self._gen_api_args(op_info, False, is_mutable_attr),
+            args=self._gen_api_args(
+                op_info, False, is_mutable_attr, is_vector_mutable_attr
+            ),
             in_combine=in_combine,
             compute_op=compute_op,
             out_split=out_split,
@@ -325,14 +393,19 @@ def _gen_cpp_file(self, op_info_items, namespaces, cpp_file_path):
             for op_name in op_info.op_phi_name:
                 # NOTE:When infer_meta_func is None, the Build() function generated in pd_op
                 # is wrong, so temporarily skip the automatic generation of these APIs
-                if (
-                    op_info.infer_meta_func is None
-                    and op_name not in PD_MANUAL_OP_LIST
-                ):
+                if self._need_skip(op_info, op_name):
                     continue
-                impl_str += self._gen_one_impl(op_info, op_name, False)
+                impl_str += self._gen_one_impl(op_info, op_name, False, False)
                 if len(op_info.mutable_attribute_name_list) > 0:
-                    impl_str += self._gen_one_impl(op_info, op_name, True)
+                    impl_str += self._gen_one_impl(
+                        op_info, op_name, True, False
+                    )
+                    if "paddle::dialect::IntArrayAttribute" in {
+                        type[0] for type in op_info.mutable_attribute_type_list
+                    }:
+                        impl_str += self._gen_one_impl(
+                            op_info, op_name, True, True
+                        )
         body = impl_str
         for namespace in reversed(namespaces):
             body = NAMESPACE_TEMPLATE.format(namespace=namespace, body=body)
diff --git a/paddle/fluid/ir/dialect/op_generator/op_build_gen.py b/paddle/fluid/pir/dialect/op_generator/op_build_gen.py
similarity index 76%
rename from paddle/fluid/ir/dialect/op_generator/op_build_gen.py
rename to paddle/fluid/pir/dialect/op_generator/op_build_gen.py
index 66d1094c9e5fc..66a3d5fbdf311 100644
--- a/paddle/fluid/ir/dialect/op_generator/op_build_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/op_build_gen.py
@@ -13,9 +13,22 @@
 # limitations under the License.
 
 # generator build function
-_INFERMETA_NEED_META_CONFIG = {'SplitInferMeta'}
+_INFERMETA_NEED_META_CONFIG = {
+    'SplitInferMeta',
+    'SumInferMeta',
+    'SplitWithNumInferMeta',
+    'ConcatInferMeta',
+    'ReduceIntArrayAxisInferMeta',
+}
+
+_PREPARE_DATA_WITH_UNKNOW_ATTRIBUTE = {
+    'SplitOp',
+    'SumOp',
+    'SplitWithNumOp',
+    'ConcatOp',
+    'MeanOp',
+}
 
-_PREPARE_DATA_WITH_UNKNOW_ATTRIBUTE = {'SplitOp'}
 
 OP_BUILD_TEMPLATE = """
 void {op_name}::Build({build_args}) {{
@@ -42,16 +55,16 @@ def GenBuildInputArgsStr(
     attr_args_is_map=False,
 ):
     '''
-    Example: ir::Builder &builder, ir::OperationArgument &argument, ir::OpResult x_, phi::DataType dtype=phi::DataType::UNDEFINED, phi::Place place={}
+    Example: pir::Builder &builder, pir::OperationArgument &argument, pir::OpResult x_, phi::DataType dtype=phi::DataType::UNDEFINED, phi::Place place={}
     '''
     # add inputs
-    build_args_str = "ir::Builder &builder, ir::OperationArgument &argument"
+    build_args_str = "pir::Builder &builder, pir::OperationArgument &argument"
     if len(op_input_name_list) > 0:
         for input_name in op_input_name_list:
-            build_args_str += ", ir::OpResult " + input_name + "_"
+            build_args_str += ", pir::OpResult " + input_name + "_"
 
     if attr_args_is_map:
-        build_args_str += ", ir::AttributeMap attributes"
+        build_args_str += ", pir::AttributeMap attributes"
     else:
         if not mutable_attr_is_input:
             # add attributes
@@ -86,7 +99,7 @@ def GenBuildInputArgsStr(
             # add mutable attributes as inputs
             if len(op_mutable_attribute_name_list) > 0:
                 for mutable_attr in op_mutable_attribute_name_list:
-                    build_args_str += ", ir::OpResult " + mutable_attr + "_"
+                    build_args_str += ", pir::OpResult " + mutable_attr + "_"
 
             # add non-mutable attributes
             for attr_idx in range(len(op_non_mutable_attribute_name_list)):
@@ -146,11 +159,11 @@ def GenBuildInserFullForMutableAttribute(
     build_mutable_attribute = ""
     BUILD_INTARRAY_ATTRIBUTE_TEMPLATE = """  // Generate int_array mutable attribute: {attr_name}
   paddle::dialect::FullIntArrayOp full_{attr_name}_op = builder.Build<paddle::dialect::FullIntArrayOp>({attr_name}, {phi_dtype}, phi::CPUPlace());
-  ir::OpResult {attr_name}_ = full_{attr_name}_op->result(0);
+  pir::OpResult {attr_name}_ = full_{attr_name}_op->result(0);
     """
     BUILD_SCALAR_ATTRIBUTE_TEMPLATE = """  // Generate scalar mutable attribute: {attr_name}
   paddle::dialect::FullOp full_{attr_name}_op = builder.Build<paddle::dialect::FullOp>(std::vector<int64_t>{{1}}, {attr_name}, {phi_dtype}, phi::CPUPlace());
-  ir::OpResult {attr_name}_ = full_{attr_name}_op->result(0);
+  pir::OpResult {attr_name}_ = full_{attr_name}_op->result(0);
     """
     for idx in range(len(op_mutable_attribute_name_list)):
         attr_name = op_mutable_attribute_name_list[idx]
@@ -177,7 +190,7 @@ def GenBuildInserFullForMutableAttribute(
 
 
 def GenBuildInputs(op_input_name_list, op_mutable_attribute_name_list):
-    BUILD_INPUT_TEMPLATE = """  std::vector<ir::OpResult> argument_inputs = {{{inputs_args}}};
+    BUILD_INPUT_TEMPLATE = """  std::vector<pir::OpResult> argument_inputs = {{{inputs_args}}};
   argument.AddOperands(argument_inputs.begin(), argument_inputs.end());
 """
     build_input_str = '  VLOG(4) << "Builder construction inputs";\n'
@@ -194,24 +207,25 @@ def GenBuildInputs(op_input_name_list, op_mutable_attribute_name_list):
 def GenBuildAttributes(
     op_non_mutable_attribute_name_list, op_non_mutable_attribute_type_list
 ):
-    INTARRAY_STR_TEMPLATE = """  ir::Attribute attr_{attr_name} = {op_attribute_type}::get(ir::IrContext::Instance(), phi::IntArray({attr}));
+    INTARRAY_STR_TEMPLATE = """  pir::Attribute attr_{attr_name} = {op_attribute_type}::get(pir::IrContext::Instance(), phi::IntArray({attr}));
 """
-    SCALAR_STR_TEMPLATE = """  ir::Attribute attr_{attr_name} = paddle::dialect::TransToIrAttribute({attr}, ir::IrContext::Instance());
+    SCALAR_STR_TEMPLATE = """  pir::Attribute attr_{attr_name} = paddle::dialect::TransToIrAttribute({attr}, pir::IrContext::Instance());
 """
-    STR_TEMPLATE = """  ir::Attribute attr_{attr_name} = {op_attribute_type}::get(ir::IrContext::Instance(), {attr});
+    STR_TEMPLATE = """  pir::Attribute attr_{attr_name} = {op_attribute_type}::get(pir::IrContext::Instance(), {attr});
 """
-    ARRAY_ATTRIBUTE_TEMPLATE = """  std::vector<ir::Attribute> vec_{attr_name};
+    ARRAY_ATTRIBUTE_TEMPLATE = """  std::vector<pir::Attribute> vec_{attr_name};
   for (size_t i = 0; i < static_cast<size_t>({attr_size}); i++) {{
     {create_attribute}
     vec_{attr_name}.push_back(attr_{attr_name});
   }}
-  ir::Attribute attr_{attr_name} = ir::ArrayAttribute::get(ir::IrContext::Instance(), vec_{attr_name});
+  pir::Attribute attr_{attr_name} = pir::ArrayAttribute::get(pir::IrContext::Instance(), vec_{attr_name});
 """
     attr_str = '  VLOG(4) << "Builder construction attributes";\n'
+    array_attr_type = "pir::ArrayAttribute<"
     for idx in range(len(op_non_mutable_attribute_name_list)):
-        if "ir::ArrayAttribute<" in op_non_mutable_attribute_type_list[idx]:
+        if array_attr_type in op_non_mutable_attribute_type_list[idx]:
             inner_attribute_type = op_non_mutable_attribute_type_list[idx][
-                19:-1
+                len(array_attr_type) : -1
             ]
             if inner_attribute_type == "paddle::dialect::IntArrayAttribute":
                 attr_str += ARRAY_ATTRIBUTE_TEMPLATE.format(
@@ -280,12 +294,15 @@ def GenBuildOutputs(
     op_class_name,
     op_input_name_list,
     op_input_type_list,
+    op_input_optional_list,
     op_mutable_attribute_name_list,
     op_mutable_attribute_type_list,
     op_output_name_list,
     op_output_type_list,
     op_output_size_list,
+    op_output_optional_list,
     op_infer_meta_map,
+    op_inplace_map,
     mutable_attr_is_input=False,
 ):
     build_output_str = '  VLOG(4) << "Builder construction outputs";\n'
@@ -299,6 +316,23 @@ def GenBuildOutputs(
   VLOG(4) << "Builder construction  meta_{name}";
   phi::MetaTensor meta_{name}(&ir_meta_tensor_{name});
 """
+
+    CREATE_OPTIONAL_INPUT_METATENSOR_TEMPLATE = """
+  phi::MetaTensor meta_{name};
+  if ({name}_.impl() != nullptr) {{
+    paddle::dialect::DenseTensorType {name} = {name}_.type().dyn_cast<paddle::dialect::DenseTensorType>();
+    VLOG(4) << "Builder construction  dense_{name}";
+    paddle::dialect::IrMetaTensor ir_meta_tensor_{name}(paddle::dialect::TransToPhiDataType({name}.dtype()),
+                                                        {name}.dims(),
+                                                        {name}.data_layout(),
+                                                        {name}.lod(),
+                                                        {name}.offset());
+    VLOG(4) << "Builder construction  meta_{name}";
+    meta_{name} = phi::MetaTensor(&ir_meta_tensor_{name});
+  }}
+
+"""
+
     CREATE_INPUT_VEC_METATENSOR_TEMPLATE = """  std::vector<paddle::dialect::IrMetaTensor> vec_ir_meta_tensor_{name};
   for (size_t i=0; i < static_cast<size_t>({name}.size()); i++) {{
     vec_ir_meta_tensor_{name}.push_back(paddle::dialect::IrMetaTensor(paddle::dialect::TransToPhiDataType({name}[i].dyn_cast<paddle::dialect::DenseTensorType>().dtype()),
@@ -322,7 +356,7 @@ def GenBuildOutputs(
     CREATE_SCALAR_MUTABLE_ATTRIBUE_TEMPLATE = """  {dtype} {name} = {name}_.owner()->dyn_cast<paddle::dialect::FullOp>().attributes().at("value").dyn_cast<paddle::dialect::ScalarAttribute>().data().to<{dtype}>(); (void){name};\n"""
 
     CREATE_INTARRAY_MUTABLE_ATTRIBUE_WITH_UNKONW_DATA_TEMPLATE = """  phi::IntArray {name};
-  if ({name}_.owner()->info().id() == ir::TypeId::get<paddle::dialect::FullIntArrayOp>()) {{
+  if ({name}_.owner()->info().id() == pir::TypeId::get<paddle::dialect::FullIntArrayOp>()) {{
     {name} = std::move(phi::IntArray({name}_.owner()
                           ->dyn_cast<paddle::dialect::FullIntArrayOp>()
                           .attributes()
@@ -330,8 +364,8 @@ def GenBuildOutputs(
                           .dyn_cast<paddle::dialect::IntArrayAttribute>()
                           .data()
                           .GetData()));
-  }} else if ({name}_.type().isa<ir::VectorType>()) {{
-    size_t {name}_size = {name}_.type().dyn_cast<ir::VectorType>().size();
+  }} else if ({name}_.type().isa<pir::VectorType>()) {{
+    size_t {name}_size = {name}_.type().dyn_cast<pir::VectorType>().size();
     {name} = std::move(phi::IntArray(std::vector<int64_t>({name}_size, -1)));
     {name}.SetFromTensor(true);
   }} else if ({name}_.type().isa<paddle::dialect::DenseTensorType>()) {{
@@ -343,7 +377,7 @@ def GenBuildOutputs(
   }}\n"""
 
     CREATE_SCALAR_MUTABLE_ATTRIBUE_WITH_UNKONW_DATA_TEMPLATE = """  phi::Scalar {name};
-  if ({name}_.owner()->info().id() == ir::TypeId::get<paddle::dialect::FullOp>()) {{
+  if ({name}_.owner()->info().id() == pir::TypeId::get<paddle::dialect::FullOp>()) {{
     {name} = std::move(phi::Scalar({name}_.owner()
                                   ->dyn_cast<paddle::dialect::FullOp>()
                                   .attributes()
@@ -373,15 +407,16 @@ def GenBuildOutputs(
     # Prepar input type
     for idx in range(len(op_input_name_list)):
         # is a vector<Tensor>
-        if 'ir::VectorType' in op_input_type_list[idx]:
-            build_output_str += "  ir::VectorType {name} = {name}_.type().dyn_cast<ir::VectorType>(); (void){name};\n".format(
+        if 'pir::VectorType' in op_input_type_list[idx]:
+            build_output_str += "  pir::VectorType {name} = {name}_.type().dyn_cast<pir::VectorType>(); (void){name};\n".format(
                 name=op_input_name_list[idx]
             )
         # is a Tensor
         else:
-            build_output_str += "  paddle::dialect::DenseTensorType {name} = {name}_.type().dyn_cast<paddle::dialect::DenseTensorType>(); (void){name};\n".format(
-                name=op_input_name_list[idx]
-            )
+            if op_input_optional_list[idx] == 'false':
+                build_output_str += "  paddle::dialect::DenseTensorType {name} = {name}_.type().dyn_cast<paddle::dialect::DenseTensorType>(); (void){name};\n".format(
+                    name=op_input_name_list[idx]
+                )
 
     # Prepare mutable attributes
     if mutable_attr_is_input:
@@ -414,7 +449,7 @@ def GenBuildOutputs(
                         )
                     )
             # string
-            elif attr_dtype[0] == "ir::StrAttribute":
+            elif attr_dtype[0] == "pir::StrAttribute":
                 build_output_str += ""
             else:
                 assert "mutable attribtue type is not right."
@@ -430,7 +465,7 @@ def GenBuildOutputs(
             ) not in infer_meta_args:
                 # is a vector<Tensor>
                 if (
-                    'ir::VectorType'
+                    'pir::VectorType'
                     in op_input_type_list[
                         op_input_name_list.index(
                             op_infer_meta_map['param'][idx]
@@ -444,9 +479,21 @@ def GenBuildOutputs(
                     )
                 # is a Tensor
                 else:
-                    build_output_str += CREATE_INPUT_METATENSOR_TEMPLATE.format(
-                        name=op_infer_meta_map['param'][idx]
+                    input_index = op_input_name_list.index(
+                        op_infer_meta_map['param'][idx]
                     )
+                    if op_input_optional_list[input_index] == 'true':
+                        build_output_str += (
+                            CREATE_OPTIONAL_INPUT_METATENSOR_TEMPLATE.format(
+                                name=op_infer_meta_map['param'][idx]
+                            )
+                        )
+                    else:
+                        build_output_str += (
+                            CREATE_INPUT_METATENSOR_TEMPLATE.format(
+                                name=op_infer_meta_map['param'][idx]
+                            )
+                        )
 
             infer_meta_args.append("meta_" + op_infer_meta_map['param'][idx])
         # is attribute
@@ -456,7 +503,7 @@ def GenBuildOutputs(
     # Prepare outputs_meta_tensor for infer meta
     for idx in range(len(op_output_name_list)):
         # is a vector<Tensor>
-        if 'ir::VectorType' in op_output_type_list[idx]:
+        if 'pir::VectorType' in op_output_type_list[idx]:
             build_output_str += CREATE_OUTPUT_VEC_METATENSOR_TEMPLATE.format(
                 name=op_output_name_list[idx],
                 output_size=op_output_size_list[idx],
@@ -488,32 +535,58 @@ def GenBuildOutputs(
         )
 
     # use dense_{name} or vec_dense_{name} to create Outputs type
-    build_output_str += "\n  std::vector<ir::Type> argument_outputs;"
+    build_output_str += "\n  std::vector<pir::Type> argument_outputs;"
 
     CREATE_OUTPUT_DENSE_TENSOR_TEMPLATE = """
-  ir::Type {name}_dense_tensor_type = paddle::dialect::DenseTensorType::get(ir::IrContext::Instance(), paddle::dialect::TransToIrDataType(dense_{name}.dtype()), dense_{name}.dims(), dense_{name}.layout(), dense_{name}.lod(), dense_{name}.offset());
+  pir::Type {name}_dense_tensor_type = paddle::dialect::DenseTensorType::get(pir::IrContext::Instance(), paddle::dialect::TransToIrDataType(dense_{name}.dtype()), dense_{name}.dims(), dense_{name}.layout(), dense_{name}.lod(), dense_{name}.offset());
   argument_outputs.push_back({name}_dense_tensor_type);
 """
+
+    CREATE_OUTPUT_INPLACE_OPTIONAL_DENSE_TENSOR_TEMPLATE = """
+  if ({input_name}_.impl() != nullptr) {{
+    pir::Type {output_name}_dense_tensor_type = paddle::dialect::DenseTensorType::get(pir::IrContext::Instance(), paddle::dialect::TransToIrDataType(dense_{output_name}.dtype()), dense_{output_name}.dims(), dense_{output_name}.layout(), dense_{output_name}.lod(), dense_{output_name}.offset());
+    argument_outputs.push_back({output_name}_dense_tensor_type);
+  }} else {{
+    pir::Type {output_name}_type;
+    argument_outputs.push_back({output_name}_type);
+  }}
+
+"""
+
     CREATE_OUTPUT_VEC_DENSE_TENSOR_TEMPLATE = """
-  std::vector<ir::Type> {name}_types;
+  std::vector<pir::Type> {name}_types;
   for (size_t i=0; i < static_cast<size_t>({output_size}); i++) {{
-    {name}_types.push_back(paddle::dialect::DenseTensorType::get(ir::IrContext::Instance(), paddle::dialect::TransToIrDataType(vec_dense_{name}[i].dtype()), vec_dense_{name}[i].dims(), vec_dense_{name}[i].layout(), vec_dense_{name}[i].lod(), vec_dense_{name}[i].offset()));
+    {name}_types.push_back(paddle::dialect::DenseTensorType::get(pir::IrContext::Instance(), paddle::dialect::TransToIrDataType(vec_dense_{name}[i].dtype()), vec_dense_{name}[i].dims(), vec_dense_{name}[i].layout(), vec_dense_{name}[i].lod(), vec_dense_{name}[i].offset()));
   }}
-  ir::Type {name}_vector_type = ir::VectorType::get(ir::IrContext::Instance(), {name}_types);
+  pir::Type {name}_vector_type = pir::VectorType::get(pir::IrContext::Instance(), {name}_types);
   argument_outputs.push_back({name}_vector_type);
 """
     for idx in range(len(op_output_name_list)):
         # is a vector<Tensor>
-        if 'ir::VectorType' in op_output_type_list[idx]:
+        if 'pir::VectorType' in op_output_type_list[idx]:
             build_output_str += CREATE_OUTPUT_VEC_DENSE_TENSOR_TEMPLATE.format(
                 name=op_output_name_list[idx],
                 output_size=op_output_size_list[idx],
             )
         # is a Tensor
         else:
-            build_output_str += CREATE_OUTPUT_DENSE_TENSOR_TEMPLATE.format(
-                name=op_output_name_list[idx]
+            output_name = op_output_name_list[idx]
+            has_input_inplace = (
+                op_inplace_map is not None
+                and output_name in op_inplace_map.keys()
             )
+            if op_output_optional_list[idx] == 'true' and has_input_inplace:
+                # is a inplace optional output
+                build_output_str += (
+                    CREATE_OUTPUT_INPLACE_OPTIONAL_DENSE_TENSOR_TEMPLATE.format(
+                        input_name=op_inplace_map[output_name],
+                        output_name=output_name,
+                    )
+                )
+            else:
+                build_output_str += CREATE_OUTPUT_DENSE_TENSOR_TEMPLATE.format(
+                    name=output_name
+                )
 
     build_output_str += "  argument.AddOutputs(argument_outputs.begin(), argument_outputs.end());\n"
 
@@ -524,6 +597,7 @@ def gen_build_func_str(
     op_class_name,
     op_input_name_list,
     op_input_type_list,
+    op_input_optional_list,
     op_attribute_name_list,
     op_attribute_type_list,
     op_attribute_build_arg_type_list,
@@ -537,7 +611,9 @@ def gen_build_func_str(
     op_output_name_list,
     op_output_type_list,
     op_output_size_list,
+    op_output_optional_list,
     op_infer_meta_map,
+    op_inplace_map,
     muta_attr_is_input=False,
     attr_args_is_map=False,
 ):
@@ -593,35 +669,59 @@ def gen_build_func_str(
         op_class_name,
         op_input_name_list,
         op_input_type_list,
+        op_input_optional_list,
         op_mutable_attribute_name_list,
         op_mutable_attribute_type_list,
         op_output_name_list,
         op_output_type_list,
         op_output_size_list,
+        op_output_optional_list,
         op_infer_meta_map,
+        op_inplace_map,
         muta_attr_is_input,
     )
 
     GET_ATTRIBUTES_FROM_MAP_TEMPLATE = """
+  PADDLE_ENFORCE(
+      attributes.find("{attribute_name}") != attributes.end(),
+      phi::errors::NotFound(
+          "'{attribute_name}' Attribute is expected for {op_name}. "));
   {attr_type} {attribute_name} = attributes.at("{attribute_name}").dyn_cast<{attr_ir_type}>().data();
 """
     GET_STR_ATTRIBUTES_FROM_MAP_TEMPLATE = """
-  {attr_type} {attribute_name} = attributes.at("{attribute_name}").dyn_cast<ir::StrAttribute>().AsString();
+  PADDLE_ENFORCE(
+      attributes.find("{attribute_name}") != attributes.end(),
+      phi::errors::NotFound(
+          "'{attribute_name}' Attribute is expected for {op_name}. "));
+  {attr_type} {attribute_name} = attributes.at("{attribute_name}").dyn_cast<pir::StrAttribute>().AsString();
 """
     GET_ARRAY_ATTRIBUTE_FROM_MAP_TEMPLATE = """
+  PADDLE_ENFORCE(
+      attributes.find("{attribute_name}") != attributes.end(),
+      phi::errors::NotFound(
+          "'{attribute_name}' Attribute is expected for {op_name}. "));
   {attr_type} {attribute_name};
-  for (size_t i = 0; i < attributes.at("{attribute_name}").dyn_cast<ir::ArrayAttribute>().size(); i++) {{
-    {attribute_name}.push_back(attributes.at("{attribute_name}").dyn_cast<ir::ArrayAttribute>().at(i).dyn_cast<{inner_type}>().{data_name}());
+  for (size_t i = 0; i < attributes.at("{attribute_name}").dyn_cast<pir::ArrayAttribute>().size(); i++) {{
+    {attribute_name}.push_back(attributes.at("{attribute_name}").dyn_cast<pir::ArrayAttribute>().at(i).dyn_cast<{inner_type}>().{data_name}());
   }}
 """
     GET_INTARRAY_ATTRIBUTE_FROM_MAP_TEMPLATE = """
+  PADDLE_ENFORCE(
+      attributes.find("{attribute_name}") != attributes.end(),
+      phi::errors::NotFound(
+          "'{attribute_name}' Attribute is expected for {op_name}. "));
   {attr_type} {attribute_name} = attributes.at("{attribute_name}").dyn_cast<paddle::dialect::IntArrayAttribute>().data().GetData();
 """
     GET_SCALAR_ATTRIBUTE_FROM_MAP_TEMPLATE = """
+  PADDLE_ENFORCE(
+      attributes.find("{attribute_name}") != attributes.end(),
+      phi::errors::NotFound(
+          "'{attribute_name}' Attribute is expected for {op_name}. "));
   {attr_type} {attribute_name} = attributes.at("{attribute_name}").dyn_cast<paddle::dialect::ScalarAttribute>().data().to<{attr_type}>();
 """
 
     get_attributes_str = ""
+    array_attr_str = "pir::ArrayAttribute"
     if attr_args_is_map:
         for idx in range(len(op_attribute_name_list)):
             attr_type = op_attribute_build_arg_type_list[idx]
@@ -629,13 +729,17 @@ def gen_build_func_str(
             attr_type = attr_type.replace("&", "")
             # if op_attribute_build_arg_type_list[idx] == "const std::vector<int>&":
             #     attr_type = "std::vector<int>"
-            if "ir::ArrayAttribute" in op_attribute_type_list[idx]:
-                inner_type = op_attribute_type_list[idx][19:-1]
+
+            if array_attr_str in op_attribute_type_list[idx]:
+                inner_type = op_attribute_type_list[idx][
+                    len(array_attr_str) + 1 : -1
+                ]
                 data_name = "data"
-                if inner_type == "ir::StrAttribute":
+                if inner_type == "pir::StrAttribute":
                     data_name = "AsString"
                 get_attributes_str += (
                     GET_ARRAY_ATTRIBUTE_FROM_MAP_TEMPLATE.format(
+                        op_name=op_class_name,
                         attr_type=attr_type,
                         attribute_name=op_attribute_name_list[idx],
                         inner_type=inner_type,
@@ -648,6 +752,7 @@ def gen_build_func_str(
             ):
                 get_attributes_str += (
                     GET_INTARRAY_ATTRIBUTE_FROM_MAP_TEMPLATE.format(
+                        op_name=op_class_name,
                         attr_type=attr_type,
                         attribute_name=op_attribute_name_list[idx],
                     )
@@ -658,13 +763,15 @@ def gen_build_func_str(
             ):
                 get_attributes_str += (
                     GET_SCALAR_ATTRIBUTE_FROM_MAP_TEMPLATE.format(
+                        op_name=op_class_name,
                         attr_type=attr_type,
                         attribute_name=op_attribute_name_list[idx],
                     )
                 )
-            elif "ir::StrAttribute" in op_attribute_type_list[idx]:
+            elif "pir::StrAttribute" in op_attribute_type_list[idx]:
                 get_attributes_str += (
                     GET_STR_ATTRIBUTES_FROM_MAP_TEMPLATE.format(
+                        op_name=op_class_name,
                         attr_type=attr_type,
                         attribute_name=op_attribute_name_list[idx],
                         attr_ir_type=op_attribute_type_list[idx],
@@ -672,6 +779,7 @@ def gen_build_func_str(
                 )
             else:
                 get_attributes_str += GET_ATTRIBUTES_FROM_MAP_TEMPLATE.format(
+                    op_name=op_class_name,
                     attr_type=attr_type,
                     attribute_name=op_attribute_name_list[idx],
                     attr_ir_type=op_attribute_type_list[idx],
diff --git a/paddle/fluid/ir/dialect/op_generator/op_gen.py b/paddle/fluid/pir/dialect/op_generator/op_gen.py
similarity index 92%
rename from paddle/fluid/ir/dialect/op_generator/op_gen.py
rename to paddle/fluid/pir/dialect/op_generator/op_gen.py
index 8663d23059d45..d52bf901ae17f 100644
--- a/paddle/fluid/ir/dialect/op_generator/op_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/op_gen.py
@@ -15,6 +15,8 @@
 import argparse
 import logging
 import os
+import pathlib
+import sys
 
 import yaml
 from op_build_gen import gen_build_func_str
@@ -30,6 +32,12 @@
     vjp_interface_implementation_gen_op_list,
 )
 
+# import from paddle/fluid/primitive/code_gen/gen.py
+sys.path.append(
+    str(pathlib.Path(__file__).resolve().parents[3] / 'primitive/codegen')
+)
+import gen as vjp_gen
+
 # =====================================
 # String Template for h file code gen
 # =====================================
@@ -41,22 +49,23 @@
 #undef GET_OP_LIST
 {op_declare}
 #else
-// This file is generated by "paddle/fluid/ir/dialect/op_generator/op_gen.py"
+// This file is generated by "paddle/fluid/pir/dialect/op_generator/op_gen.py"
 
 #include <vector>
 
-#include "paddle/ir/core/builder.h"
-#include "paddle/ir/core/operation_utils.h"
-#include "paddle/ir/core/op_base.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/utils/utils.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/utils/op_yaml_info_util.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/interface/op_yaml_info.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/interface/infermeta.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/interface/vjp.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/trait/inplace.h"
+#include "paddle/pir/core/builder.h"
+#include "paddle/pir/core/operation_utils.h"
+#include "paddle/pir/core/op_base.h"
+#include "paddle/fluid/pir/dialect/operator/utils/utils.h"
+#include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_util.h"
+#include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
+#include "paddle/fluid/pir/dialect/operator/interface/infermeta.h"
+#include "paddle/fluid/pir/dialect/operator/interface/vjp.h"
+#include "paddle/fluid/pir/dialect/operator/trait/inplace.h"
+#include "paddle/fluid/pir/dialect/operator/trait/custom_vjp.h"
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_manual_op.h"
+#include "paddle/fluid/pir/dialect/operator/ir/manual_op.h"
 
 {input}
 
@@ -72,7 +81,7 @@
 """
 
 OP_DECLARE_TEMPLATE = """
-class {op_name} : public ir::Op<{op_name}{interfaces}{traits}> {{
+class {op_name} : public pir::Op<{op_name}{interfaces}{traits}> {{
  public:
   using Op::Op;
   static const char *name() {{ return "{dialect_op_name}"; }}
@@ -97,15 +106,15 @@ class {op_name} : public ir::Op<{op_name}{interfaces}{traits}> {{
 # =====================================
 # String Template for cc file code gen
 # =====================================
-CC_FILE_TEMPLATE = """// This file is generated by "paddle/fluid/ir/dialect/op_generator/op_gen.py"
+CC_FILE_TEMPLATE = """// This file is generated by "paddle/fluid/pir/dialect/op_generator/op_gen.py"
 #include "{h_file}"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_type.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_attribute.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_meta_tensor.h"
-#include "paddle/ir/core/builtin_attribute.h"
-#include "paddle/ir/core/builtin_type.h"
-#include "paddle/ir/core/builtin_op.h"
-#include "paddle/ir/core/ir_context.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
+#include "paddle/fluid/pir/dialect/operator/ir/meta_tensor.h"
+#include "paddle/pir/core/builtin_attribute.h"
+#include "paddle/pir/core/builtin_type.h"
+#include "paddle/pir/core/builtin_op.h"
+#include "paddle/pir/core/ir_context.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/infermeta/binary.h"
@@ -117,7 +126,7 @@ class {op_name} : public ir::Op<{op_name}{interfaces}{traits}> {{
 #include "paddle/phi/infermeta/fusion.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/fluid/primitive/rule/vjp/vjp.h"
-#include "paddle/ir/core/op_base.h"
+#include "paddle/pir/core/op_base.h"
 
 {input}
 
@@ -126,13 +135,13 @@ class {op_name} : public ir::Op<{op_name}{interfaces}{traits}> {{
 # =====================================
 # String Template for pd_op_vjp.cc file code gen
 # =====================================
-VJP_CC_FILE_TEMPLATE = """// This file is generated by "paddle/fluid/ir/dialect/op_generator/op_gen.py"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_attribute.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_op.h"
+VJP_CC_FILE_TEMPLATE = """// This file is generated by "paddle/fluid/pir/dialect/op_generator/op_gen.py"
+#include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
 #include "paddle/fluid/primitive/rule/vjp/vjp.h"
 #include "paddle/fluid/primitive/type/lazy_tensor.h"
-#include "paddle/ir/core/builtin_op.h"
-#include "paddle/ir/core/op_base.h"
+#include "paddle/pir/core/builtin_op.h"
+#include "paddle/pir/core/op_base.h"
 #include "paddle/phi/common/int_array.h"
 
 namespace paddle {{
@@ -166,14 +175,14 @@ class {op_name} : public ir::Op<{op_name}{interfaces}{traits}> {{
 """
 
 scalar_type_maps = {
-    'int': 'ir::Int32Attribute',
-    'int64_t': 'ir::Int64Attribute',
-    'float': 'ir::FloatAttribute',
-    'dobule': 'ir::DoubleAttribute',
-    'bool': 'ir::BoolAttribute',
+    'int': 'pir::Int32Attribute',
+    'int64_t': 'pir::Int64Attribute',
+    'float': 'pir::FloatAttribute',
+    'dobule': 'pir::DoubleAttribute',
+    'bool': 'pir::BoolAttribute',
 }
 
-_NO_NEED_GEN_OPS = {'add_n', 'add_n_', 'add_n_with_kernel', 'split_grad'}
+PD_MANUAL_OP_LIST = {'add_n', 'add_n_', 'add_n_with_kernel', 'split_grad'}
 
 
 def to_phi_and_fluid_op_name(op_item):
@@ -255,33 +264,33 @@ def __init__(self, op_yaml_item, op_compat_item):
         self.attr_types_map = {
             'IntArray': ['paddle::dialect::IntArrayAttribute', 'IntArray'],
             'Scalar': ['paddle::dialect::ScalarAttribute', 'Scalar'],
-            'Scalar(int)': ['ir::Int32Attribute', 'int'],
-            'Scalar(int64_t)': ['ir::Int64Attribute', 'int64_t'],
-            'Scalar(float)': ['ir::FloatAttribute', 'float'],
-            'Scalar(dobule)': ['ir::DoubleAttribute', 'dobule'],
+            'Scalar(int)': ['pir::Int32Attribute', 'int'],
+            'Scalar(int64_t)': ['pir::Int64Attribute', 'int64_t'],
+            'Scalar(float)': ['pir::FloatAttribute', 'float'],
+            'Scalar(dobule)': ['pir::DoubleAttribute', 'dobule'],
             'Scalar[]': [
-                'ir::ArrayAttribute<paddle::dialect::ScalarAttribute>',
+                'pir::ArrayAttribute<paddle::dialect::ScalarAttribute>',
                 'const std::vector<Scalar>&',
             ],
-            'int': ['ir::Int32Attribute', 'int'],
-            'int32_t': ['ir::Int32Attribute', 'int32_t'],
-            'int64_t': ['ir::Int64Attribute', 'int64_t'],
-            'long': ['ir::LongAttribute', 'long'],
-            'size_t': ['ir::Size_tAttribute', 'size_t'],
-            'float': ['ir::FloatAttribute', 'float'],
+            'int': ['pir::Int32Attribute', 'int'],
+            'int32_t': ['pir::Int32Attribute', 'int32_t'],
+            'int64_t': ['pir::Int64Attribute', 'int64_t'],
+            'long': ['pir::LongAttribute', 'long'],
+            'size_t': ['pir::Size_tAttribute', 'size_t'],
+            'float': ['pir::FloatAttribute', 'float'],
             'float[]': [
-                'ir::ArrayAttribute<ir::FloatAttribute>',
+                'pir::ArrayAttribute<pir::FloatAttribute>',
                 'const std::vector<float>&',
             ],
-            'double': ['ir::DoubleAttribute', 'double'],
-            'bool': ['ir::BoolAttribute', 'bool'],
+            'double': ['pir::DoubleAttribute', 'double'],
+            'bool': ['pir::BoolAttribute', 'bool'],
             'bool[]': [
-                'ir::ArrayAttribute<ir::BoolAttribute>',
+                'pir::ArrayAttribute<pir::BoolAttribute>',
                 'const std::vector<bool>&',
             ],
-            'str': ['ir::StrAttribute', 'const std::string&'],
+            'str': ['pir::StrAttribute', 'const std::string&'],
             'str[]': [
-                'ir::ArrayAttribute<ir::StrAttribute>',
+                'pir::ArrayAttribute<pir::StrAttribute>',
                 'const std::vector<std::string>&',
             ],
             'Place': ['paddle::dialect::PlaceAttribute', 'const Place&'],
@@ -291,11 +300,11 @@ def __init__(self, op_yaml_item, op_compat_item):
             ],
             'DataType': ['paddle::dialect::DataTypeAttribute', 'DataType'],
             'int64_t[]': [
-                'ir::ArrayAttribute<ir::Int64Attribute>',
+                'pir::ArrayAttribute<pir::Int64Attribute>',
                 'const std::vector<int64_t>&',
             ],
             'int[]': [
-                'ir::ArrayAttribute<ir::Int32Attribute>',
+                'pir::ArrayAttribute<pir::Int32Attribute>',
                 'const std::vector<int>&',
             ],
         }
@@ -517,7 +526,7 @@ def parse_input_name_list(self):
     def parse_input_type_list(self):
         input_types_map = {
             'Tensor': 'paddle::dialect::DenseTensorType',
-            'Tensor[]': 'ir::VectorType<paddle::dialect::DenseTensorType>',
+            'Tensor[]': 'pir::VectorType<paddle::dialect::DenseTensorType>',
         }
         type_list = []
         for input_info in self.op_yaml_item['inputs']:
@@ -554,7 +563,7 @@ def parse_output_name_list(self):
     def parse_output_type_list(self):
         output_type_map = {
             'Tensor': 'paddle::dialect::DenseTensorType',
-            'Tensor[]': 'ir::VectorType<paddle::dialect::DenseTensorType>',
+            'Tensor[]': 'pir::VectorType<paddle::dialect::DenseTensorType>',
             'SelectedRows': 'paddle::dialect::SelectedRowsType',
         }
         type_list = []
@@ -818,6 +827,12 @@ def OpGenerator(
     ops_declare_list = []  # all op class declare store in this list
     ops_defined_list = []  # all op class defined store in this list
     ops_vjp_defined_list = []  # all op vjp static interface defination
+
+    # (4) parse name of ops which have custom vjp rules
+    custom_vjp_op_name_list = []
+    for custom_vjp in vjp_gen.CUSTOM_VJP:
+        custom_vjp_op_name_list.append(custom_vjp[:-5])  # cut _grad
+
     for key, op_info in op_info_items.items():
         # get op inputs info
         op_input_name_list = op_info.input_name_list
@@ -873,6 +888,10 @@ def OpGenerator(
             op_interfaces += ["paddle::dialect::VjpInterface"]
         exclusive_interface_str = gen_exclusive_interface_str(op_info)
 
+        # if op has custom vjp rule, then append a CustomVjpTrait to it
+        if op_info.op_phi_name[0] in custom_vjp_op_name_list:
+            op_traits += ["paddle::dialect::CustomVjpTrait"]
+
         # check op inputs and mutable_attributes grad semantics
         input_grad_semantics = get_input_grad_semantic(op_info, op_info_items)
         mutable_attribute_grad_semantics = get_mutable_attribute_grad_semantic(
@@ -881,7 +900,7 @@ def OpGenerator(
 
         # If op has inplace info, we will generate inplace op and non-inplace op.
         for op_name in op_info.op_phi_name:
-            if op_name in _NO_NEED_GEN_OPS:
+            if op_name in PD_MANUAL_OP_LIST:
                 continue
             op_class_name = to_pascal_case(op_name) + "Op"
             op_dialect_name = dialect_name + "." + op_name
@@ -927,6 +946,7 @@ def OpGenerator(
                     op_class_name,
                     op_input_name_list,
                     op_input_type_list,
+                    op_input_optional_list,
                     op_attribute_name_list,
                     op_attribute_type_list,
                     op_attribute_build_arg_type_list,
@@ -940,7 +960,9 @@ def OpGenerator(
                     op_output_name_list,
                     op_output_type_list,
                     op_output_size_list,
+                    op_output_optional_list,
                     op_infer_meta_map,
+                    op_inplace_map,
                     muta_attr_is_input=False,
                 )
                 if len(op_attribute_name_list) > 0:
@@ -951,6 +973,7 @@ def OpGenerator(
                         op_class_name,
                         op_input_name_list,
                         op_input_type_list,
+                        op_input_optional_list,
                         op_attribute_name_list,
                         op_attribute_type_list,
                         op_attribute_build_arg_type_list,
@@ -964,7 +987,9 @@ def OpGenerator(
                         op_output_name_list,
                         op_output_type_list,
                         op_output_size_list,
+                        op_output_optional_list,
                         op_infer_meta_map,
+                        op_inplace_map,
                         muta_attr_is_input=False,
                         attr_args_is_map=True,
                     )
@@ -982,6 +1007,7 @@ def OpGenerator(
                         op_class_name,
                         op_input_name_list,
                         op_input_type_list,
+                        op_input_optional_list,
                         op_attribute_name_list,
                         op_attribute_type_list,
                         op_attribute_build_arg_type_list,
@@ -995,7 +1021,9 @@ def OpGenerator(
                         op_output_name_list,
                         op_output_type_list,
                         op_output_size_list,
+                        op_output_optional_list,
                         op_infer_meta_map,
+                        op_inplace_map,
                         muta_attr_is_input=True,
                     )
 
@@ -1188,7 +1216,6 @@ def OpGenerator(
             if dialect_name == "cinn":
                 logging.warning("cinn is currently not support Vjp function")
             else:
-                # TODO(chenzhiyang) add vjp gen code
                 if (
                     op_info.backward_name
                     and op_info.op_phi_name[0]
diff --git a/paddle/fluid/ir/dialect/op_generator/op_interface_gen.py b/paddle/fluid/pir/dialect/op_generator/op_interface_gen.py
similarity index 55%
rename from paddle/fluid/ir/dialect/op_generator/op_interface_gen.py
rename to paddle/fluid/pir/dialect/op_generator/op_interface_gen.py
index 2490335f6c3fb..db763146fb1d3 100644
--- a/paddle/fluid/ir/dialect/op_generator/op_interface_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/op_interface_gen.py
@@ -26,14 +26,20 @@
     {input_type} {input_name}(std::make_shared<primitive::LazyTensor>(op_obj.{input_name}()));"""
 
 OP_VJP_FORWARD_MULTI_INPUT_TEMPLATE = """
-    ir::CombineOp combine_op_obj =
-      op_obj.{input_name}().GetDefiningOp()->dyn_cast<ir::CombineOp>();
+    pir::CombineOp combine_op_obj =
+      op_obj.{input_name}().GetDefiningOp()->dyn_cast<pir::CombineOp>();
     std::vector<Tensor> {input_name};
     for (size_t idx = 0; idx < combine_op_obj.inputs().size(); idx++) {{
         {input_name}.emplace_back(
             std::make_shared<primitive::LazyTensor>(combine_op_obj.inputs()[idx]));
     }}"""
 
+OP_VJP_FORWARD_OPTIONAL_INPUT_TEMPLATE = """
+    paddle::optional<Tensor> {input_name};
+    if (op_obj.{input_name}().type().storage()){{
+        {input_name} = paddle::make_optional<Tensor>(Tensor(std::make_shared<primitive::LazyTensor>(op_obj.{input_name}())));
+    }}"""
+
 OP_VJP_FORWARD_OUTPUT_GRAD_TEMPLATE = """
     Tensor {output_grad_name}(std::make_shared<primitive::LazyTensor>(out_grads[{idx1}][{idx2}]));"""
 
@@ -50,6 +56,11 @@
 OP_VJP_ATTRIBUTE_DEFAULT_TEMPLATE = """
     {attr_type} {attr_name} = {default_value};"""
 
+OP_VJP_ATTRIBUTE_ARRAY_TEMPLATE = """
+    {attr_type} {attr_name};
+    for (size_t i = 0; i < op->attribute("{attr_name}").dyn_cast<pir::ArrayAttribute>().size(); i++) {{
+        {attr_name}.push_back(op->attribute("{attr_name}").dyn_cast<pir::ArrayAttribute>().at(i).dyn_cast<{inner_type}>().{func}());
+    }}"""
 
 OP_VJP_CALL_VJP_TEMPLATE = """
     std::vector<std::vector<Tensor>> tensor_res =
@@ -57,19 +68,19 @@
         {inputs_list}stop_gradients);"""
 
 OP_VJP_STOPGRADIENT_TEMPLATE = """
-    std::vector<std::vector<ir::OpResult>> res(tensor_res.size());
+    std::vector<std::vector<pir::OpResult>> res(tensor_res.size());
     for (size_t i = 0; i < tensor_res.size(); ++i) {
         res[i].resize(tensor_res[i].size());
         for (size_t j = 0; j < tensor_res[i].size(); ++j) {
             if(tensor_res[i][j].defined()){
-                res[i][j] = std::static_pointer_cast<primitive::LazyTensor>(tensor_res[i][j].impl())->getValue().dyn_cast<ir::OpResult>();
+                res[i][j] = std::static_pointer_cast<primitive::LazyTensor>(tensor_res[i][j].impl())->value().dyn_cast<pir::OpResult>();
             }
         }
     }"""
 
 OP_VJP_DEFINE_TEMPLATE = """
-std::vector<std::vector<ir::OpResult>> {op_class_name}::Vjp(ir::Operation* op, const std::vector<std::vector<ir::OpResult>>& out_grads, const std::vector<std::vector<bool>>& stop_gradients){{
-    {op_class_name} op_obj = op->dyn_cast<{op_class_name}>();
+std::vector<std::vector<pir::OpResult>> {op_class_name}::Vjp(pir::Operation* op, const std::vector<std::vector<pir::OpResult>>& out_grads, const std::vector<std::vector<bool>>& stop_gradients){{
+    {op_class_name} op_obj = op->dyn_cast<{op_class_name}>(); (void)op_obj;
 
     VLOG(6) << "Prepare inputs of {op_grad_name}";
 {forward_input_output_code}
@@ -89,11 +100,7 @@
 
 input_types_map = {
     'paddle::dialect::DenseTensorType': 'Tensor',
-    'ir::VectorType<paddle::dialect::DenseTensorType>': 'Tensor[]',
-}
-
-attr_data_map = {
-    'ir::StrAttribute': 'AsString',
+    'pir::VectorType<paddle::dialect::DenseTensorType>': 'Tensor[]',
 }
 
 
@@ -111,45 +118,53 @@ def gen_op_vjp_str(
     grad_idx = -1
     for idx in range(len(bw_input_list)):
         build_args_str += bw_input_list[idx] + ", "
-        if (
-            bw_input_list[idx] in op_info.input_name_list
-            or bw_input_list[idx] in op_info.output_name_list
-        ):
-            input_type = input_types_map[op_grad_info.input_type_list[idx]]
-            if input_type == 'Tensor':
-                forward_input_output_code += (
-                    OP_VJP_FORWARD_INPUT_OR_OUTPUT_TEMPLATE.format(
-                        input_type=input_type,
-                        input_name=bw_input_list[idx],
-                    )
-                )
-            else:
-                forward_input_output_code += (
-                    OP_VJP_FORWARD_MULTI_INPUT_TEMPLATE.format(
-                        input_name=bw_input_list[idx],
-                    )
+        if op_grad_info.input_optional_list[idx] == 'true':
+            forward_input_output_code += (
+                OP_VJP_FORWARD_OPTIONAL_INPUT_TEMPLATE.format(
+                    input_name=bw_input_list[idx],
                 )
+            )
         else:
-            grad_idx += 1
-            input_type = input_types_map[op_grad_info.input_type_list[idx]]
-            if input_type == 'Tensor':
-                forward_output_grad_code += (
-                    OP_VJP_FORWARD_OUTPUT_GRAD_TEMPLATE.format(
-                        output_grad_name=bw_input_list[idx],
-                        idx1=grad_idx,
-                        idx2=0,
+            if (
+                bw_input_list[idx] in op_info.input_name_list
+                or bw_input_list[idx] in op_info.output_name_list
+            ):
+                input_type = input_types_map[op_grad_info.input_type_list[idx]]
+                if input_type == 'Tensor':
+                    forward_input_output_code += (
+                        OP_VJP_FORWARD_INPUT_OR_OUTPUT_TEMPLATE.format(
+                            input_type=input_type,
+                            input_name=bw_input_list[idx],
+                        )
+                    )
+                else:
+                    forward_input_output_code += (
+                        OP_VJP_FORWARD_MULTI_INPUT_TEMPLATE.format(
+                            input_name=bw_input_list[idx],
+                        )
                     )
-                )
             else:
-                forward_input_output_code += (
-                    OP_VJP_FORWARD_OUTPUT_GRAD_LIST_TEMPLATE.format(
-                        output_grad_name=bw_input_list[idx], index=grad_idx
+                grad_idx += 1
+                input_type = input_types_map[op_grad_info.input_type_list[idx]]
+                if input_type == 'Tensor':
+                    forward_output_grad_code += (
+                        OP_VJP_FORWARD_OUTPUT_GRAD_TEMPLATE.format(
+                            output_grad_name=bw_input_list[idx],
+                            idx1=grad_idx,
+                            idx2=0,
+                        )
+                    )
+                else:
+                    forward_input_output_code += (
+                        OP_VJP_FORWARD_OUTPUT_GRAD_LIST_TEMPLATE.format(
+                            output_grad_name=bw_input_list[idx], index=grad_idx
+                        )
                     )
-                )
     op_attribute_list = op_grad_info.attribute_name_list
     attribute_code = ''
+    build_attr_str = ''
+    array_attr_str = "pir::ArrayAttribute"
     for idx in range(len(op_attribute_list)):
-        build_args_str += op_attribute_list[idx] + ", "
         if op_attribute_list[idx] in op_info.attribute_name_list:
             if op_attribute_list[idx] in op_info.mutable_attribute_name_list:
                 attribute_code += (
@@ -158,19 +173,38 @@ def gen_op_vjp_str(
                         input_name=op_attribute_list[idx],
                     )
                 )
+                build_args_str += op_attribute_list[idx] + ", "
             else:
-                func = 'data'
-                if (
-                    op_grad_info.attribute_type_list[idx]
-                    in attr_data_map.keys()
-                ):
-                    func = attr_data_map[op_grad_info.attribute_type_list[idx]]
-                attribute_code += OP_VJP_ATTRIBUTE_TEMPLATE.format(
-                    attr_type=op_grad_info.attribute_gen_arg_type_list[idx],
-                    attr_name=op_attribute_list[idx],
-                    attr_parse_type=op_grad_info.attribute_type_list[idx],
-                    func=func,
-                )
+                func = "data"
+                attr_type = op_grad_info.attribute_gen_arg_type_list[idx]
+                attr_type = attr_type.replace("const ", "")
+                attr_type = attr_type.replace("&", "")
+                if array_attr_str in op_grad_info.attribute_type_list[idx]:
+                    inner_type = op_grad_info.attribute_type_list[idx][
+                        len(array_attr_str) + 1 : -1
+                    ]
+                    func = "data"
+                    if inner_type == "pir::StrAttribute":
+                        func = "AsString"
+                    attribute_code += OP_VJP_ATTRIBUTE_ARRAY_TEMPLATE.format(
+                        attr_type=attr_type,
+                        attr_name=op_attribute_list[idx],
+                        inner_type=inner_type,
+                        func=func,
+                    )
+                else:
+                    if (
+                        op_grad_info.attribute_type_list[idx]
+                        == "pir::StrAttribute"
+                    ):
+                        func = "AsString"
+                    attribute_code += OP_VJP_ATTRIBUTE_TEMPLATE.format(
+                        attr_type=attr_type,
+                        attr_name=op_attribute_list[idx],
+                        attr_parse_type=op_grad_info.attribute_type_list[idx],
+                        func=func,
+                    )
+                build_attr_str += op_attribute_list[idx] + ", "
 
         else:
             attribute_code += OP_VJP_ATTRIBUTE_DEFAULT_TEMPLATE.format(
@@ -178,6 +212,8 @@ def gen_op_vjp_str(
                 attr_name=op_attribute_list[idx],
                 default_value=op_grad_info.attribute_default_value_list[idx],
             )
+            build_attr_str += op_attribute_list[idx] + ", "
+    build_args_str += build_attr_str
     op_phi_name_format = op_phi_name
     if op_phi_name[-1] == '_':
         op_phi_name_format = op_phi_name[:-1]
@@ -218,5 +254,5 @@ def gen_exclusive_interface_str(op_info):
             "  static void InferMeta( phi::InferMetaContext *infer_meta );"
         )
     if op_info.op_phi_name[0] in vjp_interface_declare_gen_op_list:
-        exclusive_interface_str += "\n  static std::vector<std::vector<ir::OpResult>> Vjp(ir::Operation* op, const std::vector<std::vector<ir::OpResult>>& out_grads, const std::vector<std::vector<bool>>& stop_gradients);"
+        exclusive_interface_str += "\n  static std::vector<std::vector<pir::OpResult>> Vjp(pir::Operation* op, const std::vector<std::vector<pir::OpResult>>& out_grads, const std::vector<std::vector<bool>>& stop_gradients);"
     return exclusive_interface_str
diff --git a/paddle/fluid/ir/dialect/op_generator/op_member_func_gen.py b/paddle/fluid/pir/dialect/op_generator/op_member_func_gen.py
similarity index 88%
rename from paddle/fluid/ir/dialect/op_generator/op_member_func_gen.py
rename to paddle/fluid/pir/dialect/op_generator/op_member_func_gen.py
index 9bc2c75ccf8a9..1cf32a44c5f60 100644
--- a/paddle/fluid/ir/dialect/op_generator/op_member_func_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/op_member_func_gen.py
@@ -14,9 +14,9 @@
 
 # generator op member function
 
-OP_GET_INPUT_TEMPLATE = """  ir::Value {input_name}() {{ return operand_source({input_index}); }}
+OP_GET_INPUT_TEMPLATE = """  pir::Value {input_name}() {{ return operand_source({input_index}); }}
 """
-OP_GET_OUTPUT_TEMPLATE = """  ir::OpResult {output_name}() {{ return result({output_index}); }}
+OP_GET_OUTPUT_TEMPLATE = """  pir::OpResult {output_name}() {{ return result({output_index}); }}
 """
 
 
diff --git a/paddle/fluid/ir/dialect/op_generator/op_verify_gen.py b/paddle/fluid/pir/dialect/op_generator/op_verify_gen.py
similarity index 91%
rename from paddle/fluid/ir/dialect/op_generator/op_verify_gen.py
rename to paddle/fluid/pir/dialect/op_generator/op_verify_gen.py
index 917728f2c8b17..4dffdb2c7b814 100644
--- a/paddle/fluid/ir/dialect/op_generator/op_verify_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/op_verify_gen.py
@@ -43,7 +43,7 @@
   PADDLE_ENFORCE((*this)->operand_source({index}).type().isa<{standard}>(),
                   phi::errors::PreconditionNotMet("Type validation failed for the {index}th input."));"""
 INPUT_VECTORTYPE_CHECK_TEMPLATE = """
-  if (auto vec_type = (*this)->operand_source({index}).type().dyn_cast<ir::VectorType>()) {{
+  if (auto vec_type = (*this)->operand_source({index}).type().dyn_cast<pir::VectorType>()) {{
       for (size_t i = 0; i < vec_type.size(); ++i) {{
         PADDLE_ENFORCE(vec_type[i].isa<{standard}>(),
                        phi::errors::PreconditionNotMet("Type validation failed for the {index}th input."));
@@ -60,7 +60,7 @@
   }}"""
 INPUT_OPTIONAL_VECTORTYPE_CHECK_TEMPLATE = """
   if (auto val =  (*this)->operand({index})) {{
-    if (auto vec_type = val.type().dyn_cast<ir::VectorType>()) {{
+    if (auto vec_type = val.type().dyn_cast<pir::VectorType>()) {{
       for (size_t i = 0; i < vec_type.size(); i++) {{
         PADDLE_ENFORCE(vec_type[i].isa<{standard}>(),
                           phi::errors::PreconditionNotMet("Type validation failed for the {index}th input."));
@@ -75,10 +75,10 @@
   PADDLE_ENFORCE(attributes.count("{attribute_name}")>0 && attributes.at("{attribute_name}").isa<{standard}>(),
                  phi::errors::PreconditionNotMet("Type of attribute: {attribute_name} is not right."));"""
 ATTRIBUTE_VECTOR_CHECK_TEMPLATE = """
-  PADDLE_ENFORCE(attributes.count("{attribute_name}")>0 && attributes.at("{attribute_name}").isa<ir::ArrayAttribute>(),
+  PADDLE_ENFORCE(attributes.count("{attribute_name}")>0 && attributes.at("{attribute_name}").isa<pir::ArrayAttribute>(),
                  phi::errors::PreconditionNotMet("Type of attribute: {attribute_name} is not right."));
-  for (size_t i = 0; i < attributes.at("{attribute_name}").dyn_cast<ir::ArrayAttribute>().size(); i++) {{
-    PADDLE_ENFORCE(attributes.at("{attribute_name}").dyn_cast<ir::ArrayAttribute>().at(i).isa<{standard}>(),
+  for (size_t i = 0; i < attributes.at("{attribute_name}").dyn_cast<pir::ArrayAttribute>().size(); i++) {{
+    PADDLE_ENFORCE(attributes.at("{attribute_name}").dyn_cast<pir::ArrayAttribute>().at(i).isa<{standard}>(),
                    phi::errors::PreconditionNotMet("Type of attribute: {attribute_name} is not right."));
   }}"""
 OUTPUT_TYPE_CHECK_TEMPLATE = """
@@ -86,7 +86,7 @@
                  phi::errors::PreconditionNotMet("Type validation failed for the {index}th output."));"""
 OUTPUT_VECTORTYPE_CHECK_TEMPLATE = """
   auto output_{index}_type = (*this)->result({index}).type();
-  if (auto vec_type = output_{index}_type.dyn_cast<ir::VectorType>()) {{
+  if (auto vec_type = output_{index}_type.dyn_cast<pir::VectorType>()) {{
     for (size_t i = 0; i < vec_type.size(); i++) {{
       PADDLE_ENFORCE(vec_type[i].isa<{standard}>(),
                      phi::errors::PreconditionNotMet("Type validation failed for the {index}th output."));
@@ -103,7 +103,7 @@
   }}"""
 OUTPUT_OPTIONAL_VECTORTYPE_CHECK_TEMPLATE = """
   if (auto output_{index}_type = (*this)->result({index}).type()) {{
-    if (auto vec_type = output_{index}_type.dyn_cast<ir::VectorType>()) {{
+    if (auto vec_type = output_{index}_type.dyn_cast<pir::VectorType>()) {{
       for (size_t i = 0; i < vec_type.size(); ++i) {{
         PADDLE_ENFORCE(vec_type[i].isa<{standard}>(),
                        phi::errors::PreconditionNotMet("Type validation failed for the {index}th output."));
@@ -128,13 +128,14 @@ def gen_inputs_type_check_str(
   // Inputs num is 0, not need to check inputs type."""
     else:
         inputs_type_check_str = ""
+    vector_type_str = "pir::VectorType<"
     for idx in range(len(op_input_type_list)):
         input_type = op_input_type_list[idx]
         is_optional = op_input_optional_list[idx]
         is_vector = False
-        if input_type.startswith("ir::VectorType<"):
+        if input_type.startswith(vector_type_str):
             is_vector = True
-            input_type = input_type[15:-1]
+            input_type = input_type[len(vector_type_str) : -1]
         check_str = ""
         if is_optional == "true":
             if is_vector:
@@ -182,11 +183,13 @@ def gen_attributes_type_check_str(
     else:
         attributes_check_str = """
   auto& attributes = this->attributes();"""
+    array_attr_str = "pir::ArrayAttribute<"
     for idx in range(len(op_non_mutable_attribute_name_list)):
         attribute_name = op_non_mutable_attribute_name_list[idx]
         attribute_type = op_non_mutable_attribute_type_list[idx]
-        if attribute_type.startswith("ir::ArrayAttribute<"):
-            attribute_type = attribute_type[19:-1]
+
+        if attribute_type.startswith(array_attr_str):
+            attribute_type = attribute_type[len(array_attr_str) : -1]
             attributes_check_str += ATTRIBUTE_VECTOR_CHECK_TEMPLATE.format(
                 attribute_name=attribute_name,
                 standard=attribute_type,
@@ -205,13 +208,14 @@ def gen_outputs_type_check_str(op_output_type_list, op_output_optional_list):
   // Outputs num is 0, not need to check outputs type."""
     else:
         outputs_type_check_str = ""
+    vector_type_str = "pir::VectorType<"
     for idx in range(len(op_output_type_list)):
         output_type = op_output_type_list[idx]
         is_optional = op_output_optional_list[idx]
         is_vector = False
-        if output_type.startswith("ir::VectorType<"):
+        if output_type.startswith(vector_type_str):
             is_vector = True
-            output_type = output_type[15:-1]
+            output_type = output_type[len(vector_type_str) : -1]
         check_str = ""
         if is_optional == "true":
             if is_vector:
diff --git a/paddle/fluid/ir/dialect/op_generator/ops_api_gen.py b/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
similarity index 83%
rename from paddle/fluid/ir/dialect/op_generator/ops_api_gen.py
rename to paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
index bde5c7c23a7bc..9f04a9b2fd4b2 100644
--- a/paddle/fluid/ir/dialect/op_generator/ops_api_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
@@ -15,13 +15,14 @@
 import argparse
 import os
 
-from api_gen import NAMESPACE_TEMPLATE, PD_MANUAL_OP_LIST, CodeGen
+from api_gen import NAMESPACE_TEMPLATE, CodeGen
 
 CPP_FILE_TEMPLATE = """
 #include <pybind11/pybind11.h>
 
 #include "paddle/fluid/pybind/static_op_function.h"
 #include "paddle/fluid/pybind/eager_op_function.h"
+#include "paddle/fluid/pybind/manual_static_op_function.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/fluid/eager/api/utils/global_utils.h"
 
@@ -41,6 +42,9 @@
   if (PyModule_AddFunctions(module->ptr(), OpsAPI) < 0) {{
     PADDLE_THROW(phi::errors::Fatal("Add C++ api to core.ops failed!"));
   }}
+  if (PyModule_AddFunctions(module->ptr(), ManualOpsAPI) < 0) {{
+    PADDLE_THROW(phi::errors::Fatal("Add C++ api to core.ops failed!"));
+  }}
 }}
 """
 
@@ -55,7 +59,7 @@
   }}
 }}"""
 
-NO_DY_FUNCTION_IMPL_TEMPLATE = """
+STATIC_ONLY_FUNCTION_IMPL_TEMPLATE = """
 static PyObject *{name}(PyObject *self, PyObject *args, PyObject *kwargs) {{
   VLOG(6) << "Call static_api_{name}";
   return static_api_{name}(self, args, kwargs);
@@ -64,8 +68,9 @@
 OPS_API_TEMPLATE = """
 {{"{name}", (PyCFunction)(void (*)(void)){name}, METH_VARARGS | METH_KEYWORDS, "C++ interface function for {name}."}},"""
 
-SPECIAL_STATIC_ONLY_APIS = [
-    'fetch',
+NEED_GEN_STATIC_ONLY_APIS = ['fetch']
+
+NO_NEED_GEN_STATIC_ONLY_APIS = [
     'set_value_with_tensor',
     'set_value_with_tensor_',
     'fused_bn_add_activation_',
@@ -86,6 +91,10 @@
     'c_allreduce_sum',
     'c_embedding',
     'c_identity',
+    'c_reduce_sum',
+    'c_allreduce_max',
+    'c_allgather',
+    'seed',
 ]
 
 
@@ -93,14 +102,16 @@ class OpsAPIGen(CodeGen):
     def __init__(self) -> None:
         super().__init__()
 
+    def _need_skip(self, op_info, op_name):
+        return (
+            super()._need_skip(op_info, op_name)
+            or op_name.endswith(('_grad', '_grad_', 'xpu'))
+            or op_name in NO_NEED_GEN_STATIC_ONLY_APIS
+        )
+
     def _gen_one_function_impl(self, name):
-        if (
-            name.endswith('_grad')
-            or name.endswith('_grad_')
-            or name.endswith('xpu')
-            or name in SPECIAL_STATIC_ONLY_APIS
-        ):
-            return NO_DY_FUNCTION_IMPL_TEMPLATE.format(name=name)
+        if name in NEED_GEN_STATIC_ONLY_APIS:
+            return STATIC_ONLY_FUNCTION_IMPL_TEMPLATE.format(name=name)
         else:
             return FUNCTION_IMPL_TEMPLATE.format(name=name)
 
@@ -117,10 +128,7 @@ def gen_cpp_file(
         ops_api_str = ''
         for op_info in op_info_items:
             for op_name in op_info.op_phi_name:
-                if (
-                    op_info.infer_meta_func is None
-                    and op_name not in PD_MANUAL_OP_LIST
-                ):
+                if self._need_skip(op_info, op_name):
                     continue
                 function_impl_str += self._gen_one_function_impl(op_name)
                 ops_api_str += self._gen_one_ops_api(op_name)
diff --git a/paddle/fluid/ir/dialect/op_generator/python_c_gen.py b/paddle/fluid/pir/dialect/op_generator/python_c_gen.py
similarity index 67%
rename from paddle/fluid/ir/dialect/op_generator/python_c_gen.py
rename to paddle/fluid/pir/dialect/op_generator/python_c_gen.py
index a890a8db5d249..62c98bcef9f80 100644
--- a/paddle/fluid/ir/dialect/op_generator/python_c_gen.py
+++ b/paddle/fluid/pir/dialect/op_generator/python_c_gen.py
@@ -15,13 +15,7 @@
 import argparse
 import re
 
-from api_gen import (
-    NAMESPACE_TEMPLATE,
-    OP_RESULT,
-    PD_MANUAL_OP_LIST,
-    VECTOR_TYPE,
-    CodeGen,
-)
+from api_gen import NAMESPACE_TEMPLATE, OP_RESULT, VECTOR_TYPE, CodeGen
 
 H_FILE_TEMPLATE = """
 
@@ -46,7 +40,7 @@
 CPP_FILE_TEMPLATE = """
 
 #include "paddle/fluid/pybind/static_op_function.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_api.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_api.h"
 #include "paddle/fluid/pybind/eager_utils.h"
 #include "paddle/fluid/pybind/exception.h"
 #include "paddle/fluid/pybind/op_function_common.h"
@@ -125,20 +119,14 @@
         {attrs_py_obj}
 
         // Check for mutable attrs
-        bool has_mutable_attr = false;
-        {check_mutable_attrs}
-
-        if (has_mutable_attr){{
-            {cast_attrs_with_mutable}
-            // Call ir static api
-            auto static_api_out = paddle::dialect::{api_name}({args_with_mutable_attrs});
-            return ToPyObject(static_api_out);
-        }} else {{
-            {cast_attrs_without_mutable}
-            // Call ir static api
-            auto static_api_out = paddle::dialect::{api_name}({args_without_mutable_attrs});
-            return ToPyObject(static_api_out);
-        }}
+        {init_attrs}
+        {cast_attrs}
+
+        // Call ir static api
+        auto static_api_out = paddle::dialect::{api_name}({args_with_mutable_attrs});
+        return ToPyObject(static_api_out);
+
+
     }} catch (...) {{
         ThrowExceptionToPython(std::current_exception());
         return nullptr;
@@ -146,18 +134,40 @@
 }}
 """
 
-CHECK_MUTABLE_ATTR_TEMPLATE = """
+INIT_ATTRS_TEMPLATE = """
+       {type} {name};
+"""
+MUTABLE_ATTR_TEMPLATE = """
         if (PyObject_CheckIROpResult({name}_obj)){{
-            has_mutable_attr = true;
+            {mutable_cast_attrs}
+        }}else{{
+            {no_mutable_cast_attrs}
+        }}"""
+
+MUTABLE_ATTR_LIST_TEMPLATE = """
+        if (PyObject_CheckIRVectorOfOpResult({name}_obj)){{
+           {mutable_cast_attrs}
+        }}else{{
+           {no_mutable_cast_attrs}
         }}"""
 
 MUTABLE_ATTR_OBJ_TEMPLATE = """
         PyObject *{name}_obj = PyTuple_GET_ITEM(args, {index});"""
 
 MUTABLE_ATTR_CAST_TEMPLATE = """
-            {type} {name} = {cast_func}({name}_obj, "{api_name}", {index});"""
+            {type} {name_} = {cast_func}({name}_obj, "{api_name}", {index});"""
+
+FULL_OP_TEMPLATE = """
+            {name} = paddle::dialect::full(std::vector<int64_t>{{1}}, {name}_tmp, phi::DataType::{phi_datatype}, phi::CPUPlace());
+"""
 
+FULL_INT_ARRAY_OP_TEMPLATE = """
+            {name} = paddle::dialect::full_int_array({name}_tmp, phi::DataType::{phi_datatype}, phi::CPUPlace());
+"""
 
+BUILTIN_COMBINE_OP_TEMPLATE = """
+            {name} = paddle::dialect::builtin_combine({name}_tmp);
+"""
 TYPE_TO_FUNC_MAP = {
     "bool": "CastPyArg2Boolean",
     "int": "CastPyArg2Int",
@@ -181,6 +191,21 @@
     "phi::DataType": "CastPyArg2DataTypeDirectly",
 }
 
+TYPE_TO_PHI_DATATYPE_MAP = {
+    "bool": "BOOL",
+    "int": "INT32",
+    "long": "INT64",
+    "int64_t": "INT64",
+    "float": "FLOAT32",
+    "double": "FLOAT64",
+    "std::vector<bool>": "BOOL",
+    "std::vector<int>": "INT32",
+    "std::vector<long>": "INT64",
+    "std::vector<int64_t>": "INT64",
+    "std::vector<float>": "FLOAT32",
+    "std::vector<double>": "FLOAT64",
+}
+
 
 class PythonCCodeGen(CodeGen):
     def __init__(self) -> None:
@@ -195,10 +220,7 @@ def _gen_h_file(self, op_info_items, namespaces, h_file_path):
             for op_name in op_info.op_phi_name:
                 # NOTE:When infer_meta_func is None, the Build() function generated in pd_op
                 # is wrong, so temporarily skip the automatic generation of these APIs
-                if (
-                    op_info.infer_meta_func is None
-                    and op_name not in PD_MANUAL_OP_LIST
-                ):
+                if self._need_skip(op_info, op_name):
                     continue
                 declare_str += self._gen_one_declare(op_name)
 
@@ -252,33 +274,101 @@ def _gen_attrs_py_obj_with_mutable(self, op_info):
             )
         return ret
 
-    def _gen_check_mutable_attrs(self, op_info):
-        name_list = op_info.mutable_attribute_name_list
+    def _gen_init_mutable_attrs(self, op_info):
+        mutable_attr_name_list = op_info.mutable_attribute_name_list
         ret = ''
-        for name in name_list:
-            ret += CHECK_MUTABLE_ATTR_TEMPLATE.format(name=name)
+        for name in mutable_attr_name_list:
+            ret += INIT_ATTRS_TEMPLATE.format(type=OP_RESULT, name=name)
+
         return ret
 
-    def _gen_cast_attrs(self, op_info, op_name, with_mutable):
+    def _gen_cast_attrs(self, op_info, op_name):
         input_size = len(op_info.input_name_list)
         attr_name_list = op_info.attribute_name_list
         attr_type_list = op_info.attribute_build_arg_type_list
         mutable_attr_name_list = op_info.mutable_attribute_name_list
+        mutable_attr_type_list = op_info.mutable_attribute_type_list
         assert len(attr_name_list) == len(attr_type_list)
         ret = ''
         for i, (name, type) in enumerate(zip(attr_name_list, attr_type_list)):
             type = type.replace('const ', '').replace('&', '')
             cast_func = TYPE_TO_FUNC_MAP[type]
-            if with_mutable and name in mutable_attr_name_list:
-                type = OP_RESULT
-                cast_func = 'CastPyArg2OpResult'
-            ret += MUTABLE_ATTR_CAST_TEMPLATE.format(
-                type=type,
-                name=name,
-                cast_func=cast_func,
-                api_name=op_name,
-                index=input_size + i,
-            )
+
+            if name in mutable_attr_name_list:
+                phi_dtype = TYPE_TO_PHI_DATATYPE_MAP[type]
+                if (
+                    mutable_attr_type_list[mutable_attr_name_list.index(name)][
+                        0
+                    ]
+                    == "paddle::dialect::IntArrayAttribute"
+                ):
+                    mutable_cast_str = MUTABLE_ATTR_CAST_TEMPLATE.format(
+                        type='std::vector<pir::OpResult>',
+                        name_=name + '_tmp',
+                        name=name,
+                        cast_func='CastPyArg2VectorOfOpResult',
+                        api_name=op_name,
+                        index=input_size + i,
+                    )
+                    mutable_cast_str += BUILTIN_COMBINE_OP_TEMPLATE.format(
+                        name=name
+                    )
+
+                else:
+                    mutable_cast_str = MUTABLE_ATTR_CAST_TEMPLATE.format(
+                        type='',
+                        name_=name,
+                        name=name,
+                        cast_func='CastPyArg2OpResult',
+                        api_name=op_name,
+                        index=input_size + i,
+                    )
+
+                no_mutable_cast_str = MUTABLE_ATTR_CAST_TEMPLATE.format(
+                    type=type,
+                    name_=name + '_tmp',
+                    name=name,
+                    cast_func=cast_func,
+                    api_name=op_name,
+                    index=input_size + i,
+                )
+
+                if (
+                    mutable_attr_type_list[mutable_attr_name_list.index(name)][
+                        0
+                    ]
+                    == "paddle::dialect::IntArrayAttribute"
+                ):
+                    no_mutable_cast_str += FULL_INT_ARRAY_OP_TEMPLATE.format(
+                        name=name,
+                        phi_datatype=phi_dtype,
+                    )
+                    ret += MUTABLE_ATTR_LIST_TEMPLATE.format(
+                        name=name,
+                        mutable_cast_attrs=mutable_cast_str,
+                        no_mutable_cast_attrs=no_mutable_cast_str,
+                    )
+                else:
+                    no_mutable_cast_str += FULL_OP_TEMPLATE.format(
+                        name=name,
+                        phi_datatype=phi_dtype,
+                    )
+                    ret += MUTABLE_ATTR_TEMPLATE.format(
+                        name=name,
+                        mutable_cast_attrs=mutable_cast_str,
+                        no_mutable_cast_attrs=no_mutable_cast_str,
+                    )
+            else:
+                mutable_cast_str = MUTABLE_ATTR_CAST_TEMPLATE.format(
+                    type=type,
+                    name_=name,
+                    name=name,
+                    cast_func=cast_func,
+                    api_name=op_name,
+                    index=input_size + i,
+                )
+                ret += mutable_cast_str
+
         return ret
 
     def _gen_one_impl(self, op_info, op_name):
@@ -300,21 +390,13 @@ def _gen_one_impl(self, op_info, op_name):
                 api_name=op_name,
                 inputs=self._gen_inputs(op_info, op_name),
                 attrs_py_obj=self._gen_attrs_py_obj_with_mutable(op_info),
-                check_mutable_attrs=self._gen_check_mutable_attrs(op_info),
-                cast_attrs_with_mutable=self._gen_cast_attrs(
-                    op_info, op_name, True
-                ),
+                init_attrs=self._gen_init_mutable_attrs(op_info),
+                cast_attrs=self._gen_cast_attrs(op_info, op_name),
                 args_with_mutable_attrs=', '.join(
                     input_name_list
                     + mutable_attr_name_list
                     + no_mutable_attr_name_list
                 ),
-                cast_attrs_without_mutable=self._gen_cast_attrs(
-                    op_info, op_name, False
-                ),
-                args_without_mutable_attrs=', '.join(
-                    input_name_list + attr_name_list
-                ),
             )
         else:
             ret = NO_MUTABLE_ATTR_API_IMPL_TEMPLATE.format(
@@ -332,10 +414,7 @@ def _gen_cpp_file(self, op_info_items, namespaces, cpp_file_path):
             for op_name in op_info.op_phi_name:
                 # NOTE:When infer_meta_func is None, the Build() function generated in pd_op
                 # is wrong, so temporarily skip the automatic generation of these APIs
-                if (
-                    op_info.infer_meta_func is None
-                    and op_name not in PD_MANUAL_OP_LIST
-                ):
+                if self._need_skip(op_info, op_name):
                     continue
                 impl_str += self._gen_one_impl(op_info, op_name)
         body = impl_str
diff --git a/paddle/fluid/ir/dialect/op_generator/vjp_interface_gen_op_list.py b/paddle/fluid/pir/dialect/op_generator/vjp_interface_gen_op_list.py
similarity index 74%
rename from paddle/fluid/ir/dialect/op_generator/vjp_interface_gen_op_list.py
rename to paddle/fluid/pir/dialect/op_generator/vjp_interface_gen_op_list.py
index 9707d6fb5f9a2..2bbce72200d0c 100644
--- a/paddle/fluid/ir/dialect/op_generator/vjp_interface_gen_op_list.py
+++ b/paddle/fluid/pir/dialect/op_generator/vjp_interface_gen_op_list.py
@@ -22,6 +22,7 @@
 # remove this file and support Vjp methods
 # code gen.
 
+
 vjp_interface_declare_gen_op_list = [
     "tanh",
     "mean",
@@ -34,15 +35,29 @@
     "matmul",
     "erf",
     "multiply",
-    "subtract",
     "pow",
     "rsqrt",
+    "subtract",
+    "square",
     "dropout",
+    'exp',
+    'expand',
+    'layer_norm',
+    'reshape',
+    'cast',
+    'softmax',
+    'silu',
+    'elementwise_pow',
+    'fused_softmax_mask_upper_triangle',
+    'slice',
+    'transpose',
+    'slice_double',
 ]
 vjp_interface_implementation_gen_op_list = [
     "tanh",
     "mean",
     "divide",
+    "sum",
     "add",
     "concat",
     "split",
@@ -53,5 +68,18 @@
     "subtract",
     "pow",
     "rsqrt",
+    "square",
     "dropout",
+    'exp',
+    'expand',
+    'layer_norm',
+    'reshape',
+    'cast',
+    'softmax',
+    'silu',
+    'elementwise_pow',
+    'fused_softmax_mask_upper_triangle',
+    'slice',
+    'transpose',
+    'slice_double',
 ]
diff --git a/paddle/fluid/ir/dialect/paddle_dialect/CMakeLists.txt b/paddle/fluid/pir/dialect/operator/CMakeLists.txt
similarity index 100%
rename from paddle/fluid/ir/dialect/paddle_dialect/CMakeLists.txt
rename to paddle/fluid/pir/dialect/operator/CMakeLists.txt
diff --git a/paddle/fluid/pir/dialect/operator/interface/CMakeLists.txt b/paddle/fluid/pir/dialect/operator/interface/CMakeLists.txt
new file mode 100644
index 0000000000000..a6496585e7790
--- /dev/null
+++ b/paddle/fluid/pir/dialect/operator/interface/CMakeLists.txt
@@ -0,0 +1,7 @@
+# All source files of pd_op_dialect, except for the source file of op, which is generated in the compilation directory.
+file(GLOB PD_INTERFACE_SRCS "*.cc")
+
+cc_library(
+  pd_interface
+  SRCS ${PD_INTERFACE_SRCS}
+  DEPS pir_core phi_utils)
diff --git a/paddle/fluid/ir/dialect/paddle_dialect/interface/infermeta.h b/paddle/fluid/pir/dialect/operator/interface/infermeta.h
similarity index 77%
rename from paddle/fluid/ir/dialect/paddle_dialect/interface/infermeta.h
rename to paddle/fluid/pir/dialect/operator/interface/infermeta.h
index ba3d54c59439b..958d2df369ed9 100644
--- a/paddle/fluid/ir/dialect/paddle_dialect/interface/infermeta.h
+++ b/paddle/fluid/pir/dialect/operator/interface/infermeta.h
@@ -13,13 +13,14 @@
 // limitations under the License.
 #pragma once
 
-#include "paddle/ir/core/op_base.h"
 #include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/pir/core/op_base.h"
 
 namespace paddle {
 namespace dialect {
-class InferMetaInterface : public ir::OpInterfaceBase<InferMetaInterface> {
+class InferMetaInterface : public pir::OpInterfaceBase<InferMetaInterface> {
  public:
+  /// Defined these methods with the interface.
   struct Concept {
     explicit Concept(void (*infer_meta)(phi::InferMetaContext *))
         : infer_meta_(infer_meta) {}
@@ -28,15 +29,16 @@ class InferMetaInterface : public ir::OpInterfaceBase<InferMetaInterface> {
 
   template <class ConcreteOp>
   struct Model : public Concept {
-    static void InferMeta(phi::InferMetaContext *infer_meta) {
+    static inline void InferMeta(phi::InferMetaContext *infer_meta) {
       return ConcreteOp::InferMeta(infer_meta);
     }
 
     Model() : Concept(InferMeta) {}
   };
 
-  InferMetaInterface(ir::Operation *op, Concept *impl)
-      : ir::OpInterfaceBase<InferMetaInterface>(op), impl_(impl) {}
+  /// Constructor
+  InferMetaInterface(pir::Operation *op, Concept *impl)
+      : pir::OpInterfaceBase<InferMetaInterface>(op), impl_(impl) {}
 
   void InferMeta(phi::InferMetaContext *infer_meta) {
     impl_->infer_meta_(infer_meta);
diff --git a/paddle/fluid/ir/dialect/paddle_dialect/interface/interface.cc b/paddle/fluid/pir/dialect/operator/interface/interface.cc
similarity index 79%
rename from paddle/fluid/ir/dialect/paddle_dialect/interface/interface.cc
rename to paddle/fluid/pir/dialect/operator/interface/interface.cc
index 12b14de308640..92b3bf0ba2168 100644
--- a/paddle/fluid/ir/dialect/paddle_dialect/interface/interface.cc
+++ b/paddle/fluid/pir/dialect/operator/interface/interface.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/ir/dialect/paddle_dialect/interface/infermeta.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/interface/op_yaml_info.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/interface/vjp.h"
+#include "paddle/fluid/pir/dialect/operator/interface/infermeta.h"
+#include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
+#include "paddle/fluid/pir/dialect/operator/interface/vjp.h"
 
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::InferMetaInterface)
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::OpYamlInfoInterface)
diff --git a/paddle/fluid/ir/dialect/paddle_dialect/interface/op_yaml_info.h b/paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h
similarity index 83%
rename from paddle/fluid/ir/dialect/paddle_dialect/interface/op_yaml_info.h
rename to paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h
index 7663fb2029a43..33011f5613eb5 100644
--- a/paddle/fluid/ir/dialect/paddle_dialect/interface/op_yaml_info.h
+++ b/paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h
@@ -14,8 +14,8 @@
 
 #pragma once
 
-#include "paddle/fluid/ir/dialect/paddle_dialect/utils/op_yaml_info_util.h"
-#include "paddle/ir/core/op_base.h"
+#include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_util.h"
+#include "paddle/pir/core/op_base.h"
 
 using OpInfoTuple = std::tuple<std::vector<paddle::dialect::OpInputInfo>,
                                std::vector<paddle::dialect::OpAttributeInfo>,
@@ -25,7 +25,7 @@ using OpInfoTuple = std::tuple<std::vector<paddle::dialect::OpInputInfo>,
 
 namespace paddle {
 namespace dialect {
-class OpYamlInfoInterface : public ir::OpInterfaceBase<OpYamlInfoInterface> {
+class OpYamlInfoInterface : public pir::OpInterfaceBase<OpYamlInfoInterface> {
  public:
   struct Concept {
     explicit Concept(OpInfoTuple (*get_op_info)())
@@ -40,8 +40,8 @@ class OpYamlInfoInterface : public ir::OpInterfaceBase<OpYamlInfoInterface> {
     Model() : Concept(GetOpInfo) {}
   };
 
-  OpYamlInfoInterface(ir::Operation *op, Concept *impl)
-      : ir::OpInterfaceBase<OpYamlInfoInterface>(op), impl_(impl) {}
+  OpYamlInfoInterface(pir::Operation *op, Concept *impl)
+      : pir::OpInterfaceBase<OpYamlInfoInterface>(op), impl_(impl) {}
 
   OpInfoTuple GetOpInfo() { return impl_->get_op_info_(); }
 
diff --git a/paddle/fluid/ir/dialect/paddle_dialect/interface/vjp.h b/paddle/fluid/pir/dialect/operator/interface/vjp.h
similarity index 62%
rename from paddle/fluid/ir/dialect/paddle_dialect/interface/vjp.h
rename to paddle/fluid/pir/dialect/operator/interface/vjp.h
index a373cd0bacca4..56c814db89088 100644
--- a/paddle/fluid/ir/dialect/paddle_dialect/interface/vjp.h
+++ b/paddle/fluid/pir/dialect/operator/interface/vjp.h
@@ -13,29 +13,29 @@
 // limitations under the License.
 #pragma once
 
-#include "paddle/ir/core/op_base.h"
+#include "paddle/pir/core/op_base.h"
 
 namespace paddle {
 namespace dialect {
-class VjpInterface : public ir::OpInterfaceBase<VjpInterface> {
+class VjpInterface : public pir::OpInterfaceBase<VjpInterface> {
  public:
   struct Concept {
-    explicit Concept(std::vector<std::vector<ir::OpResult>> (*vjp)(
-        ir::Operation* op,
-        const std::vector<std::vector<ir::OpResult>>& out_grads,
+    explicit Concept(std::vector<std::vector<pir::OpResult>> (*vjp)(
+        pir::Operation* op,
+        const std::vector<std::vector<pir::OpResult>>& out_grads,
         const std::vector<std::vector<bool>>& stop_gradients))
         : vjp_(vjp) {}
-    std::vector<std::vector<ir::OpResult>> (*vjp_)(
-        ir::Operation* op,
-        const std::vector<std::vector<ir::OpResult>>& out_grads,
+    std::vector<std::vector<pir::OpResult>> (*vjp_)(
+        pir::Operation* op,
+        const std::vector<std::vector<pir::OpResult>>& out_grads,
         const std::vector<std::vector<bool>>& stop_gradients);
   };
 
   template <class ConcreteOp>
   struct Model : public Concept {
-    static std::vector<std::vector<ir::OpResult>> Vjp(
-        ir::Operation* op,
-        const std::vector<std::vector<ir::OpResult>>& out_grads,
+    static std::vector<std::vector<pir::OpResult>> Vjp(
+        pir::Operation* op,
+        const std::vector<std::vector<pir::OpResult>>& out_grads,
         const std::vector<std::vector<bool>>& stop_gradients) {
       return ConcreteOp::Vjp(op, out_grads, stop_gradients);
     }
@@ -43,12 +43,12 @@ class VjpInterface : public ir::OpInterfaceBase<VjpInterface> {
     Model() : Concept(Vjp) {}
   };
 
-  VjpInterface(ir::Operation* op, Concept* impl)
-      : ir::OpInterfaceBase<VjpInterface>(op), impl_(impl) {}
+  VjpInterface(pir::Operation* op, Concept* impl)
+      : pir::OpInterfaceBase<VjpInterface>(op), impl_(impl) {}
 
-  std::vector<std::vector<ir::OpResult>> Vjp(
-      ir::Operation* op,
-      const std::vector<std::vector<ir::OpResult>>& out_grads,
+  std::vector<std::vector<pir::OpResult>> Vjp(
+      pir::Operation* op,
+      const std::vector<std::vector<pir::OpResult>>& out_grads,
       const std::vector<std::vector<bool>>& stop_gradients) {
     return impl_->vjp_(op, out_grads, stop_gradients);
   }
diff --git a/paddle/fluid/ir/dialect/paddle_dialect/ir/.gitignore b/paddle/fluid/pir/dialect/operator/ir/.gitignore
similarity index 100%
rename from paddle/fluid/ir/dialect/paddle_dialect/ir/.gitignore
rename to paddle/fluid/pir/dialect/operator/ir/.gitignore
diff --git a/paddle/fluid/ir/dialect/paddle_dialect/ir/CMakeLists.txt b/paddle/fluid/pir/dialect/operator/ir/CMakeLists.txt
similarity index 83%
rename from paddle/fluid/ir/dialect/paddle_dialect/ir/CMakeLists.txt
rename to paddle/fluid/pir/dialect/operator/ir/CMakeLists.txt
index 64e3e982133be..71df1b6811bf7 100644
--- a/paddle/fluid/ir/dialect/paddle_dialect/ir/CMakeLists.txt
+++ b/paddle/fluid/pir/dialect/operator/ir/CMakeLists.txt
@@ -1,12 +1,12 @@
 set(PD_DIALECT_BINARY_DIR
-    "${PADDLE_BINARY_DIR}/paddle/fluid/ir/dialect/paddle_dialect/ir")
+    "${PADDLE_BINARY_DIR}/paddle/fluid/pir/dialect/operator/ir")
 
-# Generate pd_dialect files defining op using op_gen_file
+# Generate pd_op_dialect files defining op using op_gen_file
 set(op_gen_parsed_yaml_file
     ${PADDLE_SOURCE_DIR}/paddle/fluid/operators/generator/parse_op.py)
 
 set(op_gen_file
-    ${PADDLE_SOURCE_DIR}/paddle/fluid/ir/dialect/op_generator/op_gen.py)
+    ${PADDLE_SOURCE_DIR}/paddle/fluid/pir/dialect/op_generator/op_gen.py)
 set(op_compat_yaml_file ${PADDLE_SOURCE_DIR}/paddle/phi/api/yaml/op_compat.yaml)
 set(op_forward_yaml_file1
     ${PADDLE_SOURCE_DIR}/paddle/fluid/operators/generator/parsed_ops/ops.parsed.yaml
@@ -28,23 +28,22 @@ set(fused_op_backward_yaml_file
 )
 
 set(pd_op_forward_yaml_file
-    ${PADDLE_SOURCE_DIR}/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_ops.yaml)
+    ${PADDLE_SOURCE_DIR}/paddle/fluid/pir/dialect/operator/ir/ops.yaml)
 
 set(pd_op_backward_yaml_file
-    ${PADDLE_SOURCE_DIR}/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_ops_backward.yaml
-)
+    ${PADDLE_SOURCE_DIR}/paddle/fluid/pir/dialect/operator/ir/ops_backward.yaml)
 
 set(parsed_op_dir
-    ${PADDLE_SOURCE_DIR}/paddle/fluid/ir/dialect/paddle_dialect/ir/generated)
+    ${PADDLE_SOURCE_DIR}/paddle/fluid/pir/dialect/operator/ir/generated)
 
-set(op_yaml_file3 ${parsed_op_dir}/pd_ops.parsed.yaml)
-set(op_yaml_file4 ${parsed_op_dir}/pd_ops_backward.parsed.yaml)
+set(op_yaml_file3 ${parsed_op_dir}/ops.parsed.yaml)
+set(op_yaml_file4 ${parsed_op_dir}/ops_backward.parsed.yaml)
 
 set(op_yaml_files
     ${op_forward_yaml_file1},${op_forward_yaml_file2},${op_backward_yaml_file1},${op_backward_yaml_file2},${fused_op_forward_yaml_file},${fused_op_backward_yaml_file},${op_yaml_file3},${op_yaml_file4}
 )
 set(op_namespace paddle,dialect)
-set(dialect_name pd)
+set(dialect_name pd_op)
 set(op_header_file ${PD_DIALECT_BINARY_DIR}/pd_op.h)
 set(op_source_file ${PD_DIALECT_BINARY_DIR}/pd_op.cc)
 set(op_header_file_tmp ${op_header_file}.tmp)
@@ -96,7 +95,7 @@ set(api_gen_yaml_files
     ${op_forward_yaml_file1},${op_forward_yaml_file2},${op_backward_yaml_file1},${op_backward_yaml_file2},${op_yaml_file3},${op_yaml_file4}
 )
 set(api_gen_file
-    ${PADDLE_SOURCE_DIR}/paddle/fluid/ir/dialect/op_generator/api_gen.py)
+    ${PADDLE_SOURCE_DIR}/paddle/fluid/pir/dialect/op_generator/api_gen.py)
 set(api_header_file ${PD_DIALECT_BINARY_DIR}/pd_api.h)
 set(api_source_file ${PD_DIALECT_BINARY_DIR}/pd_api.cc)
 set(api_header_file_tmp ${api_header_file}.tmp)
@@ -125,7 +124,7 @@ add_custom_command(
   VERBATIM)
 
 set(python_c_gen_file
-    ${PADDLE_SOURCE_DIR}/paddle/fluid/ir/dialect/op_generator/python_c_gen.py)
+    ${PADDLE_SOURCE_DIR}/paddle/fluid/pir/dialect/op_generator/python_c_gen.py)
 set(python_c_header_file
     ${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/static_op_function.h)
 set(python_c_source_file
@@ -160,7 +159,7 @@ add_custom_target(static_op_function_gen ALL DEPENDS ${python_c_header_file}
                                                      ${python_c_source_file})
 
 set(ops_api_gen_file
-    ${PADDLE_SOURCE_DIR}/paddle/fluid/ir/dialect/op_generator/ops_api_gen.py)
+    ${PADDLE_SOURCE_DIR}/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py)
 set(ops_api_source_file ${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/ops_api.cc)
 set(ops_api_source_file_tmp ${ops_api_source_file}.tmp)
 
@@ -186,26 +185,26 @@ add_custom_command(
 add_custom_target(ops_api_gen ALL DEPENDS ${ops_api_source_file})
 
 cc_library(
-  pd_dialect_core
-  SRCS pd_attribute.cc pd_type.cc pd_meta_tensor.cc
+  pd_op_dialect_core
+  SRCS op_attribute.cc op_type.cc meta_tensor.cc
   DEPS phi pd_interface pd_trait type_info)
 cc_library(
-  pd_dialect_op
-  SRCS ${op_source_file} pd_manual_op.cc
-  DEPS pd_dialect_core)
+  pd_op_dialect_op
+  SRCS ${op_source_file} manual_op.cc
+  DEPS pd_op_dialect_core)
 cc_library(
   api_builder
   SRCS api_builder.cc
-  DEPS ir_core)
+  DEPS pir_core)
 cc_library(
-  pd_dialect_api
-  SRCS ${api_source_file} pd_manual_api.cc
-  DEPS api_builder pd_dialect_op)
+  pd_op_dialect_api
+  SRCS ${api_source_file} manual_api.cc
+  DEPS api_builder pd_op_dialect_op)
 
-target_include_directories(pd_dialect_api PRIVATE ${PD_DIALECT_BINARY_DIR})
+target_include_directories(pd_op_dialect_api PRIVATE ${PD_DIALECT_BINARY_DIR})
 
 cc_library(
-  pd_dialect
-  SRCS pd_dialect.cc pd_manual_op_vjp.cc ${op_vjp_source_file}
-  DEPS pd_dialect_api param_to_variable primitive_vjp_experimental
-       pd_dialect_utils op_yaml_info_parser)
+  pd_op_dialect
+  SRCS op_dialect.cc manual_op_vjp.cc ${op_vjp_source_file}
+  DEPS pd_op_dialect_api param_to_variable primitive_vjp_experimental
+       pd_op_dialect_utils op_yaml_info_parser)
diff --git a/paddle/fluid/ir/dialect/paddle_dialect/ir/api_builder.cc b/paddle/fluid/pir/dialect/operator/ir/api_builder.cc
similarity index 80%
rename from paddle/fluid/ir/dialect/paddle_dialect/ir/api_builder.cc
rename to paddle/fluid/pir/dialect/operator/ir/api_builder.cc
index 0ded4ee1a5de8..893c664b78b08 100644
--- a/paddle/fluid/ir/dialect/paddle_dialect/ir/api_builder.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/api_builder.cc
@@ -12,22 +12,22 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/api_builder.h"
-#include "paddle/ir/core/enforce.h"
-#include "paddle/ir/core/ir_context.h"
+#include "paddle/fluid/pir/dialect/operator/ir/api_builder.h"
+#include "paddle/pir/core/enforce.h"
+#include "paddle/pir/core/ir_context.h"
 
 namespace paddle {
 namespace dialect {
 
 APIBuilder::APIBuilder() : builder_(nullptr) {
-  ctx_ = ir::IrContext::Instance();
+  ctx_ = pir::IrContext::Instance();
 }
 
-void APIBuilder::SetProgram(ir::Program* program) {
-  builder_ = std::make_shared<ir::Builder>(ctx_, program->block());
+void APIBuilder::SetProgram(pir::Program* program) {
+  builder_ = std::make_shared<pir::Builder>(ctx_, program->block());
 }
 
-void APIBuilder::SetInsertionPoint(ir::Operation* op) {
+void APIBuilder::SetInsertionPoint(pir::Operation* op) {
   IR_ENFORCE(builder_ != nullptr,
              "builder doesn't hold program, please call SetProgram for "
              "initialization.");
diff --git a/paddle/fluid/ir/dialect/paddle_dialect/ir/api_builder.h b/paddle/fluid/pir/dialect/operator/ir/api_builder.h
similarity index 78%
rename from paddle/fluid/ir/dialect/paddle_dialect/ir/api_builder.h
rename to paddle/fluid/pir/dialect/operator/ir/api_builder.h
index 029c79c2110c0..a06f529d2c5be 100644
--- a/paddle/fluid/ir/dialect/paddle_dialect/ir/api_builder.h
+++ b/paddle/fluid/pir/dialect/operator/ir/api_builder.h
@@ -15,9 +15,9 @@
 #pragma once
 #include <memory>
 
-#include "paddle/ir/core/builder.h"
-#include "paddle/ir/core/macros.h"
-#include "paddle/ir/core/program.h"
+#include "paddle/pir/core/builder.h"
+#include "paddle/pir/core/macros.h"
+#include "paddle/pir/core/program.h"
 
 namespace paddle {
 namespace dialect {
@@ -30,25 +30,25 @@ class APIBuilder {
     static APIBuilder api_builder;
     return api_builder;
   }
-  void SetProgram(ir::Program* program);
+  void SetProgram(pir::Program* program);
 
   /// Set the insertion point to the specified operation, which will cause
   /// subsequent insertions to go right before it.
-  void SetInsertionPoint(ir::Operation* op);
+  void SetInsertionPoint(pir::Operation* op);
 
   void ResetInsertionPointToStart();
 
   void ResetInsertionPointToEnd();
 
-  std::shared_ptr<ir::Builder> GetBuilder() { return builder_; }
+  std::shared_ptr<pir::Builder> GetBuilder() { return builder_; }
 
  private:
   APIBuilder();
 
   DISABLE_COPY_AND_ASSIGN(APIBuilder);
 
-  ir::IrContext* ctx_;
-  std::shared_ptr<ir::Builder> builder_;
+  pir::IrContext* ctx_;
+  std::shared_ptr<pir::Builder> builder_;
 };
 
 }  // namespace dialect
diff --git a/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_attribute_storage.h b/paddle/fluid/pir/dialect/operator/ir/attribute_storage.h
similarity index 84%
rename from paddle/fluid/ir/dialect/paddle_dialect/ir/pd_attribute_storage.h
rename to paddle/fluid/pir/dialect/operator/ir/attribute_storage.h
index 1877e5043fc65..68f066b009329 100644
--- a/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_attribute_storage.h
+++ b/paddle/fluid/pir/dialect/operator/ir/attribute_storage.h
@@ -14,17 +14,17 @@
 
 #pragma once
 
-#include "paddle/ir/core/attribute.h"
-#include "paddle/ir/core/attribute_base.h"
-#include "paddle/ir/core/utils.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/common/place.h"
+#include "paddle/pir/core/attribute.h"
+#include "paddle/pir/core/attribute_base.h"
+#include "paddle/pir/core/utils.h"
 
 namespace paddle {
 namespace dialect {
-struct IntArrayAttributeStorage : public ir::AttributeStorage {
+struct IntArrayAttributeStorage : public pir::AttributeStorage {
   using ParamKey = phi::IntArray;
 
   explicit IntArrayAttributeStorage(const ParamKey &key) { data_ = key; }
@@ -36,9 +36,9 @@ struct IntArrayAttributeStorage : public ir::AttributeStorage {
   static std::size_t HashValue(const ParamKey &key) {
     size_t hash_value = 0;
     hash_value =
-        ir::hash_combine(hash_value, std::hash<bool>()(key.FromTensor()));
+        pir::hash_combine(hash_value, std::hash<bool>()(key.FromTensor()));
     for (auto value : key.GetData()) {
-      hash_value = ir::hash_combine(hash_value, std::hash<int64_t>()(value));
+      hash_value = pir::hash_combine(hash_value, std::hash<int64_t>()(value));
     }
     return hash_value;
   }
@@ -54,7 +54,7 @@ struct IntArrayAttributeStorage : public ir::AttributeStorage {
   phi::IntArray data_;
 };
 
-struct DataTypeAttributeStorage : public ir::AttributeStorage {
+struct DataTypeAttributeStorage : public pir::AttributeStorage {
   using ParamKey = phi::DataType;
 
   explicit DataTypeAttributeStorage(const ParamKey &key) { data_ = key; }
@@ -75,7 +75,7 @@ struct DataTypeAttributeStorage : public ir::AttributeStorage {
   phi::DataType data_;
 };
 
-struct PlaceAttributeStorage : public ir::AttributeStorage {
+struct PlaceAttributeStorage : public pir::AttributeStorage {
   using ParamKey = phi::Place;
 
   explicit PlaceAttributeStorage(const ParamKey &key) { data_ = key; }
@@ -94,7 +94,7 @@ struct PlaceAttributeStorage : public ir::AttributeStorage {
   phi::Place data_;
 };
 
-struct DataLayoutAttributeStorage : public ir::AttributeStorage {
+struct DataLayoutAttributeStorage : public pir::AttributeStorage {
   using ParamKey = phi::DataLayout;
 
   explicit DataLayoutAttributeStorage(const ParamKey &key) { data_ = key; }
diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_api.cc b/paddle/fluid/pir/dialect/operator/ir/manual_api.cc
new file mode 100644
index 0000000000000..05bd226bacba4
--- /dev/null
+++ b/paddle/fluid/pir/dialect/operator/ir/manual_api.cc
@@ -0,0 +1,74 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/pir/dialect/operator/ir/manual_api.h"
+#include "paddle/fluid/pir/dialect/operator/ir/api_builder.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_api.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
+#include "paddle/pir/core/builtin_op.h"
+
+namespace paddle {
+namespace dialect {
+
+pir::OpResult builtin_combine(std::vector<pir::OpResult> x) {
+  auto combine_op =
+      APIBuilder::Instance().GetBuilder()->Build<pir::CombineOp>(x);
+  return combine_op.out();
+}
+
+pir::OpResult get_parameter(const std::string& name,
+                            phi::DataType dtype,
+                            const std::vector<int64_t>& shape) {
+  phi::LoD lod;
+  size_t offset{0};
+  pir::Type out_dense_tensor_type = paddle::dialect::DenseTensorType::get(
+      pir::IrContext::Instance(),
+      TransToIrDataType(dtype),
+      phi::DDim(shape.data(), shape.size()),
+      phi::DataLayout::UNDEFINED,
+      lod,
+      offset);
+  pir::GetParameterOp get_parameter_op =
+      APIBuilder::Instance().GetBuilder()->Build<pir::GetParameterOp>(
+          name, out_dense_tensor_type);
+  return get_parameter_op.result(0);
+}
+
+void set_parameter(pir::OpResult parameter, const std::string& name) {
+  APIBuilder::Instance().GetBuilder()->Build<pir::SetParameterOp>(parameter,
+                                                                  name);
+}
+
+pir::OpResult embedding_grad(pir::OpResult x,
+                             pir::OpResult weight,
+                             pir::OpResult out_grad,
+                             int64_t padding_idx,
+                             bool sparse) {
+  if (weight.type().isa<paddle::dialect::DenseTensorType>()) {
+    if (sparse) {
+      return paddle::dialect::embedding_grad_sparse(
+          x, weight, out_grad, padding_idx, sparse);
+    } else {
+      return paddle::dialect::embedding_grad_dense(
+          x, weight, out_grad, padding_idx, sparse);
+    }
+  } else {
+    PADDLE_THROW(phi::errors::Unimplemented(
+        "Now we do not support sparse weight embedding_grad."));
+  }
+}
+
+}  // namespace dialect
+}  // namespace paddle
diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_api.h b/paddle/fluid/pir/dialect/operator/ir/manual_api.h
new file mode 100644
index 0000000000000..8c737a52b3aa7
--- /dev/null
+++ b/paddle/fluid/pir/dialect/operator/ir/manual_api.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/pir/core/op_result.h"
+
+namespace paddle {
+namespace dialect {
+pir::OpResult builtin_combine(std::vector<pir::OpResult> x);
+
+pir::OpResult get_parameter(const std::string& name,
+                            phi::DataType dtype,
+                            const std::vector<int64_t>& shape);
+
+void set_parameter(pir::OpResult parameter, const std::string& name);
+
+pir::OpResult embedding_grad(pir::OpResult x,
+                             pir::OpResult weight,
+                             pir::OpResult out_grad,
+                             int64_t padding_idx = -1,
+                             bool sparse = false);
+
+}  // namespace dialect
+}  // namespace paddle
diff --git a/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_manual_op.cc b/paddle/fluid/pir/dialect/operator/ir/manual_op.cc
similarity index 82%
rename from paddle/fluid/ir/dialect/paddle_dialect/ir/pd_manual_op.cc
rename to paddle/fluid/pir/dialect/operator/ir/manual_op.cc
index 058a08a384d2d..3ee3bec97cd89 100644
--- a/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_manual_op.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/manual_op.cc
@@ -12,20 +12,21 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_manual_op.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_attribute.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_op.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_type.h"
-#include "paddle/ir/core/builtin_attribute.h"
-#include "paddle/ir/core/builtin_op.h"
-#include "paddle/ir/core/builtin_type.h"
-#include "paddle/ir/core/ir_context.h"
+#include "paddle/fluid/pir/dialect/operator/ir/manual_op.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/infermeta/backward.h"
 #include "paddle/phi/infermeta/fusion.h"
 #include "paddle/phi/infermeta/multiary.h"
+#include "paddle/phi/infermeta/unary.h"
+#include "paddle/pir/core/builtin_attribute.h"
+#include "paddle/pir/core/builtin_op.h"
+#include "paddle/pir/core/builtin_type.h"
+#include "paddle/pir/core/ir_context.h"
 
 namespace paddle {
 namespace dialect {
@@ -33,7 +34,7 @@ namespace dialect {
 OpInfoTuple AddNOp::GetOpInfo() {
   std::vector<paddle::dialect::OpInputInfo> inputs = {
       OpInputInfo("inputs",
-                  "ir::VectorType<paddle::dialect::DenseTensorType>",
+                  "pir::VectorType<paddle::dialect::DenseTensorType>",
                   false,
                   false,
                   false,
@@ -57,7 +58,8 @@ void AddNOp::Verify() {
         1u,
         phi::errors::PreconditionNotMet(
             "The size %d of inputs must be equal to 1.", input_size));
-    if (auto vec_type = (*this)->operand(0).type().dyn_cast<ir::VectorType>()) {
+    if (auto vec_type =
+            (*this)->operand(0).type().dyn_cast<pir::VectorType>()) {
       for (size_t i = 0; i < vec_type.size(); ++i) {
         PADDLE_ENFORCE(vec_type[i].isa<paddle::dialect::DenseTensorType>() ||
                            vec_type[i].isa<paddle::dialect::SelectedRowsType>(),
@@ -96,17 +98,17 @@ void AddNOp::Verify() {
   VLOG(4) << "End Verifying for: AddNOp.";
 }
 
-void AddNOp::Build(ir::Builder &builder,             // NOLINT
-                   ir::OperationArgument &argument,  // NOLINT
-                   ir::OpResult inputs) {
+void AddNOp::Build(pir::Builder &builder,             // NOLINT
+                   pir::OperationArgument &argument,  // NOLINT
+                   pir::OpResult inputs) {
   VLOG(4) << "Builder construction inputs";
-  std::vector<ir::OpResult> argument_inputs = {inputs};
+  std::vector<pir::OpResult> argument_inputs = {inputs};
   argument.AddOperands(argument_inputs.begin(), argument_inputs.end());
 
   VLOG(4) << "Builder construction attributes";
 
   VLOG(4) << "Builder construction outputs";
-  ir::VectorType x = inputs.type().dyn_cast<ir::VectorType>();
+  pir::VectorType x = inputs.type().dyn_cast<pir::VectorType>();
   (void)x;
 
   std::vector<phi::DenseTensor> vec_dense_x;
@@ -137,9 +139,9 @@ void AddNOp::Build(ir::Builder &builder,             // NOLINT
 
   phi::AddNInferMeta(meta_x, &meta_out);
 
-  std::vector<ir::Type> argument_outputs;
-  ir::Type out_dense_tensor_type = paddle::dialect::DenseTensorType::get(
-      ir::IrContext::Instance(),
+  std::vector<pir::Type> argument_outputs;
+  pir::Type out_dense_tensor_type = paddle::dialect::DenseTensorType::get(
+      pir::IrContext::Instance(),
       TransToIrDataType(dense_out.dtype()),
       dense_out.dims(),
       dense_out.layout(),
@@ -158,7 +160,7 @@ OpInfoTuple AddN_Op::GetOpInfo() {
   std::vector<paddle::dialect::OpInputInfo> inputs = {
       paddle::dialect::OpInputInfo(
           "inputs",
-          "ir::VectorType<paddle::dialect::DenseTensorType>",
+          "pir::VectorType<paddle::dialect::DenseTensorType>",
           false,
           false,
           false,
@@ -172,17 +174,17 @@ OpInfoTuple AddN_Op::GetOpInfo() {
   return std::make_tuple(inputs, attributes, outputs, run_time_info, "add_n_");
 }
 
-void AddN_Op::Build(ir::Builder &builder,
-                    ir::OperationArgument &argument,
-                    ir::OpResult inputs_) {
+void AddN_Op::Build(pir::Builder &builder,
+                    pir::OperationArgument &argument,
+                    pir::OpResult inputs_) {
   VLOG(4) << "Builder construction inputs";
-  std::vector<ir::OpResult> argument_inputs = {inputs_};
+  std::vector<pir::OpResult> argument_inputs = {inputs_};
   argument.AddOperands(argument_inputs.begin(), argument_inputs.end());
 
   VLOG(4) << "Builder construction attributes";
 
   VLOG(4) << "Builder construction outputs";
-  ir::VectorType inputs = inputs_.type().dyn_cast<ir::VectorType>();
+  pir::VectorType inputs = inputs_.type().dyn_cast<pir::VectorType>();
   std::vector<phi::DenseTensor> vec_dense_inputs;
   for (size_t i = 0; i < static_cast<size_t>(inputs.size()); i++) {
     vec_dense_inputs.push_back(phi::DenseTensor(
@@ -213,9 +215,9 @@ void AddN_Op::Build(ir::Builder &builder,
 
   phi::AddNInferMeta(meta_inputs, &meta_out);
 
-  std::vector<ir::Type> argument_outputs;
-  ir::Type out_dense_tensor_type = paddle::dialect::DenseTensorType::get(
-      ir::IrContext::Instance(),
+  std::vector<pir::Type> argument_outputs;
+  pir::Type out_dense_tensor_type = paddle::dialect::DenseTensorType::get(
+      pir::IrContext::Instance(),
       paddle::dialect::TransToIrDataType(dense_out.dtype()),
       dense_out.dims(),
       dense_out.layout(),
@@ -236,7 +238,7 @@ void AddN_Op::Verify() {
         phi::errors::PreconditionNotMet(
             "The size %d of inputs must be equal to 1.", input_size));
     if (auto vec_type =
-            (*this)->operand_source(0).type().dyn_cast<ir::VectorType>()) {
+            (*this)->operand_source(0).type().dyn_cast<pir::VectorType>()) {
       for (size_t i = 0; i < vec_type.size(); ++i) {
         PADDLE_ENFORCE(vec_type[i].isa<paddle::dialect::DenseTensorType>() ||
                            vec_type[i].isa<paddle::dialect::SelectedRowsType>(),
@@ -285,7 +287,7 @@ OpInfoTuple AddNWithKernelOp::GetOpInfo() {
   std::vector<paddle::dialect::OpInputInfo> inputs = {
       paddle::dialect::OpInputInfo(
           "inputs",
-          "ir::VectorType<paddle::dialect::DenseTensorType>",
+          "pir::VectorType<paddle::dialect::DenseTensorType>",
           false,
           false,
           false,
@@ -300,17 +302,17 @@ OpInfoTuple AddNWithKernelOp::GetOpInfo() {
       inputs, attributes, outputs, run_time_info, "add_n_with_kernel");
 }
 
-void AddNWithKernelOp::Build(ir::Builder &builder,
-                             ir::OperationArgument &argument,
-                             ir::OpResult inputs_) {
+void AddNWithKernelOp::Build(pir::Builder &builder,
+                             pir::OperationArgument &argument,
+                             pir::OpResult inputs_) {
   VLOG(4) << "Builder construction inputs";
-  std::vector<ir::OpResult> argument_inputs = {inputs_};
+  std::vector<pir::OpResult> argument_inputs = {inputs_};
   argument.AddOperands(argument_inputs.begin(), argument_inputs.end());
 
   VLOG(4) << "Builder construction attributes";
 
   VLOG(4) << "Builder construction outputs";
-  ir::VectorType inputs = inputs_.type().dyn_cast<ir::VectorType>();
+  pir::VectorType inputs = inputs_.type().dyn_cast<pir::VectorType>();
   std::vector<phi::DenseTensor> vec_dense_inputs;
   for (size_t i = 0; i < static_cast<size_t>(inputs.size()); i++) {
     vec_dense_inputs.push_back(phi::DenseTensor(
@@ -341,9 +343,9 @@ void AddNWithKernelOp::Build(ir::Builder &builder,
 
   phi::AddNInferMeta(meta_inputs, &meta_out);
 
-  std::vector<ir::Type> argument_outputs;
-  ir::Type out_dense_tensor_type = paddle::dialect::DenseTensorType::get(
-      ir::IrContext::Instance(),
+  std::vector<pir::Type> argument_outputs;
+  pir::Type out_dense_tensor_type = paddle::dialect::DenseTensorType::get(
+      pir::IrContext::Instance(),
       paddle::dialect::TransToIrDataType(dense_out.dtype()),
       dense_out.dims(),
       dense_out.layout(),
@@ -365,7 +367,7 @@ void AddNWithKernelOp::Verify() {
         phi::errors::PreconditionNotMet(
             "The size %d of inputs must be equal to 1.", input_size));
     if (auto vec_type =
-            (*this)->operand_source(0).type().dyn_cast<ir::VectorType>()) {
+            (*this)->operand_source(0).type().dyn_cast<pir::VectorType>()) {
       for (size_t i = 0; i < vec_type.size(); ++i) {
         PADDLE_ENFORCE(vec_type[i].isa<paddle::dialect::DenseTensorType>() ||
                            vec_type[i].isa<paddle::dialect::SelectedRowsType>(),
@@ -426,9 +428,9 @@ OpInfoTuple FusedGemmEpilogueOp::GetOpInfo() {
                                    false,
                                    false)};
   std::vector<paddle::dialect::OpAttributeInfo> attributes = {
-      paddle::dialect::OpAttributeInfo("trans_x", "ir::BoolAttribute", ""),
-      paddle::dialect::OpAttributeInfo("trans_y", "ir::BoolAttribute", ""),
-      paddle::dialect::OpAttributeInfo("activation", "ir::StrAttribute", "")};
+      paddle::dialect::OpAttributeInfo("trans_x", "pir::BoolAttribute", ""),
+      paddle::dialect::OpAttributeInfo("trans_y", "pir::BoolAttribute", ""),
+      paddle::dialect::OpAttributeInfo("activation", "pir::StrAttribute", "")};
   std::vector<paddle::dialect::OpOutputInfo> outputs = {
       paddle::dialect::OpOutputInfo(
           "out", "paddle::dialect::DenseTensorType", false, false),
@@ -448,32 +450,44 @@ OpInfoTuple FusedGemmEpilogueOp::GetOpInfo() {
       inputs, attributes, outputs, run_time_info, "fused_gemm_epilogue");
 }
 
-void FusedGemmEpilogueOp::Build(ir::Builder &builder,
-                                ir::OperationArgument &argument,
-                                ir::OpResult x_,
-                                ir::OpResult y_,
-                                ir::OpResult bias_,
-                                ir::AttributeMap attributes) {
-  bool trans_x = attributes.at("trans_x").dyn_cast<ir::BoolAttribute>().data();
-
-  bool trans_y = attributes.at("trans_y").dyn_cast<ir::BoolAttribute>().data();
-
+void FusedGemmEpilogueOp::Build(pir::Builder &builder,
+                                pir::OperationArgument &argument,
+                                pir::OpResult x_,
+                                pir::OpResult y_,
+                                pir::OpResult bias_,
+                                pir::AttributeMap attributes) {
+  PADDLE_ENFORCE(
+      attributes.find("trans_x") != attributes.end(),
+      phi::errors::NotFound(
+          "'trans_x' Attribute is expected for FusedGemmEpilogueOp"));
+  bool trans_x = attributes.at("trans_x").dyn_cast<pir::BoolAttribute>().data();
+
+  PADDLE_ENFORCE(
+      attributes.find("trans_y") != attributes.end(),
+      phi::errors::NotFound(
+          "'trans_y' Attribute is expected for FusedGemmEpilogueOp"));
+  bool trans_y = attributes.at("trans_y").dyn_cast<pir::BoolAttribute>().data();
+
+  PADDLE_ENFORCE(
+      attributes.find("activation") != attributes.end(),
+      phi::errors::NotFound(
+          "'activation' Attribute is expected for FusedGemmEpilogueOp"));
   std::string activation =
-      attributes.at("activation").dyn_cast<ir::StrAttribute>().AsString();
+      attributes.at("activation").dyn_cast<pir::StrAttribute>().AsString();
 
   VLOG(4) << "Builder construction inputs";
-  std::vector<ir::OpResult> argument_inputs = {x_, y_, bias_};
+  std::vector<pir::OpResult> argument_inputs = {x_, y_, bias_};
   argument.AddOperands(argument_inputs.begin(), argument_inputs.end());
 
   VLOG(4) << "Builder construction attributes";
-  ir::Attribute attr_trans_x =
-      ir::BoolAttribute::get(ir::IrContext::Instance(), trans_x);
+  pir::Attribute attr_trans_x =
+      pir::BoolAttribute::get(pir::IrContext::Instance(), trans_x);
   argument.AddAttribute("trans_x", attr_trans_x);
-  ir::Attribute attr_trans_y =
-      ir::BoolAttribute::get(ir::IrContext::Instance(), trans_y);
+  pir::Attribute attr_trans_y =
+      pir::BoolAttribute::get(pir::IrContext::Instance(), trans_y);
   argument.AddAttribute("trans_y", attr_trans_y);
-  ir::Attribute attr_activation =
-      ir::StrAttribute::get(ir::IrContext::Instance(), activation);
+  pir::Attribute attr_activation =
+      pir::StrAttribute::get(pir::IrContext::Instance(), activation);
   argument.AddAttribute("activation", attr_activation);
 
   VLOG(4) << "Builder construction outputs";
@@ -540,9 +554,9 @@ void FusedGemmEpilogueOp::Build(ir::Builder &builder,
       &meta_out,
       activation == "none" ? nullptr : &meta_reserve_space);
 
-  std::vector<ir::Type> argument_outputs;
-  ir::Type out_dense_tensor_type = paddle::dialect::DenseTensorType::get(
-      ir::IrContext::Instance(),
+  std::vector<pir::Type> argument_outputs;
+  pir::Type out_dense_tensor_type = paddle::dialect::DenseTensorType::get(
+      pir::IrContext::Instance(),
       paddle::dialect::TransToIrDataType(dense_out.dtype()),
       dense_out.dims(),
       dense_out.layout(),
@@ -550,11 +564,11 @@ void FusedGemmEpilogueOp::Build(ir::Builder &builder,
       dense_out.offset());
   argument_outputs.push_back(out_dense_tensor_type);
 
-  ir::Type reserve_space_dense_tensor_type =
+  pir::Type reserve_space_dense_tensor_type =
       activation == "none"
-          ? ir::Type()
+          ? pir::Type()
           : paddle::dialect::DenseTensorType::get(
-                ir::IrContext::Instance(),
+                pir::IrContext::Instance(),
                 paddle::dialect::TransToIrDataType(dense_reserve_space.dtype()),
                 dense_reserve_space.dims(),
                 dense_reserve_space.layout(),
@@ -599,15 +613,15 @@ void FusedGemmEpilogueOp::Verify() {
   {
     auto &attributes = this->attributes();
     PADDLE_ENFORCE(attributes.count("trans_x") > 0 &&
-                       attributes.at("trans_x").isa<ir::BoolAttribute>(),
+                       attributes.at("trans_x").isa<pir::BoolAttribute>(),
                    phi::errors::PreconditionNotMet(
                        "Type of attribute: trans_x is not right."));
     PADDLE_ENFORCE(attributes.count("trans_y") > 0 &&
-                       attributes.at("trans_y").isa<ir::BoolAttribute>(),
+                       attributes.at("trans_y").isa<pir::BoolAttribute>(),
                    phi::errors::PreconditionNotMet(
                        "Type of attribute: trans_y is not right."));
     PADDLE_ENFORCE(attributes.count("activation") > 0 &&
-                       attributes.at("activation").isa<ir::StrAttribute>(),
+                       attributes.at("activation").isa<pir::StrAttribute>(),
                    phi::errors::PreconditionNotMet(
                        "Type of attribute: activation is not right."));
   }
@@ -659,10 +673,10 @@ OpInfoTuple FusedGemmEpilogueGradOp::GetOpInfo() {
                                    false,
                                    false)};
   std::vector<paddle::dialect::OpAttributeInfo> attributes = {
-      paddle::dialect::OpAttributeInfo("trans_x", "ir::BoolAttribute", ""),
-      paddle::dialect::OpAttributeInfo("trans_y", "ir::BoolAttribute", ""),
+      paddle::dialect::OpAttributeInfo("trans_x", "pir::BoolAttribute", ""),
+      paddle::dialect::OpAttributeInfo("trans_y", "pir::BoolAttribute", ""),
       paddle::dialect::OpAttributeInfo(
-          "activation_grad", "ir::StrAttribute", "")};
+          "activation_grad", "pir::StrAttribute", "")};
   std::vector<paddle::dialect::OpOutputInfo> outputs = {
       paddle::dialect::OpOutputInfo(
           "x_grad", "paddle::dialect::DenseTensorType", false, false),
@@ -689,34 +703,46 @@ OpInfoTuple FusedGemmEpilogueGradOp::GetOpInfo() {
       inputs, attributes, outputs, run_time_info, "fused_gemm_epilogue_grad");
 }
 
-void FusedGemmEpilogueGradOp::Build(ir::Builder &builder,
-                                    ir::OperationArgument &argument,
-                                    ir::OpResult x_,
-                                    ir::OpResult y_,
-                                    ir::OpResult reserve_space_,
-                                    ir::OpResult out_grad_,
-                                    ir::AttributeMap attributes) {
-  bool trans_x = attributes.at("trans_x").dyn_cast<ir::BoolAttribute>().data();
-
-  bool trans_y = attributes.at("trans_y").dyn_cast<ir::BoolAttribute>().data();
-
+void FusedGemmEpilogueGradOp::Build(pir::Builder &builder,
+                                    pir::OperationArgument &argument,
+                                    pir::OpResult x_,
+                                    pir::OpResult y_,
+                                    pir::OpResult reserve_space_,
+                                    pir::OpResult out_grad_,
+                                    pir::AttributeMap attributes) {
+  PADDLE_ENFORCE(
+      attributes.find("trans_x") != attributes.end(),
+      phi::errors::NotFound(
+          "'trans_x' Attribute is expected for FusedGemmEpilogueGradOp"));
+  bool trans_x = attributes.at("trans_x").dyn_cast<pir::BoolAttribute>().data();
+
+  PADDLE_ENFORCE(
+      attributes.find("trans_y") != attributes.end(),
+      phi::errors::NotFound(
+          "'trans_y' Attribute is expected for FusedGemmEpilogueGradOp"));
+  bool trans_y = attributes.at("trans_y").dyn_cast<pir::BoolAttribute>().data();
+
+  PADDLE_ENFORCE(
+      attributes.find("activation_grad") != attributes.end(),
+      phi::errors::NotFound("'activation_grad' Attribute is expected for"
+                            "FusedGemmEpilogueGradOp"));
   std::string activation_grad =
-      attributes.at("activation_grad").dyn_cast<ir::StrAttribute>().AsString();
+      attributes.at("activation_grad").dyn_cast<pir::StrAttribute>().AsString();
 
   VLOG(4) << "Builder construction inputs";
-  std::vector<ir::OpResult> argument_inputs = {
+  std::vector<pir::OpResult> argument_inputs = {
       x_, y_, reserve_space_, out_grad_};
   argument.AddOperands(argument_inputs.begin(), argument_inputs.end());
 
   VLOG(4) << "Builder construction attributes";
-  ir::Attribute attr_trans_x =
-      ir::BoolAttribute::get(ir::IrContext::Instance(), trans_x);
+  pir::Attribute attr_trans_x =
+      pir::BoolAttribute::get(pir::IrContext::Instance(), trans_x);
   argument.AddAttribute("trans_x", attr_trans_x);
-  ir::Attribute attr_trans_y =
-      ir::BoolAttribute::get(ir::IrContext::Instance(), trans_y);
+  pir::Attribute attr_trans_y =
+      pir::BoolAttribute::get(pir::IrContext::Instance(), trans_y);
   argument.AddAttribute("trans_y", attr_trans_y);
-  ir::Attribute attr_activation_grad =
-      ir::StrAttribute::get(ir::IrContext::Instance(), activation_grad);
+  pir::Attribute attr_activation_grad =
+      pir::StrAttribute::get(pir::IrContext::Instance(), activation_grad);
   argument.AddAttribute("activation_grad", attr_activation_grad);
 
   VLOG(4) << "Builder construction outputs";
@@ -809,9 +835,9 @@ void FusedGemmEpilogueGradOp::Build(ir::Builder &builder,
                                       &meta_y_grad,
                                       &meta_bias_grad);
 
-  std::vector<ir::Type> argument_outputs;
-  ir::Type x_grad_dense_tensor_type = paddle::dialect::DenseTensorType::get(
-      ir::IrContext::Instance(),
+  std::vector<pir::Type> argument_outputs;
+  pir::Type x_grad_dense_tensor_type = paddle::dialect::DenseTensorType::get(
+      pir::IrContext::Instance(),
       paddle::dialect::TransToIrDataType(dense_x_grad.dtype()),
       dense_x_grad.dims(),
       dense_x_grad.layout(),
@@ -819,8 +845,8 @@ void FusedGemmEpilogueGradOp::Build(ir::Builder &builder,
       dense_x_grad.offset());
   argument_outputs.push_back(x_grad_dense_tensor_type);
 
-  ir::Type y_grad_dense_tensor_type = paddle::dialect::DenseTensorType::get(
-      ir::IrContext::Instance(),
+  pir::Type y_grad_dense_tensor_type = paddle::dialect::DenseTensorType::get(
+      pir::IrContext::Instance(),
       paddle::dialect::TransToIrDataType(dense_y_grad.dtype()),
       dense_y_grad.dims(),
       dense_y_grad.layout(),
@@ -828,8 +854,8 @@ void FusedGemmEpilogueGradOp::Build(ir::Builder &builder,
       dense_y_grad.offset());
   argument_outputs.push_back(y_grad_dense_tensor_type);
 
-  ir::Type bias_grad_dense_tensor_type = paddle::dialect::DenseTensorType::get(
-      ir::IrContext::Instance(),
+  pir::Type bias_grad_dense_tensor_type = paddle::dialect::DenseTensorType::get(
+      pir::IrContext::Instance(),
       paddle::dialect::TransToIrDataType(dense_bias_grad.dtype()),
       dense_bias_grad.dims(),
       dense_bias_grad.layout(),
@@ -851,7 +877,7 @@ const char *SplitGradOp::attributes_name[1] = {"axis"};
 OpInfoTuple SplitGradOp::GetOpInfo() {
   std::vector<paddle::dialect::OpInputInfo> inputs = {
       OpInputInfo("out_grad",
-                  "ir::VectorType<paddle::dialect::DenseTensorType>",
+                  "pir::VectorType<paddle::dialect::DenseTensorType>",
                   false,
                   false,
                   false,
@@ -879,23 +905,23 @@ OpInfoTuple SplitGradOp::GetOpInfo() {
       inputs, attributes, outputs, run_time_info, "split_grad");
 }
 
-void SplitGradOp::Build(ir::Builder &builder,
-                        ir::OperationArgument &argument,
-                        ir::OpResult out_grad_,
+void SplitGradOp::Build(pir::Builder &builder,
+                        pir::OperationArgument &argument,
+                        pir::OpResult out_grad_,
                         float axis) {
   // Generate scalar mutable attribute: axis
   paddle::dialect::FullOp full_axis_op = builder.Build<paddle::dialect::FullOp>(
       std::vector<int64_t>{1}, axis, phi::DataType::FLOAT32, phi::CPUPlace());
-  ir::OpResult axis_ = full_axis_op->result(0);
+  pir::OpResult axis_ = full_axis_op->result(0);
 
   VLOG(4) << "Builder construction inputs";
-  std::vector<ir::OpResult> argument_inputs = {out_grad_, axis_};
+  std::vector<pir::OpResult> argument_inputs = {out_grad_, axis_};
   argument.AddOperands(argument_inputs.begin(), argument_inputs.end());
 
   VLOG(4) << "Builder construction attributes";
 
   VLOG(4) << "Builder construction outputs";
-  ir::VectorType out_grad = out_grad_.type().dyn_cast<ir::VectorType>();
+  pir::VectorType out_grad = out_grad_.type().dyn_cast<pir::VectorType>();
   std::vector<phi::DenseTensor> vec_dense_out_grad;
   for (size_t i = 0; i < static_cast<size_t>(out_grad.size()); i++) {
     vec_dense_out_grad.push_back(phi::DenseTensor(
@@ -930,9 +956,9 @@ void SplitGradOp::Build(ir::Builder &builder,
 
   phi::ConcatInferMeta(meta_out_grad, axis, &meta_x_grad);
 
-  std::vector<ir::Type> argument_outputs;
-  ir::Type x_grad_dense_tensor_type = paddle::dialect::DenseTensorType::get(
-      ir::IrContext::Instance(),
+  std::vector<pir::Type> argument_outputs;
+  pir::Type x_grad_dense_tensor_type = paddle::dialect::DenseTensorType::get(
+      pir::IrContext::Instance(),
       paddle::dialect::TransToIrDataType(dense_x_grad.dtype()),
       dense_x_grad.dims(),
       dense_x_grad.layout(),
@@ -942,18 +968,18 @@ void SplitGradOp::Build(ir::Builder &builder,
   argument.AddOutputs(argument_outputs.begin(), argument_outputs.end());
 }
 
-void SplitGradOp::Build(ir::Builder &builder,
-                        ir::OperationArgument &argument,
-                        ir::OpResult out_grad_,
-                        ir::OpResult axis_) {
+void SplitGradOp::Build(pir::Builder &builder,
+                        pir::OperationArgument &argument,
+                        pir::OpResult out_grad_,
+                        pir::OpResult axis_) {
   VLOG(4) << "Builder construction inputs";
-  std::vector<ir::OpResult> argument_inputs = {out_grad_, axis_};
+  std::vector<pir::OpResult> argument_inputs = {out_grad_, axis_};
   argument.AddOperands(argument_inputs.begin(), argument_inputs.end());
 
   VLOG(4) << "Builder construction attributes";
 
   VLOG(4) << "Builder construction outputs";
-  ir::VectorType out_grad = out_grad_.type().dyn_cast<ir::VectorType>();
+  pir::VectorType out_grad = out_grad_.type().dyn_cast<pir::VectorType>();
   int axis = axis_.owner()
                  ->dyn_cast<paddle::dialect::FullOp>()
                  .attributes()
@@ -995,9 +1021,9 @@ void SplitGradOp::Build(ir::Builder &builder,
 
   phi::ConcatInferMeta(meta_out_grad, axis, &meta_x_grad);
 
-  std::vector<ir::Type> argument_outputs;
-  ir::Type x_grad_dense_tensor_type = paddle::dialect::DenseTensorType::get(
-      ir::IrContext::Instance(),
+  std::vector<pir::Type> argument_outputs;
+  pir::Type x_grad_dense_tensor_type = paddle::dialect::DenseTensorType::get(
+      pir::IrContext::Instance(),
       TransToIrDataType(dense_x_grad.dtype()),
       dense_x_grad.dims(),
       dense_x_grad.layout(),
@@ -1018,7 +1044,7 @@ void SplitGradOp::Verify() {
         phi::errors::PreconditionNotMet(
             "The size %d of inputs must be equal to 2.", input_size));
     if (auto vec_type =
-            (*this)->operand_source(0).type().dyn_cast<ir::VectorType>()) {
+            (*this)->operand_source(0).type().dyn_cast<pir::VectorType>()) {
       for (size_t i = 0; i < vec_type.size(); ++i) {
         PADDLE_ENFORCE(vec_type[i].isa<paddle::dialect::DenseTensorType>(),
                        phi::errors::PreconditionNotMet(
@@ -1064,29 +1090,29 @@ void SplitGradOp::InferMeta(phi::InferMetaContext *infer_meta) {
   fn(infer_meta);
 }
 
-void IfOp::Build(ir::Builder &builder,             // NOLINT
-                 ir::OperationArgument &argument,  // NOLINT
-                 ir::OpResult cond,
-                 std::vector<ir::Type> &&output_types) {
+void IfOp::Build(pir::Builder &builder,             // NOLINT
+                 pir::OperationArgument &argument,  // NOLINT
+                 pir::OpResult cond,
+                 std::vector<pir::Type> &&output_types) {
   argument.num_regions = 2;
   argument.AddOperand(cond);
   argument.output_types.swap(output_types);
 }
-ir::Block *IfOp::true_block() {
-  ir::Region &true_region = (*this)->region(0);
+pir::Block *IfOp::true_block() {
+  pir::Region &true_region = (*this)->region(0);
   if (true_region.empty()) true_region.emplace_back();
   return true_region.front();
 }
-ir::Block *IfOp::false_block() {
-  ir::Region &false_region = (*this)->region(1);
+pir::Block *IfOp::false_block() {
+  pir::Region &false_region = (*this)->region(1);
   if (false_region.empty()) false_region.emplace_back();
   return false_region.front();
 }
-void IfOp::Print(ir::IrPrinter &printer) {
+void IfOp::Print(pir::IrPrinter &printer) {
   auto &os = printer.os;
   auto op = operation();
   printer.PrintOpResult(op);
-  os << " = pd.if";
+  os << " = pd_op.if";
   printer.PrintOpOperands(op);
   os << " -> ";
   printer.PrintOpReturnType(op);
diff --git a/paddle/fluid/pir/dialect/operator/ir/manual_op.h b/paddle/fluid/pir/dialect/operator/ir/manual_op.h
new file mode 100644
index 0000000000000..8cd8b9021858f
--- /dev/null
+++ b/paddle/fluid/pir/dialect/operator/ir/manual_op.h
@@ -0,0 +1,204 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef GET_MANUAL_OP_LIST
+#undef GET_MANUAL_OP_LIST
+paddle::dialect::AddNOp, paddle::dialect::SplitGradOp, paddle::dialect::IfOp
+
+#else
+
+#pragma once
+#include <vector>
+
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/pir/dialect/operator/interface/infermeta.h"
+#include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
+#include "paddle/fluid/pir/dialect/operator/trait/inplace.h"
+#include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_util.h"
+#include "paddle/fluid/pir/dialect/operator/utils/utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/pir/core/builder.h"
+#include "paddle/pir/core/ir_printer.h"
+#include "paddle/pir/core/op_base.h"
+#include "paddle/pir/core/operation_utils.h"
+
+namespace paddle {
+namespace dialect {
+
+class AddNOp : public pir::Op<AddNOp, OpYamlInfoInterface, InferMetaInterface> {
+ public:
+  using Op::Op;
+  static const char *name() { return "pd_op.add_n"; }
+  static constexpr const char **attributes_name = nullptr;
+  static constexpr uint32_t attributes_num = 0;
+  static OpInfoTuple GetOpInfo();
+  static void Build(pir::Builder &builder,             // NOLINT
+                    pir::OperationArgument &argument,  // NOLINT
+                    pir::OpResult inputs);
+
+  void Verify();
+  pir::Value inputs() { return operand_source(0); }
+  pir::OpResult out() { return result(0); }
+  static void InferMeta(phi::InferMetaContext *infer_meta);
+};
+
+class AddN_Op : public pir::Op<AddN_Op,
+                               paddle::dialect::OpYamlInfoInterface,
+                               paddle::dialect::InferMetaInterface,
+                               paddle::dialect::InplaceTrait> {
+ public:
+  using Op::Op;
+  static const char *name() { return "pd_op.add_n_"; }
+  static constexpr const char **attributes_name = nullptr;
+  static constexpr uint32_t attributes_num = 0;
+  static OpInfoTuple GetOpInfo();
+  static void Build(pir::Builder &builder,             // NOLINT
+                    pir::OperationArgument &argument,  // NOLINT
+                    pir::OpResult inputs_);
+
+  void Verify();
+  pir::Value inputs() { return operand_source(0); }
+  pir::OpResult out() { return result(0); }
+
+  static void InferMeta(phi::InferMetaContext *infer_meta);
+};
+
+class AddNWithKernelOp : public pir::Op<AddNWithKernelOp,
+                                        paddle::dialect::OpYamlInfoInterface,
+                                        paddle::dialect::InferMetaInterface> {
+ public:
+  using Op::Op;
+  static const char *name() { return "pd_op.add_n_with_kernel"; }
+  static constexpr const char **attributes_name = nullptr;
+  static constexpr uint32_t attributes_num = 0;
+  static OpInfoTuple GetOpInfo();
+  static void Build(pir::Builder &builder,             // NOLINT
+                    pir::OperationArgument &argument,  // NOLINT
+                    pir::OpResult inputs_);
+
+  void Verify();
+  pir::Value inputs() { return operand_source(0); }
+  pir::OpResult out() { return result(0); }
+
+  static void InferMeta(phi::InferMetaContext *infer_meta);
+};
+
+class FusedGemmEpilogueOp
+    : public pir::Op<FusedGemmEpilogueOp,
+                     paddle::dialect::OpYamlInfoInterface,
+                     paddle::dialect::InferMetaInterface> {
+ public:
+  using Op::Op;
+  static const char *name() { return "pd_op.fused_gemm_epilogue"; }
+  static const char *attributes_name[3];
+  static constexpr uint32_t attributes_num = 3;
+  static OpInfoTuple GetOpInfo();
+
+  static void Build(pir::Builder &builder,             // NOLINT
+                    pir::OperationArgument &argument,  // NOLINT
+                    pir::OpResult x_,
+                    pir::OpResult y_,
+                    pir::OpResult bias_,
+                    pir::AttributeMap attributes);
+  void Verify();
+  pir::Value x() { return operand_source(0); }
+  pir::Value y() { return operand_source(1); }
+  pir::Value bias() { return operand_source(2); }
+  pir::OpResult out() { return result(0); }
+  pir::OpResult reserve_space() { return result(1); }
+
+  static void InferMeta(phi::InferMetaContext *infer_meta);
+};
+
+class FusedGemmEpilogueGradOp
+    : public pir::Op<FusedGemmEpilogueGradOp,
+                     paddle::dialect::OpYamlInfoInterface,
+                     paddle::dialect::InferMetaInterface> {
+ public:
+  using Op::Op;
+  static const char *name() { return "pd_op.fused_gemm_epilogue_grad"; }
+  static const char *attributes_name[3];
+  static constexpr uint32_t attributes_num = 3;
+  static OpInfoTuple GetOpInfo();
+
+  static void Build(pir::Builder &builder,             // NOLINT
+                    pir::OperationArgument &argument,  // NOLINT
+                    pir::OpResult x_,
+                    pir::OpResult y_,
+                    pir::OpResult reserve_space_,
+                    pir::OpResult out_grad_,
+                    pir::AttributeMap attributes);
+  void Verify();
+  pir::Value x() { return operand_source(0); }
+  pir::Value y() { return operand_source(1); }
+  pir::Value reserve_space() { return operand_source(2); }
+  pir::Value out_grad() { return operand_source(3); }
+  pir::OpResult x_grad() { return result(0); }
+  pir::OpResult y_grad() { return result(1); }
+  pir::OpResult bias_grad() { return result(2); }
+
+  static void InferMeta(phi::InferMetaContext *infer_meta);
+};
+
+class SplitGradOp : public pir::Op<SplitGradOp, OpYamlInfoInterface> {
+ public:
+  using Op::Op;
+  static const char *name() { return "pd_op.split_grad"; }
+  static const char *attributes_name[1];
+  static constexpr uint32_t attributes_num = 1;
+  static OpInfoTuple GetOpInfo();
+  static void Build(pir::Builder &builder,             // NOLINT
+                    pir::OperationArgument &argument,  // NOLINT
+                    pir::OpResult x_,
+                    float axis = 0);
+  static void Build(pir::Builder &builder,             // NOLINT
+                    pir::OperationArgument &argument,  // NOLINT
+                    pir::OpResult out_grad_,
+                    pir::OpResult axis_);
+
+  void Verify();
+  pir::Value out_grad() { return operand_source(0); }
+  pir::Value axis() { return operand_source(1); }
+  pir::OpResult x_grad() { return result(0); }
+  static void InferMeta(phi::InferMetaContext *infer_meta);
+};
+
+class IfOp : public pir::Op<IfOp> {
+ public:
+  using Op::Op;
+  static const char *name() { return "pd_op.if"; }
+  static constexpr const char **attributes_name = nullptr;
+  static constexpr uint32_t attributes_num = 0;
+  static void Build(pir::Builder &builder,             // NOLINT
+                    pir::OperationArgument &argument,  // NOLINT
+                    pir::OpResult cond,
+                    std::vector<pir::Type> &&output_types);
+  pir::Value cond() { return operand_source(0); }
+  pir::Block *true_block();
+  pir::Block *false_block();
+  void Print(pir::IrPrinter &printer);  // NOLINT
+  void Verify();
+};
+
+}  // namespace dialect
+}  // namespace paddle
+
+IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::AddNOp)
+IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::SplitGradOp)
+IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::AddN_Op)
+IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::AddNWithKernelOp)
+IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::FusedGemmEpilogueOp)
+IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::FusedGemmEpilogueGradOp)
+IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::IfOp)
+#endif
diff --git a/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_manual_api.h b/paddle/fluid/pir/dialect/operator/ir/manual_op_vjp.cc
similarity index 61%
rename from paddle/fluid/ir/dialect/paddle_dialect/ir/pd_manual_api.h
rename to paddle/fluid/pir/dialect/operator/ir/manual_op_vjp.cc
index 5eba73e5182bd..7f58a434f554a 100644
--- a/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_manual_api.h
+++ b/paddle/fluid/pir/dialect/operator/ir/manual_op_vjp.cc
@@ -12,19 +12,20 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#pragma once
+#include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
+#include "paddle/fluid/primitive/rule/vjp/vjp.h"
+#include "paddle/fluid/primitive/type/lazy_tensor.h"
+#include "paddle/phi/common/int_array.h"
+#include "paddle/pir/core/builtin_op.h"
+#include "paddle/pir/core/op_base.h"
 
-#include <vector>
-
-#include "paddle/ir/core/value.h"
-#include "paddle/phi/common/data_type.h"
-#include "paddle/phi/common/place.h"
+// TODO(wanghao107)
+// this file will be generated in pd_op.cc
 
 namespace paddle {
 namespace dialect {
+using IntArray = paddle::experimental::IntArray;
 
-ir::OpResult split_grad(std::vector<ir::OpResult> out_grads, ir::OpResult axis);
-
-ir::OpResult split_grad(std::vector<ir::OpResult> out_grads, int axis);
 }  // namespace dialect
 }  // namespace paddle
diff --git a/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_meta_tensor.cc b/paddle/fluid/pir/dialect/operator/ir/meta_tensor.cc
similarity index 95%
rename from paddle/fluid/ir/dialect/paddle_dialect/ir/pd_meta_tensor.cc
rename to paddle/fluid/pir/dialect/operator/ir/meta_tensor.cc
index 2da7b098a6556..1985413ecb95d 100644
--- a/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_meta_tensor.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/meta_tensor.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_meta_tensor.h"
+#include "paddle/fluid/pir/dialect/operator/ir/meta_tensor.h"
 
-#include "paddle/ir/core/enforce.h"
+#include "paddle/pir/core/enforce.h"
 
 namespace paddle {
 namespace dialect {
diff --git a/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_meta_tensor.h b/paddle/fluid/pir/dialect/operator/ir/meta_tensor.h
similarity index 100%
rename from paddle/fluid/ir/dialect/paddle_dialect/ir/pd_meta_tensor.h
rename to paddle/fluid/pir/dialect/operator/ir/meta_tensor.h
diff --git a/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_attribute.cc b/paddle/fluid/pir/dialect/operator/ir/op_attribute.cc
similarity index 85%
rename from paddle/fluid/ir/dialect/paddle_dialect/ir/pd_attribute.cc
rename to paddle/fluid/pir/dialect/operator/ir/op_attribute.cc
index 72cc98447e10e..3b69d68eb65f3 100644
--- a/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_attribute.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/op_attribute.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_attribute.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
 
 namespace paddle {
 namespace dialect {
@@ -29,18 +29,18 @@ phi::DataLayout DataLayoutAttribute::data() const {
 }
 
 phi::Scalar ScalarAttribute::data() {
-  if (isa<ir::FloatAttribute>()) {
-    return phi::Scalar(dyn_cast<ir::FloatAttribute>().data());
-  } else if (isa<ir::DoubleAttribute>()) {
-    return phi::Scalar(dyn_cast<ir::DoubleAttribute>().data());
-  } else if (isa<ir::Int32Attribute>()) {
-    return phi::Scalar(dyn_cast<ir::Int32Attribute>().data());
-  } else if (isa<ir::Int64Attribute>()) {
-    return phi::Scalar(dyn_cast<ir::Int64Attribute>().data());
-  } else if (isa<ir::BoolAttribute>()) {
-    return phi::Scalar(dyn_cast<ir::BoolAttribute>().data());
-  } else if (isa<ir::StrAttribute>()) {
-    return phi::Scalar(dyn_cast<ir::StrAttribute>().AsString());
+  if (isa<pir::FloatAttribute>()) {
+    return phi::Scalar(dyn_cast<pir::FloatAttribute>().data());
+  } else if (isa<pir::DoubleAttribute>()) {
+    return phi::Scalar(dyn_cast<pir::DoubleAttribute>().data());
+  } else if (isa<pir::Int32Attribute>()) {
+    return phi::Scalar(dyn_cast<pir::Int32Attribute>().data());
+  } else if (isa<pir::Int64Attribute>()) {
+    return phi::Scalar(dyn_cast<pir::Int64Attribute>().data());
+  } else if (isa<pir::BoolAttribute>()) {
+    return phi::Scalar(dyn_cast<pir::BoolAttribute>().data());
+  } else if (isa<pir::StrAttribute>()) {
+    return phi::Scalar(dyn_cast<pir::StrAttribute>().AsString());
   } else {
     PADDLE_THROW(phi::errors::Unimplemented(
         "Unsupported ir attribute when casting it into "
@@ -48,7 +48,7 @@ phi::Scalar ScalarAttribute::data() {
   }
 }
 
-IntArrayAttribute IntArrayAttribute::Parse(ir::IrParser &parser) {  // NOLINT
+IntArrayAttribute IntArrayAttribute::Parse(pir::IrParser &parser) {  // NOLINT
   Token buket_token = parser.ConsumeToken();
   std::vector<int32_t> vec{};
   while (parser.PeekToken().val_ != "]") {
@@ -66,7 +66,7 @@ IntArrayAttribute IntArrayAttribute::Parse(ir::IrParser &parser) {  // NOLINT
 //                       |int32|uint64|int64|float32|complex64
 //                       |complex128|Undefined|psting|flaot16
 //                       |bfloat16|num_data_types|all_dtype
-DataTypeAttribute DataTypeAttribute::Parse(ir::IrParser &parser) {  // NOLINT
+DataTypeAttribute DataTypeAttribute::Parse(pir::IrParser &parser) {  // NOLINT
   std::unordered_map<std::string, phi::DataType> StringToDataType{
       {"bool", phi::DataType::BOOL},
       {"uint8", phi::DataType::UINT8},
@@ -96,7 +96,7 @@ DataTypeAttribute DataTypeAttribute::Parse(ir::IrParser &parser) {  // NOLINT
 // Parse a PlaceAttribute
 // PlaceAttribute   :=    Place(cpu)|Place(gpu:0)|Place(gpu_pinned)
 //                        |Place(xpu:0)|Place(ipu:0)|Place(:0)|undefined
-PlaceAttribute PlaceAttribute::Parse(ir::IrParser &parser) {  // NOLINT
+PlaceAttribute PlaceAttribute::Parse(pir::IrParser &parser) {  // NOLINT
   std::unordered_map<std::string, phi::Place> StringToPlace{
       {"cpu", phi::CPUPlace{}},
       {"gpu", phi::GPUPlace{}},
@@ -126,7 +126,7 @@ PlaceAttribute PlaceAttribute::Parse(ir::IrParser &parser) {  // NOLINT
 //                           |SPARSE_COO|SPARSE_CSR|NDHWC
 //                           |NCDHW|PSTRING_UNION|STRIDED
 DataLayoutAttribute DataLayoutAttribute::Parse(
-    ir::IrParser &parser) {  // NOLINT
+    pir::IrParser &parser) {  // NOLINT
   std::unordered_map<std::string, phi::DataLayout> StringToDataLayout{
       {"NHWC", phi::DataLayout::kNHWC},
       {"NCHW", phi::DataLayout::kNCHW},
diff --git a/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_attribute.h b/paddle/fluid/pir/dialect/operator/ir/op_attribute.h
similarity index 65%
rename from paddle/fluid/ir/dialect/paddle_dialect/ir/pd_attribute.h
rename to paddle/fluid/pir/dialect/operator/ir/op_attribute.h
index e1d3daab7191d..a47187774eeb6 100644
--- a/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_attribute.h
+++ b/paddle/fluid/pir/dialect/operator/ir/op_attribute.h
@@ -14,17 +14,17 @@
 
 #pragma once
 
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_attribute_storage.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/utils/utils.h"
-#include "paddle/ir/core/attribute.h"
-#include "paddle/ir/core/builtin_attribute.h"
-#include "paddle/ir/core/ir_parser.h"
+#include "paddle/fluid/pir/dialect/operator/ir/attribute_storage.h"
+#include "paddle/fluid/pir/dialect/operator/utils/utils.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/enforce.h"
+#include "paddle/pir/core/attribute.h"
+#include "paddle/pir/core/builtin_attribute.h"
+#include "paddle/pir/core/parser/ir_parser.h"
 
 namespace paddle {
 namespace dialect {
-class IntArrayAttribute : public ir::Attribute {
+class IntArrayAttribute : public pir::Attribute {
  public:
   using Attribute::Attribute;
 
@@ -35,32 +35,32 @@ class IntArrayAttribute : public ir::Attribute {
     return storage() < right.storage();
   }
 
-  static IntArrayAttribute Parse(ir::IrParser &parser);  // NOLINT
+  static IntArrayAttribute Parse(pir::IrParser &parser);  // NOLINT
 
   const phi::IntArray &data() const;
 };
 
-class ScalarAttribute : public ir::Attribute {
+class ScalarAttribute : public pir::Attribute {
  public:
   using Attribute::Attribute;
 
-  static bool classof(ir::Attribute val) {
-    return (val.type_id() == ir::BoolAttribute::type_id()) ||
-           (val.type_id() == ir::FloatAttribute::type_id()) ||
-           (val.type_id() == ir::DoubleAttribute::type_id()) ||
-           (val.type_id() == ir::Int32Attribute::type_id()) ||
-           (val.type_id() == ir::Int64Attribute::type_id()) ||
-           (val.type_id() == ir::StrAttribute::type_id());
+  static bool classof(pir::Attribute val) {
+    return (val.type_id() == pir::BoolAttribute::type_id()) ||
+           (val.type_id() == pir::FloatAttribute::type_id()) ||
+           (val.type_id() == pir::DoubleAttribute::type_id()) ||
+           (val.type_id() == pir::Int32Attribute::type_id()) ||
+           (val.type_id() == pir::Int64Attribute::type_id()) ||
+           (val.type_id() == pir::StrAttribute::type_id());
   }
 
-  static ir::Attribute get(ir::IrContext *ctx, phi::Scalar scalar) {
+  static pir::Attribute get(pir::IrContext *ctx, phi::Scalar scalar) {
     return TransToIrAttribute(scalar, ctx);
   }
 
   phi::Scalar data();
 };
 
-class DataTypeAttribute : public ir::Attribute {
+class DataTypeAttribute : public pir::Attribute {
  public:
   using Attribute::Attribute;
 
@@ -71,12 +71,12 @@ class DataTypeAttribute : public ir::Attribute {
     return storage() < right.storage();
   }
 
-  static DataTypeAttribute Parse(ir::IrParser &parser);  // NOLINT
+  static DataTypeAttribute Parse(pir::IrParser &parser);  // NOLINT
 
   phi::DataType data() const;
 };
 
-class PlaceAttribute : public ir::Attribute {
+class PlaceAttribute : public pir::Attribute {
  public:
   using Attribute::Attribute;
 
@@ -86,12 +86,12 @@ class PlaceAttribute : public ir::Attribute {
     return storage() < right.storage();
   }
 
-  static PlaceAttribute Parse(ir::IrParser &parser);  // NOLINT
+  static PlaceAttribute Parse(pir::IrParser &parser);  // NOLINT
 
   phi::Place data() const;
 };
 
-class DataLayoutAttribute : public ir::Attribute {
+class DataLayoutAttribute : public pir::Attribute {
  public:
   using Attribute::Attribute;
 
@@ -102,7 +102,7 @@ class DataLayoutAttribute : public ir::Attribute {
     return storage() < right.storage();
   }
 
-  static DataLayoutAttribute Parse(ir::IrParser &parser);  // NOLINT
+  static DataLayoutAttribute Parse(pir::IrParser &parser);  // NOLINT
   phi::DataLayout data() const;
 };
 
diff --git a/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_dialect.cc b/paddle/fluid/pir/dialect/operator/ir/op_dialect.cc
similarity index 75%
rename from paddle/fluid/ir/dialect/paddle_dialect/ir/pd_dialect.cc
rename to paddle/fluid/pir/dialect/operator/ir/op_dialect.cc
index 82169dafc5969..2c85ea18d3da3 100644
--- a/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_dialect.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/op_dialect.cc
@@ -12,26 +12,26 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_dialect.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_attribute.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
 // NOTE(zhangbo9674): File pd_op.h is generated by op_gen.py, see details in
-// paddle/fluid/ir/dialect/CMakeLists.txt.
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_op.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_type.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_type_storage.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/transforms/param_to_variable.h"
-#include "paddle/ir/core/ir_printer.h"
-#include "paddle/ir/core/utils.h"
+// paddle/fluid/pir/dialect/CMakeLists.txt.
+#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
+#include "paddle/fluid/pir/dialect/operator/ir/type_storage.h"
+#include "paddle/fluid/pir/dialect/operator/transforms/param_to_variable.h"
+#include "paddle/pir/core/ir_printer.h"
+#include "paddle/pir/core/utils.h"
 
 namespace paddle {
 namespace dialect {
 
-PaddleDialect::PaddleDialect(ir::IrContext *context)
-    : ir::Dialect(name(), context, ir::TypeId::get<PaddleDialect>()) {
+OperatorDialect::OperatorDialect(pir::IrContext *context)
+    : pir::Dialect(name(), context, pir::TypeId::get<OperatorDialect>()) {
   initialize();
 }
 
-void PaddleDialect::initialize() {
+void OperatorDialect::initialize() {
   RegisterTypes<paddle::dialect::DenseTensorType>();
   RegisterTypes<paddle::dialect::SelectedRowsType>();
 
@@ -42,12 +42,12 @@ void PaddleDialect::initialize() {
 
   // NOTE(zhangbo9674): GET_OP_LIST is defined in pd_op.h which is
   // generated by op_gen.py, see details in
-  // paddle/fluid/ir/dialect/CMakeLists.txt.
-  // NOTE(Ruting)GET_MANUAL_OP_LIST is define in pd_manual_op.h"
+  // paddle/fluid/pir/dialect/CMakeLists.txt.
+  // NOTE(Ruting)GET_MANUAL_OP_LIST is define in manual_op.h"
   // use RegisterOps when list has more than two ops.
   RegisterOps<
 #define GET_OP_LIST
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_op.h"  // NOLINT
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"  // NOLINT
       >();
   RegisterOps<paddle::dialect::AddNOp,
               paddle::dialect::AddN_Op,
@@ -60,7 +60,7 @@ void PaddleDialect::initialize() {
   RegisterInterfaces<ParameterConvertInterface>();
 }
 
-void PaddleDialect::PrintType(ir::Type type, std::ostream &os) const {
+void OperatorDialect::PrintType(pir::Type type, std::ostream &os) const {
   os << type.dialect().name();
   os << '.';
   if (auto tensor_type = type.dyn_cast<DenseTensorType>()) {
@@ -82,7 +82,8 @@ void PaddleDialect::PrintType(ir::Type type, std::ostream &os) const {
   }
 }
 
-void PaddleDialect::PrintAttribute(ir::Attribute attr, std::ostream &os) const {
+void OperatorDialect::PrintAttribute(pir::Attribute attr,
+                                     std::ostream &os) const {
   os << "(" << attr.dialect().name();
   os << '.';
   if (auto int_array_attr = attr.dyn_cast<IntArrayAttribute>()) {
@@ -90,7 +91,7 @@ void PaddleDialect::PrintAttribute(ir::Attribute attr, std::ostream &os) const {
     os << "IntArray)"
        << "[";
     const auto &inner_data = data.GetData();
-    ir::PrintInterleave(
+    pir::PrintInterleave(
         inner_data.begin(),
         inner_data.end(),
         [&os](int64_t i) { os << i; },
@@ -107,8 +108,8 @@ void PaddleDialect::PrintAttribute(ir::Attribute attr, std::ostream &os) const {
   }
 }
 
-ir::Type PaddleDialect::ParseType(ir::IrParser &parser) {  // NOLINT
-  parser.ConsumeAToken("pd.tensor");
+pir::Type OperatorDialect::ParseType(pir::IrParser &parser) {  // NOLINT
+  parser.ConsumeAToken("pd_op.tensor");
   parser.ConsumeAToken("<");
   std::vector<int> dim{};
   Token dim_token = parser.PeekToken();
@@ -126,7 +127,7 @@ ir::Type PaddleDialect::ParseType(ir::IrParser &parser) {  // NOLINT
     }
   }
   phi::DDim ddim = phi::make_ddim(dim);
-  ir::Type dtype = parser.ParseType();
+  pir::Type dtype = parser.ParseType();
   std::vector<std::vector<size_t>> lod;
   std::vector<size_t> lodv;
   lodv.push_back(0);
@@ -136,7 +137,8 @@ ir::Type PaddleDialect::ParseType(ir::IrParser &parser) {  // NOLINT
       parser.ctx, dtype, ddim, phi::DataLayout::UNDEFINED, lod, 0);
 }
 
-ir::Attribute PaddleDialect::ParseAttribute(ir::IrParser &parser) {  // NOLINT
+pir::Attribute OperatorDialect::ParseAttribute(
+    pir::IrParser &parser) {  // NOLINT
   std::string type_name = parser.ConsumeToken().val_;
   std::string attribute_name =
       type_name.substr(type_name.find('.') + 1, std::string::npos);
@@ -155,8 +157,8 @@ ir::Attribute PaddleDialect::ParseAttribute(ir::IrParser &parser) {  // NOLINT
   }
 }
 
-void PaddleDialect::PrintOperation(ir::Operation *op,
-                                   ir::IrPrinter &printer) const {
+void OperatorDialect::PrintOperation(pir::Operation *op,
+                                     pir::IrPrinter &printer) const {
   if (auto if_op = op->dyn_cast<IfOp>()) {
     if_op.Print(printer);
   } else {
@@ -167,4 +169,4 @@ void PaddleDialect::PrintOperation(ir::Operation *op,
 }  // namespace dialect
 }  // namespace paddle
 
-IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::PaddleDialect)
+IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::OperatorDialect)
diff --git a/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_dialect.h b/paddle/fluid/pir/dialect/operator/ir/op_dialect.h
similarity index 54%
rename from paddle/fluid/ir/dialect/paddle_dialect/ir/pd_dialect.h
rename to paddle/fluid/pir/dialect/operator/ir/op_dialect.h
index 285a796982f85..bc85b789a058b 100644
--- a/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_dialect.h
+++ b/paddle/fluid/pir/dialect/operator/ir/op_dialect.h
@@ -14,25 +14,25 @@
 
 #pragma once
 
-#include "paddle/ir/core/dialect.h"
+#include "paddle/pir/core/dialect.h"
 
 namespace paddle {
 namespace dialect {
 
-class PaddleDialect : public ir::Dialect {
+class OperatorDialect : public pir::Dialect {
  public:
-  explicit PaddleDialect(ir::IrContext* context);
+  explicit OperatorDialect(pir::IrContext* context);
 
-  static const char* name() { return "pd"; }
+  static const char* name() { return "pd_op"; }
 
-  ir::Type ParseType(ir::IrParser& parser) override;            // NOLINT
-  ir::Attribute ParseAttribute(ir::IrParser& parser) override;  // NOLINT
+  pir::Type ParseType(pir::IrParser& parser) override;            // NOLINT
+  pir::Attribute ParseAttribute(pir::IrParser& parser) override;  // NOLINT
 
-  void PrintType(ir::Type type, std::ostream& os) const override;
-  void PrintAttribute(ir::Attribute type, std::ostream& os) const override;
+  void PrintType(pir::Type type, std::ostream& os) const override;
+  void PrintAttribute(pir::Attribute type, std::ostream& os) const override;
 
-  void PrintOperation(ir::Operation* op,
-                      ir::IrPrinter& printer) const override;  // NOLINT
+  void PrintOperation(pir::Operation* op,
+                      pir::IrPrinter& printer) const override;  // NOLINT
 
  private:
   void initialize();
@@ -41,4 +41,4 @@ class PaddleDialect : public ir::Dialect {
 }  // namespace dialect
 }  // namespace paddle
 
-IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::PaddleDialect)
+IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::OperatorDialect)
diff --git a/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_type.cc b/paddle/fluid/pir/dialect/operator/ir/op_type.cc
similarity index 88%
rename from paddle/fluid/ir/dialect/paddle_dialect/ir/pd_type.cc
rename to paddle/fluid/pir/dialect/operator/ir/op_type.cc
index 31ba23b0e1bbc..c9fc8bcd65b10 100644
--- a/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_type.cc
+++ b/paddle/fluid/pir/dialect/operator/ir/op_type.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_type.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
 
 namespace paddle {
 namespace dialect {
-const ir::Type& SelectedRowsType::dtype() const { return storage()->dtype_; }
+const pir::Type& SelectedRowsType::dtype() const { return storage()->dtype_; }
 
 const phi::DDim& SelectedRowsType::dims() const { return storage()->dims_; }
 
diff --git a/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_type.h b/paddle/fluid/pir/dialect/operator/ir/op_type.h
similarity index 62%
rename from paddle/fluid/ir/dialect/paddle_dialect/ir/pd_type.h
rename to paddle/fluid/pir/dialect/operator/ir/op_type.h
index 9525e1a88b346..3ee0d642e2e47 100644
--- a/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_type.h
+++ b/paddle/fluid/pir/dialect/operator/ir/op_type.h
@@ -14,20 +14,23 @@
 
 #pragma once
 
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_type_storage.h"
-#include "paddle/ir/core/builtin_type.h"
-#include "paddle/ir/core/type.h"
+#include "paddle/fluid/pir/dialect/operator/ir/type_storage.h"
+#include "paddle/pir/core/builtin_type.h"
+#include "paddle/pir/core/builtin_type_interfaces.h"
+#include "paddle/pir/core/type.h"
 
 namespace paddle {
 namespace dialect {
-using DenseTensorType = ir::DenseTensorType;
-class SelectedRowsType : public ir::Type {
- public:
-  using Type::Type;
 
-  DECLARE_TYPE_UTILITY_FUNCTOR(SelectedRowsType, SelectedRowsTypeStorage);
+using DenseTensorType = pir::DenseTensorType;
+class SelectedRowsType : public pir::Type::TypeBase<SelectedRowsType,
+                                                    pir::Type,
+                                                    SelectedRowsTypeStorage,
+                                                    pir::ShapedTypeInterface> {
+ public:
+  using Base::Base;
 
-  const ir::Type &dtype() const;
+  const pir::Type &dtype() const;
 
   const phi::DDim &dims() const;
 
diff --git a/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_ops.yaml b/paddle/fluid/pir/dialect/operator/ir/ops.yaml
similarity index 96%
rename from paddle/fluid/ir/dialect/paddle_dialect/ir/pd_ops.yaml
rename to paddle/fluid/pir/dialect/operator/ir/ops.yaml
index da4c252af7217..bf80652d03134 100644
--- a/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_ops.yaml
+++ b/paddle/fluid/pir/dialect/operator/ir/ops.yaml
@@ -106,6 +106,15 @@
     param: [x, file_path, overwrite, save_as_fp16, save_to_memory]
   optional : out
 
+- op : seed
+  args : (int seed, bool deterministic, str rng_name, bool force_cpu)
+  output : Tensor(out)
+  infer_meta:
+    func: SeedInferMeta
+    param: [seed]
+  kernel:
+    func: seed
+
 - op : send_v2
   args : (Tensor x, int ring_id = 0, int peer = 0, bool use_calc_stream = false, bool dynamic_shape = false)
   output :
diff --git a/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_ops_backward.yaml b/paddle/fluid/pir/dialect/operator/ir/ops_backward.yaml
similarity index 100%
rename from paddle/fluid/ir/dialect/paddle_dialect/ir/pd_ops_backward.yaml
rename to paddle/fluid/pir/dialect/operator/ir/ops_backward.yaml
diff --git a/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_type_storage.h b/paddle/fluid/pir/dialect/operator/ir/type_storage.h
similarity index 78%
rename from paddle/fluid/ir/dialect/paddle_dialect/ir/pd_type_storage.h
rename to paddle/fluid/pir/dialect/operator/ir/type_storage.h
index 1a74b6d6c1059..e001f7b78716b 100644
--- a/paddle/fluid/ir/dialect/paddle_dialect/ir/pd_type_storage.h
+++ b/paddle/fluid/pir/dialect/operator/ir/type_storage.h
@@ -16,17 +16,17 @@
 
 #include <type_traits>
 
-#include "paddle/ir/core/builtin_type_storage.h"
-#include "paddle/ir/core/type.h"
-#include "paddle/ir/core/type_base.h"
-#include "paddle/ir/core/utils.h"
 #include "paddle/phi/core/tensor_meta.h"
+#include "paddle/pir/core/builtin_type_storage.h"
+#include "paddle/pir/core/type.h"
+#include "paddle/pir/core/type_base.h"
+#include "paddle/pir/core/utils.h"
 
 namespace paddle {
 namespace dialect {
-using DenseTensorTypeStorage = ir::DenseTensorTypeStorage;
+using DenseTensorTypeStorage = pir::DenseTensorTypeStorage;
 
-struct SelectedRowsTypeStorage : public ir::TypeStorage {
+struct SelectedRowsTypeStorage : public pir::TypeStorage {
   using DataLayout = phi::DataLayout;
   using Dim = phi::DDim;
   using LoD = std::vector<std::vector<size_t>>;
@@ -34,9 +34,9 @@ struct SelectedRowsTypeStorage : public ir::TypeStorage {
   /// \brief Declare ParamKey according to parameter type.
   ///
   using ParamKey =
-      std::tuple<ir::Type, phi::DDim, phi::DataLayout, phi::LoD, size_t>;
+      std::tuple<pir::Type, phi::DDim, phi::DataLayout, phi::LoD, size_t>;
 
-  SelectedRowsTypeStorage(const ir::Type& dtype,
+  SelectedRowsTypeStorage(const pir::Type& dtype,
                           const phi::DDim& dims,
                           const phi::DataLayout& layout,
                           const phi::LoD& lod,
@@ -66,22 +66,22 @@ struct SelectedRowsTypeStorage : public ir::TypeStorage {
     std::size_t hash_value = 317;
     // hash dtype
     hash_value =
-        ir::hash_combine(hash_value, std::hash<ir::Type>()(std::get<0>(key)));
+        pir::hash_combine(hash_value, std::hash<pir::Type>()(std::get<0>(key)));
     // hash dims
     hash_value =
-        ir::hash_combine(hash_value, std::hash<phi::DDim>()(std::get<1>(key)));
+        pir::hash_combine(hash_value, std::hash<phi::DDim>()(std::get<1>(key)));
     // hash layout
-    hash_value = ir::hash_combine(
+    hash_value = pir::hash_combine(
         hash_value,
         std::hash<std::underlying_type<phi::DataLayout>::type>()(
             static_cast<std::underlying_type<phi::DataLayout>::type>(
                 std::get<2>(key))));
     // hash lod
     hash_value =
-        ir::hash_combine(hash_value, std::hash<phi::LoD>()(std::get<3>(key)));
+        pir::hash_combine(hash_value, std::hash<phi::LoD>()(std::get<3>(key)));
     // hash offset
     hash_value =
-        ir::hash_combine(hash_value, std::hash<size_t>()(std::get<4>(key)));
+        pir::hash_combine(hash_value, std::hash<size_t>()(std::get<4>(key)));
     return hash_value;
   }
 
@@ -100,7 +100,7 @@ struct SelectedRowsTypeStorage : public ir::TypeStorage {
   /// \brief DenseTensorTypeStorage include five parameters: dims, dtype,
   /// layout, lod, offset.
   ///
-  ir::Type dtype_;
+  pir::Type dtype_;
   phi::DDim dims_;
   phi::DataLayout layout_;
   phi::LoD lod_;
diff --git a/paddle/fluid/ir/dialect/paddle_dialect/trait/CMakeLists.txt b/paddle/fluid/pir/dialect/operator/trait/CMakeLists.txt
similarity index 83%
rename from paddle/fluid/ir/dialect/paddle_dialect/trait/CMakeLists.txt
rename to paddle/fluid/pir/dialect/operator/trait/CMakeLists.txt
index 53c3060d6f182..0689edb35655e 100644
--- a/paddle/fluid/ir/dialect/paddle_dialect/trait/CMakeLists.txt
+++ b/paddle/fluid/pir/dialect/operator/trait/CMakeLists.txt
@@ -3,4 +3,4 @@ file(GLOB PD_INTERFACE_SRCS "*.cc")
 cc_library(
   pd_trait
   SRCS ${PD_INTERFACE_SRCS}
-  DEPS ir_core)
+  DEPS pir_core)
diff --git a/paddle/fluid/pir/dialect/operator/trait/custom_vjp.h b/paddle/fluid/pir/dialect/operator/trait/custom_vjp.h
new file mode 100644
index 0000000000000..1b1c7c08efca1
--- /dev/null
+++ b/paddle/fluid/pir/dialect/operator/trait/custom_vjp.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*
+Custom VJP stands for manually implemented backward rules for composite
+operators. CustomVjpTrait will be added for those composite operators that
+defines custom vjp rules. Finally, by calling has_custom_vjp(op), users can
+check whether an operator has a CustomVjpTrait, and thus check whether a custom
+vjp rule is defined for that operator.
+*/
+
+#pragma once
+
+#include "paddle/pir/core/op_base.h"
+
+namespace paddle {
+namespace dialect {
+class CustomVjpTrait : public pir::OpTraitBase<CustomVjpTrait> {
+ public:
+  explicit CustomVjpTrait(pir::Operation *op)
+      : pir::OpTraitBase<CustomVjpTrait>(op) {}
+};
+
+}  // namespace dialect
+}  // namespace paddle
+
+IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::CustomVjpTrait)
diff --git a/paddle/fluid/ir/dialect/paddle_dialect/trait/inplace.h b/paddle/fluid/pir/dialect/operator/trait/inplace.h
similarity index 80%
rename from paddle/fluid/ir/dialect/paddle_dialect/trait/inplace.h
rename to paddle/fluid/pir/dialect/operator/trait/inplace.h
index 38dfaaeac000e..e50f1e3a8349d 100644
--- a/paddle/fluid/ir/dialect/paddle_dialect/trait/inplace.h
+++ b/paddle/fluid/pir/dialect/operator/trait/inplace.h
@@ -14,14 +14,14 @@
 
 #pragma once
 
-#include "paddle/ir/core/op_base.h"
+#include "paddle/pir/core/op_base.h"
 
 namespace paddle {
 namespace dialect {
-class InplaceTrait : public ir::OpTraitBase<InplaceTrait> {
+class InplaceTrait : public pir::OpTraitBase<InplaceTrait> {
  public:
-  explicit InplaceTrait(ir::Operation *op)
-      : ir::OpTraitBase<InplaceTrait>(op) {}
+  explicit InplaceTrait(pir::Operation *op)
+      : pir::OpTraitBase<InplaceTrait>(op) {}
 };
 
 }  // namespace dialect
diff --git a/paddle/fluid/ir/dialect/paddle_dialect/trait/trait.cc b/paddle/fluid/pir/dialect/operator/trait/trait.cc
similarity index 78%
rename from paddle/fluid/ir/dialect/paddle_dialect/trait/trait.cc
rename to paddle/fluid/pir/dialect/operator/trait/trait.cc
index c086b98e34bc7..2a5b7575959b9 100644
--- a/paddle/fluid/ir/dialect/paddle_dialect/trait/trait.cc
+++ b/paddle/fluid/pir/dialect/operator/trait/trait.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/ir/dialect/paddle_dialect/trait/inplace.h"
+#include "paddle/fluid/pir/dialect/operator/trait/custom_vjp.h"
+#include "paddle/fluid/pir/dialect/operator/trait/inplace.h"
 
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::InplaceTrait)
+IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::CustomVjpTrait)
diff --git a/paddle/fluid/ir/dialect/paddle_dialect/transforms/CMakeLists.txt b/paddle/fluid/pir/dialect/operator/transforms/CMakeLists.txt
similarity index 68%
rename from paddle/fluid/ir/dialect/paddle_dialect/transforms/CMakeLists.txt
rename to paddle/fluid/pir/dialect/operator/transforms/CMakeLists.txt
index 8d90edd3feb74..7116a12be50ef 100644
--- a/paddle/fluid/ir/dialect/paddle_dialect/transforms/CMakeLists.txt
+++ b/paddle/fluid/pir/dialect/operator/transforms/CMakeLists.txt
@@ -1,4 +1,4 @@
 cc_library(
   param_to_variable
   SRCS param_to_variable.cc
-  DEPS pd_dialect_core)
+  DEPS pd_op_dialect_core)
diff --git a/paddle/fluid/ir/dialect/paddle_dialect/transforms/param_to_variable.cc b/paddle/fluid/pir/dialect/operator/transforms/param_to_variable.cc
similarity index 73%
rename from paddle/fluid/ir/dialect/paddle_dialect/transforms/param_to_variable.cc
rename to paddle/fluid/pir/dialect/operator/transforms/param_to_variable.cc
index 0113e38b8fd5e..1d93e27c59b0b 100644
--- a/paddle/fluid/ir/dialect/paddle_dialect/transforms/param_to_variable.cc
+++ b/paddle/fluid/pir/dialect/operator/transforms/param_to_variable.cc
@@ -12,19 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/ir/dialect/paddle_dialect/transforms/param_to_variable.h"
+#include "paddle/fluid/pir/dialect/operator/transforms/param_to_variable.h"
 
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_type.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/utils/utils.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
+#include "paddle/fluid/pir/dialect/operator/utils/utils.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace paddle {
 namespace dialect {
 std::shared_ptr<paddle::framework::Variable>
-ParameterConvertInterface::ParameterToVariable(ir::Parameter *parameter) {
+ParameterConvertInterface::ParameterToVariable(pir::Parameter *parameter) {
   if (parameter->type().isa<DenseTensorType>()) {
     VLOG(4) << "Convert a DenseTensor Parameter to a variable.";
     std::shared_ptr<paddle::framework::Variable> var =
@@ -56,21 +56,21 @@ ParameterConvertInterface::ParameterToVariable(ir::Parameter *parameter) {
   }
 }
 
-std::unique_ptr<ir::Parameter> ParameterConvertInterface::VariableToParameter(
+std::unique_ptr<pir::Parameter> ParameterConvertInterface::VariableToParameter(
     paddle::framework::Variable *var) {
   if (var->IsType<phi::DenseTensor>()) {
     phi::DenseTensor *tensor = var->GetMutable<phi::DenseTensor>();
     // Get Meta
-    ir::IrContext *ctx = ir::IrContext::Instance();
-    ir::Type data_type = TransToIrDataType(tensor->dtype(), ctx);
+    pir::IrContext *ctx = pir::IrContext::Instance();
+    pir::Type data_type = TransToIrDataType(tensor->dtype(), ctx);
     void *data = tensor->data();
-    ir::Type dense_tensor_type = DenseTensorType::get(ctx,
-                                                      data_type,
-                                                      tensor->dims(),
-                                                      tensor->layout(),
-                                                      tensor->lod(),
-                                                      tensor->meta().offset);
-    return std::make_unique<ir::Parameter>(
+    pir::Type dense_tensor_type = DenseTensorType::get(ctx,
+                                                       data_type,
+                                                       tensor->dims(),
+                                                       tensor->layout(),
+                                                       tensor->lod(),
+                                                       tensor->meta().offset);
+    return std::make_unique<pir::Parameter>(
         data,
         tensor->numel() * phi::SizeOf(tensor->dtype()),
         dense_tensor_type);
diff --git a/paddle/fluid/ir/dialect/paddle_dialect/transforms/param_to_variable.h b/paddle/fluid/pir/dialect/operator/transforms/param_to_variable.h
similarity index 76%
rename from paddle/fluid/ir/dialect/paddle_dialect/transforms/param_to_variable.h
rename to paddle/fluid/pir/dialect/operator/transforms/param_to_variable.h
index 4194cbae53ddf..bdb7bed12c970 100644
--- a/paddle/fluid/ir/dialect/paddle_dialect/transforms/param_to_variable.h
+++ b/paddle/fluid/pir/dialect/operator/transforms/param_to_variable.h
@@ -14,21 +14,21 @@
 
 #pragma once
 #include "paddle/fluid/framework/variable.h"
-#include "paddle/ir/core/dialect_interface.h"
-#include "paddle/ir/core/parameter.h"
+#include "paddle/pir/core/dialect_interface.h"
+#include "paddle/pir/core/parameter.h"
 
 namespace paddle {
 namespace dialect {
 class ParameterConvertInterface
-    : public ir::DialectInterface::Base<ParameterConvertInterface> {
+    : public pir::DialectInterface::Base<ParameterConvertInterface> {
  public:
-  explicit ParameterConvertInterface(ir::Dialect* dialect) : Base(dialect) {}
+  explicit ParameterConvertInterface(pir::Dialect* dialect) : Base(dialect) {}
 
   // NOTE(zhangbo): Only support new a CPU Variable.
   std::shared_ptr<paddle::framework::Variable> ParameterToVariable(
-      ir::Parameter* parameter);
+      pir::Parameter* parameter);
 
-  std::unique_ptr<ir::Parameter> VariableToParameter(
+  std::unique_ptr<pir::Parameter> VariableToParameter(
       paddle::framework::Variable* var);
 };
 
diff --git a/paddle/fluid/ir/dialect/paddle_dialect/utils/CMakeLists.txt b/paddle/fluid/pir/dialect/operator/utils/CMakeLists.txt
similarity index 64%
rename from paddle/fluid/ir/dialect/paddle_dialect/utils/CMakeLists.txt
rename to paddle/fluid/pir/dialect/operator/utils/CMakeLists.txt
index 325f13f619b51..58eafb2cc3921 100644
--- a/paddle/fluid/ir/dialect/paddle_dialect/utils/CMakeLists.txt
+++ b/paddle/fluid/pir/dialect/operator/utils/CMakeLists.txt
@@ -1,5 +1,5 @@
 cc_library(op_yaml_info_parser SRCS op_yaml_info_parser.cc)
 cc_library(
-  pd_dialect_utils
+  pd_op_dialect_utils
   SRCS utils.cc
-  DEPS pd_dialect_core)
+  DEPS pd_op_dialect_core)
diff --git a/paddle/fluid/ir/dialect/paddle_dialect/utils/op_yaml_info_parser.cc b/paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.cc
similarity index 98%
rename from paddle/fluid/ir/dialect/paddle_dialect/utils/op_yaml_info_parser.cc
rename to paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.cc
index 8b5be8ff00cfd..eeb41ed3620ac 100644
--- a/paddle/fluid/ir/dialect/paddle_dialect/utils/op_yaml_info_parser.cc
+++ b/paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/ir/dialect/paddle_dialect/utils/op_yaml_info_parser.h"
+#include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.h"
 
 namespace paddle {
 namespace dialect {
diff --git a/paddle/fluid/ir/dialect/paddle_dialect/utils/op_yaml_info_parser.h b/paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.h
similarity index 97%
rename from paddle/fluid/ir/dialect/paddle_dialect/utils/op_yaml_info_parser.h
rename to paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.h
index acbc1b8e19649..9557a3d5b7763 100644
--- a/paddle/fluid/ir/dialect/paddle_dialect/utils/op_yaml_info_parser.h
+++ b/paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "paddle/fluid/ir/dialect/paddle_dialect/interface/op_yaml_info.h"
+#include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
 
 namespace paddle {
 namespace dialect {
diff --git a/paddle/fluid/ir/dialect/paddle_dialect/utils/op_yaml_info_util.h b/paddle/fluid/pir/dialect/operator/utils/op_yaml_info_util.h
similarity index 96%
rename from paddle/fluid/ir/dialect/paddle_dialect/utils/op_yaml_info_util.h
rename to paddle/fluid/pir/dialect/operator/utils/op_yaml_info_util.h
index 3df6ce5e22c15..462e88f4da327 100644
--- a/paddle/fluid/ir/dialect/paddle_dialect/utils/op_yaml_info_util.h
+++ b/paddle/fluid/pir/dialect/operator/utils/op_yaml_info_util.h
@@ -14,9 +14,9 @@
 
 #pragma once
 
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_type_storage.h"
-#include "paddle/ir/core/builtin_attribute.h"
-#include "paddle/ir/core/builtin_type.h"
+#include "paddle/fluid/pir/dialect/operator/ir/type_storage.h"
+#include "paddle/pir/core/builtin_attribute.h"
+#include "paddle/pir/core/builtin_type.h"
 
 namespace paddle {
 namespace dialect {
diff --git a/paddle/fluid/ir/dialect/paddle_dialect/utils/utils.cc b/paddle/fluid/pir/dialect/operator/utils/utils.cc
similarity index 60%
rename from paddle/fluid/ir/dialect/paddle_dialect/utils/utils.cc
rename to paddle/fluid/pir/dialect/operator/utils/utils.cc
index e0ec875ca00d6..4681b9b100122 100644
--- a/paddle/fluid/ir/dialect/paddle_dialect/utils/utils.cc
+++ b/paddle/fluid/pir/dialect/operator/utils/utils.cc
@@ -12,24 +12,29 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/ir/dialect/paddle_dialect/utils/utils.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_attribute.h"
+#include "paddle/fluid/pir/dialect/operator/utils/utils.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
 
 namespace paddle {
 namespace dialect {
 
 const std::unordered_set<std::string> LegacyOpList = {
-    "pd.load_combine",
-    "pd.c_concat",
-    "pd.c_broadcast_",
-    "pd.fused_bn_add_activation_",
-    "pd.fused_bn_add_activation_grad",
-    "pd.c_sync_calc_stream_",
-    "pd.c_sync_comm_stream_",
-    "pd.send_v2",
-    "pd.recv_v2",
-    "pd.c_allreduce_sum",
-    "pd.c_allreduce_sum_"};
+    "pd_op.load_combine",
+    "pd_op.c_concat",
+    "pd_op.c_broadcast_",
+    "pd_op.fused_bn_add_activation_",
+    "pd_op.fused_bn_add_activation_grad",
+    "pd_op.c_sync_calc_stream_",
+    "pd_op.c_sync_comm_stream_",
+    "pd_op.send_v2",
+    "pd_op.recv_v2",
+    "pd_op.c_allreduce_sum",
+    "pd_op.c_allreduce_sum_",
+    "pd_op.c_reduce_sum",
+    "pd_op.c_reduce_sum_",
+    "pd_op.c_allreduce_max_",
+    "pd_op.c_allgather",
+    "pd_op.seed"};
 
 enum class AttrType {
   UNDEFINED = 0,
@@ -53,20 +58,20 @@ enum class AttrType {
   NUM_ATTR_TYPES,
 };
 
-static inline AttrType GetAttributeType(const ir::Attribute& attr) {
-  if (attr.isa<ir::BoolAttribute>()) {
+static inline AttrType GetAttributeType(const pir::Attribute& attr) {
+  if (attr.isa<pir::BoolAttribute>()) {
     return AttrType::BOOL;
-  } else if (attr.isa<ir::FloatAttribute>()) {
+  } else if (attr.isa<pir::FloatAttribute>()) {
     return AttrType::FLOAT;
-  } else if (attr.isa<ir::DoubleAttribute>()) {
+  } else if (attr.isa<pir::DoubleAttribute>()) {
     return AttrType::DOUBLE;
-  } else if (attr.isa<ir::Int32Attribute>()) {
+  } else if (attr.isa<pir::Int32Attribute>()) {
     return AttrType::INT32;
-  } else if (attr.isa<ir::Int64Attribute>()) {
+  } else if (attr.isa<pir::Int64Attribute>()) {
     return AttrType::INT64;
-  } else if (attr.isa<ir::ArrayAttribute>()) {
+  } else if (attr.isa<pir::ArrayAttribute>()) {
     return AttrType::ARRAY;
-  } else if (attr.isa<ir::StrAttribute>()) {
+  } else if (attr.isa<pir::StrAttribute>()) {
     return AttrType::STRING;
   } else if (attr.isa<paddle::dialect::IntArrayAttribute>()) {
     return AttrType::INT_ARRAY;
@@ -81,53 +86,54 @@ static inline AttrType GetAttributeType(const ir::Attribute& attr) {
   }
 }
 
-static std::unordered_map<AttrType,
-                          std::function<VariantType(const ir::Attribute& attr)>>
+static std::unordered_map<
+    AttrType,
+    std::function<VariantType(const pir::Attribute& attr)>>
     kAttrCastMap = {
         {AttrType::BOOL,
-         [](const ir::Attribute& attr) {
-           return VariantType{attr.dyn_cast<ir::BoolAttribute>().data()};
+         [](const pir::Attribute& attr) {
+           return VariantType{attr.dyn_cast<pir::BoolAttribute>().data()};
          }},
         {AttrType::FLOAT,
-         [](const ir::Attribute& attr) {
-           return VariantType{attr.dyn_cast<ir::FloatAttribute>().data()};
+         [](const pir::Attribute& attr) {
+           return VariantType{attr.dyn_cast<pir::FloatAttribute>().data()};
          }},
         {AttrType::DOUBLE,
-         [](const ir::Attribute& attr) {
-           return VariantType{attr.dyn_cast<ir::DoubleAttribute>().data()};
+         [](const pir::Attribute& attr) {
+           return VariantType{attr.dyn_cast<pir::DoubleAttribute>().data()};
          }},
         {AttrType::INT32,
-         [](const ir::Attribute& attr) {
-           return VariantType{attr.dyn_cast<ir::Int32Attribute>().data()};
+         [](const pir::Attribute& attr) {
+           return VariantType{attr.dyn_cast<pir::Int32Attribute>().data()};
          }},
         {AttrType::INT64,
-         [](const ir::Attribute& attr) {
-           return VariantType{attr.dyn_cast<ir::Int64Attribute>().data()};
+         [](const pir::Attribute& attr) {
+           return VariantType{attr.dyn_cast<pir::Int64Attribute>().data()};
          }},
         {AttrType::INT_ARRAY,
-         [](const ir::Attribute& attr) {
+         [](const pir::Attribute& attr) {
            return VariantType{
                attr.dyn_cast<paddle::dialect::IntArrayAttribute>()
                    .data()
                    .GetData()};
          }},
         {AttrType::STRING,
-         [](const ir::Attribute& attr) {
-           return VariantType{attr.dyn_cast<ir::StrAttribute>().AsString()};
+         [](const pir::Attribute& attr) {
+           return VariantType{attr.dyn_cast<pir::StrAttribute>().AsString()};
          }},
         {AttrType::DATA_TYPE,
-         [](const ir::Attribute& attr) {
+         [](const pir::Attribute& attr) {
            return VariantType{
                attr.dyn_cast<paddle::dialect::DataTypeAttribute>().data()};
          }},
         {AttrType::PLACE,
-         [](const ir::Attribute& attr) {
+         [](const pir::Attribute& attr) {
            return VariantType{
                attr.dyn_cast<paddle::dialect::PlaceAttribute>().data()};
          }},
         {AttrType::ARRAY,
-         [](const ir::Attribute& attr) {
-           auto attr_vec = attr.dyn_cast<ir::ArrayAttribute>().AsVector();
+         [](const pir::Attribute& attr) {
+           auto attr_vec = attr.dyn_cast<pir::ArrayAttribute>().AsVector();
            if (attr_vec.size() == 0) {
              return VariantType{std::vector<int>()};
            }
@@ -137,37 +143,44 @@ static std::unordered_map<AttrType,
              std::vector<bool> vec_bools;
              for (auto vec_element : attr_vec) {
                vec_bools.push_back(
-                   vec_element.dyn_cast<ir::BoolAttribute>().data());
+                   vec_element.dyn_cast<pir::BoolAttribute>().data());
              }
              return VariantType{vec_bools};
            } else if (element_type == AttrType::INT32) {
              std::vector<int> vec_int32;
              for (auto vec_element : attr_vec) {
                vec_int32.push_back(
-                   vec_element.dyn_cast<ir::Int32Attribute>().data());
+                   vec_element.dyn_cast<pir::Int32Attribute>().data());
              }
              return VariantType{vec_int32};
            } else if (element_type == AttrType::INT64) {
              std::vector<int64_t> vec_int64;
              for (auto vec_element : attr_vec) {
                vec_int64.push_back(
-                   vec_element.dyn_cast<ir::Int64Attribute>().data());
+                   vec_element.dyn_cast<pir::Int64Attribute>().data());
              }
              return VariantType{vec_int64};
            } else if (element_type == AttrType::FLOAT) {
              std::vector<float> vec_float;
              for (auto vec_element : attr_vec) {
                vec_float.push_back(
-                   vec_element.dyn_cast<ir::FloatAttribute>().data());
+                   vec_element.dyn_cast<pir::FloatAttribute>().data());
              }
              return VariantType{vec_float};
            } else if (element_type == AttrType::DOUBLE) {
              std::vector<double> vec_double;
              for (auto vec_element : attr_vec) {
                vec_double.push_back(
-                   vec_element.dyn_cast<ir::DoubleAttribute>().data());
+                   vec_element.dyn_cast<pir::DoubleAttribute>().data());
              }
              return VariantType{vec_double};
+           } else if (element_type == AttrType::STRING) {
+             std::vector<std::string> vec_string;
+             for (auto vec_element : attr_vec) {
+               vec_string.push_back(
+                   vec_element.dyn_cast<pir::StrAttribute>().AsString());
+             }
+             return VariantType{vec_string};
            } else {
              PADDLE_THROW(phi::errors::Unimplemented(
                  "Unsupported ir Attribute type when casting it into "
@@ -176,7 +189,7 @@ static std::unordered_map<AttrType,
          }},
 };
 
-VariantType GetAttributeData(const ir::Attribute& attr) {
+VariantType GetAttributeData(const pir::Attribute& attr) {
   AttrType attr_type = GetAttributeType(attr);
   return kAttrCastMap[attr_type](attr);
 }
diff --git a/paddle/fluid/ir/dialect/paddle_dialect/utils/utils.h b/paddle/fluid/pir/dialect/operator/utils/utils.h
similarity index 69%
rename from paddle/fluid/ir/dialect/paddle_dialect/utils/utils.h
rename to paddle/fluid/pir/dialect/operator/utils/utils.h
index f8ba736878511..0d24a03dc635d 100644
--- a/paddle/fluid/ir/dialect/paddle_dialect/utils/utils.h
+++ b/paddle/fluid/pir/dialect/operator/utils/utils.h
@@ -16,12 +16,12 @@
 
 // #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_type_storage.h"
-#include "paddle/ir/core/builtin_attribute.h"
-#include "paddle/ir/core/builtin_type.h"
+#include "paddle/fluid/pir/dialect/operator/ir/type_storage.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/attribute.h"
+#include "paddle/pir/core/builtin_attribute.h"
+#include "paddle/pir/core/builtin_type.h"
 
 namespace paddle {
 namespace dialect {
@@ -30,32 +30,32 @@ using VariantType = phi::Attribute;
 
 // TODO(zhangbo): The builtin type needs to cover all data types of
 // phi::DataType.
-static inline phi::DataType TransToPhiDataType(ir::Type dtype) {
-  if (dtype.isa<ir::BFloat16Type>()) {
+static inline phi::DataType TransToPhiDataType(pir::Type dtype) {
+  if (dtype.isa<pir::BFloat16Type>()) {
     return phi::DataType::BFLOAT16;
-  } else if (dtype.isa<ir::Float16Type>()) {
+  } else if (dtype.isa<pir::Float16Type>()) {
     return phi::DataType::FLOAT16;
-  } else if (dtype.isa<ir::Float32Type>()) {
+  } else if (dtype.isa<pir::Float32Type>()) {
     return phi::DataType::FLOAT32;
-  } else if (dtype.isa<ir::Float64Type>()) {
+  } else if (dtype.isa<pir::Float64Type>()) {
     return phi::DataType::FLOAT64;
-  } else if (dtype.isa<ir::UInt8Type>()) {
+  } else if (dtype.isa<pir::UInt8Type>()) {
     return phi::DataType::UINT8;
-  } else if (dtype.isa<ir::Int8Type>()) {
+  } else if (dtype.isa<pir::Int8Type>()) {
     return phi::DataType::INT8;
-  } else if (dtype.isa<ir::Int16Type>()) {
+  } else if (dtype.isa<pir::Int16Type>()) {
     return phi::DataType::INT16;
-  } else if (dtype.isa<ir::Int32Type>()) {
+  } else if (dtype.isa<pir::Int32Type>()) {
     return phi::DataType::INT32;
-  } else if (dtype.isa<ir::Int64Type>()) {
+  } else if (dtype.isa<pir::Int64Type>()) {
     return phi::DataType::INT64;
-  } else if (dtype.isa<ir::IndexType>()) {
+  } else if (dtype.isa<pir::IndexType>()) {
     return phi::DataType::INT32;
-  } else if (dtype.isa<ir::BoolType>()) {
+  } else if (dtype.isa<pir::BoolType>()) {
     return phi::DataType::BOOL;
-  } else if (dtype.isa<ir::Complex64Type>()) {
+  } else if (dtype.isa<pir::Complex64Type>()) {
     return phi::DataType::COMPLEX64;
-  } else if (dtype.isa<ir::Complex128Type>()) {
+  } else if (dtype.isa<pir::Complex128Type>()) {
     return phi::DataType::COMPLEX128;
   } else {
     PADDLE_THROW(phi::errors::Unimplemented(
@@ -66,36 +66,36 @@ static inline phi::DataType TransToPhiDataType(ir::Type dtype) {
 
 // use phi::DataType::INT32 for IndexType from builtin type to phi::DataType,
 // but only use INT32 not IndexType from phi::DataType type to builtin type.
-static inline ir::Type TransToIrDataType(phi::DataType dtype,
-                                         ir::IrContext* ctx = nullptr) {
+static inline pir::Type TransToIrDataType(phi::DataType dtype,
+                                          pir::IrContext* ctx = nullptr) {
   if (ctx == nullptr) {
-    ctx = ir::IrContext::Instance();
+    ctx = pir::IrContext::Instance();
   }
   switch (dtype) {
     case phi::DataType::BFLOAT16:
-      return ir::BFloat16Type::get(ctx);
+      return pir::BFloat16Type::get(ctx);
     case phi::DataType::FLOAT16:
-      return ir::Float16Type::get(ctx);
+      return pir::Float16Type::get(ctx);
     case phi::DataType::FLOAT32:
-      return ir::Float32Type::get(ctx);
+      return pir::Float32Type::get(ctx);
     case phi::DataType::FLOAT64:
-      return ir::Float64Type::get(ctx);
+      return pir::Float64Type::get(ctx);
     case phi::DataType::UINT8:
-      return ir::UInt8Type::get(ctx);
+      return pir::UInt8Type::get(ctx);
     case phi::DataType::INT8:
-      return ir::Int8Type::get(ctx);
+      return pir::Int8Type::get(ctx);
     case phi::DataType::INT16:
-      return ir::Int16Type::get(ctx);
+      return pir::Int16Type::get(ctx);
     case phi::DataType::INT32:
-      return ir::Int32Type::get(ctx);
+      return pir::Int32Type::get(ctx);
     case phi::DataType::INT64:
-      return ir::Int64Type::get(ctx);
+      return pir::Int64Type::get(ctx);
     case phi::DataType::BOOL:
-      return ir::BoolType::get(ctx);
+      return pir::BoolType::get(ctx);
     case phi::DataType::COMPLEX64:
-      return ir::Complex64Type::get(ctx);
+      return pir::Complex64Type::get(ctx);
     case phi::DataType::COMPLEX128:
-      return ir::Complex128Type::get(ctx);
+      return pir::Complex128Type::get(ctx);
     default:
       PADDLE_THROW(phi::errors::Unimplemented(
           "Unsupported phi data type `%s` when casting it into "
@@ -104,22 +104,22 @@ static inline ir::Type TransToIrDataType(phi::DataType dtype,
   }
 }
 
-static inline ir::Attribute TransToIrAttribute(phi::Scalar scalar,
-                                               ir::IrContext* ctx = nullptr) {
+static inline pir::Attribute TransToIrAttribute(phi::Scalar scalar,
+                                                pir::IrContext* ctx = nullptr) {
   if (ctx == nullptr) {
-    ctx = ir::IrContext::Instance();
+    ctx = pir::IrContext::Instance();
   }
   switch (scalar.dtype()) {
     case phi::DataType::FLOAT32:
-      return ir::FloatAttribute::get(ctx, scalar.to<float>());
+      return pir::FloatAttribute::get(ctx, scalar.to<float>());
     case phi::DataType::FLOAT64:
-      return ir::DoubleAttribute::get(ctx, scalar.to<double>());
+      return pir::DoubleAttribute::get(ctx, scalar.to<double>());
     case phi::DataType::INT32:
-      return ir::Int32Attribute::get(ctx, scalar.to<int32_t>());
+      return pir::Int32Attribute::get(ctx, scalar.to<int32_t>());
     case phi::DataType::INT64:
-      return ir::Int64Attribute::get(ctx, scalar.to<int64_t>());
+      return pir::Int64Attribute::get(ctx, scalar.to<int64_t>());
     case phi::DataType::BOOL:
-      return ir::BoolAttribute::get(ctx, scalar.to<bool>());
+      return pir::BoolAttribute::get(ctx, scalar.to<bool>());
     default:
       PADDLE_THROW(phi::errors::Unimplemented(
           "Unsupported phi data type `%s` when casting it into "
@@ -166,7 +166,7 @@ inline DataType VarTypeToDataType(
   }
 }
 
-VariantType GetAttributeData(const ir::Attribute& attr);
+VariantType GetAttributeData(const pir::Attribute& attr);
 
 bool IsLegacyOp(const std::string& name);
 
diff --git a/paddle/fluid/ir/phi_kernel_adaptor/CMakeLists.txt b/paddle/fluid/pir/phi_kernel_adaptor/CMakeLists.txt
similarity index 56%
rename from paddle/fluid/ir/phi_kernel_adaptor/CMakeLists.txt
rename to paddle/fluid/pir/phi_kernel_adaptor/CMakeLists.txt
index 1df1cc06db594..e1f8db179be6b 100644
--- a/paddle/fluid/ir/phi_kernel_adaptor/CMakeLists.txt
+++ b/paddle/fluid/pir/phi_kernel_adaptor/CMakeLists.txt
@@ -1,4 +1,4 @@
-# All source files of pd_dialect, except for the source file of op, which is generated in the compilation directory.
+# All source files of pd_op_dialect, except for the source file of op, which is generated in the compilation directory.
 file(GLOB PHI_KERNEL_ADAPTOR_SRCS "*.cc")
 
 cc_library(
diff --git a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_adaptor.h b/paddle/fluid/pir/phi_kernel_adaptor/phi_kernel_adaptor.h
similarity index 62%
rename from paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_adaptor.h
rename to paddle/fluid/pir/phi_kernel_adaptor/phi_kernel_adaptor.h
index bb1b284ea1b6c..47c0d39856d2f 100644
--- a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_adaptor.h
+++ b/paddle/fluid/pir/phi_kernel_adaptor/phi_kernel_adaptor.h
@@ -14,23 +14,23 @@
 
 #pragma once
 
-#include "paddle/fluid/ir/dialect/paddle_dialect/interface/infermeta.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/interface/op_yaml_info.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_dialect.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_op.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_type.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/utils/op_yaml_info_parser.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/utils/op_yaml_info_util.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/utils/utils.h"
-#include "paddle/ir/core/builtin_attribute.h"
-#include "paddle/ir/core/builtin_dialect.h"
-#include "paddle/ir/core/builtin_op.h"
-#include "paddle/ir/core/ir_context.h"
-#include "paddle/ir/core/program.h"
-#include "paddle/ir/core/utils.h"
+#include "paddle/fluid/pir/dialect/operator/interface/infermeta.h"
+#include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
+#include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.h"
+#include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_util.h"
+#include "paddle/fluid/pir/dialect/operator/utils/utils.h"
 #include "paddle/phi/core/meta_tensor.h"
 #include "paddle/phi/infermeta/binary.h"
 #include "paddle/phi/kernels/elementwise_add_kernel.h"
+#include "paddle/pir/core/builtin_attribute.h"
+#include "paddle/pir/core/builtin_dialect.h"
+#include "paddle/pir/core/builtin_op.h"
+#include "paddle/pir/core/ir_context.h"
+#include "paddle/pir/core/program.h"
+#include "paddle/pir/core/utils.h"
 
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/tensor.h"
@@ -43,19 +43,19 @@
 
 #include "paddle/fluid/platform/init.h"
 
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_attribute.h"
-#include "paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_attribute.h"
+#include "paddle/fluid/pir/dialect/kernel/ir/kernel_attribute.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
 
 #include "glog/logging.h"
-#include "paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h"
+#include "paddle/fluid/pir/phi_kernel_adaptor/phi_kernel_util.h"
 
 class PhiKernelAdaptor {
  public:
   explicit PhiKernelAdaptor(paddle::framework::Scope* scope) : scope_(scope) {}
 
-  void run_kernel_prog(ir::Program* program) {
+  void run_kernel_prog(pir::Program* program) {
     auto block = program->block();
-    std::unordered_map<ir::Value, std::string> value_2_var_name;
+    std::unordered_map<pir::Value, std::string> value_2_var_name;
     std::unordered_map<const paddle::framework::Variable*, std::string>
         variable_2_var_name;
     std::map<std::string, int> var_name_2_id;
@@ -70,9 +70,9 @@ class PhiKernelAdaptor {
                &variable_2_var_name,
                &var_name_2_id,
                &variable_list);
-    ir::IrContext* ctx = ir::IrContext::Instance();
+    pir::IrContext* ctx = pir::IrContext::Instance();
 
-    ctx->GetOrRegisterDialect<paddle::dialect::PaddleDialect>();
+    ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
 
     auto* dev_ctx = phi::DeviceContextPool::Instance().Get(phi::CPUPlace());
     phi::Place cpu_place(phi::AllocationType::CPU);
@@ -80,9 +80,9 @@ class PhiKernelAdaptor {
       auto attr_map = (*it)->attributes();
 
       auto op_name =
-          attr_map.at("op_name").dyn_cast<ir::StrAttribute>().AsString();
+          attr_map.at("op_name").dyn_cast<pir::StrAttribute>().AsString();
 
-      ir::OpInfo op1_info = ctx->GetRegisteredOpInfo(op_name);
+      pir::OpInfo op1_info = ctx->GetRegisteredOpInfo(op_name);
 
       auto impl =
           op1_info.GetInterfaceImpl<paddle::dialect::OpYamlInfoInterface>();
@@ -96,7 +96,7 @@ class PhiKernelAdaptor {
       phi::InferMetaContext ctx;
 
       paddle::dialect::OpYamlInfoParser op_yaml_info_parser(yaml_info);
-      ir::BuildPhiContext<
+      pir::BuildPhiContext<
           phi::InferMetaContext,
           phi::MetaTensor,
           phi::MetaTensor,
@@ -108,7 +108,7 @@ class PhiKernelAdaptor {
       infer_meta_impl->infer_meta_(&ctx);
 
       auto kernel_name =
-          attr_map.at("kernel_name").dyn_cast<ir::StrAttribute>().AsString();
+          attr_map.at("kernel_name").dyn_cast<pir::StrAttribute>().AsString();
       auto kernel_key = attr_map.at("kernel_key")
                             .dyn_cast<paddle::dialect::KernelAttribute>()
                             .data();
@@ -118,17 +118,17 @@ class PhiKernelAdaptor {
 
       phi::KernelContext kernel_ctx(dev_ctx);
 
-      ir::BuildPhiContext<phi::KernelContext,
-                          const phi::TensorBase*,
-                          phi::TensorBase*,
-                          paddle::small_vector<const phi::TensorBase*>,
-                          paddle::small_vector<phi::TensorBase*>,
-                          true>((*it),
-                                value_2_var_name,
-                                scope_,
-                                nullptr,
-                                op_yaml_info_parser,
-                                &kernel_ctx);
+      pir::BuildPhiContext<phi::KernelContext,
+                           const phi::TensorBase*,
+                           phi::TensorBase*,
+                           paddle::small_vector<const phi::TensorBase*>,
+                           paddle::small_vector<phi::TensorBase*>,
+                           true>((*it),
+                                 value_2_var_name,
+                                 scope_,
+                                 nullptr,
+                                 op_yaml_info_parser,
+                                 &kernel_ctx);
       kernel_fn(&kernel_ctx);
 
       auto out_value = (*it)->result(0);
diff --git a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc b/paddle/fluid/pir/phi_kernel_adaptor/phi_kernel_util.cc
similarity index 82%
rename from paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc
rename to paddle/fluid/pir/phi_kernel_adaptor/phi_kernel_util.cc
index c72641046f520..475e06f936f19 100644
--- a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc
+++ b/paddle/fluid/pir/phi_kernel_adaptor/phi_kernel_util.cc
@@ -12,18 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h"
-
-#include "paddle/fluid/ir/dialect/paddle_dialect/interface/op_yaml_info.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_dialect.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_type.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/utils/op_yaml_info_util.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/utils/utils.h"
-#include "paddle/ir/core/builtin_attribute.h"
-#include "paddle/ir/core/ir_context.h"
-#include "paddle/ir/core/program.h"
-#include "paddle/ir/core/utils.h"
+#include "paddle/fluid/pir/phi_kernel_adaptor/phi_kernel_util.h"
+
+#include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
+#include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_util.h"
+#include "paddle/fluid/pir/dialect/operator/utils/utils.h"
 #include "paddle/phi/core/meta_tensor.h"
+#include "paddle/pir/core/builtin_attribute.h"
+#include "paddle/pir/core/ir_context.h"
+#include "paddle/pir/core/program.h"
+#include "paddle/pir/core/utils.h"
 
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/tensor.h"
@@ -33,23 +33,23 @@
 
 #include "paddle/fluid/framework/string_array.h"
 #include "paddle/fluid/framework/tensor_ref_array.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_attribute.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/utils/op_yaml_info_parser.h"
-#include "paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_attribute.h"
-#include "paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_type.h"
 #include "paddle/fluid/ir_adaptor/translator/op_compat_info.h"
+#include "paddle/fluid/pir/dialect/kernel/ir/kernel_attribute.h"
+#include "paddle/fluid/pir/dialect/kernel/ir/kernel_type.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
+#include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.h"
 #include "paddle/phi/core/enforce.h"
 
 #include "glog/logging.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/operator.h"
 
-namespace ir {
+namespace pir {
 
-void AddNewData(ir::Value value,
+void AddNewData(pir::Value value,
                 std::string name,
                 paddle::framework::Variable* var,
-                std::unordered_map<ir::Value, std::string>* value_2_var_name,
+                std::unordered_map<pir::Value, std::string>* value_2_var_name,
                 std::unordered_map<const paddle::framework::Variable*,
                                    std::string>* variable_2_var_name,
                 std::map<std::string, int>* var_name_2_id,
@@ -71,10 +71,10 @@ void AddNewData(ir::Value value,
           "The size of variable_list and var_name_2_id map should be equal"));
 }
 
-void RenameData(ir::Value value,
+void RenameData(pir::Value value,
                 std::string new_name,
                 std::string orig_name,
-                std::unordered_map<ir::Value, std::string>* value_2_var_name,
+                std::unordered_map<pir::Value, std::string>* value_2_var_name,
                 std::unordered_map<const paddle::framework::Variable*,
                                    std::string>* variable_2_var_name,
                 std::map<std::string, int>* var_name_2_id) {
@@ -104,11 +104,11 @@ using VariableNameMap =
     std::unordered_map<const paddle::framework::Variable*, std::string>;
 
 paddle::framework::Variable* CreateVar(
-    ir::Value value,
+    pir::Value value,
     paddle::framework::Scope* inner_scope,
     const std::string& var_name_prefix,
     bool force_persisable,
-    std::unordered_map<ir::Value, std::string>* value_2_var_name,
+    std::unordered_map<pir::Value, std::string>* value_2_var_name,
     std::unordered_map<const paddle::framework::Variable*, std::string>*
         variable_2_var_name,
     std::map<std::string, int>* var_name_2_id,
@@ -142,9 +142,9 @@ paddle::framework::Variable* CreateVar(
 }
 
 void CheckInputVars(
-    ir::Operation* op,
+    pir::Operation* op,
     const std::string& op_name,
-    const std::unordered_map<ir::Value, std::string>& value_2_var_name) {
+    const std::unordered_map<pir::Value, std::string>& value_2_var_name) {
   size_t input_num = op->num_operands();
   if (input_num > 0) {
     for (size_t i = 0; i < input_num; ++i) {
@@ -162,10 +162,10 @@ void CheckInputVars(
   }
 }
 
-void BuildValue(ir::Value value,
+void BuildValue(pir::Value value,
                 paddle::framework::Scope* inner_scope,
                 const std::string& var_name_prefix,
-                std::unordered_map<ir::Value, std::string>* value_2_var_name,
+                std::unordered_map<pir::Value, std::string>* value_2_var_name,
                 std::unordered_map<const paddle::framework::Variable*,
                                    std::string>* variable_2_var_name,
                 std::map<std::string, int>* var_name_2_id,
@@ -190,12 +190,12 @@ void BuildValue(ir::Value value,
     var->GetMutable<phi::DenseTensor>();
   } else if (value.type().isa<paddle::dialect::AllocatedSelectedRowsType>()) {
     var->GetMutable<phi::SelectedRows>();
-  } else if (value.type().isa<ir::VectorType>()) {
+  } else if (value.type().isa<pir::VectorType>()) {
     auto tensor_array = var->GetMutable<paddle::framework::VariableRefArray>();
-    for (size_t i = 0; i < value.type().dyn_cast<ir::VectorType>().size();
+    for (size_t i = 0; i < value.type().dyn_cast<pir::VectorType>().size();
          i++) {
       PADDLE_ENFORCE(value.type()
-                         .dyn_cast<ir::VectorType>()[i]
+                         .dyn_cast<pir::VectorType>()[i]
                          .isa<paddle::dialect::AllocatedDenseTensorType>(),
                      paddle::platform::errors::Fatal(
                          "Element of VectorType output only support "
@@ -219,10 +219,10 @@ void BuildValue(ir::Value value,
 }
 
 void HandleForSpecialOp(
-    ir::Operation* op,
+    pir::Operation* op,
     paddle::framework::Scope* inner_scope,
     const std::string& var_name_prefix,
-    std::unordered_map<ir::Value, std::string>* value_2_var_name,
+    std::unordered_map<pir::Value, std::string>* value_2_var_name,
     std::unordered_map<const paddle::framework::Variable*, std::string>*
         variable_2_var_name,
     std::map<std::string, int>* var_name_2_id,
@@ -230,13 +230,13 @@ void HandleForSpecialOp(
   std::string op_name = op->name();
   if (op->attributes().count("op_name")) {
     op_name =
-        op->attributes().at("op_name").dyn_cast<ir::StrAttribute>().AsString();
+        op->attributes().at("op_name").dyn_cast<pir::StrAttribute>().AsString();
   }
 
-  if (op_name == "pd.fetch") {
+  if (op_name == "pd_op.fetch") {
     // fetch is a very special op, with no output
     auto fetch_src_name =
-        op->attributes().at("name").dyn_cast<ir::StrAttribute>().AsString();
+        op->attributes().at("name").dyn_cast<pir::StrAttribute>().AsString();
 
     auto fetch_var_name = fetch_src_name + "@fetch";
     auto* var = const_cast<paddle::framework::Scope*>(inner_scope->root())
@@ -253,13 +253,13 @@ void HandleForSpecialOp(
                variable_list);
   }
 
-  if (op_name == "pd.feed" || op_name == "pd.data") {
+  if (op_name == "pd_op.feed" || op_name == "pd_op.data") {
     VLOG(6) << "Handle for" << op_name;
     auto value = op->result(0);
     VLOG(6) << "link feed output to feed in variable" << inner_scope;
 
     std::string name =
-        op->attributes().at("name").dyn_cast<ir::StrAttribute>().AsString();
+        op->attributes().at("name").dyn_cast<pir::StrAttribute>().AsString();
     paddle::framework::Variable* var = inner_scope->FindVar(name);
     PADDLE_ENFORCE(var,
                    paddle::platform::errors::InvalidArgument(
@@ -310,7 +310,7 @@ void HandleForSpecialOp(
     VLOG(6) << "Handle for builtin.set_parameter:";
     auto param_name = op->attributes()
                           .at("parameter_name")
-                          .dyn_cast<ir::StrAttribute>()
+                          .dyn_cast<pir::StrAttribute>()
                           .AsString();
 
     auto value = op->operand_source(0);
@@ -338,10 +338,10 @@ void HandleForSpecialOp(
                var_name_2_id);
   }
 
-  if (op_name == "pd.shadow_output") {
-    VLOG(6) << "Handle for pd.shadow_ouptut";
+  if (op_name == "pd_op.shadow_output") {
+    VLOG(6) << "Handle for pd_op.shadow_ouptut";
     auto var_name =
-        op->attributes().at("name").dyn_cast<ir::StrAttribute>().AsString();
+        op->attributes().at("name").dyn_cast<pir::StrAttribute>().AsString();
 
     auto value = op->operand_source(0);
     // change opreand name to param_name
@@ -363,7 +363,7 @@ void HandleForSpecialOp(
     VLOG(6) << "Handle for builtin.get_parameter:";
     auto param_name = op->attributes()
                           .at("parameter_name")
-                          .dyn_cast<ir::StrAttribute>()
+                          .dyn_cast<pir::StrAttribute>()
                           .AsString();
     auto value = op->result(0);
 
@@ -387,7 +387,7 @@ void HandleForSpecialOp(
                           "input of buildin slice not in name map"));
 
     int index =
-        op->attributes().at("index").dyn_cast<ir::Int32Attribute>().data();
+        op->attributes().at("index").dyn_cast<pir::Int32Attribute>().data();
     auto in_var = inner_scope->FindVar(value_2_var_name->at(in_value));
     auto variable_array = in_var->Get<paddle::framework::VariableRefArray>();
 
@@ -428,36 +428,36 @@ void HandleForSpecialOp(
 }
 
 void HandleForInplaceOp(
-    ir::Operation* op,
+    pir::Operation* op,
     paddle::framework::Scope* inner_scope,
     const std::string& var_name_prefix,
-    std::unordered_map<ir::Value, std::string>* value_2_var_name,
+    std::unordered_map<pir::Value, std::string>* value_2_var_name,
     std::unordered_map<const paddle::framework::Variable*, std::string>*
         variable_2_var_name,
     std::map<std::string, int>* var_name_2_id,
     std::vector<paddle::framework::Variable*>* variable_list) {
   if (op->num_results() < 1) return;
-  ir::IrContext* ctx = ir::IrContext::Instance();
+  pir::IrContext* ctx = pir::IrContext::Instance();
   std::string op_name = op->name();
   if (op->attributes().count("op_name")) {
     op_name =
-        op->attributes().at("op_name").dyn_cast<ir::StrAttribute>().AsString();
+        op->attributes().at("op_name").dyn_cast<pir::StrAttribute>().AsString();
   }
 
-  ir::OpInfo op_info = ctx->GetRegisteredOpInfo(op_name);
+  pir::OpInfo op_info = ctx->GetRegisteredOpInfo(op_name);
   paddle::dialect::OpYamlInfoParser yaml_parser(
       op_info.GetInterfaceImpl<paddle::dialect::OpYamlInfoInterface>()
           ->get_op_info_());
 
   for (size_t i = 0; i < op->num_results(); ++i) {
-    ir::Value value = op->result(i);
+    pir::Value value = op->result(i);
     if (value.type().storage() == nullptr) {
       continue;
     }
     std::string value_name = yaml_parser.OutputNames()[i];
     if (yaml_parser.HasInplace(value_name)) {
       const std::string& inplace_name = yaml_parser.InplaceName(value_name);
-      ir::Value inplace_value =
+      pir::Value inplace_value =
           op->operand_source(yaml_parser.InputName2Id().at(inplace_name));
       std::string var_name = value_2_var_name->at(inplace_value);
       VLOG(4) << "inplace: " << value_name << " -> " << inplace_name
@@ -465,7 +465,7 @@ void HandleForInplaceOp(
       value_2_var_name->emplace(value, var_name);
     } else if (yaml_parser.HasView(value_name)) {
       const std::string& view_name = yaml_parser.ViewName(value_name);
-      ir::Value view_value =
+      pir::Value view_value =
           op->operand_source(yaml_parser.InputName2Id().at(view_name));
       const std::string& var_name = value_2_var_name->at(view_value);
       VLOG(4) << "view: " << value_name << " -> " << view_name
@@ -485,10 +485,10 @@ void HandleForInplaceOp(
 
 // NOTE(zhiqiu): the persistable is created in inner_scope's root, and other is
 // created in inner_scope.
-void BuildScope(const ir::Block& block,
+void BuildScope(const pir::Block& block,
                 paddle::framework::Scope* inner_scope,
                 const std::string& var_name_prefix,
-                std::unordered_map<ir::Value, std::string>* value_2_var_name,
+                std::unordered_map<pir::Value, std::string>* value_2_var_name,
                 std::unordered_map<const paddle::framework::Variable*,
                                    std::string>* variable_2_var_name,
                 std::map<std::string, int>* var_name_2_id,
@@ -503,16 +503,16 @@ void BuildScope(const ir::Block& block,
     if (op->attributes().count("op_name")) {
       op_name = op->attributes()
                     .at("op_name")
-                    .dyn_cast<ir::StrAttribute>()
+                    .dyn_cast<pir::StrAttribute>()
                     .AsString();
     }
     VLOG(4) << "build op:" << op_name;
 
-    if (op_name == "pd.feed" || op_name == "pd.fetch" ||
+    if (op_name == "pd_op.feed" || op_name == "pd_op.fetch" ||
         op_name == "builtin.combine" || op_name == "builtin.set_parameter" ||
         op_name == "builtin.get_parameter" || op_name == "builtin.slice" ||
-        op_name == "builtin.split" || op_name == "pd.data" ||
-        op_name == "pd.shadow_output") {
+        op_name == "builtin.split" || op_name == "pd_op.data" ||
+        op_name == "pd_op.shadow_output") {
       HandleForSpecialOp(op,
                          inner_scope,
                          var_name_prefix,
@@ -529,7 +529,7 @@ void BuildScope(const ir::Block& block,
     if (op->attributes().count("is_inplace") != 0 &&
         op->attributes()
             .at("is_inplace")
-            .dyn_cast<ir::BoolAttribute>()
+            .dyn_cast<pir::BoolAttribute>()
             .data()) {
       HandleForInplaceOp(op,
                          inner_scope,
@@ -559,8 +559,8 @@ void BuildScope(const ir::Block& block,
 }
 
 void BuildRuntimeContext(
-    ir::Operation* op,
-    const std::unordered_map<ir::Value, std::string>& name_map,
+    pir::Operation* op,
+    const std::unordered_map<pir::Value, std::string>& name_map,
     paddle::framework::Scope* scope,
     paddle::framework::Scope* local_scope,
     const paddle::dialect::OpYamlInfoParser& op_yaml_info,
@@ -584,7 +584,7 @@ void BuildRuntimeContext(
         true,
         phi::errors::NotFound("param [%s] MUST in name2id map", name));
     auto index = op_yaml_info.InputName2Id().at(name);
-    ir::Value ptr = op->operand_source(index);
+    pir::Value ptr = op->operand_source(index);
 
     auto in_var_name = name_map.at(ptr);
     VLOG(6) << "ctx->EmplaceBackInput: " << name << "\t" << in_var_name;
@@ -602,7 +602,7 @@ void BuildRuntimeContext(
   auto& output_name_list = op_yaml_info.OutputNames();
   for (size_t i = 0; i < output_name_list.size(); ++i) {
     auto name = output_name_list[i];
-    ir::Value ptr = op->result(i);
+    pir::Value ptr = op->result(i);
 
     auto in_var_name = name_map.at(ptr);
     VLOG(6) << "ctx->EmplaceBackOutput: " << name << "\t" << in_var_name;
@@ -618,7 +618,7 @@ void BuildRuntimeContext(
     if (type.isa<paddle::dialect::AllocatedDenseTensorType>() ||
         type.isa<paddle::dialect::AllocatedSelectedRowsType>()) {
       runtime_ctx->outputs[legacy_arg_name] = {var};
-    } else if (type.isa<ir::VectorType>()) {
+    } else if (type.isa<pir::VectorType>()) {
       auto var_ref = var->Get<paddle::framework::VariableRefArray>();
       std::vector<paddle::framework::Variable*> vec_tmp;
       vec_tmp.reserve(var_ref.size());
@@ -629,14 +629,14 @@ void BuildRuntimeContext(
     } else {
       PADDLE_THROW(phi::errors::Unimplemented(
           "only support AllocatedDenseTensor, AllocatedSelectedRowsType  and "
-          "ir::vector type"));
+          "pir::vector type"));
     }
   }
 }
 
 std::shared_ptr<paddle::framework::OperatorBase> BuildOperatorBase(
-    ir::Operation* op,
-    const std::unordered_map<ir::Value, std::string>& name_map,
+    pir::Operation* op,
+    const std::unordered_map<pir::Value, std::string>& name_map,
     const paddle::dialect::OpYamlInfoParser& op_yaml_info,
     const std::unordered_map<const paddle::framework::Variable*, std::string>&
         variable_2_var_name,
@@ -658,7 +658,7 @@ std::shared_ptr<paddle::framework::OperatorBase> BuildOperatorBase(
         true,
         phi::errors::NotFound("param [%s] MUST in name2id map", name));
     auto index = op_yaml_info.InputName2Id().at(name);
-    ir::Value ptr = op->operand_source(index);
+    pir::Value ptr = op->operand_source(index);
 
     auto in_var_name = name_map.at(ptr);
 
@@ -672,52 +672,52 @@ std::shared_ptr<paddle::framework::OperatorBase> BuildOperatorBase(
   for (auto& name : attr_name_list) {
     auto& val = op_attr_map.at(name);
 
-    if (val.isa<ir::StrAttribute>()) {
-      attr_map[name] = val.dyn_cast<ir::StrAttribute>().AsString();
-    } else if (val.isa<ir::Int32Attribute>()) {
-      attr_map[name] = val.dyn_cast<ir::Int32Attribute>().data();
-    } else if (val.isa<ir::BoolAttribute>()) {
-      attr_map[name] = val.dyn_cast<ir::BoolAttribute>().data();
-    } else if (val.isa<ir::FloatAttribute>()) {
-      attr_map[name] = val.dyn_cast<ir::FloatAttribute>().data();
-    } else if (val.isa<ir::DoubleAttribute>()) {
-      attr_map[name] = val.dyn_cast<ir::DoubleAttribute>().data();
-    } else if (val.isa<ir::Int64Attribute>()) {
-      attr_map[name] = val.dyn_cast<ir::Int64Attribute>().data();
-    } else if (val.isa<ir::ArrayAttribute>()) {
-      auto array_list = val.dyn_cast<ir::ArrayAttribute>().AsVector();
+    if (val.isa<pir::StrAttribute>()) {
+      attr_map[name] = val.dyn_cast<pir::StrAttribute>().AsString();
+    } else if (val.isa<pir::Int32Attribute>()) {
+      attr_map[name] = val.dyn_cast<pir::Int32Attribute>().data();
+    } else if (val.isa<pir::BoolAttribute>()) {
+      attr_map[name] = val.dyn_cast<pir::BoolAttribute>().data();
+    } else if (val.isa<pir::FloatAttribute>()) {
+      attr_map[name] = val.dyn_cast<pir::FloatAttribute>().data();
+    } else if (val.isa<pir::DoubleAttribute>()) {
+      attr_map[name] = val.dyn_cast<pir::DoubleAttribute>().data();
+    } else if (val.isa<pir::Int64Attribute>()) {
+      attr_map[name] = val.dyn_cast<pir::Int64Attribute>().data();
+    } else if (val.isa<pir::ArrayAttribute>()) {
+      auto array_list = val.dyn_cast<pir::ArrayAttribute>().AsVector();
       PADDLE_ENFORCE(
           array_list.size() > 0,
           paddle::platform::errors::Fatal("Attribute %s is empty", name));
-      if (array_list[0].isa<ir::Int32Attribute>()) {
+      if (array_list[0].isa<pir::Int32Attribute>()) {
         std::vector<int> vec_int;
         for (auto attribute : array_list) {
-          vec_int.push_back(attribute.dyn_cast<ir::Int32Attribute>().data());
+          vec_int.push_back(attribute.dyn_cast<pir::Int32Attribute>().data());
         }
         attr_map[name] = vec_int;
-      } else if (array_list[0].isa<ir::Int64Attribute>()) {
+      } else if (array_list[0].isa<pir::Int64Attribute>()) {
         std::vector<int> vec_int64;
         for (auto attribute : array_list) {
-          vec_int64.push_back(attribute.dyn_cast<ir::Int64Attribute>().data());
+          vec_int64.push_back(attribute.dyn_cast<pir::Int64Attribute>().data());
         }
         attr_map[name] = vec_int64;
-      } else if (array_list[0].isa<ir::BoolAttribute>()) {
+      } else if (array_list[0].isa<pir::BoolAttribute>()) {
         std::vector<int> vec_bool;
         for (auto attribute : array_list) {
-          vec_bool.push_back(attribute.dyn_cast<ir::BoolAttribute>().data());
+          vec_bool.push_back(attribute.dyn_cast<pir::BoolAttribute>().data());
         }
         attr_map[name] = vec_bool;
-      } else if (array_list[0].isa<ir::FloatAttribute>()) {
+      } else if (array_list[0].isa<pir::FloatAttribute>()) {
         std::vector<int> vec_float;
         for (auto attribute : array_list) {
-          vec_float.push_back(attribute.dyn_cast<ir::FloatAttribute>().data());
+          vec_float.push_back(attribute.dyn_cast<pir::FloatAttribute>().data());
         }
         attr_map[name] = vec_float;
-      } else if (array_list[0].isa<ir::DoubleAttribute>()) {
+      } else if (array_list[0].isa<pir::DoubleAttribute>()) {
         std::vector<int> vec_double;
         for (auto attribute : array_list) {
           vec_double.push_back(
-              attribute.dyn_cast<ir::DoubleAttribute>().data());
+              attribute.dyn_cast<pir::DoubleAttribute>().data());
         }
         attr_map[name] = vec_double;
       } else {
@@ -740,7 +740,7 @@ std::shared_ptr<paddle::framework::OperatorBase> BuildOperatorBase(
   auto& output_name_list = op_yaml_info.OutputNames();
   for (size_t i = 0; i < output_name_list.size(); ++i) {
     auto name = output_name_list[i];
-    ir::Value ptr = op->result(i);
+    pir::Value ptr = op->result(i);
 
     auto out_var_name = name_map.at(ptr);
 
@@ -749,7 +749,7 @@ std::shared_ptr<paddle::framework::OperatorBase> BuildOperatorBase(
     if (type.isa<paddle::dialect::AllocatedDenseTensorType>() ||
         type.isa<paddle::dialect::AllocatedSelectedRowsType>()) {
       out_name_map[legacy_arg_name].push_back(out_var_name);
-    } else if (type.isa<ir::VectorType>()) {
+    } else if (type.isa<pir::VectorType>()) {
       auto var = scope->FindVar(out_var_name);
       auto var_ref = var->Get<paddle::framework::VariableRefArray>();
       for (size_t k = 0; k < var_ref.size(); ++k) {
@@ -761,7 +761,7 @@ std::shared_ptr<paddle::framework::OperatorBase> BuildOperatorBase(
     } else {
       PADDLE_THROW(phi::errors::Unimplemented(
           "only support AllocatedDenseTensor, AllocatedSelectedRowsType  and "
-          "ir::vector type"));
+          "pir::vector type"));
     }
   }
 
@@ -773,4 +773,4 @@ std::shared_ptr<paddle::framework::OperatorBase> BuildOperatorBase(
   return res;
 }
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h b/paddle/fluid/pir/phi_kernel_adaptor/phi_kernel_util.h
similarity index 74%
rename from paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h
rename to paddle/fluid/pir/phi_kernel_adaptor/phi_kernel_util.h
index b1916d5418f77..037674467bc67 100644
--- a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h
+++ b/paddle/fluid/pir/phi_kernel_adaptor/phi_kernel_util.h
@@ -14,16 +14,16 @@
 
 #pragma once
 
-#include "paddle/fluid/ir/dialect/paddle_dialect/interface/op_yaml_info.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_dialect.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_type.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/utils/op_yaml_info_util.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/utils/utils.h"
-#include "paddle/ir/core/builtin_attribute.h"
-#include "paddle/ir/core/ir_context.h"
-#include "paddle/ir/core/program.h"
-#include "paddle/ir/core/utils.h"
+#include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
+#include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_util.h"
+#include "paddle/fluid/pir/dialect/operator/utils/utils.h"
 #include "paddle/phi/core/meta_tensor.h"
+#include "paddle/pir/core/builtin_attribute.h"
+#include "paddle/pir/core/ir_context.h"
+#include "paddle/pir/core/program.h"
+#include "paddle/pir/core/utils.h"
 
 #include "paddle/fluid/framework/new_executor/interpreter/execution_config.h"
 #include "paddle/fluid/framework/scope.h"
@@ -33,36 +33,36 @@
 #include "paddle/phi/core/kernel_context.h"
 
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_attribute.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/utils/op_yaml_info_parser.h"
-#include "paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_attribute.h"
-#include "paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_type.h"
-#include "paddle/ir/core/type_name.h"
+#include "paddle/fluid/pir/dialect/kernel/ir/kernel_attribute.h"
+#include "paddle/fluid/pir/dialect/kernel/ir/kernel_type.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
+#include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.h"
 #include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/pir/core/type_name.h"
 
 #include "glog/logging.h"
 
-namespace ir {
-void BuildScope(const ir::Block& block,
+namespace pir {
+void BuildScope(const pir::Block& block,
                 paddle::framework::Scope* inner_scope,
                 const std::string& var_name_prefix,
-                std::unordered_map<ir::Value, std::string>* value_2_var_name,
+                std::unordered_map<pir::Value, std::string>* value_2_var_name,
                 std::unordered_map<const paddle::framework::Variable*,
                                    std::string>* variable_2_var_name,
                 std::map<std::string, int>* var_name_2_id,
                 std::vector<paddle::framework::Variable*>* variable_list);
 
 void BuildRuntimeContext(
-    ir::Operation* op,
-    const std::unordered_map<ir::Value, std::string>& name_map,
+    pir::Operation* op,
+    const std::unordered_map<pir::Value, std::string>& name_map,
     paddle::framework::Scope* scope,
     paddle::framework::Scope* local_scope,
     const paddle::dialect::OpYamlInfoParser& op_yaml_info,
     paddle::framework::RuntimeContext* runtime_ctx);
 
 std::shared_ptr<paddle::framework::OperatorBase> BuildOperatorBase(
-    ir::Operation* op,
-    const std::unordered_map<ir::Value, std::string>& name_map,
+    pir::Operation* op,
+    const std::unordered_map<pir::Value, std::string>& name_map,
     const paddle::dialect::OpYamlInfoParser& op_yaml_info,
     const std::unordered_map<const paddle::framework::Variable*, std::string>&
         variable_2_var_name,
@@ -74,12 +74,13 @@ template <typename Context,
           typename InListType,
           typename OutListType,
           bool is_kernel>
-void BuildPhiContext(ir::Operation* op,
-                     const std::unordered_map<ir::Value, std::string>& name_map,
-                     paddle::framework::Scope* scope,
-                     paddle::framework::Scope* local_scope,
-                     const paddle::dialect::OpYamlInfoParser& op_yaml_info,
-                     Context* ctx) {
+void BuildPhiContext(
+    pir::Operation* op,
+    const std::unordered_map<pir::Value, std::string>& name_map,
+    paddle::framework::Scope* scope,
+    paddle::framework::Scope* local_scope,
+    const paddle::dialect::OpYamlInfoParser& op_yaml_info,
+    Context* ctx) {
   paddle::framework::Scope* inner_scope =
       local_scope != nullptr ? local_scope : scope;
   VLOG(6) << "Build " << get_type_name<Context>() << " in scope[" << scope
@@ -96,7 +97,7 @@ void BuildPhiContext(ir::Operation* op,
         true,
         phi::errors::NotFound("param [%s] MUST in name2id map", t));
     auto index = op_yaml_info.InputName2Id().at(t);
-    ir::Value ptr = op->operand_source(index);
+    pir::Value ptr = op->operand_source(index);
     if (!ptr) {
       phi::DenseTensor* ptr = nullptr;
       OutType in_ptr(ptr);
@@ -142,7 +143,7 @@ void BuildPhiContext(ir::Operation* op,
   for (auto& t : vec_kernel_fn_attr_params) {
     if (name2id.count(t)) {
       // tensor attribute, get information from input
-      ir::Value ptr = op->operand_source(name2id.at(t));
+      pir::Value ptr = op->operand_source(name2id.at(t));
 
       auto in_var_name = name_map.at(ptr);
 
@@ -153,7 +154,7 @@ void BuildPhiContext(ir::Operation* op,
           phi::Attribute attr = phi::TensorRef(
               &(inner_scope->FindVar(in_var_name)->Get<phi::DenseTensor>()));
           ctx->EmplaceBackAttr(attr);
-        } else if (ptr.type().isa<ir::VectorType>()) {
+        } else if (ptr.type().isa<pir::VectorType>()) {
           auto& tensor_array = inner_scope->FindVar(in_var_name)
                                    ->Get<paddle::framework::VariableRefArray>();
           if (tensor_array.size() == 1) {
@@ -193,19 +194,20 @@ void BuildPhiContext(ir::Operation* op,
     } else if (attr_type_name == "paddle::dialect::DataTypeAttribute") {
       ctx->EmplaceBackAttr(
           attr_map[t].dyn_cast<paddle::dialect::DataTypeAttribute>().data());
-    } else if (attr_type_name == "ir::Int32Attribute") {
-      ctx->EmplaceBackAttr(attr_map[t].dyn_cast<ir::Int32Attribute>().data());
-    } else if (attr_type_name == "ir::Int64Attribute") {
-      ctx->EmplaceBackAttr(attr_map[t].dyn_cast<ir::Int64Attribute>().data());
-    } else if (attr_type_name == "ir::FloatAttribute") {
-      ctx->EmplaceBackAttr(attr_map[t].dyn_cast<ir::FloatAttribute>().data());
-    } else if (attr_type_name == "ir::BoolAttribute") {
-      ctx->EmplaceBackAttr(attr_map[t].dyn_cast<ir::BoolAttribute>().data());
-    } else if (attr_type_name == "ir::StrAttribute") {
-      ctx->EmplaceBackAttr(attr_map[t].dyn_cast<ir::StrAttribute>().AsString());
+    } else if (attr_type_name == "pir::Int32Attribute") {
+      ctx->EmplaceBackAttr(attr_map[t].dyn_cast<pir::Int32Attribute>().data());
+    } else if (attr_type_name == "pir::Int64Attribute") {
+      ctx->EmplaceBackAttr(attr_map[t].dyn_cast<pir::Int64Attribute>().data());
+    } else if (attr_type_name == "pir::FloatAttribute") {
+      ctx->EmplaceBackAttr(attr_map[t].dyn_cast<pir::FloatAttribute>().data());
+    } else if (attr_type_name == "pir::BoolAttribute") {
+      ctx->EmplaceBackAttr(attr_map[t].dyn_cast<pir::BoolAttribute>().data());
+    } else if (attr_type_name == "pir::StrAttribute") {
+      ctx->EmplaceBackAttr(
+          attr_map[t].dyn_cast<pir::StrAttribute>().AsString());
     } else if (attr_type_name ==
-               "ir::ArrayAttribute<paddle::dialect::ScalarAttribute>") {
-      auto array_list = attr_map[t].dyn_cast<ir::ArrayAttribute>().AsVector();
+               "pir::ArrayAttribute<paddle::dialect::ScalarAttribute>") {
+      auto array_list = attr_map[t].dyn_cast<pir::ArrayAttribute>().AsVector();
       std::vector<phi::Scalar> vec_res;
       if (array_list.size() > 0) {
         PADDLE_ENFORCE_EQ(
@@ -220,29 +222,29 @@ void BuildPhiContext(ir::Operation* op,
         }
       }
       ctx->EmplaceBackAttr(vec_res);
-    } else if (attr_type_name == "ir::ArrayAttribute<ir::Int32Attribute>") {
-      auto array_list = attr_map[t].dyn_cast<ir::ArrayAttribute>().AsVector();
+    } else if (attr_type_name == "pir::ArrayAttribute<pir::Int32Attribute>") {
+      auto array_list = attr_map[t].dyn_cast<pir::ArrayAttribute>().AsVector();
       std::vector<int32_t> vec_res;
       if (array_list.size() > 0) {
         PADDLE_ENFORCE_EQ(
-            array_list[0].isa<ir::Int32Attribute>(),
+            array_list[0].isa<pir::Int32Attribute>(),
             true,
             phi::errors::Unimplemented(
-                "the 0th elementwise MUST be ir::Int32Attribute"));
+                "the 0th elementwise MUST be pir::Int32Attribute"));
         for (size_t i = 0; i < array_list.size(); ++i) {
           vec_res.push_back(
-              array_list[i].dyn_cast<ir::Int32Attribute>().data());
+              array_list[i].dyn_cast<pir::Int32Attribute>().data());
         }
       }
       ctx->EmplaceBackAttr(vec_res);
-    } else if (attr_type_name == "ir::ArrayAttribute<ir::FloatAttribute>") {
-      auto array_list = attr_map[t].dyn_cast<ir::ArrayAttribute>().AsVector();
+    } else if (attr_type_name == "pir::ArrayAttribute<pir::FloatAttribute>") {
+      auto array_list = attr_map[t].dyn_cast<pir::ArrayAttribute>().AsVector();
       std::vector<float> vec_res;
       if (array_list.size() > 0) {
-        if (array_list[0].isa<ir::FloatAttribute>()) {
+        if (array_list[0].isa<pir::FloatAttribute>()) {
           for (size_t i = 0; i < array_list.size(); ++i) {
             vec_res.push_back(
-                array_list[i].dyn_cast<ir::FloatAttribute>().data());
+                array_list[i].dyn_cast<pir::FloatAttribute>().data());
           }
 
         } else {
@@ -251,37 +253,37 @@ void BuildPhiContext(ir::Operation* op,
         }
       }
       ctx->EmplaceBackAttr(vec_res);
-    } else if (attr_type_name == "ir::ArrayAttribute<ir::Int64Attribute>") {
-      auto array_list = attr_map[t].dyn_cast<ir::ArrayAttribute>().AsVector();
+    } else if (attr_type_name == "pir::ArrayAttribute<pir::Int64Attribute>") {
+      auto array_list = attr_map[t].dyn_cast<pir::ArrayAttribute>().AsVector();
 
       std::vector<int64_t> vec_res;
       if (array_list.size() > 0) {
         PADDLE_ENFORCE_EQ(
-            array_list[0].isa<ir::Int64Attribute>(),
+            array_list[0].isa<pir::Int64Attribute>(),
             true,
             phi::errors::PreconditionNotMet(
-                "Element in array list MUST be ir::Int64Attribute "));
+                "Element in array list MUST be pir::Int64Attribute "));
 
         for (size_t i = 0; i < array_list.size(); ++i) {
           vec_res.push_back(
-              array_list[i].dyn_cast<ir::Int64Attribute>().data());
+              array_list[i].dyn_cast<pir::Int64Attribute>().data());
         }
       }
       ctx->EmplaceBackAttr(vec_res);
-    } else if (attr_type_name == "ir::ArrayAttribute<ir::Int64Attribute>") {
-      auto array_list = attr_map[t].dyn_cast<ir::ArrayAttribute>().AsVector();
+    } else if (attr_type_name == "pir::ArrayAttribute<pir::Int64Attribute>") {
+      auto array_list = attr_map[t].dyn_cast<pir::ArrayAttribute>().AsVector();
 
       std::vector<int64_t> vec_res;
       if (array_list.size() > 0) {
         PADDLE_ENFORCE_EQ(
-            array_list[0].isa<ir::Int64Attribute>(),
+            array_list[0].isa<pir::Int64Attribute>(),
             true,
             phi::errors::PreconditionNotMet(
-                "Element in array list MUST be ir::Int64Attribute "));
+                "Element in array list MUST be pir::Int64Attribute "));
 
         for (size_t i = 0; i < array_list.size(); ++i) {
           vec_res.push_back(
-              array_list[i].dyn_cast<ir::Int64Attribute>().data());
+              array_list[i].dyn_cast<pir::Int64Attribute>().data());
         }
       }
       ctx->EmplaceBackAttr(vec_res);
@@ -300,7 +302,7 @@ void BuildPhiContext(ir::Operation* op,
 
   // TODO(phlrain): use var type instead of op name
   for (size_t i = 0; i < op->num_results(); ++i) {
-    ir::Value out_ptr = op->result(i);
+    pir::Value out_ptr = op->result(i);
     auto out_type = out_ptr.type();
     if (out_type) {
       auto& name = name_map.at(out_ptr);
@@ -320,7 +322,7 @@ void BuildPhiContext(ir::Operation* op,
       ctx->EmplaceBackOutput(OutType(const_cast<phi::SelectedRows*>(
           &(inner_scope->FindVar(name_map.at(out_ptr))
                 ->Get<phi::SelectedRows>()))));
-    } else if (out_type.isa<ir::VectorType>()) {
+    } else if (out_type.isa<pir::VectorType>()) {
       OutListType outputs;
       auto& variable_array = inner_scope->FindVar(name_map.at(out_ptr))
                                  ->Get<paddle::framework::VariableRefArray>();
@@ -348,4 +350,4 @@ void BuildPhiContext(ir::Operation* op,
   VLOG(6) << "Done build phi context";
 }
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/fluid/ir/transforms/CMakeLists.txt b/paddle/fluid/pir/transforms/CMakeLists.txt
similarity index 71%
rename from paddle/fluid/ir/transforms/CMakeLists.txt
rename to paddle/fluid/pir/transforms/CMakeLists.txt
index 36e06410d338a..ce2cb40f0eba4 100644
--- a/paddle/fluid/ir/transforms/CMakeLists.txt
+++ b/paddle/fluid/pir/transforms/CMakeLists.txt
@@ -1,12 +1,12 @@
 cc_library(
   transform_general_functions
   SRCS transform_general_functions.cc
-  DEPS pd_dialect_core)
+  DEPS pd_op_dialect_core)
 
 cc_library(
   pd_op_to_kernel_pass
   SRCS pd_op_to_kernel_pass.cc
-  DEPS pd_kernel_dialect pd_dialect_core pd_dialect_utils)
+  DEPS pd_kernel_dialect pd_op_dialect_core pd_op_dialect_utils)
 
 cc_library(
   _constant_folding_pass
@@ -16,4 +16,4 @@ cc_library(
 cc_library(
   pd_inplace_pass
   SRCS inplace_pass.cc
-  DEPS pd_dialect_core op_yaml_info_parser)
+  DEPS pd_op_dialect_core op_yaml_info_parser)
diff --git a/paddle/fluid/ir/transforms/constant_folding_pass.cc b/paddle/fluid/pir/transforms/constant_folding_pass.cc
similarity index 61%
rename from paddle/fluid/ir/transforms/constant_folding_pass.cc
rename to paddle/fluid/pir/transforms/constant_folding_pass.cc
index 93699e3eae165..d3f78787841f0 100644
--- a/paddle/fluid/ir/transforms/constant_folding_pass.cc
+++ b/paddle/fluid/pir/transforms/constant_folding_pass.cc
@@ -12,71 +12,74 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/ir/transforms/constant_folding_pass.h"
+#include "paddle/fluid/pir/transforms/constant_folding_pass.h"
 
 #include <memory>
 #include <string>
 #include <unordered_map>
 
 // NOTE(zhangbo9674): File pd_op.h is generated by op_gen.py, see details in
-// paddle/fluid/ir/dialect/CMakeLists.txt.
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_op.h"
+// paddle/fluid/pir/dialect/CMakeLists.txt.
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
 
 #include "paddle/fluid/framework/new_executor/interpretercore.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_dialect.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_type.h"
-#include "paddle/fluid/ir/transforms/pd_op_to_kernel_pass.h"
-#include "paddle/fluid/ir/transforms/transform_general_functions.h"
-#include "paddle/ir/core/builtin_op.h"
-#include "paddle/ir/core/ir_context.h"
-#include "paddle/ir/core/operation.h"
-#include "paddle/ir/core/parameter.h"
-#include "paddle/ir/core/program.h"
-#include "paddle/ir/pass/pass.h"
-#include "paddle/ir/pattern_rewrite/frozen_rewrite_pattern_set.h"
-#include "paddle/ir/pattern_rewrite/pattern_match.h"
-#include "paddle/ir/pattern_rewrite/pattern_rewrite_driver.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
+#include "paddle/fluid/pir/transforms/pd_op_to_kernel_pass.h"
+#include "paddle/fluid/pir/transforms/transform_general_functions.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
+#include "paddle/pir/core/builtin_op.h"
+#include "paddle/pir/core/ir_context.h"
+#include "paddle/pir/core/operation.h"
+#include "paddle/pir/core/parameter.h"
+#include "paddle/pir/core/program.h"
+#include "paddle/pir/pass/pass.h"
+#include "paddle/pir/pattern_rewrite/frozen_rewrite_pattern_set.h"
+#include "paddle/pir/pattern_rewrite/pattern_match.h"
+#include "paddle/pir/pattern_rewrite/pattern_rewrite_driver.h"
 
 namespace {
 
-class ConstantFoldingPattern : public ir::RewritePattern {
+class ConstantFoldingPattern : public pir::RewritePattern {
  public:
-  ConstantFoldingPattern(ir::IrContext* context,
-                         ir::PatternBenefit benefit = 1,
+  ConstantFoldingPattern(pir::IrContext* context,
+                         pir::PatternBenefit benefit = 1,
                          const std::vector<std::string>& generated_names = {})
       : RewritePattern(MatchAnyOpTypeTag(), benefit, context, generated_names) {
   }
 
-  bool Match(ir::Operation* op) const override {
+  bool Match(pir::Operation* op) const override {
     // TODO(liuyuanle): Use trait to improve robustness.
-    if (op->dyn_cast<ir::GetParameterOp>() ||
-        op->dyn_cast<ir::SetParameterOp>() ||
+    if (op->dyn_cast<pir::GetParameterOp>() ||
+        op->dyn_cast<pir::SetParameterOp>() ||
         op->dyn_cast<paddle::dialect::FetchOp>())
       return false;
 
     // Inputs must come from get parameter op.
     for (uint32_t i = 0; i < op->num_operands(); ++i)
-      if (ir::GetDefiningOpForInput(op, i)->dyn_cast<ir::GetParameterOp>() ==
+      if (pir::GetDefiningOpForInput(op, i)->dyn_cast<pir::GetParameterOp>() ==
           nullptr)
         return false;
     return true;
   }
 
-  void Rewrite(ir::Operation* op,
-               ir::PatternRewriter& rewriter) const override {  // NOLINT
-    ir::Program* program = op->GetParentProgram();
+  void Rewrite(pir::Operation* op,
+               pir::PatternRewriter& rewriter) const override {  // NOLINT
+    pir::Program* program = op->GetParentProgram();
     auto temp_program = BuildProgramFromOperation(op);
 
     std::vector<std::string> fetch_var_names;
     auto block = temp_program->block();
     for (auto it = block->begin(); it != block->end(); ++it) {
-      if ((*it)->name() == "pd.fetch") {
-        size_t index =
-            (*it)->attributes().at("col").dyn_cast<ir::Int32Attribute>().data();
+      if ((*it)->name() == "pd_op.fetch") {
+        size_t index = (*it)
+                           ->attributes()
+                           .at("col")
+                           .dyn_cast<pir::Int32Attribute>()
+                           .data();
 
         if (fetch_var_names.size() < index + 1) {
           fetch_var_names.resize(index + 1);
@@ -85,7 +88,7 @@ class ConstantFoldingPattern : public ir::RewritePattern {
         fetch_var_names[index] = (*it)
                                      ->attributes()
                                      .at("name")
-                                     .dyn_cast<ir::StrAttribute>()
+                                     .dyn_cast<pir::StrAttribute>()
                                      .AsString() +
                                  "@fetch";
       }
@@ -104,10 +107,11 @@ class ConstantFoldingPattern : public ir::RewritePattern {
 
     // TODO(liuyuanle): Support multiple output.
     auto out_tensor = PADDLE_GET_CONST(phi::DenseTensor, fetch_list[0]);
-    std::unique_ptr<ir::Parameter> parameter = std::make_unique<ir::Parameter>(
-        reinterpret_cast<void*>(out_tensor.data()),
-        out_tensor.numel() * phi::SizeOf(out_tensor.dtype()),
-        op->result(0).type());
+    std::unique_ptr<pir::Parameter> parameter =
+        std::make_unique<pir::Parameter>(
+            reinterpret_cast<void*>(out_tensor.data()),
+            out_tensor.numel() * phi::SizeOf(out_tensor.dtype()),
+            op->result(0).type());
 
     std::string param_name =
         "@constant_folding_pass@_" + std::to_string(suffix_++);
@@ -119,20 +123,20 @@ class ConstantFoldingPattern : public ir::RewritePattern {
     program->SetParameter(param_name, std::move(parameter));
     // rewriter.SetInsertionPoint(op);
     auto get_parameter_op =
-        rewriter.Build<ir::GetParameterOp>(param_name, op->result(0).type());
+        rewriter.Build<pir::GetParameterOp>(param_name, op->result(0).type());
 
     rewriter.ReplaceAllUsesWith(op->result(0), get_parameter_op->result(0));
     rewriter.EraseOp(op);
   }
 
  private:
-  std::unique_ptr<ir::Program> BuildProgramFromOperation(
-      ir::Operation* op) const {
-    auto program = std::make_unique<ir::Program>(ir_context());
-    ir::Builder builder = ir::Builder(ir_context(), program->block());
+  std::unique_ptr<pir::Program> BuildProgramFromOperation(
+      pir::Operation* op) const {
+    auto program = std::make_unique<pir::Program>(ir_context());
+    pir::Builder builder = pir::Builder(ir_context(), program->block());
 
     // prepare op inputs
-    std::vector<ir::OpResult> op_inputs;
+    std::vector<pir::OpResult> op_inputs;
     for (uint32_t i = 0; i < op->num_operands(); i++) {
       PADDLE_ENFORCE_EQ(
           op->operand_source(i).type().isa<paddle::dialect::DenseTensorType>(),
@@ -141,22 +145,22 @@ class ConstantFoldingPattern : public ir::RewritePattern {
               "Op's input must be a dense tensor type."));
 
       auto [param_name, param] =
-          ir::GetParameterFromValue(op->operand_source(i));
+          pir::GetParameterFromValue(op->operand_source(i));
       program->SetParameter(param_name,
-                            std::make_unique<ir::Parameter>(*param));
+                            std::make_unique<pir::Parameter>(*param));
 
       auto* param_var = scope_.FindVar(param_name);
       PADDLE_ENFORCE_NOT_NULL(
           param_var,
           phi::errors::InvalidArgument("Parameter var not in scope."));
 
-      auto get_parameter_op = builder.Build<ir::GetParameterOp>(
+      auto get_parameter_op = builder.Build<pir::GetParameterOp>(
           param_name, op->operand_source(i).type());
       op_inputs.push_back(get_parameter_op->result(0));
     }
 
     // prepare op outputs
-    std::vector<ir::Type> output_types;
+    std::vector<pir::Type> output_types;
     for (uint32_t i = 0; i < op->num_results(); i++) {
       output_types.push_back(op->result(i).type());
     }
@@ -185,39 +189,39 @@ class ConstantFoldingPattern : public ir::RewritePattern {
   inline static paddle::framework::interpreter::ExecutionConfig exe_config_{};
 };
 
-class ConstantFoldingPass : public ir::Pass {
+class ConstantFoldingPass : public pir::Pass {
  public:
   // TODO(liuyuanle): Naming convention for pass.
-  ConstantFoldingPass() : ir::Pass("ConstantFoldingPass", 1) {}
+  ConstantFoldingPass() : pir::Pass("ConstantFoldingPass", 1) {}
 
-  bool Initialize(ir::IrContext* context) override {
-    ir::RewritePatternSet ps(context);
+  bool Initialize(pir::IrContext* context) override {
+    pir::RewritePatternSet ps(context);
     ps.Add<ConstantFoldingPattern>(context);
-    patterns_ = ir::FrozenRewritePatternSet(std::move(ps));
+    patterns_ = pir::FrozenRewritePatternSet(std::move(ps));
     return true;
   }
 
-  void Run(ir::Operation* op) override {
-    ir::GreedyRewriteConfig cfg;
+  void Run(pir::Operation* op) override {
+    pir::GreedyRewriteConfig cfg;
     cfg.use_top_down_traversal = true;
     cfg.max_iterations = 10;
-    ir::ApplyPatternsGreedily(op->region(0), patterns_, cfg);
+    pir::ApplyPatternsGreedily(op->region(0), patterns_, cfg);
   }
 
-  bool CanApplyOn(ir::Operation* op) const override {
+  bool CanApplyOn(pir::Operation* op) const override {
     return op->name() == "builtin.module" && op->num_regions() > 0;
   }
 
  private:
-  ir::FrozenRewritePatternSet patterns_;
+  pir::FrozenRewritePatternSet patterns_;
 };
 
 }  // namespace
 
-namespace ir {
+namespace pir {
 
 std::unique_ptr<Pass> CreateConstantFoldingPass() {
   return std::make_unique<ConstantFoldingPass>();
 }
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/fluid/ir/transforms/constant_folding_pass.h b/paddle/fluid/pir/transforms/constant_folding_pass.h
similarity index 90%
rename from paddle/fluid/ir/transforms/constant_folding_pass.h
rename to paddle/fluid/pir/transforms/constant_folding_pass.h
index 0c5ca794ad5bc..b49c9d90493b1 100644
--- a/paddle/fluid/ir/transforms/constant_folding_pass.h
+++ b/paddle/fluid/pir/transforms/constant_folding_pass.h
@@ -15,12 +15,12 @@
 #pragma once
 
 #include <memory>
-#include "paddle/ir/core/dll_decl.h"
+#include "paddle/pir/core/dll_decl.h"
 
-namespace ir {
+namespace pir {
 
 class Pass;
 
 IR_API std::unique_ptr<Pass> CreateConstantFoldingPass();
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/fluid/ir/transforms/inplace_pass.cc b/paddle/fluid/pir/transforms/inplace_pass.cc
similarity index 70%
rename from paddle/fluid/ir/transforms/inplace_pass.cc
rename to paddle/fluid/pir/transforms/inplace_pass.cc
index 222abc8344895..adfa5866799b9 100644
--- a/paddle/fluid/ir/transforms/inplace_pass.cc
+++ b/paddle/fluid/pir/transforms/inplace_pass.cc
@@ -12,25 +12,25 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/ir/transforms/inplace_pass.h"
-
-#include "paddle/fluid/ir/dialect/paddle_dialect/interface/op_yaml_info.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_type.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/trait/inplace.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/utils/op_yaml_info_parser.h"
-#include "paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_attribute.h"
-#include "paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_dialect.h"
-#include "paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_type.h"
-#include "paddle/ir/core/builtin_op.h"
-#include "paddle/ir/core/operation.h"
-#include "paddle/ir/pass/pass.h"
-#include "paddle/ir/pass/pass_registry.h"
+#include "paddle/fluid/pir/transforms/inplace_pass.h"
+
+#include "paddle/fluid/pir/dialect/kernel/ir/kernel_attribute.h"
+#include "paddle/fluid/pir/dialect/kernel/ir/kernel_dialect.h"
+#include "paddle/fluid/pir/dialect/kernel/ir/kernel_type.h"
+#include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
+#include "paddle/fluid/pir/dialect/operator/trait/inplace.h"
+#include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.h"
+#include "paddle/pir/core/builtin_op.h"
+#include "paddle/pir/core/operation.h"
+#include "paddle/pir/pass/pass.h"
+#include "paddle/pir/pass/pass_registry.h"
 
 namespace details {
 // NOTE(zhangbo): Which kind of value can be deleted?
 // (1) Value's type needs to be AllocatedDenseTensorType or
 // AllocatedSelectedRowsType; (2) Value's is not persisable.
-static bool CanBeDeleted(ir::Value value) {
+static bool CanBeDeleted(pir::Value value) {
   if (!value.type()) {
     return false;
   }
@@ -41,17 +41,17 @@ static bool CanBeDeleted(ir::Value value) {
   if (value.GetDefiningOp()->HasAttribute(kAttrIsPersisable)) {
     return !(value.GetDefiningOp()
                  ->attribute(kAttrIsPersisable)
-                 .dyn_cast<::ir::ArrayAttribute>()
-                 .AsVector()[value.dyn_cast<::ir::OpResult>().GetResultIndex()]
-                 .dyn_cast<::ir::BoolAttribute>()
+                 .dyn_cast<pir::ArrayAttribute>()
+                 .AsVector()[value.dyn_cast<pir::OpResult>().GetResultIndex()]
+                 .dyn_cast<pir::BoolAttribute>()
                  .data());
   }
   return true;
 }
 
-static bool CanDoInplace(const std::unordered_set<ir::Value>& eager_dels,
-                         ir::Value input,
-                         ir::Value output) {
+static bool CanDoInplace(const std::unordered_set<pir::Value>& eager_dels,
+                         pir::Value input,
+                         pir::Value output) {
   if (input.type() != output.type()) {
     VLOG(9) << "     -- input's type != output's type, can't do inplace";
     return false;
@@ -63,16 +63,17 @@ static bool CanDoInplace(const std::unordered_set<ir::Value>& eager_dels,
   return true;
 }
 
-static bool IsNoNeedBuffer(ir::Operation* op, ir::Value value) {
-  if (op->dialect()->name().compare(
-          paddle::dialect::PaddleKernelDialect::name()) != 0) {
+static bool IsNoNeedBuffer(pir::Operation* op, pir::Value value) {
+  if (op->dialect()->name().compare(paddle::dialect::KernelDialect::name()) !=
+      0) {
     VLOG(8) << op->name()
             << "is not a kernel_dialect op, no need buffer is false";
     return false;
   }
   auto op_name =
-      op->attributes().at("op_name").dyn_cast<::ir::StrAttribute>().AsString();
-  ir::OpInfo op_info = ir::IrContext::Instance()->GetRegisteredOpInfo(op_name);
+      op->attributes().at("op_name").dyn_cast<pir::StrAttribute>().AsString();
+  pir::OpInfo op_info =
+      pir::IrContext::Instance()->GetRegisteredOpInfo(op_name);
   if (op_info) {
     auto info_interface =
         op_info.GetInterfaceImpl<paddle::dialect::OpYamlInfoInterface>();
@@ -90,27 +91,26 @@ static bool IsNoNeedBuffer(ir::Operation* op, ir::Value value) {
   return false;
 }
 
-// NOTE(zhangbo): pd.feed's output and pd.fetch's input can not be eager
+// NOTE(zhangbo): pd_op.feed's output and pd_op.fetch's input can not be eager
 // deleted.
-static std::unordered_set<ir::Value> GetSkipDeletionValues(ir::Block* block) {
-  std::unordered_set<ir::Value> skip_dels;
+static std::unordered_set<pir::Value> GetSkipDeletionValues(pir::Block* block) {
+  std::unordered_set<pir::Value> skip_dels;
   for (auto& op : *block) {
-    if (op->dialect()->name().compare(
-            paddle::dialect::PaddleKernelDialect::name()) != 0) {
+    if (op->dialect()->name().compare(paddle::dialect::KernelDialect::name()) !=
+        0) {
       continue;
     }
     IR_ENFORCE(op->attributes().count("op_name") > 0,
                "kernel_dialect op should own an 'op_name' attribute.");
-    auto upper_op_name = op->attributes()
-                             .at("op_name")
-                             .dyn_cast<::ir::StrAttribute>()
-                             .AsString();
+    auto upper_op_name =
+        op->attributes().at("op_name").dyn_cast<pir::StrAttribute>().AsString();
 
-    if (upper_op_name == "pd.feed" || upper_op_name == "pd.data") {
+    if (upper_op_name == "pd_op.feed" || upper_op_name == "pd_op.data") {
       skip_dels.insert(op->result(0));
       continue;
     }
-    if (upper_op_name == "pd.fetch" || upper_op_name == "pd.shadow_output") {
+    if (upper_op_name == "pd_op.fetch" ||
+        upper_op_name == "pd_op.shadow_output") {
       skip_dels.insert(op->operand_source(0));
       continue;
     }
@@ -121,20 +121,20 @@ static std::unordered_set<ir::Value> GetSkipDeletionValues(ir::Block* block) {
 // NOTE(zhangbo): For inplace Pass, currently only the kernel_dialect operator
 // is supported. Therefore, this function only returns the values in the
 // kernel_dialect operator that can be eager deleted.
-static std::unordered_map<ir::Operation*, std::unordered_set<ir::Value>>
-GetEagerDeletionValues(ir::Block* block) {
-  std::unordered_set<ir::Value> skip_dels = GetSkipDeletionValues(block);
+static std::unordered_map<pir::Operation*, std::unordered_set<pir::Value>>
+GetEagerDeletionValues(pir::Block* block) {
+  std::unordered_set<pir::Value> skip_dels = GetSkipDeletionValues(block);
 
-  std::unordered_map<ir::Value, ir::Operation*> del_value_2_op;
+  std::unordered_map<pir::Value, pir::Operation*> del_value_2_op;
   for (auto& op : *block) {
     std::string upper_op_name = op->name();
-    if (op->dialect()->name().compare(
-            paddle::dialect::PaddleKernelDialect::name()) == 0) {
+    if (op->dialect()->name().compare(paddle::dialect::KernelDialect::name()) ==
+        0) {
       IR_ENFORCE(op->attributes().count("op_name") > 0,
                  "kernel_dialect op should own an 'op_name' attribute.");
       upper_op_name = op->attributes()
                           .at("op_name")
-                          .dyn_cast<::ir::StrAttribute>()
+                          .dyn_cast<pir::StrAttribute>()
                           .AsString();
     }
 
@@ -154,14 +154,15 @@ GetEagerDeletionValues(ir::Block* block) {
     }
 
     for (size_t i = 0; i < op->num_results(); ++i) {
-      ir::Value output = op->result(i);
+      pir::Value output = op->result(i);
       if (output && CanBeDeleted(output)) {
         del_value_2_op[output] = op;
       }
     }
   }
 
-  std::unordered_map<ir::Operation*, std::unordered_set<ir::Value>> eager_dels;
+  std::unordered_map<pir::Operation*, std::unordered_set<pir::Value>>
+      eager_dels;
   for (auto& kv : del_value_2_op) {
     eager_dels[kv.second].insert(kv.first);
   }
@@ -169,23 +170,23 @@ GetEagerDeletionValues(ir::Block* block) {
   return eager_dels;
 }
 
-static std::unordered_map<ir::Operation*, std::string> GetInplaceOps(
-    ir::Block* block) {
+static std::unordered_map<pir::Operation*, std::string> GetInplaceOps(
+    pir::Block* block) {
   const auto eager_dels = GetEagerDeletionValues(block);
 
-  std::unordered_map<ir::Operation*, std::string> inplace_ops;
+  std::unordered_map<pir::Operation*, std::string> inplace_ops;
 
-  std::unordered_set<ir::Value> visited_values;
-  std::unordered_set<ir::Value> reused_input_values;
-  std::unordered_set<ir::Value> reused_output_values;
+  std::unordered_set<pir::Value> visited_values;
+  std::unordered_set<pir::Value> reused_input_values;
+  std::unordered_set<pir::Value> reused_output_values;
 
   for (auto& op : *block) {
     for (size_t i = 0; i < op->num_operands(); ++i) {
       visited_values.insert(op->operand_source(i));
     }
 
-    if (op->dialect()->name().compare(
-            paddle::dialect::PaddleKernelDialect::name()) != 0) {
+    if (op->dialect()->name().compare(paddle::dialect::KernelDialect::name()) !=
+        0) {
       VLOG(6) << op->name()
               << "is not a kernel_dialect op, inplace only support "
                  "kernel_dialect operators";
@@ -197,13 +198,13 @@ static std::unordered_map<ir::Operation*, std::string> GetInplaceOps(
 
     auto upper_op_attrs = op->attributes();
     auto upper_op_name =
-        upper_op_attrs.at("op_name").dyn_cast<::ir::StrAttribute>().AsString();
+        upper_op_attrs.at("op_name").dyn_cast<pir::StrAttribute>().AsString();
     VLOG(6) << "analyse op: " << upper_op_name;
 
     // NOTE(zhangbo): add_grad cpu kernel can't do inplace, for the reason shown
     // in the function: CommonElementwiseBroadcastBackward
     // (paddle/phi/kernels/funcs/elementwise_grad_base.h)
-    if ((upper_op_name == "pd.add_grad") &&
+    if ((upper_op_name == "pd_op.add_grad") &&
         (upper_op_attrs.at("kernel_key")
              .dyn_cast<paddle::dialect::KernelAttribute>()
              .data()
@@ -215,7 +216,7 @@ static std::unordered_map<ir::Operation*, std::string> GetInplaceOps(
     }
 
     if (upper_op_attrs.count("is_inplace") != 0 &&
-        upper_op_attrs.at("is_inplace").dyn_cast<ir::BoolAttribute>().data()) {
+        upper_op_attrs.at("is_inplace").dyn_cast<pir::BoolAttribute>().data()) {
       VLOG(6) << upper_op_name << " is already an inplace op.";
       for (size_t i = 0; i < op->num_operands(); ++i) {
         reused_input_values.insert(op->operand_source(i));
@@ -227,8 +228,8 @@ static std::unordered_map<ir::Operation*, std::string> GetInplaceOps(
       continue;
     }
 
-    ir::OpInfo upper_inplace_op_info =
-        ir::IrContext::Instance()->GetRegisteredOpInfo(upper_op_name + "_");
+    pir::OpInfo upper_inplace_op_info =
+        pir::IrContext::Instance()->GetRegisteredOpInfo(upper_op_name + "_");
 
     if (eager_dels.count(op) == 0 || (!upper_inplace_op_info)) {
       VLOG(6) << upper_op_name
@@ -300,12 +301,12 @@ static std::unordered_map<ir::Operation*, std::string> GetInplaceOps(
 }
 }  // namespace details
 
-class InplacePass : public ir::Pass {
+class InplacePass : public pir::Pass {
  public:
-  InplacePass() : ir::Pass("InplacePass", 3) {}
+  InplacePass() : pir::Pass("InplacePass", 3) {}
 
-  void Run(ir::Operation* op) override {
-    auto module_op = op->dyn_cast<ir::ModuleOp>();
+  void Run(pir::Operation* op) override {
+    auto module_op = op->dyn_cast<pir::ModuleOp>();
     IR_ENFORCE(module_op, "InplacePass should run on module op.");
     auto* block = module_op.block();
 
@@ -315,9 +316,9 @@ class InplacePass : public ir::Pass {
       VLOG(6) << "Do inplace for: "
               << kv.first->attributes()
                      .at("op_name")
-                     .dyn_cast<::ir::StrAttribute>()
+                     .dyn_cast<pir::StrAttribute>()
                      .AsString();
-      ir::Block::iterator insert_pos =
+      pir::Block::iterator insert_pos =
           std::find(block->begin(), block->end(), kv.first);
       IR_ENFORCE(insert_pos != block->end(),
                  "Operator %s not found in block.",
@@ -325,26 +326,26 @@ class InplacePass : public ir::Pass {
 
       kv.first->set_attribute(
           "op_name",
-          ir::StrAttribute::get(ir::IrContext::Instance(), kv.second));
+          pir::StrAttribute::get(pir::IrContext::Instance(), kv.second));
       kv.first->set_attribute(
           "is_inplace",
-          ir::BoolAttribute::get(ir::IrContext::Instance(), true));
+          pir::BoolAttribute::get(pir::IrContext::Instance(), true));
     }
     LOG_FIRST_N(INFO, 1)
         << "Apply inplace pass on lowering ::ir::Program to Kernel Dialect.";
   }
 
-  bool CanApplyOn(ir::Operation* op) const override {
+  bool CanApplyOn(pir::Operation* op) const override {
     return op->name() == "builtin.module" && op->num_regions() > 0;
   }
 };
 
-namespace ir {
+namespace pir {
 
-std::unique_ptr<ir::Pass> CreateInplacePass() {
+std::unique_ptr<pir::Pass> CreateInplacePass() {
   return std::make_unique<InplacePass>();
 }
 
-}  // namespace ir
+}  // namespace pir
 
 REGISTER_IR_PASS(inplace, InplacePass);
diff --git a/paddle/fluid/ir/transforms/inplace_pass.h b/paddle/fluid/pir/transforms/inplace_pass.h
similarity index 90%
rename from paddle/fluid/ir/transforms/inplace_pass.h
rename to paddle/fluid/pir/transforms/inplace_pass.h
index 028d6a9eb94e8..c6d540243edc9 100644
--- a/paddle/fluid/ir/transforms/inplace_pass.h
+++ b/paddle/fluid/pir/transforms/inplace_pass.h
@@ -15,12 +15,12 @@
 #pragma once
 
 #include <memory>
-#include "paddle/ir/core/dll_decl.h"
+#include "paddle/pir/core/dll_decl.h"
 
-namespace ir {
+namespace pir {
 
 class Pass;
 
 std::unique_ptr<Pass> CreateInplacePass();
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/fluid/ir/transforms/pd_op_to_kernel_pass.cc b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
similarity index 51%
rename from paddle/fluid/ir/transforms/pd_op_to_kernel_pass.cc
rename to paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
index 3555ebe354ab7..29b1df63a8562 100644
--- a/paddle/fluid/ir/transforms/pd_op_to_kernel_pass.cc
+++ b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
@@ -14,19 +14,19 @@
 
 #include <iostream>
 
-#include "paddle/fluid/ir/transforms/pd_op_to_kernel_pass.h"
-
-#include "paddle/fluid/ir/dialect/paddle_dialect/interface/op_yaml_info.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_attribute.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_dialect.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/trait/inplace.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/utils/op_yaml_info_parser.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/utils/op_yaml_info_util.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/utils/utils.h"
-#include "paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_attribute.h"
-#include "paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_dialect.h"
-#include "paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_op.h"
-#include "paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_type.h"
+#include "paddle/fluid/pir/transforms/pd_op_to_kernel_pass.h"
+
+#include "paddle/fluid/pir/dialect/kernel/ir/kernel_attribute.h"
+#include "paddle/fluid/pir/dialect/kernel/ir/kernel_dialect.h"
+#include "paddle/fluid/pir/dialect/kernel/ir/kernel_op.h"
+#include "paddle/fluid/pir/dialect/kernel/ir/kernel_type.h"
+#include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
+#include "paddle/fluid/pir/dialect/operator/trait/inplace.h"
+#include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.h"
+#include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_util.h"
+#include "paddle/fluid/pir/dialect/operator/utils/utils.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/api/lib/data_transform.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
@@ -51,177 +51,23 @@ std::unordered_map<std::string, phi::DataType> Str2PhiDataType = {
 };
 
 const std::unordered_set<std::string> UnchangeOutputOps = {
-    "pd.data",
+    "pd_op.data",
     "builtin.combine",
     "builtin.slice",
     "builtin.split",
-    "pd.feed",
-    "pd.fetch",
+    "pd_op.feed",
+    "pd_op.fetch",
     "builtin.set_parameter",
     "builtin.get_parameter",
-    "pd.shadow_output"};
+    "pd_op.shadow_output"};
 
-const std::unordered_set<std::string> SpecialOpList = {
-    "builtin.combine", "builtin.slice", "builtin.split"};
-
-ir::OpResult GetNewInput(
-    const ir::Value cur_in,
-    const std::unordered_map<ir::Value, ir::OpResult>& map_value_pair,
-    const int index,
-    const std::string op_name) {
-  PADDLE_ENFORCE_EQ(
-      map_value_pair.count(cur_in),
-      true,
-      phi::errors::PreconditionNotMet(
-          "[%d]'s input of [%s] op MUST be in map pair", index, op_name));
-  auto new_in = map_value_pair.at(cur_in);
-  return new_in;
-}
-
-void DealWithSpecialBuiltinOps(
-    ir::Operation* op_item,
-    ir::Program* program,
-    std::unordered_map<ir::Operation*, ir::Operation*>* map_op_pair,
-    std::unordered_map<ir::Value, ir::OpResult>* map_value_pair,
-    ir::IrContext* ctx) {
-  if (op_item->name() == "builtin.combine") {
-    std::vector<phi::Place> out_places;
-    // Copy op inputs
-    std::vector<ir::OpResult> vec_inputs;
-    std::vector<ir::Type> vec_inner_types;
-    if (op_item->num_operands() > 0) {
-      for (size_t i = 0; i < op_item->num_operands(); ++i) {
-        auto cur_in = op_item->operand_source(i);
-        if (!cur_in) {
-          vec_inputs.emplace_back();
-          continue;
-        }
-        auto new_in = GetNewInput(cur_in, *map_value_pair, i, op_item->name());
-        vec_inputs.push_back(new_in);
-        vec_inner_types.push_back(new_in.type());
-        if (new_in.type().isa<paddle::dialect::AllocatedDenseTensorType>()) {
-          out_places.push_back(
-              new_in.type()
-                  .dyn_cast<paddle::dialect::AllocatedDenseTensorType>()
-                  .place());
-        } else if (new_in.type()
-                       .isa<paddle::dialect::AllocatedSelectedRowsType>()) {
-          out_places.push_back(
-              new_in.type()
-                  .dyn_cast<paddle::dialect::AllocatedSelectedRowsType>()
-                  .place());
-        } else {
-          PADDLE_THROW(phi::errors::Unimplemented(
-              "only support dense tensor type for now"));
-        }
-      }
-    }
-    // Copy op output type
-    std::vector<ir::Type> op_output_types;
-    ir::Type t1 = ir::VectorType::get(ctx, vec_inner_types);
-    op_output_types.push_back(t1);
-
-    // Get op info
-    ir::OpInfo op_info = ctx->GetRegisteredOpInfo(op_item->name());
-    // Generate new op
-    ir::Operation* op = ir::Operation::Create(
-        vec_inputs, op_item->attributes(), op_output_types, op_info);
-    program->block()->push_back(op);
-    (*map_op_pair)[op_item] = op;
-    // only deal with single output
-    if (op_item->num_results() > 0) {
-      for (size_t i = 0; i < op_item->num_results(); ++i) {
-        (*map_value_pair)[op_item->result(i)] = op->result(i);
-      }
-    }
-  }
-
-  if (op_item->name() == "builtin.slice") {
-    std::vector<ir::OpResult> vec_inputs;
-    std::vector<ir::Type> op_output_types;
-    if (op_item->num_operands() > 0) {
-      for (size_t i = 0; i < op_item->num_operands(); ++i) {
-        auto cur_in = op_item->operand_source(i);
-        if (!cur_in) {
-          vec_inputs.emplace_back();
-          continue;
-        }
-        auto new_in = GetNewInput(cur_in, *map_value_pair, i, op_item->name());
-        vec_inputs.push_back(new_in);
-        if (new_in.type().isa<ir::VectorType>()) {
-          auto vec_types = new_in.type().dyn_cast<ir::VectorType>().data();
-          auto index = op_item->attributes()
-                           .at("index")
-                           .dyn_cast<ir::Int32Attribute>()
-                           .data();
-          op_output_types.push_back(vec_types[index]);
-        } else {
-          PADDLE_THROW(
-              phi::errors::Unimplemented("only support vector type for now"));
-        }
-      }
-    }
-
-    // Get op info
-    ir::OpInfo op_info = ctx->GetRegisteredOpInfo(op_item->name());
-    // Generate new op
-    ir::Operation* op = ir::Operation::Create(
-        vec_inputs, op_item->attributes(), op_output_types, op_info);
-    program->block()->push_back(op);
-    (*map_op_pair)[op_item] = op;
-    // only deal with single output
-    if (op_item->num_results() > 0) {
-      for (size_t i = 0; i < op_item->num_results(); ++i) {
-        (*map_value_pair)[op_item->result(i)] = op->result(i);
-      }
-    }
-  }
-
-  if (op_item->name() == "builtin.split") {
-    std::vector<phi::Place> out_places(op_item->num_results());
-    // Copy op inputs
-    std::vector<ir::OpResult> vec_inputs;
-    std::vector<ir::Type> op_output_types;
-    if (op_item->num_operands() > 0) {
-      for (size_t i = 0; i < op_item->num_operands(); ++i) {
-        auto cur_in = op_item->operand_source(i);
-        if (!cur_in) {
-          vec_inputs.emplace_back();
-          continue;
-        }
-        auto new_in = GetNewInput(cur_in, *map_value_pair, i, op_item->name());
-        vec_inputs.push_back(new_in);
-
-        if (new_in.type().isa<ir::VectorType>()) {
-          auto vec_types = new_in.type().dyn_cast<ir::VectorType>().data();
-          for (uint64_t idx = 0; idx < vec_types.size(); idx++) {
-            op_output_types.push_back(vec_types[idx]);
-          }
-        } else {
-          PADDLE_THROW(
-              phi::errors::Unimplemented("only support vector type for now"));
-        }
-      }
-    }
-
-    // Get op info
-    ir::OpInfo op_info = ctx->GetRegisteredOpInfo(op_item->name());
-    // Generate new op
-    ir::Operation* op = ir::Operation::Create(
-        vec_inputs, op_item->attributes(), op_output_types, op_info);
-    program->block()->push_back(op);
-    (*map_op_pair)[op_item] = op;
-    // only deal with single output
-    if (op_item->num_results() > 0) {
-      for (size_t i = 0; i < op_item->num_results(); ++i) {
-        (*map_value_pair)[op_item->result(i)] = op->result(i);
-      }
-    }
-  }
-  VLOG(6) << "Deep copy a new builtin op: " << op_item->name();
-}
+const std::unordered_set<std::string> SpecialLowerOps = {
+    "builtin.combine",
+    "builtin.slice",
+    "builtin.split",
+};
 
-bool NeedFallBackCpu(const ir::Operation* op,
+bool NeedFallBackCpu(const pir::Operation* op,
                      const std::string& kernel_fn_name,
                      const phi::KernelKey& kernel_key) {
   if (UnchangeOutputOps.count(op->name())) {
@@ -255,7 +101,7 @@ bool NeedFallBackCpu(const ir::Operation* op,
 
 phi::Backend GetDstBackend(const std::string& op_name,
                            phi::Place place,
-                           OpYamlInfoParser* op_yaml_info_parser,
+                           const OpYamlInfoParser* op_yaml_info_parser,
                            phi::Backend kernel_def_backend,
                            size_t input_index) {
   if (op_name == "builtin.set_parameter" &&
@@ -275,14 +121,16 @@ phi::Backend GetDstBackend(const std::string& op_name,
   return dst_backend;
 }
 
-bool NeedFallBackFromGPUDNN2GPU(ir::Operation* op,
+bool NeedFallBackFromGPUDNN2GPU(pir::Operation* op,
                                 const phi::KernelKey kernel_key) {
   // NOTE(phlrain): keep the same kernel select strategy with
   // GetExepectKernelKey
-  if (op->name() == "pd.pool2d" || op->name() == "pd.pool2d_grad") {
+  if (op->name() == "pd_op.pool2d" || op->name() == "pd_op.pool2d_grad") {
     if (kernel_key.backend() == phi::Backend::GPUDNN &&
-        (op->attributes().at("adaptive").dyn_cast<ir::BoolAttribute>().data() ==
-         true)) {
+        (op->attributes()
+             .at("adaptive")
+             .dyn_cast<pir::BoolAttribute>()
+             .data() == true)) {
       return true;
     }
   }
@@ -290,26 +138,26 @@ bool NeedFallBackFromGPUDNN2GPU(ir::Operation* op,
   return false;
 }
 
-std::set<std::string> GetSkipFeedNames(ir::Block* block) {
+std::set<std::string> GetSkipFeedNames(pir::Block* block) {
   std::set<std::string> data_op_names;
   for (auto op_item : *block) {
-    if (op_item->name() == "pd.data") {
+    if (op_item->name() == "pd_op.data") {
       data_op_names.insert(op_item->attributes()
                                .at("name")
-                               .dyn_cast<ir::StrAttribute>()
+                               .dyn_cast<pir::StrAttribute>()
                                .AsString());
     }
   }
   return data_op_names;
 }
 
-bool SkipFeedOp(ir::Operation* op, const std::set<std::string>& feed_names) {
+bool SkipFeedOp(pir::Operation* op, const std::set<std::string>& feed_names) {
   return feed_names.count(
-      op->attributes().at("name").dyn_cast<ir::StrAttribute>().AsString());
+      op->attributes().at("name").dyn_cast<pir::StrAttribute>().AsString());
 }
 
 std::vector<std::shared_ptr<phi::TensorBase>> GetFakeTensorList(
-    ir::Value new_input_tmp) {
+    pir::Value new_input_tmp) {
   std::vector<std::shared_ptr<phi::TensorBase>> vec_res;
   auto input_type = new_input_tmp.type();
 
@@ -356,8 +204,8 @@ std::vector<std::shared_ptr<phi::TensorBase>> GetFakeTensorList(
   } else if (input_type.isa<dialect::AllocatedSelectedRowsType>()) {
     vec_res.push_back(build_fake_selected_rows(
         input_type.dyn_cast<dialect::AllocatedSelectedRowsType>()));
-  } else if (input_type.isa<ir::VectorType>()) {
-    auto vec_inner_types = input_type.dyn_cast<ir::VectorType>().data();
+  } else if (input_type.isa<pir::VectorType>()) {
+    auto vec_inner_types = input_type.dyn_cast<pir::VectorType>().data();
     for (size_t i = 0; i < vec_inner_types.size(); ++i) {
       if (vec_inner_types[i].isa<dialect::AllocatedDenseTensorType>()) {
         vec_res.push_back(build_fake_dense_tensor(
@@ -372,29 +220,29 @@ std::vector<std::shared_ptr<phi::TensorBase>> GetFakeTensorList(
   return vec_res;
 }
 
-ir::OpResult AddPlaceTransferOp(ir::OpResult in,
-                                ir::Type out_type,
-                                const phi::Place& src_place,
-                                const phi::Place& dst_place,
-                                const phi::KernelKey& kernel_key,
-                                ir::Program* program) {
-  ir::IrContext* ctx = ir::IrContext::Instance();
+pir::OpResult AddPlaceTransferOp(pir::OpResult in,
+                                 pir::Type out_type,
+                                 const phi::Place& src_place,
+                                 const phi::Place& dst_place,
+                                 const phi::KernelKey& kernel_key,
+                                 pir::Program* program) {
+  pir::IrContext* ctx = pir::IrContext::Instance();
   std::string op_name = paddle::dialect::PhiKernelOp::name();
 
-  ir::OpInfo op_info = ctx->GetRegisteredOpInfo(op_name);
+  pir::OpInfo op_info = ctx->GetRegisteredOpInfo(op_name);
 
   if ((src_place.GetType() == phi::AllocationType::CPU) &&
       (dst_place.GetType() == phi::AllocationType::GPU)) {
     auto copy_kernel_key = kernel_key;
     copy_kernel_key.set_backend(phi::Backend::GPU);
-    std::unordered_map<std::string, ir::Attribute> op_attribute{
-        {"op_name", ir::StrAttribute::get(ctx, "pd.memcpy_h2d")},
-        {"kernel_name", ir::StrAttribute::get(ctx, "memcpy_h2d")},
+    std::unordered_map<std::string, pir::Attribute> op_attribute{
+        {"op_name", pir::StrAttribute::get(ctx, "pd_op.memcpy_h2d")},
+        {"kernel_name", pir::StrAttribute::get(ctx, "memcpy_h2d")},
         {"kernel_key", dialect::KernelAttribute::get(ctx, copy_kernel_key)},
-        {"dst_place_type", ir::Int32Attribute::get(ctx, 1)}};
+        {"dst_place_type", pir::Int32Attribute::get(ctx, 1)}};
 
-    ir::Operation* op =
-        ir::Operation::Create({in}, op_attribute, {out_type}, op_info);
+    pir::Operation* op =
+        pir::Operation::Create({in}, op_attribute, {out_type}, op_info);
 
     if (in.GetDefiningOp()->HasAttribute(kAttrIsPersisable)) {
       op->set_attribute(kAttrIsPersisable,
@@ -409,14 +257,14 @@ ir::OpResult AddPlaceTransferOp(ir::OpResult in,
              (dst_place.GetType() == phi::AllocationType::CPU)) {
     auto copy_kernel_key = kernel_key;
     copy_kernel_key.set_backend(phi::Backend::GPU);
-    std::unordered_map<std::string, ir::Attribute> op_attribute{
-        {"op_name", ir::StrAttribute::get(ctx, "pd.memcpy_d2h")},
-        {"kernel_name", ir::StrAttribute::get(ctx, "memcpy_d2h")},
+    std::unordered_map<std::string, pir::Attribute> op_attribute{
+        {"op_name", pir::StrAttribute::get(ctx, "pd_op.memcpy_d2h")},
+        {"kernel_name", pir::StrAttribute::get(ctx, "memcpy_d2h")},
         {"kernel_key", dialect::KernelAttribute::get(ctx, copy_kernel_key)},
-        {"dst_place_type", ir::Int32Attribute::get(ctx, 0)}};
+        {"dst_place_type", pir::Int32Attribute::get(ctx, 0)}};
 
-    ir::Operation* op =
-        ir::Operation::Create({in}, op_attribute, {out_type}, op_info);
+    pir::Operation* op =
+        pir::Operation::Create({in}, op_attribute, {out_type}, op_info);
 
     program->block()->push_back(op);
 
@@ -428,10 +276,10 @@ ir::OpResult AddPlaceTransferOp(ir::OpResult in,
   }
 }
 
-ir::Type BuildOutputType(ir::Type type,
-                         const phi::Place& place,
-                         phi::DataType data_type,
-                         ir::IrContext* ctx) {
+pir::Type BuildOutputType(pir::Type type,
+                          const phi::Place& place,
+                          phi::DataType data_type,
+                          pir::IrContext* ctx) {
   if (type.isa<dialect::DenseTensorType>()) {
     auto dense_tensor_type = type.dyn_cast<dialect::DenseTensorType>();
     auto out_dtype = dense_tensor_type.dtype();
@@ -473,8 +321,8 @@ ir::Type BuildOutputType(ir::Type type,
 }
 
 phi::DataType GetKernelDataTypeByYamlInfo(
-    const ir::Operation* op,
-    const std::unordered_map<ir::Value, ir::OpResult>& map_value_pair,
+    const pir::Operation* op,
+    const std::unordered_map<pir::Value, pir::OpResult>& map_value_pair,
     const dialect::OpYamlInfoParser* op_info_parser) {
   auto& attr_map = op->attributes();
   auto& data_type_info = op_info_parser->OpRuntimeInfo().kernel_key_dtype;
@@ -495,8 +343,8 @@ phi::DataType GetKernelDataTypeByYamlInfo(
       if (type.isa<paddle::dialect::AllocatedDenseTensorType>()) {
         kernel_data_type = TransToPhiDataType(
             type.dyn_cast<paddle::dialect::AllocatedDenseTensorType>().dtype());
-      } else if (type.isa<ir::VectorType>()) {
-        auto vec_data = type.dyn_cast<ir::VectorType>().data();
+      } else if (type.isa<pir::VectorType>()) {
+        auto vec_data = type.dyn_cast<pir::VectorType>().data();
         if (vec_data.empty()) {
           kernel_data_type = phi::DataType::UNDEFINED;
         } else {
@@ -547,8 +395,8 @@ phi::DataType GetKernelDataTypeByYamlInfo(
 }
 
 phi::Backend GetKernelBackendByYamlInfo(
-    const ir::Operation* op,
-    const std::unordered_map<ir::Value, ir::OpResult>& map_value_pair,
+    const pir::Operation* op,
+    const std::unordered_map<pir::Value, pir::OpResult>& map_value_pair,
     const dialect::OpYamlInfoParser* op_info_parser) {
   auto& attr_map = op->attributes();
   auto& backend_info = op_info_parser->OpRuntimeInfo().kernel_key_backend;
@@ -565,8 +413,8 @@ phi::Backend GetKernelBackendByYamlInfo(
       if (type.isa<paddle::dialect::AllocatedDenseTensorType>()) {
         kernel_backend = paddle::experimental::ParseBackend(
             type.dyn_cast<paddle::dialect::AllocatedDenseTensorType>().place());
-      } else if (type.isa<ir::VectorType>()) {
-        auto vec_data = type.dyn_cast<ir::VectorType>().data();
+      } else if (type.isa<pir::VectorType>()) {
+        auto vec_data = type.dyn_cast<pir::VectorType>().data();
         if (vec_data.empty()) {
           kernel_backend = phi::Backend::UNDEFINED;
         } else {
@@ -617,11 +465,12 @@ phi::Backend GetKernelBackendByYamlInfo(
 }
 
 phi::KernelKey GetKernelKey(
-    ir::Operation* op,
+    pir::Operation* op,
     const phi::Place& place,
-    const std::unordered_map<ir::Value, ir::OpResult>& map_value_pair,
+    const std::string& kernel_fn_str,
+    const std::unordered_map<pir::Value, pir::OpResult>& map_value_pair,
     dialect::OpYamlInfoParser* op_info_parser = nullptr) {
-  if (op->name() == "pd.feed") {
+  if (op->name() == "pd_op.feed") {
     // NOTE, for now feed op don't need a kernel, so the data type from Op
     // Result the next op use base program datatype
     return {phi::Backend::CPU,
@@ -630,7 +479,7 @@ phi::KernelKey GetKernelKey(
                 op->result(0).type().dyn_cast<DenseTensorType>().dtype())};
   }
 
-  if (op->name() == "pd.data") {
+  if (op->name() == "pd_op.data") {
     // NOTE, for now feed op don't need a kernel, so the data type from Op
     // Result the next op use base program datatype
     auto data_place =
@@ -644,6 +493,14 @@ phi::KernelKey GetKernelKey(
                 op->result(0).type().dyn_cast<DenseTensorType>().dtype())};
   }
 
+  if (op->name() == "pd_op.seed") {
+    auto backend = paddle::experimental::ParseBackend(place);
+    return {backend,
+            phi::DataLayout::ANY,
+            TransToPhiDataType(
+                op->result(0).type().dyn_cast<DenseTensorType>().dtype())};
+  }
+
   phi::Backend kernel_backend = phi::Backend::UNDEFINED;
   phi::DataLayout kernel_layout = phi::DataLayout::UNDEFINED;
   phi::DataType kernel_data_type = phi::DataType::UNDEFINED;
@@ -659,14 +516,14 @@ phi::KernelKey GetKernelKey(
         GetKernelBackendByYamlInfo(op, map_value_pair, op_info_parser);
 
     // parse all the input tensor
-    if (tensor_input_number == 0 || op->name() == "pd.full_") {
+    if (tensor_input_number == 0 || op->name() == "pd_op.full_") {
       // all the information have to get from attribute and context
 
-      if (op->name() == "pd.uniform") {
+      if (op->name() == "pd_op.uniform") {
         // try to process uniform, use shape to determin backend
         // TODO(phlrain): shuold support other initilize op
         auto define_op = op->operand_source(0).GetDefiningOp();
-        if (define_op->name() == "pd.full_int_array") {
+        if (define_op->name() == "pd_op.full_int_array") {
           auto shape = define_op->attributes()
                            .at("value")
                            .dyn_cast<dialect::IntArrayAttribute>()
@@ -714,7 +571,7 @@ phi::KernelKey GetKernelKey(
       // don't know how to select the kernel in the next of op that
       // uses data op outout as inputs. So, we need set kernel backend
       // manually.
-      if (op->operand_source(i).GetDefiningOp()->name() == "pd.data") {
+      if (op->operand_source(i).GetDefiningOp()->name() == "pd_op.data") {
         auto data_op = op->operand_source(i).GetDefiningOp();
         auto data_place = data_op->attributes()
                               .at("place")
@@ -733,7 +590,7 @@ phi::KernelKey GetKernelKey(
         auto combine_op = op->operand_source(i).GetDefiningOp();
         for (size_t j = 0; j < combine_op->num_operands(); ++j) {
           if (combine_op->operand_source(j).GetDefiningOp()->name() ==
-              "pd.data") {
+              "pd_op.data") {
             auto data_op = combine_op->operand_source(j).GetDefiningOp();
             auto data_place = data_op->attributes()
                                   .at("place")
@@ -774,409 +631,587 @@ phi::KernelKey GetKernelKey(
   }
 
   phi::KernelKey res(kernel_backend, kernel_layout, kernel_data_type);
-  return res;
-}
 
-std::unique_ptr<ir::Program> PdOpLowerToKernelPass(ir::Program* prog,
-                                                   phi::Place place) {
-  if (VLOG_IS_ON(2)) {
-    std::stringstream ss;
-    prog->Print(ss);
-    VLOG(2) << "Program after lowering to kernel pass : " << ss.str();
+  if (op->name() == "pd_op.load_combine") {
+    res.set_dtype(phi::DataType::FLOAT32);
+  }
+  if (NeedFallBackCpu((op), kernel_fn_str, res)) {
+    res.set_backend(phi::Backend::CPU);
   }
-  auto program = std::make_unique<ir::Program>(ir::IrContext::Instance());
-
-  auto block = prog->block();
-
-  ir::IrContext* ctx = ir::IrContext::Instance();
-  ctx->GetOrRegisterDialect<paddle::dialect::PaddleDialect>();
-  ctx->GetOrRegisterDialect<paddle::dialect::PaddleKernelDialect>();
-
-  std::unordered_map<ir::Operation*, ir::Operation*> map_op_pair;
-  std::unordered_map<ir::Value, ir::OpResult> map_value_pair;
-
-  std::string phi_kernel_op_name = paddle::dialect::PhiKernelOp::name();
-  ir::OpInfo phi_kernel_op_info = ctx->GetRegisteredOpInfo(phi_kernel_op_name);
 
-  std::string legacy_kernel_op_name = paddle::dialect::LegacyKernelOp::name();
-  ir::OpInfo legacy_kernel_op_info =
-      ctx->GetRegisteredOpInfo(legacy_kernel_op_name);
+  if (NeedFallBackFromGPUDNN2GPU(op, res)) {
+    res.set_backend(phi::Backend::GPU);
+  }
 
-  auto skip_feed_names = GetSkipFeedNames(block);
+  return res;
+}
 
-  for (auto op_item : *block) {
-    VLOG(6) << "op name " << op_item->name();
-    if ((op_item->name() == "pd.feed") &&
-        SkipFeedOp(op_item, skip_feed_names)) {
-      continue;
-    }
+pir::OpResult GetNewInput(
+    const pir::Value cur_in,
+    const std::unordered_map<pir::Value, pir::OpResult>& map_value_pair,
+    const int index,
+    const std::string op_name) {
+  PADDLE_ENFORCE_EQ(
+      map_value_pair.count(cur_in),
+      true,
+      phi::errors::PreconditionNotMet(
+          "[%d]'s input of [%s] op MUST be in map pair", index, op_name));
+  auto new_in = map_value_pair.at(cur_in);
+  return new_in;
+}
 
-    if (SpecialOpList.count(op_item->name())) {
-      DealWithSpecialBuiltinOps(
-          op_item, program.get(), &map_op_pair, &map_value_pair, ctx);
-      continue;
+void HandleForSpecialOp(
+    pir::Operation* op_item,
+    pir::Program* program,
+    pir::IrContext* ctx,
+    std::unordered_map<pir::Operation*, pir::Operation*>* map_op_pair,
+    std::unordered_map<pir::Value, pir::OpResult>* map_value_pair) {
+  std::vector<pir::OpResult> vec_inputs;
+  std::vector<pir::Type> op_output_types;
+  if (op_item->name() == "builtin.combine") {
+    // Copy op inputs
+    std::vector<pir::Type> vec_inner_types;
+    if (op_item->num_operands() > 0) {
+      for (size_t i = 0; i < op_item->num_operands(); ++i) {
+        auto cur_in = op_item->operand_source(i);
+        if (!cur_in) {
+          vec_inputs.emplace_back();
+          continue;
+        }
+        auto new_in = GetNewInput(cur_in, *map_value_pair, i, op_item->name());
+        vec_inputs.push_back(new_in);
+        vec_inner_types.push_back(new_in.type());
+      }
     }
+    // Copy op output type
 
-    // Lower from PaddleDialect to KernelDialect
-    paddle::dialect::OpYamlInfoInterface op_info_interface =
-        op_item->dyn_cast<paddle::dialect::OpYamlInfoInterface>();
-
-    std::unique_ptr<OpYamlInfoParser> op_info_parser(nullptr);
-    if (op_info_interface) {
-      op_info_parser =
-          std::make_unique<OpYamlInfoParser>(op_info_interface.GetOpInfo());
-    }
+    pir::Type t1 = pir::VectorType::get(ctx, vec_inner_types);
+    op_output_types.push_back(t1);
+  }
 
-    std::string kernel_fn_str;
-    if (op_info_parser != nullptr) {
-      kernel_fn_str = op_info_parser->OpRuntimeInfo().kernel_func[0];
-    }
+  if (op_item->name() == "builtin.slice") {
+    if (op_item->num_operands() > 0) {
+      for (size_t i = 0; i < op_item->num_operands(); ++i) {
+        auto cur_in = op_item->operand_source(i);
+        if (!cur_in) {
+          vec_inputs.emplace_back();
+          continue;
+        }
+        auto new_in = GetNewInput(cur_in, *map_value_pair, i, op_item->name());
+        vec_inputs.push_back(new_in);
 
-    if (op_item->name() == "pd.add_n_" ||
-        op_item->name() == "pd.add_n_with_kernel") {
-      if (op_item->result(0).type().isa<dialect::SelectedRowsType>()) {
-        kernel_fn_str = "add_n_sr";
+        if (new_in.type().isa<pir::VectorType>()) {
+          auto vec_types = new_in.type().dyn_cast<pir::VectorType>().data();
+          auto index = op_item->attributes()
+                           .at("index")
+                           .dyn_cast<pir::Int32Attribute>()
+                           .data();
+          op_output_types.push_back(vec_types[index]);
+        } else {
+          PADDLE_THROW(
+              phi::errors::Unimplemented("only support vector type for now"));
+        }
       }
     }
+  }
 
-    auto kernel_key =
-        GetKernelKey(op_item, place, map_value_pair, op_info_parser.get());
-    VLOG(6) << "kernel type " << kernel_key;
+  if (op_item->name() == "builtin.split") {
+    if (op_item->num_operands() > 0) {
+      for (size_t i = 0; i < op_item->num_operands(); ++i) {
+        auto cur_in = op_item->operand_source(i);
+        if (!cur_in) {
+          vec_inputs.emplace_back();
+          continue;
+        }
+        auto new_in = GetNewInput(cur_in, *map_value_pair, i, op_item->name());
+        vec_inputs.push_back(new_in);
 
-    if (op_item->name() == "pd.load_combine") {
-      kernel_key.set_dtype(phi::DataType::FLOAT32);
-    }
-    if (NeedFallBackCpu((op_item), kernel_fn_str, kernel_key)) {
-      kernel_key.set_backend(phi::Backend::CPU);
+        if (new_in.type().isa<pir::VectorType>()) {
+          auto vec_types = new_in.type().dyn_cast<pir::VectorType>().data();
+          for (uint64_t idx = 0; idx < vec_types.size(); idx++) {
+            op_output_types.push_back(vec_types[idx]);
+          }
+        } else {
+          PADDLE_THROW(
+              phi::errors::Unimplemented("only support vector type for now"));
+        }
+      }
     }
+  }
 
-    if (NeedFallBackFromGPUDNN2GPU(op_item, kernel_key)) {
-      kernel_key.set_backend(phi::Backend::GPU);
+  pir::OpInfo op_info = ctx->GetRegisteredOpInfo(op_item->name());
+  // Generate new op
+  pir::Operation* op = pir::Operation::Create(
+      vec_inputs, op_item->attributes(), op_output_types, op_info);
+  program->block()->push_back(op);
+  (*map_op_pair)[op_item] = op;
+  // only deal with single output
+  if (op_item->num_results() > 0) {
+    for (size_t i = 0; i < op_item->num_results(); ++i) {
+      (*map_value_pair)[op_item->result(i)] = op->result(i);
     }
+  }
+  VLOG(6) << "Deep copy a new builtin op: " << op_item->name();
+}
 
-    // only for single output
-    // need update new kernel key layout and data tyep
-
-    std::vector<ir::Type> op_output_types;
-    if (op_item->num_results() > 0) {
-      auto phi_kernel = phi::KernelFactory::Instance().SelectKernelWithGPUDNN(
-          kernel_fn_str, kernel_key);
-      auto args_def = phi_kernel.args_def();
-      auto output_defs = args_def.output_defs();
-      if (!UnchangeOutputOps.count(op_item->name()) &&
-          !IsLegacyOp(op_item->name())) {
-        PADDLE_ENFORCE_EQ(
-            op_item->num_results(),
-            output_defs.size(),
-            phi::errors::PreconditionNotMet(
-                "op [%s] kernel output args defs should equal op outputs",
-                op_item->name()));
-      }
+std::vector<pir::Type> BuildOpOutputType(pir::Operation* op_item,
+                                         const std::string& kernel_fn_str,
+                                         const phi::KernelKey& kernel_key,
+                                         pir::IrContext* ctx) {
+  if (op_item->num_results() == 0) {
+    return {};
+  }
+  std::vector<pir::Type> op_output_types;
+  auto phi_kernel = phi::KernelFactory::Instance().SelectKernelWithGPUDNN(
+      kernel_fn_str, kernel_key);
+  auto args_def = phi_kernel.args_def();
+  auto output_defs = args_def.output_defs();
+  if (!UnchangeOutputOps.count(op_item->name()) &&
+      !IsLegacyOp(op_item->name())) {
+    PADDLE_ENFORCE_EQ(
+        op_item->num_results(),
+        output_defs.size(),
+        phi::errors::PreconditionNotMet(
+            "op [%s] kernel output args defs should equal op outputs",
+            op_item->name()));
+  }
 
-      for (size_t i = 0; i < op_item->num_results(); ++i) {
-        phi::Place out_place = phi::TransToPhiPlace(kernel_key.backend());
+  for (size_t i = 0; i < op_item->num_results(); ++i) {
+    phi::Place out_place = phi::TransToPhiPlace(kernel_key.backend());
 
-        phi::DataType out_phi_dtype = phi::DataType::UNDEFINED;
-        if ((!UnchangeOutputOps.count(op_item->name())) &&
-            (!IsLegacyOp(op_item->name())) && phi_kernel.IsValid()) {
-          out_place = phi::TransToPhiPlace(output_defs[i].backend);
-          out_phi_dtype = output_defs[i].dtype;
-        }
+    phi::DataType out_phi_dtype = phi::DataType::UNDEFINED;
+    if ((!UnchangeOutputOps.count(op_item->name())) &&
+        (!IsLegacyOp(op_item->name())) && phi_kernel.IsValid()) {
+      out_place = phi::TransToPhiPlace(output_defs[i].backend);
+      out_phi_dtype = output_defs[i].dtype;
+    }
 
-        auto result_type = op_item->result(i).type();
-        if (!result_type) {
-          op_output_types.push_back(result_type);
-        } else if (result_type.isa<dialect::DenseTensorType>() ||
-                   result_type.isa<dialect::SelectedRowsType>()) {
-          op_output_types.push_back(
-              BuildOutputType(result_type, out_place, out_phi_dtype, ctx));
-        } else if (result_type.isa<ir::VectorType>()) {
-          std::vector<ir::Type> vec_inner_types;
-          auto base_types = result_type.dyn_cast<ir::VectorType>().data();
-          for (auto& base_type : base_types) {
-            if (base_type) {
-              if (base_type.isa<dialect::DenseTensorType>()) {
-                vec_inner_types.push_back(
-                    BuildOutputType(base_type, out_place, out_phi_dtype, ctx));
-              } else {
-                PADDLE_THROW(phi::errors::Unimplemented(
-                    "only support dense tensor in vector type for now"));
-              }
-            } else {
-              // NOTE(phlrain), kernel not support a nullptr in output
-              ir::Type fp32_dtype = ir::Float32Type::get(ctx);
-              phi::DDim dims = {};
-              phi::DataLayout data_layout = phi::DataLayout::NCHW;
-              phi::LoD lod = {{}};
-              size_t offset = 0;
-              auto dense_tensor_dtype = paddle::dialect::DenseTensorType::get(
-                  ctx, fp32_dtype, dims, data_layout, lod, offset);
-              auto allocated_dense_tensor_dtype =
-                  paddle::dialect::AllocatedDenseTensorType::get(
-                      ctx, out_place, dense_tensor_dtype);
-              vec_inner_types.push_back(allocated_dense_tensor_dtype);
-            }
+    auto result_type = op_item->result(i).type();
+    if (!result_type) {
+      op_output_types.push_back(result_type);
+    } else if (result_type.isa<dialect::DenseTensorType>() ||
+               result_type.isa<dialect::SelectedRowsType>()) {
+      op_output_types.push_back(
+          BuildOutputType(result_type, out_place, out_phi_dtype, ctx));
+    } else if (result_type.isa<pir::VectorType>()) {
+      std::vector<pir::Type> vec_inner_types;
+      auto base_types = result_type.dyn_cast<pir::VectorType>().data();
+      for (auto& base_type : base_types) {
+        if (base_type) {
+          if (base_type.isa<dialect::DenseTensorType>()) {
+            vec_inner_types.push_back(
+                BuildOutputType(base_type, out_place, out_phi_dtype, ctx));
+          } else {
+            PADDLE_THROW(phi::errors::Unimplemented(
+                "only support dense tensor in vector type for now"));
           }
-
-          ir::Type t1 = ir::VectorType::get(ctx, vec_inner_types);
-          op_output_types.push_back(t1);
         } else {
-          PADDLE_THROW(phi::errors::Unimplemented(
-              "Result type only support DenseTensorType, SelectedRowType and "
-              "VectorType"));
+          // NOTE(phlrain), kernel not support a nullptr in output
+          pir::Type fp32_dtype = pir::Float32Type::get(ctx);
+          phi::DDim dims = {};
+          phi::DataLayout data_layout = phi::DataLayout::NCHW;
+          phi::LoD lod = {{}};
+          size_t offset = 0;
+          auto dense_tensor_dtype = paddle::dialect::DenseTensorType::get(
+              ctx, fp32_dtype, dims, data_layout, lod, offset);
+          auto allocated_dense_tensor_dtype =
+              paddle::dialect::AllocatedDenseTensorType::get(
+                  ctx, out_place, dense_tensor_dtype);
+          vec_inner_types.push_back(allocated_dense_tensor_dtype);
         }
       }
+
+      pir::Type t1 = pir::VectorType::get(ctx, vec_inner_types);
+      op_output_types.push_back(t1);
+    } else {
+      PADDLE_THROW(phi::errors::Unimplemented(
+          "Result type only support DenseTensorType, SelectedRowType and "
+          "VectorType"));
     }
+  }
 
-    // constuct input
-    std::vector<ir::OpResult> vec_inputs;
-    if (op_item->num_operands() > 0) {
-      for (size_t i = 0; i < op_item->num_operands(); ++i) {
-        auto cur_in = op_item->operand_source(i);
-        if (!cur_in) {
-          vec_inputs.emplace_back();
-          continue;
+  return op_output_types;
+}
+
+std::vector<pir::OpResult> BuildOpInputList(
+    pir::Operation* op_item,
+    const std::string& kernel_fn_str,
+    const phi::KernelKey& kernel_key,
+    const phi::Place place,
+    const OpYamlInfoParser* op_info_parser,
+    pir::IrContext* ctx,
+    std::unordered_map<pir::Operation*, pir::Operation*>* map_op_pair,
+    std::unordered_map<pir::Value, pir::OpResult>* map_value_pair,
+    pir::Program* program) {
+  if (op_item->num_operands() == 0) {
+    return {};
+  }
+
+  std::vector<pir::OpResult> vec_inputs;
+
+  for (size_t i = 0; i < op_item->num_operands(); ++i) {
+    auto cur_in = op_item->operand_source(i);
+    if (!cur_in) {
+      vec_inputs.emplace_back();
+      continue;
+    }
+    PADDLE_ENFORCE_EQ(
+        map_value_pair->count(cur_in),
+        true,
+        phi::errors::PreconditionNotMet(
+            "[%d]'s input of [%s] op MUST in map pair", i, op_item->name()));
+    auto new_in = map_value_pair->at(cur_in);
+
+    auto new_in_type = new_in.type();
+
+    auto& kernel = phi::KernelFactory::Instance().SelectKernelWithGPUDNN(
+        kernel_fn_str, kernel_key);
+
+    bool check_place_transfer =
+        (op_item->name() == "builtin.set_parameter") ||
+        (kernel.IsValid() && (!UnchangeOutputOps.count(op_item->name())));
+
+    if (check_place_transfer) {
+      if (new_in_type.isa<dialect::AllocatedDenseTensorType>()) {
+        // allocated type
+        auto in_place =
+            new_in_type.dyn_cast<dialect::AllocatedDenseTensorType>().place();
+
+        // get input args def type
+        auto args_def = kernel.args_def();
+        auto input_defs = args_def.input_defs();
+
+        auto dst_backend = GetDstBackend(op_item->name(),
+                                         place,
+                                         op_info_parser,
+                                         kernel.InputAt(i).backend,
+                                         i);
+
+        bool need_trans =
+            (in_place.GetType() != phi::AllocationType::UNDEFINED) &&
+            (paddle::experimental::NeedTransformPlace(
+                in_place, dst_backend, {}));
+        if (need_trans) {
+          VLOG(6) << "need trans from " << in_place << " to "
+                  << kernel_key.backend();
+          // build memcopy op
+          auto out_place = phi::TransToPhiPlace(dst_backend);
+          auto new_in_alloc_type =
+              new_in_type.dyn_cast<dialect::AllocatedDenseTensorType>();
+          auto out_type = dialect::AllocatedDenseTensorType::get(
+              ctx,
+              out_place,
+              new_in_alloc_type.dtype(),
+              new_in_alloc_type.dims(),
+              new_in_alloc_type.data_layout(),
+              new_in_alloc_type.lod(),
+              new_in_alloc_type.offset());
+          new_in = AddPlaceTransferOp(
+              new_in, out_type, in_place, out_place, kernel_key, program);
         }
-        PADDLE_ENFORCE_EQ(map_value_pair.count(cur_in),
-                          true,
-                          phi::errors::PreconditionNotMet(
-                              "[%d]'s input of [%s] op MUST in map pair",
-                              i,
-                              op_item->name()));
-        auto new_in = map_value_pair.at(cur_in);
-
-        auto new_in_type = new_in.type();
-
-        auto& kernel = phi::KernelFactory::Instance().SelectKernelWithGPUDNN(
-            kernel_fn_str, kernel_key);
-
-        bool check_place_transfer =
-            (op_item->name() == "builtin.set_parameter") ||
-            (kernel.IsValid() && (!UnchangeOutputOps.count(op_item->name())));
-
-        if (check_place_transfer) {
-          if (new_in_type.isa<dialect::AllocatedDenseTensorType>()) {
-            // allocated type
-            auto in_place =
-                new_in_type.dyn_cast<dialect::AllocatedDenseTensorType>()
-                    .place();
+      } else if (new_in_type.isa<pir::VectorType>()) {
+        // [ todo need update here, support combine data transfomer]
+        // deal with pre combine op
+        auto pre_define_op = cur_in.GetDefiningOp();
+
+        if (pre_define_op->name() == "builtin.combine") {
+          std::vector<pir::OpResult> inner_inputs;
+          std::vector<pir::Type> types_in_vec;
+          bool is_trans = false;
+          for (size_t j = 0; j < pre_define_op->num_operands(); ++j) {
+            auto in_i = map_value_pair->at(pre_define_op->operand_source(j));
+            auto in_i_type = in_i.type();
+            phi::Place place;
+            if (in_i_type.isa<dialect::AllocatedDenseTensorType>()) {
+              place = in_i_type.dyn_cast<dialect::AllocatedDenseTensorType>()
+                          .place();
+            } else if (in_i_type.isa<dialect::AllocatedSelectedRowsType>()) {
+              place = in_i_type.dyn_cast<dialect::AllocatedSelectedRowsType>()
+                          .place();
+            } else {
+              PADDLE_THROW(phi::errors::Unimplemented(
+                  "builtin.combine Input type only support "
+                  "VectorType<DenseTensorType> and "
+                  "VectorType<SelectedRowsType>"));
+            }
 
             // get input args def type
             auto args_def = kernel.args_def();
             auto input_defs = args_def.input_defs();
 
-            auto dst_backend = GetDstBackend(op_item->name(),
-                                             place,
-                                             op_info_parser.get(),
-                                             kernel.InputAt(i).backend,
-                                             i);
-
             bool need_trans =
-                (in_place.GetType() != phi::AllocationType::UNDEFINED) &&
+                (place.GetType() != phi::AllocationType::UNDEFINED) &&
+                (op_info_parser != nullptr &&
+                 !op_info_parser->IsTensorAttribute(i)) &&
                 (paddle::experimental::NeedTransformPlace(
-                    in_place, dst_backend, {}));
+                    place, kernel.InputAt(i).backend, {}));
             if (need_trans) {
-              VLOG(6) << "need trans from " << in_place << " to "
+              VLOG(6) << "need trans from " << place << " to "
                       << kernel_key.backend();
               // build memcopy op
-              auto out_place = phi::TransToPhiPlace(dst_backend);
-              auto new_in_alloc_type =
-                  new_in_type.dyn_cast<dialect::AllocatedDenseTensorType>();
-              auto out_type = dialect::AllocatedDenseTensorType::get(
-                  ctx,
-                  out_place,
-                  new_in_alloc_type.dtype(),
-                  new_in_alloc_type.dims(),
-                  new_in_alloc_type.data_layout(),
-                  new_in_alloc_type.lod(),
-                  new_in_alloc_type.offset());
-              new_in = AddPlaceTransferOp(new_in,
-                                          out_type,
-                                          in_place,
-                                          out_place,
-                                          kernel_key,
-                                          program.get());
-            }
-          } else if (new_in_type.isa<ir::VectorType>()) {
-            // [ todo need update here, support combine data transfomer]
-            // deal with pre combine op
-            auto pre_define_op = cur_in.GetDefiningOp();
-
-            if (pre_define_op->name() == "builtin.combine") {
-              std::vector<ir::OpResult> inner_inputs;
-              std::vector<ir::Type> types_in_vec;
-              bool is_trans = false;
-              for (size_t j = 0; j < pre_define_op->num_operands(); ++j) {
-                auto in_i = map_value_pair.at(pre_define_op->operand_source(j));
-                auto in_i_type = in_i.type();
-                phi::Place place;
-                if (in_i_type.isa<dialect::AllocatedDenseTensorType>()) {
-                  place =
-                      in_i_type.dyn_cast<dialect::AllocatedDenseTensorType>()
-                          .place();
-                } else if (in_i_type
-                               .isa<dialect::AllocatedSelectedRowsType>()) {
-                  place =
-                      in_i_type.dyn_cast<dialect::AllocatedSelectedRowsType>()
-                          .place();
-                } else {
-                  PADDLE_THROW(phi::errors::Unimplemented(
-                      "builtin.combine Input type only support "
-                      "VectorType<DenseTensorType> and "
-                      "VectorType<SelectedRowsType>"));
-                }
-
-                // get input args def type
-                auto args_def = kernel.args_def();
-                auto input_defs = args_def.input_defs();
-
-                bool need_trans =
-                    (place.GetType() != phi::AllocationType::UNDEFINED) &&
-                    (op_info_parser != nullptr &&
-                     !op_info_parser->IsTensorAttribute(i)) &&
-                    (paddle::experimental::NeedTransformPlace(
-                        place, kernel.InputAt(i).backend, {}));
-                if (need_trans) {
-                  VLOG(6) << "need trans from " << place << " to "
-                          << kernel_key.backend();
-                  // build memcopy op
-                  auto out_place =
-                      phi::TransToPhiPlace(kernel.InputAt(i).backend);
-
-                  ir::Type out_type;
-                  if (in_i_type.isa<dialect::AllocatedDenseTensorType>()) {
-                    out_type = dialect::AllocatedDenseTensorType::get(
-                        ctx,
-                        out_place,
-                        pre_define_op->operand_source(j)
-                            .type()
-                            .dyn_cast<dialect::DenseTensorType>());
-                  } else if (in_i_type
-                                 .isa<dialect::AllocatedSelectedRowsType>()) {
-                    out_type = dialect::AllocatedSelectedRowsType::get(
-                        ctx,
-                        out_place,
-                        pre_define_op->operand_source(j)
-                            .type()
-                            .dyn_cast<dialect::SelectedRowsType>());
-                  } else {
-                    PADDLE_THROW(phi::errors::Unimplemented(
-                        "builtin.combine Input type only support "
-                        "VectorType<DenseTensorType> and "
-                        "VectorType<SelectedRowsType>"));
-                  }
-
-                  in_i = AddPlaceTransferOp(in_i,
-                                            out_type,
-                                            place,
-                                            out_place,
-                                            kernel_key,
-                                            program.get());
-
-                  is_trans = true;
-                }
-
-                inner_inputs.push_back(in_i);
-                types_in_vec.push_back(in_i.type());
-              }
-              if (is_trans) {
-                // Add combine op
-                std::string combine_op_name(ir::CombineOp::name());
-                ir::OpInfo op_info = ctx->GetRegisteredOpInfo(combine_op_name);
-
-                ir::Type target_vec_type =
-                    ir::VectorType::get(ctx, types_in_vec);
-                ir::Operation* operation = ir::Operation::Create(
-                    inner_inputs, {}, {target_vec_type}, op_info);
-
-                new_in = operation->result(0);
-                program->block()->push_back(operation);
+              auto out_place = phi::TransToPhiPlace(kernel.InputAt(i).backend);
+              pir::Type out_type;
+              if (in_i_type.isa<dialect::AllocatedDenseTensorType>()) {
+                out_type = dialect::AllocatedDenseTensorType::get(
+                    ctx,
+                    out_place,
+                    pre_define_op->operand_source(j)
+                        .type()
+                        .dyn_cast<dialect::DenseTensorType>());
+              } else if (in_i_type.isa<dialect::AllocatedSelectedRowsType>()) {
+                out_type = dialect::AllocatedSelectedRowsType::get(
+                    ctx,
+                    out_place,
+                    pre_define_op->operand_source(j)
+                        .type()
+                        .dyn_cast<dialect::SelectedRowsType>());
+              } else {
+                PADDLE_THROW(phi::errors::Unimplemented(
+                    "builtin.combine Input type only support "
+                    "VectorType<DenseTensorType> and "
+                    "VectorType<SelectedRowsType>"));
               }
+              in_i = AddPlaceTransferOp(
+                  in_i, out_type, place, out_place, kernel_key, program);
+
+              is_trans = true;
             }
 
-          } else if (new_in_type.isa<dialect::AllocatedSelectedRowsType>()) {
-            // do nothing here
-          } else {
-            PADDLE_THROW(phi::errors::Unimplemented(
-                "only support allocated dense tensor type for now"));
+            inner_inputs.push_back(in_i);
+            types_in_vec.push_back(in_i.type());
+          }
+          if (is_trans) {
+            // Add combine op
+            std::string combine_op_name(pir::CombineOp::name());
+            pir::OpInfo op_info = ctx->GetRegisteredOpInfo(combine_op_name);
+
+            pir::Type target_vec_type = pir::VectorType::get(ctx, types_in_vec);
+            pir::Operation* operation = pir::Operation::Create(
+                inner_inputs, {}, {target_vec_type}, op_info);
+
+            new_in = operation->result(0);
+            program->block()->push_back(operation);
           }
         }
-        vec_inputs.push_back(new_in);
+
+      } else if (new_in_type.isa<dialect::AllocatedSelectedRowsType>()) {
+        // do nothing here
+      } else {
+        PADDLE_THROW(phi::errors::Unimplemented(
+            "only support allocated dense tensor type for now"));
       }
     }
+    vec_inputs.push_back(new_in);
+  }
+
+  return vec_inputs;
+}
+
+void AddShadowFeed(
+    const phi::Place& place,
+    pir::Operation* op_item,
+    pir::Operation* kernel_op,
+    pir::Program* program,
+    pir::IrContext* ctx,
+    std::unordered_map<pir::Operation*, pir::Operation*>* map_op_pair,
+    std::unordered_map<pir::Value, pir::OpResult>* map_value_pair) {
+  bool feed_op_add_shadow_feed =
+      (op_item->name() == "pd_op.feed") && platform::is_gpu_place(place);
+  bool data_op_add_shadow_feed =
+      (op_item->name() == "pd_op.data") && platform::is_gpu_place(place) &&
+      (kernel_op->attributes()
+           .at("place")
+           .dyn_cast<dialect::PlaceAttribute>()
+           .data()
+           .GetType() == phi::AllocationType::UNDEFINED);
+  bool add_shadow_feed = feed_op_add_shadow_feed || data_op_add_shadow_feed;
+  if (add_shadow_feed) {
+    // if shadow data op place not gpu,add shadow feed op
+    phi::KernelKey shadow_key{
+        phi::Backend::GPU,
+        phi::DataLayout::ANY,
+        TransToPhiDataType(
+            op_item->result(0).type().dyn_cast<DenseTensorType>().dtype())};
+    std::unordered_map<std::string, pir::Attribute> attr_map{
+        {"op_name", pir::StrAttribute::get(ctx, "pd_op.shadow_feed")},
+        {"kernel_name", pir::StrAttribute::get(ctx, "shadow_feed")},
+        {"kernel_key", dialect::KernelAttribute::get(ctx, shadow_key)}};
+
+    auto out_type = paddle::dialect::AllocatedDenseTensorType::get(
+        ctx,
+        phi::TransToPhiPlace(shadow_key.backend()),
+        op_item->result(0).type().dyn_cast<dialect::DenseTensorType>());
 
-    std::unordered_map<std::string, ir::Attribute> op_attribute{
-        {"op_name", ir::StrAttribute::get(ctx, op_item->name())},
-        {"kernel_name", ir::StrAttribute::get(ctx, kernel_fn_str)},
-        {"kernel_key", dialect::KernelAttribute::get(ctx, kernel_key)}};
-    auto op_attr_map = op_item->attributes();
+    pir::OpInfo phi_kernel_op_info =
+        ctx->GetRegisteredOpInfo(paddle::dialect::PhiKernelOp::name());
+    pir::Operation* shadow_op = pir::Operation::Create(
+        {kernel_op->result(0)}, attr_map, {out_type}, phi_kernel_op_info);
 
-    for (auto& map_item : op_attr_map) {
-      op_attribute.emplace(map_item.first, map_item.second);
+    (*map_op_pair)[op_item] = shadow_op;
+    program->block()->push_back(shadow_op);
+    if (op_item->num_results() > 0) {
+      for (size_t i = 0; i < shadow_op->num_results(); ++i) {
+        (*map_value_pair)[op_item->result(i)] = shadow_op->result(i);
+      }
     }
+  }
+}
+
+std::unique_ptr<OpYamlInfoParser> GetOpYamlInfoParser(pir::Operation* op) {
+  paddle::dialect::OpYamlInfoInterface op_info_interface =
+      op->dyn_cast<paddle::dialect::OpYamlInfoInterface>();
+
+  std::unique_ptr<OpYamlInfoParser> op_info_parser(nullptr);
+  if (op_info_interface) {
+    op_info_parser =
+        std::make_unique<OpYamlInfoParser>(op_info_interface.GetOpInfo());
+  }
+
+  return op_info_parser;
+}
+
+std::string GetKernelFnStr(const OpYamlInfoParser* op_info_parser,
+                           pir::Operation* op_item) {
+  std::string kernel_fn_str;
+  if (op_info_parser != nullptr) {
+    kernel_fn_str = op_info_parser->OpRuntimeInfo().kernel_func[0];
+  }
 
-    if (op_item->HasTrait<paddle::dialect::InplaceTrait>()) {
-      op_attribute.emplace("is_inplace", ir::BoolAttribute::get(ctx, true));
+  if (op_item->name() == "pd_op.add_n_" ||
+      op_item->name() == "pd_op.add_n_with_kernel") {
+    if (op_item->result(0).type().isa<dialect::SelectedRowsType>()) {
+      kernel_fn_str = "add_n_sr";
     }
+  }
+  return kernel_fn_str;
+}
 
-    ir::Operation* op;
-    if (dialect::IsLegacyOp(op_item->name())) {
-      op = ir::Operation::Create(
-          vec_inputs, op_attribute, op_output_types, legacy_kernel_op_info);
-    } else {
-      op = ir::Operation::Create(
-          vec_inputs, op_attribute, op_output_types, phi_kernel_op_info);
+pir::Operation* BuildPhiKernelOp(
+    const std::string& kernel_fn_str,
+    const phi::KernelKey& kernel_key,
+    const std::vector<pir::OpResult>& vec_inputs,
+    const std::vector<pir::Type>& op_output_types,
+    pir::Operation* op_item,
+    pir::Program* program,
+    pir::IrContext* ctx,
+    std::unordered_map<pir::Operation*, pir::Operation*>* map_op_pair,
+    std::unordered_map<pir::Value, pir::OpResult>* map_value_pair) {
+  std::unordered_map<std::string, pir::Attribute> op_attribute{
+      {"op_name", pir::StrAttribute::get(ctx, op_item->name())},
+      {"kernel_name", pir::StrAttribute::get(ctx, kernel_fn_str)},
+      {"kernel_key", dialect::KernelAttribute::get(ctx, kernel_key)}};
+  auto op_attr_map = op_item->attributes();
+
+  for (auto& map_item : op_attr_map) {
+    op_attribute.emplace(map_item.first, map_item.second);
+  }
+
+  if (op_item->HasTrait<paddle::dialect::InplaceTrait>()) {
+    op_attribute.emplace("is_inplace", pir::BoolAttribute::get(ctx, true));
+  }
+
+  pir::OpInfo phi_kernel_op_info =
+      ctx->GetRegisteredOpInfo(paddle::dialect::PhiKernelOp::name());
+
+  pir::OpInfo legacy_kernel_op_info =
+      ctx->GetRegisteredOpInfo(paddle::dialect::LegacyKernelOp::name());
+  pir::Operation* op;
+  if (dialect::IsLegacyOp(op_item->name())) {
+    op = pir::Operation::Create(
+        vec_inputs, op_attribute, op_output_types, legacy_kernel_op_info);
+  } else {
+    op = pir::Operation::Create(
+        vec_inputs, op_attribute, op_output_types, phi_kernel_op_info);
+  }
+
+  (*map_op_pair)[op_item] = op;
+
+  // only deal with single output
+  if (op_item->num_results() > 0) {
+    for (size_t i = 0; i < op_item->num_results(); ++i) {
+      (*map_value_pair)[op_item->result(i)] = op->result(i);
     }
+  }
+  program->block()->push_back(op);
 
-    map_op_pair[op_item] = op;
+  return op;
+}
 
-    // only deal with single output
-    if (op_item->num_results() > 0) {
-      for (size_t i = 0; i < op_item->num_results(); ++i) {
-        map_value_pair[op_item->result(i)] = op->result(i);
-      }
+std::unique_ptr<pir::Program> PdOpLowerToKernelPass(pir::Program* prog,
+                                                    phi::Place place) {
+  if (VLOG_IS_ON(2)) {
+    std::stringstream ss;
+    prog->Print(ss);
+    VLOG(2) << "Program after lowering to kernel pass : " << ss.str();
+  }
+
+  auto program = std::make_unique<pir::Program>(pir::IrContext::Instance());
+
+  auto block = prog->block();
+
+  pir::IrContext* ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+  ctx->GetOrRegisterDialect<paddle::dialect::KernelDialect>();
+
+  std::unordered_map<pir::Operation*, pir::Operation*> map_op_pair;
+  std::unordered_map<pir::Value, pir::OpResult> map_value_pair;
+
+  auto skip_feed_names = GetSkipFeedNames(block);
+
+  for (auto op_item : *block) {
+    VLOG(6) << "op name " << op_item->name();
+    if ((op_item->name() == "pd_op.feed") &&
+        SkipFeedOp(op_item, skip_feed_names)) {
+      continue;
     }
 
-    program->block()->push_back(op);
-    bool feed_op_add_shadow_feed =
-        (op_item->name() == "pd.feed") && platform::is_gpu_place(place);
-    bool data_op_add_shadow_feed = (op_item->name() == "pd.data") &&
-                                   platform::is_gpu_place(place) &&
-                                   (op->attributes()
-                                        .at("place")
-                                        .dyn_cast<dialect::PlaceAttribute>()
-                                        .data()
-                                        .GetType() != phi::AllocationType::GPU);
-    bool add_shadow_feed = feed_op_add_shadow_feed || data_op_add_shadow_feed;
-    if (add_shadow_feed) {
-      // if shadow data op place not gpu,add shadow feed op
-      phi::KernelKey shadow_key{
-          phi::Backend::GPU,
-          phi::DataLayout::ANY,
-          TransToPhiDataType(
-              op_item->result(0).type().dyn_cast<DenseTensorType>().dtype())};
-      std::unordered_map<std::string, ir::Attribute> attr_map{
-          {"op_name", ir::StrAttribute::get(ctx, "pd.shadow_feed")},
-          {"kernel_name", ir::StrAttribute::get(ctx, "shadow_feed")},
-          {"kernel_key", dialect::KernelAttribute::get(ctx, shadow_key)}};
-
-      auto out_type = paddle::dialect::AllocatedDenseTensorType::get(
-          ctx,
-          phi::TransToPhiPlace(shadow_key.backend()),
-          op_item->result(0).type().dyn_cast<dialect::DenseTensorType>());
-
-      ir::Operation* shadow_op = ir::Operation::Create(
-          {op->result(0)}, attr_map, {out_type}, phi_kernel_op_info);
-
-      map_op_pair[op_item] = shadow_op;
-      program->block()->push_back(shadow_op);
-      if (op_item->num_results() > 0) {
-        for (size_t i = 0; i < shadow_op->num_results(); ++i) {
-          map_value_pair[op_item->result(i)] = shadow_op->result(i);
-        }
-      }
+    // HandleSpecialOp
+    if (SpecialLowerOps.count(op_item->name())) {
+      HandleForSpecialOp(
+          op_item, program.get(), ctx, &map_op_pair, &map_value_pair);
+      continue;
     }
+
+    // Lower from PaddleDialect to KernelDialect
+
+    auto op_info_parser = GetOpYamlInfoParser(op_item);
+
+    auto kernel_fn_str = GetKernelFnStr(op_info_parser.get(), op_item);
+
+    auto kernel_key = GetKernelKey(
+        op_item, place, kernel_fn_str, map_value_pair, op_info_parser.get());
+    VLOG(6) << "kernel type " << kernel_key;
+
+    // build output type
+    auto op_output_types =
+        BuildOpOutputType(op_item, kernel_fn_str, kernel_key, ctx);
+
+    // build input
+    auto vec_inputs = BuildOpInputList(op_item,
+                                       kernel_fn_str,
+                                       kernel_key,
+                                       place,
+                                       op_info_parser.get(),
+                                       ctx,
+                                       &map_op_pair,
+                                       &map_value_pair,
+                                       program.get());
+
+    // build op
+    pir::Operation* op = BuildPhiKernelOp(kernel_fn_str,
+                                          kernel_key,
+                                          vec_inputs,
+                                          op_output_types,
+                                          op_item,
+                                          program.get(),
+                                          ctx,
+                                          &map_op_pair,
+                                          &map_value_pair);
+
+    AddShadowFeed(
+        place, op_item, op, program.get(), ctx, &map_op_pair, &map_value_pair);
   }
+
   if (VLOG_IS_ON(2)) {
     std::stringstream ss1;
     program->Print(ss1);
@@ -1184,6 +1219,5 @@ std::unique_ptr<ir::Program> PdOpLowerToKernelPass(ir::Program* prog,
   }
   return program;
 }
-
 }  // namespace dialect
 }  // namespace paddle
diff --git a/paddle/fluid/ir/transforms/pd_op_to_kernel_pass.h b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.h
similarity index 83%
rename from paddle/fluid/ir/transforms/pd_op_to_kernel_pass.h
rename to paddle/fluid/pir/transforms/pd_op_to_kernel_pass.h
index 3e4848720f4ce..acf839391b8c5 100644
--- a/paddle/fluid/ir/transforms/pd_op_to_kernel_pass.h
+++ b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.h
@@ -13,14 +13,14 @@
 // limitations under the License.
 #pragma once
 
-#include "paddle/ir/core/program.h"
 #include "paddle/phi/common/place.h"
+#include "paddle/pir/core/program.h"
 
 namespace paddle {
 namespace dialect {
 
-std::unique_ptr<ir::Program> PdOpLowerToKernelPass(
-    ir::Program* prog, phi::Place place = phi::CPUPlace());
+std::unique_ptr<pir::Program> PdOpLowerToKernelPass(
+    pir::Program* prog, phi::Place place = phi::CPUPlace());
 
 }  // namespace dialect
 }  // namespace paddle
diff --git a/paddle/fluid/ir/transforms/transform_general_functions.cc b/paddle/fluid/pir/transforms/transform_general_functions.cc
similarity index 74%
rename from paddle/fluid/ir/transforms/transform_general_functions.cc
rename to paddle/fluid/pir/transforms/transform_general_functions.cc
index 587c0cdaacd1d..6da131ee5e0c0 100644
--- a/paddle/fluid/ir/transforms/transform_general_functions.cc
+++ b/paddle/fluid/pir/transforms/transform_general_functions.cc
@@ -12,36 +12,38 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/ir/transforms/transform_general_functions.h"
+#include "paddle/fluid/pir/transforms/transform_general_functions.h"
 
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_dialect.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_type.h"
-#include "paddle/ir/core/builtin_op.h"
-#include "paddle/ir/core/parameter.h"
-#include "paddle/ir/core/program.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
+#include "paddle/pir/core/builtin_op.h"
+#include "paddle/pir/core/parameter.h"
+#include "paddle/pir/core/program.h"
 
-namespace ir {
+namespace pir {
 
-std::pair<std::string, ir::Parameter*> GetParameterFromValue(ir::Value value) {
-  ir::GetParameterOp op = value.GetDefiningOp()->dyn_cast<ir::GetParameterOp>();
+std::pair<std::string, pir::Parameter*> GetParameterFromValue(
+    pir::Value value) {
+  pir::GetParameterOp op =
+      value.GetDefiningOp()->dyn_cast<pir::GetParameterOp>();
   PADDLE_ENFORCE_NOT_NULL(
       op,
       phi::errors::InvalidArgument(
           "Value must be a weight from a GetParameter op."));
-  ir::Program* program = op->GetParentProgram();
+  pir::Program* program = op->GetParentProgram();
   PADDLE_ENFORCE_NOT_NULL(
       program, phi::errors::InvalidArgument("Program should not be null."));
   std::string name = op->attributes()
                          .at(op.attributes_name[0])
-                         .dyn_cast<ir::StrAttribute>()
+                         .dyn_cast<pir::StrAttribute>()
                          .AsString();
-  ir::Parameter* param = program->GetParameter(name);
+  pir::Parameter* param = program->GetParameter(name);
   PADDLE_ENFORCE_NOT_NULL(
       param, phi::errors::InvalidArgument("Parameter should not be null."));
   return {name, param};
 }
 
-const phi::DDim& GetShapeFromValue(ir::Value value) {
+const phi::DDim& GetShapeFromValue(pir::Value value) {
   // TODO(dev): Support other types like DenseTensor.
   PADDLE_ENFORCE_EQ(
       value.type().isa<paddle::dialect::DenseTensorType>(),
@@ -50,7 +52,7 @@ const phi::DDim& GetShapeFromValue(ir::Value value) {
   return value.type().dyn_cast<paddle::dialect::DenseTensorType>().dims();
 }
 
-ir::Type GetDataTypeFromValue(ir::Value value) {
+pir::Type GetDataTypeFromValue(pir::Value value) {
   // TODO(dev): Support other types like DenseTensor.
   PADDLE_ENFORCE_EQ(
       value.type().isa<paddle::dialect::DenseTensorType>(),
@@ -75,4 +77,4 @@ Operation* GetFirstUseOperationForOutput(Operation* op, uint32_t index) {
   return op->result(index).first_use().owner();
 }
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/fluid/ir/transforms/transform_general_functions.h b/paddle/fluid/pir/transforms/transform_general_functions.h
similarity index 76%
rename from paddle/fluid/ir/transforms/transform_general_functions.h
rename to paddle/fluid/pir/transforms/transform_general_functions.h
index b086af090f7a1..77c790235b832 100644
--- a/paddle/fluid/ir/transforms/transform_general_functions.h
+++ b/paddle/fluid/pir/transforms/transform_general_functions.h
@@ -14,45 +14,45 @@
 
 #pragma once
 
-#include "paddle/ir/core/operation.h"
-#include "paddle/ir/core/parameter.h"
-#include "paddle/ir/core/type.h"
-#include "paddle/ir/core/value.h"
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/errors.h"
+#include "paddle/pir/core/operation.h"
+#include "paddle/pir/core/parameter.h"
+#include "paddle/pir/core/type.h"
+#include "paddle/pir/core/value.h"
 
-namespace ir {
+namespace pir {
 
 /**
  * @brief Get the [name, parameter] pair of pararmeter from a value.
  *
  * @note The value must be a output of a GetParameterOp.
  *
- * @param ir::Value
+ * @param pir::Value
  *
- * @return std::pair<std::string, ir::Parameter*>
+ * @return std::pair<std::string, pir::Parameter*>
  */
 
-std::pair<std::string, ir::Parameter*> GetParameterFromValue(ir::Value value);
+std::pair<std::string, pir::Parameter*> GetParameterFromValue(pir::Value value);
 
 /**
  * @brief Get tensor's shape from a value.
  *
- * @param ir::Value
+ * @param pir::Value
  *
  * @return const phi::DDim&
  */
-const phi::DDim& GetShapeFromValue(ir::Value value);
+const phi::DDim& GetShapeFromValue(pir::Value value);
 
 /**
  * @brief Get tensor's data type from a value.
  *
- * @param ir::Value
+ * @param pir::Value
  *
- * @return ir::Type
+ * @return pir::Type
  */
-ir::Type GetDataTypeFromValue(ir::Value value);
+pir::Type GetDataTypeFromValue(pir::Value value);
 
 /**
  * @brief Get an operation that defines the specific input of the operation.
@@ -75,4 +75,4 @@ Operation* GetDefiningOpForInput(Operation* op, uint32_t index);
  */
 Operation* GetFirstUseOperationForOutput(Operation* op, uint32_t index);
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index 9343c8ddf7781..eae360c146df5 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -63,11 +63,7 @@ limitations under the License. */
 #endif
 
 PHI_DECLARE_int32(paddle_num_threads);
-PADDLE_DEFINE_EXPORTED_int32(
-    multiple_of_cupti_buffer_size,
-    1,
-    "Multiple of the CUPTI device buffer size. If the timestamps have "
-    "been dropped when you are profiling, try increasing this value.");
+PHI_DECLARE_int32(multiple_of_cupti_buffer_size);
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/primitive/backend/CMakeLists.txt b/paddle/fluid/primitive/backend/CMakeLists.txt
index deabc1f19d9b5..d352880871121 100644
--- a/paddle/fluid/primitive/backend/CMakeLists.txt
+++ b/paddle/fluid/primitive/backend/CMakeLists.txt
@@ -12,4 +12,4 @@ set(static_backend_files
 cc_library(
   primitive_backend_static_experimental
   SRCS ${static_backend_files}
-  DEPS pd_dialect_api)
+  DEPS pd_op_dialect_api)
diff --git a/paddle/fluid/primitive/backend/manual/manual_static_backend.cc b/paddle/fluid/primitive/backend/manual/manual_static_backend.cc
index de39a58473337..7d96b4ddfecc2 100644
--- a/paddle/fluid/primitive/backend/manual/manual_static_backend.cc
+++ b/paddle/fluid/primitive/backend/manual/manual_static_backend.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_api.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_api.h"
 #include "paddle/fluid/primitive/backend/manual/manual_backend.h"
 #include "paddle/fluid/primitive/primitive/primitive.h"
 #include "paddle/fluid/primitive/type/lazy_tensor.h"
diff --git a/paddle/fluid/primitive/codegen/gen.py b/paddle/fluid/primitive/codegen/gen.py
index 43eb5005f0f52..815f41e6fdb03 100644
--- a/paddle/fluid/primitive/codegen/gen.py
+++ b/paddle/fluid/primitive/codegen/gen.py
@@ -16,7 +16,6 @@
 import hashlib
 import pathlib
 import sys
-from typing import Dict, List
 
 import jinja2
 import yaml
@@ -28,10 +27,11 @@
 )
 import filters as op_gen_filters
 import tests_utils as op_gen_tests
+from parse_utils import to_named_dict
 
-# import from paddle/fluid/ir/dialect/op_generator/api_gen.py
+# import from paddle/fluid/pir/dialect/op_generator/api_gen.py
 sys.path.append(
-    str(pathlib.Path(__file__).resolve().parents[2] / 'ir/dialect/op_generator')
+    str(pathlib.Path(__file__).resolve().parents[2] / 'pir/dialect/op_generator')
 )
 
 # fmt: on
@@ -61,9 +61,21 @@
     'rsqrt_grad',
     'slice_grad',
     'transpose_grad',
+    'square_grad',
     'dropout_grad',
+    'cast_grad',
+    'slice_double_grad',
+    'layer_norm_grad',
+    'embedding_grad',
+    'add_n_grad',
+    'scale_grad',
 ]
-VJP_COMPS = ['divide_grad', 'sum_grad', 'gelu_grad']
+
+
+PRIM_VJP = ['divide_grad', 'sum_grad']  # vjp list of primitive op
+CUSTOM_VJP = ['gelu_grad']  # custom vjp list of composite op
+VJP_COMPS = PRIM_VJP + CUSTOM_VJP
+
 BACKENDS = [
     'add_n',
     'mean',
@@ -129,7 +141,14 @@
     'roll',
     'scatter',
     'scatter_nd_add',
+    'square_grad',
     'dropout_grad',
+    'slice',
+    'layer_norm_grad',
+    'embedding_grad',
+    'add_n_grad',
+    'sqrt',
+    'uniform',
 ]
 
 
@@ -219,21 +238,6 @@ def save(content: str, path: pathlib.Path):
             print(f"Generate source file {path}")
 
 
-def to_compat_dict(items: List[Dict]) -> Dict[str, Dict]:
-    compat_dict = {}
-    for item in items:
-        name = item["op"]
-        compat_dict[name] = item
-    return compat_dict
-
-
-def to_apis_dict(apis):
-    apis_dict = {}
-    for api in apis:
-        apis_dict[api['name']] = api
-    return apis_dict
-
-
 def get_inplace_api(apis):
     inplace_apis = []
     for api in apis:
@@ -271,7 +275,7 @@ def extend_compat_info(apis, compats):
                 attr['typename']
             ) or op_gen_tests.is_intarray(attr['typename']):
                 attr["support_tensor"] = False
-    apis_dict = to_apis_dict(apis)
+    apis_dict = to_named_dict(apis)
     for compat_item in compats:
         fwd_op_name = compat_item["op"]
         if fwd_op_name not in apis_dict:
@@ -322,6 +326,31 @@ def extend_compat_info(apis, compats):
     return apis
 
 
+def process_backward_invoke_info(apis):
+    apis_dict = to_named_dict(apis)
+    for api in apis:
+        if api['is_fwd']:
+            continue
+        if 'invoke' in api and api['invoke']['func'] in apis_dict:
+            args = api['invoke']['args'].split(',')
+            args = [arg.strip() for arg in args]
+            attrs_dict = to_named_dict(api['attrs'])
+            inputs_dict = to_named_dict(api['inputs'])
+            arg_inputs = []
+            arg_attrs = []
+            for arg in args:
+                if arg in inputs_dict:
+                    arg_inputs.append(arg)
+                elif arg in attrs_dict and attrs_dict[arg].get(
+                    "support_tensor", False
+                ):
+                    arg_inputs.append(arg + '_')
+                else:
+                    arg_attrs.append(arg)
+            args = arg_inputs + arg_attrs
+            api['invoke']['args'] = ', '.join(args)
+
+
 def gen(
     prim_path: pathlib.Path,
     fwd_path: pathlib.Path,
@@ -369,6 +398,7 @@ def gen(
     ]
     apis = extend_compat_info(apis, compats)
     apis = apis + get_inplace_api(apis)
+    process_backward_invoke_info(apis)
     render(
         templates_dir,
         destination_dir,
diff --git a/paddle/fluid/primitive/codegen/templates/backend/generated/generated_backend.h.j2 b/paddle/fluid/primitive/codegen/templates/backend/generated/generated_backend.h.j2
index 3bbd00d967b83..663467af25a97 100644
--- a/paddle/fluid/primitive/codegen/templates/backend/generated/generated_backend.h.j2
+++ b/paddle/fluid/primitive/codegen/templates/backend/generated/generated_backend.h.j2
@@ -7,6 +7,7 @@
 #include <vector>
 
 #include "paddle/phi/api/include/tensor.h"
+#include "paddle/utils/optional.h"
 
 
 namespace paddle {
diff --git a/paddle/fluid/primitive/codegen/templates/backend/generated/generated_static_backend.cc.j2 b/paddle/fluid/primitive/codegen/templates/backend/generated/generated_static_backend.cc.j2
index 8e004c22eeeb5..48292d27243e6 100644
--- a/paddle/fluid/primitive/codegen/templates/backend/generated/generated_static_backend.cc.j2
+++ b/paddle/fluid/primitive/codegen/templates/backend/generated/generated_static_backend.cc.j2
@@ -2,7 +2,7 @@
 // Auto Generated, DO NOT EDIT!
 
 #include "paddle/fluid/primitive/backend/generated/generated_backend.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_api.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_api.h"
 #include "paddle/fluid/primitive/primitive/primitive.h"
 #include "paddle/fluid/primitive/type/lazy_tensor.h"
 
@@ -17,22 +17,69 @@ template <>
 {{common.ret(outputs)}} {{name}}<LazyTensor>({{common.params(inputs, attrs, mutable_attribute_as_inputs, False)}})
 {%- endmacro -%}
 
-{% macro body(name, inputs, outputs, attrs, mutable_attribute_as_inputs=False) %}
-  {%- set output_names = [] -%}
-  {%- for o in outputs -%} {%- do output_names.append(o.name) -%} {%-endfor-%}
+{%- macro prepare_ir_api_inputs(inputs)-%}
   {%- for input in inputs -%}
-    {% if input.typename=='Tensor[]' %}
-  std::vector<ir::OpResult> {{input.name}}_res({{input.name}}.size());
-  std::transform({{input.name}}.begin(), {{input.name}}.end(), {{input.name}}_res.begin(), [](const Tensor& t) {
-    return std::static_pointer_cast<LazyTensor>(t.impl())->getValue().dyn_cast<ir::OpResult>();
+    {% if input.typename=='Tensor[]' and not input.optional %}
+std::vector<pir::OpResult> {{input.name}}_res({{input.name}}.size());
+std::transform({{input.name}}.begin(), {{input.name}}.end(), {{input.name}}_res.begin(), [](const Tensor& t) {
+  return std::static_pointer_cast<LazyTensor>(t.impl())->value().dyn_cast<pir::OpResult>();
+});
+    {% elif input.typename=='Tensor[]' and input.optional %}
+std::vector<pir::OpResult> {{input.name}}_res({{input.name}}.size());
+if({{input.name}}) {
+  std::transform({{input.name}}.get().begin(), {{input.name}}.get().end(), {{input.name}}_res.begin(), [](const Tensor& t) {
+    return std::static_pointer_cast<LazyTensor>(t.impl())->value().dyn_cast<pir::OpResult>();
   });
+}
+    {% elif input.typename=='Tensor' and not input.optional %}
+pir::OpResult {{input.name}}_res = std::static_pointer_cast<LazyTensor>({{input.name}}.impl())->value().dyn_cast<pir::OpResult>();
     {% else %}
-  ir::OpResult {{input.name}}_res = std::static_pointer_cast<LazyTensor>({{input.name}}.impl())->getValue().dyn_cast<ir::OpResult>();
+pir::OpResult {{input.name}}_res;
+if({{input.name}}) {
+  {{input.name}}_res = std::static_pointer_cast<LazyTensor>({{input.name}}.get().impl())->value().dyn_cast<pir::OpResult>();
+}
     {% endif %}
   {% endfor %}
-  {%- for attr in attrs -%}
+{%- endmacro -%}
+
+{%- macro get_static_backend_outputs(outputs)-%}
+  {%- if outputs|length == 1 -%}
+    {%- if outputs[0].typename == 'Tensor' -%}
+Tensor {{outputs[0].name}}(std::make_shared<LazyTensor>(op_res));
+return {{outputs[0].name}};
+    {%- elif outputs[0].typename == 'Tensor[]' -%}
+std::vector<Tensor> {{outputs[0].name}}(op_res.size());
+std::transform(op_res.begin(), op_res.end(), {{outputs[0].name}}.begin(), [](const pir::OpResult& res) {
+return Tensor(std::make_shared<LazyTensor>(res));
+  });
+return {{outputs[0].name}};
+    {%- else -%} {#- render nothing -#}
+    {%- endif -%}
+  {%- elif outputs|length > 1 -%}
+    {%- for i in range(outputs|length) %}
+auto op_res_{{i}} = std::get<{{i}}>(op_res);
+      {% if outputs[i].typename == 'Tensor' %}
+Tensor {{outputs[i].name}}(std::make_shared<LazyTensor>(op_res_{{i}}));
+      {% elif outputs[i].typename == 'Tensor[]' %}
+std::vector<Tensor> {{outputs[i].name}}(op_res_{{i}}.size());
+std::transform(op_res_{{i}}.begin(), op_res_{{i}}.end(), {{outputs[i].name}}.begin(), [](const pir::OpResult& res) {
+return Tensor(std::make_shared<LazyTensor>(res));
+  });
+      {% else %} {#- render nothing -#}
+      {% endif %}
+    {% endfor -%}
+return std::make_tuple({%- for i in range(outputs|length) -%}{{outputs[i].name}}{%- if i!=outputs|length - 1 -%}, {% endif -%}{%- endfor -%});
+  {%- else -%} {#- render nothing -#}
+  {%- endif -%}
+{%- endmacro -%}
+
+{% macro body(name, inputs, outputs, attrs, mutable_attribute_as_inputs=False) %}
+  {%- set output_names = [] -%}
+  {%- for o in outputs -%} {%- do output_names.append(o.name) -%} {%-endfor-%}
+{{prepare_ir_api_inputs(inputs)}}
+  {%- for attr in attrs %}
     {% if mutable_attribute_as_inputs and attr is mutable_attribute %}
-  ir::OpResult {{attr.name}}_res = std::static_pointer_cast<LazyTensor>({{attr.name~'_'}}.impl())->getValue().dyn_cast<ir::OpResult>();
+pir::OpResult {{attr.name}}_res = std::static_pointer_cast<LazyTensor>({{attr.name~'_'}}.impl())->value().dyn_cast<pir::OpResult>();
     {% endif %}
   {% endfor %}
   {%- set input_names = [] -%}
@@ -52,48 +99,25 @@ template <>
       {%- do attr_names.append(common.phi2ir_attr(i)) -%} 
     {%- endif -%}
   {% endfor %}
-  auto op_res = paddle::dialect::{{name}}({{common.args(input_names, attr_names)}});
-  {% if outputs|length == 1 %}
-    {% if outputs[0].typename == 'Tensor' %}
-  Tensor {{outputs[0].name}}(std::make_shared<LazyTensor>(op_res));
-  return {{outputs[0].name}};
-    {% elif outputs[0].typename == 'Tensor[]' %}
-  std::vector<Tensor> {{outputs[0].name}}(op_res.size());
-  std::transform(op_res.begin(), op_res.end(), {{outputs[0].name}}.begin(), [](const ir::OpResult& res) {
-    return Tensor(std::make_shared<LazyTensor>(res));
-  });
-  return {{outputs[0].name}};
-    {% else %} {#- render nothing -#}
-    {% endif %}
-  {% elif outputs|length > 1 %}
-    {% for i in range(outputs|length) %}
-  auto op_res_{{i}} = std::get<{{i}}>(op_res);
-      {% if outputs[i].typename == 'Tensor' %}
-  Tensor {{outputs[i].name}}(std::make_shared<LazyTensor>(op_res_{{i}}));
-      {% elif outputs[i].typename == 'Tensor[]' %}
-  std::vector<Tensor> {{outputs[i].name}}(op_res_{{i}}.size());
-  std::transform(op_res_{{i}}.begin(), op_res_{{i}}.end(), {{outputs[i].name}}.begin(), [](const ir::OpResult& res) {
-    return Tensor(std::make_shared<LazyTensor>(res));
-  });
-      {% else %} {#- render nothing -#}
-      {% endif %}
-    {% endfor %}
-  return std::make_tuple({% for i in range(outputs|length) %}{{outputs[i].name}}{%- if i!=outputs|length - 1 -%}, {% endif %}{% endfor %});
-  {% else %} {#- render nothing -#}
-  {% endif %}
-{% endmacro %}
+auto op_res = paddle::dialect::{{name}}({{common.args(input_names, attr_names)}});
+{{get_static_backend_outputs(outputs)}}
+{%- endmacro %}
 
 
 {% for api in apis %}
 {% if api.name in backend_white_list %}
   {% set api_outputs = api.outputs | trip_intermediate %}
 {{sig(api.name, api.inputs, api_outputs, api.attrs)}} {
+  {% filter indent(2, True) %}
 {{body(api.name, api.inputs, api_outputs, api.attrs)}} 
+  {% endfilter %}
 }
 
   {% if api.attrs is exist_mutable_attribute %}
 {{sig(api.name, api.inputs, api_outputs, api.attrs, True)}} {
+  {% filter indent(2, True) %}
 {{body(api.name, api.inputs, api_outputs, api.attrs, True)}} 
+  {% endfilter %}
 }
 
   {% endif %}
diff --git a/paddle/fluid/primitive/codegen/templates/rule/vjp/generated/generated_vjp.cc.j2 b/paddle/fluid/primitive/codegen/templates/rule/vjp/generated/generated_vjp.cc.j2
index 6d69433737633..67485bdd5a5cd 100644
--- a/paddle/fluid/primitive/codegen/templates/rule/vjp/generated/generated_vjp.cc.j2
+++ b/paddle/fluid/primitive/codegen/templates/rule/vjp/generated/generated_vjp.cc.j2
@@ -2,15 +2,16 @@
 // Auto Generated, DO NOT EDIT!
 
 #include "paddle/fluid/primitive/rule/vjp/generated/generated_vjp.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_api.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_attribute.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_api.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
 #include "paddle/fluid/prim/utils/static/static_global_utils.h"
 #include "paddle/fluid/primitive/backend/backend.h"
 #include "paddle/fluid/primitive/rule/vjp/details.h"
 #include "paddle/fluid/primitive/type/lazy_tensor.h"
 #include "paddle/fluid/primitive/utils/utils.h"
-#include "paddle/ir/core/operation.h"
+#include "paddle/pir/core/operation.h"
 #include "paddle/phi/core/flags.h"
+#include "paddle/utils/optional.h"
 
 PHI_DECLARE_string(tensor_operants_mode);
 
@@ -33,23 +34,23 @@ if (paddle::prim::StaticCompositeContext::Instance().IsBwdPrimEnabled()) {
 }
   {% else %}
 {{body_unprim(api)}}
-  {% endif %}
+  {%- endif %}
 return vjp_res;
-{% endmacro %}
+{%- endmacro -%}
 
 {% macro get_mutable_attribute(attrs, api_name) %}
   {% for i in attrs %}
     {%- if i is mutable_attribute -%}
-auto* {{i.name}}_define_op = std::static_pointer_cast<primitive::LazyTensor>({{i.name~'_'}}.impl())->getValue().dyn_cast<ir::OpResult>().GetDefiningOp();
+auto* {{i.name}}_define_op = std::static_pointer_cast<primitive::LazyTensor>({{i.name~'_'}}.impl())->value().dyn_cast<pir::OpResult>().GetDefiningOp();
       {% if i.typename is scalar %}
-if({{i.name}}_define_op->name() != "pd.full") {
+if({{i.name}}_define_op->name() != "pd_op.full") {
   PADDLE_THROW(platform::errors::Unimplemented(
       "We don't support dynamic tensors attribute {{i.name}} for {{api_name}} composite "
       "for now. "));
 }
 auto {{i.name}} = {{i.name}}_define_op->attribute("value").dyn_cast<paddle::dialect::ScalarAttribute>().data();
       {% elif i.typename is intarray %}
-if({{i.name}}_define_op->name() != "pd.full_int_array"){
+if({{i.name}}_define_op->name() != "pd_op.full_int_array"){
   PADDLE_THROW(platform::errors::Unimplemented(
       "We don't support dynamic tensors attribute {{i.name}} for {{api_name}} composite "
       "for now. "));
@@ -62,6 +63,7 @@ auto {{i.name}} = {{i.name}}_define_op->attribute("value").dyn_cast<paddle::dial
 
 {% macro body_unprim(api) %}
   {%- set input_names=[] -%}
+  {%- for api in apis -%} {%- do api_map.update({api.name: api}) -%} {%- endfor -%}
   {%- for i in api.inputs -%} {%- do input_names.append(i.name) -%} {%- endfor -%}
   {%- set attr_names=[] -%}
   {%- for i in api.attrs -%} 
@@ -71,29 +73,29 @@ auto {{i.name}} = {{i.name}}_define_op->attribute("value").dyn_cast<paddle::dial
       {%- do attr_names.append(i.name) -%} 
     {%- endif -%}
   {%- endfor %}
+  {% if 'invoke' in api and api.invoke.func in api_map %}
+auto op_res = backend::{{api.invoke.func}}<LazyTensor>({{api.invoke.args}});
+  {% else %}
 auto op_res = backend::{{api.name}}<LazyTensor>({{common.args(input_names, attr_names)}});
+  {% endif %}
   {% set outputs = api.outputs|trip_intermediate %} {#- ignore intermediate output -#}
   {% if outputs|length > 1 %}
     {% for i in range(outputs|length) %}
-auto out{{i}} = std::get<{{i}}>(op_res);
       {% if outputs[i].typename=='Tensor' %}
-vjp_res[{{i}}][0] = !stop_gradients[{{i}}][0] ? out{{i}} : vjp_res[{{i}}][0];
+vjp_res[{{i}}][0] = std::get<{{i}}>(op_res);
       {% else %}
-for (size_t i=0; i< stop_gradients[{{i}}].size(); i++ ) {
-    vjp_res[{{i}}][i] =  !stop_gradients[{{i}}][i] ? out{{i}}[i] : vjp_res[{{i}}][i];
-}
+vjp_res[{{i}}] = std::get<{{i}}>(op_res);
       {% endif %}
     {% endfor %}
   {% elif outputs|length == 1 %}
     {% if outputs[0].typename=='Tensor' %}
-vjp_res[0][0] = !stop_gradients[0][0] ? op_res : vjp_res[0][0];
+vjp_res[0][0] = op_res;
     {% else %}
-for (size_t i=0; i< stop_gradients[0].size(); i++ ) {
-  vjp_res[0][i] =  !stop_gradients[0][i] ? op_res[i] : vjp_res[0][i];
-}
+vjp_res[0] = op_res;
     {% endif %}
   {% else %} {#- render nothing -#}
   {% endif %}
+vjp_res = ConstructVjpResultByStopGradients(vjp_res, stop_gradients);
 {% endmacro %}
 
 {% macro body_prim(api) %}
@@ -120,7 +122,7 @@ details::{{api.composite.func_name}}<LazyTensor>({{api.composite.func_args}});
 {{sig(api.name, backward_api.name, backward_api.inputs, backward_api.attrs, backward_api.outputs)}} {
     {% filter indent(2, True) %}
 {{body(backward_api)}}
-    {% endfilter %}
+    {% endfilter -%}
 }
 
   {% endif %}
diff --git a/paddle/fluid/primitive/codegen/templates/rule/vjp/generated/generated_vjp.h.j2 b/paddle/fluid/primitive/codegen/templates/rule/vjp/generated/generated_vjp.h.j2
index b9e758aaa73ff..7f403661fea05 100644
--- a/paddle/fluid/primitive/codegen/templates/rule/vjp/generated/generated_vjp.h.j2
+++ b/paddle/fluid/primitive/codegen/templates/rule/vjp/generated/generated_vjp.h.j2
@@ -4,7 +4,7 @@
 #pragma once
 
 #include "paddle/fluid/primitive/primitive/primitive.h"
-#include "paddle/ir/core/value.h"
+#include "paddle/pir/core/value.h"
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/common/int_array.h"
 
diff --git a/paddle/fluid/primitive/rule/vjp/CMakeLists.txt b/paddle/fluid/primitive/rule/vjp/CMakeLists.txt
index 3d6906bb33ca5..4b790fd07900b 100644
--- a/paddle/fluid/primitive/rule/vjp/CMakeLists.txt
+++ b/paddle/fluid/primitive/rule/vjp/CMakeLists.txt
@@ -5,4 +5,4 @@ cc_library(
   primitive_vjp_experimental
   SRCS ${VJP_SRCS}
   DEPS primitive_backend_static_experimental static_global_utils
-       primitive_static_utils_experimental pd_dialect_core)
+       primitive_static_utils_experimental pd_op_dialect_core)
diff --git a/paddle/fluid/primitive/rule/vjp/manual/manual_vjp.cc b/paddle/fluid/primitive/rule/vjp/manual/manual_vjp.cc
index c56ac5c5f79ab..a882f78c52018 100644
--- a/paddle/fluid/primitive/rule/vjp/manual/manual_vjp.cc
+++ b/paddle/fluid/primitive/rule/vjp/manual/manual_vjp.cc
@@ -15,13 +15,13 @@
 // Auto Generated, DO NOT EDIT!
 
 #include "paddle/fluid/primitive/rule/vjp/manual/manual_vjp.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_api.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_api.h"
 #include "paddle/fluid/prim/utils/static/static_global_utils.h"
 #include "paddle/fluid/primitive/backend/backend.h"
 #include "paddle/fluid/primitive/rule/vjp/details.h"
 #include "paddle/fluid/primitive/type/lazy_tensor.h"
 #include "paddle/fluid/primitive/utils/utils.h"
-#include "paddle/ir/core/operation.h"
+#include "paddle/pir/core/operation.h"
 
 namespace paddle {
 namespace primitive {}  // namespace primitive
diff --git a/paddle/fluid/primitive/rule/vjp/manual/manual_vjp.h b/paddle/fluid/primitive/rule/vjp/manual/manual_vjp.h
index 0fffd6ba31a4c..35810f6d652ca 100644
--- a/paddle/fluid/primitive/rule/vjp/manual/manual_vjp.h
+++ b/paddle/fluid/primitive/rule/vjp/manual/manual_vjp.h
@@ -15,9 +15,9 @@
 #pragma once
 
 #include "paddle/fluid/primitive/primitive/primitive.h"
-#include "paddle/ir/core/value.h"
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/common/int_array.h"
+#include "paddle/pir/core/value.h"
 
 namespace paddle {
 namespace primitive {
diff --git a/paddle/fluid/primitive/type/lazy_tensor.h b/paddle/fluid/primitive/type/lazy_tensor.h
index bb0af2ef374ca..cde6ece54b163 100644
--- a/paddle/fluid/primitive/type/lazy_tensor.h
+++ b/paddle/fluid/primitive/type/lazy_tensor.h
@@ -13,12 +13,12 @@
 // limitations under the License.
 
 #pragma once
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_type.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/utils/utils.h"
-#include "paddle/ir/core/value.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
+#include "paddle/fluid/pir/dialect/operator/utils/utils.h"
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/extended_tensor.h"
 #include "paddle/phi/core/utils/data_type.h"
+#include "paddle/pir/core/value.h"
 
 namespace paddle {
 namespace primitive {
@@ -26,7 +26,7 @@ namespace primitive {
 class LazyTensor : public phi::ExtendedTensor,
                    public phi::TypeInfoTraits<phi::TensorBase, LazyTensor> {
  public:
-  explicit LazyTensor(ir::Value value)
+  explicit LazyTensor(pir::Value value)
       : value_(value),
         dims_(value.type().dyn_cast<dialect::DenseTensorType>().dims()) {}
 
@@ -41,14 +41,16 @@ class LazyTensor : public phi::ExtendedTensor,
         value_.type().dyn_cast<paddle::dialect::DenseTensorType>().dtype());
   }
 
-  ir::Value getValue() const { return value_; }
+  pir::Value value() const { return value_; }
 
   const phi::Place& place() const override { return place_; }
 
   bool initialized() const override { return value_.impl() != nullptr; }
 
+  void set_empty_type() { value_.set_type(pir::Type()); }
+
  private:
-  ir::Value value_;
+  pir::Value value_;
   mutable phi::DDim dims_;
   phi::Place place_;
 };
diff --git a/paddle/fluid/primitive/utils/static_utils.cc b/paddle/fluid/primitive/utils/static_utils.cc
index 40cbbc8d21e89..21b970561d7c9 100644
--- a/paddle/fluid/primitive/utils/static_utils.cc
+++ b/paddle/fluid/primitive/utils/static_utils.cc
@@ -21,5 +21,48 @@ void set_output<LazyTensor>(const paddle::Tensor& x_tmp, paddle::Tensor* x) {
   x->set_impl(x_tmp.impl());
 }
 
+/**
+ * @brief set output with no grads in new ir.
+ *
+ *  In new ir, we use None type to express
+ *  that value is not available.
+ *  Some outputs in vjp are marked as unnecessary
+ *  by stop_gradient with True. Therefore the
+ *  type of those outputs that are unnecessary will
+ *  be set with None.
+ *
+ */
+void SetOutputWithNoGrads(
+    const std::vector<std::vector<Tensor>>& outputs,
+    const std::vector<std::vector<bool>>& stop_gradients) {
+  for (size_t i = 0; i < outputs.size(); ++i) {
+    for (size_t j = 0; j < outputs[i].size(); ++j) {
+      if (stop_gradients[i][j]) {
+        std::static_pointer_cast<primitive::LazyTensor>(outputs[i][j].impl())
+            ->set_empty_type();
+      }
+    }
+  }
+}
+
+std::vector<std::vector<Tensor>> ConstructVjpResultByStopGradients(
+    const std::vector<std::vector<Tensor>>& outputs,
+    const std::vector<std::vector<bool>>& stop_gradients) {
+  SetOutputWithNoGrads(outputs, stop_gradients);
+  std::vector<std::vector<Tensor>> vjp_results(outputs.size());
+  for (size_t i = 0; i < outputs.size(); ++i) {
+    vjp_results[i].reserve(outputs[i].size());
+    for (size_t j = 0; j < outputs[i].size(); ++j) {
+      if (stop_gradients[i][j]) {
+        // Use Tensor's impl is nullptr to indicate it has no gradient
+        vjp_results[i].emplace_back(Tensor());
+      } else {
+        vjp_results[i].emplace_back(outputs[i][j]);
+      }
+    }
+  }
+  return vjp_results;
+}
+
 }  // namespace primitive
 }  // namespace paddle
diff --git a/paddle/fluid/primitive/utils/utils.h b/paddle/fluid/primitive/utils/utils.h
index e1765357aa9f8..3a5205c256130 100644
--- a/paddle/fluid/primitive/utils/utils.h
+++ b/paddle/fluid/primitive/utils/utils.h
@@ -16,6 +16,7 @@
 #include <vector>
 
 #include "paddle/fluid/operators/common_infer_shape_functions.h"
+#include "paddle/fluid/primitive/type/lazy_tensor.h"
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/core/ddim.h"
 
@@ -87,5 +88,12 @@ static phi::DDim get_reduce_dims(const phi::DDim& x_dims,
   return get_reduce_dims_from_out(out_dims, x_dims);
 }
 
+void SetOutputWithNoGrads(const std::vector<std::vector<Tensor>>& outputs,
+                          const std::vector<std::vector<bool>>& stop_gradients);
+
+std::vector<std::vector<Tensor>> ConstructVjpResultByStopGradients(
+    const std::vector<std::vector<Tensor>>& outputs,
+    const std::vector<std::vector<bool>>& stop_gradients);
+
 }  // namespace primitive
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 30cb90a5d2042..6c0c0fb4f81f2 100755
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -39,10 +39,10 @@ set(PYBIND_DEPS
     phi_utils
     phi
     phi_kernel_adaptor
-    pd_dialect
+    pd_op_dialect
     program_translator
     pd_inplace_pass
-    ir
+    pir
     new_profiler
     jit_layer
     jit_property
@@ -344,7 +344,7 @@ if(WITH_PYTHON)
       add_custom_command(
         OUTPUT ${op_impl_path}/ir.dll
         COMMAND ${CMAKE_COMMAND} -E copy ${IR_LIB} ${op_impl_path}
-        DEPENDS ir)
+        DEPENDS pir)
       list(APPEND EAGER_OP_IMPL_DEPS ${op_impl_path}/ir.dll)
     endif()
 
diff --git a/paddle/fluid/pybind/auto_parallel_py.cc b/paddle/fluid/pybind/auto_parallel_py.cc
index 8cf3a4dbbab07..27d6a75ba0736 100644
--- a/paddle/fluid/pybind/auto_parallel_py.cc
+++ b/paddle/fluid/pybind/auto_parallel_py.cc
@@ -14,6 +14,7 @@
 
 #include <pybind11/operators.h>
 #include <pybind11/stl.h>
+#include <utility>
 
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_desc.h"
@@ -381,6 +382,44 @@ void BindAutoParallel(py::module *m) {
              }
              return self.InferForward(ctx);
            })
+      .def("infer_forward",  // for op that have vector argument
+           [](const phi::distributed::SpmdRule &self,
+              const std::vector<std::pair<int, int>> &input_ranges,
+              const std::vector<DistTensorSpec> &input_specs,
+              const std::vector<phi::Attribute> &attrs) {
+             /*
+             to distingish between single tensor argument and vector argument of
+             one tensor: start - end == 0: single tensor start - end == 1:
+             vector containing one tensor input_ranges: [(0, 0), (1, 3), (3, 4)]
+             + input_specs: [t0, t1, t2, t3]  --> t0, [t1, t2], [t3]
+             */
+             phi::distributed::InferSpmdContext ctx;
+             paddle::small_vector<phi::distributed::DistMetaTensor,
+                                  phi::kInputSmallVectorSize>
+                 ins;
+             for (auto &range : input_ranges) {
+               if (range.second - range.first == 0) {
+                 auto &in = input_specs.at(range.first);
+                 ctx.EmplaceBackInput(phi::distributed::DistMetaTensor(
+                     phi::make_ddim(in.shape()), in.dist_attr()));
+               } else {
+                 int start = range.first;
+                 int end = range.second;
+                 ins.reserve(end - start);
+                 for (int i = start; i < end; ++i) {
+                   auto &in = input_specs.at(i);
+                   ins.emplace_back(phi::distributed::DistMetaTensor(
+                       phi::make_ddim(in.shape()), in.dist_attr()));
+                 }
+                 ctx.EmplaceBackInputs(ins);
+                 ins.clear();
+               }
+             }
+             for (auto &attr : attrs) {
+               ctx.EmplaceBackAttr(attr);
+             }
+             return self.InferForward(ctx);
+           })
       .def("infer_backward",
            [](const phi::distributed::SpmdRule &self,
               const std::vector<DistTensorSpec> &input_specs,
@@ -399,6 +438,44 @@ void BindAutoParallel(py::module *m) {
                ctx.EmplaceBackAttr(attr);
              }
              return self.InferBackward(ctx);
+           })
+      .def("infer_backward",  // for op that have vector argument
+           [](const phi::distributed::SpmdRule &self,
+              const std::vector<std::pair<int, int>> &input_ranges,
+              const std::vector<DistTensorSpec> &input_specs,
+              const std::vector<phi::Attribute> &attrs) {
+             /*
+             to distingish between single tensor argument and vector argument of
+             one tensor: start - end == 0: single tensor start - end == 1:
+             vector containing one tensor input_ranges: [(0, 0), (1, 3), (3, 4)]
+             + input_specs: [t0, t1, t2, t3]  --> t0, [t1, t2], [t3]
+             */
+             phi::distributed::InferSpmdContext ctx;
+             paddle::small_vector<phi::distributed::DistMetaTensor,
+                                  phi::kInputSmallVectorSize>
+                 ins;
+             for (auto &range : input_ranges) {
+               if (range.second - range.first == 0) {
+                 auto &in = input_specs.at(range.first);
+                 ctx.EmplaceBackInput(phi::distributed::DistMetaTensor(
+                     phi::make_ddim(in.shape()), in.dist_attr()));
+               } else {
+                 int start = range.first;
+                 int end = range.second;
+                 ins.reserve(end - start);
+                 for (int i = start; i < end; ++i) {
+                   auto &in = input_specs.at(i);
+                   ins.emplace_back(phi::distributed::DistMetaTensor(
+                       phi::make_ddim(in.shape()), in.dist_attr()));
+                 }
+                 ctx.EmplaceBackInputs(ins);
+                 ins.clear();
+               }
+             }
+             for (auto &attr : attrs) {
+               ctx.EmplaceBackAttr(attr);
+             }
+             return self.InferBackward(ctx);
            });
 
   py::class_<DistTensorSpec>(*m, "DistTensorSpec")
diff --git a/paddle/fluid/pybind/cuda_streams_py.cc b/paddle/fluid/pybind/cuda_streams_py.cc
index 2b8969e1b8181..2a6c639735a2b 100644
--- a/paddle/fluid/pybind/cuda_streams_py.cc
+++ b/paddle/fluid/pybind/cuda_streams_py.cc
@@ -98,23 +98,22 @@ void BindCudaStream(py::module *m_ptr) {
       The handle of the CUDA stream.
 
       Parameters:
-        device(paddle.CUDAPlace()|int|None, optional): The device which wanted to allocate the stream.
-        If device is None or negative integer, device will be the current device.
-        If device is positive integer, it must less than the device count. Default: None.
-
-        priority(int|None, optional): The priority of stream. The priority can be 1(high) or 2(normal).
-        If priority is None, the priority is 2(normal). Default: None.
+          device(paddle.CUDAPlace()|int|None, optional): The device which wanted to allocate the stream.
+              If device is None or negative integer, device will be the current device.
+              If device is positive integer, it must less than the device count. Default: None.
+          priority(int|None, optional): The priority of stream. The priority can be 1(high) or 2(normal).
+              If priority is None, the priority is 2(normal). Default: None.
 
       Examples:
-        .. code-block:: python
+          .. code-block:: python
 
-            # required: gpu
-            import paddle
-            s1 = paddle.device.cuda.Stream(paddle.CUDAPlace(0), 1)
-            s2 = paddle.device.cuda.Stream(0, 1)
-            s3 = paddle.device.cuda.Stream()
+              >>> # doctest: +REQUIRES(env:GPU)
+              >>> import paddle
+              >>> s1 = paddle.device.cuda.Stream(paddle.CUDAPlace(0), 1)
+              >>> s2 = paddle.device.cuda.Stream(0, 1)
+              >>> s3 = paddle.device.cuda.Stream()
 
-  )DOC")
+      )DOC")
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       .def(
           "wait_event",
@@ -122,21 +121,20 @@ void BindCudaStream(py::module *m_ptr) {
             self.WaitEvent(event.GetRawCudaEvent());
           },
           R"DOC(
-      Makes all future work submitted to stream wait for all work captured in event.
-
-      Parameters:
-        event(CUDAEvent): The event to wait on.
+          Makes all future work submitted to stream wait for all work captured in event.
 
-      Examples:
-        .. code-block:: python
+          Parameters:
+              event(CUDAEvent): The event to wait on.
 
-          # required: gpu
-          import paddle
-          s = paddle.device.cuda.Stream(paddle.CUDAPlace(0), 1)
-          event = paddle.device.cuda.Event()
-          s.wait_event(event)
+          Examples:
+              .. code-block:: python
 
-           )DOC")
+                  >>> # doctest: +REQUIRES(env:GPU)
+                  >>> import paddle
+                  >>> s = paddle.device.cuda.Stream(paddle.CUDAPlace(0), 1)
+                  >>> event = paddle.device.cuda.Event()
+                  >>> s.wait_event(event)
+          )DOC")
       .def(
           "wait_stream",
           [](phi::CUDAStream &self, phi::CUDAStream &stream) {
@@ -145,53 +143,53 @@ void BindCudaStream(py::module *m_ptr) {
             self.WaitEvent(event.GetRawCudaEvent());
           },
           R"DOC(
-      Synchronizes with the given stream.
+          Synchronizes with the given stream.
 
-      Parameters:
-        stream(CUDAStream): The stream to synchronize with.
+          Parameters:
+              stream(CUDAStream): The stream to synchronize with.
 
-      Examples:
-        .. code-block:: python
+          Examples:
+              .. code-block:: python
 
-            # required: gpu
-            import paddle
-            s1 = paddle.device.cuda.Stream(paddle.CUDAPlace(0), 1)
-            s2 = paddle.device.cuda.Stream(0, 1)
-            s1.wait_stream(s2)
+                  >>> # doctest: +REQUIRES(env:GPU)
+                  >>> import paddle
+                  >>> s1 = paddle.device.cuda.Stream(paddle.CUDAPlace(0), 1)
+                  >>> s2 = paddle.device.cuda.Stream(0, 1)
+                  >>> s1.wait_stream(s2)
 
-           )DOC")
+          )DOC")
       .def(
           "query",
           [](phi::CUDAStream &self) { return self.Query(); },
           R"DOC(
-      Return the status whether if all operations in stream have completed.
+          Return the status whether if all operations in stream have completed.
 
-      Returns: A boolean value.
+          Returns: A boolean value.
 
-      Examples:
-        .. code-block:: python
+          Examples:
+              .. code-block:: python
 
-            # required: gpu
-            import paddle
-            s = paddle.device.cuda.Stream(paddle.CUDAPlace(0), 1)
-            is_done = s.query()
+                  >>> # doctest: +REQUIRES(env:GPU)
+                  >>> import paddle
+                  >>> s = paddle.device.cuda.Stream(paddle.CUDAPlace(0), 1)
+                  >>> is_done = s.query()
 
-           )DOC")
+          )DOC")
       .def(
           "synchronize",
           [](phi::CUDAStream &self) { self.Synchronize(); },
           R"DOC(
-      Waits for stream tasks to complete.
+          Waits for stream tasks to complete.
 
-      Examples:
-        .. code-block:: python
+          Examples:
+              .. code-block:: python
 
-            # required: gpu
-            import paddle
-            s = paddle.device.cuda.Stream(paddle.CUDAPlace(0), 1)
-            s.synchronize()
+                  >>> # doctest: +REQUIRES(env:GPU)
+                  >>> import paddle
+                  >>> s = paddle.device.cuda.Stream(paddle.CUDAPlace(0), 1)
+                  >>> s.synchronize()
 
-           )DOC")
+          )DOC")
       .def(
           "record_event",
           [](phi::CUDAStream &self, paddle::platform::CudaEvent *event) {
@@ -202,24 +200,24 @@ void BindCudaStream(py::module *m_ptr) {
             return event;
           },
           R"DOC(
-      Record a CUDA event in the stream.
+          Record a CUDA event in the stream.
 
-      Parameters:
-          event(CUDAEvent, optional): The event to be record. If event is None, a new event is created.
-          Default: None.
+          Parameters:
+              event(CUDAEvent, optional): The event to be record. If event is None, a new event is created.
+                  Default: None.
 
-      Returns:
-          The record event.
+          Returns:
+              The record event.
 
-      Examples:
-        .. code-block:: python
+          Examples:
+              .. code-block:: python
 
-            # required: gpu
-            import paddle
-            s = paddle.device.cuda.Stream(paddle.CUDAPlace(0), 1)
-            event = s.record_event()
+                  >>> # doctest: +REQUIRES(env:GPU)
+                  >>> import paddle
+                  >>> s = paddle.device.cuda.Stream(paddle.CUDAPlace(0), 1)
+                  >>> event = s.record_event()
 
-           )DOC",
+          )DOC",
           py::arg("event") = nullptr)
       .def_property_readonly(
           "cuda_stream",
@@ -228,21 +226,21 @@ void BindCudaStream(py::module *m_ptr) {
             return reinterpret_cast<std::uintptr_t>(self.raw_stream());
           },
           R"DOC(
-      retrun the raw cuda stream of type cudaStream_t as type int.
+          retrun the raw cuda stream of type cudaStream_t as type int.
 
-      Examples:
-        .. code-block:: python
+          Examples:
+              .. code-block:: python
 
-            # required: gpu
-            import paddle
-            import ctypes
-            cuda_stream = paddle.device.cuda.current_stream().cuda_stream
-            print(cuda_stream)
+                  >>> # doctest: +REQUIRES(env:GPU)
+                  >>> import paddle
+                  >>> import ctypes
+                  >>> cuda_stream = paddle.device.cuda.current_stream().cuda_stream
+                  >>> print(cuda_stream)
 
-            ptr = ctypes.c_void_p(cuda_stream)  # convert back to void*
-            print(ptr)
+                  >>> ptr = ctypes.c_void_p(cuda_stream)  # convert back to void*
+                  >>> print(ptr)
 
-           )DOC")
+          )DOC")
       .def_property_readonly("place",
                              [](phi::CUDAStream &self) {
                                return platform::CUDAPlace(self.place());
@@ -322,18 +320,18 @@ void BindCudaStream(py::module *m_ptr) {
       The handle of the CUDA event.
 
       Parameters:
-        enable_timing(bool, optional): Whether the event will measure time. Default: False.
-        blocking(bool, optional): Whether the wait() func will be blocking. Default: False;
-        interprocess(bool, optional): Whether the event can be shared between processes. Default: False.
+          enable_timing(bool, optional): Whether the event will measure time. Default: False.
+          blocking(bool, optional): Whether the wait() func will be blocking. Default: False;
+          interprocess(bool, optional): Whether the event can be shared between processes. Default: False.
 
       Examples:
-        .. code-block:: python
+          .. code-block:: python
 
-            # required: gpu
-            import paddle
-            event = paddle.device.cuda.Event()
+              >>> # doctest: +REQUIRES(env:GPU)
+              >>> import paddle
+              >>> event = paddle.device.cuda.Event()
 
-  )DOC")
+      )DOC")
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       .def(
           "record",
@@ -347,17 +345,18 @@ void BindCudaStream(py::module *m_ptr) {
           Records the event in the given stream.
 
           Parameters:
-            stream(CUDAStream, optional): The handle of CUDA stream. If None, the stream is the current stream. Default: None.
+              stream(CUDAStream, optional): The handle of CUDA stream. If None, the stream is the current stream. Default: None.
 
           Examples:
-            .. code-block:: python
+              .. code-block:: python
 
-              # required: gpu
-              import paddle
-              event = paddle.device.cuda.Event()
-              event.record()
+                  >>> # doctest: +REQUIRES(env:GPU)
+                  >>> import paddle
+                  >>> paddle.device.set_device('gpu')
+                  >>> event = paddle.device.cuda.Event()
+                  >>> event.record()
 
-        )DOC",
+          )DOC",
           py::arg("stream") = nullptr)
       .def(
           "query",
@@ -368,14 +367,15 @@ void BindCudaStream(py::module *m_ptr) {
           Returns: A boolean which indicates all work currently captured by the event has been completed.
 
           Examples:
-            .. code-block:: python
+              .. code-block:: python
 
-                # required: gpu
-                import paddle
-                event = paddle.device.cuda.Event()
-                is_done = event.query()
+                  >>> # doctest: +REQUIRES(env:GPU)
+                  >>> import paddle
+                  >>> paddle.device.set_device('gpu')
+                  >>> event = paddle.device.cuda.Event()
+                  >>> is_done = event.query()
 
-           )DOC")
+          )DOC")
       .def(
           "synchronize",
           [](paddle::platform::CudaEvent &self) { self.Synchronize(); },
@@ -383,14 +383,15 @@ void BindCudaStream(py::module *m_ptr) {
             Waits for an event to complete.
 
             Examples:
-              .. code-block:: python
+                .. code-block:: python
 
-                # required: gpu
-                import paddle
-                event = paddle.device.cuda.Event()
-                event.synchronize()
+                    >>> # doctest: +REQUIRES(env:GPU)
+                    >>> import paddle
+                    >>> paddle.device.set_device('gpu')
+                    >>> event = paddle.device.cuda.Event()
+                    >>> event.synchronize()
 
-           )DOC")
+          )DOC")
 #endif
       .def(
           "__init__",
diff --git a/paddle/fluid/pybind/custom_device_py.cc b/paddle/fluid/pybind/custom_device_py.cc
index 0f0caa7fcdd0f..15415a86db422 100644
--- a/paddle/fluid/pybind/custom_device_py.cc
+++ b/paddle/fluid/pybind/custom_device_py.cc
@@ -110,29 +110,26 @@ void BindCustomDevicePy(py::module *m_ptr) {
       The handle of the custom device stream.
 
       Parameters:
-        device(paddle.CustomPlace()|str): The device which wanted to allocate the stream.
-
-        device_id(int, optional): The id of the device which wanted to allocate the stream.
-        If device is None or negative integer, device will be the current device.
-        If device is positive integer, it must less than the device count. Default: None.
-
-        priority(int|None, optional): The priority of stream. The priority can be 1(high) or 2(normal).
-        If priority is None, the priority is 2(normal). Default: None.
-
-        blocking(int|None, optional): Whether the stream is executed synchronously. Default: False.
+          device(paddle.CustomPlace()|str): The device which wanted to allocate the stream.
+          device_id(int, optional): The id of the device which wanted to allocate the stream.
+              If device is None or negative integer, device will be the current device.
+              If device is positive integer, it must less than the device count. Default: None.
+          priority(int|None, optional): The priority of stream. The priority can be 1(high) or 2(normal).
+              If priority is None, the priority is 2(normal). Default: None.
+          blocking(int|None, optional): Whether the stream is executed synchronously. Default: False.
 
       Examples:
-        .. code-block:: python
+          .. code-block:: python
 
-            # required: custom_device
-            import paddle
-            s3 = paddle.device.custom.Stream('custom_cpu')
-            s2 = paddle.device.custom.Stream('custom_cpu', 0)
-            s1 = paddle.device.custom.Stream(paddle.CustomPlace('custom_cpu'))
-            s1 = paddle.device.custom.Stream(paddle.CustomPlace('custom_cpu'), 1)
-            s1 = paddle.device.custom.Stream(paddle.CustomPlace('custom_cpu'), 1, True)
+              >>> # doctest: +REQUIRES(env:CUSTOM_DEVICE)
+              >>> import paddle
+              >>> s3 = paddle.device.custom.Stream('custom_cpu')
+              >>> s2 = paddle.device.custom.Stream('custom_cpu', 0)
+              >>> s1 = paddle.device.custom.Stream(paddle.CustomPlace('custom_cpu'))
+              >>> s1 = paddle.device.custom.Stream(paddle.CustomPlace('custom_cpu'), 1)
+              >>> s1 = paddle.device.custom.Stream(paddle.CustomPlace('custom_cpu'), 1, True)
 
-  )DOC")
+      )DOC")
       .def(
           "__init__",
           [](phi::stream::Stream &self,
@@ -196,22 +193,22 @@ void BindCustomDevicePy(py::module *m_ptr) {
 #endif
           },
           R"DOC(
-      Makes all future work submitted to stream wait for all work captured in event.
+          Makes all future work submitted to stream wait for all work captured in event.
 
-      Parameters:
-        event(CustomDeviceEvent): The event to wait on.
+          Parameters:
+              event(CustomDeviceEvent): The event to wait on.
 
-      Examples:
-        .. code-block:: python
+          Examples:
+              .. code-block:: python
 
-          # required: custom_device
-          import paddle
-          place = paddle.CustomPlace('custom_cpu', 0)
-          s = paddle.device.custom.Stream(place)
-          event = paddle.device.custom.Event(place)
-          s.wait_event(event)
+                  >>> # doctest: +REQUIRES(env:CUSTOM_DEVICE)
+                  >>> import paddle
+                  >>> place = paddle.CustomPlace('custom_cpu', 0)
+                  >>> s = paddle.device.custom.Stream(place)
+                  >>> event = paddle.device.custom.Event(place)
+                  >>> s.wait_event(event)
 
-           )DOC")
+          )DOC")
       .def(
           "wait_stream",
           [](const phi::stream::Stream &self, phi::stream::Stream *other) {
@@ -227,22 +224,22 @@ void BindCustomDevicePy(py::module *m_ptr) {
 #endif
           },
           R"DOC(
-      Synchronizes with the given stream.
+          Synchronizes with the given stream.
 
-      Parameters:
-        stream(CUDAStream): The stream to synchronize with.
+          Parameters:
+              stream(CUDAStream): The stream to synchronize with.
 
-      Examples:
-        .. code-block:: python
+          Examples:
+              .. code-block:: python
 
-            # required: custom_device
-            import paddle
-            place = paddle.CustomPlace('custom_cpu', 0)
-            s1 = paddle.device.custom.Stream(place)
-            s2 = paddle.device.custom.Stream(place)
-            s1.wait_stream(s2)
+                  >>> # doctest: +REQUIRES(env:CUSTOM_DEVICE)
+                  >>> import paddle
+                  >>> place = paddle.CustomPlace('custom_cpu', 0)
+                  >>> s1 = paddle.device.custom.Stream(place)
+                  >>> s2 = paddle.device.custom.Stream(place)
+                  >>> s1.wait_stream(s2)
 
-           )DOC")
+          )DOC")
       .def(
           "query",
           [](const phi::stream::Stream &self) {
@@ -255,20 +252,21 @@ void BindCustomDevicePy(py::module *m_ptr) {
 #endif
           },
           R"DOC(
-      Return the status whether if all operations in stream have completed.
+          Return the status whether if all operations in stream have completed.
 
-      Returns: A boolean value.
+          Returns:
+              A boolean value.
 
-      Examples:
-        .. code-block:: python
+          Examples:
+              .. code-block:: python
 
-            # required: custom_device
-            import paddle
-            place = paddle.CustomPlace('custom_cpu', 0)
-            s = paddle.device.custom.Stream(place)
-            is_done = s.query()
+                  >>> # doctest: +REQUIRES(env:CUSTOM_DEVICE)
+                  >>> import paddle
+                  >>> place = paddle.CustomPlace('custom_cpu', 0)
+                  >>> s = paddle.device.custom.Stream(place)
+                  >>> is_done = s.query()
 
-           )DOC")
+          )DOC")
       .def(
           "synchronize",
           [](const phi::stream::Stream &self) {
@@ -281,18 +279,18 @@ void BindCustomDevicePy(py::module *m_ptr) {
 #endif
           },
           R"DOC(
-      Waits for stream tasks to complete.
+          Waits for stream tasks to complete.
 
-      Examples:
-        .. code-block:: python
+          Examples:
+              .. code-block:: python
 
-            # required: custom_device
-            import paddle
-            place = paddle.CustomPlace('custom_cpu', 0)
-            s = paddle.device.custom.Stream(place)
-            s.synchronize()
+                  >>> # doctest: +REQUIRES(env:CUSTOM_DEVICE)
+                  >>> import paddle
+                  >>> place = paddle.CustomPlace('custom_cpu', 0)
+                  >>> s = paddle.device.custom.Stream(place)
+                  >>> s.synchronize()
 
-           )DOC")
+          )DOC")
       .def(
           "record_event",
           [](const phi::stream::Stream &self, phi::event::Event *event) {
@@ -310,25 +308,25 @@ void BindCustomDevicePy(py::module *m_ptr) {
 #endif
           },
           R"DOC(
-      Record an event in the stream.
+          Record an event in the stream.
 
-      Parameters:
-          event(CustomDeviceEvent, optional): The event to be record. If event is None, a new event is created.
-          Default: None.
+          Parameters:
+              event(CustomDeviceEvent, optional): The event to be record. If event is None, a new event is created.
+                  Default: None.
 
-      Returns:
-          The record event.
+          Returns:
+              The record event.
 
-      Examples:
-        .. code-block:: python
+          Examples:
+              .. code-block:: python
 
-            # required: custom_device
-            import paddle
-            place = paddle.CustomPlace('custom_cpu', 0)
-            s = paddle.device.custom.Stream(place)
-            event = s.record_event()
+                  >>> # doctest: +REQUIRES(env:CUSTOM_DEVICE)
+                  >>> import paddle
+                  >>> place = paddle.CustomPlace('custom_cpu', 0)
+                  >>> s = paddle.device.custom.Stream(place)
+                  >>> event = s.record_event()
 
-           )DOC",
+          )DOC",
           py::arg("event") = nullptr)
       .def_property_readonly(
           "raw_stream",
@@ -343,21 +341,21 @@ void BindCustomDevicePy(py::module *m_ptr) {
 #endif
           },
           R"DOC(
-      return the raw stream of type CustomDeviceStream as type int.
+          return the raw stream of type CustomDeviceStream as type int.
 
-      Examples:
-        .. code-block:: python
+          Examples:
+            .. code-block:: python
 
-            # required: custom_device
-            import paddle
-            import ctypes
-            stream  = paddle.device.custom.current_stream().raw_stream
-            print(stream)
+                >>> # doctest: +REQUIRES(env:CUSTOM_DEVICE)
+                >>> import paddle
+                >>> import ctypes
+                >>> stream  = paddle.device.custom.current_stream().raw_stream
+                >>> print(stream)
 
-            ptr = ctypes.c_void_p(stream)  # convert back to void*
-            print(ptr)
+                >>> ptr = ctypes.c_void_p(stream)  # convert back to void*
+                >>> print(ptr)
 
-           )DOC")
+          )DOC")
       .def_property_readonly("place", [](const phi::stream::Stream &self) {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
         return reinterpret_cast<const phi::CustomPlace &>(self.GetPlace());
@@ -373,27 +371,23 @@ void BindCustomDevicePy(py::module *m_ptr) {
       The handle of the custom device event.
 
       Parameters:
-        device(paddle.CustomPlace()|str): The device which wanted to allocate the stream.
-
-        device_id(int, optional): The id of the device which wanted to allocate the stream.
-        If device is None or negative integer, device will be the current device.
-        If device is positive integer, it must less than the device count. Default: None.
-
-        enable_timing(bool, optional): Whether the event will measure time. Default: False.
-
-        blocking(bool, optional): Whether the wait() func will be blocking. Default: False;
-
-        interprocess(bool, optional): Whether the event can be shared between processes. Default: False.
+          device(paddle.CustomPlace()|str): The device which wanted to allocate the stream.
+          device_id(int, optional): The id of the device which wanted to allocate the stream.
+              If device is None or negative integer, device will be the current device.
+              If device is positive integer, it must less than the device count. Default: None.
+          enable_timing(bool, optional): Whether the event will measure time. Default: False.
+          blocking(bool, optional): Whether the wait() func will be blocking. Default: False.
+          interprocess(bool, optional): Whether the event can be shared between processes. Default: False.
 
       Examples:
-        .. code-block:: python
+          .. code-block:: python
 
-            # required: custom_device
-            import paddle
-            place = paddle.CustomPlace('custom_cpu', 0)
-            event = paddle.device.custom.Event(place)
+              >>> # doctest: +REQUIRES(env:CUSTOM_DEVICE)
+              >>> import paddle
+              >>> place = paddle.CustomPlace('custom_cpu', 0)
+              >>> event = paddle.device.custom.Event(place)
 
-  )DOC")
+      )DOC")
       .def(
           "__init__",
           [](phi::event::Event &self,
@@ -483,18 +477,18 @@ void BindCustomDevicePy(py::module *m_ptr) {
           Records the event in the given stream.
 
           Parameters:
-            stream(CustomDeviceStream, optional): The handle of custom device stream. If None, the stream is the current stream. Default: None.
+              stream(CustomDeviceStream, optional): The handle of custom device stream. If None, the stream is the current stream. Default: None.
 
           Examples:
-            .. code-block:: python
+              .. code-block:: python
 
-              # required: custom_device
-              import paddle
-              place = paddle.CustomPlace('custom_cpu', 0)
-              event = paddle.device.custom.Event(place)
-              event.record()
+                  >>> # doctest: +REQUIRES(env:CUSTOM_DEVICE)
+                  >>> import paddle
+                  >>> place = paddle.CustomPlace('custom_cpu', 0)
+                  >>> event = paddle.device.custom.Event(place)
+                  >>> event.record()
 
-        )DOC")
+          )DOC")
       .def(
           "query",
           [](const phi::event::Event &self) {
@@ -509,18 +503,19 @@ void BindCustomDevicePy(py::module *m_ptr) {
           R"DOC(
           Queries the event's status.
 
-          Returns: A boolean which indicates all work currently captured by the event has been completed.
+          Returns:
+              A boolean which indicates all work currently captured by the event has been completed.
 
           Examples:
-            .. code-block:: python
+              .. code-block:: python
 
-                # required: custom_device
-                import paddle
-                place = paddle.CustomPlace('custom_cpu', 0)
-                event = paddle.device.cuda.Event(place)
-                is_done = event.query()
+                  >>> # doctest: +REQUIRES(env:CUSTOM_DEVICE)
+                  >>> import paddle
+                  >>> place = paddle.CustomPlace('custom_cpu', 0)
+                  >>> event = paddle.device.cuda.Event(place)
+                  >>> is_done = event.query()
 
-           )DOC")
+          )DOC")
       .def(
           "synchronize",
           [](const phi::event::Event &self) {
@@ -536,15 +531,15 @@ void BindCustomDevicePy(py::module *m_ptr) {
             Waits for an event to complete.
 
             Examples:
-              .. code-block:: python
+                .. code-block:: python
 
-                # required: custom_device
-                import paddle
-                place = paddle.CustomPlace('custom_cpu', 0)
-                event = paddle.device.custom.Event(place)
-                event.synchronize()
+                    >>> # doctest: +REQUIRES(env:CUSTOM_DEVICE)
+                    >>> import paddle
+                    >>> place = paddle.CustomPlace('custom_cpu', 0)
+                    >>> event = paddle.device.custom.Event(place)
+                    >>> event.synchronize()
 
-           )DOC")
+          )DOC")
       .def_property_readonly(
           "raw_event",
           [](const phi::event::Event &self) {
@@ -558,23 +553,23 @@ void BindCustomDevicePy(py::module *m_ptr) {
 #endif
           },
           R"DOC(
-      return the raw event of type CustomDeviceEvent as type int.
+          return the raw event of type CustomDeviceEvent as type int.
 
-      Examples:
-        .. code-block:: python
+          Examples:
+              .. code-block:: python
 
-            # required: custom_device
-            import paddle
-            import ctypes
-            place = paddle.CustomPlace('custom_cpu', 0)
-            event = paddle.device.custom.Event(place)
-            raw_event = event.raw_event
-            print(raw_event)
+                  >>> # doctest: +REQUIRES(env:CUSTOM_DEVICE)
+                  >>> import paddle
+                  >>> import ctypes
+                  >>> place = paddle.CustomPlace('custom_cpu', 0)
+                  >>> event = paddle.device.custom.Event(place)
+                  >>> raw_event = event.raw_event
+                  >>> print(raw_event)
 
-            ptr = ctypes.c_void_p(raw_event)  # convert back to void*
-            print(ptr)
+                  >>> ptr = ctypes.c_void_p(raw_event)  # convert back to void*
+                  >>> print(ptr)
 
-           )DOC")
+          )DOC")
       .def_property_readonly("place", [](const phi::event::Event &self) {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
         return reinterpret_cast<const phi::CustomPlace &>(self.GetPlace());
diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
index d03a20537eee6..e63790a65dfc8 100644
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -64,6 +64,7 @@ typedef SSIZE_T ssize_t;
 
 #include "paddle/phi/api/include/operants_manager.h"
 #include "paddle/phi/api/include/tensor_operants.h"
+#include "paddle/phi/api/lib/data_transform.h"
 #include "paddle/phi/core/flags.h"
 
 PHI_DECLARE_string(tensor_operants_mode);
@@ -549,12 +550,34 @@ static PyObject* eager_api_run_custom_op(PyObject* self,
       continue;
     }
     if (paddle::framework::detail::IsDuplicableVar(input)) {
-      ctx.EmplaceBackInputs(std::move(CastPyArg2VectorOfTensor(obj, i + 1)));
+      std::vector<paddle::Tensor> tensors =
+          std::move(CastPyArg2VectorOfTensor(obj, i + 1));
+      for (auto& tensor : tensors) {
+        if (tensor.initialized() && tensor.is_dense_tensor() &&
+            !std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl())
+                 ->meta()
+                 .is_contiguous()) {
+          tensor.set_impl(std::make_shared<phi::DenseTensor>(
+              std::move(paddle::experimental::Trans2Contiguous(
+                  *(std::dynamic_pointer_cast<phi::DenseTensor>(
+                      tensor.impl()))))));
+        }
+      }
+      ctx.EmplaceBackInputs(std::move(tensors));
       VLOG(7) << "Custom operator add input " << input
               << " to CustomOpKernelContext. Add vector<Tensor> size = "
               << ctx.InputRangeAt(i).second - ctx.InputRangeAt(i).first;
     } else {
-      ctx.EmplaceBackInput(std::move(CastPyArg2Tensor(obj, i + 1)));
+      paddle::Tensor tensor = std::move(CastPyArg2Tensor(obj, i + 1));
+      if (tensor.initialized() && tensor.is_dense_tensor() &&
+          !std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl())
+               ->meta()
+               .is_contiguous()) {
+        tensor.set_impl(std::make_shared<phi::DenseTensor>(
+            std::move(paddle::experimental::Trans2Contiguous(*(
+                std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl()))))));
+      }
+      ctx.EmplaceBackInput(std::move(tensor));
       VLOG(7) << "Custom operator add input " << input
               << " to CustomOpKernelContext. Add Tensor for general case.";
     }
diff --git a/paddle/fluid/pybind/eager_legacy_custom_python_api.h b/paddle/fluid/pybind/eager_legacy_custom_python_api.h
index 1deb20fbf9b88..1c40ce4275c42 100644
--- a/paddle/fluid/pybind/eager_legacy_custom_python_api.h
+++ b/paddle/fluid/pybind/eager_legacy_custom_python_api.h
@@ -21,7 +21,7 @@
 namespace paddle {
 namespace pybind {
 
-static PyObject *eager_api_run_program(PyObject *self,
+static PyObject *eager_api_run_program(PyObject *self,  // TOREMOVE
                                        PyObject *args,
                                        PyObject *kwargs) {
   PyThreadState *tstate = nullptr;
@@ -61,11 +61,58 @@ static PyObject *eager_api_run_program(PyObject *self,
   }
 }
 
+static PyObject *newir_eager_api_run_program(PyObject *self,
+                                             PyObject *args,
+                                             PyObject *kwargs) {
+  PyThreadState *tstate = nullptr;
+  try {
+    auto X = GetTensorListFromArgs("run_program", "X", args, 0, true);
+    auto Params = GetTensorListFromArgs("run_program", "Params", args, 1, true);
+    auto Out = GetTensorPtrListFromArgs("run_program", "Out", args, 2, true);
+    auto OutScope =
+        GetScopePtrListFromArgs("run_program", "OutScope", args, 3, false);
+    auto DOut = GetTensorPtrListFromArgs("run_program", "DOut", args, 4, true);
+    framework::AttributeMap attrs;
+    // TODO(zengjinle): support CUDA Graph on eager mode
+    VLOG(1) << "Start NewIR ConstructAttrMapFromPyArgs";
+
+    ConstructAttrMapForRunProgram(
+        "run_program", args, 6, PyTuple_GET_SIZE(args), attrs);
+
+    VLOG(1) << "Finish NewIR ConstructAttrMapFromPyArgs";
+    tstate = PyEval_SaveThread();
+    newir_run_program_ad_func(X, Params, Out, OutScope, DOut, attrs);
+    PyEval_RestoreThread(tstate);
+    tstate = nullptr;
+    Py_RETURN_NONE;
+  } catch (paddle::platform::EnforceNotMet &exception) {
+    if (tstate) {
+      PyEval_RestoreThread(tstate);
+    }
+    std::ostringstream sout;
+    sout << exception.what();
+    sout << "  [operator < run_program > error]";
+    exception.set_error_str(sout.str());
+    ThrowExceptionToPython(std::current_exception());
+    return nullptr;
+  } catch (...) {
+    if (tstate) {
+      PyEval_RestoreThread(tstate);
+    }
+    ThrowExceptionToPython(std::current_exception());
+    return nullptr;
+  }
+}
+
 static PyMethodDef CustomEagerMethods[] = {
     {"run_program",
      (PyCFunction)(void (*)(void))eager_api_run_program,
      METH_VARARGS | METH_KEYWORDS,
      "C++ interface function for run_program in dygraph."},
+    {"newir_run_program",
+     (PyCFunction)(void (*)(void))newir_eager_api_run_program,
+     METH_VARARGS | METH_KEYWORDS,
+     "C++ interface function for run_program in dygraph."},
     {nullptr, nullptr, 0, nullptr}};
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index 4c1fb0b431070..4046ef525bfd6 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -1665,14 +1665,17 @@ static PyObject* tensor_method__setitem_eager_tensor(TensorObject* self,
       // use inplace set_value_ operator
       if (value_tensor.initialized() &&
           (self->tensor.dtype() != value_tensor.dtype())) {
-        paddle::small_vector<std::vector<paddle::Tensor>,
-                             egr::kSlotSmallVectorSize>
-            tmps = {{self->tensor}, {value_tensor}};
-        auto amp_dtype = egr::GetAmpDestDtype("set_value", tmps);
-        self->tensor = egr::EagerAmpAutoCast(
-            self->tensor.name(), self->tensor, amp_dtype, "set_value");
-        value_tensor = egr::EagerAmpAutoCast(
-            value_tensor.name(), value_tensor, amp_dtype, "set_value");
+        if (egr::Controller::Instance().GetAMPLevel() !=
+            paddle::imperative::AmpLevel::O0) {
+          paddle::small_vector<std::vector<paddle::Tensor>,
+                               egr::kSlotSmallVectorSize>
+              tmps = {{self->tensor}, {value_tensor}};
+          auto amp_dtype = egr::GetAmpDestDtype("set_value", tmps);
+          self->tensor = egr::EagerAmpAutoCast(
+              self->tensor.name(), self->tensor, amp_dtype, "set_value");
+          value_tensor = egr::EagerAmpAutoCast(
+              value_tensor.name(), value_tensor, amp_dtype, "set_value");
+        }
         if (self->tensor.dtype() != value_tensor.dtype()) {
           value_tensor = cast_ad_func(value_tensor, self->tensor.dtype());
         }
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index 95d86f544c4bf..84418058aa9f5 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -11,7 +11,7 @@ limitations under the License. */
 
 #include "paddle/fluid/pybind/eager_utils.h"
 #include <Python.h>
-#include "paddle/ir/core/value.h"
+#include "paddle/pir/core/value.h"
 // Avoid a problem with copysign defined in pyconfig.h on Windows.
 #ifdef copysign
 #undef copysign
@@ -138,6 +138,25 @@ bool PyObject_CheckIROpResult(PyObject* obj) {
   return PyObject_TypeCheck(obj, g_ir_opresult_pytype);
 }
 
+bool PyObject_CheckIRVectorOfOpResult(PyObject* obj) {
+  if (PyList_Check(obj)) {
+    Py_ssize_t len = PyList_Size(obj);
+    PyObject* item = nullptr;
+    // if obj is [], parse it as std::vector<scalar>
+    if (len == 0) {
+      return false;
+    }
+    for (Py_ssize_t i = 0; i < len; i++) {
+      item = PyList_GetItem(obj, i);
+      if (!PyObject_CheckIROpResult(item)) {
+        return false;
+      }
+    }
+    return true;
+  } else {
+    return false;
+  }
+}
 bool CastPyArg2AttrBoolean(PyObject* obj, ssize_t arg_pos) {
   if (obj == Py_None) {
     return false;  // To be compatible with QA integration testing. Some
@@ -888,13 +907,13 @@ PyObject* ToPyObject(const phi::DenseTensor* value) {
   return obj.ptr();
 }
 
-PyObject* ToPyObject(const ir::OpResult& value) {
+PyObject* ToPyObject(const pir::OpResult& value) {
   auto obj = ::pybind11::cast(value);
   obj.inc_ref();
   return obj.ptr();
 }
 
-PyObject* ToPyObject(const std::vector<ir::OpResult>& value) {
+PyObject* ToPyObject(const std::vector<pir::OpResult>& value) {
   PyObject* result = PyList_New((Py_ssize_t)value.size());
 
   for (size_t i = 0; i < value.size(); i++) {
@@ -1485,13 +1504,13 @@ paddle::experimental::Scalar CastNumpy2Scalar(PyObject* obj,
   }
 }
 
-ir::OpResult CastPyArg2OpResult(PyObject* obj,
-                                const std::string& op_type,
-                                size_t arg_pos) {
+pir::OpResult CastPyArg2OpResult(PyObject* obj,
+                                 const std::string& op_type,
+                                 size_t arg_pos) {
   if (PyObject_TypeCheck(obj, g_ir_opresult_pytype)) {
-    return ::pybind11::handle(obj).cast<ir::OpResult>();
+    return ::pybind11::handle(obj).cast<pir::OpResult>();
   } else if (obj == nullptr || obj == Py_None) {
-    return ir::OpResult();
+    return pir::OpResult();
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "%s(): argument (position %d) must be "
@@ -1502,17 +1521,17 @@ ir::OpResult CastPyArg2OpResult(PyObject* obj,
   }
 }
 
-std::vector<ir::OpResult> CastPyArg2VectorOfOpResult(PyObject* obj,
-                                                     const std::string& op_type,
-                                                     size_t arg_pos) {
-  std::vector<ir::OpResult> result_list;
+std::vector<pir::OpResult> CastPyArg2VectorOfOpResult(
+    PyObject* obj, const std::string& op_type, size_t arg_pos) {
+  std::vector<pir::OpResult> result_list;
   if (PyList_Check(obj)) {
     Py_ssize_t len = PyList_Size(obj);
     PyObject* item = nullptr;
     for (Py_ssize_t i = 0; i < len; i++) {
       item = PyList_GetItem(obj, i);
       if (PyObject_TypeCheck(item, g_ir_opresult_pytype)) {
-        result_list.emplace_back(::pybind11::handle(item).cast<ir::OpResult>());
+        result_list.emplace_back(
+            ::pybind11::handle(item).cast<pir::OpResult>());
       } else if (item == Py_None) {
         continue;
       } else {
@@ -1531,7 +1550,8 @@ std::vector<ir::OpResult> CastPyArg2VectorOfOpResult(PyObject* obj,
     for (Py_ssize_t i = 0; i < len; i++) {
       item = PyTuple_GetItem(obj, i);
       if (PyObject_TypeCheck(item, g_ir_opresult_pytype)) {
-        result_list.emplace_back(::pybind11::handle(item).cast<ir::OpResult>());
+        result_list.emplace_back(
+            ::pybind11::handle(item).cast<pir::OpResult>());
       } else if (item == Py_None) {
         continue;
       } else {
@@ -1545,7 +1565,7 @@ std::vector<ir::OpResult> CastPyArg2VectorOfOpResult(PyObject* obj,
       }
     }
   } else if (PyObject_TypeCheck(obj, g_ir_opresult_pytype)) {
-    return {::pybind11::handle(obj).cast<ir::OpResult>()};
+    return {::pybind11::handle(obj).cast<pir::OpResult>()};
   } else if (obj == Py_None) {
     return {};
   } else {
@@ -1697,7 +1717,6 @@ paddle::experimental::IntArray CastPyArg2IntArray(PyObject* obj,
         arg_pos + 1,
         ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
   }
-
   // Fake a IntArray
   return paddle::experimental::IntArray({1});
 }
diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h
index ad7ec2d42c437..ba2368c9b6bb2 100644
--- a/paddle/fluid/pybind/eager_utils.h
+++ b/paddle/fluid/pybind/eager_utils.h
@@ -29,7 +29,6 @@ typedef SSIZE_T ssize_t;
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/jit/function.h"
 #include "paddle/fluid/platform/place.h"
-#include "paddle/ir/core/value.h"
 #include "paddle/phi/common/backend.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/int_array.h"
@@ -38,6 +37,7 @@ typedef SSIZE_T ssize_t;
 #include "paddle/phi/core/distributed/auto_parallel/dist_attr.h"
 #include "paddle/phi/core/distributed/auto_parallel/dist_tensor.h"
 #include "paddle/phi/core/selected_rows.h"
+#include "paddle/pir/core/op_result.h"
 #include "paddle/utils/pybind.h"
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
@@ -57,6 +57,7 @@ bool PyObject_CheckLongOrConvertToLong(PyObject** obj);
 bool PyObject_CheckFloatOrConvertToFloat(PyObject** obj);
 bool PyObject_CheckStr(PyObject* obj);
 bool PyObject_CheckIROpResult(PyObject* obj);
+bool PyObject_CheckIRVectorOfOpResult(PyObject* obj);
 bool CastPyArg2AttrBoolean(PyObject* obj, ssize_t arg_pos);
 int CastPyArg2AttrInt(PyObject* obj, ssize_t arg_pos);
 int64_t CastPyArg2AttrLong(PyObject* obj, ssize_t arg_pos);
@@ -75,12 +76,11 @@ std::vector<int> CastPyArg2VectorOfInt(PyObject* obj, size_t arg_pos);
 std::vector<int64_t> CastPyArg2VectorOfInt64(PyObject* obj, size_t arg_pos);
 std::vector<size_t> CastPyArg2VectorOfSize_t(PyObject* obj, size_t arg_pos);
 std::vector<float> CastPyArg2VectorOfFloat(PyObject* obj, size_t arg_pos);
-ir::OpResult CastPyArg2OpResult(PyObject* obj,
-                                const std::string& op_type,
-                                size_t arg_pos);
-std::vector<ir::OpResult> CastPyArg2VectorOfOpResult(PyObject* obj,
-                                                     const std::string& op_type,
-                                                     size_t arg_pos);
+pir::OpResult CastPyArg2OpResult(PyObject* obj,
+                                 const std::string& op_type,
+                                 size_t arg_pos);
+std::vector<pir::OpResult> CastPyArg2VectorOfOpResult(
+    PyObject* obj, const std::string& op_type, size_t arg_pos);
 std::vector<std::vector<size_t>> CastPyArg2VectorOfVectorOfSize_t(
     PyObject* obj, size_t arg_pos);
 framework::proto::VarType::Type CastPyArg2ProtoType(PyObject* obj,
@@ -131,8 +131,8 @@ PyObject* ToPyObject(const paddle::framework::Vocab& value);
 
 PyObject* ToPyObject(std::shared_ptr<egr::GradNodeBase> grad_node);
 
-PyObject* ToPyObject(const ir::OpResult& value);
-PyObject* ToPyObject(const std::vector<ir::OpResult>& value);
+PyObject* ToPyObject(const pir::OpResult& value);
+PyObject* ToPyObject(const std::vector<pir::OpResult>& value);
 
 class PyTensorHook : public egr::TensorHook {
  public:
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index 1690d738a2c60..66f24b6f03fc3 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -245,6 +245,8 @@ paddle_infer::PlaceType ToPaddleInferPlace(
     return paddle_infer::PlaceType::kGPU;
   } else if (allocation_type == phi::AllocationType::XPU) {
     return paddle_infer::PlaceType::kXPU;
+  } else if (allocation_type == phi::AllocationType::CUSTOM) {
+    return paddle_infer::PlaceType::kCUSTOM;
   } else {
     return paddle_infer::PlaceType::kCPU;
   }
@@ -975,19 +977,19 @@ void BindAnalysisConfig(py::module *m) {
       .def("disable_mkldnn_fc_passes",
            &AnalysisConfig::DisableMkldnnFcPasses,
            R"DOC(
-           Disable Mkldnn FC
-           Args:
+            Disable Mkldnn FC
+            Returns:
                 None.
-           Returns:
-                None.
-           Examples:
-               .. code-block:: python
-                from paddle.inference import Config
-
-                config = Config("")
-                config.enable_mkldnn()
-                config.disable_mkldnn_fc_passes()
-           )DOC")
+
+            Examples:
+                .. code-block:: python
+
+                    >>> from paddle.inference import Config
+
+                    >>> config = Config("")
+                    >>> config.enable_mkldnn()
+                    >>> config.disable_mkldnn_fc_passes()
+            )DOC")
 #endif
       .def("set_mkldnn_op", &AnalysisConfig::SetMKLDNNOp)
       .def("set_model_buffer", &AnalysisConfig::SetModelBuffer)
diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc
index 4dc36fe785ecc..465a8719b3c7f 100644
--- a/paddle/fluid/pybind/ir.cc
+++ b/paddle/fluid/pybind/ir.cc
@@ -25,38 +25,40 @@
 #include "paddle/fluid/pybind/pybind_variant_caster.h"
 
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/interface/op_yaml_info.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/api_builder.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_dialect.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_type.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/utils/utils.h"
-#include "paddle/fluid/ir/transforms/inplace_pass.h"
 #include "paddle/fluid/ir_adaptor/translator/translate.h"
 #include "paddle/fluid/ir_adaptor/translator/utils.h"
-#include "paddle/ir/core/block.h"
-#include "paddle/ir/core/builtin_attribute.h"
-#include "paddle/ir/core/program.h"
-#include "paddle/ir/core/type.h"
-#include "paddle/ir/core/value.h"
-#include "paddle/ir/pass/pass.h"
-#include "paddle/ir/pass/pass_manager.h"
-#include "paddle/ir/pass/pass_registry.h"
-#include "paddle/ir/transforms/dead_code_elimination_pass.h"
+#include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
+#include "paddle/fluid/pir/dialect/operator/ir/api_builder.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_api.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
+#include "paddle/fluid/pir/dialect/operator/utils/utils.h"
+#include "paddle/fluid/pir/transforms/inplace_pass.h"
 #include "paddle/phi/core/enforce.h"
+#include "paddle/pir/core/block.h"
+#include "paddle/pir/core/builtin_attribute.h"
+#include "paddle/pir/core/program.h"
+#include "paddle/pir/core/type.h"
+#include "paddle/pir/core/value.h"
+#include "paddle/pir/pass/pass.h"
+#include "paddle/pir/pass/pass_manager.h"
+#include "paddle/pir/pass/pass_registry.h"
+#include "paddle/pir/transforms/dead_code_elimination_pass.h"
 #include "pybind11/stl.h"
 
 namespace py = pybind11;
-using ir::Block;
-using ir::Operation;
-using ir::OpOperand;
-using ir::OpResult;
-using ir::Pass;
-using ir::PassManager;
-using ir::Program;
-using ir::Type;
-using ir::Value;
 using paddle::dialect::APIBuilder;
 using paddle::dialect::DenseTensorType;
+using pir::Block;
+using pir::Operation;
+using pir::OpOperand;
+using pir::OpResult;
+using pir::Pass;
+using pir::PassManager;
+using pir::Program;
+using pir::Type;
+using pir::Value;
 using pybind11::return_value_policy;
 
 USE_PASS(dead_code_elimination);
@@ -69,6 +71,26 @@ PyTypeObject *g_ir_opresult_pytype = nullptr;
 
 void BindOpsAPI(pybind11::module *module);
 
+inline int64_t GetProgramInt64Attr(const std::shared_ptr<Program> &program,
+                                   const std::string &attr_name,
+                                   int64_t default_value = 0) {
+  auto op = program->module_op();
+  if (op->HasAttribute(attr_name)) {
+    auto val = op->attribute(attr_name).dyn_cast<pir::Int64Attribute>().data();
+    return val;
+  } else {
+    return default_value;
+  }
+}
+
+inline void SetProgramInt64Attr(std::shared_ptr<Program> program,
+                                const std::string &attr_name,
+                                int64_t value) {
+  auto op = program->module_op();
+  op->set_attribute(
+      attr_name, pir::Int64Attribute::get(pir::IrContext::Instance(), value));
+}
+
 void BindProgram(py::module *m) {
   py::class_<Program, std::shared_ptr<Program>> program(*m, "Program", R"DOC(
     Create Python Program. Program is an abstraction of model structure, divided into
@@ -111,27 +133,42 @@ void BindProgram(py::module *m) {
             print("start up program is: {}".format(startup_program))
   )DOC");
   program
-      .def(
-          "__init__",
-          [](Program &self) { new (&self) Program(ir::IrContext::Instance()); })
+      .def("__init__",
+           [](Program &self) {
+             new (&self) Program(pir::IrContext::Instance());
+           })
       .def("__str__",
            [](const std::shared_ptr<Program> &self) {
              std::ostringstream print_stream;
              self->Print(print_stream);
              return print_stream.str();
            })
+      .def("__repr__",
+           [](const std::shared_ptr<Program> &self) {
+             std::ostringstream print_stream;
+             self->Print(print_stream);
+             return print_stream.str();
+           })
       .def("parameters_num",
            [](const std::shared_ptr<Program> &self) {
              return self->parameters_num();
            })
       .def(
-          "block",
+          "global_block",
           [](std::shared_ptr<Program> self) { return self->block(); },
           return_value_policy::reference)
       .def(
-          "block",
+          "global_block",
           [](const std::shared_ptr<Program> &self) { return self->block(); },
-          return_value_policy::reference);
+          return_value_policy::reference)
+      .def_property(
+          "random_seed",
+          [](const std::shared_ptr<Program> &self) {
+            return GetProgramInt64Attr(self, "random_seed", 0);
+          },
+          [](std::shared_ptr<Program> self, int64_t random_seed) {
+            SetProgramInt64Attr(self, "random_seed", random_seed);
+          });
 }
 
 void BindBlock(py::module *m) {
@@ -143,8 +180,10 @@ void BindBlock(py::module *m) {
         use `Program.block()` to get a block.
   )DOC");
   block.def("front", &Block::front, return_value_policy::reference)
-      .def("get_parent_program",
-           [](Block &self) { return self.GetParentOp()->GetParentProgram(); })
+      .def_property_readonly(
+          "program",
+          [](Block &self) { return self.GetParentOp()->GetParentProgram(); },
+          return_value_policy::reference)
       .def_property_readonly(
           "ops",
           [](Block &self) -> py::list {
@@ -169,7 +208,26 @@ void BindBlock(py::module *m) {
         Returns:
             None
 
-      )DOC");
+      )DOC")
+      .def("all_parameters", [](Block &self) -> py::list {
+        py::list param_list;
+        for (auto iter = self.begin(); iter != self.end(); iter++) {
+          auto op = *iter;
+          if (op->HasAttribute(kAttrIsPersisable)) {
+            auto attrs = op->attribute(kAttrIsPersisable)
+                             .dyn_cast<pir::ArrayAttribute>()
+                             .AsVector();
+            for (uint32_t i = 0; i < attrs.size(); i++) {
+              bool is_persistable =
+                  attrs[i].dyn_cast<pir::BoolAttribute>().data();
+              if (is_persistable) {
+                param_list.append(op->result(i));
+              }
+            }
+          }
+        }
+        return param_list;
+      });
 }
 
 void BindOperation(py::module *m) {
@@ -284,10 +342,10 @@ void BindValue(py::module *m) {
       .def("__eq__", &Value::operator==)
       .def("__eq__",
            [](Value &self, OpResult &other) {
-             return self.impl() == other.value_impl();
+             return self.impl() == other.Value::impl();
            })
       .def("__hash__",
-           [](const Value &self) { return std::hash<ir::Value>{}(self); });
+           [](const Value &self) { return std::hash<pir::Value>{}(self); });
 }
 
 void BindOpOperand(py::module *m) {
@@ -311,37 +369,36 @@ void BindOpOperand(py::module *m) {
       .def("owner", &OpOperand::owner, return_value_policy::reference);
 }
 
-bool GetStopGradient(const OpResult &self) {
+bool GetOpResultBoolAttr(const OpResult &self, const std::string &attr_name) {
   auto *defining_op = self.owner();
-  if (defining_op->HasAttribute(kAttrStopGradients)) {
-    auto stop_gradients = defining_op->attribute(kAttrStopGradients)
-                              .dyn_cast<ir::ArrayAttribute>()
-                              .AsVector();
-    return stop_gradients[self.GetResultIndex()]
-        .dyn_cast<ir::BoolAttribute>()
-        .data();
+  if (defining_op->HasAttribute(attr_name)) {
+    auto attrs = defining_op->attribute(attr_name)
+                     .dyn_cast<pir::ArrayAttribute>()
+                     .AsVector();
+    return attrs[self.GetResultIndex()].dyn_cast<pir::BoolAttribute>().data();
   } else {
     return false;
   }
 }
 
-void SetStopGradient(const OpResult &self, bool stop_gradient) {
+void SetOpResultBoolAttr(const OpResult &self,
+                         const std::string &attr_name,
+                         bool value) {
   auto *defining_op = self.owner();
-  std::vector<ir::Attribute> stop_gradients;
-  if (defining_op->HasAttribute(kAttrStopGradients)) {
-    stop_gradients = defining_op->attribute(kAttrStopGradients)
-                         .dyn_cast<ir::ArrayAttribute>()
-                         .AsVector();
+  std::vector<pir::Attribute> attrs;
+  if (defining_op->HasAttribute(attr_name)) {
+    attrs = defining_op->attribute(attr_name)
+                .dyn_cast<pir::ArrayAttribute>()
+                .AsVector();
   } else {
-    stop_gradients = std::vector<ir::Attribute>(
+    attrs = std::vector<pir::Attribute>(
         defining_op->num_results(),
-        ir::BoolAttribute::get(ir::IrContext::Instance(), false));
+        pir::BoolAttribute::get(pir::IrContext::Instance(), false));
   }
-  stop_gradients[self.GetResultIndex()] =
-      ir::BoolAttribute::get(ir::IrContext::Instance(), stop_gradient);
+  attrs[self.GetResultIndex()] =
+      pir::BoolAttribute::get(pir::IrContext::Instance(), value);
   defining_op->set_attribute(
-      kAttrStopGradients,
-      ir::ArrayAttribute::get(ir::IrContext::Instance(), stop_gradients));
+      attr_name, pir::ArrayAttribute::get(pir::IrContext::Instance(), attrs));
 }
 
 void BindOpResult(py::module *m) {
@@ -356,24 +413,98 @@ void BindOpResult(py::module *m) {
   op_result.def("__eq__", &OpResult::operator==)
       .def("__eq__",
            [](OpResult &self, Value &other) {
-             return self.value_impl() == other.impl();
+             return self.Value::impl() == other.impl();
+           })
+      .def("__neg__",
+           [](OpResult &self) {
+             return paddle::dialect::scale(self, -1.0, 0.0, true);
+           })
+      .def("__add__",
+           [](OpResult &self, OpResult &other) {
+             return paddle::dialect::add(self, other);
+           })
+      .def("__sub__",
+           [](OpResult &self, OpResult &other) {
+             return paddle::dialect::subtract(self, other);
+           })
+      .def("__mul__",
+           [](OpResult &self, OpResult &other) {
+             return paddle::dialect::multiply(self, other);
+           })
+      .def("__truediv__",
+           [](OpResult &self, OpResult &other) {
+             return paddle::dialect::divide(self, other);
+           })
+      .def("__lt__",
+           [](OpResult &self, OpResult &other) {
+             return paddle::dialect::less_than(self, other);
+           })
+      .def("__le__",
+           [](OpResult &self, OpResult &other) {
+             return paddle::dialect::less_equal(self, other);
+           })
+      .def("__gt__",
+           [](OpResult &self, OpResult &other) {
+             return paddle::dialect::greater_than(self, other);
+           })
+      .def("__ge__",
+           [](OpResult &self, OpResult &other) {
+             return paddle::dialect::greater_equal(self, other);
            })
       .def("__hash__",
            [](OpResult &self) {
-             return std::hash<ir::Value>{}(self.dyn_cast<ir::Value>());
+             return std::hash<pir::Value>{}(self.dyn_cast<pir::Value>());
            })
       .def("get_defining_op",
            &OpResult::GetDefiningOp,
            return_value_policy::reference)
+      .def_property_readonly(
+          "block",
+          [](OpResult &self) { return self.GetDefiningOp()->GetParent(); },
+          return_value_policy::reference)
+      .def_property_readonly(
+          "name",
+          [](OpResult &self) {
+            if (self.GetDefiningOp()->name() == "builtin.get_parameter") {
+              auto param_name = self.GetDefiningOp()
+                                    ->attributes()
+                                    .at("parameter_name")
+                                    .dyn_cast<pir::StrAttribute>()
+                                    .AsString();
+              return param_name;
+            } else {
+              PADDLE_THROW(phi::errors::InvalidArgument(
+                  "Currently, we can only get name of OpResult that is "
+                  "persistable"));
+            }
+          })
       .def("first_use", &OpResult::first_use, return_value_policy::reference)
       .def("has_one_use", &Value::HasOneUse)
       .def("use_empty", &OpResult::use_empty)
       .def("type", &OpResult::type)
+      .def("is_dense_tensor_type",
+           [](OpResult &self) {
+             if (self.type().isa<DenseTensorType>()) {
+               return true;
+             } else {
+               return false;
+             }
+           })
       .def_property(
           "stop_gradient",
-          [](OpResult &self) { return GetStopGradient(self); },
+          [](OpResult &self) {
+            return GetOpResultBoolAttr(self, kAttrStopGradients);
+          },
           [](OpResult &self, bool stop_gradient) {
-            SetStopGradient(self, stop_gradient);
+            SetOpResultBoolAttr(self, kAttrStopGradients, stop_gradient);
+          })
+      .def_property(
+          "is_persistable",
+          [](OpResult &self) {
+            return GetOpResultBoolAttr(self, kAttrIsPersisable);
+          },
+          [](OpResult &self, bool is_persistable) {
+            SetOpResultBoolAttr(self, kAttrIsPersisable, is_persistable);
           })
       .def_property(
           "shape",
@@ -417,7 +548,324 @@ void BindType(py::module *m) {
       });
 }
 
+Operation *BuildOpFrom(
+    const Operation *to_copy_op,
+    std::unordered_map<pir::Value, pir::Value> &value_map) {  // NOLINT
+  pir::OperationArgument to_create_argument(to_copy_op->info());
+  to_create_argument.attributes = to_copy_op->attributes();
+
+  auto origin_results = to_copy_op->results();
+  std::transform(origin_results.begin(),
+                 origin_results.end(),
+                 std::back_inserter(to_create_argument.output_types),
+                 [](const pir::OpResult &r) {
+                   // OpResult -> OpType
+                   return r.type();
+                 });
+
+  // transform by value_map dict.
+  auto origin_operands = to_copy_op->operands();
+  std::transform(origin_operands.begin(),
+                 origin_operands.end(),
+                 std::back_inserter(to_create_argument.inputs),
+                 [&value_map](const pir::OpOperand &operand) {
+                   // Operand -> OpResult
+                   return value_map[operand.source()].impl();
+                 });
+  auto *cloned_op = Operation::Create(std::move(to_create_argument));
+
+  // update the mapping of value_map. std::transform is a map(func, zip()).
+  std::vector<int> tmp;
+  std::transform(origin_results.begin(),
+                 origin_results.end(),
+                 cloned_op->results().begin(),
+                 std::back_inserter(tmp),  // NOLINT, just a placeholder.
+                 [&value_map](const OpResult &a, const OpResult &b) {  // NOLINT
+                   value_map[a.Value::impl()] = b.Value::impl();
+                   return 1;
+                 });
+  return cloned_op;
+}
+
+std::shared_ptr<Program> ProgramClone(const Program &program) {
+  // Limitation of this function:
+  // 1. don't support Parameters.
+  // 2. don't support Regions in operator.
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  auto cloned_program = std::make_shared<Program>(ctx);
+  std::unordered_map<pir::Value, pir::Value> value_map;
+  for (auto &op : *program.block()) {
+    auto *cloned_op = BuildOpFrom(op, value_map);
+    cloned_program->block()->push_back(cloned_op);
+  }
+  return cloned_program;
+}
+
+std::list<Operation *>::const_iterator list_offset(const Block *block,
+                                                   int start_idx) {
+  auto it = block->begin();
+  while (start_idx--) ++it;
+  return it;
+}
+
+template <class F>
+void range_block_do(const Block *block, std::vector<int> range, F fn) {
+  for (auto it = list_offset(block, range[0]);
+       it != list_offset(block, range[1]);
+       ++it) {
+    fn(*it);
+  }
+}
+
+std::vector<pir::Value> AnalysisMiddleVariable(
+    const Program &program,
+    const std::vector<pir::Value> &forward_inputs,
+    const std::vector<int> &forward_range,
+    const std::vector<int> &backward_range) {
+  std::vector<pir::Value> middle_values;
+
+  std::unordered_set<pir::Value> backward_inputs;
+  std::unordered_set<pir::Value> x_or_param(forward_inputs.begin(),
+                                            forward_inputs.end());
+  range_block_do(
+      program.block(), backward_range, [&backward_inputs](Operation *op) {
+        for (auto &t : op->operands()) {
+          backward_inputs.insert(t.source());
+        }
+      });
+
+  range_block_do(
+      program.block(),
+      forward_range,
+      [&middle_values, &backward_inputs, &x_or_param](Operation *op) {
+        for (auto &t : op->results()) {
+          auto v = Value(t.Value::impl());
+          if (backward_inputs.count(v) && !x_or_param.count(v))
+            middle_values.push_back(v);
+        }
+      });
+  return middle_values;
+}
+
+void mapping_value(const std::vector<pir::Value> &origin,
+                   const std::unordered_map<pir::Value, pir::Value> &value_map,
+                   std::vector<pir::Value> &out) {  // NOLINT
+  std::transform(origin.begin(),
+                 origin.end(),
+                 std::back_inserter(out),
+                 [&value_map](const pir::Value &v) {
+                   if (v.impl() == nullptr) return Value(nullptr);
+                   return value_map.at(v);
+                 });
+}
+
+using SplitedProgram = std::vector<std::shared_ptr<Program>>;
+using SplitedAttribute = std::map<std::string, std::vector<pir::Value>>;
+using SplitedResult = std::pair<SplitedProgram, SplitedAttribute>;
+
+pir::OpResult FakeOpResult() {
+  // create a fake opresults to simplify `ForwardBackwardSplit`.
+  return pir::OpResult(nullptr);
+}
+
+SplitedResult ForwardBackwardSplit(
+    const Program &program,
+    const std::vector<pir::OpResult> &op_result_forward_inputs,
+    const std::vector<pir::OpResult> &op_result_forward_outputs,
+    const std::vector<pir::OpResult> &op_result_forward_inputs_grads,
+    const std::vector<pir::OpResult> &op_result_forward_outputs_grads,
+    const std::vector<int> &forward_range,
+    const std::vector<int> &backward_range) {
+  // transform opresult -> value
+  VLOG(1) << "Start Prepare data structures.";
+  std::vector<pir::Value> forward_inputs, forward_outputs, forward_inputs_grads,
+      forward_outputs_grads;
+
+  auto op_result_to_value = [](const pir::OpResult &r) {
+    if (r.impl() == nullptr) return Value(nullptr);
+    return Value(r.Value::impl());
+  };
+
+  std::transform(op_result_forward_inputs.begin(),
+                 op_result_forward_inputs.end(),
+                 std::back_inserter(forward_inputs),
+                 op_result_to_value);
+  std::transform(op_result_forward_outputs.begin(),
+                 op_result_forward_outputs.end(),
+                 std::back_inserter(forward_outputs),
+                 op_result_to_value);
+  std::transform(op_result_forward_inputs_grads.begin(),
+                 op_result_forward_inputs_grads.end(),
+                 std::back_inserter(forward_inputs_grads),
+                 op_result_to_value);
+  std::transform(op_result_forward_outputs_grads.begin(),
+                 op_result_forward_outputs_grads.end(),
+                 std::back_inserter(forward_outputs_grads),
+                 op_result_to_value);
+
+  std::vector<pir::Value> forward_in_out_values;
+  for (auto &v : std::vector<std::vector<pir::Value> *>(
+           {&forward_inputs, &forward_outputs})) {
+    forward_in_out_values.insert(
+        forward_in_out_values.end(), v->begin(), v->end());
+  }
+
+  std::vector<pir::Value> fx, fp, fm, fo, bx, bp, bm, bo_g, bx_g, bp_g, bo;
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  auto forward_program = std::make_shared<Program>(ctx);
+  auto backward_program = std::make_shared<Program>(ctx);
+  auto middle_values = AnalysisMiddleVariable(
+      program, forward_in_out_values, forward_range, backward_range);
+  std::unordered_map<pir::Value, pir::Value> forward_value_map;
+  std::unordered_map<pir::Value, pir::Value> backward_value_map;
+  pir::Builder backward_builder = pir::Builder(ctx, backward_program->block());
+
+  // forward program construct.
+  VLOG(1) << "Before Forward Construct.";
+  range_block_do(program.block(),
+                 forward_range,
+                 [&forward_value_map, &forward_program](Operation *op) {
+                   auto *cloned_op = BuildOpFrom(op, forward_value_map);
+                   forward_program->block()->push_back(cloned_op);
+                 });
+  VLOG(1) << "After Forward Construct.";
+
+  // backward program construc.
+  // Step1. insert data op for inputs_values and middle_values
+  int counter = 0;
+  auto create_data_fn = [&backward_builder, &backward_value_map, &counter](
+                            const pir::Value &v) {
+    if (v.impl() == nullptr) {
+      return;
+    }
+    auto value_type = v.type().dyn_cast<DenseTensorType>();
+    auto dtype = paddle::dialect::TransToPhiDataType(value_type.dtype());
+    auto shape = phi::vectorize(value_type.dims());
+    auto place = phi::CPUPlace();  // TODO(xiongkun): how to get default places.
+
+    paddle::dialect::DataOp op =
+        backward_builder.Build<paddle::dialect::DataOp>(
+            std::string("input_") + std::to_string(counter),
+            shape,
+            dtype,
+            place);
+    counter += 1;
+    backward_value_map[v] = op->results()[0].Value::impl();
+  };
+
+  auto create_output_fn_forward = [&ctx,
+                                   &forward_value_map,
+                                   &counter,
+                                   &forward_program](const pir::Value &v) {
+    if (v.impl() == nullptr) {
+      return;
+    }
+    auto op_info = ctx->GetRegisteredOpInfo(pir::SetParameterOp::name());
+    pir::AttributeMap attribute_map = {
+        {"parameter_name",
+         pir::StrAttribute::get(
+             ctx, std::string("output_") + std::to_string(counter))},
+    };
+    pir::Operation *operation = pir::Operation::Create(
+        {OpResult(forward_value_map[v].impl())}, attribute_map, {}, op_info);
+    forward_program->block()->push_back(operation);
+    counter += 1;
+  };
+
+  auto create_output_fn_backward = [&ctx,
+                                    &backward_value_map,
+                                    &counter,
+                                    &backward_program](const pir::Value &v) {
+    if (v.impl() == nullptr) {
+      return;
+    }
+    auto op_info = ctx->GetRegisteredOpInfo(pir::SetParameterOp::name());
+    pir::AttributeMap attribute_map = {
+        {"parameter_name",
+         pir::StrAttribute::get(
+             ctx, std::string("output_") + std::to_string(counter))},
+    };
+    pir::Operation *operation =
+        pir::Operation::Create({OpResult(backward_value_map.at(v).impl())},
+                               attribute_map,
+                               {},
+                               op_info);
+    backward_program->block()->push_back(operation);
+    counter += 1;
+  };
+
+  counter = 0;
+  std::for_each(forward_outputs.begin(), forward_outputs.end(), create_data_fn);
+  std::for_each(forward_inputs.begin(), forward_inputs.end(), create_data_fn);
+  std::for_each(middle_values.begin(), middle_values.end(), create_data_fn);
+  std::for_each(forward_outputs_grads.begin(),
+                forward_outputs_grads.end(),
+                create_data_fn);
+  VLOG(1) << "After create pd.data for backward program.";
+
+  counter = 0;
+  std::for_each(
+      middle_values.begin(), middle_values.end(), create_output_fn_forward);
+  std::for_each(
+      forward_outputs.begin(), forward_outputs.end(), create_output_fn_forward);
+
+  VLOG(1) << "After call create_output_fn";
+  // Step2. copy backward ops .
+  range_block_do(program.block(),
+                 backward_range,
+                 [&backward_value_map, &backward_program](Operation *op) {
+                   auto *cloned_op = BuildOpFrom(op, backward_value_map);
+                   backward_program->block()->push_back(cloned_op);
+                 });
+  VLOG(1) << "After call backward copy";
+  counter = 0;
+  std::for_each(forward_inputs_grads.begin(),
+                forward_inputs_grads.end(),
+                create_output_fn_backward);
+  // TODO(xiongkun): add forward parameter grads.
+
+  VLOG(1) << "forward_value_map.size() is " << forward_value_map.size();
+  VLOG(1) << "backward_value_map.size() is " << backward_value_map.size();
+  std::ostringstream print_stream;
+  print_stream << "ForwardProgram is :\n";
+  forward_program->Print(print_stream);
+  print_stream << "BackwardProgram is:\n";
+  backward_program->Print(print_stream);
+  VLOG(1) << "Splited Program (fwd | bwd): \n" << print_stream.str();
+
+  // construct all attributes we needed.
+
+  mapping_value(middle_values, forward_value_map, fm);    // write 'fm'
+  mapping_value(middle_values, backward_value_map, bm);   // write 'bm'
+  mapping_value(forward_inputs, forward_value_map, fx);   // write 'fx'
+  mapping_value(forward_inputs, backward_value_map, bx);  // write 'bx'
+  mapping_value(forward_outputs, forward_value_map, fo);  // write 'fo'
+  mapping_value(
+      forward_inputs_grads, backward_value_map, bx_g);  // write 'fx_g'
+  mapping_value(
+      forward_outputs_grads, backward_value_map, bo_g);    // write 'bo_g'
+  mapping_value(forward_outputs, backward_value_map, bo);  // write 'bo'
+
+  std::map<std::string, std::vector<pir::Value>> attr = {{"fx", fx},
+                                                         {"fp", fp},
+                                                         {"fm", fm},
+                                                         {"fo", fo},
+                                                         {"bx", bx},
+                                                         {"bp", bp},
+                                                         {"bm", bm},
+                                                         {"bo_g", bo_g},
+                                                         {"bx_g", bx_g},
+                                                         {"bp_g", bp_g},
+                                                         {"bo", bo}};
+  std::vector<std::shared_ptr<Program>> programs = {forward_program,
+                                                    backward_program};
+  return std::make_pair(programs, attr);
+}
+
 void BindUtils(pybind11::module *m) {
+  m->def("program_clone", ProgramClone);
+  m->def("program_split", ForwardBackwardSplit);
+  m->def("fake_op_result", FakeOpResult);
   m->def("set_global_program",
          [](Program *program) { APIBuilder::Instance().SetProgram(program); });
   m->def("set_insertion_point",
@@ -427,8 +875,8 @@ void BindUtils(pybind11::module *m) {
   m->def("reset_insertion_point_to_end",
          []() { APIBuilder::Instance().ResetInsertionPointToEnd(); });
   m->def("register_paddle_dialect", []() {
-    ir::IrContext::Instance()
-        ->GetOrRegisterDialect<paddle::dialect::PaddleDialect>();
+    pir::IrContext::Instance()
+        ->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
   });
   m->def(
       "translate_to_new_ir",
@@ -476,7 +924,7 @@ void BindUtils(pybind11::module *m) {
   m->def(
       "check_unregistered_ops",
       [](const framework::ProgramDesc &legacy_program) {
-        ir::IrContext *ctx = ir::IrContext::Instance();
+        pir::IrContext *ctx = pir::IrContext::Instance();
         return paddle::translator::CheckUnregisteredOperation(ctx,
                                                               legacy_program);
       },
@@ -516,13 +964,13 @@ void BindPassManager(pybind11::module *m) {
       .def(
           "__init__",
           [](PassManager &self, uint8_t opt_level) {
-            new (&self) PassManager(ir::IrContext::Instance(), opt_level);
+            new (&self) PassManager(pir::IrContext::Instance(), opt_level);
           },
           py::arg("opt_level") = 2)
       .def("add_pass",
            [](PassManager &self, const std::string &pass_name) {
              self.AddPass(
-                 std::move(ir::PassRegistry::Instance().Get(pass_name)));
+                 std::move(pir::PassRegistry::Instance().Get(pass_name)));
            })
       .def("passes",
            [](PassManager &self) {
diff --git a/paddle/fluid/pybind/jit.cc b/paddle/fluid/pybind/jit.cc
index a0e130f40cf64..69b32fca9cd75 100644
--- a/paddle/fluid/pybind/jit.cc
+++ b/paddle/fluid/pybind/jit.cc
@@ -258,12 +258,22 @@ static PyObject *_custom_eval_frame(PyThreadState *tstate,
       // Re-enable custom behavior
       eval_frame_callback_set(callback);
       VLOG(7) << "Start eval new frame and code.";
-      auto out = eval_custom_code(tstate, frame, code, throw_flag);
+      PyObject *out;
+      if (reinterpret_cast<PyObject *>(code) != Py_None) {
+        out = eval_custom_code(tstate, frame, code, throw_flag);
+      } else {
+        out = eval_frame_default(tstate, frame, throw_flag);
+      }
       Py_DECREF(result);
       Py_DECREF(code);
       return out;
     } else {
-      auto out = eval_custom_code(tstate, frame, code, throw_flag);
+      PyObject *out;
+      if (reinterpret_cast<PyObject *>(code) != Py_None) {
+        out = eval_custom_code(tstate, frame, code, throw_flag);
+      } else {
+        out = eval_frame_default(tstate, frame, throw_flag);
+      }
       // Re-enable custom behavior
       eval_frame_callback_set(callback);
       Py_DECREF(result);
diff --git a/paddle/fluid/pybind/manual_static_op_function.h b/paddle/fluid/pybind/manual_static_op_function.h
new file mode 100644
index 0000000000000..ff365e63bb652
--- /dev/null
+++ b/paddle/fluid/pybind/manual_static_op_function.h
@@ -0,0 +1,89 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/pir/dialect/operator/ir/manual_api.h"
+#include "paddle/fluid/pybind/eager_utils.h"
+#include "paddle/fluid/pybind/exception.h"
+#include "paddle/fluid/pybind/op_function_common.h"
+#include "paddle/phi/common/int_array.h"
+#include "paddle/phi/core/enforce.h"
+
+namespace paddle {
+
+namespace pybind {
+static PyObject *static_api_get_parameter(PyObject *self,
+                                          PyObject *args,
+                                          PyObject *kwargs) {
+  try {
+    VLOG(6) << "Add get_parameter op into program";
+    VLOG(8) << "args count: " << (PyTuple_Size(args) / 2);
+
+    // Parse Attributes
+    PyObject *name_obj = PyTuple_GET_ITEM(args, 0);
+    std::string name = CastPyArg2String(name_obj, "name", 0);
+    PyObject *dtype_obj = PyTuple_GET_ITEM(args, 1);
+    phi::DataType dtype = CastPyArg2DataTypeDirectly(dtype_obj, "dtype", 1);
+    PyObject *shape_obj = PyTuple_GET_ITEM(args, 2);
+    phi::IntArray shape = CastPyArg2IntArray(shape_obj, "shape", 2);
+    // Call ir static api
+    auto static_api_out =
+        paddle::dialect::get_parameter(name, dtype, shape.GetData());
+
+    return ToPyObject(static_api_out);
+  } catch (...) {
+    ThrowExceptionToPython(std::current_exception());
+    return nullptr;
+  }
+}
+
+static PyObject *static_api_set_parameter(PyObject *self,
+                                          PyObject *args,
+                                          PyObject *kwargs) {
+  try {
+    VLOG(6) << "Add set_parameter op into program";
+    VLOG(8) << "args count: " << (PyTuple_Size(args) / 2);
+
+    // Get OpResult from args
+    PyObject *parameter_obj = PyTuple_GET_ITEM(args, 0);
+    auto parameter = CastPyArg2OpResult(parameter_obj, "parameter", 0);
+
+    // Parse Attributes
+    PyObject *name_obj = PyTuple_GET_ITEM(args, 1);
+    std::string name = CastPyArg2String(name_obj, "name", 1);
+    // Call ir static api
+    paddle::dialect::set_parameter(parameter, name);
+
+    Py_RETURN_NONE;
+  } catch (...) {
+    ThrowExceptionToPython(std::current_exception());
+    return nullptr;
+  }
+}
+
+static PyMethodDef ManualOpsAPI[] = {
+    {"set_parameter",
+     (PyCFunction)(void (*)(void))static_api_set_parameter,
+     METH_VARARGS | METH_KEYWORDS,
+     "C++ interface function for set_parameter."},
+    {"get_parameter",
+     (PyCFunction)(void (*)(void))static_api_get_parameter,
+     METH_VARARGS | METH_KEYWORDS,
+     "C++ interface function for get_parameter."},
+    {nullptr, nullptr, 0, nullptr}};
+
+}  // namespace pybind
+
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/op_function_common.cc b/paddle/fluid/pybind/op_function_common.cc
index 266578615e352..a1e22b94ce192 100644
--- a/paddle/fluid/pybind/op_function_common.cc
+++ b/paddle/fluid/pybind/op_function_common.cc
@@ -34,6 +34,8 @@
 #include "paddle/fluid/pybind/eager_utils.h"
 #include "paddle/fluid/pybind/imperative.h"
 #include "paddle/phi/common/complex.h"
+#include "paddle/pir/core/block.h"
+#include "paddle/pir/core/value.h"
 
 namespace paddle {
 namespace pybind {
@@ -829,6 +831,54 @@ void CastPyArg2AttrBlock(PyObject* obj,
   attrs[key] = reinterpret_cast<paddle::framework::BlockDesc*&>(vh[0]);
 }
 
+void CastPyArg2AttrIRBlock(PyObject* obj,
+                           paddle::framework::AttributeMap& attrs,  // NOLINT
+                           const std::string& key,
+                           const std::string& op_type,
+                           ssize_t arg_pos) {
+  VLOG(1) << "After Process pir::Block*";
+  ::pybind11::detail::instance* inst =
+      (::pybind11::detail::instance*)obj;  // NOLINT
+  void** vh = inst->simple_layout ? inst->simple_value_holder
+                                  : &inst->nonsimple.values_and_holders[0];
+  attrs[key] = reinterpret_cast<::pir::Block*&>(vh[0]);
+}
+
+void CastPyArg2AttrValues(PyObject* obj,
+                          paddle::framework::AttributeMap& attrs,  // NOLINT
+                          const std::string& key,
+                          const std::string& op_type,
+                          ssize_t arg_pos) {
+  std::vector<::pir::Value> results;
+  if (PyList_Check(obj)) {
+    Py_ssize_t len = PyList_Size(obj);
+    PyObject* item = nullptr;
+    for (Py_ssize_t i = 0; i < len; i++) {
+      // TODO(xiongkun): judge OpResult or Value;
+      item = PyList_GetItem(obj, i);
+      ::pybind11::detail::instance* inst =
+          (::pybind11::detail::instance*)item;  // NOLINT
+      void** vh = inst->simple_layout ? inst->simple_value_holder
+                                      : &inst->nonsimple.values_and_holders[0];
+      ::pir::OpResult* opresult = reinterpret_cast<::pir::OpResult*>(vh[0]);
+      if (opresult->impl() == nullptr) {
+        results.emplace_back(pir::Value(nullptr));
+      } else {
+        results.emplace_back(pir::Value(opresult->Value::impl()));
+      }
+    }
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "%s(): argument (position %d) must be "
+        "a list of int, float, complex, or bool, but got %s",
+        op_type,
+        arg_pos + 1,
+        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
+  }
+  attrs[key] = results;
+  VLOG(1) << "Pybind: Cast " << results.size() << " Value Finished.";
+}
+
 void ConstructAttrMapFromPyArgs(
     const std::string& op_type,
     PyObject* args,
@@ -847,6 +897,7 @@ void ConstructAttrMapFromPyArgs(
 
   PyObject* obj = nullptr;
   for (ssize_t arg_pos = attr_start; arg_pos < attr_end; arg_pos += 2) {
+    VLOG(1) << "Start Process " << arg_pos;
     Py_ssize_t key_len;
     const char* key_ptr;
     obj = PyTuple_GET_ITEM(args, arg_pos);
@@ -862,6 +913,7 @@ void ConstructAttrMapFromPyArgs(
     }
 
     std::string key(key_ptr, (size_t)key_len);  // NOLINT
+    VLOG(1) << "Start Process " << key;
     auto iter = attr_type_map->find(key);
     if (iter == attr_type_map->end()) {
       continue;
@@ -921,6 +973,77 @@ void ConstructAttrMapFromPyArgs(
   }
 }
 
+void ConstructAttrMapForRunProgram(
+    const std::string& op_type,
+    PyObject* args,
+    ssize_t attr_start,
+    ssize_t attr_end,
+    paddle::framework::AttributeMap& attrs) {  // NOLINT
+  PADDLE_ENFORCE_EQ((attr_end - attr_start) % 2,
+                    0,
+                    platform::errors::InvalidArgument(
+                        "The number of arguments for attributes should be even "
+                        "but attr_start = %d, attr_end = %d.",
+                        attr_start,
+                        attr_end));
+
+  PyObject* obj = nullptr;
+  for (ssize_t arg_pos = attr_start; arg_pos < attr_end; arg_pos += 2) {
+    VLOG(1) << "Start Process " << arg_pos;
+    Py_ssize_t key_len;
+    const char* key_ptr;
+    obj = PyTuple_GET_ITEM(args, arg_pos);
+    if (PyObject_CheckString(obj)) {
+      key_ptr = PyUnicode_AsUTF8AndSize(obj, &key_len);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s(): argument (position %d) must be str, but got "
+          "%s",
+          op_type,
+          arg_pos,
+          ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
+    }
+
+    std::string key(key_ptr, (size_t)key_len);  // NOLINT
+    VLOG(1) << "Start Process " << key;
+    obj = PyTuple_GET_ITEM(args, arg_pos + 1);
+
+    if (std::set<std::string>({"cuda_graph_capture_mode"}).count(key)) {
+      CastPyArg2AttrString(obj, attrs, key, op_type, arg_pos);
+    } else if (std::set<std::string>({"global_block",
+                                      "forward_global_block",
+                                      "backward_global_block"})
+                   .count(key)) {
+      CastPyArg2AttrIRBlock(obj, attrs, key, op_type, arg_pos);
+    } else if (std::set<std::string>({"is_test", "use_interpretorcore"})
+                   .count(key)) {
+      CastPyArg2AttrBoolean(obj, attrs, key, op_type, arg_pos);
+    } else if (std::set<std::string>({"start_op_index",
+                                      "end_op_index",
+                                      "program_id",
+                                      "cuda_graph_pool_id"})
+                   .count(key)) {
+      CastPyArg2AttrLong(obj, attrs, key, op_type, arg_pos);
+    } else if (std::set<std::string>({"fx",
+                                      "fp",
+                                      "fm",
+                                      "fo",
+                                      "bx",
+                                      "bp",
+                                      "bm",
+                                      "bo_g",
+                                      "bx_g",
+                                      "bp_g",
+                                      "bo"})
+                   .count(key)) {
+      CastPyArg2AttrValues(obj, attrs, key, op_type, arg_pos);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "%s is not defined in this function.", key));  // NOLINT
+    }
+  }
+}
+
 unsigned long GetUnsignedLongFromArgs(  // NOLINT
     const std::string& op_type,
     const std::string& arg_name,
diff --git a/paddle/fluid/pybind/op_function_common.h b/paddle/fluid/pybind/op_function_common.h
index a3f4960bbd58b..2d02dd6fb784d 100644
--- a/paddle/fluid/pybind/op_function_common.h
+++ b/paddle/fluid/pybind/op_function_common.h
@@ -194,6 +194,13 @@ void ConstructAttrMapFromPyArgs(
     ssize_t attr_end,
     paddle::framework::AttributeMap& attrs);  // NOLINT
 
+void ConstructAttrMapForRunProgram(
+    const std::string& op_type,
+    PyObject* args,
+    ssize_t attr_start,
+    ssize_t attr_end,
+    paddle::framework::AttributeMap& attrs);  // NOLINT
+
 unsigned long GetUnsignedLongFromArgs(  // NOLINT
     const std::string& op_type,
     const std::string& arg_name,
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 056c4b0daadfc..9d1cd87280179 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -195,17 +195,18 @@ limitations under the License. */
 #include "paddle/fluid/eager/api/utils/global_utils.h"
 #include "paddle/fluid/eager/nan_inf_utils.h"
 #include "paddle/fluid/imperative/layout_autotune.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/interface/vjp.h"
+#include "paddle/fluid/pir/dialect/operator/interface/vjp.h"
+#include "paddle/fluid/pir/dialect/operator/trait/custom_vjp.h"
 #include "paddle/fluid/prim/utils/eager/eager_tensor_operants.h"
 #include "paddle/fluid/prim/utils/static/static_tensor_operants.h"
 #include "paddle/fluid/pybind/eager_utils.h"
-#include "paddle/ir/core/program.h"
 #include "paddle/phi/api/ext/op_meta_info.h"
 #include "paddle/phi/api/include/operants_manager.h"
 #include "paddle/phi/api/include/tensor_operants.h"
 #include "paddle/phi/core/flags.h"
 #include "paddle/phi/kernels/autotune/cache.h"
 #include "paddle/phi/kernels/autotune/switch_autotune.h"
+#include "paddle/pir/core/program.h"
 #include "pybind11/stl.h"
 
 PHI_DECLARE_bool(use_mkldnn);
@@ -676,7 +677,7 @@ static void AssertStaticGraphAndDygraphGradMakerNoDiff() {
                         string::join_strings(ops, ',')));
 }
 
-#ifdef PADDLE_WITH_NCCL
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 static int GetNCCLVersion() {
 #if NCCL_VERSION_CODE >= 2304
   int ver;
@@ -692,19 +693,19 @@ static int GetNCCLVersion() {
 void BindVjp(pybind11::module *m) {
   m->def(
       "call_vjp",
-      [](ir::Operation &fwd_op,
-         const std::vector<std::vector<ir::OpResult>> &out_grads,
+      [](pir::Operation &fwd_op,
+         const std::vector<std::vector<pir::OpResult>> &out_grads,
          const std::vector<std::vector<bool>> &stop_gradients) {
         py::list res;
-        ir::IrContext *ctx = ir::IrContext::Instance();
-        ir::OpInfo fwd_op_info = ctx->GetRegisteredOpInfo(fwd_op.name());
+        pir::IrContext *ctx = pir::IrContext::Instance();
+        pir::OpInfo fwd_op_info = ctx->GetRegisteredOpInfo(fwd_op.name());
         auto vjp_interface_impl =
             fwd_op_info.GetInterfaceImpl<paddle::dialect::VjpInterface>();
         if (vjp_interface_impl == nullptr) {
           PADDLE_THROW(phi::errors::InvalidArgument(
               "The vjp function is not registered in %s op ", fwd_op.name()));
         }
-        std::vector<std::vector<ir::OpResult>> vjp_res =
+        std::vector<std::vector<pir::OpResult>> vjp_res =
             vjp_interface_impl->vjp_(&fwd_op, out_grads, stop_gradients);
         PADDLE_ENFORCE_EQ(
             stop_gradients.size(),
@@ -743,14 +744,29 @@ void BindVjp(pybind11::module *m) {
         return res;
       });
 
-  m->def("has_vjp", [](ir::Operation &fwd_op) {
-    ir::IrContext *ctx = ir::IrContext::Instance();
-    ir::OpInfo fwd_op_info = ctx->GetRegisteredOpInfo(fwd_op.name());
+  m->def("has_vjp", [](pir::Operation &fwd_op) {
+    pir::IrContext *ctx = pir::IrContext::Instance();
+    pir::OpInfo fwd_op_info = ctx->GetRegisteredOpInfo(fwd_op.name());
     auto vjp_interface_impl =
         fwd_op_info.GetInterfaceImpl<paddle::dialect::VjpInterface>();
     if (vjp_interface_impl == nullptr) return false;
     return true;
   });
+
+  m->def(
+      "has_custom_vjp",
+      [](pir::Operation &op) -> py::bool_ {
+        return op.info().HasTrait<paddle::dialect::CustomVjpTrait>();
+      },
+      R"DOC(
+           Return whether an op has custom vjp rules.
+
+           Args:
+               op (pir::Operation): op to be checked
+
+           Returns:
+               out (bool): True means that the op has custom vjp rules, False means it does not.
+           )DOC");
 }
 PYBIND11_MODULE(libpaddle, m) {
   BindImperative(&m);
@@ -872,7 +888,7 @@ PYBIND11_MODULE(libpaddle, m) {
   });
 #endif
 
-#ifdef PADDLE_WITH_NCCL
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   m.def("nccl_version", &GetNCCLVersion);
 #endif
 
@@ -1239,11 +1255,15 @@ All parameter, weight, gradient are variables in Paddle.
     Examples:
         .. code-block:: python
 
-          # create tensor from a scope and set value to it.
-          param = scope.var('Param').get_tensor()
-          param_array = np.full((height, row_numel), 5.0).astype("float32")
-          param.set(param_array, place)
+            >>> import paddle
+            >>> import numpy as np
 
+            >>> scope = paddle.static.global_scope()
+            >>> place = paddle.CPUPlace()
+            >>> # create tensor from a scope and set value to it.
+            >>> param = scope.var('Param').get_tensor()
+            >>> param_array = np.full((10, 12), 5.0).astype("float32")
+            >>> param.set(param_array, place)
         )DOC");
   g_framework_scope_pytype = reinterpret_cast<PyTypeObject *>(_Scope.ptr());
   _Scope
@@ -1983,7 +2003,7 @@ All parameter, weight, gradient are variables in Paddle.
           py::init<
               const std::vector<std::shared_ptr<framework::interpreter::Job>> &,
               const std::unordered_map<std::string,
-                                       std::shared_ptr<::ir::Program>> &>(),
+                                       std::shared_ptr<::pir::Program>> &>(),
           py::arg("job_list"),
           py::arg("type_to_ir_program"))
       .def("job_list", &framework::interpreter::Plan::JobList)
@@ -2148,9 +2168,8 @@ All parameter, weight, gradient are variables in Paddle.
     Examples:
         .. code-block:: python
 
-          import paddle.base as base
-
-          arr = base.LoDTensorArray()
+            >>> import paddle
+            >>> arr = paddle.framework.core.LoDTensorArray()
 )DOC");
   g_framework_lodtensorarray_pytype =
       reinterpret_cast<PyTypeObject *>(pylodtensorarray.ptr());
@@ -2190,15 +2209,15 @@ All parameter, weight, gradient are variables in Paddle.
                    None.
 
              Examples:
-                 .. code-block:: python
+                    .. code-block:: python
 
-                   import paddle.base as base
-                   import numpy as np
+                        >>> import paddle
+                        >>> import numpy as np
 
-                   arr = base.LoDTensorArray()
-                   t = base.LoDTensor()
-                   t.set(np.ndarray([5, 30]), base.CPUPlace())
-                   arr.append(t)
+                        >>> arr = paddle.framework.core.LoDTensorArray()
+                        >>> t = paddle.framework.core.LoDTensor()
+                        >>> t.set(np.ndarray([5, 30]), paddle.CPUPlace())
+                        >>> arr.append(t)
            )DOC")
       .def(
           "_move_to_list",
diff --git a/paddle/fluid/pybind/tensor.cc b/paddle/fluid/pybind/tensor.cc
index b3edc9575223d..95e217365be3d 100644
--- a/paddle/fluid/pybind/tensor.cc
+++ b/paddle/fluid/pybind/tensor.cc
@@ -393,11 +393,11 @@ void BindTensor(pybind11::module &m) {  // NOLINT
         Examples:
             .. code-block:: python
 
-                import paddle.base as base
-                import numpy as np
+                >>> import paddle
+                >>> import numpy as np
 
-                t = base.Tensor()
-                t.set(np.ndarray([5, 30]), base.CPUPlace())
+                >>> t = paddle.framework.core.Tensor()
+                >>> t.set(np.ndarray([5, 30]), paddle.CPUPlace())
           )DOC")
 
       .def(
@@ -411,14 +411,15 @@ void BindTensor(pybind11::module &m) {  // NOLINT
 
 
            Examples:
-               .. code-block:: python
+                .. code-block:: python
 
-                  import paddle.base as base
-                  import numpy as np
+                    >>> import paddle
+                    >>> import numpy as np
 
-                  t = base.Tensor()
-                  t.set(np.ndarray([5, 30]), base.CPUPlace())
-                  print(t.shape())  # [5, 30]
+                    >>> t = paddle.framework.core.Tensor()
+                    >>> t.set(np.ndarray([5, 30]), paddle.CPUPlace())
+                    >>> print(t.shape())
+                    [5, 30]
            )DOC")
       .def("_to_dlpack",
            [](phi::DenseTensor &self) {
@@ -515,15 +516,16 @@ void BindTensor(pybind11::module &m) {  // NOLINT
                 None.
 
            Examples:
-               .. code-block:: python
+                .. code-block:: python
 
-                 import paddle.base as base
-                 import numpy as np
+                    >>> import paddle
+                    >>> import numpy as np
 
-                 t = base.Tensor()
-                 t.set(np.ndarray([5, 30]), base.CPUPlace())
-                 t.set_lod([[0, 2, 5]])
-                 print(t.lod()) # [[0, 2, 5]]
+                    >>> t = paddle.framework.core.Tensor()
+                    >>> t.set(np.ndarray([5, 30]), paddle.CPUPlace())
+                    >>> t.set_lod([[0, 2, 5]])
+                    >>> print(t.lod())
+                    [[0, 2, 5]]
            )DOC")
       .def(
           "set_recursive_sequence_lengths",
@@ -564,16 +566,18 @@ void BindTensor(pybind11::module &m) {  // NOLINT
                 None.
 
            Examples:
-               .. code-block:: python
-
-                 import paddle.base as base
-                 import numpy as np
-
-                 t = base.Tensor()
-                 t.set(np.ndarray([5, 30]), base.CPUPlace())
-                 t.set_recursive_sequence_lengths([[2, 3]])
-                 print(t.recursive_sequence_lengths())  # [[2, 3]]
-                 print(t.lod())  # [[0, 2, 5]]
+                .. code-block:: python
+
+                    >>> import paddle
+                    >>> import numpy as np
+
+                    >>> t = paddle.framework.core.Tensor()
+                    >>> t.set(np.ndarray([5, 30]), paddle.CPUPlace())
+                    >>> t.set_recursive_sequence_lengths([[2, 3]])
+                    >>> print(t.recursive_sequence_lengths())
+                    [[2, 3]]
+                    >>> print(t.lod())
+                    [[0, 2, 5]]
            )DOC")
       .def(
           "lod",
@@ -592,15 +596,16 @@ void BindTensor(pybind11::module &m) {  // NOLINT
                list[list[int]]: The lod of the Tensor.
 
            Examples:
-               .. code-block:: python
+                .. code-block:: python
 
-                 import paddle.base as base
-                 import numpy as np
+                    >>> import paddle
+                    >>> import numpy as np
 
-                 t = base.Tensor()
-                 t.set(np.ndarray([5, 30]), base.CPUPlace())
-                 t.set_lod([[0, 2, 5]])
-                 print(t.lod()) # [[0, 2, 5]]
+                    >>> t = paddle.framework.core.Tensor()
+                    >>> t.set(np.ndarray([5, 30]), paddle.CPUPlace())
+                    >>> t.set_lod([[0, 2, 5]])
+                    >>> print(t.lod())
+                    [[0, 2, 5]]
            )DOC")
       // Set above comments of set_lod.
       .def(
@@ -621,15 +626,16 @@ void BindTensor(pybind11::module &m) {  // NOLINT
                 list[list[int]]: The recursive sequence lengths.
 
            Examples:
-               .. code-block:: python
+                .. code-block:: python
 
-                 import paddle.base as base
-                 import numpy as np
+                    >>> import paddle
+                    >>> import numpy as np
 
-                 t = base.Tensor()
-                 t.set(np.ndarray([5, 30]), base.CPUPlace())
-                 t.set_recursive_sequence_lengths([[2, 3]])
-                 print(t.recursive_sequence_lengths()) # [[2, 3]]
+                    >>> t = paddle.framework.core.Tensor()
+                    >>> t.set(np.ndarray([5, 30]), paddle.CPUPlace())
+                    >>> t.set_recursive_sequence_lengths([[2, 3]])
+                    >>> print(t.recursive_sequence_lengths())
+                    [[2, 3]]
            )DOC")
       .def(
           "has_valid_recursive_sequence_lengths",
@@ -645,15 +651,16 @@ void BindTensor(pybind11::module &m) {  // NOLINT
                bool: Whether the LoD is valid.
 
            Examples:
-               .. code-block:: python
+                .. code-block:: python
 
-                 import paddle.base as base
-                 import numpy as np
+                    >>> import paddle
+                    >>> import numpy as np
 
-                 t = base.Tensor()
-                 t.set(np.ndarray([5, 30]), base.CPUPlace())
-                 t.set_recursive_sequence_lengths([[2, 3]])
-                 print(t.has_valid_recursive_sequence_lengths()) # True
+                    >>> t = paddle.framework.core.Tensor()
+                    >>> t.set(np.ndarray([5, 30]), paddle.CPUPlace())
+                    >>> t.set_recursive_sequence_lengths([[2, 3]])
+                    >>> print(t.has_valid_recursive_sequence_lengths())
+                    True
            )DOC")
       .def("_as_type",
            [](const phi::DenseTensor &self,
@@ -773,12 +780,12 @@ void BindTensor(pybind11::module &m) {  // NOLINT
                       tensor dims, lod information, device index.
 
            Examples:
-               .. code-block:: python
+                .. code-block:: python
 
-                 import paddle
-                 tensor = paddle.ones([3,3])
-                 metainfo = tensor.value().get_tensor()._share_cuda()
+                    >>> import paddle
 
+                    >>> tensor = paddle.ones([3,3])
+                    >>> metainfo = tensor.value().get_tensor()._share_cuda()
       )DOC")
       .def("_new_shared_cuda",
            [](py::tuple t) {
@@ -819,13 +826,13 @@ void BindTensor(pybind11::module &m) {  // NOLINT
                       tensor dims, lod information, device index.
 
            Examples:
-               .. code-block:: python
+                .. code-block:: python
 
-                 import paddle
-                 tensor = paddle.ones([3,3])
-                 metainfo = tensor.value().get_tensor()._share_cuda()
-                 tensor_from_shared = paddle.to_tensor(paddle.base.core.LoDTensor._new_shared_cuda(metainfo))
+                    >>> import paddle
 
+                    >>> tensor = paddle.ones([3,3])
+                    >>> metainfo = tensor.value().get_tensor()._share_cuda()
+                    >>> tensor_from_shared = paddle.to_tensor(paddle.base.core.LoDTensor._new_shared_cuda(metainfo))
         )DOC")
 #endif
       .def("_share_filename",
@@ -896,12 +903,12 @@ void BindTensor(pybind11::module &m) {  // NOLINT
                       tensor dims and lod imformation.
 
            Examples:
-               .. code-block:: python
+                .. code-block:: python
 
-                 import paddle
-                 tensor = paddle.ones([3,3])
-                 metainfo = tensor.value().get_tensor()._share_filename()
+                    >>> import paddle
 
+                    >>> tensor = paddle.ones([3,3])
+                    >>> metainfo = tensor.value().get_tensor()._share_filename()
        )DOC")
       .def("_new_shared_filename",
            [](py::tuple t) {  // __setstate__
@@ -940,13 +947,13 @@ void BindTensor(pybind11::module &m) {  // NOLINT
                       tensor dims and lod information.
 
            Examples:
-               .. code-block:: python
+                .. code-block:: python
 
-                 import paddle
-                 tensor = paddle.ones([3,3])
-                 metainfo = tensor.value().get_tensor()._share_filename()
-                 tensor_from_shared = paddle.to_tensor(paddle.base.core.LoDTensor._new_shared_filename(metainfo))
+                    >>> import paddle
 
+                    >>> tensor = paddle.ones([3,3])
+                    >>> metainfo = tensor.value().get_tensor()._share_filename()
+                    >>> tensor_from_shared = paddle.to_tensor(paddle.base.core.LoDTensor._new_shared_filename(metainfo))
         )DOC")
       .def("_shared_incref",
            [](phi::DenseTensor &self) {
diff --git a/paddle/ir/core/CMakeLists.txt b/paddle/ir/core/CMakeLists.txt
deleted file mode 100644
index 138b102fcbd89..0000000000000
--- a/paddle/ir/core/CMakeLists.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-set(NEWIR_SOURCE_DIR "${PADDLE_SOURCE_DIR}/paddle/ir")
-set(NEWIR_BINARY_DIR "${PADDLE_BINARY_DIR}/paddle/ir")
-
-file(GLOB IR_SRCS "*.cc")
-
-file(GLOB IR_PARSER_SRCS "parser/*.cc")
-
-list(APPEND IR_SRCS ${IR_PARSER_SRCS})
-
-ir_library(ir_core SRCS ${IR_SRCS} DEPS ddim)
diff --git a/paddle/ir/core/op_base.h b/paddle/ir/core/op_base.h
deleted file mode 100644
index 0a491795d4eed..0000000000000
--- a/paddle/ir/core/op_base.h
+++ /dev/null
@@ -1,249 +0,0 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <type_traits>
-
-#include "paddle/ir/core/enforce.h"
-#include "paddle/ir/core/operation.h"
-#include "paddle/ir/core/utils.h"
-
-namespace ir {
-
-class IR_API InterfaceValue {
- public:
-  template <typename ConcreteOp, typename T>
-  static InterfaceValue get() {
-    InterfaceValue val;
-    val.type_id_ = TypeId::get<T>();
-    val.model_ = malloc(sizeof(typename T::template Model<ConcreteOp>));
-    if (val.model_ == nullptr) {
-      throw("Alloc memory for interface failed.");
-    }
-    static_assert(std::is_trivially_destructible<
-                      typename T::template Model<ConcreteOp>>::value,
-                  "interface models must be trivially destructible");
-    new (val.model_) typename T::template Model<ConcreteOp>();
-    return val;
-  }
-  TypeId type_id() const { return type_id_; }
-  void *model() const { return model_; }
-
-  InterfaceValue() = default;
-  explicit InterfaceValue(TypeId type_id) : type_id_(type_id) {}
-  InterfaceValue(const InterfaceValue &) = delete;
-  InterfaceValue(InterfaceValue &&) noexcept;
-  InterfaceValue &operator=(const InterfaceValue &) = delete;
-  InterfaceValue &operator=(InterfaceValue &&) noexcept;
-  ~InterfaceValue();
-  void swap(InterfaceValue &&val) {
-    using std::swap;
-    swap(type_id_, val.type_id_);
-    swap(model_, val.model_);
-  }
-
-  ///
-  /// \brief Comparison operations.
-  ///
-  inline bool operator<(const InterfaceValue &other) const {
-    return type_id_ < other.type_id_;
-  }
-
- private:
-  TypeId type_id_;
-  void *model_{nullptr};
-};
-
-class IR_API OpBase {
- public:
-  explicit OpBase(Operation *operation = nullptr) : operation_(operation) {}
-
-  Operation *operation() const {
-    IR_ENFORCE(operation_, "Can't use operation() in a null op.");
-    return operation_;
-  }
-
-  explicit operator bool() const { return operation_ != nullptr; }
-
-  operator Operation *() const { return operation(); }
-
-  Operation *operator->() const { return operation(); }
-
-  IrContext *ir_context() const { return operation()->ir_context(); }
-
-  uint32_t num_results() const { return operation()->num_results(); }
-
-  uint32_t num_operands() const { return operation()->num_operands(); }
-
-  const AttributeMap &attributes() const { return operation()->attributes(); }
-
-  Value operand_source(uint32_t index) const {
-    return operation()->operand_source(index);
-  }
-
-  OpResult result(uint32_t index) const { return operation()->result(index); }
-
-  ir::Attribute attribute(const std::string &name) {
-    return operation()->attribute(name);
-  }
-
-  template <typename T>
-  T attribute(const std::string &name) {
-    return operation()->attribute<T>(name);
-  }
-
- private:
-  Operation *operation_;  // Not owned
-};
-
-///
-/// \brief OpTrait
-///
-template <class ConcreteTrait>
-class OpTraitBase : public OpBase {
- public:
-  explicit OpTraitBase(Operation *op) : OpBase(op) {}
-
-  static TypeId GetTraitId() { return TypeId::get<ConcreteTrait>(); }
-
-  static ConcreteTrait dyn_cast(Operation *op) {
-    if (op && op->HasTrait<ConcreteTrait>()) {
-      return ConcreteTrait(op);
-    }
-    return ConcreteTrait(nullptr);
-  }
-};
-
-///
-/// \brief OpInterface
-///
-template <typename ConcreteInterface>
-class OpInterfaceBase : public OpBase {
- public:
-  explicit OpInterfaceBase(Operation *op) : OpBase(op) {}
-
-  static TypeId GetInterfaceId() { return TypeId::get<ConcreteInterface>(); }
-
-  static ConcreteInterface dyn_cast(Operation *op) {
-    if (op && op->HasInterface<ConcreteInterface>()) {
-      return ConcreteInterface(
-          op, op->info().GetInterfaceImpl<ConcreteInterface>());
-    }
-    return ConcreteInterface(nullptr, nullptr);
-  }
-};
-
-template <typename ConcreteOp, typename... Args>
-class ConstructInterfacesOrTraits {
- public:
-  /// Construct method for interfaces.
-  static InterfaceValue *interface(InterfaceValue *p_interface) {
-    (void)std::initializer_list<int>{
-        0, (PlacementConstrctInterface<Args>(p_interface), 0)...};
-    return p_interface;
-  }
-
-  /// Construct method for traits.
-  static TypeId *trait(TypeId *p_trait) {
-    (void)std::initializer_list<int>{
-        0, (PlacementConstrctTrait<Args>(p_trait), 0)...};
-    return p_trait;
-  }
-
- private:
-  /// Placement new interface.
-  template <typename T>
-  static void PlacementConstrctInterface(
-      InterfaceValue *&p_interface) {  // NOLINT
-    p_interface->swap(InterfaceValue::get<ConcreteOp, T>());
-    VLOG(6) << "New a interface: id["
-            << (p_interface->type_id()).AsOpaquePointer() << "].";
-    ++p_interface;
-  }
-
-  /// Placement new trait.
-  template <typename T>
-  static void PlacementConstrctTrait(ir::TypeId *&p_trait) {  // NOLINT
-    *p_trait = TypeId::get<T>();
-    VLOG(6) << "New a trait: id[" << p_trait->AsOpaquePointer() << "].";
-    ++p_trait;
-  }
-};
-
-/// Specialized for tuple type.
-template <typename ConcreteOp, typename... Args>
-class ConstructInterfacesOrTraits<ConcreteOp, std::tuple<Args...>> {
- public:
-  /// Construct method for interfaces.
-  static InterfaceValue *interface(InterfaceValue *p_interface) {
-    return ConstructInterfacesOrTraits<ConcreteOp, Args...>::interface(
-        p_interface);
-  }
-
-  /// Construct method for traits.
-  static TypeId *trait(TypeId *p_trait) {
-    return ConstructInterfacesOrTraits<ConcreteOp, Args...>::trait(p_trait);
-  }
-};
-
-template <typename ConcreteOp, class... TraitOrInterface>
-class Op : public OpBase {
- public:
-  using OpBase::OpBase;
-
-  using TraitList =
-      typename Filter<OpTraitBase, std::tuple<TraitOrInterface...>>::Type;
-
-  using InterfaceList =
-      typename Filter<OpInterfaceBase, std::tuple<TraitOrInterface...>>::Type;
-
-  static ConcreteOp dyn_cast(Operation *op) {
-    if (op && op->info().id() == TypeId::get<ConcreteOp>()) {
-      return ConcreteOp(op);
-    }
-    return ConcreteOp(nullptr);
-  }
-
-  static bool classof(const Operation *op) {
-    return op && op->info().id() == TypeId::get<ConcreteOp>();
-  }
-
-  static std::vector<InterfaceValue> GetInterfaceMap() {
-    constexpr size_t interfaces_num = std::tuple_size<InterfaceList>::value;
-    std::vector<InterfaceValue> interfaces_map(interfaces_num);
-    ConstructInterfacesOrTraits<ConcreteOp, InterfaceList>::interface(
-        interfaces_map.data());
-    return interfaces_map;
-  }
-
-  static std::vector<TypeId> GetTraitSet() {
-    constexpr size_t traits_num = std::tuple_size<TraitList>::value;
-    std::vector<TypeId> trait_set(traits_num);
-    auto p_first_trait = trait_set.data();
-    ConstructInterfacesOrTraits<ConcreteOp, TraitList>::trait(p_first_trait);
-    return trait_set;
-  }
-  static constexpr bool HasNoDataMembers() {
-    class EmptyOp : public Op<EmptyOp, TraitOrInterface...> {};
-    return sizeof(ConcreteOp) == sizeof(EmptyOp);
-  }
-
-  static void VerifyInvariants(Operation *op) {
-    static_assert(HasNoDataMembers(),
-                  "Op class shouldn't define new data members");
-    op->dyn_cast<ConcreteOp>().Verify();
-  }
-};
-
-}  // namespace ir
diff --git a/paddle/ir/core/type.h b/paddle/ir/core/type.h
deleted file mode 100644
index f27503b3731f4..0000000000000
--- a/paddle/ir/core/type.h
+++ /dev/null
@@ -1,133 +0,0 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <ostream>
-
-#include "paddle/ir/core/cast_utils.h"
-#include "paddle/ir/core/type_id.h"
-
-namespace ir {
-class TypeStorage;
-class AbstractType;
-class IrContext;
-class Dialect;
-///
-/// \brief Unified interface of the Type class. Derivation of all Type classes
-/// only derives interfaces, not members. For example, DenseTensorType,
-/// Float32Type, etc. are all derived classes of Type, but no new member
-/// variables will be added.
-///
-class IR_API Type {
- public:
-  using Storage = TypeStorage;
-
-  Type() = default;
-
-  Type(const Storage *storage)  // NOLINT
-      : storage_(const_cast<Storage *>(storage)) {}
-
-  Type(const Type &other) = default;
-
-  Type &operator=(const Type &other) = default;
-
-  ///
-  /// \brief Some operators are overloaded.
-  ///
-  bool operator==(Type other) const { return storage_ == other.storage_; }
-
-  bool operator!=(Type other) const { return storage_ != other.storage_; }
-
-  explicit operator bool() const { return storage_; }
-
-  bool operator!() const { return storage_ == nullptr; }
-
-  ///
-  /// \brief Some type attribute acquisition interfaces.
-  ///
-  TypeId type_id();
-
-  const AbstractType &abstract_type();
-
-  const Storage *storage() const { return storage_; }
-
-  Dialect &dialect() const;
-
-  IrContext *ir_context() const;
-
-  ///
-  /// \brief Methods for type judgment and cast.
-  ///
-  static bool classof(Type) { return true; }
-
-  template <typename T>
-  bool isa() const {
-    return ir::isa<T>(*this);
-  }
-
-  template <typename U>
-  U dyn_cast() const {
-    return ir::dyn_cast<U>(*this);
-  }
-
-  void Print(std::ostream &os) const;
-
-  static Type Parse(std::istream &is, IrContext *ctx);
-
-  ///
-  /// \brief Enable hashing Type.
-  ///
-  friend struct std::hash<Type>;
-
- protected:
-  const Storage *storage_{nullptr};
-};
-
-IR_API std::ostream &operator<<(std::ostream &os, Type type);
-
-}  // namespace ir
-
-///
-/// \brief This class represents the base of a type interface.
-///
-
-// template <typename ConcreteType>
-// class TypeInterface : public ir::DialectInterface<ConcreteType, Type> {
-//  public:
-//   using Base = TypeInterface<ConcreteType>;
-//   using DialectInterfaceBase = ir::DialectInterface<ConcreteType, Type>;
-//   using DialectInterfaceBase::Base;
-
-//  private:
-//   /// Returns the impl interface instance for the given type.
-//   static typename InterfaceBase::Concept *getInterfaceFor(Type type) {
-//     return type.getAbstractType().getInterface<ConcreteType>();
-//   }
-
-//   /// Allow access to 'getInterfaceFor'.
-//   friend InterfaceBase;
-// };
-
-namespace std {
-///
-/// \brief Enable hashing Type.
-///
-template <>
-struct hash<ir::Type> {
-  std::size_t operator()(const ir::Type &obj) const {
-    return std::hash<const ir::Type::Storage *>()(obj.storage_);
-  }
-};
-}  // namespace std
diff --git a/paddle/ir/core/value.cc b/paddle/ir/core/value.cc
deleted file mode 100644
index c652ef23a6dde..0000000000000
--- a/paddle/ir/core/value.cc
+++ /dev/null
@@ -1,300 +0,0 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/ir/core/value.h"
-
-#include <cstddef>
-
-#include "paddle/ir/core/enforce.h"
-#include "paddle/ir/core/operation.h"
-#include "paddle/ir/core/value_impl.h"
-
-#define CHECK_NULL_IMPL(class_name, func_name)                  \
-  IR_ENFORCE(impl_,                                             \
-             "impl_ pointer is null when call func:" #func_name \
-             " , in class: " #class_name ".")
-
-#define CHECK_OPOPEREND_NULL_IMPL(func_name) \
-  CHECK_NULL_IMPL(OpOpernad, func_name)
-
-#define CHECK_VALUE_NULL_IMPL(func_name) CHECK_NULL_IMPL(Value, func_name)
-
-#define CHECK_OPRESULT_NULL_IMPL(func_name) CHECK_NULL_IMPL(OpResult, func_name)
-namespace ir {
-
-// Operand
-OpOperand::OpOperand(const detail::OpOperandImpl *impl)
-    : impl_(const_cast<detail::OpOperandImpl *>(impl)) {}
-
-OpOperand &OpOperand::operator=(const OpOperand &rhs) {
-  if (this == &rhs) return *this;
-  impl_ = rhs.impl_;
-  return *this;
-}
-
-OpOperand &OpOperand::operator=(const detail::OpOperandImpl *impl) {
-  if (this->impl_ == impl) return *this;
-  impl_ = const_cast<detail::OpOperandImpl *>(impl);
-  return *this;
-}
-OpOperand::operator bool() const { return impl_ && impl_->source(); }
-
-OpOperand OpOperand::next_use() const {
-  CHECK_OPOPEREND_NULL_IMPL(next_use);
-  return impl_->next_use();
-}
-
-Value OpOperand::source() const {
-  CHECK_OPOPEREND_NULL_IMPL(source);
-  return impl_->source();
-}
-
-Type OpOperand::type() const { return source().type(); }
-
-void OpOperand::set_source(Value value) {
-  CHECK_OPOPEREND_NULL_IMPL(set_source);
-  impl_->set_source(value);
-}
-
-Operation *OpOperand::owner() const {
-  CHECK_OPOPEREND_NULL_IMPL(owner);
-  return impl_->owner();
-}
-
-void OpOperand::RemoveFromUdChain() {
-  CHECK_OPOPEREND_NULL_IMPL(RemoveFromUdChain);
-  return impl_->RemoveFromUdChain();
-}
-
-// Value
-Value::Value(const detail::ValueImpl *impl)
-    : impl_(const_cast<detail::ValueImpl *>(impl)) {}
-
-bool Value::operator==(const Value &other) const {
-  return impl_ == other.impl_;
-}
-
-bool Value::operator!=(const Value &other) const {
-  return impl_ != other.impl_;
-}
-
-bool Value::operator!() const { return impl_ == nullptr; }
-
-Value::operator bool() const { return impl_; }
-
-ir::Type Value::type() const {
-  CHECK_VALUE_NULL_IMPL(type);
-  return impl_->type();
-}
-
-void Value::set_type(ir::Type type) {
-  CHECK_VALUE_NULL_IMPL(set_type);
-  impl_->set_type(type);
-}
-
-Operation *Value::GetDefiningOp() const {
-  if (auto result = dyn_cast<OpResult>()) return result.owner();
-  return nullptr;
-}
-
-std::string Value::PrintUdChain() {
-  CHECK_VALUE_NULL_IMPL(PrintUdChain);
-  return impl()->PrintUdChain();
-}
-
-Value::UseIterator Value::use_begin() const {
-  return ir::OpOperand(first_use());
-}
-
-Value::UseIterator Value::use_end() const { return Value::UseIterator(); }
-
-OpOperand Value::first_use() const {
-  CHECK_VALUE_NULL_IMPL(first_use);
-  return impl_->first_use();
-}
-
-bool Value::use_empty() const { return !first_use(); }
-
-bool Value::HasOneUse() const {
-  CHECK_VALUE_NULL_IMPL(HasOneUse);
-  return impl_->HasOneUse();
-}
-
-size_t Value::use_count() const {
-  size_t count = 0;
-  for (auto it = use_begin(); it != use_end(); ++it) count++;
-  return count;
-}
-
-void Value::ReplaceUsesWithIf(
-    Value new_value,
-    const std::function<bool(OpOperand)> &should_replace) const {
-  for (auto it = use_begin(); it != use_end();) {
-    if (should_replace(*it)) {
-      (it++)->set_source(new_value);
-    }
-  }
-}
-
-void Value::ReplaceAllUsesWith(Value new_value) const {
-  for (auto it = use_begin(); it != use_end();) {
-    (it++)->set_source(new_value);
-  }
-}
-
-// OpResult
-bool OpResult::classof(Value value) {
-  return value && ir::isa<detail::OpResultImpl>(value.impl());
-}
-
-Operation *OpResult::owner() const {
-  CHECK_OPRESULT_NULL_IMPL(owner);
-  return impl()->owner();
-}
-
-uint32_t OpResult::GetResultIndex() const {
-  CHECK_OPRESULT_NULL_IMPL(GetResultIndex);
-  return impl()->GetResultIndex();
-}
-
-detail::OpResultImpl *OpResult::impl() const {
-  return reinterpret_cast<detail::OpResultImpl *>(impl_);
-}
-
-bool OpResult::operator==(const OpResult &other) const {
-  return impl_ == other.impl_;
-}
-
-detail::ValueImpl *OpResult::value_impl() const {
-  IR_ENFORCE(impl_, "Can't use value_impl() interface while value is null.");
-  return impl_;
-}
-
-uint32_t OpResult::GetValidInlineIndex(uint32_t index) {
-  uint32_t max_inline_index =
-      ir::detail::OpResultImpl::GetMaxInlineResultIndex();
-  return index <= max_inline_index ? index : max_inline_index;
-}
-
-// details
-namespace detail {
-ir::Operation *OpOperandImpl::owner() const { return owner_; }
-
-ir::detail::OpOperandImpl *OpOperandImpl::next_use() { return next_use_; }
-
-ir::Value OpOperandImpl::source() const { return source_; }
-
-void OpOperandImpl::set_source(Value source) {
-  RemoveFromUdChain();
-  if (!source) {
-    return;
-  }
-  source_ = source;
-  InsertToUdChain();
-}
-
-OpOperandImpl::OpOperandImpl(ir::Value source, ir::Operation *owner)
-    : source_(source), owner_(owner) {
-  if (!source) {
-    return;
-  }
-  InsertToUdChain();
-}
-
-void OpOperandImpl::InsertToUdChain() {
-  prev_use_addr_ = source_.impl()->first_use_addr();
-  next_use_ = source_.impl()->first_use();
-  if (next_use_) {
-    next_use_->prev_use_addr_ = &next_use_;
-  }
-  source_.impl()->set_first_use(this);
-}
-
-void OpOperandImpl::RemoveFromUdChain() {
-  if (!source_) return;
-  if (!prev_use_addr_) return;
-  if (prev_use_addr_ == source_.impl()->first_use_addr()) {
-    /// NOTE: In ValueImpl, first_use_offseted_by_index_ use lower three bits
-    /// storage index information, so need to be updated using the set_first_use
-    /// method here.
-    source_.impl()->set_first_use(next_use_);
-  } else {
-    *prev_use_addr_ = next_use_;
-  }
-  if (next_use_) {
-    next_use_->prev_use_addr_ = prev_use_addr_;
-  }
-  next_use_ = nullptr;
-  prev_use_addr_ = nullptr;
-  source_ = nullptr;
-}
-
-OpOperandImpl::~OpOperandImpl() { RemoveFromUdChain(); }
-
-uint32_t ValueImpl::index() const {
-  uint32_t index =
-      reinterpret_cast<uintptr_t>(first_use_offseted_by_index_) & 0x07;
-  if (index < 6) return index;
-  return reinterpret_cast<OpOutlineResultImpl *>(const_cast<ValueImpl *>(this))
-      ->GetResultIndex();
-}
-
-std::string ValueImpl::PrintUdChain() {
-  std::stringstream result;
-  result << "Value[" << this << "] -> ";
-  OpOperandImpl *tmp = first_use();
-  if (tmp) {
-    result << "OpOperand[" << reinterpret_cast<void *>(tmp) << "] -> ";
-    while (tmp->next_use() != nullptr) {
-      result << "OpOperand[" << reinterpret_cast<void *>(tmp->next_use())
-             << "] -> ";
-      tmp = tmp->next_use();
-    }
-  }
-  result << "nullptr";
-  return result.str();
-}
-
-uint32_t OpResultImpl::GetResultIndex() const {
-  if (const auto *outline_result = ir::dyn_cast<OpOutlineResultImpl>(this)) {
-    return outline_result->GetResultIndex();
-  }
-  return ir::dyn_cast<OpInlineResultImpl>(this)->GetResultIndex();
-}
-
-OpResultImpl::~OpResultImpl() { assert(use_empty()); }
-
-ir::Operation *OpResultImpl::owner() const {
-  // For inline result, pointer offset index to obtain the address of op.
-  if (const auto *result = ir::dyn_cast<OpInlineResultImpl>(this)) {
-    result += result->GetResultIndex() + 1;
-    return reinterpret_cast<Operation *>(
-        const_cast<OpInlineResultImpl *>(result));
-  }
-  // For outline result, pointer offset outline_index to obtain the address of
-  // maximum inline result.
-  const OpOutlineResultImpl *outline_result =
-      (const OpOutlineResultImpl *)(this);
-  outline_result +=
-      (outline_result->outline_index_ - GetMaxInlineResultIndex());
-  // The offset of the maximum inline result distance op is
-  // GetMaxInlineResultIndex.
-  const auto *inline_result =
-      reinterpret_cast<const OpInlineResultImpl *>(outline_result);
-  inline_result += (GetMaxInlineResultIndex() + 1);
-  return reinterpret_cast<Operation *>(
-      const_cast<OpInlineResultImpl *>(inline_result));
-}
-}  // namespace detail
-}  // namespace ir
diff --git a/paddle/ir/core/value_impl.h b/paddle/ir/core/value_impl.h
deleted file mode 100644
index 14a7b4d63f5d3..0000000000000
--- a/paddle/ir/core/value_impl.h
+++ /dev/null
@@ -1,210 +0,0 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/ir/core/value.h"
-
-namespace ir {
-static const uint32_t OUTLINE_OP_RESULT_INDEX = 6;
-
-class Operation;
-
-namespace detail {
-///
-/// \brief OpOperandImpl
-///
-class OpOperandImpl {
- public:
-  ir::Operation *owner() const;
-
-  ir::detail::OpOperandImpl *next_use();
-
-  ir::Value source() const;
-
-  void set_source(Value value);
-
-  /// Remove this op_operand from the current use list.
-  void RemoveFromUdChain();
-
-  ~OpOperandImpl();
-
-  friend ir::Operation;
-
- private:
-  OpOperandImpl(ir::Value source, ir::Operation *owner);
-
-  // Insert self to the UD chain holded by source_;
-  // It is not safe. So set private.
-  void InsertToUdChain();
-
-  ir::detail::OpOperandImpl *next_use_ = nullptr;
-
-  ir::detail::OpOperandImpl **prev_use_addr_ = nullptr;
-
-  ir::Value source_;
-
-  ir::Operation *const owner_ = nullptr;
-};
-
-///
-/// \brief ValueImpl is the base class of all derived Value classes such as
-/// OpResultImpl. This class defines all the information and usage interface in
-/// the IR Value. Each Value include three attributes:
-/// (1) type: ir::Type; (2) UD-chain of value: OpOperandImpl*, first op_operand
-/// address with offset of this value; (3) index: the position where the output
-/// list of the parent operator.
-///
-class alignas(8) ValueImpl {
- public:
-  ///
-  /// \brief Interface functions of "type_" attribute.
-  ///
-  ir::Type type() const { return type_; }
-
-  void set_type(ir::Type type) { type_ = type; }
-
-  ///
-  /// \brief Interface functions of "first_use_offseted_by_index_" attribute.
-  ///
-  uint32_t index() const;
-
-  OpOperandImpl *first_use() const {
-    return reinterpret_cast<OpOperandImpl *>(
-        reinterpret_cast<uintptr_t>(first_use_offseted_by_index_) & (~0x07));
-  }
-
-  void set_first_use(OpOperandImpl *first_use) {
-    uint32_t offset = index();
-    first_use_offseted_by_index_ = reinterpret_cast<OpOperandImpl *>(
-        reinterpret_cast<uintptr_t>(first_use) + offset);
-    VLOG(4) << "The index of this value is " << offset
-            << ". Offset and set first use: " << first_use << " -> "
-            << first_use_offseted_by_index_ << ".";
-  }
-
-  OpOperandImpl **first_use_addr() { return &first_use_offseted_by_index_; }
-
-  bool use_empty() const { return first_use() == nullptr; }
-
-  bool HasOneUse() const {
-    return (first_use() != nullptr) && (first_use()->next_use() == nullptr);
-  }
-
-  std::string PrintUdChain();
-
- protected:
-  ///
-  /// \brief Only can be constructed by derived classes such as OpResultImpl.
-  ///
-  explicit ValueImpl(ir::Type type, uint32_t index) {
-    if (index > OUTLINE_OP_RESULT_INDEX) {
-      throw("The value of index must not exceed 6");
-    }
-    type_ = type;
-    first_use_offseted_by_index_ = reinterpret_cast<OpOperandImpl *>(
-        reinterpret_cast<uintptr_t>(nullptr) + index);
-    VLOG(4) << "Construct a ValueImpl whose's index is " << index
-            << ". The offset first_use address is: "
-            << first_use_offseted_by_index_;
-  }
-
-  ///
-  /// \brief Attribute1: Type of value.
-  ///
-  ir::Type type_;
-
-  ///
-  /// \brief Attribute2/3: Record the UD-chain of value and index.
-  /// NOTE: The members of the OpOperandImpl include four pointers, so this
-  /// class is 8-byte aligned, and the lower 3 bits of its address are 0, so the
-  /// index can be stored in these 3 bits, stipulate:
-  /// (1) index = 0~5: represent positions 0 to 5 inline
-  /// output(OpInlineResultImpl); (2) index = 6: represent the position >=6
-  /// outline output(OpOutlineResultImpl); (3) index = 7 is reserved.
-  ///
-  OpOperandImpl *first_use_offseted_by_index_ = nullptr;
-};
-
-///
-/// \brief OpResultImpl is the implementation of an operation result.
-///
-class alignas(8) OpResultImpl : public ValueImpl {
- public:
-  using ValueImpl::ValueImpl;
-
-  static bool classof(const ValueImpl &value) { return true; }
-
-  ///
-  /// \brief Get the parent operation of this result.(op_ptr = value_ptr +
-  /// index)
-  ///
-  ir::Operation *owner() const;
-
-  ///
-  /// \brief Get the result index of the operation result.
-  ///
-  uint32_t GetResultIndex() const;
-
-  ///
-  /// \brief Get the maximum number of results that can be stored inline.
-  ///
-  static uint32_t GetMaxInlineResultIndex() {
-    return OUTLINE_OP_RESULT_INDEX - 1;
-  }
-
-  ~OpResultImpl();
-};
-
-///
-/// \brief OpInlineResultImpl is the implementation of an operation result whose
-/// index <= 5.
-///
-class OpInlineResultImpl : public OpResultImpl {
- public:
-  OpInlineResultImpl(ir::Type type, uint32_t result_index)
-      : OpResultImpl(type, result_index) {
-    if (result_index > GetMaxInlineResultIndex()) {
-      throw("Inline result index should not exceed MaxInlineResultIndex(5)");
-    }
-  }
-
-  static bool classof(const OpResultImpl &value) {
-    return value.index() < OUTLINE_OP_RESULT_INDEX;
-  }
-
-  uint32_t GetResultIndex() const { return index(); }
-};
-
-///
-/// \brief OpOutlineResultImpl is the implementation of an operation result
-/// whose index > 5.
-///
-class OpOutlineResultImpl : public OpResultImpl {
- public:
-  OpOutlineResultImpl(ir::Type type, uint32_t outline_index)
-      : OpResultImpl(type, OUTLINE_OP_RESULT_INDEX),
-        outline_index_(outline_index) {}
-
-  static bool classof(const OpResultImpl &value) {
-    return value.index() >= OUTLINE_OP_RESULT_INDEX;
-  }
-
-  uint32_t GetResultIndex() const { return outline_index_; }
-
-  uint32_t outline_index_;
-};
-
-}  // namespace detail
-}  // namespace ir
diff --git a/paddle/ir/dialect/control_flow/CMakeLists.txt b/paddle/ir/dialect/control_flow/CMakeLists.txt
deleted file mode 100644
index 5a693ba156ccd..0000000000000
--- a/paddle/ir/dialect/control_flow/CMakeLists.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-file(GLOB_RECURSE CONTROL_FLOW_SRCS "*.cc")
-ir_library(ir_control_flow SRCS ${CONTROL_FLOW_SRCS} DEPS ir_core)
diff --git a/paddle/ir/dialect/shape/CMakeLists.txt b/paddle/ir/dialect/shape/CMakeLists.txt
deleted file mode 100644
index 62d7c0d42c85c..0000000000000
--- a/paddle/ir/dialect/shape/CMakeLists.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-file(GLOB_RECURSE SHAPE_SRCS "*.cc")
-ir_library(ir_shape SRCS ${SHAPE_SRCS} DEPS ir_core)
diff --git a/paddle/ir/dialect/shape/ir/shape_op.cc b/paddle/ir/dialect/shape/ir/shape_op.cc
deleted file mode 100644
index 776503ea269e3..0000000000000
--- a/paddle/ir/dialect/shape/ir/shape_op.cc
+++ /dev/null
@@ -1,198 +0,0 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/ir/dialect/shape/ir/shape_op.h"
-#include "paddle/ir/core/builtin_attribute.h"
-#include "paddle/ir/core/builtin_type.h"
-
-namespace ir {
-namespace dialect {
-
-const char *SymbolicDim::attributes_name[attributes_num] = {"knownNegativeOne",
-                                                            "knownNonNegative",
-                                                            "knownNonSizeOne",
-                                                            "knownNonSizeZero",
-                                                            "sym_name",
-                                                            "value"};  // NOLINT
-
-void SymbolicDim::Build(
-    Builder &builder,
-    OperationArgument &argument,
-    const std::string &sym_name,
-    int64_t value,  // TODO(zhangbo) value = ShapedType::kDynamic
-    bool knownNonNegative,
-    bool knownNegativeOne,
-    bool knownNonSizeOne,
-    bool knownNonSizeZero) {
-  ir::Attribute attr_sym_name =
-      ir::StrAttribute::get(ir::IrContext::Instance(), sym_name);
-  argument.AddAttribute("sym_name", attr_sym_name);
-  ir::Attribute attr_value =
-      ir::Int64Attribute::get(ir::IrContext::Instance(), value);
-  argument.AddAttribute("value", attr_value);
-  ir::Attribute attr_knownNonNegative =
-      ir::BoolAttribute::get(ir::IrContext::Instance(), knownNonNegative);
-  argument.AddAttribute("knownNonNegative", attr_knownNonNegative);
-  ir::Attribute attr_knownNegativeOne =
-      ir::BoolAttribute::get(ir::IrContext::Instance(), knownNegativeOne);
-  argument.AddAttribute("knownNegativeOne", attr_knownNegativeOne);
-  ir::Attribute attr_knownNonSizeOne =
-      ir::BoolAttribute::get(ir::IrContext::Instance(), knownNonSizeOne);
-  argument.AddAttribute("knownNonSizeOne", attr_knownNonSizeOne);
-  ir::Attribute attr_knownNonSizeZero =
-      ir::BoolAttribute::get(ir::IrContext::Instance(), knownNonSizeZero);
-  argument.AddAttribute("knownNonSizeZero", attr_knownNonSizeZero);
-}
-
-const std::string SymbolicDim::getSymName() {
-  return attribute<ir::StrAttribute>("sym_name").AsString();
-}
-int64_t SymbolicDim::getValue() {
-  return attribute<ir::Int64Attribute>("value").data();
-}
-bool SymbolicDim::getKnownNonNegative() {
-  return attribute<ir::BoolAttribute>("knownNonNegative").data();
-}
-bool SymbolicDim::getKnownNegativeOne() {
-  return attribute<ir::BoolAttribute>("knownNegativeOne").data();
-}
-bool SymbolicDim::getKnownNonSizeOne() {
-  return attribute<ir::BoolAttribute>("knownNonSizeOne").data();
-}
-bool SymbolicDim::getKnownNonSizeZero() {
-  return attribute<ir::BoolAttribute>("knownNonSizeZero").data();
-}
-
-void SymbolicDim::updateSymName(std::string attrValue) {
-  operation()->set_attribute(
-      "sym_name", ir::StrAttribute::get(ir::IrContext::Instance(), attrValue));
-}
-void SymbolicDim::updateValue(int64_t attrValue) {
-  operation()->set_attribute(
-      "value", ir::Int64Attribute::get(ir::IrContext::Instance(), attrValue));
-}
-
-void SymbolicDim::updateKnownNonNegative(bool attrValue) {
-  operation()->set_attribute(
-      "knownNonNegative",
-      ir::BoolAttribute::get(ir::IrContext::Instance(), attrValue));
-}
-void SymbolicDim::updateKnownNegativeOne(bool attrValue) {
-  operation()->set_attribute(
-      "knownNegativeOne",
-      ir::BoolAttribute::get(ir::IrContext::Instance(), attrValue));
-}
-void SymbolicDim::updateKnownNonSizeOne(bool attrValue) {
-  operation()->set_attribute(
-      "knownNonSizeOne",
-      ir::BoolAttribute::get(ir::IrContext::Instance(), attrValue));
-}
-void SymbolicDim::updateKnownNonSizeZero(bool attrValue) {
-  operation()->set_attribute(
-      "knownNonSizeZero",
-      ir::BoolAttribute::get(ir::IrContext::Instance(), attrValue));
-}
-
-bool SymbolicDim::isDynamic() {
-  return getValue() == -100000;
-}  // TODO(zhangbo): getValue() == ShapedType::kDynamic;
-
-bool SymbolicDim::merge(SymbolicDim other) {
-  if (!isDynamic() && !other.isDynamic() && getValue() != other.getValue())
-    return false;
-  if (isDynamic() && !other.isDynamic()) updateValue(other.getValue());
-  if (!isDynamic() && other.isDynamic()) other.updateValue(getValue());
-
-  bool knownNonNegativeFlag =
-      getKnownNonNegative() || other.getKnownNonNegative();
-  bool knownNegativeOneFlag =
-      getKnownNegativeOne() || other.getKnownNegativeOne();
-  bool knownNonSizeOneFlag = getKnownNonSizeOne() ||
-                             other.getKnownNonSizeOne() || knownNegativeOneFlag;
-  bool knownNonSizeZeroFlag = getKnownNonSizeZero() ||
-                              other.getKnownNonSizeZero() ||
-                              knownNegativeOneFlag;
-
-  if (knownNonNegativeFlag && knownNegativeOneFlag) return false;
-
-  updateKnownNonSizeZero(knownNonSizeZeroFlag);
-  updateKnownNonSizeOne(knownNonSizeOneFlag);
-  updateKnownNegativeOne(knownNegativeOneFlag);
-  updateKnownNonNegative(knownNonNegativeFlag);
-
-  return true;
-}
-
-const char *DimOp::attributes_name[attributes_num] = {"name"};  // NOLINT
-
-void DimOp::Build(Builder &builder,
-                  OperationArgument &argument,
-                  const std::string &name) {
-  ir::Attribute attr_name =
-      ir::StrAttribute::get(ir::IrContext::Instance(), name);
-  argument.AddAttribute("name", attr_name);
-  argument.output_types.emplace_back(
-      ir::IndexType::get(ir::IrContext::Instance()));
-}
-
-const std::string DimOp::getName() {
-  return attribute<ir::StrAttribute>("name").AsString();
-}
-
-void DimOp::setName(std::string attrName) {
-  operation()->set_attribute(
-      "name", ir::StrAttribute::get(ir::IrContext::Instance(), attrName));
-}
-
-const char *TieProductEqualOp::attributes_name[attributes_num] = {
-    "lhs_len", "rhs_len"};  // NOLINT
-
-void TieProductEqualOp::Build(Builder &builder,
-                              OperationArgument &argument,
-                              int64_t lhs_len,
-                              int64_t rhs_len,
-                              const std::vector<ir::OpResult> &inputs) {
-  ir::Attribute attr_lhs_len =
-      ir::Int64Attribute::get(ir::IrContext::Instance(), lhs_len);
-  argument.AddAttribute("lhs_len", attr_lhs_len);
-  ir::Attribute attr_rhs_len =
-      ir::Int64Attribute::get(ir::IrContext::Instance(), rhs_len);
-  argument.AddAttribute("rhs_len", attr_rhs_len);
-  argument.inputs = inputs;
-}
-
-std::vector<ir::Value> TieProductEqualOp::getLhs() {
-  int64_t lhs_len = attribute<ir::Int64Attribute>("lhs_len").data();
-  std::vector<ir::Value> res;
-  for (uint32_t idx = 0; idx < lhs_len; idx++) {
-    res.push_back(operand_source(idx));
-  }
-  return res;
-}
-std::vector<ir::Value> TieProductEqualOp::getRhs() {
-  int64_t lhs_len = attribute<ir::Int64Attribute>("lhs_len").data();
-  int64_t rhs_len = attribute<ir::Int64Attribute>("rhs_len").data();
-  std::vector<ir::Value> res;
-  for (uint32_t idx = 0; idx < rhs_len; idx++) {
-    res.push_back(operand_source(lhs_len + idx));
-  }
-  return res;
-}
-
-}  // namespace dialect
-}  // namespace ir
-
-IR_DEFINE_EXPLICIT_TYPE_ID(ir::dialect::SymbolicDim)
-IR_DEFINE_EXPLICIT_TYPE_ID(ir::dialect::DimOp)
-IR_DEFINE_EXPLICIT_TYPE_ID(ir::dialect::TieProductEqualOp)
diff --git a/paddle/ir/pass/CMakeLists.txt b/paddle/ir/pass/CMakeLists.txt
deleted file mode 100644
index b4a1d99ab5fcd..0000000000000
--- a/paddle/ir/pass/CMakeLists.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-file(GLOB NEW_PASS_SRCS "*.cc")
-
-ir_library(ir_pass SRCS ${NEW_PASS_SRCS} DEPS ir_core)
diff --git a/paddle/ir/pattern_rewrite/CMakeLists.txt b/paddle/ir/pattern_rewrite/CMakeLists.txt
deleted file mode 100644
index e99611a4ca050..0000000000000
--- a/paddle/ir/pattern_rewrite/CMakeLists.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-file(GLOB PATTERN_SRCS "*.cc")
-
-ir_library(ir_pattern_rewrite SRCS ${PATTERN_SRCS} DEPS ir_core)
diff --git a/paddle/phi/api/lib/api_gen_utils.cc b/paddle/phi/api/lib/api_gen_utils.cc
index c7501494b1e71..887f6b2fb0d24 100644
--- a/paddle/phi/api/lib/api_gen_utils.cc
+++ b/paddle/phi/api/lib/api_gen_utils.cc
@@ -590,5 +590,28 @@ std::vector<phi::distributed::DistTensor*> SetKernelDistOutput(
   return results;
 }
 
+std::vector<phi::distributed::DistTensor*> SetKernelDistInplaceOutput(
+    size_t out_size, std::vector<Tensor>* out) {
+  std::vector<phi::distributed::DistTensor*> results(out->size(), nullptr);
+  for (size_t i = 0; i < out->size(); ++i) {
+    results[i] =
+        static_cast<phi::distributed::DistTensor*>(out->at(i).impl().get());
+  }
+  return results;
+}
+
+std::vector<phi::distributed::DistTensor*> SetKernelDistInplaceOptionalOutput(
+    size_t out_size, paddle::optional<std::vector<Tensor>> out) {
+  std::vector<phi::distributed::DistTensor*> results;
+  if (out) {
+    results = std::vector<phi::distributed::DistTensor*>(out->size(), nullptr);
+    for (size_t i = 0; i < out->size(); ++i) {
+      results[i] =
+          static_cast<phi::distributed::DistTensor*>(out->at(i).impl().get());
+    }
+  }
+  return results;
+}
+
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/phi/api/lib/api_gen_utils.h b/paddle/phi/api/lib/api_gen_utils.h
index d0281dfc68184..b13688a2ffb49 100644
--- a/paddle/phi/api/lib/api_gen_utils.h
+++ b/paddle/phi/api/lib/api_gen_utils.h
@@ -150,5 +150,11 @@ std::vector<phi::distributed::DistTensor*> SetKernelDistOutput(
 std::vector<phi::distributed::DistTensor*> SetKernelDistOutput(
     size_t out_size, std::vector<Tensor>* out);
 
+std::vector<phi::distributed::DistTensor*> SetKernelDistInplaceOutput(
+    size_t out_size, std::vector<Tensor>* out);
+
+std::vector<phi::distributed::DistTensor*> SetKernelDistInplaceOptionalOutput(
+    size_t out_size, paddle::optional<std::vector<Tensor>> out);
+
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc
index 7515ff917f10e..e2bb35948d537 100644
--- a/paddle/phi/api/lib/data_transform.cc
+++ b/paddle/phi/api/lib/data_transform.cc
@@ -672,7 +672,7 @@ PrepareDataForDistTensor(const std::vector<Tensor>& input,
                          const TransformFlag& transform_flag,
                          bool is_stride_kernel) {
   std::vector<std::shared_ptr<phi::distributed::DistTensor>> out;
-  for (auto x : input) {
+  for (auto& x : input) {
     const auto& tensor_in = x.impl();
     if (tensor_in) {
       phi::distributed::DistTensor* dist_tensor =
@@ -691,16 +691,16 @@ PrepareDataForDistTensor(const std::vector<Tensor>& input,
                                      dense_tensor.meta().is_contiguous()))) {
         out.push_back(
             std::static_pointer_cast<phi::distributed::DistTensor>(tensor_in));
-        continue;
+      } else {
+        phi::DenseTensor trans_in_tensor = TransformData(
+            dense_tensor, target_args_def, transform_flag, is_stride_kernel);
+        // TODO(GhostScreaming): The global meta in DistTensor is not changed,
+        // but the local meta in DenseTensor maybe changed, such as layout
+        // change(NCHW->NHWC), so the new DistTensor's meta maybe not unified.
+        VLOG(6) << "PrepareDataForDistTensor return transformed dist tensor";
+        out.push_back(std::make_shared<phi::distributed::DistTensor>(
+            trans_in_tensor, dist_tensor->dist_attr()));
       }
-      phi::DenseTensor trans_in_tensor = TransformData(
-          dense_tensor, target_args_def, transform_flag, is_stride_kernel);
-      // TODO(GhostScreaming): The global meta in DistTensor is not changed,
-      // but the local meta in DenseTensor maybe changed, such as layout
-      // change(NCHW->NHWC), so the new DistTensor's meta maybe not unified.
-      VLOG(6) << "PrepareDataForDistTensor return transformed dist tensor";
-      out.push_back(std::make_shared<phi::distributed::DistTensor>(
-          trans_in_tensor, dist_tensor->dist_attr()));
     } else {
       out.push_back(nullptr);
     }
@@ -708,5 +708,29 @@ PrepareDataForDistTensor(const std::vector<Tensor>& input,
   return out;
 }
 
+paddle::optional<phi::distributed::DistTensor> PrepareDataForDistTensor(
+    const paddle::optional<Tensor>& input,
+    const phi::TensorArgDef& target_args_def,
+    const TransformFlag& transform_flag,
+    bool is_stride_kernel) {
+  if (input) {
+    return {*PrepareDataForDistTensor(
+        *input, target_args_def, transform_flag, is_stride_kernel)};
+  }
+  return paddle::none;
+}
+
+paddle::optional<std::vector<std::shared_ptr<phi::distributed::DistTensor>>>
+PrepareDataForDistTensor(const paddle::optional<std::vector<Tensor>>& input,
+                         const phi::TensorArgDef& target_args_def,
+                         const TransformFlag& transform_flag,
+                         bool is_stride_kernel) {
+  if (input) {
+    return PrepareDataForDistTensor(
+        *input, target_args_def, transform_flag, is_stride_kernel);
+  }
+  return paddle::none;
+}
+
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/phi/api/lib/data_transform.h b/paddle/phi/api/lib/data_transform.h
index 3ac1b94f144ba..1e6cca8bcf5fd 100644
--- a/paddle/phi/api/lib/data_transform.h
+++ b/paddle/phi/api/lib/data_transform.h
@@ -198,5 +198,17 @@ PrepareDataForDistTensor(const std::vector<Tensor>& input,
                          const TransformFlag& transform_flag,
                          bool is_stride_kernel);
 
+paddle::optional<phi::distributed::DistTensor> PrepareDataForDistTensor(
+    const paddle::optional<Tensor>& input,
+    const phi::TensorArgDef& target_args_def,
+    const TransformFlag& transform_flag,
+    bool is_stride_kernel);
+
+paddle::optional<std::vector<std::shared_ptr<phi::distributed::DistTensor>>>
+PrepareDataForDistTensor(const paddle::optional<std::vector<Tensor>>& input,
+                         const phi::TensorArgDef& target_args_def,
+                         const TransformFlag& transform_flag,
+                         bool is_stride_kernel);
+
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/phi/api/yaml/generator/dist_api_gen.py b/paddle/phi/api/yaml/generator/dist_api_gen.py
index ed671ecdfebd6..9c17d51a3b407 100644
--- a/paddle/phi/api/yaml/generator/dist_api_gen.py
+++ b/paddle/phi/api/yaml/generator/dist_api_gen.py
@@ -93,19 +93,23 @@
 VECTOR_OUT_CREATION_TEMPLATE = """
     auto dist_out = SetKernelDistOutput({}, &api_output);
     std::vector<phi::DenseTensor*> dense_out(dist_out.size());
-    for (size_t i = 0; i < dist_out.size(); i++) {{
+    for (size_t i = 0; i < dist_out.size(); ++i) {{
         dense_out[i] = const_cast<phi::DenseTensor*>(&dist_out[i]->value());
     }}
 """
 MULTI_VECTOR_OUT_CREATION_TEMPLATE = """
     auto dist_out_{out_name} = SetKernelDistOutput({size}, {in_name});
     std::vector<phi::DenseTensor*> dense_out_{out_name}(dist_out_{out_name}.size());
-    for (size_t i = 0; i < dist_out_{out_name}.size(); i++) {{
+    for (size_t i = 0; i < dist_out_{out_name}.size(); ++i) {{
         dense_out_{out_name}[i] = const_cast<phi::DenseTensor*>(&dist_out_{out_name}[i]->value());
     }}
 """
-# TODO(GhostScreaming): support tuple output later
-TUPLE_OUT_CREATION_TEMPLATE = """
+MULTI_VECTOR_INPLACE_AND_OPTIONAL_OUT_CREATION_TEMPLATE = """
+    auto dist_out_{out_name} = {out_func}({size}, {in_name});
+    std::vector<phi::DenseTensor*> dense_out_{out_name}(dist_out_{out_name}.size());
+    for (size_t i = 0; i < dist_out_{out_name}.size(); ++i) {{
+        dense_out_{out_name}[i] = dist_out_{out_name}[i] ? const_cast<phi::DenseTensor*>(&dist_out_{out_name}[i]->value()) : nullptr;
+    }}
 """
 
 # 3. Infer Global Shape
@@ -119,12 +123,28 @@
       {name}_meta_vec.emplace_back(MakeMetaTensor(*tmp.impl()));
     }}
     std::vector<const phi::MetaTensor*> {name}_meta_ptr_vec({name}_meta_vec.size());
-    for (size_t i=0; i<{name}_meta_ptr_vec.size(); i++) {{
+    for (size_t i=0; i < {name}_meta_ptr_vec.size(); ++i) {{
       {name}_meta_ptr_vec[i] = &{name}_meta_vec[i];
     }}
 """
-# TODO(GhostScreaming): support optional args later
-OPTIONAL_GLOBAL_VECTOR_META_IN_TEMPLATE = """
+OPTIONAL_GLOBAL_SINGLE_META_IN_TEMPLATE = """meta_dist_{}, """
+OPTIONAL_GLOBAL_SINGLE_META_IN_DECL_TEMPLATE = """
+    phi::MetaTensor meta_dist_{name} = {name} ? MakeMetaTensor(*(*{name}).impl()) : phi::MetaTensor();
+"""
+OPTIONAL_GLOBAL_VECTOR_META_IN_TEMPLATE = """{}_meta_ptr_vec, """
+OPTIONAL_GLOBAL_VECTOR_META_IN_DECL_TEMPLATE = """
+    std::vector<phi::MetaTensor> {name}_meta_vec_tmp;
+    if ({name}) {{
+      for (auto tmp : *{name}) {{
+        {name}_meta_vec_tmp.emplace_back(MakeMetaTensor(*tmp.impl()));
+      }}
+    }}
+    std::vector<const phi::MetaTensor*> {name}_meta_ptr_vec_tmp({name}_meta_vec_tmp.size());
+    for (size_t i = 0; i < {name}_meta_ptr_vec_tmp.size(); ++i) {{
+      {name}_meta_ptr_vec_tmp[i] = &{name}_meta_vec_tmp[i];
+    }}
+    paddle::optional<std::vector<const phi::MetaTensor*>> {name}_meta_ptr_vec =
+        {name} ? paddle::make_optional<std::vector<const phi::MetaTensor*>>({name}_meta_ptr_vec_tmp) : paddle::none;
 """
 SINGLE_GLOBAL_META_OUT_DECL_TEMPLATE = """
     phi::MetaTensor meta_{}({});"""
@@ -134,7 +154,7 @@
       {name}_meta_vec.emplace_back(phi::MetaTensor(tmp));
     }}
     std::vector<phi::MetaTensor*> {name}_meta_ptr_vec({name}.size());
-    for (size_t i=0; i<{name}_meta_vec.size(); i++) {{
+    for (size_t i = 0; i < {name}_meta_vec.size(); ++i) {{
       {name}_meta_ptr_vec[i] = &{name}_meta_vec[i];
     }}
 """
@@ -173,10 +193,31 @@
     }}
     std::vector<phi::MetaTensor> dense_input_{name}_meta_vec = MakeMetaTensor(dense_input_{name}_vec);
     std::vector<const phi::MetaTensor*> dense_input_{name}_meta_ptr_vec(dense_input_{name}_meta_vec.size());
-    for (size_t i=0; i<dense_input_{name}_meta_vec.size(); i++) {{
+    for (size_t i = 0; i < dense_input_{name}_meta_ptr_vec.size(); ++i) {{
       dense_input_{name}_meta_ptr_vec[i] = &dense_input_{name}_meta_vec[i];
     }}
 """
+OPTIONAL_SINGLE_PREPARE_DATA_TEMPLATE = """
+    auto dist_input_{name} = PrepareDataForDistTensor({name}, GetKernelInputArgDef(kernel.InputAt({index}), kernel_backend), {trans_flag}, kernel_result.is_stride_kernel);
+    paddle::optional<phi::DenseTensor> input_{name} = dist_input_{name} ? paddle::make_optional<phi::DenseTensor>(dist_input_{name}->value()) : paddle::none;
+"""
+OPTIONAL_VECTOR_PREPARE_DATA_TEMPLATE = """
+    auto dist_input_{name}_vec = PrepareDataForDistTensor({name}, GetKernelInputArgDef(kernel.InputAt({index}), kernel_backend), {trans_flag}, kernel_result.is_stride_kernel);
+    std::vector<const phi::DenseTensor*> dense_input_{name}_vec;
+    if ({name}) {{
+      for (auto tmp : *dist_input_{name}_vec) {{
+        dense_input_{name}_vec.emplace_back(&tmp->value());
+      }}
+    }}
+    paddle::optional<std::vector<const phi::DenseTensor*>> input_{name}(dense_input_{name}_vec);
+    std::vector<phi::MetaTensor> dense_input_{name}_meta_vec = MakeMetaTensor(dense_input_{name}_vec);
+    std::vector<const phi::MetaTensor*> dense_input_{name}_meta_ptr_vec_tmp(dense_input_{name}_meta_vec.size());
+    for (size_t i = 0; i < dense_input_{name}_meta_ptr_vec_tmp.size(); ++i) {{
+      dense_input_{name}_meta_ptr_vec_tmp[i] = &dense_input_{name}_meta_vec[i];
+    }}
+    paddle::optional<std::vector<const phi::MetaTensor*>> dense_input_{name}_meta_ptr_vec =
+            {name} ? paddle::make_optional<std::vector<const phi::MetaTensor*>>(dense_input_{name}_meta_ptr_vec_tmp) : paddle::none;
+"""
 INFER_META_SINGLE_INPUT_TEMPLATE = """
     auto dist_input_{} = {}.impl();
     auto input_{} = &(static_cast<phi::distributed::DistTensor*>(dist_input_{}.get())->value());
@@ -191,16 +232,15 @@
 
 # 7. Infer Local DenseTensor Meta
 SINGLE_META_IN_TEMPLATE = """MakeMetaTensor(*input_{}), """
-# TODO(GhostScreaming): support optional args later
 VECTOR_META_IN_TEMPLATE = """dense_input_{}_meta_ptr_vec, """
-OPTIONAL_VECTOR_META_IN_TEMPLATE = """
-"""
+OPTIONAL_SINGLE_META_IN_TEMPLATE = """MakeMetaTensor(input_{}), """
+OPTIONAL_VECTOR_META_IN_TEMPLATE = """dense_input_{}_meta_ptr_vec, """
 SINGLE_META_OUT_DECL_TEMPLATE = """
     phi::MetaTensor meta_{}({});"""
 VECTOR_META_OUT_DECL_TEMPLATE = """
     std::vector<phi::MetaTensor> {name}_meta_vec = MakeMetaTensor({name});
     std::vector<phi::MetaTensor*> {name}_meta_ptr_vec({name}_meta_vec.size());
-    for (size_t i=0; i<{name}_meta_vec.size(); i++) {{
+    for (size_t i = 0; i < {name}_meta_vec.size(); ++i) {{
       {name}_meta_ptr_vec[i] = &{name}_meta_vec[i];
     }}
 """
@@ -221,6 +261,22 @@
     auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
     (*kernel_fn)({}, {});
 """
+# TODO(GhostScreaming): Some operators generate shape info in runtime,
+# bincount. As a result, dist_output's global shape is set uncorrectly,
+# because it's generated in InferMeta function. A temporally solution is
+# use black op list to set DistTensor shape extra.
+SINGLE_SET_DIST_OUT_DIMS = """
+    dist_out->unsafe_set_dims(dense_out->dims());
+"""
+MULTI_SINGLE_SET_DIST_OUT_DIMS = """
+    dist_out_{}->unsafe_set_dims(dense_out_{}->dims());
+"""
+VECTOR_SET_DIST_OUT_DIMS = """
+    for (size_t i = 0; i < dist_out.size(); ++i) {{
+        dist_out[i]->unsafe_set_dims(dense_out[i]->dims());
+    }}
+"""
+
 PREFIX_VECTOR_TENSOR_NAME = "dense_input_"
 SUFFIX_VECTOR_TENSOR_NAME = "_vec"
 
@@ -236,13 +292,15 @@
 #     types : [], list of output types
 #     out_size_expr : [], expression for getting size of vector<Tensor>
 
-# TODO(GhostScreaming): Support std::tuple<...> type of input and output later.
-skip_op_lists = [
-    "check_finite_and_unscale",  # std::vector<Tensor>&, const Tensor& -> std::tuple<std::vector<Tensor>&, Tensor>
-    "coalesce_tensor",  # const std::vector<Tensor>&, DataType, bool, bool, bool, float, bool, int, int, const std::vector<int64_t>&, const std::vector<int64_t>& -> std::tuple<std::vector<Tensor>, Tensor>
-    "update_loss_scaling",  # std::vector<Tensor>, const Tensor, ... -> std::tuple<std::vector<Tensor>, Tensor, Tensor, Tensor>
-    "einsum",
-    "einsum_grad",  # const std::vector<Tensor>&, const std::string& -> std::tuple<Tensor, std::vector<Tensor>, std::vector<Tensor>>
+
+# TODO(GhostScreaming): Black list for operators which infer shape in runtime.
+ops_infer_shape_in_runtime = [
+    "bincount",
+    "bicubic_interp",
+    "bilinear_interp",
+    "linear_interp",
+    "nearest_interp",
+    "trilinear_interp",
 ]
 
 
@@ -256,12 +314,15 @@ def init_dist_api_members(self):
             "const Tensor&": {
                 "dense": self.generate_single_dense_input,
             },
-            "const paddle::optional<Tensor>&": {
-                "dense": self.generate_single_dense_input,
-            },
             "const std::vector<Tensor>&": {
                 "dense": self.generate_vector_dense_input,
             },
+            "const paddle::optional<Tensor>&": {
+                "dense": self.generate_optional_single_dense_input,
+            },
+            "const paddle::optional<std::vector<Tensor>>&": {
+                "dense": self.generate_optional_vector_dense_input,
+            },
         }
 
         self.inplace_flag = False
@@ -423,25 +484,28 @@ def generate_output_creation_code(self) -> str:
                 get_out_code = f"&std::get<{i}>(api_output)"
                 if self.is_inplace_and_optional_output(i):
                     get_out_code = f"std::get<{i}>(api_output).get_ptr()"
-
                 if out_type == 'std::vector<Tensor>':
                     self.vector_output_size_assertion_check()
                     # Special case for inplace vector and inplace optional<vector>
-                    # TODO(chenweihang): support this branch later
                     if self.is_inplace_output(i):
-                        set_out_func = "SetInplaceVectorKernelOutput"
+                        set_out_func = "SetKernelDistInplaceOutput"
                         if self.is_inplace_and_optional_output(i):
-                            set_out_func = (
-                                "SetInplaceOptionalVectorKernelOutput"
-                            )
+                            set_out_func = "SetKernelDistInplaceOptionalOutput"
                             get_out_code = f"std::get<{i}>(api_output)"
-                    output_creation_code += (
-                        MULTI_VECTOR_OUT_CREATION_TEMPLATE.format(
+                        output_creation_code += MULTI_VECTOR_INPLACE_AND_OPTIONAL_OUT_CREATION_TEMPLATE.format(
+                            out_func=set_out_func,
                             out_name=i,
                             size=self.outputs['out_size_expr'][i],
                             in_name=get_out_code,
                         )
-                    )
+                    else:
+                        output_creation_code += (
+                            MULTI_VECTOR_OUT_CREATION_TEMPLATE.format(
+                                out_name=i,
+                                size=self.outputs['out_size_expr'][i],
+                                in_name=get_out_code,
+                            )
+                        )
                 else:
                     if self.infer_meta['spmd_rule'] is not None:
                         output_creation_code += (
@@ -496,6 +560,31 @@ def generate_infer_global_shape_code(self) -> str:
                     input_meta_code += (
                         VECTOR_GLOBAL_META_IN_DECL_TEMPLATE.format(name=param)
                     )
+                elif (
+                    self.inputs['input_info'][param]
+                    == "const paddle::optional<Tensor>&"
+                ):
+                    input_args_code += (
+                        OPTIONAL_GLOBAL_SINGLE_META_IN_TEMPLATE.format(param)
+                    )
+                    input_meta_code += (
+                        OPTIONAL_GLOBAL_SINGLE_META_IN_DECL_TEMPLATE.format(
+                            name=param
+                        )
+                    )
+                elif (
+                    self.inputs['input_info'][param]
+                    == "const paddle::optional<std::vector<Tensor>>&"
+                ):
+                    input_args_code += (
+                        OPTIONAL_GLOBAL_VECTOR_META_IN_TEMPLATE.format(param)
+                    )
+                    input_meta_code += (
+                        OPTIONAL_GLOBAL_VECTOR_META_IN_DECL_TEMPLATE.format(
+                            name=param
+                        )
+                    )
+
                 else:
                     raise ValueError(
                         f"{self.api} : Param of infer_spmd error : {self.inputs['input_info'][param]} type is not supported."
@@ -517,12 +606,7 @@ def generate_infer_global_shape_code(self) -> str:
                 output_decl_code += VECTOR_GLOBAL_META_OUT_DECL_TEMPLATE.format(
                     name=out_name
                 )
-                if len(self.dense_output_args) == 1:
-                    output_args_code += f"{out_name}_meta_ptr_vec, "
-                else:
-                    output_args_code += (
-                        f"{out_name} ? {out_name}_meta_ptr_vec : nullptr, "
-                    )
+                output_args_code += f"{out_name}_meta_ptr_vec, "
             else:
                 output_decl_code += SINGLE_GLOBAL_META_OUT_DECL_TEMPLATE.format(
                     out_name, out_name
@@ -628,6 +712,46 @@ def generate_vector_dense_input(
 
         return input_tensor_code
 
+    def generate_optional_single_dense_input(
+        self,
+        input_name,
+    ):
+        input_tensor_code = ""
+        trans_flag = self.gene_trans_flag(input_name)
+        input_names = self.inputs['names']
+        attr_names = self.attrs['names']
+        kernel_param = self.kernel['param']
+        if kernel_param is None:
+            kernel_param = input_names + attr_names
+
+        input_tensor_code += OPTIONAL_SINGLE_PREPARE_DATA_TEMPLATE.format(
+            name=input_name,
+            index=kernel_param.index(input_name),
+            trans_flag=trans_flag,
+        )
+
+        return input_tensor_code
+
+    def generate_optional_vector_dense_input(
+        self,
+        input_name,
+    ):
+        input_tensor_code = ""
+        trans_flag = self.gene_trans_flag(input_name)
+        input_names = self.inputs['names']
+        attr_names = self.attrs['names']
+        kernel_param = self.kernel['param']
+        if kernel_param is None:
+            kernel_param = input_names + attr_names
+
+        input_tensor_code += OPTIONAL_VECTOR_PREPARE_DATA_TEMPLATE.format(
+            name=input_name,
+            index=kernel_param.index(input_name),
+            trans_flag=trans_flag,
+        )
+
+        return input_tensor_code
+
     def generate_prepare_data_code(self) -> str:
         input_names = self.inputs['names']
         attr_names = self.attrs['names']
@@ -703,6 +827,20 @@ def generate_infer_meta_code(self) -> str:
                     == "const std::vector<Tensor>&"
                 ):
                     input_args_code += VECTOR_META_IN_TEMPLATE.format(param)
+                elif (
+                    self.inputs['input_info'][param]
+                    == "const paddle::optional<Tensor>&"
+                ):
+                    input_args_code += OPTIONAL_SINGLE_META_IN_TEMPLATE.format(
+                        param
+                    )
+                elif (
+                    self.inputs['input_info'][param]
+                    == "const paddle::optional<std::vector<Tensor>>&"
+                ):
+                    input_args_code += OPTIONAL_VECTOR_META_IN_TEMPLATE.format(
+                        param
+                    )
                 else:
                     raise ValueError(
                         f"{self.api} : Param of infer_meta error : {self.inputs['input_info'][param]} type is not supported."
@@ -724,12 +862,7 @@ def generate_infer_meta_code(self) -> str:
                 output_decl_code += VECTOR_META_OUT_DECL_TEMPLATE.format(
                     name=out_name
                 )
-                if len(self.dense_output_args) == 1:
-                    output_args_code += f"{out_name}_meta_ptr_vec, "
-                else:
-                    output_args_code += (
-                        f"{out_name} ? {out_name}_meta_ptr_vec : nullptr, "
-                    )
+                output_args_code += f"{out_name}_meta_ptr_vec, "
             else:
                 output_decl_code += SINGLE_META_OUT_DECL_TEMPLATE.format(
                     out_name, out_name
@@ -818,11 +951,22 @@ def generate_kernel_call_code(self) -> str:
             kernel_args_type_list.append(dense_output_trans_map[out_type])
         kernel_signature = "void(*)(" + ", ".join(kernel_args_type_list) + ")"
 
-        return KERNEL_CALL_TEMPLATE.format(
+        result = KERNEL_CALL_TEMPLATE.format(
             kernel_signature,
             ", ".join(input_args),
             ", ".join(self.dense_output_args),
         )
+        global ops_infer_shape_in_runtime
+        if self.kernel['func'][0] in ops_infer_shape_in_runtime:
+            if len(self.outputs['types']) == 1:
+                if self.outputs['types'][0] == 'Tensor':
+                    result += SINGLE_SET_DIST_OUT_DIMS
+                elif self.outputs['types'][0] == 'std::vector<Tensor>':
+                    result += VECTOR_SET_DIST_OUT_DIMS
+            else:
+                for i in range(len(self.outputs['types'])):
+                    result += MULTI_SINGLE_SET_DIST_OUT_DIMS.format(i, i)
+        return result
 
     def generate_return_code(self) -> str:
         return self.gene_return_code()
@@ -845,19 +989,17 @@ def generate_auto_paralel_branch(self) -> str:
         )
 
     def check_argument_whether_support_auto_parallel(self):
-        global skip_op_lists
         for name in self.inputs['names']:
             if self.inputs['input_info'][name] not in [
                 "const Tensor&",
                 "const std::vector<Tensor>&",
+                "const paddle::optional<Tensor>&",
+                "const paddle::optional<std::vector<Tensor>>&",
             ]:
                 return False
         for out_type in self.outputs['types']:
             if out_type not in ["Tensor", "std::vector<Tensor>"]:
                 return False
-
-        if self.kernel['func'][0] in skip_op_lists:
-            return False
         return True
 
     # override BaseAPI's method
diff --git a/paddle/phi/api/yaml/generator/dist_bw_api_gen.py b/paddle/phi/api/yaml/generator/dist_bw_api_gen.py
index 25944e3356966..8f39859882579 100644
--- a/paddle/phi/api/yaml/generator/dist_bw_api_gen.py
+++ b/paddle/phi/api/yaml/generator/dist_bw_api_gen.py
@@ -48,6 +48,13 @@
         dense_out[i] = const_cast<phi::DenseTensor*>(&dist_out[i]->value());
     }}
 """
+VECTOR_OUT_CREATION_TEMPLATE = """
+    auto dist_out = SetKernelDistOutput({name});
+    std::vector<phi::DenseTensor*> dense_out(dist_out.size());
+    for (size_t i = 0; i < dist_out.size(); i++) {{
+        dense_out[i] = const_cast<phi::DenseTensor*>(&dist_out[i]->value());
+    }}
+"""
 INPLACE_OUT_CREATION_TEMPLATE = """
     *{} = {};
 """
@@ -69,6 +76,13 @@
     auto dist_input_{arg} = PrepareDataForDistTensor({arg}, GetKernelInputArgDef(kernel.InputAt({idx}), kernel_backend), {flag}, kernel_result.is_stride_kernel);
     auto input_{arg} = &dist_input_{}->value();
 """
+MULTI_VECTOR_OUT_CREATION_TEMPLATE = """
+    auto dist_out_{i} = SetKernelDistOutput({name});
+    std::vector<phi::DenseTensor*> dense_out_{i}(dist_out_{i}.size());
+    for (size_t i = 0; i < dist_out_{i}.size(); i++) {{
+        dense_out_{i}[i] = const_cast<phi::DenseTensor*>(&dist_out_{i}[i]->value());
+    }}
+"""
 
 
 class DistBackwardAPI(DistForwardAPI, BackwardAPI):
@@ -104,6 +118,12 @@ def generate_output_creation_code(self) -> str:
                             i, self.outputs['names'][i], i, i
                         )
                     )
+                elif out_type == 'std::vector<Tensor>':
+                    output_creation_code += (
+                        MULTI_VECTOR_OUT_CREATION_TEMPLATE.format(
+                            i=i, name=self.outputs['names'][i]
+                        )
+                    )
                 else:
                     self.vector_output_size_assertion_check()
         else:
diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml
index 4c151374c6893..a647e02b35ef2 100755
--- a/paddle/phi/api/yaml/legacy_ops.yaml
+++ b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -123,6 +123,25 @@
   backward : batch_norm_grad
   optional : reserve_space
 
+- op : c_allgather
+  args : (Tensor x, int ring_id, int nranks, bool use_calc_stream)
+  output : Tensor(out)
+  infer_meta :
+    func : AllGatherInferMeta
+    param: [x, nranks]
+  kernel :
+    func : c_allgather
+
+- op : c_allreduce_max
+  args : (Tensor x, int ring_id, bool use_calc_stream, bool use_model_parallel)
+  output : Tensor(out)
+  infer_meta :
+    func : AllReduceInferMeta
+    param : [x]
+  kernel :
+    func : c_allreduce_max
+  inplace : (x -> out)
+
 - op : c_allreduce_sum
   args : (Tensor x, int ring_id, bool use_calc_stream, bool use_model_parallel)
   output : Tensor(out)
@@ -173,6 +192,16 @@
     func : c_identity
   inplace : (x -> out)
 
+- op : c_reduce_sum
+  args : (Tensor x, int ring_id, int root_id, bool use_calc_stream)
+  output : Tensor(out)
+  infer_meta :
+    func : DistReduceInferMeta
+    param : [x]
+  kernel :
+    func : c_reduce_sum
+  inplace : (x -> out)
+
 - op : c_sync_calc_stream
   args : (Tensor x)
   output : Tensor(out)
@@ -651,19 +680,11 @@
   output : Tensor
   infer_meta :
     func : MatmulInferMeta
-    spmd_rule : MatmulSpmdInferForward
+    spmd_rule : MatmulInferSpmd
   kernel :
     func : matmul
   backward : matmul_grad
 
-- op : matmul_int8
-  args : (Tensor x, Tensor y, bool transpose_x = false, bool transpose_y = false)
-  output : Tensor
-  infer_meta :
-    func : MatmulInt8InferMeta
-  kernel :
-    func : matmul_int8
-
 - op : matrix_rank
   args : (Tensor x, float tol, bool use_default_tol=true, bool hermitian=false)
   output : Tensor(out)
diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
index 495ba53cd7613..9d499c68bef74 100755
--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -810,6 +810,7 @@
   backward : dropout_grad
   inputs :
     x : X
+    seed_tensor : Seed
   outputs :
     out : Out
     mask : Mask
@@ -2428,6 +2429,8 @@
     out : Out
 
 - op : seed
+  outputs :
+    out : Out
   extra :
     attrs : [bool deterministic = false, str rng_name = "", bool force_cpu = false]
 
@@ -2638,6 +2641,7 @@
     out : Out
 
 - op : split
+  backward : split_grad
   inputs:
     x : X
   outputs:
@@ -3047,6 +3051,18 @@
     yolo_loss : GetYoloLossExpectedKernelType
     yolo_loss_grad : GetYoloLossExpectedKernelType
 
+- op: c_allgather
+  inputs :
+    x : X
+  outputs :
+    out: Out
+
+- op: c_allreduce_max
+  inputs :
+    x : X
+  outputs :
+    out: Out
+
 - op: c_allreduce_sum
   inputs :
     x : X
@@ -3065,6 +3081,12 @@
   outputs :
     out: Out
 
+- op: c_reduce_sum
+  inputs :
+    x : X
+  outputs :
+    out: Out
+
 - op: c_sync_calc_stream
   inputs :
     x : X
diff --git a/paddle/phi/backends/device_manager.cc b/paddle/phi/backends/device_manager.cc
index d95cb12646f41..24ad5087769de 100644
--- a/paddle/phi/backends/device_manager.cc
+++ b/paddle/phi/backends/device_manager.cc
@@ -30,10 +30,17 @@
 namespace phi {
 
 void Device::CheckInitialized() {
-  std::call_once(initialized_, [&]() { this->impl_->InitDevice(dev_id_); });
+  std::call_once(initialized_once_flag_, [&]() {
+    this->impl_->InitDevice(dev_id_);
+    this->initialized_ = true;
+  });
 }
 
-Device::~Device() { impl_->DeInitDevice(dev_id_); }
+Device::~Device() {
+  if (initialized_) {
+    impl_->DeInitDevice(dev_id_);
+  }
+}
 
 void Device::CreateStream(stream::Stream* stream,
                           const stream::Stream::Priority& priority,
diff --git a/paddle/phi/backends/device_manager.h b/paddle/phi/backends/device_manager.h
index 62c85aeb52674..58a9e6ebe7ab8 100644
--- a/paddle/phi/backends/device_manager.h
+++ b/paddle/phi/backends/device_manager.h
@@ -127,7 +127,8 @@ class Device final {
  private:
   size_t dev_id_;
   DeviceInterface* impl_;
-  std::once_flag initialized_;
+  std::once_flag initialized_once_flag_;
+  bool initialized_{false};
 };
 
 class DeviceManager {
diff --git a/paddle/phi/backends/gpu/cuda/cuda_helper.h b/paddle/phi/backends/gpu/cuda/cuda_helper.h
index 32a5d10d6291b..555cc2357b2ab 100644
--- a/paddle/phi/backends/gpu/cuda/cuda_helper.h
+++ b/paddle/phi/backends/gpu/cuda/cuda_helper.h
@@ -88,6 +88,12 @@ cudaDataType_t ToCudaDataType() {
 #if CUDA_VERSION >= 11000
   } else if (std::is_same<T, phi::dtype::bfloat16>::value) {
     return CUDA_R_16BF;
+#endif
+#if CUDA_VERSION >= 11060
+  } else if (std::is_same<T, int8_t>::value) {
+    return CUDA_R_8I;
+  } else if (std::is_same<T, int32_t>::value) {
+    return CUDA_R_32I;
 #endif
   } else {
     PADDLE_THROW(phi::errors::InvalidArgument(
diff --git a/paddle/phi/common/complex.h b/paddle/phi/common/complex.h
index e0ff7f11ac542..ceb46874238f3 100644
--- a/paddle/phi/common/complex.h
+++ b/paddle/phi/common/complex.h
@@ -456,6 +456,26 @@ HOSTDEVICE inline complex<T> tan(const complex<T>& a) {
 #endif
 }
 
+template <typename T>
+HOSTDEVICE inline complex<T> sinh(const complex<T>& a) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return complex<T>(thrust::sinh(thrust::complex<T>(a)));
+#else
+  return complex<T>(std::sinh(std::complex<T>(a)));
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T> cosh(const complex<T>& a) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return complex<T>(thrust::cosh(thrust::complex<T>(a)));
+#else
+  return complex<T>(std::cosh(std::complex<T>(a)));
+#endif
+}
+
 template <typename T>
 HOSTDEVICE inline complex<T> tanh(const complex<T>& a) {
 #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
@@ -466,6 +486,66 @@ HOSTDEVICE inline complex<T> tanh(const complex<T>& a) {
 #endif
 }
 
+template <typename T>
+HOSTDEVICE inline complex<T> asin(const complex<T>& a) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return complex<T>(thrust::asin(thrust::complex<T>(a)));
+#else
+  return complex<T>(std::asin(std::complex<T>(a)));
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T> acos(const complex<T>& a) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return complex<T>(thrust::acos(thrust::complex<T>(a)));
+#else
+  return complex<T>(std::acos(std::complex<T>(a)));
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T> atan(const complex<T>& a) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return complex<T>(thrust::atan(thrust::complex<T>(a)));
+#else
+  return complex<T>(std::atan(std::complex<T>(a)));
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T> asinh(const complex<T>& a) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return complex<T>(thrust::asinh(thrust::complex<T>(a)));
+#else
+  return complex<T>(std::asinh(std::complex<T>(a)));
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T> acosh(const complex<T>& a) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return complex<T>(thrust::acosh(thrust::complex<T>(a)));
+#else
+  return complex<T>(std::acosh(std::complex<T>(a)));
+#endif
+}
+
+template <typename T>
+HOSTDEVICE inline complex<T> atanh(const complex<T>& a) {
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
+  return complex<T>(thrust::atanh(thrust::complex<T>(a)));
+#else
+  return complex<T>(std::atanh(std::complex<T>(a)));
+#endif
+}
+
 template <typename T>
 HOSTDEVICE inline complex<T> conj(const complex<T>& a) {
 #if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \
diff --git a/paddle/phi/core/dense_tensor.cc b/paddle/phi/core/dense_tensor.cc
index 20b1da1efe39d..89f18920ef1fd 100644
--- a/paddle/phi/core/dense_tensor.cc
+++ b/paddle/phi/core/dense_tensor.cc
@@ -232,6 +232,9 @@ void DenseTensor::set_meta(const DenseTensorMeta& meta) {
   } else {
     meta_.strides = meta.strides;
   }
+#ifdef PADDLE_WITH_XPU
+  meta_.scale_value = meta.scale_value;
+#endif
 }
 
 /* @jim19930609: This interface will be further modified until we finalized the
diff --git a/paddle/phi/core/dense_tensor_impl.cc b/paddle/phi/core/dense_tensor_impl.cc
index ed1944ade402b..4595d11a594f7 100644
--- a/paddle/phi/core/dense_tensor_impl.cc
+++ b/paddle/phi/core/dense_tensor_impl.cc
@@ -392,6 +392,9 @@ DenseTensor& DenseTensor::ShareDataWith(const DenseTensor& src) {
   meta_.offset = src.meta_.offset;
   meta_.use_gpudnn = src.meta_.use_gpudnn;
   meta_.strides = src.meta_.strides;
+#ifdef PADDLE_WITH_XPU
+  meta_.scale_value = src.meta_.scale_value;
+#endif
   storage_properties_ =
       std::move(CopyStorageProperties(src.storage_properties_));
 #ifdef PADDLE_WITH_DNNL
diff --git a/paddle/phi/core/distributed/auto_parallel/dist_attr.cc b/paddle/phi/core/distributed/auto_parallel/dist_attr.cc
index fc105915738bb..46e58cc9b373e 100644
--- a/paddle/phi/core/distributed/auto_parallel/dist_attr.cc
+++ b/paddle/phi/core/distributed/auto_parallel/dist_attr.cc
@@ -102,6 +102,10 @@ void TensorDistAttr::set_partial_status(const std::vector<int64_t>& dims,
           "Trying to Set dim %d as Partial which is already a Partial dim.",
           dim));
     }
+    if (std::count(dims_mapping_.begin(), dims_mapping_.end(), dim)) {
+      PADDLE_THROW(phi::errors::InvalidArgument(
+          "Trying to Set dim %d as Partial which is a Sharding dim.", dim));
+    }
     partial_status_.emplace(dim, type);
   }
 }
diff --git a/paddle/phi/core/distributed/auto_parallel/dist_tensor.cc b/paddle/phi/core/distributed/auto_parallel/dist_tensor.cc
index 830665670e8ca..b9103a00c9d02 100644
--- a/paddle/phi/core/distributed/auto_parallel/dist_tensor.cc
+++ b/paddle/phi/core/distributed/auto_parallel/dist_tensor.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/phi/core/distributed/auto_parallel/dist_tensor.h"
 
+#include "glog/logging.h"
 #include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/core/distributed/auto_parallel/reshard_function.h"
 #include "paddle/phi/core/distributed/auto_parallel/reshard_utils.h"
@@ -54,14 +55,11 @@ DistTensor::DistTensor(const phi::DenseTensor& global_value,
 DistTensor::DistTensor(const DDim& dims, const TensorDistAttr& dist_attr)
     : dims_(dims), dist_attr_(dist_attr) {}
 
-void DistTensor::set_dims(const DDim& dims) {
-  PADDLE_ENFORCE_EQ(
-      this->initialized(),
-      false,
-      phi::errors::Unimplemented(
-          "DistTensor's set_dims method can only be used when the `value` "
-          "is not initialized (generally used in the InferMeta and "
-          "InferSPMD stages)."));
+void DistTensor::unsafe_set_dims(const DDim& dims) {
+  if (this->initialized()) {
+    VLOG(3) << "You try to set an initialized DistTensor's global dims. "
+               "Make sure you are aware of where you change its dims.";
+  }
   dims_ = dims;
 }
 
diff --git a/paddle/phi/core/distributed/auto_parallel/dist_tensor.h b/paddle/phi/core/distributed/auto_parallel/dist_tensor.h
index bc8b98d81a3ff..1289a23b1be8c 100644
--- a/paddle/phi/core/distributed/auto_parallel/dist_tensor.h
+++ b/paddle/phi/core/distributed/auto_parallel/dist_tensor.h
@@ -56,7 +56,7 @@ class DistTensor final
 
   /// \brief Set the global dims of the dist tensor.
   /// \return void
-  void set_dims(const DDim& dims);
+  void unsafe_set_dims(const DDim& dims);
 
   /// \brief Returns the dist attr of current dist tensor.
   /// \return The TensorDistAttr's const reference
diff --git a/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.cc b/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.cc
index 531727b3ee8d1..a1895b6dfbd79 100644
--- a/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.cc
+++ b/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.cc
@@ -18,7 +18,18 @@ namespace phi {
 namespace distributed {
 
 void InferSpmdContext::EmplaceBackInput(DistMetaTensor input) {
+  int index = static_cast<int>(inputs_.size());
   inputs_.emplace_back(std::move(input));
+  input_range_.emplace_back(std::pair<int, int>(index, index + 1));
+}
+
+void InferSpmdContext::EmplaceBackInputs(
+    paddle::small_vector<DistMetaTensor, phi::kInputSmallVectorSize> inputs) {
+  int index = static_cast<int>(inputs_.size());
+  input_range_.emplace_back(std::pair<int, int>(index, index + inputs.size()));
+  inputs_.insert(inputs_.end(),
+                 std::make_move_iterator(inputs.begin()),
+                 std::make_move_iterator(inputs.end()));
 }
 
 void InferSpmdContext::EmplaceBackAttr(Attribute attr) {
@@ -63,6 +74,23 @@ const Attribute& InferSpmdContext::AttrAt(size_t idx) const {
   return attrs_.at(idx);
 }
 
+const std::pair<int, int>& InferSpmdContext::InputRangeAt(size_t idx) const {
+  return input_range_.at(idx);
+}
+
+const std::vector<const DistMetaTensor*> InferSpmdContext::InputsBetween(
+    size_t start, size_t end) const {
+  std::vector<const DistMetaTensor*> result;
+  result.reserve(end - start);
+  for (size_t i = start; i < end; ++i) {
+    auto& in = inputs_.at(i);
+    result.emplace_back(&in);
+    // result.emplace_back(in.initialized() ? &in : nullptr);
+  }
+
+  return result;
+}
+
 SpmdRuleFactory& SpmdRuleFactory::Instance() {
   static SpmdRuleFactory g_spmd_rule_map;
   return g_spmd_rule_map;
diff --git a/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h b/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h
index bccee2bf5981a..3896bfcd6a2fe 100644
--- a/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h
+++ b/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h
@@ -45,9 +45,15 @@ class InferSpmdContext {
 
   void EmplaceBackInput(DistMetaTensor input);
   void EmplaceBackAttr(Attribute attr);
+  void EmplaceBackInputs(
+      paddle::small_vector<DistMetaTensor, phi::kInputSmallVectorSize> inputs);
 
   const DistMetaTensor& InputAt(size_t idx) const;
 
+  const std::pair<int, int>& InputRangeAt(size_t idx) const;
+  const std::vector<const DistMetaTensor*> InputsBetween(size_t start,
+                                                         size_t end) const;
+
   template <typename AttrType>
   AttrType AttrAt(size_t idx) const;
 
@@ -59,6 +65,9 @@ class InferSpmdContext {
   // Because the attribute arguments of dygraph do not have `attr name`,
   // so we use vector instead of map
   paddle::small_vector<Attribute, phi::kAttrSmallVectorSize> attrs_;
+  // for vector arguments
+  paddle::small_vector<std::pair<int, int>, phi::kInputSmallVectorSize>
+      input_range_;
 };
 
 using InferSpmdFn = SpmdInfo (*)(const InferSpmdContext&);
@@ -98,6 +107,24 @@ struct InferSpmdFnImpl<Return (*)(Args...), infer_spmd_fn> {
     }
   };
 
+  // for vecotr slot
+  template <typename... Tail>
+  struct InferSpmdFnCallHelper<const std::vector<const DistMetaTensor*>&,
+                               Tail...> {
+    template <int in_idx, int attr_idx, typename... PreviousArgs>
+    static SpmdInfo Call(const InferSpmdContext& ctx, PreviousArgs&... pargs) {
+      static_assert(attr_idx == 0,
+                    "InferSpmd's Input should appear before Attributes.");
+
+      const std::pair<int, int> range = ctx.InputRangeAt(in_idx);
+      std::vector<const DistMetaTensor*> arg =
+          ctx.InputsBetween(range.first, range.second);
+      return InferSpmdFnCallHelper<Tail...>::template Call<in_idx + 1,
+                                                           attr_idx>(
+          ctx, pargs..., arg);
+    }
+  };
+
 #define PD_SPECIALIZE_InferSpmdFnCallHelper_FOR_ATTRIBUTE(attr_type)      \
   template <typename... Tail>                                             \
   struct InferSpmdFnCallHelper<attr_type, Tail...> {                      \
diff --git a/paddle/phi/core/flags.cc b/paddle/phi/core/flags.cc
index a7df7f3203734..e02868d5e2c1b 100644
--- a/paddle/phi/core/flags.cc
+++ b/paddle/phi/core/flags.cc
@@ -749,9 +749,9 @@ PHI_DEFINE_EXPORTED_int32(
  * [false]: not set 0D Tensor to 1D Numpy, close the hack
  *
  * Now, just set true by default in 2.5 transition time
- * which will be removed in future (2.6 or 2.7) .
+ * which will be removed in future (2.6) .
  */
-PHI_DEFINE_EXPORTED_bool(set_to_1d, true, "set 0D Tensor to 1D numpy");
+PHI_DEFINE_EXPORTED_bool(set_to_1d, false, "set 0D Tensor to 1D numpy");
 
 /**
  * Debug related FLAG
@@ -1312,7 +1312,7 @@ PHI_DEFINE_EXPORTED_bool(enable_new_ir_in_executor_trace_run,
 PHI_DEFINE_EXPORTED_bool(new_ir_apply_inplace_pass,
                          true,
                          "Whether to apply inplace pass on lowering "
-                         "::ir::Program to Kernel Dialect");
+                         "::pir::Program to Kernel Dialect");
 
 PHI_DEFINE_EXPORTED_bool(enable_record_memory, false, "Enable memory recorder");
 
@@ -1329,6 +1329,12 @@ PHI_DEFINE_EXPORTED_int64(host_trace_level,
                           "RecordEvent will works "
                           "if host_trace_level >= level.");
 
+PHI_DEFINE_EXPORTED_int32(
+    multiple_of_cupti_buffer_size,
+    1,
+    "Multiple of the CUPTI device buffer size. If the timestamps have "
+    "been dropped when you are profiling, try increasing this value.");
+
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 /**
  * Communication library related FLAG
diff --git a/paddle/phi/core/kernel_factory.cc b/paddle/phi/core/kernel_factory.cc
index 2e85d521c516f..d58decadfadca 100644
--- a/paddle/phi/core/kernel_factory.cc
+++ b/paddle/phi/core/kernel_factory.cc
@@ -14,10 +14,6 @@
 
 #include "paddle/phi/core/kernel_factory.h"
 
-#include <regex>
-#include <string>
-#include <unordered_set>
-
 #include "glog/logging.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/utils/flags.h"
@@ -37,10 +33,6 @@ PHI_DEFINE_EXPORTED_bool(use_stride_kernel,
                          true,
                          "Whether to use strdie kernel if op support stride.");
 
-PHI_DEFINE_EXPORTED_string(stride_kernel_blacklist,
-                           "",
-                           "It controls the strided kernel subset do not use.");
-
 PD_DECLARE_int32(low_precision_op_list);
 PD_DECLARE_bool(enable_api_kernel_fallback);
 PD_DECLARE_bool(run_kp_kernel);
@@ -234,26 +226,14 @@ KernelResult KernelFactory::SelectKernelOrThrowError(
       phi::errors::NotFound("The kernel `%s` is not registered.", kernel_name));
 
   if (FLAGS_use_stride_kernel && use_strided_kernel) {
-    std::regex reg(",");
-    std::unordered_set<std::string> elems{
-        std::sregex_token_iterator(FLAGS_stride_kernel_blacklist.begin(),
-                                   FLAGS_stride_kernel_blacklist.end(),
-                                   reg,
-                                   -1),
-        std::sregex_token_iterator()};
-    elems.erase("");
-
-    if (!elems.count(kernel_name)) {
-      auto stride_kernel_iter = iter->second.find(
-          {const_kernel_key.backend() == paddle::experimental::Backend::GPUDNN
-               ? paddle::experimental::Backend::GPU
-               : const_kernel_key.backend(),
-           phi::DataLayout::STRIDED,
-           const_kernel_key.dtype()});
-      if (stride_kernel_iter != iter->second.end()) {
-        VLOG(1) << "use strided kernel, kernel_name = " << kernel_name;
-        return {stride_kernel_iter->second, false, true};
-      }
+    auto stride_kernel_iter = iter->second.find(
+        {const_kernel_key.backend() == paddle::experimental::Backend::GPUDNN
+             ? paddle::experimental::Backend::GPU
+             : const_kernel_key.backend(),
+         phi::DataLayout::STRIDED,
+         const_kernel_key.dtype()});
+    if (stride_kernel_iter != iter->second.end()) {
+      return {stride_kernel_iter->second, false, true};
     }
   }
 
diff --git a/paddle/phi/core/kernel_factory.h b/paddle/phi/core/kernel_factory.h
index 610009fdb70fa..9e3c67fa9ad35 100644
--- a/paddle/phi/core/kernel_factory.h
+++ b/paddle/phi/core/kernel_factory.h
@@ -16,16 +16,13 @@
 
 #include <map>
 #include <ostream>
-#include <string>
 #include <unordered_map>
 #include <unordered_set>
-#include <utility>
 #include "paddle/phi/common/backend.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/compat/get_kerneltype_forvar_utils.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/type_defs.h"
 #include "paddle/phi/core/utils/data_type.h"
 #include "paddle/utils/flat_hash_map.h"
diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h
index 82d750b692e87..a9356dcfc202a 100644
--- a/paddle/phi/core/kernel_registry.h
+++ b/paddle/phi/core/kernel_registry.h
@@ -14,20 +14,11 @@
 
 #pragma once
 
-#include <cstring>
-#include <string>
-#include <type_traits>
 #include <typeindex>
 #include <typeinfo>
-#include <vector>
 
 #include "paddle/phi/core/custom_kernel.h"
-#include "paddle/phi/core/enforce.h"
-#include "paddle/phi/core/extended_tensor.h"
-#include "paddle/phi/core/kernel_factory.h"
 #include "paddle/phi/core/kernel_utils.h"
-#include "paddle/phi/core/macros.h"
-#include "paddle/phi/core/type_defs.h"
 
 namespace phi {
 
diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h
index 33af6abc83aa4..715b4f76392d8 100644
--- a/paddle/phi/core/kernel_utils.h
+++ b/paddle/phi/core/kernel_utils.h
@@ -27,7 +27,6 @@
 #include "paddle/phi/core/sparse_csr_tensor.h"
 #include "paddle/phi/core/string_tensor.h"
 #include "paddle/phi/core/tensor_array.h"
-#include "paddle/phi/core/type_defs.h"
 
 namespace phi {
 
diff --git a/paddle/phi/core/meta_tensor.cc b/paddle/phi/core/meta_tensor.cc
index 9b9df5c1ff4aa..53cba02ab0765 100644
--- a/paddle/phi/core/meta_tensor.cc
+++ b/paddle/phi/core/meta_tensor.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "glog/logging.h"
 
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_meta_tensor.h"
+#include "paddle/fluid/pir/dialect/operator/ir/meta_tensor.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/distributed/auto_parallel/dist_tensor.h"
 #include "paddle/phi/core/enforce.h"
@@ -87,7 +87,7 @@ void MetaTensor::set_dims(const DDim& dims) {
     DenseTensorUtils::GetMutableMeta(static_cast<SparseCsrTensor*>(tensor_))
         ->dims = dims;
   } else if (phi::distributed::DistTensor::classof(tensor_)) {
-    static_cast<distributed::DistTensor*>(tensor_)->set_dims(dims);
+    static_cast<distributed::DistTensor*>(tensor_)->unsafe_set_dims(dims);
   } else {
     PADDLE_THROW(phi::errors::Unimplemented(
         "Unsupported setting dims for `%s`.", tensor_->type_info().name()));
diff --git a/paddle/phi/core/tensor_meta.cc b/paddle/phi/core/tensor_meta.cc
index 59926ed0b8c25..54c5e409aeb5b 100644
--- a/paddle/phi/core/tensor_meta.cc
+++ b/paddle/phi/core/tensor_meta.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/core/tensor_meta.h"
-#include "paddle/ir/core/enforce.h"
+#include "paddle/pir/core/enforce.h"
 
 namespace phi {
 
@@ -118,12 +118,20 @@ DDim DenseTensorMeta::calc_strides(const DDim& dims) {
   }
 }
 
-DenseTensorMeta::DenseTensorMeta() { use_gpudnn = true; }
+DenseTensorMeta::DenseTensorMeta() {
+  use_gpudnn = true;
+#ifdef PADDLE_WITH_XPU
+  scale_value = -1.0f;
+#endif
+}
 
 DenseTensorMeta::DenseTensorMeta(DataType dtype, const DDim& dims)
     : dims(dims), dtype(dtype) {
   strides = calc_strides(dims);
   use_gpudnn = true;
+#ifdef PADDLE_WITH_XPU
+  scale_value = -1.0f;
+#endif
 }
 
 DenseTensorMeta::DenseTensorMeta(DataType dtype,
@@ -131,6 +139,9 @@ DenseTensorMeta::DenseTensorMeta(DataType dtype,
                                  const DDim& strides)
     : dims(dims), dtype(dtype), strides(strides) {
   use_gpudnn = true;
+#ifdef PADDLE_WITH_XPU
+  scale_value = -1.0f;
+#endif
 }
 
 DenseTensorMeta::DenseTensorMeta(DataType dtype,
@@ -140,6 +151,9 @@ DenseTensorMeta::DenseTensorMeta(DataType dtype,
     : dims(dims), dtype(dtype), layout(layout), offset(offset) {
   strides = calc_strides(dims);
   use_gpudnn = true;
+#ifdef PADDLE_WITH_XPU
+  scale_value = -1.0f;
+#endif
 }
 
 DenseTensorMeta::DenseTensorMeta(DataType dtype,
@@ -150,6 +164,9 @@ DenseTensorMeta::DenseTensorMeta(DataType dtype,
     : dims(dims), dtype(dtype), layout(layout), lod(lod), offset(offset) {
   strides = calc_strides(dims);
   use_gpudnn = true;
+#ifdef PADDLE_WITH_XPU
+  scale_value = -1.0f;
+#endif
 }
 
 DenseTensorMeta::DenseTensorMeta(const DenseTensorMeta& other) {
@@ -165,6 +182,9 @@ DenseTensorMeta::DenseTensorMeta(const DenseTensorMeta& other) {
   } else {
     strides = other.strides;
   }
+#ifdef PADDLE_WITH_XPU
+  scale_value = other.scale_value;
+#endif
 }
 
 DenseTensorMeta& DenseTensorMeta::operator=(const DenseTensorMeta& other) {
@@ -180,6 +200,9 @@ DenseTensorMeta& DenseTensorMeta::operator=(const DenseTensorMeta& other) {
   } else {
     strides = other.strides;
   }
+#ifdef PADDLE_WITH_XPU
+  scale_value = other.scale_value;
+#endif
   return *this;
 }
 
@@ -197,7 +220,9 @@ DenseTensorMeta& DenseTensorMeta::operator=(  // NOLINT
   } else {
     strides = std::move(other.strides);
   }
-
+#ifdef PADDLE_WITH_XPU
+  scale_value = other.scale_value;
+#endif
   return *this;
 }
 
diff --git a/paddle/phi/core/tensor_meta.h b/paddle/phi/core/tensor_meta.h
index ecd746e10037f..2575b51e49fe8 100644
--- a/paddle/phi/core/tensor_meta.h
+++ b/paddle/phi/core/tensor_meta.h
@@ -82,13 +82,23 @@ struct DenseTensorMeta {
   LoD lod;
   size_t offset{0};
   DDim strides;
+
+#ifdef PADDLE_WITH_XPU
+  // for per tensor scale
+  float scale_value{-1.0f};
+#endif
 };
 
 inline bool operator==(const DenseTensorMeta& lhs, const DenseTensorMeta& rhs) {
   return (lhs.is_scalar == rhs.is_scalar) && lhs.use_gpudnn == rhs.use_gpudnn &&
          (lhs.dims == rhs.dims) && (lhs.dtype == rhs.dtype) &&
          (lhs.layout == rhs.layout) && (lhs.lod == rhs.lod) &&
+#ifdef PADDLE_WITH_XPU
+         (lhs.offset == rhs.offset) && (lhs.strides == rhs.strides) &&
+         (lhs.scale_value == rhs.scale_value);
+#else
          (lhs.offset == rhs.offset) && (lhs.strides == rhs.strides);
+#endif
 }
 
 struct StringTensorMeta {
diff --git a/paddle/phi/core/utils/type_info.cc b/paddle/phi/core/utils/type_info.cc
index 99b134b6e7960..2cb903fde7310 100644
--- a/paddle/phi/core/utils/type_info.cc
+++ b/paddle/phi/core/utils/type_info.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include <string>
 
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_meta_tensor.h"
+#include "paddle/fluid/pir/dialect/operator/ir/meta_tensor.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/backends/custom/custom_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index a66790d0ce6cd..2fd87760378fc 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -2169,77 +2169,11 @@ void MatmulInferMeta(const MetaTensor& x,
   auto ddim_out = phi::make_ddim(new_dims);
 
   out->set_dims(ddim_out);
-  out->set_dtype(x.dtype());
-  out->set_layout(x.layout());
-}
-
-void MatmulInt8InferMeta(const MetaTensor& x,
-                         const MetaTensor& y,
-                         bool trans_x,
-                         bool trans_y,
-                         MetaTensor* out) {
-  std::vector<int64_t> dims_x = phi::vectorize(x.dims());
-  std::vector<int64_t> dims_y = phi::vectorize(y.dims());
-  auto ndims_x = dims_x.size();
-  auto ndims_y = dims_y.size();
-  PADDLE_ENFORCE_GT(ndims_x,
-                    0UL,
-                    phi::errors::InvalidArgument(
-                        "The Input(x) dims size must be greater than 0,"
-                        " but reviced dims size is 0. "));
-  PADDLE_ENFORCE_GT(ndims_y,
-                    0UL,
-                    phi::errors::InvalidArgument(
-                        "The Input(y) dims size must be greater than 0,"
-                        " but reviced dims size is 0. "));
-
-  bool x_broadcasted = false, y_broadcasted = false;
-  if (ndims_x == 1) {
-    dims_x.insert(dims_x.begin(), 1);
-    ndims_x = 2;
-    x_broadcasted = true;
-  }
-
-  if (ndims_y == 1) {
-    dims_y.push_back(1);
-    ndims_y = 2;
-    y_broadcasted = true;
-  }
-
-  size_t M, N;
-  if (trans_x) {
-    M = dims_x[ndims_x - 1];
-  } else {
-    M = dims_x[ndims_x - 2];
-  }
-  if (trans_y) {
-    N = dims_y[ndims_y - 2];
-  } else {
-    N = dims_y[ndims_y - 1];
-  }
-
-  std::vector<int64_t> new_dims;
-  if (ndims_x > ndims_y) {
-    new_dims.assign(dims_x.begin(), dims_x.end() - 2);
-  } else if (ndims_x < ndims_y) {
-    new_dims.assign(dims_y.begin(), dims_y.end() - 2);
+  if (x.dtype() == phi::DataType::INT8) {
+    out->set_dtype(phi::DataType::INT32);
   } else {
-    new_dims.reserve(ndims_x);
-    for (size_t i = 0; i < ndims_x - 2; ++i) {
-      new_dims.push_back(std::max(dims_x[i], dims_y[i]));
-    }
-  }
-  if (!x_broadcasted) {
-    new_dims.push_back(M);  // NOLINT
-  }
-  if (!y_broadcasted) {
-    new_dims.push_back(N);  // NOLINT
+    out->set_dtype(x.dtype());
   }
-
-  auto ddim_out = phi::make_ddim(new_dims);
-
-  out->set_dims(ddim_out);
-  out->set_dtype(phi::DataType::INT32);
   out->set_layout(x.layout());
 }
 
@@ -2314,7 +2248,11 @@ void MatmulWithFlattenInferMeta(const MetaTensor& x,
   }
 
   out->set_dims(phi::make_ddim(output_dims));
-  out->set_dtype(x.dtype());
+  if (x.dtype() == phi::DataType::INT8) {
+    out->set_dtype(phi::DataType::INT32);
+  } else {
+    out->set_dtype(x.dtype());
+  }
   out->share_lod(x);
 }
 
diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h
index 887da467e07b1..94d8bb606ea5d 100644
--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -347,12 +347,6 @@ void MatmulInferMeta(const MetaTensor& x,
                      bool trans_y,
                      MetaTensor* out);
 
-void MatmulInt8InferMeta(const MetaTensor& x,
-                         const MetaTensor& y,
-                         bool trans_x,
-                         bool trans_y,
-                         MetaTensor* out);
-
 void MatmulWithFlattenInferMeta(const MetaTensor& x,
                                 const MetaTensor& y,
                                 int x_num_col_dims,
diff --git a/paddle/phi/infermeta/nullary.cc b/paddle/phi/infermeta/nullary.cc
index d5da3a2f8bc87..1c57e2fae92ac 100644
--- a/paddle/phi/infermeta/nullary.cc
+++ b/paddle/phi/infermeta/nullary.cc
@@ -223,6 +223,11 @@ void RecvV2InferMeta(const int ring_id,
   out->set_dtype(dtype);
 }
 
+void SeedInferMeta(int seed, MetaTensor* out) {
+  out->set_dims(phi::make_ddim({1}));
+  out->set_dtype(DataType::INT32);
+}
+
 void TruncatedGaussianRandomInferMeta(const std::vector<int>& shape,
                                       float mean,
                                       float std,
diff --git a/paddle/phi/infermeta/nullary.h b/paddle/phi/infermeta/nullary.h
index bc73942c8ec1c..2f9c9a69a13f1 100644
--- a/paddle/phi/infermeta/nullary.h
+++ b/paddle/phi/infermeta/nullary.h
@@ -83,6 +83,8 @@ void RecvV2InferMeta(const int ring_id,
                      DataType dtype,
                      MetaTensor* out);
 
+void SeedInferMeta(int seed, MetaTensor* out);
+
 void TruncatedGaussianRandomInferMeta(const std::vector<int>& shape,
                                       float mean,
                                       float std,
diff --git a/paddle/phi/infermeta/spmd_rules/default_data_parallel.cc b/paddle/phi/infermeta/spmd_rules/default_data_parallel.cc
new file mode 100644
index 0000000000000..4359534dea939
--- /dev/null
+++ b/paddle/phi/infermeta/spmd_rules/default_data_parallel.cc
@@ -0,0 +1,164 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/infermeta/spmd_rules/default_data_parallel.h"
+
+#include "glog/logging.h"
+
+#include "paddle/phi/core/distributed/auto_parallel/dist_attr.h"
+#include "paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h"
+#include "paddle/phi/core/distributed/auto_parallel/utils.h"
+
+namespace phi {
+namespace distributed {
+
+using phi::distributed::auto_parallel::str_join;
+
+////////////////// Utils Functions //////////////////
+std::vector<int64_t> GetDefaultDataParallelDimsmapping(
+    const int64_t batch_axis_dim, const int ndim) {
+  std::vector<int64_t> dims_mapping(ndim, -1);
+  dims_mapping[0] = batch_axis_dim;
+  return dims_mapping;
+}
+
+////////////////// InferMeta(Contains SPMD) Functions //////////////////
+
+SpmdInfo DefaultDataParallelSpmdInferForward(
+    const std::vector<const DistMetaTensor*>& ins,
+    const std::vector<const DistMetaTensor*>& outs) {
+  // step1: Build Einsum Notation for input tensor's batch axis
+  int64_t ninputs = ins.size();
+  int64_t noutputs = outs.size();
+  std::vector<std::pair<std::string, std::vector<int64_t>>> axes_sharding_info;
+  std::string batch_axis = "b";
+
+  for (int64_t i = 0; i < ninputs; ++i) {
+    axes_sharding_info.push_back(
+        {batch_axis, {ins[i]->dist_attr().dims_mapping()[0]}});
+  }
+
+  // Step2: Sharding Merge
+  std::unordered_map<std::string, int64_t> axis_to_dim_map =
+      ShardingMergeForTensors(axes_sharding_info);
+  int64_t batch_axis_dim = axis_to_dim_map[batch_axis];
+
+  // Step3: Infer Output's Batch Axis Dims Mapping.
+  std::vector<TensorDistAttr> output_dist_attrs;
+  for (int64_t i = 0; i < noutputs; i++) {
+    int ndim = outs[i]->dims().size();
+    TensorDistAttr dist_attr_dst =
+        CopyTensorDistAttrForOutput(ins[0]->dist_attr());
+    std::vector<int64_t> dst_dims_maping =
+        GetDefaultDataParallelDimsmapping(batch_axis_dim, ndim);
+    dist_attr_dst.set_dims_mapping(dst_dims_maping);
+    output_dist_attrs.emplace_back(dist_attr_dst);
+  }
+
+  // Step4: Merge and get Inputs' Batch Axis New Dims Mapping.
+  std::vector<TensorDistAttr> dst_input_dist_attrs;
+  for (int64_t i = 0; i < ninputs; i++) {
+    int ndim = ins[i]->dims().size();
+    TensorDistAttr dist_attr_dst =
+        CopyTensorDistAttrForOutput(ins[i]->dist_attr());
+    std::vector<int64_t> dst_dims_maping =
+        GetDefaultDataParallelDimsmapping(batch_axis_dim, ndim);
+    dist_attr_dst.set_dims_mapping(dst_dims_maping);
+    dst_input_dist_attrs.emplace_back(dist_attr_dst);
+  }
+
+  VLOG(4) << "DefaultDataParallelSpmd InferForward:";
+  for (int64_t i = 0; i < ninputs; i++) {
+    VLOG(4) << "Input" << std::to_string(i) << " shape: ["
+            << str_join(phi::vectorize(ins[i]->dims())) << "] "
+            << "src_dims_mapping: ["
+            << str_join(ins[i]->dist_attr().dims_mapping()) << "] "
+            << "dst_dims_mapping: ["
+            << str_join(dst_input_dist_attrs[i].dims_mapping()) << "]";
+  }
+
+  for (int64_t i = 0; i < noutputs; i++) {
+    VLOG(4) << "Output" << std::to_string(i) << " shape: ["
+            << str_join(phi::vectorize(outs[i]->dims())) << "] "
+            << "dst_dims_mapping: ["
+            << str_join(output_dist_attrs[i].dims_mapping()) << "]";
+  }
+
+  return {dst_input_dist_attrs, output_dist_attrs};
+}
+SpmdInfo DefaultDataParallelSpmdInferBackward(
+    const std::vector<const DistMetaTensor*>& ins,
+    const std::vector<const DistMetaTensor*>& outs) {
+  // step1: Build Einsum Notation for input tensor's batch axis
+  int64_t ninputs = ins.size();
+  int64_t noutputs = outs.size();
+  std::vector<std::pair<std::string, std::vector<int64_t>>> axes_sharding_info;
+  std::string batch_axis = "b";
+
+  for (int64_t i = 0; i < noutputs; ++i) {
+    axes_sharding_info.push_back(
+        {batch_axis, {outs[i]->dist_attr().dims_mapping()[0]}});
+  }
+
+  // Step2: Sharding Merge
+  std::unordered_map<std::string, int64_t> axis_to_dim_map =
+      ShardingMergeForTensors(axes_sharding_info);
+  int64_t batch_axis_dim = axis_to_dim_map[batch_axis];
+
+  // Step3: Infer Output's Batch Axis Dims Mapping.
+  std::vector<TensorDistAttr> output_dist_attrs;
+  for (int64_t i = 0; i < noutputs; i++) {
+    int ndim = outs[i]->dims().size();
+    TensorDistAttr dist_attr_dst =
+        CopyTensorDistAttrForOutput(outs[i]->dist_attr());
+    std::vector<int64_t> dst_dims_maping =
+        GetDefaultDataParallelDimsmapping(batch_axis_dim, ndim);
+    dist_attr_dst.set_dims_mapping(dst_dims_maping);
+    output_dist_attrs.emplace_back(dist_attr_dst);
+  }
+
+  // Step4: Merge and get Inputs' Batch Axis New Dims Mapping.
+  std::vector<TensorDistAttr> dst_input_dist_attrs;
+  for (int64_t i = 0; i < ninputs; i++) {
+    int ndim = ins[i]->dims().size();
+    TensorDistAttr dist_attr_dst =
+        CopyTensorDistAttrForOutput(ins[i]->dist_attr());
+    std::vector<int64_t> dst_dims_maping =
+        GetDefaultDataParallelDimsmapping(batch_axis_dim, ndim);
+    dist_attr_dst.set_dims_mapping(dst_dims_maping);
+    dst_input_dist_attrs.emplace_back(dist_attr_dst);
+  }
+
+  VLOG(4) << "DefaultDataParallelSpmd InferBackward:";
+  for (int64_t i = 0; i < noutputs; i++) {
+    VLOG(4) << "Output" << std::to_string(i) << " shape: ["
+            << str_join(phi::vectorize(outs[i]->dims())) << "] "
+            << "src_dims_mapping: ["
+            << str_join(outs[i]->dist_attr().dims_mapping()) << "] "
+            << "dst_dims_mapping: ["
+            << str_join(output_dist_attrs[i].dims_mapping()) << "]";
+  }
+
+  for (int64_t i = 0; i < ninputs; i++) {
+    VLOG(4) << "Input" << std::to_string(i) << " shape: ["
+            << str_join(phi::vectorize(ins[i]->dims())) << "] "
+            << "dst_dims_mapping: ["
+            << str_join(dst_input_dist_attrs[i].dims_mapping()) << "]";
+  }
+
+  return {dst_input_dist_attrs, output_dist_attrs};
+}
+
+}  // namespace distributed
+}  // namespace phi
diff --git a/paddle/phi/infermeta/spmd_rules/default_data_parallel.h b/paddle/phi/infermeta/spmd_rules/default_data_parallel.h
new file mode 100644
index 0000000000000..25fa3b65e50a0
--- /dev/null
+++ b/paddle/phi/infermeta/spmd_rules/default_data_parallel.h
@@ -0,0 +1,67 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+
+#include "paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.h"
+#include "paddle/phi/core/distributed/type_defs.h"
+#include "paddle/phi/infermeta/spmd_rules/utils.h"
+
+namespace phi {
+namespace distributed {
+/**
+ * A **hack** rule with a strong assumtion that the first dimension of
+ * all the input and ouput tensors is the batch dimension (broadcast dimension),
+ * therefore, if any tensor's first dimension is sharded, the sharding would be
+ * propagating to all the other tensors (for tensor first dimension). All the
+ * other axes of tensors would be set as unshard (-1).
+ *
+ *
+ * This rule is used to support emerging op for hybrid parallelism quickly, and
+ * once there is a specific rule for that op,  we should remove that op from
+ * this rule.
+ *
+ * Vector of input tensors and output tensors used as argumnets (for both
+ * inferfw & inferbw) to support any kind of op.
+ *
+ */
+SpmdInfo DefaultDataParallelSpmdInferForward(
+    const std::vector<const DistMetaTensor*>& ins,
+    const std::vector<const DistMetaTensor*>& outs);
+
+SpmdInfo DefaultDataParallelSpmdInferBackward(
+    const std::vector<const DistMetaTensor*>& ins,
+    const std::vector<const DistMetaTensor*>& outs);
+
+// For phi api
+template <typename... Args>
+SpmdInfo PhiDefaultDataParallelSpmdInferForward(const Args&... args) {
+  return detail::PhiSpmdVariadicArgumentParser<
+             DefaultDataParallelSpmdInferForward>()
+      .apply(args...)
+      .InferForward();
+}
+
+template <typename... Args>
+SpmdInfo PhiDefaultDataParallelSpmdInferBackward(const Args&... args) {
+  return detail::PhiSpmdVariadicArgumentParser<
+             DefaultDataParallelSpmdInferBackward>()
+      .apply(args...)
+      .InferBackward();
+}
+
+}  // namespace distributed
+}  // namespace phi
diff --git a/paddle/phi/infermeta/spmd_rules/matmul.cc b/paddle/phi/infermeta/spmd_rules/matmul.cc
index 088f9ab16363a..a29f23b88038c 100644
--- a/paddle/phi/infermeta/spmd_rules/matmul.cc
+++ b/paddle/phi/infermeta/spmd_rules/matmul.cc
@@ -114,10 +114,10 @@ void FillMatmulOperandNotation(const int x_ndim,
 
 ////////////////// InferMeta(Contains SPMD) Functions //////////////////
 
-SpmdInfo MatmulSpmdInferForward(const DistMetaTensor& x,
-                                const DistMetaTensor& y,
-                                bool trans_x,
-                                bool trans_y) {
+SpmdInfo MatmulInferSpmd(const DistMetaTensor& x,
+                         const DistMetaTensor& y,
+                         bool trans_x,
+                         bool trans_y) {
   // Step0: verify input args based on matmul logic
   auto x_shape = phi::vectorize(x.dims());
   auto y_shape = phi::vectorize(y.dims());
@@ -221,11 +221,11 @@ SpmdInfo MatmulSpmdInferForward(const DistMetaTensor& x,
   return {{x_dist_attr_dst, y_dist_attr_dst}, {output_dist_attr_dst}};
 }
 
-SpmdInfo MatmulSpmdInferBackward(const DistMetaTensor& x,
-                                 const DistMetaTensor& y,
-                                 const DistMetaTensor& out,
-                                 bool trans_x,
-                                 bool trans_y) {
+SpmdInfo MatmulInferSpmdReverse(const DistMetaTensor& x,
+                                const DistMetaTensor& y,
+                                const DistMetaTensor& out,
+                                bool trans_x,
+                                bool trans_y) {
   auto out_shape = phi::vectorize(out.dims());
   int out_ndim = out_shape.size();
 
diff --git a/paddle/phi/infermeta/spmd_rules/matmul.h b/paddle/phi/infermeta/spmd_rules/matmul.h
index 64cfba26a7445..6bb36f4bd3d34 100644
--- a/paddle/phi/infermeta/spmd_rules/matmul.h
+++ b/paddle/phi/infermeta/spmd_rules/matmul.h
@@ -22,16 +22,16 @@ limitations under the License. */
 namespace phi {
 namespace distributed {
 
-SpmdInfo MatmulSpmdInferForward(const DistMetaTensor& x,
+SpmdInfo MatmulInferSpmd(const DistMetaTensor& x,
+                         const DistMetaTensor& y,
+                         bool trans_x,
+                         bool trans_y);
+
+SpmdInfo MatmulInferSpmdReverse(const DistMetaTensor& x,
                                 const DistMetaTensor& y,
+                                const DistMetaTensor& out,
                                 bool trans_x,
                                 bool trans_y);
 
-SpmdInfo MatmulSpmdInferBackward(const DistMetaTensor& x,
-                                 const DistMetaTensor& y,
-                                 const DistMetaTensor& out,
-                                 bool trans_x,
-                                 bool trans_y);
-
 }  // namespace distributed
 }  // namespace phi
diff --git a/paddle/phi/infermeta/spmd_rules/replicated.cc b/paddle/phi/infermeta/spmd_rules/replicated.cc
new file mode 100644
index 0000000000000..55aa9bf61e0e4
--- /dev/null
+++ b/paddle/phi/infermeta/spmd_rules/replicated.cc
@@ -0,0 +1,136 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/infermeta/spmd_rules/replicated.h"
+
+#include "glog/logging.h"
+
+#include "paddle/phi/core/distributed/auto_parallel/dist_attr.h"
+#include "paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h"
+#include "paddle/phi/core/distributed/auto_parallel/utils.h"
+
+namespace phi {
+namespace distributed {
+
+using phi::distributed::auto_parallel::str_join;
+
+////////////////// Utils Functions //////////////////
+std::vector<int64_t> GetReplicatedDimsmapping(const int ndim) {
+  std::vector<int64_t> dims_mapping(ndim, -1);
+  return dims_mapping;
+}
+
+////////////////// InferMeta(Contains SPMD) Functions //////////////////
+SpmdInfo ReplicatedSpmdInferForward(
+    const std::vector<const DistMetaTensor*>& ins,
+    const std::vector<const DistMetaTensor*>& outs) {
+  // step1: Build Einsum Notation for input tensor's batch axis
+  int64_t ninputs = ins.size();
+  int64_t noutputs = outs.size();
+
+  // Step2: Unshard Output's Dims Mapping.
+  std::vector<TensorDistAttr> output_dist_attrs;
+  for (int64_t i = 0; i < noutputs; i++) {
+    VLOG(4) << outs[i]->dist_attr().to_string();
+    VLOG(4) << outs[i]->dims().to_str();
+    int ndim = outs[i]->dims().size();
+    TensorDistAttr dist_attr_dst =
+        CopyTensorDistAttrForOutput(ins[0]->dist_attr());
+    std::vector<int64_t> dst_dims_maping = GetReplicatedDimsmapping(ndim);
+    dist_attr_dst.set_dims_mapping(dst_dims_maping);
+    output_dist_attrs.emplace_back(dist_attr_dst);
+  }
+
+  // Step3: Merge and get Inputs' Batch Axis New Dims Mapping.
+  std::vector<TensorDistAttr> dst_input_dist_attrs;
+  for (int64_t i = 0; i < ninputs; i++) {
+    int ndim = ins[i]->dims().size();
+    TensorDistAttr dist_attr_dst =
+        CopyTensorDistAttrForOutput(ins[i]->dist_attr());
+    std::vector<int64_t> dst_dims_maping = GetReplicatedDimsmapping(ndim);
+    dist_attr_dst.set_dims_mapping(dst_dims_maping);
+    dst_input_dist_attrs.emplace_back(dist_attr_dst);
+  }
+
+  VLOG(4) << "ReplicatedSpmd InferForward:";
+  for (int64_t i = 0; i < ninputs; i++) {
+    VLOG(4) << "Input" << std::to_string(i) << " shape: ["
+            << str_join(phi::vectorize(ins[i]->dims())) << "] "
+            << "src_dims_mapping: ["
+            << str_join(ins[i]->dist_attr().dims_mapping()) << "] "
+            << "dst_dims_mapping: ["
+            << str_join(dst_input_dist_attrs[i].dims_mapping()) << "]";
+  }
+
+  for (int64_t i = 0; i < noutputs; i++) {
+    VLOG(4) << "Output" << std::to_string(i) << " shape: ["
+            << str_join(phi::vectorize(outs[i]->dims())) << "] "
+            << "dst_dims_mapping: ["
+            << str_join(output_dist_attrs[i].dims_mapping()) << "]";
+  }
+
+  return {dst_input_dist_attrs, output_dist_attrs};
+}
+
+SpmdInfo ReplicatedSpmdInferBackward(
+    const std::vector<const DistMetaTensor*>& ins,
+    const std::vector<const DistMetaTensor*>& outs) {
+  // step1: Build Einsum Notation for input tensor's batch axis
+  int64_t ninputs = ins.size();
+  int64_t noutputs = outs.size();
+
+  // Step2: Unshard Output's Dims Mapping.
+  std::vector<TensorDistAttr> output_dist_attrs;
+  for (int64_t i = 0; i < noutputs; i++) {
+    int ndim = outs[i]->dims().size();
+    TensorDistAttr dist_attr_dst =
+        CopyTensorDistAttrForOutput(outs[i]->dist_attr());
+    std::vector<int64_t> dst_dims_maping = GetReplicatedDimsmapping(ndim);
+    dist_attr_dst.set_dims_mapping(dst_dims_maping);
+    output_dist_attrs.emplace_back(dist_attr_dst);
+  }
+
+  // Step3: Merge and get Inputs' Batch Axis New Dims Mapping.
+  std::vector<TensorDistAttr> dst_input_dist_attrs;
+  for (int64_t i = 0; i < ninputs; i++) {
+    int ndim = ins[i]->dims().size();
+    TensorDistAttr dist_attr_dst =
+        CopyTensorDistAttrForOutput(ins[i]->dist_attr());
+    std::vector<int64_t> dst_dims_maping = GetReplicatedDimsmapping(ndim);
+    dist_attr_dst.set_dims_mapping(dst_dims_maping);
+    dst_input_dist_attrs.emplace_back(dist_attr_dst);
+  }
+
+  VLOG(4) << "ReplicatedSpmd InferBackward:";
+  for (int64_t i = 0; i < noutputs; i++) {
+    VLOG(4) << "Output" << std::to_string(i) << " shape: ["
+            << str_join(phi::vectorize(outs[i]->dims())) << "] "
+            << "src_dims_mapping: ["
+            << str_join(outs[i]->dist_attr().dims_mapping()) << "] "
+            << "dst_dims_mapping: ["
+            << str_join(output_dist_attrs[i].dims_mapping()) << "]";
+  }
+
+  for (int64_t i = 0; i < ninputs; i++) {
+    VLOG(4) << "Input" << std::to_string(i) << " shape: ["
+            << str_join(phi::vectorize(ins[i]->dims())) << "] "
+            << "dst_dims_mapping: ["
+            << str_join(dst_input_dist_attrs[i].dims_mapping()) << "]";
+  }
+
+  return {dst_input_dist_attrs, output_dist_attrs};
+}
+
+}  // namespace distributed
+}  // namespace phi
diff --git a/paddle/phi/infermeta/spmd_rules/replicated.h b/paddle/phi/infermeta/spmd_rules/replicated.h
new file mode 100644
index 0000000000000..7b2ea330be3eb
--- /dev/null
+++ b/paddle/phi/infermeta/spmd_rules/replicated.h
@@ -0,0 +1,61 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+
+#include "paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.h"
+#include "paddle/phi/core/distributed/type_defs.h"
+#include "paddle/phi/infermeta/spmd_rules/utils.h"
+
+namespace phi {
+namespace distributed {
+/**
+ * A Bottom Line Rule that enforces input(s) and output(s) of the Op to be
+ * replicated among the given mesh.
+ *
+ * This rule is used to support any op that have not been assign a specific rule
+ * in auto parallel, and once there is a specific rule for that op,  replicated
+ * rule would not effect that op any more.
+ *
+ * Vector of input tensors and output tensors used as argumnets (for both
+ * inferfw & inferbw) to support any kind of op.
+ *
+ */
+SpmdInfo ReplicatedSpmdInferForward(
+    const std::vector<const DistMetaTensor*>& ins,
+    const std::vector<const DistMetaTensor*>& outs);
+
+SpmdInfo ReplicatedSpmdInferBackward(
+    const std::vector<const DistMetaTensor*>& ins,
+    const std::vector<const DistMetaTensor*>& outs);
+
+// For phi api
+template <typename... Args>
+SpmdInfo PhiReplicatedSpmdInferForward(const Args&... args) {
+  return detail::PhiSpmdVariadicArgumentParser<ReplicatedSpmdInferForward>()
+      .apply(args...)
+      .InferForward();
+}
+
+template <typename... Args>
+SpmdInfo PhiReplicatedSpmdInferBackward(const Args&... args) {
+  return detail::PhiSpmdVariadicArgumentParser<ReplicatedSpmdInferBackward>()
+      .apply(args...)
+      .InferBackward();
+}
+
+}  // namespace distributed
+}  // namespace phi
diff --git a/paddle/phi/infermeta/spmd_rules/rules.h b/paddle/phi/infermeta/spmd_rules/rules.h
index 5ec2f212ec65b..84eb9bd552f17 100644
--- a/paddle/phi/infermeta/spmd_rules/rules.h
+++ b/paddle/phi/infermeta/spmd_rules/rules.h
@@ -16,7 +16,9 @@ limitations under the License. */
 
 #include "paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h"
 
+#include "paddle/phi/infermeta/spmd_rules/default_data_parallel.h"
 #include "paddle/phi/infermeta/spmd_rules/matmul.h"
+#include "paddle/phi/infermeta/spmd_rules/replicated.h"
 
 /**
  * Design Notes:
@@ -40,8 +42,20 @@ namespace distributed {
 
 // matmul rule
 PD_REGISTER_SPMD_RULE(matmul,
-                      PD_INFER_SPMD(phi::distributed::MatmulSpmdInferForward),
-                      PD_INFER_SPMD(phi::distributed::MatmulSpmdInferBackward));
+                      PD_INFER_SPMD(phi::distributed::MatmulInferSpmd),
+                      PD_INFER_SPMD(phi::distributed::MatmulInferSpmdReverse));
+
+// default data parallel rule
+PD_REGISTER_SPMD_RULE(
+    unsqueeze,
+    PD_INFER_SPMD(phi::distributed::DefaultDataParallelSpmdInferForward),
+    PD_INFER_SPMD(phi::distributed::DefaultDataParallelSpmdInferBackward));
+
+// replicated rule /* for unitest */
+PD_REGISTER_SPMD_RULE(
+    replicated,
+    PD_INFER_SPMD(phi::distributed::ReplicatedSpmdInferForward),
+    PD_INFER_SPMD(phi::distributed::ReplicatedSpmdInferBackward));
 
 }  // namespace distributed
 }  // namespace phi
diff --git a/paddle/phi/infermeta/spmd_rules/utils.cc b/paddle/phi/infermeta/spmd_rules/utils.cc
index 2252de98a78b3..e7a3dac52ac1d 100644
--- a/paddle/phi/infermeta/spmd_rules/utils.cc
+++ b/paddle/phi/infermeta/spmd_rules/utils.cc
@@ -137,6 +137,8 @@ TensorDistAttr CopyTensorDistAttrForOutput(
   new_dist_attr.set_batch_dim(src_dist_attr.batch_dim());
   new_dist_attr.set_dynamic_dims(src_dist_attr.dynamic_dims());
   // new_dist_attr.set_annotated(false); TODO unset field is false by default.
+  new_dist_attr.clean_partial_status();  // in partial-stage I, partial is allow
+                                         // to propagate
   return new_dist_attr;
 }
 
diff --git a/paddle/phi/infermeta/spmd_rules/utils.h b/paddle/phi/infermeta/spmd_rules/utils.h
index 5e3c3a3d0961c..e35b9cc792583 100644
--- a/paddle/phi/infermeta/spmd_rules/utils.h
+++ b/paddle/phi/infermeta/spmd_rules/utils.h
@@ -19,6 +19,10 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
+#include "paddle/phi/core/attribute.h"
+#include "paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.h"
+#include "paddle/phi/core/distributed/type_defs.h"
+
 namespace phi {
 namespace distributed {
 class TensorDistAttr;
@@ -61,5 +65,74 @@ std::vector<int64_t> ResoluteOutputPartialDimension(
     const std::unordered_map<std::string, int64_t>& axis_to_dim_map,
     const std::string& tensor_axes);
 
+// Adaptor for variadic arguments
+template <typename Functor>
+struct ArgsIterator {
+  template <typename... Args>
+  inline Functor& apply() {
+    return self();
+  }
+
+  template <typename T, typename... Args>
+  inline Functor& apply(T&& arg, Args&&... args) {
+    self()(std::forward<T>(arg));
+    if (self().short_circuit()) {
+      return self();
+    } else {
+      return apply(std::forward<Args>(args)...);
+    }
+  }
+
+  constexpr bool short_circuit() const { return false; }
+
+ private:
+  inline Functor& self() { return *static_cast<Functor*>(this); }
+};
+
+using SpmdFn = SpmdInfo (*)(const std::vector<const DistMetaTensor*>& ins,
+                            const std::vector<const DistMetaTensor*>& outs);
+
+namespace detail {
+template <SpmdFn Fn>
+struct PhiSpmdVariadicArgumentParser
+    : public ArgsIterator<PhiSpmdVariadicArgumentParser<Fn>> {
+  std::vector<const DistMetaTensor*> inputs;
+  std::vector<const DistMetaTensor*> outputs;
+  std::vector<phi::Attribute> attrs;
+
+  // deal with inputs
+  void operator()(const DistMetaTensor& x) { inputs.emplace_back(&x); }
+
+  void operator()(const std::vector<const DistMetaTensor*>& x) {
+    for (auto t : x) {
+      inputs.emplace_back(t);
+    }
+  }
+
+  template <typename AttrType>
+  void operator()(AttrType x) {
+    attrs.emplace_back(x);
+  }
+
+  // deal with outputs
+  void operator()(DistMetaTensor* out) { outputs.emplace_back(out); }
+
+  void operator()(std::vector<DistMetaTensor*> out) {
+    for (auto t : out) {
+      outputs.emplace_back(t);
+    }
+  }
+
+  SpmdInfo InferForward() {
+    return Fn(inputs, outputs);
+    // return Fn(inputs, outputs, attrs);
+  }
+
+  SpmdInfo InferBackward() {
+    return Fn(inputs, outputs);
+    // return Fn(inputs, outputs, attrs);
+  }
+};
+}  // namespace detail
 }  // namespace distributed
 }  // namespace phi
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index cc8df692a1267..cc2594c9a720a 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -83,6 +83,16 @@ if(WITH_CUTLASS)
     )
   endif()
 
+  execute_process(
+    COMMAND
+      ${CMAKE_COMMAND} -E make_directory
+      "${CMAKE_CURRENT_SOURCE_DIR}/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen"
+    COMMAND ${PYTHON_EXECUTABLE} generic_mixed_gemm_kernelLauncher.py
+            --cuda_arch "${NVCC_ARCH_BIN}"
+    WORKING_DIRECTORY
+      "${CMAKE_CURRENT_SOURCE_DIR}/fusion/cutlass/cutlass_kernels/fpA_intB_gemm"
+  )
+
   file(
     GLOB cutlass_cu
     RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
@@ -90,7 +100,9 @@ if(WITH_CUTLASS)
     "fusion/cutlass/conv2d/*.cu"
     "fusion/cutlass/*.cu"
     "fusion/cutlass/memory_efficient_attention/autogen/impl/*.cu"
-    "fusion/cutlass/memory_efficient_attention/autogen_variable/impl/*.cu")
+    "fusion/cutlass/memory_efficient_attention/autogen_variable/impl/*.cu"
+    "fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen/*.cu"
+    "fusion/cutlass/cutlass_kernels/fpA_intB_gemm/*.cu")
   list(APPEND kernel_cu ${cutlass_cu})
 endif()
 
diff --git a/paddle/phi/kernels/autotune/auto_tune_base.h b/paddle/phi/kernels/autotune/auto_tune_base.h
index a6a37272840af..438991ef6fd62 100644
--- a/paddle/phi/kernels/autotune/auto_tune_base.h
+++ b/paddle/phi/kernels/autotune/auto_tune_base.h
@@ -106,7 +106,7 @@ class AutoTuneBase {
     float min_time = std::numeric_limits<float>::max();
 
     // Time cost test estabulished in default stream.
-    for (int i = 0; i < kernels_.size(); ++i) {
+    for (size_t i = 0; i < kernels_.size(); ++i) {
       auto time = RunAndMeasureKernel<Context>(ctx, i, args...);
       if (time < min_time) {
         min_time = time;
diff --git a/paddle/phi/kernels/cpu/activation_grad_kernel.cc b/paddle/phi/kernels/cpu/activation_grad_kernel.cc
index b3203332ec7d1..d3cf1cbcb34c1 100644
--- a/paddle/phi/kernels/cpu/activation_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/activation_grad_kernel.cc
@@ -283,14 +283,14 @@ PD_REGISTER_KERNEL(
 PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(sin_grad, SinGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(cos_grad, CosGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(tan_grad, TanGradKernel)
-PD_REGISTER_ACTIVATION_GRAD_KERNEL(acos_grad, AcosGradKernel)
-PD_REGISTER_ACTIVATION_GRAD_KERNEL(asin_grad, AsinGradKernel)
-PD_REGISTER_ACTIVATION_GRAD_KERNEL(atan_grad, AtanGradKernel)
-PD_REGISTER_ACTIVATION_GRAD_KERNEL(sinh_grad, SinhGradKernel)
-PD_REGISTER_ACTIVATION_GRAD_KERNEL(cosh_grad, CoshGradKernel)
-PD_REGISTER_ACTIVATION_GRAD_KERNEL(asinh_grad, AsinhGradKernel)
-PD_REGISTER_ACTIVATION_GRAD_KERNEL(acosh_grad, AcoshGradKernel)
-PD_REGISTER_ACTIVATION_GRAD_KERNEL(atanh_grad, AtanhGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(acos_grad, AcosGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(asin_grad, AsinGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(atan_grad, AtanGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(sinh_grad, SinhGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(cosh_grad, CoshGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(asinh_grad, AsinhGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(acosh_grad, AcoshGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(atanh_grad, AtanhGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(tanh_grad, TanhGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(hardtanh_grad, HardTanhGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_grad, LeakyReluGradKernel)
@@ -340,7 +340,9 @@ PD_REGISTER_KERNEL(exp_grad,
                    float,
                    double,
                    int,
-                   int64_t) {}
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
 
 PD_REGISTER_KERNEL(expm1_grad,
                    CPU,
@@ -348,7 +350,9 @@ PD_REGISTER_KERNEL(expm1_grad,
                    phi::Expm1GradKernel,
                    float,
                    double,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
 
 PD_REGISTER_KERNEL(
     logit_grad, CPU, ALL_LAYOUT, phi::LogitGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/activation_kernel.cc b/paddle/phi/kernels/cpu/activation_kernel.cc
index 8a554470dea39..66480018a5273 100644
--- a/paddle/phi/kernels/cpu/activation_kernel.cc
+++ b/paddle/phi/kernels/cpu/activation_kernel.cc
@@ -179,14 +179,14 @@ PD_REGISTER_KERNEL(relu, CPU, ALL_LAYOUT, phi::ReluKernel, float, double) {}
 PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(sin, SinKernel)
 PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(cos, CosKernel)
 PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(tan, TanKernel)
-PD_REGISTER_ACTIVATION_KERNEL(acos, AcosKernel)
-PD_REGISTER_ACTIVATION_KERNEL(asin, AsinKernel)
-PD_REGISTER_ACTIVATION_KERNEL(atan, AtanKernel)
-PD_REGISTER_ACTIVATION_KERNEL(sinh, SinhKernel)
-PD_REGISTER_ACTIVATION_KERNEL(cosh, CoshKernel)
-PD_REGISTER_ACTIVATION_KERNEL(asinh, AsinhKernel)
-PD_REGISTER_ACTIVATION_KERNEL(acosh, AcoshKernel)
-PD_REGISTER_ACTIVATION_KERNEL(atanh, AtanhKernel)
+PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(acos, AcosKernel)
+PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(asin, AsinKernel)
+PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(atan, AtanKernel)
+PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(sinh, SinhKernel)
+PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(cosh, CoshKernel)
+PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(asinh, AsinhKernel)
+PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(acosh, AcoshKernel)
+PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(atanh, AtanhKernel)
 PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(tanh, TanhKernel)
 PD_REGISTER_ACTIVATION_KERNEL(hardtanh, HardTanhKernel)
 PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel)
@@ -211,7 +211,9 @@ PD_REGISTER_KERNEL(exp,
                    double,
                    int,
                    int64_t,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
 
 PD_REGISTER_KERNEL(expm1,
                    CPU,
@@ -221,7 +223,9 @@ PD_REGISTER_KERNEL(expm1,
                    double,
                    int,
                    int64_t,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
 
 PD_REGISTER_KERNEL(logit, CPU, ALL_LAYOUT, phi::LogitKernel, float, double) {}
 PD_REGISTER_KERNEL(
diff --git a/paddle/phi/kernels/cpu/compare_kernel.cc b/paddle/phi/kernels/cpu/compare_kernel.cc
index ef7987975a12e..24b4615daa58c 100644
--- a/paddle/phi/kernels/cpu/compare_kernel.cc
+++ b/paddle/phi/kernels/cpu/compare_kernel.cc
@@ -30,22 +30,34 @@ inline void CompareKernelImpl(const Context& ctx,
                               const DenseTensor& y,
                               int axis,
                               DenseTensor* out) {
-  if (!out->IsSharedWith(x)) {
-    ctx.template Alloc<bool>(out);
-    if (x.dims().size() >= y.dims().size()) {
-      funcs::ElementwiseCompute<Functor, T, bool>(
-          ctx, x, y, Functor(), out, axis);
-    } else {
-      funcs::ElementwiseCompute<InverseFunctor, T, bool>(
-          ctx, x, y, InverseFunctor(), out, axis);
-    }
+  ctx.template Alloc<bool>(out);
+  if (x.dims().size() >= y.dims().size()) {
+    funcs::ElementwiseCompute<Functor, T, bool>(
+        ctx, x, y, Functor(), out, axis);
   } else {
-    if (x.dims().size() >= y.dims().size()) {
-      funcs::ElementwiseCompute<Functor, T, T>(ctx, x, y, Functor(), out, axis);
-    } else {
-      funcs::ElementwiseCompute<InverseFunctor, T, T>(
-          ctx, x, y, InverseFunctor(), out, axis);
-    }
+    funcs::ElementwiseCompute<InverseFunctor, T, bool>(
+        ctx, x, y, InverseFunctor(), out, axis);
+  }
+}
+
+template <typename T,
+          typename Context,
+          typename Functor,
+          typename InverseFunctor>
+inline void InplaceCompareKernelImpl(const Context& ctx,
+                                     const DenseTensor& x,
+                                     const DenseTensor& y,
+                                     int axis,
+                                     DenseTensor* out) {
+  auto x_origin = x;
+  out->set_type(phi::DataType::BOOL);
+  ctx.template Alloc<bool>(out);
+  if (x_origin.dims().size() >= y.dims().size()) {
+    funcs::ElementwiseCompute<Functor, T, bool>(
+        ctx, x_origin, y, Functor(), out, axis);
+  } else {
+    funcs::ElementwiseCompute<InverseFunctor, T, bool>(
+        ctx, x_origin, y, InverseFunctor(), out, axis);
   }
 }
 
@@ -92,19 +104,21 @@ PD_REGISTER_KERNEL(equal_all,
   kernel->OutputAt(0).SetDataType(phi::DataType::BOOL);
 }
 
-#define PD_REGISTER_COMPARE_KERNEL(name, func) \
-  PD_REGISTER_KERNEL(name,                     \
-                     CPU,                      \
-                     ALL_LAYOUT,               \
-                     phi::func##Kernel,        \
-                     bool,                     \
-                     int16_t,                  \
-                     int,                      \
-                     int64_t,                  \
-                     float,                    \
-                     double,                   \
-                     phi::dtype::float16,      \
-                     phi::dtype::bfloat16) {}
+#define PD_REGISTER_COMPARE_KERNEL(name, func)            \
+  PD_REGISTER_KERNEL(name,                                \
+                     CPU,                                 \
+                     ALL_LAYOUT,                          \
+                     phi::func##Kernel,                   \
+                     bool,                                \
+                     int16_t,                             \
+                     int,                                 \
+                     int64_t,                             \
+                     float,                               \
+                     double,                              \
+                     phi::dtype::float16,                 \
+                     phi::dtype::bfloat16) {              \
+    kernel->OutputAt(0).SetDataType(phi::DataType::BOOL); \
+  }
 PD_REGISTER_COMPARE_KERNEL(less_than, LessThan)
 PD_REGISTER_COMPARE_KERNEL(less_equal, LessEqual)
 PD_REGISTER_COMPARE_KERNEL(greater_than, GreaterThan)
diff --git a/paddle/phi/kernels/cpu/fold_grad_kernel.cc b/paddle/phi/kernels/cpu/fold_grad_kernel.cc
index 0c3f1dda03e5e..a56b0aa054571 100644
--- a/paddle/phi/kernels/cpu/fold_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/fold_grad_kernel.cc
@@ -18,5 +18,11 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/fold_grad_kernel_impl.h"
 
-PD_REGISTER_KERNEL(
-    fold_grad, CPU, ALL_LAYOUT, phi::FoldGradKernel, float, double) {}
+PD_REGISTER_KERNEL(fold_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::FoldGradKernel,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/fold_kernel.cc b/paddle/phi/kernels/cpu/fold_kernel.cc
index e22ac4c771ed9..df6cf5652c992 100644
--- a/paddle/phi/kernels/cpu/fold_kernel.cc
+++ b/paddle/phi/kernels/cpu/fold_kernel.cc
@@ -18,4 +18,11 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/fold_kernel_impl.h"
 
-PD_REGISTER_KERNEL(fold, CPU, ALL_LAYOUT, phi::FoldKernel, float, double) {}
+PD_REGISTER_KERNEL(fold,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::FoldKernel,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/logical_kernel.cc b/paddle/phi/kernels/cpu/logical_kernel.cc
index 06dff8428533f..ef657a161c4e5 100644
--- a/paddle/phi/kernels/cpu/logical_kernel.cc
+++ b/paddle/phi/kernels/cpu/logical_kernel.cc
@@ -24,20 +24,40 @@
 
 namespace phi {
 
-#define DEFINE_LOGICAL_BINARY_KERNEL(type)                                  \
-  template <typename T, typename Context>                                   \
-  void Logical##type##Kernel(const Context& dev_ctx,                        \
-                             const DenseTensor& x,                          \
-                             const DenseTensor& y,                          \
-                             DenseTensor* out) {                            \
-    funcs::Logical##type##Functor<T> binary_func;                           \
-    if (out->IsSharedWith(x)) {                                             \
-      funcs::ElementwiseCompute<funcs::Logical##type##Functor<T>, T, T>(    \
-          dev_ctx, x, y, binary_func, out);                                 \
-    } else {                                                                \
-      funcs::ElementwiseCompute<funcs::Logical##type##Functor<T>, T, bool>( \
-          dev_ctx, x, y, binary_func, out);                                 \
-    }                                                                       \
+template <typename T, typename Context, typename Functor>
+void LogicalKernelImpl(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& y,
+                       DenseTensor* out) {
+  Functor binary_func;
+  funcs::ElementwiseCompute<Functor, T, bool>(dev_ctx, x, y, binary_func, out);
+}
+
+template <typename T, typename Context, typename Functor>
+void InplaceLogicalKernelImpl(const Context& dev_ctx,
+                              const DenseTensor& x,
+                              const DenseTensor& y,
+                              DenseTensor* out) {
+  Functor binary_func;
+  auto x_origin = x;
+  out->set_type(phi::DataType::BOOL);
+  funcs::ElementwiseCompute<Functor, T, bool>(
+      dev_ctx, x_origin, y, binary_func, out);
+}
+
+#define DEFINE_LOGICAL_BINARY_KERNEL(type)                                    \
+  template <typename T, typename Context>                                     \
+  void Logical##type##Kernel(const Context& dev_ctx,                          \
+                             const DenseTensor& x,                            \
+                             const DenseTensor& y,                            \
+                             DenseTensor* out) {                              \
+    if (out->IsSharedWith(x)) {                                               \
+      InplaceLogicalKernelImpl<T, Context, funcs::Logical##type##Functor<T>>( \
+          dev_ctx, x, y, out);                                                \
+    } else {                                                                  \
+      LogicalKernelImpl<T, Context, funcs::Logical##type##Functor<T>>(        \
+          dev_ctx, x, y, out);                                                \
+    }                                                                         \
   }
 
 DEFINE_LOGICAL_BINARY_KERNEL(And)
@@ -52,15 +72,18 @@ void LogicalNotKernel(const Context& dev_ctx,
   funcs::LogicalNotFunctor<T> unary_func;
 
   phi::Transform<Context> trans;
-  if (!out->IsSharedWith(x)) {
+  if (out->IsSharedWith(x)) {
+    auto x_origin = x;
+    out->set_type(phi::DataType::BOOL);
     auto* out_ptr = dev_ctx.template Alloc<bool>(out);
-    trans(dev_ctx, x.data<T>(), x.data<T>() + x.numel(), out_ptr, unary_func);
-  } else {
     trans(dev_ctx,
-          x.data<T>(),
-          x.data<T>() + x.numel(),
-          reinterpret_cast<T*>(out->data()),
+          x_origin.data<T>(),
+          x_origin.data<T>() + x_origin.numel(),
+          out_ptr,
           unary_func);
+  } else {
+    auto* out_ptr = dev_ctx.template Alloc<bool>(out);
+    trans(dev_ctx, x.data<T>(), x.data<T>() + x.numel(), out_ptr, unary_func);
   }
 }
 
@@ -79,7 +102,9 @@ void LogicalNotKernel(const Context& dev_ctx,
                      int8_t,                                \
                      phi::dtype::complex<float>,            \
                      phi::dtype::complex<double>,           \
-                     int16_t) {}
+                     int16_t) {                             \
+    kernel->OutputAt(0).SetDataType(phi::DataType::BOOL);   \
+  }
 
 REGISTER_LOGICAL_CPU_KERNEL(logical_and, And)
 REGISTER_LOGICAL_CPU_KERNEL(logical_or, Or)
diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h
index 6295ca14aa3ad..6b77c31d38d4a 100644
--- a/paddle/phi/kernels/funcs/activation_functor.h
+++ b/paddle/phi/kernels/funcs/activation_functor.h
@@ -892,6 +892,22 @@ struct SinhGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
+template <typename T>
+struct SinhGradFunctor<ComplexType<T>>
+    : public BaseActivationFunctor<ComplexType<T>> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out UNUSED, dOut dout, dX dx) const {
+    dx.device(d) =
+        dout * x.unaryExpr(Cosh<ComplexType<T>>()).unaryExpr(Conj<T>());
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
 // cosh'(x) = sinh(x)
 template <typename T>
 struct CoshGradFunctor : public BaseActivationFunctor<T> {
@@ -907,6 +923,22 @@ struct CoshGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
+template <typename T>
+struct CoshGradFunctor<ComplexType<T>>
+    : public BaseActivationFunctor<ComplexType<T>> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out UNUSED, dOut dout, dX dx) const {
+    dx.device(d) =
+        dout * x.unaryExpr(Sinh<ComplexType<T>>()).unaryExpr(Conj<T>());
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
 template <typename T>
 struct Acos {
   HOSTDEVICE T operator()(const T& val) const { return acos(val); }
@@ -944,6 +976,24 @@ struct AcosGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
+template <typename T>
+struct AcosGradFunctor<ComplexType<T>>
+    : public BaseActivationFunctor<ComplexType<T>> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out UNUSED, dOut dout, dX dx) const {
+    dx.device(d) =
+        -dout * (static_cast<ComplexType<T>>(1) /
+                 (static_cast<ComplexType<T>>(1) - x.square()).sqrt())
+                    .unaryExpr(Conj<T>());
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
 template <typename T>
 struct Asin {
   HOSTDEVICE T operator()(const T& val) const { return asin(val); }
@@ -981,6 +1031,23 @@ struct AsinGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
+template <typename T>
+struct AsinGradFunctor<ComplexType<T>>
+    : public BaseActivationFunctor<ComplexType<T>> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out UNUSED, dOut dout, dX dx) const {
+    dx.device(d) = dout * (static_cast<ComplexType<T>>(1) /
+                           (static_cast<ComplexType<T>>(1) - x.square()).sqrt())
+                              .unaryExpr(Conj<T>());
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
 template <typename T>
 struct Atan {
   HOSTDEVICE T operator()(const T& val) const { return atan(val); }
@@ -1017,6 +1084,23 @@ struct AtanGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
+template <typename T>
+struct AtanGradFunctor<ComplexType<T>>
+    : public BaseActivationFunctor<ComplexType<T>> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out UNUSED, dOut dout, dX dx) const {
+    dx.device(d) = dout * (static_cast<ComplexType<T>>(1) /
+                           (static_cast<ComplexType<T>>(1) + x.square()))
+                              .unaryExpr(Conj<T>());
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
 template <typename T>
 struct LogitGradFunctor {
   template <typename Device, typename X, typename dOut, typename dX, typename P>
@@ -1066,6 +1150,23 @@ struct AcoshGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
+template <typename T>
+struct AcoshGradFunctor<ComplexType<T>>
+    : public BaseActivationFunctor<ComplexType<T>> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out UNUSED, dOut dout, dX dx) const {
+    dx.device(d) =
+        dout * (static_cast<ComplexType<T>>(1) /
+                (-static_cast<ComplexType<T>>(1) + x.square()).sqrt())
+                   .unaryExpr(Conj<T>());
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
 template <typename T>
 struct Asinh {
   HOSTDEVICE T operator()(const T& val) const { return asinh(val); }
@@ -1103,6 +1204,23 @@ struct AsinhGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
+template <typename T>
+struct AsinhGradFunctor<ComplexType<T>>
+    : public BaseActivationFunctor<ComplexType<T>> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out UNUSED, dOut dout, dX dx) const {
+    dx.device(d) = dout * (static_cast<ComplexType<T>>(1) /
+                           (x.square() + static_cast<ComplexType<T>>(1)).sqrt())
+                              .unaryExpr(Conj<T>());
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
 template <typename T>
 struct Atanh {
   HOSTDEVICE T operator()(const T& val) const { return atanh(val); }
@@ -1139,6 +1257,22 @@ struct AtanhGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
+template <typename T>
+struct AtanhGradFunctor<ComplexType<T>>
+    : public BaseActivationFunctor<ComplexType<T>> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out UNUSED, dOut dout, dX dx) const {
+    dx.device(d) = dout * (static_cast<ComplexType<T>>(1) /
+                           (static_cast<ComplexType<T>>(1) - x.square()))
+                              .unaryExpr(Conj<T>());
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
 // exp functor
 // exp(x) = e^x
 template <typename T>
@@ -1167,6 +1301,33 @@ struct ExpGradFunctor : public BaseActivationFunctor<T> {
   }
 };
 
+template <typename T>
+struct ExpGradFunctor<ComplexType<T>>
+    : public BaseActivationFunctor<ComplexType<T>> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x UNUSED, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * out.unaryExpr(Conj<T>());
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
+template <typename T>
+struct Expm1 {};
+
+template <typename T>
+struct Expm1<ComplexType<T>> {
+  HOSTDEVICE ComplexType<T> operator()(const ComplexType<T>& val) const {
+    return exp(val) - static_cast<ComplexType<T>>(1);
+  }
+};
+
 // expm1(x) = e^x - 1
 template <typename T>
 struct Expm1Functor : public BaseActivationFunctor<T> {
@@ -1178,6 +1339,15 @@ struct Expm1Functor : public BaseActivationFunctor<T> {
   }
 };
 
+template <typename T>
+struct Expm1Functor<ComplexType<T>>
+    : public BaseActivationFunctor<ComplexType<T>> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.unaryExpr(Expm1<ComplexType<T>>()).eval();
+  }
+};
+
 template <typename T>
 struct Expm1GradFunctor : public BaseActivationFunctor<T> {
   template <typename Device,
@@ -1194,6 +1364,21 @@ struct Expm1GradFunctor : public BaseActivationFunctor<T> {
   }
 };
 
+template <typename T>
+struct Expm1GradFunctor<ComplexType<T>>
+    : public BaseActivationFunctor<ComplexType<T>> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x UNUSED, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * out.unaryExpr(Conj<T>()) + dout;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+};
+
 // relu(x) = max(x, 0)
 template <typename T>
 struct ReluCPUFunctor : public BaseActivationFunctor<T> {
@@ -2831,6 +3016,16 @@ struct CudaExpFunctor<double> : public BaseActivationFunctor<double> {
   }
 };
 
+template <typename T>
+struct CudaExpFunctor<ComplexType<T>>
+    : public BaseActivationFunctor<ComplexType<T>> {
+  // exp(x) = exp(x)
+  __device__ __forceinline__ ComplexType<T> operator()(
+      const ComplexType<T> x) const {
+    return static_cast<ComplexType<T>>(exp(x));
+  }
+};
+
 template <typename T>
 struct CudaSeluFunctor : public BaseActivationFunctor<T> {
   typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
@@ -2907,6 +3102,20 @@ struct CudaExpGradFunctor : public BaseActivationFunctor<T> {
   }
 };
 
+template <typename T>
+struct CudaExpGradFunctor<ComplexType<T>>
+    : public BaseActivationFunctor<ComplexType<T>> {
+  // dx = dout * exp(x)
+  __device__ __forceinline__ ComplexType<T> operator()(
+      const ComplexType<T> dout, const ComplexType<T> out) const {
+    return static_cast<ComplexType<T>>(dout * conj(out));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
 template <typename T>
 struct CudaReciprocalFunctor : public BaseActivationFunctor<T> {
   using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
@@ -2947,6 +3156,15 @@ struct CudaExpm1Functor<double> : public BaseActivationFunctor<double> {
   }
 };
 
+template <typename T>
+struct CudaExpm1Functor<ComplexType<T>>
+    : public BaseActivationFunctor<ComplexType<T>> {
+  __device__ __forceinline__ ComplexType<T> operator()(
+      const ComplexType<T> x) const {
+    return static_cast<ComplexType<T>>(Expm1<ComplexType<T>>()(x));
+  }
+};
+
 template <typename T>
 struct CudaExpm1GradFunctor : public BaseActivationFunctor<T> {
   // dx = dout * out
@@ -2959,6 +3177,20 @@ struct CudaExpm1GradFunctor : public BaseActivationFunctor<T> {
   }
 };
 
+template <typename T>
+struct CudaExpm1GradFunctor<ComplexType<T>>
+    : public BaseActivationFunctor<ComplexType<T>> {
+  // dx = dout * exp(x)
+  __device__ __forceinline__ ComplexType<T> operator()(
+      const ComplexType<T> dout, const ComplexType<T> out) const {
+    return static_cast<ComplexType<T>>(dout * conj(out) + dout);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() {
+    return ActBwdOpFwdDeps::kDepOut;
+  }
+};
+
 template <typename T>
 struct CudaSinFunctor : public BaseActivationFunctor<T> {
   using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
@@ -3062,6 +3294,20 @@ struct CudaAsinGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
+template <typename T>
+struct CudaAsinGradFunctor<ComplexType<T>>
+    : public BaseActivationFunctor<ComplexType<T>> {
+  ComplexType<T> one = static_cast<ComplexType<T>>(1.0f);
+
+  // dx = dout / sqrt(1 - x^2)
+  __device__ __forceinline__ ComplexType<T> operator()(
+      const ComplexType<T> dout, const ComplexType<T> x) const {
+    return static_cast<ComplexType<T>>(dout / conj(sqrt(one - x * x)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
 template <typename T>
 struct CudaAcosFunctor : public BaseActivationFunctor<T> {
   using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
@@ -3089,6 +3335,20 @@ struct CudaAcosGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
+template <typename T>
+struct CudaAcosGradFunctor<ComplexType<T>>
+    : public BaseActivationFunctor<ComplexType<T>> {
+  ComplexType<T> one = static_cast<ComplexType<T>>(1.0f);
+
+  // dx = -dout / sqrt(1 - x^2)
+  __device__ __forceinline__ ComplexType<T> operator()(
+      const ComplexType<T> dout, const ComplexType<T> x) const {
+    return static_cast<ComplexType<T>>(-dout / conj(sqrt(one - x * x)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
 template <typename T>
 struct CudaCoshFunctor : public BaseActivationFunctor<T> {
   using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
@@ -3115,6 +3375,18 @@ struct CudaCoshGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
+template <typename T>
+struct CudaCoshGradFunctor<ComplexType<T>>
+    : public BaseActivationFunctor<ComplexType<T>> {
+  // dx = dout * sinh(x)
+  __device__ __forceinline__ ComplexType<T> operator()(
+      const ComplexType<T> dout, const ComplexType<T> x) const {
+    return static_cast<ComplexType<T>>(dout * conj(sinh(x)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
 template <typename T>
 struct CudaSinhFunctor : public BaseActivationFunctor<T> {
   using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
@@ -3141,6 +3413,18 @@ struct CudaSinhGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
+template <typename T>
+struct CudaSinhGradFunctor<ComplexType<T>>
+    : public BaseActivationFunctor<ComplexType<T>> {
+  // dx = dout * cosh(x)
+  __device__ __forceinline__ ComplexType<T> operator()(
+      const ComplexType<T> dout, const ComplexType<T> x) const {
+    return static_cast<ComplexType<T>>(dout * conj(cosh(x)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
 template <typename T>
 struct CudaAcoshFunctor : public BaseActivationFunctor<T> {
   using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
@@ -3167,6 +3451,19 @@ struct CudaAcoshGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
+template <typename T>
+struct CudaAcoshGradFunctor<ComplexType<T>>
+    : public BaseActivationFunctor<ComplexType<T>> {
+  ComplexType<T> one = static_cast<ComplexType<T>>(1.0f);
+  // dx = dout * 1 / sqrt(x^2 - 1)
+  __device__ __forceinline__ ComplexType<T> operator()(
+      const ComplexType<T> dout, const ComplexType<T> x) const {
+    return static_cast<ComplexType<T>>(dout * conj(one / sqrt(x * x - one)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
 template <typename T>
 struct CudaAsinhFunctor : public BaseActivationFunctor<T> {
   using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
@@ -3194,6 +3491,20 @@ struct CudaAsinhGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
+template <typename T>
+struct CudaAsinhGradFunctor<ComplexType<T>>
+    : public BaseActivationFunctor<ComplexType<T>> {
+  ComplexType<T> one = static_cast<ComplexType<T>>(1.0f);
+
+  // dx = dout * 1/sqrt(x^2 + 1)
+  __device__ __forceinline__ ComplexType<T> operator()(
+      const ComplexType<T> dout, const ComplexType<T> x) const {
+    return static_cast<ComplexType<T>>(dout * conj(one / sqrt(x * x + one)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
 template <typename T>
 struct CudaAtanhFunctor : public BaseActivationFunctor<T> {
   using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
@@ -3310,6 +3621,19 @@ struct CudaAtanhGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
+template <typename T>
+struct CudaAtanhGradFunctor<ComplexType<T>>
+    : public BaseActivationFunctor<ComplexType<T>> {
+  ComplexType<T> one = static_cast<ComplexType<T>>(1.0f);
+  // dx = dout * 1/(1- x^2)
+  __device__ __forceinline__ ComplexType<T> operator()(
+      const ComplexType<T> dout, const ComplexType<T> x) const {
+    return static_cast<ComplexType<T>>(dout * conj(one / (one - x * x)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
 template <typename T>
 struct CudaSqrtFunctor : public BaseActivationFunctor<T> {
   using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
@@ -3387,6 +3711,20 @@ struct CudaAtanGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
+template <typename T>
+struct CudaAtanGradFunctor<ComplexType<T>>
+    : public BaseActivationFunctor<ComplexType<T>> {
+  ComplexType<T> one = static_cast<ComplexType<T>>(1.0f);
+
+  // dx = dout / (1 + x^2)
+  __device__ __forceinline__ ComplexType<T> operator()(
+      const ComplexType<T> dout, const ComplexType<T> x) const {
+    return dout / conj(one + x * x);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
 template <typename T>
 struct CudaTanhFunctor : public BaseActivationFunctor<T> {
   using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
diff --git a/paddle/phi/kernels/funcs/aligned_vector.h b/paddle/phi/kernels/funcs/aligned_vector.h
index c931b90a92a70..558e7dc999cf8 100644
--- a/paddle/phi/kernels/funcs/aligned_vector.h
+++ b/paddle/phi/kernels/funcs/aligned_vector.h
@@ -13,15 +13,23 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+
 #include <algorithm>
 
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/hostdevice.h"
+
 #if defined(__xpu__)
 #define CHAR_BIT 8
 #endif
 
 namespace phi {
 
+template <typename T>
+struct NeedVectorized {
+  static constexpr bool value = sizeof(T) <= sizeof(float);
+};
+
 // Aligned vector generates vectorized load/store on CUDA.
 template <typename T, int Size>
 struct alignas(sizeof(T) * Size) AlignedVector {
@@ -53,6 +61,9 @@ HOSTDEVICE inline void Store(const AlignedVector<T, Size>& vec, T* addr) {
  */
 template <typename T>
 int GetVectorizedSize(const T* pointer) {
+  if (!NeedVectorized<T>::value) {
+    return 1;
+  }
   constexpr int max_load_bits = 128;
   constexpr int valid_vec_size = max_load_bits / CHAR_BIT / sizeof(T);
   uint64_t address = reinterpret_cast<uint64_t>(pointer);
@@ -76,4 +87,28 @@ int GetVectorizedSize(const T* pointer) {
   }
 }
 
+static int GetVectorizedSize(const DenseTensor* tensor) {
+  int element_size = phi::SizeOf(tensor->dtype());
+  if (element_size > sizeof(float)) {
+    return 1;
+  }
+  constexpr int max_load_bits = 128;
+  int valid_vec_size = max_load_bits / CHAR_BIT / element_size;
+  uint64_t address = reinterpret_cast<uint64_t>(tensor->data());
+
+  // Currently, decide to deal with no more than 4 data once while adopting
+  // vectorization load/store, if performance test shows that dealing with
+  // 8 data once in vectorization load/store does get optimized, code below
+  // can begin with :
+  // if (address % (element_size * 8) == 0) {
+  //   return std::min(8, valid_vec_size);
+  if (address % (element_size * 4) == 0) {
+    return std::min(4, valid_vec_size);
+  } else if (address % (element_size * 2) == 0) {
+    return std::min(2, valid_vec_size);
+  } else {
+    return 1;
+  }
+}
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h b/paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h
index b46608e91b74a..0fca9de54b2ba 100644
--- a/paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h
+++ b/paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h
@@ -182,6 +182,8 @@ template <typename T>
 cublasComputeType_t GetCudaComputeType() {
   if (std::is_same<T, double>::value) {
     return CUBLAS_COMPUTE_64F;
+  } else if (std::is_same<T, int8_t>::value) {
+    return CUBLAS_COMPUTE_32I;
   } else {
     return CUBLAS_COMPUTE_32F;
   }
@@ -206,6 +208,17 @@ struct MatmulDescriptor {
     is_cached = obj.is_cached;
   }
 
+  MatmulDescriptor& operator=(const MatmulDescriptor& obj) {
+    algo = obj.algo;
+    x_desc = obj.x_desc;
+    y_desc = obj.y_desc;
+    op_desc = obj.op_desc;
+    out_desc = obj.out_desc;
+    is_cached = obj.is_cached;
+
+    return *this;
+  }
+
   ~MatmulDescriptor() PADDLE_MAY_THROW {
     if (!is_cached) {
       PADDLE_WARN_GPU_SUCCESS(dynload::cublasLtMatmulDescDestroy(op_desc));
@@ -237,9 +250,15 @@ struct MatmulDescriptor {
               bool grad_for_dx = true) {
     using MT = typename phi::dtype::MPTypeTrait<T>::Type;
     cudaDataType_t mat_type = phi::backends::gpu::ToCudaDataType<T>();
+    cudaDataType_t out_mat_type = phi::backends::gpu::ToCudaDataType<T>();
     cudaDataType_t scale_type = phi::backends::gpu::ToCudaDataType<MT>();
     cublasComputeType_t compute_type = GetCudaComputeType<T>();
 
+    if (std::is_same<T, int8_t>::value) {
+      out_mat_type = phi::backends::gpu::ToCudaDataType<int32_t>();
+      scale_type = phi::backends::gpu::ToCudaDataType<int32_t>();
+    }
+
     // Create operation descriptor; see cublasLtMatmulDescAttributes_t for
     // details about defaults; just need to set the transforms for A and B
     PADDLE_ENFORCE_GPU_SUCCESS(
@@ -249,7 +268,7 @@ struct MatmulDescriptor {
     // Create matrix descriptors
     CreateMatrixLayout(&x_desc, mat_type, M, K, trans_x);
     CreateMatrixLayout(&y_desc, mat_type, K, N, trans_y);
-    CreateMatrixLayout(&out_desc, mat_type, M, N, false);
+    CreateMatrixLayout(&out_desc, out_mat_type, M, N, false);
 
     // Config batch size and stride.
     if (batch_size > 1) {
@@ -625,6 +644,197 @@ struct CublasLtBase {
   }
 };
 
+template <>
+struct CublasLtBase<int8_t, int32_t, MatmulDescriptor> {
+ public:
+  static phi::Allocator::AllocationPtr GetWorkspace(const phi::GPUContext& ctx,
+                                                    size_t workspace_size) {
+    return phi::memory_utils::Alloc(
+        ctx.GetPlace(),
+        workspace_size,
+        phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
+  }
+
+  static void RunImpl(const phi::GPUContext& ctx,
+                      MatmulDescriptor* desc,
+                      const size_t sub_key,
+                      const int8_t* x_ptr,
+                      const int8_t* y_ptr,
+                      int32_t* out_ptr,
+                      phi::funcs::MatmulPlanner* planner) {
+    int32_t alpha = 1;
+    int32_t beta =
+        planner->UseAddTo() ? static_cast<int32_t>(1) : static_cast<int32_t>(0);
+    cublasLtHandle_t cublaslt_handle = ctx.cublaslt_handle();
+
+    size_t workspace_size = static_cast<size_t>(4) * 1024 * 1024;
+    phi::Allocator::AllocationPtr workspace = GetWorkspace(ctx, workspace_size);
+
+    if (planner != nullptr) {
+      if (phi::autotune::AutoTuneStatus::Instance().UseAutoTune() &&
+          (!desc->is_cached)) {
+        SearchBestAlgo(ctx,
+                       cublaslt_handle,
+                       desc,
+                       static_cast<void*>(&alpha),
+                       static_cast<void*>(&beta),
+                       y_ptr,
+                       x_ptr,
+                       out_ptr,
+                       workspace->ptr(),
+                       workspace_size);
+        MatmulDescriptor* best_desc = new MatmulDescriptor(*desc);
+        VLOG(6) << best_desc->GetDescResultString(
+            "[Searched CublasltDescriptor] ");
+
+        auto& cache = phi::autotune::AutoTuneCache::Instance().GetMatmul();
+        cache.SetSubKey(sub_key, reinterpret_cast<void*>(best_desc));
+      }
+    }
+
+    VLOG(7) << desc->GetDescResultString("[Impl CublasltDescriptor] ");
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        dynload::cublasLtMatmul(cublaslt_handle,
+                                desc->op_desc,
+                                static_cast<void*>(&alpha),
+                                y_ptr,
+                                desc->y_desc,
+                                x_ptr,
+                                desc->x_desc,
+                                static_cast<void*>(&beta),
+                                out_ptr,
+                                desc->out_desc,
+                                out_ptr,
+                                desc->out_desc,
+                                desc->algo,
+                                workspace->ptr(),
+                                workspace_size,
+                                ctx.stream()));
+  }
+
+  static void SearchBestAlgo(const phi::GPUContext& ctx,
+                             const cublasLtHandle_t& lt_handle,
+                             MatmulDescriptor* desc,
+                             const void* alpha,
+                             const void* beta,
+                             const void* y_data,
+                             const void* x_data,
+                             void* out_data,
+                             void* workspace_ptr,
+                             size_t workspace_size) {
+    cublasLtMatmulPreference_t preference;
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        dynload::cublasLtMatmulPreferenceCreate(&preference));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulPreferenceSetAttribute(
+        preference,
+        CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
+        &workspace_size,
+        sizeof(workspace_size)));
+
+    int returned_results = 0;
+    constexpr int requested_algo_count = 10;
+    std::vector<cublasLtMatmulHeuristicResult_t> heuristic_results(
+        requested_algo_count);
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        dynload::cublasLtMatmulAlgoGetHeuristic(lt_handle,
+                                                desc->op_desc,
+                                                desc->y_desc,
+                                                desc->x_desc,
+                                                desc->out_desc,
+                                                desc->out_desc,
+                                                preference,
+                                                requested_algo_count,
+                                                heuristic_results.data(),
+                                                &returned_results));
+    PADDLE_ENFORCE_GT(returned_results,
+                      0,
+                      phi::errors::Unavailable("No GEMM algorithm avaliable."));
+    int best_algo_idx = -1;
+    if (returned_results == 1 || FLAGS_cublaslt_exhaustive_search_times <= 0) {
+      best_algo_idx = 0;
+    } else {
+      float min_time_cost = std::numeric_limits<float>::max();
+      for (int algo_idx = 0; algo_idx < returned_results; ++algo_idx) {
+        float cur_time_cost =
+            RunAndMeasureAlgo(ctx,
+                              lt_handle,
+                              desc,
+                              alpha,
+                              beta,
+                              y_data,
+                              x_data,
+                              out_data,
+                              workspace_ptr,
+                              workspace_size,
+                              &(heuristic_results[algo_idx].algo));
+        VLOG(6) << "[MatmulWithCublaslt] algo[" << algo_idx
+                << "] time: " << cur_time_cost << " s";
+
+        if ((best_algo_idx == 0 && (1.05 * cur_time_cost < min_time_cost)) ||
+            (cur_time_cost < min_time_cost)) {
+          best_algo_idx = algo_idx;
+          min_time_cost = cur_time_cost;
+        }
+      }
+    }
+    VLOG(6) << "[MatmulWithCublaslt] best_algo_idx: " << best_algo_idx;
+
+    cublasLtMatmulAlgo_t* best_algo = desc->SetAlgo();
+    *best_algo = heuristic_results[best_algo_idx].algo;
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        dynload::cublasLtMatmulPreferenceDestroy(preference));
+  }
+
+  static float RunAndMeasureAlgo(const phi::GPUContext& ctx,
+                                 const cublasLtHandle_t& lt_handle,
+                                 MatmulDescriptor* desc,
+                                 const void* alpha,
+                                 const void* beta,
+                                 const void* y_data,
+                                 const void* x_data,
+                                 void* out_data,
+                                 void* workspace_ptr,
+                                 size_t workspace_size,
+                                 cublasLtMatmulAlgo_t* algo) {
+    int repeats = FLAGS_cublaslt_exhaustive_search_times;
+    if (repeats <= 0) {
+      return std::numeric_limits<float>::max();
+    }
+
+    phi::GpuTimer timer;
+    float time_cost = 0.f;
+    const auto& stream = ctx.stream();
+
+    for (int i = 0; i < repeats; ++i) {
+      timer.Start(stream);
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmul(lt_handle,
+                                                         desc->op_desc,
+                                                         alpha,
+                                                         y_data,
+                                                         desc->y_desc,
+                                                         x_data,
+                                                         desc->x_desc,
+                                                         beta,
+                                                         out_data,
+                                                         desc->out_desc,
+                                                         out_data,
+                                                         desc->out_desc,
+                                                         algo,
+                                                         workspace_ptr,
+                                                         workspace_size,
+                                                         stream));
+      timer.Stop(stream);
+      ctx.Wait();
+      auto time = timer.ElapsedTime();
+      if (i > 0) {
+        // Exclude the warmup runtime.
+        time_cost += time;
+      }
+    }
+    return (time_cost / (repeats - 1));
+  }
+};
+
 // To judge if desc is cached or not.
 template <class DescT,
           typename T,
@@ -649,6 +859,70 @@ struct DescriptorSetter {
                    int64_t stride_out = 0,
                    const bool no_exchange = true,
                    bool grad_for_dx = true) {
+    if (std::is_same<T, int8_t>::value) {
+      if (!trans_x && !trans_y) {
+        PADDLE_ENFORCE_EQ(
+            (N % 4 == 0 || N == 1),
+            true,
+            phi::errors::InvalidArgument(
+                "The dimension size N used in int8 matmul must be 1 or a "
+                "multiple of 4 does not "
+                "match the size (%d) currently contained in the container.",
+                N));
+        PADDLE_ENFORCE_EQ(
+            (K % 4 == 0),
+            true,
+            phi::errors::InvalidArgument(
+                "The dimension size K used in int8 matmul must be a multiple "
+                "of 4 does not "
+                "match the size (%d) currently contained in the container.",
+                K));
+      } else if (!trans_x && trans_y) {
+        PADDLE_ENFORCE_EQ(
+            (K % 4 == 0),
+            true,
+            phi::errors::InvalidArgument(
+                "The dimension size K used in int8 matmul must be a multiple "
+                "of 4 does not "
+                "match the size (%d) currently contained in the container.",
+                K));
+      } else if (trans_x && !trans_y) {
+        PADDLE_ENFORCE_EQ(
+            (M % 4 == 0 || M == 1),
+            true,
+            phi::errors::InvalidArgument(
+                "The dimension size M used in int8 matmul must be 1 or a "
+                "multiple of 4 does not "
+                "match the size (%d) currently contained in the container.",
+                M));
+        PADDLE_ENFORCE_EQ(
+            (N % 4 == 0 || N == 1),
+            true,
+            phi::errors::InvalidArgument(
+                "The dimension size N used in int8 matmul must be 1 or a "
+                "multiple of 4 does not "
+                "match the size (%d) currently contained in the container.",
+                N));
+      } else {
+        PADDLE_ENFORCE_EQ(
+            (M % 4 == 0 || M == 1),
+            true,
+            phi::errors::InvalidArgument(
+                "The dimension size M used in int8 matmul must be 1 or a "
+                "multiple of 4 does not "
+                "match the size (%d) currently contained in the container.",
+                M));
+        PADDLE_ENFORCE_EQ(
+            (K % 4 == 0),
+            true,
+            phi::errors::InvalidArgument(
+                "The dimension size K used in int8 matmul must be a multiple "
+                "of 4 does not "
+                "match the size (%d) currently contained in the container.",
+                K));
+      }
+    }
+
     if (planner != nullptr) {
       sub_key = planner->GenSubKey();
     }
@@ -680,13 +954,13 @@ struct DescriptorSetter {
 };
 
 // For matmul with kernels autotune
-template <typename T>
-struct MatmulWithCublasLt : public CublasLtBase<T> {
+template <typename T, typename OutT = T>
+struct MatmulWithCublasLt : public CublasLtBase<T, OutT> {
  public:
   static void Run(const phi::GPUContext& ctx,
                   const T* x_data,
                   const T* y_data,
-                  T* out_data,
+                  OutT* out_data,
                   const int64_t M,
                   const int64_t N,
                   const int64_t K,
@@ -695,14 +969,14 @@ struct MatmulWithCublasLt : public CublasLtBase<T> {
                   phi::funcs::MatmulPlanner* planner = nullptr) {
     auto setter = DescriptorSetter<MatmulDescriptor, T>(
         planner, M, N, K, trans_x, trans_y);
-    CublasLtBase<T>::RunImpl(
+    CublasLtBase<T, OutT>::RunImpl(
         ctx, &setter.desc, setter.sub_key, x_data, y_data, out_data, planner);
   }
 
   static void RunWithBatch(const phi::GPUContext& ctx,
                            const T* x_data,
                            const T* y_data,
-                           T* out_data,
+                           OutT* out_data,
                            const int64_t M,
                            const int64_t N,
                            const int64_t K,
@@ -723,14 +997,14 @@ struct MatmulWithCublasLt : public CublasLtBase<T> {
                                                         stride_x,
                                                         stride_y,
                                                         stride_out);
-    CublasLtBase<T>::RunImpl(
+    CublasLtBase<T, OutT>::RunImpl(
         ctx, &setter.desc, setter.sub_key, x_data, y_data, out_data, planner);
   }
 
   static void RunWithBatch(const phi::GPUContext& ctx,
                            const T** x_data,
                            const T** y_data,
-                           T** out_data,
+                           OutT** out_data,
                            const int64_t M,
                            const int64_t N,
                            const int64_t K,
diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h
index e754ce3bf49e4..2ba3271d2c7df 100644
--- a/paddle/phi/kernels/funcs/broadcast_function.h
+++ b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -29,74 +29,86 @@ namespace funcs {
 
 #if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
 
-enum BroadcastLoadType { kMixed = 1, kBroadcast = 2, kElementwise = 3 };
-
-template <int Index>
-struct UseBroadcast {
-  template <typename ArgsT, typename Array1, typename Array2>
-  static HOSTDEVICE void Apply(
-      const std::vector<const DenseTensor *> &ins_tensor,
-      const ArgsT &args,
-      int64_t numel,
-      Array1 *ins_data,
-      Array2 *use_broadcast,
-      int *broadcast_num,
-      bool *all_elementwise) {
-    (*ins_data)[Index] = (const _ptr_ char *)(ins_tensor[Index]->data());
-    bool is_same_dim = ins_tensor[Index]->numel() == numel;
-    if (is_same_dim) {
-      (*use_broadcast)[Index] = false;
-    } else {
-      (*use_broadcast)[Index] = true;
-      (*broadcast_num)++;
-    }
-    *all_elementwise &= is_same_dim;
-  }
-};
+enum BroadcastType { kMixed = 1, kBroadcast = 2, kElementwise = 3 };
 
-template <typename OutT, int Arity, typename Functor>
-struct LoaderTypeClassifier {
- public:
+template <typename OutT, typename Functor, int Arity, int NumOuts>
+struct BroadcastTypeClassifier {
   int64_t numel{0};
-  int vec_size{4};
-  int broadcast_num{0};
-  bool all_elementwise{true};
-  phi::Array<bool, Arity> use_broadcast;
+  int broadcast_num{0};                   // Not used for XPU
+  bool all_elementwise{true};             // Not used for XPU
+  phi::Array<bool, Arity> use_broadcast;  // Not used for XPU
+  phi::Array<kps::details::BroadcastConfig, Arity> configs;
   phi::Array<const _ptr_ char *__restrict__, Arity> ins_data;
+  phi::Array<_ptr_ OutT *, NumOuts> outs_data;
+
+  BroadcastTypeClassifier() {}
+  BroadcastTypeClassifier(const std::vector<const DenseTensor *> &ins,
+                          std::vector<DenseTensor *> *outs,
+                          int axis) {
+    numel = (*outs)[0]->numel();
+
+#ifndef PADDLE_WITH_XPU_KP
+    for (size_t i = 0; i < ins.size(); ++i) {
+      bool is_same_dim = ins[i]->numel() == numel;
+      if (is_same_dim) {
+        use_broadcast[i] = false;
+      } else {
+        use_broadcast[i] = true;
+        broadcast_num++;
+      }
+      all_elementwise &= is_same_dim;
+    }
+#endif
+
+    InitBroadcastConfigs(ins, outs, axis);
 
-  LoaderTypeClassifier() {}
-  LoaderTypeClassifier(const std::vector<const DenseTensor *> &ins,
-                       std::vector<DenseTensor *> *outs) {
     using Traits = phi::funcs::FunctionTraits<Functor>;
     using ArgsT = typename Traits::ArgsTuple;
     ArgsT arg;
-    uint64_t out_addr = reinterpret_cast<uint64_t>((*outs)[0]->data<OutT>());
-
-    UnrollerWithoutVecSize<VecSizeGetter, Arity>::step(ins, arg, &vec_size);
-
-    for (auto i = 1; i < outs->size(); ++i) {
-      PADDLE_ENFORCE_EQ(
-          (*outs)[i]->dims(),
-          (*outs)[0]->dims(),
-          phi::errors::InvalidArgument(
-              "The shape of each output tensor shall be identical yet, but "
-              "%d-th output tensor`s shape is not.",
-              i));
-      out_addr =
-          (out_addr | reinterpret_cast<uint64_t>((*outs)[i]->data<OutT>()));
+    UnrollerWithoutVecSize<InputSetter, Arity>::step(ins, arg, &ins_data);
+    for (int i = 0; i < NumOuts; ++i) {
+      outs_data[i] = (*outs)[i]->data<OutT>();
     }
+  }
 
-    vec_size = std::min(
-        vec_size,
-        phi::GetVectorizedSize<OutT>(reinterpret_cast<OutT *>(out_addr)));
-    numel = (*outs)[0]->numel();
-    UnrollerWithoutVecSize<UseBroadcast, Arity>::step(ins,
-                                                      arg,
-                                                      numel,
-                                                      &ins_data,
-                                                      &use_broadcast,
-                                                      &broadcast_num,
-                                                      &all_elementwise);
+  void InitBroadcastConfigs(const std::vector<const DenseTensor *> &ins,
+                            std::vector<DenseTensor *> *outs,
+                            int axis) {
+#ifdef PADDLE_WITH_XPU_KP
+    const auto dims_simplifier =
+        BroadcastDimsSimplifier(ins, (*outs)[0]->dims(), axis);
+    if (VLOG_IS_ON(6)) {
+      DimsSimplifiedLogger<int64_t>::Log(
+          ins, outs, dims_simplifier, "BroadcastKernel");
+    }
+    configs[0] = kps::details::BroadcastConfig(dims_simplifier.out_dims,
+                                               dims_simplifier.in_dims[0],
+                                               dims_simplifier.in_dims[1],
+                                               dims_simplifier.rank);
+    configs[1] = kps::details::BroadcastConfig(dims_simplifier.out_dims,
+                                               dims_simplifier.in_dims[1],
+                                               dims_simplifier.in_dims[0],
+                                               dims_simplifier.rank);
+#else
+    if (!all_elementwise) {
+      const auto dims_simplifier =
+          BroadcastDimsSimplifier(ins, (*outs)[0]->dims(), axis);
+      if (VLOG_IS_ON(6)) {
+        DimsSimplifiedLogger<int64_t>::Log(
+            ins, outs, dims_simplifier, "BroadcastKernel");
+      }
+      for (int i = 0; i < Arity; ++i) {
+        // if data shape is[m, n], then you should set data_dim = {n, m}
+        // eg: out's shape [3, 45, 1]. then out_dims = {1, 45, 3}
+        // if (ins[i]->numel() != (*outs)[0]->numel()) {
+        if (ins[i]->numel()) {
+          configs[i] = kps::details::BroadcastConfig(dims_simplifier.out_dims,
+                                                     dims_simplifier.in_dims[i],
+                                                     dims_simplifier.rank);
+        }
+      }
+    }
+#endif
   }
 };
 
@@ -425,18 +437,10 @@ __global__ void VectorizedBroadcastKernel(
 template <typename OutT, typename Functor, int Arity, int NumOuts, int VecSize>
 void LaunchBroadcastKernel(
     const KPDevice &ctx,
-    const std::vector<const DenseTensor *> &ins,
-    std::vector<DenseTensor *> *outs,
-    Functor func,
-    const phi::Array<kps::details::BroadcastConfig, Arity> &configs,
-    const LoaderTypeClassifier<OutT, Arity, Functor> &loader_classifier) {
-  phi::Array<_ptr_ OutT *, NumOuts> outs_data;
-  for (int i = 0; i < NumOuts; ++i) {
-    outs_data[i] = (_ptr_ OutT *)(ctx.Alloc<OutT>((*outs)[i]));
-  }
-
+    const BroadcastTypeClassifier<OutT, Functor, Arity, NumOuts> &classifier,
+    Functor func) {
 #ifdef PADDLE_WITH_XPU_KP
-  int numel = (*outs)[0]->numel();
+  int numel = classifier.numel;
   const int threads = 64;
   const int blocks = 8;
   int read_lens = configs[0].buf_len;
@@ -445,17 +449,17 @@ void LaunchBroadcastKernel(
   int tail_tid = numel % (read_lens * threads);
 
   VectorizedBroadcastKernel<Functor, OutT, Arity, NumOuts, VecSize, false>
-      <<<blocks, threads, 0, stream>>>(loader_classifier.ins_data,
-                                       outs_data,
-                                       loader_classifier.use_broadcast,
+      <<<blocks, threads, 0, stream>>>(classifier.ins_data,
+                                       classifier.outs_data,
+                                       classifier.use_broadcast,
                                        numel,
-                                       configs,
+                                       classifier.configs,
                                        main_offset,
                                        tail_tid,
                                        read_lens,
                                        func);
 #else
-  const auto &numel = loader_classifier.numel;
+  const auto &numel = classifier.numel;
   auto gpu_config =
       phi::backends::gpu::GetGpuLaunchConfig1D(ctx, numel, VecSize);
   auto stream = ctx.stream();
@@ -464,41 +468,41 @@ void LaunchBroadcastKernel(
   int main_offset = (numel / (VecSize * threads)) * VecSize * threads;
   int tail_tid = numel % (VecSize * threads);
 
-  if (loader_classifier.all_elementwise) {
+  if (classifier.all_elementwise) {
     VectorizedBroadcastKernel<Functor,
                               OutT,
                               Arity,
                               NumOuts,
                               VecSize,
                               kElementwise>
-        <<<blocks, threads, 0, stream>>>(loader_classifier.ins_data,
-                                         outs_data,
-                                         loader_classifier.use_broadcast,
+        <<<blocks, threads, 0, stream>>>(classifier.ins_data,
+                                         classifier.outs_data,
+                                         classifier.use_broadcast,
                                          numel,
-                                         configs,
+                                         classifier.configs,
                                          main_offset,
                                          tail_tid,
                                          VecSize,
                                          func);
-  } else if (loader_classifier.broadcast_num > (Arity >> 1)) {
-    constexpr BroadcastLoadType type_ = (Arity > 1) ? kBroadcast : kMixed;
+  } else if (classifier.broadcast_num > (Arity >> 1)) {
+    constexpr BroadcastType type_ = (Arity > 1) ? kBroadcast : kMixed;
     VectorizedBroadcastKernel<Functor, OutT, Arity, NumOuts, VecSize, type_>
-        <<<blocks, threads, 0, stream>>>(loader_classifier.ins_data,
-                                         outs_data,
-                                         loader_classifier.use_broadcast,
+        <<<blocks, threads, 0, stream>>>(classifier.ins_data,
+                                         classifier.outs_data,
+                                         classifier.use_broadcast,
                                          numel,
-                                         configs,
+                                         classifier.configs,
                                          main_offset,
                                          tail_tid,
                                          VecSize,
                                          func);
   } else {
     VectorizedBroadcastKernel<Functor, OutT, Arity, NumOuts, VecSize, kMixed>
-        <<<blocks, threads, 0, stream>>>(loader_classifier.ins_data,
-                                         outs_data,
-                                         loader_classifier.use_broadcast,
+        <<<blocks, threads, 0, stream>>>(classifier.ins_data,
+                                         classifier.outs_data,
+                                         classifier.use_broadcast,
                                          numel,
-                                         configs,
+                                         classifier.configs,
                                          main_offset,
                                          tail_tid,
                                          VecSize,
@@ -632,9 +636,13 @@ struct LaunchBroadcastKernelWithInt64IndexHelper<OutT,
                   std::vector<DenseTensor *> *outs,
                   int axis,
                   Functor functor) {
+    using Traits = phi::funcs::FunctionTraits<Functor>;
+    using ArgsT = typename Traits::ArgsTuple;
+    ArgsT arg;
     phi::Array<const _ptr_ char *__restrict__, MaxWithOne<Arity>::kValue>
         ins_ptrs;
-    UnrollerWithoutVecSize<InputSetter, Arity>::step(ins, &ins_ptrs);
+    UnrollerWithoutVecSize<InputSetter, Arity>::step(ins, arg, &ins_ptrs);
+
     auto *out_tensor = (*outs)[0];
     auto *out_ptr = ctx.Alloc<OutT>(out_tensor);
 
@@ -815,26 +823,65 @@ struct LaunchBroadcastKernelWithInt64IndexHelper<OutT,
 };
 #endif
 
-template <typename OutT, typename Functor, int kArity, int NumOuts = 1>
-void BroadcastKernelForDifferentVecSize(
-    const KPDevice &ctx,
-    const std::vector<const DenseTensor *> &ins,
-    std::vector<DenseTensor *> *outs,
-    int axis,
-    Functor func) {
+template <typename OutT, typename Functor, int Arity, int NumOuts = 1>
+typename std::enable_if<!NeedVectorized<OutT>::value, void>::type
+BroadcastKernelForDifferentVecSize(const KPDevice &ctx,
+                                   const std::vector<const DenseTensor *> &ins,
+                                   std::vector<DenseTensor *> *outs,
+                                   int axis,
+                                   Functor func) {
 #ifndef PADDLE_WITH_XPU_KP
-  constexpr bool kEnabledInt64IndexKernel = (NumOuts == 1 && kArity <= 3);
+  constexpr bool kEnabledInt64IndexKernel = (NumOuts == 1 && Arity <= 3);
   bool use_int64_index_kernel =
       kEnabledInt64IndexKernel &&
       (*outs)[0]->numel() >= std::numeric_limits<int32_t>::max();
   if (use_int64_index_kernel) {
-    auto loader_classifier =
-        LoaderTypeClassifier<OutT, kArity, Functor>(ins, outs);
-    switch (loader_classifier.vec_size) {
+    LaunchBroadcastKernelWithInt64IndexHelper<OutT,
+                                              Functor,
+                                              Arity,
+                                              NumOuts,
+                                              VecSizeS>::Run(ctx,
+                                                             ins,
+                                                             outs,
+                                                             axis,
+                                                             func);
+    return;
+  }
+#endif
+
+  auto classifier =
+      BroadcastTypeClassifier<OutT, Functor, Arity, NumOuts>(ins, outs, axis);
+  LaunchBroadcastKernel<OutT, Functor, Arity, NumOuts, VecSizeS>(
+      ctx, classifier, func);
+}
+
+template <typename OutT, typename Functor, int Arity, int NumOuts = 1>
+typename std::enable_if<NeedVectorized<OutT>::value, void>::type
+BroadcastKernelForDifferentVecSize(const KPDevice &ctx,
+                                   const std::vector<const DenseTensor *> &ins,
+                                   std::vector<DenseTensor *> *outs,
+                                   int axis,
+                                   Functor func) {
+#ifdef PADDLE_WITH_XPU_KP
+  auto type = kps::details::OptType::CanNotOptimize;
+  bool is_optimize = classifier.configs[0].cmp_type != type;
+  int vec_size = is_optimize ? VecSizeL : VecSizeM;
+#else
+  // Calculate the max vec_size for all ins and outs.
+  int vec_size = GetVectorizedSizeForTensors(ins, *outs);
+#endif
+
+#ifndef PADDLE_WITH_XPU_KP
+  constexpr bool kEnabledInt64IndexKernel = (NumOuts == 1 && Arity <= 3);
+  bool use_int64_index_kernel =
+      kEnabledInt64IndexKernel &&
+      (*outs)[0]->numel() >= std::numeric_limits<int32_t>::max();
+  if (use_int64_index_kernel) {
+    switch (vec_size) {
       case VecSizeL: {
         LaunchBroadcastKernelWithInt64IndexHelper<OutT,
                                                   Functor,
-                                                  kArity,
+                                                  Arity,
                                                   NumOuts,
                                                   VecSizeL>::Run(ctx,
                                                                  ins,
@@ -846,7 +893,7 @@ void BroadcastKernelForDifferentVecSize(
       case VecSizeM: {
         LaunchBroadcastKernelWithInt64IndexHelper<OutT,
                                                   Functor,
-                                                  kArity,
+                                                  Arity,
                                                   NumOuts,
                                                   VecSizeM>::Run(ctx,
                                                                  ins,
@@ -858,7 +905,7 @@ void BroadcastKernelForDifferentVecSize(
       case VecSizeS: {
         LaunchBroadcastKernelWithInt64IndexHelper<OutT,
                                                   Functor,
-                                                  kArity,
+                                                  Arity,
                                                   NumOuts,
                                                   VecSizeS>::Run(ctx,
                                                                  ins,
@@ -869,7 +916,7 @@ void BroadcastKernelForDifferentVecSize(
       }
       default: {
         PADDLE_THROW(phi::errors::Unimplemented(
-            "Unsupported vectorized size: %d!", loader_classifier.vec_size));
+            "Unsupported vectorized size: %d!", vec_size));
         break;
       }
     }
@@ -877,74 +924,27 @@ void BroadcastKernelForDifferentVecSize(
   }
 #endif
 
-  phi::Array<kps::details::BroadcastConfig, kArity> configs;
-#ifdef PADDLE_WITH_XPU_KP
-  PADDLE_ENFORCE_EQ(
-      ins.size(),
-      2,
-      phi::errors::InvalidArgument(
-          "XPU only support inputs is 2, but received %d", ins.size()));
-
-  auto loader_classifier = LoaderTypeClassifier<OutT, kArity, Functor>();
-  const auto dims_simplifier =
-      BroadcastDimsSimplifier(ins, (*outs)[0]->dims(), axis);
-  if (VLOG_IS_ON(6)) {
-    DimsSimplifiedLogger<int64_t>::Log(
-        ins, outs, dims_simplifier, "XPU Broadcast");
-  }
-  configs[0] = kps::details::BroadcastConfig(dims_simplifier.out_dims,
-                                             dims_simplifier.in_dims[0],
-                                             dims_simplifier.in_dims[1],
-                                             dims_simplifier.rank);
-  configs[1] = kps::details::BroadcastConfig(dims_simplifier.out_dims,
-                                             dims_simplifier.in_dims[1],
-                                             dims_simplifier.in_dims[0],
-                                             dims_simplifier.rank);
-  auto type = kps::details::OptType::CanNotOptimize;
-  bool is_optimize = configs[0].cmp_type != type;
-  int vec_size = is_optimize ? VecSizeL : VecSizeM;
-#else
-  auto loader_classifier =
-      LoaderTypeClassifier<OutT, kArity, Functor>(ins, outs);
-  if (!loader_classifier.all_elementwise) {
-    const auto dims_simplifier =
-        BroadcastDimsSimplifier(ins, (*outs)[0]->dims(), axis);
-
-    if (VLOG_IS_ON(6)) {
-      DimsSimplifiedLogger<int64_t>::Log(
-          ins, outs, dims_simplifier, "GPU Broadcast");
-    }
-    for (int i = 0; i < kArity; ++i) {
-      // if data shape is[m, n], then you should set data_dim = {n, m}
-      // eg: out's shape [3, 45, 1]. then out_dims = {1, 45, 3}
-      // if (ins[i]->numel() != (*outs)[0]->numel()) {
-      if (ins[i]->numel()) {
-        configs[i] = kps::details::BroadcastConfig(dims_simplifier.out_dims,
-                                                   dims_simplifier.in_dims[i],
-                                                   dims_simplifier.rank);
-      }
-    }
-  }
-#endif
-  switch (loader_classifier.vec_size) {
+  auto classifier =
+      BroadcastTypeClassifier<OutT, Functor, Arity, NumOuts>(ins, outs, axis);
+  switch (vec_size) {
     case VecSizeL: {
-      LaunchBroadcastKernel<OutT, Functor, kArity, NumOuts, VecSizeL>(
-          ctx, ins, outs, func, configs, loader_classifier);
+      LaunchBroadcastKernel<OutT, Functor, Arity, NumOuts, VecSizeL>(
+          ctx, classifier, func);
       break;
     }
     case VecSizeM: {
-      LaunchBroadcastKernel<OutT, Functor, kArity, NumOuts, VecSizeM>(
-          ctx, ins, outs, func, configs, loader_classifier);
+      LaunchBroadcastKernel<OutT, Functor, Arity, NumOuts, VecSizeM>(
+          ctx, classifier, func);
       break;
     }
     case VecSizeS: {
-      LaunchBroadcastKernel<OutT, Functor, kArity, NumOuts, VecSizeS>(
-          ctx, ins, outs, func, configs, loader_classifier);
+      LaunchBroadcastKernel<OutT, Functor, Arity, NumOuts, VecSizeS>(
+          ctx, classifier, func);
       break;
     }
     default: {
       PADDLE_THROW(phi::errors::Unimplemented(
-          "Unsupported vectorized size: %d!", loader_classifier.vec_size));
+          "Unsupported vectorized size: %d!", vec_size));
       break;
     }
   }
@@ -960,6 +960,15 @@ void BroadcastKernel(const KPDevice &ctx,
   // maximum rank of all inputs.
   using Traits = phi::funcs::FunctionTraits<Functor>;
   const int kArity = Traits::arity;
+
+#ifdef PADDLE_WITH_XPU_KP
+  PADDLE_ENFORCE_EQ(
+      ins.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "XPU only support inputs is 2, but received %d", ins.size()));
+#endif
+
   PADDLE_ENFORCE_EQ(
       ins.size(),
       kArity,
@@ -980,6 +989,19 @@ void BroadcastKernel(const KPDevice &ctx,
                                    outs->size(),
                                    NumOuts));
 
+  for (auto i = 0; i < outs->size(); ++i) {
+    if (i > 0) {
+      PADDLE_ENFORCE_EQ(
+          (*outs)[i]->dims(),
+          (*outs)[0]->dims(),
+          phi::errors::InvalidArgument(
+              "The shape of each output tensor shall be identical yet, but "
+              "%d-th output tensor`s shape is not.",
+              i));
+    }
+    ctx.template Alloc<OutT>((*outs)[i]);
+  }
+
   int max_rank = 0;
   int min_rank = phi::DDim::kMaxRank;
   for (auto *in : ins) {
diff --git a/paddle/phi/kernels/funcs/elementwise_base.h b/paddle/phi/kernels/funcs/elementwise_base.h
index 274ac1cc32c05..8ddb3f406ddfe 100644
--- a/paddle/phi/kernels/funcs/elementwise_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_base.h
@@ -553,43 +553,28 @@ struct Loader {
 
 template <int Index>
 struct InputSetter {
-  template <typename Array>
-  static HOSTDEVICE void Apply(
-      const std::vector<const DenseTensor *> &ins_tensor, Array *ins_data) {
-    (*ins_data)[Index] = (const _ptr_ char *)(ins_tensor[Index]->data());
-  }
-};
-
-template <int Index>
-struct VecSizeGetter {
-  template <typename ArgsT>
-  static HOSTDEVICE void Apply(const std::vector<const DenseTensor *> &ins,
-                               const ArgsT &args,
-                               int *vec_size) {
+  template <typename Array, typename ArgsT>
+  static void Apply(const std::vector<const DenseTensor *> &ins_tensor,
+                    const ArgsT &args,
+                    Array *ins_data) {
     using Type = std::tuple_element_t<Index, ArgsT>;
-    *vec_size = std::min<int>(*vec_size,
-                              phi::GetVectorizedSize(ins[Index]->data<Type>()));
+    (*ins_data)[Index] = (const _ptr_ char *)(ins_tensor[Index]->data<Type>());
   }
 };
 
-template <typename OutT, typename Functor>
-int GetVectorizedSizeForTensors(const std::vector<const DenseTensor *> &ins,
-                                const std::vector<DenseTensor *> &outs) {
+static int GetVectorizedSizeForTensors(
+    const std::vector<const DenseTensor *> &ins,
+    const std::vector<DenseTensor *> &outs) {
 #ifdef PADDLE_WITH_XPU_KP
   int vec_size = 256;
 #else
-  using Traits = phi::funcs::FunctionTraits<Functor>;
-  using ArgsT = typename Traits::ArgsTuple;
-  const int Arity = Traits::arity;
   int vec_size = 4;
-  uint64_t addr = static_cast<uint64_t>(0);
-  ArgsT arg;
-  UnrollerWithoutVecSize<VecSizeGetter, Arity>::step(ins, arg, &vec_size);
-  for (auto iter = outs.begin(); iter != outs.end(); ++iter) {
-    addr = (addr | reinterpret_cast<uint64_t>((*iter)->data<OutT>()));
+  for (size_t i = 0; i < ins.size(); ++i) {
+    vec_size = std::min(vec_size, phi::GetVectorizedSize(ins[i]));
+  }
+  for (size_t i = 0; i < outs.size(); ++i) {
+    vec_size = std::min(vec_size, phi::GetVectorizedSize(outs[i]));
   }
-  vec_size = std::min(
-      vec_size, phi::GetVectorizedSize<OutT>(reinterpret_cast<OutT *>(addr)));
 #endif
   return vec_size;
 }
@@ -738,10 +723,10 @@ __global__ void VectorizedElementwiseKernel(
 }
 
 template <typename OutT, typename Functor, int Arity, int NumOuts, int VecSize>
-void LaunchElementwiseCudaKernel(const KPDevice &ctx,
-                                 const std::vector<const DenseTensor *> &ins,
-                                 std::vector<DenseTensor *> *outs,
-                                 Functor func) {
+void LaunchElementwiseKernel(const KPDevice &ctx,
+                             const std::vector<const DenseTensor *> &ins,
+                             std::vector<DenseTensor *> *outs,
+                             Functor func) {
   // There are at least 1 output, but maybe 0 input (ins.size() == 0).
   // For large tensor numel * sizeof(T) > 2^31, we must use int64_t as index
   // type.
@@ -749,10 +734,14 @@ void LaunchElementwiseCudaKernel(const KPDevice &ctx,
   phi::Array<const _ptr_ char *__restrict__, Arity> ins_data;
   phi::Array<_ptr_ OutT *, NumOuts> outs_data;
 
-  UnrollerWithoutVecSize<InputSetter, Arity>::step(ins, &ins_data);
-  for (int i = 0; i < NumOuts; ++i) {
-    outs_data[i] = (_ptr_ OutT *)(ctx.Alloc<OutT>((*outs)[i]));
+  using Traits = phi::funcs::FunctionTraits<Functor>;
+  using ArgsT = typename Traits::ArgsTuple;
+  ArgsT arg;
+  UnrollerWithoutVecSize<InputSetter, Arity>::step(ins, arg, &ins_data);
+  for (int i = 0; i < outs->size(); ++i) {
+    outs_data[i] = (*outs)[i]->data<OutT>();
   }
+
 #ifdef PADDLE_WITH_XPU_KP
   int block_size = 64;
   int grid_size = 8;
@@ -775,6 +764,47 @@ void LaunchElementwiseCudaKernel(const KPDevice &ctx,
 #endif
 }
 
+template <typename OutT, typename Functor, int Arity, int NumOuts = 1>
+typename std::enable_if<!NeedVectorized<OutT>::value, void>::type
+ElementwiseKernelForDifferentVecSize(
+    const KPDevice &ctx,
+    const std::vector<const DenseTensor *> &ins,
+    std::vector<DenseTensor *> *outs,
+    Functor func) {
+  LaunchElementwiseKernel<OutT, Functor, Arity, NumOuts, VecSizeS>(
+      ctx, ins, outs, func);
+}
+
+template <typename OutT, typename Functor, int Arity, int NumOuts = 1>
+typename std::enable_if<NeedVectorized<OutT>::value, void>::type
+ElementwiseKernelForDifferentVecSize(
+    const KPDevice &ctx,
+    const std::vector<const DenseTensor *> &ins,
+    std::vector<DenseTensor *> *outs,
+    Functor func) {
+  // calculate the max vec_size for all ins and outs
+  int vec_size = GetVectorizedSizeForTensors(ins, *outs);
+  switch (vec_size) {
+    case VecSizeL:
+      LaunchElementwiseKernel<OutT, Functor, Arity, NumOuts, VecSizeL>(
+          ctx, ins, outs, func);
+      break;
+    case VecSizeM:
+      LaunchElementwiseKernel<OutT, Functor, Arity, NumOuts, VecSizeM>(
+          ctx, ins, outs, func);
+      break;
+    case VecSizeS:
+      LaunchElementwiseKernel<OutT, Functor, Arity, NumOuts, VecSizeS>(
+          ctx, ins, outs, func);
+      break;
+    default: {
+      PADDLE_THROW(phi::errors::Unimplemented(
+          "Unsupported vectorized size: %d !", vec_size));
+      break;
+    }
+  }
+}
+
 template <typename OutT, typename Functor, int NumOuts = 1>
 void ElementwiseKernel(const KPDevice &ctx,
                        const std::vector<const DenseTensor *> &ins,
@@ -798,8 +828,8 @@ void ElementwiseKernel(const KPDevice &ctx,
                         outs->size(),
                         NumOuts));
 
-  if (NumOuts > 1) {
-    for (int i = 1; i < NumOuts; ++i) {
+  for (int i = 0; i < outs->size(); ++i) {
+    if (i > 0) {
       PADDLE_ENFORCE_EQ(
           (*outs)[i]->dims(),
           (*outs)[0]->dims(),
@@ -808,29 +838,11 @@ void ElementwiseKernel(const KPDevice &ctx,
               "but %dth output tensor`s shape is not.",
               i));
     }
+    ctx.template Alloc<OutT>((*outs)[i]);
   }
 
-  // calculate the max vec_size for all ins and outs
-  int vec_size = GetVectorizedSizeForTensors<OutT, Functor>(ins, *outs);
-  switch (vec_size) {
-    case VecSizeL:
-      LaunchElementwiseCudaKernel<OutT, Functor, kArity, NumOuts, VecSizeL>(
-          ctx, ins, outs, func);
-      break;
-    case VecSizeM:
-      LaunchElementwiseCudaKernel<OutT, Functor, kArity, NumOuts, VecSizeM>(
-          ctx, ins, outs, func);
-      break;
-    case VecSizeS:
-      LaunchElementwiseCudaKernel<OutT, Functor, kArity, NumOuts, VecSizeS>(
-          ctx, ins, outs, func);
-      break;
-    default: {
-      PADDLE_THROW(phi::errors::Unimplemented(
-          "Unsupported vectorized size: %d !", vec_size));
-      break;
-    }
-  }
+  ElementwiseKernelForDifferentVecSize<OutT, Functor, kArity, NumOuts>(
+      ctx, ins, outs, func);
 }
 
 #endif
diff --git a/paddle/phi/kernels/funcs/elementwise_functor.h b/paddle/phi/kernels/funcs/elementwise_functor.h
index b7994d9cefa51..3f221d98b0a25 100644
--- a/paddle/phi/kernels/funcs/elementwise_functor.h
+++ b/paddle/phi/kernels/funcs/elementwise_functor.h
@@ -40,22 +40,11 @@ struct AddFunctor {
   inline HOSTDEVICE T operator()(const T a, const T b) const { return a + b; }
 };
 template <typename T>
-struct InverseAddFunctor {
-  inline HOSTDEVICE T operator()(const T a, const T b) const { return b + a; }
-};
-
-// Float32Bfloat16Add
-template <typename T>
-struct Float32Bfloat16AddFunctor {
-  inline HOSTDEVICE T operator()(const T x, const phi::bfloat16 y) {
-    return x + static_cast<T>(y);
-  }
-};
+using InverseAddFunctor = AddFunctor<T>;
 
-// Float32Float16Add
-template <typename T>
-struct Float32Float16AddFunctor {
-  inline HOSTDEVICE T operator()(const T x, const phi::float16 y) {
+template <typename T, typename Ty = T>
+struct MultiPrecisionAddFunctor {
+  inline HOSTDEVICE T operator()(const T x, const Ty y) const {
     return x + static_cast<T>(y);
   }
 };
@@ -82,15 +71,7 @@ struct MultiplyFunctor<bool> {
   }
 };
 template <typename T>
-struct InverseMultiplyFunctor {
-  inline HOSTDEVICE T operator()(const T a, const T b) const { return b * a; }
-};
-template <>
-struct InverseMultiplyFunctor<bool> {
-  inline HOSTDEVICE bool operator()(const bool a, const bool b) const {
-    return b && a;
-  }
-};
+using InverseMultiplyFunctor = MultiplyFunctor<T>;
 
 template <typename T>
 struct IsZeroFunctor {
diff --git a/paddle/phi/kernels/funcs/im2col.cc b/paddle/phi/kernels/funcs/im2col.cc
index 0b5901367488a..e4c470e1a7064 100644
--- a/paddle/phi/kernels/funcs/im2col.cc
+++ b/paddle/phi/kernels/funcs/im2col.cc
@@ -160,12 +160,24 @@ template class Im2ColFunctor<phi::funcs::ColFormat::kCFO,
 template class Im2ColFunctor<phi::funcs::ColFormat::kCFO,
                              phi::CPUContext,
                              double>;
+template class Im2ColFunctor<phi::funcs::ColFormat::kCFO,
+                             phi::CPUContext,
+                             phi::dtype::complex<float>>;
+template class Im2ColFunctor<phi::funcs::ColFormat::kCFO,
+                             phi::CPUContext,
+                             phi::dtype::complex<double>>;
 template class Col2ImFunctor<phi::funcs::ColFormat::kCFO,
                              phi::CPUContext,
                              float>;
 template class Col2ImFunctor<phi::funcs::ColFormat::kCFO,
                              phi::CPUContext,
                              double>;
+template class Col2ImFunctor<phi::funcs::ColFormat::kCFO,
+                             phi::CPUContext,
+                             phi::dtype::complex<float>>;
+template class Col2ImFunctor<phi::funcs::ColFormat::kCFO,
+                             phi::CPUContext,
+                             phi::dtype::complex<double>>;
 
 /*
  * im = [input_channels, input_height, input_width]
@@ -331,11 +343,23 @@ template class Im2ColFunctor<phi::funcs::ColFormat::kOCF,
 template class Im2ColFunctor<phi::funcs::ColFormat::kOCF,
                              phi::CPUContext,
                              double>;
+template class Im2ColFunctor<phi::funcs::ColFormat::kOCF,
+                             phi::CPUContext,
+                             phi::dtype::complex<float>>;
+template class Im2ColFunctor<phi::funcs::ColFormat::kOCF,
+                             phi::CPUContext,
+                             phi::dtype::complex<double>>;
 template class Col2ImFunctor<phi::funcs::ColFormat::kOCF,
                              phi::CPUContext,
                              float>;
 template class Col2ImFunctor<phi::funcs::ColFormat::kOCF,
                              phi::CPUContext,
                              double>;
+template class Col2ImFunctor<phi::funcs::ColFormat::kOCF,
+                             phi::CPUContext,
+                             phi::dtype::complex<float>>;
+template class Col2ImFunctor<phi::funcs::ColFormat::kOCF,
+                             phi::CPUContext,
+                             phi::dtype::complex<double>>;
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/im2col.cu b/paddle/phi/kernels/funcs/im2col.cu
index 87c82adbb7fbe..b633241810f9b 100644
--- a/paddle/phi/kernels/funcs/im2col.cu
+++ b/paddle/phi/kernels/funcs/im2col.cu
@@ -310,6 +310,12 @@ template class Im2ColFunctor<phi::funcs::ColFormat::kCFO,
 template class Im2ColFunctor<phi::funcs::ColFormat::kCFO,
                              phi::GPUContext,
                              double>;
+template class Im2ColFunctor<phi::funcs::ColFormat::kCFO,
+                             phi::GPUContext,
+                             phi::dtype::complex<float>>;
+template class Im2ColFunctor<phi::funcs::ColFormat::kCFO,
+                             phi::GPUContext,
+                             phi::dtype::complex<double>>;
 template class Im2ColFunctor<phi::funcs::ColFormat::kCFO,
                              phi::GPUContext,
                              phi::dtype::float16>;
@@ -322,6 +328,12 @@ template class Col2ImFunctor<phi::funcs::ColFormat::kCFO,
 template class Col2ImFunctor<phi::funcs::ColFormat::kCFO,
                              phi::GPUContext,
                              double>;
+template class Col2ImFunctor<phi::funcs::ColFormat::kCFO,
+                             phi::GPUContext,
+                             phi::dtype::complex<float>>;
+template class Col2ImFunctor<phi::funcs::ColFormat::kCFO,
+                             phi::GPUContext,
+                             phi::dtype::complex<double>>;
 template class Col2ImFunctor<phi::funcs::ColFormat::kCFO,
                              phi::GPUContext,
                              phi::dtype::float16>;
@@ -573,6 +585,12 @@ template class Im2ColFunctor<phi::funcs::ColFormat::kOCF,
 template class Im2ColFunctor<phi::funcs::ColFormat::kOCF,
                              phi::GPUContext,
                              double>;
+template class Im2ColFunctor<phi::funcs::ColFormat::kOCF,
+                             phi::GPUContext,
+                             phi::dtype::complex<float>>;
+template class Im2ColFunctor<phi::funcs::ColFormat::kOCF,
+                             phi::GPUContext,
+                             phi::dtype::complex<double>>;
 template class Im2ColFunctor<phi::funcs::ColFormat::kOCF,
                              phi::GPUContext,
                              phi::dtype::float16>;
@@ -585,6 +603,12 @@ template class Col2ImFunctor<phi::funcs::ColFormat::kOCF,
 template class Col2ImFunctor<phi::funcs::ColFormat::kOCF,
                              phi::GPUContext,
                              double>;
+template class Col2ImFunctor<phi::funcs::ColFormat::kOCF,
+                             phi::GPUContext,
+                             phi::dtype::complex<float>>;
+template class Col2ImFunctor<phi::funcs::ColFormat::kOCF,
+                             phi::GPUContext,
+                             phi::dtype::complex<double>>;
 template class Col2ImFunctor<phi::funcs::ColFormat::kOCF,
                              phi::GPUContext,
                              phi::dtype::float16>;
diff --git a/paddle/phi/kernels/funcs/weight_only_gemv.cu b/paddle/phi/kernels/funcs/weight_only_gemv.cu
index a1c746bd49ce1..76716ecf30dc5 100644
--- a/paddle/phi/kernels/funcs/weight_only_gemv.cu
+++ b/paddle/phi/kernels/funcs/weight_only_gemv.cu
@@ -189,7 +189,12 @@ struct ConvertDstFunc<__nv_bfloat16> {
 template <typename T>
 struct HalfMul {
   static __device__ __forceinline__ T apply(const T& x, const T& y) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
     return __hmul(x, y);
+#else
+    float res = static_cast<float>(float16(x)) * static_cast<float>(float16(y));
+    return float16(res).to_half();
+#endif
   }
 };
 
diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.cu b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.cu
new file mode 100644
index 0000000000000..ff98cb01c9866
--- /dev/null
+++ b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.cu
@@ -0,0 +1,590 @@
+/*
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h"
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic pop
+
+namespace phi {
+
+template <typename T,
+          typename WeightType,
+          typename arch,
+          typename EpilogueTag,
+          typename ThreadblockShape,
+          typename WarpShape>
+void dispatch_gemm_config(const T* A,
+                          const WeightType* B,
+                          const float* weight_scales,
+                          const T* biases,
+                          T* C,
+                          int m,
+                          int n,
+                          int k,
+                          CutlassGemmConfig gemm_config,
+                          char* workspace,
+                          size_t workspace_bytes,
+                          cudaStream_t stream,
+                          int* occupancy) {
+  switch (gemm_config.stages) {
+    case 2:
+      using DispatcherStages2 = dispatch_stages<T,
+                                                WeightType,
+                                                arch,
+                                                EpilogueTag,
+                                                ThreadblockShape,
+                                                WarpShape,
+                                                2>;
+      DispatcherStages2::dispatch(A,
+                                  B,
+                                  weight_scales,
+                                  biases,
+                                  C,
+                                  m,
+                                  n,
+                                  k,
+                                  gemm_config,
+                                  workspace,
+                                  workspace_bytes,
+                                  stream,
+                                  occupancy);
+      break;
+    case 3:
+      using DispatcherStages3 = dispatch_stages<T,
+                                                WeightType,
+                                                arch,
+                                                EpilogueTag,
+                                                ThreadblockShape,
+                                                WarpShape,
+                                                3>;
+      DispatcherStages3::dispatch(A,
+                                  B,
+                                  weight_scales,
+                                  biases,
+                                  C,
+                                  m,
+                                  n,
+                                  k,
+                                  gemm_config,
+                                  workspace,
+                                  workspace_bytes,
+                                  stream,
+                                  occupancy);
+      break;
+    case 4:
+      using DispatcherStages4 = dispatch_stages<T,
+                                                WeightType,
+                                                arch,
+                                                EpilogueTag,
+                                                ThreadblockShape,
+                                                WarpShape,
+                                                4>;
+      DispatcherStages4::dispatch(A,
+                                  B,
+                                  weight_scales,
+                                  biases,
+                                  C,
+                                  m,
+                                  n,
+                                  k,
+                                  gemm_config,
+                                  workspace,
+                                  workspace_bytes,
+                                  stream,
+                                  occupancy);
+      break;
+    default:
+      std::string err_msg = "dispatch_gemm_config does not support stages " +
+                            std::to_string(gemm_config.stages);
+      throw std::runtime_error("[dispatch_gemm_config] " + err_msg);
+      break;
+  }
+}
+
+template <typename T, typename WeightType, typename arch, typename EpilogueTag>
+void dispatch_gemm_to_cutlass(const T* A,
+                              const WeightType* B,
+                              const float* weight_scales,
+                              const T* biases,
+                              T* C,
+                              int m,
+                              int n,
+                              int k,
+                              char* workspace,
+                              size_t workspace_bytes,
+                              CutlassGemmConfig gemm_config,
+                              cudaStream_t stream,
+                              int* occupancy) {
+  // VLOG(3)<<__PRETTY_FUNCTION__;
+  // Note that SIMT configs are omitted here since they are not supported for
+  // fpA_intB. We also only instantiate configs here where threadblockShapeM ==
+  // warpShapeM since those usually perform the best for mixed type gemms.
+  switch (gemm_config.tile_config) {
+    case CutlassTileConfig::CtaShape32x128x64_WarpShape32x32x64:
+      dispatch_gemm_config<T,
+                           WeightType,
+                           arch,
+                           EpilogueTag,
+                           cutlass::gemm::GemmShape<32, 128, 64>,
+                           cutlass::gemm::GemmShape<32, 32, 64>>(
+          A,
+          B,
+          weight_scales,
+          biases,
+          C,
+          m,
+          n,
+          k,
+          gemm_config,
+          workspace,
+          workspace_bytes,
+          stream,
+          occupancy);
+      break;
+    case CutlassTileConfig::CtaShape64x128x64_WarpShape64x32x64:
+      dispatch_gemm_config<T,
+                           WeightType,
+                           arch,
+                           EpilogueTag,
+                           cutlass::gemm::GemmShape<64, 128, 64>,
+                           cutlass::gemm::GemmShape<64, 32, 64>>(
+          A,
+          B,
+          weight_scales,
+          biases,
+          C,
+          m,
+          n,
+          k,
+          gemm_config,
+          workspace,
+          workspace_bytes,
+          stream,
+          occupancy);
+      break;
+    case CutlassTileConfig::CtaShape128x128x64_WarpShape128x32x64:
+      dispatch_gemm_config<T,
+                           WeightType,
+                           arch,
+                           EpilogueTag,
+                           cutlass::gemm::GemmShape<128, 128, 64>,
+                           cutlass::gemm::GemmShape<128, 32, 64>>(
+          A,
+          B,
+          weight_scales,
+          biases,
+          C,
+          m,
+          n,
+          k,
+          gemm_config,
+          workspace,
+          workspace_bytes,
+          stream,
+          occupancy);
+      break;
+    // config for M_16000_N_12288_K_6144 in encoder
+    case CutlassTileConfig::CtaShape256x128x64_WarpShape64x64x64:
+      dispatch_gemm_config<T,
+                           WeightType,
+                           arch,
+                           EpilogueTag,
+                           cutlass::gemm::GemmShape<256, 128, 64>,
+                           cutlass::gemm::GemmShape<64, 64, 64>>(
+          A,
+          B,
+          weight_scales,
+          biases,
+          C,
+          m,
+          n,
+          k,
+          gemm_config,
+          workspace,
+          workspace_bytes,
+          stream,
+          occupancy);
+      break;
+    case CutlassTileConfig::CtaShape128x256x64_WarpShape64x64x64:
+      dispatch_gemm_config<T,
+                           WeightType,
+                           arch,
+                           EpilogueTag,
+                           cutlass::gemm::GemmShape<128, 256, 64>,
+                           cutlass::gemm::GemmShape<64, 64, 64>>(
+          A,
+          B,
+          weight_scales,
+          biases,
+          C,
+          m,
+          n,
+          k,
+          gemm_config,
+          workspace,
+          workspace_bytes,
+          stream,
+          occupancy);
+      break;
+    case CutlassTileConfig::Undefined:
+      throw std::runtime_error(
+          "[fpA_intB][dispatch_gemm_to_cutlass] gemm config undefined.");
+      break;
+    case CutlassTileConfig::ChooseWithHeuristic:
+      throw std::runtime_error(
+          "[fpA_intB][dispatch_gemm_to_cutlass] gemm config should have "
+          "already been set by heuristic.");
+      break;
+    default:
+      throw std::runtime_error(
+          "[fpA_intB][dispatch_gemm_to_cutlass] Config is invalid for mixed "
+          "type GEMM.");
+      break;
+  }
+}
+
+template <typename T, typename WeightType>
+CutlassFpAIntBGemmRunner<T, WeightType>::CutlassFpAIntBGemmRunner() {
+  // VLOG(3)<<__PRETTY_FUNCTION__;
+  int device{-1};
+  check_cuda_error(cudaGetDevice(&device));
+  sm_ = getSMVersion();
+  check_cuda_error(cudaDeviceGetAttribute(
+      &multi_processor_count_, cudaDevAttrMultiProcessorCount, device));
+}
+
+template <typename T, typename WeightType>
+CutlassFpAIntBGemmRunner<T, WeightType>::~CutlassFpAIntBGemmRunner() {
+  // VLOG(3)<<__PRETTY_FUNCTION__;
+}
+
+template <typename T, typename WeightType>
+template <typename EpilogueTag>
+void CutlassFpAIntBGemmRunner<T, WeightType>::dispatch_to_arch<EpilogueTag>(
+    const T* A,
+    const WeightType* B,
+    const float* weight_scales,
+    const T* biases,
+    T* C,
+    int m,
+    int n,
+    int k,
+    CutlassGemmConfig gemm_config,
+    char* workspace_ptr,
+    const size_t workspace_bytes,
+    cudaStream_t stream,
+    int* occupancy) {
+  // VLOG(3)<<__PRETTY_FUNCTION__;
+  if (sm_ >= 70 && sm_ < 75) {
+#if defined(USE_FPAINTB_GEMM_WITH_SM70)
+    dispatch_gemm_to_cutlass<T, WeightType, cutlass::arch::Sm70, EpilogueTag>(
+        A,
+        B,
+        weight_scales,
+        biases,
+        C,
+        m,
+        n,
+        k,
+        workspace_ptr,
+        workspace_bytes,
+        gemm_config,
+        stream,
+        occupancy);
+#else
+    throw std::runtime_error(
+        "[CutlassFpAIntBGemmRunner][GEMM Dispatch] Arch unsupported for "
+        "CUTLASS mixed type GEMM");
+#endif
+  } else if (sm_ >= 75 && sm_ < 80) {
+#if defined(USE_FPAINTB_GEMM_WITH_SM75)
+    dispatch_gemm_to_cutlass<T, WeightType, cutlass::arch::Sm75, EpilogueTag>(
+        A,
+        B,
+        weight_scales,
+        biases,
+        C,
+        m,
+        n,
+        k,
+        workspace_ptr,
+        workspace_bytes,
+        gemm_config,
+        stream,
+        occupancy);
+#else
+    throw std::runtime_error(
+        "[CutlassFpAIntBGemmRunner][GEMM Dispatch] Arch unsupported for "
+        "CUTLASS mixed type GEMM");
+#endif
+  } else if (sm_ >= 80 && sm_ < 90) {
+#if defined(USE_FPAINTB_GEMM_WITH_SM80)
+    dispatch_gemm_to_cutlass<T, WeightType, cutlass::arch::Sm80, EpilogueTag>(
+        A,
+        B,
+        weight_scales,
+        biases,
+        C,
+        m,
+        n,
+        k,
+        workspace_ptr,
+        workspace_bytes,
+        gemm_config,
+        stream,
+        occupancy);
+#else
+    throw std::runtime_error(
+        "[CutlassFpAIntBGemmRunner][GEMM Dispatch] Arch unsupported for "
+        "CUTLASS mixed type GEMM");
+#endif
+  } else {
+    throw std::runtime_error(
+        "[CutlassFpAIntBGemmRunner][GEMM Dispatch] Arch unsupported for "
+        "CUTLASS mixed type GEMM");
+  }
+}
+
+template <typename T, typename WeightType>
+template <typename EpilogueTag>
+void CutlassFpAIntBGemmRunner<T, WeightType>::run_gemm<EpilogueTag>(
+    const T* A,
+    const WeightType* B,
+    const float* weight_scales,
+    const T* biases,
+    T* C,
+    int m,
+    int n,
+    int k,
+    char* workspace_ptr,
+    const size_t workspace_bytes,
+    cudaStream_t stream) {
+  // VLOG(3)<<__PRETTY_FUNCTION__;
+  static constexpr bool is_weight_only = !std::is_same<T, WeightType>::value;
+  const bool is_weight_only_encoder = m >= 512 ? true : false;
+  std::vector<CutlassGemmConfig> candidate_configs =
+      get_candidate_configs(sm_, is_weight_only, is_weight_only_encoder, false);
+  std::vector<int> occupancies(candidate_configs.size());
+
+  for (size_t ii = 0; ii < candidate_configs.size(); ++ii) {
+    dispatch_to_arch<EpilogueTag>(A,
+                                  B,
+                                  weight_scales,
+                                  biases,
+                                  C,
+                                  m,
+                                  n,
+                                  k,
+                                  candidate_configs[ii],
+                                  workspace_ptr,
+                                  workspace_bytes,
+                                  stream,
+                                  &occupancies[ii]);
+  }
+  // Standard GEMM, so 1 "expert". We use the same function for MoE and regular
+  // FFN.
+  static constexpr int num_experts = 1;
+  CutlassGemmConfig chosen_config =
+      estimate_best_config_from_occupancies(candidate_configs,
+                                            occupancies,
+                                            m,
+                                            n,
+                                            k,
+                                            num_experts,
+                                            split_k_limit,
+                                            workspace_bytes,
+                                            multi_processor_count_,
+                                            is_weight_only);
+
+  dispatch_to_arch<EpilogueTag>(A,
+                                B,
+                                weight_scales,
+                                biases,
+                                C,
+                                m,
+                                n,
+                                k,
+                                chosen_config,
+                                workspace_ptr,
+                                workspace_bytes,
+                                stream);
+}
+
+template <typename T, typename WeightType>
+void CutlassFpAIntBGemmRunner<T, WeightType>::gemm_bias_act(
+    const T* A,
+    const WeightType* B,
+    const float* weight_scales,
+    const T* biases,
+    T* C,
+    int m,
+    int n,
+    int k,
+    std::string activation_type,
+    char* workspace_ptr,
+    const size_t workspace_bytes,
+    cudaStream_t stream) {
+  // VLOG(3)<<__PRETTY_FUNCTION__;
+  if (activation_type == "gelu") {
+    run_gemm<EpilogueOpBiasFtGelu>(A,
+                                   B,
+                                   weight_scales,
+                                   biases,
+                                   C,
+                                   m,
+                                   n,
+                                   k,
+                                   workspace_ptr,
+                                   workspace_bytes,
+                                   stream);
+  } else if (activation_type == "relu") {
+    run_gemm<EpilogueOpBiasReLU>(A,
+                                 B,
+                                 weight_scales,
+                                 biases,
+                                 C,
+                                 m,
+                                 n,
+                                 k,
+                                 workspace_ptr,
+                                 workspace_bytes,
+                                 stream);
+  } else if (activation_type == "none") {
+    run_gemm<EpilogueOpBias>(A,
+                             B,
+                             weight_scales,
+                             biases,
+                             C,
+                             m,
+                             n,
+                             k,
+                             workspace_ptr,
+                             workspace_bytes,
+                             stream);
+  } else {
+    throw std::runtime_error(("Invalid activation type."));
+  }
+}
+
+template <typename T, typename WeightType>
+void CutlassFpAIntBGemmRunner<T, WeightType>::gemm(const T* A,
+                                                   const WeightType* B,
+                                                   const float* weight_scales,
+                                                   T* C,
+                                                   int m,
+                                                   int n,
+                                                   int k,
+                                                   char* workspace_ptr,
+                                                   const size_t workspace_bytes,
+                                                   cudaStream_t stream) {
+  // VLOG(3)<<__PRETTY_FUNCTION__;
+  run_gemm<EpilogueOpNoBias>(A,
+                             B,
+                             weight_scales,
+                             nullptr,
+                             C,
+                             m,
+                             n,
+                             k,
+                             workspace_ptr,
+                             workspace_bytes,
+                             stream);
+}
+
+template <typename T, typename WeightType>
+int CutlassFpAIntBGemmRunner<T, WeightType>::getWorkspaceSize(const int m,
+                                                              const int n,
+                                                              const int k) {
+  // VLOG(3)<<__PRETTY_FUNCTION__;    // These are the min tile sizes for each
+  // config, which would launch the maximum number of blocks
+  const int max_grid_m = (m + 31) / 32;
+  const int max_grid_n = (n + 127) / 128;
+  // We need 4 bytes per block in the worst case. We launch split_k_limit in z
+  // dim.
+  return max_grid_m * max_grid_n * split_k_limit * 4;
+}
+
+// =============================== Specialization T == WeightType
+// =======================================
+template <typename WeightType>
+void CutlassFpAIntBGemmRunner<float, WeightType>::gemm_bias_act(
+    const float* A,
+    const WeightType* B,
+    const float* weight_scales,
+    const float* biases,
+    float* C,
+    int m,
+    int n,
+    int k,
+    std::string activation_type,
+    char* workspace_ptr,
+    const size_t workspace_bytes,
+    cudaStream_t stream) {
+  throw std::runtime_error(
+      ("Attempting to run mixed gemm bias act when the types are the same is "
+       "an error."));
+}
+
+template <typename WeightType>
+void CutlassFpAIntBGemmRunner<float, WeightType>::gemm(
+    const float* A,
+    const WeightType* B,
+    const float* weight_scales,
+    float* C,
+    int m,
+    int n,
+    int k,
+    char* workspace_ptr,
+    const size_t workspace_bytes,
+    cudaStream_t stream) {
+  throw std::runtime_error((
+      "Attempting to run mixed gemm when the types are the same is an error."));
+}
+
+template <typename WeightType>
+int CutlassFpAIntBGemmRunner<float, WeightType>::getWorkspaceSize(const int m,
+                                                                  const int n,
+                                                                  const int k) {
+  return 0;
+}
+
+template class CutlassFpAIntBGemmRunner<float, uint8_t>;
+template class CutlassFpAIntBGemmRunner<half, uint8_t>;
+#ifdef PADDLE_CUDA_BF16
+template class CutlassFpAIntBGemmRunner<__nv_bfloat16, uint8_t>;
+#endif
+template class CutlassFpAIntBGemmRunner<float, cutlass::uint4b_t>;
+template class CutlassFpAIntBGemmRunner<half, cutlass::uint4b_t>;
+#ifdef PADDLE_CUDA_BF16
+template class CutlassFpAIntBGemmRunner<__nv_bfloat16, cutlass::uint4b_t>;
+#endif
+}  // namespace phi
diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h
index e0ce642cef243..f5862a8c58959 100644
--- a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h
+++ b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h
@@ -45,6 +45,7 @@ limitations under the License. */
 #pragma GCC diagnostic pop
 
 #include "paddle/phi/kernels/fusion/cutlass/cutlass_kernels/cutlass_heuristic.h"
+#include "paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/autogen/arch_define.h"
 #include "paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h"
 #include "paddle/phi/kernels/fusion/cutlass/utils/cuda_utils.h"
 namespace phi {
@@ -221,6 +222,27 @@ void generic_mixed_gemm_kernelLauncher(const T* A,
   }
 }
 
+template <typename T,
+          typename WeightType,
+          typename arch,
+          typename EpilogueTag,
+          typename ThreadblockShape,
+          typename WarpShape,
+          int Stages>
+void generic_mixed_gemm_kernelLauncher_template(const T* A,
+                                                const WeightType* B,
+                                                const float* weight_scales,
+                                                const T* biases,
+                                                T* C,
+                                                int m,
+                                                int n,
+                                                int k,
+                                                CutlassGemmConfig gemm_config,
+                                                char* workspace,
+                                                size_t workspace_bytes,
+                                                cudaStream_t stream,
+                                                int* occupancy);
+
 template <typename T,
           typename WeightType,
           typename arch,
@@ -278,28 +300,29 @@ struct dispatch_stages<T,
                        int* occupancy = nullptr) {
     // VLOG(3)<<__PRETTY_FUNCTION__;
 
-    generic_mixed_gemm_kernelLauncher<T,
-                                      WeightType,
-                                      arch,
-                                      EpilogueTag,
-                                      ThreadblockShape,
-                                      WarpShape,
-                                      2>(A,
-                                         B,
-                                         weight_scales,
-                                         biases,
-                                         C,
-                                         m,
-                                         n,
-                                         k,
-                                         gemm_config,
-                                         workspace,
-                                         workspace_bytes,
-                                         stream,
-                                         occupancy);
+    generic_mixed_gemm_kernelLauncher_template<T,
+                                               WeightType,
+                                               arch,
+                                               EpilogueTag,
+                                               ThreadblockShape,
+                                               WarpShape,
+                                               2>(A,
+                                                  B,
+                                                  weight_scales,
+                                                  biases,
+                                                  C,
+                                                  m,
+                                                  n,
+                                                  k,
+                                                  gemm_config,
+                                                  workspace,
+                                                  workspace_bytes,
+                                                  stream,
+                                                  occupancy);
   }
 };
 
+#if defined(USE_FPAINTB_GEMM_WITH_SM80)
 template <typename T,
           typename WeightType,
           typename EpilogueTag,
@@ -327,27 +350,28 @@ struct dispatch_stages<T,
                        size_t workspace_bytes,
                        cudaStream_t stream,
                        int* occupancy = nullptr) {
-    generic_mixed_gemm_kernelLauncher<T,
-                                      WeightType,
-                                      cutlass::arch::Sm80,
-                                      EpilogueTag,
-                                      ThreadblockShape,
-                                      WarpShape,
-                                      Stages>(A,
-                                              B,
-                                              weight_scales,
-                                              biases,
-                                              C,
-                                              m,
-                                              n,
-                                              k,
-                                              gemm_config,
-                                              workspace,
-                                              workspace_bytes,
-                                              stream,
-                                              occupancy);
+    generic_mixed_gemm_kernelLauncher_template<T,
+                                               WeightType,
+                                               cutlass::arch::Sm80,
+                                               EpilogueTag,
+                                               ThreadblockShape,
+                                               WarpShape,
+                                               Stages>(A,
+                                                       B,
+                                                       weight_scales,
+                                                       biases,
+                                                       C,
+                                                       m,
+                                                       n,
+                                                       k,
+                                                       gemm_config,
+                                                       workspace,
+                                                       workspace_bytes,
+                                                       stream,
+                                                       occupancy);
   }
 };
+#endif
 
 template <typename T,
           typename WeightType,
@@ -367,81 +391,7 @@ void dispatch_gemm_config(const T* A,
                           char* workspace,
                           size_t workspace_bytes,
                           cudaStream_t stream,
-                          int* occupancy) {
-  switch (gemm_config.stages) {
-    case 2:
-      using DispatcherStages2 = dispatch_stages<T,
-                                                WeightType,
-                                                arch,
-                                                EpilogueTag,
-                                                ThreadblockShape,
-                                                WarpShape,
-                                                2>;
-      DispatcherStages2::dispatch(A,
-                                  B,
-                                  weight_scales,
-                                  biases,
-                                  C,
-                                  m,
-                                  n,
-                                  k,
-                                  gemm_config,
-                                  workspace,
-                                  workspace_bytes,
-                                  stream,
-                                  occupancy);
-      break;
-    case 3:
-      using DispatcherStages3 = dispatch_stages<T,
-                                                WeightType,
-                                                arch,
-                                                EpilogueTag,
-                                                ThreadblockShape,
-                                                WarpShape,
-                                                3>;
-      DispatcherStages3::dispatch(A,
-                                  B,
-                                  weight_scales,
-                                  biases,
-                                  C,
-                                  m,
-                                  n,
-                                  k,
-                                  gemm_config,
-                                  workspace,
-                                  workspace_bytes,
-                                  stream,
-                                  occupancy);
-      break;
-    case 4:
-      using DispatcherStages4 = dispatch_stages<T,
-                                                WeightType,
-                                                arch,
-                                                EpilogueTag,
-                                                ThreadblockShape,
-                                                WarpShape,
-                                                4>;
-      DispatcherStages4::dispatch(A,
-                                  B,
-                                  weight_scales,
-                                  biases,
-                                  C,
-                                  m,
-                                  n,
-                                  k,
-                                  gemm_config,
-                                  workspace,
-                                  workspace_bytes,
-                                  stream,
-                                  occupancy);
-      break;
-    default:
-      std::string err_msg = "dispatch_gemm_config does not support stages " +
-                            std::to_string(gemm_config.stages);
-      throw std::runtime_error("[dispatch_gemm_config] " + err_msg);
-      break;
-  }
-}
+                          int* occupancy);
 
 template <typename T, typename WeightType, typename arch, typename EpilogueTag>
 void dispatch_gemm_to_cutlass(const T* A,
@@ -456,430 +406,6 @@ void dispatch_gemm_to_cutlass(const T* A,
                               size_t workspace_bytes,
                               CutlassGemmConfig gemm_config,
                               cudaStream_t stream,
-                              int* occupancy) {
-  // VLOG(3)<<__PRETTY_FUNCTION__;
-  // Note that SIMT configs are omitted here since they are not supported for
-  // fpA_intB. We also only instantiate configs here where threadblockShapeM ==
-  // warpShapeM since those usually perform the best for mixed type gemms.
-  switch (gemm_config.tile_config) {
-    case CutlassTileConfig::CtaShape32x128x64_WarpShape32x32x64:
-      dispatch_gemm_config<T,
-                           WeightType,
-                           arch,
-                           EpilogueTag,
-                           cutlass::gemm::GemmShape<32, 128, 64>,
-                           cutlass::gemm::GemmShape<32, 32, 64>>(
-          A,
-          B,
-          weight_scales,
-          biases,
-          C,
-          m,
-          n,
-          k,
-          gemm_config,
-          workspace,
-          workspace_bytes,
-          stream,
-          occupancy);
-      break;
-    case CutlassTileConfig::CtaShape64x128x64_WarpShape64x32x64:
-      dispatch_gemm_config<T,
-                           WeightType,
-                           arch,
-                           EpilogueTag,
-                           cutlass::gemm::GemmShape<64, 128, 64>,
-                           cutlass::gemm::GemmShape<64, 32, 64>>(
-          A,
-          B,
-          weight_scales,
-          biases,
-          C,
-          m,
-          n,
-          k,
-          gemm_config,
-          workspace,
-          workspace_bytes,
-          stream,
-          occupancy);
-      break;
-    case CutlassTileConfig::CtaShape128x128x64_WarpShape128x32x64:
-      dispatch_gemm_config<T,
-                           WeightType,
-                           arch,
-                           EpilogueTag,
-                           cutlass::gemm::GemmShape<128, 128, 64>,
-                           cutlass::gemm::GemmShape<128, 32, 64>>(
-          A,
-          B,
-          weight_scales,
-          biases,
-          C,
-          m,
-          n,
-          k,
-          gemm_config,
-          workspace,
-          workspace_bytes,
-          stream,
-          occupancy);
-      break;
-    // config for M_16000_N_12288_K_6144 in encoder
-    case CutlassTileConfig::CtaShape256x128x64_WarpShape64x64x64:
-      dispatch_gemm_config<T,
-                           WeightType,
-                           arch,
-                           EpilogueTag,
-                           cutlass::gemm::GemmShape<256, 128, 64>,
-                           cutlass::gemm::GemmShape<64, 64, 64>>(
-          A,
-          B,
-          weight_scales,
-          biases,
-          C,
-          m,
-          n,
-          k,
-          gemm_config,
-          workspace,
-          workspace_bytes,
-          stream,
-          occupancy);
-      break;
-    case CutlassTileConfig::CtaShape128x256x64_WarpShape64x64x64:
-      dispatch_gemm_config<T,
-                           WeightType,
-                           arch,
-                           EpilogueTag,
-                           cutlass::gemm::GemmShape<128, 256, 64>,
-                           cutlass::gemm::GemmShape<64, 64, 64>>(
-          A,
-          B,
-          weight_scales,
-          biases,
-          C,
-          m,
-          n,
-          k,
-          gemm_config,
-          workspace,
-          workspace_bytes,
-          stream,
-          occupancy);
-      break;
-    case CutlassTileConfig::Undefined:
-      throw std::runtime_error(
-          "[fpA_intB][dispatch_gemm_to_cutlass] gemm config undefined.");
-      break;
-    case CutlassTileConfig::ChooseWithHeuristic:
-      throw std::runtime_error(
-          "[fpA_intB][dispatch_gemm_to_cutlass] gemm config should have "
-          "already been set by heuristic.");
-      break;
-    default:
-      throw std::runtime_error(
-          "[fpA_intB][dispatch_gemm_to_cutlass] Config is invalid for mixed "
-          "type GEMM.");
-      break;
-  }
-}
-
-template <typename T, typename WeightType>
-CutlassFpAIntBGemmRunner<T, WeightType>::CutlassFpAIntBGemmRunner() {
-  // VLOG(3)<<__PRETTY_FUNCTION__;
-  int device{-1};
-  check_cuda_error(cudaGetDevice(&device));
-  sm_ = getSMVersion();
-  check_cuda_error(cudaDeviceGetAttribute(
-      &multi_processor_count_, cudaDevAttrMultiProcessorCount, device));
-}
-
-template <typename T, typename WeightType>
-CutlassFpAIntBGemmRunner<T, WeightType>::~CutlassFpAIntBGemmRunner() {
-  // VLOG(3)<<__PRETTY_FUNCTION__;
-}
-
-template <typename T, typename WeightType>
-template <typename EpilogueTag>
-void CutlassFpAIntBGemmRunner<T, WeightType>::dispatch_to_arch<EpilogueTag>(
-    const T* A,
-    const WeightType* B,
-    const float* weight_scales,
-    const T* biases,
-    T* C,
-    int m,
-    int n,
-    int k,
-    CutlassGemmConfig gemm_config,
-    char* workspace_ptr,
-    const size_t workspace_bytes,
-    cudaStream_t stream,
-    int* occupancy) {
-  // VLOG(3)<<__PRETTY_FUNCTION__;
-  if (sm_ >= 70 && sm_ < 75) {
-    dispatch_gemm_to_cutlass<T, WeightType, cutlass::arch::Sm70, EpilogueTag>(
-        A,
-        B,
-        weight_scales,
-        biases,
-        C,
-        m,
-        n,
-        k,
-        workspace_ptr,
-        workspace_bytes,
-        gemm_config,
-        stream,
-        occupancy);
-  } else if (sm_ >= 75 && sm_ < 80) {
-    dispatch_gemm_to_cutlass<T, WeightType, cutlass::arch::Sm75, EpilogueTag>(
-        A,
-        B,
-        weight_scales,
-        biases,
-        C,
-        m,
-        n,
-        k,
-        workspace_ptr,
-        workspace_bytes,
-        gemm_config,
-        stream,
-        occupancy);
-  } else if (sm_ >= 80 && sm_ < 90) {
-    dispatch_gemm_to_cutlass<T, WeightType, cutlass::arch::Sm80, EpilogueTag>(
-        A,
-        B,
-        weight_scales,
-        biases,
-        C,
-        m,
-        n,
-        k,
-        workspace_ptr,
-        workspace_bytes,
-        gemm_config,
-        stream,
-        occupancy);
-  } else {
-    throw std::runtime_error(
-        "[CutlassFpAIntBGemmRunner][GEMM Dispatch] Arch unsupported for "
-        "CUTLASS mixed type GEMM");
-  }
-}
+                              int* occupancy);
 
-template <typename T, typename WeightType>
-template <typename EpilogueTag>
-void CutlassFpAIntBGemmRunner<T, WeightType>::run_gemm<EpilogueTag>(
-    const T* A,
-    const WeightType* B,
-    const float* weight_scales,
-    const T* biases,
-    T* C,
-    int m,
-    int n,
-    int k,
-    char* workspace_ptr,
-    const size_t workspace_bytes,
-    cudaStream_t stream) {
-  // VLOG(3)<<__PRETTY_FUNCTION__;
-  static constexpr bool is_weight_only = !std::is_same<T, WeightType>::value;
-  const bool is_weight_only_encoder = m >= 512 ? true : false;
-  std::vector<CutlassGemmConfig> candidate_configs =
-      get_candidate_configs(sm_, is_weight_only, is_weight_only_encoder, false);
-  std::vector<int> occupancies(candidate_configs.size());
-
-  for (size_t ii = 0; ii < candidate_configs.size(); ++ii) {
-    dispatch_to_arch<EpilogueTag>(A,
-                                  B,
-                                  weight_scales,
-                                  biases,
-                                  C,
-                                  m,
-                                  n,
-                                  k,
-                                  candidate_configs[ii],
-                                  workspace_ptr,
-                                  workspace_bytes,
-                                  stream,
-                                  &occupancies[ii]);
-  }
-  // Standard GEMM, so 1 "expert". We use the same function for MoE and regular
-  // FFN.
-  static constexpr int num_experts = 1;
-  CutlassGemmConfig chosen_config =
-      estimate_best_config_from_occupancies(candidate_configs,
-                                            occupancies,
-                                            m,
-                                            n,
-                                            k,
-                                            num_experts,
-                                            split_k_limit,
-                                            workspace_bytes,
-                                            multi_processor_count_,
-                                            is_weight_only);
-
-  dispatch_to_arch<EpilogueTag>(A,
-                                B,
-                                weight_scales,
-                                biases,
-                                C,
-                                m,
-                                n,
-                                k,
-                                chosen_config,
-                                workspace_ptr,
-                                workspace_bytes,
-                                stream);
-}
-
-template <typename T, typename WeightType>
-void CutlassFpAIntBGemmRunner<T, WeightType>::gemm_bias_act(
-    const T* A,
-    const WeightType* B,
-    const float* weight_scales,
-    const T* biases,
-    T* C,
-    int m,
-    int n,
-    int k,
-    std::string activation_type,
-    char* workspace_ptr,
-    const size_t workspace_bytes,
-    cudaStream_t stream) {
-  // VLOG(3)<<__PRETTY_FUNCTION__;
-  if (activation_type == "gelu") {
-    run_gemm<EpilogueOpBiasFtGelu>(A,
-                                   B,
-                                   weight_scales,
-                                   biases,
-                                   C,
-                                   m,
-                                   n,
-                                   k,
-                                   workspace_ptr,
-                                   workspace_bytes,
-                                   stream);
-  } else if (activation_type == "relu") {
-    run_gemm<EpilogueOpBiasReLU>(A,
-                                 B,
-                                 weight_scales,
-                                 biases,
-                                 C,
-                                 m,
-                                 n,
-                                 k,
-                                 workspace_ptr,
-                                 workspace_bytes,
-                                 stream);
-  } else if (activation_type == "none") {
-    run_gemm<EpilogueOpBias>(A,
-                             B,
-                             weight_scales,
-                             biases,
-                             C,
-                             m,
-                             n,
-                             k,
-                             workspace_ptr,
-                             workspace_bytes,
-                             stream);
-  } else {
-    throw std::runtime_error(("Invalid activation type."));
-  }
-}
-
-template <typename T, typename WeightType>
-void CutlassFpAIntBGemmRunner<T, WeightType>::gemm(const T* A,
-                                                   const WeightType* B,
-                                                   const float* weight_scales,
-                                                   T* C,
-                                                   int m,
-                                                   int n,
-                                                   int k,
-                                                   char* workspace_ptr,
-                                                   const size_t workspace_bytes,
-                                                   cudaStream_t stream) {
-  // VLOG(3)<<__PRETTY_FUNCTION__;
-  run_gemm<EpilogueOpNoBias>(A,
-                             B,
-                             weight_scales,
-                             nullptr,
-                             C,
-                             m,
-                             n,
-                             k,
-                             workspace_ptr,
-                             workspace_bytes,
-                             stream);
-}
-
-template <typename T, typename WeightType>
-int CutlassFpAIntBGemmRunner<T, WeightType>::getWorkspaceSize(const int m,
-                                                              const int n,
-                                                              const int k) {
-  // VLOG(3)<<__PRETTY_FUNCTION__;    // These are the min tile sizes for each
-  // config, which would launch the maximum number of blocks
-  const int max_grid_m = (m + 31) / 32;
-  const int max_grid_n = (n + 127) / 128;
-  // We need 4 bytes per block in the worst case. We launch split_k_limit in z
-  // dim.
-  return max_grid_m * max_grid_n * split_k_limit * 4;
-}
-
-// =============================== Specialization T == WeightType
-// =======================================
-template <typename WeightType>
-void CutlassFpAIntBGemmRunner<float, WeightType>::gemm_bias_act(
-    const float* A,
-    const WeightType* B,
-    const float* weight_scales,
-    const float* biases,
-    float* C,
-    int m,
-    int n,
-    int k,
-    std::string activation_type,
-    char* workspace_ptr,
-    const size_t workspace_bytes,
-    cudaStream_t stream) {
-  throw std::runtime_error(
-      ("Attempting to run mixed gemm bias act when the types are the same is "
-       "an error."));
-}
-
-template <typename WeightType>
-void CutlassFpAIntBGemmRunner<float, WeightType>::gemm(
-    const float* A,
-    const WeightType* B,
-    const float* weight_scales,
-    float* C,
-    int m,
-    int n,
-    int k,
-    char* workspace_ptr,
-    const size_t workspace_bytes,
-    cudaStream_t stream) {
-  throw std::runtime_error((
-      "Attempting to run mixed gemm when the types are the same is an error."));
-}
-
-template <typename WeightType>
-int CutlassFpAIntBGemmRunner<float, WeightType>::getWorkspaceSize(const int m,
-                                                                  const int n,
-                                                                  const int k) {
-  return 0;
-}
-
-template class CutlassFpAIntBGemmRunner<float, uint8_t>;
-template class CutlassFpAIntBGemmRunner<half, uint8_t>;
-#ifdef PADDLE_CUDA_BF16
-template class CutlassFpAIntBGemmRunner<__nv_bfloat16, uint8_t>;
-#endif
-template class CutlassFpAIntBGemmRunner<float, cutlass::uint4b_t>;
-template class CutlassFpAIntBGemmRunner<half, cutlass::uint4b_t>;
-#ifdef PADDLE_CUDA_BF16
-template class CutlassFpAIntBGemmRunner<__nv_bfloat16, cutlass::uint4b_t>;
-#endif
 }  // namespace phi
diff --git a/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/generic_mixed_gemm_kernelLauncher.py b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/generic_mixed_gemm_kernelLauncher.py
new file mode 100644
index 0000000000000..4295057679d57
--- /dev/null
+++ b/paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/generic_mixed_gemm_kernelLauncher.py
@@ -0,0 +1,214 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import re
+
+# this is a file's header part
+CommonHead = '''
+// Generated by generic_mixed_gemm_kernelLauncher.py - Do not edit.
+
+#include "paddle/phi/kernels/fusion/cutlass/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h"
+
+namespace phi {
+'''
+
+CommonTail = '''
+} // namespace phi
+
+'''
+DispatchGemmConfigInstanceDeclare = """
+template<>
+void generic_mixed_gemm_kernelLauncher_template<{T},
+                                                {WeightType},
+                                                {arch},
+                                                {EpilogueTag},
+                                                {ThreadblockShape},
+                                                {WarpShape},
+                                                {Stages}>(
+    const {T}* A,
+    const {WeightType}* B,
+    const float* weight_scales,
+    const {T}* biases,
+    {T}* C,
+    int m,
+    int n,
+    int k,
+    CutlassGemmConfig gemm_config,
+    char* workspace,
+    size_t workspace_bytes,
+    cudaStream_t stream,
+    int* occupancy) {
+    generic_mixed_gemm_kernelLauncher<{T},
+                                      {WeightType},
+                                      {arch},
+                                      {EpilogueTag},
+                                      {ThreadblockShape},
+                                      {WarpShape},
+                                      {Stages}>(
+        A,
+        B,
+        weight_scales,
+        biases,
+        C,
+        m,
+        n,
+        k,
+        gemm_config,
+        workspace,
+        workspace_bytes,
+        stream,
+        occupancy);
+}
+"""
+
+DefineHeader = """
+// Generated by generic_mixed_gemm_kernelLauncher.py - Do not edit.
+
+"""
+
+DefaultArch = [70, 75, 80]
+epilogue_tags = ["bias", "biasFtGelu", "biasReLU", "noBias"]
+
+WeightTypes = ["uint8_t", "cutlass::uint4b_t"]
+ThreadblockShapes = [
+    "cutlass::gemm::GemmShape<32, 128, 64>",
+    "cutlass::gemm::GemmShape<64, 128, 64>",
+    "cutlass::gemm::GemmShape<128, 128, 64>",
+    "cutlass::gemm::GemmShape<256, 128, 64>",
+    "cutlass::gemm::GemmShape<128, 256, 64>",
+]
+WarpShapes = [
+    "cutlass::gemm::GemmShape<32, 32, 64>",
+    "cutlass::gemm::GemmShape<64, 32, 64>",
+    "cutlass::gemm::GemmShape<128, 32, 64>",
+    "cutlass::gemm::GemmShape<64, 64, 64>",
+    "cutlass::gemm::GemmShape<64, 64, 64>",
+]
+StagesList = {70: [2], 75: [2], 80: [2, 3, 4]}
+
+ElementTypes = {"fp16": "half", "bf16": "__nv_bfloat16"}
+Archs = {
+    70: "cutlass::arch::Sm70",
+    75: "cutlass::arch::Sm75",
+    80: "cutlass::arch::Sm80",
+}
+EpilogueTags = {
+    "bias": "EpilogueOpBias",
+    "biasFtGelu": "EpilogueOpBiasFtGelu",
+    "biasReLU": "EpilogueOpBiasReLU",
+    "noBias": "EpilogueOpNoBias",
+}
+
+
+def SubstituteTemplate(template, values):
+    text = template
+    changed = True
+    while changed:
+        changed = False
+        for key, value in values.items():
+            regex = "\\{%s\\}" % key
+            newtext = re.sub(regex, value, text)
+            if newtext != text:
+                changed = True
+            text = newtext
+    return text
+
+
+def find_arch_range(archs):
+    compile_archs = []
+    for arch in archs:
+        if arch >= 70 and arch < 75:
+            compile_archs.append(70)
+        elif arch >= 75 and arch < 80:
+            compile_archs.append(75)
+        elif arch >= 80 and arch < 90:
+            compile_archs.append(80)
+    compile_archs = list(set(compile_archs))
+    compile_archs.sort()
+    return compile_archs
+
+
+def convert_to_arch_list(archs):
+    archs = archs.lower().strip()
+    if archs == "all":
+        return DefaultArch
+
+    archs = [int(s.strip()) for s in archs.split(';') if s.strip()]
+    archs = list(set(archs))
+    return find_arch_range(archs)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="The argument for generating the generic_mixed_gemm_kernelLauncher instance."
+    )
+    parser.add_argument(
+        "--cuda_arch",
+        type=convert_to_arch_list,
+        default=convert_to_arch_list("All"),
+        help="The CUDA architecture to be generated.",
+    )
+    args = parser.parse_args()
+    return args
+
+
+# generate source cu
+def generate_source_cu(
+    element_type: str, arch: int, epilogue_tag: str, stages: int
+):
+    all_code = CommonHead
+    for WeightType in WeightTypes:
+        for i in range(len(ThreadblockShapes)):
+            value_dict = {
+                "T": ElementTypes[element_type],
+                "WeightType": WeightType,
+                "arch": Archs[arch],
+                "EpilogueTag": EpilogueTags[epilogue_tag],
+                "ThreadblockShape": ThreadblockShapes[i],
+                "WarpShape": WarpShapes[i],
+                "Stages": str(stages),
+            }
+            all_code += SubstituteTemplate(
+                DispatchGemmConfigInstanceDeclare, value_dict
+            )
+    all_code += CommonTail
+    return all_code
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    archs = args.cuda_arch
+    header_all = DefineHeader
+    header_name = "autogen/arch_define.h"
+    if archs:
+        for arch in archs:
+            define_line = "#define USE_FPAINTB_GEMM_WITH_SM%s\n" % str(arch)
+            header_all += define_line
+    with open(header_name, "w") as f:
+        f.write(header_all)
+        f.close()
+    if archs:
+        for element_type in ElementTypes.keys():
+            for arch in archs:
+                for epilogue_tag in EpilogueTags.keys():
+                    for stages in StagesList[arch]:
+                        file_name = "autogen/generic_mixed_gemm_kernelLauncher_{}_sm{}_stages{}_{}.cu".format(
+                            element_type, arch, stages, epilogue_tag
+                        )
+                        all_code = generate_source_cu(
+                            element_type, arch, epilogue_tag, stages
+                        )
+                        with open(file_name, "w") as f:
+                            f.write(all_code)
+                            f.close()
diff --git a/paddle/phi/kernels/fusion/cutlass/utils/cuda_utils.h b/paddle/phi/kernels/fusion/cutlass/utils/cuda_utils.h
index 69e737fa21157..e2e3652258406 100644
--- a/paddle/phi/kernels/fusion/cutlass/utils/cuda_utils.h
+++ b/paddle/phi/kernels/fusion/cutlass/utils/cuda_utils.h
@@ -33,6 +33,7 @@
 #include <cublasLt.h>
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
+#include <glog/logging.h>
 #include <fstream>
 #include <iostream>
 #include <string>
@@ -41,6 +42,9 @@
 #include <cusparseLt.h>
 #endif
 
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/float16.h"
+
 namespace phi {
 
 #define MAX_CONFIG_NUM 20
diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
index a0695935de1bc..d592dfad0a52d 100644
--- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
@@ -358,14 +358,14 @@ PD_REGISTER_KERNEL(relu_double_grad,
 PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(sin_grad, SinGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(cos_grad, CosGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(tan_grad, TanGradKernel)
-PD_REGISTER_ACTIVATION_GRAD_KERNEL(acos_grad, AcosGradKernel)
-PD_REGISTER_ACTIVATION_GRAD_KERNEL(asin_grad, AsinGradKernel)
-PD_REGISTER_ACTIVATION_GRAD_KERNEL(atan_grad, AtanGradKernel)
-PD_REGISTER_ACTIVATION_GRAD_KERNEL(sinh_grad, SinhGradKernel)
-PD_REGISTER_ACTIVATION_GRAD_KERNEL(cosh_grad, CoshGradKernel)
-PD_REGISTER_ACTIVATION_GRAD_KERNEL(asinh_grad, AsinhGradKernel)
-PD_REGISTER_ACTIVATION_GRAD_KERNEL(acosh_grad, AcoshGradKernel)
-PD_REGISTER_ACTIVATION_GRAD_KERNEL(atanh_grad, AtanhGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(acos_grad, AcosGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(asin_grad, AsinGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(atan_grad, AtanGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(sinh_grad, SinhGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(cosh_grad, CoshGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(asinh_grad, AsinhGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(acosh_grad, AcoshGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(atanh_grad, AtanhGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(tanh_grad, TanhGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(tanh_double_grad,
                                                 TanhDoubleGradKernel)
@@ -398,7 +398,9 @@ PD_REGISTER_KERNEL(exp_grad,
                    int,
                    int64_t,
                    phi::dtype::float16,
-                   phi::dtype::bfloat16) {}
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
 
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(softshrink_grad, SoftShrinkGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_shrink_grad, HardShrinkGradKernel)
@@ -415,7 +417,9 @@ PD_REGISTER_KERNEL(expm1_grad,
                    float,
                    double,
                    phi::dtype::float16,
-                   phi::dtype::bfloat16) {}
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
 
 PD_REGISTER_KERNEL(square_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/activation_kernel.cu b/paddle/phi/kernels/gpu/activation_kernel.cu
index 061a02f531538..000428268bbb1 100644
--- a/paddle/phi/kernels/gpu/activation_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_kernel.cu
@@ -232,14 +232,14 @@ PD_REGISTER_KERNEL(relu,
 PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(sin, SinKernel)
 PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(cos, CosKernel)
 PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(tan, TanKernel)
-PD_REGISTER_ACTIVATION_KERNEL(acos, AcosKernel)
-PD_REGISTER_ACTIVATION_KERNEL(asin, AsinKernel)
-PD_REGISTER_ACTIVATION_KERNEL(atan, AtanKernel)
-PD_REGISTER_ACTIVATION_KERNEL(sinh, SinhKernel)
-PD_REGISTER_ACTIVATION_KERNEL(cosh, CoshKernel)
-PD_REGISTER_ACTIVATION_KERNEL(asinh, AsinhKernel)
-PD_REGISTER_ACTIVATION_KERNEL(acosh, AcoshKernel)
-PD_REGISTER_ACTIVATION_KERNEL(atanh, AtanhKernel)
+PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(acos, AcosKernel)
+PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(asin, AsinKernel)
+PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(atan, AtanKernel)
+PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(sinh, SinhKernel)
+PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(cosh, CoshKernel)
+PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(asinh, AsinhKernel)
+PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(acosh, AcoshKernel)
+PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(atanh, AtanhKernel)
 PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(tanh, TanhKernel)
 PD_REGISTER_ACTIVATION_KERNEL(hardtanh, HardTanhKernel)
 PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedReluKernel)
@@ -261,7 +261,9 @@ PD_REGISTER_KERNEL(exp,
                    int,
                    int64_t,
                    phi::dtype::float16,
-                   phi::dtype::bfloat16) {}
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
 PD_REGISTER_KERNEL(expm1,
                    GPU,
                    ALL_LAYOUT,
@@ -271,7 +273,9 @@ PD_REGISTER_KERNEL(expm1,
                    int,
                    int64_t,
                    phi::dtype::float16,
-                   phi::dtype::bfloat16) {}
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
 PD_REGISTER_KERNEL(square,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/bincount_kernel.cu b/paddle/phi/kernels/gpu/bincount_kernel.cu
index 8ce5de402cc22..04f80970a40d3 100644
--- a/paddle/phi/kernels/gpu/bincount_kernel.cu
+++ b/paddle/phi/kernels/gpu/bincount_kernel.cu
@@ -34,13 +34,12 @@ __global__ void KernelBincount(const InputT* input,
                                const bool has_weights,
                                const T* weights,
                                OutT* output) {
-  if (!has_weights) {
-    for (int i = threadIdx.x; i < total_elements; i += blockDim.x) {
-      phi::CudaAtomicAdd(&output[input[i]], 1L);
-    }
-  } else {
-    for (int i = threadIdx.x; i < total_elements; i += blockDim.x) {
-      phi::CudaAtomicAdd(&output[input[i]], static_cast<OutT>(weights[i]));
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid < total_elements) {
+    if (!has_weights) {
+      phi::CudaAtomicAdd(&output[input[tid]], 1L);
+    } else {
+      phi::CudaAtomicAdd(&output[input[tid]], static_cast<OutT>(weights[tid]));
     }
   }
 }
diff --git a/paddle/phi/kernels/gpu/flip_kernel.cu b/paddle/phi/kernels/gpu/flip_kernel.cu
index 812d68df92d93..f271eba26e0ab 100644
--- a/paddle/phi/kernels/gpu/flip_kernel.cu
+++ b/paddle/phi/kernels/gpu/flip_kernel.cu
@@ -21,21 +21,26 @@
 
 namespace phi {
 
-template <typename T, size_t Rank>
-__global__ void flip_cuda_kernel(const int64_t N,
-                                 const T* in_data,
-                                 T* out_data,
-                                 phi::Array<int64_t, Rank> shape,
-                                 phi::Array<int64_t, Rank> stride,
-                                 phi::Array<int, Rank> flip_dims,
-                                 int flip_dims_size) {
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= N) {
+template <typename T>
+__global__ void FlipCudaKernel(const T* in_data,
+                               T* out_data,
+                               phi::Array<int64_t, DDim::kMaxRank> shape,
+                               phi::Array<int64_t, DDim::kMaxRank> stride,
+                               phi::Array<int, DDim::kMaxRank> flip_dims,
+                               const int rank,
+                               const int64_t numel,
+                               const int flip_dims_size) {
+  int64_t idx =
+      static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
+      static_cast<int64_t>(threadIdx.x);
+  if (idx >= numel) {
     return;
   }
 
-  int cur_indices = idx, rem = 0, dst_offset = 0;
-  for (int i = 0; i < Rank; ++i) {
+  int64_t cur_indices = idx;
+  int64_t rem = 0;
+  int64_t dst_offset = 0;
+  for (int i = 0; i < rank; ++i) {
     int64_t temp = cur_indices;
     cur_indices = cur_indices / stride[i];
     rem = temp - cur_indices * stride[i];
@@ -51,91 +56,48 @@ __global__ void flip_cuda_kernel(const int64_t N,
   out_data[idx] = in_data[dst_offset];
 }
 
-template <typename T, typename Context, size_t N>
-void LaunchFlipCudaKernel(const Context& dev_ctx,
-                          const DenseTensor& x,
-                          const std::vector<int>& axis,
-                          DenseTensor* out) {
+template <typename T, typename Context>
+void FlipKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::vector<int>& axis,
+                DenseTensor* out) {
   auto* in_data = x.data<T>();
   auto* out_data = dev_ctx.template Alloc<T>(out);
 
   auto x_dims = x.dims();
-  const int total_dims = x_dims.size();
+  const int rank = x_dims.size();
   const int64_t numel = x.numel();
-  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel);
-  auto x_stride = phi::stride(x_dims);
 
-  phi::Array<int64_t, N> stride_a;
-  phi::Array<int64_t, N> shape_a;
-  phi::Array<int, N> flip_dims_a;
   size_t flip_dims_size = axis.size();
+  auto x_stride = phi::stride(x_dims);
 
-  for (size_t idx = 0; idx < N; ++idx) {
-    stride_a[idx] = x_stride[idx];
-    shape_a[idx] = x_dims[idx];
-    flip_dims_a[idx] = idx < flip_dims_size ? axis[idx] : 0;
-  }
+  phi::Array<int64_t, DDim::kMaxRank> stride_array;
+  phi::Array<int64_t, DDim::kMaxRank> shape_array;
+  phi::Array<int, DDim::kMaxRank> flip_dims_array;
 
-  for (size_t i = 0; i < flip_dims_a.size(); ++i) {
-    if (flip_dims_a[i] < 0) {
-      flip_dims_a[i] += total_dims;
+  for (int i = 0; i < rank; ++i) {
+    stride_array[i] = x_stride[i];
+    shape_array[i] = x_dims[i];
+    if (i < flip_dims_size) {
+      flip_dims_array[i] = axis[i] < 0 ? axis[i] + rank : axis[i];
+    } else {
+      flip_dims_array[i] = 0;
     }
   }
-  flip_cuda_kernel<T, N>
+
+  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel);
+  FlipCudaKernel<T>
       <<<config.block_per_grid, config.thread_per_block, 0, dev_ctx.stream()>>>(
-          numel,
           in_data,
           out_data,
-          shape_a,
-          stride_a,
-          flip_dims_a,
+          shape_array,
+          stride_array,
+          flip_dims_array,
+          rank,
+          numel,
           flip_dims_size);
 }
 
-template <typename T, typename Context>
-void FlipKernel(const Context& dev_ctx,
-                const DenseTensor& x,
-                const std::vector<int>& axis,
-                DenseTensor* out) {
-  const size_t total_dims = x.dims().size();
-  switch (total_dims) {
-    case 0:
-      LaunchFlipCudaKernel<T, Context, 0>(dev_ctx, x, axis, out);
-      break;
-    case 1:
-      LaunchFlipCudaKernel<T, Context, 1>(dev_ctx, x, axis, out);
-      break;
-    case 2:
-      LaunchFlipCudaKernel<T, Context, 2>(dev_ctx, x, axis, out);
-      break;
-    case 3:
-      LaunchFlipCudaKernel<T, Context, 3>(dev_ctx, x, axis, out);
-      break;
-    case 4:
-      LaunchFlipCudaKernel<T, Context, 4>(dev_ctx, x, axis, out);
-      break;
-    case 5:
-      LaunchFlipCudaKernel<T, Context, 5>(dev_ctx, x, axis, out);
-      break;
-    case 6:
-      LaunchFlipCudaKernel<T, Context, 6>(dev_ctx, x, axis, out);
-      break;
-    case 7:
-      LaunchFlipCudaKernel<T, Context, 7>(dev_ctx, x, axis, out);
-      break;
-    case 8:
-      LaunchFlipCudaKernel<T, Context, 8>(dev_ctx, x, axis, out);
-      break;
-    case 9:
-      LaunchFlipCudaKernel<T, Context, 9>(dev_ctx, x, axis, out);
-      break;
-    default:
-      PADDLE_THROW(phi::errors::InvalidArgument(
-          "dims of input tensor should be less than 10, But received"
-          "%d",
-          x.dims().size()));
-  }
-}
 }  // namespace phi
 
 PD_REGISTER_KERNEL(flip,
diff --git a/paddle/phi/kernels/gpu/fold_grad_kernel.cu b/paddle/phi/kernels/gpu/fold_grad_kernel.cu
index ad469dd7981de..1e3cceb04dd0d 100644
--- a/paddle/phi/kernels/gpu/fold_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/fold_grad_kernel.cu
@@ -18,5 +18,11 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/fold_grad_kernel_impl.h"
 
-PD_REGISTER_KERNEL(
-    fold_grad, GPU, ALL_LAYOUT, phi::FoldGradKernel, float, double) {}
+PD_REGISTER_KERNEL(fold_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::FoldGradKernel,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/fold_kernel.cu b/paddle/phi/kernels/gpu/fold_kernel.cu
index b53ef402150c2..2e21a121a0cc6 100644
--- a/paddle/phi/kernels/gpu/fold_kernel.cu
+++ b/paddle/phi/kernels/gpu/fold_kernel.cu
@@ -18,4 +18,11 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/fold_kernel_impl.h"
 
-PD_REGISTER_KERNEL(fold, GPU, ALL_LAYOUT, phi::FoldKernel, float, double) {}
+PD_REGISTER_KERNEL(fold,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::FoldKernel,
+                   float,
+                   double,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/index_put_grad_kernel.cu b/paddle/phi/kernels/gpu/index_put_grad_kernel.cu
index 8f2eba7185293..7e584e5c10318 100644
--- a/paddle/phi/kernels/gpu/index_put_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_put_grad_kernel.cu
@@ -24,20 +24,23 @@
 
 namespace phi {
 
-template <typename T, size_t Rank>
-__global__ void set_zero_cuda_kernel(const int64_t N,
-                                     int64_t** indices,
-                                     phi::Array<int64_t, Rank> stride,
-                                     phi::Array<int64_t, Rank> shape,
-                                     T* out) {
-  int64_t idx = threadIdx.x + blockDim.x * blockIdx.x;
-  int64_t cur_ix = 0;
-
-  if (idx >= N) {
+template <typename T>
+__global__ void SetZeroCudaKernel(int64_t** indices,
+                                  phi::Array<int64_t, DDim::kMaxRank> stride,
+                                  phi::Array<int64_t, DDim::kMaxRank> shape,
+                                  const int rank,
+                                  const int64_t numel,
+                                  T* out) {
+  int64_t idx =
+      static_cast<int64_t>(threadIdx.x) +
+      static_cast<int64_t>(blockDim.x) * static_cast<int64_t>(blockIdx.x);
+  if (idx >= numel) {
     return;
   }
+
+  int64_t cur_ix = 0;
   int64_t offset = 0;
-  for (int i = 0; i < Rank; ++i) {
+  for (int i = 0; i < rank; ++i) {
     cur_ix = (static_cast<int64_t>(*(indices[i] + idx)));
     if (cur_ix < 0) {
       cur_ix += shape[i];
@@ -48,21 +51,25 @@ __global__ void set_zero_cuda_kernel(const int64_t N,
   *(out + offset) = 0;
 }
 
-template <typename T, size_t Rank>
-__global__ void index_put_grad_cuda_kernel(const int64_t N,
-                                           const T* out_grad,
-                                           int64_t** indices,
-                                           phi::Array<int64_t, Rank> stride,
-                                           phi::Array<int64_t, Rank> shape,
-                                           T* value_grad) {
-  int64_t idx = threadIdx.x + blockDim.x * blockIdx.x;
-  int64_t cur_ix = 0;
-
-  if (idx >= N) {
+template <typename T>
+__global__ void IndexPutGradCudaKernel(
+    const T* out_grad,
+    int64_t** indices,
+    phi::Array<int64_t, DDim::kMaxRank> stride,
+    phi::Array<int64_t, DDim::kMaxRank> shape,
+    const int rank,
+    const int64_t numel,
+    T* value_grad) {
+  int64_t idx =
+      static_cast<int64_t>(threadIdx.x) +
+      static_cast<int64_t>(blockDim.x) * static_cast<int64_t>(blockIdx.x);
+  if (idx >= numel) {
     return;
   }
+
+  int64_t cur_ix = 0;
   int64_t offset = 0;
-  for (int i = 0; i < Rank; ++i) {
+  for (int i = 0; i < rank; ++i) {
     cur_ix = (static_cast<int64_t>(*(indices[i] + idx)));
     if (cur_ix < 0) {
       cur_ix += shape[i];
@@ -73,12 +80,13 @@ __global__ void index_put_grad_cuda_kernel(const int64_t N,
   *(value_grad + idx) = *(out_grad + offset);
 }
 
-template <typename T, typename Context, size_t Rank>
+template <typename T, typename Context>
 void LaunchIndexPutGradCudaKernel(
     const Context& dev_ctx,
     const std::vector<const DenseTensor*>& indices,
     const DenseTensor& out_grad,
-    bool accumulate,
+    const int rank,
+    const bool accumulate,
     DenseTensor* value_grad,
     DenseTensor* x_grad) {
   if (x_grad) {
@@ -87,43 +95,41 @@ void LaunchIndexPutGradCudaKernel(
       T* x_grad_data = x_grad->data<T>();
 
       auto x_grad_dims = x_grad->dims();
-      const int64_t numel = indices[0]->numel();
-      auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel);
       auto x_grad_stride = phi::stride(x_grad_dims);
 
-      phi::Array<int64_t, Rank> stride_a;
-      phi::Array<int64_t, Rank> shape_a;
-
-      for (size_t idx = 0; idx < Rank; ++idx) {
-        stride_a[idx] = x_grad_stride[idx];
-        shape_a[idx] = x_grad_dims[idx];
+      phi::Array<int64_t, DDim::kMaxRank> stride_array;
+      phi::Array<int64_t, DDim::kMaxRank> shape_array;
+      for (int i = 0; i < rank; ++i) {
+        stride_array[i] = x_grad_stride[i];
+        shape_array[i] = x_grad_dims[i];
       }
 
+      const int64_t numel = indices[0]->numel();
       auto pd_indices =
           funcs::GetDevicePointerArray<int64_t, Context>(dev_ctx, indices);
-      set_zero_cuda_kernel<T, Rank><<<config.block_per_grid,
-                                      config.thread_per_block,
-                                      0,
-                                      dev_ctx.stream()>>>(
-          numel, pd_indices, stride_a, shape_a, x_grad_data);
+      auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel);
+      SetZeroCudaKernel<T><<<config.block_per_grid,
+                             config.thread_per_block,
+                             0,
+                             dev_ctx.stream()>>>(
+          pd_indices, stride_array, shape_array, rank, numel, x_grad_data);
     }
   }
 
   auto out_grad_dims = out_grad.dims();
-  const int64_t numel = indices[0]->numel();
-  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel);
   auto out_grad_stride = phi::stride(out_grad_dims);
 
-  phi::Array<int64_t, Rank> stride_a;
-  phi::Array<int64_t, Rank> shape_a;
-
-  for (size_t idx = 0; idx < Rank; ++idx) {
-    stride_a[idx] = out_grad_stride[idx];
-    shape_a[idx] = out_grad_dims[idx];
+  phi::Array<int64_t, DDim::kMaxRank> stride_array;
+  phi::Array<int64_t, DDim::kMaxRank> shape_array;
+  for (int i = 0; i < rank; ++i) {
+    stride_array[i] = out_grad_stride[i];
+    shape_array[i] = out_grad_dims[i];
   }
 
+  const int64_t numel = indices[0]->numel();
   auto pd_indices =
       funcs::GetDevicePointerArray<int64_t, Context>(dev_ctx, indices);
+  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel);
 
   if (value_grad) {
     if (value_grad->numel() == 1) {
@@ -133,16 +139,16 @@ void LaunchIndexPutGradCudaKernel(
       T* tmp_value_grad_data = dev_ctx.template Alloc<T>(&tmp_value_grad);
       auto out_grad_data = out_grad.data<T>();
 
-      index_put_grad_cuda_kernel<T, Rank>
-          <<<config.block_per_grid,
-             config.thread_per_block,
-             0,
-             dev_ctx.stream()>>>(numel,
-                                 out_grad_data,
-                                 pd_indices,
-                                 stride_a,
-                                 shape_a,
-                                 tmp_value_grad_data);
+      IndexPutGradCudaKernel<T><<<config.block_per_grid,
+                                  config.thread_per_block,
+                                  0,
+                                  dev_ctx.stream()>>>(out_grad_data,
+                                                      pd_indices,
+                                                      stride_array,
+                                                      shape_array,
+                                                      rank,
+                                                      numel,
+                                                      tmp_value_grad_data);
 
       std::vector<int> v_dims(tmp_value_grad.dims().size());
       std::iota(v_dims.begin(), v_dims.end(), 0);
@@ -157,11 +163,16 @@ void LaunchIndexPutGradCudaKernel(
       T* value_grad_data = dev_ctx.template Alloc<T>(value_grad);
       auto out_grad_data = out_grad.data<T>();
 
-      index_put_grad_cuda_kernel<T, Rank><<<config.block_per_grid,
-                                            config.thread_per_block,
-                                            0,
-                                            dev_ctx.stream()>>>(
-          numel, out_grad_data, pd_indices, stride_a, shape_a, value_grad_data);
+      IndexPutGradCudaKernel<T><<<config.block_per_grid,
+                                  config.thread_per_block,
+                                  0,
+                                  dev_ctx.stream()>>>(out_grad_data,
+                                                      pd_indices,
+                                                      stride_array,
+                                                      shape_array,
+                                                      rank,
+                                                      numel,
+                                                      value_grad_data);
     } else {
       DenseTensor tmp_value_grad(value_grad->dtype());
       tmp_value_grad.Resize(indices[0]->dims());
@@ -169,16 +180,16 @@ void LaunchIndexPutGradCudaKernel(
       T* tmp_value_grad_data = dev_ctx.template Alloc<T>(&tmp_value_grad);
       auto out_grad_data = out_grad.data<T>();
 
-      index_put_grad_cuda_kernel<T, Rank>
-          <<<config.block_per_grid,
-             config.thread_per_block,
-             0,
-             dev_ctx.stream()>>>(numel,
-                                 out_grad_data,
-                                 pd_indices,
-                                 stride_a,
-                                 shape_a,
-                                 tmp_value_grad_data);
+      IndexPutGradCudaKernel<T><<<config.block_per_grid,
+                                  config.thread_per_block,
+                                  0,
+                                  dev_ctx.stream()>>>(out_grad_data,
+                                                      pd_indices,
+                                                      stride_array,
+                                                      shape_array,
+                                                      rank,
+                                                      numel,
+                                                      tmp_value_grad_data);
 
       std::vector<int64_t> after_dims = phi::vectorize(tmp_value_grad.dims());
       std::vector<int64_t> before_dims = phi::vectorize(value_grad->dims());
@@ -234,7 +245,6 @@ void IndexPutGradKernel(const Context& dev_ctx,
     return;
   }
 
-  const size_t total_dims = x.dims().size();
   auto bd_dim = funcs::BroadCastTensorsDims(int_indices_v);
 
   std::vector<int64_t> res_dim_v(phi::vectorize(bd_dim));
@@ -256,37 +266,9 @@ void IndexPutGradKernel(const Context& dev_ctx,
                                      bd_dim,
                                      &res_dim_v);
 
-  switch (total_dims) {
-    case 1:
-      LaunchIndexPutGradCudaKernel<T, Context, 1>(
-          dev_ctx, res_indices_v, out_grad, accumulate, value_grad, x_grad);
-      break;
-    case 2:
-      LaunchIndexPutGradCudaKernel<T, Context, 2>(
-          dev_ctx, res_indices_v, out_grad, accumulate, value_grad, x_grad);
-      break;
-    case 3:
-      LaunchIndexPutGradCudaKernel<T, Context, 3>(
-          dev_ctx, res_indices_v, out_grad, accumulate, value_grad, x_grad);
-      break;
-    case 4:
-      LaunchIndexPutGradCudaKernel<T, Context, 4>(
-          dev_ctx, res_indices_v, out_grad, accumulate, value_grad, x_grad);
-      break;
-    case 5:
-      LaunchIndexPutGradCudaKernel<T, Context, 5>(
-          dev_ctx, res_indices_v, out_grad, accumulate, value_grad, x_grad);
-      break;
-    case 6:
-      LaunchIndexPutGradCudaKernel<T, Context, 6>(
-          dev_ctx, res_indices_v, out_grad, accumulate, value_grad, x_grad);
-      break;
-    default:
-      PADDLE_THROW(phi::errors::InvalidArgument(
-          "dims of input tensor should be less than 7, But received"
-          "%d",
-          x.dims().size()));
-  }
+  const int rank = x.dims().size();
+  LaunchIndexPutGradCudaKernel<T, Context>(
+      dev_ctx, res_indices_v, out_grad, rank, accumulate, value_grad, x_grad);
 }
 }  // namespace phi
 
diff --git a/paddle/phi/kernels/gpu/index_put_kernel.cu b/paddle/phi/kernels/gpu/index_put_kernel.cu
index 4244e755b6597..ccbd19aaba681 100644
--- a/paddle/phi/kernels/gpu/index_put_kernel.cu
+++ b/paddle/phi/kernels/gpu/index_put_kernel.cu
@@ -21,24 +21,27 @@
 
 namespace phi {
 
-template <typename T, size_t Rank>
-__global__ void index_put_cuda_kernel(const int64_t N,
-                                      const T* x,
-                                      const T* vals,
-                                      int64_t** indices,
-                                      phi::Array<int64_t, Rank> stride,
-                                      phi::Array<int64_t, Rank> shape,
-                                      int64_t is_single_val_tensor,
-                                      bool accumulate,
-                                      T* out) {
-  int64_t idx = threadIdx.x + blockDim.x * blockIdx.x;
+template <typename T>
+__global__ void IndexPutCudaKernel(const T* x,
+                                   const T* vals,
+                                   int64_t** indices,
+                                   phi::Array<int64_t, DDim::kMaxRank> stride,
+                                   phi::Array<int64_t, DDim::kMaxRank> shape,
+                                   const int rank,
+                                   const int64_t numel,
+                                   const int64_t is_single_val_tensor,
+                                   const bool accumulate,
+                                   T* out) {
+  int64_t idx =
+      static_cast<int64_t>(threadIdx.x) +
+      static_cast<int64_t>(blockDim.x) * static_cast<int64_t>(blockIdx.x);
   int64_t cur_ix = 0;
 
-  if (idx >= N) {
+  if (idx >= numel) {
     return;
   }
   int64_t offset = 0;
-  for (int i = 0; i < Rank; ++i) {
+  for (int i = 0; i < rank; ++i) {
     cur_ix = (static_cast<int64_t>(*(indices[i] + idx)));
     if (cur_ix < 0) {
       cur_ix += shape[i];
@@ -53,7 +56,7 @@ __global__ void index_put_cuda_kernel(const int64_t N,
   }
 }
 
-template <typename T, typename Context, size_t Rank>
+template <typename T, typename Context>
 void LaunchIndexPutCudaKernel(const Context& dev_ctx,
                               const DenseTensor& x,
                               const std::vector<const DenseTensor*>& indices,
@@ -62,38 +65,39 @@ void LaunchIndexPutCudaKernel(const Context& dev_ctx,
                               DenseTensor* out) {
   auto* x_data = x.data<T>();
   auto* val_data = value.data<T>();
+
   bool is_initialized = out->initialized();
   T* out_data = dev_ctx.template Alloc<T>(out);
-
   if (!is_initialized) {
     phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out);
   }
 
   auto x_dims = x.dims();
-  const int64_t numel = indices[0]->numel();
-  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel);
+  const int rank = x_dims.size();
   auto x_stride = phi::stride(x_dims);
 
-  phi::Array<int64_t, Rank> stride_a;
-  phi::Array<int64_t, Rank> shape_a;
-
-  for (size_t idx = 0; idx < Rank; ++idx) {
-    stride_a[idx] = x_stride[idx];
-    shape_a[idx] = x_dims[idx];
+  phi::Array<int64_t, DDim::kMaxRank> stride_array;
+  phi::Array<int64_t, DDim::kMaxRank> shape_array;
+  for (int i = 0; i < rank; ++i) {
+    stride_array[i] = x_stride[i];
+    shape_array[i] = x_dims[i];
   }
 
   int64_t is_single_val_tensor = (value.numel() == 1) ? 0 : INT64_MAX;
-
+  const int64_t numel = indices[0]->numel();
   auto pd_indices =
       funcs::GetDevicePointerArray<int64_t, Context>(dev_ctx, indices);
-  index_put_cuda_kernel<T, Rank>
+
+  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel);
+  IndexPutCudaKernel<T>
       <<<config.block_per_grid, config.thread_per_block, 0, dev_ctx.stream()>>>(
-          numel,
           x_data,
           val_data,
           pd_indices,
-          stride_a,
-          shape_a,
+          stride_array,
+          shape_array,
+          rank,
+          numel,
           is_single_val_tensor,
           accumulate,
           out_data);
@@ -124,7 +128,6 @@ void IndexPutKernel(const Context& dev_ctx,
     }
     return;
   }
-  const size_t total_dims = x.dims().size();
   auto bd_dim = funcs::BroadCastTensorsDims(int_indices_v);
 
   std::vector<int64_t> res_dim_v(phi::vectorize(bd_dim));
@@ -158,37 +161,8 @@ void IndexPutKernel(const Context& dev_ctx,
     ptr_value = &value;
   }
 
-  switch (total_dims) {
-    case 1:
-      LaunchIndexPutCudaKernel<T, Context, 1>(
-          dev_ctx, x, res_indices_v, *ptr_value, accumulate, out);
-      break;
-    case 2:
-      LaunchIndexPutCudaKernel<T, Context, 2>(
-          dev_ctx, x, res_indices_v, *ptr_value, accumulate, out);
-      break;
-    case 3:
-      LaunchIndexPutCudaKernel<T, Context, 3>(
-          dev_ctx, x, res_indices_v, *ptr_value, accumulate, out);
-      break;
-    case 4:
-      LaunchIndexPutCudaKernel<T, Context, 4>(
-          dev_ctx, x, res_indices_v, *ptr_value, accumulate, out);
-      break;
-    case 5:
-      LaunchIndexPutCudaKernel<T, Context, 5>(
-          dev_ctx, x, res_indices_v, *ptr_value, accumulate, out);
-      break;
-    case 6:
-      LaunchIndexPutCudaKernel<T, Context, 6>(
-          dev_ctx, x, res_indices_v, *ptr_value, accumulate, out);
-      break;
-    default:
-      PADDLE_THROW(phi::errors::InvalidArgument(
-          "dims of input tensor should be less than 7, But received"
-          "%d",
-          x.dims().size()));
-  }
+  LaunchIndexPutCudaKernel<T, Context>(
+      dev_ctx, x, res_indices_v, *ptr_value, accumulate, out);
 }
 }  // namespace phi
 
diff --git a/paddle/phi/kernels/gpu/matmul_kernel.cu b/paddle/phi/kernels/gpu/matmul_kernel.cu
index 71095bf783b0b..5882cab4f4ee5 100644
--- a/paddle/phi/kernels/gpu/matmul_kernel.cu
+++ b/paddle/phi/kernels/gpu/matmul_kernel.cu
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/matmul_kernel_impl.h"
 
+#ifdef PADDLE_WITH_CUDA
 PD_REGISTER_KERNEL(matmul,
                    GPU,
                    ALL_LAYOUT,
@@ -30,11 +31,46 @@ PD_REGISTER_KERNEL(matmul,
                    phi::dtype::float16,
                    phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
-
-PD_REGISTER_KERNEL(
-    matmul_int8, GPU, ALL_LAYOUT, phi::MatmulInt8Kernel, int8_t) {}
+                   phi::dtype::complex<double>,
+                   int8_t) {
+  if (kernel_key.dtype() == phi::DataType::INT8) {
+    kernel->OutputAt(0).SetDataType(phi::DataType::INT32);
+  }
+}
+#else
+PD_REGISTER_KERNEL(matmul,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MatmulKernel,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {
+  if (kernel_key.dtype() == phi::DataType::INT8) {
+    kernel->OutputAt(0).SetDataType(phi::DataType::INT32);
+  }
+}
+#endif
 
+#ifdef PADDLE_WITH_CUDA
+PD_REGISTER_KERNEL(matmul_with_flatten,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::MatmulWithFlattenKernel,
+                   int8_t,
+                   float,
+                   double,
+                   phi::dtype::bfloat16,
+                   phi::dtype::float16) {
+  if (kernel_key.dtype() == phi::DataType::INT8) {
+    kernel->OutputAt(0).SetDataType(phi::DataType::INT32);
+  }
+}
+#else
 PD_REGISTER_KERNEL(matmul_with_flatten,
                    GPU,
                    ALL_LAYOUT,
@@ -42,4 +78,9 @@ PD_REGISTER_KERNEL(matmul_with_flatten,
                    float,
                    double,
                    phi::dtype::bfloat16,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16) {
+  if (kernel_key.dtype() == phi::DataType::INT8) {
+    kernel->OutputAt(0).SetDataType(phi::DataType::INT32);
+  }
+}
+#endif
diff --git a/paddle/phi/kernels/gpu/roll_grad_kernel.cu b/paddle/phi/kernels/gpu/roll_grad_kernel.cu
index 3a523e58ca862..71d1cd356a269 100644
--- a/paddle/phi/kernels/gpu/roll_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/roll_grad_kernel.cu
@@ -22,8 +22,6 @@
 
 namespace phi {
 
-using phi::PADDLE_CUDA_NUM_THREADS;
-
 template <typename T, typename Context>
 void RollGradKernel(const Context& dev_ctx,
                     const DenseTensor& x,
@@ -31,23 +29,23 @@ void RollGradKernel(const Context& dev_ctx,
                     const IntArray& shifts,
                     const std::vector<int64_t>& axis,
                     DenseTensor* x_grad) {
-  auto* in_data = out_grad.data<T>();
-  T* out_data = dev_ctx.template Alloc<T>(x_grad);
-  int64_t numel = out_grad.numel();
-  auto stream = dev_ctx.stream();
+  auto* out_grad_data = out_grad.data<T>();
+  T* x_grad_data = dev_ctx.template Alloc<T>(x_grad);
 
   auto shifts_data = shifts.GetData();
-  size_t nums = shifts_data.size();
+  int rank = shifts_data.size();
+
+  int64_t numel = out_grad.numel();
   auto input_dim = out_grad.dims();
   auto stride_dim = phi::stride(input_dim);
 
-  std::vector<int64_t> strides(nums), sizes(nums);
+  std::vector<int64_t> strides(rank), sizes(rank);
   if (axis.size() == 0) {
     strides[0] = 1;
     sizes[0] = numel;
     shifts_data[0] = ((-shifts_data[0]) % numel + numel) % numel;
   } else {
-    for (size_t i = 0; i < nums; i++) {
+    for (int i = 0; i < rank; i++) {
       int dim = axis[i] >= 0 ? axis[i] : axis[i] + input_dim.size();
       int64_t size = input_dim[dim];
       if (size != 0) {
@@ -58,22 +56,14 @@ void RollGradKernel(const Context& dev_ctx,
     }
   }
 
-  switch (nums) {
-    CALL_ROLL_CUDA_KERNEL(1);
-    CALL_ROLL_CUDA_KERNEL(2);
-    CALL_ROLL_CUDA_KERNEL(3);
-    CALL_ROLL_CUDA_KERNEL(4);
-    CALL_ROLL_CUDA_KERNEL(5);
-    CALL_ROLL_CUDA_KERNEL(6);
-    CALL_ROLL_CUDA_KERNEL(7);
-    CALL_ROLL_CUDA_KERNEL(8);
-    CALL_ROLL_CUDA_KERNEL(9);
-    default:
-      PADDLE_THROW(phi::errors::InvalidArgument(
-          "shifts.size() should be less than 10, But received shifts.size() "
-          "= %d",
-          shifts_data.size()));
-  }
+  LaunchRollKernel<T, Context>(dev_ctx,
+                               out_grad_data,
+                               x_grad_data,
+                               rank,
+                               numel,
+                               shifts_data,
+                               strides,
+                               sizes);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/roll_kernel.cu b/paddle/phi/kernels/gpu/roll_kernel.cu
index 0e87713df73aa..cf4f87ac11854 100644
--- a/paddle/phi/kernels/gpu/roll_kernel.cu
+++ b/paddle/phi/kernels/gpu/roll_kernel.cu
@@ -23,8 +23,6 @@
 
 namespace phi {
 
-using phi::PADDLE_CUDA_NUM_THREADS;
-
 template <typename T, typename Context>
 void RollKernel(const Context& dev_ctx,
                 const DenseTensor& x,
@@ -33,22 +31,21 @@ void RollKernel(const Context& dev_ctx,
                 DenseTensor* out) {
   auto* in_data = x.data<T>();
   T* out_data = dev_ctx.template Alloc<T>(out);
-  int64_t numel = x.numel();
-  auto stream = dev_ctx.stream();
 
   auto shifts_data = shifts.GetData();
+  int rank = shifts_data.size();
 
-  size_t nums = shifts_data.size();
+  int64_t numel = x.numel();
   auto input_dim = x.dims();
   auto stride_dim = phi::stride(input_dim);
 
-  std::vector<int64_t> strides(nums), sizes(nums);
+  std::vector<int64_t> strides(rank), sizes(rank);
   if (axis.size() == 0) {
     strides[0] = 1;
     sizes[0] = numel;
     shifts_data[0] = (shifts_data[0] % numel + numel) % numel;
   } else {
-    for (size_t i = 0; i < nums; i++) {
+    for (int i = 0; i < rank; i++) {
       int dim = axis[i] >= 0 ? axis[i] : axis[i] + input_dim.size();
       int64_t size = input_dim[dim];
 
@@ -60,22 +57,8 @@ void RollKernel(const Context& dev_ctx,
     }
   }
 
-  switch (nums) {
-    CALL_ROLL_CUDA_KERNEL(1);
-    CALL_ROLL_CUDA_KERNEL(2);
-    CALL_ROLL_CUDA_KERNEL(3);
-    CALL_ROLL_CUDA_KERNEL(4);
-    CALL_ROLL_CUDA_KERNEL(5);
-    CALL_ROLL_CUDA_KERNEL(6);
-    CALL_ROLL_CUDA_KERNEL(7);
-    CALL_ROLL_CUDA_KERNEL(8);
-    CALL_ROLL_CUDA_KERNEL(9);
-    default:
-      PADDLE_THROW(phi::errors::InvalidArgument(
-          "shifts.size() should be less than 10, But received shifts.size() "
-          "= %d",
-          shifts_data.size()));
-  }
+  LaunchRollKernel<T, Context>(
+      dev_ctx, in_data, out_data, rank, numel, shifts_data, strides, sizes);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/roll_kernel_impl.h b/paddle/phi/kernels/gpu/roll_kernel_impl.h
index d3aa8798008a9..38e2a6ff669ad 100644
--- a/paddle/phi/kernels/gpu/roll_kernel_impl.h
+++ b/paddle/phi/kernels/gpu/roll_kernel_impl.h
@@ -22,23 +22,25 @@ namespace phi {
 
 using phi::PADDLE_CUDA_NUM_THREADS;
 
-template <typename T, size_t Rank>
+template <typename T>
 __global__ void RollCudaKernel(const T* input,
                                T* output,
-                               int64_t N,
-                               phi::Array<int64_t, Rank> shifts,
-                               phi::Array<int64_t, Rank> strides,
-                               phi::Array<int64_t, Rank> sizes) {
-  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= N) {
+                               const int rank,
+                               const int64_t numel,
+                               phi::Array<int64_t, DDim::kMaxRank> shifts,
+                               phi::Array<int64_t, DDim::kMaxRank> strides,
+                               phi::Array<int64_t, DDim::kMaxRank> sizes) {
+  int64_t idx =
+      static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x) +
+      static_cast<int64_t>(threadIdx.x);
+  if (idx >= numel) {
     return;
   }
 
   int64_t output_idx = idx;
   int64_t new_dim_idx = 0;
 
-#pragma unroll
-  for (size_t i = 0; i < Rank; i++) {
+  for (size_t i = 0; i < rank; i++) {
     new_dim_idx = (output_idx / strides[i]) % sizes[i] + shifts[i];
     if (new_dim_idx >= sizes[i]) {
       output_idx += (shifts[i] - sizes[i]) * strides[i];
@@ -49,22 +51,33 @@ __global__ void RollCudaKernel(const T* input,
   output[output_idx] = input[idx];
 }
 
-#define CALL_ROLL_CUDA_KERNEL(N)                                            \
-  case N: {                                                                 \
-    phi::Array<int64_t, N> _strides;                                        \
-    phi::Array<int64_t, N> _shifts;                                         \
-    phi::Array<int64_t, N> _sizes;                                          \
-    for (size_t idx = 0; idx < N; ++idx) {                                  \
-      _strides[idx] = strides[idx];                                         \
-      _shifts[idx] = shifts_data[idx];                                      \
-      _sizes[idx] = sizes[idx];                                             \
-    }                                                                       \
-    RollCudaKernel<T, N>                                                    \
-        <<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, \
-           PADDLE_CUDA_NUM_THREADS,                                         \
-           0,                                                               \
-           stream>>>(in_data, out_data, numel, _shifts, _strides, _sizes);  \
-    break;                                                                  \
+template <typename T, typename Context>
+void LaunchRollKernel(const Context& dev_ctx,
+                      const T* input,
+                      T* output,
+                      const int rank,
+                      const int64_t numel,
+                      const std::vector<int64_t> shifts,
+                      const std::vector<int64_t> strides,
+                      const std::vector<int64_t> sizes) {
+  using phi::PADDLE_CUDA_NUM_THREADS;
+
+  phi::Array<int64_t, DDim::kMaxRank> strides_array;
+  phi::Array<int64_t, DDim::kMaxRank> shifts_array;
+  phi::Array<int64_t, DDim::kMaxRank> sizes_array;
+  for (int i = 0; i < rank; ++i) {
+    strides_array[i] = strides[i];
+    shifts_array[i] = shifts[i];
+    sizes_array[i] = sizes[i];
   }
 
+  auto stream = dev_ctx.stream();
+  RollCudaKernel<T>
+      <<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS,
+         PADDLE_CUDA_NUM_THREADS,
+         0,
+         stream>>>(
+          input, output, rank, numel, shifts_array, strides_array, sizes_array);
+}
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/impl/compare_kernel_impl.h b/paddle/phi/kernels/impl/compare_kernel_impl.h
index 92e10afc50a42..907bd5a20a104 100644
--- a/paddle/phi/kernels/impl/compare_kernel_impl.h
+++ b/paddle/phi/kernels/impl/compare_kernel_impl.h
@@ -30,20 +30,35 @@ inline void CompareKernelImpl(const Context& ctx,
                               int axis,
                               DenseTensor* out);
 
+template <typename T,
+          typename Context,
+          typename Functor,
+          typename InverseFunctor>
+inline void InplaceCompareKernelImpl(const Context& ctx,
+                                     const DenseTensor& x,
+                                     const DenseTensor& y,
+                                     int axis,
+                                     DenseTensor* out);
+
 template <typename T, typename Context, typename Functor>
 inline void CompareAllKernelImpl(const Context& ctx,
                                  const DenseTensor& x,
                                  const DenseTensor& y,
                                  DenseTensor* out);
 
-#define DEFINE_COMPARE_KERNEL(name, functor, inverse_functor)      \
-  template <typename T, typename Context>                          \
-  void name##Kernel(const Context& ctx,                            \
-                    const DenseTensor& x,                          \
-                    const DenseTensor& y,                          \
-                    DenseTensor* out) {                            \
-    CompareKernelImpl<T, Context, functor<T>, inverse_functor<T>>( \
-        ctx, x, y, -1, out);                                       \
+#define DEFINE_COMPARE_KERNEL(name, functor, inverse_functor)               \
+  template <typename T, typename Context>                                   \
+  void name##Kernel(const Context& ctx,                                     \
+                    const DenseTensor& x,                                   \
+                    const DenseTensor& y,                                   \
+                    DenseTensor* out) {                                     \
+    if (out->IsSharedWith(x)) {                                             \
+      InplaceCompareKernelImpl<T, Context, functor<T>, inverse_functor<T>>( \
+          ctx, x, y, -1, out);                                              \
+    } else {                                                                \
+      CompareKernelImpl<T, Context, functor<T>, inverse_functor<T>>(        \
+          ctx, x, y, -1, out);                                              \
+    }                                                                       \
   }
 
 DEFINE_COMPARE_KERNEL(LessThan,
diff --git a/paddle/phi/kernels/impl/fill_kernel_impl.h b/paddle/phi/kernels/impl/fill_kernel_impl.h
index 6894204cd06a4..4e8cda48f6dd6 100644
--- a/paddle/phi/kernels/impl/fill_kernel_impl.h
+++ b/paddle/phi/kernels/impl/fill_kernel_impl.h
@@ -27,9 +27,9 @@ void FillKernel(const Context& dev_ctx,
                 const DenseTensor& x UNUSED,
                 const Scalar& value,
                 DenseTensor* out) {
-  T fill_var = value.to<T>();
+  double fill_var = value.to<double>();
 
-  PADDLE_ENFORCE_EQ(std::isnan(static_cast<double>(fill_var)),
+  PADDLE_ENFORCE_EQ(std::isnan(fill_var),
                     false,
                     phi::errors::InvalidArgument("fill value should not be NaN,"
                                                  " but received NaN"));
@@ -37,7 +37,7 @@ void FillKernel(const Context& dev_ctx,
   dev_ctx.template Alloc<T>(out);
 
   phi::funcs::SetConstant<Context, T> functor;
-  functor(dev_ctx, out, fill_var);
+  functor(dev_ctx, out, value.to<T>());
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/impl/matmul_kernel_impl.h b/paddle/phi/kernels/impl/matmul_kernel_impl.h
index e680e164e623d..b3b9d82d19eec 100644
--- a/paddle/phi/kernels/impl/matmul_kernel_impl.h
+++ b/paddle/phi/kernels/impl/matmul_kernel_impl.h
@@ -954,14 +954,6 @@ struct MatMulDispatcher<phi::GPUContext, T> {
   }
 };
 
-static phi::Allocator::AllocationPtr GetWorkspace(const phi::GPUContext& ctx,
-                                                  size_t workspace_size) {
-  return phi::memory_utils::Alloc(
-      ctx.GetPlace(),
-      workspace_size,
-      phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
-}
-
 #endif  // PADDLE_WITH_CUDA
 
 template <typename Context, typename T>
@@ -979,7 +971,7 @@ void MatMulFunction(const Context& ctx,
 }
 
 template <typename Context>
-void MatMulInt8Function(const Context& ctx,
+bool MatMulInt8Function(const Context& ctx,
                         const DenseTensor& x,
                         const DenseTensor& y,
                         const std::vector<std::int64_t>& x_dims,
@@ -987,49 +979,245 @@ void MatMulInt8Function(const Context& ctx,
                         DenseTensor* out,
                         bool trans_x,
                         bool trans_y) {
-  PADDLE_ENFORCE_EQ(
-      x.dtype(),
-      DataType::INT8,
-      phi::errors::InvalidArgument(
-          "The type of input(x) used in int8 matmul must be (%s) does not "
-          "match the "
-          "type of data (%s) currently contained in the container.",
-          phi::CppTypeToDataType<int8_t>::Type(),
-          x.dtype()));
-  PADDLE_ENFORCE_EQ(
-      y.dtype(),
-      DataType::INT8,
-      phi::errors::InvalidArgument(
-          "The type of input(y) used in int8 matmul must be (%s) does not "
-          "match the "
-          "type of data (%s) currently contained in the container.",
-          phi::CppTypeToDataType<int8_t>::Type(),
-          x.dtype()));
-#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11020
+  return false;
+}
+
+#ifdef PADDLE_WITH_CUDA
+template <>
+bool inline MatMulInt8Function(const phi::GPUContext& ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& y,
+                               const std::vector<std::int64_t>& x_dims,
+                               const std::vector<std::int64_t>& y_dims,
+                               DenseTensor* out,
+                               bool trans_x,
+                               bool trans_y) {
+  if (x.dtype() != DataType::INT8 || y.dtype() != DataType::INT8) {
+    return false;
+  }
+#if CUDA_VERSION >= 11060
   const int x_ndim = x_dims.size();
   const int y_ndim = y_dims.size();
-  PADDLE_ENFORCE_EQ(
-      x_ndim,
-      2,
-      phi::errors::InvalidArgument("[INT8 GEMM] The number of dims of input(x) "
-                                   "must be equal to 2 but received %d",
-                                   x_ndim));
-  PADDLE_ENFORCE_EQ(
-      y_ndim,
-      2,
-      phi::errors::InvalidArgument("[INT8 GEMM] The number of dims of input(x) "
-                                   "must be equal to 2 but received %d",
-                                   y_ndim));
-  PADDLE_ENFORCE_EQ(
+  const int8_t* x_data = x.data<int8_t>();
+  const int8_t* y_data = y.data<int8_t>();
+  using blaslt = phi::funcs::MatmulWithCublasLt<int8_t, int32_t>;
+
+  phi::funcs::MatmulPlanner matmul_planner(
+      x_dims,
+      y_dims,
       trans_x,
-      false,
-      phi::errors::InvalidArgument("[INT8 GEMM] Input(x) must be not "
-                                   "transposed to acheive better performance"));
-  PADDLE_ENFORCE_EQ(
       trans_y,
-      true,
-      phi::errors::InvalidArgument("[INT8 GEMM] Input(y) must be transposed to "
-                                   "acheive better performance"));
+      phi::CppTypeToDataType<int8_t>::Type(),
+      funcs::MatmulFusedType::kMatmul,
+      /* bias_data */ nullptr,
+      /* reserve_data */ nullptr,
+      /* use_addto */ false,
+      /* no_exchange */ true);
+
+  if (x_ndim == 1 && y_ndim == 1) {
+    const int M = x.numel();
+    const int N = y.numel();
+    PADDLE_ENFORCE_EQ(
+        M,
+        N,
+        phi::errors::InvalidArgument(
+            "X's numbers must be equal to Y's numbers,"
+            "when X/Y's dims =1. But received X has [%d] elements,"
+            "received Y has [%d] elements",
+            M,
+            N));
+    if (!(M % 4 == 0)) {
+      return false;
+    }
+
+    out->Resize(phi::make_ddim({}));
+    ctx.template Alloc<int32_t>(out);
+    blaslt::Run(ctx,
+                y_data,
+                x_data,
+                ctx.template Alloc<int32_t>(out),
+                1,
+                1,
+                M,
+                false,
+                true,
+                &matmul_planner);
+    return true;
+  }
+  if (x_ndim == 1) {
+    const int N = x.numel();
+    if (trans_y) {
+      PADDLE_ENFORCE_EQ(
+          y_dims[y_ndim - 1],
+          N,
+          phi::errors::InvalidArgument("Input(Y) has error dim."
+                                       "Y'dims[%d] must be equal to %d"
+                                       "But received Y'dims[%d] is %d",
+                                       y_ndim - 1,
+                                       N,
+                                       y_ndim - 1,
+                                       y_dims[y_ndim - 1]));
+      if (!(N % 4 == 0)) {
+        return false;
+      }
+    } else {
+      PADDLE_ENFORCE_EQ(
+          y_dims[y_ndim - 2],
+          N,
+          phi::errors::InvalidArgument("Input(Y) has error dim."
+                                       "Y'dims[%d] must be equal to %d"
+                                       "But received Y'dims[%d] is %d",
+                                       y_ndim - 2,
+                                       N,
+                                       y_ndim - 2,
+                                       y_dims[y_ndim - 2]));
+      const int M = y.numel() / N;
+      if (!(M == 1 || M % 4 == 0)) {
+        return false;
+      }
+    }
+    std::vector<std::int64_t> out_dims(y_ndim - 1);
+    if (trans_y) {
+      std::copy_n(y_dims.cbegin(), y_ndim - 1, out_dims.begin());
+    } else {
+      std::copy_n(y_dims.cbegin(), y_ndim - 2, out_dims.begin());
+      out_dims.back() = y_dims.back();
+    }
+    out->ResizeAndAllocate(phi::make_ddim(out_dims));
+    ctx.template Alloc<int32_t>(out);
+    if (trans_y) {
+      const int M = y.numel() / N;
+      blaslt::Run(ctx,
+                  y_data,
+                  x_data,
+                  ctx.template Alloc<int32_t>(out),
+                  M,
+                  1,
+                  N,
+                  false,
+                  false,
+                  &matmul_planner);
+    } else {
+      const int M = y_dims[y_ndim - 1];
+      const int batch_size = y.numel() / (M * N);
+      if (batch_size == 1) {
+        blaslt::Run(ctx,
+                    y_data,
+                    x_data,
+                    ctx.template Alloc<int32_t>(out),
+                    M,
+                    1,
+                    N,
+                    true,
+                    false,
+                    &matmul_planner);
+      } else {
+        blaslt::RunWithBatch(ctx,
+                             y_data,
+                             x_data,
+                             ctx.template Alloc<int32_t>(out),
+                             M,
+                             1,
+                             N,
+                             true,
+                             false,
+                             batch_size,
+                             M * N,
+                             0,
+                             M,
+                             &matmul_planner);
+      }
+    }
+    return true;
+  }
+
+  if (y_ndim == 1) {
+    const int N = y.numel();
+    if (trans_x) {
+      PADDLE_ENFORCE_EQ(
+          x_dims[x_ndim - 2],
+          N,
+          phi::errors::InvalidArgument("Input(X) has error dim."
+                                       "X'dims[%d] must be equal to %d"
+                                       "But received X'dims[%d] is %d",
+                                       x_ndim - 2,
+                                       N,
+                                       x_ndim - 2,
+                                       x_dims[x_ndim - 2]));
+      const int M = x.numel() / N;
+      if (!((M == 1 || M % 4 == 0))) {
+        return false;
+      }
+    } else {
+      PADDLE_ENFORCE_EQ(
+          x_dims[x_ndim - 1],
+          N,
+          phi::errors::InvalidArgument("Input(X) has error dim."
+                                       "X'dims[%d] must be equal to %d"
+                                       "But received X'dims[%d] is %d",
+                                       x_ndim - 1,
+                                       N,
+                                       x_ndim - 1,
+                                       x_dims[x_ndim - 1]));
+      if (N % 4 != 0) {
+        return false;
+      }
+    }
+    std::vector<std::int64_t> out_dims(x_ndim - 1);
+    if (trans_x) {
+      std::copy_n(x_dims.cbegin(), x_ndim - 2, out_dims.begin());
+      out_dims.back() = x_dims.back();
+    } else {
+      std::copy_n(x_dims.cbegin(), x_ndim - 1, out_dims.begin());
+    }
+    out->ResizeAndAllocate(phi::make_ddim(out_dims));
+    ctx.template Alloc<int32_t>(out);
+
+    if (trans_x) {
+      const int M = x_dims[x_ndim - 1];
+      const int batch_size = x.numel() / (M * N);
+      if (batch_size == 1) {
+        blaslt::Run(ctx,
+                    x_data,
+                    y_data,
+                    ctx.template Alloc<int32_t>(out),
+                    M,
+                    1,
+                    N,
+                    true,
+                    false,
+                    &matmul_planner);
+      } else {
+        blaslt::RunWithBatch(ctx,
+                             x_data,
+                             y_data,
+                             ctx.template Alloc<int32_t>(out),
+                             M,
+                             1,
+                             N,
+                             true,
+                             false,
+                             batch_size,
+                             M * N,
+                             0,
+                             M,
+                             &matmul_planner);
+      }
+    } else {
+      const int M = x.numel() / N;
+      blaslt::Run(ctx,
+                  x_data,
+                  y_data,
+                  ctx.template Alloc<int32_t>(out),
+                  M,
+                  1,
+                  N,
+                  false,
+                  false,
+                  &matmul_planner);
+    }
+    return true;
+  }
 
   const int M = trans_x ? x_dims[x_ndim - 1] : x_dims[x_ndim - 2];
   const int K = trans_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1];
@@ -1057,27 +1245,186 @@ void MatMulInt8Function(const Context& ctx,
                                      y_dims[y_ndim - 2]));
   }
   const int N = trans_y ? y_dims[y_ndim - 2] : y_dims[y_ndim - 1];
+  const int ndim = (std::max)(x_ndim, y_ndim);
+  std::vector<std::int64_t> x_broadcast_dims(ndim);
+  std::vector<std::int64_t> y_broadcast_dims(ndim);
+  std::vector<std::int64_t> out_broadcast_dims(ndim);
+  GetBroadcastFromDims(x_ndim - 2,
+                       x_dims.data(),
+                       y_ndim - 2,
+                       y_dims.data(),
+                       x_broadcast_dims.data(),
+                       y_broadcast_dims.data(),
+                       out_broadcast_dims.data());
+  out_broadcast_dims[ndim - 2] = M;
+  out_broadcast_dims[ndim - 1] = N;
 
-  size_t workspace_size = static_cast<size_t>(4) * 1024 * 1024;
-  phi::Allocator::AllocationPtr workspace = GetWorkspace(ctx, workspace_size);
+  out->ResizeAndAllocate(phi::make_ddim(out_broadcast_dims));
+  ctx.template Alloc<int32_t>(out);
 
-  // TODO(wufeisheng): cublaslt_helper is a temp scheme for Int8 GEMM,
-  // and releted functions need to be integrated into
-  // phi::funcs::MatmulWithCublasLt
-  auto cublaslt_helper = CublasLtHelper(M, K, N, ctx.cublaslt_handle());
+  const int batch_dim = ndim - 2;
+  // broadcast message
+  const bool is_broadcast_dims =
+      !std::equal(x_broadcast_dims.cbegin(),
+                  x_broadcast_dims.cbegin() + batch_dim,
+                  y_broadcast_dims.cbegin());
 
-  ctx.template Alloc<int32_t>(out);
-  cublaslt_helper.GEMM(x.data<int8_t>(),
-                       y.data<int8_t>(),
-                       out->data<int32_t>(),
-                       ctx.stream(),
-                       workspace->ptr());
+  const std::int64_t x_batch_size =
+      std::accumulate(x_broadcast_dims.cbegin(),
+                      x_broadcast_dims.cbegin() + batch_dim,
+                      1LL,
+                      std::multiplies<std::int64_t>());
+  const std::int64_t y_batch_size =
+      std::accumulate(y_broadcast_dims.cbegin(),
+                      y_broadcast_dims.cbegin() + batch_dim,
+                      1LL,
+                      std::multiplies<std::int64_t>());
+  const std::int64_t out_batch_size =
+      std::accumulate(out_broadcast_dims.cbegin(),
+                      out_broadcast_dims.cbegin() + batch_dim,
+                      1LL,
+                      std::multiplies<std::int64_t>());
+  if (out_batch_size == 0) return true;
 
+  if (x_batch_size == 1 && M == 1 && trans_y) {
+    if (!(K % 4 == 0)) {
+      return false;
+    }
+  } else if (!trans_x && !trans_y) {
+    if (!(N % 4 == 0 || N == 1) || !(K % 4 == 0) || (M == 1 && N == 1)) {
+      return false;
+    }
+  } else if (!trans_x && trans_y) {
+    if (!(K % 4 == 0)) {
+      return false;
+    }
+  } else if (trans_x && !trans_y) {
+    if (!(M % 4 == 0 || M == 1) || !(N % 4 == 0 || N == 1)) {
+      return false;
+    }
+  } else {
+    if (!(M % 4 == 0 || M == 1) || !(K % 4 == 0)) {
+      return false;
+    }
+  }
+  if (x_batch_size == 1 && y_batch_size == 1) {
+    blaslt::Run(ctx,
+                x_data,
+                y_data,
+                ctx.template Alloc<int32_t>(out),
+                M,
+                N,
+                K,
+                trans_x,
+                trans_y,
+                &matmul_planner);
+  } else if (x_batch_size == 1) {
+    if (M == 1 && trans_y) {
+      blaslt::Run(ctx,
+                  y_data,
+                  x_data,
+                  ctx.template Alloc<int32_t>(out),
+                  y_batch_size * N,
+                  1,
+                  K,
+                  false,
+                  false,
+                  &matmul_planner);
+    } else {
+      blaslt::RunWithBatch(ctx,
+                           x_data,
+                           y_data,
+                           ctx.template Alloc<int32_t>(out),
+                           M,
+                           N,
+                           K,
+                           trans_x,
+                           trans_y,
+                           out_batch_size,
+                           0,
+                           K * N,
+                           M * N,
+                           &matmul_planner);
+    }
+  } else if (y_batch_size == 1) {
+    if (!trans_x) {
+      blaslt::Run(ctx,
+                  x_data,
+                  y_data,
+                  ctx.template Alloc<int32_t>(out),
+                  x_batch_size * M,
+                  N,
+                  K,
+                  false,
+                  trans_y,
+                  &matmul_planner);
+    } else {
+      blaslt::RunWithBatch(ctx,
+                           x_data,
+                           y_data,
+                           ctx.template Alloc<int32_t>(out),
+                           M,
+                           N,
+                           K,
+                           true,
+                           trans_y,
+                           out_batch_size,
+                           M * K,
+                           0,
+                           M * N,
+                           &matmul_planner);
+    }
+  } else if (!is_broadcast_dims) {
+    blaslt::RunWithBatch(ctx,
+                         x_data,
+                         y_data,
+                         ctx.template Alloc<int32_t>(out),
+                         M,
+                         N,
+                         K,
+                         trans_x,
+                         trans_y,
+                         out_batch_size,
+                         M * K,
+                         K * N,
+                         M * N,
+                         &matmul_planner);
+  } else {
+    // in the case, can't use stridedgemm
+    std::vector<const int8_t*> x_ptr(out_batch_size);
+    std::vector<const int8_t*> y_ptr(out_batch_size);
+    std::vector<int32_t*> out_ptr(out_batch_size);
+    std::vector<std::int64_t> index(batch_dim, 0);
+    for (std::int64_t i = 0; i < out_batch_size; ++i) {
+      // using the index to get offset
+      const std::int64_t x_index =
+          GetIndexMessage(batch_dim, x_broadcast_dims.data(), index.data());
+      const std::int64_t y_index =
+          GetIndexMessage(batch_dim, y_broadcast_dims.data(), index.data());
+
+      x_ptr[i] = x_data + x_index * M * K;
+      y_ptr[i] = y_data + y_index * K * N;
+      out_ptr[i] = ctx.template Alloc<int32_t>(out) + i * M * N;
+      IndexIncreaseFromDims(batch_dim, out_broadcast_dims.data(), index.data());
+    }
+    blaslt::RunWithBatch(ctx,
+                         x_ptr.data(),
+                         y_ptr.data(),
+                         out_ptr.data(),
+                         M,
+                         N,
+                         K,
+                         trans_x,
+                         trans_y,
+                         out_batch_size,
+                         &matmul_planner);
+  }
+  return true;
 #else
-  PADDLE_THROW(phi::errors::Unimplemented(
-      "MatmulInt8 op needs paddle with cuda and cuda version >= 11.2"));
+  return false;
 #endif
 }
+#endif
 
 template <typename Context, typename T>
 typename std::enable_if<std::is_integral<T>::value>::type
@@ -1089,6 +1436,11 @@ MatmulJudgeDtypeKernel(const Context& ctx,
                        DenseTensor* out,
                        bool transpose_x,
                        bool transpose_y) {
+  bool try_matmul_int8 = MatMulInt8Function<Context>(
+      ctx, x, y, x_dims, y_dims, out, transpose_x, transpose_y);
+  if (try_matmul_int8) {
+    return;
+  }
   auto x_tmp = phi::Cast<T, Context>(ctx, x, phi::DataType::FLOAT32);
   auto y_tmp = phi::Cast<T, Context>(ctx, y, phi::DataType::FLOAT32);
   DenseTensor out_tmp;
@@ -1135,35 +1487,12 @@ void MatmulKernel(const Context& ctx,
 }
 
 template <typename T, typename Context>
-void MatmulInt8Kernel(const Context& ctx,
-                      const DenseTensor& x,
-                      const DenseTensor& y,
-                      bool transpose_x,
-                      bool transpose_y,
-                      DenseTensor* out) {
-  PADDLE_ENFORCE_NE(
-      phi::product(x.dims()),
-      0,
-      phi::errors::InvalidArgument("The Input(X) dims size must not be equal 0,"
-                                   " but reviced dims size is 0. "));
-  PADDLE_ENFORCE_NE(
-      phi::product(y.dims()),
-      0,
-      phi::errors::InvalidArgument("The Input(Y) dims size must not be equal 0,"
-                                   " but reviced dims size is 0. "));
-  const std::vector<std::int64_t> x_dims = vectorize(x.dims());
-  const std::vector<std::int64_t> y_dims = vectorize(y.dims());
-  MatMulInt8Function<Context>(
-      ctx, x, y, x_dims, y_dims, out, transpose_x, transpose_y);
-}
-
-template <typename T, typename Context>
-void MatmulWithFlattenKernel(const Context& dev_ctx,
-                             const DenseTensor& x,
-                             const DenseTensor& y,
-                             int x_num_col_dims,
-                             int y_num_col_dims,
-                             DenseTensor* out) {
+void MatmulWithFlattenKernelImpl(const Context& dev_ctx,
+                                 const DenseTensor& x,
+                                 const DenseTensor& y,
+                                 int x_num_col_dims,
+                                 int y_num_col_dims,
+                                 DenseTensor* out) {
   const DenseTensor x_matrix =
       x.dims().size() > 2 ? phi::ReshapeToMatrix(x, x_num_col_dims) : x;
   const DenseTensor y_matrix =
@@ -1183,4 +1512,170 @@ void MatmulWithFlattenKernel(const Context& dev_ctx,
   }
 }
 
+#ifdef PADDLE_WITH_CUDA
+
+template <typename Context>
+void MatmulWithFlattenKernelInt8Impl(const Context& dev_ctx,
+                                     const DenseTensor& x,
+                                     const DenseTensor& y,
+                                     int x_num_col_dims,
+                                     int y_num_col_dims,
+                                     DenseTensor* out) {
+  PADDLE_ENFORCE_EQ(
+      x.dtype(),
+      DataType::INT8,
+      phi::errors::InvalidArgument(
+          "The type of input(x) used in int8 mul must be (%s) "
+          "does not match the "
+          "type of data (%s) currently contained in the container.",
+          phi::CppTypeToDataType<int8_t>::Type(),
+          x.dtype()));
+  PADDLE_ENFORCE_EQ(
+      y.dtype(),
+      DataType::INT8,
+      phi::errors::InvalidArgument(
+          "The type of input(y) used in int8 mul must be (%s) "
+          "does not match the "
+          "type of data (%s) currently contained in the container.",
+          phi::CppTypeToDataType<int8_t>::Type(),
+          y.dtype()));
+
+  const DenseTensor x_matrix =
+      x.dims().size() > 2 ? phi::ReshapeToMatrix(x, x_num_col_dims) : x;
+  const DenseTensor y_matrix =
+      y.dims().size() > 2 ? phi::ReshapeToMatrix(y, y_num_col_dims) : y;
+
+  PADDLE_ENFORCE_EQ(
+      x_matrix.dims()[1],
+      y_matrix.dims()[0],
+      phi::errors::InvalidArgument(
+          "X's numbers of columns must be equal to Y's numbers of rows."
+          "But received X has [%d] columns,"
+          "received Y has [%d] rows",
+          x_matrix.dims()[1],
+          y_matrix.dims()[0]));
+
+  PADDLE_ENFORCE_EQ((y_matrix.dims()[1] % 4 == 0 || y_matrix.dims()[1] == 1),
+                    true,
+                    phi::errors::InvalidArgument(
+                        "The dimension size N used in int8 mul must be 1"
+                        "or a multiple of 4 does not match the size (%d)"
+                        "currently contained in the container.",
+                        y_matrix.dims()[1]));
+  PADDLE_ENFORCE_EQ((x_matrix.dims()[1] % 4 == 0),
+                    true,
+                    phi::errors::InvalidArgument(
+                        "The dimension size K used in int8 mul must be a"
+                        "multiple of 4 does not match the size (%d) currently"
+                        "contained in the container.",
+                        x_matrix.dims()[1]));
+
+  dev_ctx.template Alloc<int32_t>(out);
+  auto z_dim = out->dims();
+  if (z_dim.size() != 2) {
+    out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
+  }
+
+#if CUDA_VERSION >= 11060
+  using blaslt = phi::funcs::MatmulWithCublasLt<int8_t, int32_t>;
+
+  const int8_t* x_data = x_matrix.data<int8_t>();
+  const int8_t* y_data = y_matrix.data<int8_t>();
+
+  std::vector<std::int64_t> x_dims = {x_matrix.dims()[0], x_matrix.dims()[1]};
+  std::vector<std::int64_t> y_dims = {y_matrix.dims()[0], y_matrix.dims()[1]};
+  phi::funcs::MatmulPlanner matmul_planner(
+      x_dims,
+      y_dims,
+      false,
+      false,
+      phi::CppTypeToDataType<int8_t>::Type(),
+      funcs::MatmulFusedType::kMatmul,
+      /* bias_data */ nullptr,
+      /* reserve_data */ nullptr,
+      /* use_addto */ false,
+      /* no_exchange */ true);
+
+  blaslt::Run(dev_ctx,
+              x_data,
+              y_data,
+              dev_ctx.template Alloc<int32_t>(out),
+              x_matrix.dims()[0],
+              y_matrix.dims()[1],
+              x_matrix.dims()[1],
+              false,
+              false,
+              &matmul_planner);
+
+  if (z_dim.size() != 2) {
+    out->Resize(z_dim);
+  }
+#endif
+}
+#endif
+
+#ifdef PADDLE_WITH_CUDA
+template <typename Context>
+typename std::enable_if<std::is_same<Context, phi::GPUContext>::value,
+                        void>::type
+DispatchMatmulWithFlattenInt8Kernel(const phi::GPUContext& dev_ctx,
+                                    const DenseTensor& x,
+                                    const DenseTensor& y,
+                                    int x_num_col_dims,
+                                    int y_num_col_dims,
+                                    DenseTensor* out) {
+  MatmulWithFlattenKernelInt8Impl<Context>(
+      dev_ctx, x, y, x_num_col_dims, y_num_col_dims, out);
+}
+#endif
+
+template <typename Context>
+typename std::enable_if<std::is_same<Context, phi::CPUContext>::value,
+                        void>::type
+DispatchMatmulWithFlattenInt8Kernel(const phi::CPUContext& dev_ctx,
+                                    const DenseTensor& x,
+                                    const DenseTensor& y,
+                                    int x_num_col_dims,
+                                    int y_num_col_dims,
+                                    DenseTensor* out) {
+  PADDLE_THROW(phi::errors::Unimplemented(
+      "MatmulWithFlatten with CPU is NOT implemented "
+      "yet."));
+}
+
+template <typename T, typename Context>
+typename std::enable_if<std::is_same<T, int8_t>::value, void>::type
+DispatchMatmulFlattenKernel(const Context& dev_ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& y,
+                            int x_num_col_dims,
+                            int y_num_col_dims,
+                            DenseTensor* out) {
+  DispatchMatmulWithFlattenInt8Kernel<Context>(
+      dev_ctx, x, y, x_num_col_dims, y_num_col_dims, out);
+}
+
+template <typename T, typename Context>
+typename std::enable_if<!std::is_same<T, int8_t>::value, void>::type
+DispatchMatmulFlattenKernel(const Context& dev_ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& y,
+                            int x_num_col_dims,
+                            int y_num_col_dims,
+                            DenseTensor* out) {
+  MatmulWithFlattenKernelImpl<T, Context>(
+      dev_ctx, x, y, x_num_col_dims, y_num_col_dims, out);
+}
+
+template <typename T, typename Context>
+void MatmulWithFlattenKernel(const Context& dev_ctx,
+                             const DenseTensor& x,
+                             const DenseTensor& y,
+                             int x_num_col_dims,
+                             int y_num_col_dims,
+                             DenseTensor* out) {
+  DispatchMatmulFlattenKernel<T, Context>(
+      dev_ctx, x, y, x_num_col_dims, y_num_col_dims, out);
+}
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/kps/compare_kernel.cu b/paddle/phi/kernels/kps/compare_kernel.cu
index 728298be2cd83..545a9df2961bf 100644
--- a/paddle/phi/kernels/kps/compare_kernel.cu
+++ b/paddle/phi/kernels/kps/compare_kernel.cu
@@ -52,16 +52,27 @@ inline void CompareKernelImpl(const Context& ctx,
                               const DenseTensor& y,
                               int axis,
                               DenseTensor* out) {
-  if (!out->IsSharedWith(x)) {
-    ctx.template Alloc<bool>(out);
-  }
+  ctx.template Alloc<bool>(out);
   std::vector<const DenseTensor*> ins{&x, &y};
   std::vector<DenseTensor*> outs{out};
-  if (!out->IsSharedWith(x)) {
-    funcs::BroadcastKernel<bool>(ctx, ins, &outs, Functor(), axis);
-  } else {
-    funcs::BroadcastKernel<T>(ctx, ins, &outs, Functor(), axis);
-  }
+  funcs::BroadcastKernel<bool>(ctx, ins, &outs, Functor(), axis);
+}
+
+template <typename T,
+          typename Context,
+          typename Functor,
+          typename InverseFunctor>
+inline void InplaceCompareKernelImpl(const Context& ctx,
+                                     const DenseTensor& x,
+                                     const DenseTensor& y,
+                                     int axis,
+                                     DenseTensor* out) {
+  auto x_origin = x;
+  ctx.template Alloc<bool>(out);
+  out->set_type(phi::DataType::BOOL);
+  std::vector<const DenseTensor*> ins{&x_origin, &y};
+  std::vector<DenseTensor*> outs{out};
+  funcs::BroadcastKernel<bool>(ctx, ins, &outs, Functor(), axis);
 }
 
 #ifndef PADDLE_WITH_XPU_KP
@@ -134,18 +145,21 @@ PD_REGISTER_KERNEL(equal_all,
   kernel->OutputAt(0).SetDataType(phi::DataType::BOOL);
 }
 
-#define PD_REGISTER_COMPARE_KERNEL(name, func) \
-  PD_REGISTER_KERNEL(name,                     \
-                     KPS,                      \
-                     ALL_LAYOUT,               \
-                     phi::func##Kernel,        \
-                     bool,                     \
-                     int16_t,                  \
-                     int,                      \
-                     int64_t,                  \
-                     float,                    \
-                     double,                   \
-                     phi::dtype::float16) {}
+#define PD_REGISTER_COMPARE_KERNEL(name, func)            \
+  PD_REGISTER_KERNEL(name,                                \
+                     KPS,                                 \
+                     ALL_LAYOUT,                          \
+                     phi::func##Kernel,                   \
+                     bool,                                \
+                     int16_t,                             \
+                     int,                                 \
+                     int64_t,                             \
+                     float,                               \
+                     double,                              \
+                     phi::dtype::float16,                 \
+                     phi::dtype::bfloat16) {              \
+    kernel->OutputAt(0).SetDataType(phi::DataType::BOOL); \
+  }
 
 PD_REGISTER_COMPARE_KERNEL(less_than, LessThan)
 PD_REGISTER_COMPARE_KERNEL(less_equal, LessEqual)
diff --git a/paddle/phi/kernels/kps/elementwise_add_kernel.cu b/paddle/phi/kernels/kps/elementwise_add_kernel.cu
deleted file mode 100644
index b3fe46a1cd310..0000000000000
--- a/paddle/phi/kernels/kps/elementwise_add_kernel.cu
+++ /dev/null
@@ -1,135 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#ifndef PADDLE_WITH_XPU_KP
-#include "paddle/phi/common/complex.h"
-#include "paddle/phi/common/float16.h"
-#endif
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/elementwise_add_kernel.h"
-#include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
-
-namespace phi {
-
-template <typename T, typename Context>
-void AddCudaFunctor(const Context& dev_ctx,
-                    const DenseTensor& x,
-                    const DenseTensor& y,
-                    int axis,
-                    DenseTensor* out) {
-  std::vector<const DenseTensor*> inputs;
-  inputs.reserve(2);
-  std::vector<DenseTensor*> outputs;
-  outputs.reserve(1);
-  inputs.emplace_back(&x);
-  inputs.emplace_back(&y);
-  outputs.emplace_back(out);
-  dev_ctx.template Alloc<T>(out);
-  funcs::BroadcastKernel<T>(
-      dev_ctx, inputs, &outputs, funcs::AddFunctor<T>(), axis);
-}
-
-template <typename T, typename Context>
-void Float32Bfloat16OrFloat16AddCudaFunctor(const Context& dev_ctx,
-                                            const DenseTensor& x,
-                                            const DenseTensor& y,
-                                            DenseTensor* out) {
-  std::vector<const DenseTensor*> inputs;
-  inputs.reserve(2);
-  std::vector<DenseTensor*> outputs;
-  outputs.reserve(1);
-  inputs.emplace_back(&x);
-  inputs.emplace_back(&y);
-  outputs.emplace_back(out);
-  if (y.dtype() == phi::DataType::BFLOAT16) {
-    funcs::ElementwiseKernel<T>(
-        dev_ctx, inputs, &outputs, funcs::Float32Bfloat16AddFunctor<T>());
-  } else if (y.dtype() == phi::DataType::FLOAT16) {
-    funcs::ElementwiseKernel<T>(
-        dev_ctx, inputs, &outputs, funcs::Float32Float16AddFunctor<T>());
-  } else {
-    PADDLE_THROW(phi::errors::InvalidArgument(
-        "Unsupport x dtype:%s, y dtype:%s for add(x, y) operation",
-        phi::DataTypeToString(x.type()),
-        phi::DataTypeToString(y.type())));
-  }
-}
-
-template <typename T, typename Context>
-void AddKernel(const Context& dev_ctx,
-               const DenseTensor& x,
-               const DenseTensor& y,
-               DenseTensor* out) {
-#ifdef PADDLE_WITH_CUDA
-  if (x.dtype() == phi::DataType::FLOAT32 &&
-      (y.dtype() == phi::DataType::BFLOAT16 ||
-       y.dtype() == phi::DataType::FLOAT16)) {
-    using Type = DataTypeToCppType<phi::DataType::FLOAT32>::type;
-    Float32Bfloat16OrFloat16AddCudaFunctor<Type, Context>(dev_ctx, x, y, out);
-  } else {
-#endif
-    AddCudaFunctor<T, Context>(dev_ctx, x, y, -1, out);
-#ifdef PADDLE_WITH_CUDA
-  }
-#endif
-}
-
-template <typename T, typename Context>
-void GradAddKernel(const Context& dev_ctx,
-                   const DenseTensor& x,
-                   const DenseTensor& y,
-                   DenseTensor* out) {
-  AddCudaFunctor<T>(dev_ctx, x, y, -1, out);
-}
-
-}  // namespace phi
-
-#ifdef PADDLE_WITH_XPU_KP
-PD_REGISTER_KERNEL(add, KPS, ALL_LAYOUT, phi::AddKernel, float) {}
-#else
-
-using float16 = phi::dtype::float16;
-using bfloat16 = phi::dtype::bfloat16;
-using complex64 = ::phi::dtype::complex<float>;
-using complex128 = ::phi::dtype::complex<double>;
-
-PD_REGISTER_KERNEL(add,
-                   KPS,
-                   ALL_LAYOUT,
-                   phi::AddKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16,
-                   complex64,
-                   complex128) {}
-
-PD_REGISTER_KERNEL(grad_add,
-                   KPS,
-                   ALL_LAYOUT,
-                   phi::GradAddKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16,
-                   complex64,
-                   complex128) {}
-#endif
diff --git a/paddle/phi/kernels/kps/elementwise_divide_kernel.cu b/paddle/phi/kernels/kps/elementwise_divide_kernel.cu
deleted file mode 100644
index 1f4e4ad05adde..0000000000000
--- a/paddle/phi/kernels/kps/elementwise_divide_kernel.cu
+++ /dev/null
@@ -1,66 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#ifndef PADDLE_WITH_XPU_KP
-#include "paddle/phi/common/complex.h"
-#include "paddle/phi/common/float16.h"
-#endif
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
-
-namespace phi {
-
-template <typename T, typename Context>
-void DivideKernel(const Context& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  DenseTensor* out) {
-  std::vector<const DenseTensor*> inputs;
-  inputs.reserve(2);
-  std::vector<DenseTensor*> outputs;
-  outputs.reserve(1);
-  inputs.emplace_back(&x);
-  inputs.emplace_back(&y);
-  outputs.emplace_back(out);
-  dev_ctx.template Alloc<T>(out);
-  funcs::BroadcastKernel<T>(
-      dev_ctx, inputs, &outputs, funcs::DivideFunctor<T>(), -1);
-}
-
-}  // namespace phi
-
-#ifdef PADDLE_WITH_XPU_KP
-PD_REGISTER_KERNEL(divide, KPS, ALL_LAYOUT, phi::DivideKernel, float) {}
-#else
-
-using float16 = phi::dtype::float16;
-using bfloat16 = phi::dtype::bfloat16;
-using complex64 = ::phi::dtype::complex<float>;
-using complex128 = ::phi::dtype::complex<double>;
-
-PD_REGISTER_KERNEL(divide,
-                   KPS,
-                   ALL_LAYOUT,
-                   phi::DivideKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   float16,
-                   bfloat16,
-                   complex64,
-                   complex128) {}
-
-#endif
diff --git a/paddle/phi/kernels/kps/elementwise_kernel.cu b/paddle/phi/kernels/kps/elementwise_kernel.cu
index e88714c370be9..e4eacda7d98fb 100644
--- a/paddle/phi/kernels/kps/elementwise_kernel.cu
+++ b/paddle/phi/kernels/kps/elementwise_kernel.cu
@@ -18,11 +18,129 @@
 #include "paddle/phi/common/float16.h"
 #endif
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/elementwise_add_kernel.h"
 #include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
 #include "paddle/phi/kernels/legacy/elementwise_kernel.h"
 
 namespace phi {
 
+template <typename T, typename Context>
+void SubtractKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    DenseTensor* out) {
+  std::vector<const DenseTensor*> inputs;
+  inputs.reserve(2);
+  std::vector<DenseTensor*> outputs;
+  outputs.reserve(1);
+  inputs.emplace_back(&x);
+  inputs.emplace_back(&y);
+  outputs.emplace_back(out);
+  dev_ctx.template Alloc<T>(out);
+  funcs::BroadcastKernel<T>(
+      dev_ctx, inputs, &outputs, funcs::SubtractFunctor<T>(), -1);
+}
+
+template <typename T, typename Context>
+void MultiplyKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    DenseTensor* out) {
+  std::vector<const DenseTensor*> inputs;
+  inputs.reserve(2);
+  std::vector<DenseTensor*> outputs;
+  outputs.reserve(1);
+  inputs.emplace_back(&x);
+  inputs.emplace_back(&y);
+  outputs.emplace_back(out);
+  dev_ctx.template Alloc<T>(out);
+  funcs::BroadcastKernel<T>(
+      dev_ctx, inputs, &outputs, funcs::MultiplyFunctor<T>(), -1);
+}
+
+template <typename T, typename Context>
+void DivideKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* out) {
+  std::vector<const DenseTensor*> inputs;
+  inputs.reserve(2);
+  std::vector<DenseTensor*> outputs;
+  outputs.reserve(1);
+  inputs.emplace_back(&x);
+  inputs.emplace_back(&y);
+  outputs.emplace_back(out);
+  dev_ctx.template Alloc<T>(out);
+  funcs::BroadcastKernel<T>(
+      dev_ctx, inputs, &outputs, funcs::DivideFunctor<T>(), -1);
+}
+
+template <typename T, typename Context>
+void AddKernelImpl(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   int axis,
+                   DenseTensor* out) {
+  std::vector<const DenseTensor*> inputs = {&x, &y};
+  std::vector<DenseTensor*> outputs = {out};
+  dev_ctx.template Alloc<T>(out);
+  funcs::BroadcastKernel<T>(
+      dev_ctx, inputs, &outputs, funcs::AddFunctor<T>(), axis);
+}
+
+template <typename T, typename Context>
+void MultiPrecisionAddKernelImpl(const Context& dev_ctx,
+                                 const DenseTensor& x,
+                                 const DenseTensor& y,
+                                 DenseTensor* out) {
+  std::vector<const DenseTensor*> inputs = {&x, &y};
+  std::vector<DenseTensor*> outputs = {out};
+  if (y.dtype() == phi::DataType::BFLOAT16) {
+    funcs::ElementwiseKernel<T>(
+        dev_ctx,
+        inputs,
+        &outputs,
+        funcs::MultiPrecisionAddFunctor<T, phi::bfloat16>());
+  } else if (y.dtype() == phi::DataType::FLOAT16) {
+    funcs::ElementwiseKernel<T>(
+        dev_ctx,
+        inputs,
+        &outputs,
+        funcs::MultiPrecisionAddFunctor<T, phi::float16>());
+  } else {
+    PADDLE_THROW(phi::errors::InvalidArgument(
+        "Unsupport x dtype:%s, y dtype:%s for add(x, y) operation",
+        phi::DataTypeToString(x.type()),
+        phi::DataTypeToString(y.type())));
+  }
+}
+
+template <typename T, typename Context>
+void AddKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const DenseTensor& y,
+               DenseTensor* out) {
+#ifdef PADDLE_WITH_CUDA
+  if (x.dtype() == phi::DataType::FLOAT32 &&
+      (y.dtype() == phi::DataType::BFLOAT16 ||
+       y.dtype() == phi::DataType::FLOAT16)) {
+    MultiPrecisionAddKernelImpl<float, Context>(dev_ctx, x, y, out);
+  } else {
+#endif
+    AddKernelImpl<T, Context>(dev_ctx, x, y, -1, out);
+#ifdef PADDLE_WITH_CUDA
+  }
+#endif
+}
+
+template <typename T, typename Context>
+void GradAddKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   DenseTensor* out) {
+  AddKernelImpl<T>(dev_ctx, x, y, -1, out);
+}
+
 template <typename T, typename Context>
 void MaximumKernel(const Context& dev_ctx,
                    const DenseTensor& x,
@@ -58,6 +176,7 @@ void FloorDivideKernel(const Context& dev_ctx,
   int axis = -1;
   FloorDivideRawKernel<T>(dev_ctx, x, y, axis, out);
 }
+
 // Create the definition of Heaviside
 template <typename T, typename Context>
 void HeavisideKernel(const Context& dev_ctx,
@@ -148,6 +267,10 @@ PD_REGISTER_KERNEL(elementwise_pow,
 #ifdef PADDLE_WITH_XPU_KP
 PD_REGISTER_KERNEL(maximum, KPS, ALL_LAYOUT, phi::MaximumKernel, float) {}
 PD_REGISTER_KERNEL(minimum, KPS, ALL_LAYOUT, phi::MinimumKernel, float) {}
+PD_REGISTER_KERNEL(divide, KPS, ALL_LAYOUT, phi::DivideKernel, float) {}
+PD_REGISTER_KERNEL(multiply, KPS, ALL_LAYOUT, phi::MultiplyKernel, float) {}
+PD_REGISTER_KERNEL(add, KPS, ALL_LAYOUT, phi::AddKernel, float) {}
+PD_REGISTER_KERNEL(subtract, KPS, ALL_LAYOUT, phi::SubtractKernel, float) {}
 PD_REGISTER_KERNEL(floor_divide, KPS, ALL_LAYOUT, phi::FloorDivideKernel, int) {
 }
 PD_REGISTER_KERNEL(
@@ -191,4 +314,74 @@ PD_REGISTER_KERNEL(heaviside,
                    float16,
                    bfloat16,
                    int64_t) {}
+
+PD_REGISTER_KERNEL(add,
+                   KPS,
+                   ALL_LAYOUT,
+                   phi::AddKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   complex64,
+                   complex128) {}
+
+PD_REGISTER_KERNEL(grad_add,
+                   KPS,
+                   ALL_LAYOUT,
+                   phi::GradAddKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   complex64,
+                   complex128) {}
+
+PD_REGISTER_KERNEL(divide,
+                   KPS,
+                   ALL_LAYOUT,
+                   phi::DivideKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   float16,
+                   bfloat16,
+                   complex64,
+                   complex128) {}
+
+PD_REGISTER_KERNEL(multiply,
+                   KPS,
+                   ALL_LAYOUT,
+                   phi::MultiplyKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   float16,
+                   complex64,
+                   complex128,
+                   bfloat16) {}
+
+PD_REGISTER_KERNEL(subtract,
+                   KPS,
+                   ALL_LAYOUT,
+                   phi::SubtractKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   float16,
+                   bfloat16,
+                   complex64,
+                   complex128) {}
+
 #endif
diff --git a/paddle/phi/kernels/kps/elementwise_multiply_kernel.cu b/paddle/phi/kernels/kps/elementwise_multiply_kernel.cu
deleted file mode 100644
index 120f49e0d0b7d..0000000000000
--- a/paddle/phi/kernels/kps/elementwise_multiply_kernel.cu
+++ /dev/null
@@ -1,67 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#ifndef PADDLE_WITH_XPU_KP
-#include "paddle/phi/common/complex.h"
-#include "paddle/phi/common/float16.h"
-#endif
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
-
-namespace phi {
-
-template <typename T, typename Context>
-void MultiplyKernel(const Context& dev_ctx,
-                    const DenseTensor& x,
-                    const DenseTensor& y,
-                    DenseTensor* out) {
-  std::vector<const DenseTensor*> inputs;
-  inputs.reserve(2);
-  std::vector<DenseTensor*> outputs;
-  outputs.reserve(1);
-  inputs.emplace_back(&x);
-  inputs.emplace_back(&y);
-  outputs.emplace_back(out);
-  dev_ctx.template Alloc<T>(out);
-  funcs::BroadcastKernel<T>(
-      dev_ctx, inputs, &outputs, funcs::MultiplyFunctor<T>(), -1);
-}
-
-}  // namespace phi
-
-#ifdef PADDLE_WITH_XPU_KP
-PD_REGISTER_KERNEL(multiply, KPS, ALL_LAYOUT, phi::MultiplyKernel, float) {}
-#else
-
-using float16 = phi::dtype::float16;
-using bfloat16 = phi::dtype::bfloat16;
-using complex64 = ::phi::dtype::complex<float>;
-using complex128 = ::phi::dtype::complex<double>;
-
-PD_REGISTER_KERNEL(multiply,
-                   KPS,
-                   ALL_LAYOUT,
-                   phi::MultiplyKernel,
-                   float,
-                   double,
-                   int,
-                   int64_t,
-                   bool,
-                   float16,
-                   complex64,
-                   complex128,
-                   bfloat16) {}
-
-#endif
diff --git a/paddle/phi/kernels/kps/elementwise_subtract_kernel.cu b/paddle/phi/kernels/kps/elementwise_subtract_kernel.cu
deleted file mode 100644
index 4f6015990e216..0000000000000
--- a/paddle/phi/kernels/kps/elementwise_subtract_kernel.cu
+++ /dev/null
@@ -1,67 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#ifndef PADDLE_WITH_XPU_KP
-#include "paddle/phi/common/complex.h"
-#include "paddle/phi/common/float16.h"
-#endif
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/impl/elementwise_kernel_impl.h"
-
-namespace phi {
-
-template <typename T, typename Context>
-void SubtractKernel(const Context& dev_ctx,
-                    const DenseTensor& x,
-                    const DenseTensor& y,
-                    DenseTensor* out) {
-  std::vector<const DenseTensor*> inputs;
-  inputs.reserve(2);
-  std::vector<DenseTensor*> outputs;
-  outputs.reserve(1);
-  inputs.emplace_back(&x);
-  inputs.emplace_back(&y);
-  outputs.emplace_back(out);
-  dev_ctx.template Alloc<T>(out);
-  funcs::BroadcastKernel<T>(
-      dev_ctx, inputs, &outputs, funcs::SubtractFunctor<T>(), -1);
-}
-
-}  // namespace phi
-
-#ifdef PADDLE_WITH_XPU_KP
-PD_REGISTER_KERNEL(subtract, KPS, ALL_LAYOUT, phi::SubtractKernel, float) {}
-#else
-
-using float16 = phi::dtype::float16;
-using bfloat16 = phi::dtype::bfloat16;
-using complex64 = ::phi::dtype::complex<float>;
-using complex128 = ::phi::dtype::complex<double>;
-
-PD_REGISTER_KERNEL(subtract,
-                   KPS,
-                   ALL_LAYOUT,
-                   phi::SubtractKernel,
-                   float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   float16,
-                   bfloat16,
-                   complex64,
-                   complex128) {}
-
-#endif
diff --git a/paddle/phi/kernels/kps/logical_kernel.cu b/paddle/phi/kernels/kps/logical_kernel.cu
index f7c390e65d0ff..5e62ab2684f7a 100644
--- a/paddle/phi/kernels/kps/logical_kernel.cu
+++ b/paddle/phi/kernels/kps/logical_kernel.cu
@@ -25,24 +25,45 @@
 
 namespace phi {
 
-#define DEFINE_LOGICAL_BINARY_KERNEL(type)                            \
-  template <typename T, typename Context>                             \
-  void Logical##type##Kernel(const Context& dev_ctx,                  \
-                             const DenseTensor& x,                    \
-                             const DenseTensor& y,                    \
-                             DenseTensor* out) {                      \
-    if (!out->IsSharedWith(x)) {                                      \
-      dev_ctx.template Alloc<bool>(out);                              \
-    }                                                                 \
-                                                                      \
-    funcs::Logical##type##Functor<T> binary_func;                     \
-    std::vector<const DenseTensor*> ins = {&x, &y};                   \
-    std::vector<DenseTensor*> outs = {out};                           \
-    if (!out->IsSharedWith(x)) {                                      \
-      funcs::BroadcastKernel<bool>(dev_ctx, ins, &outs, binary_func); \
-    } else {                                                          \
-      funcs::BroadcastKernel<T>(dev_ctx, ins, &outs, binary_func);    \
-    }                                                                 \
+template <typename T, typename Context, typename Functor>
+void LogicalKernelImpl(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& y,
+                       DenseTensor* out) {
+  dev_ctx.template Alloc<bool>(out);
+  Functor binary_func;
+  std::vector<const DenseTensor*> ins = {&x, &y};
+  std::vector<DenseTensor*> outs = {out};
+  funcs::BroadcastKernel<bool>(dev_ctx, ins, &outs, binary_func);
+}
+
+template <typename T, typename Context, typename Functor>
+void InplaceLogicalKernelImpl(const Context& dev_ctx,
+                              const DenseTensor& x,
+                              const DenseTensor& y,
+                              DenseTensor* out) {
+  auto x_origin = x;
+  dev_ctx.template Alloc<bool>(out);
+  out->set_type(phi::DataType::BOOL);
+  Functor binary_func;
+  std::vector<const DenseTensor*> ins = {&x_origin, &y};
+  std::vector<DenseTensor*> outs = {out};
+  funcs::BroadcastKernel<bool>(dev_ctx, ins, &outs, binary_func);
+}
+
+#define DEFINE_LOGICAL_BINARY_KERNEL(type)                                    \
+  template <typename T, typename Context>                                     \
+  void Logical##type##Kernel(const Context& dev_ctx,                          \
+                             const DenseTensor& x,                            \
+                             const DenseTensor& y,                            \
+                             DenseTensor* out) {                              \
+    if (out->IsSharedWith(x)) {                                               \
+      InplaceLogicalKernelImpl<T, Context, funcs::Logical##type##Functor<T>>( \
+          dev_ctx, x, y, out);                                                \
+    } else {                                                                  \
+      LogicalKernelImpl<T, Context, funcs::Logical##type##Functor<T>>(        \
+          dev_ctx, x, y, out);                                                \
+    }                                                                         \
   }
 
 DEFINE_LOGICAL_BINARY_KERNEL(And)
@@ -56,14 +77,18 @@ void LogicalNotKernel(const Context& dev_ctx,
                       DenseTensor* out) {
   if (!out->IsSharedWith(x)) {
     dev_ctx.template Alloc<bool>(out);
-  }
-  funcs::LogicalNotFunctor<T> unary_func;
-  std::vector<const DenseTensor*> ins = {&x};
-  std::vector<DenseTensor*> outs = {out};
-  if (!out->IsSharedWith(x)) {
+    funcs::LogicalNotFunctor<T> unary_func;
+    std::vector<const DenseTensor*> ins = {&x};
+    std::vector<DenseTensor*> outs = {out};
     funcs::BroadcastKernel<bool>(dev_ctx, ins, &outs, unary_func);
   } else {
-    funcs::BroadcastKernel<T>(dev_ctx, ins, &outs, unary_func);
+    auto x_origin = x;
+    out->set_type(phi::DataType::BOOL);
+    dev_ctx.template Alloc<bool>(out);
+    funcs::LogicalNotFunctor<T> unary_func;
+    std::vector<const DenseTensor*> ins = {&x_origin};
+    std::vector<DenseTensor*> outs = {out};
+    funcs::BroadcastKernel<bool>(dev_ctx, ins, &outs, unary_func);
   }
 }
 
@@ -99,7 +124,9 @@ PD_REGISTER_KERNEL(logical_xor, KPS, ALL_LAYOUT, phi::LogicalXorKernel, int) {
                      int8_t,                                 \
                      phi::dtype::complex<float>,             \
                      phi::dtype::complex<double>,            \
-                     int16_t) {}
+                     int16_t) {                              \
+    kernel->OutputAt(0).SetDataType(phi::DataType::BOOL);    \
+  }
 
 REGISTER_LOGICAL_CUDA_KERNEL(logical_and, And)
 REGISTER_LOGICAL_CUDA_KERNEL(logical_or, Or)
diff --git a/paddle/phi/kernels/xpu/adamw_kernel.cc b/paddle/phi/kernels/xpu/adamw_kernel.cc
index a74fd2aa9bd4e..7a732878ff64c 100644
--- a/paddle/phi/kernels/xpu/adamw_kernel.cc
+++ b/paddle/phi/kernels/xpu/adamw_kernel.cc
@@ -20,11 +20,30 @@
 
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 #include "paddle/phi/backends/xpu/xpu_context.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 
 namespace phi {
 
+template <typename Context>
+float GetAbsMax(const Context& dev_ctx,
+                const float* input,
+                float* buffer_xpu,
+                int64_t numel) {
+  float buffer_cpu[6];
+  // int findmax(Context* ctx, const T* x, float* maxptr, int64_t len);
+  int r = xpu::findmax<float>(dev_ctx.x_context(), input, buffer_xpu, numel);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "findmax");
+  memory_utils::Copy(CPUPlace(),
+                     static_cast<void*>(buffer_cpu),
+                     dev_ctx.GetPlace(),
+                     static_cast<void*>(buffer_xpu),
+                     sizeof(float) * 6);
+  float* max_value = std::max_element(buffer_cpu, buffer_cpu + 6);
+  return *max_value;
+}
+
 template <typename T, typename Context>
 void AdamwDenseKernel(const Context& dev_ctx,
                       const DenseTensor& param,
@@ -52,6 +71,98 @@ void AdamwDenseKernel(const Context& dev_ctx,
                       DenseTensor* beta1_pow_out,
                       DenseTensor* beta2_pow_out,
                       DenseTensor* master_param_outs) {
+  // check moment_dtype
+  auto moment1_dtype = moment1.dtype();
+  auto moment2_dtype = moment2.dtype();
+  PADDLE_ENFORCE_EQ(moment1_dtype,
+                    moment1_out->dtype(),
+                    errors::InvalidArgument(
+                        "moment1.dtype does not match moment1_out->dtype"));
+  PADDLE_ENFORCE_EQ(moment2_dtype,
+                    moment2_out->dtype(),
+                    errors::InvalidArgument(
+                        "moment2.dtype does not match moment2_out->dtype"));
+  PADDLE_ENFORCE_EQ(
+      moment1_dtype,
+      moment2_dtype,
+      errors::InvalidArgument("moment1.dtype does not match moment2.dtype"));
+
+  bool moment_in_fp16 = false;
+  if (moment1_dtype == phi::DataType::FLOAT16) {
+    moment_in_fp16 = true;
+  } else {
+    PADDLE_ENFORCE_EQ(
+        moment1_dtype,
+        phi::DataType::FLOAT32,
+        errors::InvalidArgument("moment1.dtype is neither fp32 nor fp16"));
+  }
+
+  float* moment1_input_for_xdnn = nullptr;
+  float* moment2_input_for_xdnn = nullptr;
+  float* moment1_output_for_xdnn = nullptr;
+  float* moment2_output_for_xdnn = nullptr;
+
+  xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+  if (moment_in_fp16) {
+    // allocate temp buffer on XPU
+    moment1_input_for_xdnn = RAII_GUARD.alloc_l3_or_gm<float>(moment1.numel());
+    PADDLE_ENFORCE_XDNN_NOT_NULL(moment1_input_for_xdnn);
+    moment2_input_for_xdnn = RAII_GUARD.alloc_l3_or_gm<float>(moment2.numel());
+    PADDLE_ENFORCE_XDNN_NOT_NULL(moment2_input_for_xdnn);
+    moment1_output_for_xdnn =
+        RAII_GUARD.alloc_l3_or_gm<float>(moment1_out->numel());
+    PADDLE_ENFORCE_XDNN_NOT_NULL(moment1_output_for_xdnn);
+    moment2_output_for_xdnn =
+        RAII_GUARD.alloc_l3_or_gm<float>(moment2_out->numel());
+    PADDLE_ENFORCE_XDNN_NOT_NULL(moment2_output_for_xdnn);
+
+    int r = 0;
+    using XPUType16 = typename XPUTypeTrait<phi::dtype::float16>::Type;
+
+    // cast moment1 and moment2, from fp16 to fp32
+    // int cast(Context* ctx, const TX* x, TY* y, int64_t len);
+    r = xpu::cast<XPUType16, float>(
+        dev_ctx.x_context(),
+        reinterpret_cast<const XPUType16*>(
+            moment1.template data<phi::dtype::float16>()),
+        moment1_input_for_xdnn,
+        moment1.numel());
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast moment1 from fp16 to float");
+    r = xpu::cast<XPUType16, float>(
+        dev_ctx.x_context(),
+        reinterpret_cast<const XPUType16*>(
+            moment2.template data<phi::dtype::float16>()),
+        moment2_input_for_xdnn,
+        moment2.numel());
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast moment2 from fp16 to float");
+
+    // de-scale using meta's scale_value
+    // int scale(Context* ctx, const T* x, T* y, int64_t len, bool
+    // bias_after_scale, float _scale, float _bias);
+    phi::DenseTensorMeta moment1_meta = moment1.meta();
+    if (moment1_meta.scale_value > 0) {
+      r = xpu::scale<float>(dev_ctx.x_context(),
+                            moment1_input_for_xdnn,
+                            moment1_input_for_xdnn,
+                            moment1.numel(),
+                            false,
+                            1.0f / moment1_meta.scale_value,
+                            0.0f);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "de-scale for moment1");
+    }
+    phi::DenseTensorMeta moment2_meta = moment2.meta();
+    if (moment2_meta.scale_value > 0) {
+      r = xpu::scale<float>(dev_ctx.x_context(),
+                            moment2_input_for_xdnn,
+                            moment2_input_for_xdnn,
+                            moment2.numel(),
+                            false,
+                            1.0f / moment2_meta.scale_value,
+                            0.0f);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "de-scale for moment2");
+    }
+  }
+
   using XPUType = typename XPUTypeTrait<T>::Type;
   bool skip_update_ = false;
   if (skip_update.is_initialized()) {
@@ -94,7 +205,7 @@ void AdamwDenseKernel(const Context& dev_ctx,
   if (!with_decay) {
     coeff = static_cast<float>(0.0);
   }
-  xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+
   float* new_lr = RAII_GUARD.alloc_l3_or_gm<float>(learning_rate.numel());
   PADDLE_ENFORCE_XDNN_NOT_NULL(new_lr);
   int r = 0;
@@ -107,17 +218,23 @@ void AdamwDenseKernel(const Context& dev_ctx,
                  0.0f);
   PADDLE_ENFORCE_XDNN_SUCCESS(r, "scale");
 
+  // int adamw(Context* ctx, const T* g, const float* mom1, const float* mom2,
+  // const T* param, const float* beta1_pow, const float* beta2_pow, const
+  // float* lr, float* moment1_out, float* moment2_out, T* param_out, float
+  // beta1, float beta2, float epsilon, float coeff, int64_t n);
   r = xpu::adamw(
       dev_ctx.x_context(),
       reinterpret_cast<const XPUType*>(grad.template data<T>()),
-      moment1.template data<float>(),
-      moment2.template data<float>(),
+      moment_in_fp16 ? moment1_input_for_xdnn : moment1.template data<float>(),
+      moment_in_fp16 ? moment2_input_for_xdnn : moment2.template data<float>(),
       reinterpret_cast<const XPUType*>(param.template data<T>()),
       beta1_pow_ptr,
       beta2_pow_ptr,
       new_lr,
-      dev_ctx.template Alloc<float>(moment1_out),
-      dev_ctx.template Alloc<float>(moment2_out),
+      moment_in_fp16 ? moment1_output_for_xdnn
+                     : dev_ctx.template Alloc<float>(moment1_out),
+      moment_in_fp16 ? moment2_output_for_xdnn
+                     : dev_ctx.template Alloc<float>(moment2_out),
       reinterpret_cast<XPUType*>(dev_ctx.template Alloc<T>(param_out)),
       beta1_,
       beta2_,
@@ -126,6 +243,75 @@ void AdamwDenseKernel(const Context& dev_ctx,
       param.numel());
   PADDLE_ENFORCE_XDNN_SUCCESS(r, "adamw");
 
+  if (moment_in_fp16) {
+    int r = 0;
+    using XPUType16 = typename XPUTypeTrait<phi::dtype::float16>::Type;
+
+    // findmax and calculate scale_value for moment1 and moment2
+    float* buffer_for_findmax = RAII_GUARD.alloc_l3_or_gm<float>(6);
+
+    // for moment1
+    float moment1_max = GetAbsMax<Context>(dev_ctx,
+                                           moment1_output_for_xdnn,
+                                           buffer_for_findmax,
+                                           moment1_out->numel());
+    float moment1_scale_value = 65504.0f / moment1_max / 2.0f;
+    // int scale(Context* ctx, const T* x, T* y, int64_t len, bool
+    // bias_after_scale, float _scale, float _bias);
+    r = xpu::scale<float>(dev_ctx.x_context(),
+                          moment1_output_for_xdnn,
+                          moment1_output_for_xdnn,
+                          moment1_out->numel(),
+                          false,
+                          moment1_scale_value,
+                          0.0f);
+    PADDLE_ENFORCE_XDNN_SUCCESS(
+        r, "scale before convert to fp16, for moment1_output_for_xdnn");
+    // write to meta info
+    phi::DenseTensorMeta moment1_out_meta = moment1_out->meta();
+    moment1_out_meta.scale_value = moment1_scale_value;
+    moment1_out->set_meta(moment1_out_meta);
+
+    // for moment2
+    float moment2_max = GetAbsMax<Context>(dev_ctx,
+                                           moment2_output_for_xdnn,
+                                           buffer_for_findmax,
+                                           moment2_out->numel());
+    float moment2_scale_value = 65504.0f / moment2_max / 2.0f;
+    // int scale(Context* ctx, const T* x, T* y, int64_t len, bool
+    // bias_after_scale, float _scale, float _bias);
+    r = xpu::scale<float>(dev_ctx.x_context(),
+                          moment2_output_for_xdnn,
+                          moment2_output_for_xdnn,
+                          moment2_out->numel(),
+                          false,
+                          moment2_scale_value,
+                          0.0f);
+    PADDLE_ENFORCE_XDNN_SUCCESS(
+        r, "scale before convert to fp16, for moment2_output_for_xdnn");
+    // write to meta info
+    phi::DenseTensorMeta moment2_out_meta = moment2_out->meta();
+    moment2_out_meta.scale_value = moment2_scale_value;
+    moment2_out->set_meta(moment2_out_meta);
+
+    // cast moment1 and moment2 output, from fp32 to fp16
+    // int cast(Context* ctx, const TX* x, TY* y, int64_t len);
+    r = xpu::cast<float, XPUType16>(
+        dev_ctx.x_context(),
+        moment1_output_for_xdnn,
+        reinterpret_cast<XPUType16*>(
+            dev_ctx.template Alloc<phi::dtype::float16>(moment1_out)),
+        moment1.numel());
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast moment1_out from float to fp16");
+    r = xpu::cast<float, XPUType16>(
+        dev_ctx.x_context(),
+        moment2_output_for_xdnn,
+        reinterpret_cast<XPUType16*>(
+            dev_ctx.template Alloc<phi::dtype::float16>(moment2_out)),
+        moment2.numel());
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast moment2_out from float to fp16");
+  }
+
   if (!use_global_beta_pow) {
     // update in cpu
     if (beta1_pow.place() == CPUPlace() && beta2_pow.place() == CPUPlace()) {
diff --git a/paddle/ir/CMakeLists.txt b/paddle/pir/CMakeLists.txt
similarity index 91%
rename from paddle/ir/CMakeLists.txt
rename to paddle/pir/CMakeLists.txt
index 5a778466b4c19..1f87a16ff36a6 100644
--- a/paddle/ir/CMakeLists.txt
+++ b/paddle/pir/CMakeLists.txt
@@ -43,31 +43,31 @@ add_subdirectory(dialect)
 if(WIN32)
   if(WITH_SHARED_IR)
     set(IR_NAME
-        ir.dll
+        pir.dll
         CACHE INTERNAL "" FORCE)
   else()
     set(IR_NAME
-        ir.lib
+        pir.lib
         CACHE INTERNAL "" FORCE)
   endif()
 elseif(APPLE)
   if(WITH_SHARED_IR)
     set(IR_NAME
-        libir.dylib
+        libpir.dylib
         CACHE INTERNAL "" FORCE)
   else()
     set(IR_NAME
-        libir.a
+        libpir.a
         CACHE INTERNAL "" FORCE)
   endif()
 else()
   if(WITH_SHARED_IR)
     set(IR_NAME
-        libir.so
+        libpir.so
         CACHE INTERNAL "" FORCE)
   else()
     set(IR_NAME
-        libir.a
+        libpir.a
         CACHE INTERNAL "" FORCE)
   endif()
 endif()
@@ -78,7 +78,7 @@ set(IR_LIB
 
 get_property(ir_modules GLOBAL PROPERTY IR_MODULES)
 if(WITH_SHARED_IR)
-  add_library(ir SHARED ${ir_modules})
+  add_library(pir SHARED ${ir_modules})
 else()
-  add_library(ir STATIC ${ir_modules})
+  add_library(pir STATIC ${ir_modules})
 endif()
diff --git a/paddle/pir/core/CMakeLists.txt b/paddle/pir/core/CMakeLists.txt
new file mode 100644
index 0000000000000..0fffc4285e376
--- /dev/null
+++ b/paddle/pir/core/CMakeLists.txt
@@ -0,0 +1,9 @@
+set(NEWIR_SOURCE_DIR "${PADDLE_SOURCE_DIR}/paddle/pir")
+set(NEWIR_BINARY_DIR "${PADDLE_BINARY_DIR}/paddle/pir")
+
+file(GLOB IR_SRCS "*.cc")
+
+file(GLOB IR_PARSER_SRCS "parser/*.cc")
+list(APPEND IR_SRCS ${IR_PARSER_SRCS})
+
+ir_library(pir_core SRCS ${IR_SRCS} DEPS ddim)
diff --git a/paddle/ir/core/attribute.cc b/paddle/pir/core/attribute.cc
similarity index 86%
rename from paddle/ir/core/attribute.cc
rename to paddle/pir/core/attribute.cc
index 0eff9964292df..993076880fdda 100644
--- a/paddle/ir/core/attribute.cc
+++ b/paddle/pir/core/attribute.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/ir/core/attribute.h"
-#include "paddle/ir/core/attribute_base.h"
-#include "paddle/ir/core/dialect.h"
+#include "paddle/pir/core/attribute.h"
+#include "paddle/pir/core/attribute_base.h"
+#include "paddle/pir/core/dialect.h"
 
-namespace ir {
+namespace pir {
 IrContext *Attribute::ir_context() const { return dialect().ir_context(); }
 
 TypeId Attribute::type_id() { return storage_->abstract_attribute().type_id(); }
@@ -29,4 +29,4 @@ const Dialect &Attribute::dialect() const {
   return storage_->abstract_attribute().dialect();
 }
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/core/attribute.h b/paddle/pir/core/attribute.h
similarity index 87%
rename from paddle/ir/core/attribute.h
rename to paddle/pir/core/attribute.h
index d83ea3b3c6045..86d6a62ceddfd 100644
--- a/paddle/ir/core/attribute.h
+++ b/paddle/pir/core/attribute.h
@@ -14,13 +14,13 @@
 
 #pragma once
 
-#include "paddle/ir/core/cast_utils.h"
-#include "paddle/ir/core/type_id.h"
+#include "paddle/pir/core/cast_utils.h"
+#include "paddle/pir/core/type_id.h"
 
 constexpr char kAttrStopGradients[] = "stop_gradient";
 constexpr char kAttrIsPersisable[] = "is_persisable";
 
-namespace ir {
+namespace pir {
 class AttributeStorage;
 class AbstractAttribute;
 class IrContext;
@@ -77,12 +77,12 @@ class IR_API Attribute {
 
   template <typename T>
   bool isa() const {
-    return ir::isa<T>(*this);
+    return pir::isa<T>(*this);
   }
 
   template <typename U>
   U dyn_cast() const {
-    return ir::dyn_cast<U>(*this);
+    return pir::dyn_cast<U>(*this);
   }
 
   friend struct std::hash<Attribute>;
@@ -92,13 +92,13 @@ class IR_API Attribute {
 };
 
 IR_API std::ostream &operator<<(std::ostream &os, Attribute attr);
-}  // namespace ir
+}  // namespace pir
 
 namespace std {
 template <>
-struct hash<ir::Attribute> {
-  std::size_t operator()(const ir::Attribute &obj) const {
-    return std::hash<const ir::Attribute::Storage *>()(obj.storage_);
+struct hash<pir::Attribute> {
+  std::size_t operator()(const pir::Attribute &obj) const {
+    return std::hash<const pir::Attribute::Storage *>()(obj.storage_);
   }
 };
 }  // namespace std
diff --git a/paddle/ir/core/attribute_base.h b/paddle/pir/core/attribute_base.h
similarity index 91%
rename from paddle/ir/core/attribute_base.h
rename to paddle/pir/core/attribute_base.h
index daa3fed14f8a3..e0cbb0253700a 100644
--- a/paddle/ir/core/attribute_base.h
+++ b/paddle/pir/core/attribute_base.h
@@ -14,11 +14,11 @@
 
 #pragma once
 
-#include "paddle/ir/core/ir_context.h"
-#include "paddle/ir/core/storage_manager.h"
-#include "paddle/ir/core/type_id.h"
+#include "paddle/pir/core/ir_context.h"
+#include "paddle/pir/core/storage_manager.h"
+#include "paddle/pir/core/type_id.h"
 
-namespace ir {
+namespace pir {
 class Dialect;
 
 ///
@@ -155,7 +155,7 @@ struct IR_API AttributeManager {
   template <typename T, typename... Args>
   static T get(IrContext *ctx, Args &&...args) {
     return get<T, Args...>(
-        ctx, ir::TypeId::get<T>(), std::forward<Args>(args)...);
+        ctx, pir::TypeId::get<T>(), std::forward<Args>(args)...);
   }
 
   ///
@@ -204,7 +204,7 @@ struct IR_API AttributeManager {
   ///
   template <typename T>
   static void RegisterAttribute(IrContext *ctx) {
-    RegisterAttribute<T>(ctx, ir::TypeId::get<T>());
+    RegisterAttribute<T>(ctx, pir::TypeId::get<T>());
   }
 
   ///
@@ -242,25 +242,25 @@ struct IR_API AttributeManager {
 ///
 /// \brief Add some necessary functions to the custom Attribute class.
 ///
-#define DECLARE_ATTRIBUTE_UTILITY_FUNCTOR(concrete_attribute, storage_type) \
-  using Storage = storage_type;                                             \
-                                                                            \
-  const Storage *storage() const {                                          \
-    return static_cast<const Storage *>(this->storage_);                    \
-  }                                                                         \
-                                                                            \
-  static ir::TypeId type_id() {                                             \
-    return ir::TypeId::get<concrete_attribute>();                           \
-  }                                                                         \
-                                                                            \
-  template <typename T>                                                     \
-  static bool classof(T val) {                                              \
-    return val.type_id() == type_id();                                      \
-  }                                                                         \
-                                                                            \
-  template <typename... Args>                                               \
-  static concrete_attribute get(ir::IrContext *ctx, Args... args) {         \
-    return ir::AttributeManager::template get<concrete_attribute>(ctx,      \
-                                                                  args...); \
+#define DECLARE_ATTRIBUTE_UTILITY_FUNCTOR(concrete_attribute, storage_type)  \
+  using Storage = storage_type;                                              \
+                                                                             \
+  const Storage *storage() const {                                           \
+    return static_cast<const Storage *>(this->storage_);                     \
+  }                                                                          \
+                                                                             \
+  static pir::TypeId type_id() {                                             \
+    return pir::TypeId::get<concrete_attribute>();                           \
+  }                                                                          \
+                                                                             \
+  template <typename T>                                                      \
+  static bool classof(T val) {                                               \
+    return val.type_id() == type_id();                                       \
+  }                                                                          \
+                                                                             \
+  template <typename... Args>                                                \
+  static concrete_attribute get(pir::IrContext *ctx, Args... args) {         \
+    return pir::AttributeManager::template get<concrete_attribute>(ctx,      \
+                                                                   args...); \
   }
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/core/block.cc b/paddle/pir/core/block.cc
similarity index 93%
rename from paddle/ir/core/block.cc
rename to paddle/pir/core/block.cc
index 04d59e2582ebe..f92d532298150 100644
--- a/paddle/ir/core/block.cc
+++ b/paddle/pir/core/block.cc
@@ -12,15 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/ir/core/block.h"
+#include "paddle/pir/core/block.h"
 
 #include <unordered_set>
 
-#include "paddle/ir/core/enforce.h"
-#include "paddle/ir/core/operation.h"
-#include "paddle/ir/core/region.h"
+#include "paddle/pir/core/enforce.h"
+#include "paddle/pir/core/operation.h"
+#include "paddle/pir/core/region.h"
 
-namespace ir {
+namespace pir {
 Block::~Block() {
   assert(use_empty() && "block destroyed still has uses.");
   clear();
@@ -93,4 +93,4 @@ bool Block::TopoOrderCheck(const OpListType &op_list) {
   return true;
 }
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/core/block.h b/paddle/pir/core/block.h
similarity index 93%
rename from paddle/ir/core/block.h
rename to paddle/pir/core/block.h
index 7e612d6318d36..3a8b4fafc345d 100644
--- a/paddle/ir/core/block.h
+++ b/paddle/pir/core/block.h
@@ -17,12 +17,12 @@
 #include <cstddef>
 #include <list>
 
-#include "paddle/ir/core/block_operand.h"
-#include "paddle/ir/core/dll_decl.h"
-#include "paddle/ir/core/region.h"
-#include "paddle/ir/core/use_iterator.h"
+#include "paddle/pir/core/block_operand.h"
+#include "paddle/pir/core/dll_decl.h"
+#include "paddle/pir/core/region.h"
+#include "paddle/pir/core/use_iterator.h"
 
-namespace ir {
+namespace pir {
 class Operation;
 
 class IR_API Block {
@@ -89,4 +89,4 @@ class IR_API Block {
   Region::iterator position_;
   BlockOperand first_use_;
 };
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/core/block_operand.cc b/paddle/pir/core/block_operand.cc
similarity index 91%
rename from paddle/ir/core/block_operand.cc
rename to paddle/pir/core/block_operand.cc
index f64a07fd50dfe..78dd9c0b5d14e 100644
--- a/paddle/ir/core/block_operand.cc
+++ b/paddle/pir/core/block_operand.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/ir/core/block_operand.h"
-#include "paddle/ir/core/block.h"
-#include "paddle/ir/core/block_operand_impl.h"
-#include "paddle/ir/core/enforce.h"
+#include "paddle/pir/core/block_operand.h"
+#include "paddle/pir/core/block.h"
+#include "paddle/pir/core/block_operand_impl.h"
+#include "paddle/pir/core/enforce.h"
 
-namespace ir {
+namespace pir {
 
 #define CHECK_BLOCKOPEREND_NULL_IMPL(func_name)                 \
   IR_ENFORCE(impl_,                                             \
@@ -75,7 +75,7 @@ void BlockOperandImpl::set_source(Block *source) {
   InsertToUdChain();
 }
 
-BlockOperandImpl::BlockOperandImpl(Block *source, ir::Operation *owner)
+BlockOperandImpl::BlockOperandImpl(Block *source, pir::Operation *owner)
     : source_(source), owner_(owner) {
   if (!source) {
     return;
@@ -110,4 +110,4 @@ void BlockOperandImpl::RemoveFromUdChain() {
 
 BlockOperandImpl::~BlockOperandImpl() { RemoveFromUdChain(); }
 }  // namespace detail
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/core/block_operand.h b/paddle/pir/core/block_operand.h
similarity index 93%
rename from paddle/ir/core/block_operand.h
rename to paddle/pir/core/block_operand.h
index ec55a90a1c65d..9895af86e7ed7 100644
--- a/paddle/ir/core/block_operand.h
+++ b/paddle/pir/core/block_operand.h
@@ -14,10 +14,10 @@
 
 #pragma once
 
-#include "paddle/ir/core/cast_utils.h"
-#include "paddle/ir/core/type.h"
+#include "paddle/pir/core/cast_utils.h"
+#include "paddle/pir/core/type.h"
 
-namespace ir {
+namespace pir {
 class Operation;
 class Value;
 class Block;
@@ -70,4 +70,4 @@ class IR_API BlockOperand {
   detail::BlockOperandImpl *impl_{nullptr};
 };
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/core/block_operand_impl.h b/paddle/pir/core/block_operand_impl.h
similarity index 94%
rename from paddle/ir/core/block_operand_impl.h
rename to paddle/pir/core/block_operand_impl.h
index 53d8257c10032..1e0f8659a9c10 100644
--- a/paddle/ir/core/block_operand_impl.h
+++ b/paddle/pir/core/block_operand_impl.h
@@ -14,9 +14,9 @@
 
 #pragma once
 
-#include "paddle/ir/core/block_operand.h"
+#include "paddle/pir/core/block_operand.h"
 
-namespace ir {
+namespace pir {
 class Operation;
 class Block;
 
@@ -58,4 +58,4 @@ class BlockOperandImpl {
 };
 
 }  // namespace detail
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/core/builder.cc b/paddle/pir/core/builder.cc
similarity index 92%
rename from paddle/ir/core/builder.cc
rename to paddle/pir/core/builder.cc
index 1bfbd2e2a8ca8..a91428ba99080 100644
--- a/paddle/ir/core/builder.cc
+++ b/paddle/pir/core/builder.cc
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/ir/core/builder.h"
-#include "paddle/ir/core/builtin_attribute.h"
-#include "paddle/ir/core/builtin_type.h"
-#include "paddle/ir/core/region.h"
-#include "paddle/ir/core/value.h"
+#include "paddle/pir/core/builder.h"
+#include "paddle/pir/core/builtin_attribute.h"
+#include "paddle/pir/core/builtin_type.h"
+#include "paddle/pir/core/region.h"
+#include "paddle/pir/core/value.h"
 
-namespace ir {
+namespace pir {
 /// Create an operation given the fields represented as an OperationState.
 Operation *Builder::Build(OperationArgument &&argument) {
   return Insert(Operation::Create(std::move(argument)));
@@ -81,4 +81,4 @@ PointerAttribute Builder::pointer_attr(void *value) {
   return PointerAttribute::get(context_, value);
 }
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/core/builder.h b/paddle/pir/core/builder.h
similarity index 92%
rename from paddle/ir/core/builder.h
rename to paddle/pir/core/builder.h
index f3ae837ea9723..acb621e7808e7 100644
--- a/paddle/ir/core/builder.h
+++ b/paddle/pir/core/builder.h
@@ -16,11 +16,11 @@
 
 #include <list>
 
-#include "paddle/ir/core/block.h"
-#include "paddle/ir/core/ir_context.h"
-#include "paddle/ir/core/operation.h"
+#include "paddle/pir/core/block.h"
+#include "paddle/pir/core/ir_context.h"
+#include "paddle/pir/core/operation.h"
 
-namespace ir {
+namespace pir {
 class Type;
 class UInt8Type;
 class Int8Type;
@@ -97,10 +97,10 @@ class Builder {
   IR_API Operation *Build(OperationArgument &&argument);
 
   /// Creates an operation with the given fields.
-  IR_API Operation *Build(const std::vector<ir::OpResult> &inputs,
+  IR_API Operation *Build(const std::vector<pir::OpResult> &inputs,
                           const AttributeMap &attribute,
-                          const std::vector<ir::Type> &output_types,
-                          ir::OpInfo op_info);
+                          const std::vector<pir::Type> &output_types,
+                          pir::OpInfo op_info);
 
   /// Create an operation of specific op type at the current insertion point.
   template <typename OpTy, typename... Args>
@@ -141,4 +141,4 @@ class Builder {
   Block::iterator insert_point_;
 };
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/core/builtin_attribute.cc b/paddle/pir/core/builtin_attribute.cc
similarity index 81%
rename from paddle/ir/core/builtin_attribute.cc
rename to paddle/pir/core/builtin_attribute.cc
index 38ca80cb1f9d7..e14a424c32c8e 100644
--- a/paddle/ir/core/builtin_attribute.cc
+++ b/paddle/pir/core/builtin_attribute.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/ir/core/builtin_attribute.h"
+#include "paddle/pir/core/builtin_attribute.h"
 
-namespace ir {
+namespace pir {
 
 bool BoolAttribute::data() const { return storage()->data(); }
 
@@ -37,7 +37,7 @@ std::string StrAttribute::AsString() const { return storage()->AsString(); }
 
 size_t StrAttribute::size() const { return storage()->size(); }
 
-StrAttribute StrAttribute::get(ir::IrContext* ctx, const std::string& value) {
+StrAttribute StrAttribute::get(pir::IrContext* ctx, const std::string& value) {
   return AttributeManager::get<StrAttribute>(ctx, value);
 }
 
@@ -79,14 +79,14 @@ ArrayAttributeStorage::~ArrayAttributeStorage() {
   }
 }
 
-}  // namespace ir
-
-IR_DEFINE_EXPLICIT_TYPE_ID(ir::StrAttribute)
-IR_DEFINE_EXPLICIT_TYPE_ID(ir::BoolAttribute)
-IR_DEFINE_EXPLICIT_TYPE_ID(ir::FloatAttribute)
-IR_DEFINE_EXPLICIT_TYPE_ID(ir::DoubleAttribute)
-IR_DEFINE_EXPLICIT_TYPE_ID(ir::Int32Attribute)
-IR_DEFINE_EXPLICIT_TYPE_ID(ir::Int64Attribute)
-IR_DEFINE_EXPLICIT_TYPE_ID(ir::ArrayAttribute)
-IR_DEFINE_EXPLICIT_TYPE_ID(ir::PointerAttribute)
-IR_DEFINE_EXPLICIT_TYPE_ID(ir::TypeAttribute)
+}  // namespace pir
+
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::StrAttribute)
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::BoolAttribute)
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::FloatAttribute)
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::DoubleAttribute)
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::Int32Attribute)
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::Int64Attribute)
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::ArrayAttribute)
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::PointerAttribute)
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::TypeAttribute)
diff --git a/paddle/ir/core/builtin_attribute.h b/paddle/pir/core/builtin_attribute.h
similarity index 80%
rename from paddle/ir/core/builtin_attribute.h
rename to paddle/pir/core/builtin_attribute.h
index 3969d962e1f4e..7d3f86144915c 100644
--- a/paddle/ir/core/builtin_attribute.h
+++ b/paddle/pir/core/builtin_attribute.h
@@ -14,11 +14,11 @@
 
 #pragma once
 
-#include "paddle/ir/core/attribute.h"
-#include "paddle/ir/core/builtin_attribute_storage.h"
-#include "paddle/ir/core/utils.h"
+#include "paddle/pir/core/attribute.h"
+#include "paddle/pir/core/builtin_attribute_storage.h"
+#include "paddle/pir/core/utils.h"
 
-namespace ir {
+namespace pir {
 class IR_API BoolAttribute : public Attribute {
  public:
   using Attribute::Attribute;
@@ -115,14 +115,14 @@ class IR_API ArrayAttribute : public Attribute {
                             const std::vector<Attribute>& value);
 };
 
-}  // namespace ir
-
-IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(ir::StrAttribute)
-IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(ir::BoolAttribute)
-IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(ir::FloatAttribute)
-IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(ir::DoubleAttribute)
-IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(ir::Int32Attribute)
-IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(ir::Int64Attribute)
-IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(ir::ArrayAttribute)
-IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(ir::PointerAttribute)
-IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(ir::TypeAttribute)
+}  // namespace pir
+
+IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::StrAttribute)
+IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::BoolAttribute)
+IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::FloatAttribute)
+IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::DoubleAttribute)
+IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::Int32Attribute)
+IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::Int64Attribute)
+IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::ArrayAttribute)
+IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::PointerAttribute)
+IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::TypeAttribute)
diff --git a/paddle/ir/core/builtin_attribute_storage.h b/paddle/pir/core/builtin_attribute_storage.h
similarity index 95%
rename from paddle/ir/core/builtin_attribute_storage.h
rename to paddle/pir/core/builtin_attribute_storage.h
index 624abaf004718..fd9dd6eb87128 100644
--- a/paddle/ir/core/builtin_attribute_storage.h
+++ b/paddle/pir/core/builtin_attribute_storage.h
@@ -18,13 +18,13 @@
 #include <map>
 #include <type_traits>
 
-#include "paddle/ir/core/attribute.h"
-#include "paddle/ir/core/attribute_base.h"
-#include "paddle/ir/core/enforce.h"
-#include "paddle/ir/core/type.h"
-#include "paddle/ir/core/utils.h"
+#include "paddle/pir/core/attribute.h"
+#include "paddle/pir/core/attribute_base.h"
+#include "paddle/pir/core/enforce.h"
+#include "paddle/pir/core/type.h"
+#include "paddle/pir/core/utils.h"
 
-namespace ir {
+namespace pir {
 
 #define DECLARE_BASE_TYPE_ATTRIBUTE_STORAGE(ConcreteStorage, BaseType) \
   struct ConcreteStorage : public AttributeStorage {                   \
@@ -147,4 +147,4 @@ struct ArrayAttributeStorage : public AttributeStorage {
   const size_t size_;
 };
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/core/builtin_dialect.cc b/paddle/pir/core/builtin_dialect.cc
similarity index 87%
rename from paddle/ir/core/builtin_dialect.cc
rename to paddle/pir/core/builtin_dialect.cc
index 375bf90d2b8fd..23ba43c3d292e 100644
--- a/paddle/ir/core/builtin_dialect.cc
+++ b/paddle/pir/core/builtin_dialect.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/ir/core/builtin_dialect.h"
-#include "paddle/ir/core/builtin_attribute.h"
-#include "paddle/ir/core/builtin_op.h"
-#include "paddle/ir/core/builtin_type.h"
+#include "paddle/pir/core/builtin_dialect.h"
+#include "paddle/pir/core/builtin_attribute.h"
+#include "paddle/pir/core/builtin_op.h"
+#include "paddle/pir/core/builtin_type.h"
 
-namespace ir {
+namespace pir {
 BuiltinDialect::BuiltinDialect(IrContext *context)
     : Dialect(name(), context, TypeId::get<BuiltinDialect>()) {
   initialize();
@@ -59,6 +59,6 @@ void BuiltinDialect::initialize() {
               ConstantOp>();
 }
 
-}  // namespace ir
+}  // namespace pir
 
-IR_DEFINE_EXPLICIT_TYPE_ID(ir::BuiltinDialect)
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::BuiltinDialect)
diff --git a/paddle/ir/core/builtin_dialect.h b/paddle/pir/core/builtin_dialect.h
similarity index 82%
rename from paddle/ir/core/builtin_dialect.h
rename to paddle/pir/core/builtin_dialect.h
index c5872f8142e7b..13e669102d8cc 100644
--- a/paddle/ir/core/builtin_dialect.h
+++ b/paddle/pir/core/builtin_dialect.h
@@ -14,17 +14,17 @@
 
 #pragma once
 
-#include "paddle/ir/core/dialect.h"
+#include "paddle/pir/core/dialect.h"
 
-namespace ir {
+namespace pir {
 ///
 /// \brief Built-in Dialect: automatically registered into global IrContext,
 /// all built-in types defined in builtin_type.h will be registered in this
 /// Dialect.
 ///
-class IR_API BuiltinDialect : public ir::Dialect {
+class IR_API BuiltinDialect : public pir::Dialect {
  public:
-  explicit BuiltinDialect(ir::IrContext *context);
+  explicit BuiltinDialect(pir::IrContext *context);
   ///
   /// \brief Each Dialect needs to provide a name function to return the name of
   /// the Dialect.
@@ -37,6 +37,6 @@ class IR_API BuiltinDialect : public ir::Dialect {
   void initialize();
 };
 
-}  // namespace ir
+}  // namespace pir
 
-IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(ir::BuiltinDialect)
+IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::BuiltinDialect)
diff --git a/paddle/ir/core/builtin_op.cc b/paddle/pir/core/builtin_op.cc
similarity index 83%
rename from paddle/ir/core/builtin_op.cc
rename to paddle/pir/core/builtin_op.cc
index 1feb4d691d99b..aba3ff9b282e4 100644
--- a/paddle/ir/core/builtin_op.cc
+++ b/paddle/pir/core/builtin_op.cc
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/ir/core/builtin_op.h"
-#include "paddle/ir/core/builtin_attribute.h"
-#include "paddle/ir/core/builtin_type.h"
-#include "paddle/ir/core/enforce.h"
+#include "paddle/pir/core/builtin_op.h"
 #include "paddle/phi/core/enforce.h"
+#include "paddle/pir/core/builtin_attribute.h"
+#include "paddle/pir/core/builtin_type.h"
+#include "paddle/pir/core/enforce.h"
 
-namespace ir {
+namespace pir {
 
 const char *ModuleOp::attributes_name[attributes_num] = {"program"};  // NOLINT
 
@@ -38,7 +38,7 @@ Block *ModuleOp::block() {
 }
 
 ModuleOp ModuleOp::Create(IrContext *context, Program *pointer) {
-  ir::OpInfo info = context->GetRegisteredOpInfo(name());
+  pir::OpInfo info = context->GetRegisteredOpInfo(name());
   OperationArgument argument(info);
   argument.num_regions = 1;
   argument.AddAttribute("program", PointerAttribute::get(context, pointer));
@@ -77,7 +77,7 @@ void GetParameterOp::Build(Builder &builder,
                            const std::string &name,
                            Type type) {
   argument.attributes[attributes_name[0]] =
-      ir::StrAttribute::get(builder.ir_context(), name);
+      pir::StrAttribute::get(builder.ir_context(), name);
   argument.output_types.emplace_back(type);
 }
 
@@ -105,7 +105,7 @@ void SetParameterOp::Build(Builder &builder,             // NOLINT
                            const std::string &name) {
   argument.AddOperand(parameter);
   argument.AddAttribute(attributes_name[0],
-                        ir::StrAttribute::get(builder.ir_context(), name));
+                        pir::StrAttribute::get(builder.ir_context(), name));
 }
 void SetParameterOp::Verify() const {
   VLOG(4) << "Verifying inputs, outputs and attributes for: SetParameterOp.";
@@ -124,14 +124,18 @@ void SetParameterOp::Verify() const {
 
 void CombineOp::Build(Builder &builder,
                       OperationArgument &argument,
-                      const std::vector<ir::OpResult> &inputs) {
+                      const std::vector<pir::OpResult> &inputs) {
   argument.inputs = inputs;
-  std::vector<ir::Type> inputs_type(inputs.size());
-  for (size_t idx = 0; idx < inputs.size(); ++idx) {
-    inputs_type[idx] = inputs[idx].type();
+  if (inputs.size() == 0) {
+    argument.output_types.emplace_back(pir::Type());
+  } else {
+    std::vector<pir::Type> inputs_type(inputs.size());
+    for (size_t idx = 0; idx < inputs.size(); ++idx) {
+      inputs_type[idx] = inputs[idx].type();
+    }
+    argument.output_types.emplace_back(
+        pir::VectorType::get(builder.ir_context(), inputs_type));
   }
-  argument.output_types.emplace_back(
-      ir::VectorType::get(builder.ir_context(), inputs_type));
 }
 
 void CombineOp::Verify() const {
@@ -167,11 +171,11 @@ const char *SliceOp::attributes_name[attributes_num] = {"index"};  // NOLINT
 
 void SliceOp::Build(Builder &builder,
                     OperationArgument &argument,
-                    const ir::OpResult &input,
+                    const pir::OpResult &input,
                     int index) {
   argument.inputs = {input};
   argument.output_types.emplace_back(input.type()
-                                         .dyn_cast<ir::VectorType>()
+                                         .dyn_cast<pir::VectorType>()
                                          .data()[static_cast<size_t>(index)]);
 }
 
@@ -182,7 +186,7 @@ void SliceOp::Verify() const {
       input_size == 1, "The size %d of inputs must be equal to 1.", input_size);
 
   // inputs[0].type == Vector<Type>
-  auto input_type = (*this)->operand(0).type().dyn_cast<ir::VectorType>();
+  auto input_type = (*this)->operand(0).type().dyn_cast<pir::VectorType>();
   IR_ENFORCE(input_type,
              "The type %s of inputs[0] must be equal to VectorType.",
              input_type);
@@ -197,10 +201,10 @@ void SliceOp::Verify() const {
   auto &attributes = this->attributes();
   IR_ENFORCE(attributes.count("index") != 0,
              "The attributes must contains index.");
-  const ir::Attribute &attr = attributes.at("index");
-  IR_ENFORCE(attr.isa<ir::Int32Attribute>(),
+  const pir::Attribute &attr = attributes.at("index");
+  IR_ENFORCE(attr.isa<pir::Int32Attribute>(),
              "The attribute index must be INT32.");
-  auto index = attr.dyn_cast<ir::Int32Attribute>().data();
+  auto index = attr.dyn_cast<pir::Int32Attribute>().data();
 
   // index >= 0 and < inputs[0].size()
   IR_ENFORCE(
@@ -222,12 +226,12 @@ void SliceOp::Verify() const {
 
 void SplitOp::Build(Builder &builder,
                     OperationArgument &argument,
-                    const ir::OpResult &input) {
+                    const pir::OpResult &input) {
   argument.inputs = {input};
-  for (size_t idx = 0; idx < input.type().dyn_cast<ir::VectorType>().size();
+  for (size_t idx = 0; idx < input.type().dyn_cast<pir::VectorType>().size();
        ++idx) {
     argument.output_types.emplace_back(
-        input.type().dyn_cast<ir::VectorType>().data()[idx]);
+        input.type().dyn_cast<pir::VectorType>().data()[idx]);
   }
 }
 
@@ -277,13 +281,13 @@ void ConstantOp::Verify() const {
 
 Attribute ConstantOp::value() const { return attributes().at("value"); }
 
-}  // namespace ir
+}  // namespace pir
 
-IR_DEFINE_EXPLICIT_TYPE_ID(ir::ModuleOp)
-IR_DEFINE_EXPLICIT_TYPE_ID(ir::GetParameterOp)
-IR_DEFINE_EXPLICIT_TYPE_ID(ir::SetParameterOp)
-IR_DEFINE_EXPLICIT_TYPE_ID(ir::CombineOp)
-IR_DEFINE_EXPLICIT_TYPE_ID(ir::SliceOp)
-IR_DEFINE_EXPLICIT_TYPE_ID(ir::SplitOp)
-IR_DEFINE_EXPLICIT_TYPE_ID(ir::ConstantLikeTrait)
-IR_DEFINE_EXPLICIT_TYPE_ID(ir::ConstantOp)
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::ModuleOp)
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::GetParameterOp)
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::SetParameterOp)
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::CombineOp)
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::SliceOp)
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::SplitOp)
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::ConstantLikeTrait)
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::ConstantOp)
diff --git a/paddle/ir/core/builtin_op.h b/paddle/pir/core/builtin_op.h
similarity index 76%
rename from paddle/ir/core/builtin_op.h
rename to paddle/pir/core/builtin_op.h
index ab2d0cb9efba6..fee0ca406a741 100644
--- a/paddle/ir/core/builtin_op.h
+++ b/paddle/pir/core/builtin_op.h
@@ -14,17 +14,17 @@
 
 #pragma once
 
-#include "paddle/ir/core/builder.h"
-#include "paddle/ir/core/op_base.h"
+#include "paddle/pir/core/builder.h"
+#include "paddle/pir/core/op_base.h"
 
-namespace ir {
+namespace pir {
 
 class Program;
 class Block;
 ///
 /// \brief ModuleOp
 ///
-class IR_API ModuleOp : public ir::Op<ModuleOp> {
+class IR_API ModuleOp : public pir::Op<ModuleOp> {
  public:
   using Op::Op;
   static const char *name() { return "builtin.module"; }
@@ -45,7 +45,7 @@ class IR_API ModuleOp : public ir::Op<ModuleOp> {
 /// \brief GetParameterOp: OpResult = GetParameterOp({StrAttribute,
 /// StrAttribute})
 ///
-class IR_API GetParameterOp : public ir::Op<GetParameterOp> {
+class IR_API GetParameterOp : public pir::Op<GetParameterOp> {
  public:
   using Op::Op;
   static const char *name() { return "builtin.get_parameter"; }
@@ -62,7 +62,7 @@ class IR_API GetParameterOp : public ir::Op<GetParameterOp> {
 /// \brief SetParameterOp: SetParameterOp(OpOperand, {StrAttribute,
 /// StrAttribute})
 ///
-class IR_API SetParameterOp : public ir::Op<SetParameterOp> {
+class IR_API SetParameterOp : public pir::Op<SetParameterOp> {
  public:
   using Op::Op;
   static const char *name() { return "builtin.set_parameter"; }
@@ -78,7 +78,7 @@ class IR_API SetParameterOp : public ir::Op<SetParameterOp> {
 ///
 /// \brief CombineOp: CombineOp(OpOperand)
 ///
-class IR_API CombineOp : public ir::Op<CombineOp> {
+class IR_API CombineOp : public pir::Op<CombineOp> {
  public:
   using Op::Op;
 
@@ -90,23 +90,23 @@ class IR_API CombineOp : public ir::Op<CombineOp> {
 
   static void Build(Builder &builder,             // NOLINT
                     OperationArgument &argument,  // NOLINT
-                    const std::vector<ir::OpResult> &inputs);
+                    const std::vector<pir::OpResult> &inputs);
 
   void Verify() const;
-  std::vector<ir::Value> inputs() {
-    std::vector<ir::Value> inputs;
+  std::vector<pir::Value> inputs() {
+    std::vector<pir::Value> inputs;
     for (uint32_t idx = 0; idx < num_operands(); idx++) {
       inputs.push_back(operand_source(static_cast<int>(idx)));
     }
     return inputs;
   }
-  ir::OpResult out() { return result(0); }
+  pir::OpResult out() { return result(0); }
 };
 
 ///
 /// \brief SliceOp: SliceOp(OpOperand)
 ///
-class IR_API SliceOp : public ir::Op<SliceOp> {
+class IR_API SliceOp : public pir::Op<SliceOp> {
  public:
   using Op::Op;
 
@@ -118,17 +118,17 @@ class IR_API SliceOp : public ir::Op<SliceOp> {
 
   static void Build(Builder &builder,             // NOLINT
                     OperationArgument &argument,  // NOLINT
-                    const ir::OpResult &input,
+                    const pir::OpResult &input,
                     int index);
 
   void Verify() const;
-  ir::Value input() { return operand_source(0); }
+  pir::Value input() { return operand_source(0); }
 };
 
 ///
 /// \brief SplitOp: SplitOp(OpOperand)
 ///
-class IR_API SplitOp : public ir::Op<SplitOp> {
+class IR_API SplitOp : public pir::Op<SplitOp> {
  public:
   using Op::Op;
 
@@ -140,12 +140,12 @@ class IR_API SplitOp : public ir::Op<SplitOp> {
 
   static void Build(Builder &builder,             // NOLINT
                     OperationArgument &argument,  // NOLINT
-                    const ir::OpResult &input);
+                    const pir::OpResult &input);
 
   void Verify() const;
-  ir::Value input() { return operand_source(0); }
-  std::vector<ir::OpResult> outputs() {
-    std::vector<ir::OpResult> outputs;
+  pir::Value input() { return operand_source(0); }
+  std::vector<pir::OpResult> outputs() {
+    std::vector<pir::OpResult> outputs;
     for (uint32_t idx = 0; idx < num_results(); idx++) {
       outputs.push_back(result(static_cast<int>(idx)));
     }
@@ -180,13 +180,13 @@ class IR_API ConstantOp : public Op<ConstantOp, ConstantLikeTrait> {
   Attribute value() const;
 };
 
-}  // namespace ir
+}  // namespace pir
 
-IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(ir::ModuleOp)
-IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(ir::GetParameterOp)
-IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(ir::SetParameterOp)
-IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(ir::CombineOp)
-IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(ir::SliceOp)
-IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(ir::SplitOp)
-IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(ir::ConstantLikeTrait)
-IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(ir::ConstantOp)
+IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::ModuleOp)
+IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::GetParameterOp)
+IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::SetParameterOp)
+IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::CombineOp)
+IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::SliceOp)
+IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::SplitOp)
+IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::ConstantLikeTrait)
+IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::ConstantOp)
diff --git a/paddle/ir/core/builtin_type.cc b/paddle/pir/core/builtin_type.cc
similarity index 56%
rename from paddle/ir/core/builtin_type.cc
rename to paddle/pir/core/builtin_type.cc
index 49a15484466b2..8d7de683e086a 100644
--- a/paddle/ir/core/builtin_type.cc
+++ b/paddle/pir/core/builtin_type.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/ir/core/builtin_type.h"
+#include "paddle/pir/core/builtin_type.h"
 
-namespace ir {
+namespace pir {
 std::vector<Type> VectorType::data() const { return storage()->GetAsKey(); }
 
-const ir::Type& DenseTensorType::dtype() const { return storage()->dtype_; }
+const pir::Type& DenseTensorType::dtype() const { return storage()->dtype_; }
 
 const DenseTensorTypeStorage::Dim& DenseTensorType::dims() const {
   return storage()->dims_;
@@ -32,20 +32,20 @@ const DenseTensorTypeStorage::LoD& DenseTensorType::lod() const {
 }
 
 const size_t& DenseTensorType::offset() const { return storage()->offset_; }
-}  // namespace ir
-
-IR_DEFINE_EXPLICIT_TYPE_ID(ir::UInt8Type)
-IR_DEFINE_EXPLICIT_TYPE_ID(ir::Int8Type)
-IR_DEFINE_EXPLICIT_TYPE_ID(ir::VectorType)
-IR_DEFINE_EXPLICIT_TYPE_ID(ir::BFloat16Type)
-IR_DEFINE_EXPLICIT_TYPE_ID(ir::Float16Type)
-IR_DEFINE_EXPLICIT_TYPE_ID(ir::Float32Type)
-IR_DEFINE_EXPLICIT_TYPE_ID(ir::Float64Type)
-IR_DEFINE_EXPLICIT_TYPE_ID(ir::Int16Type)
-IR_DEFINE_EXPLICIT_TYPE_ID(ir::Int32Type)
-IR_DEFINE_EXPLICIT_TYPE_ID(ir::Int64Type)
-IR_DEFINE_EXPLICIT_TYPE_ID(ir::IndexType)
-IR_DEFINE_EXPLICIT_TYPE_ID(ir::BoolType)
-IR_DEFINE_EXPLICIT_TYPE_ID(ir::Complex64Type)
-IR_DEFINE_EXPLICIT_TYPE_ID(ir::Complex128Type)
-IR_DEFINE_EXPLICIT_TYPE_ID(ir::DenseTensorType)
+}  // namespace pir
+
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::UInt8Type)
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::Int8Type)
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::VectorType)
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::BFloat16Type)
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::Float16Type)
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::Float32Type)
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::Float64Type)
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::Int16Type)
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::Int32Type)
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::Int64Type)
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::IndexType)
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::BoolType)
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::Complex64Type)
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::Complex128Type)
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::DenseTensorType)
diff --git a/paddle/ir/core/builtin_type.h b/paddle/pir/core/builtin_type.h
similarity index 52%
rename from paddle/ir/core/builtin_type.h
rename to paddle/pir/core/builtin_type.h
index a660f065376b2..3f0e7a1471703 100644
--- a/paddle/ir/core/builtin_type.h
+++ b/paddle/pir/core/builtin_type.h
@@ -15,14 +15,13 @@
 
 #pragma once
 
-#include "paddle/ir/core/builtin_type_storage.h"
-#include "paddle/ir/core/type.h"
+#include "paddle/pir/core/builtin_type_interfaces.h"
+#include "paddle/pir/core/builtin_type_storage.h"
+#include "paddle/pir/core/type.h"
 
-namespace ir {
+namespace pir {
 ///
-/// \brief Define built-in parameterless types. Please add the necessary
-/// interface functions for built-in types through the macro
-/// DECLARE_TYPE_UTILITY_FUNCTOR.
+/// \brief Define built-in parameterless types.
 ///
 /// NOTE(zhangbo9674): If you need to directly
 /// cache the object of this built-in type in IrContext, please overload the get
@@ -31,7 +30,7 @@ namespace ir {
 ///
 /// The built-in type object get method is as follows:
 /// \code{cpp}
-///   ir::IrContext *ctx = ir::IrContext::Instance();
+///   pir::IrContext *ctx = pir::IrContext::Instance();
 ///   Type fp32 = Float32Type::get(ctx);
 /// \endcode
 ///
@@ -39,11 +38,10 @@ namespace ir {
 // NOTE(dev): Currently Int8 are not considered as a cached member
 // in IrContextImpl because it is not widely used.
 
-class IR_API VectorType : public Type {
+class IR_API VectorType
+    : public pir::Type::TypeBase<VectorType, pir::Type, VectorTypeStorage> {
  public:
-  using Type::Type;
-
-  DECLARE_TYPE_UTILITY_FUNCTOR(VectorType, VectorTypeStorage);
+  using Base::Base;
 
   std::vector<Type> data() const;
 
@@ -54,13 +52,14 @@ class IR_API VectorType : public Type {
   Type operator[](size_t index) const { return data()[index]; }
 };
 
-class DenseTensorType : public ir::Type {
+class DenseTensorType : public pir::Type::TypeBase<DenseTensorType,
+                                                   pir::Type,
+                                                   DenseTensorTypeStorage,
+                                                   pir::ShapedTypeInterface> {
  public:
-  using Type::Type;
-
-  DECLARE_TYPE_UTILITY_FUNCTOR(DenseTensorType, DenseTensorTypeStorage);
+  using Base::Base;
 
-  const ir::Type &dtype() const;
+  const pir::Type &dtype() const;
 
   const DenseTensorTypeStorage::Dim &dims() const;
 
@@ -71,14 +70,13 @@ class DenseTensorType : public ir::Type {
   const size_t &offset() const;
 };
 
-#define DECLARE_BUILTIN_TYPE(__name)                   \
-  class IR_API __name : public Type {                  \
-   public:                                             \
-    using Type::Type;                                  \
-                                                       \
-    DECLARE_TYPE_UTILITY_FUNCTOR(__name, TypeStorage); \
-                                                       \
-    static __name get(IrContext *context);             \
+#define DECLARE_BUILTIN_TYPE(__name)                                       \
+  class IR_API __name : public ::pir::Type::TypeBase<__name,               \
+                                                     ::pir::Type,          \
+                                                     ::pir::TypeStorage> { \
+   public:                                                                 \
+    using Base::Base;                                                      \
+    static __name get(IrContext *context);                                 \
   };
 
 #define FOREACH_BUILTIN_TYPE(__macro) \
@@ -101,20 +99,20 @@ FOREACH_BUILTIN_TYPE(DECLARE_BUILTIN_TYPE)
 #undef FOREACH_BUILTIN_TYPE
 #undef DECLARE_BUILTIN_TYPE
 
-}  // namespace ir
-
-IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(ir::UInt8Type)
-IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(ir::Int8Type)
-IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(ir::VectorType)
-IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(ir::BFloat16Type)
-IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(ir::Float16Type)
-IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(ir::Float32Type)
-IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(ir::Float64Type)
-IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(ir::Int16Type)
-IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(ir::Int32Type)
-IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(ir::Int64Type)
-IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(ir::BoolType)
-IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(ir::IndexType)
-IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(ir::Complex64Type)
-IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(ir::Complex128Type)
-IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(ir::DenseTensorType)
+}  // namespace pir
+
+IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::UInt8Type)
+IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::Int8Type)
+IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::VectorType)
+IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::BFloat16Type)
+IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::Float16Type)
+IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::Float32Type)
+IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::Float64Type)
+IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::Int16Type)
+IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::Int32Type)
+IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::Int64Type)
+IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::BoolType)
+IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::IndexType)
+IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::Complex64Type)
+IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::Complex128Type)
+IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::DenseTensorType)
diff --git a/paddle/cinn/optim/tensor_write_tell.cc b/paddle/pir/core/builtin_type_interfaces.cc
similarity index 72%
rename from paddle/cinn/optim/tensor_write_tell.cc
rename to paddle/pir/core/builtin_type_interfaces.cc
index 9f0f5747c3f3d..9084bffc7a197 100644
--- a/paddle/cinn/optim/tensor_write_tell.cc
+++ b/paddle/pir/core/builtin_type_interfaces.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,8 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/cinn/optim/tensor_write_tell.h"
+#include "paddle/pir/core/builtin_type_interfaces.h"
+#include "paddle/pir/core/type_id.h"
 
-namespace cinn {
-namespace optim {}  // namespace optim
-}  // namespace cinn
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::ShapedTypeInterface)
diff --git a/paddle/pir/core/builtin_type_interfaces.h b/paddle/pir/core/builtin_type_interfaces.h
new file mode 100644
index 0000000000000..f736c1a631b48
--- /dev/null
+++ b/paddle/pir/core/builtin_type_interfaces.h
@@ -0,0 +1,159 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "paddle/phi/core/tensor_base.h"
+#include "paddle/pir/core/cast_utils.h"
+#include "paddle/pir/core/enforce.h"
+#include "paddle/pir/core/type.h"
+
+namespace details {
+
+template <typename RangeT>
+constexpr auto begin_impl(RangeT &&range)
+    -> decltype(std::begin(std::forward<RangeT>(range))) {
+  return std::begin(std::forward<RangeT>(range));
+}
+
+template <typename RangeT>
+constexpr auto end_impl(RangeT &&range)
+    -> decltype(std::end(std::forward<RangeT>(range))) {
+  return std::end(std::forward<RangeT>(range));
+}
+
+/// Returns the begin iterator to \p range using `std::begin` and
+/// function found through Argument-Dependent Lookup (ADL).
+template <typename RangeT>
+constexpr auto adl_begin(RangeT &&range)
+    -> decltype(begin_impl(std::forward<RangeT>(range))) {
+  return begin_impl(std::forward<RangeT>(range));
+}
+
+/// Returns the end iterator to \p range using `std::end` and
+/// functions found through Argument-Dependent Lookup (ADL).
+template <typename RangeT>
+constexpr auto adl_end(RangeT &&range)
+    -> decltype(end_impl(std::forward<RangeT>(range))) {
+  return end_impl(std::forward<RangeT>(range));
+}
+
+/// Provide wrappers to std::any_of which take ranges instead of having to pass
+/// begin/end explicitly.
+template <typename R, typename UnaryPredicate>
+bool any_of(R &&Range, UnaryPredicate P) {
+  return std::any_of(adl_begin(Range), adl_end(Range), P);
+}
+
+/// Wrapper function around std::count_if to count the number of times an
+/// element satisfying a given predicate occurs in a range.
+template <typename R, typename UnaryPredicate>
+auto count_if(R &&Range, UnaryPredicate P) {
+  return std::count_if(adl_begin(Range), adl_end(Range), P);
+}
+
+}  // namespace details
+namespace pir {
+class ShapedTypeInterface : public pir::TypeInterfaceBase<ShapedTypeInterface> {
+ public:
+  using DDim = phi::DDim;
+  using DataType = pir::Type;
+  struct Concept {
+    /// Defined these methods with the interface.
+    explicit Concept(DataType (*get_element_type)(pir::Type),
+                     DDim (*get_shape)(pir::Type))
+        : get_element_type_(get_element_type), get_shape_(get_shape) {}
+
+    DataType (*get_element_type_)(pir::Type);
+    DDim (*get_shape_)(pir::Type);
+  };
+
+  template <class ConcreteType>
+  struct Model : public Concept {
+    static inline DataType getElementType(pir::Type type) {
+      return pir::cast<ConcreteType>(type).dtype();
+    }
+
+    static inline DDim getShape(pir::Type type) {
+      return pir::cast<ConcreteType>(type).dims();
+    }
+
+    Model() : Concept(getElementType, getShape) {}
+  };
+
+  /// Constructor
+  ShapedTypeInterface(pir::Type type, Concept *impl)
+      : pir::TypeInterfaceBase<ShapedTypeInterface>(type), impl_(impl) {}
+
+  /// Get the element type.
+  DataType getElementType() const { return impl_->get_element_type_(*this); }
+
+  /// Get the shape of this type.
+  DDim getShape() const { return impl_->get_shape_(*this); }
+
+  static constexpr int64_t kDynamic = std::numeric_limits<int64_t>::min();
+
+  /// Check whether this type is ranked, currently return true.
+  bool hasRank() const { return true; }
+
+  /// If this is a ranked type, return the rank. Otherwise, abort.
+  int64_t getRank() const {
+    IR_ENFORCE((*this).hasRank(), "Cannot query rank of unranked shaped type.");
+    return (*this).getShape().size();
+  }
+
+  /// Check whether the given dimension size is a dynamic dimension.
+  static constexpr bool isDynamic(int64_t dValue) { return dValue == kDynamic; }
+
+  /// Check whether the given shape has any size indicating a dynamic dimension.
+  static bool isDynamicShape(DDim dSizes) {
+    return ::details::any_of(vectorize(dSizes),
+                             [](int64_t dSize) { return isDynamic(dSize); });
+  }
+
+  /// Check whether shape has any size indicating a dynamic dimension.
+  bool hasStaticShape() const {
+    return (*this).hasRank() &&
+           !pir::ShapedTypeInterface::isDynamicShape((*this).getShape());
+  }
+
+  /// Check whether the given dimension has a dynamic size.
+  /// Aborts for unranked types.
+  bool isDynamicDim(unsigned idx) const {
+    IR_ENFORCE(idx < getRank(), "Invalid index for shaped type.");
+    return pir::ShapedTypeInterface::isDynamic((*this).getShape()[idx]);
+  }
+
+  /// Get the number of dimensions with dynamic size for a ranked type.
+  /// Aborts for unranked types.
+  int64_t getNumDynamicDims() const {
+    return ::details::count_if(vectorize((*this).getShape()),
+                               pir::ShapedTypeInterface::isDynamic);
+  }
+
+  /// Get the size of the specified dimension for a ranked type.
+  /// Aborts for unranked types.
+  int64_t getDimSize(unsigned idx) const {
+    IR_ENFORCE(idx < getRank(), "Invalid index for shaped type.");
+    return (*this).getShape()[idx];
+  }
+
+ private:
+  Concept *impl_;
+};
+
+}  // namespace pir
+
+IR_DECLARE_EXPLICIT_TYPE_ID(pir::ShapedTypeInterface)
diff --git a/paddle/ir/core/builtin_type_storage.h b/paddle/pir/core/builtin_type_storage.h
similarity index 87%
rename from paddle/ir/core/builtin_type_storage.h
rename to paddle/pir/core/builtin_type_storage.h
index 4488b28b07fa2..b8b18d09ddd26 100644
--- a/paddle/ir/core/builtin_type_storage.h
+++ b/paddle/pir/core/builtin_type_storage.h
@@ -14,11 +14,11 @@
 
 #pragma once
 
-#include "paddle/ir/core/type.h"
-#include "paddle/ir/core/type_base.h"
-#include "paddle/ir/core/utils.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/pir/core/type.h"
+#include "paddle/pir/core/type_base.h"
+#include "paddle/pir/core/utils.h"
 
 namespace std {
 ///
@@ -37,7 +37,7 @@ struct hash<std::vector<T>> {
 
 }  // namespace std
 
-namespace ir {
+namespace pir {
 ///
 /// \brief Define Parametric TypeStorage for DenseTensorType.
 ///
@@ -46,16 +46,16 @@ namespace ir {
 /// (3)define HashValue method, (4)overload operator==.
 ///
 
-struct DenseTensorTypeStorage : public ir::TypeStorage {
+struct DenseTensorTypeStorage : public pir::TypeStorage {
   ///
   /// \brief Declare ParamKey according to parameter type.
   ///
   using DataLayout = phi::DataLayout;
   using Dim = phi::DDim;
   using LoD = std::vector<std::vector<size_t>>;
-  using ParamKey = std::tuple<ir::Type, Dim, DataLayout, LoD, size_t>;
+  using ParamKey = std::tuple<pir::Type, Dim, DataLayout, LoD, size_t>;
 
-  DenseTensorTypeStorage(const ir::Type& dtype,
+  DenseTensorTypeStorage(const pir::Type& dtype,
                          const Dim& dims,
                          const DataLayout& layout,
                          const LoD& lod,
@@ -85,22 +85,22 @@ struct DenseTensorTypeStorage : public ir::TypeStorage {
     std::size_t hash_value = 0;
     // hash dtype
     hash_value =
-        ir::hash_combine(hash_value, std::hash<ir::Type>()(std::get<0>(key)));
+        pir::hash_combine(hash_value, std::hash<pir::Type>()(std::get<0>(key)));
     // hash dims
     hash_value =
-        ir::hash_combine(hash_value, std::hash<Dim>()(std::get<1>(key)));
+        pir::hash_combine(hash_value, std::hash<Dim>()(std::get<1>(key)));
     // hash layout
-    hash_value = ir::hash_combine(
+    hash_value = pir::hash_combine(
         hash_value,
         std::hash<std::underlying_type<DataLayout>::type>()(
             static_cast<std::underlying_type<DataLayout>::type>(
                 std::get<2>(key))));
     // hash lod
     hash_value =
-        ir::hash_combine(hash_value, std::hash<LoD>()(std::get<3>(key)));
+        pir::hash_combine(hash_value, std::hash<LoD>()(std::get<3>(key)));
     // hash offset
     hash_value =
-        ir::hash_combine(hash_value, std::hash<size_t>()(std::get<4>(key)));
+        pir::hash_combine(hash_value, std::hash<size_t>()(std::get<4>(key)));
     return hash_value;
   }
 
@@ -119,7 +119,7 @@ struct DenseTensorTypeStorage : public ir::TypeStorage {
   /// \brief DenseTensorTypeStorage include five parameters: dims, dtype,
   /// layout, lod, offset.
   ///
-  ir::Type dtype_;
+  pir::Type dtype_;
   Dim dims_;
   DataLayout layout_;
   LoD lod_;
@@ -183,4 +183,4 @@ struct VectorTypeStorage : public TypeStorage {
   size_t size_;
 };
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/core/cast_utils.h b/paddle/pir/core/cast_utils.h
similarity index 84%
rename from paddle/ir/core/cast_utils.h
rename to paddle/pir/core/cast_utils.h
index dcc4b89fe8b04..db9f864aaabc3 100644
--- a/paddle/ir/core/cast_utils.h
+++ b/paddle/pir/core/cast_utils.h
@@ -14,9 +14,10 @@
 
 #pragma once
 
+#include <memory>
 #include <type_traits>
 
-namespace ir {
+namespace pir {
 ///
 /// \brief The template function actually called by isa_wrap.
 ///
@@ -114,7 +115,7 @@ struct ReturnTypeDuduction {
 ///
 /// cast From to To
 ///
-template <typename To, typename From>
+template <typename To, typename From, typename Enable = void>
 struct cast_impl {
   // This _is_ a simple type, just cast it.
   static typename ReturnTypeDuduction<To, From>::type call(const From &Val) {
@@ -125,7 +126,15 @@ struct cast_impl {
 };
 
 template <typename To, typename From>
-inline typename ReturnTypeDuduction<To, From>::type cast(From &Val) {  // NOLINT
+inline decltype(auto) cast(const From &Val) {
+  if (!isa<To>(Val)) {
+    throw("cast<To>() argument of incompatible type!");
+  }
+  return cast_impl<To, const From>::call(Val);
+}
+
+template <typename To, typename From>
+inline decltype(auto) cast(From &Val) {  // NOLINT
   if (!isa<To>(Val)) {
     throw("cast<To>() argument of incompatible type!");
   }
@@ -133,25 +142,32 @@ inline typename ReturnTypeDuduction<To, From>::type cast(From &Val) {  // NOLINT
 }
 
 template <typename To, typename From>
-inline typename ReturnTypeDuduction<To, From *>::type cast(From *Val) {
+inline decltype(auto) cast(From *Val) {
   if (!isa<To>(Val)) {
     throw("cast<To>() argument of incompatible type!");
   }
   return cast_impl<To, From *>::call(Val);
 }
 
+template <typename To, typename From>
+inline decltype(auto) cast(std::unique_ptr<From> &&Val) {
+  if (!isa<To>(Val)) {
+    throw("cast<To>() argument of incompatible type!");
+  }
+  return cast_impl<To, std::unique_ptr<From>>::call(std::move(Val));
+}
+
 ///
 /// \brief dyn_cast From to To.
 ///
 template <typename To, typename From>
-inline std::decay_t<typename ReturnTypeDuduction<To, From>::type> dyn_cast(
-    From &Val) {  // NOLINT
+inline decltype(auto) dyn_cast(From &Val) {  // NOLINT
   return isa<To>(Val) ? cast<To>(Val) : nullptr;
 }
 
 template <typename To, typename From>
-inline typename ReturnTypeDuduction<To, From *>::type dyn_cast(From *Val) {
+inline decltype(auto) dyn_cast(From *Val) {
   return isa<To>(Val) ? cast<To>(Val) : nullptr;
 }
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/core/dialect.cc b/paddle/pir/core/dialect.cc
similarity index 88%
rename from paddle/ir/core/dialect.cc
rename to paddle/pir/core/dialect.cc
index 0a4a6cc3b3854..e6831e977fa31 100644
--- a/paddle/ir/core/dialect.cc
+++ b/paddle/pir/core/dialect.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/ir/core/dialect.h"
+#include "paddle/pir/core/dialect.h"
 
-namespace ir {
-Dialect::Dialect(std::string name, ir::IrContext *context, ir::TypeId id)
+namespace pir {
+Dialect::Dialect(std::string name, pir::IrContext *context, pir::TypeId id)
     : name_(std::move(name)), context_(context), id_(id) {}
 
 Dialect::~Dialect() = default;
@@ -32,4 +32,4 @@ IrContext *DialectInterface::ir_context() const {
   return dialect_->ir_context();
 }
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/core/dialect.h b/paddle/pir/core/dialect.h
similarity index 94%
rename from paddle/ir/core/dialect.h
rename to paddle/pir/core/dialect.h
index f07a4242f362c..07debaf196041 100644
--- a/paddle/ir/core/dialect.h
+++ b/paddle/pir/core/dialect.h
@@ -17,15 +17,15 @@
 #include <functional>
 #include <ostream>
 
-#include "paddle/ir/core/attribute.h"
-#include "paddle/ir/core/attribute_base.h"
-#include "paddle/ir/core/dialect_interface.h"
-#include "paddle/ir/core/enforce.h"
-#include "paddle/ir/core/ir_context.h"
-#include "paddle/ir/core/op_base.h"
-#include "paddle/ir/core/type_base.h"
+#include "paddle/pir/core/attribute.h"
+#include "paddle/pir/core/attribute_base.h"
+#include "paddle/pir/core/dialect_interface.h"
+#include "paddle/pir/core/enforce.h"
+#include "paddle/pir/core/ir_context.h"
+#include "paddle/pir/core/op_base.h"
+#include "paddle/pir/core/type_base.h"
 
-namespace ir {
+namespace pir {
 
 class Operation;
 class IrPrinter;
@@ -174,4 +174,4 @@ class IR_API Dialect {
   std::unordered_map<TypeId, std::unique_ptr<DialectInterface>>
       registered_interfaces_;
 };
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/core/dialect_interface.h b/paddle/pir/core/dialect_interface.h
similarity index 96%
rename from paddle/ir/core/dialect_interface.h
rename to paddle/pir/core/dialect_interface.h
index e24b3481f4ef4..7cb2b89de03eb 100644
--- a/paddle/ir/core/dialect_interface.h
+++ b/paddle/pir/core/dialect_interface.h
@@ -14,9 +14,9 @@
 
 #pragma once
 
-#include "paddle/ir/core/type_id.h"
+#include "paddle/pir/core/type_id.h"
 
-namespace ir {
+namespace pir {
 class Dialect;
 class IrContext;
 ///
@@ -64,4 +64,4 @@ class IR_API DialectInterface {
   TypeId interface_id_;
 };
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/core/dll_decl.h b/paddle/pir/core/dll_decl.h
similarity index 100%
rename from paddle/ir/core/dll_decl.h
rename to paddle/pir/core/dll_decl.h
diff --git a/paddle/ir/core/enforce.h b/paddle/pir/core/enforce.h
similarity index 95%
rename from paddle/ir/core/enforce.h
rename to paddle/pir/core/enforce.h
index 10735297f305d..a3b1401b64d25 100644
--- a/paddle/ir/core/enforce.h
+++ b/paddle/pir/core/enforce.h
@@ -30,7 +30,7 @@ inline bool is_error(const T& stat) {
   return !stat;
 }
 
-namespace ir {
+namespace pir {
 class IrNotMetException : public std::exception {
  public:
   explicit IrNotMetException(const std::string& str) : err_str_(str) {}
@@ -44,7 +44,7 @@ class IrNotMetException : public std::exception {
 #define IR_THROW(...)                                                     \
   do {                                                                    \
     try {                                                                 \
-      throw ir::IrNotMetException(                                        \
+      throw pir::IrNotMetException(                                       \
           paddle::string::Sprintf("Error occured at: %s:%d :\n%s",        \
                                   __FILE__,                               \
                                   __LINE__,                               \
@@ -60,7 +60,7 @@ class IrNotMetException : public std::exception {
     bool __cond__(COND);                                                    \
     if (UNLIKELY(is_error(__cond__))) {                                     \
       try {                                                                 \
-        throw ir::IrNotMetException(                                        \
+        throw pir::IrNotMetException(                                       \
             paddle::string::Sprintf("Error occured at: %s:%d :\n%s",        \
                                     __FILE__,                               \
                                     __LINE__,                               \
@@ -72,4 +72,4 @@ class IrNotMetException : public std::exception {
     }                                                                       \
   } while (0)
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/core/op_base.cc b/paddle/pir/core/interface_support.cc
similarity index 72%
rename from paddle/ir/core/op_base.cc
rename to paddle/pir/core/interface_support.cc
index 6f6dca0cdc125..19cba9de0bd85 100644
--- a/paddle/ir/core/op_base.cc
+++ b/paddle/pir/core/interface_support.cc
@@ -12,20 +12,22 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/ir/core/op_base.h"
-namespace ir {
-InterfaceValue::~InterfaceValue() {
+#include "paddle/pir/core/interface_support.h"
+
+namespace pir {
+details::InterfaceValue::~InterfaceValue() {
   if (model_) free(model_);
 }
 
-InterfaceValue::InterfaceValue(InterfaceValue&& val) noexcept {
+details::InterfaceValue::InterfaceValue(InterfaceValue&& val) noexcept {
   type_id_ = val.type_id_;
   model_ = val.model_;
   val.model_ = nullptr;
 }
 
-InterfaceValue& InterfaceValue::operator=(InterfaceValue&& val) noexcept {
+details::InterfaceValue& details::InterfaceValue::operator=(
+    InterfaceValue&& val) noexcept {
   swap(std::move(val));
   return *this;
 }
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/pir/core/interface_support.h b/paddle/pir/core/interface_support.h
new file mode 100644
index 0000000000000..df8f776d7b87b
--- /dev/null
+++ b/paddle/pir/core/interface_support.h
@@ -0,0 +1,122 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/pir/core/enforce.h"
+#include "paddle/pir/core/interface_value.h"
+
+namespace pir {
+namespace details {
+template <typename ConcreteT, typename... Args>
+class ConstructInterfacesOrTraits {
+ public:
+  /// Construct method for interfaces.
+  static details::InterfaceValue *interface(
+      details::InterfaceValue *p_interface) {
+    (void)std::initializer_list<int>{
+        0, (PlacementConstrctInterface<Args>(p_interface), 0)...};
+    return p_interface;
+  }
+
+  /// Construct method for traits.
+  static TypeId *trait(TypeId *p_trait) {
+    (void)std::initializer_list<int>{
+        0, (PlacementConstrctTrait<Args>(p_trait), 0)...};
+    return p_trait;
+  }
+
+ private:
+  /// Placement new interface.
+  template <typename T>
+  static void PlacementConstrctInterface(
+      details::InterfaceValue *&p_interface) {  // NOLINT
+    p_interface->swap(details::InterfaceValue::get<ConcreteT, T>());
+    VLOG(6) << "New a interface: id["
+            << (p_interface->type_id()).AsOpaquePointer() << "].";
+    ++p_interface;
+  }
+
+  /// Placement new trait.
+  template <typename T>
+  static void PlacementConstrctTrait(pir::TypeId *&p_trait) {  // NOLINT
+    *p_trait = TypeId::get<T>();
+    VLOG(6) << "New a trait: id[" << p_trait->AsOpaquePointer() << "].";
+    ++p_trait;
+  }
+};
+
+/// Specialized for tuple type.
+template <typename ConcreteT, typename... Args>
+class ConstructInterfacesOrTraits<ConcreteT, std::tuple<Args...>> {
+ public:
+  /// Construct method for interfaces.
+  static details::InterfaceValue *interface(
+      details::InterfaceValue *p_interface) {
+    return ConstructInterfacesOrTraits<ConcreteT, Args...>::interface(
+        p_interface);
+  }
+
+  /// Construct method for traits.
+  static TypeId *trait(TypeId *p_trait) {
+    return ConstructInterfacesOrTraits<ConcreteT, Args...>::trait(p_trait);
+  }
+};
+
+template <typename T>
+void *LookUp(const TypeId &interface_id,
+             const uint32_t num_interfaces,
+             const uint32_t num_traits,
+             const T *t) {
+  if (num_interfaces > 0) {
+    const details::InterfaceValue *p_first_interface =
+        reinterpret_cast<const details::InterfaceValue *>(
+            reinterpret_cast<const char *>(t) - sizeof(TypeId) * num_traits -
+            sizeof(details::InterfaceValue) * num_interfaces);
+    size_t left = 0, right = num_interfaces;
+    while (left < right) {
+      size_t mid = (left + right) / 2;
+      if ((p_first_interface + mid)->type_id() == interface_id) {
+        return (p_first_interface + mid)->model();
+      } else if ((p_first_interface + mid)->type_id() < interface_id) {
+        left = mid + 1;
+      } else {
+        right = mid;
+      }
+    }
+  }
+  return nullptr;
+}
+
+template <typename ConcreteT, typename InterfaceList>
+std::vector<details::InterfaceValue> GetInterfaceMap() {
+  constexpr size_t interfaces_num = std::tuple_size<InterfaceList>::value;
+  std::vector<details::InterfaceValue> interfaces_map(interfaces_num);
+  ConstructInterfacesOrTraits<ConcreteT, InterfaceList>::interface(
+      interfaces_map.data());
+  return interfaces_map;
+}
+
+template <typename ConcreteT, typename TraitList>
+std::vector<TypeId> GetTraitSet() {
+  constexpr size_t traits_num = std::tuple_size<TraitList>::value;
+  std::vector<TypeId> trait_set(traits_num);
+  auto p_first_trait = trait_set.data();
+  ConstructInterfacesOrTraits<ConcreteT, TraitList>::trait(p_first_trait);
+  return trait_set;
+}
+
+}  // namespace details
+
+}  // namespace pir
diff --git a/paddle/pir/core/interface_value.h b/paddle/pir/core/interface_value.h
new file mode 100644
index 0000000000000..fe7bc6d9ca2a8
--- /dev/null
+++ b/paddle/pir/core/interface_value.h
@@ -0,0 +1,67 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/pir/core/type_id.h"
+#include "paddle/pir/core/utils.h"
+
+namespace pir {
+
+namespace details {
+class IR_API InterfaceValue {
+ public:
+  template <typename ConcreteT, typename T>
+  static InterfaceValue get() {
+    InterfaceValue val;
+    val.type_id_ = TypeId::get<T>();
+    val.model_ = malloc(sizeof(typename T::template Model<ConcreteT>));
+    if (val.model_ == nullptr) {
+      throw("Alloc memory for interface failed.");
+    }
+    static_assert(std::is_trivially_destructible<
+                      typename T::template Model<ConcreteT>>::value,
+                  "interface models must be trivially destructible");
+    new (val.model_) typename T::template Model<ConcreteT>();
+    return val;
+  }
+  TypeId type_id() const { return type_id_; }
+  void *model() const { return model_; }
+
+  InterfaceValue() = default;
+  explicit InterfaceValue(TypeId type_id) : type_id_(type_id) {}
+  InterfaceValue(const InterfaceValue &) = delete;
+  InterfaceValue(InterfaceValue &&) noexcept;
+  InterfaceValue &operator=(const InterfaceValue &) = delete;
+  InterfaceValue &operator=(InterfaceValue &&) noexcept;
+  ~InterfaceValue();
+  void swap(InterfaceValue &&val) {
+    using std::swap;
+    swap(type_id_, val.type_id_);
+    swap(model_, val.model_);
+  }
+
+  ///
+  /// \brief Comparison operations.
+  ///
+  inline bool operator<(const InterfaceValue &other) const {
+    return type_id_ < other.type_id_;
+  }
+
+ private:
+  TypeId type_id_;
+  void *model_{nullptr};
+};
+
+}  // namespace details
+}  // namespace pir
diff --git a/paddle/ir/core/ir_context.cc b/paddle/pir/core/ir_context.cc
similarity index 82%
rename from paddle/ir/core/ir_context.cc
rename to paddle/pir/core/ir_context.cc
index 9fe79ac84b6a4..bfc05fabcf35b 100644
--- a/paddle/ir/core/ir_context.cc
+++ b/paddle/pir/core/ir_context.cc
@@ -12,19 +12,19 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/ir/core/ir_context.h"
+#include "paddle/pir/core/ir_context.h"
 
 #include <unordered_map>
 
-#include "paddle/ir/core/attribute_base.h"
-#include "paddle/ir/core/builtin_dialect.h"
-#include "paddle/ir/core/builtin_type.h"
-#include "paddle/ir/core/dialect.h"
-#include "paddle/ir/core/op_info_impl.h"
-#include "paddle/ir/core/spin_lock.h"
-#include "paddle/ir/core/type_base.h"
+#include "paddle/pir/core/attribute_base.h"
+#include "paddle/pir/core/builtin_dialect.h"
+#include "paddle/pir/core/builtin_type.h"
+#include "paddle/pir/core/dialect.h"
+#include "paddle/pir/core/op_info_impl.h"
+#include "paddle/pir/core/spin_lock.h"
+#include "paddle/pir/core/type_base.h"
 
-namespace ir {
+namespace pir {
 // The implementation class of the IrContext class, cache registered
 // AbstractType, TypeStorage, AbstractAttribute, AttributeStorage, Dialect.
 class IrContextImpl {
@@ -32,7 +32,7 @@ class IrContextImpl {
   IrContextImpl() = default;
 
   ~IrContextImpl() {
-    std::lock_guard<ir::SpinLock> guard(destructor_lock_);
+    std::lock_guard<pir::SpinLock> guard(destructor_lock_);
     for (auto &abstract_type_map : registed_abstract_types_) {
       delete abstract_type_map.second;
     }
@@ -54,48 +54,48 @@ class IrContextImpl {
     registed_op_infos_.clear();
   }
 
-  void RegisterAbstractType(ir::TypeId type_id, AbstractType *abstract_type) {
-    std::lock_guard<ir::SpinLock> guard(registed_abstract_types_lock_);
+  void RegisterAbstractType(pir::TypeId type_id, AbstractType *abstract_type) {
+    std::lock_guard<pir::SpinLock> guard(registed_abstract_types_lock_);
     VLOG(6) << "Register an abstract_type of: [TypeId_hash="
-            << std::hash<ir::TypeId>()(type_id)
+            << std::hash<pir::TypeId>()(type_id)
             << ", AbstractType_ptr=" << abstract_type << "].";
     registed_abstract_types_.emplace(type_id, abstract_type);
   }
 
-  AbstractType *GetAbstractType(ir::TypeId type_id) {
-    std::lock_guard<ir::SpinLock> guard(registed_abstract_types_lock_);
+  AbstractType *GetAbstractType(pir::TypeId type_id) {
+    std::lock_guard<pir::SpinLock> guard(registed_abstract_types_lock_);
     auto iter = registed_abstract_types_.find(type_id);
     if (iter != registed_abstract_types_.end()) {
       VLOG(6) << "Found a cached abstract_type of: [TypeId_hash="
-              << std::hash<ir::TypeId>()(type_id)
+              << std::hash<pir::TypeId>()(type_id)
               << ", AbstractType_ptr=" << iter->second << "].";
       return iter->second;
     }
     LOG(WARNING) << "No cache found abstract_type of: [TypeId_hash="
-                 << std::hash<ir::TypeId>()(type_id) << "].";
+                 << std::hash<pir::TypeId>()(type_id) << "].";
     return nullptr;
   }
 
-  void RegisterAbstractAttribute(ir::TypeId type_id,
+  void RegisterAbstractAttribute(pir::TypeId type_id,
                                  AbstractAttribute *abstract_attribute) {
-    std::lock_guard<ir::SpinLock> guard(registed_abstract_attributes_lock_);
+    std::lock_guard<pir::SpinLock> guard(registed_abstract_attributes_lock_);
     VLOG(6) << "Register an abstract_attribute of: [TypeId_hash="
-            << std::hash<ir::TypeId>()(type_id)
+            << std::hash<pir::TypeId>()(type_id)
             << ", AbstractAttribute_ptr=" << abstract_attribute << "].";
     registed_abstract_attributes_.emplace(type_id, abstract_attribute);
   }
 
-  AbstractAttribute *GetAbstractAttribute(ir::TypeId type_id) {
-    std::lock_guard<ir::SpinLock> guard(registed_abstract_attributes_lock_);
+  AbstractAttribute *GetAbstractAttribute(pir::TypeId type_id) {
+    std::lock_guard<pir::SpinLock> guard(registed_abstract_attributes_lock_);
     auto iter = registed_abstract_attributes_.find(type_id);
     if (iter != registed_abstract_attributes_.end()) {
       VLOG(4) << "Found a cached abstract_attribute of: [TypeId_hash="
-              << std::hash<ir::TypeId>()(type_id)
+              << std::hash<pir::TypeId>()(type_id)
               << ", AbstractAttribute_ptr=" << iter->second << "].";
       return iter->second;
     }
     LOG(WARNING) << "No cache found abstract_attribute of: [TypeId_hash="
-                 << std::hash<ir::TypeId>()(type_id) << "].";
+                 << std::hash<pir::TypeId>()(type_id) << "].";
     return nullptr;
   }
 
@@ -104,14 +104,14 @@ class IrContextImpl {
   }
 
   void RegisterOpInfo(const std::string &name, OpInfo info) {
-    std::lock_guard<ir::SpinLock> guard(registed_op_infos_lock_);
+    std::lock_guard<pir::SpinLock> guard(registed_op_infos_lock_);
     VLOG(6) << "Register an operation of: [Name=" << name
             << ", OpInfo ptr=" << info.AsOpaquePointer() << "].";
     registed_op_infos_.emplace(name, info);
   }
 
   OpInfo GetOpInfo(const std::string &name) {
-    std::lock_guard<ir::SpinLock> guard(registed_op_infos_lock_);
+    std::lock_guard<pir::SpinLock> guard(registed_op_infos_lock_);
     auto iter = registed_op_infos_.find(name);
     if (iter != registed_op_infos_.end()) {
       VLOG(8) << "Found a cached OpInfo of: [name=" << name
@@ -124,7 +124,7 @@ class IrContextImpl {
   const OpInfoMap &registered_op_info_map() { return registed_op_infos_; }
 
   void RegisterDialect(std::string name, Dialect *dialect) {
-    std::lock_guard<ir::SpinLock> guard(registed_dialect_lock_);
+    std::lock_guard<pir::SpinLock> guard(registed_dialect_lock_);
     VLOG(6) << "Register a dialect of: [name=" << name
             << ", dialect_ptr=" << dialect << "].";
     registed_dialect_.emplace(name, dialect);
@@ -135,7 +135,7 @@ class IrContextImpl {
   }
 
   Dialect *GetDialect(const std::string &name) {
-    std::lock_guard<ir::SpinLock> guard(registed_dialect_lock_);
+    std::lock_guard<pir::SpinLock> guard(registed_dialect_lock_);
     auto iter = registed_dialect_.find(name);
     if (iter != registed_dialect_.end()) {
       VLOG(6) << "Found a cached dialect of: [name=" << name
@@ -148,7 +148,7 @@ class IrContextImpl {
 
   // Cached AbstractType instances.
   std::unordered_map<TypeId, AbstractType *> registed_abstract_types_;
-  ir::SpinLock registed_abstract_types_lock_;
+  pir::SpinLock registed_abstract_types_lock_;
   // TypeStorage uniquer and cache instances.
   StorageManager registed_type_storage_manager_;
   // Cache some built-in type objects.
@@ -168,19 +168,19 @@ class IrContextImpl {
 
   // Cached AbstractAttribute instances.
   std::unordered_map<TypeId, AbstractAttribute *> registed_abstract_attributes_;
-  ir::SpinLock registed_abstract_attributes_lock_;
+  pir::SpinLock registed_abstract_attributes_lock_;
   // AttributeStorage uniquer and cache instances.
   StorageManager registed_attribute_storage_manager_;
 
   // The dialect registered in the context.
   std::unordered_map<std::string, Dialect *> registed_dialect_;
-  ir::SpinLock registed_dialect_lock_;
+  pir::SpinLock registed_dialect_lock_;
 
   // The Op registered in the context.
   OpInfoMap registed_op_infos_;
-  ir::SpinLock registed_op_infos_lock_;
+  pir::SpinLock registed_op_infos_lock_;
 
-  ir::SpinLock destructor_lock_;
+  pir::SpinLock destructor_lock_;
 };
 
 IrContext *IrContext::Instance() {
@@ -223,7 +223,7 @@ AbstractType *IrContext::GetRegisteredAbstractType(TypeId id) {
 }
 
 void IrContext::RegisterAbstractAttribute(
-    ir::TypeId type_id, AbstractAttribute &&abstract_attribute) {
+    pir::TypeId type_id, AbstractAttribute &&abstract_attribute) {
   if (GetRegisteredAbstractAttribute(type_id) == nullptr) {
     impl().RegisterAbstractAttribute(
         type_id, new AbstractAttribute(std::move(abstract_attribute)));
@@ -274,7 +274,7 @@ Dialect *IrContext::GetRegisteredDialect(const std::string &dialect_name) {
   return nullptr;
 }
 
-void IrContext::RegisterAbstractType(ir::TypeId type_id,
+void IrContext::RegisterAbstractType(pir::TypeId type_id,
                                      AbstractType &&abstract_type) {
   if (GetRegisteredAbstractType(type_id) == nullptr) {
     impl().RegisterAbstractType(type_id,
@@ -284,14 +284,15 @@ void IrContext::RegisterAbstractType(ir::TypeId type_id,
   }
 }
 
-void IrContext::RegisterOpInfo(Dialect *dialect,
-                               TypeId op_id,
-                               const char *name,
-                               std::vector<InterfaceValue> &&interface_map,
-                               const std::vector<TypeId> &trait_set,
-                               size_t attributes_num,
-                               const char **attributes_name,
-                               VerifyPtr verify) {
+void IrContext::RegisterOpInfo(
+    Dialect *dialect,
+    TypeId op_id,
+    const char *name,
+    std::vector<details::InterfaceValue> &&interface_map,
+    const std::vector<TypeId> &trait_set,
+    size_t attributes_num,
+    const char **attributes_name,
+    VerifyPtr verify) {
   if (impl().IsOpInfoRegistered(name)) {
     LOG(WARNING) << name << " op already registered.";
   } else {
@@ -361,4 +362,4 @@ Complex128Type Complex128Type::get(IrContext *ctx) {
   return ctx->impl().complex128_type;
 }
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/core/ir_context.h b/paddle/pir/core/ir_context.h
similarity index 95%
rename from paddle/ir/core/ir_context.h
rename to paddle/pir/core/ir_context.h
index ebec8d202ceb5..a68c87f3bee0b 100644
--- a/paddle/ir/core/ir_context.h
+++ b/paddle/pir/core/ir_context.h
@@ -18,9 +18,9 @@
 #include <unordered_map>
 #include <vector>
 
-#include "paddle/ir/core/dll_decl.h"
+#include "paddle/pir/core/dll_decl.h"
 
-namespace ir {
+namespace pir {
 class IrContextImpl;
 class StorageManager;
 class AbstractType;
@@ -28,12 +28,13 @@ class AbstractAttribute;
 class TypeId;
 class Dialect;
 class OpInfo;
-class InterfaceValue;
 class Type;
 class OpResult;
 class Attribute;
 class Operation;
-
+namespace details {
+class InterfaceValue;
+}
 using OpInfoMap = std::unordered_map<std::string, OpInfo>;
 
 ///
@@ -86,7 +87,7 @@ class IR_API IrContext {
   /// \param type_id The type id of the AbstractAttribute.
   /// \param abstract_attribute AbstractAttribute provided by user.
   ///
-  void RegisterAbstractAttribute(ir::TypeId type_id,
+  void RegisterAbstractAttribute(pir::TypeId type_id,
                                  AbstractAttribute &&abstract_attribute);
 
   ///
@@ -109,7 +110,7 @@ class IR_API IrContext {
   void RegisterOpInfo(Dialect *dialect,
                       TypeId op_id,
                       const char *name,
-                      std::vector<InterfaceValue> &&interface_map,
+                      std::vector<details::InterfaceValue> &&interface_map,
                       const std::vector<TypeId> &trait_set,
                       size_t attributes_num,
                       const char **attributes_name,
@@ -190,4 +191,4 @@ class IR_API IrContext {
   IrContextImpl *impl_;
 };
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/core/ir_printer.cc b/paddle/pir/core/ir_printer.cc
similarity index 95%
rename from paddle/ir/core/ir_printer.cc
rename to paddle/pir/core/ir_printer.cc
index 0d0ce64f679de..7fa8e076ad147 100644
--- a/paddle/ir/core/ir_printer.cc
+++ b/paddle/pir/core/ir_printer.cc
@@ -17,17 +17,17 @@
 #include <string>
 #include <unordered_map>
 
-#include "paddle/ir/core/block.h"
-#include "paddle/ir/core/builtin_attribute.h"
-#include "paddle/ir/core/builtin_type.h"
-#include "paddle/ir/core/dialect.h"
-#include "paddle/ir/core/ir_printer.h"
-#include "paddle/ir/core/operation.h"
-#include "paddle/ir/core/program.h"
-#include "paddle/ir/core/utils.h"
-#include "paddle/ir/core/value.h"
-
-namespace ir {
+#include "paddle/pir/core/block.h"
+#include "paddle/pir/core/builtin_attribute.h"
+#include "paddle/pir/core/builtin_type.h"
+#include "paddle/pir/core/dialect.h"
+#include "paddle/pir/core/ir_printer.h"
+#include "paddle/pir/core/operation.h"
+#include "paddle/pir/core/program.h"
+#include "paddle/pir/core/utils.h"
+#include "paddle/pir/core/value.h"
+
+namespace pir {
 
 namespace {
 constexpr char newline[] = "\n";  // NOLINT
@@ -334,4 +334,4 @@ std::ostream& operator<<(std::ostream& os, const Program& prog) {
   return os;
 }
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/core/ir_printer.h b/paddle/pir/core/ir_printer.h
similarity index 86%
rename from paddle/ir/core/ir_printer.h
rename to paddle/pir/core/ir_printer.h
index c393d2dfbe90a..a845bec52490c 100644
--- a/paddle/ir/core/ir_printer.h
+++ b/paddle/pir/core/ir_printer.h
@@ -18,15 +18,15 @@
 #include <string>
 #include <unordered_map>
 
-#include "paddle/ir/core/attribute.h"
-#include "paddle/ir/core/block.h"
-#include "paddle/ir/core/operation.h"
-#include "paddle/ir/core/program.h"
-#include "paddle/ir/core/region.h"
-#include "paddle/ir/core/type.h"
-#include "paddle/ir/core/value.h"
+#include "paddle/pir/core/attribute.h"
+#include "paddle/pir/core/block.h"
+#include "paddle/pir/core/operation.h"
+#include "paddle/pir/core/program.h"
+#include "paddle/pir/core/region.h"
+#include "paddle/pir/core/type.h"
+#include "paddle/pir/core/value.h"
 
-namespace ir {
+namespace pir {
 
 class BasicIrPrinter {
  public:
@@ -75,4 +75,4 @@ class IR_API IrPrinter : public BasicIrPrinter {
   std::unordered_map<const void*, std::string> aliases_;
 };
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/core/macros.h b/paddle/pir/core/macros.h
similarity index 97%
rename from paddle/ir/core/macros.h
rename to paddle/pir/core/macros.h
index 962ca6d4107f3..25d6dd5a812ab 100644
--- a/paddle/ir/core/macros.h
+++ b/paddle/pir/core/macros.h
@@ -13,7 +13,7 @@
 // limitations under the License.
 #pragma once
 
-namespace ir {
+namespace pir {
 // TODO(Aurelius84): We also has DISABLE_COPY_AND_ASSIGN in phi/core/maros.h,
 // howere it's not recommended to use it in ir namspace. So we define this again
 // here.
@@ -28,4 +28,4 @@ namespace ir {
   classname& operator=(classname&&) = delete
 #endif
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/pir/core/op_base.h b/paddle/pir/core/op_base.h
new file mode 100644
index 0000000000000..e51018d5c3f57
--- /dev/null
+++ b/paddle/pir/core/op_base.h
@@ -0,0 +1,151 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <type_traits>
+
+#include "paddle/pir/core/enforce.h"
+#include "paddle/pir/core/interface_support.h"
+#include "paddle/pir/core/op_result.h"
+#include "paddle/pir/core/operation.h"
+#include "paddle/pir/core/utils.h"
+
+namespace pir {
+
+class IR_API OpBase {
+ public:
+  explicit OpBase(Operation *operation = nullptr) : operation_(operation) {}
+
+  Operation *operation() const {
+    IR_ENFORCE(operation_, "Can't use operation() in a null op.");
+    return operation_;
+  }
+
+  explicit operator bool() const { return operation_ != nullptr; }
+
+  operator Operation *() const { return operation(); }
+
+  Operation *operator->() const { return operation(); }
+
+  IrContext *ir_context() const { return operation()->ir_context(); }
+
+  uint32_t num_results() const { return operation()->num_results(); }
+
+  uint32_t num_operands() const { return operation()->num_operands(); }
+
+  const AttributeMap &attributes() const { return operation()->attributes(); }
+
+  Value operand_source(uint32_t index) const {
+    return operation()->operand_source(index);
+  }
+
+  OpResult result(uint32_t index) const { return operation()->result(index); }
+
+  pir::Attribute attribute(const std::string &name) {
+    return operation()->attribute(name);
+  }
+
+  template <typename T>
+  T attribute(const std::string &name) {
+    return operation()->attribute<T>(name);
+  }
+
+ private:
+  Operation *operation_;  // Not owned
+};
+
+///
+/// \brief OpTrait
+///
+template <class ConcreteTrait>
+class OpTraitBase : public OpBase {
+ public:
+  explicit OpTraitBase(Operation *op) : OpBase(op) {}
+
+  static TypeId GetTraitId() { return TypeId::get<ConcreteTrait>(); }
+
+  static ConcreteTrait dyn_cast(Operation *op) {
+    if (op && op->HasTrait<ConcreteTrait>()) {
+      return ConcreteTrait(op);
+    }
+    return ConcreteTrait(nullptr);
+  }
+};
+
+///
+/// \brief OpInterface
+///
+template <typename ConcreteInterface>
+class OpInterfaceBase : public OpBase {
+ public:
+  explicit OpInterfaceBase(Operation *op) : OpBase(op) {}
+
+  // Accessor for the ID of this interface.
+  static TypeId GetInterfaceId() { return TypeId::get<ConcreteInterface>(); }
+
+  static ConcreteInterface dyn_cast(Operation *op) {
+    if (op && op->HasInterface<ConcreteInterface>()) {
+      return ConcreteInterface(
+          op, op->info().GetInterfaceImpl<ConcreteInterface>());
+    }
+    return ConcreteInterface(nullptr, nullptr);
+  }
+};
+
+template <typename ConcreteOp, class... TraitOrInterface>
+class Op : public OpBase {
+ public:
+  using OpBase::OpBase;
+
+  using TraitList =
+      typename Filter<OpTraitBase, std::tuple<TraitOrInterface...>>::Type;
+
+  using InterfaceList =
+      typename Filter<OpInterfaceBase, std::tuple<TraitOrInterface...>>::Type;
+
+  static ConcreteOp dyn_cast(Operation *op) {
+    if (op && op->info().id() == TypeId::get<ConcreteOp>()) {
+      return ConcreteOp(op);
+    }
+    return ConcreteOp(nullptr);
+  }
+
+  static bool classof(const Operation *op) {
+    return op && op->info().id() == TypeId::get<ConcreteOp>();
+  }
+
+  static std::vector<details::InterfaceValue> GetInterfaceMap() {
+    return pir::details::GetInterfaceMap<ConcreteOp, InterfaceList>();
+  }
+
+  static std::vector<TypeId> GetTraitSet() {
+    return pir::details::GetTraitSet<ConcreteOp, TraitList>();
+  }
+
+  // Checking that the derived class does not define any member by comparing
+  // its size to an ad-hoc EmptyOp.
+  static constexpr bool HasNoDataMembers() {
+    class EmptyOp : public Op<EmptyOp, TraitOrInterface...> {};
+    return sizeof(ConcreteOp) == sizeof(EmptyOp);
+  }
+
+  // Implementation of `VerifyInvariantsFn` OperationName hook.
+  static void VerifyInvariants(Operation *op) {
+    static_assert(HasNoDataMembers(),
+                  "Op class shouldn't define new data members");
+    op->dyn_cast<ConcreteOp>().Verify();
+  }
+};
+
+}  // namespace pir
diff --git a/paddle/ir/core/op_info.cc b/paddle/pir/core/op_info.cc
similarity index 87%
rename from paddle/ir/core/op_info.cc
rename to paddle/pir/core/op_info.cc
index 6c9b62f56e63f..b018bec30448d 100644
--- a/paddle/ir/core/op_info.cc
+++ b/paddle/pir/core/op_info.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/ir/core/op_info.h"
-#include "paddle/ir/core/dialect.h"
-#include "paddle/ir/core/ir_context.h"
-#include "paddle/ir/core/op_info_impl.h"
+#include "paddle/pir/core/op_info.h"
+#include "paddle/pir/core/dialect.h"
+#include "paddle/pir/core/ir_context.h"
+#include "paddle/pir/core/op_info_impl.h"
 
-namespace ir {
+namespace pir {
 bool OpInfo::HasTrait(TypeId trait_id) const {
   return impl_ && impl_->HasTrait(trait_id);
 }
@@ -40,4 +40,4 @@ void OpInfo::Verify(Operation *operation) const { impl_->verify()(operation); }
 void *OpInfo::GetInterfaceImpl(TypeId interface_id) const {
   return impl_ ? impl_->GetInterfaceImpl(interface_id) : nullptr;
 }
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/core/op_info.h b/paddle/pir/core/op_info.h
similarity index 70%
rename from paddle/ir/core/op_info.h
rename to paddle/pir/core/op_info.h
index f92d37d4b33e0..130c05037d8ae 100644
--- a/paddle/ir/core/op_info.h
+++ b/paddle/pir/core/op_info.h
@@ -16,9 +16,9 @@
 #include <functional>
 #include <unordered_map>
 
-#include "paddle/ir/core/type_id.h"
+#include "paddle/pir/core/type_id.h"
 
-namespace ir {
+namespace pir {
 class OpInfoImpl;
 class IrContext;
 class OpResult;
@@ -61,15 +61,15 @@ class IR_API OpInfo {
 
   bool HasTrait(TypeId trait_id) const;
 
-  template <typename Interface>
+  template <typename InterfaceT>
   bool HasInterface() const {
-    return HasInterface(TypeId::get<Interface>());
+    return HasInterface(TypeId::get<InterfaceT>());
   }
 
   bool HasInterface(TypeId interface_id) const;
 
-  template <typename Interface>
-  typename Interface::Concept *GetInterfaceImpl() const;
+  template <typename InterfaceT>
+  typename InterfaceT::Concept *GetInterfaceImpl() const;
 
   void *AsOpaquePointer() const { return impl_; }
   static OpInfo RecoverFromOpaquePointer(void *pointer) {
@@ -84,22 +84,28 @@ class IR_API OpInfo {
   void *GetInterfaceImpl(TypeId interface_id) const;
 
  private:
-  OpInfoImpl *impl_{nullptr};  // not owned
+  /// The internal implementation of the operation name.
+  /// Not owned.
+  OpInfoImpl *impl_{nullptr};
 };
 
-template <typename Interface>
-typename Interface::Concept *OpInfo::GetInterfaceImpl() const {
-  void *model = GetInterfaceImpl(TypeId::get<Interface>());
-  return reinterpret_cast<typename Interface::Concept *>(model);
+///
+/// \brief Returns an instance of the concept object for the given interface if
+/// it was registered to this operation, null otherwise.
+///
+template <typename InterfaceT>
+typename InterfaceT::Concept *OpInfo::GetInterfaceImpl() const {
+  void *model = GetInterfaceImpl(TypeId::get<InterfaceT>());
+  return reinterpret_cast<typename InterfaceT::Concept *>(model);
 }
 
-}  // namespace ir
+}  // namespace pir
 
 namespace std {
 template <>
-struct hash<ir::OpInfo> {
-  std::size_t operator()(const ir::OpInfo &obj) const {
-    return std::hash<const ir::OpInfoImpl *>()(obj.impl_);
+struct hash<pir::OpInfo> {
+  std::size_t operator()(const pir::OpInfo &obj) const {
+    return std::hash<const pir::OpInfoImpl *>()(obj.impl_);
   }
 };
 }  // namespace std
diff --git a/paddle/ir/core/op_info_impl.cc b/paddle/pir/core/op_info_impl.cc
similarity index 69%
rename from paddle/ir/core/op_info_impl.cc
rename to paddle/pir/core/op_info_impl.cc
index 90469f1731be9..fa91d3173389a 100644
--- a/paddle/ir/core/op_info_impl.cc
+++ b/paddle/pir/core/op_info_impl.cc
@@ -12,14 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/ir/core/op_info_impl.h"
-#include "paddle/ir/core/dialect.h"
+#include "paddle/pir/core/op_info_impl.h"
+#include "paddle/pir/core/dialect.h"
+#include "paddle/pir/core/interface_support.h"
 
-namespace ir {
+namespace pir {
 OpInfo OpInfoImpl::Create(Dialect *dialect,
                           TypeId op_id,
                           const char *op_name,
-                          std::vector<InterfaceValue> &&interface_map,
+                          std::vector<details::InterfaceValue> &&interface_map,
                           const std::vector<TypeId> &trait_set,
                           size_t attributes_num,
                           const char *attributes_name[],  // NOLINT
@@ -29,7 +30,7 @@ OpInfo OpInfoImpl::Create(Dialect *dialect,
   size_t traits_num = trait_set.size();
   VLOG(6) << "Create OpInfoImpl with: " << interfaces_num << " interfaces, "
           << traits_num << " traits, " << attributes_num << " attributes.";
-  size_t base_size = sizeof(InterfaceValue) * interfaces_num +
+  size_t base_size = sizeof(details::InterfaceValue) * interfaces_num +
                      sizeof(TypeId) * traits_num + sizeof(OpInfoImpl);
   char *base_ptr = static_cast<char *>(::operator new(base_size));
   VLOG(6) << "Malloc " << base_size << " Bytes at "
@@ -37,10 +38,10 @@ OpInfo OpInfoImpl::Create(Dialect *dialect,
   if (interfaces_num > 0) {
     std::sort(interface_map.begin(), interface_map.end());
     for (size_t index = 0; index < interfaces_num; ++index) {
-      new (base_ptr + index * sizeof(InterfaceValue))
-          InterfaceValue(std::move(interface_map[index]));
+      new (base_ptr + index * sizeof(details::InterfaceValue))
+          details::InterfaceValue(std::move(interface_map[index]));
     }
-    base_ptr += interfaces_num * sizeof(InterfaceValue);
+    base_ptr += interfaces_num * sizeof(details::InterfaceValue);
   }
   if (traits_num > 0) {
     auto p_first_trait = reinterpret_cast<TypeId *>(base_ptr);
@@ -69,7 +70,7 @@ void OpInfoImpl::Destroy(OpInfo info) {
   }
 }
 
-ir::IrContext *OpInfoImpl::ir_context() const {
+pir::IrContext *OpInfoImpl::ir_context() const {
   return dialect_ ? dialect_->ir_context() : nullptr;
 }
 
@@ -77,7 +78,7 @@ bool OpInfoImpl::HasTrait(TypeId trait_id) const {
   if (num_traits_ > 0) {
     const TypeId *p_first_trait =
         reinterpret_cast<const TypeId *>(reinterpret_cast<const char *>(this) -
-                                         sizeof(ir::TypeId) * num_traits_);
+                                         sizeof(pir::TypeId) * num_traits_);
     return std::binary_search(
         p_first_trait, p_first_trait + num_traits_, trait_id);
   }
@@ -86,49 +87,32 @@ bool OpInfoImpl::HasTrait(TypeId trait_id) const {
 
 bool OpInfoImpl::HasInterface(TypeId interface_id) const {
   if (num_interfaces_ > 0) {
-    const InterfaceValue *p_first_interface =
-        reinterpret_cast<const InterfaceValue *>(
+    const details::InterfaceValue *p_first_interface =
+        reinterpret_cast<const details::InterfaceValue *>(
             reinterpret_cast<const char *>(this) -
-            sizeof(ir::TypeId) * num_traits_ -
-            sizeof(InterfaceValue) * num_interfaces_);
+            sizeof(pir::TypeId) * num_traits_ -
+            sizeof(details::InterfaceValue) * num_interfaces_);
     return std::binary_search(p_first_interface,
                               p_first_interface + num_interfaces_,
-                              InterfaceValue(interface_id));
+                              details::InterfaceValue(interface_id));
   }
   return false;
 }
 
 void *OpInfoImpl::GetInterfaceImpl(TypeId interface_id) const {
-  if (num_interfaces_ > 0) {
-    const InterfaceValue *p_first_interface =
-        reinterpret_cast<const InterfaceValue *>(
-            reinterpret_cast<const char *>(this) -
-            sizeof(TypeId) * num_traits_ -
-            sizeof(InterfaceValue) * num_interfaces_);
-    size_t left = 0, right = num_interfaces_;
-    while (left < right) {
-      size_t mid = (left + right) / 2;
-      if ((p_first_interface + mid)->type_id() == interface_id) {
-        return (p_first_interface + mid)->model();
-      } else if ((p_first_interface + mid)->type_id() < interface_id) {
-        left = mid + 1;
-      } else {
-        right = mid;
-      }
-    }
-  }
-  return nullptr;
+  return pir::details::LookUp<OpInfoImpl>(
+      interface_id, num_interfaces_, num_traits_, this);
 }
 
 void OpInfoImpl::Destroy() {
   VLOG(10) << "Destroy op_info impl at " << this;
   // (1) free interfaces
   char *base_ptr = reinterpret_cast<char *>(this) -
-                   sizeof(ir::TypeId) * num_traits_ -
-                   sizeof(InterfaceValue) * num_interfaces_;
+                   sizeof(pir::TypeId) * num_traits_ -
+                   sizeof(details::InterfaceValue) * num_interfaces_;
   if (num_interfaces_ > 0) {
-    InterfaceValue *p_interface_val =
-        reinterpret_cast<InterfaceValue *>(base_ptr);
+    details::InterfaceValue *p_interface_val =
+        reinterpret_cast<details::InterfaceValue *>(base_ptr);
     for (size_t i = 0; i < num_interfaces_; i++) {
       (p_interface_val + i)->~InterfaceValue();
     }
@@ -138,4 +122,4 @@ void OpInfoImpl::Destroy() {
   free(base_ptr);
 }
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/core/op_info_impl.h b/paddle/pir/core/op_info_impl.h
similarity index 91%
rename from paddle/ir/core/op_info_impl.h
rename to paddle/pir/core/op_info_impl.h
index 52666f1b377c8..410c9ef371989 100644
--- a/paddle/ir/core/op_info_impl.h
+++ b/paddle/pir/core/op_info_impl.h
@@ -19,11 +19,11 @@
 #include <string>
 #include <utility>
 
-#include "paddle/ir/core/builtin_attribute.h"
-#include "paddle/ir/core/op_base.h"
-#include "paddle/ir/core/type.h"
+#include "paddle/pir/core/builtin_attribute.h"
+#include "paddle/pir/core/op_base.h"
+#include "paddle/pir/core/type.h"
 
-namespace ir {
+namespace pir {
 class Dialect;
 
 ///
@@ -38,7 +38,7 @@ class OpInfoImpl {
   static OpInfo Create(Dialect *dialect,
                        TypeId op_id,
                        const char *op_name,
-                       std::vector<InterfaceValue> &&interface_map,
+                       std::vector<details::InterfaceValue> &&interface_map,
                        const std::vector<TypeId> &trait_set,
                        size_t attributes_num,
                        const char *attributes_name[],
@@ -69,7 +69,7 @@ class OpInfoImpl {
   }
 
  private:
-  OpInfoImpl(ir::Dialect *dialect,
+  OpInfoImpl(pir::Dialect *dialect,
              TypeId op_id,
              const char *op_name,
              uint32_t num_interfaces,
@@ -111,4 +111,4 @@ class OpInfoImpl {
   VerifyPtr verify_{nullptr};
 };
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/pir/core/op_operand.cc b/paddle/pir/core/op_operand.cc
new file mode 100644
index 0000000000000..b27f02ac23d4c
--- /dev/null
+++ b/paddle/pir/core/op_operand.cc
@@ -0,0 +1,70 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/pir/core/op_operand.h"
+#include "paddle/pir/core/enforce.h"
+#include "paddle/pir/core/op_operand_impl.h"
+
+#define CHECK_NULL_IMPL(class_name, func_name)                  \
+  IR_ENFORCE(impl_,                                             \
+             "impl_ pointer is null when call func:" #func_name \
+             " , in class: " #class_name ".")
+
+#define CHECK_OPOPEREND_NULL_IMPL(func_name) \
+  CHECK_NULL_IMPL(OpOpernad, func_name)
+
+namespace pir {
+
+OpOperand::OpOperand(const detail::OpOperandImpl *impl)
+    : impl_(const_cast<detail::OpOperandImpl *>(impl)) {}
+
+OpOperand &OpOperand::operator=(const OpOperand &rhs) {
+  impl_ = rhs.impl_;
+  return *this;
+}
+
+OpOperand &OpOperand::operator=(const detail::OpOperandImpl *impl) {
+  if (this->impl_ == impl) return *this;
+  impl_ = const_cast<detail::OpOperandImpl *>(impl);
+  return *this;
+}
+OpOperand::operator bool() const { return impl_ && impl_->source(); }
+
+OpOperand OpOperand::next_use() const {
+  CHECK_OPOPEREND_NULL_IMPL(next_use);
+  return impl_->next_use();
+}
+
+Value OpOperand::source() const {
+  CHECK_OPOPEREND_NULL_IMPL(source);
+  return impl_->source();
+}
+
+Type OpOperand::type() const { return source().type(); }
+
+void OpOperand::set_source(Value value) {
+  CHECK_OPOPEREND_NULL_IMPL(set_source);
+  impl_->set_source(value);
+}
+
+Operation *OpOperand::owner() const {
+  CHECK_OPOPEREND_NULL_IMPL(owner);
+  return impl_->owner();
+}
+
+void OpOperand::RemoveFromUdChain() {
+  CHECK_OPOPEREND_NULL_IMPL(RemoveFromUdChain);
+  return impl_->RemoveFromUdChain();
+}
+
+}  // namespace pir
diff --git a/paddle/pir/core/op_operand.h b/paddle/pir/core/op_operand.h
new file mode 100644
index 0000000000000..96b355b861ffa
--- /dev/null
+++ b/paddle/pir/core/op_operand.h
@@ -0,0 +1,69 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/pir/core/dll_decl.h"
+
+namespace pir {
+class Operation;
+class Value;
+class Type;
+
+namespace detail {
+class OpOperandImpl;
+}  // namespace detail
+
+///
+/// \brief OpOperand class represents the op_operand of operation. This class
+/// only provides interfaces, for specific implementation, see Impl class.
+///
+class IR_API OpOperand {
+ public:
+  OpOperand() = default;
+
+  OpOperand(const OpOperand &other) = default;
+
+  OpOperand(const detail::OpOperandImpl *impl);  // NOLINT
+
+  OpOperand &operator=(const OpOperand &rhs);
+
+  OpOperand &operator=(const detail::OpOperandImpl *impl);
+
+  bool operator==(const OpOperand &other) const { return impl_ == other.impl_; }
+
+  bool operator!=(const OpOperand &other) const { return !operator==(other); }
+
+  bool operator!() const { return impl_ == nullptr; }
+
+  operator bool() const;
+
+  OpOperand next_use() const;
+
+  Value source() const;
+
+  Type type() const;
+
+  void set_source(Value value);
+
+  Operation *owner() const;
+
+  void RemoveFromUdChain();
+
+  friend Operation;
+
+ private:
+  detail::OpOperandImpl *impl_{nullptr};
+};
+}  // namespace pir
diff --git a/paddle/pir/core/op_operand_impl.cc b/paddle/pir/core/op_operand_impl.cc
new file mode 100644
index 0000000000000..44a3a5f28bb6e
--- /dev/null
+++ b/paddle/pir/core/op_operand_impl.cc
@@ -0,0 +1,74 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/pir/core/op_operand_impl.h"
+#include "paddle/pir/core/value_impl.h"
+
+namespace pir {
+namespace detail {
+
+pir::Operation *OpOperandImpl::owner() const { return owner_; }
+
+pir::detail::OpOperandImpl *OpOperandImpl::next_use() { return next_use_; }
+
+pir::Value OpOperandImpl::source() const { return source_; }
+
+void OpOperandImpl::set_source(Value source) {
+  RemoveFromUdChain();
+  if (!source) {
+    return;
+  }
+  source_ = source;
+  InsertToUdChain();
+}
+
+OpOperandImpl::OpOperandImpl(pir::Value source, pir::Operation *owner)
+    : source_(source), owner_(owner) {
+  if (!source) {
+    return;
+  }
+  InsertToUdChain();
+}
+
+void OpOperandImpl::InsertToUdChain() {
+  prev_use_addr_ = source_.impl()->first_use_addr();
+  next_use_ = source_.impl()->first_use();
+  if (next_use_) {
+    next_use_->prev_use_addr_ = &next_use_;
+  }
+  source_.impl()->set_first_use(this);
+}
+
+void OpOperandImpl::RemoveFromUdChain() {
+  if (!source_) return;
+  if (!prev_use_addr_) return;
+  if (prev_use_addr_ == source_.impl()->first_use_addr()) {
+    /// NOTE: In ValueImpl, first_use_offseted_by_index_ use lower three bits
+    /// storage index information, so need to be updated using the set_first_use
+    /// method here.
+    source_.impl()->set_first_use(next_use_);
+  } else {
+    *prev_use_addr_ = next_use_;
+  }
+  if (next_use_) {
+    next_use_->prev_use_addr_ = prev_use_addr_;
+  }
+  next_use_ = nullptr;
+  prev_use_addr_ = nullptr;
+  source_ = nullptr;
+}
+
+OpOperandImpl::~OpOperandImpl() { RemoveFromUdChain(); }
+
+}  // namespace detail
+}  // namespace pir
diff --git a/paddle/pir/core/op_operand_impl.h b/paddle/pir/core/op_operand_impl.h
new file mode 100644
index 0000000000000..f1bc9d23c0928
--- /dev/null
+++ b/paddle/pir/core/op_operand_impl.h
@@ -0,0 +1,60 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/pir/core/value.h"
+
+namespace pir {
+
+class Operation;
+
+namespace detail {
+///
+/// \brief OpOperandImpl
+///
+class OpOperandImpl {
+ public:
+  Operation *owner() const;
+
+  OpOperandImpl *next_use();
+
+  Value source() const;
+
+  void set_source(Value value);
+
+  /// Remove this op_operand from the current use list.
+  void RemoveFromUdChain();
+
+  ~OpOperandImpl();
+
+  friend Operation;
+
+ private:
+  OpOperandImpl(Value source, Operation *owner);
+
+  // Insert self to the UD chain holded by source_;
+  // It is not safe. So set private.
+  void InsertToUdChain();
+
+  Value source_;
+
+  OpOperandImpl *next_use_ = nullptr;
+
+  OpOperandImpl **prev_use_addr_ = nullptr;
+
+  Operation *const owner_ = nullptr;
+};
+
+}  // namespace detail
+}  // namespace pir
diff --git a/paddle/pir/core/op_result.cc b/paddle/pir/core/op_result.cc
new file mode 100644
index 0000000000000..510f98d99b526
--- /dev/null
+++ b/paddle/pir/core/op_result.cc
@@ -0,0 +1,56 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/pir/core/op_result.h"
+#include "paddle/pir/core/enforce.h"
+#include "paddle/pir/core/op_result_impl.h"
+
+#define CHECK_NULL_IMPL(class_name, func_name)                  \
+  IR_ENFORCE(impl_,                                             \
+             "impl_ pointer is null when call func:" #func_name \
+             " , in class: " #class_name ".")
+
+#define CHECK_OPRESULT_NULL_IMPL(func_name) CHECK_NULL_IMPL(OpResult, func_name)
+
+namespace pir {
+
+// OpResult
+bool OpResult::classof(Value value) {
+  return value && pir::isa<detail::OpResultImpl>(value.impl());
+}
+
+Operation *OpResult::owner() const {
+  CHECK_OPRESULT_NULL_IMPL(owner);
+  return impl()->owner();
+}
+
+uint32_t OpResult::GetResultIndex() const {
+  CHECK_OPRESULT_NULL_IMPL(GetResultIndex);
+  return impl()->GetResultIndex();
+}
+
+detail::OpResultImpl *OpResult::impl() const {
+  return reinterpret_cast<detail::OpResultImpl *>(impl_);
+}
+
+bool OpResult::operator==(const OpResult &other) const {
+  return impl_ == other.impl_;
+}
+
+uint32_t OpResult::GetValidInlineIndex(uint32_t index) {
+  uint32_t max_inline_index =
+      pir::detail::OpResultImpl::GetMaxInlineResultIndex();
+  return index <= max_inline_index ? index : max_inline_index;
+}
+
+}  // namespace pir
diff --git a/paddle/pir/core/op_result.h b/paddle/pir/core/op_result.h
new file mode 100644
index 0000000000000..1a5f14a9a17fe
--- /dev/null
+++ b/paddle/pir/core/op_result.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/pir/core/value.h"
+namespace pir {
+
+namespace detail {
+class OpResultImpl;
+}  // namespace detail
+
+///
+/// \brief OpResult class represents the value defined by a result of operation.
+/// This class only provides interfaces, for specific implementation, see Impl
+/// class.
+///
+class IR_API OpResult : public Value {
+ public:
+  using Value::Value;
+
+  static bool classof(Value value);
+
+  Operation *owner() const;
+
+  uint32_t GetResultIndex() const;
+
+  bool operator==(const OpResult &other) const;
+
+  friend Operation;
+
+  detail::OpResultImpl *impl() const;
+
+ private:
+  static uint32_t GetValidInlineIndex(uint32_t index);
+};
+
+}  // namespace pir
diff --git a/paddle/pir/core/op_result_impl.cc b/paddle/pir/core/op_result_impl.cc
new file mode 100644
index 0000000000000..49b9f40259845
--- /dev/null
+++ b/paddle/pir/core/op_result_impl.cc
@@ -0,0 +1,53 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/pir/core/op_result_impl.h"
+
+#include <cassert>
+
+namespace pir {
+namespace detail {
+
+uint32_t OpResultImpl::GetResultIndex() const {
+  if (const auto *outline_result = dyn_cast<OpOutlineResultImpl>(this)) {
+    return outline_result->GetResultIndex();
+  }
+  return dyn_cast<OpInlineResultImpl>(this)->GetResultIndex();
+}
+
+OpResultImpl::~OpResultImpl() { assert(use_empty()); }
+
+Operation *OpResultImpl::owner() const {
+  // For inline result, pointer offset index to obtain the address of op.
+  if (const auto *result = dyn_cast<OpInlineResultImpl>(this)) {
+    result += result->GetResultIndex() + 1;
+    return reinterpret_cast<Operation *>(
+        const_cast<OpInlineResultImpl *>(result));
+  }
+  // For outline result, pointer offset outline_index to obtain the address of
+  // maximum inline result.
+  const OpOutlineResultImpl *outline_result =
+      (const OpOutlineResultImpl *)(this);
+  outline_result +=
+      (outline_result->outline_index_ - GetMaxInlineResultIndex());
+  // The offset of the maximum inline result distance op is
+  // GetMaxInlineResultIndex.
+  const auto *inline_result =
+      reinterpret_cast<const OpInlineResultImpl *>(outline_result);
+  inline_result += (GetMaxInlineResultIndex() + 1);
+  return reinterpret_cast<Operation *>(
+      const_cast<OpInlineResultImpl *>(inline_result));
+}
+
+}  // namespace detail
+}  // namespace pir
diff --git a/paddle/pir/core/op_result_impl.h b/paddle/pir/core/op_result_impl.h
new file mode 100644
index 0000000000000..99601a27911af
--- /dev/null
+++ b/paddle/pir/core/op_result_impl.h
@@ -0,0 +1,93 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/pir/core/value_impl.h"
+
+namespace pir {
+namespace detail {
+///
+/// \brief OpResultImpl is the implementation of an operation result.
+///
+class OpResultImpl : public ValueImpl {
+ public:
+  using ValueImpl::ValueImpl;
+
+  static bool classof(const ValueImpl &value) {
+    return value.kind() <= OUTLINE_OP_RESULT_INDEX;
+  }
+
+  ///
+  /// \brief Get the parent operation of this result.(op_ptr = value_ptr +
+  /// index)
+  ///
+  Operation *owner() const;
+
+  ///
+  /// \brief Get the result index of the operation result.
+  ///
+  uint32_t GetResultIndex() const;
+
+  ///
+  /// \brief Get the maximum number of results that can be stored inline.
+  ///
+  static uint32_t GetMaxInlineResultIndex() {
+    return OUTLINE_OP_RESULT_INDEX - 1;
+  }
+
+  ~OpResultImpl();
+};
+
+///
+/// \brief OpInlineResultImpl is the implementation of an operation result whose
+/// index <= 5.
+///
+class OpInlineResultImpl : public OpResultImpl {
+ public:
+  OpInlineResultImpl(Type type, uint32_t result_index)
+      : OpResultImpl(type, result_index) {
+    if (result_index > GetMaxInlineResultIndex()) {
+      throw("Inline result index should not exceed MaxInlineResultIndex(5)");
+    }
+  }
+
+  static bool classof(const OpResultImpl &value) {
+    return value.kind() < OUTLINE_OP_RESULT_INDEX;
+  }
+
+  uint32_t GetResultIndex() const { return kind(); }
+};
+
+///
+/// \brief OpOutlineResultImpl is the implementation of an operation result
+/// whose index > 5.
+///
+class OpOutlineResultImpl : public OpResultImpl {
+ public:
+  OpOutlineResultImpl(Type type, uint32_t outline_index)
+      : OpResultImpl(type, OUTLINE_OP_RESULT_INDEX),
+        outline_index_(outline_index) {}
+
+  static bool classof(const OpResultImpl &value) {
+    return value.kind() == OUTLINE_OP_RESULT_INDEX;
+  }
+
+  uint32_t GetResultIndex() const { return outline_index_; }
+
+  uint32_t outline_index_;
+};
+
+}  // namespace detail
+}  // namespace pir
diff --git a/paddle/ir/core/operation.cc b/paddle/pir/core/operation.cc
similarity index 93%
rename from paddle/ir/core/operation.cc
rename to paddle/pir/core/operation.cc
index 3d316847d9fc1..fdb850bc1f415 100644
--- a/paddle/ir/core/operation.cc
+++ b/paddle/pir/core/operation.cc
@@ -14,18 +14,18 @@
 
 #include <ostream>
 
-#include "paddle/ir/core/block.h"
-#include "paddle/ir/core/block_operand_impl.h"
-#include "paddle/ir/core/dialect.h"
-#include "paddle/ir/core/enforce.h"
-#include "paddle/ir/core/op_info.h"
-#include "paddle/ir/core/operation.h"
-#include "paddle/ir/core/program.h"
-#include "paddle/ir/core/region.h"
-#include "paddle/ir/core/utils.h"
-#include "paddle/ir/core/value_impl.h"
-
-namespace ir {
+#include "paddle/pir/core/block.h"
+#include "paddle/pir/core/block_operand_impl.h"
+#include "paddle/pir/core/dialect.h"
+#include "paddle/pir/core/enforce.h"
+#include "paddle/pir/core/op_info.h"
+#include "paddle/pir/core/op_result_impl.h"
+#include "paddle/pir/core/operation.h"
+#include "paddle/pir/core/program.h"
+#include "paddle/pir/core/region.h"
+#include "paddle/pir/core/utils.h"
+
+namespace pir {
 Operation *Operation::Create(OperationArgument &&argument) {
   return Create(argument.inputs,
                 argument.attributes,
@@ -38,10 +38,10 @@ Operation *Operation::Create(OperationArgument &&argument) {
 // Allocate the required memory based on the size and number of inputs, outputs,
 // and operators, and construct it in the order of: OpOutlineResult,
 // OpInlineResult, Operation, operand.
-Operation *Operation::Create(const std::vector<ir::OpResult> &inputs,
+Operation *Operation::Create(const std::vector<pir::OpResult> &inputs,
                              const AttributeMap &attributes,
                              const std::vector<Type> &output_types,
-                             ir::OpInfo op_info,
+                             pir::OpInfo op_info,
                              size_t num_regions,
                              const std::vector<Block *> &successors) {
   // 1. Calculate the required memory size for OpResults + Operation +
@@ -179,7 +179,7 @@ IrContext *Operation::ir_context() const { return info_.ir_context(); }
 Dialect *Operation::dialect() const { return info_.dialect(); }
 
 Operation::Operation(const AttributeMap &attributes,
-                     ir::OpInfo op_info,
+                     pir::OpInfo op_info,
                      uint32_t num_results,
                      uint32_t num_operands,
                      uint32_t num_regions,
@@ -191,7 +191,7 @@ Operation::Operation(const AttributeMap &attributes,
       num_regions_(num_regions),
       num_successors_(num_successors) {}
 
-ir::OpResult Operation::result(uint32_t index) const {
+pir::OpResult Operation::result(uint32_t index) const {
   if (index >= num_results_) {
     IR_THROW("index exceeds OP output range.");
   }
@@ -204,10 +204,10 @@ ir::OpResult Operation::result(uint32_t index) const {
           : reinterpret_cast<const char *>(this) -
                 (index + 1) * sizeof(detail::OpInlineResultImpl);
   if (index > max_inline_idx) {
-    return ir::OpResult(
+    return pir::OpResult(
         reinterpret_cast<const detail::OpOutlineResultImpl *>(ptr));
   } else {
-    return ir::OpResult(
+    return pir::OpResult(
         reinterpret_cast<const detail::OpInlineResultImpl *>(ptr));
   }
 }
@@ -318,4 +318,4 @@ std::vector<OpResult> Operation::results() const {
   return res;
 }
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/core/operation.h b/paddle/pir/core/operation.h
similarity index 90%
rename from paddle/ir/core/operation.h
rename to paddle/pir/core/operation.h
index 961e4a5fccc50..28c0b42671c96 100644
--- a/paddle/ir/core/operation.h
+++ b/paddle/pir/core/operation.h
@@ -16,14 +16,14 @@
 
 #include <ostream>
 #include <vector>
-#include "paddle/ir/core/block.h"
-#include "paddle/ir/core/enforce.h"
-#include "paddle/ir/core/macros.h"
-#include "paddle/ir/core/op_info.h"
-#include "paddle/ir/core/operation_utils.h"
-#include "paddle/ir/core/type.h"
-
-namespace ir {
+#include "paddle/pir/core/block.h"
+#include "paddle/pir/core/enforce.h"
+#include "paddle/pir/core/macros.h"
+#include "paddle/pir/core/op_info.h"
+#include "paddle/pir/core/operation_utils.h"
+#include "paddle/pir/core/type.h"
+
+namespace pir {
 class OpBase;
 class Program;
 class OpOperand;
@@ -41,10 +41,10 @@ class IR_API alignas(8) Operation final {
   /// NOTE: Similar to new and delete, the destroy() and the create() need to be
   /// used in conjunction.
   ///
-  static Operation *Create(const std::vector<ir::OpResult> &inputs,
+  static Operation *Create(const std::vector<pir::OpResult> &inputs,
                            const AttributeMap &attributes,
-                           const std::vector<ir::Type> &output_types,
-                           ir::OpInfo op_info,
+                           const std::vector<pir::Type> &output_types,
+                           pir::OpInfo op_info,
                            size_t num_regions = 0,
                            const std::vector<Block *> &successors = {});
   static Operation *Create(OperationArgument &&op_argument);
@@ -96,7 +96,7 @@ class IR_API alignas(8) Operation final {
     return attributes_.find(key) != attributes_.end();
   }
 
-  ir::OpInfo info() const { return info_; }
+  pir::OpInfo info() const { return info_; }
 
   uint32_t num_results() const { return num_results_; }
 
@@ -164,7 +164,7 @@ class IR_API alignas(8) Operation final {
  private:
   DISABLE_COPY_AND_ASSIGN(Operation);
   Operation(const AttributeMap &attribute,
-            ir::OpInfo op_info,
+            pir::OpInfo op_info,
             uint32_t num_results,
             uint32_t num_operands,
             uint32_t num_regions,
@@ -203,4 +203,4 @@ class IR_API alignas(8) Operation final {
   Block::iterator position_;
 };
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/core/operation_utils.cc b/paddle/pir/core/operation_utils.cc
similarity index 83%
rename from paddle/ir/core/operation_utils.cc
rename to paddle/pir/core/operation_utils.cc
index f975de0c82807..a8eedcfcb8c48 100644
--- a/paddle/ir/core/operation_utils.cc
+++ b/paddle/pir/core/operation_utils.cc
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/ir/core/operation_utils.h"
-#include "paddle/ir/core/ir_context.h"
-#include "paddle/ir/core/region.h"
+#include "paddle/pir/core/operation_utils.h"
+#include "paddle/pir/core/ir_context.h"
+#include "paddle/pir/core/region.h"
 
-namespace ir {
+namespace pir {
 OperationArgument::OperationArgument(IrContext* ir_context,
                                      const std::string& name) {
   info = ir_context->GetRegisteredOpInfo(name);
 }
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/core/operation_utils.h b/paddle/pir/core/operation_utils.h
similarity index 92%
rename from paddle/ir/core/operation_utils.h
rename to paddle/pir/core/operation_utils.h
index 9e317a6510f59..39c41c6eae2c3 100644
--- a/paddle/ir/core/operation_utils.h
+++ b/paddle/pir/core/operation_utils.h
@@ -15,13 +15,14 @@
 #pragma once
 
 #include <memory>
-#include "paddle/ir/core/attribute.h"
-#include "paddle/ir/core/op_info.h"
-#include "paddle/ir/core/region.h"
-#include "paddle/ir/core/type.h"
-#include "paddle/ir/core/value.h"
+#include "paddle/pir/core/attribute.h"
+#include "paddle/pir/core/op_info.h"
+#include "paddle/pir/core/op_result.h"
+#include "paddle/pir/core/region.h"
+#include "paddle/pir/core/type.h"
+#include "paddle/pir/core/value.h"
 
-namespace ir {
+namespace pir {
 class Block;
 using AttributeMap = std::unordered_map<std::string, Attribute>;
 
@@ -100,4 +101,4 @@ void OperationArgument::AddAttributes(InputIt first, InputIt last) {
   }
 }
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/core/parameter.h b/paddle/pir/core/parameter.h
similarity index 92%
rename from paddle/ir/core/parameter.h
rename to paddle/pir/core/parameter.h
index 3dbe48935b09a..332ef23322e01 100644
--- a/paddle/ir/core/parameter.h
+++ b/paddle/pir/core/parameter.h
@@ -14,15 +14,15 @@
 
 #pragma once
 
-#include "paddle/ir/core/type.h"
+#include "paddle/pir/core/type.h"
 
-namespace ir {
+namespace pir {
 ///
 /// \brief Parameter represents the weight in the calculation graph.
 ///
 class IR_API Parameter {
  public:
-  Parameter(void* data, size_t size, ir::Type type) {
+  Parameter(void* data, size_t size, pir::Type type) {
     data_ = malloc(size);
     memcpy(data_, data, size);
     size_ = size;
@@ -67,4 +67,4 @@ class IR_API Parameter {
   bool is_mutable_ = false;
 };
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/core/parser/ir_parser.cc b/paddle/pir/core/parser/ir_parser.cc
similarity index 96%
rename from paddle/ir/core/parser/ir_parser.cc
rename to paddle/pir/core/parser/ir_parser.cc
index 8d7e437635165..3fe336fc63289 100644
--- a/paddle/ir/core/parser/ir_parser.cc
+++ b/paddle/pir/core/parser/ir_parser.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/ir/core/ir_parser.h"
+#include "paddle/pir/core/parser/ir_parser.h"
 
-#include "paddle/ir/core/builtin_dialect.h"
-#include "paddle/ir/core/builtin_type.h"
+#include "paddle/pir/core/builtin_dialect.h"
+#include "paddle/pir/core/builtin_type.h"
 
-namespace ir {
+namespace pir {
 IrParser::IrParser(IrContext* ctx, std::istream& is) {
   lexer.reset(new Lexer{is});
   this->ctx = ctx;
@@ -216,9 +216,9 @@ Operation* IrParser::ParseOperation() {
 
   OpInfo opinfo = ParseOpInfo();
 
-  std::vector<OpResult> inputs = ParseOpRandList();
+  std::vector<OpResult> inputs = ParseOprandList();
 
-  ir::AttributeMap attributeMap = ParseAttributeMap();
+  pir::AttributeMap attributeMap = ParseAttributeMap();
 
   ConsumeAToken(":");
   ConsumeAToken("(");
@@ -269,7 +269,7 @@ OpInfo IrParser::ParseOpInfo() {
 
 // OprandList := ValueList
 // ValueList := ValueId(,ValueId)*
-std::vector<OpResult> IrParser::ParseOpRandList() {
+std::vector<OpResult> IrParser::ParseOprandList() {
   ConsumeAToken("(");
   std::vector<OpResult> inputs{};
   Token ind_token = ConsumeToken();
@@ -348,4 +348,4 @@ std::unique_ptr<Program> Program::Parse(std::istream& is, IrContext* ctx) {
   return parser.ParseProgram();
 }
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/core/ir_parser.h b/paddle/pir/core/parser/ir_parser.h
similarity index 80%
rename from paddle/ir/core/ir_parser.h
rename to paddle/pir/core/parser/ir_parser.h
index dbba3e2aaba80..c10e88225984b 100644
--- a/paddle/ir/core/ir_parser.h
+++ b/paddle/pir/core/parser/ir_parser.h
@@ -13,16 +13,16 @@
 // limitations under the License.
 
 #pragma once
-#include "paddle/ir/core/ir_context.h"
-#include "paddle/ir/core/operation.h"
-#include "paddle/ir/core/parser/lexer.h"
-#include "paddle/ir/core/program.h"
+#include "paddle/pir/core/ir_context.h"
+#include "paddle/pir/core/operation.h"
+#include "paddle/pir/core/parser/lexer.h"
+#include "paddle/pir/core/program.h"
 
-using OpResultMap = std::map<std::string, ir::OpResult>;
-using AttributeMap = std::unordered_map<std::string, ir::Attribute>;
+using OpResultMap = std::map<std::string, pir::OpResult>;
+using AttributeMap = std::unordered_map<std::string, pir::Attribute>;
 using OpAttributeInfoMap = std::map<std::string, std::string>;
 
-namespace ir {
+namespace pir {
 class IrParser {
  public:
   std::unique_ptr<Lexer> lexer;
@@ -51,7 +51,7 @@ class IrParser {
 
   std::vector<std::string> ParseOpResultList();
 
-  std::vector<OpResult> ParseOpRandList();
+  std::vector<OpResult> ParseOprandList();
 
   AttributeMap ParseAttributeMap();
 
@@ -68,4 +68,4 @@ class IrParser {
   void ConsumeAToken(std::string expect_token_val);
 };
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/core/parser/lexer.cc b/paddle/pir/core/parser/lexer.cc
similarity index 99%
rename from paddle/ir/core/parser/lexer.cc
rename to paddle/pir/core/parser/lexer.cc
index af1530a5b2961..c7f037de9927d 100644
--- a/paddle/ir/core/parser/lexer.cc
+++ b/paddle/pir/core/parser/lexer.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/ir/core/parser/lexer.h"
+#include "paddle/pir/core/parser/lexer.h"
 
 Token Lexer::ConsumeToken() {
   SkipWhitespace();
diff --git a/paddle/ir/core/parser/lexer.h b/paddle/pir/core/parser/lexer.h
similarity index 96%
rename from paddle/ir/core/parser/lexer.h
rename to paddle/pir/core/parser/lexer.h
index 0561e1f60caa8..24694eb761317 100644
--- a/paddle/ir/core/parser/lexer.h
+++ b/paddle/pir/core/parser/lexer.h
@@ -16,7 +16,7 @@
 #include <istream>
 #include <memory>
 
-#include "paddle/ir/core/parser/token.h"
+#include "paddle/pir/core/parser/token.h"
 
 class Lexer {
  private:
diff --git a/paddle/ir/core/parser/token.h b/paddle/pir/core/parser/token.h
similarity index 100%
rename from paddle/ir/core/parser/token.h
rename to paddle/pir/core/parser/token.h
diff --git a/paddle/ir/core/program.cc b/paddle/pir/core/program.cc
similarity index 90%
rename from paddle/ir/core/program.cc
rename to paddle/pir/core/program.cc
index baf6a3cbdd57c..d4197a4a9bc4b 100644
--- a/paddle/ir/core/program.cc
+++ b/paddle/pir/core/program.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/ir/core/program.h"
-#include "paddle/ir/core/ir_context.h"
+#include "paddle/pir/core/program.h"
+#include "paddle/pir/core/ir_context.h"
 
-namespace ir {
+namespace pir {
 
 Program::Program(IrContext* context) {
   module_ = ModuleOp::Create(context, this);
@@ -39,4 +39,4 @@ void Program::SetParameter(const std::string& name,
   parameters_[name].reset(parameter.release());
 }
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/core/program.h b/paddle/pir/core/program.h
similarity index 89%
rename from paddle/ir/core/program.h
rename to paddle/pir/core/program.h
index bf9c37210967e..8756b3aa70e1c 100644
--- a/paddle/ir/core/program.h
+++ b/paddle/pir/core/program.h
@@ -18,14 +18,14 @@
 #include <ostream>
 #include <unordered_map>
 
-#include "paddle/ir/core/attribute.h"
-#include "paddle/ir/core/block.h"
-#include "paddle/ir/core/builtin_attribute.h"
-#include "paddle/ir/core/builtin_op.h"
-#include "paddle/ir/core/operation.h"
-#include "paddle/ir/core/parameter.h"
+#include "paddle/pir/core/attribute.h"
+#include "paddle/pir/core/block.h"
+#include "paddle/pir/core/builtin_attribute.h"
+#include "paddle/pir/core/builtin_op.h"
+#include "paddle/pir/core/operation.h"
+#include "paddle/pir/core/parameter.h"
 
-namespace ir {
+namespace pir {
 
 class IrContext;
 ///
@@ -75,4 +75,4 @@ class IR_API Program {
 
 std::ostream& operator<<(std::ostream& os, const Program& prog);
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/core/region.cc b/paddle/pir/core/region.cc
similarity index 91%
rename from paddle/ir/core/region.cc
rename to paddle/pir/core/region.cc
index e9fdb91758219..0f02e3d19e04e 100644
--- a/paddle/ir/core/region.cc
+++ b/paddle/pir/core/region.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/ir/core/region.h"
-#include "paddle/ir/core/block.h"
-#include "paddle/ir/core/enforce.h"
-#include "paddle/ir/core/operation.h"
+#include "paddle/pir/core/region.h"
+#include "paddle/pir/core/block.h"
+#include "paddle/pir/core/enforce.h"
+#include "paddle/pir/core/operation.h"
 
-namespace ir {
+namespace pir {
 Region::~Region() { clear(); }
 
 void Region::push_back(Block *block) { insert(blocks_.end(), block); }
@@ -61,4 +61,4 @@ IrContext *Region::ir_context() const {
   IR_ENFORCE(parent_, "Region is not attached to a container.");
   return parent_->ir_context();
 }
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/core/region.h b/paddle/pir/core/region.h
similarity index 96%
rename from paddle/ir/core/region.h
rename to paddle/pir/core/region.h
index cc1c1ab791df5..06272f82a4378 100644
--- a/paddle/ir/core/region.h
+++ b/paddle/pir/core/region.h
@@ -17,9 +17,9 @@
 #include <cstddef>
 #include <list>
 
-#include "paddle/ir/core/dll_decl.h"
+#include "paddle/pir/core/dll_decl.h"
 
-namespace ir {
+namespace pir {
 
 class Block;
 class Operation;
@@ -68,4 +68,4 @@ class IR_API Region {
   Operation *parent_{nullptr};  // not owned
   std::list<Block *> blocks_;   // owned
 };
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/core/spin_lock.h b/paddle/pir/core/spin_lock.h
similarity index 97%
rename from paddle/ir/core/spin_lock.h
rename to paddle/pir/core/spin_lock.h
index 4150f419c3159..5cba96823a817 100644
--- a/paddle/ir/core/spin_lock.h
+++ b/paddle/pir/core/spin_lock.h
@@ -23,7 +23,7 @@
 #include <mutex>
 #include <thread>
 
-namespace ir {
+namespace pir {
 static inline void CpuRelax() {
 #if defined(__PADDLE_x86__)
   _mm_pause();
@@ -63,4 +63,4 @@ class SpinLock {
   std::atomic<bool> mlock_;
 };
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/core/storage_manager.cc b/paddle/pir/core/storage_manager.cc
similarity index 86%
rename from paddle/ir/core/storage_manager.cc
rename to paddle/pir/core/storage_manager.cc
index 0dcc7ca0ad855..07cc4e07cce2c 100644
--- a/paddle/ir/core/storage_manager.cc
+++ b/paddle/pir/core/storage_manager.cc
@@ -12,14 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/ir/core/storage_manager.h"
+#include "paddle/pir/core/storage_manager.h"
 
 #include <memory>
 #include <unordered_map>
 
-#include "paddle/ir/core/enforce.h"
+#include "paddle/pir/core/enforce.h"
 
-namespace ir {
+namespace pir {
 // This is a structure for creating, caching, and looking up Storage of
 // parametric types.
 struct ParametricStorageManager {
@@ -75,9 +75,9 @@ StorageManager::StorageBase *StorageManager::GetParametricStorageImpl(
     std::size_t hash_value,
     std::function<bool(const StorageBase *)> equal_func,
     std::function<StorageBase *()> constructor) {
-  std::lock_guard<ir::SpinLock> guard(parametric_instance_lock_);
+  std::lock_guard<pir::SpinLock> guard(parametric_instance_lock_);
   VLOG(6) << "Try to get a parametric storage of: [TypeId_hash="
-          << std::hash<ir::TypeId>()(type_id) << ", param_hash=" << hash_value
+          << std::hash<pir::TypeId>()(type_id) << ", param_hash=" << hash_value
           << "].";
   if (parametric_instance_.find(type_id) == parametric_instance_.end()) {
     IR_THROW("The input data pointer is null.");
@@ -88,9 +88,9 @@ StorageManager::StorageBase *StorageManager::GetParametricStorageImpl(
 
 StorageManager::StorageBase *StorageManager::GetParameterlessStorageImpl(
     TypeId type_id) {
-  std::lock_guard<ir::SpinLock> guard(parameterless_instance_lock_);
+  std::lock_guard<pir::SpinLock> guard(parameterless_instance_lock_);
   VLOG(6) << "Try to get a parameterless storage of: [TypeId_hash="
-          << std::hash<ir::TypeId>()(type_id) << "].";
+          << std::hash<pir::TypeId>()(type_id) << "].";
   if (parameterless_instance_.find(type_id) == parameterless_instance_.end())
     IR_THROW("TypeId not found in IrContext.");
   StorageBase *parameterless_instance = parameterless_instance_[type_id];
@@ -99,21 +99,21 @@ StorageManager::StorageBase *StorageManager::GetParameterlessStorageImpl(
 
 void StorageManager::RegisterParametricStorageImpl(
     TypeId type_id, std::function<void(StorageBase *)> destroy) {
-  std::lock_guard<ir::SpinLock> guard(parametric_instance_lock_);
+  std::lock_guard<pir::SpinLock> guard(parametric_instance_lock_);
   VLOG(6) << "Register a parametric storage of: [TypeId_hash="
-          << std::hash<ir::TypeId>()(type_id) << "].";
+          << std::hash<pir::TypeId>()(type_id) << "].";
   parametric_instance_.emplace(
       type_id, std::make_unique<ParametricStorageManager>(destroy));
 }
 
 void StorageManager::RegisterParameterlessStorageImpl(
     TypeId type_id, std::function<StorageBase *()> constructor) {
-  std::lock_guard<ir::SpinLock> guard(parameterless_instance_lock_);
+  std::lock_guard<pir::SpinLock> guard(parameterless_instance_lock_);
   VLOG(6) << "Register a parameterless storage of: [TypeId_hash="
-          << std::hash<ir::TypeId>()(type_id) << "].";
+          << std::hash<pir::TypeId>()(type_id) << "].";
   if (parameterless_instance_.find(type_id) != parameterless_instance_.end())
     IR_THROW("storage class already registered");
   parameterless_instance_.emplace(type_id, constructor());
 }
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/core/storage_manager.h b/paddle/pir/core/storage_manager.h
similarity index 96%
rename from paddle/ir/core/storage_manager.h
rename to paddle/pir/core/storage_manager.h
index f2cda194ce215..1853207f5953f 100644
--- a/paddle/ir/core/storage_manager.h
+++ b/paddle/pir/core/storage_manager.h
@@ -18,10 +18,10 @@
 #include <type_traits>
 #include <unordered_map>
 
-#include "paddle/ir/core/spin_lock.h"
-#include "paddle/ir/core/type_id.h"
+#include "paddle/pir/core/spin_lock.h"
+#include "paddle/pir/core/type_id.h"
 
-namespace ir {
+namespace pir {
 ///
 /// \brief The implementation of the class StorageManager.
 ///
@@ -141,12 +141,12 @@ class IR_API StorageManager {
   std::unordered_map<TypeId, std::unique_ptr<ParametricStorageManager>>
       parametric_instance_;
 
-  ir::SpinLock parametric_instance_lock_;
+  pir::SpinLock parametric_instance_lock_;
 
   // This map is a mapping between type id and parameterless type storage.
   std::unordered_map<TypeId, StorageBase *> parameterless_instance_;
 
-  ir::SpinLock parameterless_instance_lock_;
+  pir::SpinLock parameterless_instance_lock_;
 };
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/pir/core/storage_manager_support.h b/paddle/pir/core/storage_manager_support.h
new file mode 100644
index 0000000000000..a54e066a0e2a6
--- /dev/null
+++ b/paddle/pir/core/storage_manager_support.h
@@ -0,0 +1,106 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/pir/core/interface_support.h"
+#include "paddle/pir/core/ir_context.h"
+#include "paddle/pir/core/type.h"
+#include "paddle/pir/core/type_base.h"
+#include "paddle/pir/core/type_id.h"
+
+namespace pir {
+template <typename ConcreteInterface>
+class TypeInterfaceBase;
+
+namespace detail {
+
+namespace storage_helper_base_impl {
+/// Returns true if this given Trait ID matches the IDs of any of the provided
+/// trait types `Traits`.
+template <template <typename T> class... Traits>
+bool hasTrait(TypeId traitID) {
+  TypeId traitIDs[] = {TypeId::get<Traits>()...};
+  for (unsigned i = 0, e = sizeof...(Traits); i != e; ++i)
+    if (traitIDs[i] == traitID) return true;
+  return false;
+}
+
+// We specialize for the empty case to not define an empty array.
+template <>
+inline bool hasTrait(TypeId traitID) {
+  return false;
+}
+}  // namespace storage_helper_base_impl
+
+///
+/// \brief Implementing users of storage classes uniqued by StorageManager.
+///
+
+template <typename ConcreteT,
+          typename BaseT,
+          typename StorageT,
+          typename ManagerT,
+          class... TraitOrInterface>  // Traits or Interface
+class StorageHelperBase : public BaseT {
+ public:
+  using BaseT::BaseT;
+
+  using Base = StorageHelperBase<ConcreteT,
+                                 BaseT,
+                                 StorageT,
+                                 ManagerT,
+                                 TraitOrInterface...>;
+  using HasTraitFn = bool (*)(TypeId);
+  using Storage = StorageT;
+  using InterfaceList =
+      typename Filter<TypeInterfaceBase, std::tuple<TraitOrInterface...>>::Type;
+
+  /// Utility for easy access to the storage instance.
+  const Storage *storage() const {
+    return static_cast<const Storage *>(this->storage_);
+  }
+
+  /// Get the identifier for the concrete type.
+  static pir::TypeId type_id() { return pir::TypeId::get<ConcreteT>(); }
+
+  /// Provide an implementation of 'classof' that compares the type id of the
+  /// provided value with that of the concrete type.
+  template <typename T>
+  static bool classof(T val) {
+    return val.type_id() == type_id();
+  }
+
+  /// Returns an interface map for the interfaces registered to this storage
+  /// user.
+  static std::vector<details::InterfaceValue> interface_map() {
+    return pir::details::GetInterfaceMap<ConcreteT, InterfaceList>();
+  }
+
+  /// Get or create a new ConcreteT instance within the ctx.
+  template <typename... Args>
+  static ConcreteT get(pir::IrContext *ctx, Args... args) {
+    return ManagerT::template get<ConcreteT>(ctx, args...);
+  }
+
+  /// Returns the function that returns true if the given Trait ID matches the
+  /// IDs of any of the traits defined by the storage user.
+  static HasTraitFn getHasTraitFn() {
+    return [](TypeId id) {
+      return storage_helper_base_impl::hasTrait<TraitOrInterface...>(id);
+    };
+  }
+};
+}  // namespace detail
+}  // namespace pir
diff --git a/paddle/ir/core/type.cc b/paddle/pir/core/type.cc
similarity index 86%
rename from paddle/ir/core/type.cc
rename to paddle/pir/core/type.cc
index 16713290d393d..fef0eb9c1a443 100644
--- a/paddle/ir/core/type.cc
+++ b/paddle/pir/core/type.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/ir/core/type.h"
-#include "paddle/ir/core/dialect.h"
-#include "paddle/ir/core/type_base.h"
+#include "paddle/pir/core/type.h"
+#include "paddle/pir/core/dialect.h"
+#include "paddle/pir/core/type_base.h"
 
-namespace ir {
+namespace pir {
 IrContext *Type::ir_context() const { return dialect().ir_context(); }
 
 TypeId Type::type_id() { return storage_->abstract_type().type_id(); }
@@ -24,4 +24,4 @@ TypeId Type::type_id() { return storage_->abstract_type().type_id(); }
 const AbstractType &Type::abstract_type() { return storage_->abstract_type(); }
 
 Dialect &Type::dialect() const { return storage_->abstract_type().dialect(); }
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/pir/core/type.h b/paddle/pir/core/type.h
new file mode 100644
index 0000000000000..46f1c4542354c
--- /dev/null
+++ b/paddle/pir/core/type.h
@@ -0,0 +1,191 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <ostream>
+
+#include "paddle/pir/core/cast_utils.h"
+#include "paddle/pir/core/storage_manager_support.h"
+#include "paddle/pir/core/type_id.h"
+namespace pir {
+class TypeStorage;
+class AbstractType;
+class IrContext;
+class Dialect;
+class ShapedTypeInterface;
+///
+/// \brief Unified interface of the Type class. Derivation of all Type classes
+/// only derives interfaces, not members. For example, DenseTensorType,
+/// Float32Type, etc. are all derived classes of Type, but no new member
+/// variables will be added.
+///
+class IR_API Type {
+ public:
+  template <typename ConcreteType,
+            typename BaseType,
+            typename StorageType,
+            class... TraitOrInterface>
+  using TypeBase = detail::StorageHelperBase<ConcreteType,
+                                             BaseType,
+                                             StorageType,
+                                             pir::TypeManager,
+                                             TraitOrInterface...>;
+
+  using Storage = TypeStorage;
+  using AbstractT = AbstractType;
+
+  Type() = default;
+
+  Type(const Storage *storage)  // NOLINT
+      : storage_(const_cast<Storage *>(storage)) {}
+
+  Type(const Type &other) = default;
+
+  Type &operator=(const Type &other) = default;
+
+  ///
+  /// \brief Some operators are overloaded.
+  ///
+  bool operator==(Type other) const { return storage_ == other.storage_; }
+
+  bool operator!=(Type other) const { return storage_ != other.storage_; }
+
+  explicit operator bool() const { return storage_; }
+
+  bool operator!() const { return storage_ == nullptr; }
+
+  ///
+  /// \brief Some type attribute acquisition interfaces.
+  ///
+  TypeId type_id();
+
+  ///
+  /// \brief Support PointerLikeTypeTraits.
+  ///
+  ///
+  const void *AsOpaquePointer() const {
+    return static_cast<const void *>(storage_);
+  }
+  static Type RecoverFromOpaquePointer(const void *pointer) {
+    return Type(reinterpret_cast<Storage *>(const_cast<void *>(pointer)));
+  }
+
+  ///
+  /// \brief Return the abstract type descriptor.
+  ///
+  const AbstractT &abstract_type();
+
+  ///
+  /// \brief Return the Type implementation.
+  ///
+  const Storage *storage() const { return storage_; }
+
+  Dialect &dialect() const;
+
+  IrContext *ir_context() const;
+
+  ///
+  /// \brief Methods for type judgment and cast.
+  ///
+  static bool classof(Type) { return true; }
+
+  template <typename T>
+  bool isa() const {
+    return pir::isa<T>(*this);
+  }
+
+  template <typename U>
+  U dyn_cast() const {
+    return pir::dyn_cast<U>(*this);
+  }
+
+  template <typename U>
+  U dyn_cast_interface() const {
+    return CastInfo<U>::call(*this);
+  }
+
+  void Print(std::ostream &os) const;
+
+  static Type Parse(std::istream &is, IrContext *ctx);
+
+  ///
+  /// \brief Enable hashing Type.
+  ///
+  friend struct std::hash<Type>;
+
+  template <typename U>
+  U cast() const {
+    return pir::cast<U>(*this);
+  }
+
+ protected:
+  const Storage *storage_{nullptr};
+
+ private:
+  template <typename T, typename Enabler = void>
+  struct CastInfo {
+    static T call(Type type) {
+      throw("Can't dyn_cast to T, T should be a Type or Interface");
+    }
+  };
+
+  template <typename T>
+  struct CastInfo<
+      T,
+      typename std::enable_if<std::is_base_of<pir::Type, T>::value>::type> {
+    static inline T call(pir::Type type) { return T::dyn_cast(type); }
+  };
+};
+
+IR_API std::ostream &operator<<(std::ostream &os, Type type);
+
+///
+/// \brief This class represents the base of a type interface.
+///
+template <typename ConcreteInterface>
+class TypeInterfaceBase : public pir::Type {
+ public:
+  explicit TypeInterfaceBase(Type type) : Type(type) {}
+
+  // Accessor for the ID of this interface.
+  static TypeId GetInterfaceId() { return TypeId::get<ConcreteInterface>(); }
+
+  static ConcreteInterface dyn_cast(Type type) {
+    return ConcreteInterface(
+        type, type.abstract_type().GetInterfaceImpl<ConcreteInterface>());
+  }
+};
+
+template <typename To, typename From>
+struct cast_impl<
+    To,
+    From,
+    typename std::enable_if<std::is_base_of<pir::Type, From>::value>::type> {
+  static inline To call(const pir::Type type) { return To(type.storage()); }
+};
+
+}  // namespace pir
+
+namespace std {
+///
+/// \brief Enable hashing Type.
+///
+template <>
+struct hash<pir::Type> {
+  std::size_t operator()(const pir::Type &obj) const {
+    return std::hash<const pir::Type::Storage *>()(obj.storage_);
+  }
+};
+}  // namespace std
diff --git a/paddle/pir/core/type_base.cc b/paddle/pir/core/type_base.cc
new file mode 100644
index 0000000000000..b4b6e55a6c8c8
--- /dev/null
+++ b/paddle/pir/core/type_base.cc
@@ -0,0 +1,38 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pir/core/type_base.h"
+#include "glog/logging.h"
+#include "paddle/pir/core/ir_context.h"
+
+namespace pir {
+
+void *AbstractType::GetInterfaceImpl(TypeId interface_id) const {
+  if (interface_map_.empty()) {
+    VLOG(6) << "Interface map is empty!";
+    return nullptr;
+  } else {
+    for (size_t i = 0; i < interface_map_.size(); ++i) {
+      if (interface_map_[i].type_id() == interface_id)
+        return interface_map_[i].model();
+    }
+    VLOG(6) << "Find no interface!";
+    return nullptr;
+  }
+  // TODO(zhangbo63): Add LookUp method like:
+  // return ir::details::LookUp<AbstractType>(
+  //     interface_id, num_interfaces_, num_traits_, this);
+}
+
+}  // namespace pir
diff --git a/paddle/ir/core/type_base.h b/paddle/pir/core/type_base.h
similarity index 78%
rename from paddle/ir/core/type_base.h
rename to paddle/pir/core/type_base.h
index 5ff4618dab773..3f3f3d069083d 100644
--- a/paddle/ir/core/type_base.h
+++ b/paddle/pir/core/type_base.h
@@ -14,11 +14,12 @@
 
 #pragma once
 
-#include "paddle/ir/core/ir_context.h"
-#include "paddle/ir/core/storage_manager.h"
-#include "paddle/ir/core/type_id.h"
+#include "paddle/pir/core/interface_value.h"
+#include "paddle/pir/core/ir_context.h"
+#include "paddle/pir/core/storage_manager.h"
+#include "paddle/pir/core/type_id.h"
 
-namespace ir {
+namespace pir {
 class Dialect;
 
 ///
@@ -36,8 +37,11 @@ class IR_API AbstractType {
   /// \param type_id The type id of the AbstractType.
   /// \param dialect The Dialect which the type registered to.
   ///
-  static AbstractType get(TypeId type_id, const Dialect &dialect) {
-    return AbstractType(type_id, dialect);
+  static AbstractType get(
+      TypeId type_id,
+      const Dialect &dialect,
+      std::vector<details::InterfaceValue> &&interface_map) {
+    return AbstractType(type_id, dialect, std::move(interface_map));
   }
 
   ///
@@ -47,7 +51,7 @@ class IR_API AbstractType {
   ///
   template <typename T>
   static AbstractType get(const Dialect &dialect) {
-    return AbstractType(TypeId::get<T>(), dialect);
+    return AbstractType(TypeId::get<T>(), dialect, T::interface_map());
   }
 
   ///
@@ -74,6 +78,14 @@ class IR_API AbstractType {
   ///
   static const AbstractType &lookup(TypeId type_id, IrContext *ctx);
 
+  ///
+  /// \brief Returns an instance of the concept object for the given interface
+  /// if it was registered to this type, null otherwise. This should not be used
+  /// directly.
+  ///
+  template <typename InterfaceT>
+  typename InterfaceT::Concept *GetInterfaceImpl() const;
+
  private:
   ///
   /// \brief The constructor is set to private and provides the user with the
@@ -82,14 +94,37 @@ class IR_API AbstractType {
   /// \param type_id The type id of the AbstractType.
   /// \param dialect The Dialect which the type registered to.
   ///
-  explicit AbstractType(TypeId type_id, const Dialect &dialect)
-      : type_id_(type_id), dialect_(dialect) {}
+  explicit AbstractType(TypeId type_id,
+                        const Dialect &dialect,
+                        std::vector<details::InterfaceValue> &&interface_map)
+      : type_id_(type_id),
+        dialect_(dialect),
+        interface_map_(std::move(interface_map)) {}
 
-  TypeId type_id_;
+  void *GetInterfaceImpl(TypeId interface_id) const;
 
+  /// A unique identifier of the derived Type class.
+  const TypeId type_id_;
+
+  /// Dialect to which this type was registered
   const Dialect &dialect_;
+
+  /// A collection of the interfaces registered to this type.
+  std::vector<details::InterfaceValue> interface_map_;
+
+  /// Interface will be recorded by std::pair<TypeId, void*> currently.
+  uint32_t num_interfaces_ = 0;
+
+  /// Trait will be recorded by TypeId.
+  uint32_t num_traits_ = 0;
 };
 
+template <typename InterfaceT>
+typename InterfaceT::Concept *AbstractType::GetInterfaceImpl() const {
+  void *model = GetInterfaceImpl(TypeId::get<InterfaceT>());
+  return reinterpret_cast<typename InterfaceT::Concept *>(model);
+}
+
 struct TypeManager;
 
 ///
@@ -155,7 +190,7 @@ struct IR_API TypeManager {
   template <typename T, typename... Args>
   static T get(IrContext *ctx, Args &&...args) {
     return get<T, Args...>(
-        ctx, ir::TypeId::get<T>(), std::forward<Args>(args)...);
+        ctx, pir::TypeId::get<T>(), std::forward<Args>(args)...);
   }
 
   ///
@@ -204,7 +239,7 @@ struct IR_API TypeManager {
   ///
   template <typename T>
   static void RegisterType(IrContext *ctx) {
-    RegisterType<T>(ctx, ir::TypeId::get<T>());
+    RegisterType<T>(ctx, pir::TypeId::get<T>());
   }
 
   ///
@@ -237,26 +272,4 @@ struct IR_API TypeManager {
   }
 };
 
-///
-/// \brief This macro definition is used to add some necessary functions to the
-/// custom Type class.
-///
-#define DECLARE_TYPE_UTILITY_FUNCTOR(concrete_type, storage_type)          \
-  using Storage = storage_type;                                            \
-                                                                           \
-  const Storage *storage() const {                                         \
-    return static_cast<const Storage *>(this->storage_);                   \
-  }                                                                        \
-                                                                           \
-  static ir::TypeId type_id() { return ir::TypeId::get<concrete_type>(); } \
-                                                                           \
-  template <typename T>                                                    \
-  static bool classof(T val) {                                             \
-    return val.type_id() == type_id();                                     \
-  }                                                                        \
-                                                                           \
-  template <typename... Args>                                              \
-  static concrete_type get(ir::IrContext *ctx, Args... args) {             \
-    return ir::TypeManager::template get<concrete_type>(ctx, args...);     \
-  }
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/core/type_id.h b/paddle/pir/core/type_id.h
similarity index 88%
rename from paddle/ir/core/type_id.h
rename to paddle/pir/core/type_id.h
index aec6ecf343433..0cf1313925307 100644
--- a/paddle/ir/core/type_id.h
+++ b/paddle/pir/core/type_id.h
@@ -17,9 +17,9 @@
 #include <glog/logging.h>
 #include <functional>
 
-#include "paddle/ir/core/dll_decl.h"
+#include "paddle/pir/core/dll_decl.h"
 
-namespace ir {
+namespace pir {
 
 ///
 /// \brief TypeId is the unique identification of Type, each Type corresponds to
@@ -50,6 +50,9 @@ class TypeId {
 
   TypeId &operator=(const TypeId &other) = default;
 
+  ///
+  /// \brief Support PointerLikeTypeTraits.
+  ///
   void *AsOpaquePointer() const { return storage_; }
   static TypeId RecoverFromOpaquePointer(void *pointer) {
     return TypeId(static_cast<Storage *>(pointer));
@@ -108,7 +111,7 @@ TypeId TypeId::get() {
 }
 
 #define IR_DECLARE_EXPLICIT_TYPE_ID(TYPE_CLASS) \
-  namespace ir {                                \
+  namespace pir {                               \
   namespace detail {                            \
   template <>                                   \
   class TypeIdResolver<TYPE_CLASS> {            \
@@ -117,10 +120,10 @@ TypeId TypeId::get() {
     static UniqueingId id_;                     \
   };                                            \
   }                                             \
-  }  // namespace ir
+  }  // namespace pir
 
 #define IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(TYPE_CLASS) \
-  namespace ir {                                       \
+  namespace pir {                                      \
   namespace detail {                                   \
   template <>                                          \
   class IR_API TypeIdResolver<TYPE_CLASS> {            \
@@ -129,25 +132,25 @@ TypeId TypeId::get() {
     static UniqueingId id_;                            \
   };                                                   \
   }                                                    \
-  }  // namespace ir
+  }  // namespace pir
 
 #define IR_DEFINE_EXPLICIT_TYPE_ID(TYPE_CLASS)      \
-  namespace ir {                                    \
+  namespace pir {                                   \
   namespace detail {                                \
   UniqueingId TypeIdResolver<TYPE_CLASS>::id_ = {}; \
   }                                                 \
-  }  // namespace ir
+  }  // namespace pir
 
-}  // namespace ir
+}  // namespace pir
 
 namespace std {
 ///
 /// \brief Enable hashing TypeId instances.
 ///
 template <>
-struct hash<ir::TypeId> {
-  std::size_t operator()(const ir::TypeId &obj) const {
-    return std::hash<const ir::TypeId::Storage *>()(obj.storage_);
+struct hash<pir::TypeId> {
+  std::size_t operator()(const pir::TypeId &obj) const {
+    return std::hash<const pir::TypeId::Storage *>()(obj.storage_);
   }
 };
 }  // namespace std
diff --git a/paddle/ir/core/type_name.h b/paddle/pir/core/type_name.h
similarity index 98%
rename from paddle/ir/core/type_name.h
rename to paddle/pir/core/type_name.h
index d7143d2d754b0..4eecde030a9a4 100644
--- a/paddle/ir/core/type_name.h
+++ b/paddle/pir/core/type_name.h
@@ -17,7 +17,7 @@
 #include <cassert>
 #include <string>
 
-namespace ir {
+namespace pir {
 
 template <typename DesiredTypeName>
 inline std::string get_type_name() {
@@ -56,4 +56,4 @@ inline std::string get_type_name() {
 #endif
 }
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/core/use_iterator.h b/paddle/pir/core/use_iterator.h
similarity index 97%
rename from paddle/ir/core/use_iterator.h
rename to paddle/pir/core/use_iterator.h
index d7ef2a675649f..42705162d93e5 100644
--- a/paddle/ir/core/use_iterator.h
+++ b/paddle/pir/core/use_iterator.h
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #pragma once
-namespace ir {
+namespace pir {
 
 class Operation;
 ///
@@ -52,4 +52,4 @@ class ValueUseIterator {
   OperandType current_;
 };
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/core/utils.cc b/paddle/pir/core/utils.cc
similarity index 96%
rename from paddle/ir/core/utils.cc
rename to paddle/pir/core/utils.cc
index eec502ee10b1f..51c26bd379cfe 100644
--- a/paddle/ir/core/utils.cc
+++ b/paddle/pir/core/utils.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/ir/core/utils.h"
+#include "paddle/pir/core/utils.h"
 
-namespace ir {
+namespace pir {
 std::size_t hash_combine(std::size_t lhs, std::size_t rhs) {
   return lhs ^= rhs + 0x9e3779b9 + (lhs << 6) + (lhs >> 2);
 }
@@ -55,4 +55,4 @@ void aligned_free(void *mem_ptr) {
 #endif
 }
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/core/utils.h b/paddle/pir/core/utils.h
similarity index 98%
rename from paddle/ir/core/utils.h
rename to paddle/pir/core/utils.h
index e7ddd5f26eadf..94ff80b27bbe5 100644
--- a/paddle/ir/core/utils.h
+++ b/paddle/pir/core/utils.h
@@ -20,9 +20,9 @@
 #include <tuple>
 #include <type_traits>
 
-#include "paddle/ir/core/dll_decl.h"
+#include "paddle/pir/core/dll_decl.h"
 
-namespace ir {
+namespace pir {
 ///
 /// \brief Equivalent to boost::hash_combine.
 ///
@@ -136,4 +136,4 @@ void PrintInterleave(ForwardIterator begin,
   }
 }
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/pir/core/value.cc b/paddle/pir/core/value.cc
new file mode 100644
index 0000000000000..8aaab15b29c42
--- /dev/null
+++ b/paddle/pir/core/value.cc
@@ -0,0 +1,107 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pir/core/value.h"
+
+#include <cstddef>
+
+#include "paddle/pir/core/enforce.h"
+#include "paddle/pir/core/op_operand.h"
+#include "paddle/pir/core/op_result.h"
+#include "paddle/pir/core/operation.h"
+#include "paddle/pir/core/value_impl.h"
+
+#define CHECK_NULL_IMPL(class_name, func_name)                  \
+  IR_ENFORCE(impl_,                                             \
+             "impl_ pointer is null when call func:" #func_name \
+             " , in class: " #class_name ".")
+
+#define CHECK_VALUE_NULL_IMPL(func_name) CHECK_NULL_IMPL(Value, func_name)
+
+namespace pir {
+
+Value::Value(const detail::ValueImpl *impl)
+    : impl_(const_cast<detail::ValueImpl *>(impl)) {}
+
+bool Value::operator==(const Value &other) const {
+  return impl_ == other.impl_;
+}
+
+bool Value::operator!=(const Value &other) const {
+  return impl_ != other.impl_;
+}
+
+bool Value::operator!() const { return impl_ == nullptr; }
+
+Value::operator bool() const { return impl_; }
+
+pir::Type Value::type() const {
+  CHECK_VALUE_NULL_IMPL(type);
+  return impl_->type();
+}
+
+void Value::set_type(pir::Type type) {
+  CHECK_VALUE_NULL_IMPL(set_type);
+  impl_->set_type(type);
+}
+
+Operation *Value::GetDefiningOp() const {
+  if (auto result = dyn_cast<OpResult>()) return result.owner();
+  return nullptr;
+}
+
+std::string Value::PrintUdChain() {
+  CHECK_VALUE_NULL_IMPL(PrintUdChain);
+  return impl()->PrintUdChain();
+}
+
+Value::UseIterator Value::use_begin() const { return OpOperand(first_use()); }
+
+Value::UseIterator Value::use_end() const { return Value::UseIterator(); }
+
+OpOperand Value::first_use() const {
+  CHECK_VALUE_NULL_IMPL(first_use);
+  return impl_->first_use();
+}
+
+bool Value::use_empty() const { return !first_use(); }
+
+bool Value::HasOneUse() const {
+  CHECK_VALUE_NULL_IMPL(HasOneUse);
+  return impl_->HasOneUse();
+}
+
+size_t Value::use_count() const {
+  size_t count = 0;
+  for (auto it = use_begin(); it != use_end(); ++it) count++;
+  return count;
+}
+
+void Value::ReplaceUsesWithIf(
+    Value new_value,
+    const std::function<bool(OpOperand)> &should_replace) const {
+  for (auto it = use_begin(); it != use_end();) {
+    if (should_replace(*it)) {
+      (it++)->set_source(new_value);
+    }
+  }
+}
+
+void Value::ReplaceAllUsesWith(Value new_value) const {
+  for (auto it = use_begin(); it != use_end();) {
+    (it++)->set_source(new_value);
+  }
+}
+
+}  // namespace pir
diff --git a/paddle/ir/core/value.h b/paddle/pir/core/value.h
similarity index 52%
rename from paddle/ir/core/value.h
rename to paddle/pir/core/value.h
index 3bbeaa4e55a58..d92678ed3a8ed 100644
--- a/paddle/ir/core/value.h
+++ b/paddle/pir/core/value.h
@@ -14,62 +14,18 @@
 
 #pragma once
 
-#include "paddle/ir/core/cast_utils.h"
-#include "paddle/ir/core/type.h"
-#include "paddle/ir/core/use_iterator.h"
+#include "paddle/pir/core/cast_utils.h"
+#include "paddle/pir/core/op_operand.h"
+#include "paddle/pir/core/type.h"
+#include "paddle/pir/core/use_iterator.h"
 
-namespace ir {
+namespace pir {
 class Operation;
-class Value;
 
 namespace detail {
-class OpOperandImpl;
 class ValueImpl;
-class OpResultImpl;
 }  // namespace detail
 
-///
-/// \brief OpOperand class represents the op_operand of operation. This class
-/// only provides interfaces, for specific implementation, see Impl class.
-///
-class IR_API OpOperand {
- public:
-  OpOperand() = default;
-
-  OpOperand(const OpOperand &other) = default;
-
-  OpOperand(const detail::OpOperandImpl *impl);  // NOLINT
-
-  OpOperand &operator=(const OpOperand &rhs);
-
-  OpOperand &operator=(const detail::OpOperandImpl *impl);
-
-  bool operator==(const OpOperand &other) const { return impl_ == other.impl_; }
-
-  bool operator!=(const OpOperand &other) const { return !operator==(other); }
-
-  bool operator!() const { return impl_ == nullptr; }
-
-  operator bool() const;
-
-  OpOperand next_use() const;
-
-  Value source() const;
-
-  Type type() const;
-
-  void set_source(Value value);
-
-  Operation *owner() const;
-
-  void RemoveFromUdChain();
-
-  friend Operation;
-
- private:
-  detail::OpOperandImpl *impl_{nullptr};
-};
-
 ///
 /// \brief Value class represents the SSA value in the IR system. This class
 /// only provides interfaces, for specific implementation, see Impl class.
@@ -92,12 +48,12 @@ class IR_API Value {
 
   template <typename T>
   bool isa() const {
-    return ir::isa<T>(*this);
+    return pir::isa<T>(*this);
   }
 
   template <typename U>
   U dyn_cast() const {
-    return ir::dyn_cast<U>(*this);
+    return pir::dyn_cast<U>(*this);
   }
 
   Type type() const;
@@ -139,39 +95,13 @@ class IR_API Value {
   detail::ValueImpl *impl_{nullptr};
 };
 
-///
-/// \brief OpResult class represents the value defined by a result of operation.
-/// This class only provides interfaces, for specific implementation, see Impl
-/// class.
-///
-class IR_API OpResult : public Value {
- public:
-  using Value::Value;
-
-  static bool classof(Value value);
-
-  Operation *owner() const;
-
-  uint32_t GetResultIndex() const;
-
-  bool operator==(const OpResult &other) const;
-
-  friend Operation;
-
-  detail::ValueImpl *value_impl() const;
-  detail::OpResultImpl *impl() const;
-
- private:
-  static uint32_t GetValidInlineIndex(uint32_t index);
-};
-
-}  // namespace ir
+}  // namespace pir
 
 namespace std {
 template <>
-struct hash<ir::Value> {
-  std::size_t operator()(const ir::Value &obj) const {
-    return std::hash<const ir::detail::ValueImpl *>()(obj.impl_);
+struct hash<pir::Value> {
+  std::size_t operator()(const pir::Value &obj) const {
+    return std::hash<const pir::detail::ValueImpl *>()(obj.impl_);
   }
 };
 
diff --git a/paddle/pir/core/value_impl.cc b/paddle/pir/core/value_impl.cc
new file mode 100644
index 0000000000000..f98c1ac75ea3a
--- /dev/null
+++ b/paddle/pir/core/value_impl.cc
@@ -0,0 +1,56 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/pir/core/value_impl.h"
+
+namespace pir {
+
+namespace detail {
+void ValueImpl::set_first_use(OpOperandImpl *first_use) {
+  uint32_t offset = kind();
+  first_use_offseted_by_index_ = reinterpret_cast<OpOperandImpl *>(
+      reinterpret_cast<uintptr_t>(first_use) + offset);
+  VLOG(4) << "The index of this value is " << offset
+          << ". Offset and set first use: " << first_use << " -> "
+          << first_use_offseted_by_index_ << ".";
+}
+
+std::string ValueImpl::PrintUdChain() {
+  std::stringstream result;
+  result << "Value[" << this << "] -> ";
+  OpOperandImpl *tmp = first_use();
+  if (tmp) {
+    result << "OpOperand[" << reinterpret_cast<void *>(tmp) << "] -> ";
+    while (tmp->next_use() != nullptr) {
+      result << "OpOperand[" << reinterpret_cast<void *>(tmp->next_use())
+             << "] -> ";
+      tmp = tmp->next_use();
+    }
+  }
+  result << "nullptr";
+  return result.str();
+}
+ValueImpl::ValueImpl(Type type, uint32_t index) {
+  if (index > OUTLINE_OP_RESULT_INDEX) {
+    throw("The value of index must not exceed 6");
+  }
+  type_ = type;
+  first_use_offseted_by_index_ = reinterpret_cast<OpOperandImpl *>(
+      reinterpret_cast<uintptr_t>(nullptr) + index);
+  VLOG(4) << "Construct a ValueImpl whose's index is " << index
+          << ". The offset first_use address is: "
+          << first_use_offseted_by_index_;
+}
+
+}  // namespace detail
+}  // namespace pir
diff --git a/paddle/pir/core/value_impl.h b/paddle/pir/core/value_impl.h
new file mode 100644
index 0000000000000..f560aa4362d4d
--- /dev/null
+++ b/paddle/pir/core/value_impl.h
@@ -0,0 +1,91 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/pir/core/op_operand_impl.h"
+#include "paddle/pir/core/value.h"
+
+namespace pir {
+static const uint32_t OUTLINE_OP_RESULT_INDEX = 6;
+class Operation;
+
+namespace detail {
+///
+/// \brief ValueImpl is the base class of all derived Value classes such as
+/// OpResultImpl. This class defines all the information and usage interface in
+/// the IR Value. Each Value include three attributes:
+/// (1) type: pir::Type; (2) UD-chain of value: OpOperandImpl*, first op_operand
+/// address with offset of this value; (3) index: the position where the output
+/// list of the parent operator.
+///
+class alignas(8) ValueImpl {
+ public:
+  ///
+  /// \brief Interface functions of "type_" attribute.
+  ///
+  Type type() const { return type_; }
+
+  void set_type(Type type) { type_ = type; }
+
+  OpOperandImpl *first_use() const {
+    return reinterpret_cast<OpOperandImpl *>(
+        reinterpret_cast<uintptr_t>(first_use_offseted_by_index_) & (~0x07));
+  }
+
+  void set_first_use(OpOperandImpl *first_use);
+
+  OpOperandImpl **first_use_addr() { return &first_use_offseted_by_index_; }
+
+  bool use_empty() const { return first_use() == nullptr; }
+
+  bool HasOneUse() const {
+    return (first_use() != nullptr) && (first_use()->next_use() == nullptr);
+  }
+
+  std::string PrintUdChain();
+
+  ///
+  /// \brief Interface functions of "first_use_offseted_by_index_" attribute.
+  ///
+  uint32_t kind() const {
+    return reinterpret_cast<uintptr_t>(first_use_offseted_by_index_) & 0x07;
+  }
+
+ protected:
+  ///
+  /// \brief Only can be constructed by derived classes such as OpResultImpl.
+  ///
+  ValueImpl(Type type, uint32_t index);
+
+  ///
+  /// \brief Attribute1: Type of value.
+  ///
+  Type type_;
+
+  ///
+  /// \brief Attribute2/3: Record the UD-chain of value and index.
+  /// NOTE: The members of the OpOperandImpl include four pointers, so this
+  /// class is 8-byte aligned, and the lower 3 bits of its address are 0, so the
+  /// index can be stored in these 3 bits, stipulate:
+  /// (1) index = 0~5: represent positions 0 to 5 inline
+  /// output(OpInlineResultImpl); (2) index = 6: represent the position >=6
+  /// outline output(OpOutlineResultImpl); (3) index = 7 is reserved.
+  ///
+  OpOperandImpl *first_use_offseted_by_index_ = nullptr;
+};
+
+}  // namespace detail
+
+}  // namespace pir
diff --git a/paddle/ir/core/verify.cc b/paddle/pir/core/verify.cc
similarity index 89%
rename from paddle/ir/core/verify.cc
rename to paddle/pir/core/verify.cc
index 39248ec085f92..2d3485324a6ba 100644
--- a/paddle/ir/core/verify.cc
+++ b/paddle/pir/core/verify.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/ir/core/verify.h"
-#include "paddle/ir/core/operation.h"
-namespace ir {
+#include "paddle/pir/core/verify.h"
+#include "paddle/pir/core/operation.h"
+namespace pir {
 void Verify(Operation *op, bool verify_recursively) {
   op->Verify();
   if (!verify_recursively) return;
@@ -27,4 +27,4 @@ void Verify(Operation *op, bool verify_recursively) {
     }
   }
 }
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/core/verify.h b/paddle/pir/core/verify.h
similarity index 93%
rename from paddle/ir/core/verify.h
rename to paddle/pir/core/verify.h
index 92fe66054497e..de413c95e7657 100644
--- a/paddle/ir/core/verify.h
+++ b/paddle/pir/core/verify.h
@@ -13,9 +13,9 @@
 // limitations under the License.
 #pragma once
 
-#include "paddle/ir/core/dll_decl.h"
+#include "paddle/pir/core/dll_decl.h"
 
-namespace ir {
+namespace pir {
 
 class Operation;
 
@@ -26,4 +26,4 @@ class Operation;
 /// invoke the verifier on nested operations.
 IR_API void Verify(Operation *op, bool verifyRecursively = true);
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/dialect/CMakeLists.txt b/paddle/pir/dialect/CMakeLists.txt
similarity index 100%
rename from paddle/ir/dialect/CMakeLists.txt
rename to paddle/pir/dialect/CMakeLists.txt
diff --git a/paddle/pir/dialect/control_flow/CMakeLists.txt b/paddle/pir/dialect/control_flow/CMakeLists.txt
new file mode 100644
index 0000000000000..b30eb7fa567d7
--- /dev/null
+++ b/paddle/pir/dialect/control_flow/CMakeLists.txt
@@ -0,0 +1,2 @@
+file(GLOB_RECURSE CONTROL_FLOW_SRCS "*.cc")
+ir_library(pir_control_flow SRCS ${CONTROL_FLOW_SRCS} DEPS pir_core)
diff --git a/paddle/ir/dialect/control_flow/ir/cf_dialect.cc b/paddle/pir/dialect/control_flow/ir/cf_dialect.cc
similarity index 77%
rename from paddle/ir/dialect/control_flow/ir/cf_dialect.cc
rename to paddle/pir/dialect/control_flow/ir/cf_dialect.cc
index 8d26f862b562b..ed36c0c81cca6 100644
--- a/paddle/ir/dialect/control_flow/ir/cf_dialect.cc
+++ b/paddle/pir/dialect/control_flow/ir/cf_dialect.cc
@@ -11,10 +11,10 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/ir/dialect/control_flow/ir/cf_dialect.h"
-#include "paddle/ir/dialect/control_flow/ir/cf_ops.h"
+#include "paddle/pir/dialect/control_flow/ir/cf_dialect.h"
+#include "paddle/pir/dialect/control_flow/ir/cf_ops.h"
 
-namespace ir {
+namespace pir {
 void ControlFlowDialect::initialize() { RegisterOps<YieldOp>(); }
-}  // namespace ir
-IR_DEFINE_EXPLICIT_TYPE_ID(ir::ControlFlowDialect)
+}  // namespace pir
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::ControlFlowDialect)
diff --git a/paddle/ir/dialect/control_flow/ir/cf_dialect.h b/paddle/pir/dialect/control_flow/ir/cf_dialect.h
similarity index 87%
rename from paddle/ir/dialect/control_flow/ir/cf_dialect.h
rename to paddle/pir/dialect/control_flow/ir/cf_dialect.h
index 867290cdd5bab..c195ba9638984 100644
--- a/paddle/ir/dialect/control_flow/ir/cf_dialect.h
+++ b/paddle/pir/dialect/control_flow/ir/cf_dialect.h
@@ -14,9 +14,9 @@
 
 #pragma once
 
-#include "paddle/ir/core/dialect.h"
+#include "paddle/pir/core/dialect.h"
 
-namespace ir {
+namespace pir {
 class ControlFlowDialect : public Dialect {
  public:
   explicit ControlFlowDialect(IrContext *context)
@@ -29,5 +29,5 @@ class ControlFlowDialect : public Dialect {
   void initialize();
 };
 
-}  // namespace ir
-IR_DECLARE_EXPLICIT_TYPE_ID(ir::ControlFlowDialect)
+}  // namespace pir
+IR_DECLARE_EXPLICIT_TYPE_ID(pir::ControlFlowDialect)
diff --git a/paddle/ir/dialect/control_flow/ir/cf_ops.cc b/paddle/pir/dialect/control_flow/ir/cf_ops.cc
similarity index 86%
rename from paddle/ir/dialect/control_flow/ir/cf_ops.cc
rename to paddle/pir/dialect/control_flow/ir/cf_ops.cc
index dc5491d1ad5d3..7dd72ea12551e 100644
--- a/paddle/ir/dialect/control_flow/ir/cf_ops.cc
+++ b/paddle/pir/dialect/control_flow/ir/cf_ops.cc
@@ -12,15 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/ir/dialect/control_flow/ir/cf_ops.h"
+#include "paddle/pir/dialect/control_flow/ir/cf_ops.h"
 
-namespace ir {
+namespace pir {
 
 void YieldOp::Build(Builder &builder,
                     OperationArgument &argument,
                     std::vector<OpResult> &&inputs) {
   argument.AddOperands(inputs.begin(), inputs.end());
 }
-}  // namespace ir
+}  // namespace pir
 
-IR_DEFINE_EXPLICIT_TYPE_ID(ir::YieldOp)
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::YieldOp)
diff --git a/paddle/ir/dialect/control_flow/ir/cf_ops.h b/paddle/pir/dialect/control_flow/ir/cf_ops.h
similarity index 86%
rename from paddle/ir/dialect/control_flow/ir/cf_ops.h
rename to paddle/pir/dialect/control_flow/ir/cf_ops.h
index d58e717136ae2..2f69aa9147224 100644
--- a/paddle/ir/dialect/control_flow/ir/cf_ops.h
+++ b/paddle/pir/dialect/control_flow/ir/cf_ops.h
@@ -14,10 +14,10 @@
 
 #pragma once
 
-#include "paddle/ir/core/builder.h"
-#include "paddle/ir/core/op_base.h"
+#include "paddle/pir/core/builder.h"
+#include "paddle/pir/core/op_base.h"
 
-namespace ir {
+namespace pir {
 class IR_API YieldOp : public Op<YieldOp> {
  public:
   using Op::Op;
@@ -30,6 +30,6 @@ class IR_API YieldOp : public Op<YieldOp> {
                     std::vector<OpResult> &&inputs);
   void Verify() {}
 };
-}  // namespace ir
+}  // namespace pir
 
-IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(ir::YieldOp);
+IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::YieldOp);
diff --git a/paddle/pir/dialect/shape/CMakeLists.txt b/paddle/pir/dialect/shape/CMakeLists.txt
new file mode 100644
index 0000000000000..0798e78f2b15a
--- /dev/null
+++ b/paddle/pir/dialect/shape/CMakeLists.txt
@@ -0,0 +1,2 @@
+file(GLOB_RECURSE SHAPE_SRCS "*.cc")
+ir_library(pir_shape SRCS ${SHAPE_SRCS} DEPS pir_core)
diff --git a/paddle/ir/dialect/shape/ir/shape_dialect.cc b/paddle/pir/dialect/shape/ir/shape_dialect.cc
similarity index 75%
rename from paddle/ir/dialect/shape/ir/shape_dialect.cc
rename to paddle/pir/dialect/shape/ir/shape_dialect.cc
index d058924511bcd..fba4c69d466f4 100644
--- a/paddle/ir/dialect/shape/ir/shape_dialect.cc
+++ b/paddle/pir/dialect/shape/ir/shape_dialect.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/ir/dialect/shape/ir/shape_dialect.h"
-#include "paddle/ir/dialect/shape/ir/shape_op.h"
+#include "paddle/pir/dialect/shape/ir/shape_dialect.h"
+#include "paddle/pir/dialect/shape/ir/shape_op.h"
 
-namespace ir {
+namespace pir {
 namespace dialect {
 ShapeDialect::ShapeDialect(IrContext *context)
     : Dialect(name(), context, TypeId::get<ShapeDialect>()) {
@@ -23,10 +23,10 @@ ShapeDialect::ShapeDialect(IrContext *context)
 }
 
 void ShapeDialect::initialize() {
-  RegisterOps<SymbolicDim, DimOp, TieProductEqualOp>();
+  RegisterOps<SymbolicDim, DimOp, TieProductEqualOp, TieShapeOp, FuncOp>();
 }
 
 }  // namespace dialect
-}  // namespace ir
+}  // namespace pir
 
-IR_DEFINE_EXPLICIT_TYPE_ID(ir::dialect::ShapeDialect)
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::dialect::ShapeDialect)
diff --git a/paddle/ir/dialect/shape/ir/shape_dialect.h b/paddle/pir/dialect/shape/ir/shape_dialect.h
similarity index 80%
rename from paddle/ir/dialect/shape/ir/shape_dialect.h
rename to paddle/pir/dialect/shape/ir/shape_dialect.h
index eb47aa1345f28..16d5d2ea68e07 100644
--- a/paddle/ir/dialect/shape/ir/shape_dialect.h
+++ b/paddle/pir/dialect/shape/ir/shape_dialect.h
@@ -14,16 +14,16 @@
 
 #pragma once
 
-#include "paddle/ir/core/dialect.h"
+#include "paddle/pir/core/dialect.h"
 
-namespace ir {
+namespace pir {
 namespace dialect {
 ///
 /// \brief Shape Dialect:
 ///
-class IR_API ShapeDialect : public ir::Dialect {
+class IR_API ShapeDialect : public pir::Dialect {
  public:
-  explicit ShapeDialect(ir::IrContext *context);
+  explicit ShapeDialect(pir::IrContext *context);
   ///
   /// \brief Each Dialect needs to provide a name function to return the name of
   /// the Dialect.
@@ -37,6 +37,6 @@ class IR_API ShapeDialect : public ir::Dialect {
 };
 
 }  // namespace dialect
-}  // namespace ir
+}  // namespace pir
 
-IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(ir::dialect::ShapeDialect)
+IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::dialect::ShapeDialect)
diff --git a/paddle/pir/dialect/shape/ir/shape_op.cc b/paddle/pir/dialect/shape/ir/shape_op.cc
new file mode 100644
index 0000000000000..1ad4484551092
--- /dev/null
+++ b/paddle/pir/dialect/shape/ir/shape_op.cc
@@ -0,0 +1,236 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pir/dialect/shape/ir/shape_op.h"
+#include "paddle/pir/core/builtin_attribute.h"
+#include "paddle/pir/core/builtin_type.h"
+
+namespace pir {
+namespace dialect {
+
+const char *SymbolicDim::attributes_name[attributes_num] = {"knownNegativeOne",
+                                                            "knownNonNegative",
+                                                            "knownNonSizeOne",
+                                                            "knownNonSizeZero",
+                                                            "sym_name",
+                                                            "value"};  // NOLINT
+
+void SymbolicDim::Build(Builder &builder,
+                        OperationArgument &argument,
+                        const std::string &sym_name,
+                        int64_t value,
+                        bool knownNonNegative,
+                        bool knownNegativeOne,
+                        bool knownNonSizeOne,
+                        bool knownNonSizeZero) {
+  pir::Attribute attr_sym_name =
+      pir::StrAttribute::get(pir::IrContext::Instance(), sym_name);
+  argument.AddAttribute("sym_name", attr_sym_name);
+  pir::Attribute attr_value =
+      pir::Int64Attribute::get(pir::IrContext::Instance(), value);
+  argument.AddAttribute("value", attr_value);
+  pir::Attribute attr_knownNonNegative =
+      pir::BoolAttribute::get(pir::IrContext::Instance(), knownNonNegative);
+  argument.AddAttribute("knownNonNegative", attr_knownNonNegative);
+  pir::Attribute attr_knownNegativeOne =
+      pir::BoolAttribute::get(pir::IrContext::Instance(), knownNegativeOne);
+  argument.AddAttribute("knownNegativeOne", attr_knownNegativeOne);
+  pir::Attribute attr_knownNonSizeOne =
+      pir::BoolAttribute::get(pir::IrContext::Instance(), knownNonSizeOne);
+  argument.AddAttribute("knownNonSizeOne", attr_knownNonSizeOne);
+  pir::Attribute attr_knownNonSizeZero =
+      pir::BoolAttribute::get(pir::IrContext::Instance(), knownNonSizeZero);
+  argument.AddAttribute("knownNonSizeZero", attr_knownNonSizeZero);
+}
+
+const std::string SymbolicDim::getSymName() {
+  return attribute<pir::StrAttribute>("sym_name").AsString();
+}
+int64_t SymbolicDim::getValue() {
+  return attribute<pir::Int64Attribute>("value").data();
+}
+bool SymbolicDim::getKnownNonNegative() {
+  return attribute<pir::BoolAttribute>("knownNonNegative").data();
+}
+bool SymbolicDim::getKnownNegativeOne() {
+  return attribute<pir::BoolAttribute>("knownNegativeOne").data();
+}
+bool SymbolicDim::getKnownNonSizeOne() {
+  return attribute<pir::BoolAttribute>("knownNonSizeOne").data();
+}
+bool SymbolicDim::getKnownNonSizeZero() {
+  return attribute<pir::BoolAttribute>("knownNonSizeZero").data();
+}
+
+void SymbolicDim::updateSymName(std::string attrValue) {
+  operation()->set_attribute(
+      "sym_name",
+      pir::StrAttribute::get(pir::IrContext::Instance(), attrValue));
+}
+void SymbolicDim::updateValue(int64_t attrValue) {
+  operation()->set_attribute(
+      "value", pir::Int64Attribute::get(pir::IrContext::Instance(), attrValue));
+}
+
+void SymbolicDim::updateKnownNonNegative(bool attrValue) {
+  operation()->set_attribute(
+      "knownNonNegative",
+      pir::BoolAttribute::get(pir::IrContext::Instance(), attrValue));
+}
+void SymbolicDim::updateKnownNegativeOne(bool attrValue) {
+  operation()->set_attribute(
+      "knownNegativeOne",
+      pir::BoolAttribute::get(pir::IrContext::Instance(), attrValue));
+}
+void SymbolicDim::updateKnownNonSizeOne(bool attrValue) {
+  operation()->set_attribute(
+      "knownNonSizeOne",
+      pir::BoolAttribute::get(pir::IrContext::Instance(), attrValue));
+}
+void SymbolicDim::updateKnownNonSizeZero(bool attrValue) {
+  operation()->set_attribute(
+      "knownNonSizeZero",
+      pir::BoolAttribute::get(pir::IrContext::Instance(), attrValue));
+}
+
+bool SymbolicDim::isDynamic() {
+  return getValue() == ShapedTypeInterface::kDynamic;
+}
+
+bool SymbolicDim::merge(SymbolicDim other) {
+  if (!isDynamic() && !other.isDynamic() && getValue() != other.getValue())
+    return false;
+  if (isDynamic() && !other.isDynamic()) updateValue(other.getValue());
+  if (!isDynamic() && other.isDynamic()) other.updateValue(getValue());
+
+  bool knownNonNegativeFlag =
+      getKnownNonNegative() || other.getKnownNonNegative();
+  bool knownNegativeOneFlag =
+      getKnownNegativeOne() || other.getKnownNegativeOne();
+  bool knownNonSizeOneFlag = getKnownNonSizeOne() ||
+                             other.getKnownNonSizeOne() || knownNegativeOneFlag;
+  bool knownNonSizeZeroFlag = getKnownNonSizeZero() ||
+                              other.getKnownNonSizeZero() ||
+                              knownNegativeOneFlag;
+
+  if (knownNonNegativeFlag && knownNegativeOneFlag) return false;
+
+  updateKnownNonSizeZero(knownNonSizeZeroFlag);
+  updateKnownNonSizeOne(knownNonSizeOneFlag);
+  updateKnownNegativeOne(knownNegativeOneFlag);
+  updateKnownNonNegative(knownNonNegativeFlag);
+
+  return true;
+}
+
+const char *DimOp::attributes_name[attributes_num] = {"name"};  // NOLINT
+
+void DimOp::Build(Builder &builder,
+                  OperationArgument &argument,
+                  const std::string &name) {
+  pir::Attribute attr_name =
+      pir::StrAttribute::get(pir::IrContext::Instance(), name);
+  argument.AddAttribute("name", attr_name);
+  argument.output_types.emplace_back(
+      pir::IndexType::get(pir::IrContext::Instance()));
+}
+
+const std::string DimOp::getName() {
+  return attribute<pir::StrAttribute>("name").AsString();
+}
+
+void DimOp::setName(std::string attrName) {
+  operation()->set_attribute(
+      "name", pir::StrAttribute::get(pir::IrContext::Instance(), attrName));
+}
+
+const char *TieProductEqualOp::attributes_name[attributes_num] = {
+    "lhs_len", "rhs_len"};  // NOLINT
+
+void TieProductEqualOp::Build(Builder &builder,
+                              OperationArgument &argument,
+                              int64_t lhs_len,
+                              int64_t rhs_len,
+                              const std::vector<pir::OpResult> &inputs) {
+  pir::Attribute attr_lhs_len =
+      pir::Int64Attribute::get(pir::IrContext::Instance(), lhs_len);
+  argument.AddAttribute("lhs_len", attr_lhs_len);
+  pir::Attribute attr_rhs_len =
+      pir::Int64Attribute::get(pir::IrContext::Instance(), rhs_len);
+  argument.AddAttribute("rhs_len", attr_rhs_len);
+  argument.inputs = inputs;
+}
+
+void TieProductEqualOp::Build(Builder &builder,
+                              OperationArgument &argument,
+                              const std::vector<pir::OpResult> &lhs,
+                              const std::vector<pir::OpResult> &rhs) {
+  pir::Attribute attr_lhs_len =
+      pir::Int64Attribute::get(pir::IrContext::Instance(), lhs.size());
+  argument.AddAttribute("lhs_len", attr_lhs_len);
+  pir::Attribute attr_rhs_len =
+      pir::Int64Attribute::get(pir::IrContext::Instance(), rhs.size());
+  argument.AddAttribute("rhs_len", attr_rhs_len);
+
+  argument.inputs = lhs;
+  argument.inputs.insert(argument.inputs.end(), rhs.begin(), rhs.end());
+}
+
+std::vector<pir::Value> TieProductEqualOp::getLhs() {
+  int64_t lhs_len = attribute<pir::Int64Attribute>("lhs_len").data();
+  std::vector<pir::Value> res;
+  for (uint32_t idx = 0; idx < lhs_len; idx++) {
+    res.push_back(operand_source(idx));
+  }
+  return res;
+}
+std::vector<pir::Value> TieProductEqualOp::getRhs() {
+  int64_t lhs_len = attribute<pir::Int64Attribute>("lhs_len").data();
+  int64_t rhs_len = attribute<pir::Int64Attribute>("rhs_len").data();
+  std::vector<pir::Value> res;
+  for (uint32_t idx = 0; idx < rhs_len; idx++) {
+    res.push_back(operand_source(lhs_len + idx));
+  }
+  return res;
+}
+
+const char *TieShapeOp::attributes_name[attributes_num] = {
+    SymbolicDim::getSymbolicDimAttrName().c_str()};  // NOLINT
+
+void TieShapeOp::Build(Builder &builder,
+                       OperationArgument &argument,
+                       const pir::OpResult &input) {
+  argument.inputs = {input};
+}
+
+pir::Value TieShapeOp::getValue() { return operand_source(0); }
+
+void FuncOp::Build(Builder &builder, OperationArgument &argument) {
+  argument.num_regions = 1;
+}
+
+pir::Block *FuncOp::block() {
+  pir::Region &region = (*this)->region(0);
+  if (region.empty()) region.emplace_back();
+  return region.front();
+}
+
+}  // namespace dialect
+}  // namespace pir
+
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::dialect::SymbolicDim)
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::dialect::DimOp)
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::dialect::TieProductEqualOp)
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::dialect::TieShapeOp)
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::dialect::FuncOp)
diff --git a/paddle/ir/dialect/shape/ir/shape_op.h b/paddle/pir/dialect/shape/ir/shape_op.h
similarity index 51%
rename from paddle/ir/dialect/shape/ir/shape_op.h
rename to paddle/pir/dialect/shape/ir/shape_op.h
index af61393a24c9b..70e408c03cfb6 100644
--- a/paddle/ir/dialect/shape/ir/shape_op.h
+++ b/paddle/pir/dialect/shape/ir/shape_op.h
@@ -14,10 +14,11 @@
 
 #pragma once
 
-#include "paddle/ir/core/builder.h"
-#include "paddle/ir/core/op_base.h"
+#include "paddle/pir/core/builder.h"
+#include "paddle/pir/core/builtin_type_interfaces.h"
+#include "paddle/pir/core/op_base.h"
 
-namespace ir {
+namespace pir {
 namespace dialect {
 
 class IR_API SymbolicDim : public Op<SymbolicDim> {
@@ -28,15 +29,14 @@ class IR_API SymbolicDim : public Op<SymbolicDim> {
   static constexpr uint32_t attributes_num = 6;
   static const char *attributes_name[attributes_num];
 
-  static void Build(
-      Builder &builder,             // NOLINT
-      OperationArgument &argument,  // NOLINT
-      const std::string &sym_name,
-      int64_t value = -100000,  // TODO(zhangbo): value = ShapedType::kDynamic
-      bool knownNonNegative = false,
-      bool knownNegativeOne = false,
-      bool knownNonSizeOne = false,
-      bool knownNonSizeZero = false);
+  static void Build(Builder &builder,             // NOLINT
+                    OperationArgument &argument,  // NOLINT
+                    const std::string &sym_name,
+                    int64_t value = ShapedTypeInterface::kDynamic,
+                    bool knownNonNegative = false,
+                    bool knownNegativeOne = false,
+                    bool knownNonSizeOne = false,
+                    bool knownNonSizeZero = false);
   const std::string getSymName();
   int64_t getValue();
   bool getKnownNonNegative();
@@ -54,6 +54,10 @@ class IR_API SymbolicDim : public Op<SymbolicDim> {
   bool isDynamic();
   bool merge(SymbolicDim other);
 
+  static const std::string getSymbolicDimAttrName() {
+    return "SymbolicDimAttr";
+  }
+
   void Verify() {}
 };
 
@@ -71,7 +75,7 @@ class IR_API DimOp : public Op<DimOp> {
 
   const std::string getName();
   void setName(std::string attrValue);
-  ir::OpResult out() { return result(0); }
+  pir::OpResult out() { return result(0); }
   void Verify() {}
 };
 
@@ -82,20 +86,54 @@ class IR_API TieProductEqualOp : public Op<TieProductEqualOp> {
 
   static constexpr uint32_t attributes_num = 2;
   static const char *attributes_name[attributes_num];
-  // attr operand_segment_sizes
+
   static void Build(Builder &builder,             // NOLINT
                     OperationArgument &argument,  // NOLINT
                     int64_t lhs_len,
                     int64_t rhs_len,
-                    const std::vector<ir::OpResult> &inputs);
-  std::vector<ir::Value> getLhs();
-  std::vector<ir::Value> getRhs();
+                    const std::vector<pir::OpResult> &inputs);
+  static void Build(Builder &builder,             // NOLINT
+                    OperationArgument &argument,  // NOLINT
+                    const std::vector<pir::OpResult> &lhs,
+                    const std::vector<pir::OpResult> &rhs);
+  std::vector<pir::Value> getLhs();
+  std::vector<pir::Value> getRhs();
   void Verify() {}
 };
 
+class IR_API TieShapeOp : public Op<TieShapeOp> {
+ public:
+  using Op::Op;
+  static const char *name() { return "shape.tie_shape"; }
+
+  static constexpr uint32_t attributes_num = 1;
+  static const char *attributes_name[attributes_num];
+
+  static void Build(Builder &builder,             // NOLINT
+                    OperationArgument &argument,  // NOLINT
+                    const pir::OpResult &input);
+  pir::Value getValue();
+  void Verify() {}
+};
+
+class IR_API FuncOp : public Op<FuncOp> {
+ public:
+  using Op::Op;
+  static const char *name() { return "shape.func"; }
+
+  static constexpr const char **attributes_name = nullptr;
+  static constexpr uint32_t attributes_num = 0;
+
+  static void Build(Builder &builder,              // NOLINT
+                    OperationArgument &argument);  // NOLINT
+  pir::Block *block();
+  void Verify() {}
+};
 }  // namespace dialect
-}  // namespace ir
+}  // namespace pir
 
-IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(ir::dialect::SymbolicDim);
-IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(ir::dialect::DimOp);
-IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(ir::dialect::TieProductEqualOp);
+IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::dialect::SymbolicDim);
+IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::dialect::DimOp);
+IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::dialect::TieProductEqualOp);
+IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::dialect::TieShapeOp);
+IR_EXPORT_DECLARE_EXPLICIT_TYPE_ID(pir::dialect::FuncOp);
diff --git a/paddle/ir/dialect/shape/utils/shape_utils.cc b/paddle/pir/dialect/shape/utils/shape_utils.cc
similarity index 51%
rename from paddle/ir/dialect/shape/utils/shape_utils.cc
rename to paddle/pir/dialect/shape/utils/shape_utils.cc
index f9d78a63184cb..1de3f03620961 100644
--- a/paddle/ir/dialect/shape/utils/shape_utils.cc
+++ b/paddle/pir/dialect/shape/utils/shape_utils.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/ir/dialect/shape/utils/shape_utils.h"
+#include "paddle/pir/dialect/shape/utils/shape_utils.h"
 #include <string>
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_type.h"
-namespace ir {
+#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
+namespace pir {
 
 bool compareSymbolicDimNames(const std::string& lhs, const std::string& rhs) {
   if (lhs.size() < 1 || (lhs[0] != 'S' && lhs[0] != 'C')) return lhs < rhs;
@@ -30,15 +30,29 @@ bool compareSymbolicDimNames(const std::string& lhs, const std::string& rhs) {
   return (lhs[0] < rhs[0]) || (lhs[0] == rhs[0] && lhsIdx < rhsIdx);
 }
 
-const std::string SymbolTable::insert(ir::Operation* symbol) {
+bool compareSymbolicDimProduct(SymbolicDimProduct& lhs,    // NOLINT
+                               SymbolicDimProduct& rhs) {  // NOLINT
+  if (lhs.symbols.size() < rhs.symbols.size()) return true;
+  if (lhs.symbols.size() == rhs.symbols.size()) {
+    for (size_t idx = 0; idx < lhs.symbols.size(); ++idx) {
+      const std::string lhsName = lhs.symbols[idx].getSymName();
+      const std::string rhsName = rhs.symbols[idx].getSymName();
+      if (compareSymbolicDimNames(lhsName, rhsName)) return true;
+      if (lhsName != rhsName) return false;
+    }
+  }
+  return false;
+}
+
+const std::string SymbolTable::insert(Operation* symbol) {
   std::string name;
-  if (symbol->name() == "shape.SymbolicDim") {
+  if (symbol->isa<dialect::SymbolicDim>()) {
     name = symbol->dyn_cast<SymbolicDim>().getSymName();
     symbolTableMap_.insert({name, symbol});
   }
 
   // TODO(liujinnan): add more constraint_func name branch.
-  if (symbol->name() == "shape.tie_product_equal") {
+  if (symbol->isa<dialect::TieProductEqualOp>()) {
     name = "tie_product_equal";
     symbolFuncMap_[name].emplace_back(symbol);
   }
@@ -47,12 +61,14 @@ const std::string SymbolTable::insert(ir::Operation* symbol) {
 }
 
 bool SymbolicDimMgr::load() {
-  for (auto op_it = m_.block()->begin(); op_it != m_.block()->end(); op_it++) {
-    symbolTable_.insert(*op_it);
-    SymbolicDim op = (*op_it)->dyn_cast<SymbolicDim>();
-    if (!op) continue;
-    symbolDimUnionSet_[op] = op;
-    symbolNameSet_.insert(op.getSymName());
+  auto funcOp = symbolTable_.getOp()->dyn_cast<dialect::FuncOp>();
+  assert(funcOp);
+  for (auto op_ : *(funcOp.block())) {
+    symbolTable_.insert(op_);
+    if (SymbolicDim op = op_->dyn_cast<SymbolicDim>()) {
+      symbolDimUnionSet_[op] = op;
+      symbolNameSet_.insert(op.getSymName());
+    }
   }
   return loadShapeConstraintGraph();
 }
@@ -61,18 +77,18 @@ bool SymbolicDimMgr::loadShapeConstraintGraph() {
   // TODO(liujinnan): add more constraint function. currently, only support
   // tie_product_equal.
   auto constraint_vec =
-      symbolTable_.lookup<ir::dialect::TieProductEqualOp>("tie_product_equal");
+      symbolTable_.lookup<dialect::TieProductEqualOp>("tie_product_equal");
 
   if (!constraint_vec.size()) return true;
 
-  auto build_sym_product = [&](std::vector<ir::Value> range,
+  auto build_sym_product = [&](std::vector<Value> range,
                                SymbolicDimProduct& product) {
     for (Value v : range) {
       auto definingOp = v.GetDefiningOp();
-      if (auto constOp = definingOp->dyn_cast<ir::ConstantOp>()) {
-        product.factor *= constOp.value().dyn_cast<ir::Int32Attribute>().data();
+      if (auto constOp = definingOp->dyn_cast<ConstantOp>()) {
+        product.factor *= constOp.value().dyn_cast<Int32Attribute>().data();
         continue;
-      } else if (auto dimOp = definingOp->dyn_cast<ir::dialect::DimOp>()) {
+      } else if (auto dimOp = definingOp->dyn_cast<dialect::DimOp>()) {
         auto sym = symbolTable_.lookup<SymbolicDim>(dimOp.getName());
         if (!sym) return false;
         product.symbols.push_back(sym);
@@ -82,6 +98,7 @@ bool SymbolicDimMgr::loadShapeConstraintGraph() {
     }
     return true;
   };
+
   for (auto op : constraint_vec) {
     SymbolicDimProduct lhs, rhs;
     if (!build_sym_product(op.getLhs(), lhs) ||
@@ -202,12 +219,30 @@ const std::string SymbolicDimMgr::getNextName() {
   return name;
 }
 
-SymbolicDimMgr::SymbolicDimMgr(ir::ModuleOp m) : m_(m), symbolTable_(m_) {}
+SymbolicDimMgr::SymbolicDimMgr(ModuleOp m) : m_(m) {
+  for (auto op : *(m.block())) {
+    if (op->isa<dialect::FuncOp>()) {
+      symbolTable_ = SymbolTable(op);
+      return;
+    }
+  }
+  Builder builder = Builder(m_.ir_context(), m_.block(), m_.block()->begin());
+  dialect::FuncOp func = builder.Build<dialect::FuncOp>();
+  symbolTable_ = SymbolTable(func);
+}
 
 SymbolicDim SymbolicDimMgr::newSymbolicDim(const std::string& name) {
-  ::ir::Builder builder = ::ir::Builder(m_.ir_context(), m_.block());
-  ir::dialect::SymbolicDim symbol = builder.Build<ir::dialect::SymbolicDim>(
-      name.empty() ? getNextName() : name);
+  auto funcOp = symbolTable_.getOp()->dyn_cast<dialect::FuncOp>();
+  assert(funcOp);
+  Builder builder = Builder(m_.ir_context(), funcOp.block());
+  // default settting dim != 0
+  dialect::SymbolicDim symbol =
+      builder.Build<dialect::SymbolicDim>(name.empty() ? getNextName() : name,
+                                          ShapedTypeInterface::kDynamic,
+                                          false,
+                                          false,
+                                          false,
+                                          true);
   symbolDimUnionSet_[symbol] = symbol;
   symbolTable_.insert(symbol);
   return symbol;
@@ -221,19 +256,22 @@ SymbolicDim SymbolicDimMgr::newConstantSymbolicDim(int64_t val) {
              .insert(std::make_pair(val, newSymbolicDim(name)))
              .first;
     it->second.updateValue(val);
+    if (val == -1) it->second.updateKnownNegativeOne(true);
+    if (val >= 0) it->second.updateKnownNonNegative(true);
+    if (val != 1) it->second.updateKnownNonSizeOne(true);
+    if (val != 0) it->second.updateKnownNonSizeZero(true);
   }
   return getRootSymbolicDim(it->second);
 }
 
 std::vector<SymbolicDim> SymbolicDimMgr::createSymbolicDimsForRankedValue(
-    ir::Value value) {
+    Value value) {
   std::vector<SymbolicDim> symbols;
   auto dims = value.type().dyn_cast<paddle::dialect::DenseTensorType>().dims();
   for (int idx = 0; idx < dims.size(); ++idx) {
-    symbols.push_back(
-        dims[idx] == -100000  // TODO(zhangbo): value = ShapedType::kDynamic
-            ? newSymbolicDim()
-            : newConstantSymbolicDim(dims[idx]));
+    symbols.push_back(dims[idx] == ShapedTypeInterface::kDynamic
+                          ? newSymbolicDim()
+                          : newConstantSymbolicDim(dims[idx]));
   }
   return symbols;
 }
@@ -421,4 +459,269 @@ bool SymbolicDimMgr::isSymbolicDimProductEqual(const SymbolicDimProduct& lhs,
   IR_ENFORCE(updateProductEqualityMap(), "Update product equality map failed.");
   return isMultipleOfKnownSymbolicDimProductEqualPair(newLhs, newRhs);
 }
-}  // namespace ir
+
+bool SymbolicDimMgr::save() {
+  using Name2SymbolFn = std::function<SymbolicDim(const std::string&)>;
+  auto updateAttrs = [&](ArrayAttribute attrs, Name2SymbolFn fn) {
+    std::vector<Attribute> newAttrs;
+    for (Attribute attr : attrs.AsVector()) {
+      auto sym = fn(attr.dyn_cast<StrAttribute>().AsString());
+      assert(sym);
+      SymbolicDim root = getRootSymbolicDim(sym);
+      Attribute rootSymbol =
+          StrAttribute::get(m_->ir_context(), root.getSymName());
+      newAttrs.push_back(rootSymbol);
+    }
+    return ArrayAttribute::get(m_->ir_context(), newAttrs);
+  };
+
+  // TODO(liujinnan): update attributes attached in DenseTensorType
+  for (auto op : *(m_.block())) {
+    if (!op->HasAttribute(SymbolicDim::getSymbolicDimAttrName())) continue;
+    auto attrs =
+        op->attribute<ArrayAttribute>(SymbolicDim::getSymbolicDimAttrName());
+    auto symbolicShapeAttr = updateAttrs(attrs, [&](const std::string& name) {
+      return symbolTable_.lookup<SymbolicDim>(name);
+    });
+    op->set_attribute(SymbolicDim::getSymbolicDimAttrName(), symbolicShapeAttr);
+  }
+  if (!updateProductEqualityMap()) {
+    return false;
+  }
+  std::unordered_set<SymbolicDim, SymDimHasher> usedSymbolicOps;
+  std::vector<std::string> usedSymbolNames;
+  // TODO(liujinnan): collect uses in value.
+  auto collectUsedSymbols = [&](ArrayAttribute attrs) {
+    for (Attribute attr : attrs.AsVector()) {
+      auto sym = symbolTable_.lookup<SymbolicDim>(
+          attr.dyn_cast<StrAttribute>().AsString());
+      assert(sym);
+      if (usedSymbolicOps.insert(sym).second)
+        usedSymbolNames.push_back(sym.getSymName());
+    }
+  };
+  for (auto op : *(m_.block())) {
+    if (!op->HasAttribute(SymbolicDim::getSymbolicDimAttrName())) continue;
+    auto attrs =
+        op->attribute<ArrayAttribute>(SymbolicDim::getSymbolicDimAttrName());
+    collectUsedSymbols(attrs);
+  }
+  auto funcOp = symbolTable_.getOp()->dyn_cast<dialect::FuncOp>();
+  assert(funcOp);
+  for (auto& p : symbolDimUnionSet_) {
+    if (!usedSymbolicOps.count(p.first)) {
+      funcOp.block()->erase(*(p.first.operation()));
+    }
+  }
+
+  std::vector<SymbolicDimProduct> candidates;
+  for (auto& outter : productEqualityMap_) {
+    if (std::any_of(
+            outter.first.symbols.begin(),
+            outter.first.symbols.end(),
+            [&](SymbolicDim sym) { return usedSymbolicOps.count(sym) == 0; }))
+      candidates.push_back(outter.first);
+  }
+
+  for (auto& prod : candidates) productEqualityMap_.erase(prod);
+  for (auto& outter : productEqualityMap_) {
+    std::vector<SymbolicDimProduct> candidates;
+    for (auto& inner : outter.second) {
+      if (std::any_of(
+              inner.first.symbols.begin(),
+              inner.first.symbols.end(),
+              [&](SymbolicDim sym) { return usedSymbolicOps.count(sym) == 0; }))
+        candidates.push_back(outter.first);
+    }
+    for (auto& prod : candidates) outter.second.erase(prod);
+  }
+
+  std::sort(usedSymbolNames.begin(),
+            usedSymbolNames.end(),
+            [&](const std::string& lhs, const std::string& rhs) {
+              return compareSymbolicDimNames(lhs, rhs);
+            });
+  int numNonConstDims = 0;
+  std::unordered_map<std::string, std::string> nameMapping;
+  for (const auto& name : usedSymbolNames) {
+    if (name.size() > 0 && name[0] == 'C') {
+      nameMapping[name] = name;
+    } else {
+      nameMapping[name] = ("S" + std::to_string(numNonConstDims++));
+    }
+  }
+
+  std::unordered_map<std::string, SymbolicDim> name2Symbol;
+  for (SymbolicDim op : usedSymbolicOps) {
+    auto name = op.getSymName();
+    op.updateSymName(nameMapping[name]);
+    name2Symbol[name] = op;
+  }
+
+  for (auto op : *(m_.block())) {
+    if (!op->HasAttribute(SymbolicDim::getSymbolicDimAttrName())) continue;
+    auto attrs =
+        op->attribute<ArrayAttribute>(SymbolicDim::getSymbolicDimAttrName());
+    auto symbolicShapeAttr = updateAttrs(
+        attrs, [&](const std::string& name) { return name2Symbol[name]; });
+    op->set_attribute(SymbolicDim::getSymbolicDimAttrName(), symbolicShapeAttr);
+  }
+
+  // TODO(liujinnan): update attributes attached to values.
+
+  return saveShapeConstraintGraph();
+}
+
+bool SymbolicDimMgr::saveShapeConstraintGraph() {
+  auto funcOp = symbolTable_.getOp()->dyn_cast<dialect::FuncOp>();
+  assert(funcOp);
+  auto op_it = funcOp.block()->rbegin();
+  while (op_it != funcOp.block()->rend()) {
+    if (((*op_it)->isa<dialect::SymbolicDim>()) ||
+        ((*op_it)->isa<dialect::TieShapeOp>()))
+      op_it++;
+    else
+      op_it = decltype(op_it)(funcOp.block()->erase(*(*op_it)));
+  }
+
+  Builder builder = Builder(m_->ir_context(), funcOp.block());
+  auto build_operands = [&](const SymbolicDimProduct& prod) {
+    std::vector<OpResult> values;
+
+    if (prod.factor != 1) {
+      values.push_back(
+          builder
+              .Build<ConstantOp>(
+                  Int32Attribute::get(m_->ir_context(), prod.factor),
+                  Int32Type::get(m_->ir_context()))
+              ->result(0));
+    }
+    for (SymbolicDim sym : prod.symbols) {
+      values.push_back(builder.Build<dialect::DimOp>(sym.getSymName()).out());
+    }
+    return values;
+  };
+  std::vector<SymbolicDimProduct> sortedProductVec;
+  for (auto& p : productEqualityMap_) sortedProductVec.push_back(p.first);
+  std::sort(sortedProductVec.begin(),
+            sortedProductVec.end(),
+            compareSymbolicDimProduct);
+  for (auto& x : sortedProductVec) {
+    for (auto& y : sortedProductVec) {
+      if (!compareSymbolicDimProduct(x, y)) continue;
+      if (!productEqualityMap_[x][y]) continue;
+      auto lhsOperands = build_operands(x);
+      auto rhsOperands = build_operands(y);
+      builder.Build<dialect::TieProductEqualOp>(lhsOperands, rhsOperands);
+    }
+  }
+  return true;
+}
+
+bool ShapeAnalysis::isSameNumElements(Value lhs, Value rhs) {
+  if (lhs == rhs) return true;
+  auto lhsTy = lhs.type().dyn_cast_interface<ShapedTypeInterface>();
+  auto rhsTy = rhs.type().dyn_cast_interface<ShapedTypeInterface>();
+
+  if (!lhsTy || !rhsTy || !lhsTy.hasRank() || !rhsTy.hasRank()) return false;
+
+  return isProductEqual(lhs, 0, lhsTy.getRank(), rhs, 0, rhsTy.getRank());
+}
+
+bool ShapeAnalysis::isProductEqual(
+    Value lhs, int lhsFrom, int lhsTo, Value rhs, int rhsFrom, int rhsTo) {
+  std::vector<int> lhsDimIdxs, rhsDimIdxs;
+  lhsDimIdxs.reserve(lhsTo - lhsFrom);
+  rhsDimIdxs.reserve(rhsTo - rhsFrom);
+  for (int i = lhsFrom; i < lhsTo; ++i) lhsDimIdxs.push_back(i);
+  for (int i = rhsFrom; i < rhsTo; ++i) rhsDimIdxs.push_back(i);
+
+  return isProductEqual(lhs, lhsDimIdxs, rhs, rhsDimIdxs);
+}
+
+SymbolicDimShapeAnalysis::SymbolicDimShapeAnalysis(ModuleOp m)
+    : m_(m), mgr_(m) {
+  mgr_.load();
+  for (auto op : *(m_.block())) {
+    auto tieShapeOp = op->dyn_cast<dialect::TieShapeOp>();
+    if (!tieShapeOp) continue;
+    Value result = tieShapeOp.getValue();
+    auto& symbols = value2SymDims_[result];
+    auto attrs =
+        tieShapeOp
+            .attribute<ArrayAttribute>(SymbolicDim::getSymbolicDimAttrName())
+            .AsVector();
+    for (const auto& attr : attrs) {
+      auto symOp = mgr_.symbolTable().lookup<SymbolicDim>(
+          attr.dyn_cast<StrAttribute>().AsString());
+      if (!symOp) continue;
+      symbols.push_back(symOp);
+    }
+  }
+}
+
+SymbolicDimShapeAnalysis::~SymbolicDimShapeAnalysis() { mgr_.save(); }
+
+bool SymbolicDimShapeAnalysis::isShapeEqual(Value lhs, Value rhs) {
+  if (lhs == rhs) return true;
+
+  auto lhsTy = lhs.type().dyn_cast_interface<ShapedTypeInterface>();
+  auto rhsTy = rhs.type().dyn_cast_interface<ShapedTypeInterface>();
+
+  if (!lhsTy || !rhsTy || !lhsTy.hasRank() || !rhsTy.hasRank()) return false;
+
+  if (lhsTy.hasStaticShape() && rhsTy.hasStaticShape()) {
+    return vectorize(lhsTy.getShape()) == vectorize(rhsTy.getShape());
+  }
+
+  auto lhsIt = value2SymDims_.find(lhs);
+  auto rhsIt = value2SymDims_.find(rhs);
+
+  if (lhsIt == value2SymDims_.end() || rhsIt == value2SymDims_.end() ||
+      lhsIt->second.size() != rhsIt->second.size())
+    return false;
+
+  std::vector<SymbolicDim> lhsSyms;
+  std::vector<SymbolicDim> rhsSyms;
+  for (auto sym : lhsIt->second) {
+    lhsSyms.push_back(mgr_.getRootSymbolicDim(sym));
+  }
+  for (auto sym : rhsIt->second) {
+    rhsSyms.push_back(mgr_.getRootSymbolicDim(sym));
+  }
+  return lhsSyms == rhsSyms;
+}
+
+bool SymbolicDimShapeAnalysis::isProductEqual(Value lhs,
+                                              std::vector<int> lhsDimIdxs,
+                                              Value rhs,
+                                              std::vector<int> rhsDimIdxs) {
+  SymbolicDimProduct lhsProd;
+  SymbolicDimProduct rhsProd;
+
+  auto buildSymbolicDimProduct =
+      [&](SymbolicDimProduct& prod, Value value, std::vector<int> dimIdxs) {
+        auto ty = value.type().dyn_cast_interface<ShapedTypeInterface>();
+        auto it = value2SymDims_.find(value);
+        if (!ty || !ty.hasRank()) return false;
+        for (int idx : dimIdxs) {
+          if (ty.getShape()[idx] == ShapedTypeInterface::kDynamic) {
+            if (it == value2SymDims_.end() ||
+                static_cast<int>(it->second.size()) <= idx)
+              return false;
+            prod.symbols.push_back(it->second[idx]);
+          } else {
+            prod.factor *= ty.getShape()[idx];
+          }
+        }
+        return true;
+      };
+
+  if (!buildSymbolicDimProduct(lhsProd, lhs, lhsDimIdxs) ||
+      !buildSymbolicDimProduct(rhsProd, rhs, rhsDimIdxs)) {
+    return false;
+  }
+
+  return mgr_.isSymbolicDimProductEqual(lhsProd, rhsProd);
+}
+}  // namespace pir
diff --git a/paddle/ir/dialect/shape/utils/shape_utils.h b/paddle/pir/dialect/shape/utils/shape_utils.h
similarity index 68%
rename from paddle/ir/dialect/shape/utils/shape_utils.h
rename to paddle/pir/dialect/shape/utils/shape_utils.h
index 8d5fab1a1c811..f00bff91a5517 100644
--- a/paddle/ir/dialect/shape/utils/shape_utils.h
+++ b/paddle/pir/dialect/shape/utils/shape_utils.h
@@ -14,18 +14,21 @@
 
 #pragma once
 
+#include <algorithm>
 #include <functional>
+#include <iterator>
 #include <type_traits>
 #include <unordered_map>
 #include <unordered_set>
-#include "paddle/ir/core/builtin_attribute.h"
-#include "paddle/ir/core/builtin_op.h"
-#include "paddle/ir/core/utils.h"
-#include "paddle/ir/dialect/shape/ir/shape_op.h"
+#include "paddle/pir/core/builtin_attribute.h"
+#include "paddle/pir/core/builtin_op.h"
+#include "paddle/pir/core/builtin_type_interfaces.h"
+#include "paddle/pir/core/utils.h"
+#include "paddle/pir/dialect/shape/ir/shape_op.h"
 
-namespace ir {
+namespace pir {
 
-using ir::dialect::SymbolicDim;
+using dialect::SymbolicDim;
 
 struct SymbolicDimProduct {
   std::vector<SymbolicDim> symbols;
@@ -44,8 +47,9 @@ struct SymbolicDimProduct {
 
 class SymbolTable {
  public:
-  explicit SymbolTable(ir::Operation* symbolTableOp)
+  explicit SymbolTable(Operation* symbolTableOp)
       : symbolTableOp_(symbolTableOp) {}
+  SymbolTable() = default;
   template <typename T>
   typename std::enable_if<std::is_same<T, SymbolicDim>::value,
                           SymbolicDim>::type
@@ -69,22 +73,22 @@ class SymbolTable {
   }
 
   const std::string insert(Operation* symbol);
-  ir::Operation* getOp() const { return symbolTableOp_; }
+  Operation* getOp() const { return symbolTableOp_; }
 
  private:
-  ir::Operation* symbolTableOp_;
-  std::unordered_map<std::string, ir::Operation*> symbolTableMap_;
-  std::unordered_map<std::string, std::vector<ir::Operation*>> symbolFuncMap_;
+  Operation* symbolTableOp_;
+  std::unordered_map<std::string, Operation*> symbolTableMap_;
+  std::unordered_map<std::string, std::vector<Operation*>> symbolFuncMap_;
 };
 
 struct SymDimHasher {
-  size_t operator()(const ir::dialect::SymbolicDim& symbol) const noexcept {
-    return std::hash<ir::Operation*>{}(symbol.operation());
+  size_t operator()(const dialect::SymbolicDim& symbol) const noexcept {
+    return std::hash<Operation*>{}(symbol.operation());
   }
 };
 
 struct SymProductHasher {
-  size_t operator()(const ir::SymbolicDimProduct& symProd) const noexcept {
+  size_t operator()(const SymbolicDimProduct& symProd) const noexcept {
     size_t hash = std::hash<size_t>{}(symProd.symbols.size());
     for (auto& symbol : symProd.symbols) {
       hash = hash_combine(hash, SymDimHasher{}(symbol));  // NOLINT
@@ -96,7 +100,7 @@ struct SymProductHasher {
 
 class SymbolicDimMgr {
  public:
-  explicit SymbolicDimMgr(ir::ModuleOp m);
+  explicit SymbolicDimMgr(ModuleOp m);
   bool load();
   SymbolicDim newSymbolicDim(const std::string& name = {});
   SymbolicDim newConstantSymbolicDim(int64_t val);
@@ -112,7 +116,7 @@ class SymbolicDimMgr {
   SymbolicDimProduct* symbolicDimProductDivide(const SymbolicDimProduct& x,
                                                const SymbolicDimProduct& y);
 
-  bool save();  // TODO(liujinnan): load constraint func
+  bool save();
 
   bool isSymbolicDimProductEqual(const SymbolicDimProduct& lhs,
                                  const SymbolicDimProduct& rhs);
@@ -124,12 +128,11 @@ class SymbolicDimMgr {
   bool updateProductEqualityMap();
   bool isMultipleOfKnownSymbolicDimProductEqualPair(
       const SymbolicDimProduct& lhs, const SymbolicDimProduct& rhs);
-  bool saveShapeConstraintGraph();  // TODO(liujinnan): load & save
-                                    // shape_constraint_func
+  bool saveShapeConstraintGraph();
   bool loadShapeConstraintGraph();
 
  private:
-  ir::ModuleOp m_;
+  ModuleOp m_;
 
   SymbolTable symbolTable_;
 
@@ -149,4 +152,39 @@ class SymbolicDimMgr {
   SymbolicDimProductMap productEqualityMap_;
   bool productEqualityMapUpdated_ = true;
 };
-}  // namespace ir
+
+class ShapeAnalysis {
+ public:
+  virtual ~ShapeAnalysis() = default;
+
+  virtual bool isShapeEqual(Value lhs, Value rhs) = 0;
+
+  virtual bool isProductEqual(Value lhs,
+                              std::vector<int> lhsDimIdxs,
+                              Value rhs,
+                              std::vector<int> rhsDimIdxs) = 0;
+  virtual bool isProductEqual(
+      Value lhs, int lhsFrom, int lhsTo, Value rhs, int rhsFrom, int rhsTo);
+  virtual bool isSameNumElements(Value lhs, Value rhs);
+};
+
+class SymbolicDimShapeAnalysis : public ShapeAnalysis {
+ public:
+  explicit SymbolicDimShapeAnalysis(ModuleOp m);
+  ~SymbolicDimShapeAnalysis();
+
+  SymbolicDimMgr& symbolicDimMgr() { return mgr_; }
+  const SymbolicDimMgr& symbolicDimMgr() const { return mgr_; }
+  bool isShapeEqual(Value lhs, Value rhs) override;
+
+  bool isProductEqual(Value lhs,
+                      std::vector<int> lhsDimIdxs,
+                      Value rhs,
+                      std::vector<int> rhsDimIdxs) override;
+
+ private:
+  ModuleOp m_;
+  SymbolicDimMgr mgr_;
+  std::unordered_map<Value, std::vector<SymbolicDim>> value2SymDims_;
+};
+}  // namespace pir
diff --git a/paddle/pir/pass/CMakeLists.txt b/paddle/pir/pass/CMakeLists.txt
new file mode 100644
index 0000000000000..92f7de3531cf4
--- /dev/null
+++ b/paddle/pir/pass/CMakeLists.txt
@@ -0,0 +1,3 @@
+file(GLOB NEW_PASS_SRCS "*.cc")
+
+ir_library(pir_pass SRCS ${NEW_PASS_SRCS} DEPS pir_core)
diff --git a/paddle/ir/pass/analysis_manager.h b/paddle/pir/pass/analysis_manager.h
similarity index 94%
rename from paddle/ir/pass/analysis_manager.h
rename to paddle/pir/pass/analysis_manager.h
index 417d9026b88d1..e21a0b948b42c 100644
--- a/paddle/ir/pass/analysis_manager.h
+++ b/paddle/pir/pass/analysis_manager.h
@@ -21,13 +21,13 @@
 #include <unordered_map>
 #include <unordered_set>
 
-#include "paddle/ir/core/cast_utils.h"
-#include "paddle/ir/core/type_id.h"
-#include "paddle/ir/core/type_name.h"
-#include "paddle/ir/pass/pass_instrumentation.h"
-#include "paddle/ir/pass/utils.h"
+#include "paddle/pir/core/cast_utils.h"
+#include "paddle/pir/core/type_id.h"
+#include "paddle/pir/core/type_name.h"
+#include "paddle/pir/pass/pass_instrumentation.h"
+#include "paddle/pir/pass/utils.h"
 
-namespace ir {
+namespace pir {
 
 class Operation;
 class AnalysisManager;
@@ -74,7 +74,7 @@ class PreservedAnalyses {
     preserved_ids_.erase(TypeId::get<AnalysisT>());
   }
 
-  friend ir::detail::TypeIdResolver<AllAnalysesType>;
+  friend pir::detail::TypeIdResolver<AllAnalysesType>;
 
  private:
   template <typename>
@@ -145,7 +145,7 @@ class AnalysisMap {
           std::is_constructible<AnalysisT, OpT, AnalysisManager&>::value,
       AnalysisT&>
   GetAnalysis(PassInstrumentor* pi, AnalysisManager& am) {  // NOLINT
-    return GetAnalysisImpl<AnalysisT, OpT>(pi, ir::cast<OpT>(ir_), am);
+    return GetAnalysisImpl<AnalysisT, OpT>(pi, pir::cast<OpT>(ir_), am);
   }
 
   template <typename AnalysisT>
@@ -177,7 +177,7 @@ class AnalysisMap {
  private:
   template <typename AnalysisT>
   static std::string GetAnalysisName() {
-    std::string name = ir::get_type_name<AnalysisT>();
+    std::string name = pir::get_type_name<AnalysisT>();
     auto pos = name.rfind("::");
     if (pos != std::string::npos) {
       name = name.substr(pos + 2);
@@ -303,6 +303,6 @@ class AnalysisManagerHolder {
   PassInstrumentor* pi_;
 };
 
-}  // namespace ir
+}  // namespace pir
 
-IR_DECLARE_EXPLICIT_TYPE_ID(ir::detail::PreservedAnalyses::AllAnalysesType)
+IR_DECLARE_EXPLICIT_TYPE_ID(pir::detail::PreservedAnalyses::AllAnalysesType)
diff --git a/paddle/ir/pass/ir_printing.cc b/paddle/pir/pass/ir_printing.cc
similarity index 91%
rename from paddle/ir/pass/ir_printing.cc
rename to paddle/pir/pass/ir_printing.cc
index 87e0af4831f57..6171b71c090fc 100644
--- a/paddle/ir/pass/ir_printing.cc
+++ b/paddle/pir/pass/ir_printing.cc
@@ -16,13 +16,13 @@
 #include <string>
 #include <unordered_map>
 
-#include "paddle/ir/core/operation.h"
-#include "paddle/ir/pass/pass.h"
-#include "paddle/ir/pass/pass_instrumentation.h"
-#include "paddle/ir/pass/pass_manager.h"
-#include "paddle/ir/pass/utils.h"
+#include "paddle/pir/core/operation.h"
+#include "paddle/pir/pass/pass.h"
+#include "paddle/pir/pass/pass_instrumentation.h"
+#include "paddle/pir/pass/pass_manager.h"
+#include "paddle/pir/pass/utils.h"
 
-namespace ir {
+namespace pir {
 
 namespace {
 void PrintIR(Operation *op, bool print_module, std::ostream &os) {
@@ -85,4 +85,4 @@ void PassManager::EnableIRPrinting(std::unique_ptr<IRPrinterOption> option) {
   AddInstrumentation(std::make_unique<IRPrinting>(std::move(option)));
 }
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/pass/pass.cc b/paddle/pir/pass/pass.cc
similarity index 93%
rename from paddle/ir/pass/pass.cc
rename to paddle/pir/pass/pass.cc
index bab98bdbd39e2..d0e3f5d3927a7 100644
--- a/paddle/ir/pass/pass.cc
+++ b/paddle/pir/pass/pass.cc
@@ -12,18 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/ir/pass/pass.h"
+#include "paddle/pir/pass/pass.h"
 
-#include "paddle/ir/core/ir_context.h"
-#include "paddle/ir/core/operation.h"
-#include "paddle/ir/core/program.h"
-#include "paddle/ir/core/region.h"
-#include "paddle/ir/core/verify.h"
-#include "paddle/ir/pass/pass_adaptor.h"
-#include "paddle/ir/pass/pass_instrumentation.h"
-#include "paddle/ir/pass/pass_manager.h"
+#include "paddle/pir/core/ir_context.h"
+#include "paddle/pir/core/operation.h"
+#include "paddle/pir/core/program.h"
+#include "paddle/pir/core/region.h"
+#include "paddle/pir/core/verify.h"
+#include "paddle/pir/pass/pass_adaptor.h"
+#include "paddle/pir/pass/pass_instrumentation.h"
+#include "paddle/pir/pass/pass_manager.h"
 
-namespace ir {
+namespace pir {
 
 //===----------------------------------------------------------------------===//
 // Pass
@@ -110,7 +110,7 @@ bool detail::PassAdaptor::RunPass(Pass* pass,
 
   if (!pass_failed && verify) {
     bool verify_recursively = !dynamic_cast<PassAdaptor*>(pass);
-    ir::Verify(op, verify_recursively);
+    pir::Verify(op, verify_recursively);
   }
 
   return !pass_failed;
@@ -224,6 +224,6 @@ void PassInstrumentor::AddInstrumentation(
   impl_->instrumentations.emplace_back(std::move(pi));
 }
 
-}  // namespace ir
+}  // namespace pir
 
-IR_DEFINE_EXPLICIT_TYPE_ID(ir::detail::PreservedAnalyses::AllAnalysesType)
+IR_DEFINE_EXPLICIT_TYPE_ID(pir::detail::PreservedAnalyses::AllAnalysesType)
diff --git a/paddle/ir/pass/pass.h b/paddle/pir/pass/pass.h
similarity index 94%
rename from paddle/ir/pass/pass.h
rename to paddle/pir/pass/pass.h
index 5499f2172f294..f916fcbb1e354 100644
--- a/paddle/ir/pass/pass.h
+++ b/paddle/pir/pass/pass.h
@@ -18,12 +18,12 @@
 #include <string>
 #include <vector>
 
-#include "paddle/ir/core/enforce.h"
-#include "paddle/ir/pass/analysis_manager.h"
-#include "paddle/ir/pass/pass_registry.h"
 #include "paddle/phi/core/enforce.h"
+#include "paddle/pir/core/enforce.h"
+#include "paddle/pir/pass/analysis_manager.h"
+#include "paddle/pir/pass/pass_registry.h"
 
-namespace ir {
+namespace pir {
 
 class IrContext;
 class Operation;
@@ -107,4 +107,4 @@ class IR_API Pass {
   friend class detail::PassAdaptor;
 };
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/pass/pass_adaptor.h b/paddle/pir/pass/pass_adaptor.h
similarity index 93%
rename from paddle/ir/pass/pass_adaptor.h
rename to paddle/pir/pass/pass_adaptor.h
index 4b81e8362a7e2..631d6d8b398f9 100644
--- a/paddle/ir/pass/pass_adaptor.h
+++ b/paddle/pir/pass/pass_adaptor.h
@@ -14,9 +14,9 @@
 
 #pragma once
 
-#include "paddle/ir/pass/pass.h"
+#include "paddle/pir/pass/pass.h"
 
-namespace ir {
+namespace pir {
 
 class Operation;
 class PassManager;
@@ -50,8 +50,8 @@ class PassAdaptor final : public Pass {
   PassManager* pm_;
 
   // For accessing RunPipeline.
-  friend class ir::PassManager;
+  friend class pir::PassManager;
 };
 }  // namespace detail
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/pass/pass_instrumentation.h b/paddle/pir/pass/pass_instrumentation.h
similarity index 97%
rename from paddle/ir/pass/pass_instrumentation.h
rename to paddle/pir/pass/pass_instrumentation.h
index 1c80682fc43c7..8d49819596765 100644
--- a/paddle/ir/pass/pass_instrumentation.h
+++ b/paddle/pir/pass/pass_instrumentation.h
@@ -16,9 +16,9 @@
 
 #include <memory>
 
-#include "paddle/ir/core/type_id.h"
+#include "paddle/pir/core/type_id.h"
 
-namespace ir {
+namespace pir {
 
 class Operation;
 class Pass;
@@ -84,4 +84,4 @@ class IR_API PassInstrumentor {
   std::unique_ptr<detail::PassInstrumentorImpl> impl_;
 };
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/pass/pass_manager.h b/paddle/pir/pass/pass_manager.h
similarity index 98%
rename from paddle/ir/pass/pass_manager.h
rename to paddle/pir/pass/pass_manager.h
index 67ac2d1ba3435..f606be139c42f 100644
--- a/paddle/ir/pass/pass_manager.h
+++ b/paddle/pir/pass/pass_manager.h
@@ -19,9 +19,9 @@
 #include <memory>
 #include <vector>
 
-#include "paddle/ir/core/program.h"
+#include "paddle/pir/core/program.h"
 
-namespace ir {
+namespace pir {
 
 class IrContext;
 class Operation;
@@ -139,4 +139,4 @@ class IR_API PassManager {
   friend class detail::PassAdaptor;
 };
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/pass/pass_registry.cc b/paddle/pir/pass/pass_registry.cc
similarity index 90%
rename from paddle/ir/pass/pass_registry.cc
rename to paddle/pir/pass/pass_registry.cc
index a0239219a694d..7ff08499a0222 100644
--- a/paddle/ir/pass/pass_registry.cc
+++ b/paddle/pir/pass/pass_registry.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/ir/pass/pass_registry.h"
+#include "paddle/pir/pass/pass_registry.h"
 
-namespace ir {
+namespace pir {
 PassRegistry &PassRegistry::Instance() {
   static PassRegistry g_pass_info_map;
   return g_pass_info_map;
 }
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/pass/pass_registry.h b/paddle/pir/pass/pass_registry.h
similarity index 87%
rename from paddle/ir/pass/pass_registry.h
rename to paddle/pir/pass/pass_registry.h
index a7d52edca027f..71140810b0324 100644
--- a/paddle/ir/pass/pass_registry.h
+++ b/paddle/pir/pass/pass_registry.h
@@ -18,11 +18,10 @@
 #include <memory>
 #include <unordered_map>
 
-#include "paddle/ir/core/enforce.h"
-#include "paddle/ir/core/macros.h"
-#include "paddle/ir/pass/pass.h"
-
-namespace ir {
+#include "paddle/pir/core/enforce.h"
+#include "paddle/pir/core/macros.h"
+#include "paddle/pir/pass/pass.h"
+namespace pir {
 
 class Pass;
 
@@ -80,18 +79,19 @@ class PassRegistrar {
                 msg)
 
 // Register a new pass that can be applied on the IR.
-#define REGISTER_IR_PASS(pass_type, pass_class)                                \
-  STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(                                         \
-      __reg_pass__##pass_type,                                                 \
-      "REGISTER_IR_PASS must be called in global namespace");                  \
-  static ::ir::PassRegistrar<pass_class> __pass_registrar_##pass_type##__(     \
-      #pass_type);                                                             \
-  int TouchPassRegistrar_##pass_type() {                                       \
-    __pass_registrar_##pass_type##__.Touch();                                  \
-    return 0;                                                                  \
-  }                                                                            \
-  static ::ir::PassRegistrar<pass_class> &__pass_tmp_registrar_##pass_type##__ \
-      UNUSED = __pass_registrar_##pass_type##__
+#define REGISTER_IR_PASS(pass_type, pass_class)                             \
+  STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(                                      \
+      __reg_pass__##pass_type,                                              \
+      "REGISTER_IR_PASS must be called in global namespace");               \
+  static ::pir::PassRegistrar<pass_class> __pass_registrar_##pass_type##__( \
+      #pass_type);                                                          \
+  int TouchPassRegistrar_##pass_type() {                                    \
+    __pass_registrar_##pass_type##__.Touch();                               \
+    return 0;                                                               \
+  }                                                                         \
+  static ::pir::PassRegistrar<pass_class>                                   \
+      &__pass_tmp_registrar_##pass_type##__ UNUSED =                        \
+          __pass_registrar_##pass_type##__
 
 #define USE_PASS(pass_type)                           \
   STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(                \
@@ -101,4 +101,4 @@ class PassRegistrar {
   static int use_pass_itself_##pass_type##_ UNUSED =  \
       TouchPassRegistrar_##pass_type()
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/pass/pass_timing.cc b/paddle/pir/pass/pass_timing.cc
similarity index 92%
rename from paddle/ir/pass/pass_timing.cc
rename to paddle/pir/pass/pass_timing.cc
index 595320308dce3..9492a37fb6914 100644
--- a/paddle/ir/pass/pass_timing.cc
+++ b/paddle/pir/pass/pass_timing.cc
@@ -18,13 +18,13 @@
 #include <string>
 #include <unordered_map>
 
-#include "paddle/ir/core/operation.h"
-#include "paddle/ir/pass/pass.h"
-#include "paddle/ir/pass/pass_instrumentation.h"
-#include "paddle/ir/pass/pass_manager.h"
-#include "paddle/ir/pass/utils.h"
+#include "paddle/pir/core/operation.h"
+#include "paddle/pir/pass/pass.h"
+#include "paddle/pir/pass/pass_instrumentation.h"
+#include "paddle/pir/pass/pass_manager.h"
+#include "paddle/pir/pass/utils.h"
 
-namespace ir {
+namespace pir {
 namespace {
 class Timer {
  public:
@@ -53,7 +53,7 @@ class PassTimer : public PassInstrumentation {
   explicit PassTimer(bool print_module) : print_module_(print_module) {}
   ~PassTimer() override = default;
 
-  void RunBeforePipeline(ir::Operation* op) override {
+  void RunBeforePipeline(pir::Operation* op) override {
     pipeline_timers_[op] = Timer();
     pipeline_timers_[op].Start();
   }
@@ -121,4 +121,4 @@ void PassManager::EnablePassTiming(bool print_module) {
   AddInstrumentation(std::make_unique<PassTimer>(print_module));
 }
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/pass/utils.cc b/paddle/pir/pass/utils.cc
similarity index 92%
rename from paddle/ir/pass/utils.cc
rename to paddle/pir/pass/utils.cc
index 8c890943420d9..91d5975a07b5d 100644
--- a/paddle/ir/pass/utils.cc
+++ b/paddle/pir/pass/utils.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/ir/pass/utils.h"
+#include "paddle/pir/pass/utils.h"
 
-namespace ir {
+namespace pir {
 namespace detail {
 
 void PrintHeader(const std::string &header, std::ostream &os) {
@@ -25,4 +25,4 @@ void PrintHeader(const std::string &header, std::ostream &os) {
 }
 
 }  // namespace detail
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/pass/utils.h b/paddle/pir/pass/utils.h
similarity index 97%
rename from paddle/ir/pass/utils.h
rename to paddle/pir/pass/utils.h
index 61ee43037e852..a08c77c1791bb 100644
--- a/paddle/ir/pass/utils.h
+++ b/paddle/pir/pass/utils.h
@@ -18,7 +18,7 @@
 #include <string>
 #include <type_traits>
 
-namespace ir {
+namespace pir {
 namespace detail {
 
 template <typename... Ts>
@@ -48,4 +48,4 @@ using is_detected = typename detector<void, Op, Args...>::value_t;
 void PrintHeader(const std::string &header, std::ostream &os);
 
 }  // namespace detail
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/pir/pattern_rewrite/CMakeLists.txt b/paddle/pir/pattern_rewrite/CMakeLists.txt
new file mode 100644
index 0000000000000..27e939f5d05b9
--- /dev/null
+++ b/paddle/pir/pattern_rewrite/CMakeLists.txt
@@ -0,0 +1,3 @@
+file(GLOB PATTERN_SRCS "*.cc")
+
+ir_library(pir_pattern_rewrite SRCS ${PATTERN_SRCS} DEPS pir_core)
diff --git a/paddle/ir/pattern_rewrite/frozen_rewrite_pattern_set.cc b/paddle/pir/pattern_rewrite/frozen_rewrite_pattern_set.cc
similarity index 95%
rename from paddle/ir/pattern_rewrite/frozen_rewrite_pattern_set.cc
rename to paddle/pir/pattern_rewrite/frozen_rewrite_pattern_set.cc
index 363595b91a988..546b9d5bd5034 100644
--- a/paddle/ir/pattern_rewrite/frozen_rewrite_pattern_set.cc
+++ b/paddle/pir/pattern_rewrite/frozen_rewrite_pattern_set.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/ir/pattern_rewrite/frozen_rewrite_pattern_set.h"
+#include "paddle/pir/pattern_rewrite/frozen_rewrite_pattern_set.h"
 
 #include <algorithm>
 #include <memory>
@@ -20,9 +20,9 @@
 #include <set>
 #include <string>
 
-#include "paddle/ir/core/op_info.h"
+#include "paddle/pir/core/op_info.h"
 
-namespace ir {
+namespace pir {
 
 FrozenRewritePatternSet::FrozenRewritePatternSet()
     : impl_(std::make_shared<Impl>()) {}
@@ -38,7 +38,7 @@ FrozenRewritePatternSet::FrozenRewritePatternSet(
   enabled_patterns.insert(enabled_pattern_labels.begin(),
                           enabled_pattern_labels.end());
 
-  ir::OpInfoMap op_info_map;
+  pir::OpInfoMap op_info_map;
   auto AddToOpsWhen = [&](std::unique_ptr<RewritePattern>& pattern,
                           std::function<bool(OpInfo)> callback) {
     if (op_info_map.empty())
@@ -97,4 +97,4 @@ FrozenRewritePatternSet::FrozenRewritePatternSet(
   }
 }
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/pattern_rewrite/frozen_rewrite_pattern_set.h b/paddle/pir/pattern_rewrite/frozen_rewrite_pattern_set.h
similarity index 93%
rename from paddle/ir/pattern_rewrite/frozen_rewrite_pattern_set.h
rename to paddle/pir/pattern_rewrite/frozen_rewrite_pattern_set.h
index 59d7e2a8e8141..a8a7e97c390f8 100644
--- a/paddle/ir/pattern_rewrite/frozen_rewrite_pattern_set.h
+++ b/paddle/pir/pattern_rewrite/frozen_rewrite_pattern_set.h
@@ -21,11 +21,11 @@
 #include <unordered_map>
 #include <vector>
 
-#include "paddle/ir/core/dll_decl.h"
-#include "paddle/ir/core/op_info.h"
-#include "paddle/ir/pattern_rewrite/pattern_match.h"
+#include "paddle/pir/core/dll_decl.h"
+#include "paddle/pir/core/op_info.h"
+#include "paddle/pir/pattern_rewrite/pattern_match.h"
 
-namespace ir {
+namespace pir {
 
 class IR_API FrozenRewritePatternSet {
   using NativePatternListT = std::vector<std::unique_ptr<RewritePattern>>;
@@ -71,4 +71,4 @@ class IR_API FrozenRewritePatternSet {
   std::shared_ptr<Impl> impl_;
 };
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/pattern_rewrite/pattern_applicator.cc b/paddle/pir/pattern_rewrite/pattern_applicator.cc
similarity index 96%
rename from paddle/ir/pattern_rewrite/pattern_applicator.cc
rename to paddle/pir/pattern_rewrite/pattern_applicator.cc
index 7087efa9ac64f..c9ce27c0a4384 100644
--- a/paddle/ir/pattern_rewrite/pattern_applicator.cc
+++ b/paddle/pir/pattern_rewrite/pattern_applicator.cc
@@ -14,11 +14,11 @@
 
 #include <algorithm>
 
-#include "paddle/ir/pattern_rewrite/pattern_applicator.h"
+#include "paddle/pir/pattern_rewrite/pattern_applicator.h"
 
-#include "paddle/ir/pattern_rewrite/pattern_match.h"
+#include "paddle/pir/pattern_rewrite/pattern_match.h"
 
-namespace ir {
+namespace pir {
 
 PatternApplicator::PatternApplicator(
     const FrozenRewritePatternSet& frozen_pattern_list)
@@ -117,4 +117,4 @@ bool PatternApplicator::MatchAndRewrite(
   return result;
 }
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/pattern_rewrite/pattern_applicator.h b/paddle/pir/pattern_rewrite/pattern_applicator.h
similarity index 88%
rename from paddle/ir/pattern_rewrite/pattern_applicator.h
rename to paddle/pir/pattern_rewrite/pattern_applicator.h
index d0eb4bce1acab..a0fdf58fd57e0 100644
--- a/paddle/ir/pattern_rewrite/pattern_applicator.h
+++ b/paddle/pir/pattern_rewrite/pattern_applicator.h
@@ -19,12 +19,12 @@
 #include <functional>
 #include <unordered_map>
 
-#include "paddle/ir/core/op_info.h"
-#include "paddle/ir/core/operation.h"
-#include "paddle/ir/pattern_rewrite/frozen_rewrite_pattern_set.h"
-#include "paddle/ir/pattern_rewrite/pattern_match.h"
+#include "paddle/pir/core/op_info.h"
+#include "paddle/pir/core/operation.h"
+#include "paddle/pir/pattern_rewrite/frozen_rewrite_pattern_set.h"
+#include "paddle/pir/pattern_rewrite/pattern_match.h"
 
-namespace ir {
+namespace pir {
 
 class PatternApplicator {
  public:
@@ -53,4 +53,4 @@ class PatternApplicator {
   std::vector<const RewritePattern*> any_op_patterns_;
 };
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/pattern_rewrite/pattern_match.cc b/paddle/pir/pattern_rewrite/pattern_match.cc
similarity index 97%
rename from paddle/ir/pattern_rewrite/pattern_match.cc
rename to paddle/pir/pattern_rewrite/pattern_match.cc
index 1f465809be37c..eccaf66cca9ce 100644
--- a/paddle/ir/pattern_rewrite/pattern_match.cc
+++ b/paddle/pir/pattern_rewrite/pattern_match.cc
@@ -12,15 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/ir/pattern_rewrite/pattern_match.h"
+#include "paddle/pir/pattern_rewrite/pattern_match.h"
 
 #include <algorithm>
 #include <cstdint>
 
-#include "paddle/ir/core/enforce.h"
-#include "paddle/ir/core/operation.h"
+#include "paddle/pir/core/enforce.h"
+#include "paddle/pir/core/operation.h"
 
-namespace ir {
+namespace pir {
 
 //===----------------------------------------------------------------------===//
 // Pattern
@@ -162,4 +162,4 @@ void RewriterBase::ReplaceOpWithResultsOfAnotherOp(Operation* op,
   // new_op->result(0)); return ReplaceOp(op, new_op->GetResults());
 }
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/pattern_rewrite/pattern_match.h b/paddle/pir/pattern_rewrite/pattern_match.h
similarity index 96%
rename from paddle/ir/pattern_rewrite/pattern_match.h
rename to paddle/pir/pattern_rewrite/pattern_match.h
index 8b3bbaa5b1cbd..0a91c226c519b 100644
--- a/paddle/ir/pattern_rewrite/pattern_match.h
+++ b/paddle/pir/pattern_rewrite/pattern_match.h
@@ -24,17 +24,17 @@
 #include <type_traits>
 #include <vector>
 
-#include "paddle/ir/core/builder.h"
-#include "paddle/ir/core/dll_decl.h"
-#include "paddle/ir/core/enforce.h"
-#include "paddle/ir/core/ir_context.h"
-#include "paddle/ir/core/op_info.h"
-#include "paddle/ir/core/operation.h"
-#include "paddle/ir/core/type_id.h"
-#include "paddle/ir/core/type_name.h"
-#include "paddle/ir/core/value.h"
-
-namespace ir {
+#include "paddle/pir/core/builder.h"
+#include "paddle/pir/core/dll_decl.h"
+#include "paddle/pir/core/enforce.h"
+#include "paddle/pir/core/ir_context.h"
+#include "paddle/pir/core/op_info.h"
+#include "paddle/pir/core/operation.h"
+#include "paddle/pir/core/type_id.h"
+#include "paddle/pir/core/type_name.h"
+#include "paddle/pir/core/value.h"
+
+namespace pir {
 
 // This class reprensents the benefit of a pattern. The most common
 // unit to use is the `numver of operations` in the pattern.
@@ -193,7 +193,7 @@ class IR_API RewritePattern : public Pattern {
     pattern->Initialize();
 
     if (pattern->debug_name().empty())
-      pattern->SetDebugName(ir::get_type_name<T>());
+      pattern->SetDebugName(pir::get_type_name<T>());
     return pattern;
   }
 
@@ -388,4 +388,4 @@ class RewritePatternSet {
   NativePatternListT native_patterns_;
 };
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/pattern_rewrite/pattern_rewrite_driver.cc b/paddle/pir/pattern_rewrite/pattern_rewrite_driver.cc
similarity index 72%
rename from paddle/ir/pattern_rewrite/pattern_rewrite_driver.cc
rename to paddle/pir/pattern_rewrite/pattern_rewrite_driver.cc
index f574ed24afe27..1d8bafc2bfae0 100644
--- a/paddle/ir/pattern_rewrite/pattern_rewrite_driver.cc
+++ b/paddle/pir/pattern_rewrite/pattern_rewrite_driver.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/ir/pattern_rewrite/pattern_rewrite_driver.h"
+#include "paddle/pir/pattern_rewrite/pattern_rewrite_driver.h"
 
 #include <algorithm>
 #include <cstddef>
@@ -22,30 +22,30 @@
 #include <unordered_map>
 #include <unordered_set>
 
-#include "paddle/ir/core/block.h"
-#include "paddle/ir/core/ir_context.h"
-#include "paddle/ir/core/operation.h"
-#include "paddle/ir/core/region.h"
-#include "paddle/ir/core/value.h"
-#include "paddle/ir/pattern_rewrite/frozen_rewrite_pattern_set.h"
-#include "paddle/ir/pattern_rewrite/pattern_applicator.h"
-#include "paddle/ir/pattern_rewrite/pattern_match.h"
+#include "paddle/pir/core/block.h"
+#include "paddle/pir/core/ir_context.h"
+#include "paddle/pir/core/operation.h"
+#include "paddle/pir/core/region.h"
+#include "paddle/pir/core/value.h"
+#include "paddle/pir/pattern_rewrite/frozen_rewrite_pattern_set.h"
+#include "paddle/pir/pattern_rewrite/pattern_applicator.h"
+#include "paddle/pir/pattern_rewrite/pattern_match.h"
 
 namespace {
 
-class GreedyPatternRewriteDriver : public ir::PatternRewriter {
+class GreedyPatternRewriteDriver : public pir::PatternRewriter {
  public:
   explicit GreedyPatternRewriteDriver(
-      ir::IrContext* ctx,
-      const ir::FrozenRewritePatternSet& patterns,
-      const ir::GreedyRewriteConfig& config)
-      : ir::PatternRewriter(ctx),
+      pir::IrContext* ctx,
+      const pir::FrozenRewritePatternSet& patterns,
+      const pir::GreedyRewriteConfig& config)
+      : pir::PatternRewriter(ctx),
         config_(config),
         region_(*config.region),
         matcher_(patterns) {
     worklist_.reserve(128);
     matcher_.ApplyDefaultCostModel();
-    if (config.strict_mode != ir::GreedyRewriteStrictness::AnyOp) {
+    if (config.strict_mode != pir::GreedyRewriteStrictness::AnyOp) {
       for (auto& block : region_) {
         for (auto& op_item : *block) {
           strict_mode_filtered_ops_.insert(op_item);
@@ -60,7 +60,7 @@ class GreedyPatternRewriteDriver : public ir::PatternRewriter {
     do {
       // Check if the iteration limit was reached.
       if (iteration++ >= config_.max_iterations &&
-          config_.max_iterations != ir::GreedyRewriteConfig::kNoLimit)
+          config_.max_iterations != pir::GreedyRewriteConfig::kNoLimit)
         break;
       VLOG(6) << "Iteration[" << iteration << "] for PatternRewrite";
       worklist_.clear();
@@ -95,7 +95,7 @@ class GreedyPatternRewriteDriver : public ir::PatternRewriter {
 
     while (!worklist_.empty() &&
            (num_rewrites < config_.max_num_rewrites ||
-            config_.max_num_rewrites == ir::GreedyRewriteConfig::kNoLimit)) {
+            config_.max_num_rewrites == pir::GreedyRewriteConfig::kNoLimit)) {
       auto* op = PopFromWorklist();
       if (op == nullptr) continue;
       VLOG(6) << "PopFromWorklist, get op: " << op->name();
@@ -117,17 +117,17 @@ class GreedyPatternRewriteDriver : public ir::PatternRewriter {
   }
 
   // TODO(wilber): OpResult support GetUsers method.
-  void NotifyRootReplaced(ir::Operation* op,
-                          const std::vector<ir::Value>& replacement) override {
+  void NotifyRootReplaced(pir::Operation* op,
+                          const std::vector<pir::Value>& replacement) override {
     //   for (uint32_t i = 0; i < op->num_results(); ++i) {
     //     auto res = op->GetResultByIndex(i);
     //   }
     // }
   }
 
-  void FinalizeRootUpdate(ir::Operation* op) override { AddToWorklist(op); }
+  void FinalizeRootUpdate(pir::Operation* op) override { AddToWorklist(op); }
 
-  void NotifyOperationRemoved(ir::Operation* op) override {
+  void NotifyOperationRemoved(pir::Operation* op) override {
     for (uint32_t i = 0; i < op->num_operands(); ++i) {
       AddOperandToWorklist(op->operand_source(i));
     }
@@ -144,20 +144,20 @@ class GreedyPatternRewriteDriver : public ir::PatternRewriter {
       }
     }
 
-    if (config_.strict_mode != ir::GreedyRewriteStrictness::AnyOp) {
+    if (config_.strict_mode != pir::GreedyRewriteStrictness::AnyOp) {
       strict_mode_filtered_ops_.erase(op);
     }
   }
 
-  void NotifyOperationInserted(ir::Operation* op) override {
-    if (config_.strict_mode == ir::GreedyRewriteStrictness::ExistingAndNewOps)
+  void NotifyOperationInserted(pir::Operation* op) override {
+    if (config_.strict_mode == pir::GreedyRewriteStrictness::ExistingAndNewOps)
       strict_mode_filtered_ops_.insert(op);
     AddToWorklist(op);
   }
 
   /// Add the given operation to the worklist.
-  void AddToWorklist(ir::Operation* op) {
-    if (config_.strict_mode == ir::GreedyRewriteStrictness::AnyOp ||
+  void AddToWorklist(pir::Operation* op) {
+    if (config_.strict_mode == pir::GreedyRewriteStrictness::AnyOp ||
         strict_mode_filtered_ops_.count(op)) {
       if (worklist_map_.count(op)) return;
 
@@ -166,7 +166,7 @@ class GreedyPatternRewriteDriver : public ir::PatternRewriter {
     }
   }
 
-  void AddOperandToWorklist(ir::Value operand) {
+  void AddOperandToWorklist(pir::Value operand) {
     // If the use count of this operand is now < 2, we re-add the defining
     // operation to the worklist.
     // This is based on the fact that zero use operations may be deleted, and
@@ -176,14 +176,14 @@ class GreedyPatternRewriteDriver : public ir::PatternRewriter {
     if (auto* def_op = operand.GetDefiningOp()) AddToWorklist(def_op);
   }
 
-  void AddOperandsToWorklist(const std::vector<ir::Value> operands) {
+  void AddOperandsToWorklist(const std::vector<pir::Value> operands) {
     for (auto& v : operands) {
       AddOperandToWorklist(v);
     }
   }
 
   /// Pop the next operation from the worklist
-  ir::Operation* PopFromWorklist() {
+  pir::Operation* PopFromWorklist() {
     auto* op = worklist_.back();
     worklist_.pop_back();
     if (op) worklist_map_.erase(op);
@@ -191,7 +191,7 @@ class GreedyPatternRewriteDriver : public ir::PatternRewriter {
   }
 
   /// If the specified operation is in the worklist, remove it.
-  void RemoveFromWorklist(ir::Operation* op) {
+  void RemoveFromWorklist(pir::Operation* op) {
     auto it = worklist_map_.find(op);
     if (it != worklist_map_.end()) {
       worklist_[it->second] = nullptr;
@@ -200,17 +200,17 @@ class GreedyPatternRewriteDriver : public ir::PatternRewriter {
   }
 
  private:
-  std::vector<ir::Operation*> worklist_;
-  std::unordered_map<ir::Operation*, unsigned> worklist_map_;
-  ir::GreedyRewriteConfig config_;
-  std::unordered_set<ir::Operation*> strict_mode_filtered_ops_;
-  ir::Region& region_;
-  ir::PatternApplicator matcher_;
+  std::vector<pir::Operation*> worklist_;
+  std::unordered_map<pir::Operation*, unsigned> worklist_map_;
+  pir::GreedyRewriteConfig config_;
+  std::unordered_set<pir::Operation*> strict_mode_filtered_ops_;
+  pir::Region& region_;
+  pir::PatternApplicator matcher_;
 };
 
 }  // namespace
 
-namespace ir {
+namespace pir {
 
 bool ApplyPatternsGreedily(Region& region,  // NOLINT
                            const FrozenRewritePatternSet& patterns,
@@ -226,4 +226,4 @@ bool ApplyPatternsGreedily(Region& region,  // NOLINT
   return converged;
 }
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/pattern_rewrite/pattern_rewrite_driver.h b/paddle/pir/pattern_rewrite/pattern_rewrite_driver.h
similarity index 92%
rename from paddle/ir/pattern_rewrite/pattern_rewrite_driver.h
rename to paddle/pir/pattern_rewrite/pattern_rewrite_driver.h
index 2e87eac5fef0b..94f30f1b4ac52 100644
--- a/paddle/ir/pattern_rewrite/pattern_rewrite_driver.h
+++ b/paddle/pir/pattern_rewrite/pattern_rewrite_driver.h
@@ -14,12 +14,12 @@
 
 #pragma once
 
-#include "paddle/ir/core/dll_decl.h"
-#include "paddle/ir/core/region.h"
-#include "paddle/ir/pattern_rewrite/frozen_rewrite_pattern_set.h"
-#include "paddle/ir/pattern_rewrite/pattern_match.h"
+#include "paddle/pir/core/dll_decl.h"
+#include "paddle/pir/core/region.h"
+#include "paddle/pir/pattern_rewrite/frozen_rewrite_pattern_set.h"
+#include "paddle/pir/pattern_rewrite/pattern_match.h"
 
-namespace ir {
+namespace pir {
 
 /// This enum will control which ops will be added to the worklist during the
 /// match rewrite process
@@ -83,4 +83,4 @@ inline IR_API bool ApplyPatternsGreedily(
   return !failed;
 }
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/transforms/CMakeLists.txt b/paddle/pir/transforms/CMakeLists.txt
similarity index 52%
rename from paddle/ir/transforms/CMakeLists.txt
rename to paddle/pir/transforms/CMakeLists.txt
index 2b9f63a64d4f9..4f9f0fa196e9a 100644
--- a/paddle/ir/transforms/CMakeLists.txt
+++ b/paddle/pir/transforms/CMakeLists.txt
@@ -1,10 +1,10 @@
 file(GLOB PATTERN_SRCS "*.cc")
 
 ir_library(
-  ir_builtin_transforms
+  pir_builtin_transforms
   SRCS
   ${PATTERN_SRCS}
   DEPS
-  ir_core
-  ir_pattern_rewrite
-  ir_pass)
+  pir_core
+  pir_pattern_rewrite
+  pir_pass)
diff --git a/paddle/ir/transforms/dead_code_elimination_pass.cc b/paddle/pir/transforms/dead_code_elimination_pass.cc
similarity index 69%
rename from paddle/ir/transforms/dead_code_elimination_pass.cc
rename to paddle/pir/transforms/dead_code_elimination_pass.cc
index c74d71ea34569..152cca23fd5b0 100644
--- a/paddle/ir/transforms/dead_code_elimination_pass.cc
+++ b/paddle/pir/transforms/dead_code_elimination_pass.cc
@@ -12,27 +12,27 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/ir/transforms/dead_code_elimination_pass.h"
+#include "paddle/pir/transforms/dead_code_elimination_pass.h"
 
-#include "paddle/ir/core/builtin_op.h"
-#include "paddle/ir/core/program.h"
-#include "paddle/ir/pass/pass.h"
-#include "paddle/ir/pass/pass_registry.h"
+#include "paddle/pir/core/builtin_op.h"
+#include "paddle/pir/core/program.h"
+#include "paddle/pir/pass/pass.h"
+#include "paddle/pir/pass/pass_registry.h"
 
 namespace {
 
 // TODO(wilber): After support SideEffectTrait, Only NoSideEffectTrait op can be
 // removed by dce pass.
 // Now just a naive implementation.
-class DeadCodeEliminationPass : public ir::Pass {
+class DeadCodeEliminationPass : public pir::Pass {
  public:
-  DeadCodeEliminationPass() : ir::Pass("dead_code_elimination", 0) {}
+  DeadCodeEliminationPass() : pir::Pass("dead_code_elimination", 0) {}
 
-  void Run(ir::Operation *op) override {
-    auto module_op = op->dyn_cast<ir::ModuleOp>();
+  void Run(pir::Operation *op) override {
+    auto module_op = op->dyn_cast<pir::ModuleOp>();
     IR_ENFORCE(module_op, "DcePass should run on module op.");
     auto *block = module_op.block();
-    std::vector<ir::Operation *> erased_op;
+    std::vector<pir::Operation *> erased_op;
     for (auto &op : *block) {
       // TODO(wilber): Support NoSideEffect trait.
       // if (!op->HasTrait<NoSideEffect>()) continue;
@@ -42,39 +42,39 @@ class DeadCodeEliminationPass : public ir::Pass {
         use_empty &= op->result(i).use_empty();
       }
       // TODO(wilber): Support Terminator trait.
-      if (use_empty && op->name() != "pd.fetch") {
+      if (use_empty && op->name() != "pd_op.fetch") {
         erased_op.push_back(op);
       }
     }
 
     for (auto *op : erased_op) {
-      if (op->dyn_cast<ir::GetParameterOp>()) {
+      if (op->dyn_cast<pir::GetParameterOp>()) {
         // Delete parameter from program.
-        ir::GetParameterOp get_parameter_op =
-            op->dyn_cast<ir::GetParameterOp>();
+        pir::GetParameterOp get_parameter_op =
+            op->dyn_cast<pir::GetParameterOp>();
         get_parameter_op->GetParentProgram()->parameters().erase(
             get_parameter_op->attributes()
                 .at(get_parameter_op.attributes_name[0])
-                .dyn_cast<ir::StrAttribute>()
+                .dyn_cast<pir::StrAttribute>()
                 .AsString());
       }
       block->erase(*op);
     }
   }
 
-  bool CanApplyOn(ir::Operation *op) const override {
+  bool CanApplyOn(pir::Operation *op) const override {
     return op->name() == "builtin.module" && op->num_regions() > 0;
   }
 };
 
 }  // namespace
 
-namespace ir {
+namespace pir {
 
 std::unique_ptr<Pass> CreateDeadCodeEliminationPass() {
   return std::make_unique<DeadCodeEliminationPass>();
 }
 
-}  // namespace ir
+}  // namespace pir
 
 REGISTER_IR_PASS(dead_code_elimination, DeadCodeEliminationPass);
diff --git a/paddle/ir/transforms/dead_code_elimination_pass.h b/paddle/pir/transforms/dead_code_elimination_pass.h
similarity index 90%
rename from paddle/ir/transforms/dead_code_elimination_pass.h
rename to paddle/pir/transforms/dead_code_elimination_pass.h
index f03c024ae1d17..d0c86105662d1 100644
--- a/paddle/ir/transforms/dead_code_elimination_pass.h
+++ b/paddle/pir/transforms/dead_code_elimination_pass.h
@@ -15,12 +15,12 @@
 #pragma once
 
 #include <memory>
-#include "paddle/ir/core/dll_decl.h"
+#include "paddle/pir/core/dll_decl.h"
 
-namespace ir {
+namespace pir {
 
 class Pass;
 
 IR_API std::unique_ptr<Pass> CreateDeadCodeEliminationPass();
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/transforms/reorder_block_ops_pass.cc b/paddle/pir/transforms/reorder_block_ops_pass.cc
similarity index 79%
rename from paddle/ir/transforms/reorder_block_ops_pass.cc
rename to paddle/pir/transforms/reorder_block_ops_pass.cc
index 91b4b52229f10..db2d29fe9b0a7 100644
--- a/paddle/ir/transforms/reorder_block_ops_pass.cc
+++ b/paddle/pir/transforms/reorder_block_ops_pass.cc
@@ -12,33 +12,33 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/ir/transforms/reorder_block_ops_pass.h"
+#include "paddle/pir/transforms/reorder_block_ops_pass.h"
 
 #include <queue>
 
-#include "paddle/ir/core/builtin_op.h"
-#include "paddle/ir/core/program.h"
-#include "paddle/ir/pass/pass.h"
+#include "paddle/pir/core/builtin_op.h"
+#include "paddle/pir/core/program.h"
+#include "paddle/pir/pass/pass.h"
 
 namespace {
 
-class ReorderBlockOpsPass : public ir::Pass {
+class ReorderBlockOpsPass : public pir::Pass {
  public:
-  ReorderBlockOpsPass() : ir::Pass("ReorderBlockOpsPass", 0) {}
+  ReorderBlockOpsPass() : pir::Pass("ReorderBlockOpsPass", 0) {}
 
-  void Run(ir::Operation *op) override {
+  void Run(pir::Operation *op) override {
     IR_ENFORCE(op->num_regions() > 0,
                "ReorderBlockOpsPass should run on Operation which regions "
                "number greater than 0.");
     for (size_t i = 0; i < op->num_regions(); ++i) {
       for (auto *block : op->region(i)) {
-        std::list<ir::Operation *> res_op_list;
-        std::unordered_map<ir::Operation *, int>
+        std::list<pir::Operation *> res_op_list;
+        std::unordered_map<pir::Operation *, int>
             reorder_op_dep_cnt;  // op -> dependent input count
-        std::unordered_set<ir::Value> visited_values;
-        std::queue<ir::Operation *> op_que;
+        std::unordered_set<pir::Value> visited_values;
+        std::queue<pir::Operation *> op_que;
 
-        auto update_op_que = [&](ir::Operation *op) {
+        auto update_op_que = [&](pir::Operation *op) {
           for (size_t i = 0; i < op->results().size(); ++i) {
             auto result = op->result(i);
             visited_values.insert(result);
@@ -86,17 +86,17 @@ class ReorderBlockOpsPass : public ir::Pass {
     }
   }
 
-  bool CanApplyOn(ir::Operation *op) const override {
+  bool CanApplyOn(pir::Operation *op) const override {
     return op->num_regions() > 0;
   }
 };
 
 }  // namespace
 
-namespace ir {
+namespace pir {
 
 std::unique_ptr<Pass> CreateReorderBlockOpsPass() {
   return std::make_unique<ReorderBlockOpsPass>();
 }
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/ir/transforms/reorder_block_ops_pass.h b/paddle/pir/transforms/reorder_block_ops_pass.h
similarity index 90%
rename from paddle/ir/transforms/reorder_block_ops_pass.h
rename to paddle/pir/transforms/reorder_block_ops_pass.h
index f668471fc9e04..51ab110bb3ac0 100644
--- a/paddle/ir/transforms/reorder_block_ops_pass.h
+++ b/paddle/pir/transforms/reorder_block_ops_pass.h
@@ -15,12 +15,12 @@
 #pragma once
 
 #include <memory>
-#include "paddle/ir/core/dll_decl.h"
+#include "paddle/pir/core/dll_decl.h"
 
-namespace ir {
+namespace pir {
 
 class Pass;
 
 IR_API std::unique_ptr<Pass> CreateReorderBlockOpsPass();
 
-}  // namespace ir
+}  // namespace pir
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 6a6208334e0e4..b95ef1bb47062 100644
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -783,7 +783,7 @@ function run_linux_cpu_test() {
     if [ -d "${PADDLE_ROOT}/dist/" ]; then
         pip install ${PADDLE_ROOT}/dist/*whl
     fi
-    cp ${PADDLE_ROOT}/build/test/legacy_test/eager_op_test.py ${PADDLE_ROOT}/build/python
+    cp ${PADDLE_ROOT}/build/test/legacy_test/op_test.py ${PADDLE_ROOT}/build/python
     cp ${PADDLE_ROOT}/build/test/legacy_test/testsuite.py ${PADDLE_ROOT}/build/python
     cp -r ${PADDLE_ROOT}/build/test/white_list ${PADDLE_ROOT}/build/python
     ut_total_startTime_s=`date +%s`
@@ -1174,9 +1174,9 @@ EOF
 }
 
 function check_diff_file_for_coverage() {
-    diff_h_file=$(git diff --name-status test develop | awk '$1 != "D" {print $2}' | grep '\.h$' | awk -F "/" '{printf "%s,",$NF}')
-    diff_cc_file=$(git diff --name-status test develop | awk '$1 != "D" {print $2}' | grep -E '\.(cc|c)$' | awk -F "/" '{printf "%s,",$NF}')
-    diff_py_file=$(git diff --name-status test develop | grep '\.py$' | awk '$1 != "D" {printf "%s,",$2}')
+    diff_h_file=$(git diff --name-status test -- | awk '$1 != "D" {print $2}' | grep '\.h$' | awk -F "/" '{printf "%s,",$NF}')
+    diff_cc_file=$(git diff --name-status test -- | awk '$1 != "D" {print $2}' | grep -E '\.(cc|c)$' | awk -F "/" '{printf "%s,",$NF}')
+    diff_py_file=$(git diff --name-status test -- | grep '\.py$' | awk '$1 != "D" {printf "%s,",$2}')
     export PADDLE_GIT_DIFF_H_FILE=${diff_h_file%*,}
     export PADDLE_GIT_DIFF_CC_FILE=${diff_cc_file%*,}
     export PADDLE_GIT_DIFF_PY_FILE=${diff_py_file%*,}
@@ -2479,7 +2479,7 @@ set +x
         noparallel_ut_startTime_s=`date +%s`
         while read line
         do
-            card_test "$line" -1 2
+            card_test "$line" -1 4
         done < $PADDLE_ROOT/tools/no_parallel_case_file
         noparallel_ut_endTime_s=`date +%s`
         echo "ipipe_log_param_noparallel_TestCases_Total_Time: $[ $noparallel_ut_endTime_s - $noparallel_ut_startTime_s ]s"
@@ -3199,8 +3199,19 @@ function summary_check_problems() {
         echo "==============================================================================="
         echo "*****Example code error***** Please fix the error listed in the information:"
         echo "==============================================================================="
-        echo "$example_info" | grep "API check -- Example Code" -A $(echo "$example_info" | wc -l)
+        echo "$example_info"
+        echo "==============================================================================="
+        echo "*****Example code FAIL*****"
+        echo "==============================================================================="
         exit $example_code
+    else
+        echo "==============================================================================="
+        echo "*****Example code info*****"
+        echo "==============================================================================="
+        echo "$example_info"
+        echo "==============================================================================="
+        echo "*****Example code PASS*****"
+        echo "==============================================================================="
     fi
     set -x
 }
@@ -3868,10 +3879,10 @@ function main() {
         example_info_gpu=""
         example_code_gpu=0
         if [ "${WITH_GPU}" == "ON" ] ; then
-            example_info_gpu=$(exec_samplecode_test gpu)
+            { example_info_gpu=$(exec_samplecode_test gpu 2>&1 1>&3 3>/dev/null); } 3>&1
             example_code_gpu=$?
         fi
-        example_info=$(exec_samplecode_test cpu)
+        { example_info=$(exec_samplecode_test cpu 2>&1 1>&3 3>/dev/null); } 3>&1
         example_code=$?
         summary_check_problems $[${example_code_gpu} + ${example_code}] "${example_info_gpu}\n${example_info}"
         assert_api_spec_approvals
@@ -3888,10 +3899,10 @@ function main() {
         example_info_gpu=""
         example_code_gpu=0
         if [ "${WITH_GPU}" == "ON" ] ; then
-            example_info_gpu=$(exec_samplecode_test gpu)
+            { example_info_gpu=$(exec_samplecode_test gpu 2>&1 1>&3 3>/dev/null); } 3>&1
             example_code_gpu=$?
         fi
-        example_info=$(exec_samplecode_test cpu)
+        { example_info=$(exec_samplecode_test cpu 2>&1 1>&3 3>/dev/null); } 3>&1
         example_code=$?
         summary_check_problems $[${example_code_gpu} + ${example_code}] "${example_info_gpu}\n${example_info}"
         assert_api_spec_approvals
@@ -4101,7 +4112,7 @@ function main() {
         build_document_preview
         ;;
       api_example)
-        example_info=$(exec_samplecode_test cpu)
+        { example_info=$(exec_samplecode_test cpu 2>&1 1>&3 3>/dev/null); } 3>&1
         example_code=$?
         summary_check_problems $example_code "$example_info"
         ;;
diff --git a/paddle/utils/small_vector.h b/paddle/utils/small_vector.h
index 0081832beedf0..09a9695280517 100644
--- a/paddle/utils/small_vector.h
+++ b/paddle/utils/small_vector.h
@@ -34,8 +34,6 @@
 #include <new>
 #include <stdexcept>
 #include <string>
-#include <type_traits>
-#include <utility>
 
 namespace paddle {
 
diff --git a/python/paddle/_C_ops.py b/python/paddle/_C_ops.py
index 3dd87313fd5c3..987a38d36b9a6 100644
--- a/python/paddle/_C_ops.py
+++ b/python/paddle/_C_ops.py
@@ -16,6 +16,7 @@
 
 __all__ = []
 
+
 for name in dir(core.eager.ops):
     globals()[name] = getattr(core.eager.ops, name)
     __all__.append(name)
diff --git a/python/paddle/amp/debugging.py b/python/paddle/amp/debugging.py
index 9b24188457fa7..cf487ca088d6d 100644
--- a/python/paddle/amp/debugging.py
+++ b/python/paddle/amp/debugging.py
@@ -68,47 +68,40 @@ def check_layer_numerics(func):
     Args:
         func (callable): The function to be decorated.
 
-    Returns:
-        callable: The decorated function.
-
-    Raises:
-        None.
-
-    Example:
-
-        import paddle
-        class MyLayer(paddle.nn.Layer):
-            def __init__(self, dtype):
-                super().__init__()
-                self._w = self.create_parameter([2, 3], dtype=dtype)
-                self._b = self.create_parameter([2, 3], dtype=dtype)
-
-            @paddle.amp.debugging.check_layer_numerics
-            def forward(self, x):
-                # return 1/x * self._w + self._b   open it you will see the error log
-                return x * self._w + self._b
-
-            dtype = 'float32'
-            x = paddle.rand([10, 2, 2], dtype=dtype)
-            model = MyLayer(dtype)
-            x[0] = float(0)
-            loss = model(x)
-            adam = paddle.optimizer.Adam(parameters=model.parameters())
-            loss.backward()
-            adam.step()
-
-        #error log
-        #[PRECISION] [ERROR] in [device=gpu:0, op=divide, tensor=, dtype=fp32], numel=40, num_nan=0, num_inf=4, num_zero=0, max=inf, min=1.048930e+00, mean=inf
-        #Traceback (most recent call last):
-        #  File "tmp.py", line 16, in <module>
-        #    loss = model(x)
-        #  File "/paddle/nn/layer/layers.py", line 1254, in __call__
-        #    return self.forward(*inputs, **kwargs)
-        #  File "/paddle/amp/debugging.py", line 116, in wrapper
-        #    out_data = func(self, *modified_args, **kwargs)
-        #  File "test.py", line 10, in forward
-        #    return 1/x *  self._w+ self._b
-        #RuntimeError: (PreconditionNotMet) There are NAN or INF (num_nan=0, num_inf=4, num_zero=0) in [device=gpu:0, op=divide, tensor=, dtype=fp32].
+    Examples:
+
+        ..  code-block:: python
+
+            >>> import paddle
+            >>> class MyLayer(paddle.nn.Layer):
+            ...     def __init__(self, dtype):
+            ...         super().__init__()
+            ...         self._w = self.create_parameter([2, 3], dtype=dtype)
+            ...         self._b = self.create_parameter([2, 3], dtype=dtype)
+            ...     @paddle.amp.debugging.check_layer_numerics
+            ...     def forward(self, x):
+            ...         # return 1/x * self._w + self._b   open it you will see the error log
+            ...         return x * self._w + self._b
+            >>> dtype = 'float32'
+            >>> x = paddle.rand([10, 2, 3], dtype=dtype)
+            >>> model = MyLayer(dtype)
+            >>> x[0] = float(0)
+            >>> loss = model(x)
+            >>> adam = paddle.optimizer.Adam(parameters=model.parameters())
+            >>> loss.backward()
+            >>> adam.step()
+            >>> # error log
+            >>> # [PRECISION] [ERROR] in [device=gpu:0, op=divide, tensor=, dtype=fp32], numel=40, num_nan=0, num_inf=4, num_zero=0, max=inf, min=1.048930e+00, mean=inf
+            >>> # Traceback (most recent call last):
+            >>> #  File "tmp.py", line 16, in <module>
+            >>> #    loss = model(x)
+            >>> #  File "/paddle/nn/layer/layers.py", line 1254, in __call__
+            >>> #    return self.forward(*inputs, **kwargs)
+            >>> #  File "/paddle/amp/debugging.py", line 116, in wrapper
+            >>> #    out_data = func(self, *modified_args, **kwargs)
+            >>> #  File "test.py", line 10, in forward
+            >>> #    return 1/x *  self._w+ self._b
+            >>> # RuntimeError: (PreconditionNotMet) There are NAN or INF (num_nan=0, num_inf=4, num_zero=0) in [device=gpu:0, op=divide, tensor=, dtype=fp32].
     """
 
     def wrapper(self, *args, **kwargs):
diff --git a/python/paddle/autograd/__init__.py b/python/paddle/autograd/__init__.py
index 6ad6606c1f74f..acfb6447283c5 100644
--- a/python/paddle/autograd/__init__.py
+++ b/python/paddle/autograd/__init__.py
@@ -23,6 +23,7 @@
 from .py_layer import PyLayer  # noqa: F401
 from .py_layer import PyLayerContext  # noqa: F401
 from .saved_tensors_hooks import saved_tensors_hooks
+from . import ir_backward
 
 __all__ = [  # noqa
     'jacobian',
diff --git a/python/paddle/autograd/ir_backward.py b/python/paddle/autograd/ir_backward.py
index fc2619bbdcf5b..fc2d6b685d279 100644
--- a/python/paddle/autograd/ir_backward.py
+++ b/python/paddle/autograd/ir_backward.py
@@ -104,13 +104,13 @@ def prepare_grad_outputs(grad_outputs, outputs, state):
         else:
             if output.shape != grad.shape:
                 raise ValueError(
-                    "The shape of grad_output[%d] should be the same as the shape of output[%d]"
-                    % (i, i)
+                    "The shape of grad_output[%d] %s should be the same as the shape of output[%d] %s"
+                    % (i, str(output.shape), i, str(grad.shape))
                 )
             if output.dtype != grad.dtype:
                 raise ValueError(
-                    "The dtype of grad_output[%d] should be the same as the dtype of output[%d]"
-                    % (i, i)
+                    "The dtype of grad_output[%d] %s should be the same as the dtype of output[%d] %s"
+                    % (i, str(output.dtype), i, str(grad.dtype))
                 )
             feedop = grad.get_defining_op()
             update_bwdop_structure(
@@ -345,9 +345,32 @@ def make_output_grad(op):
         zero_flag = [False] * op.num_results()
         output_grads = []
         for i, value in enumerate(op.results()):
+            if (
+                value in state.value_to_valuegrad
+                and len(state.value_to_valuegrad[value])
+            ) > 1:
+                # one value is input of more than one fwd_op,
+                # so more than one bwd_op create input_grad,
+                # need add sum op to accumulate gradient
+                paddle.add_n(
+                    [item[0] for item in state.value_to_valuegrad[value]]
+                )
+                combineop = block.ops[len(block.ops) - 2]
+                sumop = block.ops[len(block.ops) - 1]
+                update_bwdop_structure(
+                    backward_ops, state.op_to_opgrad[op], combineop
+                )
+                update_bwdop_structure(
+                    backward_ops, state.op_to_opgrad[op], sumop
+                )
+                state.value_to_valuegrad[value] = [[sumop.result(0)]]
+                state.value_to_sumvaluegrad[value] = state.value_to_valuegrad[
+                    value
+                ]
+
             if (
                 value not in state.value_to_valuegrad
-                or state.value_to_valuegrad[value] is None
+                or state.value_to_valuegrad[value] == []
             ):
                 if (
                     not value.use_empty()
@@ -360,7 +383,8 @@ def make_output_grad(op):
                         value.first_use().owner()
                     )
                     zero_flag[i] = all(split_zero_flag)
-                    state.value_to_valuegrad[value] = [split_output_grad]
+                    grad_values = [value[0] for value in split_output_grad]
+                    state.value_to_valuegrad[value] = [grad_values]
                 else:
                     # first case:
                     # this fwd_op's output didn't used by other fwd_op,
@@ -383,33 +407,21 @@ def make_output_grad(op):
 
                     state.value_to_valuegrad[value] = [[grad_value]]
 
-            if len(state.value_to_valuegrad[value]) > 1:
-                # one value is input of more than one fwd_op,
-                # so more than one bwd_op create input_grad,
-                # need add sum op to accumulate gradient
-
-                paddle.add_n(
-                    [item[0] for item in state.value_to_valuegrad[value]]
-                )
-                combineop = block.ops[len(block.ops) - 2]
-                sumop = block.ops[len(block.ops) - 1]
-                update_bwdop_structure(
-                    backward_ops, state.op_to_opgrad[op], combineop
-                )
-                update_bwdop_structure(
-                    backward_ops, state.op_to_opgrad[op], sumop
-                )
-                state.value_to_valuegrad[value] = [[sumop.result(0)]]
-                state.value_to_sumvaluegrad[value] = state.value_to_valuegrad[
-                    value
-                ]
+            output_grads.append(state.value_to_valuegrad[value][0])
 
-            output_grads.append(state.value_to_valuegrad[value][0][0])
         return zero_flag, output_grads
 
     def make_input_stopgradient(op):
         input_grad_stopgradients = []
-        for input in op.operands_source():
+        if op.name() == "builtin.combine":
+            grad_semantic_info = [True for _ in range(op.num_operands())]
+        else:
+            grad_semantic_info = op.get_input_grad_semantics()
+        for input, grad_semantic in zip(
+            op.operands_source(), grad_semantic_info
+        ):
+            if not grad_semantic:
+                continue
             if input.get_defining_op().name() == "builtin.combine":
                 stop_gradient = make_input_stopgradient(input.get_defining_op())
                 input_grad_stopgradients.append(
@@ -423,7 +435,16 @@ def make_input_stopgradient(op):
         return input_grad_stopgradients
 
     def update_input_grad_map(op, input_grads):
-        for i, input in enumerate(op.operands_source()):
+        i = 0
+        if op.name() == "builtin.combine":
+            grad_semantic_info = [True for _ in range(op.num_operands())]
+        else:
+            grad_semantic_info = op.get_input_grad_semantics()
+        for input, grad_semantic in zip(
+            op.operands_source(), grad_semantic_info
+        ):
+            if not grad_semantic:
+                continue
             if input.get_defining_op().name() == "builtin.combine":
                 update_input_grad_map(input.get_defining_op(), input_grads[i])
             else:
@@ -432,6 +453,7 @@ def update_input_grad_map(op, input_grads):
                     state.value_to_valuegrad[input].append(input_grad)
                 else:
                     state.value_to_valuegrad[input].append([input_grad])
+            i += 1
 
     # there are four patterns:
     # [builtin.combine , op1] (op1's one input is vectorType, outputs are not vectorType)
@@ -449,9 +471,7 @@ def update_input_grad_map(op, input_grads):
     for op in clear_effective_forward_ops:
         if paddle.framework.core.has_vjp(op):
             # prepare output_grad
-            output_grads = []  # (opresult)
-            zero_flag, output_grad = make_output_grad(op)
-            output_grads.append(output_grad)
+            zero_flag, output_grads = make_output_grad(op)
 
             # all(zero_flag) support this op has no contribution for grad
             # should be delete (prune sub_graph)
@@ -529,7 +549,7 @@ def create_backward_prune_set(inputs, outputs, no_grad_set, state):
 
     no_gradvar_set = set()  # grad_value of value in no_grad_set
     for key in state.value_to_valuegrad:
-        if key in no_grad_set:
+        if key in no_grad_set and state.value_to_valuegrad[key] != []:
             no_gradvar_set.add(state.value_to_valuegrad[key][0][0])
     for key in state.value_to_sumvaluegrad:
         if key in no_grad_set:
@@ -561,7 +581,7 @@ def remove_op(block, op, state):
 
 def calc_gradient_helper(outputs, inputs, grad_outputs, no_grad_set):
     block = outputs[0].get_defining_op().get_parent_block()
-    state = State(block.get_parent_program())
+    state = State(block.program)
     # check all inputs and outputs in the same block
     check_all_puts(block, inputs, outputs)
     # update no_grad_set if some value stop_gradient=True
@@ -585,7 +605,6 @@ def calc_gradient_helper(outputs, inputs, grad_outputs, no_grad_set):
         block, inverse_effective_forward_ops, no_grad_set, backward_ops, state
     )
     # now value_to_valuegrad should be value <-> value (add sum op for the same values's gradvalue)
-
     outputs_set, inputs_set, no_gradvar_set = create_backward_prune_set(
         inputs, complete_outputs, no_grad_set, state
     )
diff --git a/python/paddle/base/backward.py b/python/paddle/base/backward.py
index f468f3a6a74a7..ab97e6cea4cec 100755
--- a/python/paddle/base/backward.py
+++ b/python/paddle/base/backward.py
@@ -1950,44 +1950,44 @@ def append_backward(
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn.functional as F
+            >>> import paddle
+            >>> import paddle.nn.functional as F
 
-            paddle.enable_static()
+            >>> paddle.enable_static()
 
-            x = paddle.static.data(name='x', shape=[None, 13], dtype='int64')
-            y = paddle.static.data(name='y', shape=[None, 1], dtype='float32')
-            x_emb = paddle.static.nn.embedding(x, size=[100, 256])
-            y_predict = paddle.static.nn.fc(x=x_emb, size=1, activation=None, name='my_fc')
-            loss = F.square_error_cost(input=y_predict, label=y)
-            avg_loss = paddle.mean(loss)
+            >>> x = paddle.static.data(name='x', shape=[None, 13], dtype='int64')
+            >>> y = paddle.static.data(name='y', shape=[None, 1], dtype='float32')
+            >>> x_emb = paddle.static.nn.embedding(x, size=[100, 256])
+            >>> y_predict = paddle.static.nn.fc(x=x_emb, size=1, activation=None, name='my_fc')
+            >>> loss = F.square_error_cost(input=y_predict, label=y)
+            >>> avg_loss = paddle.mean(loss)
 
-            # Get all weights in main_program, not include bias.
-            all_weights = [param for param in paddle.static.default_main_program().block(0).all_parameters() if 'w_' in param.name]
-            all_weights_name = [w.name for w in all_weights]
+            >>> # Get all weights in main_program, not include bias.
+            >>> all_weights = [param for param in paddle.static.default_main_program().block(0).all_parameters() if 'w_' in param.name]
+            >>> all_weights_name = [w.name for w in all_weights]
 
-            # return all param_grads needed to be updated if parameter_list set default None.
-            p_g_list1 = paddle.static.append_backward(loss=avg_loss)
-            # output: [(embedding_0.w_0, embedding_0.w_0@GRAD), (my_fc.w_0, my_fc.w_0@GRAD), (my_fc.b_0, my_fc.b_0@GRAD)]
+            >>> # return all param_grads needed to be updated if parameter_list set default None.
+            >>> p_g_list1 = paddle.static.append_backward(loss=avg_loss)
+            >>> # output: [(embedding_0.w_0, embedding_0.w_0@GRAD), (my_fc.w_0, my_fc.w_0@GRAD), (my_fc.b_0, my_fc.b_0@GRAD)]
 
-            # return the param_grads corresponding to parameter_list that can be list of param (Tensor).
-            p_g_list2 = paddle.static.append_backward(loss=avg_loss, parameter_list=all_weights)
-            # output: [(embedding_0.w_0, embedding_0.w_0@GRAD), (my_fc.w_0, my_fc.w_0@GRAD)]
+            >>> # return the param_grads corresponding to parameter_list that can be list of param (Tensor).
+            >>> p_g_list2 = paddle.static.append_backward(loss=avg_loss, parameter_list=all_weights)
+            >>> # output: [(embedding_0.w_0, embedding_0.w_0@GRAD), (my_fc.w_0, my_fc.w_0@GRAD)]
 
-            # parameter_list can be list of param.name (str).
-            p_g_list3 = paddle.static.append_backward(loss=avg_loss, parameter_list=all_weights_name)
-            # output: [(embedding_0.w_0, embedding_0.w_0@GRAD), (my_fc.w_0, my_fc.w_0@GRAD)]
+            >>> # parameter_list can be list of param.name (str).
+            >>> p_g_list3 = paddle.static.append_backward(loss=avg_loss, parameter_list=all_weights_name)
+            >>> # output: [(embedding_0.w_0, embedding_0.w_0@GRAD), (my_fc.w_0, my_fc.w_0@GRAD)]
 
-            # no_grad_set can be set of Tensors that means grad will be cut off from these Tensors.
-            p_g_list4 = paddle.static.append_backward(loss=avg_loss, no_grad_set=set([x_emb]))
-            # output: [(my_fc.w_0, my_fc.w_0@GRAD), (my_fc.b_0, my_fc.b_0@GRAD)]
+            >>> # no_grad_set can be set of Tensors that means grad will be cut off from these Tensors.
+            >>> p_g_list4 = paddle.static.append_backward(loss=avg_loss, no_grad_set=set([x_emb]))
+            >>> # output: [(my_fc.w_0, my_fc.w_0@GRAD), (my_fc.b_0, my_fc.b_0@GRAD)]
 
-            # no_grad_set can be set of Tensor.name when the Tensor is created inside layers and can't be specified explicitly.
-            p_g_list5 = paddle.static.append_backward(loss=avg_loss, no_grad_set=set(['my_fc.b_0']))
-            # output: [(embedding_0.w_0, embedding_0.w_0@GRAD), (my_fc.w_0, my_fc.w_0@GRAD)]
+            >>> # no_grad_set can be set of Tensor.name when the Tensor is created inside layers and can't be specified explicitly.
+            >>> p_g_list5 = paddle.static.append_backward(loss=avg_loss, no_grad_set=set(['my_fc.b_0']))
+            >>> # output: [(embedding_0.w_0, embedding_0.w_0@GRAD), (my_fc.w_0, my_fc.w_0@GRAD)]
 
-            # return [] because all param_grads are filtered by no_grad_set.
-            p_g_list6 = paddle.static.append_backward(loss=avg_loss, parameter_list=all_weights, no_grad_set=set(all_weights))
+            >>> # return [] because all param_grads are filtered by no_grad_set.
+            >>> p_g_list6 = paddle.static.append_backward(loss=avg_loss, parameter_list=all_weights, no_grad_set=set(all_weights))
 
     """
     grad_op_id_to_fwd_op = (
@@ -2641,18 +2641,19 @@ def gradients(targets, inputs, target_gradients=None, no_grad_set=None):
     Examples:
 
         .. code-block:: python
-          :name: code-example
-            import paddle
-            import paddle.nn.functional as F
-
-            paddle.enable_static()
-
-            x = paddle.static.data(name='x', shape=[None, 2, 8, 8], dtype='float32')
-            x.stop_gradient=False
-            y = paddle.static.nn.conv2d(x, 4, 1, bias_attr=False)
-            y = F.relu(y)
-            z = paddle.static.gradients([y], x)
-            print(z) # [var x@GRAD : LOD_TENSOR.shape(-1, 2, 8, 8).dtype(float32).stop_gradient(False)]
+
+            >>> import paddle
+            >>> import paddle.nn.functional as F
+
+            >>> paddle.enable_static()
+
+            >>> x = paddle.static.data(name='x', shape=[None, 2, 8, 8], dtype='float32')
+            >>> x.stop_gradient=False
+            >>> y = paddle.static.nn.conv2d(x, 4, 1, bias_attr=False)
+            >>> y = F.relu(y)
+            >>> z = paddle.static.gradients([y], x)
+            >>> print(z)
+            [var x@GRAD : LOD_TENSOR.shape(-1, 2, 8, 8).dtype(float32).stop_gradient(False)]
     """
     check_type(
         targets,
@@ -2702,16 +2703,22 @@ def gradients_with_optimizer(program, optimizer, inputs=None, outputs=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.static as static
-
-            paddle.enable_static()
-
-            img = static.data(name='image', shape=[None, 784])
-            pred = static.nn.fc(x=img, size=10, activation='relu')
-            loss = paddle.mean(pred)
-            opt_ops, pram_grads = paddle.base.backward.gradients_with_optimizer(static.default_main_program(), opt)
-            print(opt_ops)
+            >>> import paddle
+            >>> import paddle.static as static
+
+            >>> paddle.enable_static()
+
+            >>> img = static.data(name='image', shape=[None, 784])
+            >>> pred = static.nn.fc(x=img, size=10, activation='relu')
+            >>> loss = paddle.mean(pred)
+            >>> opt = paddle.optimizer.SGD(learning_rate=1e-3)
+            >>> opt_ops, pram_grads = paddle.base.backward.gradients_with_optimizer(static.default_main_program(), opt)
+            >>> print(opt_ops)
+            [{ParamOut=['fc_0.b_0']} = sgd(inputs={Grad=['fc_0.b_0@GRAD'],
+            LearningRate=['learning_rate_0'],
+            MasterParam=[],
+            ...
+            with_quant_attr = False)]
 
     """
     check_type(
diff --git a/python/paddle/base/compiler.py b/python/paddle/base/compiler.py
index b1944ac343b8b..f605242b7eb6d 100644
--- a/python/paddle/base/compiler.py
+++ b/python/paddle/base/compiler.py
@@ -119,28 +119,28 @@ class CompiledProgram:
     Example:
         .. code-block:: python
 
-            import numpy
-            import paddle
-            import paddle.static as static
+            >>> import numpy
+            >>> import paddle
+            >>> import paddle.static as static
 
-            paddle.enable_static()
+            >>> paddle.enable_static()
 
-            place = paddle.CUDAPlace(0) # paddle.CPUPlace()
-            exe = static.Executor(place)
+            >>> place = paddle.CPUPlace()
+            >>> exe = static.Executor(place)
 
-            data = static.data(name='X', shape=[None, 1], dtype='float32')
-            hidden = static.nn.fc(x=data, size=10)
-            loss = paddle.mean(hidden)
-            paddle.optimizer.SGD(learning_rate=0.01).minimize(loss)
+            >>> data = static.data(name='X', shape=[None, 1], dtype='float32')
+            >>> hidden = static.nn.fc(x=data, size=10)
+            >>> loss = paddle.mean(hidden)
+            >>> paddle.optimizer.SGD(learning_rate=0.01).minimize(loss)
 
-            exe.run(static.default_startup_program())
-            compiled_prog = static.CompiledProgram(
-                static.default_main_program())
+            >>> exe.run(static.default_startup_program())
+            >>> compiled_prog = static.CompiledProgram(
+            ...     static.default_main_program())
 
-            x = numpy.random.random(size=(10, 1)).astype('float32')
-            loss_data, = exe.run(compiled_prog,
-                                feed={"X": x},
-                                fetch_list=[loss.name])
+            >>> x = numpy.random.random(size=(10, 1)).astype('float32')
+            >>> loss_data, = exe.run(compiled_prog,
+            ...                     feed={"X": x},
+            ...                     fetch_list=[loss.name])
     """
 
     def __init__(self, program_or_graph, build_strategy=None):
@@ -605,14 +605,14 @@ class IpuStrategy:
     Examples:
         .. code-block:: python
 
-            # required: ipu
+            >>> # doctest: +REQUIRES(env:IPU)
 
-            import paddle
-            import paddle.static as static
+            >>> import paddle
+            >>> import paddle.static as static
 
-            paddle.enable_static()
+            >>> paddle.enable_static()
 
-            ipu_strategy = static.IpuStrategy()
+            >>> ipu_strategy = static.IpuStrategy()
     """
 
     def __init__(self):
@@ -647,14 +647,14 @@ def register_patch(self):
         Examples:
             .. code-block:: python
 
-                # required: ipu
+                >>> # doctest: +REQUIRES(env:IPU)
 
-                import paddle
-                import paddle.static as static
+                >>> import paddle
+                >>> import paddle.static as static
 
-                ipu_strategy = static.IpuStrategy()
+                >>> ipu_strategy = static.IpuStrategy()
 
-                ipu_strategy.register_patch()
+                >>> ipu_strategy.register_patch()
         """
         IpuDynamicPatcher.register_patch(self)
 
@@ -665,14 +665,14 @@ def release_patch(self):
         Examples:
             .. code-block:: python
 
-                # required: ipu
+                >>> # doctest: +REQUIRES(env:IPU)
 
-                import paddle
-                import paddle.static as static
+                >>> import paddle
+                >>> import paddle.static as static
 
-                ipu_strategy = static.IpuStrategy()
+                >>> ipu_strategy = static.IpuStrategy()
 
-                ipu_strategy.release_patch()
+                >>> ipu_strategy.release_patch()
         """
         IpuDynamicPatcher.release_patch()
 
@@ -687,18 +687,17 @@ def set_optimizer(self, optimizer):
               None.
 
           Examples:
-              .. code-block:: python
+                .. code-block:: python
 
-                  # required: ipu
+                    >>> # doctest: +REQUIRES(env:IPU)
+                    >>> import paddle
+                    >>> import paddle.static as static
 
-                  import paddle
-                  import paddle.static as static
-
-                  linear = paddle.nn.Linear(10, 10)
-                  optimizer = paddle.optimizer.SGD(learning_rate=0.01,
-                                                   parameters=linear.parameters())
-                  ipu_strategy = static.IpuStrategy()
-                  ipu_strategy.set_optimizer(optimizer)
+                    >>> linear = paddle.nn.Linear(10, 10)
+                    >>> optimizer = paddle.optimizer.SGD(learning_rate=0.01,
+                    ...                                 parameters=linear.parameters())
+                    >>> ipu_strategy = static.IpuStrategy()
+                    >>> ipu_strategy.set_optimizer(optimizer)
         """
         from paddle import in_dynamic_mode
 
@@ -720,18 +719,18 @@ def parse_optimizer(self, optimizer):
               Dict.
 
           Examples:
-              .. code-block:: python
+                .. code-block:: python
 
-                  # required: ipu
+                    >>> # doctest: +REQUIRES(env:IPU)
 
-                  import paddle
-                  import paddle.static as static
+                    >>> import paddle
+                    >>> import paddle.static as static
 
-                  linear = paddle.nn.Linear(10, 10)
-                  optimizer = paddle.optimizer.SGD(learning_rate=0.01,
-                                                   parameters=linear.parameters())
-                  ipu_strategy = static.IpuStrategy()
-                  attrs = ipu_strategy.parse_optimizer(optimizer)
+                    >>> linear = paddle.nn.Linear(10, 10)
+                    >>> optimizer = paddle.optimizer.SGD(learning_rate=0.01,
+                    ...                                 parameters=linear.parameters())
+                    >>> ipu_strategy = static.IpuStrategy()
+                    >>> attrs = ipu_strategy.parse_optimizer(optimizer)
         """
 
         def get_lr():
@@ -772,18 +771,18 @@ def set_graph_config(
         Examples:
             .. code-block:: python
 
-                # required: ipu
+                >>> # doctest: +REQUIRES(env:IPU)
 
-                import paddle
-                import paddle.static as static
+                >>> import paddle
+                >>> import paddle.static as static
 
-                paddle.enable_static()
+                >>> paddle.enable_static()
 
-                ipu_strategy = static.IpuStrategy()
-                ipu_strategy.set_graph_config(num_ipus=1,
-                                            is_training=True,
-                                            micro_batch_size=1,
-                                            enable_manual_shard=False)
+                >>> ipu_strategy = static.IpuStrategy()
+                >>> ipu_strategy.set_graph_config(num_ipus=1,
+                ...                             is_training=True,
+                ...                             micro_batch_size=1,
+                ...                             enable_manual_shard=False)
         """
         if num_ipus == 1 and enable_manual_shard:
             raise RuntimeError(
@@ -823,18 +822,18 @@ def set_pipelining_config(
         Examples:
             .. code-block:: python
 
-                # required: ipu
+                >>> # doctest: +REQUIRES(env:IPU)
 
-                import paddle
-                import paddle.static as static
+                >>> import paddle
+                >>> import paddle.static as static
 
-                paddle.enable_static()
+                >>> paddle.enable_static()
 
-                ipu_strategy = static.IpuStrategy()
-                ipu_strategy.set_pipelining_config(enable_pipelining=False,
-                                                    batches_per_step=1,
-                                                    enable_gradient_accumulation=False,
-                                                    accumulation_factor=1)
+                >>> ipu_strategy = static.IpuStrategy()
+                >>> ipu_strategy.set_pipelining_config(enable_pipelining=False,
+                ...                                     batches_per_step=1,
+                ...                                     enable_gradient_accumulation=False,
+                ...                                     accumulation_factor=1)
         """
         enable_manual_shard = self.get_option('enable_manual_shard')
         if not enable_manual_shard and enable_pipelining:
@@ -862,15 +861,15 @@ def set_precision_config(self, enable_fp16=False):
         Examples:
             .. code-block:: python
 
-                # required: ipu
+                >>> # doctest: +REQUIRES(env:IPU)
 
-                import paddle
-                import paddle.static as static
+                >>> import paddle
+                >>> import paddle.static as static
 
-                paddle.enable_static()
+                >>> paddle.enable_static()
 
-                ipu_strategy = static.IpuStrategy()
-                ipu_strategy.set_precision_config(enable_fp16=False)
+                >>> ipu_strategy = static.IpuStrategy()
+                >>> ipu_strategy.set_precision_config(enable_fp16=False)
         """
         options = {
             'enable_fp16': enable_fp16,
@@ -898,15 +897,15 @@ def add_custom_op(
         Examples:
             .. code-block:: python
 
-                # required: ipu
+                >>> # doctest: +REQUIRES(env:IPU)
 
-                import paddle
-                import paddle.static as static
+                >>> import paddle
+                >>> import paddle.static as static
 
-                paddle.enable_static()
+                >>> paddle.enable_static()
 
-                ipu_strategy = static.IpuStrategy()
-                ipu_strategy.add_custom_op('paddle_relu', 'popart_relu')
+                >>> ipu_strategy = static.IpuStrategy()
+                >>> ipu_strategy.add_custom_op('paddle_relu', 'popart_relu')
         """
         if popart_op is None:
             popart_op = paddle_op
@@ -934,16 +933,16 @@ def set_options(self, options):
         Examples:
             .. code-block:: python
 
-                # required: ipu
+                >>> # doctest: +REQUIRES(env:IPU)
 
-                import paddle
-                import paddle.static as static
+                >>> import paddle
+                >>> import paddle.static as static
 
-                paddle.enable_static()
+                >>> paddle.enable_static()
 
-                ipu_strategy = static.IpuStrategy()
-                options = {'num_ipus':1, 'enable_fp16': True}
-                ipu_strategy.set_options(options)
+                >>> ipu_strategy = static.IpuStrategy()
+                >>> options = {'num_ipus':1, 'enable_fp16': True}
+                >>> ipu_strategy.set_options(options)
         """
         self._ipu_strategy.set_options(options)
         # check whether to recompile program with updated ipu options.
@@ -964,15 +963,15 @@ def get_option(self, option):
         Examples:
             .. code-block:: python
 
-                # required: ipu
+                >>> # doctest: +REQUIRES(env:IPU)
 
-                import paddle
-                import paddle.static as static
+                >>> import paddle
+                >>> import paddle.static as static
 
-                paddle.enable_static()
+                >>> paddle.enable_static()
 
-                ipu_strategy = static.IpuStrategy()
-                num_ipus = ipu_strategy.get_option('num_ipus')
+                >>> ipu_strategy = static.IpuStrategy()
+                >>> num_ipus = ipu_strategy.get_option('num_ipus')
         """
         return self._ipu_strategy.get_option(option)['value']
 
@@ -989,15 +988,15 @@ def enable_pattern(self, pattern):
         Examples:
             .. code-block:: python
 
-                # required: ipu
+                >>> # doctest: +REQUIRES(env:IPU)
 
-                import paddle
-                import paddle.static as static
+                >>> import paddle
+                >>> import paddle.static as static
 
-                paddle.enable_static()
+                >>> paddle.enable_static()
 
-                ipu_strategy = static.IpuStrategy()
-                ipu_strategy.enable_pattern("ViewSimplifyPattern")
+                >>> ipu_strategy = static.IpuStrategy()
+                >>> ipu_strategy.enable_pattern("ViewSimplifyPattern")
         """
         self._ipu_strategy.enable_pattern(pattern)
 
@@ -1014,15 +1013,15 @@ def disable_pattern(self, pattern):
         Examples:
             .. code-block:: python
 
-                # required: ipu
+                >>> # doctest: +REQUIRES(env:IPU)
 
-                import paddle
-                import paddle.static as static
+                >>> import paddle
+                >>> import paddle.static as static
 
-                paddle.enable_static()
+                >>> paddle.enable_static()
 
-                ipu_strategy = static.IpuStrategy()
-                ipu_strategy.disable_pattern("ViewSimplifyPattern")
+                >>> ipu_strategy = static.IpuStrategy()
+                >>> ipu_strategy.disable_pattern("ViewSimplifyPattern")
         """
         self._ipu_strategy.disable_pattern(pattern)
 
@@ -1077,25 +1076,25 @@ class IpuCompiledProgram:
     Example:
         .. code-block:: python
 
-            # required: ipu
+            >>> # doctest: +REQUIRES(env:IPU)
 
-            import paddle
-            import paddle.static as static
+            >>> import paddle
+            >>> import paddle.static as static
 
-            paddle.enable_static()
+            >>> paddle.enable_static()
 
-            a = static.data(name='data', shape=[None, 1], dtype='int32')
-            b = a + 1
-            main_prog = static.default_main_program()
+            >>> a = static.data(name='data', shape=[None, 1], dtype='int32')
+            >>> b = a + 1
+            >>> main_prog = static.default_main_program()
 
-            ipu_strategy = static.IpuStrategy()
-            ipu_strategy.set_graph_config(num_ipus=1, is_training=True, micro_batch_size=1)
-            ipu_strategy.set_pipelining_config(enable_pipelining=False, batches_per_step=1, enable_gradient_accumulation=False, accumulation_factor=1)
-            ipu_strategy.set_precision_config(enable_fp16=False)
+            >>> ipu_strategy = static.IpuStrategy()
+            >>> ipu_strategy.set_graph_config(num_ipus=1, is_training=True, micro_batch_size=1)
+            >>> ipu_strategy.set_pipelining_config(enable_pipelining=False, batches_per_step=1, enable_gradient_accumulation=False, accumulation_factor=1)
+            >>> ipu_strategy.set_precision_config(enable_fp16=False)
 
-            ipu_compiled_program = static.IpuCompiledProgram(
-                main_prog,
-                ipu_strategy=ipu_strategy)
+            >>> ipu_compiled_program = static.IpuCompiledProgram(
+            ...     main_prog,
+            ...     ipu_strategy=ipu_strategy)
     """
 
     def __init__(self, program=None, scope=None, ipu_strategy=None):
@@ -1153,25 +1152,25 @@ def compile(self, feed_list, fetch_list):
         Example:
             .. code-block:: python
 
-                # required: ipu
+                >>> # doctest: +REQUIRES(env:IPU)
 
-                import paddle
-                import paddle.static as static
+                >>> import paddle
+                >>> import paddle.static as static
 
-                paddle.enable_static()
+                >>> paddle.enable_static()
 
-                a = static.data(name='data', shape=[None, 1], dtype='int32')
-                b = a + 1
-                main_prog = static.default_main_program()
+                >>> a = static.data(name='data', shape=[None, 1], dtype='int32')
+                >>> b = a + 1
+                >>> main_prog = static.default_main_program()
 
-                ipu_strategy = static.IpuStrategy()
-                ipu_strategy.set_graph_config(num_ipus=1, is_training=True, micro_batch_size=1)
-                ipu_strategy.set_pipelining_config(enable_pipelining=False, batches_per_step=1, enable_gradient_accumulation=False, accumulation_factor=1)
-                ipu_strategy.set_precision_config(enable_fp16=False)
+                >>> ipu_strategy = static.IpuStrategy()
+                >>> ipu_strategy.set_graph_config(num_ipus=1, is_training=True, micro_batch_size=1)
+                >>> ipu_strategy.set_pipelining_config(enable_pipelining=False, batches_per_step=1, enable_gradient_accumulation=False, accumulation_factor=1)
+                >>> ipu_strategy.set_precision_config(enable_fp16=False)
 
-                program = static.IpuCompiledProgram(
-                    main_prog,
-                    ipu_strategy=ipu_strategy).compile([a.name], [b.name])
+                >>> program = static.IpuCompiledProgram(
+                ...     main_prog,
+                ...     ipu_strategy=ipu_strategy).compile([a.name], [b.name])
         """
         self._backend.set_scope(self._scope)
         self._backend.set_ipu_strategy(self._ipu_strategy._ipu_strategy)
diff --git a/python/paddle/base/data_feed_desc.py b/python/paddle/base/data_feed_desc.py
index 3987440ecc6da..8aa69890f1933 100644
--- a/python/paddle/base/data_feed_desc.py
+++ b/python/paddle/base/data_feed_desc.py
@@ -19,7 +19,7 @@
 
 
 class DataFeedDesc:
-    """
+    r"""
     :api_attr: Static Graph
 
     Datafeed descriptor, describing input training data format. This class is
@@ -31,50 +31,48 @@ class DataFeedDesc:
     See :code:`paddle/base/framework/data_feed.proto` for message definition.
     A typical message might look like:
 
-    .. code-block:: python
-
-      import paddle.base as base
-      f = open("data.proto", "w")
-      print >> f, 'name: "MultiSlotDataFeed"'
-      print >> f, 'batch_size: 2'
-      print >> f, 'multi_slot_desc {'
-      print >> f, '    slots {'
-      print >> f, '         name: "words"'
-      print >> f, '         type: "uint64"'
-      print >> f, '         is_dense: false'
-      print >> f, '         is_used: true'
-      print >> f, '     }'
-      print >> f, '     slots {'
-      print >> f, '         name: "label"'
-      print >> f, '         type: "uint64"'
-      print >> f, '         is_dense: false'
-      print >> f, '         is_used: true'
-      print >> f, '    }'
-      print >> f, '}'
-      f.close()
-      data_feed = base.DataFeedDesc('data.proto')
-
-    However, users usually shouldn't care about the message format; instead,
-    they are encouraged to use :code:`Data Generator` as a tool to generate a
-    valid data description, in the process of converting their raw log files to
-    training files acceptable to AsyncExecutor.
-
-    DataFeedDesc can also be changed during runtime. Once you got familiar with
-    what each field mean, you can modify it to better suit your need. E.g.:
-
-    .. code-block:: python
-
-      import paddle.base as base
-      data_feed = base.DataFeedDesc('data.proto')
-      data_feed.set_batch_size(128)
-      data_feed.set_dense_slots('wd')  # The slot named 'wd' will be dense
-      data_feed.set_use_slots('wd')    # The slot named 'wd' will be used
-
-    Finally, the content can be dumped out for debugging purpose:
-
-    .. code-block:: python
-
-      print(data_feed.desc())
+    Examples:
+        .. code-block:: python
+
+            >>> import paddle.base as base
+            >>> with open("data.proto", "w") as f:
+            ...     f.write('name: "MultiSlotDataFeed"\n')
+            ...     f.write('batch_size: 2\n')
+            ...     f.write('multi_slot_desc {\n')
+            ...     f.write('    slots {\n')
+            ...     f.write('        name: "words"\n')
+            ...     f.write('        type: "uint64"\n')
+            ...     f.write('        is_dense: false\n')
+            ...     f.write('        is_used: true\n')
+            ...     f.write('    }\n')
+            ...     f.write('    slots {\n')
+            ...     f.write('        name: "label"\n')
+            ...     f.write('        type: "uint64"\n')
+            ...     f.write('        is_dense: false\n')
+            ...     f.write('        is_used: true\n')
+            ...     f.write('    }\n')
+            ...     f.write('}')
+            >>> data_feed = base.DataFeedDesc('data.proto')
+
+        However, users usually shouldn't care about the message format; instead,
+        they are encouraged to use :code:`Data Generator` as a tool to generate a
+        valid data description, in the process of converting their raw log files to
+        training files acceptable to AsyncExecutor.
+
+        DataFeedDesc can also be changed during runtime. Once you got familiar with
+        what each field mean, you can modify it to better suit your need. E.g.:
+
+        .. code-block:: python
+
+            >>> import paddle.base as base
+            >>> data_feed = base.DataFeedDesc('data.proto')
+            >>> data_feed.set_batch_size(128)
+            >>> data_feed.set_dense_slots(['words'])  # The slot named 'words' will be dense
+            >>> data_feed.set_use_slots(['words'])    # The slot named 'words' will be used
+
+            >>> # Finally, the content can be dumped out for debugging purpose:
+
+            >>> print(data_feed.desc())
 
     Args:
         proto_file(string): Disk file containing a data feed description.
@@ -93,33 +91,32 @@ def __init__(self, proto_file):
             }
 
     def set_batch_size(self, batch_size):
-        """
+        r"""
         Set :attr:`batch_size` in :ref:`api_base_DataFeedDesc` . :attr:`batch_size` can be changed during training.
 
-        Example:
+        Examples:
             .. code-block:: python
 
-              import paddle.base as base
-              f = open("data.proto", "w")
-              print >> f, 'name: "MultiSlotDataFeed"'
-              print >> f, 'batch_size: 2'
-              print >> f, 'multi_slot_desc {'
-              print >> f, '    slots {'
-              print >> f, '         name: "words"'
-              print >> f, '         type: "uint64"'
-              print >> f, '         is_dense: false'
-              print >> f, '         is_used: true'
-              print >> f, '     }'
-              print >> f, '     slots {'
-              print >> f, '         name: "label"'
-              print >> f, '         type: "uint64"'
-              print >> f, '         is_dense: false'
-              print >> f, '         is_used: true'
-              print >> f, '    }'
-              print >> f, '}'
-              f.close()
-              data_feed = base.DataFeedDesc('data.proto')
-              data_feed.set_batch_size(128)
+                >>> import paddle.base as base
+                >>> with open("data.proto", "w") as f:
+                ...     f.write('name: "MultiSlotDataFeed"\n')
+                ...     f.write('batch_size: 2\n')
+                ...     f.write('multi_slot_desc {\n')
+                ...     f.write('    slots {\n')
+                ...     f.write('        name: "words"\n')
+                ...     f.write('        type: "uint64"\n')
+                ...     f.write('        is_dense: false\n')
+                ...     f.write('        is_used: true\n')
+                ...     f.write('    }\n')
+                ...     f.write('    slots {\n')
+                ...     f.write('        name: "label"\n')
+                ...     f.write('        type: "uint64"\n')
+                ...     f.write('        is_dense: false\n')
+                ...     f.write('        is_used: true\n')
+                ...     f.write('    }\n')
+                ...     f.write('}')
+                >>> data_feed = base.DataFeedDesc('data.proto')
+                >>> data_feed.set_batch_size(128)
 
         Args:
             batch_size (int): The number of batch size.
@@ -131,36 +128,35 @@ def set_batch_size(self, batch_size):
         self.proto_desc.batch_size = batch_size
 
     def set_dense_slots(self, dense_slots_name):
-        """
+        r"""
         Set slots in :attr:`dense_slots_name` as dense slots. **Note: In default, all slots are sparse slots.**
 
         Features for a dense slot will be fed into a Tensor, while those for a
         sparse slot will be fed into a LoDTensor.
 
-        Example:
+        Examples:
             .. code-block:: python
 
-              import paddle.base as base
-              f = open("data.proto", "w")
-              print >> f, 'name: "MultiSlotDataFeed"'
-              print >> f, 'batch_size: 2'
-              print >> f, 'multi_slot_desc {'
-              print >> f, '    slots {'
-              print >> f, '         name: "words"'
-              print >> f, '         type: "uint64"'
-              print >> f, '         is_dense: false'
-              print >> f, '         is_used: true'
-              print >> f, '     }'
-              print >> f, '     slots {'
-              print >> f, '         name: "label"'
-              print >> f, '         type: "uint64"'
-              print >> f, '         is_dense: false'
-              print >> f, '         is_used: true'
-              print >> f, '    }'
-              print >> f, '}'
-              f.close()
-              data_feed = base.DataFeedDesc('data.proto')
-              data_feed.set_dense_slots(['words'])
+                >>> import paddle.base as base
+                >>> with open("data.proto", "w") as f:
+                ...     f.write('name: "MultiSlotDataFeed"\n')
+                ...     f.write('batch_size: 2\n')
+                ...     f.write('multi_slot_desc {\n')
+                ...     f.write('    slots {\n')
+                ...     f.write('        name: "words"\n')
+                ...     f.write('        type: "uint64"\n')
+                ...     f.write('        is_dense: false\n')
+                ...     f.write('        is_used: true\n')
+                ...     f.write('    }\n')
+                ...     f.write('    slots {\n')
+                ...     f.write('        name: "label"\n')
+                ...     f.write('        type: "uint64"\n')
+                ...     f.write('        is_dense: false\n')
+                ...     f.write('        is_used: true\n')
+                ...     f.write('    }\n')
+                ...     f.write('}')
+                >>> data_feed = base.DataFeedDesc('data.proto')
+                >>> data_feed.set_dense_slots(['words'])
 
         Args:
             dense_slots_name (list(str)): a list of slot names which will be set dense.
@@ -179,35 +175,34 @@ def set_dense_slots(self, dense_slots_name):
             ].is_dense = True
 
     def set_use_slots(self, use_slots_name):
-        """
+        r"""
         Set if a specific slot will be used for training. A dataset shall
         contain a lot of features, through this function one can select which
         ones will be used for a specific model.
 
-        Example:
+        Examples:
             .. code-block:: python
 
-              import paddle.base as base
-              f = open("data.proto", "w")
-              print >> f, 'name: "MultiSlotDataFeed"'
-              print >> f, 'batch_size: 2'
-              print >> f, 'multi_slot_desc {'
-              print >> f, '    slots {'
-              print >> f, '         name: "words"'
-              print >> f, '         type: "uint64"'
-              print >> f, '         is_dense: false'
-              print >> f, '         is_used: true'
-              print >> f, '     }'
-              print >> f, '     slots {'
-              print >> f, '         name: "label"'
-              print >> f, '         type: "uint64"'
-              print >> f, '         is_dense: false'
-              print >> f, '         is_used: true'
-              print >> f, '    }'
-              print >> f, '}'
-              f.close()
-              data_feed = base.DataFeedDesc('data.proto')
-              data_feed.set_use_slots(['words'])
+                >>> import paddle.base as base
+                >>> with open("data.proto", "w") as f:
+                ...     f.write('name: "MultiSlotDataFeed"\n')
+                ...     f.write('batch_size: 2\n')
+                ...     f.write('multi_slot_desc {\n')
+                ...     f.write('    slots {\n')
+                ...     f.write('        name: "words"\n')
+                ...     f.write('        type: "uint64"\n')
+                ...     f.write('        is_dense: false\n')
+                ...     f.write('        is_used: true\n')
+                ...     f.write('    }\n')
+                ...     f.write('    slots {\n')
+                ...     f.write('        name: "label"\n')
+                ...     f.write('        type: "uint64"\n')
+                ...     f.write('        is_dense: false\n')
+                ...     f.write('        is_used: true\n')
+                ...     f.write('    }\n')
+                ...     f.write('}')
+                >>> data_feed = base.DataFeedDesc('data.proto')
+                >>> data_feed.set_use_slots(['words'])
 
         Args:
             use_slots_name: a list of slot names which will be used in training
@@ -225,33 +220,32 @@ def set_use_slots(self, use_slots_name):
             ].is_used = True
 
     def desc(self):
-        """
+        r"""
         Returns a protobuf message for this DataFeedDesc
 
-        Example:
+        Examples:
             .. code-block:: python
 
-              import paddle.base as base
-              f = open("data.proto", "w")
-              print >> f, 'name: "MultiSlotDataFeed"'
-              print >> f, 'batch_size: 2'
-              print >> f, 'multi_slot_desc {'
-              print >> f, '    slots {'
-              print >> f, '         name: "words"'
-              print >> f, '         type: "uint64"'
-              print >> f, '         is_dense: false'
-              print >> f, '         is_used: true'
-              print >> f, '     }'
-              print >> f, '     slots {'
-              print >> f, '         name: "label"'
-              print >> f, '         type: "uint64"'
-              print >> f, '         is_dense: false'
-              print >> f, '         is_used: true'
-              print >> f, '    }'
-              print >> f, '}'
-              f.close()
-              data_feed = base.DataFeedDesc('data.proto')
-              print(data_feed.desc())
+                >>> import paddle.base as base
+                >>> with open("data.proto", "w") as f:
+                ...     f.write('name: "MultiSlotDataFeed"\n')
+                ...     f.write('batch_size: 2\n')
+                ...     f.write('multi_slot_desc {\n')
+                ...     f.write('    slots {\n')
+                ...     f.write('        name: "words"\n')
+                ...     f.write('        type: "uint64"\n')
+                ...     f.write('        is_dense: false\n')
+                ...     f.write('        is_used: true\n')
+                ...     f.write('    }\n')
+                ...     f.write('    slots {\n')
+                ...     f.write('        name: "label"\n')
+                ...     f.write('        type: "uint64"\n')
+                ...     f.write('        is_dense: false\n')
+                ...     f.write('        is_used: true\n')
+                ...     f.write('    }\n')
+                ...     f.write('}')
+                >>> data_feed = base.DataFeedDesc('data.proto')
+                >>> print(data_feed.desc())
 
         Returns:
             A string message
diff --git a/python/paddle/base/data_feeder.py b/python/paddle/base/data_feeder.py
index b8044dc2b7885..0029c6f35a9ae 100644
--- a/python/paddle/base/data_feeder.py
+++ b/python/paddle/base/data_feeder.py
@@ -44,6 +44,21 @@
     core.VarDesc.VarType.COMPLEX128: 'complex128',
 }
 
+_PADDLE_NEW_IR_DTYPE_2_NUMPY_DTYPE = {
+    core.DataType.BOOL: 'bool',
+    core.DataType.FLOAT16: 'float16',
+    core.DataType.UINT16: 'uint16',
+    core.DataType.FLOAT32: 'float32',
+    core.DataType.FLOAT64: 'float64',
+    core.DataType.INT8: 'int8',
+    core.DataType.INT16: 'int16',
+    core.DataType.INT32: 'int32',
+    core.DataType.INT64: 'int64',
+    core.DataType.UINT8: 'uint8',
+    core.DataType.COMPLEX64: 'complex64',
+    core.DataType.COMPLEX128: 'complex128',
+}
+
 
 def convert_float_to_uint16(data, data_format="NCHW"):
     if data.size == 0:
@@ -75,6 +90,9 @@ def convert_dtype(dtype):
     if isinstance(dtype, core.VarDesc.VarType):
         if dtype in _PADDLE_DTYPE_2_NUMPY_DTYPE:
             return _PADDLE_DTYPE_2_NUMPY_DTYPE[dtype]
+    if isinstance(dtype, core.DataType):
+        if dtype in _PADDLE_NEW_IR_DTYPE_2_NUMPY_DTYPE:
+            return _PADDLE_NEW_IR_DTYPE_2_NUMPY_DTYPE[dtype]
     elif isinstance(dtype, type):
         # This branch is for NumPy scalar types
         if dtype in [
@@ -128,7 +146,14 @@ def convert_dtype(dtype):
 def check_variable_and_dtype(
     input, input_name, expected_dtype, op_name, extra_message=''
 ):
-    check_type(input, input_name, Variable, op_name, extra_message)
+    import paddle
+
+    if paddle.ir.core._use_new_ir_api():
+        check_type(
+            input, input_name, paddle.ir.OpResult, op_name, extra_message
+        )
+    else:
+        check_type(input, input_name, Variable, op_name, extra_message)
     check_dtype(input.dtype, input_name, expected_dtype, op_name, extra_message)
 
 
diff --git a/python/paddle/base/dataset.py b/python/paddle/base/dataset.py
index 099dba1e6d755..533fb69a6621b 100644
--- a/python/paddle/base/dataset.py
+++ b/python/paddle/base/dataset.py
@@ -30,9 +30,8 @@ class DatasetFactory:
     Example:
         .. code-block:: python
 
-          import paddle.base as base
-          dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
-
+            >>> import paddle.base as base
+            >>> dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
     """
 
     def __init__(self):
@@ -51,9 +50,8 @@ def create_dataset(self, datafeed_class="QueueDataset"):
         Examples:
             .. code-block:: python
 
-              import paddle.base as base
-              dataset = base.DatasetFactory().create_dataset()
-
+                >>> import paddle.base as base
+                >>> dataset = base.DatasetFactory().create_dataset()
         """
         try:
             dataset = globals()[datafeed_class]()
@@ -87,9 +85,9 @@ def set_pipe_command(self, pipe_command):
         Examples:
             .. code-block:: python
 
-              import paddle.base as base
-              dataset = base.DatasetFactory().create_dataset()
-              dataset.set_pipe_command("python my_script.py")
+                >>> import paddle.base as base
+                >>> dataset = base.DatasetFactory().create_dataset()
+                >>> dataset.set_pipe_command("python my_script.py")
 
         Args:
             pipe_command(str): pipe command
@@ -104,9 +102,9 @@ def set_so_parser_name(self, so_parser_name):
         Examples:
             .. code-block:: python
 
-              import paddle.base as base
-              dataset = base.DatasetFactory().create_dataset()
-              dataset.set_so_parser_name("./abc.so")
+                >>> import paddle.base as base
+                >>> dataset = base.DatasetFactory().create_dataset()
+                >>> dataset.set_so_parser_name("./abc.so")
 
         Args:
             pipe_command(str): pipe command
@@ -121,9 +119,9 @@ def set_rank_offset(self, rank_offset):
         Examples:
             .. code-block:: python
 
-              import paddle.base as base
-              dataset = base.DatasetFactory().create_dataset()
-              dataset.set_rank_offset("rank_offset")
+                >>> import paddle.base as base
+                >>> dataset = base.DatasetFactory().create_dataset()
+                >>> dataset.set_rank_offset("rank_offset")
 
         Args:
             rank_offset(str): rank_offset's name
@@ -183,9 +181,9 @@ def set_batch_size(self, batch_size):
         Examples:
             .. code-block:: python
 
-              import paddle.base as base
-              dataset = base.DatasetFactory().create_dataset()
-              dataset.set_batch_size(128)
+                >>> import paddle.base as base
+                >>> dataset = base.DatasetFactory().create_dataset()
+                >>> dataset.set_batch_size(128)
 
         Args:
             batch_size(int): batch size
@@ -200,9 +198,9 @@ def set_pv_batch_size(self, pv_batch_size):
         Examples:
             .. code-block:: python
 
-              import paddle.base as base
-              dataset = base.DatasetFactory().create_dataset()
-              dataset.set_pv_batch(128)
+                >>> import paddle.base as base
+                >>> dataset = base.DatasetFactory().create_dataset()
+                >>> dataset.set_pv_batch_size(128)
         Args:
             pv_batch_size(int): pv batch size
 
@@ -216,9 +214,9 @@ def set_thread(self, thread_num):
         Examples:
             .. code-block:: python
 
-              import paddle.base as base
-              dataset = base.DatasetFactory().create_dataset()
-               dataset.set_thread(12)
+                >>> import paddle.base as base
+                >>> dataset = base.DatasetFactory().create_dataset()
+                >>> dataset.set_thread(12)
 
         Args:
             thread_num(int): thread num
@@ -233,9 +231,9 @@ def set_filelist(self, filelist):
         Examples:
             .. code-block:: python
 
-              import paddle.base as base
-              dataset = base.DatasetFactory().create_dataset()
-              dataset.set_filelist(['a.txt', 'b.txt'])
+                >>> import paddle.base as base
+                >>> dataset = base.DatasetFactory().create_dataset()
+                >>> dataset.set_filelist(['a.txt', 'b.txt'])
 
         Args:
             filelist(list): file list
@@ -253,9 +251,12 @@ def set_use_var(self, var_list):
         Examples:
             .. code-block:: python
 
-              import paddle.base as base
-              dataset = base.DatasetFactory().create_dataset()
-              dataset.set_use_var([data, label])
+                >>> import paddle.base as base
+                >>> paddle.enable_static()
+                >>> dataset = base.DatasetFactory().create_dataset()
+                >>> data = paddle.static.data(name="data", shape=[None, 10, 10], dtype="int64")
+                >>> label = paddle.static.data(name="label", shape=[None, 1], dtype="int64", lod_level=1)
+                >>> dataset.set_use_var([data, label])
 
         Args:
             var_list(list): variable list
@@ -286,9 +287,9 @@ def set_hdfs_config(self, fs_name, fs_ugi):
         Examples:
             .. code-block:: python
 
-              import paddle.base as base
-              dataset = base.DatasetFactory().create_dataset()
-              dataset.set_hdfs_config("my_fs_name", "my_fs_ugi")
+                >>> import paddle.base as base
+                >>> dataset = base.DatasetFactory().create_dataset()
+                >>> dataset.set_hdfs_config("my_fs_name", "my_fs_ugi")
 
         Args:
             fs_name(str): fs name
@@ -303,9 +304,9 @@ def set_download_cmd(self, download_cmd):
         Examples:
             .. code-block:: python
 
-              import paddle.base as base
-              dataset = base.DatasetFactory().create_dataset()
-              dataset.set_download_cmd("./read_from_afs")
+                >>> import paddle.base as base
+                >>> dataset = base.DatasetFactory().create_dataset()
+                >>> dataset.set_download_cmd("./read_from_afs")
 
         Args:
             download_cmd(str): customized download command
@@ -347,9 +348,9 @@ def desc(self):
         Examples:
             .. code-block:: python
 
-              import paddle.base as base
-              dataset = base.DatasetFactory().create_dataset()
-              print(dataset.desc())
+                >>> import paddle.base as base
+                >>> dataset = base.DatasetFactory().create_dataset()
+                >>> print(dataset.desc())
 
         Returns:
             A string message
@@ -465,9 +466,9 @@ def set_queue_num(self, queue_num):
         Examples:
             .. code-block:: python
 
-              import paddle.base as base
-              dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
-              dataset.set_queue_num(12)
+                >>> import paddle.base as base
+                >>> dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
+                >>> dataset.set_queue_num(12)
 
         """
         self.is_user_set_queue_num = True
@@ -487,9 +488,9 @@ def set_parse_ins_id(self, parse_ins_id):
         Examples:
             .. code-block:: python
 
-              import paddle.base as base
-              dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
-              dataset.set_parse_ins_id(True)
+                >>> import paddle.base as base
+                >>> dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
+                >>> dataset.set_parse_ins_id(True)
 
         """
         self.parse_ins_id = parse_ins_id
@@ -508,9 +509,9 @@ def set_parse_content(self, parse_content):
         Examples:
             .. code-block:: python
 
-              import paddle.base as base
-              dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
-              dataset.set_parse_content(True)
+                >>> import paddle.base as base
+                >>> dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
+                >>> dataset.set_parse_content(True)
 
         """
         self.parse_content = parse_content
@@ -525,9 +526,9 @@ def set_parse_logkey(self, parse_logkey):
         Examples:
             .. code-block:: python
 
-              import paddle.base as base
-              dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
-              dataset.set_parse_logkey(True)
+                >>> import paddle.base as base
+                >>> dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
+                >>> dataset.set_parse_logkey(True)
 
         """
         self.parse_logkey = parse_logkey
@@ -542,9 +543,9 @@ def _set_trainer_num(self, trainer_num):
         Examples:
             .. code-block:: python
 
-              import paddle.base as base
-              dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
-              dataset._set_trainer_num(1)
+                >>> import paddle.base as base
+                >>> dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
+                >>> dataset._set_trainer_num(1)
 
         """
         self.trainer_num = trainer_num
@@ -563,9 +564,9 @@ def set_merge_by_sid(self, merge_by_sid):
         Examples:
             .. code-block:: python
 
-              import paddle.base as base
-              dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
-              dataset.set_merge_by_sid(True)
+                >>> import paddle.base as base
+                >>> dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
+                >>> dataset.set_merge_by_sid(True)
 
         """
         self.merge_by_sid = merge_by_sid
@@ -580,9 +581,9 @@ def set_enable_pv_merge(self, enable_pv_merge):
         Examples:
             .. code-block:: python
 
-              import paddle.base as base
-              dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
-              dataset.set_enable_pv_merge(True)
+                >>> import paddle.base as base
+                >>> dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
+                >>> dataset.set_enable_pv_merge(True)
 
         """
         self.enable_pv_merge = enable_pv_merge
@@ -595,12 +596,13 @@ def preprocess_instance(self):
         Examples:
             .. code-block:: python
 
-              import paddle.base as base
-              dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
-              filelist = ["a.txt", "b.txt"]
-              dataset.set_filelist(filelist)
-              dataset.load_into_memory()
-              dataset.preprocess_instance()
+                >>> # doctest: +SKIP('Depends on external files.')
+                >>> import paddle.base as base
+                >>> dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
+                >>> filelist = ["a.txt", "b.txt"]
+                >>> dataset.set_filelist(filelist)
+                >>> dataset.load_into_memory()
+                >>> dataset.preprocess_instance()
 
         """
         self.dataset.preprocess_instance()
@@ -613,12 +615,13 @@ def set_current_phase(self, current_phase):
         Examples:
             .. code-block:: python
 
-              import paddle.base as base
-              dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
-              filelist = ["a.txt", "b.txt"]
-              dataset.set_filelist(filelist)
-              dataset.load_into_memory()
-              dataset.set_current_phase(1)
+                >>> # doctest: +SKIP('Depends on external files.')
+                >>> import paddle.base as base
+                >>> dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
+                >>> filelist = ["a.txt", "b.txt"]
+                >>> dataset.set_filelist(filelist)
+                >>> dataset.load_into_memory()
+                >>> dataset.set_current_phase(1)
 
         """
         self.dataset.set_current_phase(current_phase)
@@ -630,14 +633,15 @@ def postprocess_instance(self):
         Examples:
             .. code-block:: python
 
-              import paddle.base as base
-              dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
-              filelist = ["a.txt", "b.txt"]
-              dataset.set_filelist(filelist)
-              dataset.load_into_memory()
-              dataset.preprocess_instance()
-              exe.train_from_dataset(dataset)
-              dataset.postprocess_instance()
+                >>> # doctest: +SKIP('Depends on external files.')
+                >>> import paddle.base as base
+                >>> dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
+                >>> filelist = ["a.txt", "b.txt"]
+                >>> dataset.set_filelist(filelist)
+                >>> dataset.load_into_memory()
+                >>> dataset.preprocess_instance()
+                >>> exe.train_from_dataset(dataset)
+                >>> dataset.postprocess_instance()
 
         """
         self.dataset.postprocess_instance()
@@ -656,9 +660,9 @@ def set_fleet_send_batch_size(self, fleet_send_batch_size=1024):
         Examples:
             .. code-block:: python
 
-              import paddle.base as base
-              dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
-              dataset.set_fleet_send_batch_size(800)
+                >>> import paddle.base as base
+                >>> dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
+                >>> dataset.set_fleet_send_batch_size(800)
 
         """
         self.fleet_send_batch_size = fleet_send_batch_size
@@ -677,9 +681,9 @@ def set_fleet_send_sleep_seconds(self, fleet_send_sleep_seconds=0):
         Examples:
             .. code-block:: python
 
-              import paddle.base as base
-              dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
-              dataset.set_fleet_send_sleep_seconds(2)
+                >>> import paddle.base as base
+                >>> dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
+                >>> dataset.set_fleet_send_sleep_seconds(2)
 
         """
         self.fleet_send_sleep_seconds = fleet_send_sleep_seconds
@@ -699,9 +703,9 @@ def set_merge_by_lineid(self, merge_size=2):
         Examples:
             .. code-block:: python
 
-              import paddle.base as base
-              dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
-              dataset.set_merge_by_lineid()
+                >>> import paddle.base as base
+                >>> dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
+                >>> dataset.set_merge_by_lineid()
 
         """
         self.dataset.set_merge_by_lineid(merge_size)
@@ -740,10 +744,10 @@ def set_date(self, date):
         Examples:
             .. code-block:: python
 
-                import paddle.base as base
+                >>> import paddle.base as base
 
-                dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
-                dataset.set_date("20211111")
+                >>> dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
+                >>> dataset.set_date("20211111")
         """
         year = int(date[:4])
         month = int(date[4:6])
@@ -765,12 +769,12 @@ def load_into_memory(self, is_shuffle=False):
         Examples:
             .. code-block:: python
 
-              # required: skiptest
-              import paddle.base as base
-              dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
-              filelist = ["a.txt", "b.txt"]
-              dataset.set_filelist(filelist)
-              dataset.load_into_memory()
+                >>> # doctest: +SKIP('Depends on external files.')
+                >>> import paddle.base as base
+                >>> dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
+                >>> filelist = ["a.txt", "b.txt"]
+                >>> dataset.set_filelist(filelist)
+                >>> dataset.load_into_memory()
         """
         self._prepare_to_run()
         if not self.use_ps_gpu:
@@ -793,13 +797,13 @@ def preload_into_memory(self, thread_num=None):
         Examples:
             .. code-block:: python
 
-              # required: skiptest
-              import paddle.base as base
-              dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
-              filelist = ["a.txt", "b.txt"]
-              dataset.set_filelist(filelist)
-              dataset.preload_into_memory()
-              dataset.wait_preload_done()
+                >>> # doctest: +SKIP('Depends on external files.')
+                >>> import paddle.base as base
+                >>> dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
+                >>> filelist = ["a.txt", "b.txt"]
+                >>> dataset.set_filelist(filelist)
+                >>> dataset.preload_into_memory()
+                >>> dataset.wait_preload_done()
         """
         self._prepare_to_run()
         if thread_num is None:
@@ -819,13 +823,13 @@ def wait_preload_done(self):
         Examples:
             .. code-block:: python
 
-              # required: skiptest
-              import paddle.base as base
-              dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
-              filelist = ["a.txt", "b.txt"]
-              dataset.set_filelist(filelist)
-              dataset.preload_into_memory()
-              dataset.wait_preload_done()
+                >>> # doctest: +SKIP('Depends on external files.')
+                >>> import paddle.base as base
+                >>> dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
+                >>> filelist = ["a.txt", "b.txt"]
+                >>> dataset.set_filelist(filelist)
+                >>> dataset.preload_into_memory()
+                >>> dataset.wait_preload_done()
         """
         self.dataset.wait_preload_done()
         self.dataset.destroy_preload_readers()
@@ -841,13 +845,13 @@ def local_shuffle(self):
         Examples:
             .. code-block:: python
 
-              # required: skiptest
-              import paddle.base as base
-              dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
-              filelist = ["a.txt", "b.txt"]
-              dataset.set_filelist(filelist)
-              dataset.load_into_memory()
-              dataset.local_shuffle()
+                >>> # doctest: +SKIP('Depends on external files.')
+                >>> import paddle.base as base
+                >>> dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
+                >>> filelist = ["a.txt", "b.txt"]
+                >>> dataset.set_filelist(filelist)
+                >>> dataset.load_into_memory()
+                >>> dataset.local_shuffle()
         """
         self.dataset.local_shuffle()
 
@@ -865,14 +869,14 @@ def global_shuffle(self, fleet=None, thread_num=12):
         Examples:
             .. code-block:: python
 
-              # required: skiptest
-              import paddle.base as base
-              from paddle.incubate.distributed.fleet.parameter_server.pslib import fleet
-              dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
-              filelist = ["a.txt", "b.txt"]
-              dataset.set_filelist(filelist)
-              dataset.load_into_memory()
-              dataset.global_shuffle(fleet)
+                >>> # doctest: +SKIP('Depends on external files.')
+                >>> import paddle.base as base
+                >>> from paddle.incubate.distributed.fleet.parameter_server.pslib import fleet
+                >>> dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
+                >>> filelist = ["a.txt", "b.txt"]
+                >>> dataset.set_filelist(filelist)
+                >>> dataset.load_into_memory()
+                >>> dataset.global_shuffle(fleet)
 
         Args:
             fleet(Fleet): fleet singleton. Default None.
@@ -927,18 +931,18 @@ def release_memory(self):
         Examples:
             .. code-block:: python
 
-              # required: skiptest
-              import paddle.base as base
-              from paddle.incubate.distributed.fleet.parameter_server.pslib import fleet
-              dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
-              filelist = ["a.txt", "b.txt"]
-              dataset.set_filelist(filelist)
-              dataset.load_into_memory()
-              dataset.global_shuffle(fleet)
-              exe = base.Executor(base.CPUPlace())
-              exe.run(base.default_startup_program())
-              exe.train_from_dataset(base.default_main_program(), dataset)
-              dataset.release_memory()
+                >>> # doctest: +SKIP('Depends on external files.')
+                >>> import paddle.base as base
+                >>> from paddle.incubate.distributed.fleet.parameter_server.pslib import fleet
+                >>> dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
+                >>> filelist = ["a.txt", "b.txt"]
+                >>> dataset.set_filelist(filelist)
+                >>> dataset.load_into_memory()
+                >>> dataset.global_shuffle(fleet)
+                >>> exe = base.Executor(base.CPUPlace())
+                >>> exe.run(base.default_startup_program())
+                >>> exe.train_from_dataset(base.default_main_program(), dataset)
+                >>> dataset.release_memory()
 
         """
         self.dataset.release_memory()
@@ -957,12 +961,13 @@ def get_pv_data_size(self):
         Examples:
             .. code-block:: python
 
-              import paddle.base as base
-              dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
-              filelist = ["a.txt", "b.txt"]
-              dataset.set_filelist(filelist)
-              dataset.load_into_memory()
-              print dataset.get_pv_data_size()
+                >>> # doctest: +SKIP('Depends on external files.')
+                >>> import paddle.base as base
+                >>> dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
+                >>> filelist = ["a.txt", "b.txt"]
+                >>> dataset.set_filelist(filelist)
+                >>> dataset.load_into_memory()
+                >>> print(dataset.get_pv_data_size())
 
         """
         return self.dataset.get_pv_data_size()
@@ -991,14 +996,14 @@ def get_memory_data_size(self, fleet=None):
         Examples:
             .. code-block:: python
 
-              # required: skiptest
-              import paddle.base as base
-              from paddle.incubate.distributed.fleet.parameter_server.pslib import fleet
-              dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
-              filelist = ["a.txt", "b.txt"]
-              dataset.set_filelist(filelist)
-              dataset.load_into_memory()
-              print dataset.get_memory_data_size(fleet)
+                >>> # doctest: +SKIP('Depends on external files.')
+                >>> import paddle.base as base
+                >>> from paddle.incubate.distributed.fleet.parameter_server.pslib import fleet
+                >>> dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
+                >>> filelist = ["a.txt", "b.txt"]
+                >>> dataset.set_filelist(filelist)
+                >>> dataset.load_into_memory()
+                >>> print(dataset.get_memory_data_size(fleet))
 
         """
         import numpy as np
@@ -1035,15 +1040,15 @@ def get_shuffle_data_size(self, fleet=None):
         Examples:
             .. code-block:: python
 
-              # required: skiptest
-              import paddle.base as base
-              from paddle.incubate.distributed.fleet.parameter_server.pslib import fleet
-              dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
-              filelist = ["a.txt", "b.txt"]
-              dataset.set_filelist(filelist)
-              dataset.load_into_memory()
-              dataset.global_shuffle(fleet)
-              print dataset.get_shuffle_data_size(fleet)
+                >>> # doctest: +SKIP('Depends on external files.')
+                >>> import paddle.base as base
+                >>> from paddle.incubate.distributed.fleet.parameter_server.pslib import fleet
+                >>> dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
+                >>> filelist = ["a.txt", "b.txt"]
+                >>> dataset.set_filelist(filelist)
+                >>> dataset.load_into_memory()
+                >>> dataset.global_shuffle(fleet)
+                >>> print(dataset.get_shuffle_data_size(fleet))
 
         """
         import numpy as np
@@ -1082,20 +1087,20 @@ def set_graph_config(self, config):
         Examples:
             .. code-block:: python
 
-              # required: skiptest
-              import paddle.base as base
-              from paddle.incubate.distributed.fleet.parameter_server.pslib import fleet
-              dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
-              graph_config = {"walk_len": 24,
-                    "walk_degree": 10,
-                    "once_sample_startid_len": 80000,
-                    "sample_times_one_chunk": 5,
-                    "window": 3,
-                    "debug_mode": 0,
-                    "batch_size": 800,
-                    "meta_path": "cuid2clk-clk2cuid;cuid2conv-conv2cuid;clk2cuid-cuid2clk;clk2cuid-cuid2conv",
-                    "gpu_graph_training": 1}
-              dataset.set_graph_config(graph_config)
+                >>> # doctest: +SKIP
+                >>> import paddle.base as base
+                >>> from paddle.incubate.distributed.fleet.parameter_server.pslib import fleet
+                >>> dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
+                >>> graph_config = {"walk_len": 24,
+                ...     "walk_degree": 10,
+                ...     "once_sample_startid_len": 80000,
+                ...     "sample_times_one_chunk": 5,
+                ...     "window": 3,
+                ...     "debug_mode": 0,
+                ...     "batch_size": 800,
+                ...     "meta_path": "cuid2clk-clk2cuid;cuid2conv-conv2cuid;clk2cuid-cuid2clk;clk2cuid-cuid2conv",
+                ...     "gpu_graph_training": 1}
+                >>> dataset.set_graph_config(graph_config)
 
         """
         self.proto_desc.graph_config.walk_degree = config.get("walk_degree", 1)
@@ -1145,10 +1150,10 @@ def set_pass_id(self, pass_id):
         Examples:
             .. code-block:: python
 
-              import paddle.base as base
-              pass_id = 0
-              dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
-              dataset.set_pass_id(pass_id)
+                >>> import paddle.base as base
+                >>> pass_id = 0
+                >>> dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
+                >>> dataset.set_pass_id(pass_id)
         """
         self.pass_id = pass_id
         self.dataset.set_pass_id(pass_id)
@@ -1163,9 +1168,9 @@ def get_pass_id(self):
         Examples:
             .. code-block:: python
 
-              import paddle.base as base
-              dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
-              pass_id = dataset.get_pass_id()
+                >>> import paddle.base as base
+                >>> dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
+                >>> pass_id = dataset.get_pass_id()
         """
         return self.pass_id
 
@@ -1183,8 +1188,8 @@ class QueueDataset(DatasetBase):
     Examples:
         .. code-block:: python
 
-          import paddle.base as base
-          dataset = base.DatasetFactory().create_dataset("QueueDataset")
+            >>> import paddle.base as base
+            >>> dataset = base.DatasetFactory().create_dataset("QueueDataset")
 
     """
 
@@ -1224,9 +1229,10 @@ def local_shuffle(self):
         Examples:
             .. code-block:: python
 
-              import paddle.base as base
-              dataset = base.DatasetFactory().create_dataset("QueueDataset")
-              dataset.local_shuffle()
+                >>> # doctest: +SKIP('NotImplementedError will be raised.')
+                >>> import paddle.base as base
+                >>> dataset = base.DatasetFactory().create_dataset("QueueDataset")
+                >>> dataset.local_shuffle()
 
         Raises:
             NotImplementedError: QueueDataset does not support local shuffle
@@ -1250,10 +1256,10 @@ def global_shuffle(self, fleet=None):
         Examples:
             .. code-block:: python
 
-              import paddle.base as base
-              from paddle.incubate.distributed.fleet.parameter_server.pslib import fleet
-              dataset = base.DatasetFactory().create_dataset("QueueDataset")
-              #dataset.global_shuffle(fleet)
+                >>> import paddle.base as base
+                >>> from paddle.incubate.distributed.fleet.parameter_server.pslib import fleet
+                >>> dataset = base.DatasetFactory().create_dataset("QueueDataset")
+                >>> #dataset.global_shuffle(fleet)
 
         Raises:
             NotImplementedError: QueueDataset does not support global shuffle
@@ -1272,8 +1278,8 @@ class FileInstantDataset(DatasetBase):
     Examples:
         .. code-block:: python
 
-          import paddle.base as base
-          dataset = base.DatasetFactory.create_dataset("FileInstantDataset")
+            >>> import paddle.base as base
+            >>> dataset = base.DatasetFactory.create_dataset("FileInstantDataset")
     """
 
     def __init__(self):
@@ -1312,8 +1318,8 @@ class BoxPSDataset(InMemoryDataset):
     Examples:
         .. code-block:: python
 
-          import paddle.base as base
-          dataset = base.DatasetFactory().create_dataset("BoxPSDataset")
+            >>> import paddle.base as base
+            >>> dataset = base.DatasetFactory().create_dataset("BoxPSDataset")
     """
 
     def __init__(self):
@@ -1342,9 +1348,9 @@ def begin_pass(self):
         Examples:
             .. code-block:: python
 
-              import paddle.base as base
-              dataset = base.DatasetFactory().create_dataset("BoxPSDataset")
-              dataset.begin_pass()
+                >>> import paddle.base as base
+                >>> dataset = base.DatasetFactory().create_dataset("BoxPSDataset")
+                >>> dataset.begin_pass()
         """
         self.boxps.begin_pass()
 
@@ -1355,9 +1361,9 @@ def end_pass(self, need_save_delta):
         Examples:
             .. code-block:: python
 
-              import paddle.base as base
-              dataset = base.DatasetFactory().create_dataset("BoxPSDataset")
-              dataset.end_pass(True)
+                >>> import paddle.base as base
+                >>> dataset = base.DatasetFactory().create_dataset("BoxPSDataset")
+                >>> dataset.end_pass(True)
         """
         self.boxps.end_pass(need_save_delta)
 
@@ -1368,12 +1374,13 @@ def wait_preload_done(self):
         Examples:
             .. code-block:: python
 
-              import paddle.base as base
-              dataset = base.DatasetFactory().create_dataset("BoxPSDataset")
-              filelist = ["a.txt", "b.txt"]
-              dataset.set_filelist(filelist)
-              dataset.preload_into_memory()
-              dataset.wait_preload_done()
+                >>> # doctest: +SKIP('Depends on external files.')
+                >>> import paddle.base as base
+                >>> dataset = base.DatasetFactory().create_dataset("BoxPSDataset")
+                >>> filelist = ["a.txt", "b.txt"]
+                >>> dataset.set_filelist(filelist)
+                >>> dataset.preload_into_memory()
+                >>> dataset.wait_preload_done()
         """
         self.boxps.wait_feed_pass_done()
 
@@ -1383,11 +1390,12 @@ def load_into_memory(self):
         Examples:
             .. code-block:: python
 
-              import paddle.base as base
-              dataset = base.DatasetFactory().create_dataset("BoxPSDataset")
-              filelist = ["a.txt", "b.txt"]
-              dataset.set_filelist(filelist)
-              dataset.load_into_memory()
+                >>> # doctest: +SKIP('Depends on external files.')
+                >>> import paddle.base as base
+                >>> dataset = base.DatasetFactory().create_dataset("BoxPSDataset")
+                >>> filelist = ["a.txt", "b.txt"]
+                >>> dataset.set_filelist(filelist)
+                >>> dataset.load_into_memory()
         """
         self._prepare_to_run()
         self.boxps.load_into_memory()
@@ -1398,11 +1406,12 @@ def preload_into_memory(self):
         Examples:
             .. code-block:: python
 
-              import paddle.base as base
-              dataset = base.DatasetFactory().create_dataset("BoxPSDataset")
-              filelist = ["a.txt", "b.txt"]
-              dataset.set_filelist(filelist)
-              dataset.preload_into_memory()
+                >>> # doctest: +SKIP('Depends on external files.')
+                >>> import paddle.base as base
+                >>> dataset = base.DatasetFactory().create_dataset("BoxPSDataset")
+                >>> filelist = ["a.txt", "b.txt"]
+                >>> dataset.set_filelist(filelist)
+                >>> dataset.preload_into_memory()
         """
         self._prepare_to_run()
         self.boxps.preload_into_memory()
@@ -1427,11 +1436,13 @@ def slots_shuffle(self, slots):
             slots(list[string]): the set of slots(string) to do slots shuffle.
 
         Examples:
-            import paddle.base as base
-            dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
-            dataset.set_merge_by_lineid()
-            #suppose there is a slot 0
-            dataset.slots_shuffle(['0'])
+            .. code-block:: python
+
+                >>> import paddle.base as base
+                >>> dataset = base.DatasetFactory().create_dataset("InMemoryDataset")
+                >>> dataset.set_merge_by_lineid()
+                >>> #suppose there is a slot 0
+                >>> dataset.slots_shuffle(['0'])
         """
         slots_set = set(slots)
         self.boxps.slots_shuffle(slots_set)
diff --git a/python/paddle/base/dygraph/math_op_patch.py b/python/paddle/base/dygraph/math_op_patch.py
index 1a73e6b5e9ae0..80108f4ed65e9 100644
--- a/python/paddle/base/dygraph/math_op_patch.py
+++ b/python/paddle/base/dygraph/math_op_patch.py
@@ -85,14 +85,15 @@ def astype(self, dtype):
         Examples:
             .. code-block:: python
 
-                import paddle
-                import numpy as np
-
-                original_tensor = paddle.ones([2, 2])
-                print("original tensor's dtype is: {}".format(original_tensor.dtype))
-                new_tensor = original_tensor.astype('float32')
-                print("new tensor's dtype is: {}".format(new_tensor.dtype))
-
+                >>> import paddle
+                >>> import numpy as np
+
+                >>> original_tensor = paddle.ones([2, 2])
+                >>> print("original tensor's dtype is: {}".format(original_tensor.dtype))
+                original tensor's dtype is: paddle.float32
+                >>> new_tensor = original_tensor.astype('float32')
+                >>> print("new tensor's dtype is: {}".format(new_tensor.dtype))
+                new tensor's dtype is: paddle.float32
         """
         if not isinstance(dtype, core.VarDesc.VarType):
             dtype = convert_np_dtype_to_dtype_(dtype)
diff --git a/python/paddle/base/dygraph/tensor_patch_methods.py b/python/paddle/base/dygraph/tensor_patch_methods.py
index 2db290e1e1d4e..b726a029141d7 100644
--- a/python/paddle/base/dygraph/tensor_patch_methods.py
+++ b/python/paddle/base/dygraph/tensor_patch_methods.py
@@ -100,15 +100,14 @@ def _to_static_var(self, to_parameter=False, **kwargs):
         Examples:
             .. code-block:: python
 
-                import paddle.base as base
-                from paddle.base.dygraph.base import to_variable
-                import numpy as np
-
-                data = np.ones([3, 1024], dtype='float32')
-                with base.dygraph.guard():
-                    tensor = to_variable(data)
-                    static_var = tensor._to_static_var()
-
+                >>> import paddle.base as base
+                >>> from paddle.base.dygraph.base import to_variable
+                >>> import numpy as np
+
+                >>> data = np.ones([3, 1024], dtype='float32')
+                >>> with base.dygraph.guard():
+                ...     tensor = to_variable(data)
+                ...     static_var = tensor._to_static_var()
         """
 
         # Note: getattr(self, attr, None) will call x.grad=x.gradient(), but gradient() only available in dygraph.
@@ -172,20 +171,19 @@ def set_value(self, value):
         Examples:
             .. code-block:: python
 
-                import paddle.base as base
-                from paddle.base.dygraph.base import to_variable
-                from paddle.nn import Linear
-                import numpy as np
-
-                data = np.ones([3, 1024], dtype='float32')
-                with base.dygraph.guard():
-                    linear = Linear(1024, 4)
-                    t = to_variable(data)
-                    linear(t)  # call with default weight
-                    custom_weight = np.random.randn(1024, 4).astype("float32")
-                    linear.weight.set_value(custom_weight)  # change existing weight
-                    out = linear(t)  # call with different weight
-
+                >>> import paddle.base as base
+                >>> from paddle.base.dygraph.base import to_variable
+                >>> from paddle.nn import Linear
+                >>> import numpy as np
+
+                >>> data = np.ones([3, 1024], dtype='float32')
+                >>> with base.dygraph.guard():
+                ...     linear = Linear(1024, 4)
+                ...     t = to_variable(data)
+                ...     linear(t)  # call with default weight
+                ...     custom_weight = np.random.randn(1024, 4).astype("float32")
+                ...     linear.weight.set_value(custom_weight)  # change existing weight
+                ...     out = linear(t)  # call with different weight
         """
         base_tensor = core.eager.Tensor
         assert isinstance(
@@ -252,33 +250,43 @@ def backward(self, grad_tensor=None, retain_graph=False):
         Examples:
             .. code-block:: python
 
-                import paddle
-                x = paddle.to_tensor(5., stop_gradient=False)
-                for i in range(5):
-                    y = paddle.pow(x, 4.0)
-                    y.backward()
-                    print("{}: {}".format(i, x.grad))
-                # 0: [500.]
-                # 1: [1000.]
-                # 2: [1500.]
-                # 3: [2000.]
-                # 4: [2500.]
-
-                x.clear_grad()
-                print("{}".format(x.grad))
-                # 0.
-
-                grad_tensor=paddle.to_tensor(2.)
-                for i in range(5):
-                    y = paddle.pow(x, 4.0)
-                    y.backward(grad_tensor)
-                    print("{}: {}".format(i, x.grad))
-                # 0: [1000.]
-                # 1: [2000.]
-                # 2: [3000.]
-                # 3: [4000.]
-                # 4: [5000.]
-
+                >>> import paddle
+                >>> x = paddle.to_tensor(5., stop_gradient=False)
+                >>> for i in range(5):
+                ...     y = paddle.pow(x, 4.0)
+                ...     y.backward()
+                ...     print("{}: {}".format(i, x.grad))
+                0: Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=False,
+                500.)
+                1: Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=False,
+                1000.)
+                2: Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=False,
+                1500.)
+                3: Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=False,
+                2000.)
+                4: Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=False,
+                2500.)
+
+                >>> x.clear_grad()
+                >>> print("{}".format(x.grad))
+                Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=False,
+                0.)
+
+                >>> grad_tensor=paddle.to_tensor(2.)
+                >>> for i in range(5):
+                ...     y = paddle.pow(x, 4.0)
+                ...     y.backward(grad_tensor)
+                ...     print("{}: {}".format(i, x.grad))
+                0: Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=False,
+                1000.)
+                1: Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=False,
+                2000.)
+                2: Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=False,
+                3000.)
+                3: Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=False,
+                4000.)
+                4: Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=False,
+                5000.)
         """
         if framework.in_dygraph_mode():
             if in_profiler_mode():
@@ -334,13 +342,13 @@ def gradient(self):
         Examples:
             .. code-block:: python
 
-                import paddle
+                >>> import paddle
 
-                x = paddle.to_tensor(5., stop_gradient=False)
-                y = paddle.pow(x, 4.0)
-                y.backward()
-                print("grad of x: {}".format(x.gradient()))
-                # [500.]
+                >>> x = paddle.to_tensor(5., stop_gradient=False)
+                >>> y = paddle.pow(x, 4.0)
+                >>> y.backward()
+                >>> print("grad of x: {}".format(x.gradient()))
+                grad of x: 500.0
 
         """
         if self.grad is None:
@@ -372,41 +380,46 @@ def register_hook(self, hook):
         Examples:
             .. code-block:: python
 
-                import paddle
-
-                # hook function return None
-                def print_hook_fn(grad):
-                    print(grad)
-
-                # hook function return Tensor
-                def double_hook_fn(grad):
-                    grad = grad * 2
-                    return grad
-
-                x = paddle.to_tensor([0., 1., 2., 3.], stop_gradient=False)
-                y = paddle.to_tensor([4., 5., 6., 7.], stop_gradient=False)
-                z = paddle.to_tensor([1., 2., 3., 4.])
-
-                # one Tensor can register multiple hooks
-                h = x.register_hook(print_hook_fn)
-                x.register_hook(double_hook_fn)
-
-                w = x + y
-                # register hook by lambda function
-                w.register_hook(lambda grad: grad * 2)
-
-                o = z.matmul(w)
-                o.backward()
-                # print_hook_fn print content in backward
-                # Tensor(shape=[4], dtype=float32, place=CUDAPlace(0), stop_gradient=False,
-                #        [2., 4., 6., 8.])
-
-                print("w.grad:", w.grad) # w.grad: [1. 2. 3. 4.]
-                print("x.grad:", x.grad) # x.grad: [ 4.  8. 12. 16.]
-                print("y.grad:", y.grad) # y.grad: [2. 4. 6. 8.]
-
-                # remove hook
-                h.remove()
+                >>> import paddle
+
+                >>> # hook function return None
+                >>> def print_hook_fn(grad):
+                ...     print(grad)
+                ...
+                >>> # hook function return Tensor
+                >>> def double_hook_fn(grad):
+                ...     grad = grad * 2
+                ...     return grad
+                ...
+                >>> x = paddle.to_tensor([0., 1., 2., 3.], stop_gradient=False)
+                >>> y = paddle.to_tensor([4., 5., 6., 7.], stop_gradient=False)
+                >>> z = paddle.to_tensor([1., 2., 3., 4.])
+
+                >>> # one Tensor can register multiple hooks
+                >>> h = x.register_hook(print_hook_fn)
+                >>> x.register_hook(double_hook_fn)
+
+                >>> w = x + y
+                >>> # register hook by lambda function
+                >>> w.register_hook(lambda grad: grad * 2)
+
+                >>> o = z.matmul(w)
+                >>> o.backward()
+                >>> # print_hook_fn print content in backward
+                Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=False,
+                [2., 4., 6., 8.])
+
+                >>> print("w.grad:", w.grad)
+                w.grad: None
+                >>> print("x.grad:", x.grad)
+                x.grad: Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=False,
+                [4. , 8. , 12., 16.])
+                >>> print("y.grad:", y.grad)
+                y.grad: Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=False,
+                [2., 4., 6., 8.])
+
+                >>> # remove hook
+                >>> h.remove()
         """
         if self.stop_gradient is True:
             raise RuntimeError(
@@ -519,13 +532,13 @@ def grad(self):
         Examples:
             .. code-block:: python
 
-                import paddle
+                >>> import paddle
 
-                x = paddle.to_tensor(5., stop_gradient=False)
-                y = paddle.pow(x, 4.0)
-                y.backward()
-                print("grad of x: {}".format(x.grad))
-                # Tensor(shape=[], dtype=float32, place=CUDAPlace(0), stop_gradient=False, 500.)
+                >>> x = paddle.to_tensor(5., stop_gradient=False)
+                >>> y = paddle.pow(x, 4.0)
+                >>> y.backward()
+                >>> print("grad of x: {}".format(x.grad))
+                grad of x: Tensor(shape=[], dtype=float32, place=CUDAPlace(0), stop_gradient=False, 500.)
 
         """
         msg = (
@@ -564,27 +577,37 @@ def item(self, *args):
         Examples:
             .. code-block:: python
 
-                import paddle
-
-                x = paddle.to_tensor(1)
-                print(x.item())             #1
-                print(type(x.item()))       #<class 'int'>
-
-                x = paddle.to_tensor(1.0)
-                print(x.item())             #1.0
-                print(type(x.item()))       #<class 'float'>
-
-                x = paddle.to_tensor(True)
-                print(x.item())             #True
-                print(type(x.item()))       #<class 'bool'>
-
-                x = paddle.to_tensor(1+1j)
-                print(x.item())             #(1+1j)
-                print(type(x.item()))       #<class 'complex'>
-
-                x = paddle.to_tensor([[1.1, 2.2, 3.3]])
-                print(x.item(2))            #3.3
-                print(x.item(0, 2))         #3.3
+                >>> import paddle
+
+                >>> x = paddle.to_tensor(1)
+                >>> print(x.item())
+                1
+                >>> print(type(x.item()))
+                <class 'int'>
+
+                >>> x = paddle.to_tensor(1.0)
+                >>> print(x.item())
+                1.0
+                >>> print(type(x.item()))
+                <class 'float'>
+
+                >>> x = paddle.to_tensor(True)
+                >>> print(x.item())
+                True
+                >>> print(type(x.item()))
+                <class 'bool'>
+
+                >>> x = paddle.to_tensor(1+1j)
+                >>> print(x.item())
+                (1+1j)
+                >>> print(type(x.item()))
+                <class 'complex'>
+
+                >>> x = paddle.to_tensor([[1.1, 2.2, 3.3]])
+                >>> print(x.item(2))
+                3.299999952316284
+                >>> print(x.item(0, 2))
+                3.299999952316284
 
         """
         scalar = self._getitem_from_offset(*args)
@@ -601,14 +624,16 @@ def inplace_version(self):
         **Notes: This is a read-only property**
 
         Examples:
-          .. code-block:: python
+            .. code-block:: python
 
-            import paddle
-            var = paddle.ones(shape=[4, 2, 3], dtype="float32")
-            print(var.inplace_version)  # 0
+                >>> import paddle
+                >>> var = paddle.ones(shape=[4, 2, 3], dtype="float32")
+                >>> print(var.inplace_version)
+                0
 
-            var[1] = 2.2
-            print(var.inplace_version)  # 1
+                >>> var[1] = 2.2
+                >>> print(var.inplace_version)
+                1
 
         """
         return self._inplace_version()
@@ -622,13 +647,13 @@ def __str__(self):
         Examples:
             .. code-block:: python
 
-                import paddle
-                x = paddle.rand([2, 5])
-                print(x)
-
-                # Tensor(shape=[2, 5], dtype=float32, place=CPUPlace,
-                #        [[0.30574632, 0.55739117, 0.30902600, 0.39413780, 0.44830436],
-                #         [0.79010487, 0.53972793, 0.09495186, 0.44267157, 0.72112119]])
+                >>> import paddle
+                >>> paddle.seed(2023)
+                >>> x = paddle.rand([2, 5])
+                >>> print(x)
+                Tensor(shape=[2, 5], dtype=float32, place=Place(cpu), stop_gradient=True,
+                [[0.86583614, 0.52014720, 0.25960937, 0.90525323, 0.42400089],
+                 [0.40641287, 0.97020894, 0.74437362, 0.51785129, 0.73292869]])
         """
         from paddle.tensor.to_string import tensor_to_string
 
@@ -641,24 +666,17 @@ def __deepcopy__(self, memo):
         Examples:
             .. code-block:: python
 
-                import paddle
-                import copy
-                x = paddle.to_tensor(2.)
-                y = copy.deepcopy(x)
-
-                print(x)
-                # Tensor(shape=[], dtype=float32, place=CPUPlace, stop_gradient=True,
-                #        2.)
-
-                print(y)
-                # Tensor(shape=[], dtype=float32, place=CPUPlace, stop_gradient=True,
-                #        2.)
-
+                >>> import paddle
+                >>> import copy
+                >>> x = paddle.to_tensor(2.)
+                >>> y = copy.deepcopy(x)
+                >>> print(x)
+                Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+                2.)
+                >>> print(y)
+                Tensor(shape=[], dtype=float32, place=Place(cpu), stop_gradient=True,
+                2.)
         """
-        if not self.is_leaf:
-            raise RuntimeError(
-                "Only Leaf Tensor support the deepcopy at the moment, non-Leaf Tensors contains graph information that does't support deepcopy"
-            )
         new_tensor = core.eager.Tensor()
         new_tensor.name = self.name + unique_name.generate("_deepcopy")
         memo[id(self)] = new_tensor
@@ -694,13 +712,15 @@ def __array__(self, dtype=None):
         Examples:
             .. code-block:: python
 
-                import paddle
-                import numpy as np
-                x = paddle.randn([2, 2])
-                x_array = np.array(x)
+                >>> import paddle
+                >>> import numpy as np
+                >>> x = paddle.randn([2, 2])
+                >>> x_array = np.array(x)
 
-                print(type(x_array))      #<class 'numpy.ndarray'>
-                print(x_array.shape)      #(2, 2)
+                >>> print(type(x_array))
+                <class 'numpy.ndarray'>
+                >>> print(x_array.shape)
+                (2, 2)
         """
         array = self.numpy(False)
         if dtype:
@@ -821,11 +841,12 @@ def _uva(self, device_id=0):
         Examples:
             .. code-block:: python
 
-              # required: gpu
-              import paddle
-              x = paddle.to_tensor([1, 2, 3], place=paddle.CPUPlace())
-              x._uva()
-              print(x)
+                >>> # doctest: +REQUIRES(env:GPU)
+                >>> import paddle
+                >>> paddle.device.set_device('gpu')
+                >>> x = paddle.to_tensor([1, 2, 3], place=paddle.CPUPlace())
+                >>> x._uva()
+                >>> print(x)
         '''
         self._tensor_uva(device_id)
 
@@ -881,13 +902,14 @@ def values(self):
         Examples:
             .. code-block:: python
 
-                import paddle
-                indices = [[0, 0, 1, 2, 2], [1, 3, 2, 0, 1]]
-                values = [1, 2, 3, 4, 5]
-                dense_shape = [3, 4]
-                sparse_x = paddle.sparse.sparse_coo_tensor(paddle.to_tensor(indices, dtype='int32'), paddle.to_tensor(values, dtype='float32'), shape=dense_shape)
-                print(sparse_x.values())
-                #[1, 2, 3, 4, 5]
+                >>> import paddle
+                >>> indices = [[0, 0, 1, 2, 2], [1, 3, 2, 0, 1]]
+                >>> values = [1, 2, 3, 4, 5]
+                >>> dense_shape = [3, 4]
+                >>> sparse_x = paddle.sparse.sparse_coo_tensor(paddle.to_tensor(indices, dtype='int32'), paddle.to_tensor(values, dtype='float32'), shape=dense_shape)
+                >>> print(sparse_x.values())
+                Tensor(shape=[5], dtype=float32, place=Place(cpu), stop_gradient=True,
+                [1., 2., 3., 4., 5.])
         """
         return _C_ops.sparse_values(self)
 
@@ -904,15 +926,17 @@ def to_dense(self):
         Examples:
             .. code-block:: python
 
-                import paddle
-                indices = [[0, 0, 1, 2, 2], [1, 3, 2, 0, 1]]
-                values = [1, 2, 3, 4, 5]
-                dense_shape = [3, 4]
-                sparse_x = paddle.sparse.sparse_coo_tensor(paddle.to_tensor(indices, dtype='int64'), paddle.to_tensor(values, dtype='float32'), shape=dense_shape)
-                dense_x = sparse_x.to_dense()
-                #[[0., 1., 0., 2.],
-                # [0., 0., 3., 0.],
-                # [4., 5., 0., 0.]]
+                >>> import paddle
+                >>> indices = [[0, 0, 1, 2, 2], [1, 3, 2, 0, 1]]
+                >>> values = [1, 2, 3, 4, 5]
+                >>> dense_shape = [3, 4]
+                >>> sparse_x = paddle.sparse.sparse_coo_tensor(paddle.to_tensor(indices, dtype='int64'), paddle.to_tensor(values, dtype='float32'), shape=dense_shape)
+                >>> dense_x = sparse_x.to_dense()
+                >>> print(dense_x)
+                Tensor(shape=[3, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
+                [[0., 1., 0., 2.],
+                 [0., 0., 3., 0.],
+                 [4., 5., 0., 0.]])
         """
 
         return _C_ops.sparse_to_dense(self)
@@ -930,13 +954,15 @@ def to_sparse_coo(self, sparse_dim):
         Examples:
             .. code-block:: python
 
-                import paddle
-                dense_x = [[0, 1, 0, 2], [0, 0, 3, 4]]
-                dense_x = paddle.to_tensor(dense_x, dtype='float32')
-                sparse_x = dense_x.to_sparse_coo(sparse_dim=2)
-                #indices=[[0, 0, 1, 1],
-                #         [1, 3, 2, 3]],
-                #values=[1., 2., 3., 4.]
+                >>> import paddle
+                >>> dense_x = [[0, 1, 0, 2], [0, 0, 3, 4]]
+                >>> dense_x = paddle.to_tensor(dense_x, dtype='float32')
+                >>> sparse_x = dense_x.to_sparse_coo(sparse_dim=2)
+                >>> print(sparse_x)
+                Tensor(shape=[2, 4], dtype=paddle.float32, place=Place(cpu), stop_gradient=True,
+                       indices=[[0, 0, 1, 1],
+                                [1, 3, 2, 3]],
+                       values=[1., 2., 3., 4.])
         """
 
         return _C_ops.sparse_to_sparse_coo(self, sparse_dim)
diff --git a/python/paddle/base/executor.py b/python/paddle/base/executor.py
index 42abbd1b3b717..90af9e893f1a4 100755
--- a/python/paddle/base/executor.py
+++ b/python/paddle/base/executor.py
@@ -409,7 +409,7 @@ def has_fetch_operators(
 
 
 def has_fetch_operations(
-    block, fetch_targets, fetch_holder_name, fetch_op='pd.fetch'
+    block, fetch_targets, fetch_holder_name, fetch_op='pd_op.fetch'
 ):
     """Check whether the block already has fetch operation.
 
@@ -433,7 +433,7 @@ def has_fetch_operations(
     for op in block.ops:
         if op.name() == fetch_op:
             fetch_count += 1
-            if op.operand_source() not in fetch_targets:
+            if op.operand_source(0) not in fetch_targets:
                 raise Exception(
                     "There is a fetch op in Program which will fetch variable that is not belong to fetch_targets."
                 )
@@ -513,8 +513,8 @@ def _add_feed_fetch_ops(
 def _add_new_ir_fetch_ops(program, fetch_list, fetch_var_name):
     import paddle
 
-    global_block = program.block()
-    fetch_op = "pd.fetch"
+    global_block = program.global_block()
+    fetch_op = "pd_op.fetch"
     if not has_fetch_operations(
         global_block, fetch_list, fetch_var_name, fetch_op
     ):
@@ -1247,9 +1247,9 @@ def _feed_data(self, program, feed, feed_var_name, scope):
 
     def _new_ir_feed_data(self, program, feed, scope):
         # feed var to framework
-        global_block = program.block()
+        global_block = program.global_block()
         for op in global_block.ops:
-            if op.name() == 'pd.data':
+            if op.name() == 'pd_op.data':
                 feed_target_name = op.attrs()["name"]
                 var_type = paddle_type_to_proto_type[op.attrs()["dtype"]]
                 var_shape = op.attrs()["shape"]
@@ -1898,7 +1898,10 @@ def _run_new_ir_impl(
 
         fetch_list = self._check_fetch_list(fetch_list)
 
-        if isinstance(program, Program) and len(program.block().ops) == 0:
+        if (
+            isinstance(program, Program)
+            and len(program.global_block().ops) == 0
+        ):
             if use_default_main_program:
                 error_info = (
                     "Now you are using default_main_program, "
diff --git a/python/paddle/base/framework.py b/python/paddle/base/framework.py
index 117912c4f1894..cc67c7ae38548 100644
--- a/python/paddle/base/framework.py
+++ b/python/paddle/base/framework.py
@@ -55,8 +55,8 @@
     'xpu_places',
     'cuda_pinned_places',
     'in_dygraph_mode',
-    'in_new_ir_mode',
-    'in_dynamic_or_new_ir_mode',
+    'in_pir_mode',
+    'in_dynamic_or_pir_mode',
     'is_compiled_with_cinn',
     'is_compiled_with_cuda',
     'is_compiled_with_rocm',
@@ -163,6 +163,7 @@ def __setattr__(self, name, val):
     DataType.BOOL: core.VarDesc.VarType.BOOL,
     DataType.FLOAT16: core.VarDesc.VarType.FP16,
     DataType.UINT16: core.VarDesc.VarType.BF16,
+    DataType.BFLOAT16: core.VarDesc.VarType.BF16,
     DataType.FLOAT32: core.VarDesc.VarType.FP32,
     DataType.FLOAT64: core.VarDesc.VarType.FP64,
     DataType.INT8: core.VarDesc.VarType.INT8,
@@ -197,20 +198,23 @@ def in_dygraph_mode():
     Examples:
         .. code-block:: python
 
-            import paddle
-            print(paddle.in_dynamic_mode())  # True, dynamic mode is turn ON by default since paddle 2.0.0
+            >>> import paddle
+            >>> print(paddle.in_dynamic_mode())  # dynamic mode is turn ON by default since paddle 2.0.
+            True
 
-            paddle.enable_static()
-            print(paddle.in_dynamic_mode())  # False, Now we are in static graph mode
+            >>> paddle.enable_static()
+            >>> print(paddle.in_dynamic_mode())  # Now we are in static graph mode
+            False
 
-            paddle.disable_static()
-            print(paddle.in_dynamic_mode())  # True, Now we are in dynamic mode
+            >>> paddle.disable_static()
+            >>> print(paddle.in_dynamic_mode())  # Now we are in dynamic mode
+            True
 
     """
     return global_var._dygraph_tracer_ is not None
 
 
-def in_new_ir_mode():
+def in_pir_mode():
     """
 
     This API checks whether paddle runs in static graph mode and use new ir api.
@@ -223,19 +227,19 @@ def in_new_ir_mode():
 
             >>> import paddle
 
-            >>> print(paddle.framework.in_new_ir_mode())
+            >>> print(paddle.framework.in_pir_mode())
             False
 
             >>> paddle.enable_static()
             >>> paddle.framework.set_flags({"FLAGS_enable_new_ir_api": True})
-            >>> print(paddle.framework.in_new_ir_mode())
+            >>> print(paddle.framework.in_pir_mode())
             True
 
     """
     return ir.core._use_new_ir_api() and not in_dygraph_mode()
 
 
-def in_dynamic_or_new_ir_mode():
+def in_dynamic_or_pir_mode():
     """
 
     This API checks whether paddle runs in dynamic graph or new ir mode.
@@ -248,19 +252,19 @@ def in_dynamic_or_new_ir_mode():
 
             >>> import paddle
 
-            >>> print(paddle.framework.in_dynamic_or_new_ir_mode())
+            >>> print(paddle.framework.in_dynamic_or_pir_mode())
             True
 
             >>> paddle.enable_static()
-            >>> print(paddle.framework.in_dynamic_or_new_ir_mode())
+            >>> print(paddle.framework.in_dynamic_or_pir_mode())
             False
 
             >>> paddle.framework.set_flags({"FLAGS_enable_new_ir_api": True})
-            >>> print(paddle.framework.in_dynamic_or_new_ir_mode())
+            >>> print(paddle.framework.in_dynamic_or_pir_mode())
             True
 
     """
-    return in_dygraph_mode() or in_new_ir_mode()
+    return in_dygraph_mode() or in_pir_mode()
 
 
 global_ipu_index = -1
@@ -292,17 +296,17 @@ def ipu_shard_guard(index=-1, stage=-1):
     Examples:
         .. code-block:: python
 
-            # required: ipu
-
-            import paddle
-            paddle.enable_static()
-            a = paddle.static.data(name='data', shape=[None, 1], dtype='int32')
-            with paddle.static.ipu_shard_guard(index=0, stage=0):
-                b = a + 1
-            with paddle.static.ipu_shard_guard(index=1, stage=1):
-                c = b + 1
-            with paddle.static.ipu_shard_guard(index=0, stage=2):
-                d = c + 1
+            >>> # doctest: +REQUIRES(env:IPU)
+            >>> import paddle
+            >>> paddle.device.set_device('ipu')
+            >>> paddle.enable_static()
+            >>> a = paddle.static.data(name='data', shape=[None, 1], dtype='int32')
+            >>> with paddle.static.ipu_shard_guard(index=0, stage=0):
+            ...     b = a + 1
+            >>> with paddle.static.ipu_shard_guard(index=1, stage=1):
+            ...     c = b + 1
+            >>> with paddle.static.ipu_shard_guard(index=0, stage=2):
+            ...     d = c + 1
     """
     if not core.is_compiled_with_ipu():
         raise ValueError(
@@ -345,14 +349,14 @@ def set_ipu_shard(call_func, index=-1, stage=-1):
     Examples:
         .. code-block:: python
 
-            # required: ipu
-
-            import paddle
-            paddle.enable_static()
-            a = paddle.static.data(name='data', shape=[None, 1], dtype='float32')
-            relu = paddle.nn.ReLU()
-            relu = paddle.static.set_ipu_shard(relu, index=1, stage=1)
-            relu(a)
+            >>> # doctest: +REQUIRES(env:IPU)
+            >>> import paddle
+            >>> paddle.device.set_device('ipu')
+            >>> paddle.enable_static()
+            >>> a = paddle.static.data(name='data', shape=[None, 1], dtype='float32')
+            >>> relu = paddle.nn.ReLU()
+            >>> relu = paddle.static.set_ipu_shard(relu, index=1, stage=1)
+            >>> relu(a)
     """
 
     def decorate(func):
@@ -407,13 +411,13 @@ def require_version(min_version, max_version=None):
     Examples:
         .. code-block:: python
 
-            import paddle.base as base
+            >>> import paddle.base as base
 
-            # any version >= 0.1.0 is acceptable.
-            base.require_version('0.1.0')
+            >>> # any version >= 0.1.0 is acceptable.
+            >>> base.require_version('0.1.0')
 
-            # if 0.1.0 <= version <= 10.0.0, it is acceptable.
-            base.require_version(min_version='0.1.0', max_version='10.0.0')
+            >>> # if 0.1.0 <= version <= 10.0.0, it is acceptable.
+            >>> base.require_version(min_version='0.1.0', max_version='10.0.0')
     """
     if not isinstance(min_version, str):
         raise TypeError(
@@ -720,8 +724,8 @@ def is_compiled_with_xpu():
     Examples:
         .. code-block:: python
 
-            import paddle.base as base
-            support_xpu = base.is_compiled_with_xpu()
+            >>> import paddle.base as base
+            >>> support_xpu = base.is_compiled_with_xpu()
     """
     return core.is_compiled_with_xpu()
 
@@ -746,8 +750,8 @@ def disable_signal_handler():
     Examples:
         .. code-block:: python
 
-            import paddle
-            paddle.disable_signal_handler()
+            >>> import paddle
+            >>> paddle.disable_signal_handler()
     """
     core.disable_signal_handler()
 
@@ -762,8 +766,8 @@ def is_compiled_with_cinn():
     Examples:
         .. code-block:: python
 
-            import paddle
-            support_cinn = paddle.device.is_compiled_with_cinn()
+            >>> import paddle
+            >>> support_cinn = paddle.device.is_compiled_with_cinn()
     """
     return core.is_compiled_with_cinn()
 
@@ -778,8 +782,8 @@ def is_compiled_with_cuda():
     Examples:
         .. code-block:: python
 
-            import paddle
-            support_gpu = paddle.device.is_compiled_with_cuda()
+            >>> import paddle
+            >>> support_gpu = paddle.device.is_compiled_with_cuda()
     """
     return core.is_compiled_with_cuda()
 
@@ -794,8 +798,8 @@ def is_compiled_with_rocm():
     Examples:
         .. code-block:: python
 
-            import paddle
-            support_gpu = paddle.device.is_compiled_with_rocm()
+            >>> import paddle
+            >>> support_gpu = paddle.device.is_compiled_with_rocm()
     """
     return core.is_compiled_with_rocm()
 
@@ -830,14 +834,14 @@ def cuda_places(device_ids=None):
 
         .. code-block:: python
 
-            import paddle
-            import paddle.static as static
-
-            # required: gpu
+            >>> # doctest: +REQUIRES(env:GPU)
+            >>> import paddle
+            >>> import paddle.static as static
 
-            paddle.enable_static()
+            >>> paddle.device.set_device('gpu')
+            >>> paddle.enable_static()
 
-            cuda_places = static.cuda_places()
+            >>> cuda_places = static.cuda_places()
 
     """
     assert core.is_compiled_with_cuda(), "Not compiled with CUDA"
@@ -871,13 +875,13 @@ def xpu_places(device_ids=None):
     Examples:
         .. code-block:: python
 
-            # required: xpu
-
-            import paddle
-            import paddle.static as static
+            >>> # doctest: +REQUIRES(env:XPU)
+            >>> import paddle
+            >>> import paddle.static as static
+            >>> paddle.device.set_device('xpu')
 
-            paddle.enable_static()
-            xpu_places = static.xpu_places()
+            >>> paddle.enable_static()
+            >>> xpu_places = static.xpu_places()
     """
     assert core.is_compiled_with_xpu(), "Not compiled with XPU"
     if device_ids is None:
@@ -908,12 +912,12 @@ def cpu_places(device_count=None):
 
         .. code-block:: python
 
-            import paddle
-            import paddle.static as static
+            >>> import paddle
+            >>> import paddle.static as static
 
-            paddle.enable_static()
+            >>> paddle.enable_static()
 
-            cpu_places = static.cpu_places()
+            >>> cpu_places = static.cpu_places()
     """
 
     if device_count is None:
@@ -998,37 +1002,37 @@ def name_scope(prefix=None):
 
         .. code-block:: python
 
-          import paddle
-          paddle.enable_static()
-          with paddle.static.name_scope("s1"):
-             a = paddle.static.data(name='data', shape=[None, 1], dtype='int32')
-             b = a + 1
-             with paddle.static.name_scope("s2"):
-                c = b * 1
-             with paddle.static.name_scope("s3"):
-                d = c / 1
-          with paddle.static.name_scope("s1"):
-                f = paddle.tensor.pow(d, 2.0)
-          with paddle.static.name_scope("s4"):
-                g = f - 1
-
-          # Op are created in the default main program.
-          for op in paddle.static.default_main_program().block(0).ops:
-              # elementwise_add is created in /s1/
-              if op.type == 'elementwise_add':
-                  assert op.desc.attr("op_namescope") == '/s1/'
-              # elementwise_mul is created in '/s1/s2'
-              elif op.type == 'elementwise_mul':
-                  assert op.desc.attr("op_namescope") == '/s1/s2/'
-              # elementwise_div is created in '/s1/s3'
-              elif op.type == 'elementwise_div':
-                  assert op.desc.attr("op_namescope") == '/s1/s3/'
-              # elementwise_sum is created in '/s4'
-              elif op.type == 'elementwise_sub':
-                  assert op.desc.attr("op_namescope") == '/s4/'
-              # pow is created in /s1_1/
-              elif op.type == 'pow':
-                  assert op.desc.attr("op_namescope") == '/s1_1/'
+            >>> import paddle
+            >>> paddle.enable_static()
+            >>> with paddle.static.name_scope("s1"):
+            ...     a = paddle.static.data(name='data', shape=[None, 1], dtype='int32')
+            ...     b = a + paddle.to_tensor(1)
+            ...     with paddle.static.name_scope("s2"):
+            ...         c = b * paddle.to_tensor(1)
+            ...     with paddle.static.name_scope("s3"):
+            ...         d = c / paddle.to_tensor(1)
+            >>> with paddle.static.name_scope("s1"):
+            ...     f = paddle.tensor.pow(d, paddle.to_tensor(2.0))
+            >>> with paddle.static.name_scope("s4"):
+            ...     g = f - paddle.to_tensor(1)
+
+            >>> # Op are created in the default main program.
+            >>> for op in paddle.static.default_main_program().block(0).ops:
+            ...     # elementwise_add is created in /s1/
+            ...     if op.type == 'elementwise_add':
+            ...         assert op.desc.attr("op_namescope") == '/s1/'
+            ...     # elementwise_mul is created in '/s1/s2'
+            ...     elif op.type == 'elementwise_mul':
+            ...         assert op.desc.attr("op_namescope") == '/s1/s2/'
+            ...     # elementwise_div is created in '/s1/s3'
+            ...     elif op.type == 'elementwise_div':
+            ...         assert op.desc.attr("op_namescope") == '/s1/s3/'
+            ...     # elementwise_sum is created in '/s4'
+            ...     elif op.type == 'elementwise_sub':
+            ...         assert op.desc.attr("op_namescope") == '/s4/'
+            ...     # pow is created in /s1_1/
+            ...     elif op.type == 'pow':
+            ...         assert op.desc.attr("op_namescope") == '/s1_1/'
     """
     # TODO(panyx0718): Only [0-9a-z].
     # in dygraph we don't need namescope since it will cause mem leak
@@ -1080,6 +1084,9 @@ def convert_np_dtype_to_dtype_(np_dtype):
         core.VarDesc.VarType / core.DataType : The data type in Paddle.
 
     """
+    if in_pir_mode():
+        return ir.core.convert_np_dtype_to_dtype_(np_dtype)
+
     # Convert the data type string to numpy data type.
     if isinstance(np_dtype, str) and np_dtype == "bfloat16":
         dtype = np.uint16
@@ -1335,23 +1342,23 @@ class Variable(metaclass=VariableMetaClass):
         .. code-block:: python
             :name: code-example-1
 
-            import paddle.base as base
-            cur_program = base.Program()
-            cur_block = cur_program.current_block()
-            new_variable = cur_block.create_var(name="X",
-                                                shape=[-1, 23, 48],
-                                                dtype='float32')
+            >>> import paddle.base as base
+            >>> cur_program = base.Program()
+            >>> cur_block = cur_program.current_block()
+            >>> new_variable = cur_block.create_var(name="X",
+            ...                                     shape=[-1, 23, 48],
+            ...                                     dtype='float32')
 
         In Dygraph  Mode:
 
         .. code-block:: python
             :name: code-example-2
 
-            import paddle.base as base
-            import numpy as np
+            >>> import paddle.base as base
+            >>> import numpy as np
 
-            with base.dygraph.guard():
-                new_variable = base.dygraph.to_variable(np.arange(10))
+            >>> with base.dygraph.guard():
+            ...     new_variable = base.dygraph.to_variable(np.arange(10))
 
     """
 
@@ -1486,15 +1493,15 @@ def detach(self):
         Examples:
             .. code-block:: python
 
-                import paddle
+                >>> import paddle
 
-                paddle.enable_static()
+                >>> paddle.enable_static()
 
-                # create a static Variable
-                x = paddle.static.data(name='x', shape=[3, 2, 1])
+                >>> # create a static Variable
+                >>> x = paddle.static.data(name='x', shape=[3, 2, 1])
 
-                # create a detached Variable
-                y = x.detach()
+                >>> # create a detached Variable
+                >>> y = x.detach()
 
         """
 
@@ -1533,17 +1540,17 @@ def numpy(self):
         Examples:
             .. code-block:: python
 
-                import paddle.base as base
-                from paddle.base.dygraph.base import to_variable
-                from paddle.base.dygraph import Linear
-                import numpy as np
+                >>> import paddle.base as base
+                >>> from paddle.base.dygraph.base import to_variable
+                >>> from paddle.nn import Linear
+                >>> import numpy as np
 
-                data = np.random.uniform(-1, 1, [30, 10, 32]).astype('float32')
-                with base.dygraph.guard():
-                    linear = Linear(32, 64)
-                    data = to_variable(data)
-                    x = linear(data)
-                    print(x.numpy())
+                >>> data = np.random.uniform(-1, 1, [30, 10, 32]).astype('float32')
+                >>> with base.dygraph.guard():
+                ...     linear = Linear(32, 64)
+                ...     data = to_variable(data)
+                ...     x = linear(data)
+                ...     print(x.numpy())
 
         """
         pass
@@ -1568,21 +1575,21 @@ def backward(self, retain_graph=False):
         Examples:
             .. code-block:: python
 
-                import numpy as np
-                import paddle
-                paddle.disable_static()
+                >>> import numpy as np
+                >>> import paddle
+                >>> paddle.disable_static()
 
-                x = np.ones([2, 2], np.float32)
-                inputs = []
-                for _ in range(10):
-                    tmp = paddle.to_tensor(x)
-                    # if we don't set tmp's stop_gradient as False then, all path to loss will has no gradient since
-                    # there is no one need gradient on it.
-                    tmp.stop_gradient=False
-                    inputs.append(tmp)
-                ret = paddle.add_n(inputs)
-                loss = paddle.sum(ret)
-                loss.backward()
+                >>> x = np.ones([2, 2], np.float32)
+                >>> inputs = []
+                >>> for _ in range(10):
+                ...     tmp = paddle.to_tensor(x)
+                ...     # if we don't set tmp's stop_gradient as False then, all path to loss will has no gradient since
+                ...     # there is no one need gradient on it.
+                ...     tmp.stop_gradient=False
+                ...     inputs.append(tmp)
+                >>> ret = paddle.add_n(inputs)
+                >>> loss = paddle.sum(ret)
+                >>> loss.backward()
 
         """
         from .backward import append_backward
@@ -1611,36 +1618,37 @@ def gradient(self):
         Examples:
             .. code-block:: python
 
-                import paddle
-                import paddle.base as base
-                import numpy as np
-
-                # example1: return ndarray
-                x = np.ones([2, 2], np.float32)
-                with base.dygraph.guard():
-                    inputs2 = []
-                    for _ in range(10):
-                        tmp = base.dygraph.base.to_variable(x)
-                        tmp.stop_gradient=False
-                        inputs2.append(tmp)
-                    ret2 = paddle.add_n(inputs2)
-                    loss2 = paddle.sum(ret2)
-                    loss2.backward()
-                    print(loss2.gradient())
-
-                # example2: return tuple of ndarray
-                with base.dygraph.guard():
-                    embedding = paddle.nn.Embedding(
-                        20,
-                        32,
-                        weight_attr='emb.w',
-                        sparse=True)
-                    x_data = np.arange(12).reshape(4, 3).astype('int64')
-                    x_data = x_data.reshape((-1, 3, 1))
-                    x = base.dygraph.base.to_variable(x_data)
-                    out = embedding(x)
-                    out.backward()
-                    print(embedding.weight.gradient())
+                >>> import paddle
+                >>> import paddle.base as base
+                >>> import numpy as np
+
+                >>> # example1: return ndarray
+                >>> x = np.ones([2, 2], np.float32)
+                >>> with base.dygraph.guard():
+                ...     inputs2 = []
+                ...     for _ in range(10):
+                ...         tmp = base.dygraph.base.to_variable(x)
+                ...         tmp.stop_gradient=False
+                ...         inputs2.append(tmp)
+                ...     ret2 = paddle.add_n(inputs2)
+                ...     loss2 = paddle.sum(ret2)
+                ...     loss2.retain_grads()
+                ...     loss2.backward()
+                ...     print(loss2.gradient())
+
+                >>> # example2: return tuple of ndarray
+                >>> with base.dygraph.guard():
+                ...     embedding = paddle.nn.Embedding(
+                ...         20,
+                ...         32,
+                ...         weight_attr='emb.w',
+                ...         sparse=True)
+                ...     x_data = np.arange(12).reshape(4, 3).astype('int64')
+                ...     x_data = x_data.reshape((-1, 3, 1))
+                ...     x = base.dygraph.base.to_variable(x_data)
+                ...     out = embedding(x)
+                ...     out.backward()
+                ...     print(embedding.weight.gradient())
 
         """
         pass
@@ -1660,24 +1668,26 @@ def clear_gradient(self):
         Examples:
             .. code-block:: python
 
-                import paddle
-                import paddle.base as base
-                import numpy as np
-
-                x = np.ones([2, 2], np.float32)
-                with base.dygraph.guard():
-                    inputs2 = []
-                    for _ in range(10):
-                        tmp = base.dygraph.base.to_variable(x)
-                        tmp.stop_gradient=False
-                        inputs2.append(tmp)
-                    ret2 = paddle.add_n(inputs2)
-                    loss2 = paddle.sum(ret2)
-                    loss2.backward()
-                    print(loss2.gradient())
-                    loss2.clear_gradient()
-                    print("After clear {}".format(loss2.gradient()))
-
+                >>> import paddle
+                >>> import paddle.base as base
+                >>> import numpy as np
+
+                >>> x = np.ones([2, 2], np.float32)
+                >>> with base.dygraph.guard():
+                ...     inputs2 = []
+                ...     for _ in range(10):
+                ...         tmp = base.dygraph.base.to_variable(x)
+                ...         tmp.stop_gradient=False
+                ...         inputs2.append(tmp)
+                ...     ret2 = paddle.add_n(inputs2)
+                ...     loss2 = paddle.sum(ret2)
+                ...     loss2.retain_grads()
+                ...     loss2.backward()
+                ...     print(loss2.gradient())
+                ...     loss2.clear_gradient()
+                ...     print("After clear {}".format(loss2.gradient()))
+                1.0
+                After clear 0.0
         """
         pass
 
@@ -1717,17 +1727,18 @@ def _to_readable_code(self):
         Examples:
             .. code-block:: python
 
-                import paddle
-                import paddle.static as static
+                >>> import paddle
+                >>> import paddle.static as static
 
-                paddle.enable_static()
+                >>> paddle.enable_static()
 
-                cur_program = static.Program()
-                cur_block = cur_program.current_block()
-                new_variable = cur_block.create_var(name="X",
-                                                    shape=[-1, 23, 48],
-                                                    dtype='float32')
-                print(new_variable._to_readable_code())
+                >>> cur_program = static.Program()
+                >>> cur_block = cur_program.current_block()
+                >>> new_variable = cur_block.create_var(name="X",
+                ...                                     shape=[-1, 23, 48],
+                ...                                     dtype='float32')
+                >>> print(new_variable._to_readable_code())
+                var X : LOD_TENSOR.shape(-1, 23, 48).dtype(float32).stop_gradient(False)
         """
         # VarType.LOD_TENSOR -> LOD_TENSOR
         type_str = str(self.type).split('.')[1]
@@ -1786,18 +1797,32 @@ def to_string(self, throw_on_error, with_details=False):
         Examples:
             .. code-block:: python
 
-                import paddle.base as base
-                import paddle
-
-                paddle.enable_static()
-                cur_program = base.Program()
-                cur_block = cur_program.current_block()
-                new_variable = cur_block.create_var(name="X",
-                                                    shape=[-1, 23, 48],
-                                                    dtype='float32')
-                print(new_variable.to_string(True))
-                print("=============with detail===============")
-                print(new_variable.to_string(True, True))
+                >>> import paddle.base as base
+                >>> import paddle
+
+                >>> paddle.enable_static()
+                >>> cur_program = base.Program()
+                >>> cur_block = cur_program.current_block()
+                >>> new_variable = cur_block.create_var(name="X",
+                ...                                     shape=[-1, 23, 48],
+                ...                                     dtype='float32')
+                >>> print(new_variable.to_string(True))
+                >>> print("=============with detail===============")
+                >>> print(new_variable.to_string(True, True))
+                name: "X"
+                type {
+                  type: LOD_TENSOR
+                  lod_tensor {
+                    tensor {
+                      data_type: FP32
+                      dims: -1
+                      dims: 23
+                      dims: 48
+                    }
+                  }
+                }
+                stop_gradient: false
+                error_clip: None
         """
         assert isinstance(throw_on_error, bool) and isinstance(
             with_details, bool
@@ -1819,25 +1844,30 @@ def element_size(self):
         Returns the size in bytes of an element in the Tensor.
 
         Examples:
-          .. code-block:: python
+            .. code-block:: python
 
-            import paddle
-            paddle.enable_static()
+                >>> import paddle
+                >>> paddle.enable_static()
 
-            x = paddle.static.data(name='x1', shape=[3, 2], dtype='bool')
-            x.element_size() # 1
+                >>> x = paddle.static.data(name='x1', shape=[3, 2], dtype='bool')
+                >>> print(x.element_size())
+                1
 
-            x = paddle.static.data(name='x2', shape=[3, 2], dtype='int16')
-            x.element_size() # 2
+                >>> x = paddle.static.data(name='x2', shape=[3, 2], dtype='int16')
+                >>> print(x.element_size())
+                2
 
-            x = paddle.static.data(name='x3', shape=[3, 2], dtype='float16')
-            x.element_size() # 2
+                >>> x = paddle.static.data(name='x3', shape=[3, 2], dtype='float16')
+                >>> print(x.element_size())
+                2
 
-            x = paddle.static.data(name='x4', shape=[3, 2], dtype='float32')
-            x.element_size() # 4
+                >>> x = paddle.static.data(name='x4', shape=[3, 2], dtype='float32')
+                >>> print(x.element_size())
+                4
 
-            x = paddle.static.data(name='x5', shape=[3, 2], dtype='float64')
-            x.element_size() # 8
+                >>> x = paddle.static.data(name='x5', shape=[3, 2], dtype='float64')
+                >>> print(x.element_size())
+                8
         """
         return self.desc.element_size()
 
@@ -1849,28 +1879,27 @@ def stop_gradient(self):
         **Notes: This Property has default value as** ``True`` **in** Dygraph **mode, while Parameter's default value is False. However, in Static Graph Mode all Variable's default stop_gradient value is** ``False``
 
         Examples:
-          .. code-block:: python
+            .. code-block:: python
 
-            import paddle.base as base
-            import numpy as np
-
-            with base.dygraph.guard():
-                value0 = np.arange(26).reshape(2, 13).astype("float32")
-                value1 = np.arange(6).reshape(2, 3).astype("float32")
-                value2 = np.arange(10).reshape(2, 5).astype("float32")
-                linear = base.Linear(13, 5, dtype="float32")
-                linear2 = base.Linear(3, 3, dtype="float32")
-                a = base.dygraph.to_variable(value0)
-                b = base.dygraph.to_variable(value1)
-                c = base.dygraph.to_variable(value2)
-                out1 = linear(a)
-                out2 = linear2(b)
-                out1.stop_gradient = True
-                out = base.layers.concat(input=[out1, out2, c], axis=1)
-                out.backward()
-
-                assert linear.weight.gradient() is None
-                assert (out1.gradient() == 0).all()
+                >>> import paddle.base as base
+                >>> import numpy as np
+
+                >>> with base.dygraph.guard():
+                ...     value0 = np.arange(26).reshape(2, 13).astype("float32")
+                ...     value1 = np.arange(6).reshape(2, 3).astype("float32")
+                ...     value2 = np.arange(10).reshape(2, 5).astype("float32")
+                ...     linear = base.Linear(13, 5, dtype="float32")
+                ...     linear2 = base.Linear(3, 3, dtype="float32")
+                ...     a = base.dygraph.to_variable(value0)
+                ...     b = base.dygraph.to_variable(value1)
+                ...     c = base.dygraph.to_variable(value2)
+                ...     out1 = linear(a)
+                ...     out2 = linear2(b)
+                ...     out1.stop_gradient = True
+                ...     out = base.layers.concat(input=[out1, out2, c], axis=1)
+                ...     out.backward()
+                ...     assert linear.weight.gradient() is None
+                ...     assert (out1.gradient() == 0).all()
         """
         return self.desc.stop_gradient()
 
@@ -1891,15 +1920,16 @@ def persistable(self):
             **2. In** Dygraph **mode, this property should not be changed**
 
         Examples:
-          .. code-block:: python
+            .. code-block:: python
 
-            import paddle.base as base
-            cur_program = base.Program()
-            cur_block = cur_program.current_block()
-            new_variable = cur_block.create_var(name="X",
-                                                shape=[-1, 23, 48],
-                                                dtype='float32')
-            print("persistable of current Var is: {}".format(new_variable.persistable))
+                >>> import paddle.base as base
+                >>> cur_program = base.Program()
+                >>> cur_block = cur_program.current_block()
+                >>> new_variable = cur_block.create_var(name="X",
+                ...                                     shape=[-1, 23, 48],
+                ...                                     dtype='float32')
+                >>> print("persistable of current Var is: {}".format(new_variable.persistable))
+                persistable of current Var is: False
         """
         return self.desc.persistable()
 
@@ -1913,18 +1943,17 @@ def is_parameter(self):
         Indicating if current Variable is a Parameter
 
         Examples:
-          .. code-block:: python
-
-            import paddle
-            new_parameter = paddle.static.create_parameter(name="X",
-                                                shape=[10, 23, 48],
-                                                dtype='float32')
-            if new_parameter.is_parameter:
-                print("Current var is a Parameter")
-            else:
-                print("Current var is not a Parameter")
+            .. code-block:: python
 
-            # Current var is a Parameter
+                >>> import paddle
+                >>> new_parameter = paddle.static.create_parameter(name="X",
+                ...                                     shape=[10, 23, 48],
+                ...                                     dtype='float32')
+                >>> if new_parameter.is_parameter:
+                ...     print("Current var is a Parameter")
+                ... else:
+                ...     print("Current var is not a Parameter")
+                Current var is a Parameter
         """
         return self.desc.is_parameter()
 
@@ -1940,15 +1969,16 @@ def name(self):
         **Notes: If it has two or more Varaible share the same name in the same** :ref:`api_guide_Block_en` **, it means these Variable will share content in no-** Dygraph **mode. This is how we achieve Parameter sharing**
 
         Examples:
-          .. code-block:: python
+            .. code-block:: python
 
-            import paddle.base as base
-            cur_program = base.Program()
-            cur_block = cur_program.current_block()
-            new_variable = cur_block.create_var(name="X",
-                                                shape=[-1, 23, 48],
-                                                dtype='float32')
-            print("name of current Var is: {}".format(new_variable.name))
+                >>> import paddle.base as base
+                >>> cur_program = base.Program()
+                >>> cur_block = cur_program.current_block()
+                >>> new_variable = cur_block.create_var(name="X",
+                ...                                     shape=[-1, 23, 48],
+                ...                                     dtype='float32')
+                >>> print("name of current Var is: {}".format(new_variable.name))
+                name of current Var is: X
         """
         return self.desc.name()
 
@@ -1962,12 +1992,13 @@ def grad_name(self):
         the gradient exists.**
 
         Examples:
-          .. code-block:: python
-
-          import paddle
+            .. code-block:: python
 
-          x = paddle.static.data(name="x", shape=[-1, 23, 48], dtype='float32')
-          print(x.grad_name) # output is ``x@GRAD``
+                >>> import paddle
+                >>> paddle.enable_static()
+                >>> x = paddle.static.data(name="x", shape=[-1, 23, 48], dtype='float32')
+                >>> print(x.grad_name)
+                x@GRAD
 
         """
         return self.name + "@GRAD"
@@ -1984,16 +2015,16 @@ def shape(self):
         **Notes: This is a read-only property**
 
         Examples:
-          .. code-block:: python
-
-            import paddle.base as base
-            cur_program = base.Program()
-            cur_block = cur_program.current_block()
-            new_variable = cur_block.create_var(name="X",
-                                                shape=[-1, 23, 48],
-                                                dtype='float32')
-            print("shape of current Var is: {}".format(new_variable.shape))
+            .. code-block:: python
 
+                >>> import paddle.base as base
+                >>> cur_program = base.Program()
+                >>> cur_block = cur_program.current_block()
+                >>> new_variable = cur_block.create_var(name="X",
+                ...                                     shape=[-1, 23, 48],
+                ...                                     dtype='float32')
+                >>> print("shape of current Var is: {}".format(new_variable.shape))
+                shape of current Var is: [-1, 23, 48]
         """
         # convert to tuple, make it as same as numpy API.
         return tuple(self.desc.shape())
@@ -2006,15 +2037,16 @@ def dtype(self):
         **Notes: This is a read-only property**
 
         Examples:
-          .. code-block:: python
+            .. code-block:: python
 
-            import paddle.base as base
-            cur_program = base.Program()
-            cur_block = cur_program.current_block()
-            new_variable = cur_block.create_var(name="X",
-                                                shape=[-1, 23, 48],
-                                                dtype='float32')
-            print("Dtype of current Var is: {}".format(new_variable.dtype))
+                >>> import paddle.base as base
+                >>> cur_program = base.Program()
+                >>> cur_block = cur_program.current_block()
+                >>> new_variable = cur_block.create_var(name="X",
+                ...                                     shape=[-1, 23, 48],
+                ...                                     dtype='float32')
+                >>> print("Dtype of current Var is: {}".format(new_variable.dtype))
+                Dtype of current Var is: paddle.float32
         """
         return self.desc.dtype()
 
@@ -2031,18 +2063,19 @@ def lod_level(self):
             **2. Don't support this property in** Dygraph **mode, it's value should be** ``0(int)``
 
         Examples:
-          .. code-block:: python
+            .. code-block:: python
 
-            import paddle
-            import paddle.base as base
+                >>> import paddle
+                >>> import paddle.base as base
 
-            paddle.enable_static()
-            cur_program = base.Program()
-            cur_block = cur_program.current_block()
-            new_variable = cur_block.create_var(name="X",
-                                                shape=[-1, 23, 48],
-                                                dtype='float32')
-            print("LoD Level of current Var is: {}".format(new_variable.lod_level))
+                >>> paddle.enable_static()
+                >>> cur_program = base.Program()
+                >>> cur_block = cur_program.current_block()
+                >>> new_variable = cur_block.create_var(name="X",
+                ...                                     shape=[-1, 23, 48],
+                ...                                     dtype='float32')
+                >>> print("LoD Level of current Var is: {}".format(new_variable.lod_level))
+                LoD Level of current Var is: 0
         """
         if self.type == core.VarDesc.VarType.SELECTED_ROWS:
             raise Exception("SelectedRows DO NOT supprt lod")
@@ -2058,15 +2091,16 @@ def type(self):
         **Notes: This is a read-only property**
 
         Examples:
-          .. code-block:: python
+            .. code-block:: python
 
-            import paddle.base as base
-            cur_program = base.Program()
-            cur_block = cur_program.current_block()
-            new_variable = cur_block.create_var(name="X",
-                                                shape=[-1, 23, 48],
-                                                dtype='float32')
-            print("Type of current Var is: {}".format(new_variable.type))
+                >>> import paddle.base as base
+                >>> cur_program = base.Program()
+                >>> cur_block = cur_program.current_block()
+                >>> new_variable = cur_block.create_var(name="X",
+                ...                                     shape=[-1, 23, 48],
+                ...                                     dtype='float32')
+                >>> print("Type of current Var is: {}".format(new_variable.type))
+                Type of current Var is: VarType.LOD_TENSOR
         """
         return self.desc.type()
 
@@ -2079,19 +2113,18 @@ def T(self):
         If `n` is the dimensions of `x` , `x.T` is equivalent to `x.transpose([n-1, n-2, ..., 0])`.
 
         Examples:
-
             .. code-block:: python
 
-                import paddle
-                paddle.enable_static()
+                >>> import paddle
+                >>> paddle.enable_static()
 
-                x = paddle.ones(shape=[2, 3, 5])
-                x_T = x.T
+                >>> x = paddle.ones(shape=[2, 3, 5])
+                >>> x_T = x.T
 
-                exe = paddle.static.Executor()
-                x_T_np = exe.run(paddle.static.default_main_program(), fetch_list=[x_T])[0]
-                print(x_T_np.shape)
-                # (5, 3, 2)
+                >>> exe = paddle.static.Executor()
+                >>> x_T_np = exe.run(paddle.static.default_main_program(), fetch_list=[x_T])[0]
+                >>> print(x_T_np.shape)
+                (5, 3, 2)
 
         """
         if len(self.shape) == 1:
@@ -2136,14 +2169,14 @@ def clone(self):
         Examples:
             .. code-block:: python
 
-                import paddle
+                >>> import paddle
 
-                paddle.enable_static()
+                >>> paddle.enable_static()
 
-                # create a static Variable
-                x = paddle.static.data(name='x', shape=[3, 2, 1])
-                # create a cloned Variable
-                y = x.clone()
+                >>> # create a static Variable
+                >>> x = paddle.static.data(name='x', shape=[3, 2, 1])
+                >>> # create a cloned Variable
+                >>> y = x.clone()
 
         """
         output = self.block.create_var(
@@ -2396,31 +2429,31 @@ def get_value(self, scope=None):
         Examples:
             .. code-block:: python
 
-                import paddle
-                import paddle.static as static
-                import numpy as np
+                >>> import paddle
+                >>> import paddle.static as static
+                >>> import numpy as np
 
-                paddle.enable_static()
+                >>> paddle.enable_static()
 
-                x = static.data(name="x", shape=[10, 10], dtype='float32')
+                >>> x = static.data(name="x", shape=[10, 10], dtype='float32')
 
-                y = static.nn.fc(x, 10, name='fc')
-                place = paddle.CPUPlace()
-                exe = static.Executor(place)
-                prog = paddle.static.default_main_program()
-                exe.run(static.default_startup_program())
-                inputs = np.ones((10, 10), dtype='float32')
-                exe.run(prog, feed={'x': inputs}, fetch_list=[y, ])
-                path = 'temp/tensor_'
-                for var in prog.list_vars():
-                    if var.persistable:
-                        t = var.get_value()
-                        paddle.save(t, path+var.name+'.pdtensor')
+                >>> y = static.nn.fc(x, 10, name='fc')
+                >>> place = paddle.CPUPlace()
+                >>> exe = static.Executor(place)
+                >>> prog = paddle.static.default_main_program()
+                >>> exe.run(static.default_startup_program())
+                >>> inputs = np.ones((10, 10), dtype='float32')
+                >>> exe.run(prog, feed={'x': inputs}, fetch_list=[y, ])
+                >>> path = 'temp/tensor_'
+                >>> for var in prog.list_vars():
+                ...     if var.persistable:
+                ...         t = var.get_value()
+                ...         paddle.save(t, path+var.name+'.pdtensor')
 
-                for var in prog.list_vars():
-                    if var.persistable:
-                        t_load = paddle.load(path+var.name+'.pdtensor')
-                        var.set_value(t_load)
+                >>> for var in prog.list_vars():
+                ...     if var.persistable:
+                ...         t_load = paddle.load(path+var.name+'.pdtensor')
+                ...         var.set_value(t_load)
         """
         # The 'framework' is a low-level module, and 'executor'
         # can not be imported at the begainning of this file.
@@ -2461,31 +2494,31 @@ def set_value(self, value, scope=None):
         Examples:
             .. code-block:: python
 
-                import paddle
-                import paddle.static as static
-                import numpy as np
+                >>> import paddle
+                >>> import paddle.static as static
+                >>> import numpy as np
 
-                paddle.enable_static()
+                >>> paddle.enable_static()
 
-                x = static.data(name="x", shape=[10, 10], dtype='float32')
+                >>> x = static.data(name="x", shape=[10, 10], dtype='float32')
 
-                y = static.nn.fc(x, 10, name='fc')
-                place = paddle.CPUPlace()
-                exe = static.Executor(place)
-                prog = paddle.static.default_main_program()
-                exe.run(static.default_startup_program())
-                inputs = np.ones((10, 10), dtype='float32')
-                exe.run(prog, feed={'x': inputs}, fetch_list=[y, ])
-                path = 'temp/tensor_'
-                for var in prog.list_vars():
-                    if var.persistable:
-                        t = var.get_value()
-                        paddle.save(t, path+var.name+'.pdtensor')
+                >>> y = static.nn.fc(x, 10, name='fc')
+                >>> place = paddle.CPUPlace()
+                >>> exe = static.Executor(place)
+                >>> prog = paddle.static.default_main_program()
+                >>> exe.run(static.default_startup_program())
+                >>> inputs = np.ones((10, 10), dtype='float32')
+                >>> exe.run(prog, feed={'x': inputs}, fetch_list=[y, ])
+                >>> path = 'temp/tensor_'
+                >>> for var in prog.list_vars():
+                ...     if var.persistable:
+                ...         t = var.get_value()
+                ...         paddle.save(t, path+var.name+'.pdtensor')
 
-                for var in prog.list_vars():
-                    if var.persistable:
-                        t_load = paddle.load(path+var.name+'.pdtensor')
-                        var.set_value(t_load)
+                >>> for var in prog.list_vars():
+                ...     if var.persistable:
+                ...         t_load = paddle.load(path+var.name+'.pdtensor')
+                ...         var.set_value(t_load)
 
         '''
 
@@ -2564,15 +2597,15 @@ def size(self):
         Examples:
             .. code-block:: python
 
-                import paddle
+                >>> import paddle
 
-                paddle.enable_static()
+                >>> paddle.enable_static()
 
-                # create a static Variable
-                x = paddle.static.data(name='x', shape=[3, 2, 1])
+                >>> # create a static Variable
+                >>> x = paddle.static.data(name='x', shape=[3, 2, 1])
 
-                # get the number of elements of the Variable
-                y = x.size()
+                >>> # get the number of elements of the Variable
+                >>> y = x.size()
 
         """
 
@@ -2765,13 +2798,13 @@ class Operator:
     Examples:
         .. code-block:: python
 
-            import paddle.base as base
-            cur_program = base.Program()
-            cur_block = cur_program.current_block()
-            # var1 += var2 + var3
-            cur_block.append_op(type="sum",
-                                inputs={"X": [var1, var2, var3]},
-                                outputs={"Out": [var1]})
+            >>> import paddle.base as base
+            >>> cur_program = base.Program()
+            >>> cur_block = cur_program.current_block()
+            >>> # var1 += var2 + var3
+            >>> cur_block.append_op(type="sum",
+            ...                     inputs={"X": [var1, var2, var3]},
+            ...                     outputs={"Out": [var1]})
     """
 
     OP_WITHOUT_KERNEL_SET = {
@@ -3120,17 +3153,17 @@ def _to_readable_code(self, skip_op_callstack=True):
         Examples:
             .. code-block:: python
 
-            import paddle.base as base
+                >>> import paddle.base as base
 
-            cur_program = base.Program()
-            cur_block = cur_program.current_block()
-            var = cur_block.create_var(name="X",
-                                       shape=[-1, 23, 48],
-                                       dtype='float32')
-            new_op = cur_block.append_op(type="abs",
-                                inputs={"X": [var]},
-                                outputs={"Out": [var]})
-            print(new_op._to_readable_code())
+                >>> cur_program = base.Program()
+                >>> cur_block = cur_program.current_block()
+                >>> var = cur_block.create_var(name="X",
+                ...                            shape=[-1, 23, 48],
+                ...                            dtype='float32')
+                >>> new_op = cur_block.append_op(type="abs",
+                ...                              inputs={"X": [var]},
+                ...                              outputs={"Out": [var]})
+                >>> print(new_op._to_readable_code())
         """
         assert isinstance(
             skip_op_callstack, bool
@@ -3639,9 +3672,9 @@ def _stride_in_no_check_dy2st_diff():
 
 
 def check_if_to_static_diff_with_dygraph(op_type, inplace_map, outputs):
-    if (
-        op_type == "while"
-    ):  # dont' need check while, while is only a wrapper of inner ops, we will stuck in inner op.
+    if op_type in {"while", "conditional_block"}:
+        # Dont' need check while and conditional_block, it is only a wrapper of inner ops
+        # we will stuck in inner op.
         return
     if outputs is not None:
         for k, v in outputs.items():
@@ -3870,16 +3903,16 @@ class Block:
     Examples:
         .. code-block:: python
 
-            import paddle.base as base
+            >>> import paddle.base as base
 
-            cur_program = base.Program()
-            cur_block = cur_program.current_block()
-            var = cur_block.create_var(name="X",
-                                       shape=[-1, 23, 48],
-                                       dtype='float32')
-            cur_block.append_op(type="abs",
-                                inputs={"X": [var]},
-                                outputs={"Out": [var]})
+            >>> cur_program = base.Program()
+            >>> cur_block = cur_program.current_block()
+            >>> var = cur_block.create_var(name="X",
+            ...                            shape=[-1, 23, 48],
+            ...                            dtype='float32')
+            >>> cur_block.append_op(type="abs",
+            ...                     inputs={"X": [var]},
+            ...                     outputs={"Out": [var]})
     """
 
     def __init__(self, program, idx):
@@ -3909,17 +3942,17 @@ def _to_readable_code(self, skip_op_callstack=True):
         Examples:
             .. code-block:: python
 
-            import paddle.base as base
+                >>> import paddle.base as base
 
-            cur_program = base.Program()
-            cur_block = cur_program.current_block()
-            new_var = cur_block.create_var(name="X",
-                                           shape=[-1, 23, 48],
-                                           dtype='float32')
-            new_op = cur_block.append_op(type="abs",
-                                inputs={"X": [new_var]},
-                                outputs={"Out": [new_var]})
-            print(cur_block._to_readable_code())
+                >>> cur_program = base.Program()
+                >>> cur_block = cur_program.current_block()
+                >>> new_var = cur_block.create_var(name="X",
+                ...                                shape=[-1, 23, 48],
+                ...                                dtype='float32')
+                >>> new_op = cur_block.append_op(type="abs",
+                ...                     inputs={"X": [new_var]},
+                ...                     outputs={"Out": [new_var]})
+                >>> print(cur_block._to_readable_code())
         """
         assert isinstance(
             skip_op_callstack, bool
@@ -5556,20 +5589,20 @@ class Program:
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.static as static
+            >>> import paddle
+            >>> import paddle.static as static
 
-            paddle.enable_static()
+            >>> paddle.enable_static()
 
-            main_program = static.Program()
-            startup_program = static.Program()
-            with static.program_guard(main_program=main_program, startup_program=startup_program):
-                x = static.data(name="x", shape=[-1, 784], dtype='float32')
-                y = static.data(name="y", shape=[-1, 1], dtype='int32')
-                z = static.nn.fc(name="fc", x=x, size=10, activation="relu")
+            >>> main_program = static.Program()
+            >>> startup_program = static.Program()
+            >>> with static.program_guard(main_program=main_program, startup_program=startup_program):
+            ...     x = static.data(name="x", shape=[-1, 784], dtype='float32')
+            ...     y = static.data(name="y", shape=[-1, 1], dtype='int32')
+            ...     z = static.nn.fc(name="fc", x=x, size=10, activation="relu")
 
-            print("main program is: {}".format(main_program))
-            print("start up program is: {}".format(startup_program))
+            >>> print("main program is: {}".format(main_program))
+            >>> print("start up program is: {}".format(startup_program))
 
     """
 
@@ -5776,21 +5809,21 @@ def global_seed(self, seed=0):
         Examples:
             .. code-block:: python
 
-                import paddle
-                import paddle.static as static
+                >>> import paddle
+                >>> import paddle.static as static
 
-                paddle.enable_static()
+                >>> paddle.enable_static()
 
-                prog = static.default_main_program()
-                print(prog.random_seed)
-                ## 0
-                ## the default random seed is 0
+                >>> prog = static.default_main_program()
+                >>> print(prog.random_seed)
+                0
+                >>> ## the default random seed is 0
 
-                prog.global_seed(102)
-                prog1 = static.default_main_program()
-                print(prog1.random_seed)
-                ## 102
-                ## the random seed is 102
+                >>> prog.global_seed(102)
+                >>> prog1 = static.default_main_program()
+                >>> print(prog1.random_seed)
+                102
+                >>> ## the random seed is 102
         """
         global global_prog_seed
         global_prog_seed = seed
@@ -5939,20 +5972,20 @@ def _to_readable_code(self, skip_op_callstack=True):
         Examples:
             .. code-block:: python
 
-            import paddle
-            import paddle.static as static
+                >>> import paddle
+                >>> import paddle.static as static
 
-            paddle.enable_static()
+                >>> paddle.enable_static()
 
-            cur_program = static.Program()
-            cur_block = cur_program.current_block()
-            new_var = cur_block.create_var(name="X",
-                                           shape=[-1, 23, 48],
-                                           dtype='float32')
-            new_op = cur_block.append_op(type="abs",
-                                inputs={"X": [new_var]},
-                                outputs={"Out": [new_var]})
-            print(cur_program._to_readable_code())
+                >>> cur_program = static.Program()
+                >>> cur_block = cur_program.current_block()
+                >>> new_var = cur_block.create_var(name="X",
+                ...                                shape=[-1, 23, 48],
+                ...                                dtype='float32')
+                >>> new_op = cur_block.append_op(type="abs",
+                ...                     inputs={"X": [new_var]},
+                ...                     outputs={"Out": [new_var]})
+                >>> print(cur_program._to_readable_code())
         """
         assert isinstance(
             skip_op_callstack, bool
@@ -5984,18 +6017,18 @@ def to_string(self, throw_on_error, with_details=False):
         Examples:
             .. code-block:: python
 
-                import paddle
-                import paddle.static as static
+                >>> import paddle
+                >>> import paddle.static as static
 
-                paddle.enable_static()
+                >>> paddle.enable_static()
 
-                prog = static.default_main_program()
-                x = static.data(name="X", shape=[2,3], dtype="float32")
-                pred = static.nn.fc(x, size=3)
-                prog_string = prog.to_string(throw_on_error=True, with_details=False)
-                prog_string_with_details = prog.to_string(throw_on_error=False, with_details=True)
-                print("program string without detail: {}".format(prog_string))
-                print("program string with detail: {}".format(prog_string_with_details))
+                >>> prog = static.default_main_program()
+                >>> x = static.data(name="X", shape=[2,3], dtype="float32")
+                >>> pred = static.nn.fc(x, size=3)
+                >>> prog_string = prog.to_string(throw_on_error=True, with_details=False)
+                >>> prog_string_with_details = prog.to_string(throw_on_error=False, with_details=True)
+                >>> print("program string without detail: {}".format(prog_string))
+                >>> print("program string with detail: {}".format(prog_string_with_details))
         """
         assert isinstance(
             throw_on_error, bool
@@ -6072,18 +6105,18 @@ def clone(self, for_test=False):
             .. code-block:: python
                 :name: code-example-1
 
-                import paddle
-                import paddle.static as static
+                >>> import paddle
+                >>> import paddle.static as static
 
-                paddle.enable_static()
+                >>> paddle.enable_static()
 
-                img = static.data(name='image', shape=[None, 784])
-                pred = static.nn.fc(x=img, size=10, actvation='relu')
-                loss = paddle.mean(pred)
-                # Here we use clone before Momentum
-                test_program = static.default_main_program().clone(for_test=True)
-                optimizer = paddle.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
-                optimizer.minimize(loss)
+                >>> img = static.data(name='image', shape=[None, 784])
+                >>> pred = static.nn.fc(x=img, size=10, activation='relu')
+                >>> loss = paddle.mean(pred)
+                >>> # Here we use clone before Momentum
+                >>> test_program = static.default_main_program().clone(for_test=True)
+                >>> optimizer = paddle.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
+                >>> optimizer.minimize(loss)
 
         Args:
 
@@ -6106,118 +6139,118 @@ def clone(self, for_test=False):
             .. code-block:: python
                 :name: code-example-2
 
-                import paddle
+                >>> import paddle
 
-                def print_prog(prog):
-                    for name, value in sorted(prog.block(0).vars.items()):
-                        print(value)
-                    for op in prog.block(0).ops:
-                        print("op type is {}".format(op.type))
-                        print("op inputs are {}".format(op.input_arg_names))
-                        print("op outputs are {}".format(op.output_arg_names))
-                        for key, value in sorted(op.all_attrs().items()):
-                            if key not in ['op_callstack', 'op_role_var']:
-                                print(" [ attrs: {}:   {} ]".format(key, value))
+                >>> def print_prog(prog):
+                ...     for name, value in sorted(prog.block(0).vars.items()):
+                ...         print(value)
+                ...     for op in prog.block(0).ops:
+                ...         print("op type is {}".format(op.type))
+                ...         print("op inputs are {}".format(op.input_arg_names))
+                ...         print("op outputs are {}".format(op.output_arg_names))
+                ...         for key, value in sorted(op.all_attrs().items()):
+                ...             if key not in ['op_callstack', 'op_role_var']:
+                ...                 print(" [ attrs: {}:   {} ]".format(key, value))
 
 
             1. To clone a test program, the sample code is:
                 .. code-block:: python
                     :name: code-example-3
 
-                    import paddle
-                    import paddle.static as static
-                    import paddle.utils as utils
-                    import paddle.nn.functional as F
-
-                    paddle.enable_static()
-
-                    def print_prog(prog):
-                        for name, value in sorted(prog.block(0).vars.items()):
-                            print(value)
-                        for op in prog.block(0).ops:
-                            print("op type is {}".format(op.type))
-                            print("op inputs are {}".format(op.input_arg_names))
-                            print("op outputs are {}".format(op.output_arg_names))
-                            for key, value in sorted(op.all_attrs().items()):
-                                if key not in ['op_callstack', 'op_role_var']:
-                                    print(" [ attrs: {}:   {} ]".format(key, value))
-
-                    train_program = static.Program()
-                    startup_program = static.Program()
-
-                    # startup_program is used to do some parameter init work,
-                    # and main program is used to hold the network
-                    with static.program_guard(train_program, startup_program):
-                        with utils.unique_name.guard():
-                            img = static.data(name='image', shape=[None, 784])
-                            hidden = static.nn.fc(x=img, size=200, activation='relu')
-                            hidden = F.dropout(hidden, p=0.5)
-                            loss = F.cross_entropy(
-                                input=static.nn.fc(x=hidden, size=10, activation='softmax'),
-                                label=static.data(name='label', shape=[1], dtype='int64'))
-                            avg_loss = paddle.mean(loss)
-                            test_program = train_program.clone(for_test=True)
-                    print_prog(test_program)
-
-                    # Due to parameter sharing usage for train and test, so we need to use startup program of train
-                    # instead of using test startup program, while nothing is in test's startup program
-
-                    # In Paddle we will share weights by using the same Tensor name. In train and test program
-                    # all parameters will have the same name and this can make train and test program sharing parameters,
-                    # that's why we need to use startup program of train. And for startup program of test, it has nothing,
-                    # since it is a new program.
-
-                    with static.program_guard(train_program, startup_program):
-                        with utils.unique_name.guard():
-                            sgd = paddle.optimizer.SGD(learning_rate=1e-3)
-                            sgd.minimize(avg_loss)
+                    >>> import paddle
+                    >>> import paddle.static as static
+                    >>> import paddle.utils as utils
+                    >>> import paddle.nn.functional as F
+
+                    >>> paddle.enable_static()
+
+                    >>> def print_prog(prog):
+                    ...     for name, value in sorted(prog.block(0).vars.items()):
+                    ...         print(value)
+                    ...     for op in prog.block(0).ops:
+                    ...         print("op type is {}".format(op.type))
+                    ...         print("op inputs are {}".format(op.input_arg_names))
+                    ...         print("op outputs are {}".format(op.output_arg_names))
+                    ...         for key, value in sorted(op.all_attrs().items()):
+                    ...             if key not in ['op_callstack', 'op_role_var']:
+                    ...                 print(" [ attrs: {}:   {} ]".format(key, value))
+
+                    >>> train_program = static.Program()
+                    >>> startup_program = static.Program()
+
+                    >>> # startup_program is used to do some parameter init work,
+                    >>> # and main program is used to hold the network
+                    >>> with static.program_guard(train_program, startup_program):
+                    ...     with utils.unique_name.guard():
+                    ...         img = static.data(name='image', shape=[None, 784])
+                    ...         hidden = static.nn.fc(x=img, size=200, activation='relu')
+                    ...         hidden = F.dropout(hidden, p=0.5)
+                    ...         loss = F.cross_entropy(
+                    ...             input=static.nn.fc(x=hidden, size=10, activation='softmax'),
+                    ...             label=static.data(name='label', shape=[1], dtype='int64'))
+                    ...         avg_loss = paddle.mean(loss)
+                    ...         test_program = train_program.clone(for_test=True)
+                    >>> print_prog(test_program)
+
+                    >>> # Due to parameter sharing usage for train and test, so we need to use startup program of train
+                    >>> # instead of using test startup program, while nothing is in test's startup program
+
+                    >>> # In Paddle we will share weights by using the same Tensor name. In train and test program
+                    >>> # all parameters will have the same name and this can make train and test program sharing parameters,
+                    >>> # that's why we need to use startup program of train. And for startup program of test, it has nothing,
+                    >>> # since it is a new program.
+
+                    >>> with static.program_guard(train_program, startup_program):
+                    ...     with utils.unique_name.guard():
+                    ...         sgd = paddle.optimizer.SGD(learning_rate=1e-3)
+                    ...         sgd.minimize(avg_loss)
 
 
             2. The clone method can be avoid if you create program for training and program for testing individually.
                 .. code-block:: python
                     :name: code-example-4
 
-                    import paddle
-                    import paddle.static as static
-                    import paddle.utils as utils
-                    import paddle.nn.functional as F
-
-                    paddle.enable_static()
-
-                    def print_prog(prog):
-                        for name, value in sorted(prog.block(0).vars.items()):
-                            print(value)
-                        for op in prog.block(0).ops:
-                            print("op type is {}".format(op.type))
-                            print("op inputs are {}".format(op.input_arg_names))
-                            print("op outputs are {}".format(op.output_arg_names))
-                            for key, value in sorted(op.all_attrs().items()):
-                                if key not in ['op_callstack', 'op_role_var']:
-                                    print(" [ attrs: {}:   {} ]".format(key, value))
-
-                    def network():
-                        img = static.data(name='image', shape=[None, 784])
-                        hidden = static.nn.fc(x=img, size=200, activation='relu')
-                        hidden = F.dropout(hidden, p=0.5)
-                        loss = F.cross_entropy(
-                            input=static.nn.fc(x=hidden, size=10, activation='softmax'),
-                            label=static.data(name='label', shape=[1], dtype='int64'))
-                        avg_loss = paddle.mean(loss)
-                        return avg_loss
-
-                    train_program_2 = static.Program()
-                    startup_program_2 = static.Program()
-                    test_program_2 = static.Program()
-                    with static.program_guard(train_program_2, startup_program_2):
-                        with utils.unique_name.guard():
-                            avg_loss = network()
-                            sgd = paddle.optimizer.SGD(learning_rate=1e-3)
-                            sgd.minimize(avg_loss)
-                    # the test startup program is not used.
-                    with static.program_guard(test_program_2, startup_program_2):
-                        with utils.unique_name.guard():
-                            avg_loss = network()
-                    print_prog(test_program_2)
+                    >>> import paddle
+                    >>> import paddle.static as static
+                    >>> import paddle.utils as utils
+                    >>> import paddle.nn.functional as F
+
+                    >>> paddle.enable_static()
+
+                    >>> def print_prog(prog):
+                    ...     for name, value in sorted(prog.block(0).vars.items()):
+                    ...         print(value)
+                    ...     for op in prog.block(0).ops:
+                    ...         print("op type is {}".format(op.type))
+                    ...         print("op inputs are {}".format(op.input_arg_names))
+                    ...         print("op outputs are {}".format(op.output_arg_names))
+                    ...         for key, value in sorted(op.all_attrs().items()):
+                    ...             if key not in ['op_callstack', 'op_role_var']:
+                    ...                 print(" [ attrs: {}:   {} ]".format(key, value))
+
+                    >>> def network():
+                    ...     img = static.data(name='image', shape=[None, 784])
+                    ...     hidden = static.nn.fc(x=img, size=200, activation='relu')
+                    ...     hidden = F.dropout(hidden, p=0.5)
+                    ...     loss = F.cross_entropy(
+                    ...         input=static.nn.fc(x=hidden, size=10, activation='softmax'),
+                    ...         label=static.data(name='label', shape=[1], dtype='int64'))
+                    ...     avg_loss = paddle.mean(loss)
+                    ...     return avg_loss
+
+                    >>> train_program_2 = static.Program()
+                    >>> startup_program_2 = static.Program()
+                    >>> test_program_2 = static.Program()
+                    >>> with static.program_guard(train_program_2, startup_program_2):
+                    ...     with utils.unique_name.guard():
+                    ...         avg_loss = network()
+                    ...         sgd = paddle.optimizer.SGD(learning_rate=1e-3)
+                    ...         sgd.minimize(avg_loss)
+                    >>> # the test startup program is not used.
+                    >>> with static.program_guard(test_program_2, startup_program_2):
+                    ...     with utils.unique_name.guard():
+                    ...         avg_loss = network()
+                    >>> print_prog(test_program_2)
 
             The two code snippets above will generate and print same programs.
         """
@@ -6572,25 +6605,21 @@ def parse_from_string(binary_str):
         Examples:
             .. code-block:: python
 
-                import paddle
-                import paddle.static as static
-
-                paddle.enable_static()
-
-                startup_prog = static.Program()
-                main_prog = static.Program()
-                with static.program_guard(startup_prog, main_prog):
-                    x = static.data(name='X', shape=[1000, 784], dtype='float32')
+                >>> import paddle
+                >>> import paddle.static as static
 
-                    y = static.data(name='Y', shape=[784, 100], dtype='float32')
+                >>> paddle.enable_static()
 
-                    z = paddle.matmul(x=x, y=y)
-
-                    binary_str = static.default_main_program().desc.serialize_to_string()
-                    prog_restored = static.default_main_program().parse_from_string(binary_str)
-
-                    print(static.default_main_program())
-                    print(prog_restored)
+                >>> startup_prog = static.Program()
+                >>> main_prog = static.Program()
+                >>> with static.program_guard(startup_prog, main_prog):
+                ...     x = static.data(name='X', shape=[1000, 784], dtype='float32')
+                ...     y = static.data(name='Y', shape=[784, 100], dtype='float32')
+                ...     z = paddle.matmul(x=x, y=y)
+                ...     binary_str = static.default_main_program().desc.serialize_to_string()
+                ...     prog_restored = static.default_main_program().parse_from_string(binary_str)
+                ...     print(static.default_main_program())
+                ...     print(prog_restored)
         """
         p = Program()
         p.desc = core.ProgramDesc(binary_str)
@@ -6631,26 +6660,26 @@ def random_seed(self):
         Examples:
             .. code-block:: python
 
-                import paddle
-                import paddle.static as static
-                import paddle.nn.functional as F
+                >>> import paddle
+                >>> import paddle.static as static
+                >>> import paddle.nn.functional as F
 
-                paddle.enable_static()
+                >>> paddle.enable_static()
 
-                prog = static.default_main_program()
-                random_seed = prog.random_seed
-                x_var = static.data(name="X", shape=[3,3], dtype="float32")
-                print(random_seed)
-                ## 0
-                ## the default random seed is 0
+                >>> prog = static.default_main_program()
+                >>> random_seed = prog.random_seed
+                >>> x_var = static.data(name="X", shape=[3,3], dtype="float32")
+                >>> print(random_seed)
+                0
+                >>> ## the default random seed is 0
 
-                # Here we need to set random seed before we use paddle.nn.functional.dropout
-                prog.random_seed = 1
-                z_var = F.dropout(x_var, 0.7)
+                >>> # Here we need to set random seed before we use paddle.nn.functional.dropout
+                >>> prog.random_seed = 1
+                >>> z_var = F.dropout(x_var, 0.7)
 
-                print(prog.random_seed)
-                ## 1
-                ## the random seed is change to 1
+                >>> print(prog.random_seed)
+                1
+                >>> ## the random seed is change to 1
         """
         return self._seed
 
@@ -6669,17 +6698,15 @@ def num_blocks(self):
         Examples:
             .. code-block:: python
 
-                import paddle
-                import paddle.static as static
-
-                paddle.enable_static()
+                >>> import paddle
+                >>> import paddle.static as static
 
-                prog = static.default_main_program()
-                num_blocks = prog.num_blocks
-                print(num_blocks)
+                >>> paddle.enable_static()
 
-                # print result:
-                # 1
+                >>> prog = static.default_main_program()
+                >>> num_blocks = prog.num_blocks
+                >>> print(num_blocks)
+                1
         """
         return self.desc.num_blocks()
 
@@ -6709,14 +6736,14 @@ def global_block(self):
         Examples:
             .. code-block:: python
 
-                import paddle
-                import paddle.static as static
+                >>> import paddle
+                >>> import paddle.static as static
 
-                paddle.enable_static()
+                >>> paddle.enable_static()
 
-                prog = static.default_main_program()
-                gb_block = prog.global_block()
-                print(gb_block)
+                >>> prog = static.default_main_program()
+                >>> gb_block = prog.global_block()
+                >>> print(gb_block)
 
         """
         return self.blocks[0]
@@ -6737,14 +6764,14 @@ def block(self, index):
         Examples:
             .. code-block:: python
 
-                import paddle
-                import paddle.static as static
+                >>> import paddle
+                >>> import paddle.static as static
 
-                paddle.enable_static()
+                >>> paddle.enable_static()
 
-                prog = static.default_main_program()
-                block_0 = prog.block(0)
-                print(block_0)
+                >>> prog = static.default_main_program()
+                >>> block_0 = prog.block(0)
+                >>> print(block_0)
         """
         return self.blocks[index]
 
@@ -6762,14 +6789,14 @@ def current_block(self):
         Examples:
             .. code-block:: python
 
-                import paddle
-                import paddle.static as static
+                >>> import paddle
+                >>> import paddle.static as static
 
-                paddle.enable_static()
+                >>> paddle.enable_static()
 
-                prog = static.default_main_program()
-                current_blk = prog.current_block()
-                print(current_blk)
+                >>> prog = static.default_main_program()
+                >>> current_blk = prog.current_block()
+                >>> print(current_blk)
         """
         return self.blocks[self.current_block_idx]
 
@@ -6914,19 +6941,19 @@ def list_vars(self):
         Examples:
             .. code-block:: python
 
-                import paddle
-                import paddle.static as static
+                >>> import paddle
+                >>> import paddle.static as static
 
-                paddle.enable_static()
+                >>> paddle.enable_static()
 
-                prog = static.default_main_program()
-                img = static.data(name='img', shape=[None, 1,28,28], dtype='float32')
-                label = static.data(name='label', shape=[None,1], dtype='int64')
-                for var in prog.list_vars():
-                    print(var)
+                >>> prog = static.default_main_program()
+                >>> img = static.data(name='img', shape=[None, 1,28,28], dtype='float32')
+                >>> label = static.data(name='label', shape=[None,1], dtype='int64')
+                >>> for var in prog.list_vars():
+                ...     print(var)
 
-                # var img : LOD_TENSOR.shape(-1, 1, 28, 28).dtype(float32).stop_gradient(True)
-                # var label : LOD_TENSOR.shape(-1, 1).dtype(int64).stop_gradient(True)
+                >>> # var img : LOD_TENSOR.shape(-1, 1, 28, 28).dtype(float32).stop_gradient(True)
+                >>> # var label : LOD_TENSOR.shape(-1, 1).dtype(int64).stop_gradient(True)
         """
         for each_block in self.blocks:
             for each_var in list(each_block.vars.values()):
@@ -6942,29 +6969,29 @@ def all_parameters(self):
         Examples:
             .. code-block:: python
 
-                import paddle
-                import paddle.static as static
+                >>> import paddle
+                >>> import paddle.static as static
 
-                paddle.enable_static()
+                >>> paddle.enable_static()
 
-                program = static.default_main_program()
-                data = static.data(name='x', shape=[None, 13], dtype='float32')
-                hidden = static.nn.fc(x=data, size=10)
-                loss = paddle.mean(hidden)
-                paddle.optimizer.SGD(learning_rate=0.01).minimize(loss)
+                >>> program = static.default_main_program()
+                >>> data = static.data(name='x', shape=[None, 13], dtype='float32')
+                >>> hidden = static.nn.fc(x=data, size=10)
+                >>> loss = paddle.mean(hidden)
+                >>> paddle.optimizer.SGD(learning_rate=0.01).minimize(loss)
 
-                for param in program.all_parameters():
-                    print(param)
+                >>> for param in program.all_parameters():
+                ...     print(param)
 
-                # Here will print all parameters in current program, in this example,
-                # the result is like:
-                #
-                # persist trainable param fc_0.w_0 : LOD_TENSOR.shape(13, 10).dtype(float32).stop_gradient(False)
-                # persist trainable param fc_0.b_0 : LOD_TENSOR.shape(10,).dtype(float32).stop_gradient(False)
-                #
-                # Here print(param) will print out all the properties of a parameter,
-                # including name, type and persistable, you can access to specific
-                # property of a parameter, such as param.name, param.type
+                >>> # Here will print all parameters in current program, in this example,
+                >>> # the result is like:
+                >>> #
+                >>> # persist trainable param fc_0.w_0 : LOD_TENSOR.shape(13, 10).dtype(float32).stop_gradient(False)
+                >>> # persist trainable param fc_0.b_0 : LOD_TENSOR.shape(10,).dtype(float32).stop_gradient(False)
+                >>> #
+                >>> # Here print(param) will print out all the properties of a parameter,
+                >>> # including name, type and persistable, you can access to specific
+                >>> # property of a parameter, such as param.name, param.type
         """
         parameters = []
         for each_block in self.blocks:
@@ -6995,22 +7022,22 @@ def state_dict(self, mode='all', scope=None):
         Examples:
             .. code-block:: python
 
-                import paddle
-                import paddle.static as static
+                >>> import paddle
+                >>> import paddle.static as static
 
-                paddle.enable_static()
+                >>> paddle.enable_static()
 
-                x = static.data(name="x", shape=[10, 10], dtype='float32')
-                y = static.nn.fc(x, 10)
-                z = static.nn.fc(y, 10)
+                >>> x = static.data(name="x", shape=[10, 10], dtype='float32')
+                >>> y = static.nn.fc(x, 10)
+                >>> z = static.nn.fc(y, 10)
 
-                place = paddle.CPUPlace()
-                exe = static.Executor(place)
-                exe.run(static.default_startup_program())
-                prog = static.default_main_program()
+                >>> place = paddle.CPUPlace()
+                >>> exe = static.Executor(place)
+                >>> exe.run(static.default_startup_program())
+                >>> prog = static.default_main_program()
 
-                path = "./temp/model.pdparams"
-                paddle.save(prog.state_dict(), path)
+                >>> path = "./temp/model.pdparams"
+                >>> paddle.save(prog.state_dict(), path)
         """
         # The 'framework' is a low-level module, and 'executor'
         # can not be imported at the begainning of this file.
@@ -7102,24 +7129,24 @@ def set_state_dict(self, state_dict, scope=None):
         Examples:
             .. code-block:: python
 
-                import paddle
-                import paddle.static as static
+                >>> import paddle
+                >>> import paddle.static as static
 
-                paddle.enable_static()
+                >>> paddle.enable_static()
 
-                x = static.data(name="x", shape=[10, 10], dtype='float32')
-                y = static.nn.fc(x, 10)
-                z = static.nn.fc(y, 10)
+                >>> x = static.data(name="x", shape=[10, 10], dtype='float32')
+                >>> y = static.nn.fc(x, 10)
+                >>> z = static.nn.fc(y, 10)
 
-                place = paddle.CPUPlace()
-                exe = static.Executor(place)
-                exe.run(static.default_startup_program())
-                prog = static.default_main_program()
+                >>> place = paddle.CPUPlace()
+                >>> exe = static.Executor(place)
+                >>> exe.run(static.default_startup_program())
+                >>> prog = static.default_main_program()
 
-                path = "./temp/model.pdparams"
-                paddle.save(prog.state_dict(), path)
-                state_dict_load = paddle.load(path)
-                prog.set_state_dict(state_dict_load)
+                >>> path = "./temp/model.pdparams"
+                >>> paddle.save(prog.state_dict(), path)
+                >>> state_dict_load = paddle.load(path)
+                >>> prog.set_state_dict(state_dict_load)
         """
 
         if not isinstance(state_dict, dict):
@@ -7247,13 +7274,13 @@ def to_string(self, throw_on_error, with_details=False):
         Examples:
             .. code-block:: python
 
-                import paddle.base as base
-                import paddle
+                >>> import paddle.base as base
+                >>> import paddle
 
-                prog = base.default_main_program()
-                rlt = paddle.static.data("fake_data", shape=[-1,1,1], dtype='float32')
-                debug_str = prog.to_string(throw_on_error=True, with_details=False)
-                print(debug_str)
+                >>> prog = base.default_main_program()
+                >>> rlt = paddle.static.data("fake_data", shape=[-1,1,1], dtype='float32')
+                >>> debug_str = prog.to_string(throw_on_error=True, with_details=False)
+                >>> print(debug_str)
         """
         assert isinstance(throw_on_error, bool) and isinstance(
             with_details, bool
@@ -7393,14 +7420,15 @@ def __str__(self):
         Examples:
             .. code-block:: python
 
-                import paddle
-                linear = paddle.nn.Linear(3, 3)
-                print(linear.weight)
-                # Parameter containing:
-                # Tensor(shape=[3, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=False,
-                #        [[ 0.48948765,  0.05829060, -0.25524026],
-                #         [-0.70368278,  0.52986908, -0.68742192],
-                #         [-0.54217887,  0.48439729,  0.34082305]])
+                >>> import paddle
+                >>> linear = paddle.nn.Linear(3, 3)
+                >>> print(linear.weight)
+                >>> # doctest: +SKIP('it will be different')
+                Parameter containing:
+                Tensor(shape=[3, 3], dtype=float32, place=Place(cpu), stop_gradient=False,
+                [[ 0.48948765,  0.05829060, -0.25524026],
+                 [-0.70368278,  0.52986908, -0.68742192],
+                 [-0.54217887,  0.48439729,  0.34082305]])
         """
         return "Parameter containing:\n{tensor}".format(
             tensor=super().__str__()
@@ -7413,20 +7441,23 @@ def __deepcopy__(self, memo):
         Examples:
             .. code-block:: python
 
-                import paddle
-                import copy
-                linear = paddle.nn.Linear(1, 3)
-                linear_copy = copy.deepcopy(linear)
+                >>> import paddle
+                >>> import copy
+                >>> linear = paddle.nn.Linear(1, 3)
+                >>> linear_copy = copy.deepcopy(linear)
 
-                print(linear.weight)
-                # Parameter containing:
-                # Tensor(shape=[1, 3], dtype=float32, place=CPUPlace, stop_gradient=False,
-                #     [[-0.30929261, -0.90929240, -1.07851017]])
+                >>> print(linear.weight)
+                >>> # doctest: +SKIP('it will be different')
+                Parameter containing:
+                Tensor(shape=[1, 3], dtype=float32, place=Place(cpu), stop_gradient=False,
+                    [[-0.30929261, -0.90929240, -1.07851017]])
 
-                print(linear_copy.weight)
-                # Parameter containing:
-                # Tensor(shape=[1, 3], dtype=float32, place=CPUPlace, stop_gradient=False,
-                #     [[-0.30929261, -0.90929240, -1.07851017]])
+                >>> # doctest: -SKIP
+                >>> print(linear_copy.weight)
+                >>> # doctest: +SKIP('it will be different')
+                Parameter containing:
+                Tensor(shape=[1, 3], dtype=float32, place=Place(cpu), stop_gradient=False,
+                    [[-0.30929261, -0.90929240, -1.07851017]])
 
         """
         state = copy.deepcopy(self.__dict__, memo)
@@ -7471,13 +7502,13 @@ def default_startup_program():
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            paddle.enable_static()
-            x = paddle.static.data(name="x", shape=[-1, 784], dtype='float32')
-            out = paddle.static.nn.fc(name="fc", x=x, size=10, activation="relu")
-            print("main program is: {}".format(paddle.static.default_main_program()))
-            print("start up program is: {}".format(paddle.static.default_startup_program()))
+            >>> paddle.enable_static()
+            >>> x = paddle.static.data(name="x", shape=[-1, 784], dtype='float32')
+            >>> out = paddle.static.nn.fc(name="fc", x=x, size=10, activation="relu")
+            >>> print("main program is: {}".format(paddle.static.default_main_program()))
+            >>> print("start up program is: {}".format(paddle.static.default_startup_program()))
     """
     return _startup_program_
 
@@ -7502,18 +7533,19 @@ def default_main_program():
     Examples:
         ..  code-block:: python
 
-            import paddle
-
-            paddle.enable_static()
-            # Sample Network:
-            x = paddle.static.data(name='x', shape=[100, 100], dtype='float32')
-            y = paddle.static.data(name='y', shape=[100, 100], dtype='float32')
-            out = paddle.add(x, y)
+            >>> import paddle
 
-            #print the number of blocks in the program, 1 in this case
-            print(paddle.static.default_main_program().num_blocks) # 1
-            #print the default_main_program
-            print(paddle.static.default_main_program())
+            >>> paddle.enable_static()
+            >>> # Sample Network:
+            >>> x = paddle.static.data(name='x', shape=[100, 100], dtype='float32')
+            >>> y = paddle.static.data(name='y', shape=[100, 100], dtype='float32')
+            >>> out = paddle.add(x, y)
+
+            >>> # print the number of blocks in the program, 1 in this case
+            >>> print(paddle.static.default_main_program().num_blocks)
+            1
+            >>> # print the default_main_program
+            >>> print(paddle.static.default_main_program())
     """
     return _main_program_
 
@@ -7569,14 +7601,14 @@ def program_guard(main_program, startup_program=None):
         .. code-block:: python
             :name: code-example-1
 
-            import paddle
+            >>> import paddle
 
-            paddle.enable_static()
-            main_program = paddle.static.Program()
-            startup_program = paddle.static.Program()
-            with paddle.static.program_guard(main_program, startup_program):
-                data = paddle.static.data(name='image', shape=[None, 784, 784], dtype='float32')
-                hidden = paddle.static.nn.fc(x=data, size=10, activation='relu')
+            >>> paddle.enable_static()
+            >>> main_program = paddle.static.Program()
+            >>> startup_program = paddle.static.Program()
+            >>> with paddle.static.program_guard(main_program, startup_program):
+            ...     data = paddle.static.data(name='image', shape=[None, 784, 784], dtype='float32')
+            ...     hidden = paddle.static.nn.fc(x=data, size=10, activation='relu')
 
     Notes: The temporary :code:`Program` can be used if the user does not need
     to construct either of startup program or main program.
@@ -7585,13 +7617,13 @@ def program_guard(main_program, startup_program=None):
         .. code-block:: python
             :name: code-example-2
 
-            import paddle
+            >>> import paddle
 
-            paddle.enable_static()
-            main_program = paddle.static.Program()
-            # does not care about startup program. Just pass a temporary value.
-            with paddle.static.program_guard(main_program, paddle.static.Program()):
-                data = paddle.static.data(name='image', shape=[None, 784, 784], dtype='float32')
+            >>> paddle.enable_static()
+            >>> main_program = paddle.static.Program()
+            >>> # does not care about startup program. Just pass a temporary value.
+            >>> with paddle.static.program_guard(main_program, paddle.static.Program()):
+            ...     data = paddle.static.data(name='image', shape=[None, 784, 784], dtype='float32')
 
     """
     from .data_feeder import check_type
@@ -7715,30 +7747,30 @@ def device_guard(device=None):
 
         .. code-block:: python
 
-            # required: gpu
-            import paddle
-
-            paddle.enable_static()
-            support_gpu = paddle.is_compiled_with_cuda()
-            place = paddle.CPUPlace()
-            if support_gpu:
-                place = paddle.CUDAPlace(0)
-
-            # if GPU is supported, the three OPs below will be automatically assigned to CUDAPlace(0)
-            data1 = paddle.full(shape=[1, 3, 8, 8], fill_value=0.5, dtype='float32')
-            data2 = paddle.full(shape=[1, 3, 64], fill_value=0.5, dtype='float32')
-            shape = paddle.shape(data2)
-
-            with paddle.static.device_guard("cpu"):
-                # Ops created here will be placed on CPUPlace
-                shape = paddle.slice(shape, axes=[0], starts=[0], ends=[4])
-            with paddle.static.device_guard('gpu'):
-                # if GPU is supported, OPs created here will be placed on CUDAPlace(0), otherwise on CPUPlace
-                out = paddle.reshape(data1, shape=shape)
-
-            exe = paddle.static.Executor(place)
-            exe.run(paddle.static.default_startup_program())
-            result = exe.run(fetch_list=[out])
+            >>> # doctest: +REQUIRES(env:GPU)
+            >>> import paddle
+            >>> paddle.device.set_device('gpu')
+            >>> paddle.enable_static()
+            >>> support_gpu = paddle.is_compiled_with_cuda()
+            >>> place = paddle.CPUPlace()
+            >>> if support_gpu:
+            ...     place = paddle.CUDAPlace(0)
+
+            >>> # if GPU is supported, the three OPs below will be automatically assigned to CUDAPlace(0)
+            >>> data1 = paddle.full(shape=[1, 3, 8, 8], fill_value=0.5, dtype='float32')
+            >>> data2 = paddle.full(shape=[1, 3, 64], fill_value=0.5, dtype='float32')
+            >>> shape = paddle.shape(data2)
+
+            >>> with paddle.static.device_guard("cpu"):
+            ...     # Ops created here will be placed on CPUPlace
+            ...     shape = paddle.slice(shape, axes=[0], starts=[0], ends=[4])
+            >>> with paddle.static.device_guard('gpu'):
+            ...     # if GPU is supported, OPs created here will be placed on CUDAPlace(0), otherwise on CPUPlace
+            ...     out = paddle.reshape(data1, shape=shape)
+
+            >>> exe = paddle.static.Executor(place)
+            >>> exe.run(paddle.static.default_startup_program())
+            >>> result = exe.run(fetch_list=[out])
     """
 
     index = None
@@ -7807,8 +7839,8 @@ def set_flags(flags):
     Examples:
             .. code-block:: python
 
-                import paddle
-                paddle.set_flags({'FLAGS_eager_delete_tensor_gb': 1.0})
+                >>> import paddle
+                >>> paddle.set_flags({'FLAGS_eager_delete_tensor_gb': 1.0})
     """
     if not isinstance(flags, dict):
         raise TypeError('flags in set_flags should be a dict')
@@ -7835,12 +7867,12 @@ def get_flags(flags):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            flags = ['FLAGS_eager_delete_tensor_gb', 'FLAGS_check_nan_inf']
-            res = paddle.get_flags(flags)
-            print(res)
-            # {'FLAGS_eager_delete_tensor_gb': 0.0, 'FLAGS_check_nan_inf': False}
+            >>> flags = ['FLAGS_eager_delete_tensor_gb', 'FLAGS_check_nan_inf']
+            >>> res = paddle.get_flags(flags)
+            >>> print(res)
+            {'FLAGS_eager_delete_tensor_gb': 0.0, 'FLAGS_check_nan_inf': False}
     """
     flags_value = {}
     if isinstance(flags, (list, tuple)):
diff --git a/python/paddle/base/initializer.py b/python/paddle/base/initializer.py
index c9d197a4f6840..d1c9ceaa8a20b 100644
--- a/python/paddle/base/initializer.py
+++ b/python/paddle/base/initializer.py
@@ -76,27 +76,27 @@ def set_global_initializer(weight_init, bias_init=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.nn as nn
-
-            nn.initializer.set_global_initializer(nn.initializer.Uniform(), nn.initializer.Constant())
-            x_var = paddle.uniform((2, 4, 8, 8), dtype='float32', min=-1., max=1.)
-
-            # The weight of conv1 is initialized by Uniform
-            # The bias of conv1 is initialized by Constant
-            conv1 = nn.Conv2D(4, 6, (3, 3))
-            y_var1 = conv1(x_var)
-
-            # If set param_attr/bias_attr too, global initializer will not take effect
-            # The weight of conv2 is initialized by Xavier
-            # The bias of conv2 is initialized by Normal
-            conv2 = nn.Conv2D(4, 6, (3, 3),
-                weight_attr=nn.initializer.XavierUniform(),
-                bias_attr=nn.initializer.Normal())
-            y_var2 = conv2(x_var)
-
-            # Cancel the global initializer in framework, it will takes effect in subsequent code
-            nn.initializer.set_global_initializer(None)
+            >>> import paddle
+            >>> import paddle.nn as nn
+
+            >>> nn.initializer.set_global_initializer(nn.initializer.Uniform(), nn.initializer.Constant())
+            >>> x_var = paddle.uniform((2, 4, 8, 8), dtype='float32', min=-1., max=1.)
+
+            >>> # The weight of conv1 is initialized by Uniform
+            >>> # The bias of conv1 is initialized by Constant
+            >>> conv1 = nn.Conv2D(4, 6, (3, 3))
+            >>> y_var1 = conv1(x_var)
+
+            >>> # If set param_attr/bias_attr too, global initializer will not take effect
+            >>> # The weight of conv2 is initialized by Xavier
+            >>> # The bias of conv2 is initialized by Normal
+            >>> conv2 = nn.Conv2D(4, 6, (3, 3),
+            ...     weight_attr=nn.initializer.XavierUniform(),
+            ...     bias_attr=nn.initializer.Normal())
+            >>> y_var2 = conv2(x_var)
+
+            >>> # Cancel the global initializer in framework, it will takes effect in subsequent code
+            >>> nn.initializer.set_global_initializer(None)
     """
 
     check_type(
diff --git a/python/paddle/base/layer_helper_base.py b/python/paddle/base/layer_helper_base.py
index 02419f474e102..a5b0a3a988865 100644
--- a/python/paddle/base/layer_helper_base.py
+++ b/python/paddle/base/layer_helper_base.py
@@ -76,15 +76,15 @@ def to_variable(self, value, name=None):
 
         Examples:
 
-         .. code-block:: python
+            .. code-block:: python
 
-            import numpy as np
-            import paddle.base as base
-
-            with base.dygraph.guard():
-                x = np.ones([2, 2], np.float32)
-                y = base.dygraph.to_variable(x)
+                >>> import numpy as np
+                >>> import paddle.base as base
 
+                >>> with base.dygraph.guard():
+                ...     x = np.ones([2, 2], np.float32)
+                ...     y = base.dygraph.to_variable(x)
+                ...
         """
         if isinstance(value, np.ndarray):
             return core.eager.Tensor(
@@ -431,6 +431,12 @@ def create_parameter(
                 **attr._to_kwargs(with_initializer=True)
             )
         else:
+            if paddle.ir.core._use_new_ir_api():
+                return paddle.ir.core.create_parameter(
+                    dtype=dtype,
+                    shape=shape,
+                    **attr._to_kwargs(with_initializer=True)
+                )
             self.startup_program.global_block().create_parameter(
                 dtype=dtype,
                 shape=shape,
diff --git a/python/paddle/base/layers/math_op_patch.py b/python/paddle/base/layers/math_op_patch.py
index c5d3b5ec8aa4b..d03f3f580d2e3 100644
--- a/python/paddle/base/layers/math_op_patch.py
+++ b/python/paddle/base/layers/math_op_patch.py
@@ -147,11 +147,11 @@ def cpu(self):
 
             .. code-block:: python
 
-                import paddle
-                paddle.enable_static()
+                >>> import paddle
+                >>> paddle.enable_static()
 
-                x = paddle.static.data(name="x", shape=[2,2], dtype='float32')
-                y = x.cpu()
+                >>> x = paddle.static.data(name="x", shape=[2,2], dtype='float32')
+                >>> y = x.cpu()
         """
         block = current_block(self)
         tmp_name = unique_tmp_name()
@@ -194,12 +194,12 @@ def cuda(self, device_id=None, blocking=True):
 
             .. code-block:: python
 
-                import paddle
-                paddle.enable_static()
+                >>> import paddle
+                >>> paddle.enable_static()
 
-                x = paddle.static.data(name="x", shape=[2,2], dtype='float32')
-                y = x.cpu()
-                z = y.cuda()
+                >>> x = paddle.static.data(name="x", shape=[2,2], dtype='float32')
+                >>> y = x.cpu()
+                >>> z = y.cuda()
         """
         if device_id is not None:
             warnings.warn("device_id is not supported, and it will be ignored.")
@@ -258,30 +258,35 @@ def astype(self, dtype):
             In Static Graph Mode:
 
             .. code-block:: python
-                import paddle
-                import paddle.base as base
-                paddle.enable_static()
-                startup_prog = base.Program()
-                main_prog = base.Program()
-                with base.program_guard(startup_prog, main_prog):
-                    original_variable = paddle.static.data(name = "new_variable", shape=[2,2], dtype='float32')
-                    new_variable = original_variable.astype('int64')
-                    print("new var's dtype is: {}".format(new_variable.dtype))
+
+                >>> import paddle
+                >>> import paddle.base as base
+                >>> paddle.enable_static()
+                >>> startup_prog = paddle.static.Program()
+                >>> main_prog = paddle.static.Program()
+                >>> with base.program_guard(startup_prog, main_prog):
+                ...     original_variable = paddle.static.data(name = "new_variable", shape=[2,2], dtype='float32')
+                ...     new_variable = original_variable.astype('int64')
+                ...     print("new var's dtype is: {}".format(new_variable.dtype))
+                ...
+                new var's dtype is: paddle.int64
 
             In Dygraph Mode:
 
             .. code-block:: python
 
-                import paddle.base as base
-                import numpy as np
-
-                x = np.ones([2, 2], np.float32)
-                with base.dygraph.guard():
-                    original_variable = base.dygraph.to_variable(x)
-                    print("original var's dtype is: {}, numpy dtype is {}".format(original_variable.dtype, original_variable.numpy().dtype))
-                    new_variable = original_variable.astype('int64')
-                    print("new var's dtype is: {}, numpy dtype is {}".format(new_variable.dtype, new_variable.numpy().dtype))
-
+                >>> import paddle.base as base
+                >>> import numpy as np
+
+                >>> x = np.ones([2, 2], np.float32)
+                >>> with base.dygraph.guard():
+                ...     original_variable = base.dygraph.to_variable(x)
+                ...     print("original var's dtype is: {}, numpy dtype is {}".format(original_variable.dtype, original_variable.numpy().dtype))
+                ...     new_variable = original_variable.astype('int64')
+                ...     print("new var's dtype is: {}, numpy dtype is {}".format(new_variable.dtype, new_variable.numpy().dtype))
+                ...
+                original var's dtype is: paddle.float32, numpy dtype is float32
+                new var's dtype is: paddle.int64, numpy dtype is int64
         """
         block = current_block(self)
         out = create_new_tmp_var(block, dtype)
@@ -387,14 +392,15 @@ def _ndim_(self):
         Examples:
             .. code-block:: python
 
-                import paddle
+                >>> import paddle
 
-                paddle.enable_static()
+                >>> paddle.enable_static()
 
-                # create a static Variable
-                x = paddle.static.data(name='x', shape=[3, 2, 1])
-                # print the dimension of the Variable
-                print(x.ndim)
+                >>> # create a static Variable
+                >>> x = paddle.static.data(name='x', shape=[3, 2, 1])
+                >>> # print the dimension of the Variable
+                >>> print(x.ndim())
+                3
         """
         return len(self.shape)
 
@@ -408,14 +414,15 @@ def ndimension(self):
         Examples:
             .. code-block:: python
 
-                import paddle
+                >>> import paddle
 
-                paddle.enable_static()
+                >>> paddle.enable_static()
 
-                # create a static Variable
-                x = paddle.static.data(name='x', shape=[3, 2, 1])
-                # print the dimension of the Variable
-                print(x.ndimension)
+                >>> # create a static Variable
+                >>> x = paddle.static.data(name='x', shape=[3, 2, 1])
+                >>> # print the dimension of the Variable
+                >>> print(x.ndimension())
+                3
         """
         return len(self.shape)
 
@@ -429,14 +436,15 @@ def dim(self):
         Examples:
             .. code-block:: python
 
-                import paddle
+                >>> import paddle
 
-                paddle.enable_static()
+                >>> paddle.enable_static()
 
-                # create a static Variable
-                x = paddle.static.data(name='x', shape=[3, 2, 1])
-                # print the dimension of the Variable
-                print(x.dim)
+                >>> # create a static Variable
+                >>> x = paddle.static.data(name='x', shape=[3, 2, 1])
+                >>> # print the dimension of the Variable
+                >>> print(x.dim())
+                3
         """
         return len(self.shape)
 
diff --git a/python/paddle/base/lod_tensor.py b/python/paddle/base/lod_tensor.py
index 02058c0f576dc..96e18ec8f3bde 100644
--- a/python/paddle/base/lod_tensor.py
+++ b/python/paddle/base/lod_tensor.py
@@ -62,10 +62,10 @@ def create_lod_tensor(data, recursive_seq_lens, place):
 
         .. code-block:: python
 
-            import paddle.base as base
-            import numpy as np
+            >>> import paddle.base as base
+            >>> import numpy as np
 
-            t = base.create_lod_tensor(np.ndarray([5, 30]), [[2, 3]], base.CPUPlace())
+            >>> t = base.create_lod_tensor(np.ndarray([5, 30]), [[2, 3]], base.CPUPlace())
     """
     if isinstance(data, core.LoDTensor):
         return create_lod_tensor(np.array(data), recursive_seq_lens, place)
@@ -154,13 +154,15 @@ def create_random_int_lodtensor(
         is inside [low, high].
 
     Examples:
+
         .. code-block:: python
 
-          import paddle.base as base
+            >>> import paddle.base as base
 
-          t = base.create_random_int_lodtensor(recursive_seq_lens=[[2, 3]],
-                    base_shape=[30], place=base.CPUPlace(), low=0, high=10)
-          print(t.shape()) # [5, 30]
+            >>> t = base.create_random_int_lodtensor(recursive_seq_lens=[[2, 3]],
+            ...         base_shape=[30], place=base.CPUPlace(), low=0, high=10)
+            >>> print(t.shape())
+            [5, 30]
     """
     assert isinstance(base_shape, list), "base_shape should be a list"
     # append the total number of basic elements to the front of its shape
diff --git a/python/paddle/base/log_helper.py b/python/paddle/base/log_helper.py
index c78041eabce7b..3d6fd9b1f289d 100644
--- a/python/paddle/base/log_helper.py
+++ b/python/paddle/base/log_helper.py
@@ -32,10 +32,15 @@ def get_logger(name, level, fmt=None):
         logging.Logger: logging logger with given settings
 
     Examples:
+
         .. code-block:: python
 
-            logger = log_helper.get_logger(__name__, logging.INFO,
-                            fmt='%(asctime)s-%(levelname)s: %(message)s')
+            >>> import paddle
+            >>> import logging
+            >>> from paddle.base import log_helper
+
+            >>> logger = log_helper.get_logger(__name__, logging.INFO,
+            ...                 fmt='%(asctime)s-%(levelname)s: %(message)s')
     """
 
     logger = logging.getLogger(name)
diff --git a/python/paddle/base/param_attr.py b/python/paddle/base/param_attr.py
index 15f2f89b12016..81f5d5470dfe4 100644
--- a/python/paddle/base/param_attr.py
+++ b/python/paddle/base/param_attr.py
@@ -61,14 +61,15 @@ class ParamAttr:
 
         .. code-block:: python
 
-            import paddle
-
-            weight_attr = paddle.ParamAttr(name="weight",
-                                           learning_rate=0.5,
-                                           regularizer=paddle.regularizer.L2Decay(1.0),
-                                           trainable=True)
-            print(weight_attr.name) # "weight"
-            paddle.nn.Linear(3, 4, weight_attr=weight_attr)
+            >>> import paddle
+
+            >>> weight_attr = paddle.ParamAttr(name="weight",
+            ...                                 learning_rate=0.5,
+            ...                                 regularizer=paddle.regularizer.L2Decay(1.0),
+            ...                                 trainable=True)
+            >>> print(weight_attr.name)
+            weight
+            >>> paddle.nn.Linear(3, 4, weight_attr=weight_attr)
     """
 
     def __init__(
@@ -259,24 +260,24 @@ class WeightNormParamAttr(ParamAttr):
 
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            paddle.enable_static()
+            >>> paddle.enable_static()
 
-            data = paddle.static.data(name="data", shape=[3, 32, 32], dtype="float32")
-
-            fc = paddle.static.nn.fc(x=data,
-                                     size=1000,
-                                     weight_attr=paddle.static.WeightNormParamAttr(
-                                         dim=None,
-                                         name='weight_norm_param',
-                                         initializer=paddle.nn.initializer.Constant(1.0),
-                                         learning_rate=1.0,
-                                         regularizer=paddle.regularizer.L2Decay(0.1),
-                                         trainable=True,
-                                         do_model_average=False,
-                                         need_clip=True))
+            >>> data = paddle.static.data(name="data", shape=[3, 32, 32], dtype="float32")
 
+            >>> fc = paddle.static.nn.fc(x=data,
+            ...                             size=1000,
+            ...                             weight_attr=paddle.static.WeightNormParamAttr(
+            ...                                 dim=None,
+            ...                                 name='weight_norm_param',
+            ...                                 initializer=paddle.nn.initializer.Constant(1.0),
+            ...                                 learning_rate=1.0,
+            ...                                 regularizer=paddle.regularizer.L2Decay(0.1),
+            ...                                 trainable=True,
+            ...                                 do_model_average=False,
+            ...                                 need_clip=True))
+            ...
     """
     # List to record the parameters reparameterized by weight normalization.
     # If these parameters are treated as Variable rather than Parameter,
diff --git a/python/paddle/base/reader.py b/python/paddle/base/reader.py
index 001c96bd7df26..04dd35fd5a3c0 100644
--- a/python/paddle/base/reader.py
+++ b/python/paddle/base/reader.py
@@ -463,22 +463,22 @@ def from_dataset(dataset, places, drop_last=True):
 
             .. code-block:: python
 
-                import paddle
-                import paddle.static as static
+                >>> import paddle
+                >>> import paddle.static as static
 
-                paddle.enable_static()
+                >>> paddle.enable_static()
 
-                image = static.data(name='image', shape=[None, 784], dtype='float32')
-                label = static.data(name='label', shape=[None, 1], dtype='int64')
+                >>> image = static.data(name='image', shape=[None, 784], dtype='float32')
+                >>> label = static.data(name='label', shape=[None, 1], dtype='int64')
 
-                dataset = paddle.distributed.QueueDataset()
-                dataset.init(
-                    batch_size=32,
-                    pipe_command='cat',
-                    use_var=[image, label])
-                dataset.set_filelist(['a.txt', 'b.txt', 'c.txt'])
+                >>> dataset = paddle.distributed.QueueDataset()
+                >>> dataset.init(
+                ...     batch_size=32,
+                ...     pipe_command='cat',
+                ...     use_var=[image, label])
+                >>> dataset.set_filelist(['a.txt', 'b.txt', 'c.txt'])
 
-                loader = paddle.base.io.DataLoader.from_dataset(dataset, static.cpu_places())
+                >>> loader = paddle.base.io.DataLoader.from_dataset(dataset, static.cpu_places())
         """
         return DatasetLoader(dataset, places, drop_last)
 
@@ -1150,6 +1150,7 @@ class PyReader(DataLoaderBase):
         reader(Reader)
 
     Examples:
+
         1. If iterable = False, the created PyReader object is almost the
            same as :code:`base.layers.py_reader()`. Operators would be
            inserted into the program. User should call :code:`start()`
@@ -1160,56 +1161,56 @@ class PyReader(DataLoaderBase):
 
         .. code-block:: python
 
-           import paddle
-           import paddle.base as base
-           import numpy as np
-
-           paddle.enable_static()
-
-           EPOCH_NUM = 3
-           ITER_NUM = 5
-           BATCH_SIZE = 3
-
-           def network(image, label):
-               # User-defined network, here is an example of softmax regression.
-               predict = paddle.static.nn.fc(x=image, size=10, activation='softmax')
-               return paddle.nn.functional.cross_entropy(
-                    input=predict, label=label,
-                    reduction='none', use_softmax=False
-               )
-
-           def reader_creator_random_image_and_label(height, width):
-               def reader():
-                   for i in range(ITER_NUM):
-                       fake_image = np.random.uniform(low=0,
-                                                      high=255,
-                                                      size=[height, width])
-                       fake_label = np.ones([1])
-                       yield fake_image, fake_label
-               return reader
-
-           image = paddle.static.data(name='image', shape=[None, 784, 784], dtype='float32')
-           label = paddle.static.data(name='label', shape=[None, 1], dtype='int64')
-
-           reader = base.io.PyReader(feed_list=[image, label],
-                                      capacity=4,
-                                      iterable=False)
-
-           user_defined_reader = reader_creator_random_image_and_label(784, 784)
-           reader.decorate_sample_list_generator(
-               paddle.batch(user_defined_reader, batch_size=BATCH_SIZE))
-           loss = network(image, label)
-           executor = base.Executor(base.CPUPlace())
-           executor.run(base.default_startup_program())
-           for i in range(EPOCH_NUM):
-               reader.start()
-               while True:
-                   try:
-                       executor.run(feed=None)
-                   except base.core.EOFException:
-                       reader.reset()
-                       break
-
+            >>> import paddle
+            >>> import paddle.base as base
+            >>> import numpy as np
+
+            >>> paddle.enable_static()
+
+            >>> EPOCH_NUM = 3
+            >>> ITER_NUM = 5
+            >>> BATCH_SIZE = 3
+
+            >>> def network(image, label):
+            ...     # User-defined network, here is an example of softmax regression.
+            ...     predict = paddle.static.nn.fc(x=image, size=10, activation='softmax')
+            ...     return paddle.nn.functional.cross_entropy(
+            ...         input=predict, label=label,
+            ...         reduction='none', use_softmax=False
+            ...     )
+            ...
+            >>> def reader_creator_random_image_and_label(height, width):
+            ...     def reader():
+            ...         for i in range(ITER_NUM):
+            ...             fake_image = np.random.uniform(low=0,
+            ...                                             high=255,
+            ...                                             size=[height, width])
+            ...             fake_label = np.ones([1])
+            ...             yield fake_image, fake_label
+            ...     return reader
+            ...
+            >>> image = paddle.static.data(name='image', shape=[None, 784, 784], dtype='float32')
+            >>> label = paddle.static.data(name='label', shape=[None, 1], dtype='int64')
+
+            >>> reader = base.io.PyReader(feed_list=[image, label],
+            ...                             capacity=4,
+            ...                             iterable=False)
+            ...
+            >>> user_defined_reader = reader_creator_random_image_and_label(784, 784)
+            >>> reader.decorate_sample_list_generator(
+            ...     paddle.batch(user_defined_reader, batch_size=BATCH_SIZE))
+            >>> loss = network(image, label)
+            >>> executor = base.Executor(base.CPUPlace())
+            >>> executor.run(base.default_startup_program())
+            >>> for i in range(EPOCH_NUM):
+            ...     reader.start()
+            ...     while True:
+            ...         try:
+            ...             executor.run(feed=None)
+            ...         except base.core.EOFException:
+            ...             reader.reset()
+            ...             break
+            ...
 
         2. If iterable=True, the created PyReader object is decoupled with
            the program. No operator would be inserted into the program.
@@ -1219,78 +1220,78 @@ def reader():
 
         .. code-block:: python
 
-           import paddle
-           import paddle.base as base
-           import numpy as np
-
-           paddle.enable_static()
-
-           EPOCH_NUM = 3
-           ITER_NUM = 5
-           BATCH_SIZE = 10
-
-           def network(image, label):
-               # User-defined network, here is an example of softmax regression.
-               predict = paddle.static.nn.fc(x=image, size=10, activation='softmax')
-               return paddle.nn.functional.cross_entropy(
-                   input=predict, label=label,
-                   reduction='none', use_softmax=False
-               )
-
-           def reader_creator_random_image(height, width):
-               def reader():
-                   for i in range(ITER_NUM):
-                       fake_image = np.random.uniform(low=0, high=255, size=[height, width])
-                       fake_label = np.ones([1])
-                       yield fake_image, fake_label
-               return reader
-
-           image = paddle.static.data(name='image', shape=[None, 784, 784], dtype='float32')
-           label = paddle.static.data(name='label', shape=[None, 1], dtype='int64')
-           reader = base.io.PyReader(feed_list=[image, label], capacity=4, iterable=True, return_list=False)
-
-           user_defined_reader = reader_creator_random_image(784, 784)
-           reader.decorate_sample_list_generator(
-               paddle.batch(user_defined_reader, batch_size=BATCH_SIZE),
-                   base.core.CPUPlace())
-
-           loss = network(image, label)
-           executor = base.Executor(base.CPUPlace())
-           executor.run(base.default_startup_program())
-
-           for _ in range(EPOCH_NUM):
-               for data in reader():
-                   executor.run(feed=data, fetch_list=[loss])
-
+            >>> import paddle
+            >>> import paddle.base as base
+            >>> import numpy as np
+
+            >>> paddle.enable_static()
+
+            >>> EPOCH_NUM = 3
+            >>> ITER_NUM = 5
+            >>> BATCH_SIZE = 10
+
+            >>> def network(image, label):
+            ...     # User-defined network, here is an example of softmax regression.
+            ...     predict = paddle.static.nn.fc(x=image, size=10, activation='softmax')
+            ...     return paddle.nn.functional.cross_entropy(
+            ...         input=predict, label=label,
+            ...         reduction='none', use_softmax=False
+            ...     )
+            ...
+            >>> def reader_creator_random_image(height, width):
+            ...     def reader():
+            ...         for i in range(ITER_NUM):
+            ...             fake_image = np.random.uniform(low=0, high=255, size=[height, width])
+            ...             fake_label = np.ones([1])
+            ...             yield fake_image, fake_label
+            ...     return reader
+            ...
+            >>> image = paddle.static.data(name='image', shape=[None, 784, 784], dtype='float32')
+            >>> label = paddle.static.data(name='label', shape=[None, 1], dtype='int64')
+            >>> reader = base.io.PyReader(feed_list=[image, label], capacity=4, iterable=True, return_list=False)
+
+            >>> user_defined_reader = reader_creator_random_image(784, 784)
+            >>> reader.decorate_sample_list_generator(
+            ...     paddle.batch(user_defined_reader, batch_size=BATCH_SIZE),
+            ...         base.core.CPUPlace())
+            ...
+            >>> loss = network(image, label)
+            >>> executor = base.Executor(base.CPUPlace())
+            >>> executor.run(base.default_startup_program())
+
+            >>> for _ in range(EPOCH_NUM):
+            ...     for data in reader():
+            ...         executor.run(feed=data, fetch_list=[loss])
+            ...
 
         3. If return_list=True, the return values would be presented as list instead of dict.
            This is usually used in dygraph mode.
 
         .. code-block:: python
 
-           import paddle
-           import paddle.base as base
-           import numpy as np
-
-           ITER_NUM = 5
-           BATCH_SIZE = 10
-
-           def reader_creator_random_image(height, width):
-               def reader():
-                   for i in range(ITER_NUM):
-                       yield np.random.uniform(low=0, high=255, size=[height, width]), \
-                           np.random.random_integers(low=0, high=9, size=[1])
-               return reader
-
-           place = base.CPUPlace()
-           with base.dygraph.guard(place):
-               py_reader = base.io.PyReader(capacity=2, return_list=True)
-               user_defined_reader = reader_creator_random_image(784, 784)
-               py_reader.decorate_sample_list_generator(
-                   paddle.batch(user_defined_reader, batch_size=BATCH_SIZE),
-                   place)
-               for image, label in py_reader():
-                   relu = paddle.nn.functional.relu(image)
+            >>> import paddle
+            >>> import paddle.base as base
+            >>> import numpy as np
+
+            >>> ITER_NUM = 5
+            >>> BATCH_SIZE = 10
+
+            >>> def reader_creator_random_image(height, width):
+            ...     def reader():
+            ...         for i in range(ITER_NUM):
+            ...             yield np.random.uniform(low=0, high=255, size=[height, width]), \
+            ...                 np.random.random_integers(low=0, high=9, size=[1])
+            ...     return reader
+            ...
+            >>> place = base.CPUPlace()
+            >>> with base.dygraph.guard(place):
+            ...     py_reader = base.io.PyReader(capacity=2, return_list=True)
+            ...     user_defined_reader = reader_creator_random_image(784, 784)
+            ...     py_reader.decorate_sample_list_generator(
+            ...         paddle.batch(user_defined_reader, batch_size=BATCH_SIZE),
+            ...         place)
+            ...     for image, label in py_reader():
+            ...         relu = paddle.nn.functional.relu(image)
     """
 
     def __init__(
@@ -1327,32 +1328,34 @@ def start(self):
         Example:
             .. code-block:: python
 
-                import paddle
-                import paddle.base as base
-                import numpy as np
-
-                BATCH_SIZE = 10
-
-                def generator():
-                    for i in range(5):
-                        yield np.random.uniform(low=0, high=255, size=[784, 784]),
-
-                image = paddle.static.data(name='image', shape=[None, 784, 784], dtype='float32')
-                reader = base.io.PyReader(feed_list=[image], capacity=4, iterable=False)
-                reader.decorate_sample_list_generator(
-                    paddle.batch(generator, batch_size=BATCH_SIZE))
-
-                executor = base.Executor(base.CPUPlace())
-                executor.run(base.default_startup_program())
-                for i in range(3):
-                    reader.start()
-                    while True:
-                        try:
-                            executor.run(feed=None)
-                        except base.core.EOFException:
-                            reader.reset()
-                            break
-
+                >>> import paddle
+                >>> import paddle.base as base
+                >>> import numpy as np
+
+                >>> paddle.enable_static()
+
+                >>> BATCH_SIZE = 10
+
+                >>> def generator():
+                ...     for i in range(5):
+                ...         yield np.random.uniform(low=0, high=255, size=[784, 784]),
+                ...
+                >>> image = paddle.static.data(name='image', shape=[None, 784, 784], dtype='float32')
+                >>> reader = base.io.PyReader(feed_list=[image], capacity=4, iterable=False)
+                >>> reader.decorate_sample_list_generator(
+                ...     paddle.batch(generator, batch_size=BATCH_SIZE))
+                ...
+                >>> executor = base.Executor(base.CPUPlace())
+                >>> executor.run(base.default_startup_program())
+                >>> for i in range(3):
+                ...     reader.start()
+                ...     while True:
+                ...         try:
+                ...             executor.run(feed=None)
+                ...         except base.core.EOFException:
+                ...             reader.reset()
+                ...             break
+                ...
         '''
         self._loader.start()
 
@@ -1364,32 +1367,34 @@ def reset(self):
         Example:
             .. code-block:: python
 
-                import paddle
-                import paddle.base as base
-                import numpy as np
-
-                BATCH_SIZE = 10
-
-                def generator():
-                    for i in range(5):
-                        yield np.random.uniform(low=0, high=255, size=[784, 784]),
-
-                image = paddle.static.data(name='image', shape=[None, 784, 784], dtype='float32')
-                reader = base.io.PyReader(feed_list=[image], capacity=4, iterable=False)
-                reader.decorate_sample_list_generator(
-                    paddle.batch(generator, batch_size=BATCH_SIZE))
-
-                executor = base.Executor(base.CPUPlace())
-                executor.run(base.default_startup_program())
-                for i in range(3):
-                    reader.start()
-                    while True:
-                        try:
-                            executor.run(feed=None)
-                        except base.core.EOFException:
-                            reader.reset()
-                            break
-
+                >>> import paddle
+                >>> import paddle.base as base
+                >>> import numpy as np
+
+                >>> paddle.enable_static()
+
+                >>> BATCH_SIZE = 10
+
+                >>> def generator():
+                ...     for i in range(5):
+                ...         yield np.random.uniform(low=0, high=255, size=[784, 784]),
+                ...
+                >>> image = paddle.static.data(name='image', shape=[None, 784, 784], dtype='float32')
+                >>> reader = base.io.PyReader(feed_list=[image], capacity=4, iterable=False)
+                >>> reader.decorate_sample_list_generator(
+                ...     paddle.batch(generator, batch_size=BATCH_SIZE))
+                ...
+                >>> executor = base.Executor(base.CPUPlace())
+                >>> executor.run(base.default_startup_program())
+                >>> for i in range(3):
+                ...     reader.start()
+                ...     while True:
+                ...         try:
+                ...             executor.run(feed=None)
+                ...         except base.core.EOFException:
+                ...             reader.reset()
+                ...             break
+                ...
         '''
         self._loader.reset()
 
@@ -1419,48 +1424,50 @@ def decorate_sample_generator(
         Example:
             .. code-block:: python
 
-                import paddle
-                import paddle.base as base
-                import numpy as np
-
-                EPOCH_NUM = 3
-                ITER_NUM = 15
-                BATCH_SIZE = 3
-
-                def network(image, label):
-                    # User-defined network, here is an example of softmax regression.
-                    predict = paddle.static.nn.fc(x=image, size=10, activation='softmax')
-                    return paddle.nn.functional.cross_entropy(
-                        input=predict, label=label,
-                        reduction='none', use_softmax=False
-                    )
-
-                def random_image_and_label_generator(height, width):
-                    def generator():
-                        for i in range(ITER_NUM):
-                            fake_image = np.random.uniform(low=0,
-                                                           high=255,
-                                                           size=[height, width])
-                            fake_label = np.array([1])
-                            yield fake_image, fake_label
-                    return generator
-
-                image = paddle.static.data(name='image', shape=[None, 784, 784], dtype='float32')
-                label = paddle.static.data(name='label', shape=[None, 1], dtype='int64')
-                reader = base.io.PyReader(feed_list=[image, label], capacity=4, iterable=True)
-
-                user_defined_generator = random_image_and_label_generator(784, 784)
-                reader.decorate_sample_generator(user_defined_generator,
-                                                 batch_size=BATCH_SIZE,
-                                                 places=[base.CPUPlace()])
-                loss = network(image, label)
-                executor = base.Executor(base.CPUPlace())
-                executor.run(base.default_startup_program())
-
-                for _ in range(EPOCH_NUM):
-                    for data in reader():
-                        executor.run(feed=data, fetch_list=[loss])
-
+                >>> import paddle
+                >>> import paddle.base as base
+                >>> import numpy as np
+
+                >>> paddle.enable_static()
+
+                >>> EPOCH_NUM = 3
+                >>> ITER_NUM = 15
+                >>> BATCH_SIZE = 3
+
+                >>> def network(image, label):
+                ...     # User-defined network, here is an example of softmax regression.
+                ...     predict = paddle.static.nn.fc(x=image, size=10, activation='softmax')
+                ...     return paddle.nn.functional.cross_entropy(
+                ...         input=predict, label=label,
+                ...         reduction='none', use_softmax=False
+                ...     )
+                ...
+                >>> def random_image_and_label_generator(height, width):
+                ...     def generator():
+                ...         for i in range(ITER_NUM):
+                ...             fake_image = np.random.uniform(low=0,
+                ...                                             high=255,
+                ...                                             size=[height, width])
+                ...             fake_label = np.array([1])
+                ...             yield fake_image, fake_label
+                ...     return generator
+                ...
+                >>> image = paddle.static.data(name='image', shape=[None, 784, 784], dtype='float32')
+                >>> label = paddle.static.data(name='label', shape=[None, 1], dtype='int64')
+                >>> reader = base.io.PyReader(feed_list=[image, label], capacity=4, iterable=True)
+
+                >>> user_defined_generator = random_image_and_label_generator(784, 784)
+                >>> reader.decorate_sample_generator(user_defined_generator,
+                ...                                     batch_size=BATCH_SIZE,
+                ...                                     places=[base.CPUPlace()])
+                >>> loss = network(image, label)
+                >>> executor = base.Executor(base.CPUPlace())
+                >>> executor.run(base.default_startup_program())
+
+                >>> for _ in range(EPOCH_NUM):
+                ...     for data in reader():
+                ...         executor.run(feed=data, fetch_list=[loss])
+                ...
         '''
         self._loader.set_sample_generator(
             sample_generator, batch_size, drop_last, places
@@ -1484,51 +1491,51 @@ def decorate_sample_list_generator(self, reader, places=None):
         Example:
             .. code-block:: python
 
-                import paddle
-                import paddle.base as base
-                import numpy as np
-
-                paddle.enable_static()
-
-                EPOCH_NUM = 3
-                ITER_NUM = 15
-                BATCH_SIZE = 3
-
-                def network(image, label):
-                    # User-defined network, here is an example of softmax regression.
-                    predict = paddle.static.nn.fc(x=image, size=10, activation='softmax')
-                    return paddle.nn.functional.cross_entropy(
-                        input=predict, label=label,
-                        reduction='none', use_softmax=False
-                    )
-
-                def random_image_and_label_generator(height, width):
-                    def generator():
-                        for i in range(ITER_NUM):
-                            fake_image = np.random.uniform(low=0,
-                                                           high=255,
-                                                           size=[height, width])
-                            fake_label = np.ones([1])
-                            yield fake_image, fake_label
-                    return generator
-
-                image = paddle.static.data(name='image', shape=[None, 784, 784], dtype='float32')
-                label = paddle.static.data(name='label', shape=[None, 1], dtype='int64')
-                reader = base.io.PyReader(feed_list=[image, label], capacity=4, iterable=True)
-
-                user_defined_generator = random_image_and_label_generator(784, 784)
-                reader.decorate_sample_list_generator(
-                    paddle.batch(user_defined_generator, batch_size=BATCH_SIZE),
-                    base.core.CPUPlace())
-
-                loss = network(image, label)
-                executor = base.Executor(base.core.CPUPlace())
-                executor.run(base.default_startup_program())
-
-                for _ in range(EPOCH_NUM):
-                    for data in reader():
-                        executor.run(feed=data, fetch_list=[loss])
-
+                >>> import paddle
+                >>> import paddle.base as base
+                >>> import numpy as np
+
+                >>> paddle.enable_static()
+
+                >>> EPOCH_NUM = 3
+                >>> ITER_NUM = 15
+                >>> BATCH_SIZE = 3
+
+                >>> def network(image, label):
+                ...     # User-defined network, here is an example of softmax regression.
+                ...     predict = paddle.static.nn.fc(x=image, size=10, activation='softmax')
+                ...     return paddle.nn.functional.cross_entropy(
+                ...         input=predict, label=label,
+                ...         reduction='none', use_softmax=False
+                ...     )
+                ...
+                >>> def random_image_and_label_generator(height, width):
+                ...     def generator():
+                ...         for i in range(ITER_NUM):
+                ...             fake_image = np.random.uniform(low=0,
+                ...                                             high=255,
+                ...                                             size=[height, width])
+                ...             fake_label = np.ones([1])
+                ...             yield fake_image, fake_label
+                ...     return generator
+                ...
+                >>> image = paddle.static.data(name='image', shape=[None, 784, 784], dtype='float32')
+                >>> label = paddle.static.data(name='label', shape=[None, 1], dtype='int64')
+                >>> reader = base.io.PyReader(feed_list=[image, label], capacity=4, iterable=True)
+
+                >>> user_defined_generator = random_image_and_label_generator(784, 784)
+                >>> reader.decorate_sample_list_generator(
+                ...     paddle.batch(user_defined_generator, batch_size=BATCH_SIZE),
+                ...     base.core.CPUPlace())
+                ...
+                >>> loss = network(image, label)
+                >>> executor = base.Executor(base.core.CPUPlace())
+                >>> executor.run(base.default_startup_program())
+
+                >>> for _ in range(EPOCH_NUM):
+                ...     for data in reader():
+                ...         executor.run(feed=data, fetch_list=[loss])
+                ...
         '''
         self._loader.set_sample_list_generator(reader, places)
 
@@ -1550,51 +1557,51 @@ def decorate_batch_generator(self, reader, places=None):
         Example:
             .. code-block:: python
 
-                import paddle
-                import paddle.base as base
-                import numpy as np
-
-                paddle.enable_static()
-
-                EPOCH_NUM = 3
-                ITER_NUM = 15
-                BATCH_SIZE = 3
-
-                def network(image, label):
-                    # User-defined network, here is an example of softmax regression.
-                    predict = paddle.static.nn.fc(x=image, size=10, activation='softmax')
-                    return paddle.nn.functional.cross_entropy(
-                        input=predict, label=label,
-                        reduction='none', use_softmax=False
-                    )
-
-                def random_image_and_label_generator(height, width):
-                    def generator():
-                        for i in range(ITER_NUM):
-                            batch_image = np.random.uniform(low=0,
-                                                            high=255,
-                                                            size=[BATCH_SIZE, height, width])
-                            batch_label = np.ones([BATCH_SIZE, 1])
-                            batch_image = batch_image.astype('float32')
-                            batch_label = batch_label.astype('int64')
-                            yield batch_image, batch_label
-                    return generator
-
-                image = paddle.static.data(name='image', shape=[None, 784, 784], dtype='float32')
-                label = paddle.static.data(name='label', shape=[None, 1], dtype='int64')
-                reader = base.io.PyReader(feed_list=[image, label], capacity=4, iterable=True)
-
-                user_defined_generator = random_image_and_label_generator(784, 784)
-                reader.decorate_batch_generator(user_defined_generator, base.CPUPlace())
-
-                loss = network(image, label)
-                executor = base.Executor(base.CPUPlace())
-                executor.run(base.default_startup_program())
-
-                for _ in range(EPOCH_NUM):
-                    for data in reader():
-                        executor.run(feed=data, fetch_list=[loss])
-
+                >>> import paddle
+                >>> import paddle.base as base
+                >>> import numpy as np
+
+                >>> paddle.enable_static()
+
+                >>> EPOCH_NUM = 3
+                >>> ITER_NUM = 15
+                >>> BATCH_SIZE = 3
+
+                >>> def network(image, label):
+                ...     # User-defined network, here is an example of softmax regression.
+                ...     predict = paddle.static.nn.fc(x=image, size=10, activation='softmax')
+                ...     return paddle.nn.functional.cross_entropy(
+                ...         input=predict, label=label,
+                ...         reduction='none', use_softmax=False
+                ...     )
+                ...
+                >>> def random_image_and_label_generator(height, width):
+                ...     def generator():
+                ...         for i in range(ITER_NUM):
+                ...             batch_image = np.random.uniform(low=0,
+                ...                                             high=255,
+                ...                                             size=[BATCH_SIZE, height, width])
+                ...             batch_label = np.ones([BATCH_SIZE, 1])
+                ...             batch_image = batch_image.astype('float32')
+                ...             batch_label = batch_label.astype('int64')
+                ...             yield batch_image, batch_label
+                ...     return generator
+                ...
+                >>> image = paddle.static.data(name='image', shape=[None, 784, 784], dtype='float32')
+                >>> label = paddle.static.data(name='label', shape=[None, 1], dtype='int64')
+                >>> reader = base.io.PyReader(feed_list=[image, label], capacity=4, iterable=True)
+
+                >>> user_defined_generator = random_image_and_label_generator(784, 784)
+                >>> reader.decorate_batch_generator(user_defined_generator, base.CPUPlace())
+
+                >>> loss = network(image, label)
+                >>> executor = base.Executor(base.CPUPlace())
+                >>> executor.run(base.default_startup_program())
+
+                >>> for _ in range(EPOCH_NUM):
+                ...     for data in reader():
+                ...         executor.run(feed=data, fetch_list=[loss])
+                ...
         '''
         self._loader.set_batch_generator(reader, places)
 
diff --git a/python/paddle/base/unique_name.py b/python/paddle/base/unique_name.py
index 863883a220c67..745675767f150 100644
--- a/python/paddle/base/unique_name.py
+++ b/python/paddle/base/unique_name.py
@@ -93,10 +93,11 @@ def generate(key):
 
         .. code-block:: python
 
-            import paddle
-            name1 = paddle.utils.unique_name.generate('fc')
-            name2 = paddle.utils.unique_name.generate('fc')
-            print(name1, name2) # fc_0, fc_1
+            >>> import paddle
+            >>> name1 = paddle.utils.unique_name.generate('fc')
+            >>> name2 = paddle.utils.unique_name.generate('fc')
+            >>> print(name1, name2)
+            fc_0 fc_1
     """
     return generator(key)
 
@@ -151,18 +152,21 @@ def switch(new_generator=None, new_para_name_checker=None):
 
         .. code-block:: python
 
-            import paddle
-            name1 = paddle.utils.unique_name.generate('fc')
-            name2 = paddle.utils.unique_name.generate('fc')
-            print(name1, name2) # fc_0, fc_1
-
-            pre_generator, pre_dygraph_name_checker = paddle.utils.unique_name.switch() # switch to a new anonymous namespace.
-            name2 = paddle.utils.unique_name.generate('fc')
-            print(name2) # fc_0
-
-            paddle.utils.unique_name.switch(pre_generator, pre_dygraph_name_checker) # switch back to pre_generator.
-            name3 = paddle.utils.unique_name.generate('fc')
-            print(name3) # fc_2, since pre_generator has generated fc_0, fc_1.
+            >>> import paddle
+            >>> name1 = paddle.utils.unique_name.generate('fc')
+            >>> name2 = paddle.utils.unique_name.generate('fc')
+            >>> print(name1, name2)
+            fc_0 fc_1
+
+            >>> pre_generator, pre_dygraph_name_checker = paddle.utils.unique_name.switch() # switch to a new anonymous namespace.
+            >>> name2 = paddle.utils.unique_name.generate('fc')
+            >>> print(name2)
+            fc_0
+
+            >>> paddle.utils.unique_name.switch(pre_generator, pre_dygraph_name_checker) # switch back to pre_generator.
+            >>> name3 = paddle.utils.unique_name.generate('fc')
+            >>> print(name3)
+            fc_2
     """
     global generator
     old_generator = generator
@@ -200,18 +204,20 @@ def guard(new_generator=None):
 
         .. code-block:: python
 
-            import paddle
-            with paddle.utils.unique_name.guard():
-                name_1 = paddle.utils.unique_name.generate('fc')
-            with paddle.utils.unique_name.guard():
-                name_2 = paddle.utils.unique_name.generate('fc')
-            print(name_1, name_2) # fc_0, fc_0
-
-            with paddle.utils.unique_name.guard('A'):
-                name_1 = paddle.utils.unique_name.generate('fc')
-            with paddle.utils.unique_name.guard('B'):
-                name_2 = paddle.utils.unique_name.generate('fc')
-            print(name_1, name_2) # Afc_0, Bfc_0
+            >>> import paddle
+            >>> with paddle.utils.unique_name.guard():
+            ...     name_1 = paddle.utils.unique_name.generate('fc')
+            >>> with paddle.utils.unique_name.guard():
+            ...     name_2 = paddle.utils.unique_name.generate('fc')
+            >>> print(name_1, name_2)
+            fc_0 fc_0
+
+            >>> with paddle.utils.unique_name.guard('A'):
+            ...     name_1 = paddle.utils.unique_name.generate('fc')
+            >>> with paddle.utils.unique_name.guard('B'):
+            ...     name_2 = paddle.utils.unique_name.generate('fc')
+            >>> print(name_1, name_2)
+            Afc_0 Bfc_0
     """
     if isinstance(new_generator, str):
         new_generator = UniqueNameGenerator(new_generator)
diff --git a/python/paddle/base/variable_index.py b/python/paddle/base/variable_index.py
index ac0215b32d59b..7da27f09f9d85 100644
--- a/python/paddle/base/variable_index.py
+++ b/python/paddle/base/variable_index.py
@@ -18,6 +18,7 @@
 from . import core
 import paddle
 import warnings
+import itertools
 
 
 MAX_INTEGER = 2**31 - 1
@@ -854,16 +855,24 @@ def _setitem_static(x, indices, values):
         'decrease_axes': decrease_axes,
         'none_axes': none_axes,
     }
+
+    value_tensor = None
+    StartsTensorList = None
+    EndsTensorList = None
+    StepsTensorList = None
+
     if paddle.utils._contain_var(starts):
-        inputs['StartsTensorList'] = paddle.utils._convert_to_tensor_list(
-            starts
-        )
+        StartsTensorList = paddle.utils._convert_to_tensor_list(starts)
+        inputs['StartsTensorList'] = StartsTensorList
         del attrs['starts']
+
     if paddle.utils._contain_var(ends):
-        inputs['EndsTensorList'] = paddle.utils._convert_to_tensor_list(ends)
+        EndsTensorList = paddle.utils._convert_to_tensor_list(ends)
+        inputs['EndsTensorList'] = EndsTensorList
         del attrs['ends']
     if paddle.utils._contain_var(steps):
-        inputs['StepsTensorList'] = paddle.utils._convert_to_tensor_list(steps)
+        StepsTensorList = paddle.utils._convert_to_tensor_list(steps)
+        inputs['StepsTensorList'] = StepsTensorList
         del attrs['steps']
 
     if not has_advanced_index:
@@ -883,7 +892,10 @@ def _setitem_static(x, indices, values):
             attrs["shape"] = shape
 
         elif isinstance(values, Variable):
+            values = values.astype(dtype)
             inputs["ValueTensor"] = values
+            value_tensor = values
+
         else:
             raise TypeError(
                 "Only support to assign an integer, float, numpy.ndarray or "
@@ -894,8 +906,14 @@ def _setitem_static(x, indices, values):
 
         # step3.1: Only basic indexing, use OP set_value to set value.
         if paddle.in_dynamic_mode():
-            x._bump_inplace_version()
-            output = x
+            return paddle._legacy_C_ops.set_value_(
+                x,
+                value_tensor,
+                StartsTensorList,
+                EndsTensorList,
+                StepsTensorList,
+                *itertools.chain.from_iterable(attrs.items())
+            )
         else:
             helper = paddle.base.layer_helper.LayerHelper(
                 'set_value', **locals()
@@ -909,21 +927,20 @@ def _setitem_static(x, indices, values):
                 output = helper.create_variable_for_type_inference(
                     dtype=x.dtype
                 )
-        cur_block = default_main_program().current_block()
-        cur_block.append_op(
-            type="set_value",
-            inputs=inputs,
-            outputs={'Out': output},
-            attrs=attrs,
-            inplace_map={"Input": "Out"},
-        )
+            cur_block = default_main_program().current_block()
+            cur_block.append_op(
+                type="set_value",
+                inputs=inputs,
+                outputs={'Out': output},
+                attrs=attrs,
+                inplace_map={"Input": "Out"},
+            )
 
-        if not paddle.in_dynamic_mode():
             # map var to the new output
             paddle.jit.api.ProgramTranslator.get_instance()._inplace_map.add(
                 cur_block.program, x.desc.id(), output
             )
-        return output
+            return output
     else:
         # step3.2: Case for there are advanced indexing.
         #   1. get __getitem__ result of basic indexing;
@@ -950,27 +967,22 @@ def _setitem_static(x, indices, values):
         ) = deal_advanced_index(sub_tensor, advanced_index, True)
         if not isinstance(values, Variable):
             values = paddle.assign(values).astype(transed_sub_tensor.dtype)
-        transed_sub_tensor = transed_sub_tensor.index_put(
-            adjusted_advanced_index, values
-        )
 
-        # NOTE(zoooo0820): now basic indexing of __getitem__ will return a new Tensor both in dynamic and static mode
-        # After strided is ready and basic indexing returns view of Tensor in dynamic mode. The code shoule be changed
-        # for dynamic mode.
+        if values.dtype != transed_sub_tensor.dtype:
+            values = values.astype(transed_sub_tensor.dtype)
+
         if paddle.in_dynamic_mode():
-            transed_sub_tensor.index_put_(adjusted_advanced_index, values)
+            return transed_sub_tensor.index_put_(
+                adjusted_advanced_index, values
+            )
         else:
             transed_sub_tensor = transed_sub_tensor.index_put(
                 adjusted_advanced_index, values
             )
 
-        transback_sub_tensor = transed_sub_tensor.transpose(transback_dim)
+            transback_sub_tensor = transed_sub_tensor.transpose(transback_dim)
+            inputs["ValueTensor"] = transback_sub_tensor
 
-        inputs["ValueTensor"] = transback_sub_tensor
-        if paddle.in_dynamic_mode():
-            x._bump_inplace_version()
-            output = x
-        else:
             helper = paddle.base.layer_helper.LayerHelper(
                 'set_value', **locals()
             )
@@ -983,20 +995,20 @@ def _setitem_static(x, indices, values):
                 output = helper.create_variable_for_type_inference(
                     dtype=x.dtype
                 )
-        cur_block = default_main_program().current_block()
-        cur_block.append_op(
-            type="set_value",
-            inputs=inputs,
-            outputs={'Out': output},
-            attrs=attrs,
-            inplace_map={"Input": "Out"},
-        )
-        if not paddle.in_dynamic_mode():
+            cur_block = default_main_program().current_block()
+            cur_block.append_op(
+                type="set_value",
+                inputs=inputs,
+                outputs={'Out': output},
+                attrs=attrs,
+                inplace_map={"Input": "Out"},
+            )
+
             # map var to the new output
             paddle.jit.api.ProgramTranslator.get_instance()._inplace_map.add(
                 cur_block.program, x.desc.id(), output
             )
-        return output
+            return output
 
 
 def get_tensor_with_basic_indexing(
diff --git a/python/paddle/decomposition/decomp.py b/python/paddle/decomposition/decomp.py
index 273b11f24c9e5..9ae9178f45268 100644
--- a/python/paddle/decomposition/decomp.py
+++ b/python/paddle/decomposition/decomp.py
@@ -129,7 +129,7 @@ def decompose(
         return src_vars
     if not isinstance(program, Program):
         raise TypeError(f"Expect type Program, but got type {type(program)}.")
-    block = program.block()
+    block = program.global_block()
 
     if not isinstance(blacklist, (set, frozenset)):
         raise TypeError(
diff --git a/python/paddle/decomposition/rules.py b/python/paddle/decomposition/rules.py
index ef225ce461382..e9d04ede061ce 100644
--- a/python/paddle/decomposition/rules.py
+++ b/python/paddle/decomposition/rules.py
@@ -18,7 +18,7 @@
 from .register import register_decomp
 
 
-@register_decomp('pd.mean')
+@register_decomp('pd_op.mean')
 def mean(x, axis, keepdim):
     """define composite rule of op mean"""
     x_shape = x.shape
@@ -34,11 +34,11 @@ def mean(x, axis, keepdim):
         value=value_to_fill,
         dtype=sum_x.dtype,
     )
-    res = divide(sum_x, norm)
+    res = sum_x / norm
     return res
 
 
-@register_decomp('pd.gelu')
+@register_decomp('pd_op.gelu')
 def gelu_composite(x, approximate):
     """define composite rule of op gelu"""
     M_SQRT1_2 = (
@@ -60,16 +60,6 @@ def gelu_composite(x, approximate):
     else:
         # gelu(x) = 0.5 * x *  (1 + erf(x / sqrt(2)))
 
-        cdf = _ir_ops.multiply(
-            half,
-            (
-                _ir_ops.add(
-                    one,
-                    _ir_ops.erf(
-                        _ir_ops.multiply(x, full(x.shape, M_SQRT1_2, x.dtype))
-                    ),
-                )
-            ),
-        )
-        out = _ir_ops.multiply(x, cdf)
+        cdf = half * (one + _ir_ops.erf(x * full(x.shape, M_SQRT1_2, x.dtype)))
+        out = x * cdf
         return out
diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py
index 183f307607c36..fe914bbb3422a 100644
--- a/python/paddle/distributed/__init__.py
+++ b/python/paddle/distributed/__init__.py
@@ -66,6 +66,7 @@
 
 from .auto_parallel import shard_op  # noqa: F401
 from .auto_parallel.api import shard_tensor  # noqa: F401
+from .auto_parallel.api import dtensor_from_fn  # noqa: F401
 
 from .fleet import BoxPSDataset  # noqa: F401
 
@@ -126,4 +127,5 @@
     "ProcessMesh",
     "DistAttr",
     "shard_tensor",
+    "dtensor_from_fn",
 ]
diff --git a/python/paddle/distributed/auto_parallel/api.py b/python/paddle/distributed/auto_parallel/api.py
index 251eb6bb63263..680b9cc95bc2b 100644
--- a/python/paddle/distributed/auto_parallel/api.py
+++ b/python/paddle/distributed/auto_parallel/api.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+
 import paddle
 from paddle.distributed.auto_parallel.interface import (
     shard_tensor as shard_tensor_static,
@@ -68,7 +69,6 @@ def __init__(self, mesh, sharding_specs):
 
         self.process_mesh = mesh
         self.dims_mapping = dims_mapping
-
         self.mark_annotated("process_mesh")
         self.mark_annotated("dims_mapping")
 
@@ -76,7 +76,6 @@ def __init__(self, mesh, sharding_specs):
     def sharding_specs(self):
         """
         Get sharding_specs of the dist_attr
-
         Returns:
             list[str]: sharding_specs
         """
@@ -142,3 +141,33 @@ def shard_tensor(
         return shard_tensor_static(
             data, dist_attr.process_mesh, dist_attr.sharding_specs
         )
+
+
+def dtensor_from_fn(fn, dist_attr, *args, **kwargs):
+    """
+    Construct a Distributed Tensor from a function of arguments.
+
+    Args:
+        fn (callable): A callable function that takes arguments of Distributed Tensor and returns tensor.
+        dist_attr (paddle.distributed.DistAttr): Specify how tensors are distributed or sliced on ProcessMesh.
+        *args (tuple): A tuple of arguments to be passed to the ``fn`` function.
+        **kwargs (dict): A dict of arguments to be passed to the ``fn`` function.
+
+    Retruns:
+        Tensor: A Tensor constructed from ``fn`` with distributed attributes.
+
+    Examples:
+
+        .. code-block:: python
+
+            >>> import paddle
+            >>> import paddle.distributed as dist
+            >>> # Create a distributed attribute
+            >>> mesh = dist.ProcessMesh([0, 1], dim_names=["x"])
+            >>> dist_attr = dist.DistAttr(mesh=mesh, sharding_specs=[None])
+            >>> # Call the function dtensor_from_fn with dist_attr parameter
+            >>> d_tensor = dist.dtensor_from_fn(paddle.ones, dist_attr=dist_attr, shape=[1])
+            >>> print(d_tensor)
+    """
+    tensor = fn(*args, **kwargs)
+    return shard_tensor(tensor, dist_attr=dist_attr)
diff --git a/python/paddle/distributed/auto_parallel/random.py b/python/paddle/distributed/auto_parallel/random.py
index dbd0dbc691d02..d79f94e166524 100644
--- a/python/paddle/distributed/auto_parallel/random.py
+++ b/python/paddle/distributed/auto_parallel/random.py
@@ -25,6 +25,7 @@
 _inited_rng_name_to_seed = {}
 _enable_random_control = False
 _basic_seed = 42
+_basic_name = ""
 
 # use Prime number as offset to avoid confict
 _mesh_offset = 173
@@ -41,7 +42,7 @@ def enable_auto_rand_ctrl():
     _enable_random_control = True
 
 
-def parallel_manual_seed(seed):
+def parallel_manual_seed(seed, name=""):
     """Enable auto parallel random control.
     Random control maintain the randomness when tensor is distributed across devices on a Mesh(any order).
         * Independency: If tensor is **Sharded** on a Mesh dimension, Devices along that Mesh dimension should have Different randomness.
@@ -66,6 +67,8 @@ def parallel_manual_seed(seed):
     enable_auto_rand_ctrl()
     global _basic_seed
     _basic_seed = seed
+    global _basic_name
+    _basic_name = name
 
 
 def determinate_rng(rank, dims_mapping, process_mesh):
@@ -80,13 +83,18 @@ def determinate_rng(rank, dims_mapping, process_mesh):
         )
     global _basic_seed
     seed_ = _basic_seed
+    global _basic_name
+    name_ = _basic_name
+
+    if name_:
+        name_ += "_"
 
     # FIXME
     # unique_id = process_mesh.unique_id
     unique_id = retrive_unique_id_for_process_mesh(
         process_mesh.shape, process_mesh.process_ids
     )
-    sharding_expr = f'mesh:{unique_id}'
+    sharding_expr = name_ + f'mesh:{unique_id}'
     seed_ += _mesh_offset * (unique_id + 1)
 
     for i in range(len(process_mesh.shape)):
diff --git a/python/paddle/distributed/auto_parallel/static/completion.py b/python/paddle/distributed/auto_parallel/static/completion.py
index 2e7396dd596af..fc7646fa3b5f3 100644
--- a/python/paddle/distributed/auto_parallel/static/completion.py
+++ b/python/paddle/distributed/auto_parallel/static/completion.py
@@ -219,150 +219,95 @@ def _update_op_node_dims_mapping(self, op_node, fwd=True):
         dist_op = self._dist_context.get_dist_op_for_graph(op_node)
         op_dist_attr = dist_op.dist_attr
         original_op_dist_attr = copy.deepcopy(op_dist_attr)
+        # step 1: merge the dims mappings of in
+        # dist_op with corresponding tensors
         if fwd:
-            for tensor_node in op_node.inputs:
-                if tensor_node.is_var() and tensor_node.var() is not None:
-                    if tensor_node.var().type() == core.VarDesc.VarType.READER:
-                        continue
-                    tensor_desc = tensor_node.var()
-                    if op_dist_attr.is_annotated_input_dims_mapping(
+            node_list = op_node.inputs
+        else:
+            node_list = op_node.outputs
+        for tensor_node in node_list:
+            if not tensor_node.is_var() or tensor_node.var() is None:
+                continue
+            if tensor_node.var().type() == core.VarDesc.VarType.READER:
+                continue
+
+            tensor_desc = tensor_node.var()
+            if fwd:
+                annotated = op_dist_attr.is_annotated_input_dims_mapping(
+                    tensor_desc.name()
+                )
+            else:
+                annotated = op_dist_attr.is_annotated_output_dims_mapping(
+                    tensor_desc.name()
+                )
+            if annotated:
+                continue
+
+            tensor_dist_attr = (
+                self._dist_context.get_tensor_dist_attr_for_graph(tensor_node)
+            )
+            if op_dist_attr.process_mesh == tensor_dist_attr.process_mesh:
+                tensor_dims_mapping = tensor_dist_attr.dims_mapping
+                if fwd:
+                    op_dims_mapping = op_dist_attr.get_input_dims_mapping(
                         tensor_desc.name()
-                    ):
-                        continue
-                    tensor_dist_attr = (
-                        self._dist_context.get_tensor_dist_attr_for_graph(
-                            tensor_node
-                        )
                     )
-                    if (
-                        op_dist_attr.process_mesh
-                        == tensor_dist_attr.process_mesh
-                    ):
-                        tensor_dims_mapping = tensor_dist_attr.dims_mapping
-                        op_dims_mapping = op_dist_attr.get_input_dims_mapping(
-                            tensor_desc.name()
-                        )
-                        compatible_dims_mapping = (
-                            compute_compatible_dims_mapping(
-                                [op_dims_mapping, tensor_dims_mapping]
-                            )
-                        )
-                        if not _validate_dims_mapping(
-                            compatible_dims_mapping, op_dist_attr.process_mesh
-                        ):
-                            continue
-                        if (compatible_dims_mapping is not None) and (
-                            compatible_dims_mapping != op_dims_mapping
-                        ):
-                            op_dist_attr.set_input_dims_mapping(
-                                tensor_desc.name(), compatible_dims_mapping
-                            )
-                            changed = True
-            # Find the most compatible implementations from the distributed operator
-            op_dist_impls = find_compatible_distributed_operator_impls(
-                dist_op, fwd=True
-            )
-            if op_dist_impls is not None:
-                not_compatible = True
-                backup_op_dist_attr = copy.deepcopy(op_dist_attr)
-                backup_changed = changed
-                for op_dist_impl in op_dist_impls:
-                    dim_changed = op_dist_impl.update_dims_mapping(dist_op)
-                    if dim_changed:
-                        changed = True
-                    if (
-                        op_dist_impl.is_auto_compatible(dist_op)
-                        and dist_op.validate_dist_attr()
-                    ):
-                        if op_dist_impl.type == "elementwise":
-                            op_dist_attr.impl_type = "default"
-                        else:
-                            op_dist_attr.impl_type = op_dist_impl.type
-                        # op_dist_attr.impl_type = op_dist_impl.type
-                        op_dist_attr.impl_idx = op_dist_impl.idx
-                        not_compatible = False
-                        break
-                    else:
-                        dist_op.dist_attr = backup_op_dist_attr
-                        changed = backup_changed
-                if not_compatible:
-                    dist_op.dist_attr = original_op_dist_attr
-                    changed = False
-            else:
-                dist_op.dist_attr = original_op_dist_attr
-                changed = False
-        else:
-            for tensor_node in op_node.outputs:
-                if tensor_node.is_var() and tensor_node.var() is not None:
-                    if tensor_node.var().type() == core.VarDesc.VarType.READER:
-                        continue
-                    tensor_desc = tensor_node.var()
-                    if op_dist_attr.is_annotated_output_dims_mapping(
+                else:
+                    op_dims_mapping = op_dist_attr.get_output_dims_mapping(
                         tensor_desc.name()
-                    ):
-                        continue
-                    tensor_dist_attr = (
-                        self._dist_context.get_tensor_dist_attr_for_graph(
-                            tensor_node
-                        )
                     )
-                    if (
-                        op_dist_attr.process_mesh
-                        == tensor_dist_attr.process_mesh
-                    ):
-                        tensor_dims_mapping = tensor_dist_attr.dims_mapping
-                        op_dims_mapping = op_dist_attr.get_output_dims_mapping(
-                            tensor_desc.name()
-                        )
-                        compatible_dims_mapping = (
-                            compute_compatible_dims_mapping(
-                                [op_dims_mapping, tensor_dims_mapping]
-                            )
+
+                compatible_dims_mapping = compute_compatible_dims_mapping(
+                    [op_dims_mapping, tensor_dims_mapping]
+                )
+                if not _validate_dims_mapping(
+                    compatible_dims_mapping, op_dist_attr.process_mesh
+                ):
+                    continue
+                if (compatible_dims_mapping is not None) and (
+                    compatible_dims_mapping != op_dims_mapping
+                ):
+                    if fwd:
+                        op_dist_attr.set_input_dims_mapping(
+                            tensor_desc.name(), compatible_dims_mapping
                         )
-                        if not _validate_dims_mapping(
-                            compatible_dims_mapping, op_dist_attr.process_mesh
-                        ):
-                            continue
-                        if (compatible_dims_mapping is not None) and (
-                            compatible_dims_mapping != op_dims_mapping
-                        ):
-                            op_dist_attr.set_output_dims_mapping(
-                                tensor_desc.name(), compatible_dims_mapping
-                            )
-                            changed = True
-            # Find the most compatible implementations from the distributed operator
-            op_dist_impls = find_compatible_distributed_operator_impls(
-                dist_op, fwd=False
-            )
-            if op_dist_impls is not None:
-                not_compatible = True
-                backup_op_dist_attr = copy.deepcopy(op_dist_attr)
-                backup_changed = changed
-                for op_dist_impl in op_dist_impls:
-                    dim_changed = op_dist_impl.update_dims_mapping(dist_op)
-                    if dim_changed:
-                        changed = True
-                    if (
-                        op_dist_impl.is_auto_compatible(dist_op)
-                        and dist_op.validate_dist_attr()
-                    ):
-                        if op_dist_impl.type == "elementwise":
-                            op_dist_attr.impl_type = "default"
-                        else:
-                            op_dist_attr.impl_type = op_dist_impl.type
-                        # op_dist_attr.impl_type = op_dist_impl.type
-                        op_dist_attr.impl_idx = op_dist_impl.idx
-                        not_compatible = False
-                        break
                     else:
-                        dist_op.dist_attr = backup_op_dist_attr
-                        changed = backup_changed
-                if not_compatible:
-                    dist_op.dist_attr = original_op_dist_attr
-                    changed = False
-            else:
+                        op_dist_attr.set_output_dims_mapping(
+                            tensor_desc.name(), compatible_dims_mapping
+                        )
+                    changed = True
+
+        # step 2: infer distributed attributes in dist_op
+        # Find the most compatible implementations from the distributed operator
+        op_dist_impls = find_compatible_distributed_operator_impls(
+            dist_op, fwd=True
+        )
+        if op_dist_impls is not None:
+            not_compatible = True
+            backup_op_dist_attr = copy.deepcopy(op_dist_attr)
+            backup_changed = changed
+            for op_dist_impl in op_dist_impls:
+                dim_changed = op_dist_impl.update_dims_mapping(dist_op)
+                if dim_changed:
+                    changed = True
+                if (
+                    op_dist_impl.is_auto_compatible(dist_op)
+                    and dist_op.validate_dist_attr()
+                ):
+                    op_dist_attr.impl_type = op_dist_impl.type
+                    op_dist_attr.impl_idx = op_dist_impl.idx
+                    not_compatible = False
+                    break
+                else:
+                    dist_op.dist_attr = backup_op_dist_attr
+                    changed = backup_changed
+            if not_compatible:
                 dist_op.dist_attr = original_op_dist_attr
                 changed = False
+        else:
+            dist_op.dist_attr = original_op_dist_attr
+            changed = False
+
         return changed
 
     def _update_dims_mapping_between_graphs(self):
@@ -1924,10 +1869,10 @@ def _init_global_mesh_for_program(self):
                     for op_dist_impl in op_dist_impls:
                         dim_changed = op_dist_impl.update_dims_mapping(dist_op)
                         if op_dist_impl.is_auto_compatible(dist_op):
-                            if op_dist_impl.type == "elementwise":
-                                dist_op.dist_attr.impl_type = "default"
-                            else:
-                                dist_op.dist_attr.impl_type = op_dist_impl.type
+                            # if op_dist_impl.type == "elementwise":
+                            #     dist_op.dist_attr.impl_type = "default"
+                            # else:
+                            dist_op.dist_attr.impl_type = op_dist_impl.type
                             # op_dist_attr.impl_type = op_dist_impl.type
                             dist_op.dist_attr.impl_idx = op_dist_impl.idx
                             break
diff --git a/python/paddle/distributed/auto_parallel/static/dist_context.py b/python/paddle/distributed/auto_parallel/static/dist_context.py
index 09dfcee60b8fc..5eabdd312bbb7 100644
--- a/python/paddle/distributed/auto_parallel/static/dist_context.py
+++ b/python/paddle/distributed/auto_parallel/static/dist_context.py
@@ -982,7 +982,10 @@ def amend_dist_attr_for_program(self):
                     ):
                         dims_mapping[i] = -1
                 dist_attr.set_output_dims_mapping(arg_name, dims_mapping)
-            if len(process_mesh_processes) == 1:
+            if (
+                len(process_mesh_processes) == 1
+                and dist_op.serial_op.type != "dropout"
+            ):
                 dist_op.dist_attr.impl_type = "default"
                 dist_op.dist_attr.impl_idx = 0
 
diff --git a/python/paddle/distributed/auto_parallel/static/operators/dist_dropout.py b/python/paddle/distributed/auto_parallel/static/operators/dist_dropout.py
index 08976d7eba74f..ec3692f0385b5 100644
--- a/python/paddle/distributed/auto_parallel/static/operators/dist_dropout.py
+++ b/python/paddle/distributed/auto_parallel/static/operators/dist_dropout.py
@@ -58,12 +58,11 @@ def forward(ctx, *args, **kwargs):
         src_op = dist_op_context.cur_src_op
         rank_id = dist_op_context.rank_id
         op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
+        assert (
+            op_dist_attr is not None
+        ), f"forward op [{str(src_op)}] don't have dist attribute !"
 
         if is_enable_auto_rand_ctrl() and not op_dist_attr.is_recompute:
-            assert (
-                op_dist_attr is not None
-            ), f"forward op [{str(src_op)}] don't have dist attribute !"
-
             # check validation of inputs / outputs
             assert 'X' in kwargs, "input [{}] is not given".format('X')
             assert (
@@ -164,8 +163,8 @@ def forward(ctx, *args, **kwargs):
 
                 # modify dropout op
                 src_op.desc.set_input("Seed", [seed_var.name])
-                src_op._remove_attr("fix_seed")
-                src_op._remove_attr("seed")
+                src_op.desc._set_attr("fix_seed", False)
+                src_op.desc._set_attr("seed", 0)
                 op_dist_attr.set_input_dist_attr(
                     seed_var.name, seed_var_dist_attr
                 )
diff --git a/python/paddle/distributed/auto_parallel/static/parallelizer_v2.py b/python/paddle/distributed/auto_parallel/static/parallelizer_v2.py
index 8b5136a61b9f6..086221d64f092 100644
--- a/python/paddle/distributed/auto_parallel/static/parallelizer_v2.py
+++ b/python/paddle/distributed/auto_parallel/static/parallelizer_v2.py
@@ -25,7 +25,12 @@
 from .partitioner import Partitioner
 from .process_group import get_world_process_group
 from .reshard import Resharder
-from .utils import get_pp_stage, set_grad_var_shape, use_new_executor
+from .utils import (
+    get_pp_stage,
+    is_sequential_run,
+    set_grad_var_shape,
+    use_new_executor,
+)
 
 
 class Parallelizer:
@@ -367,6 +372,7 @@ def _apply_post_optimization(
                 [main_program], [startup_program], self._pass_context
             )
 
+        if not is_sequential_run():
             # deps for newexe
             config = {}
             config["dist_context"] = self._dist_context
diff --git a/python/paddle/distributed/auto_parallel/static/tuner/profiler.py b/python/paddle/distributed/auto_parallel/static/tuner/profiler.py
index 992ac034e16b7..bbc3171cbebe9 100644
--- a/python/paddle/distributed/auto_parallel/static/tuner/profiler.py
+++ b/python/paddle/distributed/auto_parallel/static/tuner/profiler.py
@@ -294,5 +294,6 @@ def profiler(args):
 
 
 if __name__ == "__main__":
+    paddle.framework.set_flags({'FLAGS_new_executor_sequential_run': 1})
     args = parse_args()
     profiler(args)
diff --git a/python/paddle/distributed/auto_parallel/static/utils.py b/python/paddle/distributed/auto_parallel/static/utils.py
index fa12cfd68e3b2..062880a4e70d2 100644
--- a/python/paddle/distributed/auto_parallel/static/utils.py
+++ b/python/paddle/distributed/auto_parallel/static/utils.py
@@ -2250,6 +2250,9 @@ def insert_dependencies_for_two_ops(
     dependency: prior_op should be run before posterior_op
     """
 
+    if is_sequential_run():
+        return
+
     assert (
         len(prior_op.output_arg_names) >= 1
     ), "first op of dependency should at least have one output. [{}]".format(
@@ -2320,6 +2323,9 @@ def insert_dependencies_for_vars(
     dependency: op that generates prior_vars should be run before op that generates post_vars
     """
 
+    if is_sequential_run():
+        return
+
     if isinstance(prior_vars, Variable):
         prior_vars = [prior_vars]
     if isinstance(post_vars, Variable):
@@ -2423,6 +2429,14 @@ def use_new_executor():
     ]
 
 
+def is_sequential_run():
+    return bool(
+        paddle.get_flags("FLAGS_new_executor_sequential_run")[
+            "FLAGS_new_executor_sequential_run"
+        ]
+    )
+
+
 def get_pp_stage(dist_context, rank):
     pp_idx = None
     for idx, process_mesh in enumerate(dist_context.process_meshes):
diff --git a/python/paddle/distributed/fleet/base/topology.py b/python/paddle/distributed/fleet/base/topology.py
index 1bacdfb8df02e..b6130b55bf673 100644
--- a/python/paddle/distributed/fleet/base/topology.py
+++ b/python/paddle/distributed/fleet/base/topology.py
@@ -55,6 +55,7 @@ class ParallelMode:
     TENSOR_PARALLEL = 1
     PIPELINE_PARALLEL = 2
     SHARDING_PARALLEL = 3
+    SEGMENT_PARALLEL = 4
 
 
 class CommunicateTopology:
@@ -277,24 +278,41 @@ def __init__(self, topology):
         _HYBRID_PARALLEL_GROUP = self
 
     def get_parallel_mode(self):
-        # there are four modes : DataParallel / TensorParallel / PipelineParallel / ShardingParallel
+        # there are five modes : DataParallel / TensorParallel / PipelineParallel / ShardingParallel / SepParalel
         # NOTE when sharding conjugates with other parallel, sharding should act like a optimizer and
         # adding its parallel logic within that parallelism
         # when use sharding alone, it should have its own parallelism for its parallel logic
-        # TODO modify 3 others parallel to support sharding
+
+        # pp -> mp -> sep -> sharding -> dp
         if (
-            self._mp_degree == 1
-            and self._pp_degree == 1
-            and self._dp_degree == 1
+            self._pp_degree == 1
+            and self._mp_degree == 1
+            and self._sep_degree == 1
+            and self._sharding_degree == 1
+            and self._dp_degree > 1
+        ):
+            return ParallelMode.DATA_PARALLEL
+        elif (
+            self._pp_degree == 1
+            and self._mp_degree == 1
+            and self._sep_degree == 1
             and self._sharding_degree > 1
         ):
+            # sharding may coexist with dp
             return ParallelMode.SHARDING_PARALLEL
-        elif self._mp_degree == 1 and self._pp_degree == 1:
-            return ParallelMode.DATA_PARALLEL
-        elif self._mp_degree > 1 and self._pp_degree == 1:
+        elif (
+            self._pp_degree == 1
+            and self._mp_degree == 1
+            and self._sep_degree > 1
+        ):
+            # sep may coexist with dp and sharding
+            return ParallelMode.SEGMENT_PARALLEL
+        elif self._pp_degree == 1 and self._mp_degree > 1:
+            # tp may coexist with sep、dp and sharding
             # initialize the seed
             return ParallelMode.TENSOR_PARALLEL
         elif self._pp_degree > 1:
+            # pp may coexist with mp、sep、dp and sharding
             return ParallelMode.PIPELINE_PARALLEL
 
     def _check_vaild_topo(self):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py
index 74488c1844bf4..1190a03774129 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py
@@ -226,6 +226,15 @@ def _map_param_to_rank(self):
                 mapping[param.name] = rank
         return mapping
 
+    def filter_parameters(self, parameter_list, hcg):
+        sharding_parallel_rank = hcg.get_sharding_parallel_rank()
+        parameter_list = [
+            param
+            for param in parameter_list
+            if self._param2rank[param.name] == sharding_parallel_rank
+        ]
+        return parameter_list
+
     def reduce_gradients(self, parameter_list, hcg):
         # TODO merge grad / nrank with dp
         logger.debug("sharding start gradients sync")
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
index 6e11c19a40f80..26d446f9db795 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
@@ -50,16 +50,16 @@ def __init__(self, clip, hcg):
 
     def _global_norm(self, global_norm_var_dist, global_norm_var_not_dist):
         # sharding first
-        sharding_flag = (
-            self._hcg.get_sharding_parallel_world_size() > 1
-            and self._hcg.get_data_parallel_world_size() == 1
-        )
+        sharding_flag = self._hcg.get_sharding_parallel_world_size() > 1
+        dp_flag = self._hcg.get_data_parallel_world_size() > 1
         mp_flag = self._hcg.get_model_parallel_world_size() > 1
+        pp_flag = self._hcg.get_pipe_parallel_world_size() > 1
 
         # add all reduce to get global norm of distributed params_and_grads
         if sharding_flag and not g_shard_norm_align_dp:
             # norm of mp distributed variable
             if mp_flag:
+                # dist should reduce among sharding group、mp group、pp group
                 paddle.distributed.all_reduce(
                     global_norm_var_dist,
                     group=self._hcg.get_sharding_parallel_group(),
@@ -69,16 +69,30 @@ def _global_norm(self, global_norm_var_dist, global_norm_var_not_dist):
                 global_norm_var_not_dist,
                 group=self._hcg.get_sharding_parallel_group(),
             )
+
         # norm of mp distributed variable
         if mp_flag:
             # dist should reduce among sharding group、mp group、pp group
-            paddle.distributed.all_reduce(
-                global_norm_var_dist,
-                group=self._hcg.get_check_parallel_group(sharding_flag),
-            )
+
+            # the else branch would suffice, but this branch remains here for number precision backward compatibility
+            if not (dp_flag and sharding_flag):
+                paddle.distributed.all_reduce(
+                    global_norm_var_dist,
+                    group=self._hcg.get_check_parallel_group(sharding_flag),
+                )
+            else:
+                paddle.distributed.all_reduce(
+                    global_norm_var_dist,
+                    group=self._hcg.get_model_parallel_group(),
+                )
+                if pp_flag:
+                    paddle.distributed.all_reduce(
+                        global_norm_var_dist,
+                        group=self._hcg.get_pipe_parallel_group(),
+                    )
 
         # add all reduce to get global norm of non-distributed params_and_grads in groups of pp
-        if self._hcg.get_pipe_parallel_world_size() > 1:
+        if pp_flag:
             paddle.distributed.all_reduce(
                 global_norm_var_not_dist,
                 group=self._hcg.get_pipe_parallel_group(),
@@ -271,6 +285,8 @@ def __init__(self, optimizer, hcg, strategy):
 
         self._sharding_enable = self._hcg.get_sharding_parallel_world_size() > 1
 
+        self._sep_enable = self._hcg.get_sep_parallel_world_size() > 1
+
         if (
             isinstance(self._inner_opt._grad_clip, ClipGradByGlobalNorm)
             and not self._use_dp_mode
@@ -425,18 +441,25 @@ def _step(self, parameters_list):
                             moment2, src_rank, mp_group, mp_configs.sync_mode
                         )
 
-    @no_grad()
-    @framework.dygraph_only
-    def step(self):
-        parameters_list = obtain_optimizer_parameters_list(self._inner_opt)
+    def _hybrid_sync_grad(self, parameter_list):
+        dp_parameter_list = parameter_list
         if self._sharding_enable:
             assert isinstance(self._inner_opt, DygraphShardingOptimizer)
-            self._inner_opt.reduce_gradients(list(parameters_list), self._hcg)
-
-        if self._dp_enable:
-            fused_allreduce_gradients(list(parameters_list), self._hcg)
+            self._inner_opt.reduce_gradients(parameter_list, self._hcg)
+            # dp later do not need to use global parameter list
+            if not g_shard_norm_align_dp:
+                dp_parameter_list = self._inner_opt.filter_parameters(
+                    parameter_list, self._hcg
+                )
+        if self._dp_enable or self._sep_enable:
+            fused_allreduce_gradients(dp_parameter_list, self._hcg)
 
-        self._step(parameters_list)
+    @no_grad()
+    @framework.dygraph_only
+    def step(self):
+        parameter_list = list(obtain_optimizer_parameters_list(self._inner_opt))
+        self._hybrid_sync_grad(parameter_list)
+        self._step(parameter_list)
 
     @no_grad()
     def minimize(
@@ -449,15 +472,8 @@ def minimize(
             if parameters
             else obtain_optimizer_parameters_list(self._inner_opt)
         )
-
-        # Here sharding should use global parameter list
-        if self._sharding_enable:
-            assert isinstance(self._inner_opt, DygraphShardingOptimizer)
-            self._inner_opt.reduce_gradients(list(parameter_list), self._hcg)
-
-        if self._dp_enable:
-            fused_allreduce_gradients(list(parameter_list), self._hcg)
-
+        parameter_list = list(parameter_list)
+        self._hybrid_sync_grad(parameter_list)
         return self._inner_opt.minimize(
             loss, startup_program, parameter_list, no_grad_set
         )
diff --git a/python/paddle/distributed/fleet/meta_parallel/__init__.py b/python/paddle/distributed/fleet/meta_parallel/__init__.py
index f507e2f636884..d50eb940b72bb 100644
--- a/python/paddle/distributed/fleet/meta_parallel/__init__.py
+++ b/python/paddle/distributed/fleet/meta_parallel/__init__.py
@@ -26,5 +26,6 @@
 from .pipeline_parallel import PipelineParallel  # noqa: F401
 from .pipeline_parallel import PipelineParallelWithInterleave  # noqa: F401
 from .sharding_parallel import ShardingParallel  # noqa: F401
+from .segment_parallel import SegmentParallel  # noqa: F401
 
 __all__ = []
diff --git a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
index 367169aead956..d90185d36d466 100755
--- a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
@@ -24,6 +24,7 @@
 from ..utils.hybrid_parallel_util import (
     broadcast_dp_parameters,
     broadcast_mp_parameters,
+    broadcast_sep_parameters,
     broadcast_sharding_parameters,
 )
 from ..utils.log_util import logger
@@ -138,6 +139,7 @@ def __init__(self, layers, hcg, strategy):
         super().__init__(layers, hcg, strategy)
         self.use_data_parallel = self._hcg.get_data_parallel_world_size() > 1
         self.use_model_parallel = self._hcg.get_model_parallel_world_size() > 1
+        self.use_sep_parallel = self._hcg.get_sep_parallel_world_size() > 1
         self.use_sharding_parallel = (
             self._hcg.get_sharding_parallel_world_size() > 1
         )
@@ -161,7 +163,13 @@ def __init__(self, layers, hcg, strategy):
         self.stage_id = self._hcg.get_stage_id()
         self.global_rank = self._hcg.get_global_rank()
         self.pp_group = self._hcg.get_pipe_parallel_group()
+
         self.dp_group = self._hcg.get_data_parallel_group()
+
+        # fused sep and dp
+        if self.use_sep_parallel:
+            self.dp_group = self._hcg.get_dp_sep_parallel_group()
+
         self.sharding_group = self._hcg.get_sharding_parallel_group()
 
         self._virtual_pp_world_size = None
@@ -251,6 +259,10 @@ def __init__(self, layers, hcg, strategy):
             logger.info("start broadcast mp parameters")
             broadcast_mp_parameters(self._layers, self._hcg)
 
+        if self.use_sep_parallel:
+            logger.info("start broadcast sep parameters")
+            broadcast_sep_parameters(self._layers, self._hcg)
+
         if self.use_sharding_parallel:
             logger.info("start broadcast sharding parameters")
             broadcast_sharding_parameters(self._layers, self._hcg)
diff --git a/python/paddle/distributed/fleet/meta_parallel/segment_parallel.py b/python/paddle/distributed/fleet/meta_parallel/segment_parallel.py
new file mode 100644
index 0000000000000..3a48543c50fbc
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_parallel/segment_parallel.py
@@ -0,0 +1,40 @@
+#   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..utils.hybrid_parallel_util import (
+    broadcast_dp_parameters,
+    broadcast_sep_parameters,
+    broadcast_sharding_parameters,
+)
+from ..utils.log_util import logger
+from .meta_parallel_base import MetaParallelBase
+
+__all__ = []
+
+
+class SegmentParallel(MetaParallelBase):
+    def __init__(self, layers, hcg, **kwargs):
+        super().__init__(layers, hcg, **kwargs)
+
+    def _prepare_for_model(self):
+        logger.info("start broadcast sep parameters")
+        broadcast_sep_parameters(self._layers, self._hcg)
+
+        if self._hcg.get_sharding_parallel_world_size() > 1:
+            logger.info("start broadcast sharding parameters")
+            broadcast_sharding_parameters(self._layers, self._hcg)
+
+        if self._hcg.get_data_parallel_world_size() > 1:
+            logger.info("start broadcast dp parameters")
+            broadcast_dp_parameters(self._layers, self._hcg)
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding_parallel.py b/python/paddle/distributed/fleet/meta_parallel/sharding_parallel.py
index d50d0b62e1bdf..1a90d3ae9f7b2 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding_parallel.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding_parallel.py
@@ -12,7 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ..utils.hybrid_parallel_util import broadcast_sharding_parameters
+from ..utils.hybrid_parallel_util import (
+    broadcast_dp_parameters,
+    broadcast_sharding_parameters,
+)
 from ..utils.log_util import logger
 from .meta_parallel_base import MetaParallelBase
 
@@ -27,6 +30,8 @@ def _prepare_for_model(self):
         logger.info("start broadcast sharding parameters")
         broadcast_sharding_parameters(self._layers, self._hcg)
 
-        # TODO (JZ-LIANG) to support Sharding-DP
+        if self._hcg.get_data_parallel_world_size() > 1:
+            logger.info("start broadcast dp parameters")
+            broadcast_dp_parameters(self._layers, self._hcg)
 
         logger.info("sharding's parameters is ready")
diff --git a/python/paddle/distributed/fleet/meta_parallel/tensor_parallel.py b/python/paddle/distributed/fleet/meta_parallel/tensor_parallel.py
index 883533d8e1724..6da9dae096d46 100755
--- a/python/paddle/distributed/fleet/meta_parallel/tensor_parallel.py
+++ b/python/paddle/distributed/fleet/meta_parallel/tensor_parallel.py
@@ -16,6 +16,7 @@
     broadcast_dp_parameters,
     broadcast_input_data,
     broadcast_mp_parameters,
+    broadcast_sep_parameters,
     broadcast_sharding_parameters,
 )
 from ..utils.log_util import logger
@@ -32,6 +33,10 @@ def _prepare_for_model(self):
         logger.info("start broadcast mp parameters")
         broadcast_mp_parameters(self._layers, self._hcg)
 
+        if self._hcg.get_sep_parallel_world_size() > 1:
+            logger.info("start broadcast sep parameters")
+            broadcast_sep_parameters(self._layers, self._hcg)
+
         if self._hcg.get_sharding_parallel_world_size() > 1:
             logger.info("start broadcast sharding parameters")
             broadcast_sharding_parameters(self._layers, self._hcg)
diff --git a/python/paddle/distributed/fleet/model.py b/python/paddle/distributed/fleet/model.py
index c849a94dcea5d..f7fc29b8d27ab 100755
--- a/python/paddle/distributed/fleet/model.py
+++ b/python/paddle/distributed/fleet/model.py
@@ -20,6 +20,7 @@
     PipelineLayer,
     PipelineParallel,
     PipelineParallelWithInterleave,
+    SegmentParallel,
     ShardingParallel,
     TensorParallel,
 )
@@ -130,18 +131,6 @@ def distributed_model(model):
     if fleet_env._hcg.get_parallel_mode() == ParallelMode.SHARDING_PARALLEL:
         model = ShardingParallel(model, fleet_env._hcg, strategy=strategy)
     elif fleet_env._hcg.get_parallel_mode() == ParallelMode.DATA_PARALLEL:
-        # NOTE (JZ-LIANG) init parameters broadcast within sharding group
-        # normally it should be done inside DataParallel
-        if fleet_env.sharding_degree > 1:
-            from paddle.distributed.fleet.utils.hybrid_parallel_util import (
-                broadcast_sharding_parameters,
-            )
-
-            assert (
-                fleet_env.sharding_degree
-                == fleet_env._hcg.get_sharding_parallel_world_size()
-            )
-            broadcast_sharding_parameters(model, fleet_env._hcg)
         model = paddle.DataParallel(
             model,
             comm_buffer_size=strategy.fuse_grad_size_in_MB,
@@ -149,6 +138,8 @@ def distributed_model(model):
             find_unused_parameters=strategy.find_unused_parameters,
             group=fleet_env._hcg.get_data_parallel_group(),
         )
+    elif fleet_env._hcg.get_parallel_mode() == ParallelMode.SEGMENT_PARALLEL:
+        model = SegmentParallel(model, fleet_env._hcg, strategy=strategy)
     elif fleet_env._hcg.get_parallel_mode() == ParallelMode.TENSOR_PARALLEL:
         model = TensorParallel(model, fleet_env._hcg, strategy=strategy)
     elif fleet_env._hcg.get_parallel_mode() == ParallelMode.PIPELINE_PARALLEL:
diff --git a/python/paddle/distributed/fleet/optimizer.py b/python/paddle/distributed/fleet/optimizer.py
index 85c27fd138ef3..1f1439b3b0b05 100755
--- a/python/paddle/distributed/fleet/optimizer.py
+++ b/python/paddle/distributed/fleet/optimizer.py
@@ -72,12 +72,17 @@ def _dygraph_distributed_optimizer(optimizer, strategy=None):
             if fleet_env._user_defined_strategy.hybrid_configs[
                 "pp_configs"
             ].dp_comm_overlap:
+                # grad all-reduce of dp and sep with be fused
                 hp_optim._dp_enable = False
+                hp_optim._sep_enable = False
 
             if fleet_env._user_defined_strategy.hybrid_configs[
                 "pp_configs"
             ].sharding_comm_overlap:
                 hp_optim._sharding_enable = False
+                assert (
+                    not hp_optim._sep_enable
+                ), "sep parallel can not coexist with sharding_comm_overlap"
 
             return hp_optim
         else:
diff --git a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
index edeb585384b03..1b5c3409d287a 100644
--- a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
+++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
@@ -239,9 +239,27 @@ def fused_allreduce_gradients_with_group(
 
 
 def fused_allreduce_gradients(parameter_list, hcg):
-    data_parallel_group = None if hcg is None else hcg.get_data_parallel_group()
-    logger.debug("dp start fuse allreduce gradients")
-    fused_allreduce_gradients_with_group(parameter_list, data_parallel_group)
+    group = None
+    scale = None
+    if hcg is not None:
+        dp_enabled = hcg.get_data_parallel_world_size() > 1
+        sep_enabled = hcg.get_sep_parallel_world_size() > 1
+        assert (
+            dp_enabled or sep_enabled
+        ), f"dp_enabled {dp_enabled}; sep_enabled {sep_enabled}"
+        group = None
+        # sep all reduce is not scaled
+        scale = 1.0
+        if dp_enabled:
+            group = hcg.get_data_parallel_group()
+            scale = group.nranks
+        if sep_enabled:
+            sep_group = hcg.get_sep_parallel_group()
+            dp_sep_group = hcg.get_dp_sep_parallel_group()
+            group = sep_group if group is None else dp_sep_group
+
+    logger.debug("dp or sep start fuse allreduce gradients")
+    fused_allreduce_gradients_with_group(parameter_list, group, scale=scale)
 
 
 def broadcast_sharding_parameters(model, hcg):
@@ -254,6 +272,14 @@ def broadcast_sharding_parameters(model, hcg):
     )
 
 
+def broadcast_sep_parameters(model, hcg):
+    # TODO TO save memory, use un-fused broadcast to avoid potentional OOM
+    logger.debug("sep start init parameters sync")
+    sep_group = hcg.get_sep_parallel_group()
+    src_rank = hcg.get_sep_parallel_group_src_rank()
+    sync_params_buffers(model, sep_group, src_rank, is_model_parallel=False)
+
+
 def unwrap_optimizer(optimizer, optimizer_instances=()):
     _inner_opt = optimizer
     while isinstance(_inner_opt, optimizer_instances):
diff --git a/python/paddle/distributed/passes/auto_parallel_recompute.py b/python/paddle/distributed/passes/auto_parallel_recompute.py
index 0fd008ff5a701..5c63d93cb14b1 100644
--- a/python/paddle/distributed/passes/auto_parallel_recompute.py
+++ b/python/paddle/distributed/passes/auto_parallel_recompute.py
@@ -185,8 +185,8 @@ def modify_forward_desc_for_recompute(self, dist_context):
             # modify dropout op's desc
             self.ops.insert(op_idx, seed_op)
             cur_op.desc.set_input(seed_tensor_name, [var_unique_name])
-            cur_op._remove_attr("fix_seed")
-            cur_op._remove_attr("seed")
+            cur_op.desc._set_attr("fix_seed", False)
+            cur_op.desc._set_attr("seed", 0)
             cur_op_dist_attr.set_input_dist_attr(
                 seed_var.name, seed_var_dist_attr
             )
@@ -416,10 +416,6 @@ def _apply_single_impl(self, main_program, startup_program, context):
         # segments ops should be inserted.
         for i in range(len(ops) - 1, loss_op_idx, -1):
             grad_op = ops[i]
-            # remove some attrs of dropout_grad op's desc
-            if grad_op.type == "dropout_grad":
-                grad_op._remove_attr("fix_seed")
-                grad_op._remove_attr("seed")
 
             input_and_output_names = []
             input_and_output_names.extend(grad_op.input_arg_names)
diff --git a/python/paddle/distributed/utils/launch_utils.py b/python/paddle/distributed/utils/launch_utils.py
index 54e265b068df1..eb22fde7bc1e7 100644
--- a/python/paddle/distributed/utils/launch_utils.py
+++ b/python/paddle/distributed/utils/launch_utils.py
@@ -365,11 +365,15 @@ def get_host_name_ip():
 
 def add_arguments(argname, type, default, help, argparser, **kwargs):
     """Add argparse's argument.
-    Usage:
-    .. code-block:: python
-        parser = argparse.ArgumentParser()
-        add_argument("name", str, "Jonh", "User name.", parser)
-        args = parser.parse_args()
+    Examples:
+        .. code-block:: python
+
+            >>> import argparse
+            >>> from paddle.distributed.utils import launch_utils
+            >>> parser = argparse.ArgumentParser()
+            >>> launch_utils.add_arguments("name", str, "Jonh", "User name.", parser)
+            >>> args = parser.parse_args()
+
     """
     type = strtobool if type == bool else type
     argparser.add_argument(
diff --git a/python/paddle/distributed/utils/moe_utils.py b/python/paddle/distributed/utils/moe_utils.py
index ff0d04b0742f1..e801552afae9a 100644
--- a/python/paddle/distributed/utils/moe_utils.py
+++ b/python/paddle/distributed/utils/moe_utils.py
@@ -69,34 +69,41 @@ def global_scatter(
     Examples:
         .. code-block:: python
 
-            # required: distributed
-            import paddle
-            from paddle.distributed import init_parallel_env
-            init_parallel_env()
-            n_expert = 2
-            world_size = 2
-            d_model = 2
-            in_feat = d_model
-            local_input_buf = paddle.to_tensor([[1, 2],[3, 4],[5, 6],[7, 8],[9, 10]], \
-                                            dtype='float32', stop_gradient=False)
-            if paddle.distributed.ParallelEnv().local_rank == 0:
-                local_count = paddle.to_tensor([2, 1, 1, 1], dtype="int64")
-                global_count = paddle.to_tensor([2, 1, 1, 1], dtype="int64")
-            else:
-                local_count = paddle.to_tensor([1, 1, 2, 1], dtype="int64")
-                global_count = paddle.to_tensor([1, 1, 2, 1], dtype="int64")
-            a = paddle.distributed.utils.global_scatter(local_input_buf, \
-            local_count, global_count)
-            a.stop_gradient = False
-            print(a)
-            # out for rank 0: [[1, 2], [3, 4], [1, 2], [5, 6], [3, 4]]
-            # out for rank 1: [[7, 8], [5, 6], [7, 8], [9, 10], [9, 10]]
-            # backward test
-            c = a * a
-            c.backward()
-            print("local_input_buf.grad: ", local_input_buf.grad)
-            # out for rank 0: [[2, 4], [6, 8], [10, 12], [14, 16], [18, 20]]
-            # out for rank 1: [[2, 4], [6, 8], [10, 12], [14, 16], [18, 20]]
+            >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+            >>> import paddle
+            >>> from paddle.distributed import init_parallel_env
+            >>> from paddle.distributed.utils imoprt moe_utils
+            >>> init_parallel_env()
+            >>> n_expert = 2
+            >>> world_size = 2
+            >>> d_model = 2
+            >>> in_feat = d_model
+            >>> local_input_buf = paddle.to_tensor(
+            ...     [[1, 2],[3, 4],[5, 6],[7, 8],[9, 10]],
+            ...     dtype='float32',
+            ...     stop_gradient=False
+            ... )
+            >>> if paddle.distributed.ParallelEnv().local_rank == 0:
+            ...     local_count = paddle.to_tensor([2, 1, 1, 1], dtype="int64")
+            ...     global_count = paddle.to_tensor([2, 1, 1, 1], dtype="int64")
+            >>> else:
+            ...     local_count = paddle.to_tensor([1, 1, 2, 1], dtype="int64")
+            ...     global_count = paddle.to_tensor([1, 1, 2, 1], dtype="int64")
+            >>> a = moe_utils.global_scatter(local_input_buf,
+            ...     local_count,
+            ...     global_count
+            ... )
+            >>> a.stop_gradient = False
+            >>> print(a)
+            >>> # out for rank 0: [[1, 2], [3, 4], [1, 2], [5, 6], [3, 4]]
+            >>> # out for rank 1: [[7, 8], [5, 6], [7, 8], [9, 10], [9, 10]]
+            >>> # backward test
+            >>> c = a * a
+            >>> c.backward()
+            >>> print("local_input_buf.grad: ", local_input_buf.grad)
+            >>> # out for rank 0: [[2, 4], [6, 8], [10, 12], [14, 16], [18, 20]]
+            >>> # out for rank 1: [[2, 4], [6, 8], [10, 12], [14, 16], [18, 20]]
+
     """
     if group is not None and not group.is_member():
         return
@@ -187,33 +194,41 @@ def global_gather(
     Examples:
         .. code-block:: python
 
-            # required: distributed
-            import paddle
-            from paddle.distributed import init_parallel_env
-            init_parallel_env()
-            n_expert = 2
-            world_size = 2
-            d_model = 2
-            in_feat = d_model
-            local_input_buf = paddle._to_tensor([[1, 2],[3, 4],[5, 6],[7, 8],[9, 10]],\
-                                        dtype='float32', stop_gradient=False)
-            if paddle.distributed.ParallelEnv().local_rank == 0:
-                local_count = paddle.to_tensor([2, 1, 1, 1], dtype="int64")
-                global_count = paddle.to_tensor([2, 1, 1, 1], dtype="int64")
-            else:
-                local_count = paddle.to_tensor([1, 1, 2, 1], dtype="int64")
-                global_count = paddle.to_tensor([1, 1, 2, 1], dtype="int64")
-
-            a = paddle.distributed.utils.global_gather(local_input_buf, local_count, global_count)
-            print(a)
-            # out for rank 0: [[1, 2], [3, 4], [7, 8], [1, 2], [7, 8]]
-            # out for rank 1: [[5, 6], [9, 10], [3, 4], [5, 6], [9, 10]]
-            a.stop_gradient = False
-            c = a * a
-            c.backward()
-            print("local_input_buf.grad", local_input_buf.grad)
-            # out for rank 0: [[2, 4], [6, 8], [10, 12], [14, 16], [18, 20]]
-            # out for rank 1: [[2, 4], [6, 8], [10, 12], [14, 16], [18, 20]]
+            >>> # doctest: +REQUIRES(env:DISTRIBUTED)
+            >>> import paddle
+            >>> from paddle.distributed import init_parallel_env
+            >>> from paddle.distributed.utils imoprt moe_utils
+            >>> init_parallel_env()
+            >>> n_expert = 2
+            >>> world_size = 2
+            >>> d_model = 2
+            >>> in_feat = d_model
+            >>> local_input_buf = paddle._to_tensor(
+            ...     [[1, 2],[3, 4],[5, 6],[7, 8],[9, 10]],
+            ...     dtype='float32',
+            ...     stop_gradient=False
+            ... )
+            >>> if paddle.distributed.ParallelEnv().local_rank == 0:
+            ...     local_count = paddle.to_tensor([2, 1, 1, 1], dtype="int64")
+            ...     global_count = paddle.to_tensor([2, 1, 1, 1], dtype="int64")
+            >>> else:
+            ...     local_count = paddle.to_tensor([1, 1, 2, 1], dtype="int64")
+            ...     global_count = paddle.to_tensor([1, 1, 2, 1], dtype="int64")
+            >>> a = moe_utils.global_gather(
+            ...     local_input_buf,
+            ...     local_count,
+            ...     global_count
+            ... )
+            >>> print(a)
+            >>> # out for rank 0: [[1, 2], [3, 4], [7, 8], [1, 2], [7, 8]]
+            >>> # out for rank 1: [[5, 6], [9, 10], [3, 4], [5, 6], [9, 10]]
+            >>> a.stop_gradient = False
+            >>> c = a * a
+            >>> c.backward()
+            >>> print("local_input_buf.grad", local_input_buf.grad)
+            >>> # out for rank 0: [[2, 4], [6, 8], [10, 12], [14, 16], [18, 20]]
+            >>> # out for rank 1: [[2, 4], [6, 8], [10, 12], [14, 16], [18, 20]]
+
     """
     if group is not None and not group.is_member():
         return
diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py
index 110f8cd39b585..f584008871cdb 100755
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -56,8 +56,8 @@
 from ..base.dygraph.base import enable_dygraph as disable_static  # noqa: F401
 from ..base.dygraph.base import disable_dygraph as enable_static  # noqa: F401
 from ..base.framework import in_dygraph_mode as in_dynamic_mode  # noqa: F401
-from ..base.framework import in_new_ir_mode  # noqa: F401
-from ..base.framework import in_dynamic_or_new_ir_mode  # noqa: F401
+from ..base.framework import in_pir_mode  # noqa: F401
+from ..base.framework import in_dynamic_or_pir_mode  # noqa: F401
 from ..base.framework import (
     _current_expected_place,
     _get_paddle_place,
diff --git a/python/paddle/incubate/tensor/math.py b/python/paddle/incubate/tensor/math.py
index 649b1a74fcb08..46ef434ba44b5 100644
--- a/python/paddle/incubate/tensor/math.py
+++ b/python/paddle/incubate/tensor/math.py
@@ -56,11 +56,14 @@ def segment_sum(data, segment_ids, name=None):
 
         .. code-block:: python
 
-            import paddle
-            data = paddle.to_tensor([[1, 2, 3], [3, 2, 1], [4, 5, 6]], dtype='float32')
-            segment_ids = paddle.to_tensor([0, 0, 1], dtype='int32')
-            out = paddle.incubate.segment_sum(data, segment_ids)
-            #Outputs: [[4., 4., 4.], [4., 5., 6.]]
+            >>> import paddle
+            >>> data = paddle.to_tensor([[1, 2, 3], [3, 2, 1], [4, 5, 6]], dtype='float32')
+            >>> segment_ids = paddle.to_tensor([0, 0, 1], dtype='int32')
+            >>> out = paddle.incubate.segment_sum(data, segment_ids)
+            >>> print(out)
+            Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[4., 4., 4.],
+             [4., 5., 6.]])
 
     """
     if in_dynamic_mode():
@@ -121,11 +124,14 @@ def segment_mean(data, segment_ids, name=None):
 
         .. code-block:: python
 
-            import paddle
-            data = paddle.to_tensor([[1, 2, 3], [3, 2, 1], [4, 5, 6]], dtype='float32')
-            segment_ids = paddle.to_tensor([0, 0, 1], dtype='int32')
-            out = paddle.incubate.segment_mean(data, segment_ids)
-            #Outputs: [[2., 2., 2.], [4., 5., 6.]]
+            >>> import paddle
+            >>> data = paddle.to_tensor([[1, 2, 3], [3, 2, 1], [4, 5, 6]], dtype='float32')
+            >>> segment_ids = paddle.to_tensor([0, 0, 1], dtype='int32')
+            >>> out = paddle.incubate.segment_mean(data, segment_ids)
+            >>> print(out)
+            Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[2., 2., 2.],
+             [4., 5., 6.]])
 
     """
 
@@ -186,11 +192,14 @@ def segment_min(data, segment_ids, name=None):
 
         .. code-block:: python
 
-            import paddle
-            data = paddle.to_tensor([[1, 2, 3], [3, 2, 1], [4, 5, 6]], dtype='float32')
-            segment_ids = paddle.to_tensor([0, 0, 1], dtype='int32')
-            out = paddle.incubate.segment_min(data, segment_ids)
-            #Outputs:  [[1., 2., 1.], [4., 5., 6.]]
+            >>> import paddle
+            >>> data = paddle.to_tensor([[1, 2, 3], [3, 2, 1], [4, 5, 6]], dtype='float32')
+            >>> segment_ids = paddle.to_tensor([0, 0, 1], dtype='int32')
+            >>> out = paddle.incubate.segment_min(data, segment_ids)
+            >>> print(out)
+            Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[1., 2., 1.],
+             [4., 5., 6.]])
 
     """
 
@@ -251,11 +260,14 @@ def segment_max(data, segment_ids, name=None):
 
         .. code-block:: python
 
-            import paddle
-            data = paddle.to_tensor([[1, 2, 3], [3, 2, 1], [4, 5, 6]], dtype='float32')
-            segment_ids = paddle.to_tensor([0, 0, 1], dtype='int32')
-            out = paddle.incubate.segment_max(data, segment_ids)
-            #Outputs: [[3., 2., 3.], [4., 5., 6.]]
+            >>> import paddle
+            >>> data = paddle.to_tensor([[1, 2, 3], [3, 2, 1], [4, 5, 6]], dtype='float32')
+            >>> segment_ids = paddle.to_tensor([0, 0, 1], dtype='int32')
+            >>> out = paddle.incubate.segment_max(data, segment_ids)
+            >>> print(out)
+            Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[3., 2., 3.],
+             [4., 5., 6.]])
 
     """
 
diff --git a/python/paddle/ir/core.py b/python/paddle/ir/core.py
index 2a95411292d60..3132fe07200f1 100644
--- a/python/paddle/ir/core.py
+++ b/python/paddle/ir/core.py
@@ -15,11 +15,29 @@
 
 import numpy as np
 
+from paddle.base.core import VarDesc
 from paddle.base.libpaddle import DataType
 from paddle.base.libpaddle.ir import Program, set_global_program
 
+from .._ir_ops import get_parameter, set_parameter
+from ..base import unique_name
 from ..base.wrapped_decorator import signature_safe_contextmanager
 
+vartype_to_datatype = {
+    VarDesc.VarType.FP32: DataType.FLOAT32,
+    VarDesc.VarType.FP64: DataType.FLOAT64,
+    VarDesc.VarType.FP16: DataType.FLOAT16,
+    VarDesc.VarType.BF16: DataType.BFLOAT16,
+    VarDesc.VarType.INT32: DataType.INT32,
+    VarDesc.VarType.INT16: DataType.INT16,
+    VarDesc.VarType.INT64: DataType.INT64,
+    VarDesc.VarType.BOOL: DataType.BOOL,
+    VarDesc.VarType.UINT8: DataType.UINT8,
+    VarDesc.VarType.INT8: DataType.INT8,
+    VarDesc.VarType.COMPLEX64: DataType.COMPLEX64,
+    VarDesc.VarType.COMPLEX128: DataType.COMPLEX128,
+}
+
 np_type_to_paddle_type = {
     np.dtype("float32"): DataType.FLOAT32,
     np.dtype("float64"): DataType.FLOAT64,
@@ -28,7 +46,7 @@
     np.dtype("int16"): DataType.INT16,
     np.dtype("int64"): DataType.INT64,
     np.dtype("bool_"): DataType.BOOL,
-    np.dtype("uint16"): DataType.UINT16,
+    np.dtype("uint16"): DataType.BFLOAT16,
     np.dtype("uint8"): DataType.UINT8,
     np.dtype("int8"): DataType.INT8,
     np.dtype("complex64"): DataType.COMPLEX64,
@@ -50,7 +68,9 @@ def convert_np_dtype_to_dtype_(np_dtype):
     """
     # Convert the data type string to numpy data type.
     if isinstance(np_dtype, str) and np_dtype == "bfloat16":
-        dtype = np.uint16
+        # since there is still no support for bfloat16 in NumPy,
+        # uint16 is used for casting bfloat16
+        dtype = np.dtype("uint16")
     else:
         dtype = np.dtype(np_dtype)
 
@@ -249,3 +269,36 @@ def program_guard(main_program, startup_program=None):
         switch_main_program(main_program)
         if startup_program is not None:
             switch_startup_program(startup_program)
+
+
+def create_parameter(
+    dtype,
+    shape,
+    **kwargs,
+):
+    if 'initializer' not in kwargs:
+        raise ValueError(
+            "initializer is None, if you want to create parameter, please pass its initializer."
+        )
+    if dtype is not None:
+        if not isinstance(dtype, DataType):
+            dtype = convert_np_dtype_to_dtype_(dtype)
+    op_result_name = unique_name.generate('parameter')
+    startup_program = default_startup_program()
+    main_program = default_main_program()
+
+    with program_guard(default_main_program()):
+        param = get_parameter(op_result_name, dtype, shape)
+        trainable = kwargs.get('trainable', True)
+        param.stop_gradient = not trainable
+        param.is_persistable = True
+
+    with program_guard(startup_program):
+        initializer = kwargs['initializer']
+        init_result = initializer(
+            param, param.get_defining_op().get_parent_block()
+        )
+        init_result.is_persistable = True
+        set_parameter(init_result, op_result_name)
+
+    return param
diff --git a/python/paddle/jit/dy2static/convert_operators.py b/python/paddle/jit/dy2static/convert_operators.py
index 79e9b8489658c..1082897ed8520 100644
--- a/python/paddle/jit/dy2static/convert_operators.py
+++ b/python/paddle/jit/dy2static/convert_operators.py
@@ -15,10 +15,12 @@
 import re
 
 import paddle
+from paddle.autograd.py_layer import PyLayerMeta
 from paddle.base.data_feeder import convert_dtype
 from paddle.base.dygraph.base import _convert_into_variable, in_to_static_mode
 from paddle.base.framework import Variable, core, default_main_program
 
+from .py_layer import StaticPyLayer
 from .utils import (
     RETURN_NO_VALUE_VAR_NAME,
     Dygraph2StaticException,
@@ -38,23 +40,30 @@ def convert_attr(x, attr):
 
 
 def convert_load(x):
-    if in_to_static_mode() and isinstance(x, paddle.base.core.eager.Tensor):
-        """
-        TODO:(@xiongkun) may run convert_load in dygraph mode, which should be fixed.
-        """
-        return _convert_into_variable(x)
-
-    # get the new output of the var
-    if in_to_static_mode() and isinstance(x, Variable):
-        cur_block = default_main_program().current_block()
-
-        from paddle.jit.dy2static.program_translator import ProgramTranslator
+    if in_to_static_mode():
+        if isinstance(x, paddle.base.core.eager.Tensor):
+            """
+            TODO:(@xiongkun) may run convert_load in dygraph mode, which should be fixed.
+            """
+            return _convert_into_variable(x)
+
+        # convert dygraph `PyLayer` into StaticPyLayer
+        if isinstance(x, PyLayerMeta):
+            return StaticPyLayer(x)
+
+        # get the new output of the var
+        if isinstance(x, Variable):
+            cur_block = default_main_program().current_block()
+
+            from paddle.jit.dy2static.program_translator import (
+                ProgramTranslator,
+            )
 
-        new_var = ProgramTranslator.get_instance()._inplace_map.get(
-            cur_block.program, x.desc.id()
-        )
-        if new_var is not None:
-            return new_var
+            new_var = ProgramTranslator.get_instance()._inplace_map.get(
+                cur_block.program, x.desc.id()
+            )
+            if new_var is not None:
+                return new_var
 
     return x
 
diff --git a/python/paddle/jit/dy2static/function_spec.py b/python/paddle/jit/dy2static/function_spec.py
index d34a5dc6288f6..e2966e4097d86 100644
--- a/python/paddle/jit/dy2static/function_spec.py
+++ b/python/paddle/jit/dy2static/function_spec.py
@@ -18,7 +18,9 @@
 import numpy as np
 
 import paddle
+import paddle.ir.core as ir_static
 from paddle.base import core
+from paddle.base.data_feeder import convert_dtype
 from paddle.base.dygraph.base import switch_to_static_graph
 from paddle.jit.translated_layer import TranslatedLayer
 from paddle.nn.layer import layers
@@ -170,6 +172,34 @@ def args_to_input_spec(self, args, kwargs):
 
         return args_with_spec, kwargs_with_spec
 
+    @switch_to_static_graph
+    def newir_to_static_inputs_with_spec(self, input_with_spec, main_program):
+        """
+        Constructs feed layer by inputs with InputSpec information for main program.
+
+        Args:
+            input_with_spec(tuple): input arguments by replacing argument with InputSpec.
+            main_program(Program): main program for inserting feed layer.
+        """
+        flat_input_spec = paddle.utils.flatten(input_with_spec)
+
+        inputs = []
+        with ir_static.program_guard(main_program):
+            for i, var_spec in enumerate(flat_input_spec):
+                if isinstance(var_spec, paddle.static.InputSpec):
+                    stop_gradient = getattr(var_spec, 'stop_gradient', False)
+                    feed_value = paddle.static.input.data(
+                        name=var_spec.name or "feed_%s" % i,
+                        shape=var_spec.shape,
+                        dtype=convert_dtype(var_spec.dtype),
+                    )
+                    feed_value.stop_gradient = stop_gradient
+                else:
+                    feed_value = var_spec
+                inputs.append(feed_value)
+
+        return paddle.utils.pack_sequence_as(input_with_spec, inputs)
+
     @switch_to_static_graph
     def to_static_inputs_with_spec(self, input_with_spec, main_program):
         """
diff --git a/python/paddle/jit/dy2static/newir_partial_program.py b/python/paddle/jit/dy2static/newir_partial_program.py
new file mode 100644
index 0000000000000..76954d58c6b09
--- /dev/null
+++ b/python/paddle/jit/dy2static/newir_partial_program.py
@@ -0,0 +1,1135 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from copy import deepcopy
+
+import numpy as np
+
+import paddle
+import paddle.ir.core as ir_static
+from paddle import _legacy_C_ops
+from paddle.amp.auto_cast import _in_amp_guard, _in_pure_fp16_guard
+from paddle.autograd.ir_backward import grad
+from paddle.base import core, framework, program_guard
+from paddle.base.compiler import BuildStrategy
+from paddle.base.data_feeder import check_type, convert_dtype
+from paddle.base.dygraph.base import switch_to_static_graph
+from paddle.base.framework import _apply_pass
+from paddle.base.libpaddle.ir import OpResult, fake_op_result
+from paddle.optimizer.lr import LRScheduler
+
+from . import logging_utils
+from .utils import RETURN_NO_VALUE_MAGIC_NUM, backend_guard
+
+__all__ = []
+
+
+class NestSequence:
+    """
+    A wrapper class that easily to flatten and restore the nest structure of
+    given sequence.
+    """
+
+    def __init__(self, raw_input, need_check=False):
+        self.__raw_input = raw_input
+        self.__input_list = self.tolist()
+        self.__var_ids = self._get_var_ids()
+        self._check_non_variable(need_check)
+
+    def tolist(self):
+        """
+        Flattens the nested sequences into single list.
+        """
+        return paddle.utils.flatten(self.__raw_input)
+
+    def restore(self, value_list):
+        """
+        Restores the nested sequence from value list.
+        """
+        assert len(self.__input_list) == len(value_list)
+        return paddle.utils.pack_sequence_as(self.__raw_input, value_list)
+
+    def _get_var_ids(self):
+        var_ids = []
+        for idx, var in enumerate(self.__input_list):
+            if isinstance(var, (OpResult, core.eager.Tensor)):
+                var_ids.append(idx)
+
+        return var_ids
+
+    def _check_non_variable(self, need_check):
+        """
+        Raises warning if output of traced function contains non-tensor type values.
+        """
+        if need_check:
+            warning_types = set()
+            for var in self.__input_list:
+                if not isinstance(var, (framework.Variable, core.eager.Tensor)):
+                    warning_types.add(type(var))
+            if warning_types:
+                logging_utils.warn(
+                    "Output of traced function contains non-tensor type values: {}. "
+                    "Currently, We don't support to update them while training and will return "
+                    "what we first saw. Please try to return them as tensor.".format(
+                        list(warning_types)
+                    )
+                )
+
+    @property
+    def var_ids(self):
+        return self.__var_ids
+
+    def __getitem__(self, item):
+        return self.__input_list[item]
+
+
+class LazyInitialized:
+    """
+    Descriptor to implement lazy initialization of property.
+    """
+
+    def __init__(self, function):
+        self.function = function
+
+    def __get__(self, instance, cls):
+        val = self.function(instance)
+        setattr(instance, self.function.__name__, val)
+        return val
+
+
+class ProgramInfo:
+    """
+    A helper class to recoder Program information
+    """
+
+    def __init__(self):
+        self.op_size = {
+            'fp32': -1,
+            'amp': -1,
+            'fp16': -1,
+        }
+        self.programs = {}
+        self.mode = "infer"
+
+    def __call__(self, key, prog_creator):
+        """
+        Recoder infer program and op size.
+        """
+        assert key in ['fp32', 'amp', 'fp16']
+        if key not in self.programs:
+            infer_prog = prog_creator(is_infer_mode=True)
+            self.programs[key] = infer_prog
+            self.op_size[key] = infer_prog.desc.block(0).op_size()
+
+        return self.programs[key], self.op_size[key]
+
+
+class PartialProgramLayerHook:
+    def before_append_backward(self, forward_program):
+        ...
+
+    def after_append_backward(self, whole_program, backward_start_idx):
+        ...
+
+    def after_infer(self, infer_program):
+        ...
+
+
+class PartialProgramLayer:
+    """
+    PartialProgramLayer wraps all the ops from layers decorated by `@to_static`
+    and execute them as a static subgraph.
+
+    .. note::
+        **1. This is a very low level API. Users should not use this API
+             directly. Please use `partial_program_from(concrete_program)`
+             to create it.
+        **2. LoDTensorArray is not currently supported in the output.
+
+    Args:
+        main_program(Program): The main program that contains ops need to be executed.
+        inputs(list[Variable]): The input list of the decorated function by `@to_static`.
+        outputs(list[Variable]): The output list of the decorated function by `@to_static`.
+        parameters(list[Tensor]|None): All trainable parameters included in the program. Default None.
+
+    Returns:
+        Layer: A Layer object that run all ops internally in static graph mode.
+    """
+
+    def __init__(
+        self, main_program, inputs, outputs, parameters=None, **kwargs
+    ):
+        super().__init__()
+        self._inputs = NestSequence(inputs)
+        self._outputs = NestSequence(outputs, need_check=True)
+        self._params = parameters if parameters is not None else []
+
+        self._build_strategy = kwargs.get('build_strategy', BuildStrategy())
+        assert isinstance(self._build_strategy, BuildStrategy)
+
+        self._origin_main_program = self._verify_program(main_program)
+        self._cuda_graph_vec = self._create_cuda_graph_vec()
+        self._cuda_graph_capture_mode = ""
+        self._cuda_graph_pool_id = 0
+        # Set default mode to train
+        self.training = True
+        self._infer_info = ProgramInfo()
+        self._program_extra_info = {}
+
+        amp_dtype, custom_white_list, custom_black_list = None, None, None
+        tracer = framework._dygraph_tracer()
+        if tracer:
+            custom_white_list, custom_black_list = tracer._get_amp_op_list()
+            amp_dtype = tracer._amp_dtype
+        if amp_dtype is not None and amp_dtype in ['float16', 'bfloat16']:
+            # For AMP training
+            self._amp_list = (
+                paddle.static.amp.fp16_lists.AutoMixedPrecisionLists(
+                    custom_white_list=custom_white_list,
+                    custom_black_list=custom_black_list,
+                    dtype=amp_dtype,
+                )
+            )
+
+        # program_id -> list(scope)
+        self._scope_cache = {}
+        self._hooker = None
+        self._backend = kwargs.get('backend', None)
+        self._grad_var_names = {}
+
+    def __call__(self, inputs):
+        """
+        Execute static graph by Interpreter and Return dynamic Tensors.
+        """
+        in_vars, out_vars = self._prepare(inputs)
+        self._cast_fp16_if_pure_fp16(in_vars)
+        attrs = self._prepare_attributes()
+
+        # self._sync_lr_value_with_scheduler()
+
+        c_run_program_fn = None
+        if ir_static._use_new_ir_api():
+            c_run_program_fn = _legacy_C_ops.newir_run_program
+        else:
+            c_run_program_fn = _legacy_C_ops.run_program
+        c_run_program_fn(
+            self._valid_vars(in_vars),
+            self._valid_vars(self._params),
+            self._valid_vars(out_vars),
+            self._create_scope_vec(
+                program_id=self.program_id, use_scope_cache=True
+            ),
+            self._double_grads,
+            self._cuda_graph_vec,
+            *attrs,
+        )
+        self._update_stop_gradient(out_vars)
+        restored_nest_out = self._restore_out(out_vars)
+        return self._remove_no_value(restored_nest_out)
+
+    def _sync_lr_value_with_scheduler(self):
+        """Update lr_var value with calculated by lr_scheduler."""
+        main_program = self._origin_main_program
+        if hasattr(main_program, 'lr_scheduler') and hasattr(
+            main_program, 'lr_var'
+        ):
+            lr_scheduler = main_program.lr_scheduler
+            lr_var = main_program.lr_var
+
+            assert isinstance(lr_scheduler, LRScheduler), "must be LRScheduler"
+            lr_scheduler = self._origin_main_program.lr_scheduler
+            lr_value = lr_scheduler()
+            data = np.array(lr_value).astype(convert_dtype(lr_var.dtype))
+            lr_var.set_value(data)
+
+    def set_hooker(self, hooker):
+        self._hooker = hooker
+
+    def _get_scope(self, program_id=None, use_scope_cache=False):
+        if use_scope_cache:
+            if program_id not in self._scope_cache:
+                scope = core.Scope()
+                self._scope_cache[program_id] = [scope]
+                return scope
+            else:
+                for scope in self._scope_cache[program_id]:
+                    if scope._can_reused:
+                        return scope
+                scope = core.Scope()
+                self._scope_cache[program_id].append(scope)
+                return scope
+        else:
+            return core.Scope()
+
+    @LazyInitialized
+    def _double_grads(self):
+        # TODO: check the affects.
+        return None
+
+    # whole
+    @switch_to_static_graph
+    def _create_program(self, is_infer_mode=False):
+        if is_infer_mode:
+            infer_program = self._origin_main_program.clone(
+                for_test=is_infer_mode
+            )
+            if self._hooker:
+                infer_program = self._hooker.after_infer(infer_program)
+            return infer_program
+        else:
+            train_program = self._append_backward_desc(
+                self._origin_main_program
+            )
+            # Note: Only set grad type once after initializing train program. So we put it here.
+            self._set_grad_type(self._params, train_program)
+            return train_program
+
+    @switch_to_static_graph
+    def _create_amp_program(self, is_infer_mode=False):
+        amp_program = self._origin_main_program.clone(for_test=is_infer_mode)
+        with program_guard(amp_program):
+            paddle.static.amp.fp16_utils.cast_model_to_fp16(
+                amp_program, self._amp_list, use_fp16_guard=False, level='O1'
+            )
+        if is_infer_mode:
+            if self._hooker:
+                amp_program = self._hooker.after_infer(amp_program)
+            return amp_program
+        else:
+            train_amp_program = self._append_backward_desc(amp_program)
+            self._set_grad_type(self._params, train_amp_program)
+            return train_amp_program
+
+    @switch_to_static_graph
+    def _create_pure_fp16_program(self, is_infer_mode=False):
+        pure_fp16_program = self._origin_main_program.clone(
+            for_test=is_infer_mode
+        )
+        with program_guard(pure_fp16_program):
+            paddle.static.amp.fp16_utils.cast_model_to_fp16(
+                pure_fp16_program, self._amp_list, use_fp16_guard=False
+            )
+
+        if is_infer_mode:
+            if self._hooker:
+                pure_fp16_program = self._hooker.after_infer(pure_fp16_program)
+            return pure_fp16_program
+        else:
+            train_pure_fp16_program = self._append_backward_desc(
+                pure_fp16_program
+            )
+            self._set_grad_type(self._params, train_pure_fp16_program)
+            return train_pure_fp16_program
+
+    @switch_to_static_graph
+    def _create_forward_backward_train_program(self):
+        whole_program = self._train_program
+        forward_end_op_index = self.get_forward_end_op_idx(whole_program)
+        assert forward_end_op_index >= 0
+        return self._get_forward_backward_program_form(
+            whole_program, forward_end_op_index
+        )
+
+    @switch_to_static_graph
+    def _create_forward_backward_train_amp_program(self):
+        whole_program = self._train_amp_program
+        forward_end_op_index = self.get_forward_end_op_idx(whole_program)
+        assert forward_end_op_index >= 0
+
+        return self._get_forward_backward_program_form(
+            whole_program, forward_end_op_index
+        )
+
+    @switch_to_static_graph
+    def _create_forward_backward_train_pure_fp16_program(self):
+        whole_program = self._train_pure_fp16_program
+        forward_end_op_index = self.get_forward_end_op_idx(whole_program)
+        assert forward_end_op_index >= 0
+
+        return self._get_forward_backward_program_form(
+            whole_program, forward_end_op_index
+        )
+
+    @LazyInitialized
+    def _train_program(self):
+        return self._create_program()
+
+    @LazyInitialized
+    def _infer_program(self):
+        program, op_size = self._infer_info('fp32', self._create_program)
+        return self._build_infer_program(program, op_size)
+
+    @LazyInitialized
+    def _train_amp_program(self):
+        return self._create_amp_program()
+
+    @LazyInitialized
+    def _infer_amp_program(self):
+        program, op_size = self._infer_info('amp', self._create_amp_program)
+        return self._build_infer_program(program, op_size)
+
+    @LazyInitialized
+    def _train_pure_fp16_program(self):
+        return self._create_pure_fp16_program()
+
+    @LazyInitialized
+    def _infer_pure_fp16_program(self):
+        program, op_size = self._infer_info(
+            'fp16', self._create_pure_fp16_program
+        )
+        return self._build_infer_program(program, op_size)
+
+    @LazyInitialized
+    def _train_forward_backward_program(self):
+        program = self._create_forward_backward_train_program()
+        return program
+
+    @LazyInitialized
+    def _train_amp_forward_backward_program(self):
+        program = self._create_forward_backward_train_amp_program()
+        return program
+
+    @LazyInitialized
+    def _empty_backward_program_for_eval(self):
+        return paddle.static.Program()
+
+    @LazyInitialized
+    def _train_pure_fp16_forward_backward_program(self):
+        program = self._create_forward_backward_train_pure_fp16_program()
+        return program
+
+    @LazyInitialized
+    def _train_program_id(self):
+        program_id = paddle.utils._hash_with_id(self._train_program, self)
+        core._set_cached_executor_build_strategy(
+            program_id, self._build_strategy
+        )
+        return program_id
+
+    @LazyInitialized
+    def _infer_program_id(self):
+        return paddle.utils._hash_with_id(self._infer_program, self)
+
+    @LazyInitialized
+    def _train_amp_program_id(self):
+        program_id = paddle.utils._hash_with_id(self._train_amp_program, self)
+        core._set_cached_executor_build_strategy(
+            program_id, self._build_strategy
+        )
+        return program_id
+
+    @LazyInitialized
+    def _infer_amp_program_id(self):
+        return paddle.utils._hash_with_id(self._infer_amp_program, self)
+
+    @LazyInitialized
+    def _train_pure_fp16_program_id(self):
+        program_id = paddle.utils._hash_with_id(
+            self._train_pure_fp16_program, self
+        )
+        core._set_cached_executor_build_strategy(
+            program_id, self._build_strategy
+        )
+        return program_id
+
+    @LazyInitialized
+    def _infer_pure_fp16_program_id(self):
+        return paddle.utils._hash_with_id(self._infer_pure_fp16_program, self)
+
+    def get_forward_end_op_idx(self, program):
+        return self._program_extra_info[
+            paddle.utils._hash_with_id(program, self)
+        ]['forward_end_op_idx']
+
+    def get_program_extra(self, program):
+        if (
+            paddle.utils._hash_with_id(program, self)
+            not in self._program_extra_info
+        ):
+            self._program_extra_info[
+                paddle.utils._hash_with_id(program, self)
+            ] = {}
+        return self._program_extra_info[
+            paddle.utils._hash_with_id(program, self)
+        ]
+
+    @property
+    def program(self):
+        """
+        Return current train or eval program.
+        """
+        if self.training:
+            return self.train_program
+        else:
+            return self.infer_program
+
+    @property
+    def program_id(self):
+        """
+        Return current train or eval program hash id.
+        """
+        if self.training:
+            if _in_amp_guard():
+                return self._train_amp_program_id
+            elif _in_pure_fp16_guard():
+                return self._train_pure_fp16_program_id
+            else:
+                return self._train_program_id
+        else:
+            if _in_amp_guard():
+                return self._infer_amp_program_id
+            elif _in_pure_fp16_guard():
+                return self._infer_pure_fp16_program_id
+            else:
+                return self._infer_program_id
+
+    @property
+    def train_program(self):
+        if _in_amp_guard():
+            return self._train_amp_program
+        elif _in_pure_fp16_guard():
+            return self._train_pure_fp16_program
+        else:
+            return self._train_program
+
+    @property
+    def infer_program(self):
+        if _in_amp_guard():
+            return self._infer_amp_program
+        elif _in_pure_fp16_guard():
+            return self._infer_pure_fp16_program
+        else:
+            return self._infer_program
+
+    @property
+    def forward_program(self):
+        if self.training:
+            if _in_amp_guard():
+                progs = self._train_amp_forward_backward_program
+            elif _in_pure_fp16_guard():
+                progs = self._train_pure_fp16_forward_backward_program
+            else:
+                progs = self._train_forward_backward_program
+            return progs[0]
+        else:
+            return self.infer_program
+
+    @property
+    def backward_program(self):
+        if self.training:
+            if _in_amp_guard():
+                progs = self._train_amp_forward_backward_program
+            elif _in_pure_fp16_guard():
+                progs = self._train_pure_fp16_forward_backward_program
+            else:
+                progs = self._train_forward_backward_program
+            return progs[1]
+        else:
+            """
+            Can't just return paddle.static.Program(), because self.backward_program is a property,
+            whenever we call this method, a tmp Program() object is created and is gc immediatly
+            after executed the following line in PartialProgramLayer.__call__.
+
+            >>> self.backward_program.desc.block(0),
+
+            When we access RunProgramAPI, it's possible to get an invalid backward_program address.
+            """
+            return self._empty_backward_program_for_eval
+
+    def _verify_program(self, main_program):
+        """
+        Verify that the program parameter is initialized, prune some unused params,
+        and remove redundant op callstack.
+        """
+        # 1. Check all params from main program can be found in self._params
+        self._check_params_all_inited(main_program)
+        # 2. Prune the parameters not used anywhere in the program.
+        self._prune_unused_params(main_program)
+
+        return main_program
+
+    def prepare_gradient_aggregation(
+        self, start_idx, main_program, target_program
+    ):
+        """
+        Why we need add gradient aggregation operation ?
+        In some cases, if non leaf nodes are used as output, gradient overwriting will occur, such as
+        def forward(self, in):
+            x = 2 * in  # <---- x is a non-leaf node in program.
+            y = x + 3
+            return x, y
+
+        loss = forward(in)[0].sum()
+        loss.backward()  # <----- x@grad will be overwrited by elementwise_add_grad Op
+        """
+
+        def _need_aggregation(var):
+            """
+            if exist a op whose inputs is var, then return True
+            """
+            if not isinstance(var, framework.Variable) or var.type not in [
+                core.VarDesc.VarType.LOD_TENSOR,
+                core.VarDesc.VarType.SELECTED_ROWS,
+            ]:
+                return False
+            if var.dtype not in [paddle.float32, paddle.float64]:
+                return False
+            for op in main_program.block(0).ops:
+                for in_arg in op.input_arg_names:
+                    if in_arg == var.name:
+                        return True
+            return False
+
+        def _insert_aggregation_ops_for_var(target_program, var):
+            suffix = "@dy2static"
+            var_grad_name = var.grad_name
+            new_grad_name = var.name + suffix + "@GRAD"
+            finded_ops = list(
+                filter(
+                    lambda x: x[0] >= start_idx
+                    and any(
+                        out_arg == var_grad_name
+                        for out_arg in x[1].output_arg_names
+                    ),
+                    enumerate(target_program.block(0).ops),
+                )
+            )
+
+            # len(finded_ops) may equals zero when stop_gradient works.
+            # len(finded_ops) may > 1, because we may have fill_constant op.
+            if len(finded_ops) == 0:
+                return None
+            # step1: create a new var named var.name@GRAD
+            target_program.block(0).create_var(
+                name=new_grad_name,
+                type=var.type,
+                dtype=var.dtype,
+                shape=var.shape,
+            )
+            # step2: rename the var.name@GRAD to var.name@GRAD@dy2static
+            for idx, op in finded_ops:
+                op._rename_input(var_grad_name, new_grad_name)
+                op._rename_output(var_grad_name, new_grad_name)
+            # step3: insert sum op to aggregate the gradient.
+            #        var.name@GRAD = sum(var.name@dy2static@GRAD, var.name@GRAD)
+            target_program.block(0)._insert_op(
+                finded_ops[-1][0] + 1,
+                type='sum',
+                inputs={'X': [var_grad_name, new_grad_name]},
+                outputs={"Out": var_grad_name},
+            )
+            return None
+
+        to_processed_vars = list(
+            filter(_need_aggregation, self._outputs.tolist())
+        )
+        for _var in to_processed_vars:
+            _insert_aggregation_ops_for_var(target_program, _var)
+
+    @switch_to_static_graph
+    def _append_backward_desc(self, main_program):
+        program = main_program
+        # if self._hooker:
+        # program = self._hooker.before_append_backward(program)
+        targets = list(
+            filter(lambda x: isinstance(x, OpResult), self._outputs.tolist())
+        )
+        inputs = list(
+            filter(lambda x: isinstance(x, OpResult), self._inputs.tolist())
+        )
+        forward_end_idx = len(program.block().ops)
+        if targets:
+            with backend_guard(self._backend):
+                check_type(
+                    targets,
+                    'targets',
+                    (OpResult, list, tuple),
+                    'paddle.static.gradients',
+                )
+                with ir_static.program_guard(program, None):
+                    grad_info_map = grad(inputs=inputs, outputs=targets)
+
+                forward_outputs_grads = []
+                not_stop_gradient_num = 0
+                for out_op_result in self._outputs.tolist():
+                    if out_op_result.stop_gradient is True:
+                        forward_outputs_grads.append(None)
+                        continue
+                    opres = (
+                        program.block()
+                        .ops[forward_end_idx + not_stop_gradient_num]
+                        .results()[0]
+                    )
+                    forward_outputs_grads.append(opres)
+                    not_stop_gradient_num += 1
+
+            # TODO: add later.
+            # if self._hooker:
+            # program, start_idx = self._hooker.after_append_backward(
+            # program, start_idx
+            # )
+
+            # TODO: add later
+            # self.prepare_gradient_aggregation(
+            # start_idx + 1, main_program, program
+            # )
+
+        mapping_op_result = (
+            lambda x: x if isinstance(x, OpResult) else fake_op_result()
+        )
+        hash_id = paddle.utils._hash_with_id(program, self)
+        extra_info = self._program_extra_info.get(hash_id, {})
+        extra_info['forward_end_op_idx'] = forward_end_idx
+        extra_info['forward_inputs_grads'] = list(
+            map(mapping_op_result, grad_info_map)
+        )
+        extra_info['forward_outputs_grads'] = list(
+            map(mapping_op_result, forward_outputs_grads)
+        )
+        self._program_extra_info[hash_id] = extra_info
+
+        return program
+
+    def _prune_unused_params(self, program):
+        """
+        Prune the parameters not used anywhere in the program.
+        The `@to_static` may only decorated a sub function which
+        contains some unused parameters created in `__init__`.
+        So prune these parameters to avoid unnecessary operations in
+        `run_program_op`.
+        """
+        required_params = []
+        for param in self._params:
+            found_param = False
+            for block in program.blocks:
+                for op in block.ops:
+                    if (
+                        param.name in op.input_arg_names
+                        or param.name in op.output_arg_names
+                    ):
+                        required_params.append(param)
+                        found_param = True
+                        break
+                if found_param:
+                    break
+
+        self._params = required_params
+
+    def _cast_fp16_if_pure_fp16(self, in_vars):
+        if _in_pure_fp16_guard():
+            for i, var in enumerate(in_vars):
+                name = var.name
+                if (
+                    self.program.global_block().has_var(name)
+                    and self.program.global_block().var(name).dtype
+                    == paddle.float16
+                ):
+                    in_vars[i] = var.astype('float16')
+                    in_vars[i].name = name
+
+    def _prepare_attributes(self):
+        attrs = [
+            'forward_global_block',
+            self.forward_program.block(),
+            'backward_global_block',
+            self.backward_program.block(),
+            'is_test',
+            not self.training,
+            'program_id',
+            self.program_id,
+        ]
+
+        for key, val in self.get_program_extra(self.forward_program)[
+            'program_attr'
+        ].items():
+            attrs.append(key)
+            attrs.append(val)
+
+        if self._cuda_graph_capture_mode:
+            attrs.extend(
+                (
+                    'cuda_graph_capture_mode',
+                    self._cuda_graph_capture_mode,
+                    'cuda_graph_pool_id',
+                    self._cuda_graph_pool_id,
+                )
+            )
+        return attrs
+
+    @switch_to_static_graph
+    def _build_infer_program(self, infer_program, forward_end_op_index):
+        forward_skip_vars = self._parse_skip_gc_vars(infer_program)
+        builded_infer_program = add_build_strategy_for(
+            infer_program,
+            0,
+            forward_end_op_index,
+            self._build_strategy,
+            forward_skip_vars,
+        )
+        self._apply_inplace_pass(builded_infer_program, None)
+        return builded_infer_program
+
+    @switch_to_static_graph
+    def _get_forward_backward_program_form(
+        self, whole_program, forward_end_op_index
+    ):
+        # NOTE(dev): We apply build_strategy for backward firstly to
+        # avoid skipping more gc variables.
+        forward_inputs_grads = self.get_program_extra(whole_program)[
+            'forward_inputs_grads'
+        ]
+        forward_inputs = self._inputs.tolist()
+        forward_outputs = self._outputs.tolist()
+        forward_outputs_grads = self.get_program_extra(whole_program)[
+            'forward_outputs_grads'
+        ]
+        backward_start_op_index = forward_end_op_index + len(
+            list(filter(lambda r: r.stop_gradient is False, self._outputs))
+        )
+        backward_end_op_index = len(whole_program.block().ops)
+        # For Backward process in CINN, all param@GRAD shoule be skipped for GC, because
+        # they will be shared in scope and used by optimizer.
+
+        # TODO(xiongkun): consider cinn later.
+        # backward_skip_vars = self._parse_skip_gc_vars(
+        # whole_program
+        # ) + self._grad_var_names.get('param', [])
+
+        (
+            forward_program,
+            backward_program,
+        ), program_attr = paddle.base.libpaddle.ir.program_split(
+            whole_program,
+            forward_inputs,
+            forward_outputs,
+            forward_inputs_grads,
+            forward_outputs_grads,
+            [0, forward_end_op_index],
+            [backward_start_op_index, backward_end_op_index],
+        )
+        self.get_program_extra(forward_program)["program_attr"] = program_attr
+        return [forward_program, backward_program]
+
+    def _apply_inplace_pass(self, forward_program, backward_program):
+        attr_types = {
+            "use_cuda": "bool",
+            "mem_opt_skip_vars": "list[str]",
+            "for_partial_block": "bool",
+        }
+        empty_startup_program = paddle.static.Program()
+        use_cuda = True if core.is_compiled_with_cuda() else False
+        # skip data var
+        forward_mem_opt_skip_vars = self._parse_skip_gc_vars(
+            forward_program, backward_program
+        )
+        backward_mem_opt_skip_vars = self._parse_skip_gc_vars(forward_program)
+        if forward_program:
+            attrs = {
+                "use_cuda": use_cuda,
+                "mem_opt_skip_vars": forward_mem_opt_skip_vars,
+                "for_partial_block": True,
+            }
+            if not os.getenv("FLAGS_enable_new_ir_in_executor"):
+                _apply_pass(
+                    forward_program,
+                    empty_startup_program,
+                    "buffer_shared_inplace_pass",
+                    attrs,
+                    attr_types,
+                )
+        if backward_program:
+            attrs = {
+                "use_cuda": use_cuda,
+                "mem_opt_skip_vars": backward_mem_opt_skip_vars,
+                "for_partial_block": True,
+            }
+            if not os.getenv("FLAGS_enable_new_ir_in_executor"):
+                _apply_pass(
+                    backward_program,
+                    empty_startup_program,
+                    "buffer_shared_inplace_pass",
+                    attrs,
+                    attr_types,
+                )
+
+    @LazyInitialized
+    def _inout_var_names(self):
+        """
+        Returns Variable Names from self._inputs and self.outputs
+        """
+        var_names = []
+        for var in self._inputs:
+            if isinstance(var, paddle.base.framework.Variable):
+                var_names.append(var.desc.name())
+        for var in self._outputs:
+            if isinstance(var, paddle.base.framework.Variable):
+                var_names.append(var.desc.name())
+        return var_names
+
+    def _parse_skip_gc_vars(self, program, backward_program=None):
+        """
+        Parse variables that need to skip GC after execute it.
+        If specify backward_program, it will keep the variables used in backward.
+        """
+        # skip data var, DO NOT ignore this deepcopy
+        skip_vars = deepcopy(self._inout_var_names)
+        for var_name, var in program.global_block().vars.items():
+            if var.is_data:
+                skip_vars.append(var_name)
+
+        if backward_program:
+            for var_name in core.parse_safe_eager_deletion_skip_vars(
+                backward_program.desc, True
+            ):
+                skip_vars.append(var_name)
+        return skip_vars
+
+    def _prepare(self, inputs):
+        """
+        Prepare inputs, outputs, attrs.
+        """
+        assert isinstance(inputs, (tuple, list))
+        # Flatten inputs with nested structure into single list.
+        flatten_inputs = paddle.utils.flatten(inputs)
+        # Convert variable into Tensor and feed in training data.
+        input_vars = []
+        expected_place = framework._current_expected_place()
+        for i, value in enumerate(flatten_inputs):
+            if isinstance(value, np.ndarray):
+                var = None
+                var = core.eager.Tensor(
+                    value=value,
+                    persistable=False,
+                    place=expected_place,
+                    zero_copy=True,
+                )
+            elif isinstance(value, core.eager.Tensor):
+                # NOTE(Aurelius84): If var is on CPUPlace, it will be transformed multi times
+                # into CUDAPlace when it's as input of multi Ops. so we move it in advance
+                # to avoid this problem.
+                if value.stop_gradient and not value.place._equals(
+                    expected_place
+                ):
+                    var = value._copy_to(expected_place, False)
+                    var.stop_gradient = True
+                else:
+                    var = value
+            else:
+                continue
+            input_vars.append(var)
+
+        # mapping from name(string) -> Tensor
+        out_tensor_map = {}
+
+        def create_out(var_id):
+            var = self._outputs[var_id]
+            assert isinstance(var, OpResult)
+
+            if id(var) in out_tensor_map:
+                return out_tensor_map[id(var)]
+
+            if var.is_dense_tensor_type():
+                tensor_type = paddle.dtype(7)  # LOD TENSOR
+            else:
+                tensor_type = paddle.dtype(8)  # SELECT ROW TENSOR
+
+            # TODO(xiongkun): more elegent way to do it.
+            ir_dtype_2_tensor_dtype = {
+                10: paddle.dtype(5),
+            }
+            out = core.eager.Tensor(
+                ir_dtype_2_tensor_dtype[int(var.dtype)],
+                var.shape,
+                "",
+                tensor_type,
+                False,
+            )
+            out.stop_gradient = var.stop_gradient
+            out_tensor_map[id(var)] = out
+            return out
+
+        # Create Tensor to receive output data.
+        out_vars = list(map(create_out, self._outputs.var_ids))
+        return input_vars, out_vars
+
+    def _create_scope_vec(self, program_id=None, use_scope_cache=False):
+        # Hold forward variables
+        tmp_scope_vec = None
+        inner_scope = self._get_scope(
+            program_id=program_id, use_scope_cache=use_scope_cache
+        )
+        tmp_scope_vec = [inner_scope]
+        return tmp_scope_vec
+
+    def _create_cuda_graph_vec(self):
+        var = core.eager.Tensor(
+            core.VarDesc.VarType.FP32,
+            [],
+            "cuda_graph",
+            core.VarDesc.VarType.RAW,
+            True,
+        )
+        var.stop_gradient = True
+        return var
+
+    def _update_stop_gradient(self, out_vars):
+        # Update stop_gradient for all outputs
+        def set_stop_gradient(var_id, eager_tensor):
+            var = self._outputs[var_id]
+            assert isinstance(var, OpResult)
+            eager_tensor.stop_gradient = var.stop_gradient
+            return None
+
+        for idx, var in zip(self._outputs.var_ids, out_vars):
+            set_stop_gradient(idx, var)
+
+    def _restore_out(self, out_vars):
+        """
+        Restores same nested outputs by only replacing the Variable with Tensor.
+        """
+
+        flatten_outputs = self._outputs.tolist()
+        for i, idx in enumerate(self._outputs.var_ids):
+            flatten_outputs[idx] = out_vars[i]
+        outs = self._outputs.restore(flatten_outputs)
+        if outs is not None and len(outs) == 1:
+            outs = outs[0]
+
+        return outs
+
+    @switch_to_static_graph
+    def _clone_for_test(self, main_program):
+        return main_program.clone(for_test=True)
+
+    def _is_no_value(self, var):
+        if isinstance(var, core.eager.Tensor) and var.shape == [1]:
+            # NOTE: .numpy() will insert MemcpySync operation, it hits performance.
+            if var.numpy()[0] == RETURN_NO_VALUE_MAGIC_NUM:
+                return True
+        return False
+
+    def _remove_no_value(self, out_vars):
+        """
+        Removes invalid value for various-length return statement
+        """
+        if isinstance(out_vars, core.eager.Tensor):
+            if self._is_no_value(out_vars):
+                return None
+            return out_vars
+        elif isinstance(out_vars, (tuple, list)):
+            if isinstance(out_vars, tuple):
+                res = tuple(
+                    var for var in out_vars if not self._is_no_value(var)
+                )
+            else:
+                # isinstance(out_vars, list)
+                res = [var for var in out_vars if not self._is_no_value(var)]
+
+            has_removed = len(out_vars) > len(res)
+            # len(out_vars) > len(res) means we have removed var. This is
+            # preventing out_vars is empty or just one element at the beginning
+            if len(res) == 0 and has_removed:
+                return None
+            elif len(res) == 1 and has_removed:
+                return res[0]
+            return res
+
+        return out_vars
+
+    def _set_grad_type(self, params, train_program):
+        # NOTE: if user set sparse gradient mode, the param's gradient
+        # will be SelectedRows, not LoDTensor. But tracer will just
+        # set param grad Tensor by forward Tensor(LoDTensor)
+        # If we don't change grad_var type here, RunProgramOp need
+        # transform SelectedRows to LoDTensor forcibly, it may not
+        # be user wanted result.
+        for param in params:
+            grad_name = param.name + core.grad_var_suffix()
+            grad_var = train_program.desc.block(0).find_var(grad_name.encode())
+            # NOTE: cannot find var desc maybe no problem, such as in batch_norm
+            if grad_var is None:
+                continue
+            param._set_grad_type(grad_var.type())
+
+    def _remove_op_call_stack(self, main_program):
+        """
+        Remove op's python call stack with redundant low-level error messages related to
+        transforamtions to avoid confusing users.
+        """
+        assert isinstance(main_program, framework.Program)
+        for block in main_program.blocks:
+            for op in block.ops:
+                if op.has_attr("op_callstack"):
+                    op._remove_attr("op_callstack")
+
+        return main_program
+
+    def _check_params_all_inited(self, main_program):
+        """
+        Check all params from main program are already initialized, see details as follows:
+            1. all parameters in self._params should be type `framework.EagerParamBase` which are created in dygraph.
+            2. all parameters from transformed program can be found in self._params.
+               Because they share same data with EagerParamBase of original dygraph.
+        """
+        if not isinstance(self._params, (list, tuple)):
+            raise TypeError(
+                "Type of self._params in PartialProgramLayer should be list or tuple, but received %s."
+                % type(self._params)
+            )
+
+        param_and_buffer_names_set = set()
+        for i, var in enumerate(self._params):
+            # self._params constains parameters and buffers with persistable=True.
+            if not isinstance(var, core.eager.Tensor):
+                raise TypeError(
+                    'Type of self._params[{}] in PartialProgramLayer should be Parameter or Variable, but received {}.'.format(
+                        i, type(var)
+                    )
+                )
+            param_and_buffer_names_set.add(var.name)
+
+    def _valid_vars(self, vars):
+        return vars if vars else None
+
+
+def partial_program_from(concrete_program, from_method=False):
+    inputs = concrete_program.inputs
+
+    # NOTE(SigureMo): Remove the first arg `self` from method args.
+    if inputs and from_method:
+        inputs = inputs[1:]
+
+    return PartialProgramLayer(
+        concrete_program.main_program,
+        inputs,
+        concrete_program.outputs,
+        concrete_program.parameters,
+        **concrete_program.kwargs,
+    )
+
+
+@switch_to_static_graph
+def add_build_strategy_for(
+    program, start_op_index, end_op_index, build_strategy=None, skip_vars=None
+):
+    paddle.base.libpaddle.ir.program_split(
+        program,
+    )
+    if start_op_index < end_op_index:
+        pass
+    else:
+        # can't just create a new program, we need copy the vardesc.
+        builded_program = ir_static.Program()
+    return builded_program
diff --git a/python/paddle/jit/dy2static/program_translator.py b/python/paddle/jit/dy2static/program_translator.py
index 55fc6d55d0e28..303bfbb9a3f4e 100644
--- a/python/paddle/jit/dy2static/program_translator.py
+++ b/python/paddle/jit/dy2static/program_translator.py
@@ -20,6 +20,7 @@
 import warnings
 import weakref
 
+import paddle.ir.core as ir_static
 from paddle.base import core, framework
 from paddle.base.data_feeder import check_type
 from paddle.base.dygraph.base import (
@@ -30,6 +31,7 @@
 from paddle.base.unique_name import UniqueNameGenerator
 from paddle.base.unique_name import guard as UniqueNameGuard
 from paddle.framework import in_dynamic_mode
+from paddle.ir.core import _use_new_ir_api
 from paddle.nn.layer import layers
 from paddle.utils import flatten, gast
 
@@ -46,7 +48,15 @@
     create_and_update_origin_info_map,
     update_op_callstack_with_origin_info,
 )
-from .partial_program import PartialProgramLayerHook, partial_program_from
+
+if ir_static._use_new_ir_api():
+    from .newir_partial_program import (
+        PartialProgramLayerHook,
+        partial_program_from,
+    )
+else:
+    from .partial_program import PartialProgramLayerHook, partial_program_from
+
 from .utils import (
     ALREADY_D2S,
     NO_SHAPE_VAR_TYPE,
@@ -1156,6 +1166,106 @@ def __init__(
         self.name_generator = name_generator
         self.kwargs = kwargs
 
+    @staticmethod
+    @switch_to_static_graph
+    def newir_from_func_spec(
+        func_spec, input_spec, input_kwargs_spec, class_instance, **kwargs
+    ):
+        """
+        Builds the main_program with specialized inputs and returns outputs
+        of program as fetch_list.
+
+        Args:
+            func_spec(FunctionSpec): A FunctionSpec instance for decorated function.
+            input_spec(list[InputSpec]):
+        """
+        # verify the instance is initialized in imperative mode.
+        _verify_init_in_dynamic_mode(class_instance)
+
+        # Transforms dygraph function into static function and caches it.
+        dygraph_function = func_spec.dygraph_function
+        static_func = convert_to_static(dygraph_function)
+        # apply pre\post hook for outermost layer
+        hook_helper = HookHelper(
+            dygraph_function, class_instance, kwargs.get("with_hook", False)
+        )
+
+        main_program, startup_program = ir_static.Program(), ir_static.Program()
+        # Note: The random seed should be synchronized into cached program
+        # if set in `fluid.dygraph_guard` because some ops rely on it, such as
+        # `fluid.layers.dropout`.
+
+        # TODO: new ir has no random seed.
+        #  {{{
+        # main_program.random_seed = static.default_main_program().random_seed
+        # startup_program.random_seed = (
+        # framework.default_startup_program().random_seed
+        # ) }}}
+        with ir_static.program_guard(main_program, startup_program):
+            with _to_static_mode_guard_(is_to_static=True):
+                # 1. Adds `paddle.static.data` layers for input if needed
+                static_inputs = func_spec.newir_to_static_inputs_with_spec(
+                    input_spec, main_program
+                )
+                _kwargs = func_spec.newir_to_static_inputs_with_spec(
+                    input_kwargs_spec, main_program
+                )
+                if class_instance:
+                    static_inputs = tuple(
+                        [class_instance] + list(static_inputs)
+                    )
+
+                # 2. Builds program only once and returns the output Variables.
+                with param_guard(
+                    get_parameters(class_instance, False)
+                ), param_guard(get_buffers(class_instance, False)):
+                    try:
+                        # only for jit.save, do nothing while train and eval process
+                        inputs = hook_helper.apply_pre_hooks(static_inputs)
+                        if _kwargs:
+                            outputs = static_func(*inputs, **_kwargs)
+                        else:
+                            outputs = static_func(*inputs)
+                        outputs = hook_helper.apply_post_hooks(inputs, outputs)
+                    except BaseException as e:
+                        # NOTE: If e is raised in compile time, e should be attached to ERROR_DATA here.
+                        error.attach_error_data(e)
+                        error_data = getattr(e, error.ERROR_DATA, None)
+                        if error_data:
+                            error_data.raise_new_exception()
+                        raise
+
+                # 3. Gets all ParamBases and buffered VarBases in the function
+                all_parameters_and_buffers = (
+                    ProgramTranslator.get_instance()._params_recorder.pop(
+                        main_program
+                    )
+                )
+
+                if outputs is not None:
+                    need_wrap_into_list = (
+                        not isinstance(outputs, (tuple, list))
+                        or len(outputs) == 1
+                    )
+                    if need_wrap_into_list:
+                        outputs = [outputs]
+
+        # TODO(@xiongkun): support op call stack in new ir?
+        # main_program = update_op_callstack_with_origin_info(main_program)
+
+        new_name_generator = UniqueNameGenerator()
+        return ConcreteProgram(
+            inputs=static_inputs,
+            outputs=outputs,
+            parameters=all_parameters_and_buffers,
+            name_generator=new_name_generator,
+            function=dygraph_function,
+            main_program=main_program,
+            startup_program=startup_program,
+            **kwargs,
+        )
+
+    # TODO(@xiongkun): remove after new ir is switch
     @staticmethod
     @switch_to_static_graph
     def from_func_spec(
@@ -1393,13 +1503,22 @@ def _build_once(self, cache_key):
         # NOTE(xiongkun): Need a global FLAGS to enable/disable fallback
         enable_fallback = enable_prim
         try:
-            concrete_program = ConcreteProgram.from_func_spec(
-                func_spec=cache_key.function_spec,
-                input_spec=cache_key.input_args_with_spec,
-                input_kwargs_spec=cache_key.input_kwargs_with_spec,
-                class_instance=cache_key.class_instance,
-                **cache_key.kwargs,
-            )
+            if _use_new_ir_api():
+                concrete_program = ConcreteProgram.newir_from_func_spec(
+                    func_spec=cache_key.function_spec,
+                    input_spec=cache_key.input_args_with_spec,
+                    input_kwargs_spec=cache_key.input_kwargs_with_spec,
+                    class_instance=cache_key.class_instance,
+                    **cache_key.kwargs,
+                )
+            else:
+                concrete_program = ConcreteProgram.from_func_spec(
+                    func_spec=cache_key.function_spec,
+                    input_spec=cache_key.input_args_with_spec,
+                    input_kwargs_spec=cache_key.input_kwargs_with_spec,
+                    class_instance=cache_key.class_instance,
+                    **cache_key.kwargs,
+                )
         except Exception as e:
             if enable_fallback:
                 warnings.warn(
@@ -1429,9 +1548,14 @@ def _build_once(self, cache_key):
                         )
                     )
 
-        partial_program = partial_program_from(
-            concrete_program, cache_key.class_instance is not None
-        )
+        if ir_static._use_new_ir_api():
+            partial_program = partial_program_from(
+                concrete_program, cache_key.class_instance is not None
+            )
+        else:  # TODO(new_ir): remove later.
+            partial_program = partial_program_from(
+                concrete_program, cache_key.class_instance is not None
+            )
         with backend_guard(backend):
             if core._is_fwd_prim_enabled():
                 partial_program.set_hooker(
diff --git a/python/paddle/jit/dy2static/py_layer.py b/python/paddle/jit/dy2static/py_layer.py
new file mode 100644
index 0000000000000..1d238e667c653
--- /dev/null
+++ b/python/paddle/jit/dy2static/py_layer.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+
+from paddle.base.framework import Variable
+from paddle.common_ops_import import LayerHelper
+from paddle.static.nn import static_pylayer
+
+from .program_translator import convert_to_static, unwrap_decorators
+
+
+class StaticPyLayerContext:
+    def __init__(self):
+        self.saved_vars = []
+
+    def save_for_backward(self, *tensors):
+        for tensor in tensors:
+            assert isinstance(tensor, Variable)
+            self.saved_vars.append(tensor)
+
+    def saved_tensor(self):
+        helper = LayerHelper("StaticPyLayerContext")
+        out_list = []
+        for saved_var in self.saved_vars:
+            out = helper.create_variable(
+                name=saved_var.name,
+                dtype=saved_var.dtype,
+                shape=saved_var.shape,
+                type=saved_var.type,
+            )
+            out_list.append(out)
+
+        return out_list
+
+    # TODO(MarioLulab): support not_inplace
+    def mark_not_inplace(self, *args):
+        raise NotImplementedError()
+
+    # TODO(MarioLulab): support non_differentiable
+    def mark_non_differentiable(self, *args):
+        raise NotImplementedError()
+
+    # TODO(MarioLulab): support materialize_grads
+    def set_materialize_grads(self, value: bool):
+        raise NotImplementedError()
+
+
+class StaticPyLayer:
+    def __init__(self, dyfunc_self):
+        self.dyfunc_self = dyfunc_self
+        _, self.orig_forward_fn = unwrap_decorators(dyfunc_self.forward)
+        _, self.orig_backward_fn = unwrap_decorators(dyfunc_self.backward)
+        self.static_pylayer_context = StaticPyLayerContext()
+
+        self.forward_fn_with_ctx = functools.partial(
+            convert_to_static(self.orig_forward_fn), self.static_pylayer_context
+        )
+        self.backward_fn_with_ctx = functools.partial(
+            convert_to_static(self.orig_backward_fn),
+            self.static_pylayer_context,
+        )
+
+    # NOTE: only support position args and Variables Now
+    def apply(self, *args):
+        return static_pylayer(
+            forward_fn=self.forward_fn_with_ctx,
+            inputs=list(args),
+            backward_fn=self.backward_fn_with_ctx,
+        )
diff --git a/python/paddle/new_ir_utils.py b/python/paddle/new_ir_utils.py
index 6aaa64076e98e..60814e6cc529f 100644
--- a/python/paddle/new_ir_utils.py
+++ b/python/paddle/new_ir_utils.py
@@ -24,6 +24,9 @@ def __init__(self):
             self.old_Program = paddle.static.Program
             self.old_program_guard = paddle.base.program_guard
             self.old_default_main_program = paddle.static.default_main_program
+            self.old_default_startup_program = (
+                paddle.static.default_startup_program
+            )
         else:
             raise RuntimeError(
                 "IrChange only init when paddle.ir.core._use_new_ir_api() is false, \
@@ -51,9 +54,12 @@ def _switch_to_new_ir(self):
             paddle.base.Program = paddle.ir.Program
             paddle.base.program_guard = paddle.ir.core.program_guard
             paddle.static.program_guard = paddle.ir.core.program_guard
-            paddle.framework.default_main_program = (
+            paddle.static.default_main_program = (
                 paddle.ir.core.default_main_program
             )
+            paddle.static.default_startup_program = (
+                paddle.ir.core.default_startup_program
+            )
 
     def _switch_to_old_ir(self):
         if not paddle.ir.core._use_new_ir_api():
@@ -64,8 +70,9 @@ def _switch_to_old_ir(self):
             paddle.base.Program = self.old_Program
             paddle.base.program_guard = self.old_program_guard
             paddle.static.program_guard = self.old_program_guard
-            paddle.framework.default_main_program = (
-                self.old_default_main_program
+            paddle.static.default_main_program = self.old_default_main_program
+            paddle.static.default_startup_program = (
+                self.old_default_startup_program
             )
         else:
             raise RuntimeError(
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index f909b03cd0036..4ba784f3b2d97 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -14,7 +14,7 @@
 
 import paddle
 from paddle import _C_ops, _legacy_C_ops, in_dynamic_mode
-from paddle.framework import core
+from paddle.framework import core, in_dynamic_or_pir_mode
 from paddle.utils.inplace_utils import inplace_apis_in_dygraph_only
 
 from ...base.data_feeder import check_dtype, check_variable_and_dtype
@@ -759,7 +759,7 @@ def relu(x, name=None):
     if in_dynamic_mode():
         return _C_ops.relu(x)
     else:
-        if paddle.framework.in_dynamic_or_new_ir_mode():
+        if paddle.framework.in_dynamic_or_pir_mode():
             # Below code will be removed after we can generate IR api automatically
             return paddle._ir_ops.relu(x)
 
@@ -1053,7 +1053,7 @@ def silu(x, name=None):
             [0.73105860, 1.76159406, 2.85772228, 3.92805505])
     """
 
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.silu(x)
     else:
         check_variable_and_dtype(
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index 9ed7db452455a..81f907caedae7 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -18,7 +18,12 @@
 from paddle import _C_ops
 from paddle.base.layer_helper import LayerHelper
 from paddle.common_ops_import import Variable, default_main_program
-from paddle.framework import core, in_dynamic_mode, in_dynamic_or_new_ir_mode
+from paddle.framework import (
+    core,
+    in_dynamic_mode,
+    in_dynamic_or_new_ir_mode,
+    in_pir_mode,
+)
 from paddle.tensor.creation import full
 
 from ...base.data_feeder import (
@@ -1939,6 +1944,13 @@ def linear(x, weight, bias=None, name=None):
     if in_dynamic_mode():
         # TODO(jiabin): using addmm for fast forward route
         return _C_ops.linear(x, weight, bias)
+
+    elif in_pir_mode():
+        out = paddle._ir_ops.matmul(x, weight, False, False)
+        if bias is not None:
+            return paddle._ir_ops.add(out, bias)
+        else:
+            return out
     else:
         helper = LayerHelper('linear', **locals())
         dtype = x.dtype
@@ -2280,7 +2292,7 @@ def fold(
 
     Parameters:
         x(Tensor):                3-D Tensor, input tensor of format [N, C, L],
-                                  data type can be float32 or float64
+                                  data type can be float32, float64, complex64 or complex128
         output_sizes(int|list|tuple):       The size of output size, should be [output_size_h, output_size_w]
                                   or an interger o treated as [o, o].
         kernel_sizes(int|list|tuple):   The size of convolution kernel, should be [k_h, k_w]
@@ -2325,7 +2337,9 @@ def fold(
 
     helper = LayerHelper("fold", **locals())
 
-    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'fold')
+    check_variable_and_dtype(
+        x, 'x', ['float32', 'float64', 'complex64', 'complex128'], 'fold'
+    )
 
     assert len(x.shape) == 3, "input should be the format of [N, C, L]"
 
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 539224de32ef2..6798aa2dff404 100644
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -2503,6 +2503,7 @@ def cross_entropy(
     soft_label=False,
     axis=-1,
     use_softmax=True,
+    label_smoothing=0.0,
     name=None,
 ):
     r"""
@@ -2631,8 +2632,12 @@ def cross_entropy(
             :math:`[N_1, N_2, ..., N_k]` or :math:`[N_1, N_2, ..., N_k, 1]`, k >= 1.
             the data type is int32, int64, float32, float64, where each value is [0, C-1].
 
-            2. If soft_label=True, the shape and data type should be same with ``input`` ,
-            and the sum of the labels for each sample should be 1.
+            2. If soft_label=True and no label_smoothing, the shape and data type
+            should be same with ``input`` , and the sum of the labels for each sample should be 1.
+
+            3. If has label_smoothing, (i.e. label_smoothing > 0.0), no matter what ``soft_label`` is,
+            the shape and data type of ``label`` could be either the situation 1 or situation 2.
+            In other words, if label_smoothing > 0.0, the format of label could be one-hot label or integer label.
 
         weight (Tensor, optional): a manual rescaling weight given to each class.
             If given, has to be a Tensor of size C and the data type is float32, float64.
@@ -2648,6 +2653,11 @@ def cross_entropy(
             If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
             Default is ``'mean'``.
         soft_label (bool, optional): Indicate whether label is soft. Default is ``False``.
+        label_smoothing (float, optional): A float in [0.0, 1.0].
+            Specifies the amount of smoothing when computing the loss, where 0.0 means no smoothing.
+            The targets become  a mixture of the original ground truth and a uniform distribution as
+            described in paper 'Rethinking the Inception Architecture for Computer Vision'.
+            Default is ``0.0``.
         axis (int, optional):The index of dimension to perform softmax calculations.
             It should be in range :math:`[-1, rank - 1]`, where :math:`rank` is the
             number of dimensions of input :attr:`input`.
@@ -2673,7 +2683,7 @@ def cross_entropy(
     Examples:
         .. code-block:: python
 
-            >>> # hard labels
+            # hard labels
             >>> import paddle
             >>> paddle.seed(99999)
             >>> N=100
@@ -2688,17 +2698,18 @@ def cross_entropy(
             >>> dy_ret = cross_entropy_loss(
             ...                             input,
             ...                             label)
+
             >>> print(dy_ret)
             Tensor(shape=[], dtype=float64, place=Place(cpu), stop_gradient=True,
                    5.35419278)
 
         .. code-block:: python
 
-            >>> # soft labels
+            # soft labels
+            # case1: soft labels without label_smoothing
             >>> import paddle
             >>> paddle.seed(99999)
             >>> axis = -1
-            >>> ignore_index = -100
             >>> N = 4
             >>> C = 3
             >>> shape = [N, C]
@@ -2718,6 +2729,31 @@ def cross_entropy(
             Tensor(shape=[], dtype=float64, place=Place(cpu), stop_gradient=True,
                    1.12801195)
 
+
+            # case2: soft labels with label_smoothing
+            >>> import paddle
+            >>> paddle.seed(99999)
+            >>> axis = -1
+            >>> N = 4
+            >>> C = 3
+            >>> shape = [N, C]
+            >>> label_smoothing = 0.4
+            >>> reduction='mean'
+            >>> weight = None
+            >>> logits = paddle.uniform(shape, dtype='float64', min=0.1, max=1.0)
+            >>> labels = paddle.uniform(shape, dtype='float64', min=0.1, max=1.0)
+            >>> labels /= paddle.sum(labels, axis=axis, keepdim=True)
+            >>> paddle_loss_mean = paddle.nn.functional.cross_entropy(
+            ...                                                         logits,
+            ...                                                         labels,
+            ...                                                         axis=axis,
+            ...                                                         weight=weight,
+            ...                                                         label_smoothing=label_smoothing,
+            ...                                                         reduction=reduction)
+            >>> print(paddle_loss_mean)
+            Tensor(shape=[], dtype=float64, place=Place(cpu), stop_gradient=True,
+            1.12226281)
+
     """
 
     if reduction not in ['sum', 'mean', 'none']:
@@ -2749,6 +2785,21 @@ def cross_entropy(
             )
         )
 
+    if label_smoothing > 0.0:
+        soft_label = True
+        # converting the label to one-hot encoding
+        # for 1d case, converting label's shape from [N] to [N, C]
+        # for 2d case, converting label's shape from [N, d_1, ..., d_k] to [N, d_1, ..., d_k, C]
+        if input_dims - 1 == label_dims:
+            label = paddle.squeeze(label, axis=axis)
+            label = paddle.nn.functional.one_hot(label, input.shape[-1])
+
+        label = paddle.nn.functional.label_smooth(
+            label, epsilon=label_smoothing
+        )
+        label = label.astype(input.dtype)
+        label_dims = len(list(label.shape))
+
     if in_dynamic_mode():
         if not soft_label:
             valid_label = (
diff --git a/python/paddle/nn/initializer/constant.py b/python/paddle/nn/initializer/constant.py
index 745ac44ac2273..bc2baf08c9bb1 100644
--- a/python/paddle/nn/initializer/constant.py
+++ b/python/paddle/nn/initializer/constant.py
@@ -15,7 +15,11 @@
 from paddle import _C_ops
 
 from ...base import core, framework
-from ...base.framework import _current_expected_place, in_dygraph_mode
+from ...base.framework import (
+    _current_expected_place,
+    in_dygraph_mode,
+    in_dynamic_or_pir_mode,
+)
 
 # TODO: define the initializers of Constant in neural network
 from .initializer import Initializer
@@ -48,19 +52,29 @@ def forward(self, var, block=None):
         Returns:
             The initialization op
         """
+        import paddle
+
         block = self._check_block(block)
 
-        assert isinstance(var, (framework.Variable, framework.EagerParamBase))
-        assert isinstance(block, framework.Block)
+        assert isinstance(
+            var,
+            (framework.Variable, framework.EagerParamBase, paddle.ir.OpResult),
+        )
+        assert isinstance(block, (framework.Block, paddle.ir.Block))
 
-        if in_dygraph_mode():
+        if in_dynamic_or_pir_mode():
             place = _current_expected_place()
             if self._force_cpu:
                 place = core.CPUPlace()
-            _C_ops.full_(
-                var, var.shape, str(float(self._value)), var.dtype, place
-            )
-            return None
+            if in_dygraph_mode():
+                _C_ops.full_(
+                    var, var.shape, float(self._value), var.dtype, place
+                )
+                return None
+            else:
+                return _C_ops.full(
+                    var.shape, float(self._value), var.dtype, place
+                )
         else:
             op = block.append_op(
                 type="fill_constant",
diff --git a/python/paddle/nn/initializer/lazy_init.py b/python/paddle/nn/initializer/lazy_init.py
index 7c67649738203..6c037bab8f035 100644
--- a/python/paddle/nn/initializer/lazy_init.py
+++ b/python/paddle/nn/initializer/lazy_init.py
@@ -98,16 +98,16 @@ class LazyGuard:
 
         .. code-block:: python
 
-            from paddle import LazyGuard
-            from paddle.nn import Linear
-
-            with LazyGuard():
-                # w and b are initialized lazily and have no memory.
-                net = Linear(10, 10)
-
-            for param in net.parameters():
-                # Initialize param and allocate memory explicitly.
-                param.initialize()
+            >>> from paddle import LazyGuard
+            >>> from paddle.nn import Linear
+
+            >>> with LazyGuard():
+            ...     # w and b are initialized lazily and have no memory.
+            ...     net = Linear(10, 10)
+            ...
+            >>> for param in net.parameters():
+            ...     # Initialize param and allocate memory explicitly.
+            ...     param.initialize()
     """
 
     def __enter__(self):
@@ -118,14 +118,14 @@ def __enter__(self):
 
             .. code-block:: python
 
-                from paddle import LazyGuard
-                from paddle.nn import Linear
-
-                with LazyGuard():
-                    fc = LazyInit(Linear)(10, 10)
+                >>> from paddle import LazyGuard
+                >>> from paddle.nn import Linear
 
-                for param in fc.parameters():
-                    param.initialize()
+                >>> with LazyGuard():
+                ...     fc = LazyInit(Linear)(10, 10)
+                ...
+                >>> for param in fc.parameters():
+                ...     param.initialize()
         """
         lazy_init_helper().enable()
 
diff --git a/python/paddle/nn/initializer/xavier.py b/python/paddle/nn/initializer/xavier.py
index 147328c222c46..7f479111fba3d 100644
--- a/python/paddle/nn/initializer/xavier.py
+++ b/python/paddle/nn/initializer/xavier.py
@@ -18,7 +18,11 @@
 
 from ...base import core, framework, unique_name
 from ...base.data_feeder import check_variable_and_dtype
-from ...base.framework import _current_expected_place, in_dygraph_mode
+from ...base.framework import (
+    _current_expected_place,
+    in_dygraph_mode,
+    in_pir_mode,
+)
 from .initializer import Initializer
 
 __all__ = []
@@ -80,9 +84,10 @@ def forward(self, var, block=None):
         Returns:
             The initialization op
         """
-        block = self._check_block(block)
+        import paddle
 
-        assert isinstance(block, framework.Block)
+        block = self._check_block(block)
+        assert isinstance(block, (framework.Block, paddle.ir.Block))
         check_variable_and_dtype(
             var,
             "Out",
@@ -144,6 +149,17 @@ def forward(self, var, block=None):
             else:
                 out_var._share_underline_tensor_to(var)
             return None
+        elif in_pir_mode():
+            if self._uniform:
+                limit = math.sqrt(6.0 / float(fan_in + fan_out))
+                return paddle._ir_ops.uniform(
+                    var.shape,
+                    var.dtype,
+                    -limit,
+                    limit,
+                    self._seed,
+                    _current_expected_place(),
+                )
         else:
             if self._uniform:
                 limit = math.sqrt(6.0 / float(fan_in + fan_out))
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index 700d558429bb5..db11591db5fe7 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -730,6 +730,9 @@ class Dropout(Layer):
 
     In dygraph mode, please use ``eval()`` to switch to evaluation mode, where dropout is disabled.
 
+    Warning:
+        The corresponding `functional methods` please reference :ref:`api_paddle_nn_functional_dropout`.
+
     Parameters:
         p (float|int, optional): Probability of setting units to zero. Default: 0.5
         axis (int|list|tuple, optional): The axis along which the dropout is performed. Default: None.
diff --git a/python/paddle/nn/layer/layers.py b/python/paddle/nn/layer/layers.py
index c80bfc4b39fb1..2d88f2e1e81bd 100644
--- a/python/paddle/nn/layer/layers.py
+++ b/python/paddle/nn/layer/layers.py
@@ -1600,6 +1600,11 @@ def _remove_if_exist(*dicts):
 
             _remove_if_exist(self.__dict__, self._buffers, self._sub_layers)
             params[name] = value
+        elif isinstance(value, paddle.ir.OpResult) and value.is_persistable:
+            if params is None:
+                raise ValueError("super().__init__() should be called first")
+            _remove_if_exist(self.__dict__, self._buffers, self._sub_layers)
+            params[name] = value
         elif params is not None and name in params:
             if value is not None:
                 raise TypeError(
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index eb2a66caedf77..2e22cbd213d2d 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -16,7 +16,7 @@
 
 # TODO: define loss functions of neural network
 from paddle import base, in_dynamic_mode
-from paddle.base.framework import in_dygraph_mode
+from paddle.base.framework import in_dynamic_or_pir_mode
 
 from .. import functional as F
 from .layers import Layer
@@ -261,6 +261,11 @@ class CrossEntropyLoss(Layer):
         soft_label (bool, optional): Indicate whether label is soft.
             If soft_label=False, the label is hard.  If soft_label=True, the label is soft.
             Default is ``False``.
+        label_smoothing (float, optional): A float in [0.0, 1.0].
+            Specifies the amount of smoothing when computing the loss, where 0.0 means no smoothing.
+            The targets become  a mixture of the original ground truth and a uniform distribution as
+            described in paper 'Rethinking the Inception Architecture for Computer Vision'.
+            Default is ``0.0``.
         axis (int, optional): The index of dimension to perform softmax calculations.
             It should be in range :math:`[-1, rank - 1]`, where :math:`rank` is the number
             of dimensions of input :attr:`input`.
@@ -287,8 +292,12 @@ class CrossEntropyLoss(Layer):
             :math:`[N_1, N_2, ..., N_k]` or :math:`[N_1, N_2, ..., N_k, 1]`, k >= 1.
             the data type is int32, int64, float32, float64, where each value is [0, C-1].
 
-            2. If soft_label=True, the shape and data type should be same with ``input`` ,
-            and the sum of the labels for each sample should be 1.
+            2. If soft_label=True and no label_smoothing, the shape and data type
+            should be same with ``input`` , and the sum of the labels for each sample should be 1.
+
+            3. If has label_smoothing, (i.e. label_smoothing > 0.0), no matter what ``soft_label`` is,
+            the shape and data type of ``label`` could be either the situation 1 or situation 2.
+            In other words, if label_smoothing > 0.0, the format of label could be one-hot label or integer label.
 
         - **output** (Tensor), Return the softmax cross_entropy loss of ``input`` and ``label``.
           The data type is the same as input.
@@ -302,9 +311,8 @@ class CrossEntropyLoss(Layer):
     Examples:
 
         .. code-block:: python
-            :name: code-example1
 
-            >>> # hard labels
+            # hard labels
             >>> import paddle
             >>> paddle.seed(2023)
             >>> N=100
@@ -322,13 +330,12 @@ class CrossEntropyLoss(Layer):
             5.33697682)
 
         .. code-block:: python
-            :name: code-example2
 
-            >>> # soft labels
+            # soft labels
+            # case1: soft labels without label_smoothing
             >>> import paddle
             >>> paddle.seed(2023)
             >>> axis = -1
-            >>> ignore_index = -100
             >>> N = 4
             >>> C = 3
             >>> shape = [N, C]
@@ -337,16 +344,35 @@ class CrossEntropyLoss(Layer):
             >>> logits = paddle.uniform(shape, dtype='float64', min=0.1, max=1.0)
             >>> labels = paddle.uniform(shape, dtype='float64', min=0.1, max=1.0)
             >>> labels /= paddle.sum(labels, axis=axis, keepdim=True)
-            >>> paddle_loss_mean = paddle.nn.functional.cross_entropy(logits,
-            ...                                                       labels,
-            ...                                                       soft_label=True,
-            ...                                                       axis=axis,
-            ...                                                       weight=weight,
-            ...                                                       reduction=reduction)
-            >>> print(paddle_loss_mean)
+            >>> cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
+            ...     weight=weight, reduction=reduction, soft_label=True, label_smoothing=0.0)
+            >>> dy_ret = cross_entropy_loss(logits, labels)
+            >>> print(dy_ret)
             Tensor(shape=[], dtype=float64, place=Place(cpu), stop_gradient=True,
             1.14554912)
 
+
+
+            # case2: soft labels with label_smoothing
+            >>> import paddle
+            >>> paddle.seed(2023)
+            >>> axis = -1
+            >>> N = 4
+            >>> C = 3
+            >>> shape = [N, C]
+            >>> label_smoothing = 0.4
+            >>> reduction='mean'
+            >>> weight = None
+            >>> logits = paddle.uniform(shape, dtype='float64', min=0.1, max=1.0)
+            >>> labels = paddle.uniform(shape, dtype='float64', min=0.1, max=1.0)
+            >>> labels /= paddle.sum(labels, axis=axis, keepdim=True)
+            >>> cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
+            ...     weight=weight, reduction=reduction, label_smoothing=label_smoothing)
+            >>> dy_ret = cross_entropy_loss(logits, labels)
+            >>> print(dy_ret)
+            Tensor(shape=[], dtype=float64, place=Place(cpu), stop_gradient=True,
+            1.13879701)
+
     """
 
     def __init__(
@@ -357,6 +383,7 @@ def __init__(
         soft_label=False,
         axis=-1,
         use_softmax=True,
+        label_smoothing=0.0,
         name=None,
     ):
         super().__init__()
@@ -366,6 +393,7 @@ def __init__(
         self.soft_label = soft_label
         self.axis = axis
         self.use_softmax = use_softmax
+        self.label_smoothing = label_smoothing
         self.name = name
 
     def forward(self, input, label):
@@ -378,6 +406,7 @@ def forward(self, input, label):
             soft_label=self.soft_label,
             axis=self.axis,
             use_softmax=self.use_softmax,
+            label_smoothing=self.label_smoothing,
             name=self.name,
         )
 
@@ -590,7 +619,7 @@ def forward(self, input, label):
                 label, 'label', ['float32', 'float64'], 'MSELoss'
             )
 
-        if in_dygraph_mode():
+        if in_dynamic_or_pir_mode():
             square_out = paddle._C_ops.square(paddle.subtract(input, label))
         else:
             square_out = paddle.square(paddle.subtract(input, label))
diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py
index 6341676747a5e..6336f1914280f 100644
--- a/python/paddle/optimizer/adamw.py
+++ b/python/paddle/optimizer/adamw.py
@@ -341,8 +341,25 @@ def _add_moments_pows(self, p):
         acc_dtype = p.dtype
         if self._is_dtype_fp16_or_bf16(acc_dtype):
             acc_dtype = core.VarDesc.VarType.FP32
-        self._add_accumulator(self._moment1_acc_str, p, dtype=acc_dtype)
-        self._add_accumulator(self._moment2_acc_str, p, dtype=acc_dtype)
+        if core.is_compiled_with_xpu():
+            import os
+
+            xpu_adamw_moment_dtype = os.getenv(
+                "xpu_adamw_moment_dtype", default="fp32"
+            )
+            if xpu_adamw_moment_dtype == "fp16":
+                self._add_accumulator(
+                    self._moment1_acc_str, p, dtype=core.VarDesc.VarType.FP16
+                )
+                self._add_accumulator(
+                    self._moment2_acc_str, p, dtype=core.VarDesc.VarType.FP16
+                )
+            else:
+                self._add_accumulator(self._moment1_acc_str, p, dtype=acc_dtype)
+                self._add_accumulator(self._moment2_acc_str, p, dtype=acc_dtype)
+        else:
+            self._add_accumulator(self._moment1_acc_str, p, dtype=acc_dtype)
+            self._add_accumulator(self._moment2_acc_str, p, dtype=acc_dtype)
         self._add_accumulator(
             name=self._beta1_pow_acc_str,
             param=p,
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
index 5870db63d67af..20f2fa970900e 100644
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -27,13 +27,15 @@
     default_main_program,
     device_guard,
     in_dygraph_mode,
+    in_dynamic_or_pir_mode,
+    in_pir_mode,
     name_scope,
 )
 from paddle.regularizer import L2Decay
 
 from ..base import framework, unique_name
 from ..base.backward import _get_no_grad_set_name, append_backward
-from ..base.framework import Parameter, program_guard
+from ..base.framework import Parameter
 from ..base.layer_helper import LayerHelper
 from .lr import LRScheduler
 
@@ -467,18 +469,38 @@ def do_create():
             elif isinstance(self._learning_rate, float):
                 # only create global lr_var once
                 lr = self._global_learning_rate()
-                if isinstance(lr, framework.Variable):
-                    return
+                if in_pir_mode():
+                    if isinstance(lr, paddle.ir.OpResult):
+                        return
+                    else:
+                        place = _current_expected_place()
+                        if not isinstance(_lr_dtype, paddle.base.core.DataType):
+                            lr_dtype = (
+                                paddle.ir.core.convert_np_dtype_to_dtype_(
+                                    _lr_dtype
+                                )
+                            )
+                        self._learning_rate_map[
+                            framework.default_main_program()
+                        ] = paddle._ir_ops.full(
+                            [],
+                            self._learning_rate,
+                            _lr_dtype,
+                            place,
+                        )
                 else:
-                    self._learning_rate_map[
-                        framework.default_main_program()
-                    ] = paddle.static.create_global_var(
-                        name=unique_name.generate("learning_rate"),
-                        shape=[],
-                        value=float(self._learning_rate),
-                        dtype=_lr_dtype,
-                        persistable=True,
-                    )
+                    if isinstance(lr, framework.Variable):
+                        return
+                    else:
+                        self._learning_rate_map[
+                            framework.default_main_program()
+                        ] = paddle.static.create_global_var(
+                            name=unique_name.generate("learning_rate"),
+                            shape=[],
+                            value=float(self._learning_rate),
+                            dtype=_lr_dtype,
+                            persistable=True,
+                        )
 
         with paddle.base.framework.dygraph_guard_if_declarative():
             do_create()
@@ -1142,6 +1164,79 @@ def _create_optimization_pass(
         end = len(target_block.ops)
         return target_block._slice_ops(start, end)
 
+    def _new_ir_create_optimization_pass(
+        self, parameters_and_grads, param_group_idx=0
+    ):
+        """Add optimization operators to update gradients to tensors.
+
+        Args:
+          parameters_and_grads(list(tuple(Tensor, Tensor))):
+            a list of (tensor, gradient) pair to update.
+
+        Returns:
+          return_op_list: a list of operators that will complete one step of
+            optimization. This will include parameter update ops, global step
+            update ops and any other custom ops required by subclasses to manage
+            their internal state.
+        """
+
+        global_block = framework.default_main_program().global_block()
+        target_block = global_block
+
+        start = len(target_block.ops)
+
+        self._create_global_learning_rate()
+
+        params_grads_device_map = (
+            parameters_and_grads['params']
+            if isinstance(parameters_and_grads, dict)
+            else parameters_and_grads
+        )
+        self._update_param_device_map(params_grads_device_map, target_block)
+
+        if isinstance(parameters_and_grads, list):
+            self._create_accumulators(
+                target_block,
+                [p[0] for p in parameters_and_grads if not p[0].stop_gradient],
+            )
+        else:
+            params_acc_dict = parameters_and_grads.copy()
+            params_acc_dict['params'] = [
+                p[0]
+                for p in params_acc_dict['params']
+                if not p[0].stop_gradient
+            ]
+            self._create_accumulators(target_block, params_acc_dict)
+
+        if isinstance(parameters_and_grads, list):
+            for param_and_grad in parameters_and_grads:
+                if param_and_grad[1] is None:
+                    continue
+                if param_and_grad[0].stop_gradient is False:
+                    self._append_optimize_op(target_block, param_and_grad)
+        else:
+            for param_and_grad in parameters_and_grads['params']:
+                if param_and_grad[1] is None:
+                    continue
+                if param_and_grad[0].stop_gradient is False:
+                    param_grad_dict = {}
+                    param_grad_dict['params'] = param_and_grad
+                    param_grad_dict.update(
+                        {
+                            k: v
+                            for k, v in parameters_and_grads.items()
+                            if k != 'params'
+                        }
+                    )
+                    self._append_optimize_op(target_block, param_grad_dict)
+
+        # Get custom finish ops for subclasses
+        # FIXME: Need to fix this once we figure out how to handle dependencies
+        self._finish_update(target_block, parameters_and_grads)
+
+        end = len(target_block.ops)
+        return target_block._slice_ops(start, end)
+
     def backward(
         self,
         loss,
@@ -1219,17 +1314,26 @@ def backward(
                 )
             )
             parameter_list = parameters if parameters else self._parameter_list
-            with program_guard(program, startup_program):
-                from paddle.incubate.autograd.utils import prim_enabled
-
-                if prim_enabled():
-                    params_grads = append_backward_new(
-                        [loss], parameter_list, act_no_grad_set, callbacks
+            with paddle.static.program_guard(program, startup_program):
+                if in_pir_mode():
+                    params_grads = []
+                    grads = paddle.autograd.ir_backward.grad(
+                        loss, parameter_list, no_grad_vars=act_no_grad_set
                     )
+                    for index, grad in enumerate(grads):
+                        if grad is not None:
+                            params_grads.append((parameter_list[index], grad))
                 else:
-                    params_grads = append_backward(
-                        loss, parameter_list, act_no_grad_set, callbacks
-                    )
+                    from paddle.incubate.autograd.utils import prim_enabled
+
+                    if prim_enabled():
+                        params_grads = append_backward_new(
+                            [loss], parameter_list, act_no_grad_set, callbacks
+                        )
+                    else:
+                        params_grads = append_backward(
+                            loss, parameter_list, act_no_grad_set, callbacks
+                        )
         return params_grads
 
     def apply_gradients(self, params_grads):
@@ -1290,10 +1394,10 @@ def _apply_optimize(
         Returns:
             list: A list of operators appended to the current program.
         """
-        if framework.in_dygraph_mode():
-            with program_guard(
-                framework.default_main_program(),
-                framework.default_startup_program(),
+        if in_dynamic_or_pir_mode():
+            with paddle.static.program_guard(
+                paddle.static.default_main_program(),
+                paddle.static.default_startup_program(),
             ):
                 if isinstance(params_grads, list):
                     if self._grad_clip is not None:
@@ -1311,13 +1415,18 @@ def _apply_optimize(
                     params_grads['params'] = self.append_regularization_ops(
                         params_grads['params'], self.regularization
                     )
-                optimize_ops = self._create_optimization_pass(
-                    params_grads, param_group_idx=param_group_idx
-                )
+                if in_pir_mode():
+                    optimize_ops = self._new_ir_create_optimization_pass(
+                        params_grads, param_group_idx=param_group_idx
+                    )
+                else:
+                    optimize_ops = self._create_optimization_pass(
+                        params_grads, param_group_idx=param_group_idx
+                    )
         else:
             assert param_group_idx == 0
             program = loss.block.program
-            with program_guard(program, startup_program):
+            with paddle.static.program_guard(program, startup_program):
                 optimize_ops = self.apply_gradients(params_grads)
         return optimize_ops
 
@@ -1408,7 +1517,7 @@ def append_regularization_ops(
             Exception: Unknown regularization type
         """
         params_and_grads = []
-        if framework.in_dygraph_mode():
+        if framework.in_dygraph_mode() or in_pir_mode():
             for param, grad in parameters_and_grads:
                 new_grad = self._create_regularization_of_grad(
                     param, grad, regularization
@@ -1540,7 +1649,9 @@ def minimize(
                 >>> adam.clear_grad()
 
         """
-        assert isinstance(loss, Variable), "The loss should be an Tensor."
+        assert isinstance(
+            loss, (Variable, paddle.ir.OpResult)
+        ), "The loss should be an Tensor."
 
         parameter_list = parameters if parameters else self._parameter_list
 
@@ -1725,9 +1836,14 @@ def _is_dtype_fp16_or_bf16(self, dtype):
         :return: True if dtype is one of fp16 or bf16, False otherwise
         """
         assert isinstance(
-            dtype, core.VarDesc.VarType
-        ), "The dtype should be an instance of core.VarDesc.VarType."
-        return (
-            dtype == core.VarDesc.VarType.FP16
-            or dtype == core.VarDesc.VarType.BF16
-        )
+            dtype, (core.VarDesc.VarType, core.DataType)
+        ), "The dtype should be an instance of core.VarDesc.VarType or core.DataType."
+        if isinstance(dtype, core.VarDesc.VarType):
+            return (
+                dtype == core.VarDesc.VarType.FP16
+                or dtype == core.VarDesc.VarType.BF16
+            )
+        else:
+            return (
+                dtype == core.DataType.FLOAT16 or dtype == core.DataType.UINT16
+            )
diff --git a/python/paddle/optimizer/sgd.py b/python/paddle/optimizer/sgd.py
index 19851c7386821..b2773f51908c1 100644
--- a/python/paddle/optimizer/sgd.py
+++ b/python/paddle/optimizer/sgd.py
@@ -18,7 +18,7 @@
 
 from ..base import framework
 from ..base.dygraph import no_grad
-from ..base.framework import in_dygraph_mode
+from ..base.framework import in_dynamic_or_pir_mode
 from .optimizer import Optimizer
 
 __all__ = []
@@ -129,7 +129,7 @@ def _append_optimize_op(self, block, param_and_grad):
         )
 
         lr = self._create_param_lr(param_and_grad)
-        if in_dygraph_mode():
+        if in_dynamic_or_pir_mode():
             _C_ops.sgd_(
                 param_and_grad[0],
                 lr,
diff --git a/python/paddle/sparse/nn/functional/activation.py b/python/paddle/sparse/nn/functional/activation.py
index 966ea27b1368d..72f35d0d236f9 100644
--- a/python/paddle/sparse/nn/functional/activation.py
+++ b/python/paddle/sparse/nn/functional/activation.py
@@ -38,12 +38,15 @@ def relu(x, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            dense_x = paddle.to_tensor([-2., 0., 1.])
-            sparse_x = dense_x.to_sparse_coo(1)
-            out = paddle.sparse.nn.functional.relu(sparse_x)
-            # [0., 0., 1.]
+            >>> import paddle
+
+            >>> dense_x = paddle.to_tensor([-2., 0., 1.])
+            >>> sparse_x = dense_x.to_sparse_coo(1)
+            >>> out = paddle.sparse.nn.functional.relu(sparse_x)
+            >>> print(out)
+            Tensor(shape=[3], dtype=paddle.float32, place=Place(cpu), stop_gradient=True,
+                   indices=[[0, 2]],
+                   values=[0., 1.])
     """
     if in_dynamic_mode():
         return _C_ops.sparse_relu(x)
@@ -84,48 +87,48 @@ def softmax(x, axis=-1, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            paddle.seed(100)
-
-            mask = paddle.rand((3, 4)) < 0.5
-            x = paddle.rand((3, 4)) * mask
-            print(x)
-            # Tensor(shape=[3, 4], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [[0.83438963, 0.70008713, 0.        , 0.88831252],
-            #         [0.02200012, 0.        , 0.75432241, 0.65136462],
-            #         [0.96088767, 0.82938021, 0.35367414, 0.86653489]])
-
-            csr = x.to_sparse_csr()
-            print(csr)
-            # Tensor(shape=[3, 4], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True,
-            #        crows=[0 , 3 , 6 , 10],
-            #        cols=[0, 1, 3, 0, 2, 3, 0, 1, 2, 3],
-            #        values=[0.83438963, 0.70008713, 0.88831252, 0.02200012, 0.75432241,
-            #                0.65136462, 0.96088767, 0.82938021, 0.35367414, 0.86653489])
-
-            out = paddle.sparse.nn.functional.softmax(csr)
-            print(out)
-            # Tensor(shape=[3, 4], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True,
-            #        crows=[0 , 3 , 6 , 10],
-            #        cols=[0, 1, 3, 0, 2, 3, 0, 1, 2, 3],
-            #        values=[0.34132850, 0.29843223, 0.36023921, 0.20176248, 0.41964680,
-            #                0.37859070, 0.30015594, 0.26316854, 0.16354506, 0.27313042])
-
-            coo = x.to_sparse_coo(sparse_dim=2)
-            print(coo)
-            # Tensor(shape=[3, 4], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True,
-            #        indices=[[0, 0, 0, 1, 1, 1, 2, 2, 2, 2],
-            #                 [0, 1, 3, 0, 2, 3, 0, 1, 2, 3]],
-            #        values=[0.83438963, 0.70008713, 0.88831252, 0.02200012, 0.75432241,
-            #                0.65136462, 0.96088767, 0.82938021, 0.35367414, 0.86653489])
-
-            out = paddle.sparse.nn.functional.softmax(coo)
-            print(out)
-            # Tensor(shape=[3, 4], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True,
-            #        indices=[[0, 0, 0, 1, 1, 1, 2, 2, 2, 2],
-            #                 [0, 1, 3, 0, 2, 3, 0, 1, 2, 3]],
-            #        values=[0.34132853, 0.29843226, 0.36023924, 0.20176250, 0.41964683,
-            #                0.37859073, 0.30015597, 0.26316857, 0.16354507, 0.27313042])
+            >>> import paddle
+            >>> paddle.seed(100)
+
+            >>> mask = paddle.rand((3, 4)) < 0.5
+            >>> x = paddle.rand((3, 4)) * mask
+            >>> print(x)
+            Tensor(shape=[3, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[0.        , 0.95717543, 0.43864486, 0.        ],
+             [0.84765935, 0.45680618, 0.39412445, 0.        ],
+             [0.59444654, 0.        , 0.78364515, 0.        ]])
+
+            >>> csr = x.to_sparse_csr()
+            >>> print(csr)
+            Tensor(shape=[3, 4], dtype=paddle.float32, place=Place(cpu), stop_gradient=True,
+                   crows=[0, 2, 5, 7],
+                   cols=[1, 2, 0, 1, 2, 0, 2],
+                   values=[0.95717543, 0.43864486, 0.84765935, 0.45680618, 0.39412445,
+                           0.59444654, 0.78364515])
+
+            >>> out = paddle.sparse.nn.functional.softmax(csr)
+            >>> print(out)
+            Tensor(shape=[3, 4], dtype=paddle.float32, place=Place(cpu), stop_gradient=True,
+                   crows=[0, 2, 5, 7],
+                   cols=[1, 2, 0, 1, 2, 0, 2],
+                   values=[0.62680405, 0.37319586, 0.43255258, 0.29261294, 0.27483448,
+                           0.45284089, 0.54715902])
+
+            >>> coo = x.to_sparse_coo(sparse_dim=2)
+            >>> print(coo)
+            Tensor(shape=[3, 4], dtype=paddle.float32, place=Place(cpu), stop_gradient=True,
+                   indices=[[0, 0, 1, 1, 1, 2, 2],
+                            [1, 2, 0, 1, 2, 0, 2]],
+                   values=[0.95717543, 0.43864486, 0.84765935, 0.45680618, 0.39412445,
+                           0.59444654, 0.78364515])
+
+            >>> out = paddle.sparse.nn.functional.softmax(coo)
+            >>> print(out)
+            Tensor(shape=[3, 4], dtype=paddle.float32, place=Place(cpu), stop_gradient=True,
+                   indices=[[0, 0, 1, 1, 1, 2, 2],
+                            [1, 2, 0, 1, 2, 0, 2]],
+                   values=[0.62680405, 0.37319589, 0.43255258, 0.29261294, 0.27483445,
+                           0.45284092, 0.54715902])
     """
     if in_dynamic_mode():
         return _C_ops.sparse_softmax(x, axis)
@@ -162,11 +165,11 @@ def relu6(x, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            dense_x = paddle.to_tensor([-2., 0., 8.])
-            sparse_x = dense_x.to_sparse_coo(1)
-            out = paddle.sparse.nn.functional.relu6(sparse_x)
+            >>> dense_x = paddle.to_tensor([-2., 0., 8.])
+            >>> sparse_x = dense_x.to_sparse_coo(1)
+            >>> out = paddle.sparse.nn.functional.relu6(sparse_x)
     """
     return _C_ops.sparse_relu6(x)
 
@@ -198,10 +201,10 @@ def leaky_relu(x, negative_slope=0.01, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            dense_x = paddle.to_tensor([-2., 0., 5.])
-            sparse_x = dense_x.to_sparse_coo(1)
-            out = paddle.sparse.nn.functional.leaky_relu(sparse_x, 0.5)
+            >>> dense_x = paddle.to_tensor([-2., 0., 5.])
+            >>> sparse_x = dense_x.to_sparse_coo(1)
+            >>> out = paddle.sparse.nn.functional.leaky_relu(sparse_x, 0.5)
     """
     return _C_ops.sparse_leaky_relu(x, negative_slope)
diff --git a/python/paddle/sparse/nn/functional/conv.py b/python/paddle/sparse/nn/functional/conv.py
index c3f51421408bf..91fe020eeea47 100644
--- a/python/paddle/sparse/nn/functional/conv.py
+++ b/python/paddle/sparse/nn/functional/conv.py
@@ -282,18 +282,18 @@ def conv3d(
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            indices = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 1, 2], [1, 3, 2, 3]]
-            values = [[1], [2], [3], [4]]
-            indices = paddle.to_tensor(indices, dtype='int32')
-            values = paddle.to_tensor(values, dtype='float32')
-            dense_shape = [1, 1, 3, 4, 1]
-            sparse_x = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True)
-            weight = paddle.randn((1, 3, 3, 1, 1), dtype='float32')
-            y = paddle.sparse.nn.functional.conv3d(sparse_x, weight)
-            print(y.shape)
-            # (1, 1, 1, 2, 1)
+            >>> import paddle
+
+            >>> indices = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 1, 2], [1, 3, 2, 3]]
+            >>> values = [[1], [2], [3], [4]]
+            >>> indices = paddle.to_tensor(indices, dtype='int32')
+            >>> values = paddle.to_tensor(values, dtype='float32')
+            >>> dense_shape = [1, 1, 3, 4, 1]
+            >>> sparse_x = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True)
+            >>> weight = paddle.randn((1, 3, 3, 1, 1), dtype='float32')
+            >>> y = paddle.sparse.nn.functional.conv3d(sparse_x, weight)
+            >>> print(y.shape)
+            [1, 1, 1, 2, 1]
     """
     return _conv3d(
         x,
@@ -394,18 +394,18 @@ def subm_conv3d(
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            indices = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 1, 2], [1, 3, 2, 3]]
-            values = [[1], [2], [3], [4]]
-            indices = paddle.to_tensor(indices, dtype='int32')
-            values = paddle.to_tensor(values, dtype='float32')
-            dense_shape = [1, 1, 3, 4, 1]
-            sparse_x = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True)
-            weight = paddle.randn((1, 3, 3, 1, 1), dtype='float32')
-            y = paddle.sparse.nn.functional.subm_conv3d(sparse_x, weight)
-            print(y.shape)
-            #(1, 1, 3, 4, 1)
+            >>> import paddle
+
+            >>> indices = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 1, 2], [1, 3, 2, 3]]
+            >>> values = [[1], [2], [3], [4]]
+            >>> indices = paddle.to_tensor(indices, dtype='int32')
+            >>> values = paddle.to_tensor(values, dtype='float32')
+            >>> dense_shape = [1, 1, 3, 4, 1]
+            >>> sparse_x = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True)
+            >>> weight = paddle.randn((1, 3, 3, 1, 1), dtype='float32')
+            >>> y = paddle.sparse.nn.functional.subm_conv3d(sparse_x, weight)
+            >>> print(y.shape)
+            [1, 1, 3, 4, 1]
     """
     return _conv3d(
         x,
@@ -498,18 +498,18 @@ def conv2d(
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            indices = [[0, 0, 0, 0], [0, 0, 1, 2], [1, 3, 2, 3]]
-            values = [[1], [2], [3], [4]]
-            indices = paddle.to_tensor(indices, dtype='int32')
-            values = paddle.to_tensor(values, dtype='float32')
-            dense_shape = [1, 3, 4, 1]
-            sparse_x = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True)
-            weight = paddle.randn((3, 3, 1, 1), dtype='float32')
-            y = paddle.sparse.nn.functional.conv2d(sparse_x, weight)
-            print(y.shape)
-            # (1, 1, 2, 1)
+            >>> import paddle
+
+            >>> indices = [[0, 0, 0, 0], [0, 0, 1, 2], [1, 3, 2, 3]]
+            >>> values = [[1], [2], [3], [4]]
+            >>> indices = paddle.to_tensor(indices, dtype='int32')
+            >>> values = paddle.to_tensor(values, dtype='float32')
+            >>> dense_shape = [1, 3, 4, 1]
+            >>> sparse_x = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True)
+            >>> weight = paddle.randn((3, 3, 1, 1), dtype='float32')
+            >>> y = paddle.sparse.nn.functional.conv2d(sparse_x, weight)
+            >>> print(y.shape)
+            [1, 1, 2, 1]
     """
     return _conv2d(
         x,
@@ -607,18 +607,18 @@ def subm_conv2d(
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            indices = [[0, 0, 0, 0], [0, 0, 1, 2], [1, 3, 2, 3]]
-            values = [[1], [2], [3], [4]]
-            indices = paddle.to_tensor(indices, dtype='int32')
-            values = paddle.to_tensor(values, dtype='float32')
-            dense_shape = [1, 3, 4, 1]
-            sparse_x = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True)
-            weight = paddle.randn((3, 3, 1, 1), dtype='float32')
-            y = paddle.sparse.nn.functional.subm_conv2d(sparse_x, weight)
-            print(y.shape)
-            # (1, 3, 4, 1)
+            >>> import paddle
+
+            >>> indices = [[0, 0, 0, 0], [0, 0, 1, 2], [1, 3, 2, 3]]
+            >>> values = [[1], [2], [3], [4]]
+            >>> indices = paddle.to_tensor(indices, dtype='int32')
+            >>> values = paddle.to_tensor(values, dtype='float32')
+            >>> dense_shape = [1, 3, 4, 1]
+            >>> sparse_x = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True)
+            >>> weight = paddle.randn((3, 3, 1, 1), dtype='float32')
+            >>> y = paddle.sparse.nn.functional.subm_conv2d(sparse_x, weight)
+            >>> print(y.shape)
+            [1, 3, 4, 1]
     """
     return _conv2d(
         x,
diff --git a/python/paddle/sparse/nn/functional/pooling.py b/python/paddle/sparse/nn/functional/pooling.py
index d2165b1a63559..e76c0c43ba51e 100644
--- a/python/paddle/sparse/nn/functional/pooling.py
+++ b/python/paddle/sparse/nn/functional/pooling.py
@@ -63,15 +63,16 @@ def max_pool3d(
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            dense_x = paddle.randn((1, 4, 4, 4, 3))
-            sparse_x = dense_x.to_sparse_coo(4)
-            kernel_sizes = [3, 3, 3]
-            paddings = [0, 0, 0]
-            strides = [1, 1, 1]
-            out = paddle.sparse.nn.functional.max_pool3d(sparse_x, kernel_sizes, stride=strides, padding=paddings)
-            #[1, 2, 2, 2, 3]
+            >>> dense_x = paddle.randn((1, 4, 4, 4, 3))
+            >>> sparse_x = dense_x.to_sparse_coo(4)
+            >>> kernel_sizes = [3, 3, 3]
+            >>> paddings = [0, 0, 0]
+            >>> strides = [1, 1, 1]
+            >>> out = paddle.sparse.nn.functional.max_pool3d(sparse_x, kernel_sizes, stride=strides, padding=paddings)
+            >>> print(out.shape)
+            [1, 2, 2, 2, 3]
     """
 
     assert in_dynamic_mode(), "Currently, Sparse API only support dynamic mode"
diff --git a/python/paddle/sparse/nn/layer/activation.py b/python/paddle/sparse/nn/layer/activation.py
index 73672e4aac397..eedecc7a3b1d2 100644
--- a/python/paddle/sparse/nn/layer/activation.py
+++ b/python/paddle/sparse/nn/layer/activation.py
@@ -39,14 +39,16 @@ class ReLU(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            dense_x = paddle.to_tensor([-2., 0., 1.])
-            sparse_x = dense_x.to_sparse_coo(1)
-            relu = paddle.sparse.nn.ReLU()
-            out = relu(sparse_x)
-            # [0., 0., 1.]
-
+            >>> import paddle
+
+            >>> dense_x = paddle.to_tensor([-2., 0., 1.])
+            >>> sparse_x = dense_x.to_sparse_coo(1)
+            >>> relu = paddle.sparse.nn.ReLU()
+            >>> out = relu(sparse_x)
+            >>> print(out)
+            Tensor(shape=[3], dtype=paddle.float32, place=Place(cpu), stop_gradient=True,
+                   indices=[[0, 2]],
+                   values=[0., 1.])
     """
 
     def __init__(self, name=None):
@@ -89,49 +91,49 @@ class Softmax(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle
-            paddle.seed(2022)
-
-            mask = paddle.rand((3, 4)) < 0.7
-            x = paddle.rand((3, 4)) * mask
-            print(x)
-            # Tensor(shape=[3, 4], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [[0.08325022, 0.27030438, 0.        , 0.83883715],
-            #         [0.        , 0.95856029, 0.24004589, 0.        ],
-            #         [0.14500992, 0.17088132, 0.        , 0.        ]])
-
-            csr = x.to_sparse_csr()
-            print(csr)
-            # Tensor(shape=[3, 4], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True,
-            #        crows=[0, 3, 5, 7],
-            #        cols=[0, 1, 3, 1, 2, 0, 1],
-            #        values=[0.08325022, 0.27030438, 0.83883715, 0.95856029, 0.24004589,
-            #                0.14500992, 0.17088132])
-
-            softmax = paddle.sparse.nn.Softmax()
-            out = softmax(csr)
-            print(out)
-            # Tensor(shape=[3, 4], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True,
-            #        crows=[0, 3, 5, 7],
-            #        cols=[0, 1, 3, 1, 2, 0, 1],
-            #        values=[0.23070428, 0.27815846, 0.49113727, 0.67227983, 0.32772022,
-            #                0.49353254, 0.50646752])
-
-            coo = x.to_sparse_coo(sparse_dim=2)
-            print(coo)
-            # Tensor(shape=[3, 4], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True,
-            #        indices=[[0, 0, 0, 1, 1, 1, 2, 2, 2, 2],
-            #                 [0, 1, 3, 0, 2, 3, 0, 1, 2, 3]],
-            #        values=[0.83438963, 0.70008713, 0.88831252, 0.02200012, 0.75432241,
-            #                0.65136462, 0.96088767, 0.82938021, 0.35367414, 0.86653489])
-
-            out = softmax(coo)
-            print(out)
-            # Tensor(shape=[3, 4], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True,
-            #        indices=[[0, 0, 0, 1, 1, 1, 2, 2, 2, 2],
-            #                 [0, 1, 3, 0, 2, 3, 0, 1, 2, 3]],
-            #        values=[0.34132853, 0.29843226, 0.36023924, 0.20176250, 0.41964683,
-            #                0.37859073, 0.30015597, 0.26316857, 0.16354507, 0.27313042])
+            >>> import paddle
+            >>> paddle.seed(2022)
+
+            >>> mask = paddle.rand((3, 4)) < 0.7
+            >>> x = paddle.rand((3, 4)) * mask
+            >>> print(x)
+            Tensor(shape=[3, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[0.88156885, 0.14463395, 0.17831714, 0.43818203],
+             [0.07617740, 0.75576496, 0.        , 0.61921930],
+             [0.        , 0.        , 0.42460245, 0.03001321]])
+
+            >>> csr = x.to_sparse_csr()
+            >>> print(csr)
+            Tensor(shape=[3, 4], dtype=paddle.float32, place=Place(cpu), stop_gradient=True,
+                   crows=[0, 4, 7, 9],
+                   cols=[0, 1, 2, 3, 0, 1, 3, 2, 3],
+                   values=[0.88156885, 0.14463395, 0.17831714, 0.43818203, 0.07617740,
+                           0.75576496, 0.61921930, 0.42460245, 0.03001321])
+
+            >>> softmax = paddle.sparse.nn.Softmax()
+            >>> out = softmax(csr)
+            >>> print(out)
+            Tensor(shape=[3, 4], dtype=paddle.float32, place=Place(cpu), stop_gradient=True,
+                   crows=[0, 4, 7, 9],
+                   cols=[0, 1, 2, 3, 0, 1, 3, 2, 3],
+                   values=[0.38234913, 0.18298410, 0.18925257, 0.24541418, 0.21302439,
+                           0.42031071, 0.36666498, 0.59738696, 0.40261301])
+
+            >>> coo = x.to_sparse_coo(sparse_dim=2)
+            >>> print(coo)
+            Tensor(shape=[3, 4], dtype=paddle.float32, place=Place(cpu), stop_gradient=True,
+                   indices=[[0, 0, 0, 0, 1, 1, 1, 2, 2],
+                            [0, 1, 2, 3, 0, 1, 3, 2, 3]],
+                   values=[0.88156885, 0.14463395, 0.17831714, 0.43818203, 0.07617740,
+                           0.75576496, 0.61921930, 0.42460245, 0.03001321])
+
+            >>> out = softmax(coo)
+            >>> print(out)
+            Tensor(shape=[3, 4], dtype=paddle.float32, place=Place(cpu), stop_gradient=True,
+                   indices=[[0, 0, 0, 0, 1, 1, 1, 2, 2],
+                            [0, 1, 2, 3, 0, 1, 3, 2, 3]],
+                   values=[0.38234913, 0.18298411, 0.18925257, 0.24541420, 0.21302438,
+                           0.42031071, 0.36666498, 0.59738696, 0.40261301])
     """
 
     def __init__(self, axis=-1, name=None):
@@ -167,13 +169,12 @@ class ReLU6(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            dense_x = paddle.to_tensor([-2., 0., 8.])
-            sparse_x = dense_x.to_sparse_coo(1)
-            relu6 = paddle.sparse.nn.ReLU6()
-            out = relu6(sparse_x)
+            >>> import paddle
 
+            >>> dense_x = paddle.to_tensor([-2., 0., 8.])
+            >>> sparse_x = dense_x.to_sparse_coo(1)
+            >>> relu6 = paddle.sparse.nn.ReLU6()
+            >>> out = relu6(sparse_x)
     """
 
     def __init__(self, name=None):
@@ -216,12 +217,12 @@ class LeakyReLU(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            dense_x = paddle.to_tensor([-2., 0., 5.])
-            sparse_x = dense_x.to_sparse_coo(1)
-            leaky_relu = paddle.sparse.nn.LeakyReLU(0.5)
-            out = leaky_relu(sparse_x)
+            >>> dense_x = paddle.to_tensor([-2., 0., 5.])
+            >>> sparse_x = dense_x.to_sparse_coo(1)
+            >>> leaky_relu = paddle.sparse.nn.LeakyReLU(0.5)
+            >>> out = leaky_relu(sparse_x)
 
     """
 
diff --git a/python/paddle/sparse/nn/layer/conv.py b/python/paddle/sparse/nn/layer/conv.py
index 4c77947c2ebaa..d3295148fbc15 100644
--- a/python/paddle/sparse/nn/layer/conv.py
+++ b/python/paddle/sparse/nn/layer/conv.py
@@ -326,18 +326,18 @@ class Conv3D(_Conv3D):
 
         .. code-block:: python
 
-          import paddle
-
-          indices = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 1, 2], [1, 3, 2, 3]]
-          values = [[1], [2], [3], [4]]
-          indices = paddle.to_tensor(indices, dtype='int32')
-          values = paddle.to_tensor(values, dtype='float32')
-          dense_shape = [1, 1, 3, 4, 1]
-          sparse_x = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True)
-          conv = paddle.sparse.nn.Conv3D(1, 1, (1, 3, 3))
-          y = conv(sparse_x)
-          print(y.shape)
-          # (1, 1, 1, 2, 1)
+            >>> import paddle
+
+            >>> indices = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 1, 2], [1, 3, 2, 3]]
+            >>> values = [[1], [2], [3], [4]]
+            >>> indices = paddle.to_tensor(indices, dtype='int32')
+            >>> values = paddle.to_tensor(values, dtype='float32')
+            >>> dense_shape = [1, 1, 3, 4, 1]
+            >>> sparse_x = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True)
+            >>> conv = paddle.sparse.nn.Conv3D(1, 1, (1, 3, 3))
+            >>> y = conv(sparse_x)
+            >>> print(y.shape)
+            [1, 1, 1, 2, 1]
     """
 
     def __init__(
@@ -461,18 +461,18 @@ class Conv2D(_Conv2D):
 
         .. code-block:: python
 
-          import paddle
-
-          indices = [[0, 0, 0, 0], [0, 0, 1, 2], [1, 3, 2, 3]]
-          values = [[1], [2], [3], [4]]
-          indices = paddle.to_tensor(indices, dtype='int32')
-          values = paddle.to_tensor(values, dtype='float32')
-          dense_shape = [1, 3, 4, 1]
-          sparse_x = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True)
-          conv = paddle.sparse.nn.Conv2D(1, 1, (3, 3))
-          y = conv(sparse_x)
-          print(y.shape)
-          # (1, 1, 2, 1)
+            >>> import paddle
+
+            >>> indices = [[0, 0, 0, 0], [0, 0, 1, 2], [1, 3, 2, 3]]
+            >>> values = [[1], [2], [3], [4]]
+            >>> indices = paddle.to_tensor(indices, dtype='int32')
+            >>> values = paddle.to_tensor(values, dtype='float32')
+            >>> dense_shape = [1, 3, 4, 1]
+            >>> sparse_x = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True)
+            >>> conv = paddle.sparse.nn.Conv2D(1, 1, (3, 3))
+            >>> y = conv(sparse_x)
+            >>> print(y.shape)
+            [1, 1, 2, 1]
     """
 
     def __init__(
@@ -600,18 +600,18 @@ class SubmConv3D(_Conv3D):
 
         .. code-block:: python
 
-          import paddle
-
-          indices = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 1, 2], [1, 3, 2, 3]]
-          values = [[1], [2], [3], [4]]
-          dense_shape = [1, 1, 3, 4, 1]
-          indices = paddle.to_tensor(indices, dtype='int32')
-          values = paddle.to_tensor(values, dtype='float32')
-          sparse_x = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True)
-          subm_conv = paddle.sparse.nn.SubmConv3D(1, 1, (1, 3, 3))
-          y = subm_conv(sparse_x)
-          print(y.shape)
-          # (1, 1, 3, 4, 1)
+            >>> import paddle
+
+            >>> indices = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 1, 2], [1, 3, 2, 3]]
+            >>> values = [[1], [2], [3], [4]]
+            >>> dense_shape = [1, 1, 3, 4, 1]
+            >>> indices = paddle.to_tensor(indices, dtype='int32')
+            >>> values = paddle.to_tensor(values, dtype='float32')
+            >>> sparse_x = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True)
+            >>> subm_conv = paddle.sparse.nn.SubmConv3D(1, 1, (1, 3, 3))
+            >>> y = subm_conv(sparse_x)
+            >>> print(y.shape)
+            [1, 1, 3, 4, 1]
     """
 
     def __init__(
@@ -740,18 +740,18 @@ class SubmConv2D(_Conv2D):
 
         .. code-block:: python
 
-          import paddle
-
-          indices = [[0, 0, 0, 0], [0, 0, 1, 2], [1, 3, 2, 3]]
-          values = [[1], [2], [3], [4]]
-          dense_shape = [1, 3, 4, 1]
-          indices = paddle.to_tensor(indices, dtype='int32')
-          values = paddle.to_tensor(values, dtype='float32')
-          sparse_x = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True)
-          subm_conv = paddle.sparse.nn.SubmConv2D(1, 1, (3, 3))
-          y = subm_conv(sparse_x)
-          print(y.shape)
-          # (1, 3, 4, 1)
+            >>> import paddle
+
+            >>> indices = [[0, 0, 0, 0], [0, 0, 1, 2], [1, 3, 2, 3]]
+            >>> values = [[1], [2], [3], [4]]
+            >>> dense_shape = [1, 3, 4, 1]
+            >>> indices = paddle.to_tensor(indices, dtype='int32')
+            >>> values = paddle.to_tensor(values, dtype='float32')
+            >>> sparse_x = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True)
+            >>> subm_conv = paddle.sparse.nn.SubmConv2D(1, 1, (3, 3))
+            >>> y = subm_conv(sparse_x)
+            >>> print(y.shape)
+            [1, 3, 4, 1]
     """
 
     def __init__(
diff --git a/python/paddle/sparse/nn/layer/norm.py b/python/paddle/sparse/nn/layer/norm.py
index ebb4b68930ece..4b9b4087bd1b5 100644
--- a/python/paddle/sparse/nn/layer/norm.py
+++ b/python/paddle/sparse/nn/layer/norm.py
@@ -83,17 +83,16 @@ class BatchNorm(paddle.nn.BatchNorm1D):
     Examples:
         .. code-block:: python
 
-          import paddle
-
-          paddle.seed(123)
-          channels = 3
-          x_data = paddle.randn((1, 6, 6, 6, channels)).astype('float32')
-          dense_x = paddle.to_tensor(x_data)
-          sparse_x = dense_x.to_sparse_coo(4)
-          batch_norm = paddle.sparse.nn.BatchNorm(channels)
-          batch_norm_out = batch_norm(sparse_x)
-          print(batch_norm_out.shape)
-          # [1, 6, 6, 6, 3]
+            >>> import paddle
+            >>> paddle.seed(123)
+            >>> channels = 3
+            >>> x_data = paddle.randn((1, 6, 6, 6, channels)).astype('float32')
+            >>> dense_x = paddle.to_tensor(x_data)
+            >>> sparse_x = dense_x.to_sparse_coo(4)
+            >>> batch_norm = paddle.sparse.nn.BatchNorm(channels)
+            >>> batch_norm_out = batch_norm(sparse_x)
+            >>> print(batch_norm_out.shape)
+            [1, 6, 6, 6, 3]
     """
 
     def __init__(
@@ -281,25 +280,26 @@ class SyncBatchNorm(paddle.nn.SyncBatchNorm):
     Examples:
         .. code-block:: python
 
-          # required: gpu
-          import paddle
-          import paddle.sparse.nn as nn
-
-          x = paddle.to_tensor([[[[0.3, 0.4], [0.3, 0.07]], [[0.83, 0.37], [0.18, 0.93]]]], dtype='float32')
-          x = x.to_sparse_coo(len(x.shape)-1)
-
-          if paddle.is_compiled_with_cuda():
-              sync_batch_norm = nn.SyncBatchNorm(2)
-              hidden1 = sync_batch_norm(x)
-              print(hidden1)
-              # Tensor(shape=[1, 2, 2, 2], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True,
-              #        indices=[[0, 0, 0, 0],
-              #                 [0, 0, 1, 1],
-              #                 [0, 1, 0, 1]],
-              #        values=[[-0.40730840, -0.13725480],
-              #                 [-0.40730840, -1.20299828],
-              #                 [ 1.69877410, -0.23414057],
-              #                 [-0.88415730,  1.57439375]])
+            >>> # doctest: +REQUIRES(env:GPU)
+            >>> import paddle
+            >>> import paddle.sparse.nn as nn
+            >>> paddle.device.set_device('gpu')
+
+            >>> x = paddle.to_tensor([[[[0.3, 0.4], [0.3, 0.07]], [[0.83, 0.37], [0.18, 0.93]]]], dtype='float32')
+            >>> x = x.to_sparse_coo(len(x.shape)-1)
+
+            >>> if paddle.is_compiled_with_cuda():
+            ...     sync_batch_norm = nn.SyncBatchNorm(2)
+            ...     hidden1 = sync_batch_norm(x)
+            ...     print(hidden1)
+            Tensor(shape=[1, 2, 2, 2], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=False,
+                   indices=[[0, 0, 0, 0],
+                            [0, 0, 1, 1],
+                            [0, 1, 0, 1]],
+                   values=[[-0.40730840, -0.13725480],
+                            [-0.40730840, -1.20299828],
+                            [ 1.69877410, -0.23414057],
+                            [-0.88415730,  1.57439375]])
     """
 
     def __init__(
@@ -354,11 +354,11 @@ def convert_sync_batchnorm(cls, layer):
 
             .. code-block:: python
 
-                import paddle
-                import paddle.sparse.nn as nn
+                >>> import paddle
+                >>> import paddle.sparse.nn as nn
 
-                model = paddle.nn.Sequential(nn.Conv3D(3, 5, 3), nn.BatchNorm(5))
-                sync_model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
+                >>> model = paddle.nn.Sequential(nn.Conv3D(3, 5, 3), nn.BatchNorm(5))
+                >>> sync_model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
         """
 
         layer_output = layer
diff --git a/python/paddle/sparse/nn/layer/pooling.py b/python/paddle/sparse/nn/layer/pooling.py
index f42af9f5dc140..3190e4d98f864 100644
--- a/python/paddle/sparse/nn/layer/pooling.py
+++ b/python/paddle/sparse/nn/layer/pooling.py
@@ -61,15 +61,15 @@ class MaxPool3D(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            dense_x = paddle.randn((2, 3, 6, 6, 3))
-            sparse_x = dense_x.to_sparse_coo(4)
-            max_pool3d = paddle.sparse.nn.MaxPool3D(
-                kernel_size=3, data_format='NDHWC')
-            out = max_pool3d(sparse_x)
-            #shape=[2, 1, 2, 2, 3]
+            >>> import paddle
 
+            >>> dense_x = paddle.randn((2, 3, 6, 6, 3))
+            >>> sparse_x = dense_x.to_sparse_coo(4)
+            >>> max_pool3d = paddle.sparse.nn.MaxPool3D(
+            ...     kernel_size=3, data_format='NDHWC')
+            >>> out = max_pool3d(sparse_x)
+            >>> print(out.shape)
+            [2, 1, 2, 2, 3]
     """
 
     def __init__(
diff --git a/python/paddle/sparse/unary.py b/python/paddle/sparse/unary.py
index b97a588af1bbe..c49a7b03d26b4 100644
--- a/python/paddle/sparse/unary.py
+++ b/python/paddle/sparse/unary.py
@@ -53,12 +53,15 @@ def sin(x, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            dense_x = paddle.to_tensor([-2., 0., 1.])
-            sparse_x = dense_x.to_sparse_coo(1)
-            out = paddle.sparse.sin(sparse_x)
-
+            >>> import paddle
+
+            >>> dense_x = paddle.to_tensor([-2., 0., 1.])
+            >>> sparse_x = dense_x.to_sparse_coo(1)
+            >>> out = paddle.sparse.sin(sparse_x)
+            >>> out
+            Tensor(shape=[3], dtype=paddle.float32, place=Place(cpu), stop_gradient=True,
+                indices=[[0, 2]],
+                values=[-0.90929741,  0.84147102])
     """
     return _C_ops.sparse_sin(x)
 
@@ -83,12 +86,15 @@ def tan(x, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            dense_x = paddle.to_tensor([-2., 0., 1.])
-            sparse_x = dense_x.to_sparse_coo(1)
-            out = paddle.sparse.tan(sparse_x)
+            >>> import paddle
 
+            >>> dense_x = paddle.to_tensor([-2., 0., 1.])
+            >>> sparse_x = dense_x.to_sparse_coo(1)
+            >>> out = paddle.sparse.tan(sparse_x)
+            >>> out
+            Tensor(shape=[3], dtype=paddle.float32, place=Place(cpu), stop_gradient=True,
+                indices=[[0, 2]],
+                values=[2.18503976, 1.55740774])
     """
     return _C_ops.sparse_tan(x)
 
@@ -113,12 +119,15 @@ def asin(x, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            dense_x = paddle.to_tensor([-2., 0., 1.])
-            sparse_x = dense_x.to_sparse_coo(1)
-            out = paddle.sparse.asin(sparse_x)
+            >>> import paddle
 
+            >>> dense_x = paddle.to_tensor([-2., 0., 1.])
+            >>> sparse_x = dense_x.to_sparse_coo(1)
+            >>> out = paddle.sparse.asin(sparse_x)
+            >>> out
+            Tensor(shape=[3], dtype=paddle.float32, place=Place(cpu), stop_gradient=True,
+                indices=[[0, 2]],
+                values=[nan       , 1.57079625])
     """
     return _C_ops.sparse_asin(x)
 
@@ -144,12 +153,16 @@ def transpose(x, perm, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            dense_x = paddle.to_tensor([[-2., 0.], [1., 2.]])
-            sparse_x = dense_x.to_sparse_coo(1)
-            out = paddle.sparse.transpose(sparse_x, [1, 0])
+            >>> import paddle
 
+            >>> dense_x = paddle.to_tensor([[-2., 0.], [1., 2.]])
+            >>> sparse_x = dense_x.to_sparse_coo(1)
+            >>> out = paddle.sparse.transpose(sparse_x, [1, 0])
+            >>> out
+            Tensor(shape=[2, 2], dtype=paddle.float32, place=Place(cpu), stop_gradient=True,
+                indices=[[0, 0]],
+                values=[[-2.,  0.],
+                        [ 1.,  2.]])
     """
     return _C_ops.sparse_transpose(x, perm)
 
@@ -181,14 +194,31 @@ def sum(x, axis=None, dtype=None, keepdim=False, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            dense_x = paddle.to_tensor([[-2., 0.], [1., 2.]])
-            sparse_x = dense_x.to_sparse_coo(1)
-            out1 = paddle.sparse.sum(sparse_x)  # [1.]
-            out2 = paddle.sparse.sum(sparse_x, axis=0)  # [-1., 2.]
-            out3 = paddle.sparse.sum(sparse_x, axis=-1)  # [-2., 3.]
-            out4 = paddle.sparse.sum(sparse_x, axis=1, keepdim=True)  # [[-2.], [3.]]
+            >>> import paddle
+
+            >>> dense_x = paddle.to_tensor([[-2., 0.], [1., 2.]])
+            >>> sparse_x = dense_x.to_sparse_coo(1)
+            >>> out1 = paddle.sparse.sum(sparse_x)
+            >>> out1
+            Tensor(shape=[1], dtype=paddle.float32, place=Place(cpu), stop_gradient=True,
+                indices=[0],
+                values=1.)
+            >>> out2 = paddle.sparse.sum(sparse_x, axis=0)
+            >>> out2
+            Tensor(shape=[1, 2], dtype=paddle.float32, place=Place(cpu), stop_gradient=True,
+                indices=[[0]],
+                values=[[-1.,  2.]])
+            >>> out3 = paddle.sparse.sum(sparse_x, axis=-1)
+            >>> out3
+            Tensor(shape=[2], dtype=paddle.float32, place=Place(cpu), stop_gradient=True,
+                indices=[[0, 1]],
+                values=[-2.,  3.])
+            >>> out4 = paddle.sparse.sum(sparse_x, axis=1, keepdim=True)
+            >>> out4
+            Tensor(shape=[2, 1], dtype=paddle.float32, place=Place(cpu), stop_gradient=True,
+                indices=[[0, 1]],
+                values=[[-2.],
+                        [ 3.]])
     """
     dtype_flag = False
     if dtype is not None:
@@ -259,12 +289,15 @@ def atan(x, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            dense_x = paddle.to_tensor([-2., 0., 1.])
-            sparse_x = dense_x.to_sparse_coo(1)
-            out = paddle.sparse.atan(sparse_x)
+            >>> import paddle
 
+            >>> dense_x = paddle.to_tensor([-2., 0., 1.])
+            >>> sparse_x = dense_x.to_sparse_coo(1)
+            >>> out = paddle.sparse.atan(sparse_x)
+            >>> out
+            Tensor(shape=[3], dtype=paddle.float32, place=Place(cpu), stop_gradient=True,
+                indices=[[0, 2]],
+                values=[-1.10714877,  0.78539819])
     """
     return _C_ops.sparse_atan(x)
 
@@ -289,12 +322,15 @@ def sinh(x, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            dense_x = paddle.to_tensor([-2., 0., 1.])
-            sparse_x = dense_x.to_sparse_coo(1)
-            out = paddle.sparse.sinh(sparse_x)
+            >>> import paddle
 
+            >>> dense_x = paddle.to_tensor([-2., 0., 1.])
+            >>> sparse_x = dense_x.to_sparse_coo(1)
+            >>> out = paddle.sparse.sinh(sparse_x)
+            >>> out
+            Tensor(shape=[3], dtype=paddle.float32, place=Place(cpu), stop_gradient=True,
+                indices=[[0, 2]],
+                values=[-3.62686038,  1.17520118])
     """
     return _C_ops.sparse_sinh(x)
 
@@ -319,12 +355,15 @@ def asinh(x, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            dense_x = paddle.to_tensor([-2., 0., 1.])
-            sparse_x = dense_x.to_sparse_coo(1)
-            out = paddle.sparse.asinh(sparse_x)
+            >>> import paddle
 
+            >>> dense_x = paddle.to_tensor([-2., 0., 1.])
+            >>> sparse_x = dense_x.to_sparse_coo(1)
+            >>> out = paddle.sparse.asinh(sparse_x)
+            >>> out
+            Tensor(shape=[3], dtype=paddle.float32, place=Place(cpu), stop_gradient=True,
+                indices=[[0, 2]],
+                values=[-1.44363546,  0.88137358])
     """
     return _C_ops.sparse_asinh(x)
 
@@ -349,12 +388,15 @@ def atanh(x, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            dense_x = paddle.to_tensor([-2., 0., 1.])
-            sparse_x = dense_x.to_sparse_coo(1)
-            out = paddle.sparse.atanh(sparse_x)
+            >>> import paddle
 
+            >>> dense_x = paddle.to_tensor([-2., 0., 1.])
+            >>> sparse_x = dense_x.to_sparse_coo(1)
+            >>> out = paddle.sparse.atanh(sparse_x)
+            >>> out
+            Tensor(shape=[3], dtype=paddle.float32, place=Place(cpu), stop_gradient=True,
+                indices=[[0, 2]],
+                values=[nan , inf.])
     """
     return _C_ops.sparse_atanh(x)
 
@@ -379,12 +421,15 @@ def tanh(x, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            dense_x = paddle.to_tensor([-2., 0., 1.])
-            sparse_x = dense_x.to_sparse_coo(1)
-            out = paddle.sparse.tanh(sparse_x)
+            >>> import paddle
 
+            >>> dense_x = paddle.to_tensor([-2., 0., 1.])
+            >>> sparse_x = dense_x.to_sparse_coo(1)
+            >>> out = paddle.sparse.tanh(sparse_x)
+            >>> out
+            Tensor(shape=[3], dtype=paddle.float32, place=Place(cpu), stop_gradient=True,
+                indices=[[0, 2]],
+                values=[-0.96402758,  0.76159418])
     """
     return _C_ops.sparse_tanh(x)
 
@@ -409,12 +454,15 @@ def square(x, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            dense_x = paddle.to_tensor([-2., 0., 1.])
-            sparse_x = dense_x.to_sparse_coo(1)
-            out = paddle.sparse.square(sparse_x)
+            >>> import paddle
 
+            >>> dense_x = paddle.to_tensor([-2., 0., 1.])
+            >>> sparse_x = dense_x.to_sparse_coo(1)
+            >>> out = paddle.sparse.square(sparse_x)
+            >>> out
+            Tensor(shape=[3], dtype=paddle.float32, place=Place(cpu), stop_gradient=True,
+                indices=[[0, 2]],
+                values=[4., 1.])
     """
     return _C_ops.sparse_square(x)
 
@@ -439,12 +487,15 @@ def sqrt(x, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            dense_x = paddle.to_tensor([-2., 0., 1.])
-            sparse_x = dense_x.to_sparse_coo(1)
-            out = paddle.sparse.sqrt(sparse_x)
+            >>> import paddle
 
+            >>> dense_x = paddle.to_tensor([-2., 0., 1.])
+            >>> sparse_x = dense_x.to_sparse_coo(1)
+            >>> out = paddle.sparse.sqrt(sparse_x)
+            >>> out
+            Tensor(shape=[3], dtype=paddle.float32, place=Place(cpu), stop_gradient=True,
+                indices=[[0, 2]],
+                values=[nan, 1. ])
     """
     return _C_ops.sparse_sqrt(x)
 
@@ -469,12 +520,15 @@ def log1p(x, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            dense_x = paddle.to_tensor([-2, 0, 1], dtype='float32')
-            sparse_x = dense_x.to_sparse_coo(1)
-            out = paddle.sparse.log1p(sparse_x)
+            >>> import paddle
 
+            >>> dense_x = paddle.to_tensor([-2, 0, 1], dtype='float32')
+            >>> sparse_x = dense_x.to_sparse_coo(1)
+            >>> out = paddle.sparse.log1p(sparse_x)
+            >>> out
+            Tensor(shape=[3], dtype=paddle.float32, place=Place(cpu), stop_gradient=True,
+                indices=[[0, 2]],
+                values=[nan       , 0.69314718])
     """
     return _C_ops.sparse_log1p(x)
 
@@ -500,12 +554,15 @@ def cast(x, index_dtype=None, value_dtype=None, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            dense_x = paddle.to_tensor([-2, 0, 1])
-            sparse_x = dense_x.to_sparse_coo(1)
-            out = paddle.sparse.cast(sparse_x, 'int32', 'float64')
+            >>> import paddle
 
+            >>> dense_x = paddle.to_tensor([-2, 0, 1])
+            >>> sparse_x = dense_x.to_sparse_coo(1)
+            >>> out = paddle.sparse.cast(sparse_x, 'int32', 'float64')
+            >>> out
+            Tensor(shape=[3], dtype=paddle.float64, place=Place(cpu), stop_gradient=True,
+                indices=[[0, 2]],
+                values=[-2.,  1.])
     """
     if index_dtype and not isinstance(index_dtype, core.VarDesc.VarType):
         index_dtype = convert_np_dtype_to_dtype_(index_dtype)
@@ -535,12 +592,15 @@ def pow(x, factor, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            dense_x = paddle.to_tensor([-2, 0, 3], dtype='float32')
-            sparse_x = dense_x.to_sparse_coo(1)
-            out = paddle.sparse.pow(sparse_x, 2)
+            >>> import paddle
 
+            >>> dense_x = paddle.to_tensor([-2, 0, 3], dtype='float32')
+            >>> sparse_x = dense_x.to_sparse_coo(1)
+            >>> out = paddle.sparse.pow(sparse_x, 2)
+            >>> out
+            Tensor(shape=[3], dtype=paddle.float32, place=Place(cpu), stop_gradient=True,
+                indices=[[0, 2]],
+                values=[4., 9.])
     """
     return _C_ops.sparse_pow(x, float(factor))
 
@@ -565,12 +625,15 @@ def neg(x, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            dense_x = paddle.to_tensor([-2, 0, 3], dtype='float32')
-            sparse_x = dense_x.to_sparse_coo(1)
-            out = paddle.sparse.neg(sparse_x)
+            >>> import paddle
 
+            >>> dense_x = paddle.to_tensor([-2, 0, 3], dtype='float32')
+            >>> sparse_x = dense_x.to_sparse_coo(1)
+            >>> out = paddle.sparse.neg(sparse_x)
+            >>> out
+            Tensor(shape=[3], dtype=paddle.float32, place=Place(cpu), stop_gradient=True,
+                indices=[[0, 2]],
+                values=[ 2., -3.])
     """
     return _C_ops.sparse_scale(x, -1.0, 0.0, True)
 
@@ -595,12 +658,15 @@ def abs(x, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            dense_x = paddle.to_tensor([-2, 0, 3], dtype='float32')
-            sparse_x = dense_x.to_sparse_coo(1)
-            out = paddle.sparse.abs(sparse_x)
+            >>> import paddle
 
+            >>> dense_x = paddle.to_tensor([-2, 0, 3], dtype='float32')
+            >>> sparse_x = dense_x.to_sparse_coo(1)
+            >>> out = paddle.sparse.abs(sparse_x)
+            >>> out
+            Tensor(shape=[3], dtype=paddle.float32, place=Place(cpu), stop_gradient=True,
+                indices=[[0, 2]],
+                values=[2., 3.])
     """
     return _C_ops.sparse_abs(x)
 
@@ -621,16 +687,19 @@ def coalesce(x, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            indices = [[0, 0, 1], [1, 1, 2]]
-            values = [1.0, 2.0, 3.0]
-            sp_x = paddle.sparse.sparse_coo_tensor(indices, values)
-            sp_x = paddle.sparse.coalesce(sp_x)
-            print(sp_x.indices())
-            #[[0, 1], [1, 2]]
-            print(sp_x.values())
-            #[3.0, 3.0]
+            >>> import paddle
+
+            >>> indices = [[0, 0, 1], [1, 1, 2]]
+            >>> values = [1.0, 2.0, 3.0]
+            >>> sp_x = paddle.sparse.sparse_coo_tensor(indices, values)
+            >>> sp_x = paddle.sparse.coalesce(sp_x)
+            >>> print(sp_x.indices())
+            Tensor(shape=[2, 2], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[0, 1],
+             [1, 2]])
+            >>> print(sp_x.values())
+            Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [3., 3.])
     """
     return _C_ops.sparse_coalesce(x)
 
@@ -656,12 +725,15 @@ def rad2deg(x, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            dense_x = paddle.to_tensor([3.142, 0., -3.142])
-            sparse_x = dense_x.to_sparse_coo(1)
-            out = paddle.sparse.rad2deg(sparse_x)
+            >>> import paddle
 
+            >>> dense_x = paddle.to_tensor([3.142, 0., -3.142])
+            >>> sparse_x = dense_x.to_sparse_coo(1)
+            >>> out = paddle.sparse.rad2deg(sparse_x)
+            >>> out
+            Tensor(shape=[3], dtype=paddle.float32, place=Place(cpu), stop_gradient=True,
+                indices=[[0, 2]],
+                values=[ 180.02334595, -180.02334595])
     """
     if x.dtype in _int_dtype_:
         x = _C_ops.sparse_cast(x, None, core.VarDesc.VarType.FP32)
@@ -689,12 +761,15 @@ def deg2rad(x, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            dense_x = paddle.to_tensor([-180, 0, 180])
-            sparse_x = dense_x.to_sparse_coo(1)
-            out = paddle.sparse.deg2rad(sparse_x)
+            >>> import paddle
 
+            >>> dense_x = paddle.to_tensor([-180, 0, 180])
+            >>> sparse_x = dense_x.to_sparse_coo(1)
+            >>> out = paddle.sparse.deg2rad(sparse_x)
+            >>> out
+            Tensor(shape=[3], dtype=paddle.float32, place=Place(cpu), stop_gradient=True,
+                indices=[[0, 2]],
+                values=[-3.14159274,  3.14159274])
     """
     if x.dtype in _int_dtype_:
         x = _C_ops.sparse_cast(x, None, core.VarDesc.VarType.FP32)
@@ -721,11 +796,15 @@ def expm1(x, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            dense_x = paddle.to_tensor([-2., 0., 1.])
-            sparse_x = dense_x.to_sparse_coo(1)
-            out = paddle.sparse.expm1(sparse_x)
+            >>> dense_x = paddle.to_tensor([-2., 0., 1.])
+            >>> sparse_x = dense_x.to_sparse_coo(1)
+            >>> out = paddle.sparse.expm1(sparse_x)
+            >>> out
+            Tensor(shape=[3], dtype=paddle.float32, place=Place(cpu), stop_gradient=True,
+                indices=[[0, 2]],
+                values=[-0.86466473,  1.71828187])
     """
     return _C_ops.sparse_expm1(x)
 
@@ -765,22 +844,22 @@ def reshape(x, shape, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            x_shape = [6, 2, 3]
-            new_shape = [1, 0, 2, -1, 3]
-            format = "coo"
+            >>> x_shape = [6, 2, 3]
+            >>> new_shape = [1, 0, 2, -1, 3]
+            >>> format = "coo"
 
-            dense_x = paddle.randint(-100, 100, x_shape) * paddle.randint(0, 2, x_shape)
+            >>> dense_x = paddle.randint(-100, 100, x_shape) * paddle.randint(0, 2, x_shape)
 
-            if format == "coo":
-                sp_x = dense_x.to_sparse_coo(len(x_shape))
-            else:
-                sp_x = dense_x.to_sparse_csr()
-            sp_out = paddle.sparse.reshape(sp_x, new_shape)
+            >>> if format == "coo":
+            ...     sp_x = dense_x.to_sparse_coo(len(x_shape))
+            >>> else:
+            ...     sp_x = dense_x.to_sparse_csr()
+            >>> sp_out = paddle.sparse.reshape(sp_x, new_shape)
 
-            print(sp_out)
-            # the shape of sp_out is [1, 2, 2, 3, 3]
+            >>> print(sp_out.shape)
+            [1, 2, 2, 3, 3]
 
     """
     if in_dynamic_mode():
@@ -832,25 +911,25 @@ def isnan(x, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import numpy as np
-
-            format = "coo"
-            np_x = np.asarray([[[0., 0], [1., 2.]], [[0., 0], [3., float('nan')]]])
-            dense_x = paddle.to_tensor(np_x)
-
-            if format == "coo":
-                sparse_x = dense_x.to_sparse_coo(len(np_x.shape))
-            else:
-                sparse_x = dense_x.to_sparse_csr()
-
-            sparse_out = paddle.sparse.isnan(sparse_x)
-            print(sparse_out)
-            # Tensor(shape=[2, 2, 2], dtype=paddle.bool, place=Place(gpu:0), stop_gradient=True,
-            #        indices=[[0, 0, 1, 1],
-            #                 [1, 1, 1, 1],
-            #                 [0, 1, 0, 1]],
-            #        values=[False, False, False, True ])
+            >>> import paddle
+            >>> import numpy as np
+
+            >>> format = "coo"
+            >>> np_x = np.asarray([[[0., 0], [1., 2.]], [[0., 0], [3., float('nan')]]])
+            >>> dense_x = paddle.to_tensor(np_x)
+
+            >>> if format == "coo":
+            ...     sparse_x = dense_x.to_sparse_coo(len(np_x.shape))
+            >>> else:
+            ...     sparse_x = dense_x.to_sparse_csr()
+            ...
+            >>> sparse_out = paddle.sparse.isnan(sparse_x)
+            >>> print(sparse_out)
+            Tensor(shape=[2, 2, 2], dtype=paddle.bool, place=Place(gpu:0), stop_gradient=True,
+                   indices=[[0, 0, 1, 1],
+                            [1, 1, 1, 1],
+                            [0, 1, 0, 1]],
+                   values=[False, False, False, True ])
 
     """
     if in_dynamic_mode():
@@ -896,28 +975,28 @@ def slice(x, axes, starts, ends, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import numpy as np
-
-            format = 'coo'
-            np_x = np.asarray([[4, 0, 7, 0], [0, 0, 5, 0], [-4, 2, 0, 0]])
-            dense_x = paddle.to_tensor(np_x)
-            if format == 'coo':
-                sp_x = dense_x.to_sparse_coo(len(np_x.shape))
-            else:
-                sp_x = dense_x.to_sparse_csr()
-
-            axes = [0, 1]
-            starts = [1, 0]
-            ends = [3, -2]
-            sp_out = paddle.sparse.slice(sp_x, axes, starts, ends)
-            # sp_out is x[1:3, 0:-2]
-
-            print(sp_out)
-            # Tensor(shape=[2, 2], dtype=paddle.int64, place=Place(cpu), stop_gradient=True,
-            #        indices=[[1, 1],
-            #                 [0, 1]],
-            #        values=[-4,  2])
+            >>> import paddle
+            >>> import numpy as np
+
+            >>> format = 'coo'
+            >>> np_x = np.asarray([[4, 0, 7, 0], [0, 0, 5, 0], [-4, 2, 0, 0]])
+            >>> dense_x = paddle.to_tensor(np_x)
+            >>> if format == 'coo':
+            ...     sp_x = dense_x.to_sparse_coo(len(np_x.shape))
+            >>> else:
+            ...     sp_x = dense_x.to_sparse_csr()
+            ...
+            >>> axes = [0, 1]
+            >>> starts = [1, 0]
+            >>> ends = [3, -2]
+            >>> sp_out = paddle.sparse.slice(sp_x, axes, starts, ends)
+            >>> # sp_out is x[1:3, 0:-2]
+
+            >>> print(sp_out)
+            Tensor(shape=[2, 2], dtype=paddle.int64, place=Place(cpu), stop_gradient=True,
+                   indices=[[1, 1],
+                            [0, 1]],
+                   values=[-4,  2])
 
     """
     if in_dynamic_mode():
@@ -979,40 +1058,43 @@ def pca_lowrank(x, q=None, center=True, niter=2, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            format = "coo"
-            dense_x = paddle.randn((5, 5), dtype='float64')
-
-            if format == "coo":
-                sparse_x = dense_x.to_sparse_coo(len(dense_x.shape))
-            else:
-                sparse_x = dense_x.to_sparse_csr()
-
-            print("sparse.pca_lowrank API only support CUDA 11.x")
-            U, S, V = None, None, None
-            # use code blow when your device CUDA version >= 11.0
-            # U, S, V = paddle.sparse.pca_lowrank(sparse_x)
-
-            print(U)
-            # Tensor(shape=[5, 5], dtype=float64, place=Place(gpu:0), stop_gradient=True,
-            #        [[ 0.02206024,  0.53170082, -0.22392168, -0.48450657,  0.65720625],
-            #         [ 0.02206024,  0.53170082, -0.22392168, -0.32690402, -0.74819812],
-            #         [ 0.02206024,  0.53170082, -0.22392168,  0.81141059,  0.09099187],
-            #         [ 0.15045792,  0.37840027,  0.91333217, -0.00000000,  0.00000000],
-            #         [ 0.98787775, -0.09325209, -0.12410317, -0.00000000, -0.00000000]])
-
-            print(S)
-            # Tensor(shape=[5], dtype=float64, place=Place(gpu:0), stop_gradient=True,
-            #        [2.28621761, 0.93618564, 0.53234942, 0.00000000, 0.00000000])
-
-            print(V)
-            # Tensor(shape=[5, 5], dtype=float64, place=Place(gpu:0), stop_gradient=True,
-            #        [[ 0.26828910, -0.57116436, -0.26548201,  0.67342660, -0.27894114],
-            #         [-0.19592125, -0.31629129,  0.02001645, -0.50484498, -0.77865626],
-            #         [-0.82913017, -0.09391036,  0.37975388,  0.39938099, -0.00241046],
-            #         [-0.41163516,  0.27490410, -0.86666276,  0.03382656, -0.05230341],
-            #         [ 0.18092947,  0.69952818,  0.18385126,  0.36190987, -0.55959343]])
+            >>> # doctest: +REQUIRES(env:GPU)
+            >>> import paddle
+            >>> paddle.device.set_device('gpu')
+
+            >>> format = "coo"
+            >>> paddle.seed(2023)
+            >>> dense_x = paddle.randn((5, 5), dtype='float64')
+
+            >>> if format == "coo":
+            ...     sparse_x = dense_x.to_sparse_coo(len(dense_x.shape))
+            >>> else:
+            ...     sparse_x = dense_x.to_sparse_csr()
+
+            >>> print("sparse.pca_lowrank API only support CUDA 11.x")
+            >>> # U, S, V = None, None, None
+            >>> # use code blow when your device CUDA version >= 11.0
+            >>> U, S, V = paddle.sparse.pca_lowrank(sparse_x)
+
+            >>> print(U)
+            Tensor(shape=[5, 5], dtype=float64, place=Place(gpu:0), stop_gradient=True,
+                   [[-0.31412600,  0.44814876,  0.18390454, -0.19967630, -0.79170452],
+                    [-0.31412600,  0.44814876,  0.18390454, -0.58579808,  0.56877700],
+                    [-0.31412600,  0.44814876,  0.18390454,  0.78547437,  0.22292751],
+                    [-0.38082462,  0.10982129, -0.91810233,  0.00000000,  0.00000000],
+                    [ 0.74762770,  0.62082796, -0.23585052,  0.00000000, -0.00000000]])
+
+            >>> print(S)
+            Tensor(shape=[5], dtype=float64, place=Place(gpu:0), stop_gradient=True,
+                   [1.56031096, 1.12956227, 0.27922715, 0.00000000, 0.00000000])
+
+            >>> print(V)
+            Tensor(shape=[5, 5], dtype=float64, place=Place(gpu:0), stop_gradient=True,
+                   [[ 0.88568469, -0.29081908,  0.06163676,  0.19597228, -0.29796422],
+                    [-0.26169364, -0.27616183,  0.43148760, -0.42522796, -0.69874939],
+                    [ 0.28587685,  0.30695344, -0.47790836, -0.76982533, -0.05501437],
+                    [-0.23958121, -0.62770647, -0.71141770,  0.11463224, -0.17125926],
+                    [ 0.08918713, -0.59238761,  0.27478686, -0.41833534,  0.62498824]])
     """
 
     def get_floating_dtype(x):
diff --git a/python/paddle/static/input.py b/python/paddle/static/input.py
index d31d86f729199..44554ea090c2c 100644
--- a/python/paddle/static/input.py
+++ b/python/paddle/static/input.py
@@ -98,51 +98,50 @@ def data(name, shape, dtype=None, lod_level=0):
                     [2.]]], dtype=float32)]
 
     """
+    helper = LayerHelper('data', **locals())
+    check_type(name, 'name', (bytes, str), 'data')
+    check_type(shape, 'shape', (list, tuple), 'data')
 
-    if not dtype:
+    shape = list(shape)
+    for i in range(len(shape)):
+        if shape[i] is None:
+            shape[i] = -1
+
+    if dtype is None:
         dtype = paddle.get_default_dtype()
+
     if paddle.ir.core._use_new_ir_api():
         ir_dtype = paddle.ir.core.convert_np_dtype_to_dtype_(dtype)
         return paddle._ir_ops.data(name, shape, ir_dtype, core.Place())
 
-    else:
+    out = helper.create_global_variable(
+        name=name,
+        shape=shape,
+        dtype=dtype,
+        type=core.VarDesc.VarType.LOD_TENSOR,
+        stop_gradient=True,
+        lod_level=lod_level,
+        is_data=True,
+        need_check_feed=True,
+    )
+
+    is_pir_mode = os.environ.get("FLAGS_enable_new_ir_in_executor", None)
+    if evaluate_flag(is_pir_mode):
         helper = LayerHelper('data', **locals())
-        check_type(name, 'name', (bytes, str), 'data')
-        check_type(shape, 'shape', (list, tuple), 'data')
-
-        shape = list(shape)
-        for i in range(len(shape)):
-            if shape[i] is None:
-                shape[i] = -1
-
-        out = helper.create_global_variable(
-            name=name,
-            shape=shape,
-            dtype=dtype,
-            type=core.VarDesc.VarType.LOD_TENSOR,
-            stop_gradient=True,
-            lod_level=lod_level,
-            is_data=True,
-            need_check_feed=True,
+        if not isinstance(dtype, core.VarDesc.VarType):
+            dtype = convert_np_dtype_to_dtype_(dtype)
+        helper.append_op(
+            type='data',
+            inputs={},
+            outputs={'out': out},
+            attrs={
+                'shape': shape,
+                'dtype': dtype,
+                'place': 0,
+                'name': name,
+            },
         )
-
-        is_new_ir_mode = os.environ.get("FLAGS_enable_new_ir_in_executor", None)
-        if evaluate_flag(is_new_ir_mode):
-            helper = LayerHelper('data', **locals())
-            if not isinstance(dtype, core.VarDesc.VarType):
-                dtype = convert_np_dtype_to_dtype_(dtype)
-            helper.append_op(
-                type='data',
-                inputs={},
-                outputs={'out': out},
-                attrs={
-                    'shape': shape,
-                    'dtype': dtype,
-                    'place': 0,
-                    'name': name,
-                },
-            )
-        return out
+    return out
 
 
 class InputSpec:
diff --git a/python/paddle/static/nn/sequence_lod.py b/python/paddle/static/nn/sequence_lod.py
index 653541d92551c..eb4329d22086a 100644
--- a/python/paddle/static/nn/sequence_lod.py
+++ b/python/paddle/static/nn/sequence_lod.py
@@ -123,11 +123,11 @@ def sequence_conv(
 
         .. code-block:: python
 
-             import paddle
-             paddle.enable_static()
+            >>> import paddle
+            >>> paddle.enable_static()
 
-             x = paddle.static.data(name='x', shape=[-1, 10], dtype='float32', lod_level=1)
-             x_conved = paddle.static.nn.sequence_conv(input=x, num_filters=2, filter_size=3, padding_start=-1)
+            >>> x = paddle.static.data(name='x', shape=[-1, 10], dtype='float32', lod_level=1)
+            >>> x_conved = paddle.static.nn.sequence_conv(input=x, num_filters=2, filter_size=3, padding_start=-1)
     """
 
     assert (
@@ -221,16 +221,16 @@ def sequence_softmax(input, use_cudnn=False, name=None):
 
         .. code-block:: python
 
-             import paddle
-             paddle.enable_static()
+            >>> import paddle
+            >>> paddle.enable_static()
 
-             x = paddle.static.data(name='x', shape=[7, 1],
+            >>> x = paddle.static.data(name='x', shape=[7, 1],
                               dtype='float32', lod_level=1)
-             x_sequence_softmax_1 = paddle.static.nn.sequence_softmax(input=x)
+            >>> x_sequence_softmax_1 = paddle.static.nn.sequence_softmax(input=x)
 
-             y = paddle.static.data(name='y', shape=[7],
-                 dtype='float32', lod_level=1)
-             x_sequence_softmax_2 = paddle.static.nn.sequence_softmax(input=y)
+            >>> y = paddle.static.data(name='y', shape=[7],
+            ...     dtype='float32', lod_level=1)
+            >>> x_sequence_softmax_2 = paddle.static.nn.sequence_softmax(input=y)
     """
     assert (
         not in_dygraph_mode()
@@ -326,16 +326,16 @@ def sequence_pool(input, pool_type, is_test=False, pad_value=0.0):
 
         .. code-block:: python
 
-            import paddle
-            paddle.enable_static()
+            >>> import paddle
+            >>> paddle.enable_static()
 
-            x = paddle.static.data(name='x', shape=[None, 10], dtype='float32', lod_level=1)
-            avg_x = paddle.static.nn.sequence_pool(input=x, pool_type='average')
-            sum_x = paddle.static.nn.sequence_pool(input=x, pool_type='sum')
-            sqrt_x = paddle.static.nn.sequence_pool(input=x, pool_type='sqrt')
-            max_x = paddle.static.nn.sequence_pool(input=x, pool_type='max')
-            last_x = paddle.static.nn.sequence_pool(input=x, pool_type='last')
-            first_x = paddle.static.nn.sequence_pool(input=x, pool_type='first')
+            >>> x = paddle.static.data(name='x', shape=[None, 10], dtype='float32', lod_level=1)
+            >>> avg_x = paddle.static.nn.sequence_pool(input=x, pool_type='average')
+            >>> sum_x = paddle.static.nn.sequence_pool(input=x, pool_type='sum')
+            >>> sqrt_x = paddle.static.nn.sequence_pool(input=x, pool_type='sqrt')
+            >>> max_x = paddle.static.nn.sequence_pool(input=x, pool_type='max')
+            >>> last_x = paddle.static.nn.sequence_pool(input=x, pool_type='last')
+            >>> first_x = paddle.static.nn.sequence_pool(input=x, pool_type='first')
     """
     assert (
         not in_dygraph_mode()
@@ -408,12 +408,12 @@ def sequence_concat(input, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            paddle.enable_static()
+            >>> import paddle
+            >>> paddle.enable_static()
 
-            x = paddle.static.data(name='x', shape=[-1, 10], dtype='float32', lod_level=1)
-            y = paddle.static.data(name='y', shape=[-1, 10], dtype='float32', lod_level=1)
-            out = paddle.static.nn.sequence_concat(input=[x, y])
+            >>> x = paddle.static.data(name='x', shape=[-1, 10], dtype='float32', lod_level=1)
+            >>> y = paddle.static.data(name='y', shape=[-1, 10], dtype='float32', lod_level=1)
+            >>> out = paddle.static.nn.sequence_concat(input=[x, y])
     """
     assert (
         not in_dygraph_mode()
@@ -484,11 +484,11 @@ def sequence_first_step(input):
 
         .. code-block:: python
 
-             import paddle
-             paddle.enable_static()
+            >>> import paddle
+            >>> paddle.enable_static()
 
-             x = paddle.static.data(name='x', shape=[None, 10], dtype='float32', lod_level=1)
-             x_first_step = paddle.static.nn.sequence_first_step(input=x)
+            >>> x = paddle.static.data(name='x', shape=[None, 10], dtype='float32', lod_level=1)
+            >>> x_first_step = paddle.static.nn.sequence_first_step(input=x)
     """
     check_variable_and_dtype(
         input, 'input', ['float32', 'float64'], 'sequence_first_step'
@@ -543,11 +543,11 @@ def sequence_last_step(input):
 
         .. code-block:: python
 
-             import paddle
-             paddle.enable_static()
+            >>> import paddle
+            >>> paddle.enable_static()
 
-             x = paddle.static.data(name='x', shape=[None, 10], dtype='float32', lod_level=1)
-             x_last_step = paddle.static.nn.sequence_last_step(input=x)
+            >>> x = paddle.static.data(name='x', shape=[None, 10], dtype='float32', lod_level=1)
+            >>> x_last_step = paddle.static.nn.sequence_last_step(input=x)
     """
     check_variable_and_dtype(
         input, 'input', ['float32', 'float64'], 'sequence_last_step'
@@ -605,16 +605,16 @@ def sequence_slice(input, offset, length, name=None):
 
         .. code-block:: python
 
-             import paddle
-             paddle.enable_static()
+            >>> import paddle
+            >>> paddle.enable_static()
 
-             import numpy as np
-             seqs = paddle.static.data(name='x', shape=[10, 5],
-                              dtype='float32', lod_level=1)
-             offset = paddle.assign(np.array([[0, 1]]).astype("int32"))
-             length = paddle.assign(np.array([[2, 1]]).astype("int32"))
-             subseqs = paddle.static.nn.sequence_slice(input=seqs, offset=offset,
-                                                   length=length)
+            >>> import numpy as np
+            >>> seqs = paddle.static.data(name='x', shape=[10, 5],
+            ...                  dtype='float32', lod_level=1)
+            >>> offset = paddle.assign(np.array([[0, 1]]).astype("int32"))
+            >>> length = paddle.assign(np.array([[2, 1]]).astype("int32"))
+            ... subseqs = paddle.static.nn.sequence_slice(input=seqs, offset=offset,
+            ...                                       length=length)
     """
     assert (
         not in_dygraph_mode()
@@ -729,46 +729,49 @@ def sequence_expand(x, y, ref_level=-1, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            from paddle import base
-            paddle.enable_static()
-            import numpy as np
-
-            x = paddle.static.data(name='x', shape=[4, 1], dtype='float32')
-            y = paddle.static.data(name='y', shape=[8, 1],
-                        dtype='float32', lod_level=1)
-            out = paddle.static.nn.sequence_expand(x=x, y=y, ref_level=0)
-
-            exe = paddle.static.Executor(base.CPUPlace())
-            place = paddle.CPUPlace()
-
-            np_data = np.array([[1], [2], [3], [4]]).astype('float32')
-            x_lod_tensor = base.create_lod_tensor(np_data, [[2, 2]], place)
-            print(x_lod_tensor)
-            #lod: [[0, 2, 4]]
-            #    dim: 4, 1
-            #    layout: NCHW
-            #    dtype: float
-            #    data: [1 2 3 4]
-
-            np_data = np.array([[1], [2], [3], [4], [5], [6], [7], [8]]).astype('float32')
-        y_lod_tensor = base.create_lod_tensor(np_data, [[2, 2], [3,3,1,1]], place)
-            print(y_lod_tensor)
-            #lod: [[0, 2, 4][0, 3, 6, 7, 8]]
-            #    dim: 8, 1
-            #    layout: NCHW
-            #    dtype: int64_t
-            #    data: [0 0 1 1 1 1 1 0]
-
-            out_main = exe.run(base.default_main_program(),
-                            feed={'x': x_lod_tensor, 'y': y_lod_tensor},
-                            fetch_list=[out], return_numpy=False)
-            print(out_main[0])
-            #lod: [[0, 2, 4, 6, 8]]
-            #    dim: 8, 1
-            #    layout: NCHW
-            #    dtype: float
-            #    data: [1 2 1 2 3 4 3 4]
+            >>> import paddle
+            >>> from paddle import base
+            >>> paddle.enable_static()
+            >>> import numpy as np
+
+            >>> x = paddle.static.data(name='x', shape=[4, 1], dtype='float32')
+            >>> y = paddle.static.data(name='y', shape=[8, 1],
+            ...             dtype='float32', lod_level=1)
+            >>> out = paddle.static.nn.sequence_expand(x=x, y=y, ref_level=0)
+
+            >>> exe = paddle.static.Executor(base.CPUPlace())
+            >>> place = paddle.CPUPlace()
+
+            >>> np_data = np.array([[1], [2], [3], [4]]).astype('float32')
+            >>> x_lod_tensor = base.create_lod_tensor(np_data, [[2, 2]], place)
+            >>> print(x_lod_tensor)
+            - lod: {{0, 2, 4}}
+            - place: Place(cpu)
+            - shape: [4, 1]
+            - layout: NCHW
+            - dtype: float32
+            - data: [1 2 3 4]
+
+            >>> np_data = np.array([[1], [2], [3], [4], [5], [6], [7], [8]]).astype('float32')
+            >>> y_lod_tensor = base.create_lod_tensor(np_data, [[2, 2], [3,3,1,1]], place)
+            >>> print(y_lod_tensor)
+            - lod: {{0, 2, 4}{0, 3, 6, 7, 8}}
+            - place: Place(cpu)
+            - shape: [8, 1]
+            - layout: NCHW
+            - dtype: float32
+            - data: [1 2 3 4 5 6 7 8]
+
+            >>> out_main = exe.run(base.default_main_program(),
+            ...                 feed={'x': x_lod_tensor, 'y': y_lod_tensor},
+            ...                 fetch_list=[out], return_numpy=False)
+            >>> print(out_main[0])
+            - lod: {{0, 2, 4, 6, 8}}
+            - place: Place(cpu)
+            - shape: [8, 1]
+            - layout: NCHW
+            - dtype: float32
+            - data: [1 2 1 2 3 4 3 4]
     """
     assert (
         not in_dygraph_mode()
@@ -852,45 +855,48 @@ def sequence_expand_as(x, y, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.base as base
-            paddle.enable_static()
-            import numpy as np
-
-            x = paddle.static.data(name='x', shape=[4, 1], dtype='float32')
-            y = paddle.static.data(name='y', shape=[8, 1], dtype='float32', lod_level=1)
-            out = paddle.static.nn.sequence_expand_as(x=x, y=y)
-
-            exe = base.Executor(base.CPUPlace())
-            place = base.CPUPlace()
-
-            np_data = np.array([[1], [2], [3], [4]]).astype('float32')
-            x_lod_tensor = base.create_lod_tensor(np_data, [[2, 2]], place)
-            print(x_lod_tensor)
-            #lod: [[0, 2, 4]]
-            #    dim: 4, 1
-            #    layout: NCHW
-            #    dtype: float
-            #    data: [1 2 3 4]
-
-            np_data = np.array([[1], [2], [3], [4], [5], [6], [7], [8]]).astype('float32')
-        y_lod_tensor = base.create_lod_tensor(np_data, [[3,3,1,1]], place)
-            print(y_lod_tensor)
-            #lod: [[0, 3, 6, 7, 8]]
-            #    dim: 8, 1
-            #    layout: NCHW
-            #    dtype: int64_t
-            #    data: [0 0 1 0 1 1 1 0]
-
-            out_main = exe.run(base.default_main_program(),
-                            feed={'x': x_lod_tensor, 'y': y_lod_tensor},
-                            fetch_list=[out], return_numpy=False)
-            print(out_main[0])
-            #lod: [[0, 3, 6, 7, 8]]
-            #    dim: 8, 1
-            #    layout: NCHW
-            #    dtype: float
-            #    data: [1 1 1 2 2 2 3 4]
+            >>> import paddle
+            >>> import paddle.base as base
+            >>> paddle.enable_static()
+            >>> import numpy as np
+
+            >>> x = paddle.static.data(name='x', shape=[4, 1], dtype='float32')
+            >>> y = paddle.static.data(name='y', shape=[8, 1], dtype='float32', lod_level=1)
+            >>> out = paddle.static.nn.sequence_expand_as(x=x, y=y)
+
+            >>> exe = base.Executor(base.CPUPlace())
+            >>> place = base.CPUPlace()
+
+            >>> np_data = np.array([[1], [2], [3], [4]]).astype('float32')
+            >>> x_lod_tensor = base.create_lod_tensor(np_data, [[2, 2]], place)
+            >>> print(x_lod_tensor)
+            - lod: {{0, 2, 4}}
+            - place: Place(cpu)
+            - shape: [4, 1]
+            - layout: NCHW
+            - dtype: float32
+            - data: [1 2 3 4]
+
+            >>> np_data = np.array([[1], [2], [3], [4], [5], [6], [7], [8]]).astype('float32')
+            >>> y_lod_tensor = base.create_lod_tensor(np_data, [[3,3,1,1]], place)
+            >>> print(y_lod_tensor)
+            - lod: {{0, 3, 6, 7, 8}}
+            - place: Place(cpu)
+            - shape: [8, 1]
+            - layout: NCHW
+            - dtype: float32
+            - data: [1 2 3 4 5 6 7 8]
+
+            >>> out_main = exe.run(base.default_main_program(),
+            ...                 feed={'x': x_lod_tensor, 'y': y_lod_tensor},
+            ...                 fetch_list=[out], return_numpy=False)
+            >>> print(out_main[0])
+            - lod: {{0, 3, 6, 7, 8}}
+            - place: Place(cpu)
+            - shape: [8, 1]
+            - layout: NCHW
+            - dtype: float32
+            - data: [1 1 1 2 2 2 3 4]
     """
     assert (
         not in_dygraph_mode()
@@ -984,15 +990,15 @@ def sequence_pad(x, pad_value, maxlen=None, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            paddle.enable_static()
-            import paddle.base as base
-            import numpy
+            >>> import paddle
+            >>> paddle.enable_static()
+            >>> import paddle.base as base
+            >>> import numpy
 
-            x = paddle.static.data(name='x', shape=[10, 5], dtype='float32', lod_level=1)
-            pad_value = paddle.assign(
-                numpy.array([0.0], dtype=numpy.float32))
-            out = paddle.static.nn.sequence_pad(x=x, pad_value=pad_value)
+            >>> x = paddle.static.data(name='x', shape=[10, 5], dtype='float32', lod_level=1)
+            >>> pad_value = paddle.assign(
+            ...     numpy.array([0.0], dtype=numpy.float32))
+            >>> out = paddle.static.nn.sequence_pad(x=x, pad_value=pad_value)
     """
 
     assert (
@@ -1070,18 +1076,18 @@ def sequence_unpad(x, length, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            paddle.enable_static()
-            import paddle.base as base
-            import numpy
+            >>> import paddle
+            >>> paddle.enable_static()
+            >>> import paddle.base as base
+            >>> import numpy
 
-            # pad data
-            x = paddle.static.data(name='x', shape=[10, 5], dtype='float32', lod_level=1)
-            pad_value = paddle.assign(numpy.array([0.0], dtype=numpy.float32))
-            pad_data, len = paddle.static.nn.sequence_pad(x=x, pad_value=pad_value)
+            >>> # pad data
+            >>> x = paddle.static.data(name='x', shape=[10, 5], dtype='float32', lod_level=1)
+            >>> pad_value = paddle.assign(numpy.array([0.0], dtype=numpy.float32))
+            >>> pad_data, len = paddle.static.nn.sequence_pad(x=x, pad_value=pad_value)
 
-            # unpad data
-            unpad_data = paddle.static.nn.sequence_unpad(x=pad_data, length=len)
+            >>> # unpad data
+            >>> unpad_data = paddle.static.nn.sequence_unpad(x=pad_data, length=len)
     """
 
     assert (
@@ -1156,11 +1162,11 @@ def sequence_reshape(input, new_dim):
     Examples:
         .. code-block:: python
 
-            import paddle
-            paddle.enable_static()
+            >>> import paddle
+            >>> paddle.enable_static()
 
-            x = paddle.static.data(name='x', shape=[None, 16], dtype='float32', lod_level=1)
-            x_reshaped = paddle.static.nn.sequence_reshape(input=x, new_dim=4)
+            >>> x = paddle.static.data(name='x', shape=[None, 16], dtype='float32', lod_level=1)
+            >>> x_reshaped = paddle.static.nn.sequence_reshape(input=x, new_dim=4)
     """
     assert (
         not in_dygraph_mode()
@@ -1238,13 +1244,13 @@ def sequence_scatter(input, index, updates, name=None):
 
         .. code-block:: python
 
-            import paddle
-            paddle.enable_static()
+            >>> import paddle
+            >>> paddle.enable_static()
 
-            input = paddle.static.data(name="x", shape=[None, 3, 6], dtype='float32' )
-            index = paddle.static.data(name='index', shape=[12, 1],  dtype='int64', lod_level=1)
-            updates = paddle.static.data(name='updates', shape=[12, 1], dtype='float32', lod_level=1)
-            output = paddle.static.nn.sequence_scatter(input, index, updates)
+            >>> input = paddle.static.data(name="x", shape=[None, 3, 6], dtype='float32' )
+            >>> index = paddle.static.data(name='index', shape=[12, 1],  dtype='int64', lod_level=1)
+            >>> updates = paddle.static.data(name='updates', shape=[12, 1], dtype='float32', lod_level=1)
+            >>> output = paddle.static.nn.sequence_scatter(input, index, updates)
 
     """
     assert (
@@ -1323,11 +1329,11 @@ def sequence_enumerate(input, win_size, pad_value=0, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            paddle.enable_static()
+            >>> import paddle
+            >>> paddle.enable_static()
 
-            x = paddle.static.data(name='x', shape=[-1, 1], dtype='int32', lod_level=1)
-            out = paddle.static.nn.sequence_enumerate(input=x, win_size=3, pad_value=0)
+            >>> x = paddle.static.data(name='x', shape=[-1, 1], dtype='int32', lod_level=1)
+            >>> out = paddle.static.nn.sequence_enumerate(input=x, win_size=3, pad_value=0)
     """
     assert (
         not in_dygraph_mode()
@@ -1395,15 +1401,15 @@ def sequence_mask(x, maxlen=None, dtype='int64', name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            lengths = paddle.to_tensor([10, 9, 8])
-            mask = paddle.nn.functional.sequence_mask(lengths)
+            >>> lengths = paddle.to_tensor([10, 9, 8])
+            >>> mask = paddle.nn.functional.sequence_mask(lengths)
 
-            print(mask.numpy())
-            # [[1 1 1 1 1 1 1 1 1 1]
-            #  [1 1 1 1 1 1 1 1 1 0]
-            #  [1 1 1 1 1 1 1 1 0 0]]
+            >>> print(mask.numpy())
+            [[1 1 1 1 1 1 1 1 1 1]
+             [1 1 1 1 1 1 1 1 1 0]
+             [1 1 1 1 1 1 1 1 0 0]]
 
     """
 
@@ -1452,11 +1458,11 @@ def sequence_reverse(x, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            paddle.enable_static()
+            >>> import paddle
+            >>> paddle.enable_static()
 
-            x = paddle.static.data(name='x', shape=[None, 10], dtype='float32', lod_level=1)
-            x_reversed = paddle.static.nn.sequence_reverse(x)
+            >>> x = paddle.static.data(name='x', shape=[None, 10], dtype='float32', lod_level=1)
+            >>> x_reversed = paddle.static.nn.sequence_reverse(x)
     """
     assert (
         not in_dygraph_mode()
diff --git a/python/paddle/static/nn/static_pylayer.py b/python/paddle/static/nn/static_pylayer.py
index bf6900906b523..91d0f9d2351ff 100644
--- a/python/paddle/static/nn/static_pylayer.py
+++ b/python/paddle/static/nn/static_pylayer.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
+import paddle
 from paddle.base import core
 from paddle.base.backward import _append_grad_suffix_
 from paddle.base.framework import Variable
@@ -44,7 +44,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
 
 
 class StaticPyLayerBlock:
-    def __init__(self, inputs, name=None):
+    def __init__(self, inputs, name=None, pylayer_context=None):
         for each_input in inputs:
             check_type(each_input, "input", Variable, "StaticPyLayerBlock")
 
@@ -53,6 +53,8 @@ def __init__(self, inputs, name=None):
         # used to specify the `Out` to `pylayer` op
         self.fwd_outputs = []
 
+        self.context = pylayer_context
+
         self.helper = LayerHelper("static_pylayer_block", name=name)
         self.fwd_op_id = None
         self._forward_block_id = None
@@ -96,6 +98,7 @@ def complete_forward_block(self):
         )
 
         self.fwd_op_id = pylayer_op.idx
+        self.helper.main_program._sync_with_cpp()
 
     def complete_backward_block(self):
         inside_block = self.helper.main_program.current_block()
@@ -114,14 +117,7 @@ def complete_backward_block(self):
         # NOTE(MarioLulab): The reason of renaming the var name in the inside block is that
         # we need to associating `inside_grads` and `outside_grads` at
         # runtime `RunImpl` in pylayer op
-        for old_var_name, new_var_name in self.var_old_to_new.items():
-            # TODO(MarioLulab): need to remove recursively in ``sub_block``
-
-            # NOTE(MarioLulab): The reason why not using Block._rename_var is that `old_var_name` does not correspond to a Variable instance in Block
-            # and Block._rename_var will raise ValueError.
-            inside_block.desc._rename_var(
-                old_var_name.encode(), new_var_name.encode()
-            )
+        _rename_var_recursively_(inside_block, self.var_old_to_new)
 
         # update `blocks` attr by appending backward_block
         forward_block_desc = parent_block.program.block(
@@ -132,6 +128,17 @@ def complete_backward_block(self):
             "blocks", [forward_block_desc, backward_block_desc]
         )
 
+        # remove temporary vars created by `StaticPyLayerContext.saved_tensor`
+        if self.context:
+            for var in self.context.saved_vars:
+                if not inside_block.has_var(var.name):
+                    raise ValueError(
+                        f"{var.name} was saved in forward block but could not be found in backward block. Maybe {var.name} was renamed somewhere."
+                    )
+                inside_block._remove_var(var.name)
+
+        self.helper.main_program._sync_with_cpp()
+
     def complete(self):
         if not self.is_backward_block:
             return self.complete_forward_block()
@@ -139,6 +146,94 @@ def complete(self):
             return self.complete_backward_block()
 
 
+def _get_ctx_from_func_(func):
+    if func is None:
+        return None
+
+    fn_bind_args = getattr(func, "args", None)
+    if fn_bind_args is None:
+        return None
+
+    from paddle.jit.dy2static.py_layer import StaticPyLayerContext
+
+    fn_ctx = None
+    if len(fn_bind_args) > 0 and isinstance(
+        fn_bind_args[0], StaticPyLayerContext
+    ):
+        fn_ctx = fn_bind_args[0]
+
+    return fn_ctx
+
+
+def _rename_var_recursively_(cur_block, var_old_to_new):
+    """
+    Rename the var both the Variable instances and all ops' input and output arg names
+    in `cur_block` based on dict `var_old_to_new`.
+    Dict `var_old_to_new` should be the following format:
+    {
+        old_name_0 : new_name_0,
+        old_name_1 : new_name_1,
+        ...
+        old_name_n : new_name_n,
+    }
+    """
+
+    for old_var_name, new_var_name in var_old_to_new.items():
+        # NOTE(MarioLulab): The reason why not using `Block._rename_var`` is that `Block._rename_var` will raise ValueError, when `old_var_name` does not correspond to a Variable instance in Block.
+
+        if cur_block.has_var(old_var_name):
+            # `Block.desc._rename_var` can rename var in block and then rename var name in all ops
+            cur_block.desc._rename_var(
+                old_var_name.encode(), new_var_name.encode()
+            )
+        else:
+            # When cur_block does not have the var, `Block.desc._rename_var` can't rename var name in ops.
+            # In this case, we should traverse all ops and perform renaming manually.
+            for op in cur_block.ops:
+                op._rename_input(old_var_name, new_var_name)
+                op._rename_output(old_var_name, new_var_name)
+
+    # NOTE(MarioLulab): block attr type with the name of "blocks" or "sub_block" indicates
+    # the block might be excuted. We should rename the var name in these blocks recursively
+    block_attr_names = ["blocks", "sub_block"]
+
+    for op in cur_block.ops:
+        for attr_name in op.all_attrs():
+            if attr_name not in block_attr_names:
+                continue
+
+            if op.attr_type(attr_name) == core.AttrType.BLOCK:
+                sub_block_id = op._block_attr_id(attr_name)
+                sub_block = cur_block.program.block(sub_block_id)
+                _rename_var_recursively_(sub_block, var_old_to_new)
+            elif op.attr_type(attr_name) == core.AttrType.BLOCKS:
+                sub_blocks_ids = op._blocks_attr_ids(attr_name)
+                for sub_block_id in sub_blocks_ids:
+                    sub_block = cur_block.program.block(sub_block_id)
+                    _rename_var_recursively_(sub_block, var_old_to_new)
+
+
+def copy_var_from_parent_block(parent_block_var, layer_helper):
+    if not isinstance(parent_block_var, Variable):
+        return parent_block_var
+    prog = layer_helper.main_program
+    current_block = prog.current_block()
+
+    if (
+        parent_block_var.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY
+        and current_block._find_var_recursive(parent_block_var.name)
+    ):
+        current_block_var = parent_block_var
+    else:
+        current_block_var = current_block.create_var(
+            dtype=parent_block_var.dtype,
+            shape=parent_block_var.shape,
+            type=parent_block_var.type,
+        )
+        paddle.assign(parent_block_var, current_block_var)
+    return current_block_var
+
+
 # TODO(MarioLulab):
 # Need to support non-Variable in ``inputs``
 def static_pylayer(forward_fn, inputs, backward_fn=None, name=None):
@@ -157,72 +252,64 @@ def static_pylayer(forward_fn, inputs, backward_fn=None, name=None):
         the same as the number of inputs to ``backward_fn``.
 
         2. If ``backward_fn`` is None, ``stop_gradient`` attr of all Variable in ``inputs`` is expected to be True.
-        Otherwise it might get unexpected results in backward pass.
+        Otherwise it might get unexpected results in backward propagation.
 
         3. This API can only be used under static graph mode.
 
     Args:
-        forward_fn (callable): A callable to be performed in forward pass
-        inputs (list[Variable]): The list of if input Variable to the ``forward_fn``
-        backward_fn (callable, optional): A callable to be performed in backward pass
+        forward_fn (callable): A callable to be performed in forward propagation
+        inputs (list[Variable]): The list of input Variable to the ``forward_fn``
+        backward_fn (callable, optional): A callable to be performed in backward propagation. Default: None, which means no need to do backward propagation.
         name (str, optional): The default value is ``None`` . Normally users
-            don't have to set this parameter.
+            don't have to set this parameter. For more information, please
+            refer to :ref:`api_guide_Name` .
 
     Returns:
         Variable|list(Variable)|tuple(Variable): returns the output of ``forward_fn(inputs)``
 
     Examples:
-        .. code-block: python
-
-            import paddle
-            import numpy as np
-
-            #
-            # pseudocode:
-            # y = exp(x)
-            # dx = 2 * exp(dy)
-            #
-
-            paddle.enable_static()
-
-            def forward_fn(x):
-                return paddle.exp(x)
-
-            def backward_fn(dy):
-                return 2 * paddle.exp(dy)
-
-            main_program = paddle.static.Program()
-            start_program = paddle.static.Program()
-
-            place = paddle.CPUPlace()
-            exe = paddle.static.Executor(place)
-            with paddle.static.program_guard(main_program, start_program):
-                data = paddle.static.data(name="X", shape=[None, 5], dtype="float32")
-                data.stop_gradient = False
-                ret = paddle.static.nn.static_pylayer(forward_fn, [data], backward_fn)
-                data_grad = paddle.static.gradients([ret], data)[0]
-
-            exe = paddle.static.Executor(place)
-            exe.run(start_program)
-            x = np.array([[1.0, 2.0, 3.0, 4.0, 5.0]], dtype=np.float32)
-            x, x_grad, y = exe.run(
-                main_program,
-                feed={"X": x},
-                fetch_list=[
-                    data.name,
-                    data_grad.name,
-                    ret.name
-                ],
-            )
-            # x is Numpy
-            # x.data = [[1.0, 2.0, 3.0, 4.0, 5.0]]
-            # x.shape = [1, 5]
-            # y is Numpy
-            # y.data = [[2.7182817, 7.389056, 20.085537, 54.59815, 148.41316]]
-            # y.shape = [1, 5]
-            # x_grad is Numpy
-            # x_grad.data = [[5.4365635, 5.4365635, 5.4365635, 5.4365635, 5.4365635]]
-            # x_grad.shape = [1, 5]
+        .. code-block:: python
+
+                >>> import paddle
+                >>> import numpy as np
+
+                >>> paddle.enable_static()
+
+                >>> def forward_fn(x):
+                ...     return paddle.exp(x)
+
+                >>> def backward_fn(dy):
+                ...     return 2 * paddle.exp(dy)
+
+                >>> main_program = paddle.static.Program()
+                >>> start_program = paddle.static.Program()
+
+                >>> place = paddle.CPUPlace()
+                >>> exe = paddle.static.Executor(place)
+                >>> with paddle.static.program_guard(main_program, start_program):
+                ...     data = paddle.static.data(name="X", shape=[None, 5], dtype="float32")
+                ...     data.stop_gradient = False
+                ...     ret = paddle.static.nn.static_pylayer(forward_fn, [data], backward_fn)
+                ...     data_grad = paddle.static.gradients([ret], data)[0]
+
+                >>> exe.run(start_program)
+                >>> x = np.array([[1.0, 2.0, 3.0, 4.0, 5.0]], dtype=np.float32)
+                >>> x, x_grad, y = exe.run(
+                ...     main_program,
+                ...     feed={"X": x},
+                ...     fetch_list=[
+                ...         data.name,
+                ...         data_grad.name,
+                ...         ret.name
+                ...     ],
+                ... )
+
+                >>> print(x)
+                [[1. 2. 3. 4. 5.]]
+                >>> print(x_grad)
+                [[5.4365635 5.4365635 5.4365635 5.4365635 5.4365635]]
+                >>> print(y)
+                [[  2.7182817   7.389056   20.085537   54.59815   148.41316  ]]
     """
     assert (
         in_dygraph_mode() is False
@@ -238,12 +325,21 @@ def backward_fn(dy):
                     )
                 )
 
+    # judge if in dy2st or not, by checking binding args of `forward_fn` and `backward_fn`
+    fwd_fn_ctx = _get_ctx_from_func_(forward_fn)
+    bwd_fn_ctx = _get_ctx_from_func_(backward_fn)
+    static_pylayer_context = (
+        fwd_fn_ctx if fwd_fn_ctx and (fwd_fn_ctx == bwd_fn_ctx) else None
+    )
+
     check_type(name, "name", (str, type(None)), "base.layers.static_pylayer")
     helper = LayerHelper('static_pylayer', **locals())
     copy_to_parent_func = lambda var: copy_var_to_parent_block(var, helper)
 
     assert forward_fn is not None and callable(forward_fn)
-    pylayer_block_manager = StaticPyLayerBlock(inputs)
+    pylayer_block_manager = StaticPyLayerBlock(
+        inputs, pylayer_context=static_pylayer_context
+    )
     with pylayer_block_manager.block(is_backward_block=False) as mgr:
         origin_output = forward_fn(*inputs)
         if origin_output is not None:
@@ -265,7 +361,6 @@ def backward_fn(dy):
         for fwd_var in flatten(output):
             fwd_var_name = fwd_var.name
             bwd_var_name = _append_grad_suffix_(fwd_var_name)
-            var = current_block.create_var(name=bwd_var_name)
             if not current_block.desc.has_var_recursive(fwd_var_name.encode()):
                 raise ValueError(
                     "Grad var {} , we can't find its related forward var {}".format(
@@ -273,16 +368,33 @@ def backward_fn(dy):
                     )
                 )
 
-            var.desc.set_dtype(fwd_var.dtype)
-            var.desc.set_shape(fwd_var.shape)
+            var = current_block.create_var(
+                dtype=fwd_var.dtype,
+                shape=fwd_var.shape,
+                type=fwd_var.type,
+                name=bwd_var_name,
+            )
 
             grad_var_ins.append(var)
 
+        copy_from_parent_func = lambda var: copy_var_from_parent_block(
+            var, helper
+        )
         assert isinstance(grad_var_ins, list)
         with pylayer_block_manager.block(is_backward_block=True) as mgr:
-            grad_origin_output = backward_fn(*grad_var_ins)
+            # Step1. Copy var from parent block
+            inside_block_inputs = map_structure(
+                copy_from_parent_func, grad_var_ins
+            )
+
+            # Step2. Do backward propagation
+            grad_origin_output = backward_fn(*inside_block_inputs)
+
             if grad_origin_output is not None:
+                # Step3. Check the number of inputs to ``forward_fn`` the
+                # same as the number of outputs to ``backward_fn``
                 flat_grad_origin = flatten(grad_origin_output)
+
                 # NOTE(MarioLulab): ``current_block`` was defined outside
                 forward_input_names = current_block.ops[
                     pylayer_block_manager.fwd_op_index
@@ -292,6 +404,7 @@ def backward_fn(dy):
                 ), f"needs to keep the number of inputs to ``forward_fn`` the same as the number of outputs to ``backward_fn``, \
                     but got {len(forward_input_names)} and {len(flat_grad_origin)}"
 
+                # Step4. Rename var name with suffix of "@GRAD"
                 for bwd_output_name, fwd_input_name in zip(
                     flat_grad_origin, forward_input_names
                 ):
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index ff92f9baae541..32a5ebb0dbc3f 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -40,6 +40,7 @@
     convert_np_dtype_to_dtype_,
     core,
     in_dynamic_mode,
+    in_pir_mode,
 )
 
 __all__ = []
@@ -680,8 +681,6 @@ def _to_tensor_static(data, dtype=None, stop_gradient=None):
                             d, dtype, stop_gradient
                         )
                     data = paddle.stack(to_stack_list)
-                    data = paddle.squeeze(data, -1)
-
             else:
                 raise RuntimeError(
                     f"Do not support transform type `{type(data)}` to tensor"
@@ -822,13 +821,18 @@ def full_like(x, fill_value, dtype=None, name=None):
             [[2. 2. 2.]
              [2. 2. 2.]]
     """
+
     if dtype is None:
         dtype = x.dtype
     else:
-        if not isinstance(dtype, core.VarDesc.VarType):
+        if not isinstance(dtype, (core.VarDesc.VarType, core.DataType)):
             dtype = convert_np_dtype_to_dtype_(dtype)
+
     if in_dynamic_mode():
         return _C_ops.full_like(x, fill_value, dtype, x.place)
+    elif in_pir_mode():
+        place = _current_expected_place()
+        return _C_ops.full_like(x, fill_value, dtype, place)
     else:
         helper = LayerHelper("full_like", **locals())
         check_variable_and_dtype(
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 60c5afb99fc7b..3d95b6bea5023 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -240,6 +240,7 @@ def __check_input(x, y):
                     val,
                     name,
                     [
+                        'int8',
                         'uint16',
                         'float16',
                         'float32',
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 639cce92fa3a8..6c2d170aedac8 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -34,7 +34,8 @@
     core,
     dygraph_only,
     in_dynamic_mode,
-    in_new_ir_mode,
+    in_dynamic_or_pir_mode,
+    in_pir_mode,
 )
 from .creation import _complex_to_real_dtype, _real_to_complex_dtype, zeros
 
@@ -179,9 +180,9 @@ def cast(x, dtype):
             x = paddle.to_tensor([2, 3, 4], 'float64')
             y = paddle.cast(x, 'uint8')
     """
-    if not isinstance(dtype, core.VarDesc.VarType):
+    if not isinstance(dtype, (core.VarDesc.VarType, core.DataType)):
         dtype = convert_np_dtype_to_dtype_(dtype)
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.cast(x, dtype)
     else:
         check_variable_and_dtype(
@@ -1132,7 +1133,7 @@ def concat(x, axis=0, name=None):
         if not isinstance(input, Variable):
             input = [t for t in input if t.shape.count(0) == 0]
         return _C_ops.concat(input, axis)
-    elif in_new_ir_mode():
+    elif in_pir_mode():
         if not isinstance(input, paddle.ir.Value):
             input = [t for t in input if t.shape.count(0) == 0]
         return _C_ops.concat(input, axis)
@@ -3598,7 +3599,7 @@ def reshape(x, shape, name=None):
             # the value is [10.]
 
     """
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         if isinstance(shape, (list, tuple)):
             new_shape = []
             for ele in shape:
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 1b9fe1164a510..c6590dc304ce0 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -35,8 +35,8 @@
     convert_np_dtype_to_dtype_,
     core,
     in_dynamic_mode,
-    in_dynamic_or_new_ir_mode,
-    in_new_ir_mode,
+    in_dynamic_or_pir_mode,
+    in_pir_mode,
 )
 from .creation import _complex_to_real_dtype
 from .layer_function_generator import generate_layer_fn, templatedoc
@@ -267,7 +267,7 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
             return _C_ops.scale(x, scale, float(bias), bias_after_scale)
         out = _C_ops.scale(x, scale, float(bias), bias_after_scale)
         return dygraph_utils._append_activation_in_dygraph(out, act)
-    elif in_new_ir_mode():
+    elif in_pir_mode():
         if act is None:
             return _C_ops.scale(x, scale, float(bias), bias_after_scale)
         raise ValueError("act is not implement in new ir of scale api.")
@@ -685,9 +685,11 @@ def add(x, y, name=None):
             [3., 8., 6.])
     """
 
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.add(x, y)
     else:
+        if paddle.ir.core._use_new_ir_api():
+            return paddle._ir_ops.add(x, y)
         return _elementwise_op(LayerHelper('elementwise_add', **locals()))
 
 
@@ -825,7 +827,7 @@ def subtract(x, y, name=None):
             Tensor(shape=[3], dtype=float64, place=Place(cpu), stop_gradient=True,
             [ 4.  ,  inf., -inf.])
     """
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.subtract(x, y)
     else:
         return _elementwise_op(LayerHelper('elementwise_sub', **locals()))
@@ -883,11 +885,9 @@ def divide(x, y, name=None):
             [2.        , 0.60000000, 2.        ])
 
     """
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.divide(x, y)
     else:
-        if paddle.ir.core._use_new_ir_api():
-            return paddle._ir_ops.divide(x, y)
         return _elementwise_op(LayerHelper('elementwise_div', **locals()))
 
 
@@ -1081,7 +1081,7 @@ def multiply(x, y, name=None):
               [2, 4, 6]]])
 
     """
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.multiply(x, y)
     else:
         if x.dtype != y.dtype:
@@ -1517,17 +1517,12 @@ def sum(x, axis=None, dtype=None, keepdim=False, name=None):
 
     dtype_flag = False
     if dtype is not None:
-        if paddle.ir.core._use_new_ir_api():
-            dtype = paddle.ir.core.convert_np_dtype_to_dtype_(dtype)
-        else:
-            dtype_flag = True
-            dtype = convert_np_dtype_to_dtype_(dtype)
+        dtype_flag = True
+        dtype = convert_np_dtype_to_dtype_(dtype)
 
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.sum(x, axis, dtype, keepdim)
     else:
-        if paddle.ir.core._use_new_ir_api():
-            return paddle._ir_ops.sum(x, axis, dtype, keepdim)
         reduce_all, axis = _get_reduce_axis_with_tensor(axis, x)
         attrs = {'dim': axis, 'keep_dim': keepdim, 'reduce_all': reduce_all}
 
@@ -1963,7 +1958,7 @@ def add_n(inputs, name=None):
             [[8. , 10., 12.],
              [14., 16., 18.]])
     """
-    if in_dynamic_or_new_ir_mode():
+    if in_dynamic_or_pir_mode():
         if isinstance(inputs, Variable):
             inputs = [inputs]
         return _C_ops.add_n(inputs)
@@ -2810,7 +2805,7 @@ def max(x, axis=None, keepdim=False, name=None):
               [1., 1.]]])
     """
 
-    if in_dynamic_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.max(x, axis, keepdim)
     else:
         reduce_all, axis = _get_reduce_axis_with_tensor(axis, x)
diff --git a/python/paddle/tensor/ops.py b/python/paddle/tensor/ops.py
index e15bf17beb646..3585c48b0afff 100644
--- a/python/paddle/tensor/ops.py
+++ b/python/paddle/tensor/ops.py
@@ -225,7 +225,7 @@ def acos(x, name=None):
         out = cos^{-1}(x)
 
     Args:
-        x (Tensor): Input of Acos operator, an N-D Tensor, with data type float32, float64 or float16.
+        x (Tensor): Input of Acos operator, an N-D Tensor, with data type float32, float64, float16, complex64 or complex128.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -246,7 +246,17 @@ def acos(x, name=None):
         return _C_ops.acos(x)
     else:
         check_variable_and_dtype(
-            x, 'x', ['float16', 'uint16', 'float32', 'float64'], 'acos'
+            x,
+            'x',
+            [
+                'float16',
+                'uint16',
+                'float32',
+                'float64',
+                'complex64',
+                'complex128',
+            ],
+            'acos',
         )
         helper = LayerHelper('acos', **locals())
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -262,7 +272,7 @@ def acosh(x, name=None):
        out = acosh(x)
 
     Args:
-        x (Tensor): Input of Acosh operator, an N-D Tensor, with data type float32, float64 or float16.
+        x (Tensor): Input of Acosh operator, an N-D Tensor, with data type float32, float64, float16, complex64 or complex128.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -283,7 +293,17 @@ def acosh(x, name=None):
         return _C_ops.acosh(x)
     else:
         check_variable_and_dtype(
-            x, 'x', ['float16', 'uint16', 'float32', 'float64'], 'acosh'
+            x,
+            'x',
+            [
+                'float16',
+                'uint16',
+                'float32',
+                'float64',
+                'complex64',
+                'complex128',
+            ],
+            'acosh',
         )
         helper = LayerHelper('acosh', **locals())
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -299,7 +319,7 @@ def asin(x, name=None):
        out = sin^{-1}(x)
 
     Args:
-        x (Tensor): Input of Asin operator, an N-D Tensor, with data type float32, float64 or float16.
+        x (Tensor): Input of Asin operator, an N-D Tensor, with data type float32, float64, float16, complex64 or complex128.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -320,7 +340,17 @@ def asin(x, name=None):
         return _C_ops.asin(x)
     else:
         check_variable_and_dtype(
-            x, 'x', ['float16', 'uint16', 'float32', 'float64'], 'asin'
+            x,
+            'x',
+            [
+                'float16',
+                'uint16',
+                'float32',
+                'float64',
+                'complex64',
+                'complex128',
+            ],
+            'asin',
         )
         helper = LayerHelper('asin', **locals())
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -336,7 +366,7 @@ def asinh(x, name=None):
        out = asinh(x)
 
     Args:
-        x (Tensor): Input of Asinh operator, an N-D Tensor, with data type float32, float64 or float16.
+        x (Tensor): Input of Asinh operator, an N-D Tensor, with data type float32, float64, float16, complex64 or complex128.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -357,7 +387,17 @@ def asinh(x, name=None):
         return _C_ops.asinh(x)
     else:
         check_variable_and_dtype(
-            x, 'x', ['float16', 'uint16', 'float32', 'float64'], 'asinh'
+            x,
+            'x',
+            [
+                'float16',
+                'uint16',
+                'float32',
+                'float64',
+                'complex64',
+                'complex128',
+            ],
+            'asinh',
         )
         helper = LayerHelper('asinh', **locals())
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -373,7 +413,7 @@ def atan(x, name=None):
        out = tan^{-1}(x)
 
     Args:
-        x (Tensor): Input of Atan operator, an N-D Tensor, with data type float32, float64 or float16.
+        x (Tensor): Input of Atan operator, an N-D Tensor, with data type float32, float64, float16, complex64 or complex128.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -394,7 +434,17 @@ def atan(x, name=None):
         return _C_ops.atan(x)
     else:
         check_variable_and_dtype(
-            x, 'x', ['float16', 'uint16', 'float32', 'float64'], 'atan'
+            x,
+            'x',
+            [
+                'float16',
+                'uint16',
+                'float32',
+                'float64',
+                'complex64',
+                'complex128',
+            ],
+            'atan',
         )
         helper = LayerHelper('atan', **locals())
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -410,7 +460,7 @@ def atanh(x, name=None):
        out = atanh(x)
 
     Args:
-        x (Tensor): Input of Atan operator, an N-D Tensor, with data type float32, float64 or float16.
+        x (Tensor): Input of Atan operator, an N-D Tensor, with data type float32, float64, float16, complex64 or complex128.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -431,7 +481,17 @@ def atanh(x, name=None):
         return _C_ops.atanh(x)
     else:
         check_variable_and_dtype(
-            x, 'x', ['float16', 'uint16', 'float32', 'float64'], 'atanh'
+            x,
+            'x',
+            [
+                'float16',
+                'uint16',
+                'float32',
+                'float64',
+                'complex64',
+                'complex128',
+            ],
+            'atanh',
         )
         helper = LayerHelper('atanh', **locals())
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -529,7 +589,7 @@ def cosh(x, name=None):
        out = \\frac{exp(x)+exp(-x)}{2}
 
     Args:
-        x (Tensor): Input of Cosh operator, an N-D Tensor, with data type float32, float64 or float16.
+        x (Tensor): Input of Cosh operator, an N-D Tensor, with data type float32, float64, float16, complex64 or complex128.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -550,7 +610,17 @@ def cosh(x, name=None):
         return _C_ops.cosh(x)
     else:
         check_variable_and_dtype(
-            x, 'x', ['float16', 'uint16', 'float32', 'float64'], 'cosh'
+            x,
+            'x',
+            [
+                'float16',
+                'uint16',
+                'float32',
+                'float64',
+                'complex64',
+                'complex128',
+            ],
+            'cosh',
         )
         helper = LayerHelper('cosh', **locals())
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -567,7 +637,7 @@ def exp(x, name=None):
         out = e^x
 
     Args:
-        x (Tensor): Input of Exp operator, an N-D Tensor, with data type int32, int64, float32, float64 or float16.
+        x (Tensor): Input of Exp operator, an N-D Tensor, with data type int32, int64, float16, float32, float64, complex64 or complex128.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -617,7 +687,7 @@ def expm1(x, name=None):
         out = e^x - 1
 
     Args:
-        x (Tensor): Input of Expm1 operator, an N-D Tensor, with data type int32, int64, float32, float64 or float16.
+        x (Tensor): Input of Expm1 operator, an N-D Tensor, with data type int32, int64, float16, float32, float64, complex64 or complex128.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -640,7 +710,16 @@ def expm1(x, name=None):
         check_variable_and_dtype(
             x,
             'x',
-            ['float16', 'uint16', 'float32', 'float64', 'int32', 'int64'],
+            [
+                'float16',
+                'uint16',
+                'float32',
+                'float64',
+                'int32',
+                'int64',
+                'complex64',
+                'complex128',
+            ],
             'expm1',
         )
         helper = LayerHelper('expm1', **locals())
@@ -904,7 +983,7 @@ def sinh(x, name=None):
        out = sinh(x)
 
     Args:
-        x (Tensor): Input of Sinh operator, an N-D Tensor, with data type float32, float64 or float16.
+        x (Tensor): Input of Sinh operator, an N-D Tensor, with data type float32, float64, float16, complex64 or complex128.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -925,7 +1004,17 @@ def sinh(x, name=None):
         return _C_ops.sinh(x)
     else:
         check_variable_and_dtype(
-            x, 'x', ['float16', 'uint16', 'float32', 'float64'], 'sinh'
+            x,
+            'x',
+            [
+                'float16',
+                'uint16',
+                'float32',
+                'float64',
+                'complex64',
+                'complex128',
+            ],
+            'sinh',
         )
         helper = LayerHelper('sinh', **locals())
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
diff --git a/python/paddle/tensor/stat.py b/python/paddle/tensor/stat.py
index 7c5a6693e8c80..74808c65017fa 100644
--- a/python/paddle/tensor/stat.py
+++ b/python/paddle/tensor/stat.py
@@ -16,7 +16,7 @@
 
 import paddle
 from paddle import _C_ops
-from paddle.framework import in_dynamic_mode, in_dynamic_or_new_ir_mode
+from paddle.framework import in_dynamic_mode, in_dynamic_or_pir_mode
 
 from ..base.data_feeder import check_type, check_variable_and_dtype
 from ..common_ops_import import Variable
@@ -83,7 +83,7 @@ def mean(x, axis=None, keepdim=False, name=None):
             >>> print(out4.numpy())
             [ 8.5 12.5 16.5]
     """
-    if in_dynamic_or_new_ir_mode():
+    if in_dynamic_or_pir_mode():
         return _C_ops.mean(x, axis, keepdim)
     else:
         reduce_all, axis = _get_reduce_axis_with_tensor(axis, x)
diff --git a/python/paddle/utils/layers_utils.py b/python/paddle/utils/layers_utils.py
index d7ac845926875..1a9c56f218d73 100644
--- a/python/paddle/utils/layers_utils.py
+++ b/python/paddle/utils/layers_utils.py
@@ -407,7 +407,7 @@ def _get_shape_tensor(list_shape):
                     dim = paddle.cast(x=dim, dtype='int32')
                 shape_tensor_list.append(dim)
             else:
-                temp_out = fill_constant([1], 'int32', dim, force_cpu=True)
+                temp_out = fill_constant([], 'int32', dim, force_cpu=True)
                 shape_tensor_list.append(temp_out)
         return shape_tensor_list
 
diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py
index ccec5c41a549b..677fd7602bcfa 100755
--- a/python/paddle/vision/ops.py
+++ b/python/paddle/vision/ops.py
@@ -170,25 +170,22 @@ def yolo_loss(
         Tensor: A 1-D tensor with shape [N], the value of yolov3 loss
 
     Examples:
-      .. code-block:: python
-
-          import paddle
-
-          x = paddle.rand([2, 14, 8, 8]).astype('float32')
-          gt_box = paddle.rand([2, 10, 4]).astype('float32')
-          gt_label = paddle.rand([2, 10]).astype('int32')
-
+        .. code-block:: python
 
-          loss = paddle.vision.ops.yolo_loss(x,
-                                             gt_box=gt_box,
-                                             gt_label=gt_label,
-                                             anchors=[10, 13, 16, 30],
-                                             anchor_mask=[0, 1],
-                                             class_num=2,
-                                             ignore_thresh=0.7,
-                                             downsample_ratio=8,
-                                             use_label_smooth=True,
-                                             scale_x_y=1.)
+            >>> import paddle
+            >>> x = paddle.rand([2, 14, 8, 8]).astype('float32')
+            >>> gt_box = paddle.rand([2, 10, 4]).astype('float32')
+            >>> gt_label = paddle.rand([2, 10]).astype('int32')
+            >>> loss = paddle.vision.ops.yolo_loss(x,
+            ...                                    gt_box=gt_box,
+            ...                                    gt_label=gt_label,
+            ...                                    anchors=[10, 13, 16, 30],
+            ...                                    anchor_mask=[0, 1],
+            ...                                    class_num=2,
+            ...                                    ignore_thresh=0.7,
+            ...                                    downsample_ratio=8,
+            ...                                    use_label_smooth=True,
+            ...                                    scale_x_y=1.)
     """
 
     if in_dygraph_mode():
@@ -353,22 +350,20 @@ def yolo_box(
         scores of boxes.
 
     Examples:
+        .. code-block:: python
 
-    .. code-block:: python
-
-        import paddle
-
-        x = paddle.rand([2, 14, 8, 8]).astype('float32')
-        img_size = paddle.ones((2, 2)).astype('int32')
-
-        boxes, scores = paddle.vision.ops.yolo_box(x,
-                                                   img_size=img_size,
-                                                   anchors=[10, 13, 16, 30],
-                                                   class_num=2,
-                                                   conf_thresh=0.01,
-                                                   downsample_ratio=8,
-                                                   clip_bbox=True,
-                                                   scale_x_y=1.)
+            >>> import paddle
+
+            >>> x = paddle.rand([2, 14, 8, 8]).astype('float32')
+            >>> img_size = paddle.ones((2, 2)).astype('int32')
+            >>> boxes, scores = paddle.vision.ops.yolo_box(x,
+            ...                                             img_size=img_size,
+            ...                                             anchors=[10, 13, 16, 30],
+            ...                                             class_num=2,
+            ...                                             conf_thresh=0.01,
+            ...                                             downsample_ratio=8,
+            ...                                             clip_bbox=True,
+            ...                                             scale_x_y=1.)
     """
     if in_dygraph_mode():
         boxes, scores = _C_ops.yolo_box(
@@ -480,18 +475,17 @@ def prior_box(
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            input = paddle.rand((1, 3, 6, 9), dtype=paddle.float32)
-            image = paddle.rand((1, 3, 9, 12), dtype=paddle.float32)
-
-            box, var = paddle.vision.ops.prior_box(
-                input=input,
-                image=image,
-                min_sizes=[2.0, 4.0],
-                clip=True,
-                flip=True)
-
+            >>> import paddle
+
+            >>> input = paddle.rand((1, 3, 6, 9), dtype=paddle.float32)
+            >>> image = paddle.rand((1, 3, 9, 12), dtype=paddle.float32)
+            >>> box, var = paddle.vision.ops.prior_box(
+            ...     input=input,
+            ...     image=image,
+            ...     min_sizes=[2.0, 4.0],
+            ...     clip=True,
+            ...     flip=True)
+            ...
     """
 
     def _is_list_or_tuple_(data):
@@ -656,29 +650,29 @@ def box_coder(
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            # For encode
-            prior_box_encode = paddle.rand((80, 4), dtype=paddle.float32)
-            prior_box_var_encode = paddle.rand((80, 4), dtype=paddle.float32)
-            target_box_encode = paddle.rand((20, 4), dtype=paddle.float32)
-            output_encode = paddle.vision.ops.box_coder(
-                prior_box=prior_box_encode,
-                prior_box_var=prior_box_var_encode,
-                target_box=target_box_encode,
-                code_type="encode_center_size")
-
-            # For decode
-            prior_box_decode = paddle.rand((80, 4), dtype=paddle.float32)
-            prior_box_var_decode = paddle.rand((80, 4), dtype=paddle.float32)
-            target_box_decode = paddle.rand((20, 80, 4), dtype=paddle.float32)
-            output_decode = paddle.vision.ops.box_coder(
-                prior_box=prior_box_decode,
-                prior_box_var=prior_box_var_decode,
-                target_box=target_box_decode,
-                code_type="decode_center_size",
-                box_normalized=False)
-
+            >>> import paddle
+
+            >>> # For encode
+            >>> prior_box_encode = paddle.rand((80, 4), dtype=paddle.float32)
+            >>> prior_box_var_encode = paddle.rand((80, 4), dtype=paddle.float32)
+            >>> target_box_encode = paddle.rand((20, 4), dtype=paddle.float32)
+            >>> output_encode = paddle.vision.ops.box_coder(
+            ...     prior_box=prior_box_encode,
+            ...     prior_box_var=prior_box_var_encode,
+            ...     target_box=target_box_encode,
+            ...     code_type="encode_center_size")
+            ...
+            >>> # For decode
+            >>> prior_box_decode = paddle.rand((80, 4), dtype=paddle.float32)
+            >>> prior_box_var_decode = paddle.rand((80, 4), dtype=paddle.float32)
+            >>> target_box_decode = paddle.rand((20, 80, 4), dtype=paddle.float32)
+            >>> output_decode = paddle.vision.ops.box_coder(
+            ...     prior_box=prior_box_decode,
+            ...     prior_box_var=prior_box_var_decode,
+            ...     target_box=target_box_decode,
+            ...     code_type="decode_center_size",
+            ...     box_normalized=False)
+            ...
     """
     if in_dygraph_mode():
         if isinstance(prior_box_var, core.eager.Tensor):
@@ -832,37 +826,35 @@ def deform_conv2d(
     Examples:
         .. code-block:: python
 
-          #deformable conv v2:
-
-          import paddle
-          input = paddle.rand((8, 1, 28, 28))
-          kh, kw = 3, 3
-          weight = paddle.rand((16, 1, kh, kw))
-          # offset shape should be [bs, 2 * kh * kw, out_h, out_w]
-          # mask shape should be [bs, hw * hw, out_h, out_w]
-          # In this case, for an input of 28, stride of 1
-          # and kernel size of 3, without padding, the output size is 26
-          offset = paddle.rand((8, 2 * kh * kw, 26, 26))
-          mask = paddle.rand((8, kh * kw, 26, 26))
-          out = paddle.vision.ops.deform_conv2d(input, offset, weight, mask=mask)
-          print(out.shape)
-          # returns
-          [8, 16, 26, 26]
-
-          #deformable conv v1:
-
-          import paddle
-          input = paddle.rand((8, 1, 28, 28))
-          kh, kw = 3, 3
-          weight = paddle.rand((16, 1, kh, kw))
-          # offset shape should be [bs, 2 * kh * kw, out_h, out_w]
-          # In this case, for an input of 28, stride of 1
-          # and kernel size of 3, without padding, the output size is 26
-          offset = paddle.rand((8, 2 * kh * kw, 26, 26))
-          out = paddle.vision.ops.deform_conv2d(input, offset, weight)
-          print(out.shape)
-          # returns
-          [8, 16, 26, 26]
+            >>> #deformable conv v2:
+
+            >>> import paddle
+            >>> input = paddle.rand((8, 1, 28, 28))
+            >>> kh, kw = 3, 3
+            >>> weight = paddle.rand((16, 1, kh, kw))
+            >>> # offset shape should be [bs, 2 * kh * kw, out_h, out_w]
+            >>> # mask shape should be [bs, hw * hw, out_h, out_w]
+            >>> # In this case, for an input of 28, stride of 1
+            >>> # and kernel size of 3, without padding, the output size is 26
+            >>> offset = paddle.rand((8, 2 * kh * kw, 26, 26))
+            >>> mask = paddle.rand((8, kh * kw, 26, 26))
+            >>> out = paddle.vision.ops.deform_conv2d(input, offset, weight, mask=mask)
+            >>> print(out.shape)
+            [8, 16, 26, 26]
+
+            >>> #deformable conv v1:
+
+            >>> import paddle
+            >>> input = paddle.rand((8, 1, 28, 28))
+            >>> kh, kw = 3, 3
+            >>> weight = paddle.rand((16, 1, kh, kw))
+            >>> # offset shape should be [bs, 2 * kh * kw, out_h, out_w]
+            >>> # In this case, for an input of 28, stride of 1
+            >>> # and kernel size of 3, without padding, the output size is 26
+            >>> offset = paddle.rand((8, 2 * kh * kw, 26, 26))
+            >>> out = paddle.vision.ops.deform_conv2d(input, offset, weight)
+            >>> print(out.shape)
+            [8, 16, 26, 26]
     """
     stride = convert_to_list(stride, 2, 'stride')
     padding = convert_to_list(padding, 2, 'padding')
@@ -1042,44 +1034,40 @@ class DeformConv2D(Layer):
     Examples:
         .. code-block:: python
 
-          #deformable conv v2:
-
-          import paddle
-          input = paddle.rand((8, 1, 28, 28))
-          kh, kw = 3, 3
-          # offset shape should be [bs, 2 * kh * kw, out_h, out_w]
-          # mask shape should be [bs, hw * hw, out_h, out_w]
-          # In this case, for an input of 28, stride of 1
-          # and kernel size of 3, without padding, the output size is 26
-          offset = paddle.rand((8, 2 * kh * kw, 26, 26))
-          mask = paddle.rand((8, kh * kw, 26, 26))
-          deform_conv = paddle.vision.ops.DeformConv2D(
-              in_channels=1,
-              out_channels=16,
-              kernel_size=[kh, kw])
-          out = deform_conv(input, offset, mask)
-          print(out.shape)
-          # returns
-          [8, 16, 26, 26]
-
-          #deformable conv v1:
-
-          import paddle
-          input = paddle.rand((8, 1, 28, 28))
-          kh, kw = 3, 3
-          # offset shape should be [bs, 2 * kh * kw, out_h, out_w]
-          # mask shape should be [bs, hw * hw, out_h, out_w]
-          # In this case, for an input of 28, stride of 1
-          # and kernel size of 3, without padding, the output size is 26
-          offset = paddle.rand((8, 2 * kh * kw, 26, 26))
-          deform_conv = paddle.vision.ops.DeformConv2D(
-              in_channels=1,
-              out_channels=16,
-              kernel_size=[kh, kw])
-          out = deform_conv(input, offset)
-          print(out.shape)
-          # returns
-          [8, 16, 26, 26]
+            >>> #deformable conv v2:
+            >>> import paddle
+            >>> input = paddle.rand((8, 1, 28, 28))
+            >>> kh, kw = 3, 3
+            >>> # offset shape should be [bs, 2 * kh * kw, out_h, out_w]
+            >>> # mask shape should be [bs, hw * hw, out_h, out_w]
+            >>> # In this case, for an input of 28, stride of 1
+            >>> # and kernel size of 3, without padding, the output size is 26
+            >>> offset = paddle.rand((8, 2 * kh * kw, 26, 26))
+            >>> mask = paddle.rand((8, kh * kw, 26, 26))
+            >>> deform_conv = paddle.vision.ops.DeformConv2D(
+            ...     in_channels=1,
+            ...     out_channels=16,
+            ...     kernel_size=[kh, kw])
+            >>> out = deform_conv(input, offset, mask)
+            >>> print(out.shape)
+            [8, 16, 26, 26]
+
+            >>> #deformable conv v1:
+            >>> import paddle
+            >>> input = paddle.rand((8, 1, 28, 28))
+            >>> kh, kw = 3, 3
+            >>> # offset shape should be [bs, 2 * kh * kw, out_h, out_w]
+            >>> # mask shape should be [bs, hw * hw, out_h, out_w]
+            >>> # In this case, for an input of 28, stride of 1
+            >>> # and kernel size of 3, without padding, the output size is 26
+            >>> offset = paddle.rand((8, 2 * kh * kw, 26, 26))
+            >>> deform_conv = paddle.vision.ops.DeformConv2D(
+            ...     in_channels=1,
+            ...     out_channels=16,
+            ...     kernel_size=[kh, kw])
+            >>> out = deform_conv(input, offset)
+            >>> print(out.shape)
+            [8, 16, 26, 26]
     """
 
     def __init__(
@@ -1204,19 +1192,18 @@ def distribute_fpn_proposals(
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            fpn_rois = paddle.rand((10, 4))
-            rois_num = paddle.to_tensor([3, 1, 4, 2], dtype=paddle.int32)
-
-            multi_rois, restore_ind, rois_num_per_level = paddle.vision.ops.distribute_fpn_proposals(
-                fpn_rois=fpn_rois,
-                min_level=2,
-                max_level=5,
-                refer_level=4,
-                refer_scale=224,
-                rois_num=rois_num)
-
+            >>> import paddle
+
+            >>> fpn_rois = paddle.rand((10, 4))
+            >>> rois_num = paddle.to_tensor([3, 1, 4, 2], dtype=paddle.int32)
+            >>> multi_rois, restore_ind, rois_num_per_level = paddle.vision.ops.distribute_fpn_proposals(
+            ...     fpn_rois=fpn_rois,
+            ...     min_level=2,
+            ...     max_level=5,
+            ...     refer_level=4,
+            ...     refer_scale=224,
+            ...     rois_num=rois_num)
+            ...
     """
     assert (
         max_level > 0 and min_level > 0
@@ -1311,17 +1298,14 @@ def read_file(filename, name=None):
     Examples:
         .. code-block:: python
 
-            import cv2
-            import paddle
-
-            fake_img = (paddle.rand((400, 300, 3)).numpy() * 255).astype('uint8')
-
-            cv2.imwrite('fake.jpg', fake_img)
-
-            img_bytes = paddle.vision.ops.read_file('fake.jpg')
-
-            print(img_bytes.shape)
-            # [142915]
+            >>> import cv2
+            >>> import paddle
+            >>> paddle.seed(2023)
+            >>> fake_img = (paddle.rand((400, 300, 3)).numpy() * 255).astype('uint8')
+            >>> cv2.imwrite('fake.jpg', fake_img)
+            >>> img_bytes = paddle.vision.ops.read_file('fake.jpg')
+            >>> print(img_bytes.shape)
+            [142773]
     """
 
     if in_dygraph_mode():
@@ -1359,20 +1343,19 @@ def decode_jpeg(x, mode='unchanged', name=None):
     Examples:
         .. code-block:: python
 
-            # required: gpu
-            import cv2
-            import numpy as np
-            import paddle
-
-            fake_img = (np.random.random(
-                        (400, 300, 3)) * 255).astype('uint8')
-
-            cv2.imwrite('fake.jpg', fake_img)
-
-            img_bytes = paddle.vision.ops.read_file('fake.jpg')
-            img = paddle.vision.ops.decode_jpeg(img_bytes)
-
-            print(img.shape)
+            >>> # doctest: +REQUIRES(env:GPU)
+            >>> import cv2
+            >>> import numpy as np
+            >>> import paddle
+            >>> paddle.device.set_device('gpu')
+
+            >>> fake_img = (np.random.random(
+            ...             (400, 300, 3)) * 255).astype('uint8')
+            >>> cv2.imwrite('fake.jpg', fake_img)
+            >>> img_bytes = paddle.vision.ops.read_file('fake.jpg')
+            >>> img = paddle.vision.ops.decode_jpeg(img_bytes)
+            >>> print(img.shape)
+            [3, 400, 300]
     """
     if in_dygraph_mode():
         return _C_ops.decode_jpeg(x, mode, _current_expected_place())
@@ -1419,13 +1402,13 @@ def psroi_pool(x, boxes, boxes_num, output_size, spatial_scale=1.0, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            x = paddle.uniform([2, 490, 28, 28], dtype='float32')
-            boxes = paddle.to_tensor([[1, 5, 8, 10], [4, 2, 6, 7], [12, 12, 19, 21]], dtype='float32')
-            boxes_num = paddle.to_tensor([1, 2], dtype='int32')
-            pool_out = paddle.vision.ops.psroi_pool(x, boxes, boxes_num, 7, 1.0)
-            print(pool_out.shape)
-            # [3, 10, 7, 7]
+            >>> import paddle
+            >>> x = paddle.uniform([2, 490, 28, 28], dtype='float32')
+            >>> boxes = paddle.to_tensor([[1, 5, 8, 10], [4, 2, 6, 7], [12, 12, 19, 21]], dtype='float32')
+            >>> boxes_num = paddle.to_tensor([1, 2], dtype='int32')
+            >>> pool_out = paddle.vision.ops.psroi_pool(x, boxes, boxes_num, 7, 1.0)
+            >>> print(pool_out.shape)
+            [3, 10, 7, 7]
     """
 
     check_type(output_size, 'output_size', (int, tuple, list), 'psroi_pool')
@@ -1488,14 +1471,15 @@ class PSRoIPool(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle
+            >>> import paddle
 
-            psroi_module = paddle.vision.ops.PSRoIPool(7, 1.0)
-            x = paddle.uniform([2, 490, 28, 28], dtype='float32')
-            boxes = paddle.to_tensor([[1, 5, 8, 10], [4, 2, 6, 7], [12, 12, 19, 21]], dtype='float32')
-            boxes_num = paddle.to_tensor([1, 2], dtype='int32')
-            pool_out = psroi_module(x, boxes, boxes_num)
-            print(pool_out.shape) # [3, 10, 7, 7]
+            >>> psroi_module = paddle.vision.ops.PSRoIPool(7, 1.0)
+            >>> x = paddle.uniform([2, 490, 28, 28], dtype='float32')
+            >>> boxes = paddle.to_tensor([[1, 5, 8, 10], [4, 2, 6, 7], [12, 12, 19, 21]], dtype='float32')
+            >>> boxes_num = paddle.to_tensor([1, 2], dtype='int32')
+            >>> pool_out = psroi_module(x, boxes, boxes_num)
+            >>> print(pool_out.shape)
+            [3, 10, 7, 7]
     """
 
     def __init__(self, output_size, spatial_scale=1.0):
@@ -1535,16 +1519,17 @@ def roi_pool(x, boxes, boxes_num, output_size, spatial_scale=1.0, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle
-            from paddle.vision.ops import roi_pool
-
-            data = paddle.rand([1, 256, 32, 32])
-            boxes = paddle.rand([3, 4])
-            boxes[:, 2] += boxes[:, 0] + 3
-            boxes[:, 3] += boxes[:, 1] + 4
-            boxes_num = paddle.to_tensor([3]).astype('int32')
-            pool_out = roi_pool(data, boxes, boxes_num=boxes_num, output_size=3)
-            assert pool_out.shape == [3, 256, 3, 3], ''
+            >>> import paddle
+            >>> from paddle.vision.ops import roi_pool
+
+            >>> data = paddle.rand([1, 256, 32, 32])
+            >>> boxes = paddle.rand([3, 4])
+            >>> boxes[:, 2] += boxes[:, 0] + 3
+            >>> boxes[:, 3] += boxes[:, 1] + 4
+            >>> boxes_num = paddle.to_tensor([3]).astype('int32')
+            >>> pool_out = roi_pool(data, boxes, boxes_num=boxes_num, output_size=3)
+            >>> print(pool_out.shape)
+            [3, 256, 3, 3]
     """
 
     check_type(output_size, 'output_size', (int, tuple), 'roi_pool')
@@ -1601,17 +1586,18 @@ class RoIPool(Layer):
     Examples:
         .. code-block:: python
 
-            import paddle
-            from paddle.vision.ops import RoIPool
-
-            data = paddle.rand([1, 256, 32, 32])
-            boxes = paddle.rand([3, 4])
-            boxes[:, 2] += boxes[:, 0] + 3
-            boxes[:, 3] += boxes[:, 1] + 4
-            boxes_num = paddle.to_tensor([3]).astype('int32')
-            roi_pool = RoIPool(output_size=(4, 3))
-            pool_out = roi_pool(data, boxes, boxes_num)
-            assert pool_out.shape == [3, 256, 4, 3], ''
+            >>> import paddle
+            >>> from paddle.vision.ops import RoIPool
+
+            >>> data = paddle.rand([1, 256, 32, 32])
+            >>> boxes = paddle.rand([3, 4])
+            >>> boxes[:, 2] += boxes[:, 0] + 3
+            >>> boxes[:, 3] += boxes[:, 1] + 4
+            >>> boxes_num = paddle.to_tensor([3]).astype('int32')
+            >>> roi_pool = RoIPool(output_size=(4, 3))
+            >>> pool_out = roi_pool(data, boxes, boxes_num)
+            >>> print(pool_out.shape)
+            [3, 256, 4, 3]
     """
 
     def __init__(self, output_size, spatial_scale=1.0):
@@ -1693,16 +1679,17 @@ def roi_align(
     Examples:
         .. code-block:: python
 
-            import paddle
-            from paddle.vision.ops import roi_align
-
-            data = paddle.rand([1, 256, 32, 32])
-            boxes = paddle.rand([3, 4])
-            boxes[:, 2] += boxes[:, 0] + 3
-            boxes[:, 3] += boxes[:, 1] + 4
-            boxes_num = paddle.to_tensor([3]).astype('int32')
-            align_out = roi_align(data, boxes, boxes_num, output_size=3)
-            assert align_out.shape == [3, 256, 3, 3]
+            >>> import paddle
+            >>> from paddle.vision.ops import roi_align
+
+            >>> data = paddle.rand([1, 256, 32, 32])
+            >>> boxes = paddle.rand([3, 4])
+            >>> boxes[:, 2] += boxes[:, 0] + 3
+            >>> boxes[:, 3] += boxes[:, 1] + 4
+            >>> boxes_num = paddle.to_tensor([3]).astype('int32')
+            >>> align_out = roi_align(data, boxes, boxes_num, output_size=3)
+            >>> print(align_out.shape)
+            [3, 256, 3, 3]
     """
 
     check_type(output_size, 'output_size', (int, tuple), 'roi_align')
@@ -1770,19 +1757,20 @@ class RoIAlign(Layer):
             shape (num_boxes, channels, pooled_h, pooled_w).
 
     Examples:
-        ..  code-block:: python
-
-            import paddle
-            from paddle.vision.ops import RoIAlign
-
-            data = paddle.rand([1, 256, 32, 32])
-            boxes = paddle.rand([3, 4])
-            boxes[:, 2] += boxes[:, 0] + 3
-            boxes[:, 3] += boxes[:, 1] + 4
-            boxes_num = paddle.to_tensor([3]).astype('int32')
-            roi_align = RoIAlign(output_size=(4, 3))
-            align_out = roi_align(data, boxes, boxes_num)
-            assert align_out.shape == [3, 256, 4, 3]
+        .. code-block:: python
+
+            >>> import paddle
+            >>> from paddle.vision.ops import RoIAlign
+
+            >>> data = paddle.rand([1, 256, 32, 32])
+            >>> boxes = paddle.rand([3, 4])
+            >>> boxes[:, 2] += boxes[:, 0] + 3
+            >>> boxes[:, 3] += boxes[:, 1] + 4
+            >>> boxes_num = paddle.to_tensor([3]).astype('int32')
+            >>> roi_align = RoIAlign(output_size=(4, 3))
+            >>> align_out = roi_align(data, boxes, boxes_num)
+            >>> print(align_out.shape)
+            [3, 256, 4, 3]
     """
 
     def __init__(self, output_size, spatial_scale=1.0):
@@ -1906,37 +1894,36 @@ def nms(
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            boxes = paddle.rand([4, 4]).astype('float32')
-            boxes[:, 2] = boxes[:, 0] + boxes[:, 2]
-            boxes[:, 3] = boxes[:, 1] + boxes[:, 3]
-            print(boxes)
-            # Tensor(shape=[4, 4], dtype=float32, place=Place(gpu:0), stop_gradient=True,
-            #        [[0.64811575, 0.89756244, 0.86473107, 1.48552322],
-            #         [0.48085716, 0.84799081, 0.54517937, 0.86396021],
-            #         [0.62646860, 0.72901905, 1.17392159, 1.69691563],
-            #         [0.89729202, 0.46281594, 1.88733089, 0.98588502]])
-
-            out = paddle.vision.ops.nms(boxes, 0.1)
-            print(out)
-            # Tensor(shape=[3], dtype=int64, place=Place(gpu:0), stop_gradient=True,
-            #        [0, 1, 3])
-
-            scores = paddle.to_tensor([0.6, 0.7, 0.4, 0.233])
-
-            categories = [0, 1, 2, 3]
-            category_idxs = paddle.to_tensor([2, 0, 0, 3], dtype="int64")
-
-            out = paddle.vision.ops.nms(boxes,
-                                        0.1,
-                                        paddle.to_tensor(scores),
-                                        paddle.to_tensor(category_idxs),
-                                        categories,
-                                        4)
-            print(out)
-            # Tensor(shape=[4], dtype=int64, place=Place(gpu:0), stop_gradient=True,
-            #        [1, 0, 2, 3])
+            >>> import paddle
+            >>> paddle.seed(2023)
+
+            >>> boxes = paddle.rand([4, 4]).astype('float32')
+            >>> boxes[:, 2] = boxes[:, 0] + boxes[:, 2]
+            >>> boxes[:, 3] = boxes[:, 1] + boxes[:, 3]
+            >>> print(boxes)
+            Tensor(shape=[4, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[0.86583614, 0.52014720, 1.12544549, 1.42540050],
+             [0.42400089, 0.40641287, 1.39420986, 1.15078652],
+             [0.51785129, 0.73292869, 1.49571705, 0.77608776],
+             [0.42639419, 0.71958369, 0.63450879, 0.91689879]])
+
+            >>> out = paddle.vision.ops.nms(boxes, 0.1)
+            >>> print(out)
+            Tensor(shape=[3], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [0, 2, 3])
+
+            >>> scores = paddle.to_tensor([0.6, 0.7, 0.4, 0.233])
+            >>> categories = [0, 1, 2, 3]
+            >>> category_idxs = paddle.to_tensor([2, 0, 0, 3], dtype="int64")
+            >>> out = paddle.vision.ops.nms(boxes,
+            ...                             0.1,
+            ...                             paddle.to_tensor(scores),
+            ...                             paddle.to_tensor(category_idxs),
+            ...                             categories,
+            ...                             4)
+            >>> print(out)
+            Tensor(shape=[4], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [1, 0, 2, 3])
     """
 
     def _nms(boxes, iou_threshold):
@@ -2100,16 +2087,26 @@ def generate_proposals(
     Examples:
         .. code-block:: python
 
-            import paddle
-
-            scores = paddle.rand((2,4,5,5), dtype=paddle.float32)
-            bbox_deltas = paddle.rand((2, 16, 5, 5), dtype=paddle.float32)
-            img_size = paddle.to_tensor([[224.0, 224.0], [224.0, 224.0]])
-            anchors = paddle.rand((2,5,4,4), dtype=paddle.float32)
-            variances = paddle.rand((2,5,10,4), dtype=paddle.float32)
-            rois, roi_probs, roi_nums = paddle.vision.ops.generate_proposals(scores, bbox_deltas,
-                         img_size, anchors, variances, return_rois_num=True)
-            print(rois, roi_probs, roi_nums)
+            >>> import paddle
+            >>> paddle.seed(2023)
+
+            >>> scores = paddle.rand((2,4,5,5), dtype=paddle.float32)
+            >>> bbox_deltas = paddle.rand((2, 16, 5, 5), dtype=paddle.float32)
+            >>> img_size = paddle.to_tensor([[224.0, 224.0], [224.0, 224.0]])
+            >>> anchors = paddle.rand((2,5,4,4), dtype=paddle.float32)
+            >>> variances = paddle.rand((2,5,10,4), dtype=paddle.float32)
+            >>> rois, roi_probs, roi_nums = paddle.vision.ops.generate_proposals(scores, bbox_deltas,
+            ...                 img_size, anchors, variances, return_rois_num=True)
+            >>> # doctest: +SKIP('random sample')
+            >>> print(rois, roi_probs, roi_nums)
+            Tensor(shape=[2, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[0., 0., 0., 0.],
+             [0., 0., 0., 0.]])
+            Tensor(shape=[2, 1], dtype=float32, place=Place(cpu), stop_gradient=True,
+            [[0.],
+             [0.]])
+            Tensor(shape=[2], dtype=int32, place=Place(cpu), stop_gradient=True,
+            [1, 1])
     """
 
     if in_dygraph_mode():
@@ -2266,17 +2263,16 @@ def matrix_nms(
     Examples:
         .. code-block:: python
 
-            import paddle
-            from paddle.vision.ops import matrix_nms
-
-            boxes = paddle.rand([4, 1, 4])
-            boxes[..., 2] = boxes[..., 0] + boxes[..., 2]
-            boxes[..., 3] = boxes[..., 1] + boxes[..., 3]
-            scores = paddle.rand([4, 80, 1])
-            out = matrix_nms(bboxes=boxes, scores=scores, background_label=0,
-                                 score_threshold=0.5, post_threshold=0.1,
-                                 nms_top_k=400, keep_top_k=200, normalized=False)
+            >>> import paddle
+            >>> from paddle.vision.ops import matrix_nms
 
+            >>> boxes = paddle.rand([4, 1, 4])
+            >>> boxes[..., 2] = boxes[..., 0] + boxes[..., 2]
+            >>> boxes[..., 3] = boxes[..., 1] + boxes[..., 3]
+            >>> scores = paddle.rand([4, 80, 1])
+            >>> out = matrix_nms(bboxes=boxes, scores=scores, background_label=0,
+            ...                         score_threshold=0.5, post_threshold=0.1,
+            ...                         nms_top_k=400, keep_top_k=200, normalized=False)
     """
     if in_dygraph_mode():
         out, index, rois_num = _C_ops.matrix_nms(
diff --git a/python/setup.py.in b/python/setup.py.in
index 0ff5f3d1814bb..39dffbb240556 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -548,7 +548,7 @@ if('${WITH_SHARED_PHI}' == 'ON'):
     shutil.copy('${PHI_LIB}', libs_path)
 
 if('${WITH_SHARED_IR}' == 'ON'):
-    package_data['paddle.libs'] += [('libir' if os.name != 'nt' else 'ir') + ext_name]
+    package_data['paddle.libs'] += [('libpir' if os.name != 'nt' else 'pir') + ext_name]
     shutil.copy('${IR_LIB}', libs_path)
 
 package_data['paddle.libs']+=[
diff --git a/setup.py b/setup.py
index 34d68f3efe370..3096814be8900 100644
--- a/setup.py
+++ b/setup.py
@@ -927,7 +927,7 @@ def get_package_data_and_package_dir():
 
     if env_dict.get("WITH_SHARED_IR") == "ON":
         package_data['paddle.libs'] += [
-            ('libir' if os.name != 'nt' else 'ir') + ext_suffix
+            ('libpir' if os.name != 'nt' else 'pir') + ext_suffix
         ]
         shutil.copy(env_dict.get("IR_LIB"), libs_path)
 
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 4c2fb218fc605..19d6a6171c116 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -181,7 +181,7 @@ if(${len} GREATER_EQUAL 1)
         target_link_libraries(${test_name} $<TARGET_LINKER_FILE:phi>)
       endif()
       if(WITH_SHARED_IR)
-        target_link_libraries(${test_name} $<TARGET_LINKER_FILE:ir>)
+        target_link_libraries(${test_name} $<TARGET_LINKER_FILE:pir>)
       endif()
       add_dependencies(${test_name} ${paddle_lib} paddle_gtest_main_new)
       if(WITH_GPU)
@@ -194,7 +194,7 @@ if(${len} GREATER_EQUAL 1)
       if(APPLE)
         target_link_libraries(
           ${test_name}
-          "-Wl,-rpath,$<TARGET_FILE_DIR:${paddle_lib}> -Wl,-rpath,$<TARGET_FILE_DIR:phi> -Wl,-rpath,$<TARGET_FILE_DIR:ir>"
+          "-Wl,-rpath,$<TARGET_FILE_DIR:${paddle_lib}> -Wl,-rpath,$<TARGET_FILE_DIR:phi> -Wl,-rpath,$<TARGET_FILE_DIR:pir>"
         )
       endif()
       if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
diff --git a/test/amp/test_amp_api.py b/test/amp/test_amp_api.py
index 847da4cbcbf20..3f9f13d3b420b 100644
--- a/test/amp/test_amp_api.py
+++ b/test/amp/test_amp_api.py
@@ -281,7 +281,7 @@ def forward(self, x):
 class TestDy2STWithSetValue(AmpTestBase):
     def test_op_called_as_expected(self):
         expected_fp16_calls = {
-            "cast": 0,
+            "cast": 1,
             "layer_norm": 1,
             "scale": 3,
             "set_value": 1,
diff --git a/test/auto_parallel/gpt_with_newir.py b/test/auto_parallel/gpt_with_newir.py
index 4ddfd5a76ffe0..1be3202a23777 100644
--- a/test/auto_parallel/gpt_with_newir.py
+++ b/test/auto_parallel/gpt_with_newir.py
@@ -26,10 +26,31 @@
 paddle.enable_static()
 
 
-def apply_pass():
+def apply_pass(use_sharding=False):
     strategy = auto.Strategy()
     strategy.auto_mode = "semi"
     strategy.reinit = True
+
+    amp = strategy.amp
+    amp.enable = True
+    amp.dtype = "float16"
+    amp.level = "o2"
+    amp.custom_white_list = ['softmax', 'layer_norm', 'gelu']
+    amp.custom_black_list = [
+        'c_softmax_with_cross_entropy',
+        'elementwise_div',
+        'reduce_sum',
+    ]
+
+    recompute = strategy.recompute
+    recompute.enable = True
+
+    if use_sharding:
+        sharding = strategy.sharding
+        sharding.enable = True
+        sharding.degree = 2
+        sharding.stage = 2
+
     return strategy
 
 
@@ -49,24 +70,27 @@ def setUp(self):
         paddle.set_flags({'FLAGS_embedding_deterministic': 1})
         paddle.set_flags({'FLAGS_cudnn_deterministic': 1})
 
-    def init(self, engine):
+    def init(self, engine, name):
         paddle.seed(2021)
         np.random.seed(2021)
         random.seed(2021)
         paddle.distributed.fleet.init(is_collective=True)
+        paddle.distributed.auto_parallel.random._rng_name_to_seed.clear()
+        paddle.distributed.auto_parallel.random._inited_rng_name_to_seed.clear()
+        paddle.distributed.auto_parallel.parallel_manual_seed(2021, name)
         place = paddle.CUDAPlace(ParallelEnv().dev_id)
         engine._executor = paddle.static.Executor(place)
 
-    def get_engine(self, mode):
+    def get_engine(self, mode, name, use_sharding=False):
         reset_prog()
 
-        strategy = apply_pass()
+        strategy = apply_pass(use_sharding)
         clip = paddle.nn.ClipGradByGlobalNorm(self.clip_norm)
-        opt = paddle.optimizer.AdamW(learning_rate=0.00001, grad_clip=None)
-        model, loss = generate_model(mode)
+        opt = paddle.optimizer.AdamW(learning_rate=0.00001, grad_clip=clip)
+        model, loss = generate_model(mode, dropout_prob=0.1)
 
         engine = auto.Engine(model, loss, opt, strategy=strategy)
-        self.init(engine)
+        self.init(engine, name)
         return engine
 
     def check_results(self, ref_losses, check_losses):
@@ -84,13 +108,15 @@ def enable_new_ir(self, flag):
 
     def test_dp(self):
         self.enable_new_ir(False)
-        engine_dp_prog = self.get_engine("dp")
+        engine_dp_prog = self.get_engine(
+            "dp", name="dp_prog", use_sharding=True
+        )
         out_dp_prog = engine_dp_prog.fit(
             self.dataset, 3, batch_size=self.batch_size, log_freq=1
         )
 
         self.enable_new_ir(True)
-        engine_dp_ir = self.get_engine("dp")
+        engine_dp_ir = self.get_engine("dp", name="dp_newir", use_sharding=True)
         out_dp_ir = engine_dp_ir.fit(
             self.dataset, 3, batch_size=self.batch_size, log_freq=1
         )
@@ -101,13 +127,13 @@ def test_dp(self):
 
     def test_mp(self):
         self.enable_new_ir(False)
-        engine_mp_prog = self.get_engine("mp")
+        engine_mp_prog = self.get_engine("mp", name="mp_prog")
         out_mp_prog = engine_mp_prog.fit(
             self.dataset, 3, batch_size=self.batch_size, log_freq=1
         )
 
         self.enable_new_ir(True)
-        engine_mp_ir = self.get_engine("mp")
+        engine_mp_ir = self.get_engine("mp", name="mp_newir")
         out_mp_ir = engine_mp_ir.fit(
             self.dataset, 3, batch_size=self.batch_size, log_freq=1
         )
@@ -119,14 +145,14 @@ def test_mp(self):
     def test_pp(self):
         # navie pipeline parallel without schedule
         self.enable_new_ir(False)
-        engine_pp_prog = self.get_engine("pp")
+        engine_pp_prog = self.get_engine("pp", name="pp_prog0")
         out_pp_prog = engine_pp_prog.fit(
             self.dataset, 3, batch_size=self.batch_size, log_freq=1
         )
 
         self.enable_new_ir(True)
         # send_v2/recv_v2 dynamic_shape is True
-        engine_pp_ir = self.get_engine("pp")
+        engine_pp_ir = self.get_engine("pp", name="pp_newir")
         out_pp_ir = engine_pp_ir.fit(
             self.dataset, 3, batch_size=self.batch_size, log_freq=1
         )
@@ -137,7 +163,7 @@ def test_pp(self):
             )
 
         # send_v2/recv_v2 dynamic_shape is False
-        engine_pp_prog1 = self.get_engine("pp")
+        engine_pp_prog1 = self.get_engine("pp", name="pp_prog1")
         dataloader_pp_prog = engine_pp_prog1.dataloader(
             self.dataset,
             batch_size=self.batch_size,
diff --git a/test/auto_parallel/spmd_rules/CMakeLists.txt b/test/auto_parallel/spmd_rules/CMakeLists.txt
index 96c14a7a2a8ad..78b8ff0728c6d 100644
--- a/test/auto_parallel/spmd_rules/CMakeLists.txt
+++ b/test/auto_parallel/spmd_rules/CMakeLists.txt
@@ -15,6 +15,8 @@ if(WITH_DISTRIBUTE)
                   test_cross_entropy_with_softmax_rule)
   py_test_modules(test_reduction_rule MODULES test_reduction_rule)
   py_test_modules(test_reshape_rule MODULES test_reshape_rule)
+  py_test_modules(test_default_data_parallel_rule MODULES
+                  test_default_data_parallel_rule)
   # End of unittests WITH single card WITHOUT timeout
 
 endif()
diff --git a/test/auto_parallel/spmd_rules/test_default_data_parallel_rule.py b/test/auto_parallel/spmd_rules/test_default_data_parallel_rule.py
new file mode 100644
index 0000000000000..d9c4c3f7d7528
--- /dev/null
+++ b/test/auto_parallel/spmd_rules/test_default_data_parallel_rule.py
@@ -0,0 +1,156 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from paddle.distributed.auto_parallel.static.dist_attribute import (
+    DistTensorSpec,
+    TensorDistAttr,
+)
+from paddle.distributed.fleet import auto
+from paddle.framework import core
+
+
+class TestDefaultDataParallelSPMDRule(unittest.TestCase):
+    def setUp(self):
+        # After replaced all spmd rules by phi impl, we can recover the
+        # api name to `get_spmd_rule`
+        self.rule = core.get_phi_spmd_rule("unsqueeze")
+
+        x_shape = [10, 10, 32, 48]
+        y_shape = [32, 48]
+        out1_shape = [10, 10, 32, 48]
+        out2_shape = [10, 32, 48]
+        process_mesh = auto.ProcessMesh(mesh=[[0, 1, 2], [3, 4, 5]])
+
+        x_tensor_dist_attr = TensorDistAttr()
+        x_tensor_dist_attr.dims_mapping = [-1, 1, -1, -1]
+        x_tensor_dist_attr.process_mesh = process_mesh
+        self.x_dist_tensor_spec = DistTensorSpec(x_shape, x_tensor_dist_attr)
+
+        y_tensor_dist_attr = TensorDistAttr()
+        y_tensor_dist_attr.dims_mapping = [0, -1]
+        y_tensor_dist_attr.process_mesh = process_mesh
+        self.y_dist_tensor_spec = DistTensorSpec(y_shape, y_tensor_dist_attr)
+
+        out1_tensor_dist_attr = TensorDistAttr()
+        out1_tensor_dist_attr.dims_mapping = [-1, 1, -1, -1]
+        out1_tensor_dist_attr.process_mesh = process_mesh
+        self.out1_dist_tensor_spec = DistTensorSpec(
+            out1_shape, out1_tensor_dist_attr
+        )
+        out2_tensor_dist_attr = TensorDistAttr()
+        out1_tensor_dist_attr.dims_mapping = [-1, 1, -1]
+        out1_tensor_dist_attr.process_mesh = process_mesh
+        self.out2_dist_tensor_spec = DistTensorSpec(
+            out2_shape, out2_tensor_dist_attr
+        )
+
+    def test_default_dp_infer_forward(self):
+        # 2 inputs 2 outputs, sharded batch axis
+        in_vec = [self.x_dist_tensor_spec, self.y_dist_tensor_spec]
+        out_vec = [self.out1_dist_tensor_spec, self.out2_dist_tensor_spec]
+        result_dist_attrs = self.rule.infer_forward(
+            [(0, len(in_vec)), (len(in_vec), len(in_vec) + len(out_vec))],
+            in_vec + out_vec,
+            [],
+        )
+        self.assertEqual(len(result_dist_attrs), 2)
+        self.assertEqual(len(result_dist_attrs[0]), 2)
+        self.assertEqual(len(result_dist_attrs[1]), 2)
+
+        self.assertEqual(result_dist_attrs[0][0].dims_mapping, [0, -1, -1, -1])
+        self.assertEqual(result_dist_attrs[0][1].dims_mapping, [0, -1])
+        self.assertEqual(result_dist_attrs[1][0].dims_mapping, [0, -1, -1, -1])
+        self.assertEqual(result_dist_attrs[1][1].dims_mapping, [0, -1, -1])
+
+        # 1 inputs 3 outputs, unsharded batch axis
+        in_vec = [self.x_dist_tensor_spec]
+        out_vec = [
+            self.y_dist_tensor_spec,
+            self.out1_dist_tensor_spec,
+            self.out2_dist_tensor_spec,
+        ]
+        result_dist_attrs = self.rule.infer_forward(
+            [(0, len(in_vec)), (len(in_vec), len(in_vec) + len(out_vec))],
+            in_vec + out_vec,
+            [],
+        )
+        self.assertEqual(len(result_dist_attrs), 2)
+        self.assertEqual(len(result_dist_attrs[0]), 1)
+        self.assertEqual(len(result_dist_attrs[1]), 3)
+
+        self.assertEqual(result_dist_attrs[0][0].dims_mapping, [-1, -1, -1, -1])
+        self.assertEqual(result_dist_attrs[1][0].dims_mapping, [-1, -1])
+        self.assertEqual(result_dist_attrs[1][1].dims_mapping, [-1, -1, -1, -1])
+        self.assertEqual(result_dist_attrs[1][2].dims_mapping, [-1, -1, -1])
+
+        # conflict
+        self.x_dist_tensor_spec.set_dims_mapping([0, -1, -1, -1])
+        self.y_dist_tensor_spec.set_dims_mapping([1, -1, -1, -1])
+
+        in_vec = [self.x_dist_tensor_spec, self.y_dist_tensor_spec]
+        out_vec = [self.out1_dist_tensor_spec, self.out2_dist_tensor_spec]
+        with self.assertRaises(NotImplementedError):
+            result_dist_attrs = self.rule.infer_forward(
+                [(0, len(in_vec)), (len(in_vec), len(in_vec) + len(out_vec))],
+                in_vec + out_vec,
+                [],
+            )
+
+    def test_default_dp_infer_backward(self):
+        # replicated out1 from [-1, 0, 1, -1]  to [0, -1, -1, -1]
+        # replicated out1 from [0, 1, -1]  to [0, -1, -1]
+        self.out1_dist_tensor_spec.set_dims_mapping([-1, 0, 1, -1])
+        self.out2_dist_tensor_spec.set_dims_mapping([0, 1, -1])
+
+        in_vec = [self.x_dist_tensor_spec, self.y_dist_tensor_spec]
+        out_vec = [self.out1_dist_tensor_spec, self.out2_dist_tensor_spec]
+        result_dist_attrs = self.rule.infer_backward(
+            [(0, len(in_vec)), (len(in_vec), len(in_vec) + len(out_vec))],
+            in_vec + out_vec,
+            [],
+        )
+        self.assertEqual(len(result_dist_attrs), 2)
+        self.assertEqual(len(result_dist_attrs[0]), 2)
+        self.assertEqual(len(result_dist_attrs[1]), 2)
+
+        self.assertEqual(result_dist_attrs[0][0].dims_mapping, [0, -1, -1, -1])
+        self.assertEqual(result_dist_attrs[0][1].dims_mapping, [0, -1])
+        self.assertEqual(result_dist_attrs[1][0].dims_mapping, [0, -1, -1, -1])
+        self.assertEqual(result_dist_attrs[1][1].dims_mapping, [0, -1, -1])
+
+        # batch axis unsharded
+        self.out1_dist_tensor_spec.set_dims_mapping([-1, 0, 1, -1])
+        self.out2_dist_tensor_spec.set_dims_mapping([-1, 1, -1])
+
+        in_vec = [self.x_dist_tensor_spec, self.y_dist_tensor_spec]
+        out_vec = [self.out1_dist_tensor_spec, self.out2_dist_tensor_spec]
+        result_dist_attrs = self.rule.infer_backward(
+            [(0, len(in_vec)), (len(in_vec), len(in_vec) + len(out_vec))],
+            in_vec + out_vec,
+            [],
+        )
+        self.assertEqual(len(result_dist_attrs), 2)
+        self.assertEqual(len(result_dist_attrs[0]), 2)
+        self.assertEqual(len(result_dist_attrs[1]), 2)
+
+        self.assertEqual(result_dist_attrs[0][0].dims_mapping, [-1, -1, -1, -1])
+        self.assertEqual(result_dist_attrs[0][1].dims_mapping, [-1, -1])
+        self.assertEqual(result_dist_attrs[1][0].dims_mapping, [-1, -1, -1, -1])
+        self.assertEqual(result_dist_attrs[1][1].dims_mapping, [-1, -1, -1])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/auto_parallel/spmd_rules/test_layer_norm_rule.py b/test/auto_parallel/spmd_rules/test_layer_norm_rule.py
new file mode 100644
index 0000000000000..bac7d12f13b06
--- /dev/null
+++ b/test/auto_parallel/spmd_rules/test_layer_norm_rule.py
@@ -0,0 +1,447 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from paddle.distributed.auto_parallel.static.completion import get_spmd_rule
+from paddle.distributed.auto_parallel.static.dist_attribute import (
+    DistTensorSpec,
+    TensorDistAttr,
+)
+from paddle.distributed.fleet import auto
+
+
+class TestLayerNormSPMDRule(unittest.TestCase):
+    """
+    Unit tests for layer_norm spmd rule.
+    """
+
+    def setUp(self):
+        self.rule = get_spmd_rule("layer_norm")
+
+        x_shape = [64, 32, 1024]
+        scale_shape = [1024]
+        bias_shape = [1024]
+        process_mesh = auto.ProcessMesh(mesh=[[0, 1, 2], [3, 4, 5]])
+
+        x_tensor_dist_attr = TensorDistAttr()
+        x_tensor_dist_attr.dims_mapping = [-1, -1, -1]
+        x_tensor_dist_attr.process_mesh = process_mesh
+        self.x_spec = DistTensorSpec(x_shape, x_tensor_dist_attr)
+
+        self.scale_spec = DistTensorSpec(self.x_spec)
+        self.bias_spec = DistTensorSpec(self.x_spec)
+        self.scale_spec.shape = scale_shape
+        self.scale_spec.set_dims_mapping([-1])
+        self.bias_spec.shape = bias_shape
+        self.bias_spec.set_dims_mapping([-1])
+
+        self.out_spec = DistTensorSpec(self.x_spec)
+        self.mean_spec = DistTensorSpec(self.x_spec)
+        self.var_spec = DistTensorSpec(self.x_spec)
+
+        self.attrs = {
+            'begin_norm_axis': 2,
+        }
+
+    def test_infer_forward(self):
+        # ijk[1, -1, -1], k[-1], k[-1] -->
+        # ijk[1, -1, -1], k[-1], k[-1], (inputs)
+        # ijk[1, -1, -1], z[1], z[1], z=ij (outputs)
+        # begin_norm_axis=2
+        self.x_spec.set_dims_mapping([1, -1, -1])
+        self.bias_spec.set_dims_mapping([-1])
+        self.scale_spec.set_dims_mapping([-1])
+
+        result_dist_attrs = self.rule.infer_forward(
+            [self.x_spec, self.scale_spec, self.bias_spec], self.attrs
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(len(result_dist_attrs), 2)
+        self.assertEqual(len(infered_input_dist_attrs), 3)
+        self.assertEqual(len(infered_output_dist_attrs), 3)
+
+        self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [1, -1, -1])
+        self.assertEqual(infered_input_dist_attrs[1].dims_mapping, [-1])
+        self.assertEqual(infered_input_dist_attrs[2].dims_mapping, [-1])
+        self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [1, -1, -1])
+        self.assertEqual(infered_output_dist_attrs[1].dims_mapping, [1])
+        self.assertEqual(infered_output_dist_attrs[2].dims_mapping, [1])
+
+        # ijk[1, 0, -1],k[0],k[0] -->
+        # [1, -1, -1], [-1], [-1] (inputs)
+        # [1, -1, -1], [1], [1] (outputs)
+        # begin_norm_axis=2
+        self.x_spec.set_dims_mapping([1, 0, -1])
+        self.scale_spec.set_dims_mapping([0])
+        self.bias_spec.set_dims_mapping([0])
+
+        result_dist_attrs = self.rule.infer_forward(
+            [self.x_spec, self.scale_spec, self.bias_spec], self.attrs
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(len(result_dist_attrs), 2)
+        self.assertEqual(len(infered_input_dist_attrs), 3)
+        self.assertEqual(len(infered_output_dist_attrs), 3)
+
+        self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [1, -1, -1])
+        self.assertEqual(infered_input_dist_attrs[1].dims_mapping, [-1])
+        self.assertEqual(infered_input_dist_attrs[2].dims_mapping, [-1])
+        self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [1, -1, -1])
+        self.assertEqual(infered_output_dist_attrs[1].dims_mapping, [1])
+        self.assertEqual(infered_output_dist_attrs[2].dims_mapping, [1])
+
+        # ijk[0, -1, -1],y[-1],y[1] -->
+        # ijk[0, -1, -1],y[-1],y[-1], (inputs)
+        # ijk[0, -1, -1], i[0], i[0], y=jk (outputs)
+        # begin_norm_axis=1
+        self.attrs['begin_norm_axis'] = 1
+        self.x_spec.set_dims_mapping([0, -1, -1])
+        x_shape = self.x_spec.shape
+        self.scale_spec.shape = [x_shape[1] * x_shape[2]]
+        self.bias_spec.shape = [x_shape[1] * x_shape[2]]
+        self.scale_spec.set_dims_mapping([-1])
+        self.bias_spec.set_dims_mapping([1])
+
+        result_dist_attrs = self.rule.infer_forward(
+            [self.x_spec, self.scale_spec, self.bias_spec], self.attrs
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(len(result_dist_attrs), 2)
+        self.assertEqual(len(infered_input_dist_attrs), 3)
+        self.assertEqual(len(infered_output_dist_attrs), 3)
+
+        self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [0, -1, -1])
+        self.assertEqual(infered_input_dist_attrs[1].dims_mapping, [-1])
+        self.assertEqual(infered_input_dist_attrs[2].dims_mapping, [-1])
+        self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [0, -1, -1])
+        self.assertEqual(infered_output_dist_attrs[1].dims_mapping, [0])
+        self.assertEqual(infered_output_dist_attrs[2].dims_mapping, [0])
+
+    def test_infer_backward(self):
+        import math
+
+        # [1, -1, -1], [1], [1] (outputs) -->
+        # [1, -1, -1], [-1], [-1], (inputs)
+        # [1, -1, -1], [1], [1] (outputs)
+        # begin_norm_axis=2
+        self.attrs['begin_norm_axis'] = 2
+        self.scale_spec.shape = [1024]
+        self.bias_spec.shape = [1024]
+        self.mean_spec.shape = [
+            math.prod(self.x_spec.shape[: self.attrs['begin_norm_axis']])
+        ]
+        self.var_spec.shape = [
+            math.prod(self.x_spec.shape[: self.attrs['begin_norm_axis']])
+        ]
+        self.out_spec.set_dims_mapping([1, -1, -1])
+        self.mean_spec.set_dims_mapping([1])
+        self.var_spec.set_dims_mapping([1])
+
+        result_dist_attrs = self.rule.infer_backward(
+            [self.x_spec, self.scale_spec, self.bias_spec],
+            [self.out_spec, self.mean_spec, self.var_spec],
+            self.attrs,
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(len(result_dist_attrs), 2)
+        self.assertEqual(len(infered_input_dist_attrs), 3)
+        self.assertEqual(len(infered_output_dist_attrs), 3)
+
+        self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [1, -1, -1])
+        self.assertEqual(infered_input_dist_attrs[1].dims_mapping, [-1])
+        self.assertEqual(infered_input_dist_attrs[2].dims_mapping, [-1])
+        self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [1, -1, -1])
+        self.assertEqual(infered_output_dist_attrs[1].dims_mapping, [1])
+        self.assertEqual(infered_output_dist_attrs[2].dims_mapping, [1])
+
+        # [0, -1, -1], [0], [0] (outputs) -->
+        # [0, -1, -1], [-1], [-1], (inputs)
+        # [0, -1, -1], [0], [0] (outputs)
+        # begin_norm_axis=2
+        self.attrs['begin_norm_axis'] = 2
+        self.scale_spec.shape = [
+            math.prod(self.x_spec.shape[self.attrs['begin_norm_axis'] :])
+        ]
+        self.bias_spec.shape = [
+            math.prod(self.x_spec.shape[self.attrs['begin_norm_axis'] :])
+        ]
+        self.mean_spec.shape = [
+            math.prod(self.x_spec.shape[: self.attrs['begin_norm_axis']])
+        ]
+        self.var_spec.shape = [
+            math.prod(self.x_spec.shape[: self.attrs['begin_norm_axis']])
+        ]
+        self.out_spec.set_dims_mapping([0, -1, -1])
+        self.mean_spec.set_dims_mapping([0])
+        self.var_spec.set_dims_mapping([0])
+
+        result_dist_attrs = self.rule.infer_backward(
+            [self.x_spec, self.scale_spec, self.bias_spec],
+            [self.out_spec, self.mean_spec, self.var_spec],
+            self.attrs,
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(len(result_dist_attrs), 2)
+        self.assertEqual(len(infered_input_dist_attrs), 3)
+        self.assertEqual(len(infered_output_dist_attrs), 3)
+
+        self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [0, -1, -1])
+        self.assertEqual(infered_input_dist_attrs[1].dims_mapping, [-1])
+        self.assertEqual(infered_input_dist_attrs[2].dims_mapping, [-1])
+        self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [0, -1, -1])
+        self.assertEqual(infered_output_dist_attrs[1].dims_mapping, [0])
+        self.assertEqual(infered_output_dist_attrs[2].dims_mapping, [0])
+
+        # [-1, -1, -1], [0], [-1] (outputs) -->
+        # [0, -1, -1], [-1], [-1], (inputs)
+        # [0, -1, -1], [0], [0] (outputs)
+        # begin_norm_axis=2
+        self.attrs['begin_norm_axis'] = 2
+        self.scale_spec.shape = [
+            math.prod(self.x_spec.shape[self.attrs['begin_norm_axis'] :])
+        ]
+        self.bias_spec.shape = [
+            math.prod(self.x_spec.shape[self.attrs['begin_norm_axis'] :])
+        ]
+        self.mean_spec.shape = [
+            math.prod(self.x_spec.shape[: self.attrs['begin_norm_axis']])
+        ]
+        self.var_spec.shape = [
+            math.prod(self.x_spec.shape[: self.attrs['begin_norm_axis']])
+        ]
+        self.out_spec.set_dims_mapping([-1, -1, -1])
+        self.mean_spec.set_dims_mapping([0])
+        self.var_spec.set_dims_mapping([-1])
+
+        result_dist_attrs = self.rule.infer_backward(
+            [self.x_spec, self.scale_spec, self.bias_spec],
+            [self.out_spec, self.mean_spec, self.var_spec],
+            self.attrs,
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(len(result_dist_attrs), 2)
+        self.assertEqual(len(infered_input_dist_attrs), 3)
+        self.assertEqual(len(infered_output_dist_attrs), 3)
+
+        self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [0, -1, -1])
+        self.assertEqual(infered_input_dist_attrs[1].dims_mapping, [-1])
+        self.assertEqual(infered_input_dist_attrs[2].dims_mapping, [-1])
+        self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [0, -1, -1])
+        self.assertEqual(infered_output_dist_attrs[1].dims_mapping, [0])
+        self.assertEqual(infered_output_dist_attrs[2].dims_mapping, [0])
+
+        # [-1, 1, -1], [-1], [-1] (outputs) -->
+        # [-1, -1, -1], [-1], [-1], (inputs)
+        # [-1, -1, -1], [-1], [-1] (outputs)
+        # begin_norm_axis=2
+        self.attrs['begin_norm_axis'] = 2
+        self.scale_spec.shape = [
+            math.prod(self.x_spec.shape[self.attrs['begin_norm_axis'] :])
+        ]
+        self.bias_spec.shape = [
+            math.prod(self.x_spec.shape[self.attrs['begin_norm_axis'] :])
+        ]
+        self.mean_spec.shape = [
+            math.prod(self.x_spec.shape[: self.attrs['begin_norm_axis']])
+        ]
+        self.var_spec.shape = [
+            math.prod(self.x_spec.shape[: self.attrs['begin_norm_axis']])
+        ]
+        self.out_spec.set_dims_mapping([-1, 1, -1])
+        self.mean_spec.set_dims_mapping([-1])
+        self.var_spec.set_dims_mapping([-1])
+
+        result_dist_attrs = self.rule.infer_backward(
+            [self.x_spec, self.scale_spec, self.bias_spec],
+            [self.out_spec, self.mean_spec, self.var_spec],
+            self.attrs,
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(len(result_dist_attrs), 2)
+        self.assertEqual(len(infered_input_dist_attrs), 3)
+        self.assertEqual(len(infered_output_dist_attrs), 3)
+
+        self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [-1, -1, -1])
+        self.assertEqual(infered_input_dist_attrs[1].dims_mapping, [-1])
+        self.assertEqual(infered_input_dist_attrs[2].dims_mapping, [-1])
+        self.assertEqual(
+            infered_output_dist_attrs[0].dims_mapping, [-1, -1, -1]
+        )
+        self.assertEqual(infered_output_dist_attrs[1].dims_mapping, [-1])
+        self.assertEqual(infered_output_dist_attrs[2].dims_mapping, [-1])
+
+        # [1, -1, -1], [0], [-1] (outputs) --> error
+        # begin_norm_axis=2
+        self.attrs['begin_norm_axis'] = 2
+        self.scale_spec.shape = [
+            math.prod(self.x_spec.shape[self.attrs['begin_norm_axis'] :])
+        ]
+        self.bias_spec.shape = [
+            math.prod(self.x_spec.shape[self.attrs['begin_norm_axis'] :])
+        ]
+        self.mean_spec.shape = [
+            math.prod(self.x_spec.shape[: self.attrs['begin_norm_axis']])
+        ]
+        self.var_spec.shape = [
+            math.prod(self.x_spec.shape[: self.attrs['begin_norm_axis']])
+        ]
+        self.out_spec.set_dims_mapping([1, -1, -1])
+        self.mean_spec.set_dims_mapping([0])
+        self.var_spec.set_dims_mapping([-1])
+
+        with self.assertRaises(BaseException):
+            result_dist_attrs = self.rule.infer_backward(
+                [self.x_spec, self.scale_spec, self.bias_spec],
+                [self.out_spec, self.mean_spec, self.var_spec],
+                self.attrs,
+            )
+
+        # [-1, 1, -1], [0], [-1] (outputs) -->
+        # [0, -1, -1], [-1], [-1] (inputs)
+        # [0, -1, -1], [0], [0] (outputs)
+        # begin_norm_axis=2
+        self.attrs['begin_norm_axis'] = 2
+        self.scale_spec.shape = [
+            math.prod(self.x_spec.shape[self.attrs['begin_norm_axis'] :])
+        ]
+        self.bias_spec.shape = [
+            math.prod(self.x_spec.shape[self.attrs['begin_norm_axis'] :])
+        ]
+        self.mean_spec.shape = [
+            math.prod(self.x_spec.shape[: self.attrs['begin_norm_axis']])
+        ]
+        self.var_spec.shape = [
+            math.prod(self.x_spec.shape[: self.attrs['begin_norm_axis']])
+        ]
+        self.out_spec.set_dims_mapping([-1, 1, -1])
+        self.mean_spec.set_dims_mapping([0])
+        self.var_spec.set_dims_mapping([-1])
+
+        result_dist_attrs = self.rule.infer_backward(
+            [self.x_spec, self.scale_spec, self.bias_spec],
+            [self.out_spec, self.mean_spec, self.var_spec],
+            self.attrs,
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(len(result_dist_attrs), 2)
+        self.assertEqual(len(infered_input_dist_attrs), 3)
+        self.assertEqual(len(infered_output_dist_attrs), 3)
+
+        self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [0, -1, -1])
+        self.assertEqual(infered_input_dist_attrs[1].dims_mapping, [-1])
+        self.assertEqual(infered_input_dist_attrs[2].dims_mapping, [-1])
+        self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [0, -1, -1])
+        self.assertEqual(infered_output_dist_attrs[1].dims_mapping, [0])
+        self.assertEqual(infered_output_dist_attrs[2].dims_mapping, [0])
+
+        # [0, 1, -1], [-1], [-1] (outputs) -->
+        # [0, -1, -1], [-1], [-1] (inputs)
+        # [0, -1, -1], [0], [0] (outputs)
+        # begin_norm_axis=2
+        self.attrs['begin_norm_axis'] = 2
+        self.scale_spec.shape = [
+            math.prod(self.x_spec.shape[self.attrs['begin_norm_axis'] :])
+        ]
+        self.bias_spec.shape = [
+            math.prod(self.x_spec.shape[self.attrs['begin_norm_axis'] :])
+        ]
+        self.mean_spec.shape = [
+            math.prod(self.x_spec.shape[: self.attrs['begin_norm_axis']])
+        ]
+        self.var_spec.shape = [
+            math.prod(self.x_spec.shape[: self.attrs['begin_norm_axis']])
+        ]
+        self.out_spec.set_dims_mapping([0, 1, -1])
+        self.mean_spec.set_dims_mapping([-1])
+        self.var_spec.set_dims_mapping([-1])
+
+        result_dist_attrs = self.rule.infer_backward(
+            [self.x_spec, self.scale_spec, self.bias_spec],
+            [self.out_spec, self.mean_spec, self.var_spec],
+            self.attrs,
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(len(result_dist_attrs), 2)
+        self.assertEqual(len(infered_input_dist_attrs), 3)
+        self.assertEqual(len(infered_output_dist_attrs), 3)
+
+        self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [0, -1, -1])
+        self.assertEqual(infered_input_dist_attrs[1].dims_mapping, [-1])
+        self.assertEqual(infered_input_dist_attrs[2].dims_mapping, [-1])
+        self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [0, -1, -1])
+        self.assertEqual(infered_output_dist_attrs[1].dims_mapping, [0])
+        self.assertEqual(infered_output_dist_attrs[2].dims_mapping, [0])
+
+        # [0, 1, -1], [-1], [-1] (outputs) -->
+        # [0, -1, -1], [-1], [-1], (inputs)
+        # [0, -1, -1], [0], [0] (outputs)
+        # begin_norm_axis=1
+        self.attrs['begin_norm_axis'] = 1
+        self.scale_spec.shape = [
+            math.prod(self.x_spec.shape[self.attrs['begin_norm_axis'] :])
+        ]
+        self.bias_spec.shape = [
+            math.prod(self.x_spec.shape[self.attrs['begin_norm_axis'] :])
+        ]
+        self.mean_spec.shape = [
+            math.prod(self.x_spec.shape[: self.attrs['begin_norm_axis']])
+        ]
+        self.var_spec.shape = [
+            math.prod(self.x_spec.shape[: self.attrs['begin_norm_axis']])
+        ]
+        self.out_spec.set_dims_mapping([0, 1, -1])
+        self.mean_spec.set_dims_mapping([-1])
+        self.var_spec.set_dims_mapping([-1])
+
+        result_dist_attrs = self.rule.infer_backward(
+            [self.x_spec, self.scale_spec, self.bias_spec],
+            [self.out_spec, self.mean_spec, self.var_spec],
+            self.attrs,
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(len(result_dist_attrs), 2)
+        self.assertEqual(len(infered_input_dist_attrs), 3)
+        self.assertEqual(len(infered_output_dist_attrs), 3)
+
+        self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [0, -1, -1])
+        self.assertEqual(infered_input_dist_attrs[1].dims_mapping, [-1])
+        self.assertEqual(infered_input_dist_attrs[2].dims_mapping, [-1])
+        self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [0, -1, -1])
+        self.assertEqual(infered_output_dist_attrs[1].dims_mapping, [0])
+        self.assertEqual(infered_output_dist_attrs[2].dims_mapping, [0])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/auto_parallel/spmd_rules/test_replicated_rule.py b/test/auto_parallel/spmd_rules/test_replicated_rule.py
index cd91e8a8fb765..d09d99ffe6a1f 100644
--- a/test/auto_parallel/spmd_rules/test_replicated_rule.py
+++ b/test/auto_parallel/spmd_rules/test_replicated_rule.py
@@ -14,24 +14,28 @@
 
 import unittest
 
-from paddle.distributed.auto_parallel.static.completion import get_spmd_rule
 from paddle.distributed.auto_parallel.static.dist_attribute import (
     DistTensorSpec,
     TensorDistAttr,
 )
 from paddle.distributed.fleet import auto
+from paddle.framework import core
 
 
-class TestMatmulSPMDRule(unittest.TestCase):
+class TestReplicatedSPMDRule(unittest.TestCase):
     def setUp(self):
-        self.rule = get_spmd_rule("replicated")
+        # After replaced all spmd rules by phi impl, we can recover the
+        # api name to `get_spmd_rule`
+        self.rule = core.get_phi_spmd_rule("replicated")
 
-        x_shape = [64, 32, 10, 10]
+        x_shape = [10, 10, 32, 48]
         y_shape = [32, 48]
+        out1_shape = [10, 10, 32, 48]
+        out2_shape = [10, 32, 48]
         process_mesh = auto.ProcessMesh(mesh=[[0, 1, 2], [3, 4, 5]])
 
         x_tensor_dist_attr = TensorDistAttr()
-        x_tensor_dist_attr.dims_mapping = [-1, 1, 0, -1]
+        x_tensor_dist_attr.dims_mapping = [-1, 1, -1, -1]
         x_tensor_dist_attr.process_mesh = process_mesh
         self.x_dist_tensor_spec = DistTensorSpec(x_shape, x_tensor_dist_attr)
 
@@ -40,21 +44,95 @@ def setUp(self):
         y_tensor_dist_attr.process_mesh = process_mesh
         self.y_dist_tensor_spec = DistTensorSpec(y_shape, y_tensor_dist_attr)
 
+        out1_tensor_dist_attr = TensorDistAttr()
+        # out1_tensor_dist_attr.dims_mapping = [-1, 1, 0, -1] // unset on purpose, inferforward only need shape infer of output
+        # out1_tensor_dist_attr.process_mesh = process_mesh
+        self.out1_dist_tensor_spec = DistTensorSpec(
+            out1_shape, out1_tensor_dist_attr
+        )
+        out2_tensor_dist_attr = TensorDistAttr()
+        self.out2_dist_tensor_spec = DistTensorSpec(
+            out2_shape, out2_tensor_dist_attr
+        )
+
     def test_replicated_infer_forward(self):
         # return all -1
-        result_tensor_specs = self.rule.infer_forward(
-            [self.x_dist_tensor_spec, self.y_dist_tensor_spec], {}
+        # 2 inputs 2 outputs
+        in_vec = [self.x_dist_tensor_spec, self.y_dist_tensor_spec]
+        out_vec = [self.out1_dist_tensor_spec, self.out2_dist_tensor_spec]
+        result_dist_attrs = self.rule.infer_forward(
+            [(0, len(in_vec)), (len(in_vec), len(in_vec) + len(out_vec))],
+            in_vec + out_vec,
+            [],
         )
-        self.assertEqual(len(result_tensor_specs), 2)
-        self.assertEqual(len(result_tensor_specs[0]), 2)
-        self.assertEqual(len(result_tensor_specs[1]), 1)
-        self.assertEqual(
-            result_tensor_specs[0][0].dims_mapping, [-1, -1, -1, -1]
+        self.assertEqual(len(result_dist_attrs), 2)
+        self.assertEqual(len(result_dist_attrs[0]), 2)
+        self.assertEqual(len(result_dist_attrs[1]), 2)
+
+        self.assertEqual(result_dist_attrs[0][0].dims_mapping, [-1, -1, -1, -1])
+        self.assertEqual(result_dist_attrs[0][1].dims_mapping, [-1, -1])
+        self.assertEqual(result_dist_attrs[1][0].dims_mapping, [-1, -1, -1, -1])
+        self.assertEqual(result_dist_attrs[1][1].dims_mapping, [-1, -1, -1])
+
+        # 1 inputs 2 outputs
+        in_vec = [self.y_dist_tensor_spec]
+        out_vec = [self.out1_dist_tensor_spec, self.out2_dist_tensor_spec]
+        result_dist_attrs = self.rule.infer_forward(
+            [(0, len(in_vec)), (len(in_vec), len(in_vec) + len(out_vec))],
+            in_vec + out_vec,
+            [],
         )
-        self.assertEqual(result_tensor_specs[0][1].dims_mapping, [-1, -1])
-        self.assertEqual(
-            result_tensor_specs[1][0].dims_mapping, [-1, -1, -1, -1]
+        self.assertEqual(len(result_dist_attrs), 2)
+        self.assertEqual(len(result_dist_attrs[0]), 1)
+        self.assertEqual(len(result_dist_attrs[1]), 2)
+
+        self.assertEqual(result_dist_attrs[0][0].dims_mapping, [-1, -1])
+        self.assertEqual(result_dist_attrs[1][0].dims_mapping, [-1, -1, -1, -1])
+        self.assertEqual(result_dist_attrs[1][1].dims_mapping, [-1, -1, -1])
+
+    def test_replicated_infer_backward(self):
+        process_mesh = auto.ProcessMesh(mesh=[[0, 1, 2], [3, 4, 5]])
+        self.out1_dist_tensor_spec.set_dims_mapping([-1, 1, 0, -1])
+        self.out1_dist_tensor_spec.set_process_mesh(process_mesh)
+        self.out2_dist_tensor_spec.set_dims_mapping([1, -1, 0])
+        self.out2_dist_tensor_spec.set_process_mesh(process_mesh)
+
+        in_vec = [self.x_dist_tensor_spec, self.y_dist_tensor_spec]
+        out_vec = [self.out1_dist_tensor_spec, self.out2_dist_tensor_spec]
+        result_dist_attrs = self.rule.infer_backward(
+            [(0, len(in_vec)), (len(in_vec), len(in_vec) + len(out_vec))],
+            in_vec + out_vec,
+            [],
+        )
+        self.assertEqual(len(result_dist_attrs), 2)
+        self.assertEqual(len(result_dist_attrs[0]), 2)
+        self.assertEqual(len(result_dist_attrs[1]), 2)
+
+        self.assertEqual(result_dist_attrs[0][0].dims_mapping, [-1, -1, -1, -1])
+        self.assertEqual(result_dist_attrs[0][1].dims_mapping, [-1, -1])
+        self.assertEqual(result_dist_attrs[1][0].dims_mapping, [-1, -1, -1, -1])
+        self.assertEqual(result_dist_attrs[1][1].dims_mapping, [-1, -1, -1])
+
+        # 1 inputs 3 outputs
+        in_vec = [self.y_dist_tensor_spec]
+        out_vec = [
+            self.x_dist_tensor_spec,
+            self.out1_dist_tensor_spec,
+            self.out2_dist_tensor_spec,
+        ]
+        result_dist_attrs = self.rule.infer_backward(
+            [(0, len(in_vec)), (len(in_vec), len(in_vec) + len(out_vec))],
+            in_vec + out_vec,
+            [],
         )
+        self.assertEqual(len(result_dist_attrs), 2)
+        self.assertEqual(len(result_dist_attrs[0]), 1)
+        self.assertEqual(len(result_dist_attrs[1]), 3)
+
+        self.assertEqual(result_dist_attrs[0][0].dims_mapping, [-1, -1])
+        self.assertEqual(result_dist_attrs[1][0].dims_mapping, [-1, -1, -1, -1])
+        self.assertEqual(result_dist_attrs[1][1].dims_mapping, [-1, -1, -1, -1])
+        self.assertEqual(result_dist_attrs[1][2].dims_mapping, [-1, -1, -1])
 
 
 if __name__ == "__main__":
diff --git a/test/auto_parallel/spmd_rules/test_split_rule.py b/test/auto_parallel/spmd_rules/test_split_rule.py
index 1cd32d1bcf2b3..a4f66ff638f0d 100644
--- a/test/auto_parallel/spmd_rules/test_split_rule.py
+++ b/test/auto_parallel/spmd_rules/test_split_rule.py
@@ -88,7 +88,7 @@ def test_single_mesh_dim(self):
         self.assertEqual(infered_output_dist_attrs[2].dims_mapping, [0, -1, -1])
 
         # num_or_sections = [15, 16, 17], axis = 2
-        # [-1, -1, 0] --> [-1, -1, -1], [-1, -1, -1], [-1 -1, -1], [-1, -1, -1]
+        # [-1, -1, 0] --> [-1, -1, -1], [-1, -1, -1], [-1, -1, -1], [-1, -1, -1]
         self.attrs = {}
         self.attrs['sections'] = [15, 16, 17]
         self.attrs['axis'] = 2
@@ -200,6 +200,312 @@ def test_multi_mesh_dim(self):
             infered_output_dist_attrs[2].dims_mapping, [-1, 1, -1, -1]
         )
 
+    def test_backward_single_mesh_dim(self):
+        x_shape = self.x_dist_tensor_spec.shape
+
+        # num_or_sections = 2, axis = 1
+        # [0, -1, -1], [0, -1, -1] --> [0, -1, -1], [0, -1, -1], [0, -1, -1]
+        # (outputs --> input, outputs)
+        self.rule = get_spmd_rule("split_with_num")
+        self.attrs = {}
+        self.attrs['num'] = 2
+        self.attrs['axis'] = 1
+        self.out_spec_list = []
+        self.out_spec_list.append(DistTensorSpec(self.x_dist_tensor_spec))
+        self.out_spec_list.append(DistTensorSpec(self.x_dist_tensor_spec))
+        self.out_spec_list[0].shape = [x_shape[0], x_shape[1] // 2, x_shape[2]]
+        self.out_spec_list[1].shape = [x_shape[0], x_shape[1] // 2, x_shape[2]]
+        self.out_spec_list[0].set_dims_mapping([0, -1, -1])
+        self.out_spec_list[1].set_dims_mapping([0, -1, -1])
+        result_dist_attrs = self.rule.infer_backward(
+            [self.x_dist_tensor_spec], self.out_spec_list, self.attrs
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(len(result_dist_attrs), 2)
+        self.assertEqual(len(infered_input_dist_attrs), 1)
+        self.assertEqual(len(infered_output_dist_attrs), 2)
+
+        self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [0, -1, -1])
+        self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [0, -1, -1])
+        self.assertEqual(infered_output_dist_attrs[1].dims_mapping, [0, -1, -1])
+
+        # num_or_sections = [15, 16, 17], axis = 2
+        # [0, -1, -1], [0, -1, -1], [0, -1, -1] -->
+        # [0, -1, -1], [0, -1, -1], [0, -1, -1], [0, -1, -1]
+        # (outputs --> input, outputs)
+        self.rule = get_spmd_rule("split")
+        self.attrs = {}
+        self.attrs['sections'] = [15, 16, 17]
+        self.attrs['axis'] = 2
+        self.out_spec_list = []
+        self.out_spec_list.append(DistTensorSpec(self.x_dist_tensor_spec))
+        self.out_spec_list.append(DistTensorSpec(self.x_dist_tensor_spec))
+        self.out_spec_list.append(DistTensorSpec(self.x_dist_tensor_spec))
+        self.out_spec_list[0].shape = [x_shape[0], x_shape[1], 15]
+        self.out_spec_list[1].shape = [x_shape[0], x_shape[1], 16]
+        self.out_spec_list[2].shape = [x_shape[0], x_shape[1], 17]
+        self.out_spec_list[0].set_dims_mapping([0, -1, -1])
+        self.out_spec_list[1].set_dims_mapping([0, -1, -1])
+        self.out_spec_list[2].set_dims_mapping([0, -1, -1])
+        result_dist_attrs = self.rule.infer_backward(
+            [self.x_dist_tensor_spec], self.out_spec_list, self.attrs
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(len(result_dist_attrs), 2)
+        self.assertEqual(len(infered_input_dist_attrs), 1)
+        self.assertEqual(len(infered_output_dist_attrs), 3)
+
+        self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [0, -1, -1])
+        self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [0, -1, -1])
+        self.assertEqual(infered_output_dist_attrs[1].dims_mapping, [0, -1, -1])
+        self.assertEqual(infered_output_dist_attrs[2].dims_mapping, [0, -1, -1])
+
+        # num_or_sections = [15, 16, 17], axis = 2
+        # [-1, -1, -1], [-1, -1, -1], [-1, -1, -1] -->
+        # [-1, -1, -1], [-1, -1, -1], [-1, -1, -1], [-1, -1, -1]
+        # (outputs --> input, outputs)
+        self.attrs = {}
+        self.attrs['sections'] = [15, 16, 17]
+        self.attrs['axis'] = 2
+        self.out_spec_list = []
+        self.out_spec_list.append(DistTensorSpec(self.x_dist_tensor_spec))
+        self.out_spec_list.append(DistTensorSpec(self.x_dist_tensor_spec))
+        self.out_spec_list.append(DistTensorSpec(self.x_dist_tensor_spec))
+        self.out_spec_list[0].shape = [x_shape[0], x_shape[1], 15]
+        self.out_spec_list[1].shape = [x_shape[0], x_shape[1], 16]
+        self.out_spec_list[2].shape = [x_shape[0], x_shape[1], 17]
+        self.out_spec_list[0].set_dims_mapping([-1, -1, -1])
+        self.out_spec_list[1].set_dims_mapping([-1, -1, -1])
+        self.out_spec_list[2].set_dims_mapping([-1, -1, -1])
+        result_dist_attrs = self.rule.infer_backward(
+            [self.x_dist_tensor_spec], self.out_spec_list, self.attrs
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(len(result_dist_attrs), 2)
+        self.assertEqual(len(infered_input_dist_attrs), 1)
+        self.assertEqual(len(infered_output_dist_attrs), 3)
+
+        self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [-1, -1, -1])
+        self.assertEqual(
+            infered_output_dist_attrs[0].dims_mapping, [-1, -1, -1]
+        )
+        self.assertEqual(
+            infered_output_dist_attrs[1].dims_mapping, [-1, -1, -1]
+        )
+        self.assertEqual(
+            infered_output_dist_attrs[2].dims_mapping, [-1, -1, -1]
+        )
+
+        # num_or_sections = 2, axis = -2
+        # [0, -1, -1], [0, -1, -1] --> [0, -1, -1], [0, -1, -1], [0, -1, -1]
+        # (outputs --> input, outputs)
+        self.rule = get_spmd_rule("split_with_num")
+        self.attrs = {}
+        self.attrs['num'] = 2
+        self.attrs['axis'] = -2
+        self.out_spec_list = []
+        self.out_spec_list.append(DistTensorSpec(self.x_dist_tensor_spec))
+        self.out_spec_list.append(DistTensorSpec(self.x_dist_tensor_spec))
+        self.out_spec_list[0].shape = [x_shape[0], x_shape[1] // 2, x_shape[2]]
+        self.out_spec_list[1].shape = [x_shape[0], x_shape[1] // 2, x_shape[2]]
+        self.out_spec_list[0].set_dims_mapping([0, -1, -1])
+        self.out_spec_list[1].set_dims_mapping([0, -1, -1])
+        result_dist_attrs = self.rule.infer_backward(
+            [self.x_dist_tensor_spec], self.out_spec_list, self.attrs
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(len(result_dist_attrs), 2)
+        self.assertEqual(len(infered_input_dist_attrs), 1)
+        self.assertEqual(len(infered_output_dist_attrs), 2)
+
+        self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [0, -1, -1])
+        self.assertEqual(infered_output_dist_attrs[0].dims_mapping, [0, -1, -1])
+        self.assertEqual(infered_output_dist_attrs[1].dims_mapping, [0, -1, -1])
+
+        # num_or_sections = 2, axis = -2
+        # [-1, 0, -1], [-1, -1, -1] --> [-1, -1, -1], [-1, -1, -1], [-1, -1, -1]
+        # (outputs --> input, outputs)
+        self.rule = get_spmd_rule("split_with_num")
+        self.attrs = {}
+        self.attrs['num'] = 2
+        self.attrs['axis'] = -2
+        self.out_spec_list = []
+        self.out_spec_list.append(DistTensorSpec(self.x_dist_tensor_spec))
+        self.out_spec_list.append(DistTensorSpec(self.x_dist_tensor_spec))
+        self.out_spec_list[0].shape = [x_shape[0], x_shape[1] // 2, x_shape[2]]
+        self.out_spec_list[1].shape = [x_shape[0], x_shape[1] // 2, x_shape[2]]
+        self.out_spec_list[0].set_dims_mapping([-1, 0, -1])
+        self.out_spec_list[1].set_dims_mapping([-1, -1, -1])
+        result_dist_attrs = self.rule.infer_backward(
+            [self.x_dist_tensor_spec], self.out_spec_list, self.attrs
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(len(result_dist_attrs), 2)
+        self.assertEqual(len(infered_input_dist_attrs), 1)
+        self.assertEqual(len(infered_output_dist_attrs), 2)
+
+        self.assertEqual(infered_input_dist_attrs[0].dims_mapping, [-1, -1, -1])
+        self.assertEqual(
+            infered_output_dist_attrs[0].dims_mapping, [-1, -1, -1]
+        )
+        self.assertEqual(
+            infered_output_dist_attrs[1].dims_mapping, [-1, -1, -1]
+        )
+
+    def test_backward_multi_mesh_dim(self):
+        x_shape = [96, 32, 48, 24]
+        process_mesh = auto.ProcessMesh(mesh=[[0, 1, 2], [3, 4, 5]])
+        self.x_dist_tensor_spec.set_process_mesh(process_mesh)
+        self.x_dist_tensor_spec.shape = x_shape
+
+        # num_or_sections = 3, axis = -1
+        # [0, 1, -1, -1], [0, 1, -1, -1], [0, 1, -1, -1] -->
+        # [0, 1, -1, -1], [0, 1, -1, -1], [0, 1, -1, -1], [0, 1, -1, -1]
+        # (outputs --> input, outputs)
+        self.rule = get_spmd_rule("split_with_num")
+        self.attrs = {}
+        self.attrs['num'] = 3
+        self.attrs['axis'] = -1
+        self.out_spec_list = []
+        self.out_spec_list.append(DistTensorSpec(self.x_dist_tensor_spec))
+        self.out_spec_list.append(DistTensorSpec(self.x_dist_tensor_spec))
+        self.out_spec_list.append(DistTensorSpec(self.x_dist_tensor_spec))
+        self.out_spec_list[0].shape = [
+            x_shape[0],
+            x_shape[1],
+            x_shape[2],
+            x_shape[3] // 3,
+        ]
+        self.out_spec_list[1].shape = [
+            x_shape[0],
+            x_shape[1],
+            x_shape[2],
+            x_shape[3] // 3,
+        ]
+        self.out_spec_list[2].shape = [
+            x_shape[0],
+            x_shape[1],
+            x_shape[2],
+            x_shape[3] // 3,
+        ]
+        self.out_spec_list[0].set_dims_mapping([0, 1, -1, -1])
+        self.out_spec_list[1].set_dims_mapping([0, 1, -1, -1])
+        self.out_spec_list[2].set_dims_mapping([0, 1, -1, -1])
+        result_dist_attrs = self.rule.infer_backward(
+            [self.x_dist_tensor_spec], self.out_spec_list, self.attrs
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(len(result_dist_attrs), 2)
+        self.assertEqual(len(infered_input_dist_attrs), 1)
+        self.assertEqual(len(infered_output_dist_attrs), 3)
+
+        self.assertEqual(
+            infered_input_dist_attrs[0].dims_mapping, [0, 1, -1, -1]
+        )
+        self.assertEqual(
+            infered_output_dist_attrs[0].dims_mapping, [0, 1, -1, -1]
+        )
+        self.assertEqual(
+            infered_output_dist_attrs[1].dims_mapping, [0, 1, -1, -1]
+        )
+        self.assertEqual(
+            infered_output_dist_attrs[2].dims_mapping, [0, 1, -1, -1]
+        )
+
+        # num_or_sections = [32, 32, 32], axis = 0
+        # [-1, 1, -1, -1], [-1, 1, -1, -1], [-1, 1, -1, -1] -->
+        # [-1, 1, -1, -1], [-1, 1, -1, -1], [-1, 1, -1, -1], [-1, 1, -1, -1]
+        # (outputs --> input, outputs)
+        self.rule = get_spmd_rule("split")
+        self.attrs = {}
+        self.attrs['sections'] = [32, 32, 32]
+        self.attrs['axis'] = 0
+        self.out_spec_list = []
+        self.out_spec_list.append(DistTensorSpec(self.x_dist_tensor_spec))
+        self.out_spec_list.append(DistTensorSpec(self.x_dist_tensor_spec))
+        self.out_spec_list.append(DistTensorSpec(self.x_dist_tensor_spec))
+        self.out_spec_list[0].shape = [32, x_shape[1], x_shape[2], x_shape[3]]
+        self.out_spec_list[1].shape = [32, x_shape[1], x_shape[2], x_shape[3]]
+        self.out_spec_list[2].shape = [32, x_shape[1], x_shape[2], x_shape[3]]
+        self.out_spec_list[0].set_dims_mapping([-1, 1, -1, -1])
+        self.out_spec_list[1].set_dims_mapping([-1, 1, -1, -1])
+        self.out_spec_list[2].set_dims_mapping([-1, 1, -1, -1])
+        result_dist_attrs = self.rule.infer_backward(
+            [self.x_dist_tensor_spec], self.out_spec_list, self.attrs
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(len(result_dist_attrs), 2)
+        self.assertEqual(len(infered_input_dist_attrs), 1)
+        self.assertEqual(len(infered_output_dist_attrs), 3)
+
+        self.assertEqual(
+            infered_input_dist_attrs[0].dims_mapping, [-1, 1, -1, -1]
+        )
+        self.assertEqual(
+            infered_output_dist_attrs[0].dims_mapping, [-1, 1, -1, -1]
+        )
+        self.assertEqual(
+            infered_output_dist_attrs[1].dims_mapping, [-1, 1, -1, -1]
+        )
+        self.assertEqual(
+            infered_output_dist_attrs[2].dims_mapping, [-1, 1, -1, -1]
+        )
+
+        # num_or_sections = [32, 32, 32], axis = 2
+        # [0, -1, 1, -1], [-1, 1, -1, -1], [-1, -1, -1, -1] -->
+        # [0, -1, -1, -1], [0, -1, -1, -1], [0, -1, -1, -1], [0, -1, -1, -1]
+        # (outputs --> input, outputs)
+        self.rule = get_spmd_rule("split")
+        self.attrs = {}
+        self.attrs['sections'] = [32, 32, 32]
+        self.attrs['axis'] = 2
+        self.out_spec_list = []
+        self.out_spec_list.append(DistTensorSpec(self.x_dist_tensor_spec))
+        self.out_spec_list.append(DistTensorSpec(self.x_dist_tensor_spec))
+        self.out_spec_list.append(DistTensorSpec(self.x_dist_tensor_spec))
+        self.out_spec_list[0].shape = [32, x_shape[1], x_shape[2], x_shape[3]]
+        self.out_spec_list[1].shape = [32, x_shape[1], x_shape[2], x_shape[3]]
+        self.out_spec_list[2].shape = [32, x_shape[1], x_shape[2], x_shape[3]]
+        self.out_spec_list[0].set_dims_mapping([0, -1, 1, -1])
+        self.out_spec_list[1].set_dims_mapping([-1, 1, -1, -1])
+        self.out_spec_list[2].set_dims_mapping([-1, -1, -1, -1])
+        result_dist_attrs = self.rule.infer_backward(
+            [self.x_dist_tensor_spec], self.out_spec_list, self.attrs
+        )
+        infered_input_dist_attrs = result_dist_attrs[0]
+        infered_output_dist_attrs = result_dist_attrs[1]
+
+        self.assertEqual(len(result_dist_attrs), 2)
+        self.assertEqual(len(infered_input_dist_attrs), 1)
+        self.assertEqual(len(infered_output_dist_attrs), 3)
+
+        self.assertEqual(
+            infered_input_dist_attrs[0].dims_mapping, [0, -1, -1, -1]
+        )
+        self.assertEqual(
+            infered_output_dist_attrs[0].dims_mapping, [0, -1, -1, -1]
+        )
+        self.assertEqual(
+            infered_output_dist_attrs[1].dims_mapping, [0, -1, -1, -1]
+        )
+        self.assertEqual(
+            infered_output_dist_attrs[2].dims_mapping, [0, -1, -1, -1]
+        )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/auto_parallel/test_dist_tensor.py b/test/auto_parallel/test_dist_tensor.py
index e82a3cc2fe6b2..2a9090f72261a 100644
--- a/test/auto_parallel/test_dist_tensor.py
+++ b/test/auto_parallel/test_dist_tensor.py
@@ -53,6 +53,73 @@ def test_dist_tensor_creation(self):
         self.assertEqual(dist_tensor_with_tensor.dist_attr, dist_attr)
 
 
+class TestDistTensorFromFn(unittest.TestCase):
+    def run_dtensor_from_fn(self):
+        # Create a dist_attr
+        mesh = dist.ProcessMesh([0, 1], dim_names=["x"])
+        dist_attr = dist.DistAttr(mesh=mesh, sharding_specs=[None])
+
+        # Call the function dtensor_from_fn with dist_attr parameter
+        result = dist.dtensor_from_fn(
+            paddle.ones, dist_attr=dist_attr, shape=[16]
+        )
+        # Verify the result
+        if paddle.in_dynamic_mode():
+            dist_attr.dynamic_dims = []
+            self.assertIsInstance(result, paddle.Tensor)
+            self.assertEqual(result.shape, [16])
+            self.assertEqual(result.dist_attr, dist_attr)
+        else:
+            dist_attr.dynamic_dims = [0]
+            self.assertIsInstance(result, paddle.static.Variable)
+            self.assertEqual(result.shape, (16,))
+            self.assertEqual(result.dist_attr, dist_attr)
+
+        result_zeros = dist.dtensor_from_fn(
+            paddle.zeros, dist_attr=dist_attr, shape=[16]
+        )
+        if paddle.in_dynamic_mode():
+            dist_attr.dynamic_dims = []
+            self.assertIsInstance(result, paddle.Tensor)
+            self.assertEqual(result.shape, [16])
+            self.assertEqual(result.dist_attr, dist_attr)
+        else:
+            dist_attr.dynamic_dims = [0]
+            self.assertIsInstance(result, paddle.static.Variable)
+            self.assertEqual(result.shape, (16,))
+            self.assertEqual(result.dist_attr, dist_attr)
+
+        result_random = dist.dtensor_from_fn(
+            paddle.rand, dist_attr=dist_attr, shape=[16]
+        )
+        if paddle.in_dynamic_mode():
+            dist_attr.dynamic_dims = []
+            self.assertIsInstance(result, paddle.Tensor)
+            self.assertEqual(result.shape, [16])
+            self.assertEqual(result.dist_attr, dist_attr)
+        else:
+            dist_attr.dynamic_dims = [0]
+            self.assertIsInstance(result, paddle.static.Variable)
+            self.assertEqual(result.shape, (16,))
+            self.assertEqual(result.dist_attr, dist_attr)
+
+        # Test with invalid sharding_specs length
+        with self.assertRaises(AssertionError):
+            invalid_dist_attr = dist.DistAttr(mesh=mesh, sharding_specs=['x'])
+            dist.dtensor_from_fn(
+                paddle.ones, dist_attr=invalid_dist_attr, shape=[2, 3]
+            )
+
+    def test_dynamic_mode(self):
+        self.run_dtensor_from_fn()
+
+    # Test exceptions when running in static mode
+    def test_static_mode(self):
+        paddle.enable_static()
+        self.run_dtensor_from_fn()
+        paddle.disable_static()
+
+
 class TestDistTensorForDygraphAPI(unittest.TestCase):
     def check_tensor_eq(self, a, b):
         np1 = a.numpy()
@@ -60,10 +127,19 @@ def check_tensor_eq(self, a, b):
         np.testing.assert_allclose(np1, np2, rtol=1e-05)
 
     def create_local_and_dist_tensor_pair(self, np_array):
-        local_t = paddle.to_tensor(np_array, dtype='float32')
+        if np_array.dtype == np.float32:
+            local_t = paddle.to_tensor(np_array, dtype='float32')
+        elif np_array.dtype == np.float16:
+            local_t = paddle.to_tensor(np_array, dtype='float16')
+        elif np_array.dtype == np.int32:
+            local_t = paddle.to_tensor(np_array, dtype='int32')
+        elif np_array.dtype == np.bool_:
+            local_t = paddle.to_tensor(np_array, dtype='bool')
 
         mesh = dist.ProcessMesh([0], dim_names=["x"])
-        dist_attr = dist.DistAttr(mesh=mesh, sharding_specs=[None, None])
+        dist_attr = dist.DistAttr(
+            mesh=mesh, sharding_specs=[None] * np_array.ndim
+        )
         dist_t = dist.shard_tensor(np_array, dist_attr=dist_attr)
 
         local_t.stop_gradient = False
@@ -71,6 +147,20 @@ def create_local_and_dist_tensor_pair(self, np_array):
 
         return local_t, dist_t
 
+    def create_local_and_dist_tensor_list_pair(self, np_array_list):
+        assert isinstance(
+            np_array_list, list
+        ), "input should be list of np_array!"
+        local_t_list = []
+        dist_t_list = []
+        for np_array in np_array_list:
+            local_t, dist_t = self.create_local_and_dist_tensor_pair(np_array)
+            local_t_list.append(local_t)
+            dist_t_list.append(dist_t)
+        return local_t_list, dist_t_list
+
+    # input: phi::Tensor
+    # output: phi::Tensor
     def test_relu_api_for_dist_tensor(self):
         x = np.random.random(size=[4, 4]).astype("float32")
         local_in, dist_in = self.create_local_and_dist_tensor_pair(x)
@@ -83,7 +173,8 @@ def test_relu_api_for_dist_tensor(self):
         dist_out.backward()
         self.check_tensor_eq(local_in.grad, dist_in.grad)
 
-    # input: std::vector<phi::Tensor>, output: phi::Tensor
+    # input: std::vector<phi::Tensor>
+    # output: phi::Tensor
     def test_concat_for_dist_tensor(self):
         x1 = np.random.random(size=[4, 4]).astype("float32")
         x2 = np.random.random(size=[4, 4]).astype("float32")
@@ -100,7 +191,8 @@ def test_concat_for_dist_tensor(self):
         self.check_tensor_eq(local_in2.grad, dist_in2.grad)
         self.check_tensor_eq(local_in3.grad, dist_in3.grad)
 
-    # input: std::vector<phi::Tensor>, output: std::vector<phi::Tensor>
+    # input: std::vector<phi::Tensor>
+    # output: std::vector<phi::Tensor>
     def test_broadcast_tensors_for_dist_tensor(self):
         x1 = np.random.random(size=[4, 4]).astype("float32")
         x2 = np.random.random(size=[4, 4]).astype("float32")
@@ -122,8 +214,9 @@ def test_broadcast_tensors_for_dist_tensor(self):
         self.check_tensor_eq(local_in1.grad, dist_in1.grad)
         self.check_tensor_eq(local_in2.grad, dist_in2.grad)
 
-    # input: phi::Tensor, output: std::vector<phi::Tensor>
-    def test_unbind_api_for_dist_tensor(self):
+    # input: phi::Tensor
+    # output: std::vector<phi::Tensor>
+    def test_unbind_for_dist_tensor(self):
         x = np.random.random(size=[2, 8]).astype("float32")
         local_in, dist_in = self.create_local_and_dist_tensor_pair(x)
         local_out1, local_out2 = paddle.unbind(local_in, axis=0)
@@ -138,6 +231,304 @@ def test_unbind_api_for_dist_tensor(self):
         dist_out.backward()
         self.check_tensor_eq(local_in.grad, dist_in.grad)
 
+    # input: paddle::optional<phi::Tensor>
+    # output: phi::Tensor
+    def test_expand_as_for_dist_tensor(self):
+        x1 = np.random.random(size=[2, 8]).astype("float32")
+        x2 = np.random.random(size=[2, 2, 8]).astype("float32")
+        local_in1, dist_in1 = self.create_local_and_dist_tensor_pair(x1)
+        local_in2, dist_in2 = self.create_local_and_dist_tensor_pair(x2)
+        local_out = paddle.expand_as(local_in1, local_in2)
+        dist_out = paddle.expand_as(dist_in1, dist_in2)
+        self.check_tensor_eq(local_out, dist_out)
+        local_out.backward()
+        dist_out.backward()
+        self.check_tensor_eq(local_in1.grad, dist_in1.grad)
+
+    # input: paddle::optional<phi::Tensor>
+    # output: phi::Tensor
+    def test_bincount_api_for_dist_tensor(self):
+        x = np.random.random(size=[16]).astype("int32")
+        weight = np.random.random(size=[16]).astype("float32")
+        local_x, dist_x = self.create_local_and_dist_tensor_pair(x)
+        local_weight, dist_weight = self.create_local_and_dist_tensor_pair(
+            weight
+        )
+
+        local_out = paddle.bincount(local_x, weights=local_weight)
+        dist_out = paddle.bincount(dist_x, weights=dist_weight)
+
+        self.check_tensor_eq(local_out, dist_out)
+
+    # input: paddle::optional<std::vector<phi::Tensor>>
+    # output: phi::Tensor
+    def test_linear_interp_for_dist_tensor(self):
+        out_size = np.array(
+            [
+                50,
+            ]
+        ).astype("int32")
+        shape = [1, 3, 100]
+        size1 = np.array([50]).astype("int32")
+        scale = 0.5
+        scale_list = []
+        for _ in range(len(shape) - 2):
+            scale_list.append(scale)
+        scale = list(map(float, scale_list))
+
+        x = np.random.random(size=shape).astype("float32")
+        local_x, dist_x = self.create_local_and_dist_tensor_pair(x)
+        local_out_size, dist_out_size = self.create_local_and_dist_tensor_pair(
+            out_size
+        )
+        local_size1, dist_size1 = self.create_local_and_dist_tensor_pair(size1)
+
+        local_scale, dist_scale = self.create_local_and_dist_tensor_pair(
+            np.array([0.5]).astype("float32")
+        )
+        local_out = paddle._C_ops.linear_interp(
+            local_x,
+            local_out_size,  # Outsize
+            [local_size1],  # SizeTensor
+            local_scale,  # Scale
+            'NCHW',  # data_layout
+            -1,  # out_d
+            -1,  # out_h
+            50,  # in_w * out_w
+            scale,
+            'linear',  # interp_method
+            False,  # align_corners
+            1,  # align_mode
+        )
+        dist_out = paddle._C_ops.linear_interp(
+            dist_x,
+            dist_out_size,  # Outsize
+            [dist_size1],  # SizeTensor
+            dist_scale,  # Scale
+            'NCHW',  # data_layout
+            -1,  # out_d
+            -1,  # out_h
+            50,  # in_w * out_w
+            scale,
+            'linear',
+            False,  # align_corners
+            1,  # align_mode
+        )
+        self.check_tensor_eq(local_out, dist_out)
+
+    # input: std::vector<phi::Tensor>, phi::Tensor
+    # output: inplace std::vector<phi::Tensor>, inplace phi::Tensor
+    def test_check_finite_and_unscale_for_dist_tensor(self):
+        x = np.random.random((1024, 1024)).astype("float32")
+        x[128][128] = np.inf
+        scale = np.random.random(1).astype("float32")
+        found_inf = np.array([0]).astype(np.bool_)
+
+        local_x, dist_x = self.create_local_and_dist_tensor_pair(x)
+        local_scale, dist_scale = self.create_local_and_dist_tensor_pair(scale)
+        (
+            local_found_inf,
+            dist_found_inf,
+        ) = self.create_local_and_dist_tensor_pair(found_inf)
+
+        paddle._C_ops.check_finite_and_unscale_(
+            [local_x],
+            local_scale,
+            [local_x],
+            local_found_inf,
+        )
+        paddle._C_ops.check_finite_and_unscale_(
+            [dist_x],
+            dist_scale,
+            [dist_x],
+            dist_found_inf,
+        )
+        self.check_tensor_eq(local_x, dist_x)
+        self.check_tensor_eq(local_found_inf, dist_found_inf)
+
+    # input: phi::Tensor
+    # output: inplace paddle::optional<phi::Tensor>
+    def test_adamax_for_dist_tensor(self):
+        dtype = np.float16
+        mp_dtype = np.float32
+        shape = [123, 321]
+
+        beta1 = 0.78
+        beta2 = 0.899
+        epsilon = 1e-5
+        param = np.random.random(shape).astype(dtype)
+        grad = np.random.random(shape).astype(dtype)
+        moment = np.random.random(shape).astype(dtype)
+        inf_norm = np.random.random(shape).astype(dtype)
+        master_param = param.astype(mp_dtype)
+
+        lr = np.array([0.002]).astype("float32")
+        beta1_pow = np.array([beta1**10]).astype("float32")
+
+        local_param, dist_param = self.create_local_and_dist_tensor_pair(param)
+        local_grad, dist_grad = self.create_local_and_dist_tensor_pair(grad)
+        local_lr, dist_lr = self.create_local_and_dist_tensor_pair(lr)
+        (
+            local_beta1_pow,
+            dist_beta1_pow,
+        ) = self.create_local_and_dist_tensor_pair(beta1_pow)
+        local_moment, dist_moment = self.create_local_and_dist_tensor_pair(
+            moment
+        )
+        local_inf_norm, dist_inf_norm = self.create_local_and_dist_tensor_pair(
+            inf_norm
+        )
+        (
+            local_master_param,
+            dist_master_param,
+        ) = self.create_local_and_dist_tensor_pair(master_param)
+
+        (
+            local_param_out,
+            local_moment_out,
+            local_inf_norm_out,
+            local_master_param_out,
+        ) = paddle._C_ops.adamax_(
+            local_param,
+            local_grad,
+            local_lr,
+            local_moment,
+            local_inf_norm,
+            local_beta1_pow,
+            local_master_param,
+            beta1,
+            beta2,
+            epsilon,
+            True,
+        )
+
+        (
+            dist_param_out,
+            dist_moment_out,
+            dist_inf_norm_out,
+            dist_master_param_out,
+        ) = paddle._C_ops.adamax_(
+            dist_param,
+            dist_grad,
+            dist_lr,
+            dist_moment,
+            dist_inf_norm,
+            dist_beta1_pow,
+            dist_master_param,
+            beta1,
+            beta2,
+            epsilon,
+            True,
+        )
+
+        self.check_tensor_eq(local_param_out, dist_param_out)
+        self.check_tensor_eq(local_moment_out, dist_moment_out)
+        self.check_tensor_eq(local_inf_norm_out, dist_inf_norm_out)
+        self.check_tensor_eq(local_master_param_out, dist_master_param_out)
+
+    # input: std::vector<phi::Tensor>, phi::Tensor
+    # output: inplace paddle::optional<std::vector<phi::Tensor>>, inplace phi::Tensor
+    def test_merged_adam_for_dist_tensor(self):
+        dtype = np.float16
+        mp_dtype = np.float32
+        lr_shape = [[1], [1], [1], [1]]
+        shapes = [[3, 4], [2, 7], [5, 6], [7, 8]]
+
+        epsilon = 0.9
+        beta1 = 0.9
+        beta2 = 0.99
+        params = [np.random.random(s).astype(dtype) for s in shapes]
+        grads = [np.random.random(s).astype(dtype) for s in shapes]
+        lrs = [np.random.random(s).astype(mp_dtype) for s in lr_shape]
+        moment1s = [np.random.random(s).astype(mp_dtype) for s in shapes]
+        moment2s = [np.random.random(s).astype(mp_dtype) for s in shapes]
+        beta1_pows = [np.random.random(s).astype(mp_dtype) for s in lr_shape]
+        beta2_pows = [np.random.random(s).astype(mp_dtype) for s in lr_shape]
+        master_params = [p.astype(mp_dtype) for p in params]
+
+        local_param, dist_param = self.create_local_and_dist_tensor_list_pair(
+            params
+        )
+        local_grads, dist_grads = self.create_local_and_dist_tensor_list_pair(
+            grads
+        )
+        local_lrs, dist_lrs = self.create_local_and_dist_tensor_list_pair(lrs)
+        (
+            local_moment1s,
+            dist_moment1s,
+        ) = self.create_local_and_dist_tensor_list_pair(moment1s)
+        (
+            local_moment2s,
+            dist_moment2s,
+        ) = self.create_local_and_dist_tensor_list_pair(moment2s)
+        (
+            local_beta1_pows,
+            dist_beta1_pows,
+        ) = self.create_local_and_dist_tensor_list_pair(beta1_pows)
+        (
+            local_beta2_pows,
+            dist_beta2_pows,
+        ) = self.create_local_and_dist_tensor_list_pair(beta2_pows)
+        (
+            local_master_params,
+            dist_master_params,
+        ) = self.create_local_and_dist_tensor_list_pair(master_params)
+
+        (
+            local_param_out,
+            local_moment1s_out,
+            local_moment2s_out,
+            local_beta1_pow_out,
+            local_beta2_pow_out,
+            local_master_param_out,
+        ) = paddle._C_ops.merged_adam_(
+            local_param,
+            local_grads,
+            local_lrs,
+            local_moment1s,
+            local_moment2s,
+            local_beta1_pows,
+            local_beta2_pows,
+            local_master_params,
+            beta1,
+            beta2,
+            epsilon,
+            True,
+            False,
+        )
+
+        (
+            dist_param_out,
+            dist_moment1s_out,
+            dist_moment2s_out,
+            dist_beta1_pow_out,
+            dist_beta2_pow_out,
+            dist_master_param_out,
+        ) = paddle._C_ops.merged_adam_(
+            dist_param,
+            dist_grads,
+            dist_lrs,
+            dist_moment1s,
+            dist_moment2s,
+            dist_beta1_pows,
+            dist_beta2_pows,
+            dist_master_params,
+            beta1,
+            beta2,
+            epsilon,
+            True,
+            False,
+        )
+        for i in range(len(local_param_out)):
+            self.check_tensor_eq(local_param_out[i], dist_param_out[i])
+            self.check_tensor_eq(local_moment1s_out[i], dist_moment1s_out[i])
+            self.check_tensor_eq(local_moment2s_out[i], dist_moment2s_out[i])
+            self.check_tensor_eq(local_beta1_pow_out[i], dist_beta1_pow_out[i])
+            self.check_tensor_eq(local_beta2_pow_out[i], dist_beta2_pow_out[i])
+            self.check_tensor_eq(
+                local_master_param_out[i], dist_master_param_out[i]
+            )
+
     def test_matmul_api_for_dist_tensor(self):
         x = np.random.random(size=[4, 4]).astype("float32")
         y = np.random.random(size=[4, 4]).astype("float32")
diff --git a/test/auto_parallel/test_tuning_recompute.py b/test/auto_parallel/test_tuning_recompute.py
index ef9a16a2cae72..ca266475b5aff 100644
--- a/test/auto_parallel/test_tuning_recompute.py
+++ b/test/auto_parallel/test_tuning_recompute.py
@@ -79,7 +79,7 @@ def apply_pass():
 class TestRecomputePassTuning(unittest.TestCase):
     def setUp(self):
         self.batch_size = 8
-        self.batch_num = 200
+        self.batch_num = 5
         self.dataset = FakeDataset(
             self.batch_size * self.batch_num,
             vocab_size=50304,
diff --git a/test/collective/collective_global_scatter.py b/test/collective/collective_global_scatter.py
index c92dcd1631390..c488a971dfc44 100644
--- a/test/collective/collective_global_scatter.py
+++ b/test/collective/collective_global_scatter.py
@@ -56,6 +56,30 @@ def get_model(self, main_prog, startup_program, rank, indata=None):
             )
             return [output]
 
+    def get_model_new_comm(self, main_prog, startup_program, rank, indata=None):
+        with base.program_guard(main_prog, startup_program):
+            seed = os.getpid()
+            np.random.seed(seed)
+            in_feat = 2
+            n_expert = 2
+            world_size = 2
+            tot_expert = n_expert * world_size
+            local_input_buf = paddle.static.data(
+                name="local_input_buf", shape=[-1, in_feat], dtype="float32"
+            )
+            local_expert_count = paddle.static.data(
+                name="local_expert_count", shape=[tot_expert], dtype="int64"
+            )
+            global_expert_count = []
+            paddle.distributed.alltoall(
+                paddle.split(local_expert_count, 2, axis=0), global_expert_count
+            )
+            global_expert_count = paddle.concat(global_expert_count, axis=0)
+            output = moe_utils.global_scatter(
+                local_input_buf, local_expert_count, global_expert_count
+            )
+            return [output]
+
     def run_trainer(self, args):
         train_prog = base.Program()
         startup_prog = base.Program()
@@ -63,7 +87,10 @@ def run_trainer(self, args):
         rank = args["trainerid"]
         current_endpoint = args["currentendpoint"]
         nranks = 2
-        paddle.distributed.init_parallel_env()
+        if args["dynamic_static_unified_comm"]:
+            paddle.distributed.collective._init_parallel_env(args["backend"])
+        else:
+            paddle.distributed.init_parallel_env()
         if args['backend'] == 'nccl':
             device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
             place = base.CUDAPlace(
@@ -87,7 +114,11 @@ def run_trainer(self, args):
             "float32"
         )
         if args['static_mode']:
-            result = self.get_model(train_prog, startup_prog, rank)
+            result = (
+                self.get_model_new_comm(train_prog, startup_prog, rank)
+                if args["dynamic_static_unified_comm"]
+                else self.get_model(train_prog, startup_prog, rank)
+            )
             exe = base.Executor(place)
             exe.run(startup_prog)
             fetch_list = []
diff --git a/test/collective/fleet/hybrid_parallel_sep_model.py b/test/collective/fleet/hybrid_parallel_sep_model.py
index 3bb83f27dc01d..c45913c5fa4e8 100644
--- a/test/collective/fleet/hybrid_parallel_sep_model.py
+++ b/test/collective/fleet/hybrid_parallel_sep_model.py
@@ -18,15 +18,152 @@
 import numpy as np
 
 import paddle
+from paddle.autograd import PyLayer
 from paddle.distributed import fleet
 
+vocab_size = 20
+hidden_size = 10
+inner_size = 8
+output_size = 10
+seq_length = 10
+batch_size = 4
 
-class TestDistMPTraining(unittest.TestCase):
-    def setUp(self):
-        random.seed(2023)
-        np.random.seed(2023)
-        paddle.seed(2023)
 
+class Concat(PyLayer):
+    @staticmethod
+    def forward(ctx, inp, axis, group):
+        inputs = []
+        paddle.distributed.all_gather(inputs, inp, group=group)
+        with paddle.no_grad():
+            cat = paddle.concat(inputs, axis=axis)
+        ctx.args_axis = axis
+        ctx.args_group = group
+        return cat
+
+    @staticmethod
+    def backward(ctx, grad):
+        axis = ctx.args_axis
+        group = ctx.args_group
+        with paddle.no_grad():
+            grads = paddle.split(
+                grad, paddle.distributed.get_world_size(group), axis=axis
+            )
+        grad = grads[paddle.distributed.get_rank(group)]
+        return grad
+
+
+class Split(PyLayer):
+    @staticmethod
+    def forward(ctx, inp, axis, group):
+        with paddle.no_grad():
+            inps = paddle.split(
+                inp, paddle.distributed.get_world_size(group), axis=axis
+            )
+        inp = inps[paddle.distributed.get_rank(group)]
+
+        ctx.args_axis = axis
+        ctx.args_group = group
+        return inp
+
+    @staticmethod
+    def backward(ctx, grad):
+        axis = ctx.args_axis
+        group = ctx.args_group
+        grads = []
+        paddle.distributed.all_gather(grads, grad, group=group)
+        with paddle.no_grad():
+            grad = paddle.concat(grads, axis=axis)
+        return grad
+
+
+class SimpleNet(paddle.nn.Layer):
+    def __init__(
+        self, vocab_size, hidden_size, inner_size, output_size, np_fc1, np_fc2
+    ):
+        super().__init__()
+        self.linear1 = paddle.nn.Linear(
+            hidden_size,
+            inner_size,
+            weight_attr=paddle.framework.ParamAttr(
+                initializer=paddle.nn.initializer.Assign(np_fc1)
+            ),
+            bias_attr=paddle.framework.ParamAttr(
+                initializer=paddle.nn.initializer.Constant(0.0)
+            ),
+        )
+
+        self.linear2 = paddle.nn.Linear(
+            inner_size,
+            hidden_size,
+            weight_attr=paddle.framework.ParamAttr(
+                initializer=paddle.nn.initializer.Assign(np_fc2)
+            ),
+            bias_attr=paddle.framework.ParamAttr(
+                initializer=paddle.nn.initializer.Constant(0.0)
+            ),
+        )
+
+        self.linear3 = paddle.nn.Linear(
+            hidden_size,
+            output_size,
+            weight_attr=paddle.framework.ParamAttr(
+                initializer=paddle.nn.initializer.Constant(0.0)
+            ),
+            bias_attr=paddle.framework.ParamAttr(
+                initializer=paddle.nn.initializer.Constant(0.0)
+            ),
+        )
+
+        self.embedding = paddle.nn.Embedding(
+            vocab_size,
+            hidden_size,
+            weight_attr=paddle.nn.initializer.Constant(value=0.5),
+        )
+
+    def forward(self, x):
+        x = self.embedding(x)
+        x = self.linear1(x)
+        x = self.linear2(x)
+        x = self.linear3(x)
+        x = paddle.matmul(x, self.embedding.weight, transpose_y=True)
+        return x
+
+
+class SEPModel(paddle.nn.Layer):
+    def __init__(
+        self, vocab_size, hidden_size, inner_size, output_size, np_fc1, np_fc2
+    ):
+        super().__init__()
+        self._net = SimpleNet(
+            vocab_size, hidden_size, inner_size, output_size, np_fc1, np_fc2
+        )
+        self._hcg = fleet.get_hybrid_communicate_group()
+
+    def forward(self, x):
+        x = Split.apply(x, axis=1, group=self._hcg.get_sep_parallel_group())
+        x = self._net.forward(x)
+        x = Concat.apply(x, axis=1, group=self._hcg.get_sep_parallel_group())
+        loss = x.mean()
+        return loss
+
+
+class DPModel(paddle.nn.Layer):
+    def __init__(
+        self, vocab_size, hidden_size, inner_size, output_size, np_fc1, np_fc2
+    ):
+        super().__init__()
+        self._net = SimpleNet(
+            vocab_size, hidden_size, inner_size, output_size, np_fc1, np_fc2
+        )
+
+    def forward(self, x):
+        x = self._net.forward(x)
+        loss = x.mean()
+        return loss
+
+
+class TestDistSEPTraining(unittest.TestCase):
+    def setUp(self):
         self.strategy = fleet.DistributedStrategy()
         self.strategy.hybrid_configs = {
             "sharding_degree": 1,
@@ -46,6 +183,58 @@ def test_basic_hcg(self):
         assert hcg.get_dp_sep_parallel_group() is not None
         assert hcg.get_pp_mp_parallel_group() is not None
 
+    def train_batch(self, batch, model, optimizer):
+        output = model(batch)
+        loss = output.mean()
+        loss.backward()
+        optimizer.step()
+        optimizer.clear_grad()
+        return loss
+
+    def build_optimizer(self, model):
+        optimizer = paddle.optimizer.SGD(
+            learning_rate=0.001, parameters=model.parameters()
+        )
+        return optimizer
+
+    def build_model(self, model_cls):
+        paddle.seed(2023)
+        np.random.seed(2023)
+        random.seed(2023)
+        np_fc1 = np.random.random_sample((hidden_size, inner_size))
+        np_fc2 = np.random.random_sample((inner_size, hidden_size))
+
+        model = model_cls(
+            vocab_size, hidden_size, inner_size, output_size, np_fc1, np_fc2
+        )
+
+        return model
+
+    def test_sep_train(self):
+        sep_model = self.build_model(SEPModel)
+        sep_model = fleet.distributed_model(sep_model)
+        sep_optimizer = self.build_optimizer(sep_model)
+        sep_optimizer = fleet.distributed_optimizer(sep_optimizer)
+        dp_model = self.build_model(DPModel)
+        dp_optimizer = self.build_optimizer(dp_model)
+
+        for _ in range(5):
+            np_data = np.random.randint(
+                0,
+                vocab_size,
+                (
+                    batch_size,
+                    seq_length,
+                ),
+            )
+            batch = paddle.to_tensor(np_data)
+            loss_sep = self.train_batch(batch, sep_model, sep_optimizer)
+            loss_dp = self.train_batch(batch, dp_model, dp_optimizer)
+
+            np.testing.assert_allclose(
+                loss_sep.numpy(), loss_dp.numpy(), rtol=1e-3
+            )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/collective/test_collective_global_scatter.py b/test/collective/test_collective_global_scatter.py
index ed7825f0fcf4e..7eb34abe6cf5a 100644
--- a/test/collective/test_collective_global_scatter.py
+++ b/test/collective/test_collective_global_scatter.py
@@ -38,6 +38,14 @@ def test_global_scatter_nccl_dygraph_eager(self):
             eager_mode=True,
         )
 
+    def test_global_scatter_nccl_new_comm(self):
+        self.check_with_place(
+            "collective_global_scatter.py",
+            "global_scatter",
+            "nccl",
+            need_envs={"FLAGS_dynamic_static_unified_comm": "1"},
+        )
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/cpp/CMakeLists.txt b/test/cpp/CMakeLists.txt
index 030431cddc284..879449512c57d 100644
--- a/test/cpp/CMakeLists.txt
+++ b/test/cpp/CMakeLists.txt
@@ -4,7 +4,7 @@ add_subdirectory(jit)
 add_subdirectory(new_executor)
 add_subdirectory(prim)
 add_subdirectory(imperative)
-add_subdirectory(ir)
+add_subdirectory(pir)
 add_subdirectory(inference)
 add_subdirectory(eager)
 add_subdirectory(fluid)
diff --git a/test/cpp/auto_parallel/CMakeLists.txt b/test/cpp/auto_parallel/CMakeLists.txt
index ae7300bf62f08..6fc6a9f364665 100644
--- a/test/cpp/auto_parallel/CMakeLists.txt
+++ b/test/cpp/auto_parallel/CMakeLists.txt
@@ -9,7 +9,7 @@ if(WITH_DISTRIBUTE)
     dist_tensor_test
     SRCS dist_tensor_test.cc
     DEPS phi)
-  cc_test_old(spmd_rule_test SRCS spmd_rule_test.cc DEPS spmd_rules)
+  cc_test_old(spmd_rule_test SRCS spmd_rule_test.cc DEPS spmd_rules phi)
 endif()
 
 cc_test_old(dist_mapper_test SRCS dist_mapper_test.cc DEPS phi)
diff --git a/test/cpp/auto_parallel/spmd_rule_test.cc b/test/cpp/auto_parallel/spmd_rule_test.cc
index 079f2d84ea044..7869e0078f9d0 100644
--- a/test/cpp/auto_parallel/spmd_rule_test.cc
+++ b/test/cpp/auto_parallel/spmd_rule_test.cc
@@ -23,6 +23,7 @@ limitations under the License. */
 #include "paddle/phi/core/distributed/auto_parallel/dist_attr.h"
 #include "paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h"
 #include "paddle/phi/core/distributed/auto_parallel/process_mesh.h"
+#include "paddle/phi/infermeta/spmd_rules/replicated.h"
 #include "paddle/phi/infermeta/spmd_rules/rules.h"
 
 namespace paddle {
@@ -230,7 +231,8 @@ TEST(MatmulSPMDRule, Ctor) {
   VLOG(4) << infered_dist_attrs.second[0].to_string();
   infered_dist_attrs.second[0].clean_partial_status();
   EXPECT_EQ(infered_dist_attrs.second[0].is_partial(), false);
-  // EXPECT_ANY_THROW(infered_dist_attrs.second[0].set_partial_status(std::vector<int64_t>({1})));
+  EXPECT_ANY_THROW(infered_dist_attrs.second[0].set_partial_status(
+      std::vector<int64_t>({1})));
   VLOG(4) << "test9 done." << std::endl << std::endl << std::endl;
 
   // abcmk[-1, -1, 1, 0], kn[1, 0] --> abcmk[-1, -1, -1, 0],kn[1, 0] =
@@ -245,8 +247,8 @@ TEST(MatmulSPMDRule, Ctor) {
   // Error
   VLOG(4) << "test10 done." << std::endl << std::endl << std::endl;
 
-  // abcmk[-1, -1, -1, -1], kn[1, 0] --> abcmk[-1, -1, -1, 0],kn[1, 0] =
-  // abcmn[-1, -1, -1, 1] partial[0]:
+  // abcmk[-1, -1, 1, 0], kn[0, 1] --> abcmk[-1, -1, 1, 0],kn[0, 1] =
+  // abcmn[-1, -1, 1, -1] partial[0]:
   x_dist_attr.set_dims_mapping({-1, -1, 0, 1});
   y_dist_attr.set_dims_mapping({1, 0});
   x = phi::distributed::DistMetaTensor(phi::make_ddim(x_shape), x_dist_attr);
@@ -254,17 +256,29 @@ TEST(MatmulSPMDRule, Ctor) {
   ctx = phi::distributed::InferSpmdContext(
       {x, y}, {/*trans_x=*/true, /*trans_x=*/true});
   infered_dist_attrs = matmul_spmd_rule.InferForward(ctx);
-  EXPECT_ANY_THROW(infered_dist_attrs.second[0].clean_partial_dims(
-      std::vector<int64_t>({1})));
-  infered_dist_attrs.second[0].set_partial_status(std::vector<int64_t>({1}));
+
+  EXPECT_EQ(infered_dist_attrs.second[0].dims_mapping(),
+            std::vector<int64_t>({-1, -1, 1, -1}));
   EXPECT_EQ(infered_dist_attrs.second[0].is_partial(), true);
-  EXPECT_EQ(infered_dist_attrs.second[0].partial_dims(),
-            std::set<int64_t>({0, 1}));
-  infered_dist_attrs.second[0].clean_partial_dims(std::vector<int64_t>({1}));
   EXPECT_EQ(infered_dist_attrs.second[0].partial_dims(),
             std::set<int64_t>({0}));
+
+  // try to clean partial on a dim which is not partial
+  EXPECT_ANY_THROW(infered_dist_attrs.second[0].clean_partial_dims(
+      std::vector<int64_t>({1})));
+
+  // try to clean partial on a dims which is sharded
+  EXPECT_ANY_THROW(infered_dist_attrs.second[0].set_partial_status(
+      std::vector<int64_t>({1})));
+
+  // clean partial and then re-set again
   infered_dist_attrs.second[0].clean_partial_dims(std::vector<int64_t>({0}));
   EXPECT_EQ(infered_dist_attrs.second[0].is_partial(), false);
+  infered_dist_attrs.second[0].set_partial_status(std::vector<int64_t>({0}));
+  EXPECT_EQ(infered_dist_attrs.second[0].is_partial(), true);
+  EXPECT_EQ(infered_dist_attrs.second[0].partial_dims(),
+            std::set<int64_t>({0}));
+
   VLOG(4) << "test11 done." << std::endl << std::endl << std::endl;
 }
 
@@ -331,14 +345,26 @@ TEST(LayerNormSPMDRule, Ctor) {
             std::vector<int64_t>({1}));
   VLOG(4) << "test1 done.";
 
-  // ijk[1, 0, -1],k[0],k[0] --> error, begin_norm_axis=2
+  // ijk[1, 0, -1],k[0],k[0] --> ijk[1, -1, -1],z[1],z[1],
+  // begin_norm_axis=2
   x_dist_tensor_spec.set_dims_mapping({1, 0, -1});
   scale_dist_tensor_spec.set_dims_mapping({0});
   bias_dist_tensor_spec.set_dims_mapping({0});
-  EXPECT_ANY_THROW(
-      infered_dist_attrs = layer_norm_rule->InferForward(
-          {x_dist_tensor_spec, scale_dist_tensor_spec, bias_dist_tensor_spec},
-          attrs););
+  infered_dist_attrs = layer_norm_rule->InferForward(
+      {x_dist_tensor_spec, scale_dist_tensor_spec, bias_dist_tensor_spec},
+      attrs);
+  EXPECT_EQ(infered_dist_attrs.first[0].dims_mapping(),
+            std::vector<int64_t>({1, -1, -1}));
+  EXPECT_EQ(infered_dist_attrs.first[1].dims_mapping(),
+            std::vector<int64_t>({-1}));
+  EXPECT_EQ(infered_dist_attrs.first[2].dims_mapping(),
+            std::vector<int64_t>({-1}));
+  EXPECT_EQ(infered_dist_attrs.second[0].dims_mapping(),
+            std::vector<int64_t>({1, -1, -1}));
+  EXPECT_EQ(infered_dist_attrs.second[1].dims_mapping(),
+            std::vector<int64_t>({1}));
+  EXPECT_EQ(infered_dist_attrs.second[2].dims_mapping(),
+            std::vector<int64_t>({1}));
   VLOG(4) << "test2 done.";
 
   // ijk[0, -1, -1],y[-1],y[1] --> ijk[0, 1, -1], i[0], i[0], y=jk,
@@ -362,7 +388,7 @@ TEST(LayerNormSPMDRule, Ctor) {
             std::vector<int64_t>({0}));
   EXPECT_EQ(infered_dist_attrs.second[2].dims_mapping(),
             std::vector<int64_t>({0}));
-  VLOG(4) << "test2 done.";
+  VLOG(4) << "test3 done.";
 }
 
 TEST(MatmulSPMDRuleInferBackward, Ctor) {
@@ -428,6 +454,287 @@ TEST(MatmulSPMDRuleInferBackward, Ctor) {
   VLOG(4) << "test1 done." << std::endl << std::endl << std::endl;
 }
 
+TEST(ReplicatedSPMDRule, Ctor) {
+  // build input data class
+  std::vector<int64_t> x_shape = {10, 10, 32, 48};
+  std::vector<int64_t> y_shape = {32, 48};
+  std::vector<int64_t> out1_shape = {10, 10, 32, 48};
+  std::vector<int64_t> out2_shape = {10, 32, 48};
+
+  std::vector<int64_t> mesh_shape = {2, 3};
+  std::vector<int64_t> process_ids = {0, 1, 2, 3, 4, 5};
+  std::vector<std::string> dim_names = {"x", "y"};
+  ProcessMesh process_mesh(mesh_shape, process_ids, dim_names);
+
+  TensorDistAttr x_dist_attr = TensorDistAttr();
+  x_dist_attr.set_process_mesh(process_mesh);
+  x_dist_attr.set_dims_mapping(std::vector<int64_t>({-1, 1, -1, -1}));
+  x_dist_attr.set_dynamic_dims(std::vector<bool>({false, false}));
+
+  TensorDistAttr y_dist_attr = TensorDistAttr();
+  y_dist_attr.set_process_mesh(process_mesh);
+  y_dist_attr.set_dims_mapping(std::vector<int64_t>({0, -1}));  // no affect
+  y_dist_attr.set_dynamic_dims(std::vector<bool>({false, false}));
+
+  TensorDistAttr out1_dist_attr = TensorDistAttr();
+  out1_dist_attr.set_process_mesh(process_mesh);
+  out1_dist_attr.set_dims_mapping(std::vector<int64_t>({-1, -1, 1, -1}));
+  out1_dist_attr.set_dynamic_dims(std::vector<bool>({false, false}));
+
+  TensorDistAttr out2_dist_attr = TensorDistAttr();
+  out2_dist_attr.set_process_mesh(process_mesh);
+  out2_dist_attr.set_dims_mapping(std::vector<int64_t>({-1, 1, -1}));
+  out2_dist_attr.set_dynamic_dims(std::vector<bool>({false, false}));
+
+  phi::distributed::DistMetaTensor x(phi::make_ddim(x_shape), x_dist_attr);
+  phi::distributed::DistMetaTensor y(phi::make_ddim(y_shape), y_dist_attr);
+  phi::distributed::DistMetaTensor out1(phi::make_ddim(out1_shape),
+                                        out1_dist_attr);
+  phi::distributed::DistMetaTensor out2(phi::make_ddim(out2_shape),
+                                        out2_dist_attr);
+
+  // 2 inputs 2 outputs
+  // call in vector arguments format
+  auto infered_dist_attrs_st =
+      phi::distributed::ReplicatedSpmdInferForward({&x, &y}, {&out1, &out2});
+  // call in variadic arguments format
+  auto infered_dist_attrs_dy =
+      phi::distributed::PhiReplicatedSpmdInferForward(x, y, &out1, &out2);
+
+  size_t input_size = 2;
+  size_t output_size = 2;
+  EXPECT_EQ(infered_dist_attrs_st.first.size(), input_size);
+  EXPECT_EQ(infered_dist_attrs_st.second.size(), output_size);
+  EXPECT_EQ(infered_dist_attrs_dy.first.size(), input_size);
+  EXPECT_EQ(infered_dist_attrs_dy.second.size(), output_size);
+
+  EXPECT_EQ(infered_dist_attrs_st.first[0].dims_mapping(),
+            std::vector<int64_t>({-1, -1, -1, -1}));
+  EXPECT_EQ(infered_dist_attrs_st.first[1].dims_mapping(),
+            std::vector<int64_t>({-1, -1}));
+  EXPECT_EQ(infered_dist_attrs_st.second[0].dims_mapping(),
+            std::vector<int64_t>({-1, -1, -1, -1}));
+  EXPECT_EQ(infered_dist_attrs_st.second[1].dims_mapping(),
+            std::vector<int64_t>({-1, -1, -1}));
+  EXPECT_EQ(infered_dist_attrs_st.first[0].is_partial(), false);
+  EXPECT_EQ(infered_dist_attrs_st.first[1].is_partial(), false);
+  EXPECT_EQ(infered_dist_attrs_st.second[0].is_partial(), false);
+  EXPECT_EQ(infered_dist_attrs_st.second[1].is_partial(), false);
+  EXPECT_EQ(infered_dist_attrs_st.first, infered_dist_attrs_dy.first);
+  EXPECT_EQ(infered_dist_attrs_st.second, infered_dist_attrs_dy.second);
+  VLOG(4) << "test1 done." << std::endl << std::endl << std::endl;
+
+  // 3 inputs 1 outputs
+  // call in vector arguments format
+  infered_dist_attrs_st =
+      phi::distributed::ReplicatedSpmdInferForward({&x, &y, &out1}, {&out2});
+  // call in variadic arguments format
+  infered_dist_attrs_dy =
+      phi::distributed::PhiReplicatedSpmdInferForward(x, y, out1, &out2);
+
+  input_size = 3;
+  output_size = 1;
+  EXPECT_EQ(infered_dist_attrs_st.first.size(), input_size);
+  EXPECT_EQ(infered_dist_attrs_st.second.size(), output_size);
+  EXPECT_EQ(infered_dist_attrs_dy.first.size(), input_size);
+  EXPECT_EQ(infered_dist_attrs_dy.second.size(), output_size);
+
+  EXPECT_EQ(infered_dist_attrs_dy.first[0].dims_mapping(),
+            std::vector<int64_t>({-1, -1, -1, -1}));
+  EXPECT_EQ(infered_dist_attrs_dy.first[1].dims_mapping(),
+            std::vector<int64_t>({-1, -1}));
+  EXPECT_EQ(infered_dist_attrs_dy.first[2].dims_mapping(),
+            std::vector<int64_t>({-1, -1, -1, -1}));
+  EXPECT_EQ(infered_dist_attrs_dy.second[0].dims_mapping(),
+            std::vector<int64_t>({-1, -1, -1}));
+  EXPECT_EQ(infered_dist_attrs_st.first, infered_dist_attrs_dy.first);
+  EXPECT_EQ(infered_dist_attrs_st.second, infered_dist_attrs_dy.second);
+  VLOG(4) << "test2 done." << std::endl << std::endl << std::endl;
+
+  // 1 inputs 3 outputs backward
+  // call in vector arguments format
+  infered_dist_attrs_st =
+      phi::distributed::ReplicatedSpmdInferBackward({&x}, {&y, &out1, &out2});
+  // call in variadic arguments format
+  infered_dist_attrs_dy =
+      phi::distributed::PhiReplicatedSpmdInferBackward(x, &y, &out1, &out2);
+
+  input_size = 1;
+  output_size = 3;
+  EXPECT_EQ(infered_dist_attrs_st.first.size(), input_size);
+  EXPECT_EQ(infered_dist_attrs_st.second.size(), output_size);
+  EXPECT_EQ(infered_dist_attrs_dy.first.size(), input_size);
+  EXPECT_EQ(infered_dist_attrs_dy.second.size(), output_size);
+
+  EXPECT_EQ(infered_dist_attrs_dy.first[0].dims_mapping(),
+            std::vector<int64_t>({-1, -1, -1, -1}));
+  EXPECT_EQ(infered_dist_attrs_dy.second[0].dims_mapping(),
+            std::vector<int64_t>({-1, -1}));
+  EXPECT_EQ(infered_dist_attrs_dy.second[1].dims_mapping(),
+            std::vector<int64_t>({-1, -1, -1, -1}));
+  EXPECT_EQ(infered_dist_attrs_dy.second[2].dims_mapping(),
+            std::vector<int64_t>({-1, -1, -1}));
+  EXPECT_EQ(infered_dist_attrs_st.first, infered_dist_attrs_dy.first);
+  EXPECT_EQ(infered_dist_attrs_st.second, infered_dist_attrs_dy.second);
+  VLOG(4) << "test3 done." << std::endl << std::endl << std::endl;
+}
+
+TEST(DefaultDataParallelSPMDRule, Ctor) {
+  // build input data class
+  std::vector<int64_t> x_shape = {10, 10, 32, 48};
+  std::vector<int64_t> y_shape = {32, 48};
+  std::vector<int64_t> out1_shape = {10, 10, 32, 48};
+  std::vector<int64_t> out2_shape = {10, 32, 48};
+
+  std::vector<int64_t> mesh_shape = {2, 3};
+  std::vector<int64_t> process_ids = {0, 1, 2, 3, 4, 5};
+  std::vector<std::string> dim_names = {"x", "y"};
+  ProcessMesh process_mesh(mesh_shape, process_ids, dim_names);
+
+  TensorDistAttr x_dist_attr = TensorDistAttr();
+  x_dist_attr.set_process_mesh(process_mesh);
+  x_dist_attr.set_dims_mapping(std::vector<int64_t>({-1, -1, -1, 1}));
+  x_dist_attr.set_dynamic_dims(std::vector<bool>({false, false}));
+
+  TensorDistAttr y_dist_attr = TensorDistAttr();
+  y_dist_attr.set_process_mesh(process_mesh);
+  y_dist_attr.set_dims_mapping(std::vector<int64_t>({0, 1}));
+  y_dist_attr.set_dynamic_dims(std::vector<bool>({false, false}));
+
+  TensorDistAttr out1_dist_attr = TensorDistAttr();
+  out1_dist_attr.set_process_mesh(process_mesh);
+  out1_dist_attr.set_dims_mapping(std::vector<int64_t>({-1, -1, 1, -1}));
+  out1_dist_attr.set_dynamic_dims(std::vector<bool>({false, false}));
+
+  TensorDistAttr out2_dist_attr = TensorDistAttr();
+  out2_dist_attr.set_process_mesh(process_mesh);
+  out2_dist_attr.set_dims_mapping(std::vector<int64_t>({-1, 1, -1}));
+  out2_dist_attr.set_dynamic_dims(std::vector<bool>({false, false}));
+
+  phi::distributed::DistMetaTensor x(phi::make_ddim(x_shape), x_dist_attr);
+  phi::distributed::DistMetaTensor y(phi::make_ddim(y_shape), y_dist_attr);
+  phi::distributed::DistMetaTensor out1(phi::make_ddim(out1_shape),
+                                        out1_dist_attr);
+  phi::distributed::DistMetaTensor out2(phi::make_ddim(out2_shape),
+                                        out2_dist_attr);
+
+  // 2 inputs 2 outputs, batch axis sharding is propagatd while other axes are
+  // replicatd call in vector arguments format
+  auto infered_dist_attrs_st =
+      phi::distributed::DefaultDataParallelSpmdInferForward({&x, &y},
+                                                            {&out1, &out2});
+  // call in variadic arguments format
+  auto infered_dist_attrs_dy =
+      phi::distributed::PhiDefaultDataParallelSpmdInferForward(
+          x, y, &out1, &out2);
+
+  size_t input_size = 2;
+  size_t output_size = 2;
+  EXPECT_EQ(infered_dist_attrs_st.first.size(), input_size);
+  EXPECT_EQ(infered_dist_attrs_st.second.size(), output_size);
+  EXPECT_EQ(infered_dist_attrs_dy.first.size(), input_size);
+  EXPECT_EQ(infered_dist_attrs_dy.second.size(), output_size);
+
+  EXPECT_EQ(infered_dist_attrs_st.first[0].dims_mapping(),
+            std::vector<int64_t>({0, -1, -1, -1}));
+  EXPECT_EQ(infered_dist_attrs_st.first[1].dims_mapping(),
+            std::vector<int64_t>({0, -1}));
+  EXPECT_EQ(infered_dist_attrs_st.second[0].dims_mapping(),
+            std::vector<int64_t>({0, -1, -1, -1}));
+  EXPECT_EQ(infered_dist_attrs_st.second[1].dims_mapping(),
+            std::vector<int64_t>({0, -1, -1}));
+  EXPECT_EQ(infered_dist_attrs_st.first[0].is_partial(), false);
+  EXPECT_EQ(infered_dist_attrs_st.first[1].is_partial(), false);
+  EXPECT_EQ(infered_dist_attrs_st.second[0].is_partial(), false);
+  EXPECT_EQ(infered_dist_attrs_st.second[1].is_partial(), false);
+  EXPECT_EQ(infered_dist_attrs_st.first, infered_dist_attrs_dy.first);
+  EXPECT_EQ(infered_dist_attrs_st.second, infered_dist_attrs_dy.second);
+  VLOG(4) << "test1 done." << std::endl << std::endl << std::endl;
+
+  // 1 inputs 3 outputs, batch axis is un-sharded
+  // call in vector arguments format
+  infered_dist_attrs_st = phi::distributed::DefaultDataParallelSpmdInferForward(
+      {&x}, {&y, &out1, &out2});
+  // call in variadic arguments format
+  infered_dist_attrs_dy =
+      phi::distributed::PhiDefaultDataParallelSpmdInferForward(
+          x, &y, &out1, &out2);
+
+  input_size = 1;
+  output_size = 3;
+  EXPECT_EQ(infered_dist_attrs_st.first.size(), input_size);
+  EXPECT_EQ(infered_dist_attrs_st.second.size(), output_size);
+  EXPECT_EQ(infered_dist_attrs_dy.first.size(), input_size);
+  EXPECT_EQ(infered_dist_attrs_dy.second.size(), output_size);
+
+  EXPECT_EQ(infered_dist_attrs_dy.first[0].dims_mapping(),
+            std::vector<int64_t>({-1, -1, -1, -1}));
+  EXPECT_EQ(infered_dist_attrs_dy.second[0].dims_mapping(),
+            std::vector<int64_t>({-1, -1}));
+  EXPECT_EQ(infered_dist_attrs_dy.second[1].dims_mapping(),
+            std::vector<int64_t>({-1, -1, -1, -1}));
+  EXPECT_EQ(infered_dist_attrs_dy.second[2].dims_mapping(),
+            std::vector<int64_t>({-1, -1, -1}));
+  EXPECT_EQ(infered_dist_attrs_st.first, infered_dist_attrs_dy.first);
+  EXPECT_EQ(infered_dist_attrs_st.second, infered_dist_attrs_dy.second);
+  VLOG(4) << "test2 done." << std::endl << std::endl << std::endl;
+
+  // conflict on batch axis
+  x_dist_attr.set_dims_mapping(std::vector<int64_t>({0, -1, -1, -1}));
+  y_dist_attr.set_dims_mapping(std::vector<int64_t>({-1, -1}));
+  out1_dist_attr.set_dims_mapping(std::vector<int64_t>({1, -1, -1, -1}));
+  x = phi::distributed::DistMetaTensor(phi::make_ddim(x_shape), x_dist_attr);
+  y = phi::distributed::DistMetaTensor(phi::make_ddim(y_shape), y_dist_attr);
+  out1 = phi::distributed::DistMetaTensor(phi::make_ddim(out1_shape),
+                                          out1_dist_attr);
+
+  EXPECT_ANY_THROW(infered_dist_attrs_st =
+                       phi::distributed::DefaultDataParallelSpmdInferForward(
+                           {&x, &y, &out1}, {&out2}));
+  // call in variadic arguments format
+  EXPECT_ANY_THROW(infered_dist_attrs_dy =
+                       phi::distributed::PhiDefaultDataParallelSpmdInferForward(
+                           x, y, out1, &out2));
+
+  VLOG(4) << "test3 done." << std::endl << std::endl << std::endl;
+
+  // 2 inputs 2 outputs, backward
+  // call in vector arguments format
+  out1_dist_attr.set_dims_mapping(std::vector<int64_t>({-1, 0, 1, -1}));
+  out2_dist_attr.set_dims_mapping(std::vector<int64_t>({0, 1, -1}));
+  out1 = phi::distributed::DistMetaTensor(phi::make_ddim(out1_shape),
+                                          out1_dist_attr);
+  out2 = phi::distributed::DistMetaTensor(phi::make_ddim(out2_shape),
+                                          out2_dist_attr);
+
+  infered_dist_attrs_st =
+      phi::distributed::DefaultDataParallelSpmdInferBackward({&x, &y},
+                                                             {&out1, &out2});
+  // call in variadic arguments format
+  infered_dist_attrs_dy =
+      phi::distributed::PhiDefaultDataParallelSpmdInferBackward(
+          x, y, &out1, &out2);
+
+  input_size = 2;
+  output_size = 2;
+  EXPECT_EQ(infered_dist_attrs_st.first.size(), input_size);
+  EXPECT_EQ(infered_dist_attrs_st.second.size(), output_size);
+  EXPECT_EQ(infered_dist_attrs_dy.first.size(), input_size);
+  EXPECT_EQ(infered_dist_attrs_dy.second.size(), output_size);
+
+  EXPECT_EQ(infered_dist_attrs_dy.first[0].dims_mapping(),
+            std::vector<int64_t>({0, -1, -1, -1}));
+  EXPECT_EQ(infered_dist_attrs_dy.first[1].dims_mapping(),
+            std::vector<int64_t>({0, -1}));
+  EXPECT_EQ(infered_dist_attrs_dy.second[0].dims_mapping(),
+            std::vector<int64_t>({0, -1, -1, -1}));
+  EXPECT_EQ(infered_dist_attrs_dy.second[1].dims_mapping(),
+            std::vector<int64_t>({0, -1, -1}));
+  EXPECT_EQ(infered_dist_attrs_st.first, infered_dist_attrs_dy.first);
+  EXPECT_EQ(infered_dist_attrs_st.second, infered_dist_attrs_dy.second);
+  VLOG(4) << "test4 done." << std::endl << std::endl << std::endl;
+}
+
 }  // namespace auto_parallel
 }  // namespace distributed
 }  // namespace paddle
diff --git a/test/cpp/fluid/cinn/cinn_launch_context_test.cc b/test/cpp/fluid/cinn/cinn_launch_context_test.cc
index dcb9d6f25c497..5e7fbea5d876f 100644
--- a/test/cpp/fluid/cinn/cinn_launch_context_test.cc
+++ b/test/cpp/fluid/cinn/cinn_launch_context_test.cc
@@ -35,9 +35,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/operators/cinn/cinn_op_helper.h"
-#include "paddle/ir/core/program.h"
-#include "paddle/ir/core/value.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/pir/core/program.h"
+#include "paddle/pir/core/value.h"
 
 USE_OP_ITSELF(cinn_instruction_run);
 PD_DECLARE_KERNEL(cinn_instruction_run, CPU, ALL_LAYOUT);
diff --git a/test/cpp/inference/api/trt_mark_trt_engine_outputs_test.cc b/test/cpp/inference/api/trt_mark_trt_engine_outputs_test.cc
index 7157f442ae66d..2694415194974 100644
--- a/test/cpp/inference/api/trt_mark_trt_engine_outputs_test.cc
+++ b/test/cpp/inference/api/trt_mark_trt_engine_outputs_test.cc
@@ -25,10 +25,7 @@ TEST(TensorRT, mark_trt_engine_outputs) {
   config.EnableTensorRtEngine(
       1 << 30, 1, 5, AnalysisConfig::Precision::kFloat32, false, false);
   // The name of the tensor that needs to be marked
-  std::vector<std::string> markOutput = {"pool2d_0.tmp_0",
-                                         "elementwise_add_0.tmp_0",
-                                         "conv2d_5.tmp_0",
-                                         "batch_norm_6.tmp_2"};
+  std::vector<std::string> markOutput = {};
   config.MarkTrtEngineOutputs(markOutput);
 
   std::vector<std::vector<PaddleTensor>> inputs_all;
diff --git a/test/cpp/ir/core/TestParserText.txt b/test/cpp/ir/core/TestParserText.txt
deleted file mode 100644
index e90248086ebda..0000000000000
--- a/test/cpp/ir/core/TestParserText.txt
+++ /dev/null
@@ -1,43 +0,0 @@
-
-//CHECK attribute
-(String)sdfgs.sdsd
-
-//CHECK type
-f32
-
-//CHECK type
-pd.tensor<256xf32>
-
-//CHECK program
-{
- (%0) = "builtin.get_parameter" () {parameter_name:(String)conv2d_0.w_0} : () -> pd.tensor<64x3x7x7xf32>
- (%1) = "pd.feed" () {col:(Int32)0,is_persisable:(Array)[false],name:(String)data,stop_gradient:(Array)[true]} : () -> pd.tensor<-1x3x224x224xf32>
- (%2) = "pd.conv2d" (%1, %0) {data_format:(String)NCHW,dilations:(Array)[(Int32)1,(Int32)1],groups:(Int32)1,is_persisable:(Array)[false],padding_algorithm:(String)EXPLICIT,paddings:(Array)[(Int32)3,(Int32)3],stop_gradient:(Array)[false],strides:(Array)[(Int32)2,(Int32)2]} : (pd.tensor<-1x3x224x224xf32>, pd.tensor<64x3x7x7xf32>) -> pd.tensor<-1x64x112x112xf32>
-}
-
-//CHECK attribute
-(Array)[(pd.DataType)bool,(pd.DataType)float32,(pd.DataType)float64,
-(pd.DataType)complex64,(pd.DataType)complex128,(pd.DataType)Undefined,
-(pd.DataType)Undefined,(pd.DataType)Undefined,(pd.DataType)Undefined,
-(pd.DataType)bfloat16,(pd.DataType)uint8,(pd.DataType)uint32,(pd.DataType)int8,
-(pd.DataType)uint16,(pd.DataType)int16,(pd.DataType)int32,(pd.DataType)uint64,(pd.DataType)int64]
-
-
-//CHECK attribute
-(Array)[(pd.Place)Place(gpu:0),(pd.Place)Place(gpu_pinned),(pd.Place)Place(gpu_pinned),
-(pd.Place)Place(xpu:0),(pd.Place)Place(ipu:0),(pd.Place)Place(:0),(pd.Place)Place(cpu)]
-
-
-//CHECK attribute
-(Array)[(pd.DataLayout)NHWC,(pd.DataLayout)STRIDED,(pd.DataLayout)NCHW,(pd.DataLayout)Undefined(AnyLayout),
-(pd.DataLayout)ONEDNN,(pd.DataLayout)SPARSE_COO,(pd.DataLayout)SPARSE_CSR,(pd.DataLayout)NDHWC,(pd.DataLayout)NCDHW,
-(pd.DataLayout)PSTRING_UNION]
-
-//CHECK attribute
-(Array)[(Double)1,(Int64)0,(String)1]
-
-//CHECK type
-vec[bf16,f64,b,i8,u8,i16,c64,c128]
-
-//CHECK attribute
-(String)1
diff --git a/test/cpp/ir/core/ir_builder_test.cc b/test/cpp/ir/core/ir_builder_test.cc
deleted file mode 100644
index 863bac72da9c2..0000000000000
--- a/test/cpp/ir/core/ir_builder_test.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include <map>
-
-#include "paddle/ir/core/builder.h"
-#include "paddle/ir/core/builtin_attribute.h"
-#include "paddle/ir/core/builtin_type.h"
-#include "paddle/ir/core/ir_context.h"
-#include "paddle/ir/core/program.h"
-
-TEST(builder_test, type_api) {
-  ir::IrContext ctx;
-  ir::Builder builder(&ctx);
-  EXPECT_EQ(ir::UInt8Type::get(&ctx), builder.uint8_type());
-  EXPECT_EQ(ir::Int8Type::get(&ctx), builder.int8_type());
-  EXPECT_EQ(ir::VectorType::get(&ctx, std::vector<ir::Type>()),
-            builder.vec_type({}));
-  EXPECT_EQ(ir::BFloat16Type::get(&ctx), builder.bfloat16_type());
-  EXPECT_EQ(ir::Float32Type::get(&ctx), builder.float32_type());
-  EXPECT_EQ(ir::Float64Type::get(&ctx), builder.float64_type());
-  EXPECT_EQ(ir::IndexType::get(&ctx), builder.index_type());
-  EXPECT_EQ(ir::Int16Type::get(&ctx), builder.int16_type());
-  EXPECT_EQ(ir::BoolType::get(&ctx), builder.bool_type());
-  EXPECT_EQ(ir::Complex64Type::get(&ctx), builder.complex64_type());
-  EXPECT_EQ(ir::Complex128Type::get(&ctx), builder.complex128_type());
-}
-
-TEST(builder_test, attribute_api) {
-  ir::IrContext ctx;
-  ir::Builder builder(&ctx);
-  EXPECT_EQ(ir::StrAttribute::get(&ctx, "test"), builder.str_attr("test"));
-  EXPECT_EQ(ir::BoolAttribute::get(&ctx, true), builder.bool_attr(true));
-  EXPECT_EQ(ir::FloatAttribute::get(&ctx, 0.2f), builder.float_attr(0.2f));
-  EXPECT_EQ(ir::DoubleAttribute::get(&ctx, 2.0), builder.double_attr(2.0));
-  EXPECT_EQ(ir::Int32Attribute::get(&ctx, 2), builder.int32_attr(2));
-  EXPECT_EQ(ir::Int64Attribute::get(&ctx, 2), builder.int64_attr(2));
-  EXPECT_EQ(ir::ArrayAttribute::get(&ctx, std::vector<ir::Attribute>()),
-            builder.array_attr({}));
-  EXPECT_EQ(ir::PointerAttribute::get(&ctx, nullptr),
-            builder.pointer_attr(nullptr));
-}
diff --git a/test/cpp/ir/shape_dialect/symbolic_op_test.cc b/test/cpp/ir/shape_dialect/symbolic_op_test.cc
deleted file mode 100644
index 138e5e5b0d8c9..0000000000000
--- a/test/cpp/ir/shape_dialect/symbolic_op_test.cc
+++ /dev/null
@@ -1,366 +0,0 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include <map>
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_dialect.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_type.h"
-#include "paddle/ir/core/block.h"
-#include "paddle/ir/core/builder.h"
-#include "paddle/ir/core/builtin_type.h"
-#include "paddle/ir/core/dialect.h"
-#include "paddle/ir/core/ir_context.h"
-#include "paddle/ir/core/program.h"
-#include "paddle/ir/dialect/shape/ir/shape_dialect.h"
-#include "paddle/ir/dialect/shape/ir/shape_op.h"
-#include "paddle/ir/dialect/shape/utils/shape_utils.h"
-
-TEST(assist_struct_test, symbolic_dim) {
-  ir::IrContext *ctx = ir::IrContext::Instance();
-  ir::Program program(ctx);
-  ctx->GetOrRegisterDialect<ir::dialect::ShapeDialect>();
-  ir::Builder builder = ir::Builder(ctx, program.block());
-  ir::dialect::SymbolicDim symDim = builder.Build<ir::dialect::SymbolicDim>(
-      "S0", 10, false, false, false, false);
-  ir::dialect::SymbolicDim symDim_ = builder.Build<ir::dialect::SymbolicDim>(
-      "S1", 10, false, false, false, false);
-  EXPECT_EQ(symDim.getValue(), 10);
-  EXPECT_EQ(symDim.getSymName(), "S0");
-  EXPECT_FALSE(symDim.getKnownNegativeOne());
-  EXPECT_FALSE(symDim.getKnownNonSizeOne());
-  EXPECT_FALSE(symDim.getKnownNonSizeZero());
-  EXPECT_FALSE(symDim.getKnownNonNegative());
-
-  EXPECT_FALSE(symDim.isDynamic());
-  EXPECT_TRUE(symDim.merge(symDim_));
-
-  symDim.updateValue(20);
-  symDim.updateSymName("S2");
-  symDim.updateKnownNegativeOne(true);
-  symDim.updateKnownNonSizeOne(true);
-  symDim.updateKnownNonSizeZero(true);
-  symDim.updateKnownNonNegative(true);
-
-  EXPECT_FALSE(symDim.merge(symDim_));
-
-  EXPECT_EQ(symDim.getValue(), 20);
-  EXPECT_EQ(symDim.getSymName(), "S2");
-  EXPECT_TRUE(symDim.getKnownNegativeOne());
-  EXPECT_TRUE(symDim.getKnownNonSizeOne());
-  EXPECT_TRUE(symDim.getKnownNonSizeZero());
-  EXPECT_TRUE(symDim.getKnownNonNegative());
-}
-
-TEST(assist_struct_test, symbolic_dim_product) {
-  ir::IrContext *ctx = ir::IrContext::Instance();
-  ir::Program program(ctx);
-  ctx->GetOrRegisterDialect<ir::dialect::ShapeDialect>();
-  ir::Builder builder = ir::Builder(ctx, program.block());
-  ir::dialect::SymbolicDim symDim = builder.Build<ir::dialect::SymbolicDim>(
-      "S0", -100000, false, false, false, false);
-  ir::SymbolicDimProduct symDimProduct;
-  ir::SymbolicDimProduct symDimProduct_;
-  symDimProduct.symbols.push_back(symDim);
-  symDimProduct.factor *= 10;
-  EXPECT_EQ(symDimProduct.factor, 10);
-  EXPECT_NE(symDimProduct, symDimProduct_);
-  EXPECT_FALSE(symDimProduct.empty());
-}
-
-TEST(assist_struct_test, symbolic_dim_table) {
-  ir::IrContext *ctx = ir::IrContext::Instance();
-  ir::Program program(ctx);
-  ctx->GetOrRegisterDialect<ir::dialect::ShapeDialect>();
-  ir::Builder builder = ir::Builder(ctx, program.block());
-  ir::dialect::SymbolicDim symDim = builder.Build<ir::dialect::SymbolicDim>(
-      "S0", 10, false, false, false, false);
-
-  ir::SymbolTable symbolTable(program.module_op());
-  EXPECT_EQ(symbolTable.insert(symDim), "S0");
-  EXPECT_EQ(symbolTable.lookup<ir::dialect::SymbolicDim>("S0"), symDim);
-  EXPECT_EQ(symbolTable.getOp(), program.module_op());
-  EXPECT_FALSE(symbolTable.lookup<ir::dialect::SymbolicDim>("S1"));
-}
-
-TEST(assist_struct_test, symbolic_dim_mgr_simple) {
-  /******************************************************/
-  /* Mgr simple version, only SymbolicDim related func. */
-  /******************************************************/
-  ir::IrContext *ctx = ir::IrContext::Instance();
-  ir::Program program(ctx);
-  ctx->GetOrRegisterDialect<ir::dialect::ShapeDialect>();
-  ctx->GetOrRegisterDialect<paddle::dialect::PaddleDialect>();
-
-  ir::SymbolicDimMgr symDimMgr(program.module_op());
-  ir::dialect::SymbolicDim symDimS0 = symDimMgr.newSymbolicDim();
-  ir::dialect::SymbolicDim symDimS1 = symDimMgr.newSymbolicDim();
-  ir::dialect::SymbolicDim symDimC10 = symDimMgr.newConstantSymbolicDim(10);
-  symDimMgr.mapSymbolicDimEqual(symDimS0, symDimS1);
-
-  ir::Attribute attr_value = ir::StrAttribute::get(ctx, "op_attr");
-  ir::AttributeMap attr_map;
-  attr_map.insert(std::pair<std::string, ir::Attribute>("op", attr_value));
-  std::vector<ir::OpResult> op_inputs = {};
-
-  ir::Type fp32_dtype = ir::Float32Type::get(ctx);
-  phi::DDim dims = {-100000, 2};
-  phi::DataLayout data_layout = phi::DataLayout::NCHW;
-  phi::LoD lod = {{0, 1, 2}};
-  size_t offset = 0;
-  std::vector<ir::Type> op_output_types = {
-      paddle::dialect::DenseTensorType::get(
-          ctx, fp32_dtype, dims, data_layout, lod, offset)};
-  ir::Operation *op =
-      ir::Operation::Create(op_inputs, attr_map, op_output_types, ir::OpInfo());
-  ir::Value res = op->result(0);
-
-  std::vector<ir::dialect::SymbolicDim> symDimVec =
-      symDimMgr.createSymbolicDimsForRankedValue(res);
-
-  EXPECT_EQ(symDimS0.getSymName(), "S0");
-  EXPECT_EQ(symDimS1.getSymName(), "S1");
-  EXPECT_EQ(symDimS1.getValue(), -100000);
-  EXPECT_EQ(symDimC10.getSymName(), "C10");
-  EXPECT_EQ(symDimC10.getValue(), 10);
-  EXPECT_EQ(symDimVec[0].getSymName(), "S2");
-  EXPECT_EQ(symDimVec[1].getSymName(), "C2");
-  EXPECT_EQ(symDimMgr.symbolTable().lookup<ir::dialect::SymbolicDim>("S0"),
-            symDimS0);
-  EXPECT_EQ(symDimMgr.symbolTable().lookup<ir::dialect::SymbolicDim>("C10"),
-            symDimC10);
-  EXPECT_EQ(symDimMgr.getRootSymbolicDim(symDimS1), symDimS0);
-  EXPECT_TRUE(symDimMgr.isSymbolicDimEqual(symDimS0, symDimS1));
-  EXPECT_FALSE(symDimMgr.isSymbolicDimEqual(symDimS0, symDimC10));
-}
-
-TEST(assist_struct_test, symbolic_dim_mgr_complex) {
-  /***************************************************************/
-  /* Mgr with constraintOp, and SymbolicDimProduct related func. */
-  /***************************************************************/
-  ir::IrContext *ctx = ir::IrContext::Instance();
-  ir::Program program(ctx);
-  ctx->GetOrRegisterDialect<ir::dialect::ShapeDialect>();
-  ctx->GetOrRegisterDialect<paddle::dialect::PaddleDialect>();
-  ir::Builder builder = ir::Builder(ctx, program.block());
-
-  ir::dialect::SymbolicDim symDimS0 = builder.Build<ir::dialect::SymbolicDim>(
-      "S0", -100000, false, false, true, true);
-  ir::dialect::SymbolicDim symDimS1 = builder.Build<ir::dialect::SymbolicDim>(
-      "S1", -100000, false, false, true, true);
-  ir::dialect::SymbolicDim symDimS2 = builder.Build<ir::dialect::SymbolicDim>(
-      "S2", -100000, false, false, true, true);
-  ir::dialect::SymbolicDim symDimS3 = builder.Build<ir::dialect::SymbolicDim>(
-      "S3", -100000, false, false, true, true);
-  ir::dialect::SymbolicDim symDimS4 = builder.Build<ir::dialect::SymbolicDim>(
-      "S4", -100000, false, false, true, true);
-  ir::dialect::SymbolicDim symDimS5 = builder.Build<ir::dialect::SymbolicDim>(
-      "S5", -100000, false, false, true, true);
-  ir::dialect::SymbolicDim symDimS6 = builder.Build<ir::dialect::SymbolicDim>(
-      "S6", -100000, false, false, true, true);
-  ir::dialect::SymbolicDim symDimS7 = builder.Build<ir::dialect::SymbolicDim>(
-      "S7", -100000, false, false, true, true);
-  ir::dialect::SymbolicDim symDimS8 = builder.Build<ir::dialect::SymbolicDim>(
-      "S8", -100000, false, false, true, true);
-  ir::dialect::SymbolicDim symDimS9 = builder.Build<ir::dialect::SymbolicDim>(
-      "S9", -100000, false, false, true, true);
-  ir::dialect::SymbolicDim symDimS10 = builder.Build<ir::dialect::SymbolicDim>(
-      "S10", -100000, false, false, true, true);
-  ir::dialect::SymbolicDim symDimS11 = builder.Build<ir::dialect::SymbolicDim>(
-      "S11", -100000, false, false, true, true);
-  ir::dialect::SymbolicDim symDimS12 = builder.Build<ir::dialect::SymbolicDim>(
-      "S12", -100000, false, false, true, false);
-  ir::dialect::SymbolicDim symDimC10 = builder.Build<ir::dialect::SymbolicDim>(
-      "C10", 10, true, false, true, true);
-  ir::dialect::SymbolicDim symDimC20 = builder.Build<ir::dialect::SymbolicDim>(
-      "C20", 20, true, false, true, true);
-
-  ir::OpResult dimOpS0 = builder.Build<ir::dialect::DimOp>("S0").out();
-  ir::OpResult dimOpS1 = builder.Build<ir::dialect::DimOp>("S1").out();
-  ir::OpResult dimOpS2 = builder.Build<ir::dialect::DimOp>("S2").out();
-  ir::OpResult dimOpS3 = builder.Build<ir::dialect::DimOp>("S3").out();
-  ir::OpResult dimOpS4 = builder.Build<ir::dialect::DimOp>("S4").out();
-  ir::OpResult dimOpS5 = builder.Build<ir::dialect::DimOp>("S5").out();
-  ir::OpResult dimOpS6 = builder.Build<ir::dialect::DimOp>("S6").out();
-  ir::OpResult dimOpS7 = builder.Build<ir::dialect::DimOp>("S7").out();
-  ir::OpResult dimOpS8 = builder.Build<ir::dialect::DimOp>("S8").out();
-  ir::OpResult dimOpS9 = builder.Build<ir::dialect::DimOp>("S9").out();
-  ir::OpResult dimOpS10 = builder.Build<ir::dialect::DimOp>("S10").out();
-  ir::OpResult dimOpS11 = builder.Build<ir::dialect::DimOp>("S11").out();
-  ir::OpResult dimOpC10 = builder.Build<ir::dialect::DimOp>("C10").out();
-  ir::OpResult dimOpC20 = builder.Build<ir::dialect::DimOp>("C20").out();
-  ir::OpResult constant =
-      builder
-          .Build<ir::ConstantOp>(ir::Int32Attribute::get(ctx, 2),
-                                 ir::Int32Type::get(ctx))
-          ->result(0);
-
-  // Mark S1 == S2.
-  builder.Build<ir::dialect::TieProductEqualOp>(
-      2, 2, std::vector<ir::OpResult>{constant, dimOpS1, dimOpS2, constant});
-  // Mark S0 * S1 == S2 * S3, For check S0 == S3.
-  builder.Build<ir::dialect::TieProductEqualOp>(
-      2, 2, std::vector<ir::OpResult>{dimOpS0, dimOpS1, dimOpS2, dimOpS3});
-  // Mark S4 * S0 * S1 == S2 * S3 * S5, For check S4 == S5.
-  builder.Build<ir::dialect::TieProductEqualOp>(
-      3,
-      3,
-      std::vector<ir::OpResult>{
-          dimOpS4, dimOpS0, dimOpS1, dimOpS2, dimOpS3, dimOpS5});
-  // For check S6 == C10 * C20.
-  builder.Build<ir::dialect::TieProductEqualOp>(
-      1, 2, std::vector<ir::OpResult>{dimOpS6, dimOpC10, dimOpC20});
-  // Mark C10 * S0 * S1 == S2 * S3 * S7, for check C10 == S7.
-  builder.Build<ir::dialect::TieProductEqualOp>(
-      3,
-      3,
-      std::vector<ir::OpResult>{
-          dimOpC10, dimOpS0, dimOpS1, dimOpS2, dimOpS3, dimOpS7});
-
-  // Mark S8 * S9 == S10 * S11, for unsimplify product case
-  builder.Build<ir::dialect::TieProductEqualOp>(
-      2, 2, std::vector<ir::OpResult>{dimOpS8, dimOpS9, dimOpS10, dimOpS11});
-
-  ir::SymbolicDimMgr symDimMgr(program.module_op());
-
-  symDimMgr.load();
-
-  // For check indirect equality: S1 * S4 == S2 * S5
-  ir::SymbolicDimProduct symDimProductLhs;
-  ir::SymbolicDimProduct symDimProductRhs;
-
-  symDimProductLhs.symbols.push_back(symDimS1);
-  symDimProductLhs.symbols.push_back(symDimS4);
-
-  symDimProductRhs.symbols.push_back(symDimS2);
-  symDimProductRhs.symbols.push_back(symDimS5);
-
-  // For uncompletely simplied product check: S8 * S9 * S12 == S10 * S11 * S12
-  ir::SymbolicDimProduct symDimProductLhs_;
-  ir::SymbolicDimProduct symDimProductRhs_;
-
-  symDimProductLhs_.symbols.push_back(symDimS8);
-  symDimProductLhs_.symbols.push_back(symDimS9);
-  symDimProductLhs_.symbols.push_back(symDimS12);
-
-  symDimProductRhs_.symbols.push_back(symDimS10);
-  symDimProductRhs_.symbols.push_back(symDimS11);
-  symDimProductRhs_.symbols.push_back(symDimS12);
-
-  // For check simplifySymbolicDimProduct, {factor = 1, Sym = {S7}} => {factor =
-  // 10}
-  ir::SymbolicDimProduct symDimProductS7;
-  symDimProductS7.symbols.push_back(symDimS7);
-  ir::SymbolicDimProduct simplifiedProductS7 =
-      symDimMgr.simplifySymbolicDimProduct(symDimProductS7);
-
-  // For check simplifySymbolicDimProductPair, X * Y * Y, Y * Y * Z => X, Z
-  ir::SymbolicDimProduct symDimProductPairLhs;
-  ir::SymbolicDimProduct symDimProductPairRhs;
-  ir::SymbolicDimProduct newLhs, newRhs;
-  symDimProductPairLhs.symbols.push_back(symDimS4);
-  symDimProductPairLhs.symbols.push_back(symDimS1);
-  symDimProductPairLhs.symbols.push_back(symDimS2);
-  symDimProductPairRhs.symbols.push_back(symDimS1);
-  symDimProductPairRhs.symbols.push_back(symDimS2);
-  symDimProductPairRhs.symbols.push_back(symDimS3);
-
-  std::tie(newLhs, newRhs) = symDimMgr.simplifySymbolicDimProductPair(
-      symDimProductPairLhs, symDimProductPairRhs);
-
-  // For check symbolicDimProductDivide, {S4 * S1 * C20} / {S1 * C10} => {factor
-  // = 2 Sym = {S4}}
-  ir::SymbolicDimProduct symDimProductDivLhs;
-  ir::SymbolicDimProduct symDimProductDivRhs;
-  symDimProductDivLhs.symbols.push_back(symDimS4);
-  symDimProductDivLhs.symbols.push_back(symDimS1);
-  symDimProductDivLhs.symbols.push_back(symDimC20);
-  symDimProductDivRhs.symbols.push_back(symDimS1);
-  symDimProductDivRhs.symbols.push_back(symDimC10);
-
-  ir::SymbolicDimProduct *divRes = symDimMgr.symbolicDimProductDivide(
-      symDimProductDivLhs, symDimProductDivRhs);
-
-  EXPECT_TRUE(symDimMgr.isSymbolicDimEqual(symDimS1, symDimS2));
-  EXPECT_TRUE(symDimMgr.isSymbolicDimEqual(symDimS0, symDimS3));
-  EXPECT_TRUE(symDimMgr.isSymbolicDimEqual(symDimS4, symDimS5));
-  EXPECT_EQ(symDimS6.getValue(), 200);
-  EXPECT_EQ(symDimMgr.symbolTable().lookup<ir::dialect::SymbolicDim>("C20"),
-            symDimC20);
-  EXPECT_EQ(symDimS7.getValue(), symDimC10.getValue());
-  EXPECT_EQ(simplifiedProductS7.factor, 10);
-  EXPECT_EQ(simplifiedProductS7.symbols.size(), static_cast<size_t>(0));
-  EXPECT_EQ(newLhs.symbols.size(), static_cast<size_t>(1));
-  EXPECT_EQ(newRhs.symbols.size(), static_cast<size_t>(1));
-  EXPECT_EQ(newLhs.symbols[0], symDimMgr.getRootSymbolicDim(symDimS4));
-  EXPECT_EQ(newRhs.symbols[0], symDimMgr.getRootSymbolicDim(symDimS3));
-  EXPECT_EQ(divRes->factor, 2);
-  EXPECT_EQ(divRes->symbols.size(), static_cast<size_t>(1));
-  EXPECT_EQ(divRes->symbols[0], symDimMgr.getRootSymbolicDim(symDimS4));
-  EXPECT_TRUE(
-      symDimMgr.isSymbolicDimProductEqual(symDimProductLhs, symDimProductRhs));
-  EXPECT_TRUE(symDimMgr.isSymbolicDimProductEqual(symDimProductLhs_,
-                                                  symDimProductRhs_));
-}
-
-TEST(assist_struct_test, dim) {
-  ir::IrContext *ctx = ir::IrContext::Instance();
-  ir::Program program(ctx);
-  ctx->GetOrRegisterDialect<ir::dialect::ShapeDialect>();
-  ir::Builder builder = ir::Builder(ctx, program.block());
-
-  ir::dialect::DimOp dimOp = builder.Build<ir::dialect::DimOp>("S0");
-  ir::OpResult res = dimOp.out();
-  EXPECT_EQ(dimOp.getName(), "S0");
-  dimOp.setName("S1");
-  EXPECT_EQ(dimOp.getName(), "S1");
-  EXPECT_EQ(res.GetDefiningOp(), dimOp.operation());
-  EXPECT_EQ(res.type(), ir::IndexType::get(ctx));
-}
-
-TEST(assist_struct_test, tie_product_equal) {
-  ir::IrContext *ctx = ir::IrContext::Instance();
-  ir::Program program(ctx);
-  ctx->GetOrRegisterDialect<ir::dialect::ShapeDialect>();
-  ir::Builder builder = ir::Builder(ctx, program.block());
-  ir::SymbolTable symbolTable(program.module_op());
-
-  ir::OpResult dimOp0 = builder.Build<ir::dialect::DimOp>("S0").out();
-  ir::OpResult dimOp1 = builder.Build<ir::dialect::DimOp>("S1").out();
-  ir::OpResult dimOp2 = builder.Build<ir::dialect::DimOp>("S2").out();
-  ir::OpResult dimOp3 = builder.Build<ir::dialect::DimOp>("S3").out();
-  ir::OpResult dimOp4 = builder.Build<ir::dialect::DimOp>("S4").out();
-
-  ir::dialect::TieProductEqualOp tie_product_equal =
-      builder.Build<ir::dialect::TieProductEqualOp>(
-          2,
-          3,
-          std::vector<ir::OpResult>{dimOp0, dimOp1, dimOp2, dimOp3, dimOp4});
-
-  std::vector<ir::Value> lhs = tie_product_equal.getLhs();
-  std::vector<ir::Value> rhs = tie_product_equal.getRhs();
-
-  std::vector<ir::Value> lhs_ref{dimOp0, dimOp1};
-  std::vector<ir::Value> rhs_ref{dimOp2, dimOp3, dimOp4};
-
-  EXPECT_EQ(symbolTable.insert(tie_product_equal), "tie_product_equal");
-  EXPECT_EQ(
-      symbolTable.lookup<ir::dialect::TieProductEqualOp>("tie_product_equal")
-          .size(),
-      static_cast<size_t>(1));
-  EXPECT_EQ(symbolTable.lookup<ir::dialect::TieProductEqualOp>(
-                "tie_product_equal")[0],
-            tie_product_equal);
-  EXPECT_EQ(lhs, lhs_ref);
-  EXPECT_EQ(rhs, rhs_ref);
-}
diff --git a/test/cpp/new_executor/CMakeLists.txt b/test/cpp/new_executor/CMakeLists.txt
index 6ce941d701b4a..d6d9b37f5bb14 100644
--- a/test/cpp/new_executor/CMakeLists.txt
+++ b/test/cpp/new_executor/CMakeLists.txt
@@ -4,7 +4,7 @@ if(NOT WIN32)
   cc_test(
     standalone_executor_new_ir_test
     SRCS standalone_executor_new_ir_test.cc
-    DEPS phi_kernel_adaptor pd_dialect pd_kernel_dialect ir)
+    DEPS phi_kernel_adaptor pd_op_dialect pd_kernel_dialect pir)
 endif()
 
 set(OPS
diff --git a/test/cpp/new_executor/standalone_executor_new_ir_test.cc b/test/cpp/new_executor/standalone_executor_new_ir_test.cc
index 9ef1d7f3215fb..b865dc61d1c4a 100644
--- a/test/cpp/new_executor/standalone_executor_new_ir_test.cc
+++ b/test/cpp/new_executor/standalone_executor_new_ir_test.cc
@@ -23,14 +23,14 @@
 #include "paddle/phi/core/kernel_registry.h"
 
 #include "paddle/fluid/framework/new_executor/new_ir_interpreter.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_dialect.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_op.h"
-#include "paddle/fluid/ir/transforms/pd_op_to_kernel_pass.h"
-#include "paddle/ir/core/builder.h"
-#include "paddle/ir/core/ir_context.h"
-#include "paddle/ir/core/program.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
+#include "paddle/fluid/pir/transforms/pd_op_to_kernel_pass.h"
+#include "paddle/pir/core/builder.h"
+#include "paddle/pir/core/ir_context.h"
+#include "paddle/pir/core/program.h"
 
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_type.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
 
 #include "paddle/fluid/platform/init_phi.h"
 
@@ -48,12 +48,12 @@ namespace paddle {
 namespace framework {
 
 TEST(StandaloneExecutor, run) {
-  ir::IrContext* ctx = ir::IrContext::Instance();
-  ir::Program program((ctx));
+  pir::IrContext* ctx = pir::IrContext::Instance();
+  pir::Program program((ctx));
 
-  ctx->GetOrRegisterDialect<paddle::dialect::PaddleDialect>();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
 
-  ir::Builder builder = ir::Builder(ctx, program.block());
+  pir::Builder builder = pir::Builder(ctx, program.block());
 
   paddle::dialect::FullOp op1 = builder.Build<paddle::dialect::FullOp>(
       std::vector<int64_t>{2, 2}, 1.0, phi::DataType::FLOAT32, phi::CPUPlace());
@@ -96,10 +96,10 @@ TEST(StandaloneExecutor, run) {
 }
 
 TEST(StandaloneExecutor, run_inplace_sqrt) {
-  ir::IrContext* ctx = ir::IrContext::Instance();
-  ir::Program program((ctx));
-  ctx->GetOrRegisterDialect<paddle::dialect::PaddleDialect>();
-  ir::Builder builder = ir::Builder(ctx, program.block());
+  pir::IrContext* ctx = pir::IrContext::Instance();
+  pir::Program program((ctx));
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+  pir::Builder builder = pir::Builder(ctx, program.block());
 
   paddle::dialect::FullOp full = builder.Build<paddle::dialect::FullOp>(
       std::vector<int64_t>{2, 2}, 4.0, phi::DataType::FLOAT32, phi::CPUPlace());
diff --git a/test/cpp/ir/CMakeLists.txt b/test/cpp/pir/CMakeLists.txt
similarity index 100%
rename from test/cpp/ir/CMakeLists.txt
rename to test/cpp/pir/CMakeLists.txt
diff --git a/test/cpp/ir/cinn/CMakeLists.txt b/test/cpp/pir/cinn/CMakeLists.txt
similarity index 93%
rename from test/cpp/ir/cinn/CMakeLists.txt
rename to test/cpp/pir/cinn/CMakeLists.txt
index 360b6fc29ac81..a3b51965f0459 100644
--- a/test/cpp/ir/cinn/CMakeLists.txt
+++ b/test/cpp/pir/cinn/CMakeLists.txt
@@ -6,8 +6,8 @@ if(WITH_TESTING AND WITH_CINN)
     DEPS
     new_ir_compiler
     convert_to_dialect
-    runtime_dialect
-    ir
+    cinn_runtime_dialect
+    pir
     phi
     gtest
     glog)
diff --git a/test/cpp/ir/cinn/jit_instruction_test.cc b/test/cpp/pir/cinn/jit_instruction_test.cc
similarity index 77%
rename from test/cpp/ir/cinn/jit_instruction_test.cc
rename to test/cpp/pir/cinn/jit_instruction_test.cc
index 9b6f5b2c20800..456b1719b2f65 100644
--- a/test/cpp/ir/cinn/jit_instruction_test.cc
+++ b/test/cpp/pir/cinn/jit_instruction_test.cc
@@ -22,22 +22,22 @@
 
 #include "paddle/fluid/framework/new_executor/interpretercore.h"
 
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_dialect.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_op.h"
-#include "paddle/ir/core/ir_context.h"
-#include "paddle/ir/core/program.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
+#include "paddle/pir/core/ir_context.h"
+#include "paddle/pir/core/program.h"
 
-#include "paddle/cinn/hlir/dialect/runtime_dialect/ir/jit_kernel_op.h"
-#include "paddle/cinn/hlir/dialect/runtime_dialect/ir/runtime_dialect.h"
+#include "paddle/cinn/hlir/dialect/runtime/ir/jit_kernel_op.h"
+#include "paddle/cinn/hlir/dialect/runtime/ir/runtime_dialect.h"
 #include "paddle/cinn/hlir/framework/convert_to_dialect.h"
 #include "paddle/cinn/hlir/framework/new_ir_compiler.h"
 #include "paddle/cinn/utils/data_util.h"
 
-std::unique_ptr<::ir::Program> BuildProgram() {
-  ::ir::IrContext* ctx = ::ir::IrContext::Instance();
-  ctx->GetOrRegisterDialect<paddle::dialect::PaddleDialect>();
-  auto program = std::make_unique<::ir::Program>(ctx);
-  ::ir::Builder builder = ::ir::Builder(ctx, program->block());
+std::unique_ptr<::pir::Program> BuildProgram() {
+  ::pir::IrContext* ctx = ::pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+  auto program = std::make_unique<::pir::Program>(ctx);
+  ::pir::Builder builder = ::pir::Builder(ctx, program->block());
 
   const float value = 2.0;
   auto full_op_x =
@@ -58,11 +58,11 @@ namespace paddle {
 namespace framework {
 
 TEST(CinnJitInstruction, Run) {
-  // Step 1: Construct ir::Program
-  std::unique_ptr<::ir::Program> program = BuildProgram();
+  // Step 1: Construct pir::Program
+  std::unique_ptr<::pir::Program> program = BuildProgram();
   EXPECT_EQ(program->block()->size(), 2u);
 
-  // Step 2: Compiler New ir::Program into Runtime Program
+  // Step 2: Compiler New pir::Program into Runtime Program
   auto target = cinn::common::DefaultNVGPUTarget();
   auto scope = cinn::hlir::framework::BuildScope(target, *program);
   ASSERT_EQ(scope->var_names().size(), 2);
@@ -71,7 +71,7 @@ TEST(CinnJitInstruction, Run) {
   auto runtime_program = ir_compiler.Build();
 
   // Step 3: Convert into cinn::dialect::RuntimeDialect
-  std::unique_ptr<::ir::Program> ir_runtime_program =
+  std::unique_ptr<::pir::Program> ir_runtime_program =
       cinn::hlir::framework::ConvertToRuntimeDialect(*runtime_program);
 
   std::set<std::string> out_names;
diff --git a/test/cpp/ir/cinn/new_ir_compiler_test.cc b/test/cpp/pir/cinn/new_ir_compiler_test.cc
similarity index 77%
rename from test/cpp/ir/cinn/new_ir_compiler_test.cc
rename to test/cpp/pir/cinn/new_ir_compiler_test.cc
index 91fb7cb13cc09..283e415a38130 100644
--- a/test/cpp/ir/cinn/new_ir_compiler_test.cc
+++ b/test/cpp/pir/cinn/new_ir_compiler_test.cc
@@ -20,15 +20,15 @@
 #include <tuple>
 #include <unordered_map>
 
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_dialect.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_op.h"
-#include "paddle/ir/core/ir_context.h"
-#include "paddle/ir/core/program.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
+#include "paddle/pir/core/ir_context.h"
+#include "paddle/pir/core/program.h"
 
 #include "paddle/cinn/utils/data_util.h"
 
-#include "paddle/cinn/hlir/dialect/runtime_dialect/ir/jit_kernel_op.h"
-#include "paddle/cinn/hlir/dialect/runtime_dialect/ir/runtime_dialect.h"
+#include "paddle/cinn/hlir/dialect/runtime/ir/jit_kernel_op.h"
+#include "paddle/cinn/hlir/dialect/runtime/ir/runtime_dialect.h"
 #include "paddle/cinn/hlir/framework/convert_to_dialect.h"
 #include "paddle/cinn/hlir/framework/new_ir_compiler.h"
 
@@ -36,12 +36,12 @@ using cinn::hlir::framework::newir::Group;
 using cinn::hlir::framework::newir::GroupPtr;
 
 using ProgramInfo =
-    std::tuple<std::shared_ptr<::ir::Program>, std::vector<GroupPtr>>;
+    std::tuple<std::shared_ptr<::pir::Program>, std::vector<GroupPtr>>;
 ProgramInfo BuildProgram() {
-  ::ir::IrContext* ctx = ::ir::IrContext::Instance();
-  ctx->GetOrRegisterDialect<paddle::dialect::PaddleDialect>();
-  auto program = std::make_shared<::ir::Program>(ctx);
-  ::ir::Builder builder = ::ir::Builder(ctx, program->block());
+  ::pir::IrContext* ctx = ::pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+  auto program = std::make_shared<::pir::Program>(ctx);
+  ::pir::Builder builder = ::pir::Builder(ctx, program->block());
 
   const float value_one = 1.0;  // relu(tan(1.)) = 1.5;
   const float value_two = 2.0;  // relu(tan(2.)) = 0.
@@ -64,23 +64,23 @@ ProgramInfo BuildProgram() {
 
   std::vector<GroupPtr> groups;
   groups.emplace_back(
-      std::make_shared<Group>(std::initializer_list<::ir::Operation*>(
+      std::make_shared<Group>(std::initializer_list<::pir::Operation*>(
           {full_op_x.operation()})));  // For coverage
   groups.emplace_back(std::make_shared<Group>(
-      std::initializer_list<::ir::Operation*>({full_op_y.operation()})));
+      std::initializer_list<::pir::Operation*>({full_op_y.operation()})));
   groups.emplace_back(std::make_shared<Group>(
-      std::vector<::ir::Operation*>({tan_op_x.operation(),
-                                     relu_op_x.operation(),
-                                     tan_op_y.operation(),
-                                     relu_op_y.operation()})));
+      std::vector<::pir::Operation*>({tan_op_x.operation(),
+                                      relu_op_x.operation(),
+                                      tan_op_y.operation(),
+                                      relu_op_y.operation()})));
 
   return {program, groups};
 }
 
 TEST(NewIRCompier, CompilerAndRun) {
-  // Step 1: Construct ir::Program
+  // Step 1: Construct pir::Program
   auto prog_info = BuildProgram();
-  std::shared_ptr<::ir::Program> program = std::get<0>(prog_info);
+  std::shared_ptr<::pir::Program> program = std::get<0>(prog_info);
   EXPECT_EQ(program->block()->size(), 6u);
   LOG(INFO) << program->block()->size();
 
@@ -88,7 +88,7 @@ TEST(NewIRCompier, CompilerAndRun) {
   program->Print(ss);
   LOG(INFO) << ss.str();
 
-  // Step 2: Compiler New ir::Program into Runtime Program
+  // Step 2: Compiler New pir::Program into Runtime Program
   auto target = cinn::common::DefaultNVGPUTarget();
   auto scope = cinn::hlir::framework::BuildScope(target, *program);
   ASSERT_EQ(scope->var_names().size(), 6);
@@ -109,9 +109,9 @@ TEST(NewIRCompier, CompilerAndRun) {
 }
 
 TEST(NewIRCompier, CompileGroupOps) {
-  // Step 1: Construct ir::Program
+  // Step 1: Construct pir::Program
   auto prog_info = BuildProgram();
-  std::shared_ptr<::ir::Program> program = std::get<0>(prog_info);
+  std::shared_ptr<::pir::Program> program = std::get<0>(prog_info);
   std::vector<GroupPtr> groups = std::get<1>(prog_info);
   EXPECT_EQ(program->block()->size(), 6u);
   LOG(INFO) << program->block()->size();
@@ -120,7 +120,7 @@ TEST(NewIRCompier, CompileGroupOps) {
   program->Print(ss);
   LOG(INFO) << ss.str();
 
-  // Step 2: Compiler New ir::Program into Runtime Program
+  // Step 2: Compiler New pir::Program into Runtime Program
   auto target = cinn::common::DefaultNVGPUTarget();
   auto scope = cinn::hlir::framework::BuildScope(target, *program);
   ASSERT_EQ(scope->var_names().size(), 6);
@@ -141,12 +141,12 @@ TEST(NewIRCompier, CompileGroupOps) {
 }
 
 TEST(RuntimeDialect, CompilerAndRun) {
-  // Step 1: Construct ir::Program
+  // Step 1: Construct pir::Program
   auto prog_info = BuildProgram();
-  std::shared_ptr<::ir::Program> program = std::get<0>(prog_info);
+  std::shared_ptr<::pir::Program> program = std::get<0>(prog_info);
   EXPECT_EQ(program->block()->size(), 6u);
 
-  // Step 2: Compiler New ir::Program into Runtime Program
+  // Step 2: Compiler New pir::Program into Runtime Program
   auto target = cinn::common::DefaultNVGPUTarget();
   auto scope = cinn::hlir::framework::BuildScope(target, *program);
   ASSERT_EQ(scope->var_names().size(), 6u);
@@ -155,7 +155,7 @@ TEST(RuntimeDialect, CompilerAndRun) {
   auto runtime_program = ir_compiler.Build();
 
   // Step 3: Convert into cinn::dialect::RuntimeDialect
-  std::shared_ptr<::ir::Program> ir_runtime_program =
+  std::shared_ptr<::pir::Program> ir_runtime_program =
       cinn::hlir::framework::ConvertToRuntimeDialect(*runtime_program);
 
   // Step 4: Run cinn::dialect::RuntimeDialect
diff --git a/test/cpp/ir/control_flow_dialect/CMakeLists.txt b/test/cpp/pir/control_flow_dialect/CMakeLists.txt
similarity index 74%
rename from test/cpp/ir/control_flow_dialect/CMakeLists.txt
rename to test/cpp/pir/control_flow_dialect/CMakeLists.txt
index 5f2a864f9942e..fa6b0a5ae7fca 100644
--- a/test/cpp/ir/control_flow_dialect/CMakeLists.txt
+++ b/test/cpp/pir/control_flow_dialect/CMakeLists.txt
@@ -3,6 +3,6 @@ cc_test_old(
   SRCS
   if_op_test.cc
   DEPS
-  ir
-  pd_dialect
+  pir
+  pd_op_dialect
   gtest)
diff --git a/test/cpp/ir/control_flow_dialect/if_op_test.cc b/test/cpp/pir/control_flow_dialect/if_op_test.cc
similarity index 56%
rename from test/cpp/ir/control_flow_dialect/if_op_test.cc
rename to test/cpp/pir/control_flow_dialect/if_op_test.cc
index 8d0d962b5e791..f4a7a7790866d 100644
--- a/test/cpp/ir/control_flow_dialect/if_op_test.cc
+++ b/test/cpp/pir/control_flow_dialect/if_op_test.cc
@@ -14,45 +14,45 @@
 #include <gtest/gtest.h>
 #include <iostream>
 
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_dialect.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_manual_op.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_op.h"
-#include "paddle/ir/core/builder.h"
-#include "paddle/ir/core/builtin_op.h"
-#include "paddle/ir/core/program.h"
-#include "paddle/ir/dialect/control_flow/ir/cf_dialect.h"
-#include "paddle/ir/dialect/control_flow/ir/cf_ops.h"
+#include "paddle/fluid/pir/dialect/operator/ir/manual_op.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
+#include "paddle/pir/core/builder.h"
+#include "paddle/pir/core/builtin_op.h"
+#include "paddle/pir/core/program.h"
+#include "paddle/pir/dialect/control_flow/ir/cf_dialect.h"
+#include "paddle/pir/dialect/control_flow/ir/cf_ops.h"
 
 TEST(if_op_test, base) {
-  ir::IrContext* ctx = ir::IrContext::Instance();
-  ctx->GetOrRegisterDialect<paddle::dialect::PaddleDialect>();
-  ctx->GetOrRegisterDialect<ir::ControlFlowDialect>();
+  pir::IrContext* ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+  ctx->GetOrRegisterDialect<pir::ControlFlowDialect>();
 
-  ir::Program program(ctx);
-  ir::Block* block = program.block();
-  ir::Builder builder(ctx, block);
+  pir::Program program(ctx);
+  pir::Block* block = program.block();
+  pir::Builder builder(ctx, block);
 
   auto full_op = builder.Build<paddle::dialect::FullOp>(
       std::vector<int64_t>{1}, true, phi::DataType::BOOL);
 
   auto if_op = builder.Build<paddle::dialect::IfOp>(
-      full_op.out(), std::vector<ir::Type>{builder.bool_type()});
+      full_op.out(), std::vector<pir::Type>{builder.bool_type()});
 
-  ir::Block* true_block = if_op.true_block();
+  pir::Block* true_block = if_op.true_block();
 
   builder.SetInsertionPointToStart(true_block);
 
   auto full_op_1 = builder.Build<paddle::dialect::FullOp>(
       std::vector<int64_t>{2}, true, phi::DataType::BOOL);
-  builder.Build<ir::YieldOp>(std::vector<ir::OpResult>{full_op_1.out()});
+  builder.Build<pir::YieldOp>(std::vector<pir::OpResult>{full_op_1.out()});
 
-  ir::Block* false_block = if_op.false_block();
+  pir::Block* false_block = if_op.false_block();
 
   builder.SetInsertionPointToStart(false_block);
 
   auto full_op_2 = builder.Build<paddle::dialect::FullOp>(
       std::vector<int64_t>{3}, true, phi::DataType::BOOL);
-  builder.Build<ir::YieldOp>(std::vector<ir::OpResult>{full_op_2.out()});
+  builder.Build<pir::YieldOp>(std::vector<pir::OpResult>{full_op_2.out()});
 
   std::stringstream ss;
   program.Print(ss);
diff --git a/test/cpp/ir/core/CMakeLists.txt b/test/cpp/pir/core/CMakeLists.txt
similarity index 67%
rename from test/cpp/ir/core/CMakeLists.txt
rename to test/cpp/pir/core/CMakeLists.txt
index 14ea9dc13725d..b3d815e59bb6c 100644
--- a/test/cpp/ir/core/CMakeLists.txt
+++ b/test/cpp/pir/core/CMakeLists.txt
@@ -3,21 +3,21 @@ cc_test_old(
   SRCS
   type_test.cc
   DEPS
-  ir
+  pir
   gtest
-  pd_dialect)
-cc_test_old(ir_attribute_test SRCS ir_attribute_test.cc DEPS ir gtest)
-cc_test_old(ir_value_test SRCS ir_value_test.cc DEPS ir gtest)
-cc_test_old(ir_op_test SRCS ir_op_test.cc DEPS ir gtest)
-cc_test_old(ir_region_test SRCS ir_region_test.cc DEPS ir gtest)
-cc_test_old(ir_builder_test SRCS ir_builder_test.cc DEPS ir gtest)
+  pd_op_dialect)
+cc_test_old(ir_attribute_test SRCS ir_attribute_test.cc DEPS pir gtest)
+cc_test_old(ir_value_test SRCS ir_value_test.cc DEPS pir gtest)
+cc_test_old(ir_op_test SRCS ir_op_test.cc DEPS pir gtest)
+cc_test_old(ir_region_test SRCS ir_region_test.cc DEPS pir gtest)
+cc_test_old(ir_builder_test SRCS ir_builder_test.cc DEPS pir gtest)
 cc_test_old(
   ir_program_test
   SRCS
   ir_program_test.cc
   DEPS
-  pd_dialect
-  ir
+  pd_op_dialect
+  pir
   phi
   gtest)
 
@@ -26,8 +26,8 @@ cc_test_old(
   SRCS
   ir_infershape_test.cc
   DEPS
-  pd_dialect
-  ir
+  pd_op_dialect
+  pir
   phi
   gtest)
 
@@ -38,10 +38,10 @@ cc_test_old(
   DEPS
   pd_op_to_kernel_pass
   program_translator
-  pd_dialect
+  pd_op_dialect
   pd_kernel_dialect
   phi_kernel_adaptor
-  ir
+  pir
   phi
   gtest)
 
@@ -50,8 +50,8 @@ cc_test_old(
   SRCS
   scalar_attribute_test.cc
   DEPS
-  pd_dialect
-  ir
+  pd_op_dialect
+  pir
   gtest)
 
 file(
@@ -75,8 +75,8 @@ cc_test_old(
   DEPS
   program_translator
   gtest
-  pd_dialect
-  ir)
+  pd_op_dialect
+  pir)
 
 cc_test_old(
   add_dialect_parser_test
@@ -84,28 +84,24 @@ cc_test_old(
   add_dialect_parser_test.cc
   DEPS
   gtest
-  pd_dialect
-  ir)
+  pd_op_dialect
+  pir)
 
-cc_test_old(
+cc_test(
   ir_parser_test
-  SRCS
-  ir_parser_test.cc
-  DEPS
-  gtest
-  pd_dialect
-  ir)
+  SRCS ir_parser_test.cc
+  DEPS gtest pd_op_dialect pir)
 
-cc_test_old(ir_op_info_test SRCS op_info_test.cc DEPS gtest ir)
+cc_test_old(ir_op_info_test SRCS op_info_test.cc DEPS gtest pir)
 cc_test_old(
   ir_op_yaml_info_parser_test
   SRCS
   op_yaml_info_parser_test.cc
   DEPS
   gtest
-  pd_dialect
+  pd_op_dialect
   pd_interface
-  ir)
+  pir)
 
 cc_test_old(
   ir_type_converter_test
@@ -114,8 +110,18 @@ cc_test_old(
   DEPS
   gtest
   program_translator
-  pd_dialect
-  ir)
+  pd_op_dialect
+  pir)
+
+cc_test_old(
+  type_interface_test
+  SRCS
+  type_interface_test.cc
+  DEPS
+  pir
+  test_dialect
+  gtest
+  pd_op_dialect)
 
 cc_test_old(
   block_operand_test
@@ -124,4 +130,4 @@ cc_test_old(
   DEPS
   test_dialect
   gtest
-  ir)
+  pir)
diff --git a/test/cpp/pir/core/TestParserText.txt b/test/cpp/pir/core/TestParserText.txt
new file mode 100644
index 0000000000000..95c26c61501d1
--- /dev/null
+++ b/test/cpp/pir/core/TestParserText.txt
@@ -0,0 +1,43 @@
+
+//CHECK attribute
+(String)sdfgs.sdsd
+
+//CHECK type
+f32
+
+//CHECK type
+pd_op.tensor<256xf32>
+
+//CHECK program
+{
+ (%0) = "builtin.get_parameter" () {parameter_name:(String)conv2d_0.w_0} : () -> pd_op.tensor<64x3x7x7xf32>
+ (%1) = "pd_op.feed" () {col:(Int32)0,is_persisable:(Array)[false],name:(String)data,stop_gradient:(Array)[true]} : () -> pd_op.tensor<-1x3x224x224xf32>
+ (%2) = "pd_op.conv2d" (%1, %0) {data_format:(String)NCHW,dilations:(Array)[(Int32)1,(Int32)1],groups:(Int32)1,is_persisable:(Array)[false],padding_algorithm:(String)EXPLICIT,paddings:(Array)[(Int32)3,(Int32)3],stop_gradient:(Array)[false],strides:(Array)[(Int32)2,(Int32)2]} : (pd_op.tensor<-1x3x224x224xf32>, pd_op.tensor<64x3x7x7xf32>) -> pd_op.tensor<-1x64x112x112xf32>
+}
+
+//CHECK attribute
+(Array)[(pd_op.DataType)bool,(pd_op.DataType)float32,(pd_op.DataType)float64,
+(pd_op.DataType)complex64,(pd_op.DataType)complex128,(pd_op.DataType)Undefined,
+(pd_op.DataType)Undefined,(pd_op.DataType)Undefined,(pd_op.DataType)Undefined,
+(pd_op.DataType)bfloat16,(pd_op.DataType)uint8,(pd_op.DataType)uint32,(pd_op.DataType)int8,
+(pd_op.DataType)uint16,(pd_op.DataType)int16,(pd_op.DataType)int32,(pd_op.DataType)uint64,(pd_op.DataType)int64]
+
+
+//CHECK attribute
+(Array)[(pd_op.Place)Place(gpu:0),(pd_op.Place)Place(gpu_pinned),(pd_op.Place)Place(gpu_pinned),
+(pd_op.Place)Place(xpu:0),(pd_op.Place)Place(ipu:0),(pd_op.Place)Place(:0),(pd_op.Place)Place(cpu)]
+
+
+//CHECK attribute
+(Array)[(pd_op.DataLayout)NHWC,(pd_op.DataLayout)STRIDED,(pd_op.DataLayout)NCHW,(pd_op.DataLayout)Undefined(AnyLayout),
+(pd_op.DataLayout)ONEDNN,(pd_op.DataLayout)SPARSE_COO,(pd_op.DataLayout)SPARSE_CSR,(pd_op.DataLayout)NDHWC,(pd_op.DataLayout)NCDHW,
+(pd_op.DataLayout)PSTRING_UNION]
+
+//CHECK attribute
+(Array)[(Double)1,(Int64)0,(String)1]
+
+//CHECK type
+vec[bf16,f64,b,i8,u8,i16,c64,c128]
+
+//CHECK attribute
+(String)1
diff --git a/test/cpp/ir/core/add_dialect_parser_test.cc b/test/cpp/pir/core/add_dialect_parser_test.cc
similarity index 64%
rename from test/cpp/ir/core/add_dialect_parser_test.cc
rename to test/cpp/pir/core/add_dialect_parser_test.cc
index 9bc39bb8d967c..88c2f31df23b5 100644
--- a/test/cpp/ir/core/add_dialect_parser_test.cc
+++ b/test/cpp/pir/core/add_dialect_parser_test.cc
@@ -18,27 +18,27 @@
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/ir_adaptor/translator/translate.h"
-#include "paddle/ir/core/attribute.h"
-#include "paddle/ir/core/attribute_base.h"
-#include "paddle/ir/core/builtin_attribute.h"
-#include "paddle/ir/core/builtin_attribute_storage.h"
-#include "paddle/ir/core/builtin_dialect.h"
-#include "paddle/ir/core/dialect.h"
-#include "paddle/ir/core/ir_parser.h"
-#include "paddle/ir/core/utils.h"
-
-using PaddleDialect = paddle::dialect::PaddleDialect;
-using AttributeStorage = ir::AttributeStorage;
-
-class TestParserDialect : public ir::Dialect {
+#include "paddle/pir/core/attribute.h"
+#include "paddle/pir/core/attribute_base.h"
+#include "paddle/pir/core/builtin_attribute.h"
+#include "paddle/pir/core/builtin_attribute_storage.h"
+#include "paddle/pir/core/builtin_dialect.h"
+#include "paddle/pir/core/dialect.h"
+#include "paddle/pir/core/parser/ir_parser.h"
+#include "paddle/pir/core/utils.h"
+
+using OperatorDialect = paddle::dialect::OperatorDialect;
+using AttributeStorage = pir::AttributeStorage;
+
+class TestParserDialect : public pir::Dialect {
  public:
-  explicit TestParserDialect(ir::IrContext* context);
+  explicit TestParserDialect(pir::IrContext* context);
 
   static const char* name() { return "tp"; }
 
-  void PrintAttribute(ir::Attribute attr, std::ostream& os) const;
+  void PrintAttribute(pir::Attribute attr, std::ostream& os) const;
 
-  ir::Attribute ParseAttribute(ir::IrParser& parser);  // NOLINT
+  pir::Attribute ParseAttribute(pir::IrParser& parser);  // NOLINT
 
  private:
   void initialize();
@@ -49,7 +49,7 @@ IR_DEFINE_EXPLICIT_TYPE_ID(TestParserDialect);
 
 DECLARE_BASE_TYPE_ATTRIBUTE_STORAGE(CharAttributeStorage, char);
 
-class CharAttribute : public ir::Attribute {
+class CharAttribute : public pir::Attribute {
  public:
   using Attribute::Attribute;
 
@@ -57,7 +57,7 @@ class CharAttribute : public ir::Attribute {
 
   char data() const;
 
-  static CharAttribute Parse(ir::IrParser& parser) {  // NOLINT
+  static CharAttribute Parse(pir::IrParser& parser) {  // NOLINT
     std::string char_val = parser.ConsumeToken().val_;
     return CharAttribute::get(parser.ctx, char_val[0]);
   }
@@ -71,19 +71,19 @@ void TestParserDialect::initialize() { RegisterAttributes<CharAttribute>(); }
 
 char CharAttribute::data() const { return storage()->data(); }
 
-TestParserDialect::TestParserDialect(ir::IrContext* context)
-    : ir::Dialect(name(), context, ir::TypeId::get<TestParserDialect>()) {
+TestParserDialect::TestParserDialect(pir::IrContext* context)
+    : pir::Dialect(name(), context, pir::TypeId::get<TestParserDialect>()) {
   initialize();
 }
 
-void TestParserDialect::PrintAttribute(ir::Attribute attr,
+void TestParserDialect::PrintAttribute(pir::Attribute attr,
                                        std::ostream& os) const {
   auto byte_attr = attr.dyn_cast<CharAttribute>();
   os << "(tp.char)" << byte_attr.data();
 }
 
-ir::Attribute TestParserDialect::ParseAttribute(
-    ir::IrParser& parser) {  // NOLINT
+pir::Attribute TestParserDialect::ParseAttribute(
+    pir::IrParser& parser) {  // NOLINT
   std::string type_name = parser.ConsumeToken().val_;
   std::string parenthesis_token_val = parser.ConsumeToken().val_;
   IR_ENFORCE(parenthesis_token_val == ")",
@@ -93,19 +93,19 @@ ir::Attribute TestParserDialect::ParseAttribute(
 }
 
 TEST(IrParserTest, AddAttribute) {
-  ir::IrContext* ctx = ir::IrContext::Instance();
-  ctx->GetOrRegisterDialect<PaddleDialect>();
-  ctx->GetOrRegisterDialect<ir::BuiltinDialect>();
+  pir::IrContext* ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<OperatorDialect>();
+  ctx->GetOrRegisterDialect<pir::BuiltinDialect>();
   ctx->GetOrRegisterDialect<TestParserDialect>();
 
   std::string op_str =
       " (%0) = \"builtin.get_parameter\" () "
       "{parameter_name:(String)conv2d_0.w_0,test:(tp.char)a} : () -> "
-      "pd.tensor<64x3x7x7xf32>";
+      "pd_op.tensor<64x3x7x7xf32>";
   std::stringstream ss;
   ss << op_str;
-  ir::IrParser* parser = new ir::IrParser(ctx, ss);
-  ir::Operation* op = parser->ParseOperation();
+  pir::IrParser* parser = new pir::IrParser(ctx, ss);
+  pir::Operation* op = parser->ParseOperation();
   std::stringstream ssp;
   op->Print(ssp);
   delete parser;
diff --git a/test/cpp/ir/core/block_operand_test.cc b/test/cpp/pir/core/block_operand_test.cc
similarity index 73%
rename from test/cpp/ir/core/block_operand_test.cc
rename to test/cpp/pir/core/block_operand_test.cc
index f2b74e9781a3f..eac4cc302e439 100644
--- a/test/cpp/ir/core/block_operand_test.cc
+++ b/test/cpp/pir/core/block_operand_test.cc
@@ -14,34 +14,34 @@
 
 #include <gtest/gtest.h>
 
-#include "paddle/ir/core/builder.h"
-#include "paddle/ir/core/program.h"
+#include "paddle/pir/core/builder.h"
+#include "paddle/pir/core/program.h"
 
-#include "test/cpp/ir/tools/test_dialect.h"
-#include "test/cpp/ir/tools/test_op.h"
+#include "test/cpp/pir/tools/test_dialect.h"
+#include "test/cpp/pir/tools/test_op.h"
 
 TEST(block_operand_test, type_block) {
-  ir::IrContext ctx;
+  pir::IrContext ctx;
   ctx.GetOrRegisterDialect<test::TestDialect>();
 
-  ir::Program program(&ctx);
-  ir::Block* block = program.block();
+  pir::Program program(&ctx);
+  pir::Block* block = program.block();
 
-  ir::Builder builder(&ctx, block);
+  pir::Builder builder(&ctx, block);
   test::RegionOp region_op = builder.Build<test::RegionOp>();
 
   auto& region = region_op->region(0);
 
-  ir::Block* block_1 = new ir::Block();
-  ir::Block* block_2 = new ir::Block();
-  ir::Block* block_3 = new ir::Block();
+  pir::Block* block_1 = new pir::Block();
+  pir::Block* block_2 = new pir::Block();
+  pir::Block* block_3 = new pir::Block();
   region.push_back(block_1);
   region.push_back(block_2);
   region.push_back(block_3);
 
   builder.SetInsertionPointToEnd(block_1);
   auto op1 =
-      builder.Build<test::BranchOp>(std::vector<ir::OpResult>{}, block_2);
+      builder.Build<test::BranchOp>(std::vector<pir::OpResult>{}, block_2);
   EXPECT_TRUE(block_2->HasOneUse());
   EXPECT_FALSE(block_2->use_empty());
 
@@ -56,7 +56,7 @@ TEST(block_operand_test, type_block) {
 
   builder.SetInsertionPointToEnd(block_3);
   auto op3 =
-      builder.Build<test::BranchOp>(std::vector<ir::OpResult>{}, block_1);
+      builder.Build<test::BranchOp>(std::vector<pir::OpResult>{}, block_1);
   block_operand = op3->block_operand(0);
   block_operand.set_source(block_2);
   EXPECT_EQ(block_2, block_operand.source());
diff --git a/test/cpp/ir/core/ir_attribute_test.cc b/test/cpp/pir/core/ir_attribute_test.cc
similarity index 51%
rename from test/cpp/ir/core/ir_attribute_test.cc
rename to test/cpp/pir/core/ir_attribute_test.cc
index 291b64a7233cb..1242e19b50faa 100644
--- a/test/cpp/ir/core/ir_attribute_test.cc
+++ b/test/cpp/pir/core/ir_attribute_test.cc
@@ -15,21 +15,21 @@
 #include <gtest/gtest.h>
 #include <map>
 
-#include "paddle/ir/core/attribute.h"
-#include "paddle/ir/core/attribute_base.h"
-#include "paddle/ir/core/builtin_attribute.h"
-#include "paddle/ir/core/builtin_dialect.h"
-#include "paddle/ir/core/builtin_type.h"
-#include "paddle/ir/core/dialect.h"
-#include "paddle/ir/core/ir_context.h"
+#include "paddle/pir/core/attribute.h"
+#include "paddle/pir/core/attribute_base.h"
+#include "paddle/pir/core/builtin_attribute.h"
+#include "paddle/pir/core/builtin_dialect.h"
+#include "paddle/pir/core/builtin_type.h"
+#include "paddle/pir/core/dialect.h"
+#include "paddle/pir/core/ir_context.h"
 
 class AttributeA {};
 IR_DECLARE_EXPLICIT_TYPE_ID(AttributeA)
 IR_DEFINE_EXPLICIT_TYPE_ID(AttributeA)
 
-struct FakeDialect : ir::Dialect {
-  explicit FakeDialect(ir::IrContext *context)
-      : ir::Dialect(name(), context, ir::TypeId::get<FakeDialect>()) {}
+struct FakeDialect : pir::Dialect {
+  explicit FakeDialect(pir::IrContext *context)
+      : pir::Dialect(name(), context, pir::TypeId::get<FakeDialect>()) {}
   static const char *name() { return "fake"; }
 };
 IR_DECLARE_EXPLICIT_TYPE_ID(FakeDialect)
@@ -37,37 +37,37 @@ IR_DEFINE_EXPLICIT_TYPE_ID(FakeDialect)
 
 TEST(attribute_test, attribute_base) {
   // Test 1: Test the function of IrContext to register Dialect.
-  ir::IrContext *ctx = ir::IrContext::Instance();
-  ir::Dialect *fake_dialect = ctx->GetOrRegisterDialect<FakeDialect>();
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  pir::Dialect *fake_dialect = ctx->GetOrRegisterDialect<FakeDialect>();
   // Test 2: Test the get method of AbstractType.
-  ir::TypeId a_id = ir::TypeId::get<AttributeA>();
-  ir::AbstractAttribute abstract_attribute_a =
-      ir::AbstractAttribute::get(a_id, *fake_dialect);
+  pir::TypeId a_id = pir::TypeId::get<AttributeA>();
+  pir::AbstractAttribute abstract_attribute_a =
+      pir::AbstractAttribute::get(a_id, *fake_dialect);
   EXPECT_EQ(abstract_attribute_a.type_id(), a_id);
   // Test 3: Test the constructor of AbstractStorage.
-  ir::AttributeStorage storage_a(&abstract_attribute_a);
+  pir::AttributeStorage storage_a(&abstract_attribute_a);
   EXPECT_EQ(storage_a.abstract_attribute().type_id(),
             abstract_attribute_a.type_id());
 }
 
 TEST(attribute_test, built_in_attribute) {
-  ir::IrContext *ctx = ir::IrContext::Instance();
+  pir::IrContext *ctx = pir::IrContext::Instance();
   // Test 1: Test the parametric built-in attribute of IrContext.
   std::string str_tmp = "string_a";
-  ir::Attribute string_attr_1 = ir::StrAttribute::get(ctx, str_tmp);
-  ir::Attribute string_attr_2 = ir::StrAttribute::get(ctx, str_tmp);
+  pir::Attribute string_attr_1 = pir::StrAttribute::get(ctx, str_tmp);
+  pir::Attribute string_attr_2 = pir::StrAttribute::get(ctx, str_tmp);
   EXPECT_EQ(string_attr_1, string_attr_2);
-  EXPECT_EQ(ir::StrAttribute::classof(string_attr_1), 1);
+  EXPECT_EQ(pir::StrAttribute::classof(string_attr_1), 1);
   // Test 2: Test isa and dyn_cast.
-  EXPECT_EQ(string_attr_1.isa<ir::StrAttribute>(), true);
-  ir::StrAttribute string_attr_cast_1 =
-      string_attr_1.dyn_cast<ir::StrAttribute>();
-  EXPECT_EQ(string_attr_cast_1.isa<ir::StrAttribute>(), true);
+  EXPECT_EQ(string_attr_1.isa<pir::StrAttribute>(), true);
+  pir::StrAttribute string_attr_cast_1 =
+      string_attr_1.dyn_cast<pir::StrAttribute>();
+  EXPECT_EQ(string_attr_cast_1.isa<pir::StrAttribute>(), true);
   EXPECT_EQ(string_attr_cast_1.size() == 8, 1);
 
-  ir::Int32Type i32_type = ir::Int32Type::get(ctx);
-  ir::Attribute type_attr = ir::TypeAttribute::get(ctx, i32_type);
-  EXPECT_TRUE(type_attr.isa<ir::TypeAttribute>());
-  EXPECT_EQ(type_attr.dyn_cast<ir::TypeAttribute>().data().type_id(),
+  pir::Int32Type i32_type = pir::Int32Type::get(ctx);
+  pir::Attribute type_attr = pir::TypeAttribute::get(ctx, i32_type);
+  EXPECT_TRUE(type_attr.isa<pir::TypeAttribute>());
+  EXPECT_EQ(type_attr.dyn_cast<pir::TypeAttribute>().data().type_id(),
             i32_type.type_id());
 }
diff --git a/test/cpp/pir/core/ir_builder_test.cc b/test/cpp/pir/core/ir_builder_test.cc
new file mode 100644
index 0000000000000..e3705d08c7ef9
--- /dev/null
+++ b/test/cpp/pir/core/ir_builder_test.cc
@@ -0,0 +1,54 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <map>
+
+#include "paddle/pir/core/builder.h"
+#include "paddle/pir/core/builtin_attribute.h"
+#include "paddle/pir/core/builtin_type.h"
+#include "paddle/pir/core/ir_context.h"
+#include "paddle/pir/core/program.h"
+
+TEST(builder_test, type_api) {
+  pir::IrContext ctx;
+  pir::Builder builder(&ctx);
+  EXPECT_EQ(pir::UInt8Type::get(&ctx), builder.uint8_type());
+  EXPECT_EQ(pir::Int8Type::get(&ctx), builder.int8_type());
+  EXPECT_EQ(pir::VectorType::get(&ctx, std::vector<pir::Type>()),
+            builder.vec_type({}));
+  EXPECT_EQ(pir::BFloat16Type::get(&ctx), builder.bfloat16_type());
+  EXPECT_EQ(pir::Float32Type::get(&ctx), builder.float32_type());
+  EXPECT_EQ(pir::Float64Type::get(&ctx), builder.float64_type());
+  EXPECT_EQ(pir::IndexType::get(&ctx), builder.index_type());
+  EXPECT_EQ(pir::Int16Type::get(&ctx), builder.int16_type());
+  EXPECT_EQ(pir::BoolType::get(&ctx), builder.bool_type());
+  EXPECT_EQ(pir::Complex64Type::get(&ctx), builder.complex64_type());
+  EXPECT_EQ(pir::Complex128Type::get(&ctx), builder.complex128_type());
+}
+
+TEST(builder_test, attribute_api) {
+  pir::IrContext ctx;
+  pir::Builder builder(&ctx);
+  EXPECT_EQ(pir::StrAttribute::get(&ctx, "test"), builder.str_attr("test"));
+  EXPECT_EQ(pir::BoolAttribute::get(&ctx, true), builder.bool_attr(true));
+  EXPECT_EQ(pir::FloatAttribute::get(&ctx, 0.2f), builder.float_attr(0.2f));
+  EXPECT_EQ(pir::DoubleAttribute::get(&ctx, 2.0), builder.double_attr(2.0));
+  EXPECT_EQ(pir::Int32Attribute::get(&ctx, 2), builder.int32_attr(2));
+  EXPECT_EQ(pir::Int64Attribute::get(&ctx, 2), builder.int64_attr(2));
+  EXPECT_EQ(pir::ArrayAttribute::get(&ctx, std::vector<pir::Attribute>()),
+            builder.array_attr({}));
+  EXPECT_EQ(pir::PointerAttribute::get(&ctx, nullptr),
+            builder.pointer_attr(nullptr));
+}
diff --git a/test/cpp/ir/core/ir_exe_test.cc b/test/cpp/pir/core/ir_exe_test.cc
similarity index 71%
rename from test/cpp/ir/core/ir_exe_test.cc
rename to test/cpp/pir/core/ir_exe_test.cc
index e36c99fb2e4b1..6ce9d39172a20 100644
--- a/test/cpp/ir/core/ir_exe_test.cc
+++ b/test/cpp/pir/core/ir_exe_test.cc
@@ -14,20 +14,20 @@
 
 #include <gtest/gtest.h>
 
-#include "paddle/fluid/ir/dialect/paddle_dialect/interface/op_yaml_info.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_dialect.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_type.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/utils/op_yaml_info_util.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/utils/utils.h"
-#include "paddle/ir/core/builtin_attribute.h"
-#include "paddle/ir/core/builtin_dialect.h"
-#include "paddle/ir/core/builtin_op.h"
-#include "paddle/ir/core/ir_context.h"
-#include "paddle/ir/core/program.h"
-#include "paddle/ir/core/utils.h"
+#include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
+#include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_util.h"
+#include "paddle/fluid/pir/dialect/operator/utils/utils.h"
 #include "paddle/phi/core/meta_tensor.h"
 #include "paddle/phi/infermeta/binary.h"
 #include "paddle/phi/kernels/elementwise_add_kernel.h"
+#include "paddle/pir/core/builtin_attribute.h"
+#include "paddle/pir/core/builtin_dialect.h"
+#include "paddle/pir/core/builtin_op.h"
+#include "paddle/pir/core/ir_context.h"
+#include "paddle/pir/core/program.h"
+#include "paddle/pir/core/utils.h"
 
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/tensor.h"
@@ -40,12 +40,12 @@
 
 #include "paddle/fluid/platform/init.h"
 
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_attribute.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
 
-#include "paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_adaptor.h"
-#include "paddle/fluid/ir/transforms/pd_op_to_kernel_pass.h"
-#include "paddle/ir/core/attribute.h"
+#include "paddle/fluid/pir/phi_kernel_adaptor/phi_kernel_adaptor.h"
+#include "paddle/fluid/pir/transforms/pd_op_to_kernel_pass.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/pir/core/attribute.h"
 
 PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(full_int_array, CPU, ALL_LAYOUT);
@@ -56,32 +56,33 @@ bool simple_cmp(float a, float b) { return std::abs((a - b) / a) < 1e-5; }
 
 TEST(program_test, program) {
   // Prepare ir env
-  ir::IrContext* ctx = ir::IrContext::Instance();
-  ctx->GetOrRegisterDialect<paddle::dialect::PaddleDialect>();
-  ir::Program program(ctx);
-  ir::Builder builder(ctx, program.block());
-  ir::Block* block = program.block();
+  pir::IrContext* ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+  pir::Program program(ctx);
+  pir::Builder builder(ctx, program.block());
+  pir::Block* block = program.block();
 
   // Def: A = paddle::dialect::UniformOp(std::vector<int64_t> shape,
   // phi::DataType dtype, float min, float max, int seed, phi::Place place)
-  ir::AttributeMap uniform1_attributes;
+  pir::AttributeMap uniform1_attributes;
   uniform1_attributes.insert({"shape",
                               paddle::dialect::IntArrayAttribute::get(
-                                  ir::IrContext::Instance(),
+                                  pir::IrContext::Instance(),
                                   phi::IntArray(std::vector<int64_t>{2, 2}))});
   uniform1_attributes.insert(
       {"dtype",
-       paddle::dialect::DataTypeAttribute::get(ir::IrContext::Instance(),
+       paddle::dialect::DataTypeAttribute::get(pir::IrContext::Instance(),
                                                phi::DataType::FLOAT32)});
   uniform1_attributes.insert(
-      {"min", ir::FloatAttribute::get(ir::IrContext::Instance(), 0.0)});
+      {"min", pir::FloatAttribute::get(pir::IrContext::Instance(), 0.0)});
   uniform1_attributes.insert(
-      {"max", ir::FloatAttribute::get(ir::IrContext::Instance(), 1.0)});
+      {"max", pir::FloatAttribute::get(pir::IrContext::Instance(), 1.0)});
   uniform1_attributes.insert(
-      {"seed", ir::Int32Attribute::get(ir::IrContext::Instance(), 2)});
-  uniform1_attributes.insert({"place",
-                              paddle::dialect::PlaceAttribute::get(
-                                  ir::IrContext::Instance(), phi::CPUPlace())});
+      {"seed", pir::Int32Attribute::get(pir::IrContext::Instance(), 2)});
+  uniform1_attributes.insert(
+      {"place",
+       paddle::dialect::PlaceAttribute::get(pir::IrContext::Instance(),
+                                            phi::CPUPlace())});
   paddle::dialect::UniformOp uniform1 =
       builder.Build<paddle::dialect::UniformOp>(uniform1_attributes);
 
@@ -89,10 +90,11 @@ TEST(program_test, program) {
             true);
   EXPECT_EQ(block->size(), 4u);
 
-  ir::Attribute seed_attr = uniform1.attribute("seed");
-  ir::Int32Attribute seed_attr1 =
-      uniform1.attribute<ir::Int32Attribute>("seed");
-  EXPECT_EQ(seed_attr.dyn_cast<ir::Int32Attribute>().data(), seed_attr1.data());
+  pir::Attribute seed_attr = uniform1.attribute("seed");
+  pir::Int32Attribute seed_attr1 =
+      uniform1.attribute<pir::Int32Attribute>("seed");
+  EXPECT_EQ(seed_attr.dyn_cast<pir::Int32Attribute>().data(),
+            seed_attr1.data());
 
   // Def: B = paddle::dialect::UniformOp(...)
   paddle::dialect::UniformOp uniform2 =
@@ -106,7 +108,7 @@ TEST(program_test, program) {
             true);
   EXPECT_EQ(block->size(), 8u);
 
-  // Def: C = paddle::dialect::AddOp(ir::OpResult x_, ir::OpResult y_)
+  // Def: C = paddle::dialect::AddOp(pir::OpResult x_, pir::OpResult y_)
   paddle::dialect::AddOp add = builder.Build<paddle::dialect::AddOp>(
       uniform1->result(0), uniform2->result(0));
   EXPECT_EQ(add->result(0).type().isa<paddle::dialect::DenseTensorType>(),
@@ -135,28 +137,28 @@ TEST(program_test, program) {
 
 TEST(program_test, mutable_attribute) {
   // Prepare ir env
-  ir::IrContext* ctx = ir::IrContext::Instance();
-  ctx->GetOrRegisterDialect<paddle::dialect::PaddleDialect>();
-  ir::Program program(ctx);
-  ir::Builder builder = ir::Builder(ctx, program.block());
-  ir::Block* block = program.block();
+  pir::IrContext* ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+  pir::Program program(ctx);
+  pir::Builder builder = pir::Builder(ctx, program.block());
+  pir::Block* block = program.block();
 
   // Def FullOp
   paddle::dialect::FullIntArrayOp full_shape_op =
       builder.Build<paddle::dialect::FullIntArrayOp>(
           std::vector<int64_t>{2, 2}, phi::DataType::INT64, phi::CPUPlace());
-  ir::OpResult shape_ = full_shape_op->result(0);
+  pir::OpResult shape_ = full_shape_op->result(0);
   // Generate scalar mutable attribute: min
   paddle::dialect::FullOp full_min_op = builder.Build<paddle::dialect::FullOp>(
       std::vector<int64_t>{1}, 0.0, phi::DataType::FLOAT32, phi::CPUPlace());
-  ir::OpResult min_ = full_min_op->result(0);
+  pir::OpResult min_ = full_min_op->result(0);
   // Generate scalar mutable attribute: max
   paddle::dialect::FullOp full_max_op = builder.Build<paddle::dialect::FullOp>(
       std::vector<int64_t>{1}, 1.0, phi::DataType::FLOAT32, phi::CPUPlace());
-  ir::OpResult max_ = full_max_op->result(0);
+  pir::OpResult max_ = full_max_op->result(0);
 
-  // Def: static void Build(ir::Builder &builder, ir::OperationArgument
-  // &argument, ir::OpResult shape_, ir::OpResult min_, ir::OpResult max_,
+  // Def: static void Build(pir::Builder &builder, pir::OperationArgument
+  // &argument, pir::OpResult shape_, pir::OpResult min_, pir::OpResult max_,
   // phi::DataType dtype, int seed, phi::Place place={});
   paddle::dialect::UniformOp uniform1 =
       builder.Build<paddle::dialect::UniformOp>(
@@ -173,7 +175,7 @@ TEST(program_test, mutable_attribute) {
             true);
   EXPECT_EQ(block->size(), 5u);
 
-  // Def: C = paddle::dialect::AddOp(ir::OpResult x_, ir::OpResult y_)
+  // Def: C = paddle::dialect::AddOp(pir::OpResult x_, pir::OpResult y_)
   paddle::dialect::AddOp add = builder.Build<paddle::dialect::AddOp>(
       uniform1->result(0), uniform2->result(0));
   EXPECT_EQ(add->result(0).type().isa<paddle::dialect::DenseTensorType>(),
diff --git a/test/cpp/ir/core/ir_infershape_test.cc b/test/cpp/pir/core/ir_infershape_test.cc
similarity index 72%
rename from test/cpp/ir/core/ir_infershape_test.cc
rename to test/cpp/pir/core/ir_infershape_test.cc
index e39a69ac573f9..589e3b87bebe0 100644
--- a/test/cpp/ir/core/ir_infershape_test.cc
+++ b/test/cpp/pir/core/ir_infershape_test.cc
@@ -14,14 +14,14 @@
 
 #include <gtest/gtest.h>
 
-#include "paddle/ir/core/block.h"
-#include "paddle/ir/core/builder.h"
-#include "paddle/ir/core/builtin_attribute.h"
-#include "paddle/ir/core/builtin_type.h"
-#include "paddle/ir/core/dialect.h"
-#include "paddle/ir/core/ir_context.h"
-#include "paddle/ir/core/op_base.h"
-#include "paddle/ir/core/region.h"
+#include "paddle/pir/core/block.h"
+#include "paddle/pir/core/builder.h"
+#include "paddle/pir/core/builtin_attribute.h"
+#include "paddle/pir/core/builtin_type.h"
+#include "paddle/pir/core/dialect.h"
+#include "paddle/pir/core/ir_context.h"
+#include "paddle/pir/core/op_base.h"
+#include "paddle/pir/core/region.h"
 
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/tensor.h"
@@ -32,14 +32,14 @@
 #include "paddle/phi/core/kernel_context.h"
 #include "paddle/phi/core/kernel_factory.h"
 
-#include "paddle/fluid/ir/dialect/paddle_dialect/interface/infermeta.h"
+#include "paddle/fluid/pir/dialect/operator/interface/infermeta.h"
 #include "paddle/fluid/platform/init.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/infermeta/nullary.h"
 
 // Define op
 class OperationTest
-    : public ir::Op<OperationTest, paddle::dialect::InferMetaInterface> {
+    : public pir::Op<OperationTest, paddle::dialect::InferMetaInterface> {
  public:
   using Op::Op;
   static const char *name() { return "test.operation2"; }
@@ -59,10 +59,10 @@ const char *OperationTest::attributes_name[attributes_num] = {  // NOLINT
     "op2_attr2"};
 
 // Define a dialect, op1 and op2 will be registered by this dialect.
-class TestDialect : public ir::Dialect {
+class TestDialect : public pir::Dialect {
  public:
-  explicit TestDialect(ir::IrContext *context)
-      : ir::Dialect(name(), context, ir::TypeId::get<TestDialect>()) {
+  explicit TestDialect(pir::IrContext *context)
+      : pir::Dialect(name(), context, pir::TypeId::get<TestDialect>()) {
     initialize();
   }
   static const char *name() { return "test"; }
@@ -74,19 +74,19 @@ IR_DECLARE_EXPLICIT_TYPE_ID(TestDialect)
 IR_DEFINE_EXPLICIT_TYPE_ID(TestDialect)
 
 TEST(infershape_test, infershape_test) {
-  ir::IrContext *ctx = ir::IrContext::Instance();
-  ir::Dialect *test_dialect = ctx->GetOrRegisterDialect<TestDialect>();
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  pir::Dialect *test_dialect = ctx->GetOrRegisterDialect<TestDialect>();
   EXPECT_EQ(test_dialect != nullptr, true);
 
   // (2) Get registered operations.
 
   std::string op_name = OperationTest::name();
-  ir::OpInfo op_info = ctx->GetRegisteredOpInfo(op_name);
+  pir::OpInfo op_info = ctx->GetRegisteredOpInfo(op_name);
 
-  std::vector<ir::OpResult> op_inputs = {};
-  std::vector<ir::Type> op_output_types = {ir::Float32Type::get(ctx)};
-  ir::Operation *op =
-      ir::Operation::Create(op_inputs, {}, op_output_types, op_info);
+  std::vector<pir::OpResult> op_inputs = {};
+  std::vector<pir::Type> op_output_types = {pir::Float32Type::get(ctx)};
+  pir::Operation *op =
+      pir::Operation::Create(op_inputs, {}, op_output_types, op_info);
 
   paddle::dialect::InferMetaInterface interface =
       op->dyn_cast<paddle::dialect::InferMetaInterface>();
diff --git a/test/cpp/ir/core/ir_op_test.cc b/test/cpp/pir/core/ir_op_test.cc
similarity index 52%
rename from test/cpp/ir/core/ir_op_test.cc
rename to test/cpp/pir/core/ir_op_test.cc
index 48f54c63230e0..d0c35151d6be1 100644
--- a/test/cpp/ir/core/ir_op_test.cc
+++ b/test/cpp/pir/core/ir_op_test.cc
@@ -15,24 +15,24 @@
 #include <gtest/gtest.h>
 #include <sstream>
 
-#include "paddle/ir/core/block.h"
-#include "paddle/ir/core/builder.h"
-#include "paddle/ir/core/builtin_attribute.h"
-#include "paddle/ir/core/builtin_op.h"
-#include "paddle/ir/core/builtin_type.h"
-#include "paddle/ir/core/dialect.h"
-#include "paddle/ir/core/enforce.h"
-#include "paddle/ir/core/ir_context.h"
-#include "paddle/ir/core/ir_printer.h"
-#include "paddle/ir/core/op_base.h"
-#include "paddle/ir/core/program.h"
-#include "paddle/ir/core/region.h"
+#include "paddle/pir/core/block.h"
+#include "paddle/pir/core/builder.h"
+#include "paddle/pir/core/builtin_attribute.h"
+#include "paddle/pir/core/builtin_op.h"
+#include "paddle/pir/core/builtin_type.h"
+#include "paddle/pir/core/dialect.h"
+#include "paddle/pir/core/enforce.h"
+#include "paddle/pir/core/ir_context.h"
+#include "paddle/pir/core/ir_printer.h"
+#include "paddle/pir/core/op_base.h"
+#include "paddle/pir/core/program.h"
+#include "paddle/pir/core/region.h"
 
 /// \brief Define built-in Trait, derived from OpTraitBase.
-class ReadOnlyTrait : public ir::OpTraitBase<ReadOnlyTrait> {
+class ReadOnlyTrait : public pir::OpTraitBase<ReadOnlyTrait> {
  public:
-  explicit ReadOnlyTrait(ir::Operation *op)
-      : ir::OpTraitBase<ReadOnlyTrait>(op) {}
+  explicit ReadOnlyTrait(pir::Operation *op)
+      : pir::OpTraitBase<ReadOnlyTrait>(op) {}
 };
 IR_DECLARE_EXPLICIT_TYPE_ID(ReadOnlyTrait)
 IR_DEFINE_EXPLICIT_TYPE_ID(ReadOnlyTrait)
@@ -41,17 +41,17 @@ IR_DEFINE_EXPLICIT_TYPE_ID(ReadOnlyTrait)
 /// Models need to be defined within the class. Concept defines abstract
 /// interface functions, and Model is a template class that defines the specific
 /// implementation of interface functions based on template parameters.
-class InferShapeInterface : public ir::OpInterfaceBase<InferShapeInterface> {
+class InferShapeInterface : public pir::OpInterfaceBase<InferShapeInterface> {
  public:
   struct Concept {
-    explicit Concept(void (*infer_shape)(ir::Operation *))
+    explicit Concept(void (*infer_shape)(pir::Operation *))
         : infer_shape_(infer_shape) {}
-    void (*infer_shape_)(ir::Operation *);
+    void (*infer_shape_)(pir::Operation *);
   };
 
   template <class ConcreteOp>
   struct Model : public Concept {
-    static void InferShape(ir::Operation *op) {
+    static void InferShape(pir::Operation *op) {
       ConcreteOp concret_op = ConcreteOp(op);
       if (concret_op == nullptr) throw("concret_op is nullptr");
       concret_op.InferShape();
@@ -60,8 +60,8 @@ class InferShapeInterface : public ir::OpInterfaceBase<InferShapeInterface> {
     Model() : Concept(InferShape) {}
   };
 
-  InferShapeInterface(ir::Operation *op, Concept *impl)
-      : ir::OpInterfaceBase<InferShapeInterface>(op), impl_(impl) {}
+  InferShapeInterface(pir::Operation *op, Concept *impl)
+      : pir::OpInterfaceBase<InferShapeInterface>(op), impl_(impl) {}
 
   void InferShape() { impl_->infer_shape_(operation()); }
 
@@ -71,20 +71,21 @@ class InferShapeInterface : public ir::OpInterfaceBase<InferShapeInterface> {
 IR_DECLARE_EXPLICIT_TYPE_ID(InferShapeInterface)
 IR_DEFINE_EXPLICIT_TYPE_ID(InferShapeInterface)
 
-ir::AttributeMap CreateAttributeMap(std::vector<std::string> attribute_names,
-                                    std::vector<std::string> attributes) {
-  ir::IrContext *ctx = ir::IrContext::Instance();
-  ir::AttributeMap attr_map;
+pir::AttributeMap CreateAttributeMap(
+    const std::vector<std::string> &attribute_names,
+    const std::vector<std::string> &attributes) {
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  pir::AttributeMap attr_map;
   for (size_t i = 0; i < attribute_names.size(); i++) {
-    ir::Attribute attr_value = ir::StrAttribute::get(ctx, attributes[i]);
+    pir::Attribute attr_value = pir::StrAttribute::get(ctx, attributes[i]);
     attr_map.insert(
-        std::pair<std::string, ir::Attribute>(attribute_names[i], attr_value));
+        std::pair<std::string, pir::Attribute>(attribute_names[i], attr_value));
   }
   return attr_map;
 }
 
 // Define op1.
-class Operation1 : public ir::Op<Operation1> {
+class Operation1 : public pir::Op<Operation1> {
  public:
   using Op::Op;
   static const char *name() { return "test.operation1"; }
@@ -93,20 +94,20 @@ class Operation1 : public ir::Op<Operation1> {
   void Verify() {
     auto &attributes = this->attributes();
     if (attributes.count("op1_attr1") == 0 ||
-        !attributes.at("op1_attr1").isa<ir::StrAttribute>()) {
+        !attributes.at("op1_attr1").isa<pir::StrAttribute>()) {
       throw("Type of attribute: parameter_name is not right.");
     }
     if (attributes.count("op1_attr2") == 0 ||
-        !attributes.at("op1_attr2").isa<ir::StrAttribute>()) {
+        !attributes.at("op1_attr2").isa<pir::StrAttribute>()) {
       throw("Type of attribute: parameter_name is not right.");
     }
   }
-  static void Build(const ir::Builder &builder,
-                    ir::OperationArgument &argument) {  // NOLINT
-    std::vector<ir::OpResult> inputs = {};
-    std::vector<ir::Type> output_types = {
-        ir::Float32Type::get(builder.ir_context())};
-    std::unordered_map<std::string, ir::Attribute> attributes =
+  static void Build(const pir::Builder &builder,
+                    pir::OperationArgument &argument) {  // NOLINT
+    std::vector<pir::OpResult> inputs = {};
+    std::vector<pir::Type> output_types = {
+        pir::Float32Type::get(builder.ir_context())};
+    std::unordered_map<std::string, pir::Attribute> attributes =
         CreateAttributeMap({"op1_attr1", "op1_attr2"},
                            {"op1_attr1", "op1_attr2"});
     argument.AddOperands(inputs.begin(), inputs.end());
@@ -123,7 +124,7 @@ IR_DEFINE_EXPLICIT_TYPE_ID(Operation1)
 
 // Define op2.
 class Operation2
-    : public ir::Op<Operation2, ReadOnlyTrait, InferShapeInterface> {
+    : public pir::Op<Operation2, ReadOnlyTrait, InferShapeInterface> {
  public:
   using Op::Op;
   static const char *name() { return "test.operation2"; }
@@ -132,11 +133,11 @@ class Operation2
   void Verify() {
     auto &attributes = this->attributes();
     if (attributes.count("op2_attr1") == 0 ||
-        (!attributes.at("op2_attr1").isa<ir::StrAttribute>())) {
+        (!attributes.at("op2_attr1").isa<pir::StrAttribute>())) {
       throw("Type of attribute: parameter_name is not right.");
     }
     if (attributes.count("op2_attr2") == 0 ||
-        (!attributes.at("op2_attr2").isa<ir::StrAttribute>())) {
+        (!attributes.at("op2_attr2").isa<pir::StrAttribute>())) {
       throw("Type of attribute: parameter_name is not right.");
     }
   }
@@ -149,16 +150,16 @@ IR_DECLARE_EXPLICIT_TYPE_ID(Operation2)
 IR_DEFINE_EXPLICIT_TYPE_ID(Operation2)
 
 // Define a dialect, op1 and op2 will be registered by this dialect.
-class TestDialect : public ir::Dialect {
+class TestDialect : public pir::Dialect {
  public:
-  explicit TestDialect(ir::IrContext *context)
-      : ir::Dialect(name(), context, ir::TypeId::get<TestDialect>()) {
+  explicit TestDialect(pir::IrContext *context)
+      : pir::Dialect(name(), context, pir::TypeId::get<TestDialect>()) {
     initialize();
   }
   static const char *name() { return "test"; }
 
-  void PrintOperation(ir::Operation *op,
-                      ir::IrPrinter &printer) const override {
+  void PrintOperation(pir::Operation *op,
+                      pir::IrPrinter &printer) const override {
     printer.PrintOpResult(op);
     printer.os << " =";
 
@@ -174,16 +175,16 @@ IR_DEFINE_EXPLICIT_TYPE_ID(TestDialect)
 
 TEST(op_test, op_test) {
   // (1) Register Dialect, Operation1, Operation2 into IrContext.
-  ir::IrContext *ctx = ir::IrContext::Instance();
-  ir::Dialect *test_dialect = ctx->GetOrRegisterDialect<TestDialect>();
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  pir::Dialect *test_dialect = ctx->GetOrRegisterDialect<TestDialect>();
   EXPECT_EQ(test_dialect != nullptr, true);
 
   // (2) Get registered operations.
   std::string op1_name = Operation1::name();
-  ir::OpInfo op1_info = ctx->GetRegisteredOpInfo(op1_name);
+  pir::OpInfo op1_info = ctx->GetRegisteredOpInfo(op1_name);
   EXPECT_TRUE(op1_info);
   std::string op2_name = Operation2::name();
-  ir::OpInfo op2_info = ctx->GetRegisteredOpInfo(op2_name);
+  pir::OpInfo op2_info = ctx->GetRegisteredOpInfo(op2_name);
   EXPECT_TRUE(op2_info);
   EXPECT_EQ(op1_info.HasTrait<ReadOnlyTrait>(), false);
   EXPECT_EQ(op1_info.HasInterface<InferShapeInterface>(), false);
@@ -191,14 +192,14 @@ TEST(op_test, op_test) {
   EXPECT_EQ(op2_info.HasInterface<InferShapeInterface>(), true);
 
   // (3) Test uses for op.
-  std::vector<ir::OpResult> op_inputs = {};
-  std::vector<ir::Type> op_output_types = {ir::Float32Type::get(ctx)};
-  ir::Operation *op2 =
-      ir::Operation::Create(op_inputs,
-                            CreateAttributeMap({"op2_attr1", "op2_attr2"},
-                                               {"op2_attr1", "op2_attr2"}),
-                            op_output_types,
-                            op2_info);
+  std::vector<pir::OpResult> op_inputs = {};
+  std::vector<pir::Type> op_output_types = {pir::Float32Type::get(ctx)};
+  pir::Operation *op2 =
+      pir::Operation::Create(op_inputs,
+                             CreateAttributeMap({"op2_attr1", "op2_attr2"},
+                                                {"op2_attr1", "op2_attr2"}),
+                             op_output_types,
+                             op2_info);
 
   ReadOnlyTrait trait = op2->dyn_cast<ReadOnlyTrait>();
   EXPECT_EQ(trait.operation(), op2);
@@ -211,37 +212,37 @@ TEST(op_test, op_test) {
 
 TEST(op_test, region_test) {
   // (1) Register Dialect, Operation1, Operation2 into IrContext.
-  ir::IrContext *ctx = ir::IrContext::Instance();
-  ir::Dialect *test_dialect = ctx->GetOrRegisterDialect<TestDialect>();
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  pir::Dialect *test_dialect = ctx->GetOrRegisterDialect<TestDialect>();
   EXPECT_EQ(test_dialect != nullptr, true);
 
   // (2) Get registered operations.
-  ir::OpInfo op1_info = ctx->GetRegisteredOpInfo(Operation1::name());
-  ir::OpInfo op2_info = ctx->GetRegisteredOpInfo(Operation2::name());
-
-  ir::Operation *op1 =
-      ir::Operation::Create({},
-                            CreateAttributeMap({"op1_attr1", "op1_attr2"},
-                                               {"op1_attr1", "op1_attr2"}),
-                            {ir::Float32Type::get(ctx)},
-                            op1_info);
-  ir::Operation *op1_2 =
-      ir::Operation::Create({},
-                            CreateAttributeMap({"op1_attr1", "op1_attr2"},
-                                               {"op1_attr1", "op1_attr2"}),
-                            {ir::Float32Type::get(ctx)},
-                            op1_info);
-
-  ir::OperationArgument argument(op2_info);
+  pir::OpInfo op1_info = ctx->GetRegisteredOpInfo(Operation1::name());
+  pir::OpInfo op2_info = ctx->GetRegisteredOpInfo(Operation2::name());
+
+  pir::Operation *op1 =
+      pir::Operation::Create({},
+                             CreateAttributeMap({"op1_attr1", "op1_attr2"},
+                                                {"op1_attr1", "op1_attr2"}),
+                             {pir::Float32Type::get(ctx)},
+                             op1_info);
+  pir::Operation *op1_2 =
+      pir::Operation::Create({},
+                             CreateAttributeMap({"op1_attr1", "op1_attr2"},
+                                                {"op1_attr1", "op1_attr2"}),
+                             {pir::Float32Type::get(ctx)},
+                             op1_info);
+
+  pir::OperationArgument argument(op2_info);
   argument.attributes = CreateAttributeMap({"op2_attr1", "op2_attr2"},
                                            {"op2_attr1", "op2_attr2"});
-  argument.output_types = {ir::Float32Type::get(ctx)};
+  argument.output_types = {pir::Float32Type::get(ctx)};
   argument.num_regions = 1;
 
-  ir::Operation *op3 = ir::Operation::Create(std::move(argument));
-  // argument.regions.emplace_back(std::make_unique<ir::Region>());
+  pir::Operation *op3 = pir::Operation::Create(std::move(argument));
+  // argument.regions.emplace_back(std::make_unique<pir::Region>());
 
-  ir::Region &region = op3->region(0);
+  pir::Region &region = op3->region(0);
   EXPECT_EQ(region.empty(), true);
 
   // (3) Test custom operation printer
@@ -249,35 +250,35 @@ TEST(op_test, region_test) {
   op1->Print(ss);
   EXPECT_EQ(ss.str(), " (%0) = \"test.operation1\" ()");
 
-  region.push_back(new ir::Block());
-  region.push_front(new ir::Block());
-  region.insert(region.begin(), new ir::Block());
-  ir::Block *block = region.front();
+  region.push_back(new pir::Block());
+  region.push_front(new pir::Block());
+  region.insert(region.begin(), new pir::Block());
+  pir::Block *block = region.front();
   block->push_front(op1);
   block->insert(block->begin(), op1_2);
   op3->Destroy();
 }
 
 TEST(op_test, module_op_death) {
-  ir::IrContext *ctx = ir::IrContext::Instance();
-  ir::OpInfo op_info = ctx->GetRegisteredOpInfo(ir::ModuleOp::name());
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  pir::OpInfo op_info = ctx->GetRegisteredOpInfo(pir::ModuleOp::name());
 
-  std::vector<ir::OpResult> inputs{ir::OpResult()};
-  ir::AttributeMap attrs{{"program", ir::Int32Attribute::get(ctx, 1)}};
-  std::vector<ir::Type> output_types = {ir::Float32Type::get(ctx)};
+  std::vector<pir::OpResult> inputs{pir::OpResult()};
+  pir::AttributeMap attrs{{"program", pir::Int32Attribute::get(ctx, 1)}};
+  std::vector<pir::Type> output_types = {pir::Float32Type::get(ctx)};
 
-  EXPECT_THROW(ir::Operation::Create(inputs, {}, {}, op_info),
-               ir::IrNotMetException);
-  EXPECT_THROW(ir::Operation::Create({}, attrs, {}, op_info),
-               ir::IrNotMetException);
-  EXPECT_THROW(ir::Operation::Create({}, {}, output_types, op_info),
-               ir::IrNotMetException);
+  EXPECT_THROW(pir::Operation::Create(inputs, {}, {}, op_info),
+               pir::IrNotMetException);
+  EXPECT_THROW(pir::Operation::Create({}, attrs, {}, op_info),
+               pir::IrNotMetException);
+  EXPECT_THROW(pir::Operation::Create({}, {}, output_types, op_info),
+               pir::IrNotMetException);
 
-  ir::Program program(ctx);
+  pir::Program program(ctx);
 
   EXPECT_EQ(program.module_op().program(), &program);
   EXPECT_EQ(program.module_op().ir_context(), ctx);
 
   program.module_op()->set_attribute("program",
-                                     ir::PointerAttribute::get(ctx, &program));
+                                     pir::PointerAttribute::get(ctx, &program));
 }
diff --git a/test/cpp/ir/core/ir_parser_test.cc b/test/cpp/pir/core/ir_parser_test.cc
similarity index 72%
rename from test/cpp/ir/core/ir_parser_test.cc
rename to test/cpp/pir/core/ir_parser_test.cc
index 39abf960583e0..1627c2a4982c7 100644
--- a/test/cpp/ir/core/ir_parser_test.cc
+++ b/test/cpp/pir/core/ir_parser_test.cc
@@ -21,18 +21,18 @@
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/ir_adaptor/translator/translate.h"
-#include "paddle/ir/core/attribute.h"
-#include "paddle/ir/core/attribute_base.h"
-#include "paddle/ir/core/builtin_attribute.h"
-#include "paddle/ir/core/builtin_attribute_storage.h"
-#include "paddle/ir/core/builtin_dialect.h"
-#include "paddle/ir/core/dialect.h"
-#include "paddle/ir/core/ir_parser.h"
-#include "paddle/ir/core/ir_printer.h"
-#include "paddle/ir/core/utils.h"
+#include "paddle/pir/core/attribute.h"
+#include "paddle/pir/core/attribute_base.h"
+#include "paddle/pir/core/builtin_attribute.h"
+#include "paddle/pir/core/builtin_attribute_storage.h"
+#include "paddle/pir/core/builtin_dialect.h"
+#include "paddle/pir/core/dialect.h"
+#include "paddle/pir/core/ir_printer.h"
+#include "paddle/pir/core/parser/ir_parser.h"
+#include "paddle/pir/core/utils.h"
 
-using PaddleDialect = paddle::dialect::PaddleDialect;
-using AttributeStorage = ir::AttributeStorage;
+using OperatorDialect = paddle::dialect::OperatorDialect;
+using AttributeStorage = pir::AttributeStorage;
 
 enum TestType {
   AttributeTest = 0,
@@ -59,7 +59,7 @@ class ParserTest {
  public:
   explicit ParserTest(std::ifstream& test_text) : test_text(test_text) {}
   TestTask* GetTestTask();
-  bool ConsumeTestTask(TestTask* test_task, ir::IrContext* ctx);
+  bool ConsumeTestTask(TestTask* test_task, pir::IrContext* ctx);
 };
 
 TestTask* ParserTest::GetTestTask() {
@@ -92,13 +92,13 @@ TestTask* ParserTest::GetTestTask() {
   return nullptr;
 }
 
-bool ParserTest::ConsumeTestTask(TestTask* test_task, ir::IrContext* ctx) {
+bool ParserTest::ConsumeTestTask(TestTask* test_task, pir::IrContext* ctx) {
   std::string test_info = test_task->test_info;
   TestType test_type = test_task->test_type;
-  std::unique_ptr<ir::IrPrinter> printer;
-  std::unique_ptr<ir::IrParser> parser;
+  std::unique_ptr<pir::IrPrinter> printer;
+  std::unique_ptr<pir::IrParser> parser;
   std::stringstream is(test_info);
-  parser.reset(new ir::IrParser(ctx, is));
+  parser.reset(new pir::IrParser(ctx, is));
   std::vector<std::string> before_parser_tokens;
   while (parser->PeekToken().token_type_ != EOF_) {
     before_parser_tokens.push_back(parser->ConsumeToken().val_);
@@ -106,16 +106,16 @@ bool ParserTest::ConsumeTestTask(TestTask* test_task, ir::IrContext* ctx) {
   std::stringstream is_par(test_info);
   std::stringstream os;
   if (test_type == AttributeTest) {
-    auto attr = ir::Attribute::Parse(is_par, ctx);
+    auto attr = pir::Attribute::Parse(is_par, ctx);
     attr.Print(os);
   } else if (test_type == ProgramTest) {
-    auto program = ir::Program::Parse(is_par, ctx);
+    auto program = pir::Program::Parse(is_par, ctx);
     program->Print(os);
   } else if (test_type == TypeTest) {
-    auto type = ir::Type::Parse(is_par, ctx);
+    auto type = pir::Type::Parse(is_par, ctx);
     type.Print(os);
   }
-  parser.reset(new ir::IrParser(ctx, os));
+  parser.reset(new pir::IrParser(ctx, os));
   std::vector<std::string> after_parser_tokens;
   while (parser->PeekToken().token_type_ != EOF_) {
     auto str = parser->ConsumeToken().val_;
@@ -136,10 +136,15 @@ bool ParserTest::ConsumeTestTask(TestTask* test_task, ir::IrContext* ctx) {
 }
 
 TEST(IrParserTest, TestParserByFile) {
-  ir::IrContext* ctx = ir::IrContext::Instance();
-  ctx->GetOrRegisterDialect<PaddleDialect>();
-  ctx->GetOrRegisterDialect<ir::BuiltinDialect>();
-  std::ifstream is("TestParserText.txt");
+  pir::IrContext* ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<OperatorDialect>();
+  ctx->GetOrRegisterDialect<pir::BuiltinDialect>();
+#ifdef _WIN32
+  const std::string file_path = "TestParserText.txt";
+#else
+  const std::string file_path = "./pir/core/TestParserText.txt";
+#endif
+  std::ifstream is(file_path);
   EXPECT_TRUE(is.is_open());
   ParserTest parser_test(is);
   bool is_test = false;
diff --git a/test/cpp/ir/core/ir_program_test.cc b/test/cpp/pir/core/ir_program_test.cc
similarity index 61%
rename from test/cpp/ir/core/ir_program_test.cc
rename to test/cpp/pir/core/ir_program_test.cc
index c7729ae89fde8..b4a2ebc2522dc 100644
--- a/test/cpp/ir/core/ir_program_test.cc
+++ b/test/cpp/pir/core/ir_program_test.cc
@@ -16,37 +16,37 @@
 
 #include <sstream>
 
-#include "paddle/fluid/ir/dialect/paddle_dialect/interface/op_yaml_info.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_dialect.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_type.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/utils/utils.h"
-#include "paddle/ir/core/block.h"
-#include "paddle/ir/core/builtin_attribute.h"
-#include "paddle/ir/core/builtin_dialect.h"
-#include "paddle/ir/core/builtin_op.h"
-#include "paddle/ir/core/ir_context.h"
-#include "paddle/ir/core/program.h"
-#include "paddle/ir/core/utils.h"
+#include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
+#include "paddle/fluid/pir/dialect/operator/utils/utils.h"
 #include "paddle/phi/core/meta_tensor.h"
 #include "paddle/phi/infermeta/binary.h"
 #include "paddle/phi/kernels/elementwise_add_kernel.h"
+#include "paddle/pir/core/block.h"
+#include "paddle/pir/core/builtin_attribute.h"
+#include "paddle/pir/core/builtin_dialect.h"
+#include "paddle/pir/core/builtin_op.h"
+#include "paddle/pir/core/ir_context.h"
+#include "paddle/pir/core/program.h"
+#include "paddle/pir/core/utils.h"
 // NOTE(zhangbo9674): File pd_op.h is generated by op_gen.py, see details in
-// paddle/fluid/ir/dialect/CMakeLists.txt.
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_op.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/transforms/param_to_variable.h"
+// paddle/fluid/pir/dialect/CMakeLists.txt.
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
+#include "paddle/fluid/pir/dialect/operator/transforms/param_to_variable.h"
 
-class AddOp : public ir::Op<AddOp> {
+class AddOp : public pir::Op<AddOp> {
  public:
   using Op::Op;
   static const char *name() { return "test.add"; }
   static constexpr const char **attributes_name = nullptr;
   static constexpr uint32_t attributes_num = 0;
   void Verify();
-  static void Build(ir::Builder &builder,             // NOLINT
-                    ir::OperationArgument &argument,  // NOLINT
-                    ir::OpResult l_operand,
-                    ir::OpResult r_operand,
-                    ir::Type sum_type);
+  static void Build(pir::Builder &builder,             // NOLINT
+                    pir::OperationArgument &argument,  // NOLINT
+                    pir::OpResult l_operand,
+                    pir::OpResult r_operand,
+                    pir::Type sum_type);
 };
 void AddOp::Verify() {
   if (num_operands() != 2) {
@@ -56,11 +56,11 @@ void AddOp::Verify() {
     throw("The size of outputs must be equal to 1.");
   }
 }
-void AddOp::Build(ir::Builder &,
-                  ir::OperationArgument &argument,
-                  ir::OpResult l_operand,
-                  ir::OpResult r_operand,
-                  ir::Type sum_type) {
+void AddOp::Build(pir::Builder &,
+                  pir::OperationArgument &argument,
+                  pir::OpResult l_operand,
+                  pir::OpResult r_operand,
+                  pir::Type sum_type) {
   argument.AddOperand(l_operand);
   argument.AddOperand(r_operand);
   argument.AddOutput(sum_type);
@@ -70,44 +70,44 @@ IR_DEFINE_EXPLICIT_TYPE_ID(AddOp)
 
 TEST(program_test, program) {
   // (1) Init environment.
-  ir::IrContext *ctx = ir::IrContext::Instance();
-  ir::Dialect *builtin_dialect =
-      ctx->GetOrRegisterDialect<ir::BuiltinDialect>();
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  pir::Dialect *builtin_dialect =
+      ctx->GetOrRegisterDialect<pir::BuiltinDialect>();
   builtin_dialect->RegisterOp<AddOp>();
-  ir::Dialect *paddle_dialect =
-      ctx->GetOrRegisterDialect<paddle::dialect::PaddleDialect>();
+  pir::Dialect *paddle_dialect =
+      ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
 
   // (2) Create an empty program object
-  ir::Program program(ctx);
+  pir::Program program(ctx);
 
   // (3) Create a float32 DenseTensor Parameter and save into Program
-  ir::Type fp32_dtype = ir::Float32Type::get(ctx);
+  pir::Type fp32_dtype = pir::Float32Type::get(ctx);
   phi::DDim dims = {2, 2};
   phi::DataLayout data_layout = phi::DataLayout::NCHW;
   phi::LoD lod = {{0, 1, 2}};
   size_t offset = 0;
-  ir::Type dense_tensor_dtype = paddle::dialect::DenseTensorType::get(
+  pir::Type dense_tensor_dtype = paddle::dialect::DenseTensorType::get(
       ctx, fp32_dtype, dims, data_layout, lod, offset);
 
   std::vector<float> data_a = {1, 2, 3, 4};
-  std::unique_ptr<ir::Parameter> parameter_a =
-      std::make_unique<ir::Parameter>(reinterpret_cast<void *>(data_a.data()),
-                                      4 * sizeof(float),
-                                      dense_tensor_dtype);
+  std::unique_ptr<pir::Parameter> parameter_a =
+      std::make_unique<pir::Parameter>(reinterpret_cast<void *>(data_a.data()),
+                                       4 * sizeof(float),
+                                       dense_tensor_dtype);
   program.SetParameter("a", std::move(parameter_a));
   EXPECT_EQ(program.parameters_num() == 1, true);
 
   std::vector<float> data_b = {5, 6, 7, 8};
-  std::unique_ptr<ir::Parameter> parameter_b =
-      std::make_unique<ir::Parameter>(reinterpret_cast<void *>(data_b.data()),
-                                      4 * sizeof(float),
-                                      dense_tensor_dtype);
+  std::unique_ptr<pir::Parameter> parameter_b =
+      std::make_unique<pir::Parameter>(reinterpret_cast<void *>(data_b.data()),
+                                       4 * sizeof(float),
+                                       dense_tensor_dtype);
   program.SetParameter("b", std::move(parameter_b));
   EXPECT_EQ(program.parameters_num() == 2, true);
 
   // (4) Def a = GetParameterOp("a"), and create DenseTensor for a.
-  ir::Builder builder(ctx, program.block());
-  auto op1 = builder.Build<ir::GetParameterOp>("a", dense_tensor_dtype);
+  pir::Builder builder(ctx, program.block());
+  auto op1 = builder.Build<pir::GetParameterOp>("a", dense_tensor_dtype);
 
   EXPECT_EQ(&program, op1->GetParentProgram());
   EXPECT_EQ(op1->result(0).type().dialect().id(), paddle_dialect->id());
@@ -128,7 +128,7 @@ TEST(program_test, program) {
   }
 
   // (5) Def b = GetParameterOp("b"), and create DenseTensor for b.
-  auto op2 = builder.Build<ir::GetParameterOp>("b", dense_tensor_dtype);
+  auto op2 = builder.Build<pir::GetParameterOp>("b", dense_tensor_dtype);
 
   EXPECT_EQ(op2->result(0).type().dialect().id(), paddle_dialect->id());
   Interface *b_interface =
@@ -175,14 +175,14 @@ TEST(program_test, program) {
   EXPECT_EQ(std::get<0>(interface.GetOpInfo())[0].name == "x", true);
 
   // (8) Def SetParameterOp(c, "c")
-  auto op4 = builder.Build<ir::SetParameterOp>(op3->result(0), "c");
+  auto op4 = builder.Build<pir::SetParameterOp>(op3->result(0), "c");
 
   EXPECT_EQ(op4->operand(0).type().dialect().id(), paddle_dialect->id());
   Interface *c_interface =
       op4->operand(0).type().dialect().GetRegisteredInterface<Interface>();
-  //   ir::Parameter *parameter_c =
+  //   pir::Parameter *parameter_c =
   //       c_interface->VariableToParameter(variable_c.get());
-  std::unique_ptr<ir::Parameter> parameter_c =
+  std::unique_ptr<pir::Parameter> parameter_c =
       c_interface->VariableToParameter(variable_c.get());
   EXPECT_EQ(parameter_c->type(), dense_tensor_dtype);
   for (int64_t i = 0; i < dst_tensor->numel(); i++) {
@@ -206,55 +206,55 @@ TEST(program_test, program) {
 
 TEST(program_test, slice_combine_test) {
   // (1) Init environment.
-  ir::IrContext *ctx = ir::IrContext::Instance();
-  ctx->GetOrRegisterDialect<ir::BuiltinDialect>();
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<pir::BuiltinDialect>();
 
   // (2) Create an empty program object
-  ir::Program program(ctx);
-  //   ir::Program *program = new ir::Program();
+  pir::Program program(ctx);
+  //   pir::Program *program = new pir::Program();
   EXPECT_EQ(program.block()->empty(), true);
 
   // (3) Create a float32 DenseTensor Parameter and save into Program
-  ir::Type fp32_dtype = ir::Float32Type::get(ctx);
+  pir::Type fp32_dtype = pir::Float32Type::get(ctx);
 
   // (4) Def a = GetParameterOp("a")
-  std::string op1_name = ir::GetParameterOp::name();
-  ir::OpInfo op1_info = ctx->GetRegisteredOpInfo(op1_name);
-  std::unordered_map<std::string, ir::Attribute> op1_attribute{
-      {"parameter_name", ir::StrAttribute::get(ctx, "a")}};
-  ir::Operation *op1 =
-      ir::Operation::Create({}, op1_attribute, {fp32_dtype}, op1_info);
+  std::string op1_name = pir::GetParameterOp::name();
+  pir::OpInfo op1_info = ctx->GetRegisteredOpInfo(op1_name);
+  std::unordered_map<std::string, pir::Attribute> op1_attribute{
+      {"parameter_name", pir::StrAttribute::get(ctx, "a")}};
+  pir::Operation *op1 =
+      pir::Operation::Create({}, op1_attribute, {fp32_dtype}, op1_info);
   program.block()->push_back(op1);
 
   // (5) Def b = Constant("b")
-  std::string op2_name = std::string(ir::ConstantOp::name());
-  ir::OpInfo op2_info = ctx->GetRegisteredOpInfo(op2_name);
-  ir::AttributeMap attr_map;
-  attr_map.insert(std::pair<std::string, ir::Attribute>(
-      "value", ir::FloatAttribute::get(ctx, 2.0)));
-  ir::Operation *op2 =
-      ir::Operation::Create({}, attr_map, {fp32_dtype}, op2_info);
+  std::string op2_name = std::string(pir::ConstantOp::name());
+  pir::OpInfo op2_info = ctx->GetRegisteredOpInfo(op2_name);
+  pir::AttributeMap attr_map;
+  attr_map.insert(std::pair<std::string, pir::Attribute>(
+      "value", pir::FloatAttribute::get(ctx, 2.0)));
+  pir::Operation *op2 =
+      pir::Operation::Create({}, attr_map, {fp32_dtype}, op2_info);
   program.block()->push_back(op2);
 
   // (6) Def combine_op = CombineOp("a", "b")
-  std::string combine_op_name = std::string(ir::CombineOp::name());
-  ir::OpInfo combine_op_info = ctx->GetRegisteredOpInfo(combine_op_name);
-  ir::Type output_type =
-      ir::VectorType::get(ctx, std::vector<ir::Type>({fp32_dtype, fp32_dtype}));
-  ir::Operation *combine_op = ir::Operation::Create(
+  std::string combine_op_name = std::string(pir::CombineOp::name());
+  pir::OpInfo combine_op_info = ctx->GetRegisteredOpInfo(combine_op_name);
+  pir::Type output_type = pir::VectorType::get(
+      ctx, std::vector<pir::Type>({fp32_dtype, fp32_dtype}));
+  pir::Operation *combine_op = pir::Operation::Create(
       {op1->result(0), op2->result(0)}, {}, {output_type}, combine_op_info);
-  ir::CombineOp combine_op_type = combine_op->dyn_cast<ir::CombineOp>();
+  pir::CombineOp combine_op_type = combine_op->dyn_cast<pir::CombineOp>();
   EXPECT_TRUE(combine_op_type.out());
   program.block()->push_back(combine_op);
 
   // (7) Def slice_op = SliceOp(combine_op, 0)
-  std::string slice_op_name = std::string(ir::SliceOp::name());
-  ir::OpInfo slice_op_info = ctx->GetRegisteredOpInfo(slice_op_name);
-  ir::Attribute index_attr = ir::Int32Attribute::get(ctx, 0);
-  ir::Operation *slice_op = ir::Operation::Create({combine_op->result(0)},
-                                                  {{"index", index_attr}},
-                                                  {fp32_dtype},
-                                                  slice_op_info);
+  std::string slice_op_name = std::string(pir::SliceOp::name());
+  pir::OpInfo slice_op_info = ctx->GetRegisteredOpInfo(slice_op_name);
+  pir::Attribute index_attr = pir::Int32Attribute::get(ctx, 0);
+  pir::Operation *slice_op = pir::Operation::Create({combine_op->result(0)},
+                                                    {{"index", index_attr}},
+                                                    {fp32_dtype},
+                                                    slice_op_info);
   program.block()->push_back(slice_op);
 
   // (8) Traverse Program
@@ -262,14 +262,14 @@ TEST(program_test, slice_combine_test) {
 }
 
 TEST(program_test, builder) {
-  ir::IrContext *ctx = ir::IrContext::Instance();
-  ctx->GetOrRegisterDialect<paddle::dialect::PaddleDialect>();
-  ir::Program program(ctx);
-  ir::Builder builder = ir::Builder(ctx, program.block());
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+  pir::Program program(ctx);
+  pir::Builder builder = pir::Builder(ctx, program.block());
 
   paddle::dialect::FullOp full_op = builder.Build<paddle::dialect::FullOp>(
       std::vector<int64_t>{2, 2}, 1.5, phi::DataType::FLOAT32, phi::CPUPlace());
-  ir::Type full_op_output = full_op->result(0).type();
+  pir::Type full_op_output = full_op->result(0).type();
   EXPECT_EQ(program.block()->size(), 1u);
   EXPECT_EQ(program.block()->back(), full_op.operation());
   EXPECT_EQ(full_op.num_operands(), 0u);
@@ -284,8 +284,8 @@ TEST(program_test, builder) {
     EXPECT_EQ(dim == 2, true);
   }
 
-  ir::ConstantOp constant = builder.Build<ir::ConstantOp>(
-      ir::Int32Attribute::get(ctx, 2), ir::Int32Type::get(ctx));
+  pir::ConstantOp constant = builder.Build<pir::ConstantOp>(
+      pir::Int32Attribute::get(ctx, 2), pir::Int32Type::get(ctx));
   EXPECT_EQ(program.block()->size() == 2, true);
-  EXPECT_EQ(constant.value().dyn_cast<ir::Int32Attribute>().data() == 2, true);
+  EXPECT_EQ(constant.value().dyn_cast<pir::Int32Attribute>().data() == 2, true);
 }
diff --git a/test/cpp/ir/core/ir_region_test.cc b/test/cpp/pir/core/ir_region_test.cc
similarity index 50%
rename from test/cpp/ir/core/ir_region_test.cc
rename to test/cpp/pir/core/ir_region_test.cc
index c14e9da420901..ea829ea4b7639 100644
--- a/test/cpp/ir/core/ir_region_test.cc
+++ b/test/cpp/pir/core/ir_region_test.cc
@@ -14,41 +14,43 @@
 
 #include <gtest/gtest.h>
 
-#include "paddle/ir/core/block.h"
-#include "paddle/ir/core/builtin_attribute.h"
-#include "paddle/ir/core/builtin_dialect.h"
-#include "paddle/ir/core/builtin_op.h"
-#include "paddle/ir/core/builtin_type.h"
-#include "paddle/ir/core/ir_context.h"
-#include "paddle/ir/core/program.h"
-#include "paddle/ir/core/utils.h"
+#include "paddle/pir/core/block.h"
+#include "paddle/pir/core/builtin_attribute.h"
+#include "paddle/pir/core/builtin_dialect.h"
+#include "paddle/pir/core/builtin_op.h"
+#include "paddle/pir/core/builtin_type.h"
+#include "paddle/pir/core/ir_context.h"
+#include "paddle/pir/core/program.h"
+#include "paddle/pir/core/utils.h"
 
 TEST(region, erase_op_test) {
   // (1) Init environment.
-  ir::IrContext* ctx = ir::IrContext::Instance();
+  pir::IrContext* ctx = pir::IrContext::Instance();
 
   // (2) Create an empty program object
-  ir::Program program(ctx);
-  ir::Builder builder = ir::Builder(ctx, program.block());
+  pir::Program program(ctx);
+  pir::Builder builder = pir::Builder(ctx, program.block());
 
   // (3) Def a = ConstantOp("2.0"); b = ConstantOp("2.0");
-  ir::FloatAttribute fp_attr = builder.float_attr(2.0f);
-  ir::Float32Type fp32_type = builder.float32_type();
-  ir::OpResult a = builder.Build<ir::ConstantOp>(fp_attr, fp32_type)->result(0);
-  ir::OpResult b = builder.Build<ir::ConstantOp>(fp_attr, fp32_type)->result(0);
+  pir::FloatAttribute fp_attr = builder.float_attr(2.0f);
+  pir::Float32Type fp32_type = builder.float32_type();
+  pir::OpResult a =
+      builder.Build<pir::ConstantOp>(fp_attr, fp32_type)->result(0);
+  pir::OpResult b =
+      builder.Build<pir::ConstantOp>(fp_attr, fp32_type)->result(0);
 
   // (6) Def c = CombineOp(a, b)
-  builder.Build<ir::CombineOp>(std::vector<ir::OpResult>{a, b});
+  builder.Build<pir::CombineOp>(std::vector<pir::OpResult>{a, b});
 
-  // Test ir::Block::erase
-  ir::Block* block = program.block();
+  // Test pir::Block::erase
+  pir::Block* block = program.block();
   EXPECT_EQ(block->size(), 3u);
   block->erase(*(block->back()));
   EXPECT_EQ(block->size(), 2u);
 
-  // Test ir::Region::erase
-  ir::Region& region = program.module_op()->region(0);
-  region.push_back(new ir::Block());
+  // Test pir::Region::erase
+  pir::Region& region = program.module_op()->region(0);
+  region.push_back(new pir::Block());
   EXPECT_EQ(region.size(), 2u);
   region.erase(region.begin());
   EXPECT_EQ(region.size(), 1u);
diff --git a/test/cpp/ir/core/ir_type_converter_test.cc b/test/cpp/pir/core/ir_type_converter_test.cc
similarity index 63%
rename from test/cpp/ir/core/ir_type_converter_test.cc
rename to test/cpp/pir/core/ir_type_converter_test.cc
index 4370d79d5b6d5..d5c77d7550d2b 100644
--- a/test/cpp/ir/core/ir_type_converter_test.cc
+++ b/test/cpp/pir/core/ir_type_converter_test.cc
@@ -18,19 +18,19 @@
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/var_desc.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/utils/utils.h"
 #include "paddle/fluid/ir_adaptor/translator/type_translator.h"
-#include "paddle/ir/core/builtin_dialect.h"
-#include "paddle/ir/core/builtin_type.h"
-#include "paddle/ir/core/ir_context.h"
-#include "paddle/ir/core/type.h"
+#include "paddle/fluid/pir/dialect/operator/utils/utils.h"
+#include "paddle/pir/core/builtin_dialect.h"
+#include "paddle/pir/core/builtin_type.h"
+#include "paddle/pir/core/ir_context.h"
+#include "paddle/pir/core/type.h"
 
 template <typename IR_TYPE>
 void test_parameterless_type() {
-  ir::IrContext* ctx = ir::IrContext::Instance();
-  ctx->GetOrRegisterDialect<ir::BuiltinDialect>();
+  pir::IrContext* ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<pir::BuiltinDialect>();
 
-  ir::Type type = IR_TYPE::get(ctx);
+  pir::Type type = IR_TYPE::get(ctx);
   std::stringstream ss;
   ss << type;
   EXPECT_GT(ss.str().size(), 0u);
@@ -41,7 +41,7 @@ void test_parameterless_type() {
   auto& type_translator = paddle::translator::TypeTranslator::instance();
   paddle::framework::VarDesc empty_var_desc("empty");
   auto proto_type = paddle::framework::TransToProtoVarType(phi_type);
-  ir::Type final_type = type_translator[proto_type](ctx, empty_var_desc);
+  pir::Type final_type = type_translator[proto_type](ctx, empty_var_desc);
   EXPECT_EQ(type, final_type);
 }
 
@@ -52,25 +52,25 @@ void test_parameterless_type_helper() {
 }
 
 TEST(TypeConverterTest, paramterless_type) {
-  test_parameterless_type_helper<ir::UInt8Type,
-                                 ir::Int8Type,
-                                 ir::BFloat16Type,
-                                 ir::Float16Type,
-                                 ir::Float32Type,
-                                 ir::Float64Type,
-                                 ir::Int16Type,
-                                 ir::Int32Type,
-                                 ir::Int64Type,
-                                 ir::BoolType,
-                                 ir::Complex64Type,
-                                 ir::Complex128Type>();
+  test_parameterless_type_helper<pir::UInt8Type,
+                                 pir::Int8Type,
+                                 pir::BFloat16Type,
+                                 pir::Float16Type,
+                                 pir::Float32Type,
+                                 pir::Float64Type,
+                                 pir::Int16Type,
+                                 pir::Int32Type,
+                                 pir::Int64Type,
+                                 pir::BoolType,
+                                 pir::Complex64Type,
+                                 pir::Complex128Type>();
 }
 
 void test_index_type() {
-  ir::IrContext* ctx = ir::IrContext::Instance();
-  ctx->GetOrRegisterDialect<ir::BuiltinDialect>();
+  pir::IrContext* ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<pir::BuiltinDialect>();
 
-  ir::Type type = ir::IndexType::get(ctx);
+  pir::Type type = pir::IndexType::get(ctx);
   std::stringstream ss;
   ss << type;
   EXPECT_GT(ss.str().size(), 0u);
@@ -80,7 +80,7 @@ void test_index_type() {
   auto& type_translator = paddle::translator::TypeTranslator::instance();
   paddle::framework::VarDesc empty_var_desc("empty");
   auto proto_type = paddle::framework::TransToProtoVarType(phi_type);
-  ir::Type final_type = type_translator[proto_type](ctx, empty_var_desc);
+  pir::Type final_type = type_translator[proto_type](ctx, empty_var_desc);
   EXPECT_EQ(paddle::dialect::TransToIrDataType(phi_type), final_type);
 }
 
diff --git a/test/cpp/ir/core/ir_value_test.cc b/test/cpp/pir/core/ir_value_test.cc
similarity index 51%
rename from test/cpp/ir/core/ir_value_test.cc
rename to test/cpp/pir/core/ir_value_test.cc
index fb7fcfd6fdda1..cd8a299e59df5 100644
--- a/test/cpp/ir/core/ir_value_test.cc
+++ b/test/cpp/pir/core/ir_value_test.cc
@@ -14,73 +14,73 @@
 
 #include <gtest/gtest.h>
 
-#include "paddle/ir/core/attribute.h"
-#include "paddle/ir/core/builtin_attribute.h"
-#include "paddle/ir/core/builtin_type.h"
-#include "paddle/ir/core/ir_context.h"
-#include "paddle/ir/core/operation.h"
+#include "paddle/pir/core/attribute.h"
+#include "paddle/pir/core/builtin_attribute.h"
+#include "paddle/pir/core/builtin_type.h"
+#include "paddle/pir/core/ir_context.h"
+#include "paddle/pir/core/operation.h"
 
 // This unittest is used to test the construction interfaces of value class and
 // operation. The constructed test scenario is: a = OP1(); b = OP2(); c = OP3(a,
 // b); d, e, f, g, h, i, j = OP4(a, c);
-ir::AttributeMap CreateAttributeMap(std::string attribute_name,
-                                    std::string attribute) {
-  ir::IrContext *ctx = ir::IrContext::Instance();
-  ir::Attribute attr_value = ir::StrAttribute::get(ctx, attribute);
-  ir::AttributeMap attr_map;
+pir::AttributeMap CreateAttributeMap(std::string attribute_name,
+                                     std::string attribute) {
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  pir::Attribute attr_value = pir::StrAttribute::get(ctx, attribute);
+  pir::AttributeMap attr_map;
   attr_map.insert(
-      std::pair<std::string, ir::Attribute>(attribute_name, attr_value));
+      std::pair<std::string, pir::Attribute>(attribute_name, attr_value));
   return attr_map;
 }
 
 TEST(value_test, value_test) {
-  ir::IrContext *ctx = ir::IrContext::Instance();
+  pir::IrContext *ctx = pir::IrContext::Instance();
   // 1. Construct OP1: a = OP1()
-  std::vector<ir::OpResult> op1_inputs = {};
-  std::vector<ir::Type> op1_output_types = {ir::Float32Type::get(ctx)};
-  ir::Operation *op1 =
-      ir::Operation::Create(op1_inputs,
-                            CreateAttributeMap("op1_name", "op1_attr"),
-                            op1_output_types,
-                            ir::OpInfo());
+  std::vector<pir::OpResult> op1_inputs = {};
+  std::vector<pir::Type> op1_output_types = {pir::Float32Type::get(ctx)};
+  pir::Operation *op1 =
+      pir::Operation::Create(op1_inputs,
+                             CreateAttributeMap("op1_name", "op1_attr"),
+                             op1_output_types,
+                             pir::OpInfo());
   op1->Print(std::cout);
-  ir::OpResult a = op1->result(0);
+  pir::OpResult a = op1->result(0);
   EXPECT_TRUE(a.use_empty());
   // 2. Construct OP2: b = OP2();
-  std::vector<ir::OpResult> op2_inputs = {};
-  std::vector<ir::Type> op2_output_types = {ir::Float32Type::get(ctx)};
-  ir::Operation *op2 =
-      ir::Operation::Create(op2_inputs,
-                            CreateAttributeMap("op2_name", "op2_attr"),
-                            op2_output_types,
-                            ir::OpInfo());
+  std::vector<pir::OpResult> op2_inputs = {};
+  std::vector<pir::Type> op2_output_types = {pir::Float32Type::get(ctx)};
+  pir::Operation *op2 =
+      pir::Operation::Create(op2_inputs,
+                             CreateAttributeMap("op2_name", "op2_attr"),
+                             op2_output_types,
+                             pir::OpInfo());
   op2->Print(std::cout);
-  ir::OpResult b = op2->result(0);
+  pir::OpResult b = op2->result(0);
   EXPECT_TRUE(b.use_empty());
   // 3. Construct OP3: c = OP3(a, b);
-  std::vector<ir::OpResult> op3_inputs{a, b};
-  std::vector<ir::Type> op3_output_types = {ir::Float32Type::get(ctx)};
-  ir::Operation *op3 =
-      ir::Operation::Create(op3_inputs,
-                            CreateAttributeMap("op3_name", "op3_attr"),
-                            op3_output_types,
-                            ir::OpInfo());
+  std::vector<pir::OpResult> op3_inputs{a, b};
+  std::vector<pir::Type> op3_output_types = {pir::Float32Type::get(ctx)};
+  pir::Operation *op3 =
+      pir::Operation::Create(op3_inputs,
+                             CreateAttributeMap("op3_name", "op3_attr"),
+                             op3_output_types,
+                             pir::OpInfo());
 
   EXPECT_TRUE(op1->result(0).HasOneUse());
   EXPECT_TRUE(op2->result(0).HasOneUse());
   op3->Print(std::cout);
-  ir::OpResult c = op3->result(0);
+  pir::OpResult c = op3->result(0);
   // 4. Construct OP4: d, e, f, g, h, i, j = OP4(a, c);
-  std::vector<ir::OpResult> op4_inputs = {a, c};
-  std::vector<ir::Type> op4_output_types;
+  std::vector<pir::OpResult> op4_inputs = {a, c};
+  std::vector<pir::Type> op4_output_types;
   for (size_t i = 0; i < 7; i++) {
-    op4_output_types.push_back(ir::Float32Type::get(ctx));
+    op4_output_types.push_back(pir::Float32Type::get(ctx));
   }
-  ir::Operation *op4 =
-      ir::Operation::Create(op4_inputs,
-                            CreateAttributeMap("op4_name", "op4_attr"),
-                            op4_output_types,
-                            ir::OpInfo());
+  pir::Operation *op4 =
+      pir::Operation::Create(op4_inputs,
+                             CreateAttributeMap("op4_name", "op4_attr"),
+                             op4_output_types,
+                             pir::OpInfo());
   op4->Print(std::cout);
 
   // Test 1:
@@ -90,17 +90,17 @@ TEST(value_test, value_test) {
   EXPECT_EQ(op4->result(6).GetDefiningOp(), op4);
 
   // Test 2: op1_first_output -> op4_first_input
-  ir::OpResult op1_first_output = op1->result(0);
-  ir::OpOperand op4_first_input = op4->operand(0);
+  pir::OpResult op1_first_output = op1->result(0);
+  pir::OpOperand op4_first_input = op4->operand(0);
 
   EXPECT_EQ(op1_first_output.first_use(), op4_first_input);
-  ir::OpOperand op3_first_input = op3->operand(0);
+  pir::OpOperand op3_first_input = op3->operand(0);
 
   EXPECT_EQ(op4_first_input.next_use(), op3_first_input);
   EXPECT_EQ(op3_first_input.next_use(), nullptr);
 
   // Test 3: Value iterator
-  using my_iterator = ir::Value::UseIterator;
+  using my_iterator = pir::Value::UseIterator;
   my_iterator iter = op1->result(0).use_begin();
   EXPECT_EQ(iter.owner(), op4);
   ++iter;
@@ -109,7 +109,7 @@ TEST(value_test, value_test) {
   // Test 4: Value Replace Use
   // a = OP1(); b = OP2(); c = OP3(a, b); d, e, f, g, h, i, j = OP4(a, c);
   //
-  c.ReplaceUsesWithIf(b, [](ir::OpOperand) { return true; });
+  c.ReplaceUsesWithIf(b, [](pir::OpOperand) { return true; });
   EXPECT_EQ(op4->operand_source(1), b);
   EXPECT_TRUE(c.use_empty());
 
diff --git a/test/cpp/ir/core/op_info_test.cc b/test/cpp/pir/core/op_info_test.cc
similarity index 51%
rename from test/cpp/ir/core/op_info_test.cc
rename to test/cpp/pir/core/op_info_test.cc
index 3e91f357daf6a..d02566237876a 100644
--- a/test/cpp/ir/core/op_info_test.cc
+++ b/test/cpp/pir/core/op_info_test.cc
@@ -14,33 +14,33 @@
 
 #include <gtest/gtest.h>
 
-#include "paddle/ir/core/block.h"
-#include "paddle/ir/core/builder.h"
-#include "paddle/ir/core/builtin_attribute.h"
-#include "paddle/ir/core/builtin_op.h"
-#include "paddle/ir/core/builtin_type.h"
-#include "paddle/ir/core/ir_context.h"
-#include "paddle/ir/core/program.h"
-#include "paddle/ir/core/verify.h"
+#include "paddle/pir/core/block.h"
+#include "paddle/pir/core/builder.h"
+#include "paddle/pir/core/builtin_attribute.h"
+#include "paddle/pir/core/builtin_op.h"
+#include "paddle/pir/core/builtin_type.h"
+#include "paddle/pir/core/ir_context.h"
+#include "paddle/pir/core/program.h"
+#include "paddle/pir/core/verify.h"
 
 TEST(ir_op_info_test, op_op_info_test) {
-  ir::IrContext* context = ir::IrContext::Instance();
-  ir::Program program(context);
+  pir::IrContext* context = pir::IrContext::Instance();
+  pir::Program program(context);
 
-  ir::Block* block = program.block();
-  ir::Builder builder(context, block);
-  builder.Build<ir::ConstantOp>(ir::Int32Attribute::get(context, 5),
-                                ir::Int32Type::get(context));
+  pir::Block* block = program.block();
+  pir::Builder builder(context, block);
+  builder.Build<pir::ConstantOp>(pir::Int32Attribute::get(context, 5),
+                                 pir::Int32Type::get(context));
 
-  ir::Operation* op = block->back();
+  pir::Operation* op = block->back();
 
-  EXPECT_EQ(block->end(), ++ir::Block::iterator(*op));
+  EXPECT_EQ(block->end(), ++pir::Block::iterator(*op));
 
   auto& info_map = context->registered_op_info_map();
   EXPECT_FALSE(info_map.empty());
 
   void* info_1 = op->info().AsOpaquePointer();
-  auto info_2 = ir::OpInfo::RecoverFromOpaquePointer(info_1);
+  auto info_2 = pir::OpInfo::RecoverFromOpaquePointer(info_1);
   EXPECT_EQ(op->info(), info_2);
-  ir::Verify(program.module_op());
+  pir::Verify(program.module_op());
 }
diff --git a/test/cpp/ir/core/op_yaml_info_parser_test.cc b/test/cpp/pir/core/op_yaml_info_parser_test.cc
similarity index 61%
rename from test/cpp/ir/core/op_yaml_info_parser_test.cc
rename to test/cpp/pir/core/op_yaml_info_parser_test.cc
index 3abdf0a72cd30..d269400fb13e3 100644
--- a/test/cpp/ir/core/op_yaml_info_parser_test.cc
+++ b/test/cpp/pir/core/op_yaml_info_parser_test.cc
@@ -14,30 +14,30 @@
 
 #include <gtest/gtest.h>
 
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_attribute.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_dialect.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_type.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/utils/op_yaml_info_parser.h"
-#include "paddle/ir/core/builder.h"
-#include "paddle/ir/core/ir_context.h"
-#include "paddle/ir/core/program.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
+#include "paddle/fluid/pir/dialect/operator/utils/op_yaml_info_parser.h"
+#include "paddle/pir/core/builder.h"
+#include "paddle/pir/core/ir_context.h"
+#include "paddle/pir/core/program.h"
 
-#include "paddle/fluid/ir/dialect/paddle_dialect/interface/op_yaml_info.h"
-#include "paddle/ir/core/builtin_attribute.h"
-#include "paddle/ir/core/builtin_dialect.h"
-#include "paddle/ir/core/builtin_op.h"
+#include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
+#include "paddle/pir/core/builtin_attribute.h"
+#include "paddle/pir/core/builtin_dialect.h"
+#include "paddle/pir/core/builtin_op.h"
 
-#include "paddle/ir/core/utils.h"
+#include "paddle/pir/core/utils.h"
 
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_op.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
 
 TEST(ir_op_info_test, op_op_info_test) {
-  ir::IrContext* ctx = ir::IrContext::Instance();
-  ir::Program program(ctx);
+  pir::IrContext* ctx = pir::IrContext::Instance();
+  pir::Program program(ctx);
 
-  ctx->GetOrRegisterDialect<paddle::dialect::PaddleDialect>();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
 
-  ir::Builder builder(ctx, program.block());
+  pir::Builder builder(ctx, program.block());
 
   auto uniform1 =
       builder.Build<paddle::dialect::UniformOp>(std::vector<int64_t>{2, 2},
@@ -55,7 +55,7 @@ TEST(ir_op_info_test, op_op_info_test) {
 
   paddle::dialect::OpYamlInfoParser op_yaml_info_parser(op_info_res);
 
-  EXPECT_EQ((op_yaml_info_parser.AttrTypeName("seed") == "ir::Int32Attribute"),
+  EXPECT_EQ((op_yaml_info_parser.AttrTypeName("seed") == "pir::Int32Attribute"),
             true);
   EXPECT_EQ(op_yaml_info_parser.IsTensorAttribute(0), true);
 
diff --git a/test/cpp/ir/core/program_translator_test.cc b/test/cpp/pir/core/program_translator_test.cc
similarity index 75%
rename from test/cpp/ir/core/program_translator_test.cc
rename to test/cpp/pir/core/program_translator_test.cc
index 0441860ed1d7c..915ff2de42beb 100644
--- a/test/cpp/ir/core/program_translator_test.cc
+++ b/test/cpp/pir/core/program_translator_test.cc
@@ -25,17 +25,17 @@
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_dialect.h"
 #include "paddle/fluid/ir_adaptor/translator/translate.h"
 #include "paddle/fluid/ir_adaptor/translator/utils.h"
-#include "paddle/ir/core/builtin_dialect.h"
-#include "paddle/ir/core/dialect.h"
-#include "paddle/ir/core/ir_context.h"
-#include "paddle/ir/core/ir_parser.h"
-#include "paddle/ir/core/ir_printer.h"
-#include "paddle/ir/core/program.h"
-
-using PaddleDialect = paddle::dialect::PaddleDialect;
+#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
+#include "paddle/pir/core/builtin_dialect.h"
+#include "paddle/pir/core/dialect.h"
+#include "paddle/pir/core/ir_context.h"
+#include "paddle/pir/core/ir_printer.h"
+#include "paddle/pir/core/parser/ir_parser.h"
+#include "paddle/pir/core/program.h"
+
+using OperatorDialect = paddle::dialect::OperatorDialect;
 using ProgramDesc = paddle::framework::ProgramDesc;
 using BlockDesc = paddle::framework::BlockDesc;
 using OpDesc = paddle::framework::OpDesc;
@@ -53,13 +53,13 @@ ProgramDesc load_from_file(const std::string &file_name) {
   return ProgramDesc(buffer);
 }
 
-TEST(PaddleDialectTest, MainProgram) {
+TEST(OperatorDialectTest, MainProgram) {
   auto p = load_from_file("resnet50_main.prog");
   EXPECT_EQ(p.Size(), 1u);
 
-  ir::IrContext *ctx = ir::IrContext::Instance();
-  ctx->GetOrRegisterDialect<PaddleDialect>();
-  ctx->GetOrRegisterDialect<ir::BuiltinDialect>();
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<OperatorDialect>();
+  ctx->GetOrRegisterDialect<pir::BuiltinDialect>();
   auto program = paddle::TranslateLegacyProgramToProgram(p);
 
   std::stringstream ss;
@@ -72,13 +72,13 @@ TEST(PaddleDialectTest, MainProgram) {
   EXPECT_GT(ss.str().size(), 0u);
 }
 
-TEST(PaddleDialectTest, StartupProgram) {
+TEST(OperatorDialectTest, StartupProgram) {
   auto p = load_from_file("resnet50_startup.prog");
   EXPECT_EQ(p.Size(), 1u);
 
-  ir::IrContext *ctx = ir::IrContext::Instance();
-  ctx->GetOrRegisterDialect<PaddleDialect>();
-  ctx->GetOrRegisterDialect<ir::BuiltinDialect>();
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<OperatorDialect>();
+  ctx->GetOrRegisterDialect<pir::BuiltinDialect>();
   auto program = paddle::TranslateLegacyProgramToProgram(p);
 
   size_t op_size = program->block()->size();
@@ -94,7 +94,7 @@ TEST(PaddleDialectTest, StartupProgram) {
 
 TEST(RegisterInfoTest, MainProgram) {
   auto p = load_from_file("resnet50_startup.prog");
-  ir::IrContext *ctx = ir::IrContext::Instance();
+  pir::IrContext *ctx = pir::IrContext::Instance();
 
   auto unregistered_ops =
       paddle::translator::CheckUnregisteredOperation(ctx, p);
@@ -113,14 +113,14 @@ TEST(RegisterInfoTest, MainProgram) {
 TEST(IrParserTest, MainProgram) {
   auto p = load_from_file("resnet50_main.prog");
   EXPECT_EQ(p.Size(), 1u);
-  ir::IrContext *ctx = ir::IrContext::Instance();
-  ctx->GetOrRegisterDialect<PaddleDialect>();
-  ctx->GetOrRegisterDialect<ir::BuiltinDialect>();
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<OperatorDialect>();
+  ctx->GetOrRegisterDialect<pir::BuiltinDialect>();
   auto program = paddle::TranslateLegacyProgramToProgram(p);
 
   std::stringstream ss;
   program->Print(ss);
-  std::unique_ptr<ir::Program> parser_program = ir::Program::Parse(ss, ctx);
+  std::unique_ptr<pir::Program> parser_program = pir::Program::Parse(ss, ctx);
   std::stringstream ssp;
   parser_program->Print(ssp);
 
@@ -130,14 +130,14 @@ TEST(IrParserTest, MainProgram) {
 TEST(IrParserTest, StartupProgram) {
   auto p = load_from_file("resnet50_startup.prog");
   EXPECT_EQ(p.Size(), 1u);
-  ir::IrContext *ctx = ir::IrContext::Instance();
-  ctx->GetOrRegisterDialect<PaddleDialect>();
-  ctx->GetOrRegisterDialect<ir::BuiltinDialect>();
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<OperatorDialect>();
+  ctx->GetOrRegisterDialect<pir::BuiltinDialect>();
   auto program = paddle::TranslateLegacyProgramToProgram(p);
 
   std::stringstream ss;
   program->Print(ss);
-  std::unique_ptr<ir::Program> parser_program = ir::Program::Parse(ss, ctx);
+  std::unique_ptr<pir::Program> parser_program = pir::Program::Parse(ss, ctx);
   std::stringstream ssp;
   parser_program->Print(ssp);
 
diff --git a/test/cpp/ir/core/scalar_attribute_test.cc b/test/cpp/pir/core/scalar_attribute_test.cc
similarity index 54%
rename from test/cpp/ir/core/scalar_attribute_test.cc
rename to test/cpp/pir/core/scalar_attribute_test.cc
index 649d9cd0cd3d2..e15ebfad84585 100644
--- a/test/cpp/ir/core/scalar_attribute_test.cc
+++ b/test/cpp/pir/core/scalar_attribute_test.cc
@@ -14,42 +14,42 @@
 
 #include <gtest/gtest.h>
 
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_attribute.h"
-#include "paddle/ir/core/attribute.h"
-#include "paddle/ir/core/builtin_attribute.h"
-#include "paddle/ir/core/builtin_dialect.h"
-#include "paddle/ir/core/dialect.h"
-#include "paddle/ir/core/ir_context.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
+#include "paddle/pir/core/attribute.h"
+#include "paddle/pir/core/builtin_attribute.h"
+#include "paddle/pir/core/builtin_dialect.h"
+#include "paddle/pir/core/dialect.h"
+#include "paddle/pir/core/ir_context.h"
 
 using ScalarAttribute = paddle::dialect::ScalarAttribute;
 
 TEST(ScalarTest, base) {
-  ir::IrContext *ctx = ir::IrContext::Instance();
+  pir::IrContext *ctx = pir::IrContext::Instance();
 
-  ir::Attribute bool_scalar = ir::BoolAttribute::get(ctx, false);
+  pir::Attribute bool_scalar = pir::BoolAttribute::get(ctx, false);
   EXPECT_TRUE(bool_scalar.isa<ScalarAttribute>());
-  EXPECT_TRUE(bool_scalar.isa<ir::BoolAttribute>());
-  ir::BoolAttribute pure_bool = bool_scalar.dyn_cast<ir::BoolAttribute>();
+  EXPECT_TRUE(bool_scalar.isa<pir::BoolAttribute>());
+  pir::BoolAttribute pure_bool = bool_scalar.dyn_cast<pir::BoolAttribute>();
   EXPECT_TRUE(pure_bool.isa<ScalarAttribute>());
   ScalarAttribute scalar_from_bool = bool_scalar.dyn_cast<ScalarAttribute>();
-  EXPECT_TRUE(scalar_from_bool.isa<ir::BoolAttribute>());
-  EXPECT_NO_THROW(scalar_from_bool.dyn_cast<ir::BoolAttribute>());
+  EXPECT_TRUE(scalar_from_bool.isa<pir::BoolAttribute>());
+  EXPECT_NO_THROW(scalar_from_bool.dyn_cast<pir::BoolAttribute>());
 }
 
 TEST(ScalarTest, test_classof) {
-  ir::IrContext *ctx = ir::IrContext::Instance();
-  ir::Attribute bool_scalar = ir::BoolAttribute::get(ctx, false);
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  pir::Attribute bool_scalar = pir::BoolAttribute::get(ctx, false);
   EXPECT_TRUE(bool_scalar.isa<ScalarAttribute>());
 
-  ir::Attribute float_scalar = ir::FloatAttribute::get(ctx, 1.0f);
+  pir::Attribute float_scalar = pir::FloatAttribute::get(ctx, 1.0f);
   EXPECT_TRUE(float_scalar.isa<ScalarAttribute>());
 
-  ir::Attribute double_scalar = ir::DoubleAttribute::get(ctx, 1.0);
+  pir::Attribute double_scalar = pir::DoubleAttribute::get(ctx, 1.0);
   EXPECT_TRUE(double_scalar.isa<ScalarAttribute>());
 
-  ir::Attribute int32_scalar = ir::Int32Attribute::get(ctx, 1);
+  pir::Attribute int32_scalar = pir::Int32Attribute::get(ctx, 1);
   EXPECT_TRUE(int32_scalar.isa<ScalarAttribute>());
 
-  ir::Attribute int64_scalar = ir::Int64Attribute::get(ctx, 1l);
+  pir::Attribute int64_scalar = pir::Int64Attribute::get(ctx, 1l);
   EXPECT_TRUE(int64_scalar.isa<ScalarAttribute>());
 }
diff --git a/test/cpp/pir/core/type_interface_test.cc b/test/cpp/pir/core/type_interface_test.cc
new file mode 100644
index 0000000000000..2625c184120d9
--- /dev/null
+++ b/test/cpp/pir/core/type_interface_test.cc
@@ -0,0 +1,59 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
+#include "paddle/pir/core/builtin_dialect.h"
+#include "paddle/pir/core/builtin_type.h"
+#include "paddle/pir/core/dialect.h"
+#include "paddle/pir/core/type.h"
+
+#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
+#include "test/cpp/pir/tools/test_dialect.h"
+#include "test/cpp/pir/tools/test_op.h"
+
+TEST(shapedtype_test, shapedtype_test) {
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  pir::Dialect *test_dialect = ctx->GetOrRegisterDialect<test::TestDialect>();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+  EXPECT_EQ(test_dialect != nullptr, true);
+
+  pir::Type fp32_dtype = pir::Float32Type::get(ctx);
+  phi::DDim dims = {2, 2};
+  phi::DataLayout data_layout = phi::DataLayout::NCHW;
+  phi::LoD lod = {{0, 1, 2}};
+  size_t offset = 0;
+
+  pir::DenseTensorType shaped_type = pir::DenseTensorType::get(
+      ctx, fp32_dtype, dims, data_layout, lod, offset);
+
+  EXPECT_EQ(shaped_type.dtype().isa<pir::Float32Type>(), true);
+  EXPECT_EQ(shaped_type.dims(), dims);
+  EXPECT_EQ(shaped_type.data_layout(), data_layout);
+  EXPECT_EQ(shaped_type.lod(), lod);
+  EXPECT_EQ(shaped_type.offset(), offset);
+
+  pir::ShapedTypeInterface interface =
+      shaped_type.dyn_cast_interface<pir::ShapedTypeInterface>();
+
+  EXPECT_EQ(interface.getElementType().isa<pir::Float32Type>(), true);
+  EXPECT_EQ(interface.getShape(), dims);
+  EXPECT_EQ(interface.kDynamic, std::numeric_limits<int64_t>::min());
+  EXPECT_EQ(interface.getRank(), 2);
+  EXPECT_EQ(interface.isDynamic(2), false);
+  EXPECT_EQ(interface.isDynamicShape(dims), false);
+  EXPECT_EQ(interface.isDynamicDim(1), false);
+  EXPECT_EQ(interface.getNumDynamicDims(), 0);
+  EXPECT_EQ(interface.getDimSize(0), 2);
+}
diff --git a/test/cpp/ir/core/type_test.cc b/test/cpp/pir/core/type_test.cc
similarity index 50%
rename from test/cpp/ir/core/type_test.cc
rename to test/cpp/pir/core/type_test.cc
index 2def5aa3d1741..edbca9921f496 100644
--- a/test/cpp/ir/core/type_test.cc
+++ b/test/cpp/pir/core/type_test.cc
@@ -15,16 +15,16 @@
 #include <gtest/gtest.h>
 #include <unordered_map>
 
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_dialect.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_type.h"
-#include "paddle/ir/core/builtin_dialect.h"
-#include "paddle/ir/core/builtin_type.h"
-#include "paddle/ir/core/dialect.h"
-#include "paddle/ir/core/ir_context.h"
-#include "paddle/ir/core/type.h"
-#include "paddle/ir/core/type_base.h"
-#include "paddle/ir/core/type_name.h"
-#include "paddle/ir/core/utils.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
+#include "paddle/pir/core/builtin_dialect.h"
+#include "paddle/pir/core/builtin_type.h"
+#include "paddle/pir/core/dialect.h"
+#include "paddle/pir/core/ir_context.h"
+#include "paddle/pir/core/type.h"
+#include "paddle/pir/core/type_base.h"
+#include "paddle/pir/core/type_name.h"
+#include "paddle/pir/core/utils.h"
 
 class TypeA {};
 IR_DECLARE_EXPLICIT_TYPE_ID(TypeA)
@@ -37,14 +37,14 @@ IR_DEFINE_EXPLICIT_TYPE_ID(TypeB)
 TEST(type_test, type_id) {
   // Test 1: Test construct TypeId by TypeId::get<T>() and overloaded operator==
   // method.
-  ir::TypeId a_id = ir::TypeId::get<TypeA>();
-  ir::TypeId a_other_id = ir::TypeId::get<TypeA>();
-  ir::TypeId b_id = ir::TypeId::get<TypeB>();
+  pir::TypeId a_id = pir::TypeId::get<TypeA>();
+  pir::TypeId a_other_id = pir::TypeId::get<TypeA>();
+  pir::TypeId b_id = pir::TypeId::get<TypeB>();
   EXPECT_EQ(a_id, a_other_id);
   EXPECT_NE(a_id, b_id);
 
   // Test 2: Test the hash function of TypeId.
-  std::unordered_map<ir::TypeId, ir::TypeId *> type_id_register;
+  std::unordered_map<pir::TypeId, pir::TypeId *> type_id_register;
   type_id_register.emplace(a_id, &a_id);
   type_id_register.emplace(b_id, &b_id);
   for (auto kv : type_id_register) {
@@ -53,9 +53,9 @@ TEST(type_test, type_id) {
 }
 
 // Define a FakeDialect without registering any types.
-struct FakeDialect : ir::Dialect {
-  explicit FakeDialect(ir::IrContext *context)
-      : ir::Dialect(name(), context, ir::TypeId::get<FakeDialect>()) {}
+struct FakeDialect : pir::Dialect {
+  explicit FakeDialect(pir::IrContext *context)
+      : pir::Dialect(name(), context, pir::TypeId::get<FakeDialect>()) {}
   static const char *name() { return "fake"; }
 };
 IR_DECLARE_EXPLICIT_TYPE_ID(FakeDialect)
@@ -63,114 +63,116 @@ IR_DEFINE_EXPLICIT_TYPE_ID(FakeDialect)
 
 TEST(type_test, type_base) {
   // Test 1: Test the function of IrContext to register Dialect.
-  ir::IrContext *ctx = ir::IrContext::Instance();
-  ir::Dialect *fake_dialect = ctx->GetOrRegisterDialect<FakeDialect>();
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  pir::Dialect *fake_dialect = ctx->GetOrRegisterDialect<FakeDialect>();
+  std::vector<pir::details::InterfaceValue> interface_map;
 
   // Test 2: Test the get method of AbstractType.
-  ir::TypeId a_id = ir::TypeId::get<TypeA>();
-  ir::AbstractType abstract_type_a = ir::AbstractType::get(a_id, *fake_dialect);
+  pir::TypeId a_id = pir::TypeId::get<TypeA>();
+  pir::AbstractType abstract_type_a =
+      pir::AbstractType::get(a_id, *fake_dialect, std::move(interface_map));
   EXPECT_EQ(abstract_type_a.type_id(), a_id);
 
   // Test 3: Test the constructor of TypeStorage.
-  ir::TypeStorage storage_a(&abstract_type_a);
+  pir::TypeStorage storage_a(&abstract_type_a);
   EXPECT_EQ(storage_a.abstract_type().type_id(), abstract_type_a.type_id());
 }
 
 TEST(type_test, built_in_type) {
   // Test the interfaces of class Type: judgment, type_id, abstract_type,
   // classof.
-  ir::IrContext *ctx = ir::IrContext::Instance();
+  pir::IrContext *ctx = pir::IrContext::Instance();
 
   // Test 1: Test the parameterless built-in type of IrContext.
-  ir::Type bfp16_1 = ir::BFloat16Type::get(ctx);
-  ir::Type bfp16_2 = ir::BFloat16Type::get(ctx);
+  pir::Type bfp16_1 = pir::BFloat16Type::get(ctx);
+  pir::Type bfp16_2 = pir::BFloat16Type::get(ctx);
   EXPECT_EQ(bfp16_1, bfp16_2);
   EXPECT_EQ(bfp16_1.type_id(), bfp16_2.type_id());
   EXPECT_EQ(&bfp16_1.abstract_type(),
-            &ir::AbstractType::lookup(bfp16_1.type_id(), ctx));
-  EXPECT_EQ(ir::BFloat16Type::classof(bfp16_1), 1);
+            &pir::AbstractType::lookup(bfp16_1.type_id(), ctx));
+  EXPECT_EQ(pir::BFloat16Type::classof(bfp16_1), 1);
 
-  ir::Type index_1 = ir::IndexType::get(ctx);
-  ir::Type index_2 = ir::IndexType::get(ctx);
+  pir::Type index_1 = pir::IndexType::get(ctx);
+  pir::Type index_2 = pir::IndexType::get(ctx);
   EXPECT_EQ(index_1, index_2);
   EXPECT_EQ(index_1.type_id(), index_2.type_id());
   EXPECT_EQ(&index_1.abstract_type(),
-            &ir::AbstractType::lookup(index_1.type_id(), ctx));
-  EXPECT_EQ(ir::IndexType::classof(index_1), 1);
+            &pir::AbstractType::lookup(index_1.type_id(), ctx));
+  EXPECT_EQ(pir::IndexType::classof(index_1), 1);
 
-  ir::Type fp16_1 = ir::Float16Type::get(ctx);
-  ir::Type fp16_2 = ir::Float16Type::get(ctx);
+  pir::Type fp16_1 = pir::Float16Type::get(ctx);
+  pir::Type fp16_2 = pir::Float16Type::get(ctx);
   EXPECT_EQ(fp16_1, fp16_2);
   EXPECT_EQ(fp16_1.type_id(), fp16_2.type_id());
   EXPECT_EQ(&fp16_1.abstract_type(),
-            &ir::AbstractType::lookup(fp16_1.type_id(), ctx));
-  EXPECT_EQ(ir::Float16Type::classof(fp16_1), 1);
+            &pir::AbstractType::lookup(fp16_1.type_id(), ctx));
+  EXPECT_EQ(pir::Float16Type::classof(fp16_1), 1);
 
-  ir::Type fp32_1 = ir::Float32Type::get(ctx);
-  ir::Type fp32_2 = ir::Float32Type::get(ctx);
+  pir::Type fp32_1 = pir::Float32Type::get(ctx);
+  pir::Type fp32_2 = pir::Float32Type::get(ctx);
   EXPECT_EQ(fp32_1, fp32_2);
   EXPECT_EQ(fp32_1.type_id(), fp32_2.type_id());
   EXPECT_EQ(&fp32_1.abstract_type(),
-            &ir::AbstractType::lookup(fp32_1.type_id(), ctx));
-  EXPECT_EQ(ir::Float32Type::classof(fp32_1), 1);
+            &pir::AbstractType::lookup(fp32_1.type_id(), ctx));
+  EXPECT_EQ(pir::Float32Type::classof(fp32_1), 1);
 
-  ir::Type fp64_1 = ir::Float64Type::get(ctx);
-  ir::Type fp64_2 = ir::Float64Type::get(ctx);
+  pir::Type fp64_1 = pir::Float64Type::get(ctx);
+  pir::Type fp64_2 = pir::Float64Type::get(ctx);
   EXPECT_EQ(fp64_1, fp64_2);
   EXPECT_EQ(fp64_1.type_id(), fp64_2.type_id());
   EXPECT_EQ(&fp64_1.abstract_type(),
-            &ir::AbstractType::lookup(fp64_1.type_id(), ctx));
-  EXPECT_EQ(ir::Float64Type::classof(fp64_1), 1);
+            &pir::AbstractType::lookup(fp64_1.type_id(), ctx));
+  EXPECT_EQ(pir::Float64Type::classof(fp64_1), 1);
 
-  ir::Type int16_1 = ir::Int16Type::get(ctx);
-  ir::Type int16_2 = ir::Int16Type::get(ctx);
+  pir::Type int16_1 = pir::Int16Type::get(ctx);
+  pir::Type int16_2 = pir::Int16Type::get(ctx);
   EXPECT_EQ(int16_1, int16_2);
   EXPECT_EQ(int16_1.type_id(), int16_2.type_id());
   EXPECT_EQ(&int16_1.abstract_type(),
-            &ir::AbstractType::lookup(int16_1.type_id(), ctx));
-  EXPECT_EQ(ir::Int16Type::classof(int16_1), 1);
+            &pir::AbstractType::lookup(int16_1.type_id(), ctx));
+  EXPECT_EQ(pir::Int16Type::classof(int16_1), 1);
 
-  ir::Type int32_1 = ir::Int32Type::get(ctx);
-  ir::Type int32_2 = ir::Int32Type::get(ctx);
+  pir::Type int32_1 = pir::Int32Type::get(ctx);
+  pir::Type int32_2 = pir::Int32Type::get(ctx);
   EXPECT_EQ(int32_1, int32_2);
   EXPECT_EQ(int32_1.type_id(), int32_2.type_id());
   EXPECT_EQ(&int32_1.abstract_type(),
-            &ir::AbstractType::lookup(int32_1.type_id(), ctx));
-  EXPECT_EQ(ir::Int32Type::classof(int32_1), 1);
+            &pir::AbstractType::lookup(int32_1.type_id(), ctx));
+  EXPECT_EQ(pir::Int32Type::classof(int32_1), 1);
 
-  ir::Type int64_1 = ir::Int64Type::get(ctx);
-  ir::Type int64_2 = ir::Int64Type::get(ctx);
+  pir::Type int64_1 = pir::Int64Type::get(ctx);
+  pir::Type int64_2 = pir::Int64Type::get(ctx);
   EXPECT_EQ(int64_1, int64_2);
   EXPECT_EQ(int64_1.type_id(), int64_2.type_id());
   EXPECT_EQ(&int64_1.abstract_type(),
-            &ir::AbstractType::lookup(int64_1.type_id(), ctx));
-  EXPECT_EQ(ir::Int64Type::classof(int64_1), 1);
+            &pir::AbstractType::lookup(int64_1.type_id(), ctx));
+  EXPECT_EQ(pir::Int64Type::classof(int64_1), 1);
 
   // Test 2: Test isa and dyn_cast.
-  EXPECT_EQ(fp16_1.isa<ir::Float16Type>(), true);
-  EXPECT_EQ(fp16_1.isa<ir::Float32Type>(), false);
-  EXPECT_EQ(fp16_1.isa<ir::Type>(), true);
+  EXPECT_EQ(fp16_1.isa<pir::Float16Type>(), true);
+  EXPECT_EQ(fp16_1.isa<pir::Float32Type>(), false);
+  EXPECT_EQ(fp16_1.isa<pir::Type>(), true);
 
   // Test 3: Test VectorType
-  std::vector<ir::Type> vec_type = {int32_1, int64_1};
-  ir::Type vector_type = ir::VectorType::get(ctx, vec_type);
-  EXPECT_EQ(vector_type.isa<ir::VectorType>(), true);
-  EXPECT_EQ(vector_type.dyn_cast<ir::VectorType>().size() == 2, true);
-  EXPECT_EQ(vector_type.dyn_cast<ir::VectorType>()[0].isa<ir::Int32Type>(),
+  std::vector<pir::Type> vec_type = {int32_1, int64_1};
+  pir::Type vector_type = pir::VectorType::get(ctx, vec_type);
+  EXPECT_EQ(vector_type.isa<pir::VectorType>(), true);
+  EXPECT_EQ(vector_type.dyn_cast<pir::VectorType>().size() == 2, true);
+  EXPECT_EQ(vector_type.dyn_cast<pir::VectorType>()[0].isa<pir::Int32Type>(),
             true);
-  EXPECT_EQ(vector_type.dyn_cast<ir::VectorType>()[1].isa<ir::Int64Type>(),
+  EXPECT_EQ(vector_type.dyn_cast<pir::VectorType>()[1].isa<pir::Int64Type>(),
             true);
 }
 
 // Customize a parameterized TypeStorage IntegerTypeStorage.
-struct IntegerTypeStorage : public ir::TypeStorage {
+struct IntegerTypeStorage : public pir::TypeStorage {
   IntegerTypeStorage(unsigned width, unsigned signedness)
       : width_(width), signedness_(signedness) {}
   using ParamKey = std::pair<unsigned, unsigned>;
 
   static std::size_t HashValue(const ParamKey &key) {
-    return ir::hash_combine(std::hash<unsigned>()(std::get<0>(key)),
-                            std::hash<unsigned>()(std::get<1>(key)));
+    return pir::hash_combine(std::hash<unsigned>()(std::get<0>(key)),
+                             std::hash<unsigned>()(std::get<1>(key)));
   }
 
   bool operator==(const ParamKey &key) const {
@@ -189,18 +191,18 @@ struct IntegerTypeStorage : public ir::TypeStorage {
 
 // Customize a parameterized type: IntegerType, storage type is
 // IntegerTypeStorage.
-class IntegerType : public ir::Type {
+class IntegerType
+    : public pir::Type::TypeBase<IntegerType, pir::Type, IntegerTypeStorage> {
  public:
-  using Type::Type;
-  DECLARE_TYPE_UTILITY_FUNCTOR(IntegerType, IntegerTypeStorage);
+  using Base::Base;
 };
 IR_DECLARE_EXPLICIT_TYPE_ID(IntegerType)
 IR_DEFINE_EXPLICIT_TYPE_ID(IntegerType)
 
 // Customize a Dialect IntegerDialect, registration type of IntegerType.
-struct IntegerDialect : ir::Dialect {
-  explicit IntegerDialect(ir::IrContext *context)
-      : ir::Dialect(name(), context, ir::TypeId::get<IntegerDialect>()) {
+struct IntegerDialect : pir::Dialect {
+  explicit IntegerDialect(pir::IrContext *context)
+      : pir::Dialect(name(), context, pir::TypeId::get<IntegerDialect>()) {
     RegisterType<IntegerType>();
   }
   static const char *name() { return "integer"; }
@@ -209,41 +211,41 @@ IR_DECLARE_EXPLICIT_TYPE_ID(IntegerDialect)
 IR_DEFINE_EXPLICIT_TYPE_ID(IntegerDialect)
 
 TEST(type_test, custom_type_dialect) {
-  ir::IrContext *ctx = ir::IrContext::Instance();
-  ctx->GetOrRegisterDialect<paddle::dialect::PaddleDialect>();
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
 
   // Test 1: Test the function of IrContext to register Dialect.
   ctx->GetOrRegisterDialect<IntegerDialect>();
 
-  ir::Type int1_1 = IntegerType::get(ctx, 1, 0);
-  ir::Type int1_2 = IntegerType::get(ctx, 1, 0);
+  pir::Type int1_1 = IntegerType::get(ctx, 1, 0);
+  pir::Type int1_2 = IntegerType::get(ctx, 1, 0);
   EXPECT_EQ(int1_1, int1_2);
 
-  ir::Type int8 = IntegerType::get(ctx, 8, 0);
+  pir::Type int8 = IntegerType::get(ctx, 8, 0);
   EXPECT_NE(int8, int1_2);
 
   //  Test 2: Test Dialect interfaces
   EXPECT_EQ(ctx, int8.ir_context());
 
-  EXPECT_EQ(int8.dialect().id(), ir::TypeId::get<IntegerDialect>());
+  EXPECT_EQ(int8.dialect().id(), pir::TypeId::get<IntegerDialect>());
 
-  std::vector<ir::Dialect *> dialect_list = ctx->GetRegisteredDialects();
+  std::vector<pir::Dialect *> dialect_list = ctx->GetRegisteredDialects();
   EXPECT_EQ(dialect_list.size() == 4, 1);  // integer, builtin, fake
 
-  ir::Dialect *dialect_builtin1 = ctx->GetRegisteredDialect("builtin");
-  ir::Dialect *dialect_builtin2 =
-      ctx->GetRegisteredDialect<ir::BuiltinDialect>();
+  pir::Dialect *dialect_builtin1 = ctx->GetRegisteredDialect("builtin");
+  pir::Dialect *dialect_builtin2 =
+      ctx->GetRegisteredDialect<pir::BuiltinDialect>();
   EXPECT_EQ(dialect_builtin1, dialect_builtin2);
 
-  ir::Dialect *dialect_integer1 = ctx->GetRegisteredDialect("integer");
-  ir::Dialect *dialect_integer2 = ctx->GetRegisteredDialect<IntegerDialect>();
+  pir::Dialect *dialect_integer1 = ctx->GetRegisteredDialect("integer");
+  pir::Dialect *dialect_integer2 = ctx->GetRegisteredDialect<IntegerDialect>();
   EXPECT_EQ(dialect_integer1, dialect_integer2);
 }
 
-TEST(type_test, pd_dialect) {
-  ir::IrContext *ctx = ir::IrContext::Instance();
-  ctx->GetOrRegisterDialect<paddle::dialect::PaddleDialect>();
-  ir::Type fp32_dtype = ir::Float32Type::get(ctx);
+TEST(type_test, pd_op_dialect) {
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+  pir::Type fp32_dtype = pir::Float32Type::get(ctx);
   phi::DDim dims = {2, 2};
   phi::DataLayout data_layout = phi::DataLayout::NCHW;
   phi::LoD lod = {{0, 1, 2}};
@@ -251,7 +253,7 @@ TEST(type_test, pd_dialect) {
   paddle::dialect::SelectedRowsType select_rows_dtype =
       paddle::dialect::SelectedRowsType::get(
           ctx, fp32_dtype, dims, data_layout, lod, offset);
-  EXPECT_EQ(select_rows_dtype.dtype().isa<ir::Float32Type>(), true);
+  EXPECT_EQ(select_rows_dtype.dtype().isa<pir::Float32Type>(), true);
   EXPECT_EQ(select_rows_dtype.dims(), dims);
   EXPECT_EQ(select_rows_dtype.data_layout(), data_layout);
   EXPECT_EQ(select_rows_dtype.lod(), lod);
@@ -263,6 +265,6 @@ class TestClass {};
 }  // namespace TestNamespace
 
 TEST(type_test, get_type_name) {
-  auto name = ir::get_type_name<TestNamespace::TestClass>();
+  auto name = pir::get_type_name<TestNamespace::TestClass>();
   EXPECT_EQ(name, "TestNamespace::TestClass");
 }
diff --git a/test/cpp/ir/kernel_dialect/CMakeLists.txt b/test/cpp/pir/kernel_dialect/CMakeLists.txt
similarity index 97%
rename from test/cpp/ir/kernel_dialect/CMakeLists.txt
rename to test/cpp/pir/kernel_dialect/CMakeLists.txt
index 47aad616fd783..ea8477dbe3970 100644
--- a/test/cpp/ir/kernel_dialect/CMakeLists.txt
+++ b/test/cpp/pir/kernel_dialect/CMakeLists.txt
@@ -8,6 +8,6 @@ cc_test_old(
   pd_kernel_dialect
   phi_kernel_adaptor
   pd_trait
-  ir
+  pir
   phi
   gtest)
diff --git a/test/cpp/ir/kernel_dialect/ir_kernel_dialect_pass_test.cc b/test/cpp/pir/kernel_dialect/ir_kernel_dialect_pass_test.cc
similarity index 66%
rename from test/cpp/ir/kernel_dialect/ir_kernel_dialect_pass_test.cc
rename to test/cpp/pir/kernel_dialect/ir_kernel_dialect_pass_test.cc
index 22442c40555d0..97aad7062292c 100644
--- a/test/cpp/ir/kernel_dialect/ir_kernel_dialect_pass_test.cc
+++ b/test/cpp/pir/kernel_dialect/ir_kernel_dialect_pass_test.cc
@@ -19,22 +19,16 @@
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/framework/variable_helper.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/interface/op_yaml_info.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_attribute.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_dialect.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_type.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/utils/utils.h"
-#include "paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_dialect.h"
-#include "paddle/fluid/ir/dialect/paddle_kernel_dialect/ir/kernel_op.h"
-#include "paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_adaptor.h"
-#include "paddle/fluid/ir/transforms/pd_op_to_kernel_pass.h"
+#include "paddle/fluid/pir/dialect/kernel/ir/kernel_dialect.h"
+#include "paddle/fluid/pir/dialect/kernel/ir/kernel_op.h"
+#include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
+#include "paddle/fluid/pir/dialect/operator/utils/utils.h"
+#include "paddle/fluid/pir/phi_kernel_adaptor/phi_kernel_adaptor.h"
+#include "paddle/fluid/pir/transforms/pd_op_to_kernel_pass.h"
 #include "paddle/fluid/platform/init.h"
-#include "paddle/ir/core/builtin_attribute.h"
-#include "paddle/ir/core/builtin_dialect.h"
-#include "paddle/ir/core/builtin_op.h"
-#include "paddle/ir/core/ir_context.h"
-#include "paddle/ir/core/program.h"
-#include "paddle/ir/core/utils.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/kernel_context.h"
@@ -43,6 +37,12 @@
 #include "paddle/phi/core/meta_tensor.h"
 #include "paddle/phi/infermeta/binary.h"
 #include "paddle/phi/kernels/elementwise_add_kernel.h"
+#include "paddle/pir/core/builtin_attribute.h"
+#include "paddle/pir/core/builtin_dialect.h"
+#include "paddle/pir/core/builtin_op.h"
+#include "paddle/pir/core/ir_context.h"
+#include "paddle/pir/core/program.h"
+#include "paddle/pir/core/utils.h"
 
 PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(full_int_array, CPU, ALL_LAYOUT);
@@ -53,12 +53,12 @@ bool simple_cmp(float a, float b) { return std::abs((a - b) / a) < 1e-5; }
 
 TEST(program_test, program) {
   // (1) Init environment.
-  ir::IrContext* ctx = ir::IrContext::Instance();
-  ir::Program program((ctx));
+  pir::IrContext* ctx = pir::IrContext::Instance();
+  pir::Program program((ctx));
 
-  ctx->GetOrRegisterDialect<paddle::dialect::PaddleDialect>();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
 
-  ir::Builder builder = ir::Builder(ctx, program.block());
+  pir::Builder builder = pir::Builder(ctx, program.block());
 
   paddle::dialect::FullOp op1 = builder.Build<paddle::dialect::FullOp>(
       std::vector<int64_t>{2, 2}, 1.0, phi::DataType::FLOAT32, phi::CPUPlace());
@@ -92,7 +92,7 @@ TEST(program_test, program) {
                 ->front()
                 ->dyn_cast<paddle::dialect::PhiKernelOp>()
                 .op_name(),
-            "pd.full");
+            "pd_op.full");
   EXPECT_EQ(kernel_program->block()
                 ->front()
                 ->dyn_cast<paddle::dialect::PhiKernelOp>()
@@ -108,12 +108,12 @@ TEST(program_test, program) {
 
 TEST(dialect_attr, attr) {
   // (1) Init environment.
-  ir::IrContext* ctx = ir::IrContext::Instance();
-  ir::Program program((ctx));
+  pir::IrContext* ctx = pir::IrContext::Instance();
+  pir::Program program((ctx));
 
-  ctx->GetOrRegisterDialect<paddle::dialect::PaddleDialect>();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
   auto kernel_dialect =
-      ctx->GetOrRegisterDialect<paddle::dialect::PaddleKernelDialect>();
+      ctx->GetOrRegisterDialect<paddle::dialect::KernelDialect>();
 
   phi::KernelKey kernel_key(
       phi::Backend::CPU, phi::DataLayout::ALL_LAYOUT, phi::DataType::FLOAT32);
@@ -128,42 +128,42 @@ TEST(dialect_attr, attr) {
       true);
 }
 
-ir::AttributeMap CreateAttributeMap(std::vector<std::string> attribute_names,
-                                    std::vector<std::string> attributes,
-                                    std::string attr_name,
-                                    phi::KernelKey kernel_key) {
-  ir::IrContext* ctx = ir::IrContext::Instance();
-  ir::AttributeMap attr_map;
+pir::AttributeMap CreateAttributeMap(std::vector<std::string> attribute_names,
+                                     std::vector<std::string> attributes,
+                                     std::string attr_name,
+                                     phi::KernelKey kernel_key) {
+  pir::IrContext* ctx = pir::IrContext::Instance();
+  pir::AttributeMap attr_map;
   for (size_t i = 0; i < attribute_names.size(); i++) {
-    ir::Attribute attr_value = ir::StrAttribute::get(ctx, attributes[i]);
+    pir::Attribute attr_value = pir::StrAttribute::get(ctx, attributes[i]);
     attr_map.insert(
-        std::pair<std::string, ir::Attribute>(attribute_names[i], attr_value));
+        std::pair<std::string, pir::Attribute>(attribute_names[i], attr_value));
   }
   auto attr = paddle::dialect::KernelAttribute::get(ctx, kernel_key);
-  attr_map.insert(std::pair<std::string, ir::Attribute>(attr_name, attr));
+  attr_map.insert(std::pair<std::string, pir::Attribute>(attr_name, attr));
   return attr_map;
 }
 
 TEST(kernel_dialect, legacy_op_test) {
   // (1) Init environment.
 
-  ir::IrContext* ctx = ir::IrContext::Instance();
-  ir::Program program((ctx));
+  pir::IrContext* ctx = pir::IrContext::Instance();
+  pir::Program program((ctx));
 
-  ctx->GetOrRegisterDialect<paddle::dialect::PaddleDialect>();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
   phi::KernelKey kernel_key(
       phi::Backend::CPU, phi::DataLayout::ALL_LAYOUT, phi::DataType::FLOAT32);
 
-  ir::OpInfo kernel_op_info =
+  pir::OpInfo kernel_op_info =
       ctx->GetRegisteredOpInfo(paddle::dialect::LegacyKernelOp::name());
-  ir::OperationArgument argument(kernel_op_info);
+  pir::OperationArgument argument(kernel_op_info);
   argument.attributes = CreateAttributeMap({"op_name", "kernel_name"},
-                                           {"pd.kernel_op", "kernel_op"},
+                                           {"pd_op.kernel_op", "kernel_op"},
                                            "kernel_key",
                                            kernel_key);
 
-  ir::Operation* op = ir::Operation::Create(std::move(argument));
-  EXPECT_EQ("pd.kernel_op",
+  pir::Operation* op = pir::Operation::Create(std::move(argument));
+  EXPECT_EQ("pd_op.kernel_op",
             op->dyn_cast<paddle::dialect::LegacyKernelOp>().op_name());
   EXPECT_EQ("kernel_op",
             op->dyn_cast<paddle::dialect::LegacyKernelOp>().kernel_name());
diff --git a/test/cpp/ir/pass/CMakeLists.txt b/test/cpp/pir/pass/CMakeLists.txt
similarity index 79%
rename from test/cpp/ir/pass/CMakeLists.txt
rename to test/cpp/pir/pass/CMakeLists.txt
index 061b1e8c9e2fb..be68cdab344e7 100644
--- a/test/cpp/ir/pass/CMakeLists.txt
+++ b/test/cpp/pir/pass/CMakeLists.txt
@@ -3,7 +3,7 @@ cc_test_old(
   SRCS
   pass_manager_test.cc
   DEPS
-  ir
-  pd_dialect
+  pir
+  pd_op_dialect
   phi
   gtest)
diff --git a/test/cpp/ir/pass/pass_manager_test.cc b/test/cpp/pir/pass/pass_manager_test.cc
similarity index 77%
rename from test/cpp/ir/pass/pass_manager_test.cc
rename to test/cpp/pir/pass/pass_manager_test.cc
index 38af7d7a3021f..ac1b8a6c6d9f3 100644
--- a/test/cpp/ir/pass/pass_manager_test.cc
+++ b/test/cpp/pir/pass/pass_manager_test.cc
@@ -16,23 +16,23 @@
 #include "glog/logging.h"
 
 // NOTE(zhangbo9674): File pd_op.h is generated by op_gen.py, see details in
-// paddle/fluid/ir/dialect/CMakeLists.txt.
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_op.h"
-
-#include "paddle/fluid/ir/dialect/paddle_dialect/interface/op_yaml_info.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_dialect.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_type.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/utils/utils.h"
-#include "paddle/ir/core/builtin_dialect.h"
-#include "paddle/ir/core/builtin_op.h"
-#include "paddle/ir/core/builtin_type.h"
-#include "paddle/ir/core/dialect.h"
-#include "paddle/ir/core/ir_context.h"
-#include "paddle/ir/core/op_base.h"
-#include "paddle/ir/core/operation.h"
-#include "paddle/ir/pass/pass.h"
-#include "paddle/ir/pass/pass_manager.h"
+// paddle/fluid/pir/dialect/CMakeLists.txt.
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
+
+#include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
+#include "paddle/fluid/pir/dialect/operator/utils/utils.h"
 #include "paddle/phi/kernels/elementwise_add_kernel.h"
+#include "paddle/pir/core/builtin_dialect.h"
+#include "paddle/pir/core/builtin_op.h"
+#include "paddle/pir/core/builtin_type.h"
+#include "paddle/pir/core/dialect.h"
+#include "paddle/pir/core/ir_context.h"
+#include "paddle/pir/core/op_base.h"
+#include "paddle/pir/core/operation.h"
+#include "paddle/pir/pass/pass.h"
+#include "paddle/pir/pass/pass_manager.h"
 
 #ifndef _WIN32
 class TestAnalysis1 {};
@@ -44,7 +44,7 @@ IR_DECLARE_EXPLICIT_TYPE_ID(TestAnalysis2)
 IR_DEFINE_EXPLICIT_TYPE_ID(TestAnalysis2)
 
 TEST(pass_manager, PreservedAnalyses) {
-  ir::detail::PreservedAnalyses pa;
+  pir::detail::PreservedAnalyses pa;
   CHECK_EQ(pa.IsNone(), true);
 
   CHECK_EQ(pa.IsPreserved<TestAnalysis1>(), false);
@@ -63,18 +63,18 @@ TEST(pass_manager, PreservedAnalyses) {
 }
 #endif
 
-class AddOp : public ir::Op<AddOp> {
+class AddOp : public pir::Op<AddOp> {
  public:
   using Op::Op;
   static const char *name() { return "test.add"; }
   static constexpr const char **attributes_name = nullptr;
   static constexpr uint32_t attributes_num = 0;
   void Verify();
-  static void Build(ir::Builder &builder,             // NOLINT
-                    ir::OperationArgument &argument,  // NOLINT
-                    ir::OpResult l_operand,
-                    ir::OpResult r_operand,
-                    ir::Type sum_type);
+  static void Build(pir::Builder &builder,             // NOLINT
+                    pir::OperationArgument &argument,  // NOLINT
+                    pir::OpResult l_operand,
+                    pir::OpResult r_operand,
+                    pir::Type sum_type);
 };
 void AddOp::Verify() {
   if (num_operands() != 2) {
@@ -84,11 +84,11 @@ void AddOp::Verify() {
     throw("The size of outputs must be equal to 1.");
   }
 }
-void AddOp::Build(ir::Builder &,
-                  ir::OperationArgument &argument,
-                  ir::OpResult l_operand,
-                  ir::OpResult r_operand,
-                  ir::Type sum_type) {
+void AddOp::Build(pir::Builder &,
+                  pir::OperationArgument &argument,
+                  pir::OpResult l_operand,
+                  pir::OpResult r_operand,
+                  pir::Type sum_type) {
   argument.AddOperand(l_operand);
   argument.AddOperand(r_operand);
   argument.AddOutput(sum_type);
@@ -97,7 +97,7 @@ IR_DECLARE_EXPLICIT_TYPE_ID(AddOp)
 IR_DEFINE_EXPLICIT_TYPE_ID(AddOp)
 
 struct CountOpAnalysis {
-  explicit CountOpAnalysis(ir::Operation *container_op) {
+  explicit CountOpAnalysis(pir::Operation *container_op) {
     IR_ENFORCE(container_op->num_regions() > 0,
                "op must be a container with zero or multiple regions.");
 
@@ -120,17 +120,17 @@ struct CountOpAnalysis {
 IR_DECLARE_EXPLICIT_TYPE_ID(CountOpAnalysis)
 IR_DEFINE_EXPLICIT_TYPE_ID(CountOpAnalysis)
 
-class TestPass : public ir::Pass {
+class TestPass : public pir::Pass {
  public:
-  TestPass() : ir::Pass("TestPass", 1) {}
-  void Run(ir::Operation *op) override {
+  TestPass() : pir::Pass("TestPass", 1) {}
+  void Run(pir::Operation *op) override {
     auto count_op_analysis = analysis_manager().GetAnalysis<CountOpAnalysis>();
     pass_state().preserved_analyses.Preserve<CountOpAnalysis>();
     CHECK_EQ(pass_state().preserved_analyses.IsPreserved<CountOpAnalysis>(),
              true);
     CHECK_EQ(count_op_analysis.count, 11);
 
-    auto module_op = op->dyn_cast<ir::ModuleOp>();
+    auto module_op = op->dyn_cast<pir::ModuleOp>();
     CHECK_EQ(module_op.operation(), op);
     CHECK_EQ(module_op.name(), module_op->name());
     LOG(INFO) << "In " << pass_info().name << ": " << module_op->name()
@@ -141,12 +141,12 @@ class TestPass : public ir::Pass {
              false);
   }
 
-  bool CanApplyOn(ir::Operation *op) const override {
+  bool CanApplyOn(pir::Operation *op) const override {
     return op->name() == "builtin.module" && op->num_regions() > 0;
   }
 };
 
-void BuildProgram(ir::Builder &builder) {  // NOLINT
+void BuildProgram(pir::Builder &builder) {  // NOLINT
   paddle::dialect::FullOp full_input_op =
       builder.Build<paddle::dialect::FullOp>(std::vector<int64_t>{4, 3, 16, 16},
                                              1.5,
@@ -204,25 +204,25 @@ void BuildProgram(ir::Builder &builder) {  // NOLINT
 }
 
 TEST(pass_manager, PassManager) {
-  ir::IrContext *ctx = ir::IrContext::Instance();
-  ctx->GetOrRegisterDialect<paddle::dialect::PaddleDialect>();
-  ir::Program program(ctx);
-  ir::Builder builder = ir::Builder(ctx, program.block());
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+  pir::Program program(ctx);
+  pir::Builder builder = pir::Builder(ctx, program.block());
   BuildProgram(builder);
 
   EXPECT_EQ(program.block()->size(), 11u);
 
   // (9) Test pass manager for program.
-  ir::PassManager pm(ctx);
+  pir::PassManager pm(ctx);
 
   pm.AddPass(std::make_unique<TestPass>());
 
   // pm.EnableIRPrinting();
-  pm.EnableIRPrinting(std::make_unique<ir::PassManager::IRPrinterOption>(
-      [](ir::Pass *pass, ir::Operation *op) {
+  pm.EnableIRPrinting(std::make_unique<pir::PassManager::IRPrinterOption>(
+      [](pir::Pass *pass, pir::Operation *op) {
         return pass->name() == "TestPass";
       },
-      [](ir::Pass *pass, ir::Operation *op) {
+      [](pir::Pass *pass, pir::Operation *op) {
         return pass->name() == "TestPass";
       },
       true,
diff --git a/test/cpp/ir/pattern_rewrite/CMakeLists.txt b/test/cpp/pir/pattern_rewrite/CMakeLists.txt
similarity index 73%
rename from test/cpp/ir/pattern_rewrite/CMakeLists.txt
rename to test/cpp/pir/pattern_rewrite/CMakeLists.txt
index 2023cc0cf413f..b65aca7e62656 100644
--- a/test/cpp/ir/pattern_rewrite/CMakeLists.txt
+++ b/test/cpp/pir/pattern_rewrite/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(PATTERN_REWRITE_TEST_DEPS _constant_folding_pass
-                              transform_general_functions gtest pd_dialect ir)
+set(PATTERN_REWRITE_TEST_DEPS
+    _constant_folding_pass transform_general_functions gtest pd_op_dialect pir)
 
 if(WITH_DISTRIBUTE)
   set(PATTERN_REWRITE_TEST_DEPS ${PATTERN_REWRITE_TEST_DEPS} fleet_executor)
diff --git a/test/cpp/ir/pattern_rewrite/pattern_rewrite_test.cc b/test/cpp/pir/pattern_rewrite/pattern_rewrite_test.cc
similarity index 69%
rename from test/cpp/ir/pattern_rewrite/pattern_rewrite_test.cc
rename to test/cpp/pir/pattern_rewrite/pattern_rewrite_test.cc
index a86055523b521..985d00c4b0d1e 100644
--- a/test/cpp/ir/pattern_rewrite/pattern_rewrite_test.cc
+++ b/test/cpp/pir/pattern_rewrite/pattern_rewrite_test.cc
@@ -20,45 +20,45 @@
 #include <sstream>
 #include <vector>
 
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_attribute.h"
-#include "paddle/fluid/ir/transforms/constant_folding_pass.h"
-#include "paddle/fluid/ir/transforms/transform_general_functions.h"
-#include "paddle/ir/core/builder.h"
-#include "paddle/ir/core/builtin_attribute.h"
-#include "paddle/ir/core/builtin_dialect.h"
-#include "paddle/ir/core/builtin_op.h"
-#include "paddle/ir/core/cast_utils.h"
-#include "paddle/ir/core/dialect.h"
-#include "paddle/ir/core/enforce.h"
-#include "paddle/ir/core/ir_context.h"
-#include "paddle/ir/core/op_info.h"
-#include "paddle/ir/core/parameter.h"
-#include "paddle/ir/core/program.h"
-#include "paddle/ir/core/value.h"
-#include "paddle/ir/pass/pass.h"
-#include "paddle/ir/pass/pass_manager.h"
-#include "paddle/ir/pattern_rewrite/frozen_rewrite_pattern_set.h"
-#include "paddle/ir/pattern_rewrite/pattern_applicator.h"
-#include "paddle/ir/pattern_rewrite/pattern_match.h"
-#include "paddle/ir/pattern_rewrite/pattern_rewrite_driver.h"
-#include "paddle/ir/transforms/dead_code_elimination_pass.h"
-#include "paddle/ir/transforms/reorder_block_ops_pass.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_attribute.h"
+#include "paddle/fluid/pir/transforms/constant_folding_pass.h"
+#include "paddle/fluid/pir/transforms/transform_general_functions.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/pir/core/builder.h"
+#include "paddle/pir/core/builtin_attribute.h"
+#include "paddle/pir/core/builtin_dialect.h"
+#include "paddle/pir/core/builtin_op.h"
+#include "paddle/pir/core/cast_utils.h"
+#include "paddle/pir/core/dialect.h"
+#include "paddle/pir/core/enforce.h"
+#include "paddle/pir/core/ir_context.h"
+#include "paddle/pir/core/op_info.h"
+#include "paddle/pir/core/parameter.h"
+#include "paddle/pir/core/program.h"
+#include "paddle/pir/core/value.h"
+#include "paddle/pir/pass/pass.h"
+#include "paddle/pir/pass/pass_manager.h"
+#include "paddle/pir/pattern_rewrite/frozen_rewrite_pattern_set.h"
+#include "paddle/pir/pattern_rewrite/pattern_applicator.h"
+#include "paddle/pir/pattern_rewrite/pattern_match.h"
+#include "paddle/pir/pattern_rewrite/pattern_rewrite_driver.h"
+#include "paddle/pir/transforms/dead_code_elimination_pass.h"
+#include "paddle/pir/transforms/reorder_block_ops_pass.h"
 
 // NOTE(zhangbo9674): File pd_op.h is generated by op_gen.py, see details in
-// paddle/fluid/ir/dialect/CMakeLists.txt.
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_op.h"
+// paddle/fluid/pir/dialect/CMakeLists.txt.
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
 
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_dialect.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_type.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
 #include "paddle/phi/core/ddim.h"
 
 // build Conv2dFusionOp
-#include "paddle/fluid/ir/dialect/paddle_dialect/interface/infermeta.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/interface/op_yaml_info.h"
-#include "paddle/ir/core/op_base.h"
+#include "paddle/fluid/pir/dialect/operator/interface/infermeta.h"
+#include "paddle/fluid/pir/dialect/operator/interface/op_yaml_info.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/infermeta/multiary.h"
+#include "paddle/pir/core/op_base.h"
 
 PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
@@ -73,7 +73,7 @@ PD_DECLARE_KERNEL(conv2d, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(transpose, CPU, ALL_LAYOUT);
 
 // Define op1.
-class Operation1 : public ir::Op<Operation1> {
+class Operation1 : public pir::Op<Operation1> {
  public:
   using Op::Op;
   static const char *name() { return "test.Operation1"; }
@@ -86,11 +86,11 @@ class Operation1 : public ir::Op<Operation1> {
 void Operation1::Verify() {
   auto &attributes = this->attributes();
   if (attributes.count("op2_attr1") == 0 ||
-      (!attributes.at("op2_attr1").isa<ir::StrAttribute>())) {
+      (!attributes.at("op2_attr1").isa<pir::StrAttribute>())) {
     throw("Type of attribute: parameter_name is not right.");
   }
   if (attributes.count("op2_attr2") == 0 ||
-      (!attributes.at("op2_attr2").isa<ir::StrAttribute>())) {
+      (!attributes.at("op2_attr2").isa<pir::StrAttribute>())) {
     throw("Type of attribute: parameter_name is not right.");
   }
 }
@@ -101,10 +101,10 @@ IR_DECLARE_EXPLICIT_TYPE_ID(Operation1)
 IR_DEFINE_EXPLICIT_TYPE_ID(Operation1)
 
 // Define a dialect, op1 and op2 will be registered by this dialect.
-class TestDialect : public ir::Dialect {
+class TestDialect : public pir::Dialect {
  public:
-  explicit TestDialect(ir::IrContext *context)
-      : ir::Dialect(name(), context, ir::TypeId::get<TestDialect>()) {
+  explicit TestDialect(pir::IrContext *context)
+      : pir::Dialect(name(), context, pir::TypeId::get<TestDialect>()) {
     initialize();
   }
   static const char *name() { return "test"; }
@@ -116,28 +116,28 @@ IR_DECLARE_EXPLICIT_TYPE_ID(TestDialect)
 IR_DEFINE_EXPLICIT_TYPE_ID(TestDialect)
 
 // TODO(wilber): Add logical when ir support erase, replace or update.
-class TestPatternRewrite : public ir::OpRewritePattern<Operation1> {
+class TestPatternRewrite : public pir::OpRewritePattern<Operation1> {
  public:
-  using ir::OpRewritePattern<Operation1>::OpRewritePattern;
+  using pir::OpRewritePattern<Operation1>::OpRewritePattern;
 
-  void Rewrite(Operation1 op, ir::PatternRewriter &rewriter) const override {}
+  void Rewrite(Operation1 op, pir::PatternRewriter &rewriter) const override {}
   bool Match(Operation1 op) const override { return false; }
 };
 
-class TestPatternRewrite2 : public ir::OpRewritePattern<Operation1> {
+class TestPatternRewrite2 : public pir::OpRewritePattern<Operation1> {
  public:
-  using ir::OpRewritePattern<Operation1>::OpRewritePattern;
+  using pir::OpRewritePattern<Operation1>::OpRewritePattern;
   bool MatchAndRewrite(
       Operation1 op,
-      ir::PatternRewriter &rewriter) const override {  // NOLINT
+      pir::PatternRewriter &rewriter) const override {  // NOLINT
     return false;
   }
 };
 
 TEST(PatternRewrite, PatternBenefit) {
-  ir::PatternBenefit benefit1(1);
+  pir::PatternBenefit benefit1(1);
   EXPECT_EQ(benefit1.benefit(), 1U);
-  ir::PatternBenefit benefit2(2);
+  pir::PatternBenefit benefit2(2);
   EXPECT_EQ(benefit2.benefit(), 2U);
 
   EXPECT_TRUE(benefit2 > benefit1);
@@ -145,17 +145,17 @@ TEST(PatternRewrite, PatternBenefit) {
   EXPECT_TRUE(benefit1 < benefit2);
   EXPECT_TRUE(benefit1 <= benefit2);
   EXPECT_TRUE(benefit1 != benefit2);
-  ir::PatternBenefit benefit3(2);
+  pir::PatternBenefit benefit3(2);
   EXPECT_TRUE(benefit2 == benefit3);
 }
 
 TEST(RewritePattern, RewritePatternSet) {
-  ir::IrContext *ctx = ir::IrContext::Instance();
-  ctx->GetOrRegisterDialect<ir::BuiltinDialect>();
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<pir::BuiltinDialect>();
   auto *test_dialect = ctx->GetOrRegisterDialect<TestDialect>();
   test_dialect->RegisterOp<Operation1>();
 
-  ir::RewritePatternSet ps(ctx);
+  pir::RewritePatternSet ps(ctx);
   ps.Add<TestPatternRewrite>(ctx, 1);
   EXPECT_EQ(ps.native_patterns().size(), 1U);
   EXPECT_TRUE(ps.native_patterns().back()->debug_labels().empty());
@@ -175,31 +175,31 @@ TEST(RewritePattern, RewritePatternSet) {
 
 // TODO(wilber): Add actual case.
 // TEST(PatternRewrite, PatternApplicator) {
-//   ir::IrContext *ctx = ir::IrContext::Instance();
-//   ctx->GetOrRegisterDialect<ir::BuiltinDialect>();
+//   pir::IrContext *ctx = pir::IrContext::Instance();
+//   ctx->GetOrRegisterDialect<pir::BuiltinDialect>();
 //   auto *test_dialect = ctx->GetOrRegisterDialect<TestDialect>();
 //   test_dialect->RegisterOp<Operation1>();
-//   ir::RewritePatternSet ps(ctx);
+//   pir::RewritePatternSet ps(ctx);
 //   ps.Add<TestPatternRewrite, TestPatternRewrite2>(ctx, 2);
-//   ir::FrozenRewritePatternSet frozen_set(std::move(ps));
-//   ir::PatternApplicator applicator(frozen_set);
+//   pir::FrozenRewritePatternSet frozen_set(std::move(ps));
+//   pir::PatternApplicator applicator(frozen_set);
 //   applicator.ApplyDefaultCostModel();
 // }
 
 // // TODO(wilber): Add actual case.
 TEST(PatternRewrite, FrozenRewritePatternSet) {
-  ir::FrozenRewritePatternSet frozen_set;
+  pir::FrozenRewritePatternSet frozen_set;
   EXPECT_TRUE(frozen_set.match_any_op_native_patterns().empty());
   EXPECT_TRUE(frozen_set.op_specific_native_patterns().empty());
 
-  ir::IrContext *ctx = ir::IrContext::Instance();
-  ctx->GetOrRegisterDialect<ir::BuiltinDialect>();
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<pir::BuiltinDialect>();
   auto *test_dialect = ctx->GetOrRegisterDialect<TestDialect>();
   test_dialect->RegisterOp<Operation1>();
-  ir::RewritePatternSet ps(ctx);
+  pir::RewritePatternSet ps(ctx);
   ps.Add<TestPatternRewrite, TestPatternRewrite2>(ctx, 2);
 
-  ir::FrozenRewritePatternSet frozen_set2(std::move(ps));
+  pir::FrozenRewritePatternSet frozen_set2(std::move(ps));
   EXPECT_TRUE(frozen_set2.match_any_op_native_patterns().empty());
   const auto &pattern_maps = frozen_set2.op_specific_native_patterns();
   EXPECT_EQ(pattern_maps.size(), 1U);
@@ -208,13 +208,13 @@ TEST(PatternRewrite, FrozenRewritePatternSet) {
 }
 
 class RedundantTransposeFusePattern
-    : public ir::OpRewritePattern<paddle::dialect::TransposeOp> {
+    : public pir::OpRewritePattern<paddle::dialect::TransposeOp> {
  public:
-  using ir::OpRewritePattern<paddle::dialect::TransposeOp>::OpRewritePattern;
+  using pir::OpRewritePattern<paddle::dialect::TransposeOp>::OpRewritePattern;
 
   bool MatchAndRewrite(paddle::dialect::TransposeOp op,
-                       ir::PatternRewriter &rewriter) const override {
-    auto prev_op = ir::GetDefiningOpForInput(op, 0);
+                       pir::PatternRewriter &rewriter) const override {
+    auto prev_op = pir::GetDefiningOpForInput(op, 0);
     std::vector<int> axis_last = GetAxis(op);
     auto prev_trans_op = prev_op->dyn_cast<paddle::dialect::TransposeOp>();
     if (prev_trans_op) {
@@ -224,7 +224,7 @@ class RedundantTransposeFusePattern
       auto new_perm = GetPerm(axis_first, axis_last);
       rewriter.SetInsertionPoint(op);
       auto new_transpose_op = rewriter.Build<paddle::dialect::TransposeOp>(
-          ir::GetDefiningOpForInput(prev_trans_op, 0)->result(0), new_perm);
+          pir::GetDefiningOpForInput(prev_trans_op, 0)->result(0), new_perm);
       rewriter.ReplaceOp(op, {new_transpose_op.out()});
       return true;
     }
@@ -234,10 +234,10 @@ class RedundantTransposeFusePattern
 
  private:
   std::vector<int> GetAxis(paddle::dialect::TransposeOp op) const {
-    auto array_attr = op.attribute<ir::ArrayAttribute>("perm").AsVector();
+    auto array_attr = op.attribute<pir::ArrayAttribute>("perm").AsVector();
     std::vector<int> axis(array_attr.size());
     for (size_t i = 0; i < array_attr.size(); ++i) {
-      axis[i] = array_attr[i].dyn_cast<ir::Int32Attribute>().data();
+      axis[i] = array_attr[i].dyn_cast<pir::Int32Attribute>().data();
     }
     return axis;
   }
@@ -258,53 +258,55 @@ class RedundantTransposeFusePattern
 };
 
 class Conv2dBnFusePattern
-    : public ir::OpRewritePattern<paddle::dialect::BatchNormOp> {
+    : public pir::OpRewritePattern<paddle::dialect::BatchNormOp> {
  public:
-  using ir::OpRewritePattern<paddle::dialect::BatchNormOp>::OpRewritePattern;
+  using pir::OpRewritePattern<paddle::dialect::BatchNormOp>::OpRewritePattern;
   bool MatchAndRewrite(
       paddle::dialect::BatchNormOp op,
-      ir::PatternRewriter &rewriter) const override {  // NOLINT
+      pir::PatternRewriter &rewriter) const override {  // NOLINT
     // The next op should be batch_norm.
     paddle::dialect::Conv2dOp conv2d_op =
-        ir::GetDefiningOpForInput(op, 0)->dyn_cast<paddle::dialect::Conv2dOp>();
+        pir::GetDefiningOpForInput(op, 0)
+            ->dyn_cast<paddle::dialect::Conv2dOp>();
     if (!conv2d_op) return false;
 
-    ir::OpResult conv2d_out = conv2d_op.out();
+    pir::OpResult conv2d_out = conv2d_op.out();
     if (!conv2d_out.HasOneUse()) return false;
 
-    ir::Value conv2d_filter = conv2d_op.filter();
+    pir::Value conv2d_filter = conv2d_op.filter();
 
-    // ir::GetParameterOp filter_parameter_op =
-    //     conv2d_filter.GetDefiningOp()->dyn_cast<ir::GetParameterOp>();
+    // pir::GetParameterOp filter_parameter_op =
+    //     conv2d_filter.GetDefiningOp()->dyn_cast<pir::GetParameterOp>();
     // if (!filter_parameter_op) return false;
 
-    ir::OpResult conv2d_filter_result = conv2d_filter.dyn_cast<ir::OpResult>();
+    pir::OpResult conv2d_filter_result =
+        conv2d_filter.dyn_cast<pir::OpResult>();
     IR_ENFORCE(conv2d_filter_result);
 
-    ir::Value bn_input = op.x();
+    pir::Value bn_input = op.x();
     IR_ENFORCE(bn_input == conv2d_out);
 
-    ir::Value bn_mean = op.mean();
-    ir::Value bn_variance = op.variance();
-    ir::Value bn_scale = op.scale();
-    ir::Value bn_bias = op.bias();
+    pir::Value bn_mean = op.mean();
+    pir::Value bn_variance = op.variance();
+    pir::Value bn_scale = op.scale();
+    pir::Value bn_bias = op.bias();
 
     // --- deal with filter ---
     rewriter.SetInsertionPoint(op);
     phi::DDim bn_variance_shape =
         bn_variance.type().dyn_cast<paddle::dialect::DenseTensorType>().dims();
-    float epsilon = op.attribute<ir::FloatAttribute>("epsilon").data();
+    float epsilon = op.attribute<pir::FloatAttribute>("epsilon").data();
     paddle::dialect::FullOp full_op = rewriter.Build<paddle::dialect::FullOp>(
         phi::vectorize(bn_variance_shape), epsilon);
     paddle::dialect::AddOp add_op = rewriter.Build<paddle::dialect::AddOp>(
-        bn_variance.dyn_cast<ir::OpResult>(), full_op.out());
+        bn_variance.dyn_cast<pir::OpResult>(), full_op.out());
     paddle::dialect::SqrtOp sqrt_op =
         rewriter.Build<paddle::dialect::SqrtOp>(add_op.out());
     paddle::dialect::DivideOp div_op =
         rewriter.Build<paddle::dialect::DivideOp>(
-            bn_scale.dyn_cast<ir::OpResult>(), sqrt_op.out());
+            bn_scale.dyn_cast<pir::OpResult>(), sqrt_op.out());
     // reshape scale
-    phi::DDim conv2d_filter_shape = ir::GetShapeFromValue(conv2d_filter);
+    phi::DDim conv2d_filter_shape = pir::GetShapeFromValue(conv2d_filter);
     phi::DDim bn_scale_shape =
         bn_scale.type().dyn_cast<paddle::dialect::DenseTensorType>().dims();
     std::vector<int64_t> bn_scale_new_shape(conv2d_filter_shape.size(), 1);
@@ -319,23 +321,24 @@ class Conv2dBnFusePattern
 
     auto conv2d_attributes = conv2d_op->attributes();
     auto new_conv2d_op = rewriter.Build<paddle::dialect::Conv2dOp>(
-        conv2d_op.input().dyn_cast<ir::OpResult>(),
+        conv2d_op.input().dyn_cast<pir::OpResult>(),
         mul_op.out(),
         conv2d_attributes);
 
     // --- deal with bias ---
     paddle::dialect::MultiplyOp mul_bias_op =
         rewriter.Build<paddle::dialect::MultiplyOp>(
-            bn_mean.dyn_cast<ir::OpResult>(), div_op.out());
+            bn_mean.dyn_cast<pir::OpResult>(), div_op.out());
     // new bias --> sub_op.out()
     paddle::dialect::SubtractOp sub_op =
         rewriter.Build<paddle::dialect::SubtractOp>(
-            bn_bias.dyn_cast<ir::OpResult>(), mul_bias_op.out());
+            bn_bias.dyn_cast<pir::OpResult>(), mul_bias_op.out());
     // reshape new bias
-    phi::DDim new_conv2d_out_shape = ir::GetShapeFromValue(new_conv2d_op.out());
+    phi::DDim new_conv2d_out_shape =
+        pir::GetShapeFromValue(new_conv2d_op.out());
     std::vector<int64_t> new_bias_new_shape(new_conv2d_out_shape.size(), 1);
     std::string data_format =
-        new_conv2d_op.attribute<ir::StrAttribute>("data_format").AsString();
+        new_conv2d_op.attribute<pir::StrAttribute>("data_format").AsString();
     IR_ENFORCE(data_format == "NCHW", "Only support NCHW now.");
     new_bias_new_shape[1] = new_conv2d_out_shape[1];
     paddle::dialect::ReshapeOp reshape_bias_op =
@@ -354,21 +357,21 @@ class Conv2dBnFusePattern
 
 namespace paddle {
 namespace dialect {
-class Conv2dFusionOpTest : public ir::Op<Conv2dFusionOpTest,
-                                         OpYamlInfoInterface,
-                                         InferMetaInterface> {
+class Conv2dFusionOpTest : public pir::Op<Conv2dFusionOpTest,
+                                          OpYamlInfoInterface,
+                                          InferMetaInterface> {
  public:
   using Op::Op;
-  static const char *name() { return "pd.conv2d_fusion_test"; }
+  static const char *name() { return "pd_op.conv2d_fusion_test"; }
   static const char *attributes_name[10];  // NOLINT
   static constexpr uint32_t attributes_num = 10;
   static OpInfoTuple GetOpInfo();
-  static void Build(ir::Builder &builder,             // NOLINT
-                    ir::OperationArgument &argument,  // NOLINT
-                    ir::OpResult input_,
-                    ir::OpResult filter_,
-                    ir::OpResult bias_,
-                    ir::OpResult residual_,
+  static void Build(pir::Builder &builder,             // NOLINT
+                    pir::OperationArgument &argument,  // NOLINT
+                    pir::OpResult input_,
+                    pir::OpResult filter_,
+                    pir::OpResult bias_,
+                    pir::OpResult residual_,
                     const std::vector<int> &strides,
                     const std::vector<int> &paddings_t,
                     std::string padding_algorithm,
@@ -380,21 +383,21 @@ class Conv2dFusionOpTest : public ir::Op<Conv2dFusionOpTest,
                     const std::vector<int> &channels,
                     int user_workspace_size);
 
-  static void Build(ir::Builder &builder,             // NOLINT
-                    ir::OperationArgument &argument,  // NOLINT
-                    ir::OpResult input_,
-                    ir::OpResult filter_,
-                    ir::OpResult bias_,
-                    ir::OpResult residual_,
-                    ir::AttributeMap attributes);
+  static void Build(pir::Builder &builder,             // NOLINT
+                    pir::OperationArgument &argument,  // NOLINT
+                    pir::OpResult input_,
+                    pir::OpResult filter_,
+                    pir::OpResult bias_,
+                    pir::OpResult residual_,
+                    pir::AttributeMap attributes);
   void Verify();
-  ir::Value input() { return operand_source(0); }
-  ir::Value filter() { return operand_source(1); }
-  ir::Value bias() { return operand_source(2); }
-  ir::Value residual() { return operand_source(3); }
-  ir::OpResult output() { return result(0); }
-  ir::OpResult outputs() { return result(1); }
-  ir::Attribute attribute(const std::string &name) {
+  pir::Value input() { return operand_source(0); }
+  pir::Value filter() { return operand_source(1); }
+  pir::Value bias() { return operand_source(2); }
+  pir::Value residual() { return operand_source(3); }
+  pir::OpResult output() { return result(0); }
+  pir::OpResult outputs() { return result(1); }
+  pir::Attribute attribute(const std::string &name) {
     {
       PADDLE_ENFORCE(
           attributes().count(name) > 0,
@@ -454,22 +457,24 @@ OpInfoTuple Conv2dFusionOpTest::GetOpInfo() {
                   false,
                   true)};
   std::vector<paddle::dialect::OpAttributeInfo> attributes = {
-      OpAttributeInfo("strides", "ir::ArrayAttribute<ir::Int32Attribute>", ""),
       OpAttributeInfo(
-          "paddings_t", "ir::ArrayAttribute<ir::Int32Attribute>", ""),
-      OpAttributeInfo("padding_algorithm", "ir::StrAttribute", ""),
+          "strides", "pir::ArrayAttribute<pir::Int32Attribute>", ""),
       OpAttributeInfo(
-          "dilations_t", "ir::ArrayAttribute<ir::Int32Attribute>", ""),
-      OpAttributeInfo("groups", "ir::Int32Attribute", ""),
-      OpAttributeInfo("data_format", "ir::StrAttribute", ""),
-      OpAttributeInfo("activation", "ir::StrAttribute", ""),
-      OpAttributeInfo("exhaustive_search", "ir::BoolAttribute", ""),
-      OpAttributeInfo("channels", "ir::ArrayAttribute<ir::Int32Attribute>", ""),
-      OpAttributeInfo("user_workspace_size", "ir::Int32Attribute", "")};
+          "paddings_t", "pir::ArrayAttribute<pir::Int32Attribute>", ""),
+      OpAttributeInfo("padding_algorithm", "pir::StrAttribute", ""),
+      OpAttributeInfo(
+          "dilations_t", "pir::ArrayAttribute<pir::Int32Attribute>", ""),
+      OpAttributeInfo("groups", "pir::Int32Attribute", ""),
+      OpAttributeInfo("data_format", "pir::StrAttribute", ""),
+      OpAttributeInfo("activation", "pir::StrAttribute", ""),
+      OpAttributeInfo("exhaustive_search", "pir::BoolAttribute", ""),
+      OpAttributeInfo(
+          "channels", "pir::ArrayAttribute<pir::Int32Attribute>", ""),
+      OpAttributeInfo("user_workspace_size", "pir::Int32Attribute", "")};
   std::vector<paddle::dialect::OpOutputInfo> outputs = {
       OpOutputInfo("output", "paddle::dialect::DenseTensorType", false, false),
       OpOutputInfo("outputs",
-                   "ir::VectorType<paddle::dialect::DenseTensorType>",
+                   "pir::VectorType<paddle::dialect::DenseTensorType>",
                    false,
                    false)};
   paddle::dialect::OpRunTimeInfo run_time_info =
@@ -512,132 +517,132 @@ OpInfoTuple Conv2dFusionOpTest::GetOpInfo() {
       inputs, attributes, outputs, run_time_info, "conv2d_fusion_test");
 }
 
-void Conv2dFusionOpTest::Build(ir::Builder &builder,
-                               ir::OperationArgument &argument,
-                               ir::OpResult input_,
-                               ir::OpResult filter_,
-                               ir::OpResult bias_,
-                               ir::OpResult residual_,
-                               ir::AttributeMap attributes) {
+void Conv2dFusionOpTest::Build(pir::Builder &builder,
+                               pir::OperationArgument &argument,
+                               pir::OpResult input_,
+                               pir::OpResult filter_,
+                               pir::OpResult bias_,
+                               pir::OpResult residual_,
+                               pir::AttributeMap attributes) {
   std::vector<int> strides;
   for (size_t i = 0;
-       i < attributes.at("strides").dyn_cast<ir::ArrayAttribute>().size();
+       i < attributes.at("strides").dyn_cast<pir::ArrayAttribute>().size();
        i++) {
     strides.push_back(attributes.at("strides")
-                          .dyn_cast<ir::ArrayAttribute>()
+                          .dyn_cast<pir::ArrayAttribute>()
                           .at(i)
-                          .dyn_cast<ir::Int32Attribute>()
+                          .dyn_cast<pir::Int32Attribute>()
                           .data());
   }
 
   std::vector<int> paddings_t;
   for (size_t i = 0;
-       i < attributes.at("paddings_t").dyn_cast<ir::ArrayAttribute>().size();
+       i < attributes.at("paddings_t").dyn_cast<pir::ArrayAttribute>().size();
        i++) {
     paddings_t.push_back(attributes.at("paddings_t")
-                             .dyn_cast<ir::ArrayAttribute>()
+                             .dyn_cast<pir::ArrayAttribute>()
                              .at(i)
-                             .dyn_cast<ir::Int32Attribute>()
+                             .dyn_cast<pir::Int32Attribute>()
                              .data());
   }
 
   std::string padding_algorithm = attributes.at("padding_algorithm")
-                                      .dyn_cast<ir::StrAttribute>()
+                                      .dyn_cast<pir::StrAttribute>()
                                       .AsString();
   std::vector<int> dilations_t;
   for (size_t i = 0;
-       i < attributes.at("dilations_t").dyn_cast<ir::ArrayAttribute>().size();
+       i < attributes.at("dilations_t").dyn_cast<pir::ArrayAttribute>().size();
        i++) {
     dilations_t.push_back(attributes.at("dilations_t")
-                              .dyn_cast<ir::ArrayAttribute>()
+                              .dyn_cast<pir::ArrayAttribute>()
                               .at(i)
-                              .dyn_cast<ir::Int32Attribute>()
+                              .dyn_cast<pir::Int32Attribute>()
                               .data());
   }
-  int groups = attributes.at("groups").dyn_cast<ir::Int32Attribute>().data();
+  int groups = attributes.at("groups").dyn_cast<pir::Int32Attribute>().data();
   std::string data_format =
-      attributes.at("data_format").dyn_cast<ir::StrAttribute>().AsString();
+      attributes.at("data_format").dyn_cast<pir::StrAttribute>().AsString();
   std::string activation =
-      attributes.at("activation").dyn_cast<ir::StrAttribute>().AsString();
+      attributes.at("activation").dyn_cast<pir::StrAttribute>().AsString();
   bool exhaustive_search =
-      attributes.at("exhaustive_search").dyn_cast<ir::BoolAttribute>().data();
+      attributes.at("exhaustive_search").dyn_cast<pir::BoolAttribute>().data();
   std::vector<int> channels;
   for (size_t i = 0;
-       i < attributes.at("channels").dyn_cast<ir::ArrayAttribute>().size();
+       i < attributes.at("channels").dyn_cast<pir::ArrayAttribute>().size();
        i++) {
     channels.push_back(attributes.at("channels")
-                           .dyn_cast<ir::ArrayAttribute>()
+                           .dyn_cast<pir::ArrayAttribute>()
                            .at(i)
-                           .dyn_cast<ir::Int32Attribute>()
+                           .dyn_cast<pir::Int32Attribute>()
                            .data());
   }
   int user_workspace_size = attributes.at("user_workspace_size")
-                                .dyn_cast<ir::Int32Attribute>()
+                                .dyn_cast<pir::Int32Attribute>()
                                 .data();
 
   VLOG(4) << "Builder construction inputs";
-  std::vector<ir::OpResult> argument_inputs = {
+  std::vector<pir::OpResult> argument_inputs = {
       input_, filter_, bias_, residual_};
   argument.AddOperands(argument_inputs.begin(), argument_inputs.end());
 
   VLOG(4) << "Builder construction attributes";
-  std::vector<ir::Attribute> vec_strides;
+  std::vector<pir::Attribute> vec_strides;
   for (auto stride : strides) {
-    ir::Attribute attr_strides =
-        ir::Int32Attribute::get(ir::IrContext::Instance(), stride);
+    pir::Attribute attr_strides =
+        pir::Int32Attribute::get(pir::IrContext::Instance(), stride);
 
     vec_strides.push_back(attr_strides);
   }
-  ir::Attribute attr_strides =
-      ir::ArrayAttribute::get(ir::IrContext::Instance(), vec_strides);
+  pir::Attribute attr_strides =
+      pir::ArrayAttribute::get(pir::IrContext::Instance(), vec_strides);
   argument.AddAttribute("strides", attr_strides);
-  std::vector<ir::Attribute> vec_paddings_t;
+  std::vector<pir::Attribute> vec_paddings_t;
   for (auto padding : paddings_t) {
-    ir::Attribute attr_paddings_t =
-        ir::Int32Attribute::get(ir::IrContext::Instance(), padding);
+    pir::Attribute attr_paddings_t =
+        pir::Int32Attribute::get(pir::IrContext::Instance(), padding);
 
     vec_paddings_t.push_back(attr_paddings_t);
   }
-  ir::Attribute attr_paddings_t =
-      ir::ArrayAttribute::get(ir::IrContext::Instance(), vec_paddings_t);
+  pir::Attribute attr_paddings_t =
+      pir::ArrayAttribute::get(pir::IrContext::Instance(), vec_paddings_t);
   argument.AddAttribute("paddings_t", attr_paddings_t);
-  ir::Attribute attr_padding_algorithm =
-      ir::StrAttribute::get(ir::IrContext::Instance(), padding_algorithm);
+  pir::Attribute attr_padding_algorithm =
+      pir::StrAttribute::get(pir::IrContext::Instance(), padding_algorithm);
   argument.AddAttribute("padding_algorithm", attr_padding_algorithm);
-  std::vector<ir::Attribute> vec_dilations_t;
+  std::vector<pir::Attribute> vec_dilations_t;
   for (auto dilation : dilations_t) {
-    ir::Attribute attr_dilations_t =
-        ir::Int32Attribute::get(ir::IrContext::Instance(), dilation);
+    pir::Attribute attr_dilations_t =
+        pir::Int32Attribute::get(pir::IrContext::Instance(), dilation);
 
     vec_dilations_t.push_back(attr_dilations_t);
   }
-  ir::Attribute attr_dilations_t =
-      ir::ArrayAttribute::get(ir::IrContext::Instance(), vec_dilations_t);
+  pir::Attribute attr_dilations_t =
+      pir::ArrayAttribute::get(pir::IrContext::Instance(), vec_dilations_t);
   argument.AddAttribute("dilations_t", attr_dilations_t);
-  ir::Attribute attr_groups =
-      ir::Int32Attribute::get(ir::IrContext::Instance(), groups);
+  pir::Attribute attr_groups =
+      pir::Int32Attribute::get(pir::IrContext::Instance(), groups);
   argument.AddAttribute("groups", attr_groups);
-  ir::Attribute attr_data_format =
-      ir::StrAttribute::get(ir::IrContext::Instance(), data_format);
+  pir::Attribute attr_data_format =
+      pir::StrAttribute::get(pir::IrContext::Instance(), data_format);
   argument.AddAttribute("data_format", attr_data_format);
-  ir::Attribute attr_activation =
-      ir::StrAttribute::get(ir::IrContext::Instance(), activation);
+  pir::Attribute attr_activation =
+      pir::StrAttribute::get(pir::IrContext::Instance(), activation);
   argument.AddAttribute("activation", attr_activation);
-  ir::Attribute attr_exhaustive_search =
-      ir::BoolAttribute::get(ir::IrContext::Instance(), exhaustive_search);
+  pir::Attribute attr_exhaustive_search =
+      pir::BoolAttribute::get(pir::IrContext::Instance(), exhaustive_search);
   argument.AddAttribute("exhaustive_search", attr_exhaustive_search);
-  std::vector<ir::Attribute> vec_channels;
+  std::vector<pir::Attribute> vec_channels;
   for (auto channel : channels) {
-    ir::Attribute attr_channels =
-        ir::Int32Attribute::get(ir::IrContext::Instance(), channel);
+    pir::Attribute attr_channels =
+        pir::Int32Attribute::get(pir::IrContext::Instance(), channel);
 
     vec_channels.push_back(attr_channels);
   }
-  ir::Attribute attr_channels =
-      ir::ArrayAttribute::get(ir::IrContext::Instance(), vec_channels);
+  pir::Attribute attr_channels =
+      pir::ArrayAttribute::get(pir::IrContext::Instance(), vec_channels);
   argument.AddAttribute("channels", attr_channels);
-  ir::Attribute attr_user_workspace_size =
-      ir::Int32Attribute::get(ir::IrContext::Instance(), user_workspace_size);
+  pir::Attribute attr_user_workspace_size =
+      pir::Int32Attribute::get(pir::IrContext::Instance(), user_workspace_size);
   argument.AddAttribute("user_workspace_size", attr_user_workspace_size);
 
   VLOG(4) << "Builder construction outputs";
@@ -734,9 +739,9 @@ void Conv2dFusionOpTest::Build(ir::Builder &builder,
                           &meta_output,
                           phi::MetaConfig());
 
-  std::vector<ir::Type> argument_outputs;
+  std::vector<pir::Type> argument_outputs;
   auto output_dense_tensor_type = paddle::dialect::DenseTensorType::get(
-      ir::IrContext::Instance(),
+      pir::IrContext::Instance(),
       TransToIrDataType(dense_output.dtype()),
       dense_output.dims(),
       dense_output.layout(),
@@ -746,18 +751,18 @@ void Conv2dFusionOpTest::Build(ir::Builder &builder,
 
   argument_outputs.push_back(output_dense_tensor_type);
 
-  std::vector<ir::Type> outputs_types;
+  std::vector<pir::Type> outputs_types;
   for (size_t i = 0; i < static_cast<size_t>(channels.size()); i++) {
     outputs_types.push_back(paddle::dialect::DenseTensorType::get(
-        ir::IrContext::Instance(),
+        pir::IrContext::Instance(),
         TransToIrDataType(vec_dense_outputs[i].dtype()),
         vec_dense_outputs[i].dims(),
         vec_dense_outputs[i].layout(),
         vec_dense_outputs[i].lod(),
         vec_dense_outputs[i].offset()));
   }
-  ir::Type outputs_vector_type =
-      ir::VectorType::get(ir::IrContext::Instance(), outputs_types);
+  pir::Type outputs_vector_type =
+      pir::VectorType::get(pir::IrContext::Instance(), outputs_types);
   argument_outputs.push_back(outputs_vector_type);
   argument.AddOutputs(argument_outputs.begin(), argument_outputs.end());
 }
@@ -801,86 +806,87 @@ void Conv2dFusionOpTest::Verify() {
   {
     auto &attributes = this->attributes();
     PADDLE_ENFORCE(attributes.count("strides") > 0 &&
-                       attributes.at("strides").isa<ir::ArrayAttribute>(),
+                       attributes.at("strides").isa<pir::ArrayAttribute>(),
                    phi::errors::PreconditionNotMet(
                        "Type of attribute: strides is not right."));
     for (size_t i = 0;
-         i < attributes.at("strides").dyn_cast<ir::ArrayAttribute>().size();
+         i < attributes.at("strides").dyn_cast<pir::ArrayAttribute>().size();
          i++) {
       PADDLE_ENFORCE(attributes.at("strides")
-                         .dyn_cast<ir::ArrayAttribute>()
+                         .dyn_cast<pir::ArrayAttribute>()
                          .at(i)
-                         .isa<ir::Int32Attribute>(),
+                         .isa<pir::Int32Attribute>(),
                      phi::errors::PreconditionNotMet(
                          "Type of attribute: strides is not right."));
     }
     PADDLE_ENFORCE(attributes.count("paddings_t") > 0 &&
-                       attributes.at("paddings_t").isa<ir::ArrayAttribute>(),
+                       attributes.at("paddings_t").isa<pir::ArrayAttribute>(),
                    phi::errors::PreconditionNotMet(
                        "Type of attribute: paddings_t is not right."));
     for (size_t i = 0;
-         i < attributes.at("paddings_t").dyn_cast<ir::ArrayAttribute>().size();
+         i < attributes.at("paddings_t").dyn_cast<pir::ArrayAttribute>().size();
          i++) {
       PADDLE_ENFORCE(attributes.at("paddings_t")
-                         .dyn_cast<ir::ArrayAttribute>()
+                         .dyn_cast<pir::ArrayAttribute>()
                          .at(i)
-                         .isa<ir::Int32Attribute>(),
+                         .isa<pir::Int32Attribute>(),
                      phi::errors::PreconditionNotMet(
                          "Type of attribute: paddings_t is not right."));
     }
     PADDLE_ENFORCE(
         attributes.count("padding_algorithm") > 0 &&
-            attributes.at("padding_algorithm").isa<ir::StrAttribute>(),
+            attributes.at("padding_algorithm").isa<pir::StrAttribute>(),
         phi::errors::PreconditionNotMet(
             "Type of attribute: padding_algorithm is not right."));
     PADDLE_ENFORCE(attributes.count("dilations_t") > 0 &&
-                       attributes.at("dilations_t").isa<ir::ArrayAttribute>(),
+                       attributes.at("dilations_t").isa<pir::ArrayAttribute>(),
                    phi::errors::PreconditionNotMet(
                        "Type of attribute: dilations_t is not right."));
     for (size_t i = 0;
-         i < attributes.at("dilations_t").dyn_cast<ir::ArrayAttribute>().size();
+         i <
+         attributes.at("dilations_t").dyn_cast<pir::ArrayAttribute>().size();
          i++) {
       PADDLE_ENFORCE(attributes.at("dilations_t")
-                         .dyn_cast<ir::ArrayAttribute>()
+                         .dyn_cast<pir::ArrayAttribute>()
                          .at(i)
-                         .isa<ir::Int32Attribute>(),
+                         .isa<pir::Int32Attribute>(),
                      phi::errors::PreconditionNotMet(
                          "Type of attribute: dilations_t is not right."));
     }
     PADDLE_ENFORCE(attributes.count("groups") > 0 &&
-                       attributes.at("groups").isa<ir::Int32Attribute>(),
+                       attributes.at("groups").isa<pir::Int32Attribute>(),
                    phi::errors::PreconditionNotMet(
                        "Type of attribute: groups is not right."));
     PADDLE_ENFORCE(attributes.count("data_format") > 0 &&
-                       attributes.at("data_format").isa<ir::StrAttribute>(),
+                       attributes.at("data_format").isa<pir::StrAttribute>(),
                    phi::errors::PreconditionNotMet(
                        "Type of attribute: data_format is not right."));
     PADDLE_ENFORCE(attributes.count("activation") > 0 &&
-                       attributes.at("activation").isa<ir::StrAttribute>(),
+                       attributes.at("activation").isa<pir::StrAttribute>(),
                    phi::errors::PreconditionNotMet(
                        "Type of attribute: activation is not right."));
     PADDLE_ENFORCE(
         attributes.count("exhaustive_search") > 0 &&
-            attributes.at("exhaustive_search").isa<ir::BoolAttribute>(),
+            attributes.at("exhaustive_search").isa<pir::BoolAttribute>(),
         phi::errors::PreconditionNotMet(
             "Type of attribute: exhaustive_search is not right."));
     PADDLE_ENFORCE(attributes.count("channels") > 0 &&
-                       attributes.at("channels").isa<ir::ArrayAttribute>(),
+                       attributes.at("channels").isa<pir::ArrayAttribute>(),
                    phi::errors::PreconditionNotMet(
                        "Type of attribute: channels is not right."));
     for (size_t i = 0;
-         i < attributes.at("channels").dyn_cast<ir::ArrayAttribute>().size();
+         i < attributes.at("channels").dyn_cast<pir::ArrayAttribute>().size();
          i++) {
       PADDLE_ENFORCE(attributes.at("channels")
-                         .dyn_cast<ir::ArrayAttribute>()
+                         .dyn_cast<pir::ArrayAttribute>()
                          .at(i)
-                         .isa<ir::Int32Attribute>(),
+                         .isa<pir::Int32Attribute>(),
                      phi::errors::PreconditionNotMet(
                          "Type of attribute: channels is not right."));
     }
     PADDLE_ENFORCE(
         attributes.count("user_workspace_size") > 0 &&
-            attributes.at("user_workspace_size").isa<ir::Int32Attribute>(),
+            attributes.at("user_workspace_size").isa<pir::Int32Attribute>(),
         phi::errors::PreconditionNotMet(
             "Type of attribute: user_workspace_size is not right."));
   }
@@ -897,7 +903,7 @@ void Conv2dFusionOpTest::Verify() {
         phi::errors::PreconditionNotMet(
             "Type validation failed for the 0th output."));
     auto output_1_type = (*this)->result(1).type();
-    if (auto vec_type = output_1_type.dyn_cast<ir::VectorType>()) {
+    if (auto vec_type = output_1_type.dyn_cast<pir::VectorType>()) {
       for (size_t i = 0; i < vec_type.size(); i++) {
         PADDLE_ENFORCE(vec_type[i].isa<paddle::dialect::DenseTensorType>(),
                        phi::errors::PreconditionNotMet(
@@ -922,10 +928,10 @@ void Conv2dFusionOpTest::InferMeta(phi::InferMetaContext *infer_meta) {
 IR_DECLARE_EXPLICIT_TYPE_ID(paddle::dialect::Conv2dFusionOpTest)
 IR_DEFINE_EXPLICIT_TYPE_ID(paddle::dialect::Conv2dFusionOpTest)
 
-class Conv2dFusionTestDialect : public ir::Dialect {
+class Conv2dFusionTestDialect : public pir::Dialect {
  public:
-  explicit Conv2dFusionTestDialect(ir::IrContext *context)
-      : ir::Dialect(name(), context, ir::TypeId::get<TestDialect>()) {
+  explicit Conv2dFusionTestDialect(pir::IrContext *context)
+      : pir::Dialect(name(), context, pir::TypeId::get<TestDialect>()) {
     initialize();
   }
   static const char *name() { return "con2d fusion test"; }
@@ -937,30 +943,32 @@ IR_DECLARE_EXPLICIT_TYPE_ID(Conv2dFusionTestDialect)
 IR_DEFINE_EXPLICIT_TYPE_ID(Conv2dFusionTestDialect)
 
 class Conv2dAddFusePattern
-    : public ir::OpRewritePattern<paddle::dialect::AddOp> {
+    : public pir::OpRewritePattern<paddle::dialect::AddOp> {
  public:
-  using ir::OpRewritePattern<paddle::dialect::AddOp>::OpRewritePattern;
+  using pir::OpRewritePattern<paddle::dialect::AddOp>::OpRewritePattern;
   bool MatchAndRewrite(
       paddle::dialect::AddOp op,
-      ir::PatternRewriter &rewriter) const override {  // NOLINT
+      pir::PatternRewriter &rewriter) const override {  // NOLINT
     // The next op should be add.
     paddle::dialect::Conv2dOp conv2d_op =
-        ir::GetDefiningOpForInput(op, 0)->dyn_cast<paddle::dialect::Conv2dOp>();
+        pir::GetDefiningOpForInput(op, 0)
+            ->dyn_cast<paddle::dialect::Conv2dOp>();
     if (!conv2d_op) return false;
 
-    ir::OpResult conv2d_out = conv2d_op.out();
+    pir::OpResult conv2d_out = conv2d_op.out();
     if (!conv2d_out.HasOneUse()) return false;
 
-    ir::Value conv2d_filter = conv2d_op.filter();
+    pir::Value conv2d_filter = conv2d_op.filter();
 
-    ir::OpResult conv2d_filter_result = conv2d_filter.dyn_cast<ir::OpResult>();
+    pir::OpResult conv2d_filter_result =
+        conv2d_filter.dyn_cast<pir::OpResult>();
     IR_ENFORCE(conv2d_filter_result);
 
-    ir::Value add_input = op.x();
+    pir::Value add_input = op.x();
     IR_ENFORCE(add_input == conv2d_out);
 
-    ir::Value y = op.y();
-    ir::OpResult bias = y.dyn_cast<ir::OpResult>();
+    pir::Value y = op.y();
+    pir::OpResult bias = y.dyn_cast<pir::OpResult>();
     auto conv2d_attributes = conv2d_op.attributes();
     std::vector<std::string> conv2d_fusion_attrStr = {"strides",
                                                       "paddings_t",
@@ -972,7 +980,7 @@ class Conv2dAddFusePattern
                                                       "exhaustive_search",
                                                       "channels",
                                                       "user_workspace_size"};
-    std::vector<ir::Attribute> con2d_fusing_attr = {
+    std::vector<pir::Attribute> con2d_fusing_attr = {
         conv2d_attributes.at("strides"),
         conv2d_attributes.at("paddings"),
         conv2d_attributes.at("padding_algorithm"),
@@ -981,32 +989,32 @@ class Conv2dAddFusePattern
         conv2d_attributes.at("data_format"),
         rewriter.str_attr("identity"),
         rewriter.bool_attr(true),
-        rewriter.array_attr(std::vector<ir::Attribute>{}),
+        rewriter.array_attr(std::vector<pir::Attribute>{}),
         rewriter.int32_attr(0)};
-    ir::AttributeMap conv2d_fusion_attributes;
+    pir::AttributeMap conv2d_fusion_attributes;
     for (size_t i = 0; i < conv2d_fusion_attrStr.size(); ++i) {
       conv2d_fusion_attributes[conv2d_fusion_attrStr[i]] = con2d_fusing_attr[i];
     }
 
-    ir::OpResult tmpResidual;
+    pir::OpResult tmpResidual;
 
     auto conv2d_fuse_op = rewriter.Build<paddle::dialect::Conv2dFusionOpTest>(
-        ir::GetDefiningOpForInput(conv2d_op, 0)->result(0),
+        pir::GetDefiningOpForInput(conv2d_op, 0)->result(0),
         conv2d_filter_result,
         bias,
         tmpResidual,
         conv2d_fusion_attributes);
-    rewriter.ReplaceOp(op, std::vector<ir::Value>{conv2d_fuse_op.output()});
+    rewriter.ReplaceOp(op, std::vector<pir::Value>{conv2d_fuse_op.output()});
     return true;
   }
 };
 
-class TestPass : public ir::Pass {
+class TestPass : public pir::Pass {
  public:
-  TestPass() : ir::Pass("TestPass", 1) {}
+  TestPass() : pir::Pass("TestPass", 1) {}
 
-  bool Initialize(ir::IrContext *context) override {
-    ir::RewritePatternSet ps(context);
+  bool Initialize(pir::IrContext *context) override {
+    pir::RewritePatternSet ps(context);
     ps.Add<RedundantTransposeFusePattern>(context);
     auto conv_bn_pattern = std::make_unique<Conv2dBnFusePattern>(
         context,
@@ -1024,26 +1032,26 @@ class TestPass : public ir::Pass {
       LOG(INFO) << "--- " << op_info.name();
     }
     ps.Add(std::move(conv_bn_pattern));
-    patterns_ = ir::FrozenRewritePatternSet(std::move(ps));
+    patterns_ = pir::FrozenRewritePatternSet(std::move(ps));
     return true;
   }
 
-  void Run(ir::Operation *op) override {
-    ir::GreedyRewriteConfig cfg;
+  void Run(pir::Operation *op) override {
+    pir::GreedyRewriteConfig cfg;
     cfg.use_top_down_traversal = true;
     cfg.max_iterations = 10;
-    ir::ApplyPatternsGreedily(op->region(0), patterns_, cfg);
+    pir::ApplyPatternsGreedily(op->region(0), patterns_, cfg);
   }
 
-  bool CanApplyOn(ir::Operation *op) const override {
+  bool CanApplyOn(pir::Operation *op) const override {
     return op->name() == "builtin.module" && op->num_regions() > 0;
   }
 
  private:
-  ir::FrozenRewritePatternSet patterns_;
+  pir::FrozenRewritePatternSet patterns_;
 };
 
-void BuildProgram(ir::Builder &builder) {  // NOLINT
+void BuildProgram(pir::Builder &builder) {  // NOLINT
   paddle::dialect::FullOp full_input_op =
       builder.Build<paddle::dialect::FullOp>(std::vector<int64_t>{4, 3, 16, 16},
                                              1.5,
@@ -1102,28 +1110,28 @@ void BuildProgram(ir::Builder &builder) {  // NOLINT
 
 // TODO(wilber): Add a normal test.
 TEST(pattern_rewrite, Patterns) {
-  ir::IrContext *ctx = ir::IrContext::Instance();
+  pir::IrContext *ctx = pir::IrContext::Instance();
   auto *test_dialect = ctx->GetOrRegisterDialect<Conv2dFusionTestDialect>();
   test_dialect->RegisterOp<paddle::dialect::Conv2dFusionOpTest>();
-  ctx->GetOrRegisterDialect<paddle::dialect::PaddleDialect>();
-  ir::Program program(ctx);
-  ir::Builder builder = ir::Builder(ctx, program.block());
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+  pir::Program program(ctx);
+  pir::Builder builder = pir::Builder(ctx, program.block());
   BuildProgram(builder);
 
   EXPECT_EQ(program.block()->size(), 11u);
 
-  ir::PassManager pm(ctx);
+  pir::PassManager pm(ctx);
   pm.AddPass(std::make_unique<TestPass>());
   //   pm.AddPass(ir::CreateConstantFoldingPass());
-  pm.AddPass(ir::CreateDeadCodeEliminationPass());
-  pm.AddPass(ir::CreateReorderBlockOpsPass());
+  pm.AddPass(pir::CreateDeadCodeEliminationPass());
+  pm.AddPass(pir::CreateReorderBlockOpsPass());
   pm.EnablePassTiming();
   pm.EnableIRPrinting();
-  // pm.EnableIRPrinting(std::make_unique<ir::PassManager::IRPrinterOption>(
-  //     [](ir::Pass *pass, ir::Operation *op) {
+  // pm.EnableIRPrinting(std::make_unique<pir::PassManager::IRPrinterOption>(
+  //     [](pir::Pass *pass, pir::Operation *op) {
   //       return pass->name() == "ConstantFoldingPass";
   //     },
-  //     [](ir::Pass *pass, ir::Operation *op) {
+  //     [](pir::Pass *pass, pir::Operation *op) {
   //       return pass->name() == "ConstantFoldingPass";
   //     },
   //     true,
diff --git a/test/cpp/ir/shape_dialect/CMakeLists.txt b/test/cpp/pir/shape_dialect/CMakeLists.txt
similarity index 77%
rename from test/cpp/ir/shape_dialect/CMakeLists.txt
rename to test/cpp/pir/shape_dialect/CMakeLists.txt
index ae3e3d63d52bd..73c635713f99d 100644
--- a/test/cpp/ir/shape_dialect/CMakeLists.txt
+++ b/test/cpp/pir/shape_dialect/CMakeLists.txt
@@ -3,6 +3,6 @@ cc_test_old(
   SRCS
   symbolic_op_test.cc
   DEPS
-  pd_dialect
-  ir
+  pd_op_dialect
+  pir
   gtest)
diff --git a/test/cpp/pir/shape_dialect/symbolic_op_test.cc b/test/cpp/pir/shape_dialect/symbolic_op_test.cc
new file mode 100644
index 0000000000000..87f4623f811ce
--- /dev/null
+++ b/test/cpp/pir/shape_dialect/symbolic_op_test.cc
@@ -0,0 +1,588 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <map>
+#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
+#include "paddle/pir/core/block.h"
+#include "paddle/pir/core/builder.h"
+#include "paddle/pir/core/builtin_type.h"
+#include "paddle/pir/core/builtin_type_interfaces.h"
+#include "paddle/pir/core/dialect.h"
+#include "paddle/pir/core/ir_context.h"
+#include "paddle/pir/core/program.h"
+#include "paddle/pir/dialect/shape/ir/shape_dialect.h"
+#include "paddle/pir/dialect/shape/ir/shape_op.h"
+#include "paddle/pir/dialect/shape/utils/shape_utils.h"
+
+pir::AttributeMap CreateAttributeMap(
+    const std::vector<std::string> &attribute_names,
+    const std::vector<std::string> &attributes) {
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  pir::AttributeMap attr_map;
+  for (size_t i = 0; i < attribute_names.size(); i++) {
+    pir::Attribute attr_value = pir::StrAttribute::get(ctx, attributes[i]);
+    attr_map.insert(
+        std::pair<std::string, pir::Attribute>(attribute_names[i], attr_value));
+  }
+  return attr_map;
+}
+
+pir::Operation *CreateDenseTensorOp(
+    pir::IrContext *ctx,
+    const phi::DDim &dims,
+    const std::vector<std::string> &attribute_names,
+    const std::vector<std::string> &attributes) {
+  std::vector<pir::OpResult> op_inputs = {};
+  pir::Type fp32_dtype = pir::Float32Type::get(ctx);
+  phi::DataLayout data_layout = phi::DataLayout::NCHW;
+  phi::LoD lod = {{0, 1, 2}};
+  size_t offset = 0;
+  std::vector<pir::Type> op_output_types = {
+      paddle::dialect::DenseTensorType::get(
+          ctx, fp32_dtype, dims, data_layout, lod, offset)};
+  pir::Operation *op =
+      pir::Operation::Create(op_inputs,
+                             CreateAttributeMap(attribute_names, attributes),
+                             op_output_types,
+                             pir::OpInfo());
+  return op;
+}
+
+TEST(assist_struct_test, symbolic_dim) {
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  pir::Program program(ctx);
+  ctx->GetOrRegisterDialect<pir::dialect::ShapeDialect>();
+  pir::Builder builder = pir::Builder(ctx, program.block());
+  pir::dialect::SymbolicDim symDim = builder.Build<pir::dialect::SymbolicDim>(
+      "S0", 10, false, false, false, false);
+  pir::dialect::SymbolicDim symDim_ = builder.Build<pir::dialect::SymbolicDim>(
+      "S1", 10, false, false, false, false);
+  EXPECT_EQ(symDim.getValue(), 10);
+  EXPECT_EQ(symDim.getSymName(), "S0");
+  EXPECT_FALSE(symDim.getKnownNegativeOne());
+  EXPECT_FALSE(symDim.getKnownNonSizeOne());
+  EXPECT_FALSE(symDim.getKnownNonSizeZero());
+  EXPECT_FALSE(symDim.getKnownNonNegative());
+
+  EXPECT_FALSE(symDim.isDynamic());
+  EXPECT_TRUE(symDim.merge(symDim_));
+
+  symDim.updateValue(20);
+  symDim.updateSymName("S2");
+  symDim.updateKnownNegativeOne(true);
+  symDim.updateKnownNonSizeOne(true);
+  symDim.updateKnownNonSizeZero(true);
+  symDim.updateKnownNonNegative(true);
+
+  EXPECT_FALSE(symDim.merge(symDim_));
+
+  EXPECT_EQ(symDim.getValue(), 20);
+  EXPECT_EQ(symDim.getSymName(), "S2");
+  EXPECT_TRUE(symDim.getKnownNegativeOne());
+  EXPECT_TRUE(symDim.getKnownNonSizeOne());
+  EXPECT_TRUE(symDim.getKnownNonSizeZero());
+  EXPECT_TRUE(symDim.getKnownNonNegative());
+}
+
+TEST(assist_struct_test, symbolic_dim_product) {
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  pir::Program program(ctx);
+  ctx->GetOrRegisterDialect<pir::dialect::ShapeDialect>();
+  pir::Builder builder = pir::Builder(ctx, program.block());
+  pir::dialect::SymbolicDim symDim = builder.Build<pir::dialect::SymbolicDim>(
+      "S0", pir::ShapedTypeInterface::kDynamic, false, false, false, false);
+  pir::SymbolicDimProduct symDimProduct;
+  pir::SymbolicDimProduct symDimProduct_;
+  symDimProduct.symbols.push_back(symDim);
+  symDimProduct.factor *= 10;
+  EXPECT_EQ(symDimProduct.factor, 10);
+  EXPECT_NE(symDimProduct, symDimProduct_);
+  EXPECT_FALSE(symDimProduct.empty());
+}
+
+TEST(assist_struct_test, symbolic_dim_table) {
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  pir::Program program(ctx);
+  ctx->GetOrRegisterDialect<pir::dialect::ShapeDialect>();
+  pir::Builder builder = pir::Builder(ctx, program.block());
+  pir::dialect::SymbolicDim symDim = builder.Build<pir::dialect::SymbolicDim>(
+      "S0", 10, false, false, false, false);
+
+  pir::SymbolTable symbolTable(program.module_op());
+  EXPECT_EQ(symbolTable.insert(symDim), "S0");
+  EXPECT_EQ(symbolTable.lookup<pir::dialect::SymbolicDim>("S0"), symDim);
+  EXPECT_EQ(symbolTable.getOp(), program.module_op());
+  EXPECT_FALSE(symbolTable.lookup<pir::dialect::SymbolicDim>("S1"));
+}
+
+TEST(assist_struct_test, symbolic_dim_mgr_simple) {
+  /******************************************************/
+  /* Mgr simple version, only SymbolicDim related func. */
+  /******************************************************/
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  pir::Program program(ctx);
+  ctx->GetOrRegisterDialect<pir::dialect::ShapeDialect>();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+
+  pir::SymbolicDimMgr symDimMgr(program.module_op());
+  pir::dialect::SymbolicDim symDimS0 = symDimMgr.newSymbolicDim();
+  pir::dialect::SymbolicDim symDimS1 = symDimMgr.newSymbolicDim();
+  pir::dialect::SymbolicDim symDimC10 = symDimMgr.newConstantSymbolicDim(10);
+  symDimMgr.mapSymbolicDimEqual(symDimS0, symDimS1);
+
+  auto op = CreateDenseTensorOp(
+      ctx, {pir::ShapedTypeInterface::kDynamic, 2}, {"op_attr"}, {"op_name"});
+  pir::Value res = op->result(0);
+
+  std::vector<pir::dialect::SymbolicDim> symDimVec =
+      symDimMgr.createSymbolicDimsForRankedValue(res);
+
+  EXPECT_EQ(symDimS0.getSymName(), "S0");
+  EXPECT_EQ(symDimS1.getSymName(), "S1");
+  EXPECT_EQ(symDimS1.getValue(), pir::ShapedTypeInterface::kDynamic);
+  EXPECT_EQ(symDimC10.getSymName(), "C10");
+  EXPECT_EQ(symDimC10.getValue(), 10);
+  EXPECT_EQ(symDimVec[0].getSymName(), "S2");
+  EXPECT_EQ(symDimVec[1].getSymName(), "C2");
+  EXPECT_EQ(symDimMgr.symbolTable().lookup<pir::dialect::SymbolicDim>("S0"),
+            symDimS0);
+  EXPECT_EQ(symDimMgr.symbolTable().lookup<pir::dialect::SymbolicDim>("C10"),
+            symDimC10);
+  EXPECT_EQ(symDimMgr.getRootSymbolicDim(symDimS1), symDimS0);
+  EXPECT_TRUE(symDimMgr.isSymbolicDimEqual(symDimS0, symDimS1));
+  EXPECT_FALSE(symDimMgr.isSymbolicDimEqual(symDimS0, symDimC10));
+}
+
+TEST(assist_struct_test, symbolic_dim_mgr_complex) {
+  /***************************************************************/
+  /* Mgr with constraintOp, and SymbolicDimProduct related func. */
+  /***************************************************************/
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  pir::Program program(ctx);
+  ctx->GetOrRegisterDialect<pir::dialect::ShapeDialect>();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+
+  pir::SymbolicDimMgr symDimMgr(program.module_op());
+  auto funcOp =
+      symDimMgr.symbolTable().getOp()->dyn_cast<pir::dialect::FuncOp>();
+
+  pir::Builder builder = pir::Builder(ctx, funcOp.block());
+
+  pir::dialect::SymbolicDim symDimS0 = symDimMgr.newSymbolicDim("S0");
+  pir::dialect::SymbolicDim symDimS1 = symDimMgr.newSymbolicDim("S1");
+  pir::dialect::SymbolicDim symDimS2 = symDimMgr.newSymbolicDim("S2");
+  pir::dialect::SymbolicDim symDimS3 = symDimMgr.newSymbolicDim("S3");
+  pir::dialect::SymbolicDim symDimS4 = symDimMgr.newSymbolicDim("S4");
+  pir::dialect::SymbolicDim symDimS5 = symDimMgr.newSymbolicDim("S5");
+  pir::dialect::SymbolicDim symDimS6 = symDimMgr.newSymbolicDim("S6");
+  pir::dialect::SymbolicDim symDimS7 = symDimMgr.newSymbolicDim("S7");
+  pir::dialect::SymbolicDim symDimS8 = symDimMgr.newSymbolicDim("S8");
+  pir::dialect::SymbolicDim symDimS9 = symDimMgr.newSymbolicDim("S9");
+  pir::dialect::SymbolicDim symDimS10 = symDimMgr.newSymbolicDim("S10");
+  pir::dialect::SymbolicDim symDimS11 = symDimMgr.newSymbolicDim("S11");
+  pir::dialect::SymbolicDim symDimS12 = symDimMgr.newSymbolicDim("S12");
+  pir::dialect::SymbolicDim symDimC10 = symDimMgr.newConstantSymbolicDim(10);
+  pir::dialect::SymbolicDim symDimC20 = symDimMgr.newConstantSymbolicDim(20);
+
+  pir::OpResult dimOpS0 = builder.Build<pir::dialect::DimOp>("S0").out();
+  pir::OpResult dimOpS1 = builder.Build<pir::dialect::DimOp>("S1").out();
+  pir::OpResult dimOpS2 = builder.Build<pir::dialect::DimOp>("S2").out();
+  pir::OpResult dimOpS3 = builder.Build<pir::dialect::DimOp>("S3").out();
+  pir::OpResult dimOpS4 = builder.Build<pir::dialect::DimOp>("S4").out();
+  pir::OpResult dimOpS5 = builder.Build<pir::dialect::DimOp>("S5").out();
+  pir::OpResult dimOpS6 = builder.Build<pir::dialect::DimOp>("S6").out();
+  pir::OpResult dimOpS7 = builder.Build<pir::dialect::DimOp>("S7").out();
+  pir::OpResult dimOpS8 = builder.Build<pir::dialect::DimOp>("S8").out();
+  pir::OpResult dimOpS9 = builder.Build<pir::dialect::DimOp>("S9").out();
+  pir::OpResult dimOpS10 = builder.Build<pir::dialect::DimOp>("S10").out();
+  pir::OpResult dimOpS11 = builder.Build<pir::dialect::DimOp>("S11").out();
+  pir::OpResult dimOpC10 = builder.Build<pir::dialect::DimOp>("C10").out();
+  pir::OpResult dimOpC20 = builder.Build<pir::dialect::DimOp>("C20").out();
+  pir::OpResult constant =
+      builder
+          .Build<pir::ConstantOp>(pir::Int32Attribute::get(ctx, 2),
+                                  pir::Int32Type::get(ctx))
+          ->result(0);
+
+  // Mark S1 == S2.
+  builder.Build<pir::dialect::TieProductEqualOp>(
+      2, 2, std::vector<pir::OpResult>{constant, dimOpS1, dimOpS2, constant});
+  // Mark S0 * S1 == S2 * S3, For check S0 == S3.
+  builder.Build<pir::dialect::TieProductEqualOp>(
+      2, 2, std::vector<pir::OpResult>{dimOpS0, dimOpS1, dimOpS2, dimOpS3});
+  // Mark S4 * S0 * S1 == S2 * S3 * S5, For check S4 == S5.
+  builder.Build<pir::dialect::TieProductEqualOp>(
+      3,
+      3,
+      std::vector<pir::OpResult>{
+          dimOpS4, dimOpS0, dimOpS1, dimOpS2, dimOpS3, dimOpS5});
+  // For check S6 == C10 * C20.
+  builder.Build<pir::dialect::TieProductEqualOp>(
+      1, 2, std::vector<pir::OpResult>{dimOpS6, dimOpC10, dimOpC20});
+  // Mark C10 * S0 * S1 == S2 * S3 * S7, for check C10 == S7.
+  builder.Build<pir::dialect::TieProductEqualOp>(
+      3,
+      3,
+      std::vector<pir::OpResult>{
+          dimOpC10, dimOpS0, dimOpS1, dimOpS2, dimOpS3, dimOpS7});
+
+  // For unsimplify product case: S8 * S9 == S10 * S11
+  builder.Build<pir::dialect::TieProductEqualOp>(
+      2, 2, std::vector<pir::OpResult>{dimOpS8, dimOpS9, dimOpS10, dimOpS11});
+
+  auto op = CreateDenseTensorOp(ctx,
+                                {pir::ShapedTypeInterface::kDynamic,
+                                 pir::ShapedTypeInterface::kDynamic,
+                                 pir::ShapedTypeInterface::kDynamic,
+                                 pir::ShapedTypeInterface::kDynamic,
+                                 pir::ShapedTypeInterface::kDynamic,
+                                 pir::ShapedTypeInterface::kDynamic},
+                                {"op0_attr"},
+                                {"op0_name"});
+  auto op_ = CreateDenseTensorOp(ctx,
+                                 {pir::ShapedTypeInterface::kDynamic,
+                                  pir::ShapedTypeInterface::kDynamic,
+                                  pir::ShapedTypeInterface::kDynamic,
+                                  pir::ShapedTypeInterface::kDynamic,
+                                  pir::ShapedTypeInterface::kDynamic,
+                                  10,
+                                  20},
+                                 {"op1_attr"},
+                                 {"op1_name"});
+  pir::OpResult res = op->result(0);
+  pir::OpResult res_ = op_->result(0);
+
+  builder.SetInsertionPointToEnd(program.block());
+  pir::dialect::TieShapeOp tieShapeOp =
+      builder.Build<pir::dialect::TieShapeOp>(res);
+  pir::dialect::TieShapeOp tieShapeOp_ =
+      builder.Build<pir::dialect::TieShapeOp>(res_);
+
+  pir::Attribute attrS0 = pir::StrAttribute::get(ctx, "S0");
+  pir::Attribute attrS1 = pir::StrAttribute::get(ctx, "S1");
+  pir::Attribute attrS2 = pir::StrAttribute::get(ctx, "S2");
+  pir::Attribute attrS3 = pir::StrAttribute::get(ctx, "S3");
+  pir::Attribute attrS4 = pir::StrAttribute::get(ctx, "S4");
+  pir::Attribute attrS5 = pir::StrAttribute::get(ctx, "S5");
+  pir::Attribute attrS6 = pir::StrAttribute::get(ctx, "S6");
+  pir::Attribute attrS7 = pir::StrAttribute::get(ctx, "S7");
+  pir::Attribute attrS8 = pir::StrAttribute::get(ctx, "S8");
+  pir::Attribute attrS9 = pir::StrAttribute::get(ctx, "S9");
+  pir::Attribute attrS10 = pir::StrAttribute::get(ctx, "S10");
+  pir::Attribute attrS11 = pir::StrAttribute::get(ctx, "S11");
+  pir::Attribute attrC10 = pir::StrAttribute::get(ctx, "C10");
+  pir::Attribute attrC20 = pir::StrAttribute::get(ctx, "C20");
+
+  std::vector<pir::Attribute> newAttrs = {
+      attrS0, attrS1, attrS2, attrS3, attrS4, attrS5};
+  std::vector<pir::Attribute> newAttrsRef = {
+      attrS0, attrS1, attrS1, attrS0, attrS2, attrS2};
+  std::vector<pir::Attribute> newAttrs_ = {
+      attrS6, attrS7, attrS8, attrS9, attrS10, attrS11, attrC10, attrC20};
+
+  auto arrayAttr = pir::ArrayAttribute::get(ctx, newAttrs);
+  auto arrayAttrRef = pir::ArrayAttribute::get(ctx, newAttrsRef);
+  auto arrayAttr_ = pir::ArrayAttribute::get(ctx, newAttrs_);
+  tieShapeOp->set_attribute(pir::dialect::SymbolicDim::getSymbolicDimAttrName(),
+                            arrayAttr);
+  tieShapeOp_->set_attribute(
+      pir::dialect::SymbolicDim::getSymbolicDimAttrName(), arrayAttr_);
+
+  EXPECT_TRUE(symDimMgr.load());
+
+  // For check indirect equality: S1 * S4 == S2 * S5
+  pir::SymbolicDimProduct symDimProductLhs;
+  pir::SymbolicDimProduct symDimProductRhs;
+
+  symDimProductLhs.symbols.push_back(symDimS1);
+  symDimProductLhs.symbols.push_back(symDimS4);
+
+  symDimProductRhs.symbols.push_back(symDimS2);
+  symDimProductRhs.symbols.push_back(symDimS5);
+
+  // For uncompletely simplied product check: S8 * S9 * S12 == S10 * S11 * S12
+  pir::SymbolicDimProduct symDimProductLhs_;
+  pir::SymbolicDimProduct symDimProductRhs_;
+
+  symDimProductLhs_.symbols.push_back(symDimS8);
+  symDimProductLhs_.symbols.push_back(symDimS9);
+  symDimProductLhs_.symbols.push_back(symDimS12);
+
+  symDimProductRhs_.symbols.push_back(symDimS10);
+  symDimProductRhs_.symbols.push_back(symDimS11);
+  symDimProductRhs_.symbols.push_back(symDimS12);
+
+  // For check simplifySymbolicDimProduct, {factor = 1, Sym = {S7}} => {factor =
+  // 10}
+  pir::SymbolicDimProduct symDimProductS7;
+  symDimProductS7.symbols.push_back(symDimS7);
+  pir::SymbolicDimProduct simplifiedProductS7 =
+      symDimMgr.simplifySymbolicDimProduct(symDimProductS7);
+
+  // For check simplifySymbolicDimProductPair, X * Y * Y, Y * Y * Z => X, Z
+  pir::SymbolicDimProduct symDimProductPairLhs;
+  pir::SymbolicDimProduct symDimProductPairRhs;
+  pir::SymbolicDimProduct newLhs, newRhs;
+  symDimProductPairLhs.symbols.push_back(symDimS4);
+  symDimProductPairLhs.symbols.push_back(symDimS1);
+  symDimProductPairLhs.symbols.push_back(symDimS2);
+  symDimProductPairRhs.symbols.push_back(symDimS1);
+  symDimProductPairRhs.symbols.push_back(symDimS2);
+  symDimProductPairRhs.symbols.push_back(symDimS3);
+
+  std::tie(newLhs, newRhs) = symDimMgr.simplifySymbolicDimProductPair(
+      symDimProductPairLhs, symDimProductPairRhs);
+
+  // For check symbolicDimProductDivide, {S4 * S1 * C20} / {S1 * C10} => {factor
+  // = 2 Sym = {S4}}
+  pir::SymbolicDimProduct symDimProductDivLhs;
+  pir::SymbolicDimProduct symDimProductDivRhs;
+  symDimProductDivLhs.symbols.push_back(symDimS4);
+  symDimProductDivLhs.symbols.push_back(symDimS1);
+  symDimProductDivLhs.symbols.push_back(symDimC20);
+  symDimProductDivRhs.symbols.push_back(symDimS1);
+  symDimProductDivRhs.symbols.push_back(symDimC10);
+
+  pir::SymbolicDimProduct *divRes = symDimMgr.symbolicDimProductDivide(
+      symDimProductDivLhs, symDimProductDivRhs);
+
+  EXPECT_TRUE(symDimMgr.isSymbolicDimEqual(symDimS1, symDimS2));
+  EXPECT_TRUE(symDimMgr.isSymbolicDimEqual(symDimS0, symDimS3));
+  EXPECT_TRUE(symDimMgr.isSymbolicDimEqual(symDimS4, symDimS5));
+  EXPECT_EQ(symDimS6.getValue(), 200);
+  EXPECT_EQ(symDimMgr.symbolTable().lookup<pir::dialect::SymbolicDim>("C20"),
+            symDimC20);
+  EXPECT_EQ(symDimS7.getValue(), symDimC10.getValue());
+  EXPECT_EQ(simplifiedProductS7.factor, 10);
+  EXPECT_EQ(simplifiedProductS7.symbols.size(), static_cast<size_t>(0));
+  EXPECT_EQ(newLhs.symbols.size(), static_cast<size_t>(1));
+  EXPECT_EQ(newRhs.symbols.size(), static_cast<size_t>(1));
+  EXPECT_EQ(newLhs.symbols[0], symDimMgr.getRootSymbolicDim(symDimS4));
+  EXPECT_EQ(newRhs.symbols[0], symDimMgr.getRootSymbolicDim(symDimS3));
+  EXPECT_EQ(divRes->factor, 2);
+  EXPECT_EQ(divRes->symbols.size(), static_cast<size_t>(1));
+  EXPECT_EQ(divRes->symbols[0], symDimMgr.getRootSymbolicDim(symDimS4));
+  EXPECT_TRUE(
+      symDimMgr.isSymbolicDimProductEqual(symDimProductLhs, symDimProductRhs));
+  EXPECT_TRUE(symDimMgr.isSymbolicDimProductEqual(symDimProductLhs_,
+                                                  symDimProductRhs_));
+  EXPECT_TRUE(symDimMgr.save());
+
+  pir::SymbolicDimMgr symDimMgr_(program.module_op());
+  EXPECT_TRUE(symDimMgr_.load());
+  auto attrs = tieShapeOp.attribute<pir::ArrayAttribute>(
+      pir::dialect::SymbolicDim::getSymbolicDimAttrName());
+  EXPECT_FALSE(
+      symDimMgr_.symbolTable().lookup<pir::dialect::SymbolicDim>("S7"));
+  EXPECT_EQ(symDimMgr_.symbolTable()
+                .lookup<pir::dialect::TieProductEqualOp>("tie_product_equal")
+                .size(),
+            static_cast<size_t>(1));
+
+  EXPECT_EQ(attrs.AsVector(), arrayAttrRef.AsVector());
+}
+
+TEST(shape_op, dim) {
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  pir::Program program(ctx);
+  ctx->GetOrRegisterDialect<pir::dialect::ShapeDialect>();
+  pir::Builder builder = pir::Builder(ctx, program.block());
+
+  pir::dialect::DimOp dimOp = builder.Build<pir::dialect::DimOp>("S0");
+  pir::OpResult res = dimOp.out();
+  EXPECT_EQ(dimOp.getName(), "S0");
+  dimOp.setName("S1");
+  EXPECT_EQ(dimOp.getName(), "S1");
+  EXPECT_EQ(res.GetDefiningOp(), dimOp.operation());
+  EXPECT_EQ(res.type(), pir::IndexType::get(ctx));
+}
+
+TEST(shape_op, tie_product_equal) {
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  pir::Program program(ctx);
+  ctx->GetOrRegisterDialect<pir::dialect::ShapeDialect>();
+  pir::Builder builder = pir::Builder(ctx, program.block());
+  pir::SymbolTable symbolTable(program.module_op());
+
+  pir::OpResult dimOp0 = builder.Build<pir::dialect::DimOp>("S0").out();
+  pir::OpResult dimOp1 = builder.Build<pir::dialect::DimOp>("S1").out();
+  pir::OpResult dimOp2 = builder.Build<pir::dialect::DimOp>("S2").out();
+  pir::OpResult dimOp3 = builder.Build<pir::dialect::DimOp>("S3").out();
+  pir::OpResult dimOp4 = builder.Build<pir::dialect::DimOp>("S4").out();
+
+  pir::dialect::TieProductEqualOp tie_product_equal =
+      builder.Build<pir::dialect::TieProductEqualOp>(
+          2,
+          3,
+          std::vector<pir::OpResult>{dimOp0, dimOp1, dimOp2, dimOp3, dimOp4});
+
+  std::vector<pir::Value> lhs = tie_product_equal.getLhs();
+  std::vector<pir::Value> rhs = tie_product_equal.getRhs();
+
+  std::vector<pir::Value> lhs_ref{dimOp0, dimOp1};
+  std::vector<pir::Value> rhs_ref{dimOp2, dimOp3, dimOp4};
+
+  EXPECT_EQ(symbolTable.insert(tie_product_equal), "tie_product_equal");
+  EXPECT_EQ(
+      symbolTable.lookup<pir::dialect::TieProductEqualOp>("tie_product_equal")
+          .size(),
+      static_cast<size_t>(1));
+  EXPECT_EQ(symbolTable.lookup<pir::dialect::TieProductEqualOp>(
+                "tie_product_equal")[0],
+            tie_product_equal);
+  EXPECT_EQ(lhs, lhs_ref);
+  EXPECT_EQ(rhs, rhs_ref);
+}
+
+TEST(shape_op, tie_shape) {
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  pir::Program program(ctx);
+  ctx->GetOrRegisterDialect<pir::dialect::ShapeDialect>();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+
+  pir::Builder builder = pir::Builder(ctx, program.block());
+
+  auto op = CreateDenseTensorOp(
+      ctx, {pir::ShapedTypeInterface::kDynamic, 2}, {"op_attr"}, {"op_name"});
+  pir::OpResult res = op->result(0);
+
+  pir::dialect::TieShapeOp tieShapeOp =
+      builder.Build<pir::dialect::TieShapeOp>(res);
+  pir::Value tieShapeOpValue = tieShapeOp.getValue();
+
+  pir::Attribute attrS0 = pir::StrAttribute::get(ctx, "S0");
+  pir::Attribute attrS1 = pir::StrAttribute::get(ctx, "S1");
+
+  std::vector<pir::Attribute> newAttrs = {attrS0, attrS1};
+
+  auto arrayAttr = pir::ArrayAttribute::get(ctx, newAttrs);
+  tieShapeOp->set_attribute(pir::dialect::SymbolicDim::getSymbolicDimAttrName(),
+                            arrayAttr);
+
+  std::vector<pir::Attribute> arrAttrVec =
+      tieShapeOp
+          ->attribute<pir::ArrayAttribute>(
+              pir::dialect::SymbolicDim::getSymbolicDimAttrName())
+          .AsVector();
+
+  EXPECT_EQ(tieShapeOpValue, res);
+  EXPECT_EQ(arrAttrVec.size(), static_cast<size_t>(2));
+  EXPECT_EQ(arrAttrVec[0].dyn_cast<pir::StrAttribute>(), attrS0);
+  EXPECT_EQ(arrAttrVec[1].dyn_cast<pir::StrAttribute>(), attrS1);
+  EXPECT_TRUE(tieShapeOp->HasAttribute(
+      pir::dialect::SymbolicDim::getSymbolicDimAttrName()));
+}
+
+TEST(shape_op, func_op) {
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  pir::Program program(ctx);
+  ctx->GetOrRegisterDialect<pir::dialect::ShapeDialect>();
+  ::pir::Builder builder = ::pir::Builder(ctx, program.block());
+  pir::dialect::FuncOp funcOp = builder.Build<pir::dialect::FuncOp>();
+  auto funcBlock = funcOp.block();
+  builder.SetInsertionPointToStart(funcBlock);
+  builder.Build<pir::ConstantOp>(pir::Int32Attribute::get(ctx, 2),
+                                 pir::Int32Type::get(ctx));
+  EXPECT_EQ(funcBlock, funcOp->region(0).front());
+  EXPECT_EQ(funcOp->region(0).size(), static_cast<size_t>(1));
+  EXPECT_EQ(funcBlock->size(), static_cast<size_t>(1));
+}
+
+TEST(assist_struct_test, shape_analysis) {
+  pir::IrContext *ctx = pir::IrContext::Instance();
+  pir::Program program(ctx);
+  ctx->GetOrRegisterDialect<pir::dialect::ShapeDialect>();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+  ::pir::Builder builder = ::pir::Builder(ctx, program.block());
+  pir::dialect::FuncOp funcOp = builder.Build<pir::dialect::FuncOp>();
+
+  phi::DDim dims_D_2 = {pir::ShapedTypeInterface::kDynamic, 2};
+  phi::DDim dims_2_2 = {2, 2};
+  phi::DDim dims_D = {pir::ShapedTypeInterface::kDynamic};
+
+  // same shape with dynamic: value1 == value2
+  auto op1 = CreateDenseTensorOp(ctx, dims_D_2, {"op1_attr"}, {"op1_name"});
+  auto op2 = CreateDenseTensorOp(ctx, dims_D_2, {"op2_attr"}, {"op2_name"});
+  pir::OpResult value1 = op1->result(0);
+  pir::OpResult value2 = op2->result(0);
+
+  // same shape with static: value3 == value4
+  auto op3 = CreateDenseTensorOp(ctx, dims_2_2, {"op3_attr"}, {"op3_name"});
+  auto op4 = CreateDenseTensorOp(ctx, dims_2_2, {"op4_attr"}, {"op4_name"});
+  pir::OpResult value3 = op3->result(0);
+  pir::OpResult value4 = op4->result(0);
+
+  // one dimension with dynamic: value5 != value1 != value3
+  auto op5 = CreateDenseTensorOp(ctx, dims_D, {"op5_attr"}, {"op5_name"});
+  pir::OpResult value5 = op5->result(0);
+
+  pir::dialect::TieShapeOp tieShapeOp1 =
+      builder.Build<pir::dialect::TieShapeOp>(value1);
+  pir::dialect::TieShapeOp tieShapeOp2 =
+      builder.Build<pir::dialect::TieShapeOp>(value2);
+  pir::dialect::TieShapeOp tieShapeOp3 =
+      builder.Build<pir::dialect::TieShapeOp>(value3);
+  pir::dialect::TieShapeOp tieShapeOp4 =
+      builder.Build<pir::dialect::TieShapeOp>(value4);
+  pir::dialect::TieShapeOp tieShapeOp5 =
+      builder.Build<pir::dialect::TieShapeOp>(value5);
+
+  builder.SetInsertionPointToEnd(funcOp.block());
+  builder.Build<pir::dialect::SymbolicDim>("C2", 2, true, false, true, true);
+  pir::dialect::SymbolicDim symDimS0 = builder.Build<pir::dialect::SymbolicDim>(
+      "S0", pir::ShapedTypeInterface::kDynamic, false, false, true, true);
+  pir::dialect::SymbolicDim symDimS1 = builder.Build<pir::dialect::SymbolicDim>(
+      "S1", pir::ShapedTypeInterface::kDynamic, false, false, true, true);
+  pir::dialect::SymbolicDim symDimS2 = builder.Build<pir::dialect::SymbolicDim>(
+      "S2", pir::ShapedTypeInterface::kDynamic, false, false, true, true);
+
+  pir::Attribute attrS0 = pir::StrAttribute::get(ctx, "S0");
+  pir::Attribute attrS1 = pir::StrAttribute::get(ctx, "S1");
+  pir::Attribute attrS2 = pir::StrAttribute::get(ctx, "S2");
+  pir::Attribute attrC2 = pir::StrAttribute::get(ctx, "C2");
+
+  auto attrOp1 = pir::ArrayAttribute::get(ctx, {attrS0, attrC2});
+  auto attrOp2 = pir::ArrayAttribute::get(ctx, {attrS1, attrC2});
+  auto attrOp3 = pir::ArrayAttribute::get(ctx, {attrC2, attrC2});
+  auto attrOp4 = pir::ArrayAttribute::get(ctx, {attrC2, attrC2});
+  auto attrOp5 = pir::ArrayAttribute::get(ctx, {attrS2});
+
+  tieShapeOp1->set_attribute(
+      pir::dialect::SymbolicDim::getSymbolicDimAttrName(), attrOp1);
+  tieShapeOp2->set_attribute(
+      pir::dialect::SymbolicDim::getSymbolicDimAttrName(), attrOp2);
+  tieShapeOp3->set_attribute(
+      pir::dialect::SymbolicDim::getSymbolicDimAttrName(), attrOp3);
+  tieShapeOp4->set_attribute(
+      pir::dialect::SymbolicDim::getSymbolicDimAttrName(), attrOp4);
+  tieShapeOp5->set_attribute(
+      pir::dialect::SymbolicDim::getSymbolicDimAttrName(), attrOp5);
+
+  pir::SymbolicDimShapeAnalysis shapeAnalysis(program.module_op());
+  EXPECT_TRUE(shapeAnalysis.isShapeEqual(value3, value4));
+  EXPECT_FALSE(shapeAnalysis.isShapeEqual(value1, value2));
+  EXPECT_FALSE(shapeAnalysis.isShapeEqual(value1, value3));
+  EXPECT_FALSE(shapeAnalysis.isShapeEqual(value1, value5));
+  EXPECT_FALSE(shapeAnalysis.isShapeEqual(value3, value5));
+  EXPECT_TRUE(shapeAnalysis.isProductEqual(value1, {1}, value3, {0}));
+  EXPECT_TRUE(shapeAnalysis.isSameNumElements(value4, value3));
+
+  shapeAnalysis.symbolicDimMgr().mapSymbolicDimEqual(symDimS0, symDimS1);
+  shapeAnalysis.symbolicDimMgr().mapSymbolicDimEqual(symDimS0, symDimS2);
+
+  EXPECT_TRUE(shapeAnalysis.isShapeEqual(value1, value2));
+  EXPECT_FALSE(shapeAnalysis.isShapeEqual(value1, value5));
+}
diff --git a/test/cpp/ir/tools/CMakeLists.txt b/test/cpp/pir/tools/CMakeLists.txt
similarity index 83%
rename from test/cpp/ir/tools/CMakeLists.txt
rename to test/cpp/pir/tools/CMakeLists.txt
index 58179d87e0d88..64e5b97243620 100644
--- a/test/cpp/ir/tools/CMakeLists.txt
+++ b/test/cpp/pir/tools/CMakeLists.txt
@@ -1,4 +1,4 @@
 cc_library(
   test_dialect
   SRCS test_dialect.cc test_op.cc
-  DEPS ir)
+  DEPS pir)
diff --git a/test/cpp/ir/tools/test_dialect.cc b/test/cpp/pir/tools/test_dialect.cc
similarity index 90%
rename from test/cpp/ir/tools/test_dialect.cc
rename to test/cpp/pir/tools/test_dialect.cc
index c16b9be067663..bf94e8db3dce1 100644
--- a/test/cpp/ir/tools/test_dialect.cc
+++ b/test/cpp/pir/tools/test_dialect.cc
@@ -11,8 +11,8 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "test/cpp/ir/tools/test_dialect.h"
-#include "test/cpp/ir/tools/test_op.h"
+#include "test/cpp/pir/tools/test_dialect.h"
+#include "test/cpp/pir/tools/test_op.h"
 namespace test {
 void TestDialect::initialize() { RegisterOps<RegionOp, BranchOp>(); }
 }  // namespace test
diff --git a/test/cpp/ir/tools/test_dialect.h b/test/cpp/pir/tools/test_dialect.h
similarity index 80%
rename from test/cpp/ir/tools/test_dialect.h
rename to test/cpp/pir/tools/test_dialect.h
index 4403719458e4b..8b259c5563c4b 100644
--- a/test/cpp/ir/tools/test_dialect.h
+++ b/test/cpp/pir/tools/test_dialect.h
@@ -14,13 +14,13 @@
 
 #pragma once
 
-#include "paddle/ir/core/dialect.h"
+#include "paddle/pir/core/dialect.h"
 
 namespace test {
-class TestDialect : public ir::Dialect {
+class TestDialect : public pir::Dialect {
  public:
-  explicit TestDialect(ir::IrContext *context)
-      : ir::Dialect(name(), context, ir::TypeId::get<TestDialect>()) {
+  explicit TestDialect(pir::IrContext *context)
+      : pir::Dialect(name(), context, pir::TypeId::get<TestDialect>()) {
     initialize();
   }
   static const char *name() { return "test"; }
diff --git a/test/cpp/ir/tools/test_op.cc b/test/cpp/pir/tools/test_op.cc
similarity index 79%
rename from test/cpp/ir/tools/test_op.cc
rename to test/cpp/pir/tools/test_op.cc
index 40dc46c0b8e14..9802f8827cf6f 100644
--- a/test/cpp/ir/tools/test_op.cc
+++ b/test/cpp/pir/tools/test_op.cc
@@ -11,10 +11,10 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "test/cpp/ir/tools/test_op.h"
+#include "test/cpp/pir/tools/test_op.h"
 
 namespace test {
-void RegionOp::Build(ir::Builder &builder, ir::OperationArgument &argument) {
+void RegionOp::Build(pir::Builder &builder, pir::OperationArgument &argument) {
   argument.num_regions = 1;
 }
 void RegionOp::Verify() const {
@@ -24,10 +24,10 @@ void RegionOp::Verify() const {
              num_regions);
 }
 
-void BranchOp::Build(ir::Builder &builder,  // NOLINT
-                     ir::OperationArgument &argument,
-                     const std::vector<ir::OpResult> &target_operands,
-                     ir::Block *target) {
+void BranchOp::Build(pir::Builder &builder,  // NOLINT
+                     pir::OperationArgument &argument,
+                     const std::vector<pir::OpResult> &target_operands,
+                     pir::Block *target) {
   argument.AddOperands(target_operands.begin(), target_operands.end());
   argument.AddSuccessor(target);
 }
diff --git a/test/cpp/ir/tools/test_op.h b/test/cpp/pir/tools/test_op.h
similarity index 70%
rename from test/cpp/ir/tools/test_op.h
rename to test/cpp/pir/tools/test_op.h
index 1462a9555cb07..9e0f9f1e933b2 100644
--- a/test/cpp/ir/tools/test_op.h
+++ b/test/cpp/pir/tools/test_op.h
@@ -14,37 +14,37 @@
 
 #pragma once
 
-#include "paddle/ir/core/builder.h"
-#include "paddle/ir/core/op_base.h"
+#include "paddle/pir/core/builder.h"
+#include "paddle/pir/core/op_base.h"
 
 namespace test {
 ///
 /// \brief TestRegionOp
 ///
-class RegionOp : public ir::Op<RegionOp> {
+class RegionOp : public pir::Op<RegionOp> {
  public:
   using Op::Op;
   static const char *name() { return "test.region"; }
   static constexpr uint32_t attributes_num = 0;
   static constexpr const char **attributes_name = nullptr;
-  static void Build(ir::Builder &builder,              // NOLINT
-                    ir::OperationArgument &argument);  // NOLINT
+  static void Build(pir::Builder &builder,              // NOLINT
+                    pir::OperationArgument &argument);  // NOLINT
   void Verify() const;
 };
 
 ///
 /// \brief TestBranchOp
 ///
-class BranchOp : public ir::Op<BranchOp> {
+class BranchOp : public pir::Op<BranchOp> {
  public:
   using Op::Op;
   static const char *name() { return "test.branch"; }
   static constexpr uint32_t attributes_num = 0;
   static constexpr const char **attributes_name = nullptr;
-  static void Build(ir::Builder &builder,             // NOLINT
-                    ir::OperationArgument &argument,  // NOLINT
-                    const std::vector<ir::OpResult> &target_operands,
-                    ir::Block *target);
+  static void Build(pir::Builder &builder,             // NOLINT
+                    pir::OperationArgument &argument,  // NOLINT
+                    const std::vector<pir::OpResult> &target_operands,
+                    pir::Block *target);
   void Verify() const;
 };
 
diff --git a/test/cpp/prim/CMakeLists.txt b/test/cpp/prim/CMakeLists.txt
index e1ae6d843c96a..efe5f0a635597 100644
--- a/test/cpp/prim/CMakeLists.txt
+++ b/test/cpp/prim/CMakeLists.txt
@@ -68,5 +68,5 @@ if(NOT WIN32)
   cc_test(
     test_vjp_new_ir
     SRCS test_vjp.cc
-    DEPS phi_kernel_adaptor pd_dialect ir)
+    DEPS phi_kernel_adaptor pd_op_dialect pir)
 endif()
diff --git a/test/cpp/prim/test_vjp.cc b/test/cpp/prim/test_vjp.cc
index 496bb1de1891a..39ed39e3e7f7b 100644
--- a/test/cpp/prim/test_vjp.cc
+++ b/test/cpp/prim/test_vjp.cc
@@ -16,19 +16,19 @@
 
 #include "paddle/fluid/framework/new_executor/new_ir_interpreter.h"
 #include "paddle/fluid/framework/new_executor/standalone_executor.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/api_builder.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_dialect.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_op.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/ir/pd_type.h"
-#include "paddle/fluid/ir/dialect/paddle_dialect/utils/utils.h"
-#include "paddle/fluid/ir/transforms/pd_op_to_kernel_pass.h"
+#include "paddle/fluid/pir/dialect/operator/ir/api_builder.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h"
+#include "paddle/fluid/pir/dialect/operator/ir/op_type.h"
+#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h"
+#include "paddle/fluid/pir/dialect/operator/utils/utils.h"
+#include "paddle/fluid/pir/transforms/pd_op_to_kernel_pass.h"
 #include "paddle/fluid/platform/init_phi.h"
-#include "paddle/ir/core/block.h"
-#include "paddle/ir/core/builtin_attribute.h"
-#include "paddle/ir/core/builtin_op.h"
-#include "paddle/ir/core/ir_context.h"
-#include "paddle/ir/core/program.h"
-#include "paddle/ir/core/utils.h"
+#include "paddle/pir/core/block.h"
+#include "paddle/pir/core/builtin_attribute.h"
+#include "paddle/pir/core/builtin_op.h"
+#include "paddle/pir/core/ir_context.h"
+#include "paddle/pir/core/program.h"
+#include "paddle/pir/core/utils.h"
 
 DECLARE_FILE_SYMBOLS(kernel_dialect);
 
@@ -43,12 +43,12 @@ namespace paddle {
 namespace framework {
 
 TEST(VJP, TanhBackwardTest) {
-  ir::IrContext* ctx = ir::IrContext::Instance();
-  ctx->GetOrRegisterDialect<paddle::dialect::PaddleDialect>();
-  ir::Program program((ctx));
+  pir::IrContext* ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+  pir::Program program((ctx));
   paddle::dialect::APIBuilder::Instance().SetProgram(&program);
 
-  std::shared_ptr<ir::Builder> builder =
+  std::shared_ptr<pir::Builder> builder =
       paddle::dialect::APIBuilder::Instance().GetBuilder();
   paddle::dialect::FullOp op1 = builder->Build<paddle::dialect::FullOp>(
       std::vector<int64_t>{1}, 1.0, phi::DataType::FLOAT32, phi::CPUPlace());
@@ -59,9 +59,9 @@ TEST(VJP, TanhBackwardTest) {
       std::vector<int64_t>{1}, 2.0, phi::DataType::FLOAT32, phi::CPUPlace());
 
   std::vector<std::vector<bool>> stop_gradients{{false}};
-  std::vector<std::vector<ir::OpResult>> out_grads{{op3.out()}};
+  std::vector<std::vector<pir::OpResult>> out_grads{{op3.out()}};
 
-  ir::OpInfo op2_info = ctx->GetRegisteredOpInfo("pd.tanh");
+  pir::OpInfo op2_info = ctx->GetRegisteredOpInfo("pd_op.tanh");
   auto tanh_vjp_interface_impl =
       op2_info.GetInterfaceImpl<paddle::dialect::VjpInterface>();
   tanh_vjp_interface_impl->vjp_(op2.operation(), out_grads, stop_gradients);
@@ -98,12 +98,12 @@ TEST(VJP, TanhBackwardTest) {
 }
 
 TEST(VJP, Tanh_BackwardTest) {
-  ir::IrContext* ctx = ir::IrContext::Instance();
-  ctx->GetOrRegisterDialect<paddle::dialect::PaddleDialect>();
-  ir::Program program((ctx));
+  pir::IrContext* ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+  pir::Program program((ctx));
   paddle::dialect::APIBuilder::Instance().SetProgram(&program);
 
-  std::shared_ptr<ir::Builder> builder =
+  std::shared_ptr<pir::Builder> builder =
       paddle::dialect::APIBuilder::Instance().GetBuilder();
   paddle::dialect::FullOp op1 = builder->Build<paddle::dialect::FullOp>(
       std::vector<int64_t>{1}, 1.0, phi::DataType::FLOAT32, phi::CPUPlace());
@@ -114,9 +114,9 @@ TEST(VJP, Tanh_BackwardTest) {
       std::vector<int64_t>{1}, 2.0, phi::DataType::FLOAT32, phi::CPUPlace());
 
   std::vector<std::vector<bool>> stop_gradients{{false}};
-  std::vector<std::vector<ir::OpResult>> out_grads{{op3.out()}};
+  std::vector<std::vector<pir::OpResult>> out_grads{{op3.out()}};
 
-  ir::OpInfo op2_info = ctx->GetRegisteredOpInfo("pd.tanh_");
+  pir::OpInfo op2_info = ctx->GetRegisteredOpInfo("pd_op.tanh_");
   auto tanh_vjp_interface_impl =
       op2_info.GetInterfaceImpl<paddle::dialect::VjpInterface>();
   tanh_vjp_interface_impl->vjp_(op2.operation(), out_grads, stop_gradients);
@@ -153,12 +153,12 @@ TEST(VJP, Tanh_BackwardTest) {
 }
 
 TEST(VJP, MeanBackwardTest) {
-  ir::IrContext* ctx = ir::IrContext::Instance();
-  ctx->GetOrRegisterDialect<paddle::dialect::PaddleDialect>();
-  ir::Program program((ctx));
+  pir::IrContext* ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+  pir::Program program((ctx));
   paddle::dialect::APIBuilder::Instance().SetProgram(&program);
 
-  std::shared_ptr<ir::Builder> builder =
+  std::shared_ptr<pir::Builder> builder =
       paddle::dialect::APIBuilder::Instance().GetBuilder();
   paddle::dialect::FullOp op1 = builder->Build<paddle::dialect::FullOp>(
       std::vector<int64_t>{2, 2}, 2.0, phi::DataType::FLOAT32, phi::CPUPlace());
@@ -169,9 +169,9 @@ TEST(VJP, MeanBackwardTest) {
       std::vector<int64_t>{}, 1.0, phi::DataType::FLOAT32, phi::CPUPlace());
 
   std::vector<std::vector<bool>> stop_gradients{{false}};
-  std::vector<std::vector<ir::OpResult>> out_grads{{op3.out()}};
+  std::vector<std::vector<pir::OpResult>> out_grads{{op3.out()}};
 
-  ir::OpInfo op2_info = ctx->GetRegisteredOpInfo("pd.mean");
+  pir::OpInfo op2_info = ctx->GetRegisteredOpInfo("pd_op.mean");
   auto mean_vjp_interface_impl =
       op2_info.GetInterfaceImpl<paddle::dialect::VjpInterface>();
   mean_vjp_interface_impl->vjp_(op2.operation(), out_grads, stop_gradients);
@@ -210,25 +210,25 @@ TEST(VJP, MeanBackwardTest) {
 }
 
 TEST(VJP, ConcatBackwardTest) {
-  ir::IrContext* ctx = ir::IrContext::Instance();
-  ctx->GetOrRegisterDialect<paddle::dialect::PaddleDialect>();
-  ir::Program program((ctx));
+  pir::IrContext* ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+  pir::Program program((ctx));
   paddle::dialect::APIBuilder::Instance().SetProgram(&program);
 
-  std::shared_ptr<ir::Builder> builder =
+  std::shared_ptr<pir::Builder> builder =
       paddle::dialect::APIBuilder::Instance().GetBuilder();
   paddle::dialect::FullOp op1 = builder->Build<paddle::dialect::FullOp>(
       std::vector<int64_t>{1, 2}, 2.0, phi::DataType::FLOAT32, phi::CPUPlace());
-  std::vector<ir::OpResult> combine_input{{op1.out(), op1.out()}};
-  ir::CombineOp op2 = builder->Build<ir::CombineOp>(combine_input);
+  std::vector<pir::OpResult> combine_input{{op1.out(), op1.out()}};
+  pir::CombineOp op2 = builder->Build<pir::CombineOp>(combine_input);
   paddle::dialect::ConcatOp op3 =
       builder->Build<paddle::dialect::ConcatOp>(op2.out(), 0);
 
   paddle::dialect::FullOp op4 = builder->Build<paddle::dialect::FullOp>(
       std::vector<int64_t>{2, 2}, 1.0, phi::DataType::FLOAT32, phi::CPUPlace());
   std::vector<std::vector<bool>> stop_gradients{{false, false}};
-  std::vector<std::vector<ir::OpResult>> out_grads{{op4.out()}};
-  ir::OpInfo op2_info = ctx->GetRegisteredOpInfo("pd.concat");
+  std::vector<std::vector<pir::OpResult>> out_grads{{op4.out()}};
+  pir::OpInfo op2_info = ctx->GetRegisteredOpInfo("pd_op.concat");
   auto concat_vjp_interface_impl =
       op2_info.GetInterfaceImpl<paddle::dialect::VjpInterface>();
   concat_vjp_interface_impl->vjp_(op3.operation(), out_grads, stop_gradients);
@@ -273,12 +273,12 @@ TEST(VJP, ConcatBackwardTest) {
 }
 
 TEST(VJP, AddBackwardTest) {
-  ir::IrContext* ctx = ir::IrContext::Instance();
-  ctx->GetOrRegisterDialect<paddle::dialect::PaddleDialect>();
-  ir::Program program((ctx));
+  pir::IrContext* ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+  pir::Program program((ctx));
   paddle::dialect::APIBuilder::Instance().SetProgram(&program);
 
-  std::shared_ptr<ir::Builder> builder =
+  std::shared_ptr<pir::Builder> builder =
       paddle::dialect::APIBuilder::Instance().GetBuilder();
   paddle::dialect::FullOp op1 = builder->Build<paddle::dialect::FullOp>(
       std::vector<int64_t>{1}, 2.0, phi::DataType::FLOAT32, phi::CPUPlace());
@@ -291,9 +291,9 @@ TEST(VJP, AddBackwardTest) {
       std::vector<int64_t>{1}, 1.0, phi::DataType::FLOAT32, phi::CPUPlace());
 
   std::vector<std::vector<bool>> stop_gradients{{false}, {false}};
-  std::vector<std::vector<ir::OpResult>> out_grads{{op4.out()}};
+  std::vector<std::vector<pir::OpResult>> out_grads{{op4.out()}};
 
-  ir::OpInfo op3_info = ctx->GetRegisteredOpInfo("pd.add");
+  pir::OpInfo op3_info = ctx->GetRegisteredOpInfo("pd_op.add");
   auto add_vjp_interface_impl =
       op3_info.GetInterfaceImpl<paddle::dialect::VjpInterface>();
   add_vjp_interface_impl->vjp_(op3.operation(), out_grads, stop_gradients);
@@ -338,12 +338,12 @@ TEST(VJP, AddBackwardTest) {
 }
 
 TEST(VJP, Add_BackwardTest) {
-  ir::IrContext* ctx = ir::IrContext::Instance();
-  ctx->GetOrRegisterDialect<paddle::dialect::PaddleDialect>();
-  ir::Program program((ctx));
+  pir::IrContext* ctx = pir::IrContext::Instance();
+  ctx->GetOrRegisterDialect<paddle::dialect::OperatorDialect>();
+  pir::Program program((ctx));
   paddle::dialect::APIBuilder::Instance().SetProgram(&program);
 
-  std::shared_ptr<ir::Builder> builder =
+  std::shared_ptr<pir::Builder> builder =
       paddle::dialect::APIBuilder::Instance().GetBuilder();
   paddle::dialect::FullOp op1 = builder->Build<paddle::dialect::FullOp>(
       std::vector<int64_t>{1}, 2.0, phi::DataType::FLOAT32, phi::CPUPlace());
@@ -356,9 +356,9 @@ TEST(VJP, Add_BackwardTest) {
       std::vector<int64_t>{1}, 1.0, phi::DataType::FLOAT32, phi::CPUPlace());
 
   std::vector<std::vector<bool>> stop_gradients{{false}, {false}};
-  std::vector<std::vector<ir::OpResult>> out_grads{{op4.out()}};
+  std::vector<std::vector<pir::OpResult>> out_grads{{op4.out()}};
 
-  ir::OpInfo op3_info = ctx->GetRegisteredOpInfo("pd.add_");
+  pir::OpInfo op3_info = ctx->GetRegisteredOpInfo("pd_op.add_");
   auto add_inplace_vjp_interface_impl =
       op3_info.GetInterfaceImpl<paddle::dialect::VjpInterface>();
   add_inplace_vjp_interface_impl->vjp_(
@@ -404,11 +404,11 @@ TEST(VJP, Add_BackwardTest) {
 }
 
 TEST(VJP, SplitBackwardTest) {
-  ir::IrContext* ctx = ir::IrContext::Instance();
-  ir::Program program((ctx));
+  pir::IrContext* ctx = pir::IrContext::Instance();
+  pir::Program program((ctx));
   paddle::dialect::APIBuilder::Instance().SetProgram(&program);
 
-  std::shared_ptr<ir::Builder> builder =
+  std::shared_ptr<pir::Builder> builder =
       paddle::dialect::APIBuilder::Instance().GetBuilder();
   paddle::dialect::FullOp op1 = builder->Build<paddle::dialect::FullOp>(
       std::vector<int64_t>{2, 2}, 2.0, phi::DataType::FLOAT32, phi::CPUPlace());
@@ -416,14 +416,14 @@ TEST(VJP, SplitBackwardTest) {
   paddle::dialect::SplitOp op2 = builder->Build<paddle::dialect::SplitOp>(
       op1.out(), std::vector<int64_t>{1, 1}, 0);
 
-  ir::SplitOp op3 = builder->Build<ir::SplitOp>(op2.out());
+  pir::SplitOp op3 = builder->Build<pir::SplitOp>(op2.out());
 
   paddle::dialect::FullOp op4 = builder->Build<paddle::dialect::FullOp>(
       std::vector<int64_t>{1, 2}, 1.0, phi::DataType::FLOAT32, phi::CPUPlace());
 
-  std::vector<std::vector<bool>> stop_gradients{{false}, {true}, {true}};
-  std::vector<std::vector<ir::OpResult>> out_grads{{op3.result(0), op4.out()}};
-  ir::OpInfo op2_info = ctx->GetRegisteredOpInfo("pd.split");
+  std::vector<std::vector<bool>> stop_gradients{{false}};
+  std::vector<std::vector<pir::OpResult>> out_grads{{op3.result(0), op4.out()}};
+  pir::OpInfo op2_info = ctx->GetRegisteredOpInfo("pd_op.split");
 
   auto concat_vjp_interface_impl =
       op2_info.GetInterfaceImpl<paddle::dialect::VjpInterface>();
diff --git a/test/distribution/test_dirichlet_op.py b/test/distribution/test_dirichlet_op.py
index b2c1c15bbf91f..dccf825b3f121 100644
--- a/test/distribution/test_dirichlet_op.py
+++ b/test/distribution/test_dirichlet_op.py
@@ -17,11 +17,7 @@
 
 import numpy as np
 import scipy.stats
-from eager_op_test import (
-    OpTest,
-    convert_float_to_uint16,
-    convert_uint16_to_float,
-)
+from op_test import OpTest, convert_float_to_uint16, convert_uint16_to_float
 
 import paddle
 from paddle.base import core
diff --git a/test/dygraph_to_static/test_cpu_cuda_to_tensor.py b/test/dygraph_to_static/test_cpu_cuda_to_tensor.py
index 3596af0254cc1..f5d6c833d16c1 100644
--- a/test/dygraph_to_static/test_cpu_cuda_to_tensor.py
+++ b/test/dygraph_to_static/test_cpu_cuda_to_tensor.py
@@ -73,7 +73,7 @@ def func(x):
         x = paddle.to_tensor([3])
         np.testing.assert_allclose(
             paddle.jit.to_static(func)(x).numpy(),
-            np.array([1, 2, 3, 4]),
+            np.array([[1], [2], [3], [4]]),
             rtol=1e-05,
         )
 
diff --git a/test/dygraph_to_static/test_pylayer.py b/test/dygraph_to_static/test_pylayer.py
new file mode 100644
index 0000000000000..8191760c72a3f
--- /dev/null
+++ b/test/dygraph_to_static/test_pylayer.py
@@ -0,0 +1,294 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for PyLayer of Dynamic-to-Static.
+Only test simple cases here."""
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle.autograd.py_layer import PyLayer
+
+SEED = 2023
+np.random.seed(SEED)
+
+
+def compare_result(dygraph_res, static_res, rtol=1e-5, atol=0):
+    np.testing.assert_allclose(
+        dygraph_res.detach().numpy(),
+        static_res.detach().numpy(),
+        rtol=rtol,
+        atol=atol,
+        err_msg='dygraph result is {}\nstatic_result is {}'.format(
+            dygraph_res, static_res
+        ),
+    )
+
+
+class scaled_layer_1(PyLayer):
+    @staticmethod
+    def forward(ctx, x):
+        y = x * 3
+        return y
+
+    @staticmethod
+    def backward(ctx, dy):
+        dx = paddle.sin(dy)
+        return dx
+
+
+class scaled_layer_2(PyLayer):
+    @staticmethod
+    def forward(ctx, x1, x2):
+        y = x1 * x2
+        return y
+
+    @staticmethod
+    def backward(ctx, dy):
+        dx1 = paddle.sin(dy)
+        dx2 = paddle.cos(dy)
+        return dx1, dx2
+
+
+class cus_tanh_1(PyLayer):
+    @staticmethod
+    def forward(ctx, x):
+        y = paddle.tanh(x)
+        ctx.save_for_backward(y)
+        return y
+
+    @staticmethod
+    def backward(ctx, dy):
+        (y,) = ctx.saved_tensor()
+        grad = dy * (1 - paddle.square(y))
+        return grad
+
+
+class nested_layer(PyLayer):
+    @staticmethod
+    def forward(ctx, x1, x2):
+        y = cus_tanh_1.apply(x1)
+        ctx.save_for_backward(y)
+        ret = y + x2
+        return ret
+
+    @staticmethod
+    def backward(ctx, dy):
+        (y,) = ctx.saved_tensor()
+        grad1 = scaled_layer_1.apply(dy)
+        grad2 = dy - paddle.square(y)
+        return grad1, grad2
+
+
+class SimpleNet_1(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.linear = paddle.nn.Linear(4, 8)
+
+    @paddle.jit.to_static
+    def forward(self, data):
+        hidden = self.linear(data)
+        z = cus_tanh_1.apply(hidden)
+        return z
+
+
+class SimpleNetInplace(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    @paddle.jit.to_static
+    def forward(self, data):
+        data = data**2
+        z = paddle.tanh(data)
+        z = cus_tanh_1.apply(z)
+        return z
+
+
+class TestPyLayerBase(unittest.TestCase):
+    def setUp(self):
+        self.place = "gpu" if paddle.is_compiled_with_cuda() else "cpu"
+        self.to_static = False
+
+    def _run(self, *input_args, **input_kwargs):
+        assert getattr(
+            self, "dygraph_func", None
+        ), "Please setting `self.dygraph_func` before calling `self._run`"
+
+        paddle.jit.enable_to_static(self.to_static)
+        paddle.set_device(self.place)
+        result = self.dygraph_func(*input_args, **input_kwargs)
+        result.mean().backward()
+        return result
+
+    def _run_dygraph(self, *args, **kwargs):
+        self.to_static = False
+        return self._run(*args, **kwargs)
+
+    def _run_static(self, *args, **kwargs):
+        self.to_static = True
+        return self._run(*args, **kwargs)
+
+    # TODO(MarioLulab): In the future, this will be supported: not only `paddle.Tensor`
+    # but also non-Tensor objects will be included in the argument list.
+    def _run_and_compare(self, *args, **kwargs):
+        # Step1. Clone args and kwargs to avoid dygraph and static overwriting with each other
+        dygraph_inp_args = []
+        static_inp_args = []
+        for v in args:
+            assert isinstance(
+                v, paddle.Tensor
+            ), "Only Support `paddle.Tensor` now"
+            stop_gradient = v.stop_gradient
+            # detach from the compute graph to turn `dygraph_inp_args` and `static_inp_args` into leaf nodes
+            v = v.detach()
+            dygraph_inp_args.append(v.clone())
+            static_inp_args.append(v.clone())
+            if not stop_gradient:
+                dygraph_inp_args[-1].stop_gradient = False
+                static_inp_args[-1].stop_gradient = False
+
+        dygraph_inp_kwargs = {}
+        static_inp_kwargs = {}
+        for k, v in kwargs.items():
+            stop_gradient = v.stop_gradient
+            assert isinstance(
+                v, paddle.Tensor
+            ), "Only Support `paddle.Tensor` now"
+            # detach from the compute graph to turn `dygraph_inp_kwargs` and `static_inp_kwargs` into leaf nodes
+            v = v.detach()
+            dygraph_inp_kwargs[k] = v.clone()
+            static_inp_kwargs[k] = v.clone()
+            if not stop_gradient:
+                dygraph_inp_kwargs[k].stop_gradient = False
+                static_inp_kwargs[k].stop_gradient = False
+
+        # Step2. Run the dygraph and the static seperately
+        dygraph_res = self._run_dygraph(*dygraph_inp_args, **dygraph_inp_kwargs)
+        static_res = self._run_static(*static_inp_args, **static_inp_kwargs)
+
+        # Step3. Compare forward result between dygraph and static
+        if not isinstance(dygraph_res, tuple):
+            dygraph_res = (dygraph_res,)
+        if not isinstance(static_res, tuple):
+            static_res = (static_res,)
+
+        for d, s in zip(dygraph_res, static_res):
+            compare_result(d, s)
+
+        # Step4. Compare grad between dygraph and static
+        for i in range(len(dygraph_inp_args)):
+            self.assertEqual(
+                dygraph_inp_args[i].stop_gradient,
+                static_inp_args[i].stop_gradient,
+            )
+            if dygraph_inp_args[i].stop_gradient:
+                continue
+
+            compare_result(dygraph_inp_args[i].grad, static_inp_args[i].grad)
+
+        for key in dygraph_inp_kwargs.keys():
+            self.assertEqual(
+                dygraph_inp_kwargs[key].stop_gradient,
+                static_inp_kwargs[key].stop_gradient,
+            )
+            if dygraph_inp_kwargs[key].stop_gradient:
+                continue
+
+            compare_result(
+                dygraph_inp_kwargs[key].grad, static_inp_kwargs[key].grad
+            )
+
+
+class TestPyLayerWithoutContext(TestPyLayerBase):
+    def test_single_in_single_out(self):
+        @paddle.jit.to_static
+        def test_func(x):
+            y = scaled_layer_1.apply(x)
+            return y
+
+        self.dygraph_func = test_func
+
+        input1 = paddle.randn([2, 3]).astype("float32")
+        input1.stop_gradient = False
+
+        self._run_and_compare(input1)
+
+    def test_multi_in_single_out(self):
+        @paddle.jit.to_static
+        def test_func(x1, x2):
+            y = scaled_layer_2.apply(x1, x2)
+            return y
+
+        self.dygraph_func = test_func
+
+        input1 = paddle.randn([2, 3]).astype("float32")
+        input2 = paddle.randn([2, 3]).astype("float32")
+        input1.stop_gradient = False
+        input2.stop_gradient = False
+
+        self._run_and_compare(input1, input2)
+
+
+class TestPyLayerWithContext(TestPyLayerBase):
+    def test_single_in_single_out(self):
+        @paddle.jit.to_static
+        def test_func(x):
+            y = cus_tanh_1.apply(x)
+            return y
+
+        self.dygraph_func = test_func
+
+        input1 = paddle.randn([2, 3]).astype("float32")
+        input1.stop_gradient = False
+
+        self._run_and_compare(input1)
+
+    def test_nested_pylayer(self):
+        @paddle.jit.to_static
+        def test_func(x1, x2):
+            y = nested_layer.apply(x1, x2)
+            return y
+
+        self.dygraph_func = test_func
+
+        input1 = paddle.randn([2, 3]).astype("float32")
+        input2 = paddle.randn([2, 3]).astype("float32")
+        input1.stop_gradient = False
+        input2.stop_gradient = False
+
+        self._run_and_compare(input1, input2)
+
+
+class TestPyLayerInsideNet(TestPyLayerBase):
+    def test_single_in_single_out(self):
+        simple_net = SimpleNet_1()
+        self.dygraph_func = simple_net
+
+        input1 = paddle.randn([3, 4]).astype("float32")
+        input1.stop_gradient = False
+        self._run_and_compare(input1)
+
+    def test_inplace(self):
+        simple_net = SimpleNetInplace()
+        self.dygraph_func = simple_net
+
+        input1 = paddle.randn([3, 4]).astype("float32")
+        input1.stop_gradient = False
+        self._run_and_compare(input1)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/dygraph_to_static/test_to_tensor.py b/test/dygraph_to_static/test_to_tensor.py
index bab0ad018b76f..ee33d56187efa 100644
--- a/test/dygraph_to_static/test_to_tensor.py
+++ b/test/dygraph_to_static/test_to_tensor.py
@@ -15,11 +15,7 @@
 import unittest
 
 import numpy
-from dygraph_to_static_util import (
-    ast_only_test,
-    dy2static_unittest,
-    sot_only_test,
-)
+from dygraph_to_static_util import dy2static_unittest
 
 import paddle
 from paddle.base import core
@@ -154,7 +150,6 @@ def test_to_tensor_badreturn(self):
         self.assertTrue(a.stop_gradient == b.stop_gradient)
         self.assertTrue(a.place._equals(b.place))
 
-    @ast_only_test
     def test_to_tensor_err_log(self):
         paddle.disable_static()
         x = paddle.to_tensor([3])
@@ -166,18 +161,6 @@ def test_to_tensor_err_log(self):
                 in str(e)
             )
 
-    @sot_only_test
-    def test_to_tensor_err_log_sot(self):
-        paddle.disable_static()
-        x = paddle.to_tensor([3])
-        try:
-            a = paddle.jit.to_static(case8)(x)
-        except Exception as e:
-            self.assertTrue(
-                "Can't constructs a 'paddle.Tensor' with data type <class 'dict'>"
-                in str(e)
-            )
-
 
 class TestStatic(unittest.TestCase):
     def test_static(self):
diff --git a/test/fft/test_spectral_op.py b/test/fft/test_spectral_op.py
index 96ef439728524..885aff2c7cd1b 100644
--- a/test/fft/test_spectral_op.py
+++ b/test/fft/test_spectral_op.py
@@ -16,7 +16,7 @@
 import sys
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 from spectral_op_np import fft_c2c, fft_c2r, fft_r2c
 
 import paddle
diff --git a/test/indexing/test_getitem.py b/test/indexing/test_getitem.py
index 3a62aac3fdbc7..7fd263e72b62b 100644
--- a/test/indexing/test_getitem.py
+++ b/test/indexing/test_getitem.py
@@ -433,6 +433,67 @@ def test_indexing_is_multi_dim_list(self):
         np.testing.assert_allclose(res[1], np_res)
 
 
+class TestGetitemBasicIndexOutputView(unittest.TestCase):
+    def setUp(self):
+        # Stride now only supports in dygraph mode
+        paddle.disable_static()
+
+    def test_index_is_int(self):
+        np_data = np.ones((5, 5, 5), dtype='float32')
+        np_tmp = np_data[3, 2]
+        np_tmp[2] = 20
+
+        x = paddle.ones((5, 5, 5), dtype='float32')
+        x_tmp = x[3, 2]
+        x_tmp[2] = 20
+
+        np.testing.assert_allclose(x.numpy(), np_data)
+
+    def test_index_is_0dTensor(self):
+        np_data = np.ones((5, 5, 5), dtype='float32')
+        np_tmp = np_data[3, 2]
+        np_tmp[2] = 20
+
+        x = paddle.ones((5, 5, 5), dtype='float32')
+        x_tmp = x[paddle.to_tensor(3), paddle.to_tensor(2)]
+        x_tmp[2] = 20
+
+        np.testing.assert_allclose(x.numpy(), np_data)
+
+    def test_index_is_slice(self):
+        np_data = np.ones((5, 5, 5), dtype='float32')
+        np_tmp = np_data[::2, :, 0:4]
+        np_tmp[2] = 20
+
+        x = paddle.ones((5, 5, 5), dtype='float32')
+        x_tmp = x[::2, :, 0:4]
+        x_tmp[2] = 20
+
+        np.testing.assert_allclose(x.numpy(), np_data)
+
+    def test_index_is_None(self):
+        np_data = np.ones((5, 5, 5), dtype='float32')
+        np_tmp = np_data[None]
+        np_tmp[:, 2] = 20
+
+        x = paddle.ones((5, 5, 5), dtype='float32')
+        x_tmp = x[None]
+        x_tmp[:, 2] = 20
+
+        np.testing.assert_allclose(x.numpy(), np_data)
+
+    def test_index_is_ellipsis(self):
+        np_data = np.ones((5, 5, 5), dtype='float32')
+        np_tmp = np_data[...]
+        np_tmp[2] = 20
+
+        x = paddle.ones((5, 5, 5), dtype='float32')
+        x_tmp = x[...]
+        x_tmp[2] = 20
+
+        np.testing.assert_allclose(x.numpy(), np_data)
+
+
 class TestGetItemErrorCase(unittest.TestCase):
     def setUp(self):
         paddle.disable_static()
diff --git a/test/indexing/test_setitem.py b/test/indexing/test_setitem.py
index c8ee4cebc8018..b9c51bfe08403 100644
--- a/test/indexing/test_setitem.py
+++ b/test/indexing/test_setitem.py
@@ -60,6 +60,31 @@ def test_index_has_range(self):
 
         np.testing.assert_allclose(x.numpy(), np_data)
 
+    def test_src_value_with_different_dtype_1(self):
+        # basic-indexing, with set_value op
+        np_data = np.ones((3, 4, 5, 6), dtype='int32')
+        np_value = np.zeros((6,), dtype='float32')
+        x = paddle.to_tensor(np_data)
+        v = paddle.to_tensor(np_value)
+
+        np_data[0, 2, 3] = np_value
+        x[0, 2, 3] = v
+
+        np.testing.assert_allclose(x.numpy(), np_data)
+
+    def test_src_value_with_different_dtype_2(self):
+        # combined-indexing, with index_put op
+        np_data = np.ones((3, 4, 5, 6), dtype='float32')
+        np_value = np.zeros((6,), dtype='int64')
+
+        x = paddle.to_tensor(np_data)
+        v = paddle.to_tensor(np_value)
+
+        np_data[:, [1, 0], 3] = np_value
+        x[:, [1, 0], 3] = v
+
+        np.testing.assert_allclose(x.numpy(), np_data)
+
     def test_indexing_with_bool_list1(self):
         # test bool-list indexing when axes num less than x.rank
         np_data = np.arange(3 * 4 * 5 * 6).reshape((3, 4, 5, 6))
@@ -201,6 +226,46 @@ def test_index_has_range(self):
 
         np.testing.assert_allclose(res[0], np_data)
 
+    def test_src_value_with_different_dtype_1(self):
+        # basic-indexing, with set_value op
+        np_data = np.ones((3, 4, 5, 6), dtype='int32')
+        np_value = np.zeros((6,), dtype='float32')
+        np_data[0, 2, 3] = np_value
+
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
+            x = paddle.ones((3, 4, 5, 6), dtype='int32')
+            v = paddle.zeros((6,), dtype='float32')
+            y = _setitem_static(
+                x,
+                (0, 2, 3),
+                v,
+            )
+            res = self.exe.run(fetch_list=[y.name])
+
+        np.testing.assert_allclose(res[0], np_data)
+
+    def test_src_value_with_different_dtype_2(self):
+        # combined-indexing, with index_put op
+        np_data = np.ones((3, 4, 5, 6), dtype='float32')
+        np_value = np.zeros((6,), dtype='int64')
+        np_data[:, [1, 0], 3] = np_value
+
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
+            x = paddle.ones((3, 4, 5, 6), dtype='float32')
+            v = paddle.zeros((6,), dtype='int64')
+            y = _setitem_static(
+                x,
+                (slice(None, None), [1, 0], 3),
+                v,
+            )
+            res = self.exe.run(fetch_list=[y.name])
+
+        np.testing.assert_allclose(res[0], np_data)
+
     def test_indexing_with_bool_list1(self):
         # test bool-list indexing when axes num less than x.rank
         np_data = np.arange(3 * 4 * 5 * 6).reshape((3, 4, 5, 6))
diff --git a/test/ir/new_ir/CMakeLists.txt b/test/ir/new_ir/CMakeLists.txt
index 943f7b42ba15a..be352019e2d50 100644
--- a/test/ir/new_ir/CMakeLists.txt
+++ b/test/ir/new_ir/CMakeLists.txt
@@ -4,7 +4,8 @@ file(
   "test_*.py")
 string(REPLACE ".py" "" TEST_INTERP_CASES "${TEST_INTERP_CASES}")
 
-set(TEST_IR_SYSTEM_CASES test_build_model test_pd_inplace_pass)
+set(TEST_IR_SYSTEM_CASES test_build_model test_pd_inplace_pass
+                         test_symbol_overload)
 list(REMOVE_ITEM TEST_INTERP_CASES ${TEST_IR_SYSTEM_CASES})
 
 foreach(target ${TEST_INTERP_CASES})
diff --git a/test/ir/new_ir/test_build_model.py b/test/ir/new_ir/test_build_model.py
index 8c9cbb32aa28f..f356cfc24ffdf 100644
--- a/test/ir/new_ir/test_build_model.py
+++ b/test/ir/new_ir/test_build_model.py
@@ -47,6 +47,40 @@ def test_basic_network(self):
             (sum_value,) = exe.run(feed={'x': x_feed}, fetch_list=[out])
             self.assertEqual(sum_value, 10)
 
+    def test_train_network(self):
+        x_data = np.array(
+            [[1.0], [3.0], [5.0], [9.0], [10.0], [20.0]], dtype="float32"
+        )
+        y_data = np.array(
+            [[12.0], [16.0], [20.0], [28.0], [30.0], [50.0]], dtype="float32"
+        )
+        main_program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        with paddle.static.program_guard(main_program, startup_program):
+            x = paddle.static.data(name="x", shape=[6, 1], dtype="float32")
+            y = paddle.static.data(name="y", shape=[6, 1], dtype="float32")
+            linear = paddle.nn.Linear(in_features=1, out_features=1)
+            mse_loss = paddle.nn.MSELoss()
+            sgd_optimizer = paddle.optimizer.SGD(
+                learning_rate=0.001, parameters=linear.parameters()
+            )
+            exe = paddle.static.Executor()
+
+            y_predict = linear(x)
+            loss = mse_loss(y_predict, y)
+            sgd_optimizer.minimize(loss)
+
+            exe.run(startup_program)
+            total_epoch = 5000
+            for i in range(total_epoch):
+                (loss_value,) = exe.run(
+                    feed={'x': x_data, 'y': y_data}, fetch_list=[loss]
+                )
+
+            print(f"loss is {loss_value} after {total_epoch} iteration")
+
+            self.assertLess(loss_value, 0.1)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/ir/new_ir/test_build_op.py b/test/ir/new_ir/test_build_op.py
index 16bc1adb0628e..c8f605c9f5956 100644
--- a/test/ir/new_ir/test_build_op.py
+++ b/test/ir/new_ir/test_build_op.py
@@ -39,18 +39,18 @@ def get_ir_program():
 class TestBuildOp(unittest.TestCase):
     def test_build_mean_op(self):
         newir_program = get_ir_program()
-        tanh_out = newir_program.block().ops[-1].result(0)
+        tanh_out = newir_program.global_block().ops[-1].result(0)
         paddle.framework.set_flags({"FLAGS_enable_new_ir_api": True})
         with paddle.ir.core.program_guard(newir_program):
             out = paddle.mean(tanh_out)
-        self.assertEqual(out.get_defining_op().name(), "pd.mean")
+        self.assertEqual(out.get_defining_op().name(), "pd_op.mean")
         self.assertEqual(
             out.get_defining_op()
             .operands()[0]
             .source()
             .get_defining_op()
             .name(),
-            "pd.tanh",
+            "pd_op.tanh",
         )
         paddle.framework.set_flags({"FLAGS_enable_new_ir_api": False})
 
@@ -58,13 +58,13 @@ def test_build_mean_op(self):
 class TestBuildOp2(unittest.TestCase):
     def test_build_add_n_op(self):
         newir_program = get_ir_program()
-        tanh_out = newir_program.block().ops[-1].result(0)
+        tanh_out = newir_program.global_block().ops[-1].result(0)
         paddle.framework.set_flags({"FLAGS_enable_new_ir_api": True})
         with paddle.ir.core.program_guard(newir_program):
             out1 = paddle.mean(tanh_out)
             out2 = paddle.mean(tanh_out)
             out = paddle.add_n([out1, out2])
-        self.assertEqual(out.get_defining_op().name(), "pd.add_n")
+        self.assertEqual(out.get_defining_op().name(), "pd_op.add_n")
         self.assertEqual(
             out.get_defining_op()
             .operands()[0]
@@ -80,8 +80,8 @@ class TestBuildOp3(unittest.TestCase):
     def test_insertion_point(self):
         newir_program = get_ir_program()
         paddle.framework.set_flags({"FLAGS_enable_new_ir_api": True})
-        add_op = newir_program.block().ops[-2]
-        tanh_op = newir_program.block().ops[-1]
+        add_op = newir_program.global_block().ops[-2]
+        tanh_op = newir_program.global_block().ops[-1]
         add_out = add_op.result(0)
         tanh_operand = tanh_op.operands()[0]
 
@@ -97,7 +97,7 @@ def test_insertion_point(self):
 
         print(newir_program)
         self.assertEqual(
-            tanh_operand.source().get_defining_op().name(), "pd.mean"
+            tanh_operand.source().get_defining_op().name(), "pd_op.mean"
         )
         paddle.framework.set_flags({"FLAGS_enable_new_ir_api": False})
 
@@ -105,11 +105,11 @@ def test_insertion_point(self):
 class TestBuildOp4(unittest.TestCase):
     def test_build_concat_op(self):
         newir_program = get_ir_program()
-        tanh_out = newir_program.block().ops[-1].result(0)
+        tanh_out = newir_program.global_block().ops[-1].result(0)
         paddle.framework.set_flags({"FLAGS_enable_new_ir_api": True})
         with paddle.ir.core.program_guard(newir_program):
             out = paddle.concat([tanh_out, tanh_out], 0)
-        self.assertEqual(out.get_defining_op().name(), "pd.concat")
+        self.assertEqual(out.get_defining_op().name(), "pd_op.concat")
         self.assertEqual(
             out.get_defining_op()
             .operands()[0]
@@ -124,7 +124,7 @@ def test_build_concat_op(self):
 class TestBuildOp5(unittest.TestCase):
     def test_build_split_op(self):
         newir_program = get_ir_program()
-        tanh_out = newir_program.block().ops[-1].result(0)
+        tanh_out = newir_program.global_block().ops[-1].result(0)
         paddle.framework.set_flags({"FLAGS_enable_new_ir_api": True})
         with paddle.ir.core.program_guard(newir_program):
             out = paddle.split(tanh_out, [2, 2], 0)
@@ -136,7 +136,7 @@ def test_build_split_op(self):
             .source()
             .get_defining_op()
             .name(),
-            "pd.split",
+            "pd_op.split",
         )
         paddle.framework.set_flags({"FLAGS_enable_new_ir_api": False})
 
diff --git a/test/ir/new_ir/test_ir_backward.py b/test/ir/new_ir/test_ir_backward.py
index 5d858fc2b7602..185d63cf5ebcd 100644
--- a/test/ir/new_ir/test_ir_backward.py
+++ b/test/ir/new_ir/test_ir_backward.py
@@ -36,40 +36,48 @@ def get_ir_program_0():
 
 
 class TesBackward_1(unittest.TestCase):
+    def tearDown(self) -> None:
+        paddle.framework.set_flags({"FLAGS_enable_new_ir_api": False})
+
     def test_grad(self):
         newir_program = get_ir_program_0()
-        input = newir_program.block().ops[-1].operand(0).source()
-        tanh_out = newir_program.block().ops[-1].result(0)
+        input = newir_program.global_block().ops[-1].operand(0).source()
+        tanh_out = newir_program.global_block().ops[-1].result(0)
         paddle.framework.set_flags({"FLAGS_enable_new_ir_api": True})
         with paddle.ir.core.program_guard(newir_program):
             out = paddle.mean(tanh_out)
             out2 = paddle.mean(tanh_out)
             input_grad = grad(out, input, out2)
 
-        self.assertEqual(out.get_defining_op().name(), "pd.mean")
-        self.assertEqual(input_grad[0].get_defining_op().name(), "pd.tanh_grad")
+        self.assertEqual(out.get_defining_op().name(), "pd_op.mean")
+        self.assertEqual(
+            input_grad[0].get_defining_op().name(), "pd_op.tanh_grad"
+        )
         self.assertEqual(
             out.get_defining_op()
             .operands()[0]
             .source()
             .get_defining_op()
             .name(),
-            "pd.tanh",
+            "pd_op.tanh",
         )
-        paddle.framework.set_flags({"FLAGS_enable_new_ir_api": False})
 
     def test_full(self):
         # test create output_grad in backward use full op
         newir_program = get_ir_program_0()
-        input = newir_program.block().ops[-1].operand(0).source()
-        tanh_out = newir_program.block().ops[-1].result(0)
+        input = newir_program.global_block().ops[-1].operand(0).source()
+        tanh_out = newir_program.global_block().ops[-1].result(0)
         paddle.framework.set_flags({"FLAGS_enable_new_ir_api": True})
         with paddle.ir.core.program_guard(newir_program):
             out = paddle.mean(tanh_out)
             input_grad = grad(out, input)
 
-        self.assertEqual(newir_program.block().ops[-3].name(), "pd.full")
-        self.assertEqual(input_grad[0].get_defining_op().name(), "pd.tanh_grad")
+        self.assertEqual(
+            newir_program.global_block().ops[-3].name(), "pd_op.full"
+        )
+        self.assertEqual(
+            input_grad[0].get_defining_op().name(), "pd_op.tanh_grad"
+        )
         self.assertEqual(
             input_grad[0]
             .get_defining_op()
@@ -77,46 +85,48 @@ def test_full(self):
             .source()
             .get_defining_op()
             .name(),
-            "pd.mean_grad",
+            "pd_op.mean_grad",
         )
         paddle.framework.set_flags({"FLAGS_enable_new_ir_api": False})
 
     def test_no_grad_set(self):
         # test create output_grad in backward use full op
         newir_program = get_ir_program_0()
-        input = newir_program.block().ops[-1].operand(0).source()
-        tanh_out = newir_program.block().ops[-1].result(0)
+        input = newir_program.global_block().ops[-1].operand(0).source()
+        tanh_out = newir_program.global_block().ops[-1].result(0)
         paddle.framework.set_flags({"FLAGS_enable_new_ir_api": True})
         with paddle.ir.core.program_guard(newir_program):
             out = paddle.mean(tanh_out)
             input_grad = grad(out, input, no_grad_vars=[input])
 
-        self.assertEqual(newir_program.block().ops[-1].name(), "pd.mean")
+        self.assertEqual(
+            newir_program.global_block().ops[-1].name(), "pd_op.mean"
+        )
         paddle.framework.set_flags({"FLAGS_enable_new_ir_api": False})
 
     def test_split(self):
         # test create output_grad in backward use full op
         newir_program = get_ir_program_0()
-        input = newir_program.block().ops[-1].operand(0).source()
-        tanh_out = newir_program.block().ops[-1].result(0)
+        input = newir_program.global_block().ops[-1].operand(0).source()
+        tanh_out = newir_program.global_block().ops[-1].result(0)
         paddle.framework.set_flags({"FLAGS_enable_new_ir_api": True})
         with paddle.ir.core.program_guard(newir_program):
             out = paddle.split(tanh_out, [2, 2], 0)
             input_grad = grad(out, input)
-
         ops_name = [
-            "pd.data",
-            "pd.tanh",
-            "pd.full_int_array",
-            "pd.full",
-            "pd.split",
+            "pd_op.data",
+            "pd_op.tanh",
+            "pd_op.full_int_array",
+            "pd_op.full",
+            "pd_op.split",
             "builtin.split",
-            "pd.full",
+            "pd_op.full",
+            "pd_op.full",
             "builtin.combine",
-            "pd.split_grad",
-            "pd.tanh_grad",
+            "pd_op.concat",
+            "pd_op.tanh_grad",
         ]
-        for i, op in enumerate(newir_program.block().ops):
+        for i, op in enumerate(newir_program.global_block().ops):
             self.assertEqual(op.name(), ops_name[i])
         paddle.framework.set_flags({"FLAGS_enable_new_ir_api": False})
 
@@ -141,58 +151,62 @@ def get_ir_program_1():
 
 
 class TesBackward_2(unittest.TestCase):
+    def tearDown(self) -> None:
+        paddle.framework.set_flags({"FLAGS_enable_new_ir_api": False})
+
     def test_add_n(self):
         newir_program = get_ir_program_1()
-        input_x = newir_program.block().ops[-3].operand(0).source()
+        input_x = newir_program.global_block().ops[-3].operand(0).source()
 
-        add_out = newir_program.block().ops[-1].result(0)
+        add_out = newir_program.global_block().ops[-1].result(0)
         paddle.framework.set_flags({"FLAGS_enable_new_ir_api": True})
         with paddle.ir.core.program_guard(newir_program):
             out = paddle.mean(add_out)
             input_grad = grad(out, input_x)
 
-        self.assertEqual(newir_program.block().ops[-1].name(), "pd.add_n")
         self.assertEqual(
-            newir_program.block().ops[-2].name(), "builtin.combine"
+            newir_program.global_block().ops[-1].name(), "pd_op.add_n"
+        )
+        self.assertEqual(
+            newir_program.global_block().ops[-1].name(), "pd_op.add_n"
+        )
+        self.assertEqual(
+            newir_program.global_block().ops[-2].name(), "builtin.combine"
         )
-        paddle.framework.set_flags({"FLAGS_enable_new_ir_api": False})
 
     def test_concat(self):
         newir_program = get_ir_program_1()
-        input_x = newir_program.block().ops[-3].operand(0).source()
+        input_x = newir_program.global_block().ops[-3].operand(0).source()
 
-        add_out = newir_program.block().ops[-1].result(0)
+        add_out = newir_program.global_block().ops[-1].result(0)
         paddle.framework.set_flags({"FLAGS_enable_new_ir_api": True})
         with paddle.ir.core.program_guard(newir_program):
             out = paddle.concat([add_out, add_out])
             input_grad = grad(out, input_x)
-
         ops_name = [
-            "pd.data",
-            "pd.data",
-            "pd.tanh",
-            "pd.tanh",
-            "pd.add",
+            "pd_op.data",
+            "pd_op.data",
+            "pd_op.tanh",
+            "pd_op.tanh",
+            "pd_op.add",
+            "pd_op.full",
             "builtin.combine",
-            "pd.full",
-            "pd.concat",
-            "pd.full",
+            "pd_op.concat",
+            "pd_op.full",
             "builtin.combine",
-            "pd.concat_grad",
+            "pd_op.concat_grad",
             "builtin.split",
             "builtin.combine",
-            "pd.add_n",
-            "pd.add_grad",
-            "pd.tanh_grad",
-            "pd.tanh_grad",
+            "pd_op.add_n",
+            "pd_op.add_grad",
+            "pd_op.tanh_grad",
+            "pd_op.tanh_grad",
             "builtin.combine",
-            "pd.add_n",
+            "pd_op.add_n",
         ]
-        for i, op in enumerate(newir_program.block().ops):
+        for i, op in enumerate(newir_program.global_block().ops):
             self.assertEqual(op.name(), ops_name[i])
 
-        paddle.framework.set_flags({"FLAGS_enable_new_ir_api": False})
-
 
 def get_ir_program_2():
     x = paddle.randn([2, 2])
@@ -209,10 +223,13 @@ def get_ir_program_2():
 
 
 class TestBackward_3(unittest.TestCase):
+    def tearDown(self) -> None:
+        paddle.framework.set_flags({"FLAGS_enable_new_ir_api": False})
+
     def test_basic_network(self):
         newir_program = get_ir_program_2()
-        x = newir_program.block().ops[-1].operand(0).source()
-        sum_x = newir_program.block().ops[-1].result(0)
+        x = newir_program.global_block().ops[-1].operand(0).source()
+        sum_x = newir_program.global_block().ops[-1].result(0)
         paddle.framework.set_flags({"FLAGS_enable_new_ir_api": True})
         with paddle.ir.core.program_guard(newir_program):
             norm = paddle.tensor.fill_constant(
@@ -223,8 +240,6 @@ def test_basic_network(self):
             res = paddle.divide(sum_x, norm)
             input_grad = grad(res, x)
 
-        paddle.framework.set_flags({"FLAGS_enable_new_ir_api": False})
-
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/ir/new_ir/test_ir_pybind.py b/test/ir/new_ir/test_ir_pybind.py
index 63e9703cedf9f..34aa4c90c873f 100644
--- a/test/ir/new_ir/test_ir_pybind.py
+++ b/test/ir/new_ir/test_ir_pybind.py
@@ -41,14 +41,14 @@ def test_program(self):
         newir_program = get_ir_program()
         print(newir_program)
 
-        block = newir_program.block()
-        program = block.get_parent_program()
+        block = newir_program.global_block()
+        program = block.program
 
         self.assertEqual(newir_program, program)
 
     def test_block(self):
         newir_program = get_ir_program()
-        block = newir_program.block()
+        block = newir_program.global_block()
         ops = block.ops
         self.assertEqual(
             len(ops), 4
@@ -58,10 +58,10 @@ def test_block(self):
 
     def test_operation(self):
         newir_program = get_ir_program()
-        ops = newir_program.block().ops
-        matmul_op = newir_program.block().ops[1]
-        add_op = newir_program.block().ops[2]
-        tanh_op = newir_program.block().ops[3]
+        ops = newir_program.global_block().ops
+        matmul_op = newir_program.global_block().ops[1]
+        add_op = newir_program.global_block().ops[2]
+        tanh_op = newir_program.global_block().ops[3]
         parent_block = tanh_op.get_parent_block()
         parent_ops_num = len(parent_block.ops)
         self.assertEqual(parent_ops_num, 4)
@@ -72,19 +72,19 @@ def test_operation(self):
 
     def test_value(self):
         newir_program = get_ir_program()
-        matmul_op = newir_program.block().ops[1]
-        add_op = newir_program.block().ops[2]
-        tanh_op = newir_program.block().ops[3]
+        matmul_op = newir_program.global_block().ops[1]
+        add_op = newir_program.global_block().ops[2]
+        tanh_op = newir_program.global_block().ops[3]
 
         self.assertEqual(
             matmul_op.result(0).dtype, paddle.base.core.DataType.FLOAT32
         )
         self.assertEqual(matmul_op.result(0).shape, [4, 4])
         self.assertEqual(
-            matmul_op.results()[0].get_defining_op().name(), "pd.matmul"
+            matmul_op.results()[0].get_defining_op().name(), "pd_op.matmul"
         )
         self.assertEqual(
-            matmul_op.result(0).get_defining_op().name(), "pd.matmul"
+            matmul_op.result(0).get_defining_op().name(), "pd_op.matmul"
         )
         matmul_op.result(0).stop_gradient = True
         self.assertEqual(matmul_op.result(0).stop_gradient, True)
@@ -111,20 +111,21 @@ def test_value(self):
         self.assertEqual(add_op.operands()[0].source(), matmul_op.results()[0])
 
         self.assertEqual(
-            tanh_op.operands()[0].source().get_defining_op().name(), "pd.add"
+            tanh_op.operands()[0].source().get_defining_op().name(), "pd_op.add"
         )
 
         add_op.replace_all_uses_with(matmul_op.results())
         self.assertEqual(
-            tanh_op.operands()[0].source().get_defining_op().name(), "pd.matmul"
+            tanh_op.operands()[0].source().get_defining_op().name(),
+            "pd_op.matmul",
         )
 
         self.assertEqual(add_op.result(0).use_empty(), True)
 
     def test_type(self):
         newir_program = get_ir_program()
-        matmul_op = newir_program.block().ops[1]
-        add_op = newir_program.block().ops[2]
+        matmul_op = newir_program.global_block().ops[1]
+        add_op = newir_program.global_block().ops[2]
         print(matmul_op.result(0).type())
         self.assertEqual(
             matmul_op.result(0).type() == add_op.result(0).type(), True
@@ -152,8 +153,8 @@ def test_attr(self):
 
         newir_program = ir.translate_to_new_ir(main_program.desc)
         print(newir_program)
-        conv_attr = newir_program.block().ops[3].attrs()
-        full_attr = newir_program.block().ops[8].attrs()
+        conv_attr = newir_program.global_block().ops[3].attrs()
+        full_attr = newir_program.global_block().ops[8].attrs()
         self.assertEqual(conv_attr["stop_gradient"], [False])
         self.assertEqual(conv_attr["dilations"], [1, 1])
         self.assertEqual(conv_attr["data_format"], "NCHW")
@@ -166,13 +167,13 @@ def test_attr(self):
 
     def test_operands(self):
         newir_program = get_ir_program()
-        matmul_op = newir_program.block().ops[1]
+        matmul_op = newir_program.global_block().ops[1]
         operands = matmul_op.operands()
         self.assertEqual(len(operands), 2)
 
     def test_results(self):
         newir_program = get_ir_program()
-        matmul_op = newir_program.block().ops[1]
+        matmul_op = newir_program.global_block().ops[1]
         results = matmul_op.results()
         self.assertEqual(len(results), 1)
 
diff --git a/test/ir/new_ir/test_ir_vjp.py b/test/ir/new_ir/test_ir_vjp.py
index c770153738d2b..e3d2fc4d1446a 100644
--- a/test/ir/new_ir/test_ir_vjp.py
+++ b/test/ir/new_ir/test_ir_vjp.py
@@ -38,14 +38,14 @@ def get_ir_program():
 class TestTanhVjp(unittest.TestCase):
     def test_tanh_vjp1(self):
         newir_program = get_ir_program()
-        tanh_op = newir_program.block().ops[-2]
-        fill_constant_op = newir_program.block().ops[-1]
+        tanh_op = newir_program.global_block().ops[-2]
+        fill_constant_op = newir_program.global_block().ops[-1]
         out_grads = [[fill_constant_op.result(0)]]
         stop_gradients = [[False]]
         with paddle.ir.core.program_guard(newir_program):
             grad_outs = call_vjp(tanh_op, out_grads, stop_gradients)
         self.assertEqual(
-            grad_outs[0][0].get_defining_op().name(), "pd.tanh_grad"
+            grad_outs[0][0].get_defining_op().name(), "pd_op.tanh_grad"
         )
         self.assertEqual(
             grad_outs[0][0]
@@ -54,7 +54,7 @@ def test_tanh_vjp1(self):
             .source()
             .get_defining_op()
             .name(),
-            "pd.tanh",
+            "pd_op.tanh",
         )
         self.assertEqual(
             grad_outs[0][0]
@@ -63,14 +63,14 @@ def test_tanh_vjp1(self):
             .source()
             .get_defining_op()
             .name(),
-            "pd.full",
+            "pd_op.full",
         )
-        self.assertEqual(len(newir_program.block().ops), 4)
+        self.assertEqual(len(newir_program.global_block().ops), 4)
 
     def test_tanh_vjp2(self):
         newir_program = get_ir_program()
-        tanh_op = newir_program.block().ops[-2]
-        fill_constant_op = newir_program.block().ops[-1]
+        tanh_op = newir_program.global_block().ops[-2]
+        fill_constant_op = newir_program.global_block().ops[-1]
         out_grads = [[fill_constant_op.result(0)]]
         stop_gradients = [[True]]
         with paddle.ir.core.program_guard(newir_program):
@@ -90,14 +90,14 @@ def test_mean_vjp1(self):
             paddle.mean(x, axis=[0, 1])
             paddle.tensor.fill_constant(shape=[1], dtype='float32', value=2.0)
         newir_program = ir.translate_to_new_ir(main_program.desc)
-        fill_constant_op = newir_program.block().ops[-1]
-        mean_op = newir_program.block().ops[-2]
+        fill_constant_op = newir_program.global_block().ops[-1]
+        mean_op = newir_program.global_block().ops[-2]
         out_grads = [[fill_constant_op.result(0)]]
         stop_gradients = [[False]]
         with paddle.ir.core.program_guard(newir_program):
             grad_outs = call_vjp(mean_op, out_grads, stop_gradients)
             self.assertEqual(
-                grad_outs[0][0].get_defining_op().name(), "pd.mean_grad"
+                grad_outs[0][0].get_defining_op().name(), "pd_op.mean_grad"
             )
             self.assertEqual(
                 grad_outs[0][0]
@@ -106,7 +106,7 @@ def test_mean_vjp1(self):
                 .source()
                 .get_defining_op()
                 .name(),
-                "pd.data",
+                "pd_op.data",
             )
             self.assertEqual(
                 grad_outs[0][0]
@@ -115,9 +115,9 @@ def test_mean_vjp1(self):
                 .source()
                 .get_defining_op()
                 .name(),
-                "pd.full",
+                "pd_op.full",
             )
-            self.assertEqual(len(newir_program.block().ops), 4)
+            self.assertEqual(len(newir_program.global_block().ops), 4)
 
     def test_mean_vjp2(self):
         main_program, start_program = (
@@ -130,8 +130,8 @@ def test_mean_vjp2(self):
             paddle.mean(x, axis=[0, 1])
             paddle.tensor.fill_constant(shape=[1], dtype='float32', value=2.0)
         newir_program = ir.translate_to_new_ir(main_program.desc)
-        fill_constant_op = newir_program.block().ops[-1]
-        mean_op = newir_program.block().ops[-2]
+        fill_constant_op = newir_program.global_block().ops[-1]
+        mean_op = newir_program.global_block().ops[-2]
         out_grads = [[fill_constant_op.result(0)]]
         stop_gradients = [[True]]
         with paddle.ir.core.program_guard(newir_program):
@@ -151,8 +151,8 @@ def test_has_vjp(self):
             paddle.mean(x, axis=[0, 1])
             paddle.tensor.fill_constant(shape=[1], dtype='float32', value=2.0)
         newir_program = ir.translate_to_new_ir(main_program.desc)
-        fill_constant_op = newir_program.block().ops[-1]
-        mean_op = newir_program.block().ops[-2]
+        fill_constant_op = newir_program.global_block().ops[-1]
+        mean_op = newir_program.global_block().ops[-2]
         self.assertEqual(has_vjp(fill_constant_op), False)
         self.assertEqual(has_vjp(mean_op), True)
 
diff --git a/test/ir/new_ir/test_new_ir_to_static.py b/test/ir/new_ir/test_new_ir_to_static.py
new file mode 100644
index 0000000000000..4c4596984b878
--- /dev/null
+++ b/test/ir/new_ir/test_new_ir_to_static.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+
+import numpy as np
+
+import paddle
+
+os.environ['FLAGS_enable_new_ir_api'] = 'true'  # don't work, we should
+
+
+class TestDy2staticNewIR(unittest.TestCase):
+    def test_basic_network(self):
+        def func(x):
+            out = paddle.mean(x)
+            return out
+
+        static_func = paddle.jit.to_static(func)
+        x = paddle.randn((3, 3))
+        y = paddle.randn((3, 3))
+        x.stop_gradient = False
+        y.stop_gradient = False
+        ans = func(x)
+        out = static_func(x)
+
+        np.testing.assert_allclose(
+            out.numpy(), ans.numpy(), rtol=1e-05, atol=1e-8
+        )
+
+    def test_basic_network_backward(self):
+        def func(x):
+            out = paddle.mean(x)
+            return out
+
+        # ==== dygraph computation ====
+        static_func = paddle.jit.to_static(func)
+        x = paddle.randn((3, 3))
+        y = paddle.randn((3, 3))
+        x.stop_gradient = False
+        y.stop_gradient = False
+        loss = func(x) * 2
+        loss.backward()
+        x_grad_ans = x.grad.numpy()
+        x.clear_gradient()
+
+        # ==== to static compuatation ====
+        out = static_func(x)
+        out = out * 2
+        out.backward()
+        st_grad = x.grad
+
+        np.testing.assert_allclose(
+            x_grad_ans, st_grad.numpy(), rtol=1e-05, atol=1e-8
+        )
+
+
+class TestDy2staticNewIR3(unittest.TestCase):
+    def test_complex_layer(self):
+        def output_pure_func(x, y):
+            outx = paddle.mean(x)
+            outy = paddle.mean(y)
+            outy.stop_gradient = True
+            return paddle.add(outx, outy), outy
+
+        def run_function(to_static=True):
+            import paddle
+
+            # 设置随机种子
+            paddle.seed(2023)
+            # 生成随机数
+            x = paddle.randn((10, 10))
+            y = paddle.randn((10, 10))
+            x.stop_gradient = False
+            y.stop_gradient = True
+            func = output_pure_func
+            if to_static:
+                func = paddle.jit.to_static(func)
+            y, y_mean = func(x, y)
+            loss = y.mean()
+            loss.backward()
+            return (y, x.grad)
+
+        for dy, st in zip(run_function(False), run_function(True)):
+            np.testing.assert_allclose(
+                dy.numpy(), st.numpy(), rtol=1e-05, atol=1e-8
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/ir/new_ir/test_pass_manager.py b/test/ir/new_ir/test_pass_manager.py
index 81184eb08e8a8..32428627d8b9e 100644
--- a/test/ir/new_ir/test_pass_manager.py
+++ b/test/ir/new_ir/test_pass_manager.py
@@ -46,20 +46,20 @@ def test_op(self):
                 )
 
         new_program = ir.translate_to_new_ir(main_program.desc)
-        op_names = [op.name() for op in new_program.block().ops]
+        op_names = [op.name() for op in new_program.global_block().ops]
         # print(op_names)
-        self.assertTrue('pd.uniform' in op_names)
+        self.assertTrue('pd_op.uniform' in op_names)
         pm = ir.PassManager()
         pm.add_pass(
             'dead_code_elimination'
         )  # apply pass to elimitate dead code
         pm.run(new_program)
-        op_names = [op.name() for op in new_program.block().ops]
+        op_names = [op.name() for op in new_program.global_block().ops]
         # print(op_names)
         self.assertEqual(pm.passes(), ['dead_code_elimination'])
         self.assertFalse(pm.empty())
         self.assertTrue(
-            'pd.uniform' not in op_names
+            'pd_op.uniform' not in op_names
         )  # uniform is elimited because its output is not used
 
 
diff --git a/test/ir/new_ir/test_symbol_overload.py b/test/ir/new_ir/test_symbol_overload.py
new file mode 100644
index 0000000000000..160ba78b6582c
--- /dev/null
+++ b/test/ir/new_ir/test_symbol_overload.py
@@ -0,0 +1,193 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle import _ir_ops, nn
+from paddle.autograd.ir_backward import grad
+
+paddle.enable_static()
+
+
+class Net(nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, y):
+        z1 = _ir_ops.add(x, y)
+        z2 = _ir_ops.multiply(x, y)
+        z3 = _ir_ops.subtract(z1, z2)
+        z4 = _ir_ops.scale(z3, -1, 0, True)
+        res = _ir_ops.divide(z3, z4)
+        return res
+
+
+class SimbolNet(nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, y):
+        z1 = x + y
+        z2 = x * y
+        z3 = z1 - z2
+        z4 = -z3
+        res = z3 / z4
+        return res
+
+
+class CompareNet(nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, y):
+        z1 = _ir_ops.less_equal(x, y)
+        z2 = _ir_ops.greater_equal(x, y)
+        z3 = _ir_ops.less_than(x, y)
+        z4 = _ir_ops.greater_than(x, y)
+        return z1, z2, z3, z4
+
+
+class SimbolCompareNet(nn.Layer):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, y):
+        z1 = x <= y
+        z2 = x >= y
+        z3 = x < y
+        z4 = x > y
+        return z1, z2, z3, z4
+
+
+class TestOpresultSymbol(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(2023)
+        self.shape_x = [2, 1024, 1024]
+        self.shape_y = [2, 1024, 1024]
+        self.x = np.random.random(self.shape_x).astype("float32")
+        self.y = np.random.random(self.shape_y).astype("float32")
+
+    def base_net(self):
+        main_program = paddle.static.Program()
+        with paddle.static.program_guard(main_program):
+            net = Net()
+            x = paddle.static.data('x', self.shape_x, dtype='float32')
+            y = paddle.static.data('y', self.shape_y, dtype='float32')
+            x.stop_gradient = False
+            y.stop_gradient = False
+            res = net(x, y)
+            gradients = grad(res, (x, y))
+
+            exe = paddle.static.Executor()
+            outs = exe.run(
+                feed={
+                    'x': self.x,
+                    'y': self.y,
+                },
+                fetch_list=[res, gradients[0], gradients[1]],
+            )
+            ops = [op.name() for op in main_program.global_block().ops]
+        return outs, ops
+
+    def symbol_net(self):
+        main_program = paddle.static.Program()
+        with paddle.static.program_guard(main_program):
+            net = SimbolNet()
+            x = paddle.static.data('x', self.shape_x, dtype='float32')
+            y = paddle.static.data('y', self.shape_y, dtype='float32')
+            x.stop_gradient = False
+            y.stop_gradient = False
+            res = net(x, y)
+            gradients = grad(res, (x, y))
+
+            exe = paddle.static.Executor()
+            outs = exe.run(
+                feed={
+                    'x': self.x,
+                    'y': self.y,
+                },
+                fetch_list=[res, gradients[0], gradients[1]],
+            )
+            ops = [op.name() for op in main_program.global_block().ops]
+        return outs, ops
+
+    def test_symbol_overload(self):
+        res_ref, ops_ref = self.base_net()
+        res, ops = self.symbol_net()
+        for ref, actual in zip(res_ref, res):
+            np.testing.assert_equal(ref, actual)
+        self.assertEqual(ops_ref, ops)
+
+
+class TestOpresultCompareSymbol(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(2023)
+        self.shape_x = [2, 1024, 1024]
+        self.shape_y = [2, 1024, 1024]
+        self.x = np.random.random(self.shape_x).astype("float32")
+        self.y = np.random.random(self.shape_y).astype("float32")
+
+    def base_net(self):
+        main_program = paddle.static.Program()
+        with paddle.static.program_guard(main_program):
+            net = CompareNet()
+            x = paddle.static.data('x', self.shape_x, dtype='float32')
+            y = paddle.static.data('y', self.shape_y, dtype='float32')
+
+            res = net(x, y)
+
+            exe = paddle.static.Executor()
+            outs = exe.run(
+                feed={
+                    'x': self.x,
+                    'y': self.y,
+                },
+                fetch_list=[res],
+            )
+            ops = [op.name() for op in main_program.global_block().ops]
+        return outs, ops
+
+    def symbol_net(self):
+        main_program = paddle.static.Program()
+        with paddle.static.program_guard(main_program):
+            net = SimbolCompareNet()
+            x = paddle.static.data('x', self.shape_x, dtype='float32')
+            y = paddle.static.data('y', self.shape_y, dtype='float32')
+
+            res = net(x, y)
+
+            exe = paddle.static.Executor()
+            outs = exe.run(
+                feed={
+                    'x': self.x,
+                    'y': self.y,
+                },
+                fetch_list=[res],
+            )
+            ops = [op.name() for op in main_program.global_block().ops]
+        return outs, ops
+
+    def test_compare_symbol_overload(self):
+        res_ref, ops_ref = self.base_net()
+        res, ops = self.symbol_net()
+        for ref, actual in zip(res_ref, res):
+            np.testing.assert_equal(ref, actual)
+        self.assertEqual(ops_ref, ops)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/ir/test_ir_subgraph_python_interface.py b/test/ir/test_ir_subgraph_python_interface.py
index 6738dc6dccef6..7c1258dc8f837 100644
--- a/test/ir/test_ir_subgraph_python_interface.py
+++ b/test/ir/test_ir_subgraph_python_interface.py
@@ -14,7 +14,7 @@
 
 import unittest
 
-from eager_op_test import OpTestTool
+from op_test import OpTestTool
 
 import paddle
 from paddle import base
diff --git a/test/ir/test_op_input_grad_semantic.py b/test/ir/test_op_input_grad_semantic.py
index 7b932245fe515..24bf506461e54 100644
--- a/test/ir/test_op_input_grad_semantic.py
+++ b/test/ir/test_op_input_grad_semantic.py
@@ -56,14 +56,14 @@ def get_multiply_program_new_ir():
 class TestOpInputGradSemantic(unittest.TestCase):
     def test_gather_op_input_grad_semantic(self):
         newir_program = get_gather_program_new_ir()
-        gather_op = newir_program.block().ops[-1]
+        gather_op = newir_program.global_block().ops[-1]
         self.assertEqual(
             gather_op.get_input_grad_semantics(), [True, False, False]
         )
 
     def test_multiply_op_input_grad_semantic(self):
         newir_program = get_multiply_program_new_ir()
-        multiply_op = newir_program.block().ops[-1]
+        multiply_op = newir_program.global_block().ops[-1]
         self.assertEqual(multiply_op.get_input_grad_semantics(), [True, True])
 
 
diff --git a/test/legacy_test/benchmark.py b/test/legacy_test/benchmark.py
index 3384c04889389..bc3f2ae7810fb 100644
--- a/test/legacy_test/benchmark.py
+++ b/test/legacy_test/benchmark.py
@@ -15,8 +15,8 @@
 import time
 
 import numpy as np
-from eager_op_test import OpTest
 from op import Operator
+from op_test import OpTest
 
 
 class BenchmarkSuite(OpTest):
diff --git a/test/legacy_test/c_embedding_op_base.py b/test/legacy_test/c_embedding_op_base.py
index 891b3b8288260..83758b6bb0bc9 100644
--- a/test/legacy_test/c_embedding_op_base.py
+++ b/test/legacy_test/c_embedding_op_base.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle.framework import core
diff --git a/test/legacy_test/eager_op_test.py b/test/legacy_test/op_test.py
similarity index 92%
rename from test/legacy_test/eager_op_test.py
rename to test/legacy_test/op_test.py
index 504e10291b40a..7b0ec59c15664 100644
--- a/test/legacy_test/eager_op_test.py
+++ b/test/legacy_test/op_test.py
@@ -31,7 +31,6 @@
 from white_list import (
     check_shape_white_list,
     compile_vs_runtime_white_list,
-    new_ir_python_api_grad_white_list,
     no_check_set_white_list,
     no_grad_set_white_list,
     op_accuracy_white_list,
@@ -41,9 +40,9 @@
 import paddle
 from paddle import base
 from paddle.autograd.ir_backward import grad as ir_grad
-from paddle.base import core, unique_name
+from paddle.base import Scope, core, unique_name
 from paddle.base.backward import append_backward
-from paddle.base.executor import Executor
+from paddle.base.executor import Executor, scope_guard
 from paddle.base.framework import (
     OpProtoHolder,
     Program,
@@ -390,6 +389,7 @@ def setUpClass(cls):
         cls.input_shape_is_large = True
         cls.is_calc_ref = False
         cls.check_prim = False
+        cls.check_prim_new_ir = False
         cls._check_cinn = False
 
         np.random.seed(123)
@@ -473,6 +473,7 @@ def is_complex_test():
                 and not is_rocm_op_test()
                 and not is_custom_device_op_test()
                 and not cls.check_prim
+                and not cls.check_prim_new_ir
             ):
                 raise AssertionError(
                     "This test of %s op needs check_grad with fp64 precision."
@@ -1312,59 +1313,70 @@ def construct_output_dict_by_kernel_sig(ret_tuple, output_sig):
         kernel_sig = self.get_kernel_signature(place)
         ir_program = paddle.static.Program()
         with paddle.static.program_guard(ir_program):
-            # prepare inps attributes feed
-            (
-                static_inputs,
-                attrs,
-                input_dict,
-                feed,
-            ) = self.get_ir_input_attr_dict_and_feed(stop_gradient=True)
-            # prepare args
-            args = OpTestUtils.prepare_python_api_arguments(
-                self.python_api,
-                static_inputs,
-                attrs,
-                kernel_sig,
-            )
-            inputs_sig, attrs_sig, outputs_sig = kernel_sig
-            args = OpTestUtils.assumption_assert_and_transform(
-                args, len(inputs_sig)
-            )
-            ret_tuple = self.python_api(*args)
-            result = construct_output_dict_by_kernel_sig(ret_tuple, outputs_sig)
-            if hasattr(self, "python_out_sig_sub_name"):
-                for key in self.python_out_sig_sub_name.keys():
-                    for i in range(len(self.python_out_sig_sub_name[key])):
-                        result[key][0][i].name = self.python_out_sig_sub_name[
-                            key
-                        ][i]
-            fetch_list = getattr(self, "fetch_list", [])
-            # if the fetch_list is customized by user, we use it directly.
-            # if not, fill the fetch_list by the user configured outputs in test.
-
-            if len(fetch_list) == 0:
-                for var in result.items():
-                    if no_check_set is not None and var in no_check_set:
-                        continue
-                    if isinstance(var[1], list):
-                        for v in var[1]:
-                            fetch_list.append(v)
+            with scope_guard(Scope()):
+                # prepare inps attributes feed
+                (
+                    static_inputs,
+                    attrs,
+                    input_dict,
+                    feed,
+                ) = self.get_ir_input_attr_dict_and_feed(stop_gradient=True)
+                # prepare args
+                args = OpTestUtils.prepare_python_api_arguments(
+                    self.python_api,
+                    static_inputs,
+                    attrs,
+                    kernel_sig,
+                )
+                inputs_sig, attrs_sig, outputs_sig = kernel_sig
+                args = OpTestUtils.assumption_assert_and_transform(
+                    args, len(inputs_sig)
+                )
+                ret_tuple = self.python_api(*args)
+                fetch_list = getattr(self, "fetch_list", [])
+                # if the fetch_list is customized by user, we use it directly.
+                # if not, fill the fetch_list by the user configured outputs in test.
+
+                if len(fetch_list) == 0:
+                    if isinstance(ret_tuple, (tuple, list)):
+                        for var in ret_tuple:
+                            if no_check_set is not None and var in no_check_set:
+                                continue
+                            if isinstance(var, list):
+                                for v in var:
+                                    fetch_list.append(v)
+                            else:
+                                fetch_list.append(var)
+                    elif isinstance(
+                        ret_tuple, paddle.base.libpaddle.ir.OpResult
+                    ):
+                        fetch_list.append(ret_tuple)
                     else:
-                        fetch_list.append(var[1])
+                        raise ValueError(
+                            "output of python api should be OpResult or list of OpResult or tuple of OpResult"
+                        )
 
-            # executor run
-            executor = Executor(place)
-            (outs,) = executor.run(
-                ir_program, feed=feed, fetch_list=[fetch_list]
-            )
-        return outs
+                # executor run
+                executor = Executor(place)
+                outs = executor.run(
+                    ir_program, feed=feed, fetch_list=[fetch_list]
+                )
+
+                result = construct_output_dict_by_kernel_sig(outs, outputs_sig)
+                if hasattr(self, "python_out_sig_sub_name"):
+                    for key in self.python_out_sig_sub_name.keys():
+                        for i in range(len(self.python_out_sig_sub_name[key])):
+                            result[key][0][
+                                i
+                            ].name = self.python_out_sig_sub_name[key][i]
+            return result
 
     def _check_ir_output(self, place, program, feed_map, fetch_list, outs):
         if os.getenv("FLAGS_NEW_IR_OPTEST") is None:
             return
         if os.getenv("FLAGS_NEW_IR_OPTEST_WHITE_LIST") is None:
             return
-        if self.check_prim:
+        if self.check_prim or self.check_prim_new_ir:
             return
         if self._check_cinn:
             return
@@ -1932,10 +1944,11 @@ def check_output_with_place(
         equal_nan=False,
         check_dygraph=True,
         check_prim=False,
+        check_prim_new_ir=False,
         only_check_prim=False,
         inplace_atol=None,
         check_cinn=False,
-        check_new_ir=True,
+        check_new_ir=False,
     ):
         core._set_prim_all_enabled(False)
         core.set_prim_eager_enabled(False)
@@ -2306,6 +2319,7 @@ def calculate_output(self):
                         place, no_check_set=no_check_set
                     )
                 self.outputs = new_ir_outs
+
                 if self.op_test.is_compared_with_fp32():
                     self.op_test.enable_cal_ref_output()
                     self.is_python_api_test = True
@@ -2357,19 +2371,57 @@ def convert_uint16_to_float_ifneed(self, actual_np, expect_np):
                         expect_np = convert_uint16_to_float(expect_np)
                 return actual_np, expect_np
 
+            def find_imperative_actual(target_name, new_ir_outs, place):
+                for name in new_ir_outs:
+                    if name == target_name:
+                        return new_ir_outs[name][0]
+
+                    var_list = new_ir_outs[name]
+                    for i, var in enumerate(var_list):
+                        if isinstance(var, list):
+                            for tensor in var:
+                                if tensor.name == target_name:
+                                    return tensor
+                        elif (
+                            isinstance(var, paddle.Tensor)
+                            and var.name == target_name
+                        ):
+                            return new_ir_outs[name][i]
+                    self.assertTrue(
+                        False,
+                        f"Found failed {new_ir_outs.keys()} {target_name}",
+                    )
+
+            def find_imperative_expect(target_name, new_ir_outs, place):
+                for name in new_ir_outs:
+                    if name == target_name:
+                        return new_ir_outs[name][0]
+                    var_list = new_ir_outs[name]
+                    for i, var in enumerate(var_list):
+                        if var.name == target_name:
+                            return new_ir_outs[name][i]
+                self.assertTrue(
+                    False,
+                    f"Found failed {new_ir_outs.keys()} {target_name}",
+                )
+
             def find_actual_value(self, target_name):
                 with paddle.ir.core.program_guard(
                     paddle.ir.core.default_main_program()
                 ):
-                    actual = self.outputs
+                    actual = find_imperative_actual(
+                        target_name, self.outputs, place
+                    )
                     actual_t = np.array(actual)
                     return actual, actual_t
 
-            def find_expect_value(self, name):
+            def find_expect_value(self, target_name):
                 with paddle.ir.core.program_guard(
                     paddle.ir.core.default_main_program()
                 ):
-                    expect = self.ref_outputs
+                    expect = find_imperative_expect(
+                        target_name, self.ref_outputs, place
+                    )
                     expect_t = np.array(expect)
                     return expect, expect_t
 
@@ -2450,8 +2502,16 @@ def _is_skip_name(self, name):
             # Support operators which are not in the NO_FP64_CHECK_GRAD_OP_LIST list can be test prim with fp32
             self.__class__.check_prim = True
             self.__class__.op_type = self.op_type
-            if only_check_prim:
-                return
+
+        if check_prim_new_ir:
+            with paddle.new_ir_utils.IrGuard():
+                prim_checker = PrimForwardChecker(self, place)
+                prim_checker.check()
+                # Support operators which are not in the NO_FP64_CHECK_GRAD_OP_LIST list can be test prim with fp32
+                self.__class__.check_prim_new_ir = True
+                self.__class__.op_type = self.op_type
+        if only_check_prim:
+            return
 
         static_checker = StaticChecker(self, self.outputs)
         static_checker.check()
@@ -2461,22 +2521,15 @@ def _is_skip_name(self, name):
             dygraph_checker.check()
             dygraph_dygraph_outs = dygraph_checker.outputs
 
-        if (
-            self.op_type
-            in new_ir_python_api_grad_white_list.new_ir_python_api_grad_white_list
-            and check_new_ir
-        ):
+        if check_new_ir:
             if (
                 type(place) is paddle.base.libpaddle.CPUPlace
                 or type(place) is paddle.base.libpaddle.CUDAPlace
             ):
-                print("New IR checker begins...........")
                 with paddle.new_ir_utils.IrGuard():
                     new_ir_checker = NewIRChecker(self, self.outputs)
                     new_ir_checker.check()
 
-                print("New IR checker ends...........")
-
         # Note(zhiqiu): inplace_atol should be only set when op doesn't ensure
         # computational consistency.
         # For example, group_norm uses AtomicAdd on CUDAPlace, which do not ensure
@@ -2583,10 +2636,11 @@ def check_output(
         equal_nan=False,
         check_dygraph=True,
         check_prim=False,
+        check_prim_new_ir=False,
         inplace_atol=None,
         check_cinn=False,
         only_check_prim=False,
-        check_new_ir=True,
+        check_new_ir=False,
     ):
         self.__class__.op_type = self.op_type
         if self.is_mkldnn_op():
@@ -2608,6 +2662,7 @@ def check_output(
                 equal_nan,
                 check_dygraph=check_dygraph,
                 check_prim=check_prim,
+                check_prim_new_ir=check_prim_new_ir,
                 only_check_prim=only_check_prim,
                 inplace_atol=inplace_atol,
                 check_cinn=check_cinn,
@@ -2775,10 +2830,11 @@ def check_grad(
         user_defined_grad_outputs=None,
         check_dygraph=True,
         check_prim=False,
+        check_prim_new_ir=False,
         only_check_prim=False,
         atol=1e-5,
         check_cinn=False,
-        check_new_ir=True,
+        check_new_ir=False,
     ):
         if hasattr(self, "use_custom_device") and self.use_custom_device:
             check_dygraph = False
@@ -2798,6 +2854,7 @@ def check_grad(
                 user_defined_grad_outputs,
                 check_dygraph=check_dygraph,
                 check_prim=check_prim,
+                check_prim_new_ir=check_prim_new_ir,
                 only_check_prim=only_check_prim,
                 atol=atol,
                 check_cinn=check_cinn,
@@ -2817,11 +2874,12 @@ def check_grad_with_place(
         user_defined_grad_outputs=None,
         check_dygraph=True,
         check_prim=False,
+        check_prim_new_ir=False,
         only_check_prim=False,
         numeric_place=None,
         atol=1e-5,
         check_cinn=False,
-        check_new_ir=True,
+        check_new_ir=False,
     ):
         if hasattr(self, "use_custom_device") and self.use_custom_device:
             check_dygraph = False
@@ -2841,8 +2899,24 @@ def check_grad_with_place(
             prim_grad_checker.check()
             # Support operators which are not in the NO_FP64_CHECK_GRAD_OP_LIST list can be test prim with fp32
             self.__class__.check_prim = True
-            if only_check_prim:
-                return
+
+        if check_prim_new_ir:
+            with paddle.new_ir_utils.IrGuard():
+                self._check_grad_helper()
+                prim_grad_checker = PrimGradChecker(
+                    self,
+                    place,
+                    inputs_to_check,
+                    output_names,
+                    no_grad_set,
+                    user_defined_grad_outputs,
+                )
+                prim_grad_checker.check()
+                # Support operators which are not in the NO_FP64_CHECK_GRAD_OP_LIST list can be test prim with fp32
+                self.__class__.check_prim_new_ir = True
+
+        if only_check_prim:
+            return
         self.scope = core.Scope()
         op_inputs = self.inputs if hasattr(self, "inputs") else {}
         op_outputs = self.outputs if hasattr(self, "outputs") else {}
@@ -3019,16 +3093,11 @@ def check_grad_with_place(
                 )
 
         # get new ir gradient
-        if (
-            self.op_type
-            in new_ir_python_api_grad_white_list.new_ir_python_api_grad_white_list
-            and check_new_ir
-        ):
+        if check_new_ir:
             if (
                 type(place) is paddle.base.libpaddle.CPUPlace
                 or type(place) is paddle.base.libpaddle.CUDAPlace
             ):
-                print("New IR gradient begins...........")
                 with paddle.new_ir_utils.IrGuard():
                     new_ir_grad = self._get_ir_gradient(
                         inputs_to_check,
@@ -3037,7 +3106,23 @@ def check_grad_with_place(
                         user_defined_grad_outputs,
                         no_grad_set,
                     )
-                print("New IR gradient ends...........")
+                fp32_analytic_grads = []
+                for grad in new_ir_grad:
+                    if grad.dtype == np.uint16:
+                        grad = convert_uint16_to_float(grad)
+                        max_relative_error = (
+                            0.01
+                            if max_relative_error < 0.01
+                            else max_relative_error
+                        )
+                    fp32_analytic_grads.append(grad)
+                new_ir_grad = fp32_analytic_grads
+                if self.is_float16_op():
+                    max_relative_error = (
+                        0.01
+                        if max_relative_error < 0.01
+                        else max_relative_error
+                    )
                 self._assert_is_close(
                     numeric_grads,
                     new_ir_grad,
@@ -3217,7 +3302,7 @@ def _check_ir_grad_output(
             return
         if os.getenv("FLAGS_NEW_IR_OPTEST_WHITE_LIST") is None:
             return
-        if self.check_prim:
+        if self.check_prim or self.check_prim_new_ir:
             return
         if self._check_cinn:
             return
@@ -3427,96 +3512,109 @@ def construct_output_dict_by_kernel_sig(ret_tuple, output_sig):
         kernel_sig = self.get_kernel_signature(place)
         ir_program = paddle.static.Program()
         with paddle.static.program_guard(ir_program):
-            # prepare inps attributes feed
-            (
-                static_inputs,
-                attrs,
-                inputs_dict,
-                feed,
-            ) = self.get_ir_input_attr_dict_and_feed(stop_gradient=False)
-            # prepare args
-            args = OpTestUtils.prepare_python_api_arguments(
-                self.python_api,
-                static_inputs,
-                attrs,
-                kernel_sig,
-            )
-            inputs_sig, attrs_sig, outputs_sig = kernel_sig
-            args = OpTestUtils.assumption_assert_and_transform(
-                args, len(inputs_sig)
-            )
-            grad_outputs = []
-            if user_defined_grad_outputs is not None:
-                # user_defined_grad_outputs here are numpy arrays
-                if not isinstance(user_defined_grad_outputs, list):
-                    user_defined_grad_outputs = [user_defined_grad_outputs]
-                for grad_out_value, idx in zip(
-                    user_defined_grad_outputs,
-                    range(len(user_defined_grad_outputs)),
-                ):
-                    grad_val = paddle.static.data(
-                        name='val_grad_%s' % idx,
-                        shape=grad_out_value.shape,
-                        dtype=grad_out_value.dtype,
-                    )
-                    grad_outputs.append(grad_val)
-                    feed.update({'val_grad_%s' % idx: grad_out_value})
-                # delete the inputs which no need to calculate grad
-                for no_grad_val in no_grad_set:
-                    del static_inputs[no_grad_val]
+            with scope_guard(Scope()):
+                # prepare inps attributes feed
+                (
+                    static_inputs,
+                    attrs,
+                    inputs_dict,
+                    feed,
+                ) = self.get_ir_input_attr_dict_and_feed(stop_gradient=False)
+                # prepare args
+                args = OpTestUtils.prepare_python_api_arguments(
+                    self.python_api,
+                    static_inputs,
+                    attrs,
+                    kernel_sig,
+                )
+                inputs_sig, attrs_sig, outputs_sig = kernel_sig
+                args = OpTestUtils.assumption_assert_and_transform(
+                    args, len(inputs_sig)
+                )
+                grad_outputs = []
+                if user_defined_grad_outputs is not None:
+                    # user_defined_grad_outputs here are numpy arrays
+                    if not isinstance(user_defined_grad_outputs, list):
+                        user_defined_grad_outputs = [user_defined_grad_outputs]
+                    for grad_out_value, idx in zip(
+                        user_defined_grad_outputs,
+                        range(len(user_defined_grad_outputs)),
+                    ):
+                        grad_val = paddle.static.data(
+                            name='val_grad_%s' % idx,
+                            shape=grad_out_value.shape,
+                            dtype=grad_out_value.dtype,
+                        )
+                        grad_outputs.append(grad_val)
+                        feed.update({'val_grad_%s' % idx: grad_out_value})
+                    # delete the inputs which no need to calculate grad
+                    for no_grad_val in no_grad_set:
+                        del static_inputs[no_grad_val]
+
+                ret_tuple = self.python_api(*args)
+                outputs = construct_output_dict_by_kernel_sig(
+                    ret_tuple, outputs_sig
+                )
+                if hasattr(self, "python_out_sig_sub_name"):
+                    for key in self.python_out_sig_sub_name.keys():
+                        for i in range(len(self.python_out_sig_sub_name[key])):
+                            outputs[key][0][
+                                i
+                            ].name = self.python_out_sig_sub_name[key][i]
+                fetch_list = getattr(self, "fetch_list", [])
+
+                # cast outputs
+                if self.dtype == np.uint16:
+                    for output in outputs:
+                        outputs[output][0] = paddle.cast(
+                            outputs[output][0],
+                            paddle.base.core.DataType.FLOAT32,
+                        )
 
-            ret_tuple = self.python_api(*args)
-            outputs = construct_output_dict_by_kernel_sig(
-                ret_tuple, outputs_sig
-            )
-            if hasattr(self, "python_out_sig_sub_name"):
-                for key in self.python_out_sig_sub_name.keys():
-                    for i in range(len(self.python_out_sig_sub_name[key])):
-                        outputs[key][0][i].name = self.python_out_sig_sub_name[
-                            key
-                        ][i]
-            fetch_list = getattr(self, "fetch_list", [])
+                outputs_valid = outputs
+                loss_inputs = []
+                for input_name in inputs_to_check:
+                    loss_inputs.append(inputs_dict[input_name])
 
-            outputs_valid = outputs
-            loss_inputs = []
-            for input_name in inputs_to_check:
-                loss_inputs.append(inputs_dict[input_name])
+                if user_defined_grad_outputs is None:
+                    if len(outputs_valid) == 1:
+                        for outputs_valid_key in outputs_valid:
+                            loss = paddle.mean(
+                                outputs_valid[outputs_valid_key][0]
+                            )
+                    else:
+                        avg_sum = []
+                        for cur_loss in outputs_valid:
+                            cur_avg_loss = paddle.mean(
+                                outputs_valid[cur_loss][0]
+                            )
+                            avg_sum.append(cur_avg_loss)
+                        loss_sum = paddle.add_n(avg_sum)
+                        loss = paddle.scale(
+                            loss_sum, scale=1.0 / float(len(avg_sum))
+                        )
 
-            if user_defined_grad_outputs is None:
-                if len(outputs_valid) == 1:
-                    for outputs_valid_key in outputs_valid:
-                        loss = paddle.mean(outputs_valid[outputs_valid_key][0])
+                    grad_inputs = ir_grad(
+                        outputs=paddle.utils.flatten(loss),
+                        inputs=paddle.utils.flatten(loss_inputs),
+                        grad_outputs=None,
+                    )
                 else:
-                    avg_sum = []
-                    for cur_loss in outputs_valid:
-                        cur_avg_loss = paddle.mean(outputs_valid[cur_loss][0])
-                        avg_sum.append(cur_avg_loss)
-                    loss_sum = paddle.add_n(avg_sum)
-                    loss = paddle.scale(
-                        loss_sum, scale=1.0 / float(len(avg_sum))
+                    grad_inputs = ir_grad(
+                        outputs=paddle.utils.flatten(outputs),
+                        inputs=paddle.utils.flatten(static_inputs),
+                        grad_outputs=grad_outputs,
                     )
-
-                grad_inputs = ir_grad(
-                    outputs=paddle.utils.flatten(loss),
-                    inputs=paddle.utils.flatten(loss_inputs),
-                    grad_outputs=None,
+                fetch_list = list(grad_inputs)
+
+                # executor run
+                executor = paddle.static.Executor()
+                outs = executor.run(
+                    ir_program,
+                    feed=feed,
+                    fetch_list=fetch_list,
                 )
-            else:
-                grad_inputs = ir_grad(
-                    outputs=paddle.utils.flatten(outputs),
-                    inputs=paddle.utils.flatten(static_inputs),
-                    grad_outputs=grad_outputs,
-                )
-            fetch_list = list(grad_inputs)
-
-            # executor run
-            executor = paddle.static.Executor()
-            outs = executor.run(
-                ir_program,
-                feed=feed,
-                fetch_list=fetch_list,
-            )
-            return outs
+                return outs
 
 
 class OpTestTool:
diff --git a/test/legacy_test/prim_op_test.py b/test/legacy_test/prim_op_test.py
index c41cfc05d79ca..7d8dc9b0a770c 100644
--- a/test/legacy_test/prim_op_test.py
+++ b/test/legacy_test/prim_op_test.py
@@ -20,6 +20,7 @@
 import numpy as np
 
 import paddle
+from paddle.autograd.ir_backward import grad as ir_grad
 from paddle.base import core
 from paddle.base.framework import (
     OpProtoHolder,
@@ -27,6 +28,7 @@
     canonicalize_attrs,
     in_dygraph_mode,
 )
+from paddle.decomposition import decompose
 from paddle.incubate.autograd import primapi
 from paddle.jit.dy2static.utils import parse_arg_and_kwargs
 
@@ -406,12 +408,16 @@ def check(self):
         ):
             return
         self.eager_desire = self.get_eager_desire()
-        if self.enable_check_static_comp:
-            self.check_static_comp()
-        if self.enable_check_jit_comp:
-            self.check_jit_comp()
-        if self.enable_check_jit_comp_with_cinn:
-            self.check_jit_comp_with_cinn()
+        if not paddle.ir.core._use_new_ir_api():
+            if self.enable_check_static_comp:
+                self.check_static_comp()
+            if self.enable_check_jit_comp:
+                self.check_jit_comp()
+            if self.enable_check_jit_comp_with_cinn:
+                self.check_jit_comp_with_cinn()
+        else:
+            if self.enable_check_static_comp:
+                self.check_static_comp()
 
         self.recover_eager_or_static_status()
 
@@ -604,19 +610,23 @@ def check_static_comp(self):
                     args, len(inputs_sig)
                 )
                 ret = flatten(_as_list(self.public_python_api(*args)))
-                primapi.to_prim(main_program.blocks)
-            # ensure the operator not in program if check_prim is True
-            forward_ops = [op.type for op in main_program.blocks[0].ops]
-            assert self.op_type not in forward_ops, (
-                "%s shouldn't appear in program when check_prim is True"
-            ) % (self.op_type)
-            exe = paddle.static.Executor(self.place)
-            exe.run(startup_program)
-            ret = exe.run(main_program, feed=feed, fetch_list=ret)
-            if OpTestUtils.is_bfloat16_type(self.dtype):
-                ret = paddle.utils.map_structure(
-                    lambda x: convert_uint16_to_float(x), ret
-                )
+                if not paddle.ir.core._use_new_ir_api():
+                    primapi.to_prim(main_program.blocks)
+                else:
+                    ret = decompose(main_program, ret)
+                # ensure the operator not in program if check_prim is True
+                if not paddle.ir.core._use_new_ir_api():
+                    forward_ops = [op.type for op in main_program.blocks[0].ops]
+                    assert self.op_type not in forward_ops, (
+                        "%s shouldn't appear in program when check_prim is True"
+                    ) % (self.op_type)
+                exe = paddle.static.Executor(self.place)
+                exe.run(startup_program)
+                ret = exe.run(main_program, feed=feed, fetch_list=ret)
+                if OpTestUtils.is_bfloat16_type(self.dtype):
+                    ret = paddle.utils.map_structure(
+                        lambda x: convert_uint16_to_float(x), ret
+                    )
         # check static forward
         if len(ret) != len(self.eager_desire):
             msg = (
@@ -853,15 +863,18 @@ def check(self):
         ):
             return
         self.eager_desire = self.get_eager_desire()
-        if self.enable_check_eager_comp:
-            self.check_eager_comp()
-        if self.enable_check_static_comp:
-            self.check_static_comp()
-        if self.enable_check_jit_comp:
-            self.check_jit_comp()
-        if self.enable_check_jit_comp_with_cinn:
-            self.check_jit_comp_with_cinn()
-
+        if not paddle.ir.core._use_new_ir_api():
+            if self.enable_check_eager_comp:
+                self.check_eager_comp()
+            if self.enable_check_static_comp:
+                self.check_static_comp()
+            if self.enable_check_jit_comp:
+                self.check_jit_comp()
+            if self.enable_check_jit_comp_with_cinn:
+                self.check_jit_comp_with_cinn()
+        else:
+            if self.enable_check_static_comp:
+                self.check_static_comp()
         self.recover_eager_or_static_status()
 
     def get_output_dict(self, np_outputs, api_outputs, outputs_sig):
@@ -1054,10 +1067,13 @@ def check_static_comp(self):
                     args, len(inputs_sig)
                 )
                 fw_outs = _as_list(self.public_python_api(*args))
+                if not paddle.ir.core._use_new_ir_api():
+                    primapi.to_prim(main_program.blocks)
+                else:
+                    fw_outs = decompose(main_program, fw_outs)
                 outputs_dict = self.get_output_dict(
                     self.outputs, fw_outs, outputs_sig
                 )
-                primapi.to_prim(main_program.blocks)
                 ys = []
                 if isinstance(self.output_names, list):
                     for output_name in self.output_names:
@@ -1075,22 +1091,26 @@ def check_static_comp(self):
                 no_grad_vars = self.gen_no_grad_set(
                     var_dict={**inputs_dict, **outputs_dict}
                 )
-                ret = paddle.static.gradients(
-                    ys, xs, vs, no_grad_set=no_grad_vars
-                )
-            # check the backward operator not in program when check_prim is True
-            ops = [op.type for op in main_program.blocks[0].ops]
-            backward_op_type = self.op_type + "_grad"
-            assert backward_op_type not in ops, (
-                "%s shouldn't appear in program when check_prim is True"
-            ) % (backward_op_type)
-            exe = paddle.static.Executor(self.place)
-            exe.run(startup_program)
-            actual_ret = exe.run(main_program, feed=feed, fetch_list=ret)
-            if OpTestUtils.is_bfloat16_type(self.dtype):
-                actual_ret = paddle.utils.map_structure(
-                    lambda x: convert_uint16_to_float(x), actual_ret
-                )
+                if not paddle.ir.core._use_new_ir_api():
+                    ret = paddle.static.gradients(
+                        ys, xs, vs, no_grad_set=no_grad_vars
+                    )
+                else:
+                    ret = ir_grad(ys, xs, vs, no_grad_vars=no_grad_vars)
+                # check the backward operator not in program when check_prim is True
+                if not paddle.ir.core._use_new_ir_api():
+                    ops = [op.type for op in main_program.blocks[0].ops]
+                    backward_op_type = self.op_type + "_grad"
+                    assert backward_op_type not in ops, (
+                        "%s shouldn't appear in program when check_prim is True"
+                    ) % (backward_op_type)
+                exe = paddle.static.Executor(self.place)
+                exe.run(startup_program)
+                actual_ret = exe.run(main_program, feed=feed, fetch_list=ret)
+                if OpTestUtils.is_bfloat16_type(self.dtype):
+                    actual_ret = paddle.utils.map_structure(
+                        lambda x: convert_uint16_to_float(x), actual_ret
+                    )
         # check static grad out
         if len(actual_ret) != len(self.eager_desire):
             msg = (
diff --git a/test/legacy_test/test_accuracy_op.py b/test/legacy_test/test_accuracy_op.py
index e722b6912c24d..2acb9aa121e18 100755
--- a/test/legacy_test/test_accuracy_op.py
+++ b/test/legacy_test/test_accuracy_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16, paddle_static_guard
+from op_test import OpTest, convert_float_to_uint16, paddle_static_guard
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_activation_op.py b/test/legacy_test/test_activation_op.py
index a7fe2cf3f602f..295e1dfd9a8e8 100644
--- a/test/legacy_test/test_activation_op.py
+++ b/test/legacy_test/test_activation_op.py
@@ -18,7 +18,7 @@
 from contextlib import contextmanager
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 from scipy.special import erf, expit
 
 import paddle
@@ -154,6 +154,48 @@ def init_shape(self):
         self.shape = []
 
 
+class TestExp_Complex64(OpTest):
+    def setUp(self):
+        self.op_type = "exp"
+        self.python_api = paddle.exp
+        self.public_python_api = paddle.exp
+        self.init_dtype()
+        self.init_shape()
+        self.if_enable_cinn()
+        np.random.seed(1024)
+        x = (
+            np.random.uniform(-1, 1, self.shape)
+            + 1j * np.random.uniform(-1, 1, self.shape)
+        ).astype(self.dtype)
+        out = np.exp(x)
+        self.inputs = {'X': OpTest.np_dtype_to_base_dtype(x)}
+        self.outputs = {'Out': out}
+        self.convert_input_output()
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.006)
+
+    def init_dtype(self):
+        self.dtype = np.complex64
+
+    def init_shape(self):
+        self.shape = [10, 12]
+
+    def if_enable_cinn(self):
+        pass
+
+    def convert_input_output(self):
+        pass
+
+
+class TestExp_Complex128(TestExp_Complex64):
+    def init_dtype(self):
+        self.dtype = np.complex128
+
+
 class Test_Exp_Op_Fp16(unittest.TestCase):
     def test_api_fp16(self):
         with paddle.base.framework._static_guard():
@@ -192,6 +234,11 @@ def setUp(self):
 
         np.random.seed(2049)
         x = np.random.uniform(0.1, 1, self.shape).astype(self.dtype)
+        if self.dtype == np.complex64 or self.dtype == np.complex128:
+            x = (
+                np.random.uniform(-1, 1, self.shape)
+                + 1j * np.random.uniform(-1, 1, self.shape)
+            ).astype(self.dtype)
         out = np.expm1(x)
 
         self.inputs = {'X': OpTest.np_dtype_to_base_dtype(x)}
@@ -205,6 +252,16 @@ def test_check_output(self):
         self.check_output()
 
 
+class TestExpm1_Complex64(TestExpm1):
+    def init_dtype(self):
+        self.dtype = np.complex64
+
+
+class TestExpm1_Complex128(TestExpm1):
+    def init_dtype(self):
+        self.dtype = np.complex128
+
+
 class TestExpm1_ZeroDim(TestExpm1):
     def init_shape(self):
         self.shape = []
@@ -401,12 +458,15 @@ def init_dtype(self):
     def if_enable_cinn(self):
         pass
 
+    def test_check_output(self):
+        self.check_output(check_new_ir=True)
+
     def test_check_grad(self):
         # TODO(BeingGod): set `check_prim=True` when `fill_constant` supports `complex` dtype
         if self.dtype == np.complex64 or self.dtype == np.complex128:
-            self.check_grad(['X'], 'Out', check_prim=False)
+            self.check_grad(['X'], 'Out', check_prim=False, check_new_ir=True)
         else:
-            self.check_grad(['X'], 'Out', check_prim=True)
+            self.check_grad(['X'], 'Out', check_prim=True, check_new_ir=True)
 
 
 class TestSilu_ZeroDim(TestSilu):
@@ -697,6 +757,11 @@ def setUp(self):
 
         np.random.seed(1024)
         x = np.random.uniform(0.1, 1, self.shape).astype(self.dtype)
+        if self.dtype == np.complex64 or self.dtype == np.complex128:
+            x = (
+                np.random.uniform(0.1, 1, self.shape)
+                + 1j * np.random.uniform(0.1, 1, self.shape)
+            ).astype(self.dtype)
         out = np.arctan(x)
 
         self.inputs = {'X': OpTest.np_dtype_to_base_dtype(x)}
@@ -731,6 +796,16 @@ def test_dygraph(self):
             self.assertEqual(z, z_expected)
 
 
+class TestAtan_Complex64(TestAtan):
+    def init_dtype(self):
+        self.dtype = np.complex64
+
+
+class TestAtan_Complex128(TestAtan):
+    def init_dtype(self):
+        self.dtype = np.complex128
+
+
 class TestAtan_ZeroDim(TestAtan):
     def init_shape(self):
         self.shape = []
@@ -745,6 +820,11 @@ def setUp(self):
 
         np.random.seed(1024)
         x = np.random.uniform(0.1, 1, self.shape).astype(self.dtype)
+        if self.dtype == np.complex64 or self.dtype == np.complex128:
+            x = (
+                np.random.uniform(0.1, 1, self.shape)
+                + 1j * np.random.uniform(0.1, 1, self.shape)
+            ).astype(self.dtype)
         out = np.sinh(x)
         self.inputs = {'X': OpTest.np_dtype_to_base_dtype(x)}
         self.outputs = {'Out': out}
@@ -757,6 +837,16 @@ def test_check_grad(self):
         self.check_grad(['X'], 'Out')
 
 
+class TestSinh_Complex64(TestSinh):
+    def init_dtype(self):
+        self.dtype = np.complex64
+
+
+class TestSinh_Complex128(TestSinh):
+    def init_dtype(self):
+        self.dtype = np.complex128
+
+
 class TestSinh_ZeroDim(TestSinh):
     def init_shape(self):
         self.shape = []
@@ -837,6 +927,11 @@ def setUp(self):
 
         np.random.seed(1024)
         x = np.random.uniform(0.1, 1, self.shape).astype(self.dtype)
+        if self.dtype == np.complex64 or self.dtype == np.complex128:
+            x = (
+                np.random.uniform(0.1, 1, self.shape)
+                + 1j * np.random.uniform(0.1, 1, self.shape)
+            ).astype(self.dtype)
         out = np.cosh(x)
         self.inputs = {'X': OpTest.np_dtype_to_base_dtype(x)}
         self.outputs = {'Out': out}
@@ -846,7 +941,21 @@ def setUp(self):
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
-        self.check_grad(['X'], 'Out')
+        if self.dtype == np.complex64 or self.dtype == np.complex128:
+            # Complex64 [CPU]: AssertionError: 0.006845869 not less than or equal to 0.005
+            self.check_grad(['X'], 'Out', max_relative_error=0.007)
+        else:
+            self.check_grad(['X'], 'Out')
+
+
+class TestCosh_Complex64(TestCosh):
+    def init_dtype(self):
+        self.dtype = np.complex64
+
+
+class TestCosh_Complex128(TestCosh):
+    def init_dtype(self):
+        self.dtype = np.complex128
 
 
 class TestCosh_ZeroDim(TestCosh):
@@ -1770,6 +1879,11 @@ def setUp(self):
 
         np.random.seed(1024)
         x = np.random.uniform(-0.95, 0.95, self.shape).astype(self.dtype)
+        if self.dtype == np.complex64 or self.dtype == np.complex128:
+            x = (
+                np.random.uniform(-0.95, 0.95, self.shape)
+                + 1j * np.random.uniform(-0.95, 0.95, self.shape)
+            ).astype(self.dtype)
         out = np.arccos(x)
 
         self.inputs = {'X': OpTest.np_dtype_to_base_dtype(x)}
@@ -1785,6 +1899,16 @@ def test_check_grad(self):
         self.check_grad(['X'], 'Out')
 
 
+class TestAcos_Comple64(TestAcos):
+    def init_dtype(self):
+        self.dtype = np.complex64
+
+
+class TestAcos_Complex128(TestAcos):
+    def init_dtype(self):
+        self.dtype = np.complex128
+
+
 class TestAcos_ZeroDim(TestAcos):
     def init_shape(self):
         self.shape = []
@@ -1852,6 +1976,11 @@ def setUp(self):
 
         np.random.seed(2048)
         x = np.random.uniform(-0.95, 0.95, self.shape).astype(self.dtype)
+        if self.dtype == np.complex64 or self.dtype == np.complex128:
+            x = (
+                np.random.uniform(-0.95, 0.95, self.shape)
+                + 1j * np.random.uniform(-0.95, 0.95, self.shape)
+            ).astype(self.dtype)
         out = np.arcsin(x)
 
         self.inputs = {'X': OpTest.np_dtype_to_base_dtype(x)}
@@ -1867,6 +1996,16 @@ def test_check_grad(self):
         self.check_grad(['X'], 'Out')
 
 
+class TestAsin_Complex64(TestAsin):
+    def init_dtype(self):
+        self.dtype = np.complex64
+
+
+class TestAsin_Complex128(TestAsin):
+    def init_dtype(self):
+        self.dtype = np.complex128
+
+
 class TestAsin_ZeroDim(TestAsin):
     def init_shape(self):
         self.shape = []
@@ -1881,6 +2020,11 @@ def setUp(self):
 
         np.random.seed(1024)
         x = np.random.uniform(2, 3, self.shape).astype(self.dtype)
+        if self.dtype == np.complex64 or self.dtype == np.complex128:
+            x = (
+                np.random.uniform(2, 3, self.shape)
+                + 1j * np.random.uniform(2, 3, self.shape)
+            ).astype(self.dtype)
         out = np.arccosh(x)
 
         self.inputs = {'X': OpTest.np_dtype_to_base_dtype(x)}
@@ -1893,7 +2037,21 @@ def init_shape(self):
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
-        self.check_grad(['X'], 'Out')
+        if self.dtype == np.complex64:
+            # Complex64[CPU]: AssertionError: 0.012431525 not less than or equal to 0.005
+            self.check_grad(['X'], 'Out', max_relative_error=0.02)
+        else:
+            self.check_grad(['X'], 'Out')
+
+
+class TestAcosh_Complex64(TestAcosh):
+    def init_dtype(self):
+        self.dtype = np.complex64
+
+
+class TestAcosh_Complex128(TestAcosh):
+    def init_dtype(self):
+        self.dtype = np.complex128
 
 
 class TestAcosh_ZeroDim(TestAcosh):
@@ -1910,6 +2068,11 @@ def setUp(self):
 
         np.random.seed(1024)
         x = np.random.uniform(1, 2, self.shape).astype(self.dtype)
+        if self.dtype == np.complex64 or self.dtype == np.complex128:
+            x = (
+                np.random.uniform(1, 2, self.shape)
+                + 1j * np.random.uniform(1, 2, self.shape)
+            ).astype(self.dtype)
         out = np.arcsinh(x)
 
         self.inputs = {'X': OpTest.np_dtype_to_base_dtype(x)}
@@ -1922,7 +2085,21 @@ def init_shape(self):
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
-        self.check_grad(['X'], 'Out')
+        if self.dtype == np.complex64 or self.dtype == np.complex128:
+            # Complex64 [CPU]: AssertionError: 0.006898686 not less than or equal to 0.005
+            self.check_grad(['X'], 'Out', max_relative_error=0.007)
+        else:
+            self.check_grad(['X'], 'Out')
+
+
+class TestAsinh_Complex64(TestAsinh):
+    def init_dtype(self):
+        self.dtype = np.complex64
+
+
+class TestAsinh_Complex128(TestAsinh):
+    def init_dtype(self):
+        self.dtype = np.complex128
 
 
 class TestAsinh_ZeroDim(TestAsinh):
@@ -1939,6 +2116,11 @@ def setUp(self):
 
         np.random.seed(400)
         x = np.random.uniform(-0.9, 0.9, self.shape).astype(self.dtype)
+        if self.dtype == np.complex64 or self.dtype == np.complex128:
+            x = (
+                np.random.uniform(-0.9, 0.9, self.shape)
+                + 1j * np.random.uniform(-0.9, 0.9, self.shape)
+            ).astype(self.dtype)
         out = np.arctanh(x)
 
         self.inputs = {'X': OpTest.np_dtype_to_base_dtype(x)}
@@ -1954,6 +2136,16 @@ def test_check_grad(self):
         self.check_grad(['X'], 'Out')
 
 
+class TestAtanh_Complex64(TestAtanh):
+    def init_dtype(self):
+        self.dtype = np.complex64
+
+
+class TestAtanh_Complex128(TestAtanh):
+    def init_dtype(self):
+        self.dtype = np.complex128
+
+
 class TestAtanh_ZeroDim(TestAtanh):
     def init_shape(self):
         self.shape = []
diff --git a/test/legacy_test/test_adadelta_op.py b/test/legacy_test/test_adadelta_op.py
index a1733fa45a150..9202d6e2aa80f 100644
--- a/test/legacy_test/test_adadelta_op.py
+++ b/test/legacy_test/test_adadelta_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_adagrad_op.py b/test/legacy_test/test_adagrad_op.py
index 552f19f6c0aff..b1aab3c903248 100644
--- a/test/legacy_test/test_adagrad_op.py
+++ b/test/legacy_test/test_adagrad_op.py
@@ -16,8 +16,8 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
 from op import Operator
+from op_test import OpTest
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_adam_op.py b/test/legacy_test/test_adam_op.py
index f0326fc54d744..0789e08991e1a 100644
--- a/test/legacy_test/test_adam_op.py
+++ b/test/legacy_test/test_adam_op.py
@@ -15,8 +15,8 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
 from op import Operator
+from op_test import OpTest
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_adamax_op.py b/test/legacy_test/test_adamax_op.py
index c12bcfd994fa0..4087e75398266 100644
--- a/test/legacy_test/test_adamax_op.py
+++ b/test/legacy_test/test_adamax_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 
diff --git a/test/legacy_test/test_adamw_op.py b/test/legacy_test/test_adamw_op.py
index a6a30d3af101a..f685ec4cd71eb 100644
--- a/test/legacy_test/test_adamw_op.py
+++ b/test/legacy_test/test_adamw_op.py
@@ -17,7 +17,7 @@
 from functools import partial
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_adaptive_max_pool1d.py b/test/legacy_test/test_adaptive_max_pool1d.py
index b4f44790af5d3..eb12c8d597ba9 100644
--- a/test/legacy_test/test_adaptive_max_pool1d.py
+++ b/test/legacy_test/test_adaptive_max_pool1d.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import check_out_dtype, paddle_static_guard
+from op_test import check_out_dtype, paddle_static_guard
 
 import paddle
 import paddle.nn.functional as F
diff --git a/test/legacy_test/test_adaptive_max_pool2d.py b/test/legacy_test/test_adaptive_max_pool2d.py
index e4625fbd8eb33..90f0f12e9303f 100644
--- a/test/legacy_test/test_adaptive_max_pool2d.py
+++ b/test/legacy_test/test_adaptive_max_pool2d.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import check_out_dtype
+from op_test import check_out_dtype
 
 import paddle
 import paddle.nn.functional as F
diff --git a/test/legacy_test/test_adaptive_max_pool3d.py b/test/legacy_test/test_adaptive_max_pool3d.py
index b09b5779fc444..fbbf1d81588d6 100755
--- a/test/legacy_test/test_adaptive_max_pool3d.py
+++ b/test/legacy_test/test_adaptive_max_pool3d.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import check_out_dtype
+from op_test import check_out_dtype
 
 import paddle
 import paddle.nn.functional as F
diff --git a/test/legacy_test/test_add_position_encoding_op.py b/test/legacy_test/test_add_position_encoding_op.py
index 48f718ae4f98f..e28a16eee678c 100644
--- a/test/legacy_test/test_add_position_encoding_op.py
+++ b/test/legacy_test/test_add_position_encoding_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 def add_position_encoding(input, alpha=1.0, beta=1.0):
diff --git a/test/legacy_test/test_addmm_op.py b/test/legacy_test/test_addmm_op.py
index 201d86f7d17c4..1e339ad1ceb68 100644
--- a/test/legacy_test/test_addmm_op.py
+++ b/test/legacy_test/test_addmm_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_affine_channel_op.py b/test/legacy_test/test_affine_channel_op.py
index ada236d2e6c9a..17b8c1acb353b 100644
--- a/test/legacy_test/test_affine_channel_op.py
+++ b/test/legacy_test/test_affine_channel_op.py
@@ -18,7 +18,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 def affine_channel(x, scale, bias, layout):
diff --git a/test/legacy_test/test_affine_grid_op.py b/test/legacy_test/test_affine_grid_op.py
index 0bb9f2a19b435..d0a5b546d8be9 100644
--- a/test/legacy_test/test_affine_grid_op.py
+++ b/test/legacy_test/test_affine_grid_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 
diff --git a/test/legacy_test/test_allclose_op.py b/test/legacy_test/test_allclose_op.py
index e272e98403903..e263a4d517a1f 100644
--- a/test/legacy_test/test_allclose_op.py
+++ b/test/legacy_test/test_allclose_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_amp_check_finite_and_scale_op.py b/test/legacy_test/test_amp_check_finite_and_scale_op.py
index 4926b10b4a1b2..3178adf47a6ec 100644
--- a/test/legacy_test/test_amp_check_finite_and_scale_op.py
+++ b/test/legacy_test/test_amp_check_finite_and_scale_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 from paddle.static.amp import amp_nn
 
diff --git a/test/legacy_test/test_anchor_generator_op.py b/test/legacy_test/test_anchor_generator_op.py
index 6c30031e6308b..ae13f7563ed20 100644
--- a/test/legacy_test/test_anchor_generator_op.py
+++ b/test/legacy_test/test_anchor_generator_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 def anchor_generator_in_python(
diff --git a/test/legacy_test/test_angle_op.py b/test/legacy_test/test_angle_op.py
index 1433bcd117b09..c4ec247b1677b 100644
--- a/test/legacy_test/test_angle_op.py
+++ b/test/legacy_test/test_angle_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import static
diff --git a/test/legacy_test/test_arange.py b/test/legacy_test/test_arange.py
index 19390754048f4..9ca8eeee11277 100644
--- a/test/legacy_test/test_arange.py
+++ b/test/legacy_test/test_arange.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_arg_min_max_op.py b/test/legacy_test/test_arg_min_max_op.py
index 000e5d88b080b..09425be02fc53 100644
--- a/test/legacy_test/test_arg_min_max_op.py
+++ b/test/legacy_test/test_arg_min_max_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 from test_attribute_var import UnittestBase
 
 import paddle
diff --git a/test/legacy_test/test_arg_min_max_v2_op.py b/test/legacy_test/test_arg_min_max_v2_op.py
index f7e79b56e9f0b..46db83a72c788 100644
--- a/test/legacy_test/test_arg_min_max_v2_op.py
+++ b/test/legacy_test/test_arg_min_max_v2_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_argsort_op.py b/test/legacy_test/test_argsort_op.py
index 4ecbe2a8386b1..b73f50dc208a1 100644
--- a/test/legacy_test/test_argsort_op.py
+++ b/test/legacy_test/test_argsort_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_assign_op.py b/test/legacy_test/test_assign_op.py
index e9dc9889d5fc3..991c96bdf7849 100644
--- a/test/legacy_test/test_assign_op.py
+++ b/test/legacy_test/test_assign_op.py
@@ -14,11 +14,11 @@
 
 import unittest
 
-import eager_op_test
 import gradient_checker
 import numpy as np
+import op_test
 from decorator_helper import prog_scope
-from eager_op_test import convert_float_to_uint16, convert_uint16_to_float
+from op_test import convert_float_to_uint16, convert_uint16_to_float
 
 import paddle
 from paddle import base
@@ -26,7 +26,7 @@
 from paddle.base.backward import append_backward
 
 
-class TestAssignOp(eager_op_test.OpTest):
+class TestAssignOp(op_test.OpTest):
     def setUp(self):
         self.python_api = paddle.assign
         self.public_python_api = paddle.assign
@@ -59,7 +59,7 @@ def init_input_configs(self):
 @unittest.skipIf(
     not paddle.is_compiled_with_cuda(), "FP16 test runs only on GPU"
 )
-class TestAssignFP16Op(eager_op_test.OpTest):
+class TestAssignFP16Op(op_test.OpTest):
     def setUp(self):
         self.python_api = paddle.assign
         self.public_python_api = paddle.assign
@@ -84,7 +84,7 @@ def test_backward(self):
     not paddle.is_compiled_with_cuda() or paddle.is_compiled_with_rocm(),
     "BFP16 test runs only on CUDA",
 )
-class TestAssignBFP16Op(eager_op_test.OpTest):
+class TestAssignBFP16Op(op_test.OpTest):
     def setUp(self):
         self.python_api = paddle.assign
         self.public_python_api = paddle.assign
diff --git a/test/legacy_test/test_assign_pos_op.py b/test/legacy_test/test_assign_pos_op.py
index 4b039eb1bd91d..ff50a2310c8c5 100644
--- a/test/legacy_test/test_assign_pos_op.py
+++ b/test/legacy_test/test_assign_pos_op.py
@@ -14,8 +14,8 @@
 
 import unittest
 
-import eager_op_test
 import numpy as np
+import op_test
 
 import paddle
 from paddle.base import core
@@ -72,7 +72,7 @@ def redefined_allclose(x, y, *args, **kwargs):
 @unittest.skipIf(
     not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
 )
-class TestAssignPosOpInt64(eager_op_test.OpTest):
+class TestAssignPosOpInt64(op_test.OpTest):
     def setUp(self):
         x = np.random.randint(0, 16, size=(100, 2)).astype("int64")
         y = count(x, 16)
diff --git a/test/legacy_test/test_assign_value_op.py b/test/legacy_test/test_assign_value_op.py
index 28f9b0fba0597..88bb60edbbc3c 100644
--- a/test/legacy_test/test_assign_value_op.py
+++ b/test/legacy_test/test_assign_value_op.py
@@ -14,8 +14,8 @@
 
 import unittest
 
-import eager_op_test
 import numpy as np
+import op_test
 
 import paddle
 from paddle import base
@@ -31,7 +31,7 @@ def assign_value_wrapper(
     )
 
 
-class TestAssignValueOp(eager_op_test.OpTest):
+class TestAssignValueOp(op_test.OpTest):
     def setUp(self):
         self.op_type = "assign_value"
         self.python_api = assign_value_wrapper
@@ -74,7 +74,7 @@ def init_data(self):
 
 class TestAssignApi(unittest.TestCase):
     def setUp(self):
-        with eager_op_test.paddle_static_guard():
+        with op_test.paddle_static_guard():
             self.init_dtype()
             self.value = (-100 + 200 * np.random.random(size=(2, 5))).astype(
                 self.dtype
@@ -89,7 +89,7 @@ def init_dtype(self):
         self.dtype = "float32"
 
     def test_assign(self):
-        with eager_op_test.paddle_static_guard():
+        with op_test.paddle_static_guard():
             main_program = base.Program()
             with base.program_guard(main_program):
                 x = paddle.tensor.create_tensor(dtype=self.dtype)
@@ -113,7 +113,7 @@ def init_dtype(self):
 
 class TestAssignApi4(TestAssignApi):
     def setUp(self):
-        with eager_op_test.paddle_static_guard():
+        with op_test.paddle_static_guard():
             self.init_dtype()
             self.value = np.random.choice(a=[False, True], size=(2, 5)).astype(
                 np.bool_
diff --git a/test/legacy_test/test_atan2_op.py b/test/legacy_test/test_atan2_op.py
index 74d5bb7d8f290..3d2abef9393d0 100644
--- a/test/legacy_test/test_atan2_op.py
+++ b/test/legacy_test/test_atan2_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_attention_lstm_op.py b/test/legacy_test/test_attention_lstm_op.py
index c086a3f33cd92..ba92837fa7136 100644
--- a/test/legacy_test/test_attention_lstm_op.py
+++ b/test/legacy_test/test_attention_lstm_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 from test_fusion_lstm_op import ACTIVATION, fc
 from test_softmax_op import stable_softmax
 
diff --git a/test/legacy_test/test_auc_op.py b/test/legacy_test/test_auc_op.py
index 64b617c1eee22..691ee671d6fb6 100644
--- a/test/legacy_test/test_auc_op.py
+++ b/test/legacy_test/test_auc_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_auc_single_pred_op.py b/test/legacy_test/test_auc_single_pred_op.py
index 3445b5a0a119e..a89fb29faaf0a 100644
--- a/test/legacy_test/test_auc_single_pred_op.py
+++ b/test/legacy_test/test_auc_single_pred_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 
diff --git a/test/legacy_test/test_batch_fc_op.py b/test/legacy_test/test_batch_fc_op.py
index e5f045a49d604..f7a0249896e98 100644
--- a/test/legacy_test/test_batch_fc_op.py
+++ b/test/legacy_test/test_batch_fc_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 from paddle.base import core
 
diff --git a/test/legacy_test/test_batch_norm_op.py b/test/legacy_test/test_batch_norm_op.py
index 4f3f06f692cba..cfbb33c2a2933 100644
--- a/test/legacy_test/test_batch_norm_op.py
+++ b/test/legacy_test/test_batch_norm_op.py
@@ -16,13 +16,13 @@
 import unittest
 
 import numpy as np
-from eager_op_test import (
+from op import Operator
+from op_test import (
     OpTest,
     _set_use_system_allocator,
     convert_float_to_uint16,
     convert_uint16_to_float,
 )
-from op import Operator
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_batch_norm_op_prim_nchw.py b/test/legacy_test/test_batch_norm_op_prim_nchw.py
index a83fc9fbc2c88..88d02468e0400 100644
--- a/test/legacy_test/test_batch_norm_op_prim_nchw.py
+++ b/test/legacy_test/test_batch_norm_op_prim_nchw.py
@@ -15,11 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import (
-    OpTest,
-    _set_use_system_allocator,
-    convert_float_to_uint16,
-)
+from op_test import OpTest, _set_use_system_allocator, convert_float_to_uint16
 
 import paddle
 import paddle.nn.functional as F
diff --git a/test/legacy_test/test_batch_norm_op_prim_nhwc.py b/test/legacy_test/test_batch_norm_op_prim_nhwc.py
index 11d8c4a0db459..bfa104bcf73d4 100644
--- a/test/legacy_test/test_batch_norm_op_prim_nhwc.py
+++ b/test/legacy_test/test_batch_norm_op_prim_nhwc.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import _set_use_system_allocator
+from op_test import _set_use_system_allocator
 from test_batch_norm_op_prim_nchw import TestBatchNormOp
 
 import paddle
diff --git a/test/legacy_test/test_bce_loss.py b/test/legacy_test/test_bce_loss.py
index dc95248ae9bde..a9fe9cfa030d9 100644
--- a/test/legacy_test/test_bce_loss.py
+++ b/test/legacy_test/test_bce_loss.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_bernoulli_op.py b/test/legacy_test/test_bernoulli_op.py
index 51cd296436ef3..0c68f143e6190 100644
--- a/test/legacy_test/test_bernoulli_op.py
+++ b/test/legacy_test/test_bernoulli_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_bicubic_interp_op.py b/test/legacy_test/test_bicubic_interp_op.py
index 8cec2f266cf61..a1301bcd2242b 100644
--- a/test/legacy_test/test_bicubic_interp_op.py
+++ b/test/legacy_test/test_bicubic_interp_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_bicubic_interp_v2_op.py b/test/legacy_test/test_bicubic_interp_v2_op.py
index e034015f9a8f0..7aec439fb62ad 100644
--- a/test/legacy_test/test_bicubic_interp_v2_op.py
+++ b/test/legacy_test/test_bicubic_interp_v2_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_bilateral_slice_op.py b/test/legacy_test/test_bilateral_slice_op.py
index 1cf5f292240d5..6de6bfc66beae 100644
--- a/test/legacy_test/test_bilateral_slice_op.py
+++ b/test/legacy_test/test_bilateral_slice_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, paddle_static_guard
+from op_test import OpTest, paddle_static_guard
 
 import paddle
 from paddle.incubate.layers.nn import bilateral_slice
diff --git a/test/legacy_test/test_bilinear_interp_op.py b/test/legacy_test/test_bilinear_interp_op.py
index fca64f8b802df..9409762d881c0 100755
--- a/test/legacy_test/test_bilinear_interp_op.py
+++ b/test/legacy_test/test_bilinear_interp_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_bilinear_interp_v2_op.py b/test/legacy_test/test_bilinear_interp_v2_op.py
index 93805968673a8..7726d620c5d81 100755
--- a/test/legacy_test/test_bilinear_interp_v2_op.py
+++ b/test/legacy_test/test_bilinear_interp_v2_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_bilinear_tensor_product_op.py b/test/legacy_test/test_bilinear_tensor_product_op.py
index 3e8a81393ff05..20dcc132e80d6 100644
--- a/test/legacy_test/test_bilinear_tensor_product_op.py
+++ b/test/legacy_test/test_bilinear_tensor_product_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, paddle_static_guard
+from op_test import OpTest, paddle_static_guard
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_bincount_op.py b/test/legacy_test/test_bincount_op.py
index f788dcf2798c7..f388141671443 100644
--- a/test/legacy_test/test_bincount_op.py
+++ b/test/legacy_test/test_bincount_op.py
@@ -17,7 +17,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 import paddle.inference as paddle_infer
@@ -221,6 +221,14 @@ def init_test_case(self):
         self.Out = np.bincount(self.np_input, minlength=self.minlength)
 
 
+class TestCase6(TestBincountOp):
+    # with bigger input size
+    def init_test_case(self):
+        self.minlength = 0
+        self.np_input = np.random.randint(low=0, high=10, size=1024)
+        self.Out = np.bincount(self.np_input, minlength=self.minlength)
+
+
 class TestTensorMinlength(unittest.TestCase):
     def setUp(self):
         paddle.disable_static()
diff --git a/test/legacy_test/test_bipartite_match_op.py b/test/legacy_test/test_bipartite_match_op.py
index b42750d84b9a9..954c47d068a11 100644
--- a/test/legacy_test/test_bipartite_match_op.py
+++ b/test/legacy_test/test_bipartite_match_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 def bipartite_match(distance, match_indices, match_dist):
diff --git a/test/legacy_test/test_bitwise_op.py b/test/legacy_test/test_bitwise_op.py
index 728ea62dbf2cb..7f60c00901f92 100644
--- a/test/legacy_test/test_bitwise_op.py
+++ b/test/legacy_test/test_bitwise_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 
diff --git a/test/legacy_test/test_bmm_op.py b/test/legacy_test/test_bmm_op.py
index 91a0f806dbe16..7885faf6f8eb8 100644
--- a/test/legacy_test/test_bmm_op.py
+++ b/test/legacy_test/test_bmm_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16, paddle_static_guard
+from op_test import OpTest, convert_float_to_uint16, paddle_static_guard
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_box_clip_op.py b/test/legacy_test/test_box_clip_op.py
index 1f7f3e9672541..18050092d6af1 100644
--- a/test/legacy_test/test_box_clip_op.py
+++ b/test/legacy_test/test_box_clip_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 def box_clip(input_box, im_info, output_box):
diff --git a/test/legacy_test/test_box_coder_op.py b/test/legacy_test/test_box_coder_op.py
index c0beed3ca4a91..7221fb2ba73f6 100644
--- a/test/legacy_test/test_box_coder_op.py
+++ b/test/legacy_test/test_box_coder_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 
diff --git a/test/legacy_test/test_box_decoder_and_assign_op.py b/test/legacy_test/test_box_decoder_and_assign_op.py
index 48b310671789f..555e5fbd2c6f7 100644
--- a/test/legacy_test/test_box_decoder_and_assign_op.py
+++ b/test/legacy_test/test_box_decoder_and_assign_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 def box_decoder_and_assign(deltas, weights, boxes, box_score, box_clip):
diff --git a/test/legacy_test/test_bpr_loss_op.py b/test/legacy_test/test_bpr_loss_op.py
index 27c32c50063be..9f4bf31dc76d6 100644
--- a/test/legacy_test/test_bpr_loss_op.py
+++ b/test/legacy_test/test_bpr_loss_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, randomize_probability
+from op_test import OpTest, randomize_probability
 
 import paddle
 
diff --git a/test/legacy_test/test_broadcast_error.py b/test/legacy_test/test_broadcast_error.py
index 3e8f924c85f2f..d42cc6d9b8840 100644
--- a/test/legacy_test/test_broadcast_error.py
+++ b/test/legacy_test/test_broadcast_error.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 from paddle.base import core
 
diff --git a/test/legacy_test/test_broadcast_tensors_op.py b/test/legacy_test/test_broadcast_tensors_op.py
index 8e691f02fefff..15e42d48cebfd 100644
--- a/test/legacy_test/test_broadcast_tensors_op.py
+++ b/test/legacy_test/test_broadcast_tensors_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_c_softmax_with_cross_entropy_op.py b/test/legacy_test/test_c_softmax_with_cross_entropy_op.py
index 44ce71285af33..7db9cc040f1dc 100644
--- a/test/legacy_test/test_c_softmax_with_cross_entropy_op.py
+++ b/test/legacy_test/test_c_softmax_with_cross_entropy_op.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import subprocess
 import sys
 import unittest
@@ -20,7 +21,7 @@
 
 
 class TestCSoftmaxWithCrossEntropy(unittest.TestCase):
-    def pdrun(self):
+    def pdrun(self, need_envs={}):
         cmd = [
             sys.executable,
             "-m",
@@ -29,13 +30,19 @@ def pdrun(self):
             "0,1",
             "c_softmax_with_cross_entropy_op.py",
         ]
-        proc = subprocess.Popen(cmd)
+        envs = os.environ.copy()
+        envs.update(need_envs)
+        proc = subprocess.Popen(cmd, env=envs)
         return proc
 
     def test_c_softmax_with_cross_entropy_op(self):
         p = self.pdrun()
         p.wait()
 
+    def test_c_softmax_with_cross_entropy_new_comm(self):
+        p = self.pdrun(need_envs={"FLAGS_dynamic_static_unified_comm": "1"})
+        p.wait()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/legacy_test/test_cast_op.py b/test/legacy_test/test_cast_op.py
index e3bae330f0910..e24eb6b44b631 100644
--- a/test/legacy_test/test_cast_op.py
+++ b/test/legacy_test/test_cast_op.py
@@ -17,11 +17,7 @@
 import gradient_checker
 import numpy as np
 from decorator_helper import prog_scope
-from eager_op_test import (
-    OpTest,
-    convert_float_to_uint16,
-    convert_uint16_to_float,
-)
+from op_test import OpTest, convert_float_to_uint16, convert_uint16_to_float
 
 import paddle
 from paddle import base
@@ -29,7 +25,12 @@
 
 
 def cast_wrapper(x, out_dtype=None):
-    return paddle.cast(x, paddle.dtype(out_dtype))
+    paddle_dtype = paddle.dtype(out_dtype)
+    # unify dtype to numpy_type for pir and dygraph
+    numpy_dtype = paddle.base.data_feeder._PADDLE_DTYPE_2_NUMPY_DTYPE[
+        paddle_dtype
+    ]
+    return paddle.cast(x, numpy_dtype)
 
 
 class TestCastOpFp32ToFp64(OpTest):
@@ -51,10 +52,10 @@ def init_shapes(self):
         self.input_shape = [10, 10]
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=True)
 
     def test_grad(self):
-        self.check_grad(['X'], ['Out'], check_prim=True)
+        self.check_grad(['X'], ['Out'], check_prim=True, check_new_ir=True)
 
 
 class TestCastOpFp32ToFp64_ZeroDim(TestCastOpFp32ToFp64):
diff --git a/test/legacy_test/test_center_loss.py b/test/legacy_test/test_center_loss.py
index ffb193953129e..31863cd93f767 100644
--- a/test/legacy_test/test_center_loss.py
+++ b/test/legacy_test/test_center_loss.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 
diff --git a/test/legacy_test/test_channel_shuffle.py b/test/legacy_test/test_channel_shuffle.py
index 2ade43213190d..e59a2c2de61e7 100644
--- a/test/legacy_test/test_channel_shuffle.py
+++ b/test/legacy_test/test_channel_shuffle.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 import paddle.nn.functional as F
diff --git a/test/legacy_test/test_cholesky_op.py b/test/legacy_test/test_cholesky_op.py
index daacb067cdff8..034cbb87366fa 100644
--- a/test/legacy_test/test_cholesky_op.py
+++ b/test/legacy_test/test_cholesky_op.py
@@ -16,8 +16,8 @@
 
 import numpy as np
 from decorator_helper import prog_scope
-from eager_op_test import OpTest, skip_check_grad_ci
 from gradient_checker import grad_check
+from op_test import OpTest, skip_check_grad_ci
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_cholesky_solve_op.py b/test/legacy_test/test_cholesky_solve_op.py
index 413119ecc7587..c1c9e4e7400bc 100644
--- a/test/legacy_test/test_cholesky_solve_op.py
+++ b/test/legacy_test/test_cholesky_solve_op.py
@@ -20,7 +20,7 @@
 import scipy.linalg
 
 sys.path.append("..")
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_chunk_eval_op.py b/test/legacy_test/test_chunk_eval_op.py
index 61c29519d117f..b059c04f8e0f4 100644
--- a/test/legacy_test/test_chunk_eval_op.py
+++ b/test/legacy_test/test_chunk_eval_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 class Segment:
diff --git a/test/legacy_test/test_class_center_sample_op.py b/test/legacy_test/test_class_center_sample_op.py
index ffc7350e0f703..da903b5a16689 100644
--- a/test/legacy_test/test_class_center_sample_op.py
+++ b/test/legacy_test/test_class_center_sample_op.py
@@ -12,10 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, paddle_static_guard
+from op_test import OpTest, paddle_static_guard
 
 import paddle
 from paddle.base import Program, core, program_guard
@@ -87,7 +88,11 @@ def init_dtype(self):
     def init_fix_seed(self):
         self.fix_seed = True
 
+    def with_new_comm(self):
+        os.environ["FLAGS_dynamic_static_unified_comm"] = "0"
+
     def setUp(self):
+        self.with_new_comm()
         self.initParams()
         self.init_dtype()
         self.init_fix_seed()
@@ -126,6 +131,11 @@ def init_fix_seed(self):
         self.fix_seed = True
 
 
+class TestClassCenterSampleOpWithNewComm(TestClassCenterSampleOp):
+    def with_new_comm(self):
+        os.environ["FLAGS_dynamic_static_unified_comm"] = "1"
+
+
 class TestClassCenterSampleV2(unittest.TestCase):
     def setUp(self):
         self.initParams()
diff --git a/test/legacy_test/test_clip_by_norm_op.py b/test/legacy_test/test_clip_by_norm_op.py
index 8319255bc925d..f3c1336a18fc2 100644
--- a/test/legacy_test/test_clip_by_norm_op.py
+++ b/test/legacy_test/test_clip_by_norm_op.py
@@ -15,8 +15,8 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
 from op import Operator
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_clip_op.py b/test/legacy_test/test_clip_op.py
index 354bc1ae95cad..1c8366bbdf5ef 100644
--- a/test/legacy_test/test_clip_op.py
+++ b/test/legacy_test/test_clip_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_coalesce_tensor_op.py b/test/legacy_test/test_coalesce_tensor_op.py
index 46be0e58bec2e..cb22bc5f85092 100644
--- a/test/legacy_test/test_coalesce_tensor_op.py
+++ b/test/legacy_test/test_coalesce_tensor_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_collect_fpn_proposals_op.py b/test/legacy_test/test_collect_fpn_proposals_op.py
index b10771009d870..4898eeecfdc87 100644
--- a/test/legacy_test/test_collect_fpn_proposals_op.py
+++ b/test/legacy_test/test_collect_fpn_proposals_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 class TestCollectFPNProposalstOp(OpTest):
diff --git a/test/legacy_test/test_collective_api_base.py b/test/legacy_test/test_collective_api_base.py
index 9cca57c50edc8..08de4a1be9a32 100644
--- a/test/legacy_test/test_collective_api_base.py
+++ b/test/legacy_test/test_collective_api_base.py
@@ -24,7 +24,7 @@
 sys.path.append("../legacy_test")
 
 import numpy as np
-from eager_op_test import convert_float_to_uint16, convert_uint16_to_float
+from op_test import convert_float_to_uint16, convert_uint16_to_float
 
 import paddle
 import paddle.distributed as dist
diff --git a/test/legacy_test/test_compare_op.py b/test/legacy_test/test_compare_op.py
index 85bdc6cd11838..815febea2857e 100755
--- a/test/legacy_test/test_compare_op.py
+++ b/test/legacy_test/test_compare_op.py
@@ -14,9 +14,9 @@
 
 import unittest
 
-import eager_op_test
 import numpy
 import numpy as np
+import op_test
 
 import paddle
 from paddle import base
@@ -24,7 +24,7 @@
 
 
 def create_test_class(op_type, typename, callback):
-    class Cls(eager_op_test.OpTest):
+    class Cls(op_test.OpTest):
         def setUp(self):
             a = numpy.random.random(size=(10, 7)).astype(typename)
             b = numpy.random.random(size=(10, 7)).astype(typename)
@@ -444,7 +444,7 @@ def test_attr_name(self):
 
 # add bf16 tests
 def create_bf16_case(op_type, callback):
-    class TestCompareOpBF16Op(eager_op_test.OpTest):
+    class TestCompareOpBF16Op(op_test.OpTest):
         def setUp(self):
             self.op_type = op_type
             self.dtype = np.uint16
@@ -454,8 +454,8 @@ def setUp(self):
             y = np.random.uniform(0, 1, [5, 5]).astype(np.float32)
             real_result = callback(x, y)
             self.inputs = {
-                'X': eager_op_test.convert_float_to_uint16(x),
-                'Y': eager_op_test.convert_float_to_uint16(y),
+                'X': op_test.convert_float_to_uint16(x),
+                'Y': op_test.convert_float_to_uint16(y),
             }
             self.outputs = {'Out': real_result}
 
diff --git a/test/legacy_test/test_compare_reduce_op.py b/test/legacy_test/test_compare_reduce_op.py
index 2f982d3d8aa0b..e281407c242b0 100644
--- a/test/legacy_test/test_compare_reduce_op.py
+++ b/test/legacy_test/test_compare_reduce_op.py
@@ -14,14 +14,14 @@
 
 import unittest
 
-import eager_op_test
 import numpy as np
+import op_test
 
 import paddle
 
 
 def create_test_not_equal_class(op_type, typename, callback):
-    class Cls(eager_op_test.OpTest):
+    class Cls(op_test.OpTest):
         def setUp(self):
             x = np.random.random(size=(10, 7)).astype(typename)
             y = np.random.random(size=(10, 7)).astype(typename)
@@ -40,7 +40,7 @@ def test_output(self):
 
 
 def create_test_not_shape_equal_class(op_type, typename, callback):
-    class Cls(eager_op_test.OpTest):
+    class Cls(op_test.OpTest):
         def setUp(self):
             x = np.random.random(size=(10, 7)).astype(typename)
             y = np.random.random(size=(10)).astype(typename)
@@ -59,7 +59,7 @@ def test_output(self):
 
 
 def create_test_equal_class(op_type, typename, callback):
-    class Cls(eager_op_test.OpTest):
+    class Cls(op_test.OpTest):
         def setUp(self):
             x = y = np.random.random(size=(10, 7)).astype(typename)
             z = callback(x, y)
@@ -77,7 +77,7 @@ def test_output(self):
 
 
 def create_test_dim1_class(op_type, typename, callback):
-    class Cls(eager_op_test.OpTest):
+    class Cls(op_test.OpTest):
         def setUp(self):
             x = y = np.random.random(size=(1)).astype(typename)
             x = np.array([True, False, True]).astype(typename)
diff --git a/test/legacy_test/test_complex_abs.py b/test/legacy_test/test_complex_abs.py
index 845308b9bb026..81089949a47e4 100644
--- a/test/legacy_test/test_complex_abs.py
+++ b/test/legacy_test/test_complex_abs.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 import paddle.base.dygraph as dg
diff --git a/test/legacy_test/test_complex_op.py b/test/legacy_test/test_complex_op.py
index 910e8945afda4..151ecfbdb6524 100644
--- a/test/legacy_test/test_complex_op.py
+++ b/test/legacy_test/test_complex_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle import static
diff --git a/test/legacy_test/test_complex_view_op.py b/test/legacy_test/test_complex_view_op.py
index 6055a9781419a..b747804ca65c5 100644
--- a/test/legacy_test/test_complex_view_op.py
+++ b/test/legacy_test/test_complex_view_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle import static
diff --git a/test/legacy_test/test_concat_op.py b/test/legacy_test/test_concat_op.py
index 286f59797caed..aa057011b209c 100644
--- a/test/legacy_test/test_concat_op.py
+++ b/test/legacy_test/test_concat_op.py
@@ -17,7 +17,7 @@
 import gradient_checker
 import numpy as np
 from decorator_helper import prog_scope
-from eager_op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci
+from op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_conj_op.py b/test/legacy_test/test_conj_op.py
index 4cad7bc276fc1..4f552a3801c85 100644
--- a/test/legacy_test/test_conj_op.py
+++ b/test/legacy_test/test_conj_op.py
@@ -20,8 +20,8 @@
 import paddle
 
 sys.path.append("..")
-from eager_op_test import OpTest, convert_float_to_uint16
 from numpy.random import random as rand
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle.base.dygraph as dg
 from paddle import static
diff --git a/test/legacy_test/test_conv2d_fusion_op.py b/test/legacy_test/test_conv2d_fusion_op.py
index 6aff6cb38b967..accf4f6715399 100644
--- a/test/legacy_test/test_conv2d_fusion_op.py
+++ b/test/legacy_test/test_conv2d_fusion_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 from test_conv2d_op import conv2d_forward_naive
 
 from paddle.base import core
diff --git a/test/legacy_test/test_conv2d_op.py b/test/legacy_test/test_conv2d_op.py
index 3d4ffe78b935c..88bfd4cb5c02f 100644
--- a/test/legacy_test/test_conv2d_op.py
+++ b/test/legacy_test/test_conv2d_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16, get_numeric_gradient
+from op_test import OpTest, convert_float_to_uint16, get_numeric_gradient
 from testsuite import create_op
 
 import paddle
diff --git a/test/legacy_test/test_conv2d_op_depthwise_conv.py b/test/legacy_test/test_conv2d_op_depthwise_conv.py
index 19e5b335768b0..c2e5451eb39df 100644
--- a/test/legacy_test/test_conv2d_op_depthwise_conv.py
+++ b/test/legacy_test/test_conv2d_op_depthwise_conv.py
@@ -19,7 +19,7 @@
 import paddle
 
 paddle.enable_static()
-from eager_op_test import get_numeric_gradient
+from op_test import get_numeric_gradient
 from test_conv2d_op import (
     TestConv2DOp,
     TestConv2DOp_v2,
diff --git a/test/legacy_test/test_conv2d_transpose_op.py b/test/legacy_test/test_conv2d_transpose_op.py
index 28478c886ab7d..ef610d3af0516 100644
--- a/test/legacy_test/test_conv2d_transpose_op.py
+++ b/test/legacy_test/test_conv2d_transpose_op.py
@@ -21,7 +21,7 @@
 from paddle import nn
 
 paddle.enable_static()
-from eager_op_test import OpTest, convert_float_to_uint16, get_numeric_gradient
+from op_test import OpTest, convert_float_to_uint16, get_numeric_gradient
 from test_attribute_var import UnittestBase
 from testsuite import create_op
 
diff --git a/test/legacy_test/test_conv3d_op.py b/test/legacy_test/test_conv3d_op.py
index 5ae2fb3a0f888..14e7d9dc09930 100644
--- a/test/legacy_test/test_conv3d_op.py
+++ b/test/legacy_test/test_conv3d_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import (
+from op_test import (
     OpTest,
     convert_float_to_uint16,
     get_numeric_gradient,
diff --git a/test/legacy_test/test_conv3d_transpose_op.py b/test/legacy_test/test_conv3d_transpose_op.py
index e8cdb7ddf42c7..35f2967f5f5ba 100644
--- a/test/legacy_test/test_conv3d_transpose_op.py
+++ b/test/legacy_test/test_conv3d_transpose_op.py
@@ -19,7 +19,7 @@
 import paddle
 
 paddle.enable_static()
-from eager_op_test import OpTest, copy_bits_from_float_to_uint16
+from op_test import OpTest, copy_bits_from_float_to_uint16
 
 from paddle.base import core
 
diff --git a/test/legacy_test/test_conv_shift_op.py b/test/legacy_test/test_conv_shift_op.py
index d416761b1b6c0..26965d9b393cb 100644
--- a/test/legacy_test/test_conv_shift_op.py
+++ b/test/legacy_test/test_conv_shift_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 def conv_shift_forward(x, y):
diff --git a/test/legacy_test/test_cos_sim_op.py b/test/legacy_test/test_cos_sim_op.py
index 3113345364ffc..f9c761c9eedf3 100644
--- a/test/legacy_test/test_cos_sim_op.py
+++ b/test/legacy_test/test_cos_sim_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 class TestCosSimOp(OpTest):
diff --git a/test/legacy_test/test_crf_decoding_op.py b/test/legacy_test/test_crf_decoding_op.py
index 75a6a2ce2fb8c..a18328a4038d2 100644
--- a/test/legacy_test/test_crf_decoding_op.py
+++ b/test/legacy_test/test_crf_decoding_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 class CRFDecoding:
diff --git a/test/legacy_test/test_crop_op.py b/test/legacy_test/test_crop_op.py
index 394b4a1b6885c..858fd89fc7e99 100644
--- a/test/legacy_test/test_crop_op.py
+++ b/test/legacy_test/test_crop_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 
diff --git a/test/legacy_test/test_crop_tensor_op.py b/test/legacy_test/test_crop_tensor_op.py
index 3f186ebab54bd..8d9743a55171c 100644
--- a/test/legacy_test/test_crop_tensor_op.py
+++ b/test/legacy_test/test_crop_tensor_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 
diff --git a/test/legacy_test/test_cross_entropy2_op.py b/test/legacy_test/test_cross_entropy2_op.py
index 3b39af0054920..1066f4d2c8305 100644
--- a/test/legacy_test/test_cross_entropy2_op.py
+++ b/test/legacy_test/test_cross_entropy2_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 class CrossEntropy2OpTestBase(OpTest):
diff --git a/test/legacy_test/test_cross_entropy_loss.py b/test/legacy_test/test_cross_entropy_loss.py
index f0cbc7cae9684..78901bb75bf1b 100644
--- a/test/legacy_test/test_cross_entropy_loss.py
+++ b/test/legacy_test/test_cross_entropy_loss.py
@@ -23,6 +23,35 @@
 from paddle.base import Program, program_guard
 
 
+def label_smooth(label, C, epsilon, is_onehot=True):
+    """
+    Smooths the labels, commonly used in machine learning and deep learning to address label noise issues.
+
+    Args:
+        label: Labels, of type np.ndarray, with shape (batch_size, C)
+        C: Number of classes
+        epsilon: Smoothing factor, should be greater than or equal to 0 and less than or equal to 1
+        is_onehot: Whether the labels are in one-hot encoding, default is True
+
+    Returns:
+        smooth_labels: Smoothed labels, of type np.ndarray, with the same shape as label
+    """
+    assert epsilon >= 0.0 and epsilon <= 1.0, "epsilon should be in [0.0, 1.0]"
+    confidence = 1.0 - epsilon
+
+    # one-hot label
+    if label.shape[-1] == C and is_onehot is True:
+        smooth_labels = label * confidence + epsilon / C
+
+    # index label
+    else:
+        # Convert index labels to one-hot labels
+        eye_matrix = np.eye(C)
+        onehot_labels = eye_matrix[label]
+        smooth_labels = onehot_labels * confidence + epsilon / C
+    return smooth_labels.astype(np.float64)
+
+
 def log_softmax(x, axis=-1):
     softmax_out = np.apply_along_axis(stable_softmax, axis, x)
     return np.log(softmax_out)
@@ -810,6 +839,835 @@ def test_cross_entropy_loss_soft_2d_weight_mean(self):
 
     # soft_label test end
 
+    # label_smoothing test 1
+    def test_cross_entropy_loss_onehot_label_smoothing_1d(self):
+        self.numeric_stable_mode = False
+        self.soft_label = True
+        self.label_smoothing = np.random.uniform(0.1, 1.0)
+        self.dtype = (
+            'float32' if base.core.is_compiled_with_rocm() else 'float64'
+        )
+        self.axis = -1
+        self.ignore_index = -100  # should not be changed
+        self.N = 4
+        self.C = 3
+        self.shape = [self.N, self.C]
+        self.use_softmax = True
+        self.reduction = 'none'
+        self.weight = None
+        self.logits = getattr(
+            self,
+            "logits",
+            np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype),
+        )
+        softmax = np.apply_along_axis(stable_softmax, self.axis, self.logits)
+
+        self.labels = np.zeros(self.shape, dtype=self.dtype)
+        indices = np.random.randint(0, self.C, self.N)
+        self.labels[np.arange(self.N), indices] = 1.0
+        self.soft_labels = label_smooth(
+            self.labels, self.C, epsilon=self.label_smoothing
+        )
+
+        expected = cross_entropy_soft(
+            softmax,
+            self.soft_labels,
+            self.axis,
+            self.N,
+            weight=self.weight,
+            reduction=self.reduction,
+            ignore_index=self.ignore_index,
+        )
+
+        paddle.set_device("cpu")
+
+        # 2. dygraph
+        paddle.disable_static()
+        paddle_loss_none_weight = paddle.nn.functional.cross_entropy(
+            base.dygraph.to_variable(self.logits),
+            base.dygraph.to_variable(self.labels),
+            soft_label=True,
+            label_smoothing=self.label_smoothing,
+            axis=self.axis,
+            weight=base.dygraph.to_variable(self.weight)
+            if self.weight is not None
+            else None,
+            reduction=self.reduction,
+        )
+        dy_ret_value = paddle_loss_none_weight.numpy()
+
+        # 3. static
+        paddle.enable_static()
+        prog = base.Program()
+        startup_prog = base.Program()
+        place = (
+            base.CUDAPlace(0)
+            if base.core.is_compiled_with_cuda()
+            else base.CPUPlace()
+        )
+        with base.program_guard(prog, startup_prog):
+            input = paddle.static.data(
+                name='input', shape=[self.N, self.C], dtype=self.dtype
+            )
+            label = paddle.static.data(
+                name='label', shape=[self.N, self.C], dtype=self.dtype
+            )
+
+            cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
+                reduction=self.reduction,
+                soft_label=True,
+                label_smoothing=self.label_smoothing,
+            )
+            ret = cross_entropy_loss(input, label)
+
+            exe = base.Executor(place)
+            static_ret = exe.run(
+                prog,
+                feed={
+                    'input': self.logits,
+                    'label': self.labels,
+                },
+                fetch_list=[ret],
+            )
+            self.assertIsNotNone(static_ret)
+        paddle.disable_static()
+
+        np.testing.assert_allclose(static_ret[0], expected, rtol=1e-05)
+        np.testing.assert_allclose(dy_ret_value, expected, rtol=1e-05)
+        paddle.enable_static()
+
+    # label_smoothing test 2
+    def test_cross_entropy_loss_onehot_label_smoothing_1d_weight_mean(self):
+        self.numeric_stable_mode = False
+        self.soft_label = True
+        self.label_smoothing = np.random.uniform(0.1, 1.0)
+        self.dtype = (
+            'float32' if base.core.is_compiled_with_rocm() else 'float64'
+        )
+        self.axis = -1
+        self.ignore_index = -100  # should not be changed
+        self.N = 4
+        self.C = 3
+        self.shape = [self.N, self.C]
+        self.use_softmax = True
+        self.reduction = 'mean'
+        self.weight = np.random.uniform(0.1, 1.0, self.C).astype(self.dtype)
+        self.logits = getattr(
+            self,
+            "logits",
+            np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype),
+        )
+        softmax = np.apply_along_axis(stable_softmax, self.axis, self.logits)
+
+        self.labels = np.zeros(self.shape, dtype=self.dtype)
+        indices = np.random.randint(0, self.C, self.N)
+        self.labels[np.arange(self.N), indices] = 1.0
+        self.soft_labels = label_smooth(
+            self.labels, self.C, epsilon=self.label_smoothing
+        )
+
+        # 1. numpy
+        expected = cross_entropy_soft(
+            softmax,
+            self.soft_labels,
+            self.axis,
+            self.N,
+            weight=self.weight,
+            reduction=self.reduction,
+            ignore_index=self.ignore_index,
+        )
+
+        paddle.set_device("cpu")
+        paddle.disable_static()
+
+        # 2. dygraph
+        paddle_loss_none_weight = paddle.nn.functional.cross_entropy(
+            base.dygraph.to_variable(self.logits),
+            base.dygraph.to_variable(self.labels),
+            soft_label=True,
+            label_smoothing=self.label_smoothing,
+            axis=self.axis,
+            weight=base.dygraph.to_variable(self.weight),
+            reduction=self.reduction,
+        )
+        dy_ret_value = paddle_loss_none_weight.numpy()
+
+        # 3. static
+        paddle.enable_static()
+        prog = base.Program()
+        startup_prog = base.Program()
+        place = (
+            base.CUDAPlace(0)
+            if base.core.is_compiled_with_cuda()
+            else base.CPUPlace()
+        )
+        with base.program_guard(prog, startup_prog):
+            input = paddle.static.data(
+                name='input', shape=[self.N, self.C], dtype=self.dtype
+            )
+            label = paddle.static.data(
+                name='label', shape=[self.N, self.C], dtype=self.dtype
+            )
+            weight = paddle.static.data(
+                name='weight', shape=[self.C], dtype=self.dtype
+            )
+
+            cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
+                weight=weight,
+                reduction=self.reduction,
+                soft_label=True,
+                label_smoothing=self.label_smoothing,
+            )
+            ret = cross_entropy_loss(input, label)
+            exe = base.Executor(place)
+            static_ret = exe.run(
+                prog,
+                feed={
+                    'input': self.logits,
+                    'label': self.labels,
+                    "weight": self.weight,
+                },
+                fetch_list=[ret],
+            )
+            self.assertIsNotNone(static_ret)
+        paddle.disable_static()
+
+        np.testing.assert_allclose(static_ret[0], expected, rtol=1e-05)
+        np.testing.assert_allclose(dy_ret_value, expected, rtol=1e-05)
+
+    # label_smoothing test 3
+    def test_cross_entropy_loss_onehot_label_smoothing_2d(self):
+        self.numeric_stable_mode = False
+        self.soft_label = True
+        self.label_smoothing = np.random.uniform(0.1, 1.0)
+        self.dtype = (
+            'float32' if base.core.is_compiled_with_rocm() else 'float64'
+        )
+        self.axis = -1
+        self.ignore_index = -100  # should not be changed
+        self.N = 3
+        self.H = 2
+        self.W = 2
+        self.C = 5
+        self.shape = [self.N, self.H, self.W, self.C]
+        self.use_softmax = True
+        self.reduction = 'none'
+        self.weight = None
+        self.logits = getattr(
+            self,
+            "logits",
+            np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype),
+        )
+        softmax = np.apply_along_axis(stable_softmax, self.axis, self.logits)
+
+        self.labels = np.zeros(self.shape, dtype=self.dtype)
+        indices = np.random.randint(0, self.C, self.shape[:-1])
+        for i in range(self.N):
+            self.labels[
+                i, np.arange(self.H), np.arange(self.W), indices[i]
+            ] = 1.0
+        self.soft_labels = label_smooth(
+            self.labels, self.C, epsilon=self.label_smoothing
+        )
+
+        # 1. numpy
+        expected = cross_entropy_soft_2d(
+            softmax,
+            self.soft_labels,
+            self.axis,
+            self.N,
+            self.H,
+            self.W,
+            weight=self.weight,
+            reduction=self.reduction,
+            ignore_index=self.ignore_index,
+        )
+
+        paddle.set_device("cpu")
+        paddle.disable_static()
+
+        # 2. dygraph
+        paddle_loss_none_weight = paddle.nn.functional.cross_entropy(
+            base.dygraph.to_variable(self.logits),
+            base.dygraph.to_variable(self.labels),
+            soft_label=True,
+            label_smoothing=self.label_smoothing,
+            axis=self.axis,
+            weight=base.dygraph.to_variable(self.weight)
+            if self.weight is not None
+            else None,
+            reduction=self.reduction,
+        )
+        dy_ret_value = paddle_loss_none_weight.numpy()
+
+        # 3. static
+        paddle.enable_static()
+        prog = base.Program()
+        startup_prog = base.Program()
+        place = (
+            base.CUDAPlace(0)
+            if base.core.is_compiled_with_cuda()
+            else base.CPUPlace()
+        )
+        with base.program_guard(prog, startup_prog):
+            input = paddle.static.data(
+                name='input',
+                shape=[self.N, self.H, self.W, self.C],
+                dtype=self.dtype,
+            )
+            label = paddle.static.data(
+                name='label',
+                shape=[self.N, self.H, self.W, self.C],
+                dtype=self.dtype,
+            )
+
+            cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
+                reduction=self.reduction,
+                soft_label=True,
+                label_smoothing=self.label_smoothing,
+            )
+            ret = cross_entropy_loss(input, label)
+            exe = base.Executor(place)
+            static_ret = exe.run(
+                prog,
+                feed={
+                    'input': self.logits,
+                    'label': self.labels,
+                },
+                fetch_list=[ret],
+            )
+            self.assertIsNotNone(static_ret)
+        paddle.disable_static()
+
+        np.testing.assert_allclose(static_ret[0], expected, rtol=1e-05)
+        np.testing.assert_allclose(dy_ret_value, expected, rtol=1e-05)
+
+    # label_smoothing test 4
+    def test_cross_entropy_loss_onehot_label_smoothing_2d_weight_mean(self):
+        self.numeric_stable_mode = False
+        self.soft_label = True
+        self.label_smoothing = np.random.uniform(0.1, 1.0)
+        self.dtype = (
+            'float32' if base.core.is_compiled_with_rocm() else 'float64'
+        )
+        self.axis = -1
+        self.ignore_index = -100  # should not be changed
+        self.N = 3
+        self.H = 2
+        self.W = 2
+        self.C = 5
+        self.shape = [self.N, self.H, self.W, self.C]
+        self.use_softmax = True
+        self.reduction = 'mean'
+        self.weight = np.random.uniform(0.1, 1.0, self.C).astype(self.dtype)
+        self.logits = getattr(
+            self,
+            "logits",
+            np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype),
+        )
+        softmax = np.apply_along_axis(stable_softmax, self.axis, self.logits)
+
+        self.labels = np.zeros(self.shape, dtype=self.dtype)
+        indices = np.random.randint(0, self.C, self.shape[:-1])
+        for i in range(self.N):
+            self.labels[
+                i, np.arange(self.H), np.arange(self.W), indices[i]
+            ] = 1.0
+        self.soft_labels = label_smooth(
+            self.labels, self.C, epsilon=self.label_smoothing
+        )
+
+        # 1. numpy
+        expected = cross_entropy_soft_2d(
+            softmax,
+            self.soft_labels,
+            self.axis,
+            self.N,
+            self.H,
+            self.W,
+            weight=self.weight,
+            reduction=self.reduction,
+            ignore_index=self.ignore_index,
+        )
+
+        paddle.set_device("cpu")
+        paddle.disable_static()
+
+        # 2. dygraph
+        paddle_loss_none_weight = paddle.nn.functional.cross_entropy(
+            base.dygraph.to_variable(self.logits),
+            base.dygraph.to_variable(self.labels),
+            soft_label=True,
+            label_smoothing=self.label_smoothing,
+            axis=self.axis,
+            weight=base.dygraph.to_variable(self.weight),
+            reduction=self.reduction,
+        )
+        dy_ret_value = paddle_loss_none_weight.numpy()
+
+        # 3. static
+        paddle.enable_static()
+        prog = base.Program()
+        startup_prog = base.Program()
+        place = (
+            base.CUDAPlace(0)
+            if base.core.is_compiled_with_cuda()
+            else base.CPUPlace()
+        )
+        with base.program_guard(prog, startup_prog):
+            input = paddle.static.data(
+                name='input',
+                shape=[self.N, self.H, self.W, self.C],
+                dtype=self.dtype,
+            )
+            label = paddle.static.data(
+                name='label',
+                shape=[self.N, self.H, self.W, self.C],
+                dtype=self.dtype,
+            )
+            weight = paddle.static.data(
+                name='weight', shape=[self.C], dtype=self.dtype
+            )
+
+            cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
+                weight=weight,
+                reduction=self.reduction,
+                soft_label=True,
+                label_smoothing=self.label_smoothing,
+            )
+            ret = cross_entropy_loss(input, label)
+            exe = base.Executor(place)
+            static_ret = exe.run(
+                prog,
+                feed={
+                    'input': self.logits,
+                    'label': self.labels,
+                    "weight": self.weight,
+                },
+                fetch_list=[ret],
+            )
+            self.assertIsNotNone(static_ret)
+        paddle.disable_static()
+
+        np.testing.assert_allclose(static_ret[0], expected, rtol=1e-05)
+        np.testing.assert_allclose(dy_ret_value, expected, rtol=1e-05)
+
+    # label_smoothing test 5
+    def test_cross_entropy_loss_integer_label_smoothing_1d(self):
+        self.numeric_stable_mode = False
+        self.soft_label = True
+        self.label_smoothing = np.random.uniform(0.1, 1.0)
+        self.input_dtype = (
+            'float32' if base.core.is_compiled_with_rocm() else 'float64'
+        )
+        self.label_dtype = 'int64'
+        self.axis = -1
+        self.ignore_index = -100  # should not be changed
+        self.N = 4
+        self.C = 3
+        self.shape = [self.N, self.C]
+        self.use_softmax = True
+        self.reduction = 'none'
+        self.weight = None
+        self.logits = getattr(
+            self,
+            "logits",
+            np.random.uniform(0.1, 1.0, self.shape).astype(self.input_dtype),
+        )
+        softmax = np.apply_along_axis(stable_softmax, self.axis, self.logits)
+        self.labels = np.random.randint(0, self.C, size=self.N, dtype=np.int64)
+        self.soft_labels = label_smooth(
+            self.labels,
+            self.C,
+            epsilon=self.label_smoothing,
+            is_onehot=False,
+        )
+
+        expected = cross_entropy_soft(
+            softmax,
+            self.soft_labels,
+            self.axis,
+            self.N,
+            weight=self.weight,
+            reduction=self.reduction,
+            ignore_index=self.ignore_index,
+        )
+
+        paddle.set_device("cpu")
+
+        # 2. dygraph
+        paddle.disable_static()
+        paddle_loss_none_weight = paddle.nn.functional.cross_entropy(
+            base.dygraph.to_variable(self.logits),
+            base.dygraph.to_variable(self.labels),
+            soft_label=self.soft_label,
+            label_smoothing=self.label_smoothing,
+            axis=self.axis,
+            weight=base.dygraph.to_variable(self.weight)
+            if self.weight is not None
+            else None,
+            reduction=self.reduction,
+        )
+        dy_ret_value = paddle_loss_none_weight.numpy()
+
+        # 3. static
+        paddle.enable_static()
+        prog = base.Program()
+        startup_prog = base.Program()
+        place = (
+            base.CUDAPlace(0)
+            if base.core.is_compiled_with_cuda()
+            else base.CPUPlace()
+        )
+        with base.program_guard(prog, startup_prog):
+            input = paddle.static.data(
+                name='input', shape=[self.N, self.C], dtype=self.input_dtype
+            )
+            label = paddle.static.data(
+                name='label', shape=[self.N], dtype=self.label_dtype
+            )
+
+            cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
+                reduction=self.reduction,
+                soft_label=True,
+                label_smoothing=self.label_smoothing,
+            )
+            ret = cross_entropy_loss(input, label)
+
+            exe = base.Executor(place)
+            static_ret = exe.run(
+                prog,
+                feed={
+                    'input': self.logits,
+                    'label': self.labels,
+                },
+                fetch_list=[ret],
+            )
+            self.assertIsNotNone(static_ret)
+        paddle.disable_static()
+
+        np.testing.assert_allclose(static_ret[0], expected, rtol=1e-05)
+        np.testing.assert_allclose(dy_ret_value, expected, rtol=1e-05)
+
+    # label_smoothing test 6
+    def test_cross_entropy_loss_integer_label_smoothing_1d_weight_mean(self):
+        self.numeric_stable_mode = False
+        self.soft_label = True
+        self.label_smoothing = np.random.uniform(0.1, 1.0)
+        self.input_dtype = (
+            'float32' if base.core.is_compiled_with_rocm() else 'float64'
+        )
+        self.label_dtype = 'int64'
+        self.axis = -1
+        self.ignore_index = -100  # should not be changed
+        self.N = 4
+        self.C = 3
+        self.shape = [self.N, self.C]
+        self.use_softmax = True
+        self.reduction = 'mean'
+        self.weight = np.random.uniform(0.1, 1.0, self.C).astype(self.dtype)
+        self.logits = getattr(
+            self,
+            "logits",
+            np.random.uniform(0.1, 1.0, self.shape).astype(self.input_dtype),
+        )
+        softmax = np.apply_along_axis(stable_softmax, self.axis, self.logits)
+        self.labels = np.random.randint(0, self.C, size=self.N, dtype=np.int64)
+        self.soft_labels = label_smooth(
+            self.labels,
+            self.C,
+            epsilon=self.label_smoothing,
+            is_onehot=False,
+        )
+
+        expected = cross_entropy_soft(
+            softmax,
+            self.soft_labels,
+            self.axis,
+            self.N,
+            weight=self.weight,
+            reduction=self.reduction,
+            ignore_index=self.ignore_index,
+        )
+
+        paddle.set_device("cpu")
+
+        # 2. dygraph
+        paddle.disable_static()
+        paddle_loss_none_weight = paddle.nn.functional.cross_entropy(
+            base.dygraph.to_variable(self.logits),
+            base.dygraph.to_variable(self.labels),
+            soft_label=self.soft_label,
+            label_smoothing=self.label_smoothing,
+            axis=self.axis,
+            weight=base.dygraph.to_variable(self.weight)
+            if self.weight is not None
+            else None,
+            reduction=self.reduction,
+        )
+        dy_ret_value = paddle_loss_none_weight.numpy()
+
+        # 3. static
+        paddle.enable_static()
+        prog = base.Program()
+        startup_prog = base.Program()
+        place = (
+            base.CUDAPlace(0)
+            if base.core.is_compiled_with_cuda()
+            else base.CPUPlace()
+        )
+        with base.program_guard(prog, startup_prog):
+            input = paddle.static.data(
+                name='input', shape=[self.N, self.C], dtype=self.input_dtype
+            )
+            label = paddle.static.data(
+                name='label', shape=[self.N], dtype=self.label_dtype
+            )
+            weight = paddle.static.data(
+                name='weight', shape=[self.C], dtype=self.dtype
+            )
+
+            cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
+                weight=weight,
+                reduction=self.reduction,
+                soft_label=True,
+                label_smoothing=self.label_smoothing,
+            )
+            ret = cross_entropy_loss(input, label)
+
+            exe = base.Executor(place)
+            static_ret = exe.run(
+                prog,
+                feed={
+                    'input': self.logits,
+                    'label': self.labels,
+                    'weight': self.weight,
+                },
+                fetch_list=[ret],
+            )
+            self.assertIsNotNone(static_ret)
+        paddle.disable_static()
+
+        np.testing.assert_allclose(static_ret[0], expected, rtol=1e-05)
+        np.testing.assert_allclose(dy_ret_value, expected, rtol=1e-05)
+
+    # label_smoothing test 7
+    def test_cross_entropy_loss_integer_label_smoothing_2d(self):
+        self.numeric_stable_mode = False
+        self.soft_label = True
+        self.label_smoothing = np.random.uniform(0.1, 1.0)
+        self.input_dtype = (
+            'float32' if base.core.is_compiled_with_rocm() else 'float64'
+        )
+        self.label_dtype = 'int64'
+        self.axis = -1
+        self.ignore_index = -100  # should not be changed
+        self.N = 3
+        self.H = 2
+        self.W = 2
+        self.C = 5
+        self.shape = [self.N, self.H, self.W, self.C]
+        self.use_softmax = True
+        self.reduction = 'none'
+        self.weight = None
+        self.logits = getattr(
+            self,
+            "logits",
+            np.random.uniform(0.1, 1.0, self.shape).astype(self.input_dtype),
+        )
+        softmax = np.apply_along_axis(stable_softmax, self.axis, self.logits)
+
+        self.labels = np.random.randint(0, self.C, self.shape[:-1]).astype(
+            self.label_dtype
+        )
+        self.soft_labels = label_smooth(
+            self.labels, self.C, epsilon=self.label_smoothing, is_onehot=False
+        )
+
+        # 1. numpy
+        expected = cross_entropy_soft_2d(
+            softmax,
+            self.soft_labels,
+            self.axis,
+            self.N,
+            self.H,
+            self.W,
+            weight=self.weight,
+            reduction=self.reduction,
+            ignore_index=self.ignore_index,
+        )
+
+        paddle.set_device("cpu")
+        paddle.disable_static()
+
+        # 2. dygraph
+        paddle_loss_none_weight = paddle.nn.functional.cross_entropy(
+            base.dygraph.to_variable(self.logits),
+            base.dygraph.to_variable(self.labels),
+            soft_label=True,
+            label_smoothing=self.label_smoothing,
+            axis=self.axis,
+            weight=base.dygraph.to_variable(self.weight)
+            if self.weight is not None
+            else None,
+            reduction=self.reduction,
+        )
+        dy_ret_value = paddle_loss_none_weight.numpy()
+
+        # 3. static
+        paddle.enable_static()
+        prog = base.Program()
+        startup_prog = base.Program()
+        place = (
+            base.CUDAPlace(0)
+            if base.core.is_compiled_with_cuda()
+            else base.CPUPlace()
+        )
+        with base.program_guard(prog, startup_prog):
+            input = paddle.static.data(
+                name='input',
+                shape=[self.N, self.H, self.W, self.C],
+                dtype=self.input_dtype,
+            )
+            label = paddle.static.data(
+                name='label',
+                shape=[self.N, self.H, self.W],
+                dtype=self.label_dtype,
+            )
+
+            cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
+                reduction=self.reduction,
+                soft_label=True,
+                label_smoothing=self.label_smoothing,
+            )
+            ret = cross_entropy_loss(input, label)
+            exe = base.Executor(place)
+            static_ret = exe.run(
+                prog,
+                feed={
+                    'input': self.logits,
+                    'label': self.labels,
+                },
+                fetch_list=[ret],
+            )
+            self.assertIsNotNone(static_ret)
+        paddle.disable_static()
+
+        np.testing.assert_allclose(static_ret[0], expected, rtol=1e-05)
+        np.testing.assert_allclose(dy_ret_value, expected, rtol=1e-05)
+
+    # label_smoothing test 8
+    def test_cross_entropy_loss_integer_label_smoothing_2d_weight_mean(self):
+        self.numeric_stable_mode = False
+        self.soft_label = True
+        self.label_smoothing = np.random.uniform(0.1, 1.0)
+        self.input_dtype = (
+            'float32' if base.core.is_compiled_with_rocm() else 'float64'
+        )
+        self.label_dtype = 'int64'
+        self.axis = -1
+        self.ignore_index = -100  # should not be changed
+        self.N = 3
+        self.H = 2
+        self.W = 2
+        self.C = 5
+        self.shape = [self.N, self.H, self.W, self.C]
+        self.use_softmax = True
+        self.reduction = 'mean'
+        self.weight = np.random.uniform(0.1, 1.0, self.C).astype(self.dtype)
+        self.logits = getattr(
+            self,
+            "logits",
+            np.random.uniform(0.1, 1.0, self.shape).astype(self.input_dtype),
+        )
+        softmax = np.apply_along_axis(stable_softmax, self.axis, self.logits)
+
+        self.labels = np.random.randint(0, self.C, self.shape[:-1]).astype(
+            self.label_dtype
+        )
+        self.soft_labels = label_smooth(
+            self.labels, self.C, epsilon=self.label_smoothing, is_onehot=False
+        )
+
+        # 1. numpy
+        expected = cross_entropy_soft_2d(
+            softmax,
+            self.soft_labels,
+            self.axis,
+            self.N,
+            self.H,
+            self.W,
+            weight=self.weight,
+            reduction=self.reduction,
+            ignore_index=self.ignore_index,
+        )
+
+        paddle.set_device("cpu")
+        paddle.disable_static()
+
+        # 2. dygraph
+        paddle_loss_none_weight = paddle.nn.functional.cross_entropy(
+            base.dygraph.to_variable(self.logits),
+            base.dygraph.to_variable(self.labels),
+            soft_label=True,
+            label_smoothing=self.label_smoothing,
+            axis=self.axis,
+            weight=base.dygraph.to_variable(self.weight)
+            if self.weight is not None
+            else None,
+            reduction=self.reduction,
+        )
+        dy_ret_value = paddle_loss_none_weight.numpy()
+
+        # 3. static
+        paddle.enable_static()
+        prog = base.Program()
+        startup_prog = base.Program()
+        place = (
+            base.CUDAPlace(0)
+            if base.core.is_compiled_with_cuda()
+            else base.CPUPlace()
+        )
+        with base.program_guard(prog, startup_prog):
+            input = paddle.static.data(
+                name='input',
+                shape=[self.N, self.H, self.W, self.C],
+                dtype=self.input_dtype,
+            )
+            label = paddle.static.data(
+                name='label',
+                shape=[self.N, self.H, self.W],
+                dtype=self.label_dtype,
+            )
+            weight = paddle.static.data(
+                name='weight', shape=[self.C], dtype=self.dtype
+            )
+
+            cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
+                weight=weight,
+                reduction=self.reduction,
+                soft_label=True,
+                label_smoothing=self.label_smoothing,
+            )
+            ret = cross_entropy_loss(input, label)
+            exe = base.Executor(place)
+            static_ret = exe.run(
+                prog,
+                feed={
+                    'input': self.logits,
+                    'label': self.labels,
+                    'weight': self.weight,
+                },
+                fetch_list=[ret],
+            )
+            self.assertIsNotNone(static_ret)
+        paddle.disable_static()
+
+        np.testing.assert_allclose(static_ret[0], expected, rtol=1e-05)
+        np.testing.assert_allclose(dy_ret_value, expected, rtol=1e-05)
+
+    # label_smoothing test end
+
     def test_cross_entropy_loss_1d_with_mean_ignore(self):
         input_np = np.random.random([2, 4]).astype(self.dtype)
         label_np = np.random.randint(0, 4, size=(2)).astype(np.int64)
diff --git a/test/legacy_test/test_cross_entropy_op.py b/test/legacy_test/test_cross_entropy_op.py
index 4ad24ba762b09..f4f25543b2bf5 100644
--- a/test/legacy_test/test_cross_entropy_op.py
+++ b/test/legacy_test/test_cross_entropy_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, paddle_static_guard, randomize_probability
+from op_test import OpTest, paddle_static_guard, randomize_probability
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_cross_op.py b/test/legacy_test/test_cross_op.py
index 72f77a0adaf3b..cd13ea10f4510 100644
--- a/test/legacy_test/test_cross_op.py
+++ b/test/legacy_test/test_cross_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_ctc_align.py b/test/legacy_test/test_ctc_align.py
index b95ac8a3c6e5a..699b176518be1 100644
--- a/test/legacy_test/test_ctc_align.py
+++ b/test/legacy_test/test_ctc_align.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 
diff --git a/test/legacy_test/test_cummax_op.py b/test/legacy_test/test_cummax_op.py
index 53e1da6da45f8..91df4866a75a6 100644
--- a/test/legacy_test/test_cummax_op.py
+++ b/test/legacy_test/test_cummax_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_cummin_op.py b/test/legacy_test/test_cummin_op.py
index 95235dc33d442..416e4c48f0fc0 100644
--- a/test/legacy_test/test_cummin_op.py
+++ b/test/legacy_test/test_cummin_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_cumprod_op.py b/test/legacy_test/test_cumprod_op.py
index f0569a7697239..da3db1ee1ef6f 100644
--- a/test/legacy_test/test_cumprod_op.py
+++ b/test/legacy_test/test_cumprod_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_cumsum_op.py b/test/legacy_test/test_cumsum_op.py
index 6422852458138..be733d989a93f 100644
--- a/test/legacy_test/test_cumsum_op.py
+++ b/test/legacy_test/test_cumsum_op.py
@@ -17,7 +17,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 import paddle.inference as paddle_infer
diff --git a/test/legacy_test/test_cvm_op.py b/test/legacy_test/test_cvm_op.py
index 12294665b99f3..282167e8050cf 100644
--- a/test/legacy_test/test_cvm_op.py
+++ b/test/legacy_test/test_cvm_op.py
@@ -16,7 +16,7 @@
 from math import log
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 def cvm_compute(X, item_width, use_cvm):
diff --git a/test/legacy_test/test_data_norm_op.py b/test/legacy_test/test_data_norm_op.py
index 251123e9abdaa..954c3da834fd7 100644
--- a/test/legacy_test/test_data_norm_op.py
+++ b/test/legacy_test/test_data_norm_op.py
@@ -16,8 +16,8 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
 from op import Operator
+from op_test import OpTest
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_decayed_adagrad_op.py b/test/legacy_test/test_decayed_adagrad_op.py
index fc542c2e764b6..f7174fbfb1e57 100644
--- a/test/legacy_test/test_decayed_adagrad_op.py
+++ b/test/legacy_test/test_decayed_adagrad_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 class TestDecayedAdagradOp1(OpTest):
diff --git a/test/legacy_test/test_deformable_conv_op.py b/test/legacy_test/test_deformable_conv_op.py
index db01a7c68a48b..ff816a16a7b32 100644
--- a/test/legacy_test/test_deformable_conv_op.py
+++ b/test/legacy_test/test_deformable_conv_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 
diff --git a/test/legacy_test/test_deformable_conv_v1_op.py b/test/legacy_test/test_deformable_conv_v1_op.py
index c4354ea9e8a88..a5a8347829041 100644
--- a/test/legacy_test/test_deformable_conv_v1_op.py
+++ b/test/legacy_test/test_deformable_conv_v1_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 
diff --git a/test/legacy_test/test_density_prior_box_op.py b/test/legacy_test/test_density_prior_box_op.py
index 688913ce396e7..9d621dc551111 100644
--- a/test/legacy_test/test_density_prior_box_op.py
+++ b/test/legacy_test/test_density_prior_box_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 class TestDensityPriorBoxOp(OpTest):
diff --git a/test/legacy_test/test_dequantize_abs_max_op.py b/test/legacy_test/test_dequantize_abs_max_op.py
index 86a8b686b8eed..0df5a3fda11c2 100644
--- a/test/legacy_test/test_dequantize_abs_max_op.py
+++ b/test/legacy_test/test_dequantize_abs_max_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 def quantize_max_abs(x, max_range):
diff --git a/test/legacy_test/test_dequantize_log_op.py b/test/legacy_test/test_dequantize_log_op.py
index dc97f5dc2ec5a..9db2aa6b918d1 100644
--- a/test/legacy_test/test_dequantize_log_op.py
+++ b/test/legacy_test/test_dequantize_log_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 def dequantize_log(x, dict_data):
diff --git a/test/legacy_test/test_detection_map_op.py b/test/legacy_test/test_detection_map_op.py
index e8c0e27211017..376b9876cd46a 100644
--- a/test/legacy_test/test_detection_map_op.py
+++ b/test/legacy_test/test_detection_map_op.py
@@ -17,7 +17,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 class TestDetectionMAPOp(OpTest):
diff --git a/test/legacy_test/test_determinant_op.py b/test/legacy_test/test_determinant_op.py
index 8e50f0c5552ec..f0066edf10424 100644
--- a/test/legacy_test/test_determinant_op.py
+++ b/test/legacy_test/test_determinant_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 
diff --git a/test/legacy_test/test_diag.py b/test/legacy_test/test_diag.py
index 96d5f9e53db79..4f713488b8206 100644
--- a/test/legacy_test/test_diag.py
+++ b/test/legacy_test/test_diag.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle.base import Program, core, program_guard
diff --git a/test/legacy_test/test_diag_embed.py b/test/legacy_test/test_diag_embed.py
index 4e9e26d1b4ad4..2f3869713f0e3 100644
--- a/test/legacy_test/test_diag_embed.py
+++ b/test/legacy_test/test_diag_embed.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, paddle_static_guard
+from op_test import OpTest, paddle_static_guard
 
 import paddle
 import paddle.nn.functional as F
diff --git a/test/legacy_test/test_diag_v2.py b/test/legacy_test/test_diag_v2.py
index bbe146245bf14..2458a280ce039 100644
--- a/test/legacy_test/test_diag_v2.py
+++ b/test/legacy_test/test_diag_v2.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_diagonal_op.py b/test/legacy_test/test_diagonal_op.py
index 005a890b6ed07..0e69b7cc4d26a 100644
--- a/test/legacy_test/test_diagonal_op.py
+++ b/test/legacy_test/test_diagonal_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_digamma_op.py b/test/legacy_test/test_digamma_op.py
index 04fbefc1dc8be..04bb768a5b179 100644
--- a/test/legacy_test/test_digamma_op.py
+++ b/test/legacy_test/test_digamma_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 from scipy.special import psi
 
 import paddle
diff --git a/test/legacy_test/test_dist_op.py b/test/legacy_test/test_dist_op.py
index be7f05b638d8a..bd2e9c828e144 100644
--- a/test/legacy_test/test_dist_op.py
+++ b/test/legacy_test/test_dist_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_distribute_fpn_proposals_op.py b/test/legacy_test/test_distribute_fpn_proposals_op.py
index 8d30b70c47a45..956c435298781 100644
--- a/test/legacy_test/test_distribute_fpn_proposals_op.py
+++ b/test/legacy_test/test_distribute_fpn_proposals_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 
diff --git a/test/legacy_test/test_dot_op.py b/test/legacy_test/test_dot_op.py
index 2fdaf3dd1598b..17c928be25071 100644
--- a/test/legacy_test/test_dot_op.py
+++ b/test/legacy_test/test_dot_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_dpsgd_op.py b/test/legacy_test/test_dpsgd_op.py
index ee68ba01be697..f8a84f2f38572 100644
--- a/test/legacy_test/test_dpsgd_op.py
+++ b/test/legacy_test/test_dpsgd_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 class TestDpsgdOp(OpTest):
diff --git a/test/legacy_test/test_dropout_nd_op.py b/test/legacy_test/test_dropout_nd_op.py
index 003977fe7ebe2..02e160c091734 100644
--- a/test/legacy_test/test_dropout_nd_op.py
+++ b/test/legacy_test/test_dropout_nd_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import _legacy_C_ops, base
diff --git a/test/legacy_test/test_dropout_op.py b/test/legacy_test/test_dropout_op.py
index dbfbfca070c31..e76f620512639 100644
--- a/test/legacy_test/test_dropout_op.py
+++ b/test/legacy_test/test_dropout_op.py
@@ -16,7 +16,7 @@
 
 import numpy as np
 import parameterized as param
-from eager_op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci
+from op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci
 
 import paddle
 from paddle import _C_ops, base, static
diff --git a/test/legacy_test/test_edit_distance_op.py b/test/legacy_test/test_edit_distance_op.py
index 1bcedf8b99d2c..dba8891c7e445 100644
--- a/test/legacy_test/test_edit_distance_op.py
+++ b/test/legacy_test/test_edit_distance_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 
diff --git a/test/legacy_test/test_eig_op.py b/test/legacy_test/test_eig_op.py
index da28ca66bc000..c5ba7262902c7 100644
--- a/test/legacy_test/test_eig_op.py
+++ b/test/legacy_test/test_eig_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, skip_check_grad_ci
+from op_test import OpTest, skip_check_grad_ci
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_eigh_op.py b/test/legacy_test/test_eigh_op.py
index dd45169696247..12042b89a17f0 100644
--- a/test/legacy_test/test_eigh_op.py
+++ b/test/legacy_test/test_eigh_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 
diff --git a/test/legacy_test/test_eigvals_op.py b/test/legacy_test/test_eigvals_op.py
index 811eee672e1a5..6f3f126b2db3e 100644
--- a/test/legacy_test/test_eigvals_op.py
+++ b/test/legacy_test/test_eigvals_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_eigvalsh_op.py b/test/legacy_test/test_eigvalsh_op.py
index 6b8dd9b1d8d5b..654702f856188 100644
--- a/test/legacy_test/test_eigvalsh_op.py
+++ b/test/legacy_test/test_eigvalsh_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 
diff --git a/test/legacy_test/test_einsum_op.py b/test/legacy_test/test_einsum_op.py
index 8595094ee0fed..86b1cff7ea9ff 100644
--- a/test/legacy_test/test_einsum_op.py
+++ b/test/legacy_test/test_einsum_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_einsum_v2.py b/test/legacy_test/test_einsum_v2.py
index 4ce8b7519bc8a..4009518329d0e 100644
--- a/test/legacy_test/test_einsum_v2.py
+++ b/test/legacy_test/test_einsum_v2.py
@@ -596,7 +596,7 @@ def test_shape(self):
 
 class TestSimpleComplexGrad(unittest.TestCase):
     """
-    EinsumOp support complex grad. but eager_op_test don't support numeric grad for complex dtype.
+    EinsumOp support complex grad. but op_test don't support numeric grad for complex dtype.
     """
 
     def test_shape(self):
diff --git a/test/legacy_test/test_elementwise_add_op.py b/test/legacy_test/test_elementwise_add_op.py
index 8630ef593be64..279d1997f160e 100644
--- a/test/legacy_test/test_elementwise_add_op.py
+++ b/test/legacy_test/test_elementwise_add_op.py
@@ -18,7 +18,7 @@
 import warnings
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci
+from op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci
 
 import paddle
 from paddle import base
@@ -55,7 +55,9 @@ def check_dygraph(self):
     def test_check_output(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
         self.check_output(
-            check_dygraph=self.check_dygraph(), check_prim=self.check_prim
+            check_dygraph=self.check_dygraph(),
+            check_prim=self.check_prim,
+            check_new_ir=self.check_dygraph(),
         )
 
     def test_check_grad_normal(self):
@@ -67,6 +69,7 @@ def test_check_grad_normal(self):
             'Out',
             check_dygraph=self.check_dygraph(),
             check_prim=self.check_prim,
+            check_new_ir=self.check_dygraph(),
         )
 
     def test_check_grad_ingore_x(self):
@@ -79,6 +82,7 @@ def test_check_grad_ingore_x(self):
             no_grad_set=set("X"),
             check_dygraph=self.check_dygraph(),
             check_prim=self.check_prim,
+            check_new_ir=self.check_dygraph(),
         )
 
     def test_check_grad_ingore_y(self):
@@ -91,6 +95,7 @@ def test_check_grad_ingore_y(self):
             no_grad_set=set('Y'),
             check_dygraph=self.check_dygraph(),
             check_prim=self.check_prim,
+            check_new_ir=self.check_dygraph(),
         )
 
     def init_input_output(self):
@@ -147,6 +152,7 @@ def test_check_output(self):
             atol=1e-3,
             check_dygraph=self.check_dygraph(),
             check_prim=self.check_prim,
+            check_new_ir=self.check_dygraph(),
         )
 
     def test_check_grad_normal(self):
@@ -156,13 +162,23 @@ def test_check_grad_normal(self):
     def test_check_grad_ingore_x(self):
         place = core.CUDAPlace(0)
         self.check_grad_with_place(
-            place, ['Y'], 'Out', no_grad_set=set("X"), check_prim=True
+            place,
+            ['Y'],
+            'Out',
+            no_grad_set=set("X"),
+            check_prim=True,
+            check_new_ir=True,
         )
 
     def test_check_grad_ingore_y(self):
         place = core.CUDAPlace(0)
         self.check_grad_with_place(
-            place, ['X'], 'Out', no_grad_set=set('Y'), check_prim=True
+            place,
+            ['X'],
+            'Out',
+            no_grad_set=set('Y'),
+            check_prim=True,
+            check_new_ir=True,
         )
 
 
@@ -200,18 +216,34 @@ def test_check_output(self):
 
     def test_check_grad_normal(self):
         place = core.CUDAPlace(0)
-        self.check_grad_with_place(place, ['X', 'Y'], 'Out', check_prim=True)
+        self.check_grad_with_place(
+            place,
+            ['X', 'Y'],
+            'Out',
+            check_prim=True,
+            check_new_ir=True,
+        )
 
     def test_check_grad_ingore_x(self):
         place = core.CUDAPlace(0)
         self.check_grad_with_place(
-            place, ['Y'], 'Out', no_grad_set=set("X"), check_prim=True
+            place,
+            ['Y'],
+            'Out',
+            no_grad_set=set("X"),
+            check_prim=True,
+            check_new_ir=True,
         )
 
     def test_check_grad_ingore_y(self):
         place = core.CUDAPlace(0)
         self.check_grad_with_place(
-            place, ['X'], 'Out', no_grad_set=set('Y'), check_prim=True
+            place,
+            ['X'],
+            'Out',
+            no_grad_set=set('Y'),
+            check_prim=True,
+            check_new_ir=True,
         )
 
     def if_enable_cinn(self):
diff --git a/test/legacy_test/test_elementwise_div_op.py b/test/legacy_test/test_elementwise_div_op.py
index 8a2adb65908bb..1f43162d242a5 100644
--- a/test/legacy_test/test_elementwise_div_op.py
+++ b/test/legacy_test/test_elementwise_div_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci
+from op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci
 
 import paddle
 from paddle import base
@@ -24,7 +24,7 @@
 
 def broadcast_wrapper(shape=[1, 10, 12, 1]):
     def div_wrapper(x, y, axis=-1):
-        return paddle.divide(x, y.reshape(shape))
+        return paddle.divide(x, paddle.reshape(y, shape))
 
     return div_wrapper
 
@@ -97,9 +97,9 @@ def compute_gradient_y(self, grad_out, out, y):
 
     def test_check_output(self):
         if self.place is None:
-            self.check_output()
+            self.check_output(check_new_ir=True)
         else:
-            self.check_output_with_place(self.place)
+            self.check_output_with_place(self.place, check_new_ir=True)
 
     def test_check_gradient(self):
         check_list = []
@@ -126,10 +126,12 @@ def test_check_gradient(self):
                 'check_prim': self.check_prim,
             }
             if self.place is None:
-                self.check_grad(*check_args, **check_kwargs)
+                self.check_grad(*check_args, **check_kwargs, check_new_ir=True)
             else:
                 check_args.insert(0, self.place)
-                self.check_grad_with_place(*check_args, **check_kwargs)
+                self.check_grad_with_place(
+                    *check_args, **check_kwargs, check_new_ir=True
+                )
 
 
 class TestElementwiseDivPrimOpFp32(ElementwiseDivOp):
@@ -215,10 +217,12 @@ def test_check_gradient(self):
                 'check_dygraph': self.check_dygraph,
             }
             if self.place is None:
-                self.check_grad(*check_args, **check_kwargs)
+                self.check_grad(*check_args, **check_kwargs, check_new_ir=True)
             else:
                 check_args.insert(0, self.place)
-                self.check_grad_with_place(*check_args, **check_kwargs)
+                self.check_grad_with_place(
+                    *check_args, **check_kwargs, check_new_ir=True
+                )
 
     def if_check_prim(self):
         self.check_prim = True
@@ -270,10 +274,12 @@ def test_check_gradient(self):
                 'check_dygraph': self.check_dygraph,
             }
             if self.place is None:
-                self.check_grad(*check_args, **check_kwargs)
+                self.check_grad(*check_args, **check_kwargs, check_new_ir=True)
             else:
                 check_args.insert(0, self.place)
-                self.check_grad_with_place(*check_args, **check_kwargs)
+                self.check_grad_with_place(
+                    *check_args, **check_kwargs, check_new_ir=True
+                )
 
 
 class TestElementwiseDivOpBroadcast0(TestElementwiseDivOpNoPrim):
@@ -443,10 +449,14 @@ def test_check_gradient(self):
                     'max_relative_error': max_relative_error,
                 }
                 if self.place is None:
-                    self.check_grad(*check_args, **check_kwargs)
+                    self.check_grad(
+                        *check_args, **check_kwargs, check_new_ir=True
+                    )
                 else:
                     check_args.insert(0, self.place)
-                    self.check_grad_with_place(*check_args, **check_kwargs)
+                    self.check_grad_with_place(
+                        *check_args, **check_kwargs, check_new_ir=True
+                    )
 
     cls_name = "{}_{}".format(parent.__name__, "Fp16")
     TestElementwiseDivFP16Op.__name__ = cls_name
@@ -472,6 +482,7 @@ def test_check_gradient(self):
 
 class TestElementwiseDivBroadcast(unittest.TestCase):
     def test_shape_with_batch_sizes(self):
+        paddle.enable_static()
         with base.program_guard(base.Program()):
             x_var = paddle.static.data(
                 name='x', dtype='float32', shape=[None, 3, None, None]
@@ -482,16 +493,19 @@ def test_shape_with_batch_sizes(self):
             x = np.random.uniform(0.1, 0.6, (1, 3, 32, 32)).astype("float32")
             (out_result,) = exe.run(feed={'x': x}, fetch_list=[out])
             self.assertEqual((out_result == (2 / x)).all(), True)
+        paddle.disable_static()
 
 
 class TestDivideOp(unittest.TestCase):
     def test_name(self):
+        paddle.enable_static()
         with base.program_guard(base.Program()):
             x = paddle.static.data(name="x", shape=[2, 3], dtype="float32")
             y = paddle.static.data(name='y', shape=[2, 3], dtype='float32')
 
             y_1 = paddle.divide(x, y, name='div_res')
             self.assertEqual(('div_res' in y_1.name), True)
+        paddle.disable_static()
 
     def test_dygraph(self):
         with base.dygraph.guard():
@@ -505,6 +519,7 @@ def test_dygraph(self):
             self.assertEqual((np_z == z_expected).all(), True)
 
 
+# new ir doesn't support complex right now, skip new ir op test
 class TestComplexElementwiseDivOp(OpTest):
     def setUp(self):
         self.op_type = "elementwise_div"
@@ -532,7 +547,7 @@ def init_input_output(self):
         self.out = self.x / self.y
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=False)
 
     def test_check_grad_normal(self):
         self.check_grad(
@@ -540,6 +555,7 @@ def test_check_grad_normal(self):
             'Out',
             numeric_grad_delta=1e-5,
             max_relative_error=1e-6,
+            check_new_ir=False,
         )
 
     def test_check_grad_ingore_x(self):
@@ -549,6 +565,7 @@ def test_check_grad_ingore_x(self):
             no_grad_set=set("X"),
             numeric_grad_delta=1e-5,
             max_relative_error=1e-6,
+            check_new_ir=False,
         )
 
     def test_check_grad_ingore_y(self):
@@ -558,6 +575,7 @@ def test_check_grad_ingore_y(self):
             no_grad_set=set('Y'),
             numeric_grad_delta=1e-5,
             max_relative_error=1e-6,
+            check_new_ir=False,
         )
 
 
diff --git a/test/legacy_test/test_elementwise_floordiv_op.py b/test/legacy_test/test_elementwise_floordiv_op.py
index 17a5d0f2ed91a..f85ffe2252e02 100644
--- a/test/legacy_test/test_elementwise_floordiv_op.py
+++ b/test/legacy_test/test_elementwise_floordiv_op.py
@@ -17,7 +17,7 @@
 from contextlib import contextmanager
 
 import numpy as np
-from eager_op_test import OpTest, paddle_static_guard
+from op_test import OpTest, paddle_static_guard
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_elementwise_heaviside_op.py b/test/legacy_test/test_elementwise_heaviside_op.py
index 5e02e6cd0a945..be1a241849e41 100644
--- a/test/legacy_test/test_elementwise_heaviside_op.py
+++ b/test/legacy_test/test_elementwise_heaviside_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_elementwise_max_op.py b/test/legacy_test/test_elementwise_max_op.py
index 4c94992cec075..80e35c1a171c6 100644
--- a/test/legacy_test/test_elementwise_max_op.py
+++ b/test/legacy_test/test_elementwise_max_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci
+from op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_elementwise_min_op.py b/test/legacy_test/test_elementwise_min_op.py
index 9d2e5b76b9a48..4b839b4c8fc17 100644
--- a/test/legacy_test/test_elementwise_min_op.py
+++ b/test/legacy_test/test_elementwise_min_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci
+from op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_elementwise_mod_op.py b/test/legacy_test/test_elementwise_mod_op.py
index eb3ba740ea5e2..bb9348b358ebd 100644
--- a/test/legacy_test/test_elementwise_mod_op.py
+++ b/test/legacy_test/test_elementwise_mod_op.py
@@ -16,11 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import (
-    OpTest,
-    convert_float_to_uint16,
-    convert_uint16_to_float,
-)
+from op_test import OpTest, convert_float_to_uint16, convert_uint16_to_float
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_elementwise_mul_op.py b/test/legacy_test/test_elementwise_mul_op.py
index 6da50f02c163d..86f4e764916e0 100644
--- a/test/legacy_test/test_elementwise_mul_op.py
+++ b/test/legacy_test/test_elementwise_mul_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci
+from op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci
 
 import paddle
 from paddle.base import core
@@ -47,7 +47,10 @@ def setUp(self):
 
     def test_check_output(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_output(check_dygraph=(not self.use_mkldnn))
+        self.check_output(
+            check_dygraph=(not self.use_mkldnn),
+            check_new_ir=(not self.use_mkldnn),
+        )
 
     def test_check_grad_normal(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
@@ -56,6 +59,7 @@ def test_check_grad_normal(self):
             'Out',
             check_dygraph=(not self.use_mkldnn),
             check_prim=True,
+            check_new_ir=(not self.use_mkldnn),
         )
 
     def test_check_grad_ingore_x(self):
@@ -66,6 +70,7 @@ def test_check_grad_ingore_x(self):
             no_grad_set=set("X"),
             check_dygraph=(not self.use_mkldnn),
             check_prim=True,
+            check_new_ir=(not self.use_mkldnn),
         )
 
     def test_check_grad_ingore_y(self):
@@ -76,6 +81,7 @@ def test_check_grad_ingore_y(self):
             no_grad_set=set('Y'),
             check_dygraph=(not self.use_mkldnn),
             check_prim=True,
+            check_new_ir=(not self.use_mkldnn),
         )
 
     def init_input_output(self):
@@ -193,7 +199,7 @@ def test_check_output(self):
         self.check_output()
 
     def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out', check_prim=True)
+        self.check_grad(['X', 'Y'], 'Out', check_prim=True, check_new_ir=True)
 
     def test_check_grad_ingore_x(self):
         self.check_grad(
@@ -201,6 +207,7 @@ def test_check_grad_ingore_x(self):
             'Out',
             no_grad_set=set("X"),
             check_prim=True,
+            check_new_ir=True,
         )
 
     def test_check_grad_ingore_y(self):
@@ -209,6 +216,7 @@ def test_check_grad_ingore_y(self):
             'Out',
             no_grad_set=set('Y'),
             check_prim=True,
+            check_new_ir=True,
         )
 
     def if_enable_cinn(self):
@@ -264,7 +272,8 @@ def setUp(self):
 
     def test_check_output(self):
         self.check_output(
-            check_dygraph=self.check_dygraph, check_prim=self.check_prim
+            check_dygraph=self.check_dygraph,
+            check_prim=self.check_prim,
         )
 
     def test_check_grad_normal(self):
@@ -418,6 +427,7 @@ def test_check_grad_normal(self):
             'Out',
             check_dygraph=(not self.use_mkldnn),
             check_prim=True,
+            check_new_ir=(not self.use_mkldnn),
         )
 
     def test_check_grad_ingore_x(self):
@@ -428,6 +438,7 @@ def test_check_grad_ingore_x(self):
             no_grad_set=set("X"),
             check_dygraph=(not self.use_mkldnn),
             check_prim=True,
+            check_new_ir=(not self.use_mkldnn),
         )
 
     def test_check_grad_ingore_y(self):
@@ -438,6 +449,7 @@ def test_check_grad_ingore_y(self):
             no_grad_set=set('Y'),
             check_dygraph=(not self.use_mkldnn),
             check_prim=True,
+            check_new_ir=(not self.use_mkldnn),
         )
 
 
diff --git a/test/legacy_test/test_elementwise_pow_op.py b/test/legacy_test/test_elementwise_pow_op.py
index ce8a63355c5c7..8e64c45de27f6 100644
--- a/test/legacy_test/test_elementwise_pow_op.py
+++ b/test/legacy_test/test_elementwise_pow_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci
+from op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_elementwise_sub_op.py b/test/legacy_test/test_elementwise_sub_op.py
index 87abcc2b07e57..d3e96d158d98c 100644
--- a/test/legacy_test/test_elementwise_sub_op.py
+++ b/test/legacy_test/test_elementwise_sub_op.py
@@ -17,7 +17,7 @@
 import warnings
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci
+from op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_empty_like_op.py b/test/legacy_test/test_empty_like_op.py
index 6074e74ae30f1..f464388ae4720 100644
--- a/test/legacy_test/test_empty_like_op.py
+++ b/test/legacy_test/test_empty_like_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import convert_uint16_to_float
+from op_test import convert_uint16_to_float
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_empty_op.py b/test/legacy_test/test_empty_op.py
index 19980e55d4718..6920ef42dc6c6 100644
--- a/test/legacy_test/test_empty_op.py
+++ b/test/legacy_test/test_empty_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_erf_op.py b/test/legacy_test/test_erf_op.py
index e7eb19b156a70..bb94e950d35e7 100644
--- a/test/legacy_test/test_erf_op.py
+++ b/test/legacy_test/test_erf_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 from scipy.special import erf
 
 import paddle
diff --git a/test/legacy_test/test_erfinv_op.py b/test/legacy_test/test_erfinv_op.py
index 1a4f54230991a..3108f8520d532 100644
--- a/test/legacy_test/test_erfinv_op.py
+++ b/test/legacy_test/test_erfinv_op.py
@@ -15,11 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import (
-    OpTest,
-    convert_float_to_uint16,
-    convert_uint16_to_float,
-)
+from op_test import OpTest, convert_float_to_uint16, convert_uint16_to_float
 from scipy.special import erfinv
 
 import paddle
diff --git a/test/legacy_test/test_executor_return_tensor_not_overwriting.py b/test/legacy_test/test_executor_return_tensor_not_overwriting.py
index 66fa0001ef030..aa88d3d603f44 100644
--- a/test/legacy_test/test_executor_return_tensor_not_overwriting.py
+++ b/test/legacy_test/test_executor_return_tensor_not_overwriting.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, skip_check_grad_ci
+from op_test import OpTest, skip_check_grad_ci
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_expand_as_op.py b/test/legacy_test/test_expand_as_op.py
index 51bfb46dbb5ae..b5b8013a2c9c6 100755
--- a/test/legacy_test/test_expand_as_op.py
+++ b/test/legacy_test/test_expand_as_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 def bcast(x, target_tensor):
diff --git a/test/legacy_test/test_expand_as_v2_op.py b/test/legacy_test/test_expand_as_v2_op.py
index 0f9d9d3da6477..13aa6863b9bd6 100755
--- a/test/legacy_test/test_expand_as_v2_op.py
+++ b/test/legacy_test/test_expand_as_v2_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_expand_op.py b/test/legacy_test/test_expand_op.py
index 90fb5222dfae3..f86af46ba7e5d 100644
--- a/test/legacy_test/test_expand_op.py
+++ b/test/legacy_test/test_expand_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 from paddle import base
 
diff --git a/test/legacy_test/test_expand_v2_op.py b/test/legacy_test/test_expand_v2_op.py
index 9352b34f69aa0..d76e97095253b 100644
--- a/test/legacy_test/test_expand_v2_op.py
+++ b/test/legacy_test/test_expand_v2_op.py
@@ -17,7 +17,7 @@
 import gradient_checker
 import numpy as np
 from decorator_helper import prog_scope
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_exponential_op.py b/test/legacy_test/test_exponential_op.py
index 4debe069b8235..a611b477f99ed 100644
--- a/test/legacy_test/test_exponential_op.py
+++ b/test/legacy_test/test_exponential_op.py
@@ -15,11 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import (
-    OpTest,
-    convert_float_to_uint16,
-    convert_uint16_to_float,
-)
+from op_test import OpTest, convert_float_to_uint16, convert_uint16_to_float
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_eye_op.py b/test/legacy_test/test_eye_op.py
index a0340dc2c5f56..bd1b42183364c 100644
--- a/test/legacy_test/test_eye_op.py
+++ b/test/legacy_test/test_eye_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 from test_attribute_var import UnittestBase
 
 import paddle
diff --git a/test/legacy_test/test_fake_dequantize_op.py b/test/legacy_test/test_fake_dequantize_op.py
index d47ba3fd91b55..ee2f7f7b0820a 100644
--- a/test/legacy_test/test_fake_dequantize_op.py
+++ b/test/legacy_test/test_fake_dequantize_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 def quantize_max_abs(x, max_range):
diff --git a/test/legacy_test/test_fake_quantize_op.py b/test/legacy_test/test_fake_quantize_op.py
index 9f89763731a38..8fdfd5f142c78 100644
--- a/test/legacy_test/test_fake_quantize_op.py
+++ b/test/legacy_test/test_fake_quantize_op.py
@@ -17,7 +17,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 def round_c_single_element(val):
diff --git a/test/legacy_test/test_fc_op.py b/test/legacy_test/test_fc_op.py
index a59720e9145c8..1f41cf8d4afa6 100644
--- a/test/legacy_test/test_fc_op.py
+++ b/test/legacy_test/test_fc_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, paddle_static_guard
+from op_test import OpTest, paddle_static_guard
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_fill_any_like_op.py b/test/legacy_test/test_fill_any_like_op.py
index f32f07f39d07b..375f7e2a103b8 100644
--- a/test/legacy_test/test_fill_any_like_op.py
+++ b/test/legacy_test/test_fill_any_like_op.py
@@ -15,18 +15,28 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 import paddle.framework.dtype as dtypes
 from paddle.base import core
+from paddle.framework import in_pir_mode
 
 
 def fill_any_like_wrapper(x, value, out_dtype=None, name=None):
     if isinstance(out_dtype, int):
-        tmp_dtype = dtypes.dtype(out_dtype)
+        if not in_pir_mode():
+            tmp_dtype = dtypes.dtype(out_dtype)
+        else:
+            from paddle.base.libpaddle import DataType
+
+            tmp_dtype = DataType(paddle.ir.core.vartype_to_datatype[out_dtype])
     else:
         tmp_dtype = out_dtype
+        if in_pir_mode() and isinstance(
+            out_dtype, paddle.framework.core.VarDesc.VarType
+        ):
+            tmp_dtype = paddle.ir.core.vartype_to_datatype[tmp_dtype]
     return paddle.full_like(x, value, tmp_dtype, name)
 
 
@@ -48,7 +58,7 @@ def init(self):
         pass
 
     def test_check_output(self):
-        self.check_output(check_prim=True)
+        self.check_output(check_prim=True, check_new_ir=True)
 
     def if_enable_cinn(self):
         pass
@@ -86,7 +96,7 @@ def setUp(self):
 
     def test_check_output(self):
         place = core.CUDAPlace(0)
-        self.check_output_with_place(place, check_prim=True)
+        self.check_output_with_place(place, check_prim=True, check_new_ir=True)
 
     def if_enable_cinn(self):
         pass
diff --git a/test/legacy_test/test_fill_any_op.py b/test/legacy_test/test_fill_any_op.py
index b2325d06dfa37..6752701dde91d 100644
--- a/test/legacy_test/test_fill_any_op.py
+++ b/test/legacy_test/test_fill_any_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 
diff --git a/test/legacy_test/test_fill_constant_op.py b/test/legacy_test/test_fill_constant_op.py
index ee9d5e058c1e0..e8027da2d0853 100644
--- a/test/legacy_test/test_fill_constant_op.py
+++ b/test/legacy_test/test_fill_constant_op.py
@@ -15,8 +15,8 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
 from op import Operator
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_fill_diagonal_tensor_op.py b/test/legacy_test/test_fill_diagonal_tensor_op.py
index be5f435b001ea..5b8f5d69e5f5f 100644
--- a/test/legacy_test/test_fill_diagonal_tensor_op.py
+++ b/test/legacy_test/test_fill_diagonal_tensor_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_fill_op.py b/test/legacy_test/test_fill_op.py
index 88686999b2d91..679ee25e041ab 100644
--- a/test/legacy_test/test_fill_op.py
+++ b/test/legacy_test/test_fill_op.py
@@ -15,9 +15,11 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
 from op import Operator
+from op_test import OpTest, convert_float_to_uint16
 
+import paddle
+from paddle import base
 from paddle.base import core
 
 
@@ -25,15 +27,22 @@ class TestFillOp1(OpTest):
     def setUp(self):
         self.op_type = "fill"
         self.init_dtype()
-        val = np.random.random(size=[100, 200])
+        self.init_shape()
+        self.init_value()
         self.inputs = {}
         self.attrs = {
-            'value': val.flatten().tolist(),
-            'shape': [100, 200],
+            'value': self.val.flatten().tolist(),
+            'shape': self.shape,
             'dtype': int(core.VarDesc.VarType.FP64),
             'force_cpu': False,
         }
-        self.outputs = {'Out': val.astype('float64')}
+        self.outputs = {'Out': self.val.astype('float64')}
+
+    def init_shape(self):
+        self.shape = [100, 200]
+
+    def init_value(self):
+        self.val = np.random.random(size=self.shape)
 
     def init_dtype(self):
         self.dtype = np.float64
@@ -45,15 +54,15 @@ def test_check_output(self):
 class TestFillOp2(OpTest):
     def setUp(self):
         self.op_type = "fill"
-        val = np.random.random(size=[100, 200])
+        self.val = np.random.random(size=[100, 200])
         self.inputs = {}
         self.attrs = {
-            'value': val.flatten().tolist(),
+            'value': self.val.flatten().tolist(),
             'shape': [100, 200],
             'dtype': int(core.VarDesc.VarType.FP64),
             'force_cpu': True,
         }
-        self.outputs = {'Out': val.astype('float64')}
+        self.outputs = {'Out': self.val.astype('float64')}
 
     def test_check_output(self):
         self.check_output()
@@ -98,6 +107,22 @@ def init_dtype(self):
         self.dtype = np.float16
 
 
+class TestFillInf(TestFillOp1):
+    def init_value(self):
+        self.val = np.full(fill_value=np.inf, shape=self.shape)
+
+
+class TestFillOpError(unittest.TestCase):
+    def test_errors(self):
+        with base.dygraph.base.guard():
+
+            def test_nan_fill_value():
+                tensor = paddle.zeros(shape=[100, 200])
+                tensor.fill_(np.nan)
+
+            self.assertRaises(ValueError, test_nan_fill_value)
+
+
 @unittest.skipIf(
     not core.is_compiled_with_cuda()
     or not core.is_bfloat16_supported(core.CUDAPlace(0)),
diff --git a/test/legacy_test/test_fill_zeros_like2_op.py b/test/legacy_test/test_fill_zeros_like2_op.py
index cbacc81d3985b..8b00fc3a29fd7 100644
--- a/test/legacy_test/test_fill_zeros_like2_op.py
+++ b/test/legacy_test/test_fill_zeros_like2_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 from paddle.base.framework import convert_np_dtype_to_dtype_
 
diff --git a/test/legacy_test/test_fill_zeros_like_op.py b/test/legacy_test/test_fill_zeros_like_op.py
index 3d88f679e2709..805d9c88d6cb2 100644
--- a/test/legacy_test/test_fill_zeros_like_op.py
+++ b/test/legacy_test/test_fill_zeros_like_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 class TestFillZerosLikeOp(OpTest):
diff --git a/test/legacy_test/test_filter_by_instag_op.py b/test/legacy_test/test_filter_by_instag_op.py
index 1c8d4fc6ca8e2..211889feaa06b 100644
--- a/test/legacy_test/test_filter_by_instag_op.py
+++ b/test/legacy_test/test_filter_by_instag_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 """This is Test Case 1"""
 
diff --git a/test/legacy_test/test_flatten2_op.py b/test/legacy_test/test_flatten2_op.py
index 1981b3f4ab3b7..a60739210679f 100644
--- a/test/legacy_test/test_flatten2_op.py
+++ b/test/legacy_test/test_flatten2_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 class TestFlattenOp(OpTest):
diff --git a/test/legacy_test/test_flatten_contiguous_range_op.py b/test/legacy_test/test_flatten_contiguous_range_op.py
index f8b76771b4e65..d497da1cd2758 100644
--- a/test/legacy_test/test_flatten_contiguous_range_op.py
+++ b/test/legacy_test/test_flatten_contiguous_range_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_flatten_op.py b/test/legacy_test/test_flatten_op.py
index d3756946b1d89..f59c6a91028d1 100644
--- a/test/legacy_test/test_flatten_op.py
+++ b/test/legacy_test/test_flatten_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, paddle_static_guard
+from op_test import OpTest, paddle_static_guard
 
 import paddle
 
diff --git a/test/legacy_test/test_flip.py b/test/legacy_test/test_flip.py
index d4125049f90eb..4e5cc58ad3312 100644
--- a/test/legacy_test/test_flip.py
+++ b/test/legacy_test/test_flip.py
@@ -17,7 +17,7 @@
 import gradient_checker
 import numpy as np
 from decorator_helper import prog_scope
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_fmax_op.py b/test/legacy_test/test_fmax_op.py
index 0ea78cb41cc5e..bc5272134f238 100644
--- a/test/legacy_test/test_fmax_op.py
+++ b/test/legacy_test/test_fmax_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_fmin_op.py b/test/legacy_test/test_fmin_op.py
index 26e2c585355ca..88d4b8252f3d1 100644
--- a/test/legacy_test/test_fmin_op.py
+++ b/test/legacy_test/test_fmin_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_fold_op.py b/test/legacy_test/test_fold_op.py
index 62f3a05311e63..8e4ab1971b7ae 100644
--- a/test/legacy_test/test_fold_op.py
+++ b/test/legacy_test/test_fold_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle import base
@@ -39,7 +39,15 @@ def init_data(self):
         self.dilations = [1, 1]
         self.output_sizes = [4, 5]
         input_shape = [self.batch_size, self.input_channels, self.length]
-        self.x = np.random.rand(*input_shape).astype(np.float64)
+        self.x = np.random.rand(*input_shape).astype(self.dtype)
+        if self.dtype == np.complex64 or self.dtype == np.complex128:
+            self.x = (
+                np.random.uniform(-1, 1, input_shape)
+                + 1j * np.random.uniform(-1, 1, input_shape)
+            ).astype(self.dtype)
+
+    def init_dtype(self):
+        self.dtype = np.float64
 
     def calc_fold(self):
         output_shape = [0] * 4
@@ -75,7 +83,7 @@ def calc_fold(self):
             )
             + 1
         )
-        output = np.zeros(output_shape).astype(np.float64)
+        output = np.zeros(output_shape).astype(self.dtype)
         # ------------- calculate output ------------- #
         for b in range(output_shape[0]):
             for c in range(self.input_channels):
@@ -106,6 +114,7 @@ def calc_fold(self):
         self.outputs = output
 
     def set_data(self):
+        self.init_dtype()
         self.init_data()
         self.calc_fold()
         self.inputs = {'X': OpTest.np_dtype_to_base_dtype(self.x)}
@@ -130,6 +139,16 @@ def test_check_grad(self):
         self.check_grad(['X'], 'Y')
 
 
+class TestFold_Complex64(TestFoldOp):
+    def init_dtype(self):
+        self.dtype = np.complex64
+
+
+class TestFold_Complex128(TestFoldOp):
+    def init_dtype(self):
+        self.dtype = np.complex128
+
+
 class TestFoldshape(TestFoldOp):
     def init_data(self):
         self.batch_size = 8
diff --git a/test/legacy_test/test_frame_op.py b/test/legacy_test/test_frame_op.py
index 6f96d952cc63e..2b0263d08620f 100644
--- a/test/legacy_test/test_frame_op.py
+++ b/test/legacy_test/test_frame_op.py
@@ -15,8 +15,8 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
 from numpy.lib.stride_tricks import as_strided
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_fsp_op.py b/test/legacy_test/test_fsp_op.py
index 5628e5a52dcaf..abeaae9f24d3d 100644
--- a/test/legacy_test/test_fsp_op.py
+++ b/test/legacy_test/test_fsp_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 def fsp_matrix(a, b):
diff --git a/test/legacy_test/test_ftrl_op.py b/test/legacy_test/test_ftrl_op.py
index 2659eb423905f..489da7c4f0c8f 100644
--- a/test/legacy_test/test_ftrl_op.py
+++ b/test/legacy_test/test_ftrl_op.py
@@ -15,8 +15,8 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
 from op import Operator
+from op_test import OpTest
 
 from paddle.base import core
 
diff --git a/test/legacy_test/test_full_like_op.py b/test/legacy_test/test_full_like_op.py
index 38e2b426e4775..96c9db25c29ae 100644
--- a/test/legacy_test/test_full_like_op.py
+++ b/test/legacy_test/test_full_like_op.py
@@ -15,20 +15,30 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 import paddle.framework.dtype as dtypes
 from paddle.base import core
 from paddle.base.framework import convert_np_dtype_to_dtype_
+from paddle.framework import in_pir_mode
 from paddle.static import Program, program_guard
 
 
 def fill_any_like_wrapper(x, value, out_dtype=None, name=None):
     if isinstance(out_dtype, int):
-        tmp_dtype = dtypes.dtype(out_dtype)
+        if not in_pir_mode():
+            tmp_dtype = dtypes.dtype(out_dtype)
+        else:
+            from paddle.base.libpaddle import DataType
+
+            tmp_dtype = DataType(paddle.ir.core.vartype_to_datatype[out_dtype])
     else:
         tmp_dtype = out_dtype
+        if in_pir_mode() and isinstance(
+            out_dtype, paddle.framework.core.VarDesc.VarType
+        ):
+            tmp_dtype = paddle.ir.core.vartype_to_datatype[tmp_dtype]
     return paddle.full_like(x, value, tmp_dtype, name)
 
 
@@ -36,6 +46,7 @@ class TestFullOp(unittest.TestCase):
     """Test fill_any_like op(whose API is full_like) for attr out."""
 
     def test_attr_tensor_API(self):
+        paddle.enable_static()
         startup_program = Program()
         train_program = Program()
         with program_guard(train_program, startup_program):
@@ -63,6 +74,7 @@ def test_attr_tensor_API(self):
                 not (out_np - np.full_like(img, fill_value)).any(),
                 msg="full_like output is wrong, out = " + str(out_np),
             )
+        paddle.disable_static()
 
     def test_full_like_imperative(self):
         paddle.disable_static()
@@ -136,7 +148,7 @@ def init_data(self):
         self.dtype = np.float32
 
     def test_check_output(self):
-        self.check_output(check_prim=True)
+        self.check_output(check_prim=True, check_new_ir=True)
 
     def if_enable_cinn(self):
         pass
diff --git a/test/legacy_test/test_fused_adam_op.py b/test/legacy_test/test_fused_adam_op.py
index 27829957962a7..8bbc1fafef05b 100644
--- a/test/legacy_test/test_fused_adam_op.py
+++ b/test/legacy_test/test_fused_adam_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 
diff --git a/test/legacy_test/test_fused_attention_op.py b/test/legacy_test/test_fused_attention_op.py
index 271b4ab6bec33..af734c96d19d8 100644
--- a/test/legacy_test/test_fused_attention_op.py
+++ b/test/legacy_test/test_fused_attention_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 import paddle.incubate.nn.functional as incubate_f
diff --git a/test/legacy_test/test_fused_bias_act_op.py b/test/legacy_test/test_fused_bias_act_op.py
index 054932f13f57d..b82e99117653d 100644
--- a/test/legacy_test/test_fused_bias_act_op.py
+++ b/test/legacy_test/test_fused_bias_act_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import convert_float_to_uint16
+from op_test import convert_float_to_uint16
 from scipy.special import erf, expit
 
 import paddle
diff --git a/test/legacy_test/test_fused_bias_dropout_residual_layer_norm_op.py b/test/legacy_test/test_fused_bias_dropout_residual_layer_norm_op.py
index 87c7bcf4db908..e57ec58fb0ede 100644
--- a/test/legacy_test/test_fused_bias_dropout_residual_layer_norm_op.py
+++ b/test/legacy_test/test_fused_bias_dropout_residual_layer_norm_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 import paddle.incubate.nn.functional as incubate_f
diff --git a/test/legacy_test/test_fused_ec_moe_op.py b/test/legacy_test/test_fused_ec_moe_op.py
index 3ac780806c911..68f436a62c5ae 100644
--- a/test/legacy_test/test_fused_ec_moe_op.py
+++ b/test/legacy_test/test_fused_ec_moe_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 import paddle.nn.functional as F
diff --git a/test/legacy_test/test_fused_elemwise_activation_op.py b/test/legacy_test/test_fused_elemwise_activation_op.py
index 28cffdcabd579..a71e449f75d3a 100644
--- a/test/legacy_test/test_fused_elemwise_activation_op.py
+++ b/test/legacy_test/test_fused_elemwise_activation_op.py
@@ -16,7 +16,7 @@
 from functools import partial
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 from paddle.base import core
 
diff --git a/test/legacy_test/test_fused_emb_seq_pool_op.py b/test/legacy_test/test_fused_emb_seq_pool_op.py
index 4d2e774e5465c..0d7d5aab4401e 100644
--- a/test/legacy_test/test_fused_emb_seq_pool_op.py
+++ b/test/legacy_test/test_fused_emb_seq_pool_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, paddle_static_guard, skip_check_grad_ci
+from op_test import OpTest, paddle_static_guard, skip_check_grad_ci
 
 import paddle
 import paddle.version as ver
diff --git a/test/legacy_test/test_fused_embedding_fc_lstm_op.py b/test/legacy_test/test_fused_embedding_fc_lstm_op.py
index cd44b1d74e772..cc9dd6a17565d 100644
--- a/test/legacy_test/test_fused_embedding_fc_lstm_op.py
+++ b/test/legacy_test/test_fused_embedding_fc_lstm_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 from test_lstm_op import ACTIVATION, lstm
 
 
diff --git a/test/legacy_test/test_fused_fc_elementwise_layernorm_op.py b/test/legacy_test/test_fused_fc_elementwise_layernorm_op.py
index 330931fdc4170..3284c4a46953f 100644
--- a/test/legacy_test/test_fused_fc_elementwise_layernorm_op.py
+++ b/test/legacy_test/test_fused_fc_elementwise_layernorm_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 from test_fc_op import MatrixGenerate, fc_refer
 from test_layer_norm_op import _reference_layer_norm_naive
 
diff --git a/test/legacy_test/test_fused_feedforward_op.py b/test/legacy_test/test_fused_feedforward_op.py
index 6aba4784ff6f6..6ac4c8b09f63f 100644
--- a/test/legacy_test/test_fused_feedforward_op.py
+++ b/test/legacy_test/test_fused_feedforward_op.py
@@ -14,7 +14,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 import paddle.incubate.nn.functional as incubate_f
diff --git a/test/legacy_test/test_fused_gate_attention_op.py b/test/legacy_test/test_fused_gate_attention_op.py
index 7bb5790f0c766..43ee9ab844ee0 100644
--- a/test/legacy_test/test_fused_gate_attention_op.py
+++ b/test/legacy_test/test_fused_gate_attention_op.py
@@ -20,11 +20,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import (
-    OpTest,
-    convert_float_to_uint16,
-    convert_uint16_to_float,
-)
+from op_test import OpTest, convert_float_to_uint16, convert_uint16_to_float
 from test_sparse_attention_op import get_cuda_version
 
 import paddle
diff --git a/test/legacy_test/test_fused_gemm_epilogue_grad_op.py b/test/legacy_test/test_fused_gemm_epilogue_grad_op.py
index 0014f5a440cc9..0e33518072cf8 100644
--- a/test/legacy_test/test_fused_gemm_epilogue_grad_op.py
+++ b/test/legacy_test/test_fused_gemm_epilogue_grad_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, skip_check_grad_ci
+from op_test import OpTest, skip_check_grad_ci
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_fused_gemm_epilogue_op.py b/test/legacy_test/test_fused_gemm_epilogue_op.py
index 6064536ec4a90..7a3301a3981d5 100644
--- a/test/legacy_test/test_fused_gemm_epilogue_op.py
+++ b/test/legacy_test/test_fused_gemm_epilogue_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, skip_check_grad_ci, skip_check_inplace_ci
+from op_test import OpTest, skip_check_grad_ci, skip_check_inplace_ci
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_fused_multi_transformer_op.py b/test/legacy_test/test_fused_multi_transformer_op.py
index dbe4fa6f014e8..d7bab80a41b80 100644
--- a/test/legacy_test/test_fused_multi_transformer_op.py
+++ b/test/legacy_test/test_fused_multi_transformer_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 import paddle.nn.functional as F
diff --git a/test/legacy_test/test_fused_multihead_matmul_op.py b/test/legacy_test/test_fused_multihead_matmul_op.py
index ebf73fd6353fb..e4ba1a346e453 100644
--- a/test/legacy_test/test_fused_multihead_matmul_op.py
+++ b/test/legacy_test/test_fused_multihead_matmul_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 from paddle.base import core
 
diff --git a/test/legacy_test/test_fused_scale_bias_relu_conv_bnstats_op.py b/test/legacy_test/test_fused_scale_bias_relu_conv_bnstats_op.py
index f8cbc8c387dc7..4beef728d9e11 100644
--- a/test/legacy_test/test_fused_scale_bias_relu_conv_bnstats_op.py
+++ b/test/legacy_test/test_fused_scale_bias_relu_conv_bnstats_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, skip_check_grad_ci
+from op_test import OpTest, skip_check_grad_ci
 
 import paddle
 from paddle import nn
diff --git a/test/legacy_test/test_fused_token_prune_op.py b/test/legacy_test/test_fused_token_prune_op.py
index 1b66830c1b15a..c29766123d15e 100644
--- a/test/legacy_test/test_fused_token_prune_op.py
+++ b/test/legacy_test/test_fused_token_prune_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 from paddle.framework import core
 
diff --git a/test/legacy_test/test_fusion_gru_op.py b/test/legacy_test/test_fusion_gru_op.py
index 661b4b0eec0a7..a86fd9b1f7b7c 100644
--- a/test/legacy_test/test_fusion_gru_op.py
+++ b/test/legacy_test/test_fusion_gru_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 from test_fusion_lstm_op import ACTIVATION, fc
 from test_gru_op import gru
 
diff --git a/test/legacy_test/test_fusion_lstm_op.py b/test/legacy_test/test_fusion_lstm_op.py
index b9cda032fd444..bbcb5e8a8396c 100644
--- a/test/legacy_test/test_fusion_lstm_op.py
+++ b/test/legacy_test/test_fusion_lstm_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 from test_lstm_op import ACTIVATION, lstm
 
 
diff --git a/test/legacy_test/test_fusion_repeated_fc_relu_op.py b/test/legacy_test/test_fusion_repeated_fc_relu_op.py
index 03cb6bd3b822c..b57cdfc1d4eb2 100644
--- a/test/legacy_test/test_fusion_repeated_fc_relu_op.py
+++ b/test/legacy_test/test_fusion_repeated_fc_relu_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 from test_fc_op import MatrixGenerate, fc_refer
 
 
diff --git a/test/legacy_test/test_fusion_seqconv_eltadd_relu_op.py b/test/legacy_test/test_fusion_seqconv_eltadd_relu_op.py
index 974aa1ba3a323..7cfc0da1ebe47 100644
--- a/test/legacy_test/test_fusion_seqconv_eltadd_relu_op.py
+++ b/test/legacy_test/test_fusion_seqconv_eltadd_relu_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 sys.path.append("../../test/sequence")
 from test_sequence_conv import seqconv
diff --git a/test/legacy_test/test_fusion_seqexpand_concat_fc_op.py b/test/legacy_test/test_fusion_seqexpand_concat_fc_op.py
index 4bf126100d336..77bbb1e238767 100644
--- a/test/legacy_test/test_fusion_seqexpand_concat_fc_op.py
+++ b/test/legacy_test/test_fusion_seqexpand_concat_fc_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 from test_fusion_lstm_op import ACTIVATION, fc
 
 
diff --git a/test/legacy_test/test_fusion_seqpool_concat_op.py b/test/legacy_test/test_fusion_seqpool_concat_op.py
index e3cf2a54a93d5..3e136d94f4041 100644
--- a/test/legacy_test/test_fusion_seqpool_concat_op.py
+++ b/test/legacy_test/test_fusion_seqpool_concat_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 sys.path.append("../../test/sequence")
 from test_sequence_pool import (
diff --git a/test/legacy_test/test_fusion_seqpool_cvm_concat_op.py b/test/legacy_test/test_fusion_seqpool_cvm_concat_op.py
index dc9efeec03790..86620eda0f69d 100644
--- a/test/legacy_test/test_fusion_seqpool_cvm_concat_op.py
+++ b/test/legacy_test/test_fusion_seqpool_cvm_concat_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 sys.path.append("../../test/sequence")
 from test_cvm_op import cvm_compute
diff --git a/test/legacy_test/test_fusion_squared_mat_sub_op.py b/test/legacy_test/test_fusion_squared_mat_sub_op.py
index ca58effebc2ef..a9c692e538261 100644
--- a/test/legacy_test/test_fusion_squared_mat_sub_op.py
+++ b/test/legacy_test/test_fusion_squared_mat_sub_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 class TestFusionSquaredMatSubOp(OpTest):
diff --git a/test/legacy_test/test_fusion_transpose_flatten_concat_op.py b/test/legacy_test/test_fusion_transpose_flatten_concat_op.py
index cb461ec073651..de557e4c4a52e 100644
--- a/test/legacy_test/test_fusion_transpose_flatten_concat_op.py
+++ b/test/legacy_test/test_fusion_transpose_flatten_concat_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 from paddle.base import core
 
diff --git a/test/legacy_test/test_gather_nd_op.py b/test/legacy_test/test_gather_nd_op.py
index d934b047ae149..193a9856c9e67 100644
--- a/test/legacy_test/test_gather_nd_op.py
+++ b/test/legacy_test/test_gather_nd_op.py
@@ -15,11 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import (
-    OpTest,
-    convert_float_to_uint16,
-    convert_uint16_to_float,
-)
+from op_test import OpTest, convert_float_to_uint16, convert_uint16_to_float
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_gather_op.py b/test/legacy_test/test_gather_op.py
index 8bc012883fc65..e845875394be6 100644
--- a/test/legacy_test/test_gather_op.py
+++ b/test/legacy_test/test_gather_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_gather_tree_op.py b/test/legacy_test/test_gather_tree_op.py
index b59e30098aecc..2604d3e48e8da 100644
--- a/test/legacy_test/test_gather_tree_op.py
+++ b/test/legacy_test/test_gather_tree_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle.base.framework import Program, program_guard
diff --git a/test/legacy_test/test_gaussian_random_op.py b/test/legacy_test/test_gaussian_random_op.py
index 293e79d05fcea..f3f13e8b0c2b8 100644
--- a/test/legacy_test/test_gaussian_random_op.py
+++ b/test/legacy_test/test_gaussian_random_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_uint16_to_float, paddle_static_guard
+from op_test import OpTest, convert_uint16_to_float, paddle_static_guard
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_generate_proposal_labels_op.py b/test/legacy_test/test_generate_proposal_labels_op.py
index f8cc3185fd43a..903201b9856a7 100644
--- a/test/legacy_test/test_generate_proposal_labels_op.py
+++ b/test/legacy_test/test_generate_proposal_labels_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 def generate_proposal_labels_in_python(
diff --git a/test/legacy_test/test_generate_proposals_op.py b/test/legacy_test/test_generate_proposals_op.py
index 05e167d253018..7ce1aa5e4666b 100644
--- a/test/legacy_test/test_generate_proposals_op.py
+++ b/test/legacy_test/test_generate_proposals_op.py
@@ -17,7 +17,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 from test_anchor_generator_op import anchor_generator_in_python
 
 import paddle
diff --git a/test/legacy_test/test_generate_proposals_v2_op.py b/test/legacy_test/test_generate_proposals_v2_op.py
index 48f00c57486b2..568c466e06666 100644
--- a/test/legacy_test/test_generate_proposals_v2_op.py
+++ b/test/legacy_test/test_generate_proposals_v2_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 from test_anchor_generator_op import anchor_generator_in_python
 from test_generate_proposals_op import box_coder, clip_tiled_boxes, nms
 
diff --git a/test/legacy_test/test_graph_send_recv_op.py b/test/legacy_test/test_graph_send_recv_op.py
index ec7b994eec4c0..31269a82b3faa 100644
--- a/test/legacy_test/test_graph_send_recv_op.py
+++ b/test/legacy_test/test_graph_send_recv_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 
diff --git a/test/legacy_test/test_graph_send_ue_recv_op.py b/test/legacy_test/test_graph_send_ue_recv_op.py
index 10ce31514df36..cc8f6e188a863 100644
--- a/test/legacy_test/test_graph_send_ue_recv_op.py
+++ b/test/legacy_test/test_graph_send_ue_recv_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_graph_send_uv_op.py b/test/legacy_test/test_graph_send_uv_op.py
index 914901146a96d..ad32cbeea3952 100644
--- a/test/legacy_test/test_graph_send_uv_op.py
+++ b/test/legacy_test/test_graph_send_uv_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 
diff --git a/test/legacy_test/test_grid_sampler_op.py b/test/legacy_test/test_grid_sampler_op.py
index cbc40b12f59ba..c3bbbbf040b2c 100644
--- a/test/legacy_test/test_grid_sampler_op.py
+++ b/test/legacy_test/test_grid_sampler_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, skip_check_grad_ci
+from op_test import OpTest, skip_check_grad_ci
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_group_norm_op.py b/test/legacy_test/test_group_norm_op.py
index ef0c030f8f93e..4d6a3fff923f6 100644
--- a/test/legacy_test/test_group_norm_op.py
+++ b/test/legacy_test/test_group_norm_op.py
@@ -16,7 +16,7 @@
 
 import numpy as np
 import parameterized as param
-from eager_op_test import (
+from op_test import (
     OpTest,
     convert_float_to_uint16,
     convert_uint16_to_float,
diff --git a/test/legacy_test/test_gru_op.py b/test/legacy_test/test_gru_op.py
index 8c202d16359d8..cde8e2bec8baf 100644
--- a/test/legacy_test/test_gru_op.py
+++ b/test/legacy_test/test_gru_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 from test_lstm_op import ACTIVATION
 
 
diff --git a/test/legacy_test/test_gru_rnn_op.py b/test/legacy_test/test_gru_rnn_op.py
index 4a4420a8c335d..f3b87d2b8ac54 100644
--- a/test/legacy_test/test_gru_rnn_op.py
+++ b/test/legacy_test/test_gru_rnn_op.py
@@ -17,7 +17,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_gru_unit_op.py b/test/legacy_test/test_gru_unit_op.py
index 1359583ae1ade..ce176a0ba1c70 100644
--- a/test/legacy_test/test_gru_unit_op.py
+++ b/test/legacy_test/test_gru_unit_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 from paddle import base
 
diff --git a/test/legacy_test/test_gumbel_softmax_op.py b/test/legacy_test/test_gumbel_softmax_op.py
index 608b73739c902..6332dd695e551 100644
--- a/test/legacy_test/test_gumbel_softmax_op.py
+++ b/test/legacy_test/test_gumbel_softmax_op.py
@@ -13,7 +13,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 import paddle.nn.functional as F
diff --git a/test/legacy_test/test_hash_op.py b/test/legacy_test/test_hash_op.py
index ad0e202b7904a..75ddd7bb89c8c 100644
--- a/test/legacy_test/test_hash_op.py
+++ b/test/legacy_test/test_hash_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 class TestHashOp(OpTest):
diff --git a/test/legacy_test/test_hinge_loss_op.py b/test/legacy_test/test_hinge_loss_op.py
index b8c0fdb284a6c..d87d9e43aa2d3 100644
--- a/test/legacy_test/test_hinge_loss_op.py
+++ b/test/legacy_test/test_hinge_loss_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 class TestHingeLossOp(OpTest):
diff --git a/test/legacy_test/test_histogram_op.py b/test/legacy_test/test_histogram_op.py
index 8ca5b4dd500da..c482350536b6f 100644
--- a/test/legacy_test/test_histogram_op.py
+++ b/test/legacy_test/test_histogram_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_hsigmoid_op.py b/test/legacy_test/test_hsigmoid_op.py
index 6144285586055..65cb8548e9eb8 100644
--- a/test/legacy_test/test_hsigmoid_op.py
+++ b/test/legacy_test/test_hsigmoid_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, skip_check_grad_ci
+from op_test import OpTest, skip_check_grad_ci
 
 import paddle
 import paddle.nn.functional as F
diff --git a/test/legacy_test/test_huber_loss_op.py b/test/legacy_test/test_huber_loss_op.py
index 5b74de590dab9..52a06c12cd716 100644
--- a/test/legacy_test/test_huber_loss_op.py
+++ b/test/legacy_test/test_huber_loss_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_i0_op.py b/test/legacy_test/test_i0_op.py
index 6019a868cf0d2..4ff7514752e0b 100644
--- a/test/legacy_test/test_i0_op.py
+++ b/test/legacy_test/test_i0_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 from scipy import special
 
 import paddle
diff --git a/test/legacy_test/test_i0e_op.py b/test/legacy_test/test_i0e_op.py
index 8a3df4ab77afd..692587bf86dbf 100644
--- a/test/legacy_test/test_i0e_op.py
+++ b/test/legacy_test/test_i0e_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 from scipy import special
 
 import paddle
diff --git a/test/legacy_test/test_i1_op.py b/test/legacy_test/test_i1_op.py
index d4ab42cd4b4fa..0bb76a9bbf6ef 100644
--- a/test/legacy_test/test_i1_op.py
+++ b/test/legacy_test/test_i1_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 from scipy import special
 
 import paddle
diff --git a/test/legacy_test/test_i1e_op.py b/test/legacy_test/test_i1e_op.py
index 49379918b2666..94f9b625dc95b 100644
--- a/test/legacy_test/test_i1e_op.py
+++ b/test/legacy_test/test_i1e_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 from scipy import special
 
 import paddle
diff --git a/test/legacy_test/test_identity_loss_op.py b/test/legacy_test/test_identity_loss_op.py
index 6fc0ce658ff41..f751aa8959309 100644
--- a/test/legacy_test/test_identity_loss_op.py
+++ b/test/legacy_test/test_identity_loss_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_im2sequence_op.py b/test/legacy_test/test_im2sequence_op.py
index a001c850f60d2..b118418711249 100644
--- a/test/legacy_test/test_im2sequence_op.py
+++ b/test/legacy_test/test_im2sequence_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, skip_check_grad_ci
+from op_test import OpTest, skip_check_grad_ci
 
 
 def get_output_shape(attrs, in_shape, img_real_size):
diff --git a/test/legacy_test/test_index_add_op.py b/test/legacy_test/test_index_add_op.py
index 39279305a15c7..cf6a3d03245d2 100644
--- a/test/legacy_test/test_index_add_op.py
+++ b/test/legacy_test/test_index_add_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle.base import Program, core
diff --git a/test/legacy_test/test_index_sample_op.py b/test/legacy_test/test_index_sample_op.py
index 569e27203dd24..f3424feba04aa 100755
--- a/test/legacy_test/test_index_sample_op.py
+++ b/test/legacy_test/test_index_sample_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_index_select_op.py b/test/legacy_test/test_index_select_op.py
index 7f6eac1a9423d..d95295e8419b9 100644
--- a/test/legacy_test/test_index_select_op.py
+++ b/test/legacy_test/test_index_select_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_inner.py b/test/legacy_test/test_inner.py
index be6c01b36244f..3b34482b06971 100644
--- a/test/legacy_test/test_inner.py
+++ b/test/legacy_test/test_inner.py
@@ -129,13 +129,6 @@ def test_errors(self):
             self.assertRaises(ValueError, paddle.inner, x, y)
 
         np.random.seed(7)
-        # test dynamic computation graph: dtype can not be int8
-        paddle.disable_static()
-        x_data = np.random.randn(200).astype(np.int8)
-        y_data = np.random.randn(200).astype(np.int8)
-        x = paddle.to_tensor(x_data)
-        y = paddle.to_tensor(y_data)
-        self.assertRaises(RuntimeError, paddle.inner, x, y)
 
         # test dynamic computation graph: inputs must be broadcastable
         x_data = np.random.rand(20, 5)
@@ -144,29 +137,22 @@ def test_errors(self):
         y = paddle.to_tensor(y_data)
         self.assertRaises(ValueError, paddle.inner, x, y)
 
-        # test dynamic computation graph: dtype must be same
-        x_data = np.random.randn(200).astype(np.float32)
-        y_data = np.random.randn(200).astype(np.float64)
-        x = paddle.to_tensor(x_data)
-        y = paddle.to_tensor(y_data)
-        self.assertRaises(ValueError, paddle.inner, x, y)
-
         # test dynamic computation graph: dtype must be Tensor type
         x_data = np.random.randn(200).astype(np.float64)
         y_data = np.random.randn(200).astype(np.float64)
         y = paddle.to_tensor(y_data)
-        self.assertRaises(ValueError, paddle.inner, x_data, y)
+        self.assertRaises(TypeError, paddle.inner, x_data, y)
 
         # test dynamic computation graph: dtype must be Tensor type
         x_data = np.random.randn(200).astype(np.float64)
         y_data = np.random.randn(200).astype(np.float64)
         x = paddle.to_tensor(x_data)
-        self.assertRaises(ValueError, paddle.inner, x, y_data)
+        self.assertRaises(TypeError, paddle.inner, x, y_data)
 
         # test dynamic computation graph: dtype must be Tensor type
         x_data = np.random.randn(200).astype(np.float32)
         y_data = np.random.randn(200).astype(np.float32)
-        self.assertRaises(ValueError, paddle.inner, x_data, y_data)
+        self.assertRaises(TypeError, paddle.inner, x_data, y_data)
 
 
 if __name__ == '__main__':
diff --git a/test/legacy_test/test_inplace.py b/test/legacy_test/test_inplace.py
index 3dd1213d38d9e..5b53b55f5f96a 100644
--- a/test/legacy_test/test_inplace.py
+++ b/test/legacy_test/test_inplace.py
@@ -896,15 +896,6 @@ def test_backward_error(self):
             ):
                 loss.backward()
 
-    # def test_backward_success_1(self):
-    #     pass
-
-    # def test_backward_success_2(self):
-    #     pass
-
-    # def test_leaf_inplace_var_error(self):
-    #     pass
-
 
 class TestDygraphInplaceLcm(TestDygraphInplace):
     def init_data(self):
@@ -1117,11 +1108,20 @@ def test_forward_result(self):
         var = paddle.to_tensor(self.input_var_numpy).astype(self.dtype)
         no_inplace_var = self.non_inplace_api_processing(var)
         inplace_var = self.inplace_api_processing(var)
-        inplace_var_bool = paddle.cast(inplace_var, 'bool')
         np.testing.assert_array_equal(
-            no_inplace_var.numpy(), inplace_var_bool.numpy()
+            no_inplace_var.numpy(), inplace_var.numpy()
         )
 
+    def test_forward_version(self):
+        with paddle.base.dygraph.guard():
+            var = paddle.to_tensor(self.input_var_numpy).astype(self.dtype)
+            self.assertEqual(var.inplace_version, 0)
+
+            inplace_var = self.inplace_api_processing(var)
+            self.assertEqual(var.inplace_version, 1)
+            inplace_var[0] = True
+            self.assertEqual(var.inplace_version, 2)
+
     def inplace_api_processing(self, var):
         return paddle.logical_and_(var, self.y)
 
diff --git a/test/legacy_test/test_instance_norm_op.py b/test/legacy_test/test_instance_norm_op.py
index b4bd1401ddea9..292e1c9881629 100644
--- a/test/legacy_test/test_instance_norm_op.py
+++ b/test/legacy_test/test_instance_norm_op.py
@@ -16,7 +16,7 @@
 
 import numpy as np
 import parameterized as param
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle import base, nn
diff --git a/test/legacy_test/test_instance_norm_op_v2.py b/test/legacy_test/test_instance_norm_op_v2.py
index 9b59b2d813616..9dafce621ff9f 100644
--- a/test/legacy_test/test_instance_norm_op_v2.py
+++ b/test/legacy_test/test_instance_norm_op_v2.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_inverse_op.py b/test/legacy_test/test_inverse_op.py
index da4c74e62814f..8e578746226ac 100644
--- a/test/legacy_test/test_inverse_op.py
+++ b/test/legacy_test/test_inverse_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_iou_similarity_op.py b/test/legacy_test/test_iou_similarity_op.py
index 8925a45614886..974bd0b362ada 100644
--- a/test/legacy_test/test_iou_similarity_op.py
+++ b/test/legacy_test/test_iou_similarity_op.py
@@ -14,8 +14,8 @@
 
 import unittest
 
-from eager_op_test import OpTest
 from numpy import random
+from op_test import OpTest
 
 
 class TestIOUSimilarityOp(OpTest):
diff --git a/test/legacy_test/test_is_empty_op.py b/test/legacy_test/test_is_empty_op.py
index b1e212be74755..eb2e99d750eac 100644
--- a/test/legacy_test/test_is_empty_op.py
+++ b/test/legacy_test/test_is_empty_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 
diff --git a/test/legacy_test/test_isclose_op.py b/test/legacy_test/test_isclose_op.py
index 5f17a9b6e7055..db7d8e6c49b54 100644
--- a/test/legacy_test/test_isclose_op.py
+++ b/test/legacy_test/test_isclose_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_isfinite_op.py b/test/legacy_test/test_isfinite_op.py
index 74bb329a8df39..94e52ee9f2e96 100755
--- a/test/legacy_test/test_isfinite_op.py
+++ b/test/legacy_test/test_isfinite_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 from paddle.base import core
 
diff --git a/test/legacy_test/test_kldiv_loss_op.py b/test/legacy_test/test_kldiv_loss_op.py
index 02dc6151d32b8..ea93d0e4dd607 100644
--- a/test/legacy_test/test_kldiv_loss_op.py
+++ b/test/legacy_test/test_kldiv_loss_op.py
@@ -14,7 +14,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, paddle_static_guard
+from op_test import OpTest, paddle_static_guard
 
 import paddle
 from paddle.nn.functional import kl_div
diff --git a/test/legacy_test/test_kron_op.py b/test/legacy_test/test_kron_op.py
index d5eac58233f12..be91861825a74 100644
--- a/test/legacy_test/test_kron_op.py
+++ b/test/legacy_test/test_kron_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 import paddle.base.dygraph as dg
diff --git a/test/legacy_test/test_kthvalue_op.py b/test/legacy_test/test_kthvalue_op.py
index 463ee34b880e8..d361e33e0dab7 100644
--- a/test/legacy_test/test_kthvalue_op.py
+++ b/test/legacy_test/test_kthvalue_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_l1_norm_op.py b/test/legacy_test/test_l1_norm_op.py
index 39e46bd2a9221..7ca647da0a3b7 100644
--- a/test/legacy_test/test_l1_norm_op.py
+++ b/test/legacy_test/test_l1_norm_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 class TestL1NormOp(OpTest):
diff --git a/test/legacy_test/test_label_smooth_op.py b/test/legacy_test/test_label_smooth_op.py
index d7cd2169a8ba7..763cbd676c169 100644
--- a/test/legacy_test/test_label_smooth_op.py
+++ b/test/legacy_test/test_label_smooth_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_lamb_op.py b/test/legacy_test/test_lamb_op.py
index 535b09bd8a448..225c9c690053f 100644
--- a/test/legacy_test/test_lamb_op.py
+++ b/test/legacy_test/test_lamb_op.py
@@ -15,8 +15,8 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
 from op import Operator
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_layer_norm_op.py b/test/legacy_test/test_layer_norm_op.py
index 3eb4dbe42cc01..fafa9ed017e38 100644
--- a/test/legacy_test/test_layer_norm_op.py
+++ b/test/legacy_test/test_layer_norm_op.py
@@ -17,11 +17,7 @@
 from operator import mul
 
 import numpy as np
-from eager_op_test import (
-    OpTest,
-    _set_use_system_allocator,
-    convert_float_to_uint16,
-)
+from op_test import OpTest, _set_use_system_allocator, convert_float_to_uint16
 
 import paddle
 import paddle.nn.functional as F
diff --git a/test/legacy_test/test_lerp_op.py b/test/legacy_test/test_lerp_op.py
index f85062940e31d..9fc3ada88a4e8 100644
--- a/test/legacy_test/test_lerp_op.py
+++ b/test/legacy_test/test_lerp_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_lgamma_op.py b/test/legacy_test/test_lgamma_op.py
index 3fbbb45f235d6..375f7e87a7c78 100644
--- a/test/legacy_test/test_lgamma_op.py
+++ b/test/legacy_test/test_lgamma_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 from scipy import special
 
 import paddle
diff --git a/test/legacy_test/test_linear_chain_crf_op.py b/test/legacy_test/test_linear_chain_crf_op.py
index da33c3cdc228c..6899a34063378 100755
--- a/test/legacy_test/test_linear_chain_crf_op.py
+++ b/test/legacy_test/test_linear_chain_crf_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 class LinearChainCrfForward:
diff --git a/test/legacy_test/test_linear_interp_op.py b/test/legacy_test/test_linear_interp_op.py
index d7b6848c3cbe5..c2854d99173ce 100755
--- a/test/legacy_test/test_linear_interp_op.py
+++ b/test/legacy_test/test_linear_interp_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_linear_interp_v2_op.py b/test/legacy_test/test_linear_interp_v2_op.py
index 87cf56e3eb1b4..acd7d0fed0919 100755
--- a/test/legacy_test/test_linear_interp_v2_op.py
+++ b/test/legacy_test/test_linear_interp_v2_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16, paddle_static_guard
+from op_test import OpTest, convert_float_to_uint16, paddle_static_guard
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_linspace.py b/test/legacy_test/test_linspace.py
index 88a2ef57acc5d..d45463cd3d826 100644
--- a/test/legacy_test/test_linspace.py
+++ b/test/legacy_test/test_linspace.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16, paddle_static_guard
+from op_test import OpTest, convert_float_to_uint16, paddle_static_guard
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_locality_aware_nms_op.py b/test/legacy_test/test_locality_aware_nms_op.py
index d08039ee44040..fd2e64e90c77f 100644
--- a/test/legacy_test/test_locality_aware_nms_op.py
+++ b/test/legacy_test/test_locality_aware_nms_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 from test_multiclass_nms_op import iou
 
 
diff --git a/test/legacy_test/test_lod_reset_op.py b/test/legacy_test/test_lod_reset_op.py
index d0078653f9d78..28548b2eb22af 100644
--- a/test/legacy_test/test_lod_reset_op.py
+++ b/test/legacy_test/test_lod_reset_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 class TestLodResetOpByAttr(OpTest):
diff --git a/test/legacy_test/test_log_loss_op.py b/test/legacy_test/test_log_loss_op.py
index cb1b50b49a853..f96095482c199 100644
--- a/test/legacy_test/test_log_loss_op.py
+++ b/test/legacy_test/test_log_loss_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 from paddle.nn import functional as F
 
diff --git a/test/legacy_test/test_log_softmax.py b/test/legacy_test/test_log_softmax.py
index 70da5577485ff..1c11d1096f0b4 100644
--- a/test/legacy_test/test_log_softmax.py
+++ b/test/legacy_test/test_log_softmax.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 import paddle.nn.functional as F
diff --git a/test/legacy_test/test_logcumsumexp_op.py b/test/legacy_test/test_logcumsumexp_op.py
index ab0df6cf73a19..373548f679b88 100644
--- a/test/legacy_test/test_logcumsumexp_op.py
+++ b/test/legacy_test/test_logcumsumexp_op.py
@@ -17,11 +17,7 @@
 from typing import Optional
 
 import numpy as np
-from eager_op_test import (
-    OpTest,
-    convert_float_to_uint16,
-    convert_uint16_to_float,
-)
+from op_test import OpTest, convert_float_to_uint16, convert_uint16_to_float
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_logical_op.py b/test/legacy_test/test_logical_op.py
index 10bf33e40a5e0..98e15878cdfb6 100755
--- a/test/legacy_test/test_logical_op.py
+++ b/test/legacy_test/test_logical_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import convert_float_to_uint16
+from op_test import convert_float_to_uint16
 
 import paddle
 from paddle.framework import in_dynamic_mode
diff --git a/test/legacy_test/test_logit_op.py b/test/legacy_test/test_logit_op.py
index e2123bfa8f9c5..b2f2e21af25ee 100644
--- a/test/legacy_test/test_logit_op.py
+++ b/test/legacy_test/test_logit_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_logspace.py b/test/legacy_test/test_logspace.py
index 402442ca4bc06..9edd4aef71788 100644
--- a/test/legacy_test/test_logspace.py
+++ b/test/legacy_test/test_logspace.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_logsumexp.py b/test/legacy_test/test_logsumexp.py
index 44c958fef6323..67cc953c5fdc5 100644
--- a/test/legacy_test/test_logsumexp.py
+++ b/test/legacy_test/test_logsumexp.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_lookup_table_bf16_op.py b/test/legacy_test/test_lookup_table_bf16_op.py
index 942a9cc3ccd74..46cbfea6d0f6f 100644
--- a/test/legacy_test/test_lookup_table_bf16_op.py
+++ b/test/legacy_test/test_lookup_table_bf16_op.py
@@ -15,13 +15,13 @@
 import unittest
 
 import numpy as np
-from eager_op_test import (
+from op import Operator
+from op_test import (
     OpTest,
     convert_float_to_uint16,
     convert_uint16_to_float,
     skip_check_grad_ci,
 )
-from op import Operator
 
 import paddle
 from paddle import base, enable_static
diff --git a/test/legacy_test/test_lookup_table_dequant_op.py b/test/legacy_test/test_lookup_table_dequant_op.py
index a2799ea3ca3fd..19b7ca2597be5 100644
--- a/test/legacy_test/test_lookup_table_dequant_op.py
+++ b/test/legacy_test/test_lookup_table_dequant_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 class TestLookupTableDequantOp(OpTest):
diff --git a/test/legacy_test/test_lookup_table_op.py b/test/legacy_test/test_lookup_table_op.py
index 8a3eb0ef2da1f..b26dece197ebc 100644
--- a/test/legacy_test/test_lookup_table_op.py
+++ b/test/legacy_test/test_lookup_table_op.py
@@ -15,13 +15,13 @@
 import unittest
 
 import numpy as np
-from eager_op_test import (
+from op import Operator
+from op_test import (
     OpTest,
     check_out_dtype,
     paddle_static_guard,
     skip_check_grad_ci,
 )
-from op import Operator
 
 import paddle
 import paddle.nn.functional as F
diff --git a/test/legacy_test/test_lookup_table_v2_bf16_op.py b/test/legacy_test/test_lookup_table_v2_bf16_op.py
index 13b15053a6e67..04362fc7cffa0 100644
--- a/test/legacy_test/test_lookup_table_v2_bf16_op.py
+++ b/test/legacy_test/test_lookup_table_v2_bf16_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import convert_uint16_to_float
+from op_test import convert_uint16_to_float
 from test_lookup_table_bf16_op import (
     TestLookupTableBF16Op,
     TestLookupTableBF16OpIds4D,
diff --git a/test/legacy_test/test_lookup_table_v2_op.py b/test/legacy_test/test_lookup_table_v2_op.py
index 5000c3486b5e6..0ac54e303b34b 100644
--- a/test/legacy_test/test_lookup_table_v2_op.py
+++ b/test/legacy_test/test_lookup_table_v2_op.py
@@ -15,8 +15,8 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci
 from op import Operator
+from op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_lrn_op.py b/test/legacy_test/test_lrn_op.py
index 2e65b92ebe532..34ceff298ec3d 100644
--- a/test/legacy_test/test_lrn_op.py
+++ b/test/legacy_test/test_lrn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, paddle_static_guard
+from op_test import OpTest, paddle_static_guard
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_lstm_cudnn_op.py b/test/legacy_test/test_lstm_cudnn_op.py
index 20b0066dd55a7..2ac1457a59651 100644
--- a/test/legacy_test/test_lstm_cudnn_op.py
+++ b/test/legacy_test/test_lstm_cudnn_op.py
@@ -17,7 +17,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_lstm_op.py b/test/legacy_test/test_lstm_op.py
index 833c4a311372d..a9b5c98f78daa 100644
--- a/test/legacy_test/test_lstm_op.py
+++ b/test/legacy_test/test_lstm_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 SIGMOID_THRESHOLD_MIN = -40.0
 SIGMOID_THRESHOLD_MAX = 13.0
diff --git a/test/legacy_test/test_lstm_unit_op.py b/test/legacy_test/test_lstm_unit_op.py
index 99aff76e3c43f..8a1b2fc238b22 100644
--- a/test/legacy_test/test_lstm_unit_op.py
+++ b/test/legacy_test/test_lstm_unit_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 def sigmoid_np(x):
diff --git a/test/legacy_test/test_lu_op.py b/test/legacy_test/test_lu_op.py
index f12d4ae6c893a..7cd9244499290 100644
--- a/test/legacy_test/test_lu_op.py
+++ b/test/legacy_test/test_lu_op.py
@@ -19,7 +19,7 @@
 import numpy as np
 import scipy
 import scipy.linalg
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_lu_unpack_op.py b/test/legacy_test/test_lu_unpack_op.py
index bfac8781076bf..ad5f809614c98 100644
--- a/test/legacy_test/test_lu_unpack_op.py
+++ b/test/legacy_test/test_lu_unpack_op.py
@@ -19,7 +19,7 @@
 import numpy as np
 import scipy
 import scipy.linalg
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_margin_cross_entropy_op.py b/test/legacy_test/test_margin_cross_entropy_op.py
index 9c27be97fc577..59d965456554c 100644
--- a/test/legacy_test/test_margin_cross_entropy_op.py
+++ b/test/legacy_test/test_margin_cross_entropy_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16, paddle_static_guard
+from op_test import OpTest, convert_float_to_uint16, paddle_static_guard
 
 import paddle
 from paddle.base import Program, core, program_guard
diff --git a/test/legacy_test/test_margin_rank_loss_op.py b/test/legacy_test/test_margin_rank_loss_op.py
index ad40b546b81a5..a795bc23694b3 100644
--- a/test/legacy_test/test_margin_rank_loss_op.py
+++ b/test/legacy_test/test_margin_rank_loss_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, paddle_static_guard
+from op_test import OpTest, paddle_static_guard
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_marker_op.py b/test/legacy_test/test_marker_op.py
index 08474d9803f57..21895d962318f 100644
--- a/test/legacy_test/test_marker_op.py
+++ b/test/legacy_test/test_marker_op.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 import unittest
 
-from eager_op_test import OpTest
+from op_test import OpTest
 
 from paddle.distributed.fleet.meta_optimizers.common import OpRole
 
diff --git a/test/legacy_test/test_masked_select_op.py b/test/legacy_test/test_masked_select_op.py
index 572e0ecb17583..954f3ffd1d9b6 100644
--- a/test/legacy_test/test_masked_select_op.py
+++ b/test/legacy_test/test_masked_select_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_match_matrix_tensor_op.py b/test/legacy_test/test_match_matrix_tensor_op.py
index 8ae8baf1831fd..357a215caba69 100644
--- a/test/legacy_test/test_match_matrix_tensor_op.py
+++ b/test/legacy_test/test_match_matrix_tensor_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 class TestMatchMatrixTensorOp(OpTest):
diff --git a/test/legacy_test/test_matmul_int8_op.py b/test/legacy_test/test_matmul_int8_op.py
index f482f7b194446..82be3b9e596e9 100644
--- a/test/legacy_test/test_matmul_int8_op.py
+++ b/test/legacy_test/test_matmul_int8_op.py
@@ -23,46 +23,69 @@
 paddle.disable_static()
 
 
+# TODO: verify the requirments of CUDA ARCH
 @unittest.skipIf(
-    not core.is_compiled_with_cuda()
-    or get_cuda_version() < 11020
-    or paddle.device.cuda.get_device_capability()[0] < 8,
-    "MatmulInt8 requires CUDA >= 11.2 and CUDA_ARCH >= 8",
+    not core.is_compiled_with_cuda() or get_cuda_version() < 11060,
+    "MatmulInt8 requires CUDA >= 11.6",
 )
 class TestMatmulInt8(unittest.TestCase):
-    """
-    Test matmul int8
-    Only NT (Non-Transposed-A and Transposed-B) is supported
-    """
-
     def config(self):
         self.dtype = 'int8'
         self.rtol = 1e-5
         self.atol = 1e-2
-        self.bias = True
-        self.m = 8
-        self.k = 64
-        self.n = 64
+        self.x_shape = (8, 64)
+        self.y_shape = (64, 64)
+        self.trans_x = False
+        self.trans_y = True
 
     def setUp(self):
         self.config()
-        self.input_a_np = np.random.randint(-127, 127, [self.m, self.k]).astype(
+        self.input_a_np = np.random.randint(-127, 127, self.x_shape).astype(
             'int32'
         )
-        self.input_b_np = np.random.randint(-127, 127, [self.k, self.n]).astype(
+        self.input_b_np = np.random.randint(-127, 127, self.y_shape).astype(
             'int32'
         )
         self.input_a = paddle.to_tensor(self.input_a_np, dtype=self.dtype)
-        self.input_b = paddle.to_tensor(
-            self.input_b_np.transpose((1, 0)), dtype=self.dtype
-        )
+        self.input_b = paddle.to_tensor(self.input_b_np, dtype=self.dtype)
+
+        if self.trans_x:
+            if self.input_a_np.ndim == 1:
+                self.input_a_np = self.input_a_np.reshape(
+                    (self.input_a_np.size,)
+                )
+            elif self.input_a_np.ndim == 2:
+                self.input_a_np = self.input_a_np.T
+            else:
+                dim = list(range(len(self.input_a_np.shape)))
+                dim[-1], dim[len(self.input_a_np.shape) - 2] = (
+                    dim[len(self.input_a_np.shape) - 2],
+                    dim[-1],
+                )
+                self.input_a_np = np.transpose(self.input_a_np, tuple(dim))
+        if self.trans_y:
+            if self.input_b_np.ndim == 1:
+                self.input_b_np = self.input_b_np.reshape(
+                    (self.input_b_np.size,)
+                )
+            elif self.input_b_np.ndim == 2:
+                self.input_b_np = self.input_b_np.T
+            else:
+                dim = list(range(len(self.input_b_np.shape)))
+                dim[-1], dim[len(self.input_b_np.shape) - 2] = (
+                    dim[len(self.input_b_np.shape) - 2],
+                    dim[-1],
+                )
+                self.input_b_np = np.transpose(self.input_b_np, tuple(dim))
 
     def get_reference_out(self):
-        out = np.dot(self.input_a_np, self.input_b_np)
+        out = np.matmul(self.input_a_np, self.input_b_np)
         return out
 
     def get_op_out(self):
-        out = paddle._C_ops.matmul_int8(self.input_a, self.input_b, False, True)
+        out = paddle._C_ops.matmul(
+            self.input_a, self.input_b, self.trans_x, self.trans_y
+        )
         return out.numpy()
 
     def test_matmul_int8(self):
@@ -73,5 +96,203 @@ def test_matmul_int8(self):
         )
 
 
+class TestMatmulInt8Op2(TestMatmulInt8):
+    def config(self):
+        self.dtype = 'int8'
+        self.rtol = 1e-5
+        self.atol = 1e-2
+        self.x_shape = (100,)
+        self.y_shape = (1, 3, 2, 100)
+        self.trans_x = False
+        self.trans_y = True
+
+
+class TestMatmulInt8Op3(TestMatmulInt8):
+    def config(self):
+        self.dtype = 'int8'
+        self.rtol = 1e-5
+        self.atol = 1e-2
+        self.x_shape = (4,)
+        self.y_shape = (1, 1, 4, 100)
+        self.trans_x = False
+        self.trans_y = False
+
+
+class TestMatmulInt8Op4(TestMatmulInt8):
+    def config(self):
+        self.dtype = 'int8'
+        self.rtol = 1e-5
+        self.atol = 1e-2
+        self.x_shape = (100,)
+        self.y_shape = (1, 2, 100, 4)
+        self.trans_x = False
+        self.trans_y = False
+
+
+class TestMatmulInt8Op5(TestMatmulInt8):
+    def config(self):
+        self.dtype = 'int8'
+        self.rtol = 1e-5
+        self.atol = 1e-2
+        self.x_shape = (1, 1, 100, 4)
+        self.y_shape = (100,)
+        self.trans_x = True
+        self.trans_y = False
+
+
+class TestMatmulInt8Op6(TestMatmulInt8):
+    def config(self):
+        self.dtype = 'int8'
+        self.rtol = 1e-5
+        self.atol = 1e-2
+        self.x_shape = (1, 2, 104, 4)
+        self.y_shape = (104,)
+        self.trans_x = True
+        self.trans_y = False
+
+
+class TestMatmulInt8Op7(TestMatmulInt8):
+    def config(self):
+        self.dtype = 'int8'
+        self.rtol = 1e-5
+        self.atol = 1e-2
+        self.x_shape = (1, 2, 4, 100)
+        self.y_shape = (100,)
+        self.trans_x = False
+        self.trans_y = False
+
+
+class TestMatmulInt8Op8(TestMatmulInt8):
+    def config(self):
+        self.dtype = 'int8'
+        self.rtol = 1e-5
+        self.atol = 1e-2
+        self.x_shape = (1, 1, 4, 100)
+        self.y_shape = (1, 1, 100, 4)
+        self.trans_x = False
+        self.trans_y = False
+
+
+class TestMatmulInt8Op9(TestMatmulInt8):
+    def config(self):
+        self.dtype = 'int8'
+        self.rtol = 1e-5
+        self.atol = 1e-2
+        self.x_shape = (1, 1, 4, 100)
+        self.y_shape = (2, 1, 8, 100)
+        self.trans_x = False
+        self.trans_y = True
+
+
+class TestMatmulInt8Op10(TestMatmulInt8):
+    def config(self):
+        self.dtype = 'int8'
+        self.rtol = 1e-5
+        self.atol = 1e-2
+        self.x_shape = (1, 1, 24, 4)
+        self.y_shape = (1, 2, 4, 24)
+        self.trans_x = False
+        self.trans_y = False
+
+
+class TestMatmulInt8Op11(TestMatmulInt8):
+    def config(self):
+        self.dtype = 'int8'
+        self.rtol = 1e-5
+        self.atol = 1e-2
+        self.x_shape = (2, 1, 4, 100)
+        self.y_shape = (1, 1, 100, 4)
+        self.trans_x = False
+        self.trans_y = False
+
+
+class TestMatmulInt8Op12(TestMatmulInt8):
+    def config(self):
+        self.dtype = 'int8'
+        self.rtol = 1e-5
+        self.atol = 1e-2
+        self.x_shape = (2, 1, 4, 24)
+        self.y_shape = (1, 1, 4, 24)
+        self.trans_x = True
+        self.trans_y = False
+
+
+class TestMatmulInt8Op13(TestMatmulInt8):
+    def config(self):
+        self.dtype = 'int8'
+        self.rtol = 1e-5
+        self.atol = 1e-2
+        self.x_shape = (2, 2, 12, 12)
+        self.y_shape = (2, 2, 12, 12)
+        self.trans_x = True
+        self.trans_y = False
+
+
+class TestMatmulInt8Op14(TestMatmulInt8):
+    def config(self):
+        self.dtype = 'int8'
+        self.rtol = 1e-5
+        self.atol = 1e-2
+        self.x_shape = (3, 1, 8, 8)
+        self.y_shape = (1, 2, 8, 8)
+        self.trans_x = True
+        self.trans_y = False
+
+
+class TestMatmulInt8Op15(TestMatmulInt8):
+    def config(self):
+        self.dtype = 'int8'
+        self.rtol = 1e-5
+        self.atol = 1e-2
+        self.x_shape = (3, 1, 8, 8)
+        self.y_shape = (1, 2, 8, 8)
+        self.trans_x = False
+        self.trans_y = False
+
+
+class TestMatmulInt8Op16(TestMatmulInt8):
+    def config(self):
+        self.dtype = 'int8'
+        self.rtol = 1e-5
+        self.atol = 1e-2
+        self.x_shape = 100
+        self.y_shape = (1, 2, 2, 100, 4)
+        self.trans_x = False
+        self.trans_y = False
+
+
+class TestMatmulInt8Op17(TestMatmulInt8):
+    def config(self):
+        self.dtype = 'int8'
+        self.rtol = 1e-5
+        self.atol = 1e-2
+        self.x_shape = (2, 4, 100)
+        self.y_shape = 100
+        self.trans_x = False
+        self.trans_y = False
+
+
+class TestMatmulInt8OpBroadcast1(TestMatmulInt8):
+    def config(self):
+        self.dtype = 'int8'
+        self.rtol = 1e-5
+        self.atol = 1e-2
+        self.x_shape = (3, 1, 12, 12)
+        self.y_shape = (1, 2, 12, 12)
+        self.trans_x = True
+        self.trans_y = True
+
+
+class TestMatmulInt8OpBroadcast2(TestMatmulInt8):
+    def config(self):
+        self.dtype = 'int8'
+        self.rtol = 1e-5
+        self.atol = 1e-2
+        self.x_shape = (3, 1, 12, 12)
+        self.y_shape = (1, 2, 12, 12)
+        self.trans_x = False
+        self.trans_y = True
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/legacy_test/test_matmul_op.py b/test/legacy_test/test_matmul_op.py
index cf2fd10d5dcf2..f907e7f7458bd 100644
--- a/test/legacy_test/test_matmul_op.py
+++ b/test/legacy_test/test_matmul_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, paddle_static_guard
+from op_test import OpTest, paddle_static_guard
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_matmul_op_with_head.py b/test/legacy_test/test_matmul_op_with_head.py
index 475ede6fd05d7..1c3cbe8d926c9 100644
--- a/test/legacy_test/test_matmul_op_with_head.py
+++ b/test/legacy_test/test_matmul_op_with_head.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 def generate_compatible_shapes_mul_head(dim_X, dim_Y, transpose_X, transpose_Y):
diff --git a/test/legacy_test/test_matmul_v2_op.py b/test/legacy_test/test_matmul_v2_op.py
index 681a24fd243c4..f867d4c959636 100644
--- a/test/legacy_test/test_matmul_v2_op.py
+++ b/test/legacy_test/test_matmul_v2_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16, get_numeric_gradient
+from op_test import OpTest, convert_float_to_uint16, get_numeric_gradient
 from testsuite import create_op
 
 import paddle
diff --git a/test/legacy_test/test_matrix_nms_op.py b/test/legacy_test/test_matrix_nms_op.py
index 9500e6fbf6f06..08d1ec20653c8 100644
--- a/test/legacy_test/test_matrix_nms_op.py
+++ b/test/legacy_test/test_matrix_nms_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle.base import Program, program_guard
diff --git a/test/legacy_test/test_matrix_power_op.py b/test/legacy_test/test_matrix_power_op.py
index e9967a7ec3e1c..627f331cd1ceb 100644
--- a/test/legacy_test/test_matrix_power_op.py
+++ b/test/legacy_test/test_matrix_power_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_matrix_rank_op.py b/test/legacy_test/test_matrix_rank_op.py
index 1a8bbb5b4636c..f13d4e95dc0f7 100644
--- a/test/legacy_test/test_matrix_rank_op.py
+++ b/test/legacy_test/test_matrix_rank_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_max_op.py b/test/legacy_test/test_max_op.py
index 580b8423bffef..cf924bb7f89eb 100644
--- a/test/legacy_test/test_max_op.py
+++ b/test/legacy_test/test_max_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import check_out_dtype
+from op_test import check_out_dtype
 from test_sum_op import TestReduceOPTensorAxisBase
 
 import paddle
diff --git a/test/legacy_test/test_maxout_op.py b/test/legacy_test/test_maxout_op.py
index 06b855ce2364a..7f067ef62e058 100644
--- a/test/legacy_test/test_maxout_op.py
+++ b/test/legacy_test/test_maxout_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 import paddle.nn.functional as F
diff --git a/test/legacy_test/test_mean_iou.py b/test/legacy_test/test_mean_iou.py
index 551bfeda62491..f50a8beb010f9 100644
--- a/test/legacy_test/test_mean_iou.py
+++ b/test/legacy_test/test_mean_iou.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 def compute_mean_iou(
diff --git a/test/legacy_test/test_mean_op.py b/test/legacy_test/test_mean_op.py
index 360e450625cbf..c7f28b926d279 100644
--- a/test/legacy_test/test_mean_op.py
+++ b/test/legacy_test/test_mean_op.py
@@ -17,7 +17,7 @@
 import gradient_checker
 import numpy as np
 from decorator_helper import prog_scope
-from eager_op_test import OpTest, OpTestTool, convert_float_to_uint16
+from op_test import OpTest, OpTestTool, convert_float_to_uint16
 from test_sum_op import TestReduceOPTensorAxisBase
 
 import paddle
@@ -52,10 +52,10 @@ def init_dtype_type(self):
         pass
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=True)
 
     def test_checkout_grad(self):
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', check_new_ir=True)
 
 
 class TestMeanOp_ZeroDim(OpTest):
@@ -67,10 +67,10 @@ def setUp(self):
         self.outputs = {'Out': np.mean(self.inputs["X"])}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=True)
 
     def test_checkout_grad(self):
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', check_new_ir=True)
 
 
 class TestMeanOpError(unittest.TestCase):
@@ -78,6 +78,7 @@ def test_errors(self):
         paddle.enable_static()
         with program_guard(Program(), Program()):
             # The input type of mean_op must be Variable.
+
             input1 = 12
             self.assertRaises(TypeError, paddle.mean, input1)
             # The input dtype of mean_op must be float16, float32, float64.
@@ -103,7 +104,7 @@ def init_dtype_type(self):
     def test_check_output(self):
         place = core.CUDAPlace(0)
         if core.is_float16_supported(place):
-            self.check_output_with_place(place)
+            self.check_output_with_place(place, check_new_ir=True)
 
     def test_checkout_grad(self):
         place = core.CUDAPlace(0)
@@ -127,11 +128,11 @@ def init_dtype_type(self):
 
     def test_check_output(self):
         paddle.enable_static()
-        self.check_output_with_place(core.CPUPlace())
+        self.check_output_with_place(core.CPUPlace(), check_new_ir=True)
 
     def test_checkout_grad(self):
         place = core.CPUPlace()
-        self.check_grad_with_place(place, ['X'], 'Out')
+        self.check_grad_with_place(place, ['X'], 'Out', check_new_ir=True)
 
 
 def ref_reduce_mean(x, axis=None, keepdim=False, reduce_all=False):
@@ -188,18 +189,34 @@ def if_enable_cinn(self):
 
     def test_check_output(self):
         if self.dtype != 'float16':
-            self.check_output(check_prim=True)
+            self.check_output(
+                check_prim=True, check_prim_new_ir=True, check_new_ir=True
+            )
         else:
             place = paddle.CUDAPlace(0)
-            self.check_output_with_place(place=place, check_prim=True)
+            self.check_output_with_place(
+                place=place, check_prim=True, check_new_ir=True
+            )
 
     def test_check_grad(self):
         if self.dtype != 'float16':
-            self.check_grad(['X'], ['Out'], check_prim=True)
+            self.check_grad(
+                ['X'],
+                ['Out'],
+                check_prim=True,
+                check_prim_new_ir=True,
+                check_new_ir=True,
+            )
         else:
             place = paddle.CUDAPlace(0)
             self.check_grad_with_place(
-                place, ['X'], ['Out'], numeric_grad_delta=0.5, check_prim=True
+                place,
+                ['X'],
+                ['Out'],
+                numeric_grad_delta=0.5,
+                check_prim=True,
+                check_prim_new_ir=True,
+                check_new_ir=True,
             )
 
 
@@ -254,7 +271,12 @@ def test_check_output(self):
     def test_check_grad(self):
         place = paddle.CUDAPlace(0)
         self.check_grad_with_place(
-            place, ['X'], ['Out'], numeric_grad_delta=0.05, check_prim=True
+            place,
+            ['X'],
+            ['Out'],
+            numeric_grad_delta=0.05,
+            check_prim=True,
+            check_prim_new_ir=True,
         )
 
 
diff --git a/test/legacy_test/test_meshgrid_op.py b/test/legacy_test/test_meshgrid_op.py
index 97c896e7cb8da..d8324612e78e4 100644
--- a/test/legacy_test/test_meshgrid_op.py
+++ b/test/legacy_test/test_meshgrid_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_min_op.py b/test/legacy_test/test_min_op.py
index 4d51ce3fbbf28..7de7108d7d1ad 100644
--- a/test/legacy_test/test_min_op.py
+++ b/test/legacy_test/test_min_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import check_out_dtype
+from op_test import check_out_dtype
 from test_sum_op import TestReduceOPTensorAxisBase
 
 import paddle
diff --git a/test/legacy_test/test_mine_hard_examples_op.py b/test/legacy_test/test_mine_hard_examples_op.py
index 9cdaf582e89c1..f3f1ec4d76ad7 100644
--- a/test/legacy_test/test_mine_hard_examples_op.py
+++ b/test/legacy_test/test_mine_hard_examples_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 class TestMineHardExamplesOp(OpTest):
diff --git a/test/legacy_test/test_minus_op.py b/test/legacy_test/test_minus_op.py
index 44646ed3e892a..26d01a179ff46 100644
--- a/test/legacy_test/test_minus_op.py
+++ b/test/legacy_test/test_minus_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 
diff --git a/test/legacy_test/test_mode_op.py b/test/legacy_test/test_mode_op.py
index 63e67882e23e1..4f13158bb664d 100644
--- a/test/legacy_test/test_mode_op.py
+++ b/test/legacy_test/test_mode_op.py
@@ -15,11 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import (
-    OpTest,
-    convert_float_to_uint16,
-    convert_uint16_to_float,
-)
+from op_test import OpTest, convert_float_to_uint16, convert_uint16_to_float
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_modified_huber_loss_op.py b/test/legacy_test/test_modified_huber_loss_op.py
index 8f88db7768c2c..61eb0c3db25ef 100644
--- a/test/legacy_test/test_modified_huber_loss_op.py
+++ b/test/legacy_test/test_modified_huber_loss_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 def modified_huber_loss_forward(val):
diff --git a/test/legacy_test/test_momentum_op.py b/test/legacy_test/test_momentum_op.py
index 67296e516ab78..6bc42d42a0c46 100644
--- a/test/legacy_test/test_momentum_op.py
+++ b/test/legacy_test/test_momentum_op.py
@@ -16,8 +16,8 @@
 
 import numpy
 import numpy as np
-from eager_op_test import OpTest
 from op import Operator
+from op_test import OpTest
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_mul_op.py b/test/legacy_test/test_mul_op.py
index d1b5d5f02843b..0a7e428da3904 100644
--- a/test/legacy_test/test_mul_op.py
+++ b/test/legacy_test/test_mul_op.py
@@ -16,11 +16,12 @@
 import unittest
 
 import numpy as np
+from test_sparse_attention_op import get_cuda_version
 
 from paddle.base import core
 
 sys.path.append("..")
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 
 class TestMulOp(OpTest):
@@ -311,5 +312,78 @@ def test_check_grad_ingore_y(self):
         )
 
 
+# TODO: verify the requirments of CUDA ARCH
+@unittest.skipIf(
+    not core.is_compiled_with_cuda() or get_cuda_version() < 11060,
+    "MatmulInt8 requires CUDA >= 11.6",
+)
+class TestMulInt8Op(OpTest):
+    def setUp(self):
+        self.op_type = "mul"
+        self.dtype = np.int8
+        self.init_dtype_type()
+        self.inputs = {
+            'X': np.random.randint(-127, 127, (8, 64)).astype(np.int32),
+            'Y': np.random.randint(-127, 127, (64, 64)).astype(np.int32),
+        }
+        self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
+
+        self.inputs['X'] = self.inputs['X'].astype(self.dtype)
+        self.inputs['Y'] = self.inputs['Y'].astype(self.dtype)
+
+    def init_dtype_type(self):
+        pass
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place, check_dygraph=False)
+
+    def test_check_grad_normal(self):
+        pass
+
+    def test_check_grad_ingore_x(self):
+        pass
+
+    def test_check_grad_ingore_y(self):
+        pass
+
+
+class TestMulInt8Op2(TestMulInt8Op):
+    def setUp(self):
+        self.op_type = "mul"
+        self.dtype = np.int8
+        self.init_dtype_type()
+        self.inputs = {
+            'X': np.random.randint(-127, 127, (3, 4, 2, 8)).astype(np.int32),
+            'Y': np.random.randint(-127, 127, (4, 4, 1, 2, 4)).astype(np.int32),
+        }
+        self.attrs = {
+            'x_num_col_dims': 2,
+            'y_num_col_dims': 2,
+        }
+        result = np.dot(
+            self.inputs['X'].reshape(3 * 4, 2 * 8),
+            self.inputs['Y'].reshape(4 * 4, 1 * 2 * 4),
+        )
+        result = result.reshape(3, 4, 1, 2, 4)
+        self.outputs = {'Out': result}
+
+        self.inputs['X'] = self.inputs['X'].astype(self.dtype)
+        self.inputs['Y'] = self.inputs['Y'].astype(self.dtype)
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place, check_dygraph=False)
+
+    def test_check_grad_normal(self):
+        pass
+
+    def test_check_grad_ingore_x(self):
+        pass
+
+    def test_check_grad_ingore_y(self):
+        pass
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/legacy_test/test_multi_dot_op.py b/test/legacy_test/test_multi_dot_op.py
index 8a3557b6d7f14..c18ee94fb01e6 100644
--- a/test/legacy_test/test_multi_dot_op.py
+++ b/test/legacy_test/test_multi_dot_op.py
@@ -15,8 +15,8 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
 from numpy.linalg import multi_dot
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_multiclass_nms_op.py b/test/legacy_test/test_multiclass_nms_op.py
index 2059eff7d738b..7262bf88f7a1a 100644
--- a/test/legacy_test/test_multiclass_nms_op.py
+++ b/test/legacy_test/test_multiclass_nms_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle import _C_ops
diff --git a/test/legacy_test/test_multinomial_op.py b/test/legacy_test/test_multinomial_op.py
index de58d6e8dd903..2b9ae85ef318f 100644
--- a/test/legacy_test/test_multinomial_op.py
+++ b/test/legacy_test/test_multinomial_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 from test_attribute_var import UnittestBase
 
 import paddle
diff --git a/test/legacy_test/test_multiplex_op.py b/test/legacy_test/test_multiplex_op.py
index c860825c66e40..97fa02c00f826 100644
--- a/test/legacy_test/test_multiplex_op.py
+++ b/test/legacy_test/test_multiplex_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_mv_op.py b/test/legacy_test/test_mv_op.py
index 14a4ada5727e8..d687d0513cc08 100644
--- a/test/legacy_test/test_mv_op.py
+++ b/test/legacy_test/test_mv_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle.static import Program, program_guard
diff --git a/test/legacy_test/test_nan_to_num_op.py b/test/legacy_test/test_nan_to_num_op.py
index c8a6d4fc67bef..aa50e7fe5c870 100644
--- a/test/legacy_test/test_nan_to_num_op.py
+++ b/test/legacy_test/test_nan_to_num_op.py
@@ -20,7 +20,7 @@
 import paddle
 from paddle.base import core
 
-# from eager_op_test import OpTest
+# from op_test import OpTest
 
 
 def np_nan_to_num(
diff --git a/test/legacy_test/test_nanmedian.py b/test/legacy_test/test_nanmedian.py
index 9a55a91ac625b..6cf8d7316f615 100644
--- a/test/legacy_test/test_nanmedian.py
+++ b/test/legacy_test/test_nanmedian.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_nce.py b/test/legacy_test/test_nce.py
index 8332f07b85c3a..5593abf4deb56 100644
--- a/test/legacy_test/test_nce.py
+++ b/test/legacy_test/test_nce.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, paddle_static_guard
+from op_test import OpTest, paddle_static_guard
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_nearest_interp_op.py b/test/legacy_test/test_nearest_interp_op.py
index 6ef7c23589ed3..3b09cab3eacee 100755
--- a/test/legacy_test/test_nearest_interp_op.py
+++ b/test/legacy_test/test_nearest_interp_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 from paddle.base import core
 
diff --git a/test/legacy_test/test_nearest_interp_v2_op.py b/test/legacy_test/test_nearest_interp_v2_op.py
index 15902efc28e53..109943a7ec1be 100755
--- a/test/legacy_test/test_nearest_interp_v2_op.py
+++ b/test/legacy_test/test_nearest_interp_v2_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_nextafter_op.py b/test/legacy_test/test_nextafter_op.py
index 5048778e1b7b7..7b6c6039820de 100644
--- a/test/legacy_test/test_nextafter_op.py
+++ b/test/legacy_test/test_nextafter_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 
diff --git a/test/legacy_test/test_nll_loss.py b/test/legacy_test/test_nll_loss.py
index e225bbebd9a0d..1703a13e4f8ed 100644
--- a/test/legacy_test/test_nll_loss.py
+++ b/test/legacy_test/test_nll_loss.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_nms_op.py b/test/legacy_test/test_nms_op.py
index 7c1e4dbcedcf1..68715db80d578 100755
--- a/test/legacy_test/test_nms_op.py
+++ b/test/legacy_test/test_nms_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 
diff --git a/test/legacy_test/test_nn_functional_hot_op.py b/test/legacy_test/test_nn_functional_hot_op.py
index e320a991c74bc..1144c3062272a 100644
--- a/test/legacy_test/test_nn_functional_hot_op.py
+++ b/test/legacy_test/test_nn_functional_hot_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_nonzero_api.py b/test/legacy_test/test_nonzero_api.py
index 6dbe9cfed9de1..a57e1d9803c22 100644
--- a/test/legacy_test/test_nonzero_api.py
+++ b/test/legacy_test/test_nonzero_api.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_norm_all.py b/test/legacy_test/test_norm_all.py
index 44001abdea81f..58be677975742 100644
--- a/test/legacy_test/test_norm_all.py
+++ b/test/legacy_test/test_norm_all.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import _C_ops, base
diff --git a/test/legacy_test/test_norm_op.py b/test/legacy_test/test_norm_op.py
index a486fba065e10..1f00d19601dfc 100644
--- a/test/legacy_test/test_norm_op.py
+++ b/test/legacy_test/test_norm_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci
+from op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_number_count_op.py b/test/legacy_test/test_number_count_op.py
index 2ea68fd82d7ec..70c02e9e82348 100644
--- a/test/legacy_test/test_number_count_op.py
+++ b/test/legacy_test/test_number_count_op.py
@@ -14,8 +14,8 @@
 
 import unittest
 
-import eager_op_test
 import numpy as np
+import op_test
 
 import paddle
 from paddle.base import core
@@ -37,7 +37,7 @@ def number_count_wrapper(numbers, upper_num):
 @unittest.skipIf(
     not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
 )
-class TestNumberCountOpInt64(eager_op_test.OpTest):
+class TestNumberCountOpInt64(op_test.OpTest):
     def setUp(self):
         upper_num = 16
         self.op_type = "number_count"
diff --git a/test/legacy_test/test_numel_op.py b/test/legacy_test/test_numel_op.py
index d04326971ed00..1f5642b95bfdb 100644
--- a/test/legacy_test/test_numel_op.py
+++ b/test/legacy_test/test_numel_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_one_hot_v2_op.py b/test/legacy_test/test_one_hot_v2_op.py
index f8a7c3a59140e..40423dcb20f89 100644
--- a/test/legacy_test/test_one_hot_v2_op.py
+++ b/test/legacy_test/test_one_hot_v2_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_outer.py b/test/legacy_test/test_outer.py
index 5ce564509d46e..67bb15323338c 100644
--- a/test/legacy_test/test_outer.py
+++ b/test/legacy_test/test_outer.py
@@ -147,37 +147,23 @@ def test_errors(self):
             self.assertRaises(TypeError, paddle.outer, x, y)
 
         np.random.seed(7)
-        # test dynamic computation graph: dtype can not be int8
-        paddle.disable_static()
-        x_data = np.random.randn(200).astype(np.int8)
-        y_data = np.random.randn(200).astype(np.int8)
-        x = paddle.to_tensor(x_data)
-        y = paddle.to_tensor(y_data)
-        self.assertRaises(RuntimeError, paddle.outer, x, y)
-
-        # test dynamic computation graph: dtype must be same
-        x_data = np.random.randn(200).astype(np.float32)
-        y_data = np.random.randn(200).astype(np.float64)
-        x = paddle.to_tensor(x_data)
-        y = paddle.to_tensor(y_data)
-        self.assertRaises(ValueError, paddle.outer, x, y)
 
         # test dynamic computation graph: dtype must be Tensor type
         x_data = np.random.randn(200).astype(np.float64)
         y_data = np.random.randn(200).astype(np.float64)
         y = paddle.to_tensor(y_data)
-        self.assertRaises(ValueError, paddle.outer, x_data, y)
+        self.assertRaises(TypeError, paddle.outer, x_data, y)
 
         # test dynamic computation graph: dtype must be Tensor type
         x_data = np.random.randn(200).astype(np.float32)
         y_data = np.random.randn(200).astype(np.float32)
         x = paddle.to_tensor(x_data)
-        self.assertRaises(ValueError, paddle.outer, x, y_data)
+        self.assertRaises(TypeError, paddle.outer, x, y_data)
 
         # test dynamic computation graph: dtype must be Tensor type
         x_data = np.random.randn(200).astype(np.float32)
         y_data = np.random.randn(200).astype(np.float32)
-        self.assertRaises(ValueError, paddle.outer, x_data, y_data)
+        self.assertRaises(TypeError, paddle.outer, x_data, y_data)
 
 
 if __name__ == '__main__':
diff --git a/test/legacy_test/test_overlap_add_op.py b/test/legacy_test/test_overlap_add_op.py
index 90b10c6b9a6c0..82c902dfdcaf5 100644
--- a/test/legacy_test/test_overlap_add_op.py
+++ b/test/legacy_test/test_overlap_add_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_pad2d_op.py b/test/legacy_test/test_pad2d_op.py
index 8b16189adda5b..f72c440d72618 100644
--- a/test/legacy_test/test_pad2d_op.py
+++ b/test/legacy_test/test_pad2d_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 class TestPad2dOp(OpTest):
diff --git a/test/legacy_test/test_pad3d_op.py b/test/legacy_test/test_pad3d_op.py
index 6202eae2fa212..0a3dd7ac859ba 100644
--- a/test/legacy_test/test_pad3d_op.py
+++ b/test/legacy_test/test_pad3d_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 import paddle.nn.functional as F
diff --git a/test/legacy_test/test_pad_constant_like.py b/test/legacy_test/test_pad_constant_like.py
index 3261880e2131a..65639df2ac6f3 100644
--- a/test/legacy_test/test_pad_constant_like.py
+++ b/test/legacy_test/test_pad_constant_like.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 class TestPadConstantLikeOp(OpTest):
diff --git a/test/legacy_test/test_pad_op.py b/test/legacy_test/test_pad_op.py
index f33a317e944c6..2b920b7b9b43c 100644
--- a/test/legacy_test/test_pad_op.py
+++ b/test/legacy_test/test_pad_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 from test_attribute_var import UnittestBase
 
 import paddle
diff --git a/test/legacy_test/test_partial_concat_op.py b/test/legacy_test/test_partial_concat_op.py
index cc88320e1740b..61a201970402a 100644
--- a/test/legacy_test/test_partial_concat_op.py
+++ b/test/legacy_test/test_partial_concat_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 def np_partial_concat(inputs, start, length):
diff --git a/test/legacy_test/test_partial_sum_op.py b/test/legacy_test/test_partial_sum_op.py
index 42dc497b92931..86c952008c851 100644
--- a/test/legacy_test/test_partial_sum_op.py
+++ b/test/legacy_test/test_partial_sum_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 class TestPartialSumOp(OpTest):
diff --git a/test/legacy_test/test_pixel_shuffle_op.py b/test/legacy_test/test_pixel_shuffle_op.py
index 7ab983476c5d6..6d7cdab848623 100644
--- a/test/legacy_test/test_pixel_shuffle_op.py
+++ b/test/legacy_test/test_pixel_shuffle_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 import paddle.nn.functional as F
diff --git a/test/legacy_test/test_pixel_unshuffle.py b/test/legacy_test/test_pixel_unshuffle.py
index dc982ded5b4ee..ec6ce803d1277 100644
--- a/test/legacy_test/test_pixel_unshuffle.py
+++ b/test/legacy_test/test_pixel_unshuffle.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 import paddle.nn.functional as F
diff --git a/test/legacy_test/test_poisson_op.py b/test/legacy_test/test_poisson_op.py
index f6667407ed92e..a60bd0c43a1cd 100644
--- a/test/legacy_test/test_poisson_op.py
+++ b/test/legacy_test/test_poisson_op.py
@@ -16,11 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import (
-    OpTest,
-    convert_float_to_uint16,
-    convert_uint16_to_float,
-)
+from op_test import OpTest, convert_float_to_uint16, convert_uint16_to_float
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_polygamma_op.py b/test/legacy_test/test_polygamma_op.py
index 88736f89173e0..9c9b0416ba4f2 100644
--- a/test/legacy_test/test_polygamma_op.py
+++ b/test/legacy_test/test_polygamma_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 from scipy import special
 
 import paddle
diff --git a/test/legacy_test/test_polygon_box_transform.py b/test/legacy_test/test_polygon_box_transform.py
index 46c0f52cde61f..6e3f19927d5cc 100644
--- a/test/legacy_test/test_polygon_box_transform.py
+++ b/test/legacy_test/test_polygon_box_transform.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 def PolygonBoxRestore(input):
diff --git a/test/legacy_test/test_pool2d_op.py b/test/legacy_test/test_pool2d_op.py
index f55b1a9a9d02c..1ea560e211f23 100644
--- a/test/legacy_test/test_pool2d_op.py
+++ b/test/legacy_test/test_pool2d_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_pool3d_op.py b/test/legacy_test/test_pool3d_op.py
index 649e5cd255bd8..350d62098befa 100644
--- a/test/legacy_test/test_pool3d_op.py
+++ b/test/legacy_test/test_pool3d_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_pool_max_op.py b/test/legacy_test/test_pool_max_op.py
index e24a6a5f3bf5c..23740d39b8ef3 100644
--- a/test/legacy_test/test_pool_max_op.py
+++ b/test/legacy_test/test_pool_max_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import (
+from op_test import (
     OpTest,
     convert_float_to_uint16,
     convert_uint16_to_float,
diff --git a/test/legacy_test/test_positive_negative_pair_op.py b/test/legacy_test/test_positive_negative_pair_op.py
index e01aa7fdbfb42..cf3440f365cd7 100644
--- a/test/legacy_test/test_positive_negative_pair_op.py
+++ b/test/legacy_test/test_positive_negative_pair_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 def py_pnpair_op(score, label, query, column=-1, weight=None):
diff --git a/test/legacy_test/test_precision_recall_op.py b/test/legacy_test/test_precision_recall_op.py
index d31a31bb9a469..97f3d7e7724a4 100644
--- a/test/legacy_test/test_precision_recall_op.py
+++ b/test/legacy_test/test_precision_recall_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 def calc_precision(tp_count, fp_count):
diff --git a/test/legacy_test/test_prelu_op.py b/test/legacy_test/test_prelu_op.py
index 80772fdd671e5..c287daec3b959 100644
--- a/test/legacy_test/test_prelu_op.py
+++ b/test/legacy_test/test_prelu_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci
+from op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci
 
 import paddle
 import paddle.nn.functional as F
diff --git a/test/legacy_test/test_prior_box_op.py b/test/legacy_test/test_prior_box_op.py
index 7c071fb57cb6b..733219b326a12 100644
--- a/test/legacy_test/test_prior_box_op.py
+++ b/test/legacy_test/test_prior_box_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 
diff --git a/test/legacy_test/test_proximal_adagrad_op.py b/test/legacy_test/test_proximal_adagrad_op.py
index 22f4a5839f0ef..45d25d3a21350 100644
--- a/test/legacy_test/test_proximal_adagrad_op.py
+++ b/test/legacy_test/test_proximal_adagrad_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 class TestProximalAdagradOp(OpTest):
diff --git a/test/legacy_test/test_proximal_gd_op.py b/test/legacy_test/test_proximal_gd_op.py
index e4b4342503b09..d55c1ffcc2d8d 100644
--- a/test/legacy_test/test_proximal_gd_op.py
+++ b/test/legacy_test/test_proximal_gd_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 class TestProximalGDOp(OpTest):
diff --git a/test/legacy_test/test_psroi_pool_op.py b/test/legacy_test/test_psroi_pool_op.py
index fc4b513acfe72..24b6acbe25954 100644
--- a/test/legacy_test/test_psroi_pool_op.py
+++ b/test/legacy_test/test_psroi_pool_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 
diff --git a/test/legacy_test/test_put_along_axis_op.py b/test/legacy_test/test_put_along_axis_op.py
index 2834cda0be696..b8027137069ed 100644
--- a/test/legacy_test/test_put_along_axis_op.py
+++ b/test/legacy_test/test_put_along_axis_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle.framework import core
diff --git a/test/legacy_test/test_qr_op.py b/test/legacy_test/test_qr_op.py
index 7352e16bd65c3..0e9f0df29fcc7 100644
--- a/test/legacy_test/test_qr_op.py
+++ b/test/legacy_test/test_qr_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_randint_op.py b/test/legacy_test/test_randint_op.py
index 478b980ac1604..e6bdff627face 100644
--- a/test/legacy_test/test_randint_op.py
+++ b/test/legacy_test/test_randint_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_random_crop_op.py b/test/legacy_test/test_random_crop_op.py
index 9881b590acc36..08355378207c1 100644
--- a/test/legacy_test/test_random_crop_op.py
+++ b/test/legacy_test/test_random_crop_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 class TestRandomCropOp(OpTest):
diff --git a/test/legacy_test/test_randperm_op.py b/test/legacy_test/test_randperm_op.py
index bc14e4cbd4452..0a5bed32f6dee 100644
--- a/test/legacy_test/test_randperm_op.py
+++ b/test/legacy_test/test_randperm_op.py
@@ -15,11 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import (
-    OpTest,
-    convert_float_to_uint16,
-    convert_uint16_to_float,
-)
+from op_test import OpTest, convert_float_to_uint16, convert_uint16_to_float
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_range.py b/test/legacy_test/test_range.py
index dafa42825f6bf..b23de8726c80c 100644
--- a/test/legacy_test/test_range.py
+++ b/test/legacy_test/test_range.py
@@ -16,7 +16,7 @@
 from functools import partial
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 
diff --git a/test/legacy_test/test_rank_attention_op.py b/test/legacy_test/test_rank_attention_op.py
index f48fa41ba989a..514463b0cbae4 100644
--- a/test/legacy_test/test_rank_attention_op.py
+++ b/test/legacy_test/test_rank_attention_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 from paddle.base import core
 
diff --git a/test/legacy_test/test_rank_loss_op.py b/test/legacy_test/test_rank_loss_op.py
index 9fc57f13d5a14..e246310ddaaca 100644
--- a/test/legacy_test/test_rank_loss_op.py
+++ b/test/legacy_test/test_rank_loss_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 class TestRankLossOp(OpTest):
diff --git a/test/legacy_test/test_real_imag_op.py b/test/legacy_test/test_real_imag_op.py
index 3da20edb1b726..f714cef69e6d4 100644
--- a/test/legacy_test/test_real_imag_op.py
+++ b/test/legacy_test/test_real_imag_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle import base, static
diff --git a/test/legacy_test/test_reduce_op.py b/test/legacy_test/test_reduce_op.py
index 91bbd7acdbe26..03417bb87bb35 100644
--- a/test/legacy_test/test_reduce_op.py
+++ b/test/legacy_test/test_reduce_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci
+from op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci
 
 import paddle
 from paddle import base
@@ -54,10 +54,16 @@ def calc_output(self):
         self.out = self.x.sum(axis=tuple(self.attrs['dim']))
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_prim=True)
+        self.check_grad(
+            ['X'],
+            'Out',
+            check_prim=True,
+            check_new_ir=True,
+            check_prim_new_ir=True,
+        )
 
 
 class TestComplexSumOP(TestSumOp):
@@ -71,7 +77,7 @@ def init_attrs(self):
         self.attrs = {'dim': [0]}
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', check_prim=False)
+        self.check_grad(['X'], 'Out', check_prim=False, check_new_ir=True)
 
 
 class TestSumOp_ZeroDim(TestSumOp):
@@ -85,7 +91,13 @@ def calc_output(self):
         self.out = self.x.sum(axis=None)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
+        self.check_grad(
+            ['X'],
+            'Out',
+            check_new_ir=True,
+            check_prim=True,
+            check_prim_new_ir=True,
+        )
 
 
 class TestSumOp5D(TestSumOp):
@@ -112,10 +124,10 @@ def init_attrs(self):
         self.attrs = {'dim': (0, 3)}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', check_new_ir=True)
 
 
 class TestSumOp_withInt(TestSumOp):
@@ -128,7 +140,7 @@ def init_attrs(self):
         self.attrs = {'dim': (0, 1)}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=True)
 
     def calc_gradient(self):
         x = self.inputs["X"]
@@ -141,6 +153,8 @@ def test_check_grad(self):
             'Out',
             user_defined_grads=self.calc_gradient(),
             check_prim=True,
+            check_prim_new_ir=True,
+            check_new_ir=True,
         )
 
 
@@ -152,7 +166,7 @@ def init_attrs(self):
         self.attrs = {'dim': (0, 1, 2)}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=True)
 
     def calc_gradient(self):
         x = self.inputs["X"]
@@ -165,6 +179,8 @@ def test_check_grad(self):
             'Out',
             user_defined_grads=self.calc_gradient(),
             check_prim=True,
+            check_prim_new_ir=True,
+            check_new_ir=True,
         )
 
 
@@ -177,13 +193,15 @@ def init_dtype(self):
             self.dtype = np.float16
 
         def test_check_output(self):
-            self.check_output()
+            self.check_output(check_new_ir=True)
 
         def test_check_grad(self):
             self.check_grad(
                 ['X'],
                 'Out',
                 check_prim=True,
+                check_prim_new_ir=True,
+                check_new_ir=True,
             )
 
 
@@ -212,7 +230,7 @@ def init_dtype(self):
 
         def test_check_output(self):
             place = core.CUDAPlace(0)
-            self.check_output_with_place(place)
+            self.check_output_with_place(place, check_new_ir=True)
 
         def test_check_grad(self):
             place = core.CUDAPlace(0)
@@ -222,6 +240,8 @@ def test_check_grad(self):
                 'Out',
                 user_defined_grads=self.gradient,
                 check_prim=True,
+                check_prim_new_ir=True,
+                check_new_ir=True,
             )
 
         def calc_gradient(self):
@@ -258,7 +278,7 @@ def setUp(self):
         }
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=True)
 
     def test_check_grad(self):
         # only composite op support gradient check of reduce_max
@@ -292,7 +312,7 @@ def init_inputs_and_outputs(self):
         }
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=True)
 
     def test_check_grad(self):
         # only composite op support gradient check of reduce_max
@@ -345,7 +365,7 @@ def if_enable_cinn(self):
         pass
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=True)
 
     def test_check_grad(self):
         # only composite op support gradient check of reduce_max
@@ -379,7 +399,7 @@ def if_enable_cinn(self):
         self.enable_cinn = False
 
     def test_check_output(self):
-        self.check_output_with_place(core.CUDAPlace(0))
+        self.check_output_with_place(core.CUDAPlace(0), check_new_ir=True)
 
     def test_check_grad(self):
         # only composite op support gradient check of reduce_max
@@ -1278,7 +1298,7 @@ def setUp(self):
         }
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_new_ir=True)
 
     def test_check_grad(self):
         # only composite op support gradient check of reduce_max
diff --git a/test/legacy_test/test_repeat_interleave_op.py b/test/legacy_test/test_repeat_interleave_op.py
index a0c1a3bad03d0..ec6649039dd45 100644
--- a/test/legacy_test/test_repeat_interleave_op.py
+++ b/test/legacy_test/test_repeat_interleave_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_reshape_op.py b/test/legacy_test/test_reshape_op.py
index 5caaaae049b76..89308899ca03d 100755
--- a/test/legacy_test/test_reshape_op.py
+++ b/test/legacy_test/test_reshape_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_retinanet_detection_output.py b/test/legacy_test/test_retinanet_detection_output.py
index 0f7202aab2457..a120dfd50eefc 100644
--- a/test/legacy_test/test_retinanet_detection_output.py
+++ b/test/legacy_test/test_retinanet_detection_output.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 from test_anchor_generator_op import anchor_generator_in_python
 from test_multiclass_nms_op import nms
 
diff --git a/test/legacy_test/test_reverse_op.py b/test/legacy_test/test_reverse_op.py
index 91e90db95468c..ded51a261721e 100644
--- a/test/legacy_test/test_reverse_op.py
+++ b/test/legacy_test/test_reverse_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 
diff --git a/test/legacy_test/test_rnn_op.py b/test/legacy_test/test_rnn_op.py
index 4d7412f949786..4eb2d8332d9ec 100644
--- a/test/legacy_test/test_rnn_op.py
+++ b/test/legacy_test/test_rnn_op.py
@@ -17,7 +17,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_roi_align_op.py b/test/legacy_test/test_roi_align_op.py
index 468411170d05a..f23542239b6fc 100644
--- a/test/legacy_test/test_roi_align_op.py
+++ b/test/legacy_test/test_roi_align_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 
diff --git a/test/legacy_test/test_roi_pool_op.py b/test/legacy_test/test_roi_pool_op.py
index a3fd94a354815..4b5fec6203916 100644
--- a/test/legacy_test/test_roi_pool_op.py
+++ b/test/legacy_test/test_roi_pool_op.py
@@ -18,7 +18,7 @@
 from decimal import ROUND_HALF_UP, Decimal
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 
diff --git a/test/legacy_test/test_roll_op.py b/test/legacy_test/test_roll_op.py
index b8aa8e28529bd..dd88ab6e5c423 100644
--- a/test/legacy_test/test_roll_op.py
+++ b/test/legacy_test/test_roll_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_row_conv_op.py b/test/legacy_test/test_row_conv_op.py
index 595fb7b8b12aa..a3928521a9bed 100644
--- a/test/legacy_test/test_row_conv_op.py
+++ b/test/legacy_test/test_row_conv_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_rpn_target_assign_op.py b/test/legacy_test/test_rpn_target_assign_op.py
index 3df75b79c4add..d0147d8b700f1 100644
--- a/test/legacy_test/test_rpn_target_assign_op.py
+++ b/test/legacy_test/test_rpn_target_assign_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 from test_anchor_generator_op import anchor_generator_in_python
 from test_generate_proposal_labels_op import (
     _bbox_overlaps,
diff --git a/test/legacy_test/test_rrelu_op.py b/test/legacy_test/test_rrelu_op.py
index 851b90c44c7ba..e3b5e073d719a 100644
--- a/test/legacy_test/test_rrelu_op.py
+++ b/test/legacy_test/test_rrelu_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 import paddle.nn.functional as F
diff --git a/test/legacy_test/test_sample_logits_op.py b/test/legacy_test/test_sample_logits_op.py
index 809a6579fce46..4f29b62ae1744 100644
--- a/test/legacy_test/test_sample_logits_op.py
+++ b/test/legacy_test/test_sample_logits_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 class TestSampleLogitsOp(OpTest):
diff --git a/test/legacy_test/test_scale_op.py b/test/legacy_test/test_scale_op.py
index 9585357ce1b07..8621090aa1ea0 100644
--- a/test/legacy_test/test_scale_op.py
+++ b/test/legacy_test/test_scale_op.py
@@ -17,8 +17,8 @@
 import gradient_checker
 import numpy as np
 from decorator_helper import prog_scope
-from eager_op_test import OpTest, convert_float_to_uint16
 from op import Operator
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_scatter_nd_op.py b/test/legacy_test/test_scatter_nd_op.py
index 9d8c7bf876773..d9bd7365d4d0d 100644
--- a/test/legacy_test/test_scatter_nd_op.py
+++ b/test/legacy_test/test_scatter_nd_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_scatter_op.py b/test/legacy_test/test_scatter_op.py
index 5c87f6aed919e..d0cd04903956a 100644
--- a/test/legacy_test/test_scatter_op.py
+++ b/test/legacy_test/test_scatter_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_searchsorted_op.py b/test/legacy_test/test_searchsorted_op.py
index 84fd7afac7aa0..c3537fc4a47f4 100644
--- a/test/legacy_test/test_searchsorted_op.py
+++ b/test/legacy_test/test_searchsorted_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_seed_op.py b/test/legacy_test/test_seed_op.py
index 9127d4cd0aecc..a15b8099a5cf3 100644
--- a/test/legacy_test/test_seed_op.py
+++ b/test/legacy_test/test_seed_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle import static
diff --git a/test/legacy_test/test_segment_ops.py b/test/legacy_test/test_segment_ops.py
index 00b4f403fc7a0..81c6b5845a03f 100644
--- a/test/legacy_test/test_segment_ops.py
+++ b/test/legacy_test/test_segment_ops.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_selu_op.py b/test/legacy_test/test_selu_op.py
index 17e4311615145..52162111b85d8 100644
--- a/test/legacy_test/test_selu_op.py
+++ b/test/legacy_test/test_selu_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 import paddle.nn.functional as F
diff --git a/test/legacy_test/test_set_value_op.py b/test/legacy_test/test_set_value_op.py
index db7a8ebbf91fa..65c9f69765d11 100644
--- a/test/legacy_test/test_set_value_op.py
+++ b/test/legacy_test/test_set_value_op.py
@@ -18,7 +18,7 @@
 from functools import reduce
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle.base import core
@@ -1878,13 +1878,13 @@ def test_inplace(self):
             paddle.seed(100)
             a = paddle.rand(shape=[1, 4])
             a.stop_gradient = False
-            b = a[:]
+            b = a[:] * 1
             c = b
             b[paddle.zeros([], dtype='int32')] = 1.0
 
             self.assertTrue(id(b) == id(c))
             np.testing.assert_array_equal(b.numpy(), c.numpy())
-            self.assertEqual(b.inplace_version, 0)
+            self.assertEqual(b.inplace_version, 1)
 
         paddle.enable_static()
 
diff --git a/test/legacy_test/test_sgd_op.py b/test/legacy_test/test_sgd_op.py
index 1d9cdb3ffa3f3..a69039baa2634 100644
--- a/test/legacy_test/test_sgd_op.py
+++ b/test/legacy_test/test_sgd_op.py
@@ -15,8 +15,8 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
 from op import Operator
+from op_test import OpTest
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_sgd_op_bf16.py b/test/legacy_test/test_sgd_op_bf16.py
index 816ab3b6359d8..e7d0497fd7876 100644
--- a/test/legacy_test/test_sgd_op_bf16.py
+++ b/test/legacy_test/test_sgd_op_bf16.py
@@ -16,13 +16,13 @@
 import unittest
 
 import numpy as np
-from eager_op_test import (
+from op import Operator
+from op_test import (
     OpTest,
     OpTestTool,
     convert_float_to_uint16,
     convert_uint16_to_float,
 )
-from op import Operator
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_shape_op.py b/test/legacy_test/test_shape_op.py
index e7e7f9b62110a..f71df9cd3e8af 100644
--- a/test/legacy_test/test_shape_op.py
+++ b/test/legacy_test/test_shape_op.py
@@ -15,8 +15,8 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
 from op import Operator
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_shard_index_op.py b/test/legacy_test/test_shard_index_op.py
index 231518cd8ac50..ee49304adbbe8 100644
--- a/test/legacy_test/test_shard_index_op.py
+++ b/test/legacy_test/test_shard_index_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 
diff --git a/test/legacy_test/test_share_data_op.py b/test/legacy_test/test_share_data_op.py
index bf158beac1c97..5a8d0e6178889 100644
--- a/test/legacy_test/test_share_data_op.py
+++ b/test/legacy_test/test_share_data_op.py
@@ -15,8 +15,8 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
 from op import Operator
+from op_test import OpTest
 
 from paddle.base import core
 
diff --git a/test/legacy_test/test_shuffle_batch_op.py b/test/legacy_test/test_shuffle_batch_op.py
index 2bee79bda306d..f57728fe6fd7b 100644
--- a/test/legacy_test/test_shuffle_batch_op.py
+++ b/test/legacy_test/test_shuffle_batch_op.py
@@ -17,7 +17,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 from paddle import base
 
diff --git a/test/legacy_test/test_shuffle_channel_op.py b/test/legacy_test/test_shuffle_channel_op.py
index 2b974004968aa..a93d830f98906 100644
--- a/test/legacy_test/test_shuffle_channel_op.py
+++ b/test/legacy_test/test_shuffle_channel_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 class TestShuffleChannelOp(OpTest):
diff --git a/test/legacy_test/test_sigmoid_cross_entropy_with_logits_op.py b/test/legacy_test/test_sigmoid_cross_entropy_with_logits_op.py
index e0efedffc0043..9bb0d805fba8e 100644
--- a/test/legacy_test/test_sigmoid_cross_entropy_with_logits_op.py
+++ b/test/legacy_test/test_sigmoid_cross_entropy_with_logits_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 from scipy.special import expit, logit
 
 import paddle
diff --git a/test/legacy_test/test_sigmoid_focal_loss_op.py b/test/legacy_test/test_sigmoid_focal_loss_op.py
index abf4ba677ba86..efe1922165fb4 100644
--- a/test/legacy_test/test_sigmoid_focal_loss_op.py
+++ b/test/legacy_test/test_sigmoid_focal_loss_op.py
@@ -17,7 +17,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 
diff --git a/test/legacy_test/test_sign_op.py b/test/legacy_test/test_sign_op.py
index 516330a58898a..80dcc6909bfb7 100644
--- a/test/legacy_test/test_sign_op.py
+++ b/test/legacy_test/test_sign_op.py
@@ -17,7 +17,7 @@
 import gradient_checker
 import numpy as np
 from decorator_helper import prog_scope
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_similarity_focus_op.py b/test/legacy_test/test_similarity_focus_op.py
index f02a01e6253e6..1227a48949341 100755
--- a/test/legacy_test/test_similarity_focus_op.py
+++ b/test/legacy_test/test_similarity_focus_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 class TestSimilarityFocusOp(OpTest):
diff --git a/test/legacy_test/test_simple_rnn_op.py b/test/legacy_test/test_simple_rnn_op.py
index fb731f87e951a..9d51758be1400 100644
--- a/test/legacy_test/test_simple_rnn_op.py
+++ b/test/legacy_test/test_simple_rnn_op.py
@@ -17,7 +17,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_size_op.py b/test/legacy_test/test_size_op.py
index 68ec73fb57ba4..6ad5f00741ed8 100644
--- a/test/legacy_test/test_size_op.py
+++ b/test/legacy_test/test_size_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_slice_op.py b/test/legacy_test/test_slice_op.py
index 0aa24c85b87b0..10a5ad1950c5b 100644
--- a/test/legacy_test/test_slice_op.py
+++ b/test/legacy_test/test_slice_op.py
@@ -17,7 +17,7 @@
 import gradient_checker
 import numpy as np
 from decorator_helper import prog_scope
-from eager_op_test import OpTest, convert_float_to_uint16, paddle_static_guard
+from op_test import OpTest, convert_float_to_uint16, paddle_static_guard
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_smooth_l1_loss_op.py b/test/legacy_test/test_smooth_l1_loss_op.py
index 7c19db6d7b5c0..fb3fd40c0a823 100644
--- a/test/legacy_test/test_smooth_l1_loss_op.py
+++ b/test/legacy_test/test_smooth_l1_loss_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 def smooth_l1_loss_forward(val, sigma2):
diff --git a/test/legacy_test/test_softmax_mask_fuse_op.py b/test/legacy_test/test_softmax_mask_fuse_op.py
index 2adb2d4ce7029..495876a850588 100644
--- a/test/legacy_test/test_softmax_mask_fuse_op.py
+++ b/test/legacy_test/test_softmax_mask_fuse_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle import base, incubate
diff --git a/test/legacy_test/test_softmax_mask_fuse_upper_triangle_op.py b/test/legacy_test/test_softmax_mask_fuse_upper_triangle_op.py
index 67f2676a12885..bd6a098690628 100644
--- a/test/legacy_test/test_softmax_mask_fuse_upper_triangle_op.py
+++ b/test/legacy_test/test_softmax_mask_fuse_upper_triangle_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle import base, incubate
diff --git a/test/legacy_test/test_softmax_op.py b/test/legacy_test/test_softmax_op.py
index 6eb86b95fa620..33b6a104c21e4 100644
--- a/test/legacy_test/test_softmax_op.py
+++ b/test/legacy_test/test_softmax_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 import paddle.nn.functional as F
diff --git a/test/legacy_test/test_softmax_with_cross_entropy_op.py b/test/legacy_test/test_softmax_with_cross_entropy_op.py
index 73828072e9661..0515c7c78e9d7 100644
--- a/test/legacy_test/test_softmax_with_cross_entropy_op.py
+++ b/test/legacy_test/test_softmax_with_cross_entropy_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, paddle_static_guard
+from op_test import OpTest, paddle_static_guard
 from test_softmax_op import stable_softmax
 
 import paddle
diff --git a/test/legacy_test/test_solve_op.py b/test/legacy_test/test_solve_op.py
index ecb05b1d18a28..1d15da4019e65 100644
--- a/test/legacy_test/test_solve_op.py
+++ b/test/legacy_test/test_solve_op.py
@@ -21,7 +21,7 @@
 from paddle.base import core
 
 sys.path.append("..")
-from eager_op_test import OpTest
+from op_test import OpTest
 
 from paddle import base
 from paddle.base import Program, program_guard
diff --git a/test/legacy_test/test_space_to_depth_op.py b/test/legacy_test/test_space_to_depth_op.py
index 3e37ccfb2e104..c7cd6cae179db 100644
--- a/test/legacy_test/test_space_to_depth_op.py
+++ b/test/legacy_test/test_space_to_depth_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 from paddle import base
 
diff --git a/test/legacy_test/test_sparse_attention_op.py b/test/legacy_test/test_sparse_attention_op.py
index 2a80906ad74aa..48946522864aa 100644
--- a/test/legacy_test/test_sparse_attention_op.py
+++ b/test/legacy_test/test_sparse_attention_op.py
@@ -18,7 +18,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 import paddle.nn.functional as F
diff --git a/test/legacy_test/test_sparse_momentum_op.py b/test/legacy_test/test_sparse_momentum_op.py
index a8dba1702e472..4b3ea1d8d089c 100644
--- a/test/legacy_test/test_sparse_momentum_op.py
+++ b/test/legacy_test/test_sparse_momentum_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 def calculate_sparse_momentum_by_numpy(
diff --git a/test/legacy_test/test_spectral_norm_op.py b/test/legacy_test/test_spectral_norm_op.py
index d0b4b712ce087..912965f94755e 100644
--- a/test/legacy_test/test_spectral_norm_op.py
+++ b/test/legacy_test/test_spectral_norm_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, skip_check_grad_ci
+from op_test import OpTest, skip_check_grad_ci
 
 import paddle
 from paddle import _C_ops
diff --git a/test/legacy_test/test_split_op.py b/test/legacy_test/test_split_op.py
index 77a720ffc8d30..584ca0d6b2863 100644
--- a/test/legacy_test/test_split_op.py
+++ b/test/legacy_test/test_split_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_spp_op.py b/test/legacy_test/test_spp_op.py
index 2fca2f3fc845f..fbf3440352590 100644
--- a/test/legacy_test/test_spp_op.py
+++ b/test/legacy_test/test_spp_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 from test_pool2d_op import avg_pool2D_forward_naive, max_pool2D_forward_naive
 
 
diff --git a/test/legacy_test/test_squared_l2_distance_op.py b/test/legacy_test/test_squared_l2_distance_op.py
index 88c36f383501d..579681ab0c098 100644
--- a/test/legacy_test/test_squared_l2_distance_op.py
+++ b/test/legacy_test/test_squared_l2_distance_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 class TestSquaredL2DistanceOp_f0(OpTest):
diff --git a/test/legacy_test/test_squared_l2_norm_op.py b/test/legacy_test/test_squared_l2_norm_op.py
index 96872b10f1295..75a7a18e44d15 100755
--- a/test/legacy_test/test_squared_l2_norm_op.py
+++ b/test/legacy_test/test_squared_l2_norm_op.py
@@ -15,8 +15,8 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
 from numpy import linalg as LA
+from op_test import OpTest
 
 import paddle
 from paddle import _C_ops, _legacy_C_ops
diff --git a/test/legacy_test/test_squeeze2_op.py b/test/legacy_test/test_squeeze2_op.py
index ec9b96aed46cc..b4723b415cccb 100755
--- a/test/legacy_test/test_squeeze2_op.py
+++ b/test/legacy_test/test_squeeze2_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 from test_attribute_var import UnittestBase
 
 import paddle
diff --git a/test/legacy_test/test_squeeze_op.py b/test/legacy_test/test_squeeze_op.py
index 6cdfa5fec0f02..294a86db6dd04 100755
--- a/test/legacy_test/test_squeeze_op.py
+++ b/test/legacy_test/test_squeeze_op.py
@@ -17,7 +17,7 @@
 import gradient_checker
 import numpy as np
 from decorator_helper import prog_scope
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_stack_op.py b/test/legacy_test/test_stack_op.py
index 5cc2234555853..288b3f75fc214 100644
--- a/test/legacy_test/test_stack_op.py
+++ b/test/legacy_test/test_stack_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_static_pylayer.py b/test/legacy_test/test_static_pylayer.py
index d62bfb2fd2afa..27bbc6bfefb8f 100644
--- a/test/legacy_test/test_static_pylayer.py
+++ b/test/legacy_test/test_static_pylayer.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import functools
 import unittest
 
 import numpy as np
@@ -25,7 +26,7 @@
 np.random.seed(123)
 
 
-class TestStatocPyLayerInputOutput(unittest.TestCase):
+class TestStaticPyLayerInputOutput(unittest.TestCase):
     def test_return_single_var(self):
         """
         pseudocode:
@@ -231,16 +232,14 @@ def backward_fn(daout, dbout):
                 )
 
 
-# TODO(MarioLulab): Disable now. We will refine and add testcases later.
-class _TestControlFlowNestedStaticPyLayer(unittest.TestCase):
-    # TODO(MarioLulab): failed when i >= 5, fix it later
-    def _test_cond_inside_static_pylayer(self):
+class TestControlFlowNestedStaticPyLayer(unittest.TestCase):
+    def test_cond_inside_static_pylayer(self):
         """
-        forward pass:
+        forward propagation:
                       _ _ _ _ _ _ _ _
-         ---> a ---> |               | -----> out_i
-        |            | StaticPyLayer |
-        i ---------> |_ _ _ _ _ _ _ _| -----> out ---> loss
+         ---> a ---> |               | -----> out_a ------
+        |            | StaticPyLayer |                    |
+        i ---------> |_ _ _ _ _ _ _ _| -----> out_i ---> out ---> loss
 
 
         pseudocode:
@@ -251,10 +250,11 @@ def forward_fn(i, a):
                 return i, a - a
 
         def backward_fn(diout, daout):
+            daout_scaled = daout * 3.0
             if diout < 5:
-                return 2 * diout, daout * daout
+                return daout_scaled, -1 * daout
             else:
-                return 2 * diout, cos(daout)
+                return daout_scaled, daout * daout
         """
 
         paddle.enable_static()
@@ -265,10 +265,11 @@ def forward_fn(i, a):
             )
 
         def backward_fn(diout, daout):
-            return 2 * diout, paddle.static.nn.cond(
+            daout_scale = daout * 3.0
+            return daout_scale, paddle.static.nn.cond(
                 diout < 5.0,
-                lambda: paddle.multiply(daout, daout),
-                lambda: paddle.cos(daout),
+                lambda: -1 * daout,
+                lambda: daout * daout,
             )
 
         main_program = Program()
@@ -277,9 +278,10 @@ def backward_fn(diout, daout):
             i = paddle.static.data(name="i", shape=[1], dtype="float32")
             i.stop_gradient = False
             a = 2.0 * i
-            out_i, out = paddle.static.nn.static_pylayer(
+            out_i, out_a = paddle.static.nn.static_pylayer(
                 forward_fn, [i, a], backward_fn
             )
+            out = out_i + out_a
             loss = paddle.exp(out)
             append_backward(loss)
 
@@ -290,24 +292,36 @@ def backward_fn(diout, daout):
         )
         exe = base.Executor(place)
         for feed_i in range(0, 10):
-            print(feed_i)
             expected_a = 2.0 * feed_i
             if feed_i < 5:
                 expected_out_i = feed_i
-                expected_out = expected_a + expected_a
+                expected_out_a = expected_a + expected_a
+                expected_out = expected_out_a + expected_out_i
                 expected_out_grad = np.exp(expected_out)
-                expected_a_grad = expected_out_grad * expected_out_grad
-                expected_i_grad = 2 * expected_a_grad + 0
             else:
                 expected_out_i = feed_i
-                expected_out = expected_a - expected_a
+                expected_out_a = expected_a - expected_a
+                expected_out = expected_out_a + expected_out_i
                 expected_out_grad = np.exp(expected_out)
-                expected_a_grad = np.cos(expected_out_grad)
-                expected_i_grad = 2 * expected_a_grad + 0
+
+            if expected_out_grad < 5:
+                expected_a_grad = -1 * expected_out_grad
+                expected_i_grad = 3 * expected_out_grad + 2 * expected_a_grad
+            else:
+                expected_a_grad = expected_out_grad * expected_out_grad
+                expected_i_grad = 3 * expected_out_grad + 2 * expected_a_grad
+
             ret = exe.run(
                 main_program,
                 feed={'i': np.full((1), feed_i, dtype=np.float32)},
-                fetch_list=[out.name, out.grad_name, a.grad_name, i.grad_name],
+                fetch_list=[
+                    out.name,
+                    out.grad_name,
+                    out_i.grad_name,
+                    out_a.grad_name,
+                    a.grad_name,
+                    i.grad_name,
+                ],
             )
             np.testing.assert_allclose(
                 np.asarray(ret[0]), expected_out, rtol=1e-05
@@ -316,11 +330,130 @@ def backward_fn(diout, daout):
                 np.asarray(ret[1]), expected_out_grad, rtol=1e-05
             )
             np.testing.assert_allclose(
-                np.asarray(ret[2]), expected_a_grad, rtol=1e-05
+                np.asarray(ret[2]), expected_out_grad, rtol=1e-05
             )
             np.testing.assert_allclose(
-                np.asarray(ret[3]), expected_i_grad, rtol=1e-05
+                np.asarray(ret[3]), expected_out_grad, rtol=1e-05
+            )
+            np.testing.assert_allclose(
+                np.asarray(ret[4]), expected_a_grad, rtol=1e-05
+            )
+            np.testing.assert_allclose(
+                np.asarray(ret[5]), expected_i_grad, rtol=1e-05
+            )
+
+
+class TestStaticPyLayerBackward(unittest.TestCase):
+    def test_identity_backward(self):
+        paddle.enable_static()
+
+        def forward_fn(x):
+            return x
+
+        def backward_fn(dy):
+            return dy
+
+        main_program = Program()
+        start_program = Program()
+        input_shape = (2, 4)
+        with program_guard(main_program, start_program):
+            data = paddle.static.data(
+                name="X", shape=input_shape, dtype="float32"
             )
+            data.stop_gradient = False
+            out = paddle.static.nn.static_pylayer(
+                forward_fn, [data], backward_fn
+            )
+            loss = paddle.mean(out)
+            append_backward(loss)
+
+        place = (
+            paddle.CUDAPlace(0)
+            if core.is_compiled_with_cuda()
+            else paddle.CPUPlace()
+        )
+        exe = base.Executor(place)
+        randn_x = np.random.random(size=input_shape).astype(np.float32)
+        ret, x_grad = exe.run(
+            main_program,
+            feed={
+                'X': randn_x,
+            },
+            fetch_list=[out.name, data.grad_name],
+        )
+
+        np.testing.assert_allclose(
+            np.asarray(ret),
+            randn_x,
+            rtol=1e-05,
+        )
+
+        np.testing.assert_allclose(
+            np.asarray(x_grad),
+            np.full(
+                input_shape,
+                1.0 / functools.reduce(lambda x, y: x * y, input_shape),
+                dtype=np.float32,
+            ),
+            rtol=1e-05,
+        )
+
+    def test_static_pylayer_backward(self):
+        '''
+        pseudocode:
+
+        y = 3 * x
+        dx = tanh(dy)
+        '''
+
+        paddle.enable_static()
+
+        def forward_fn(x):
+            return 3 * x
+
+        def backward_fn(dy):
+            return paddle.tanh(dy)
+
+        main_program = Program()
+        start_program = Program()
+        input_shape = (3, 4)
+        with program_guard(main_program, start_program):
+            data = paddle.full(
+                shape=input_shape, dtype='float32', fill_value=-2.0
+            )
+            data.stop_gradient = False
+            out = paddle.static.nn.static_pylayer(
+                forward_fn, [data], backward_fn
+            )
+            loss = paddle.mean(out)
+            append_backward(loss)
+
+        place = (
+            paddle.CUDAPlace(0)
+            if core.is_compiled_with_cuda()
+            else paddle.CPUPlace()
+        )
+        exe = base.Executor(place)
+        ret, x_grad = exe.run(
+            main_program, fetch_list=[out.name, data.grad_name]
+        )
+        np.testing.assert_allclose(
+            np.asarray(ret),
+            np.full(input_shape, -6.0, dtype=np.float32),
+            rtol=1e-05,
+        )
+
+        np.testing.assert_allclose(
+            np.asarray(x_grad),
+            np.full(
+                input_shape,
+                np.tanh(
+                    1.0 / functools.reduce(lambda x, y: x * y, input_shape)
+                ),
+                dtype=np.float32,
+            ),
+            rtol=1e-05,
+        )
 
 
 if __name__ == '__main__':
diff --git a/test/legacy_test/test_stft_op.py b/test/legacy_test/test_stft_op.py
index 8b672295d7cde..665714f3e939e 100644
--- a/test/legacy_test/test_stft_op.py
+++ b/test/legacy_test/test_stft_op.py
@@ -15,8 +15,8 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
 from numpy.lib.stride_tricks import as_strided
+from op_test import OpTest
 
 import paddle
 
diff --git a/test/legacy_test/test_strided_slice_op.py b/test/legacy_test/test_strided_slice_op.py
index 9aca4ab961aa7..c23eb18bc5647 100644
--- a/test/legacy_test/test_strided_slice_op.py
+++ b/test/legacy_test/test_strided_slice_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_sum_op.py b/test/legacy_test/test_sum_op.py
index fc37437409063..0738f52f2ba7d 100644
--- a/test/legacy_test/test_sum_op.py
+++ b/test/legacy_test/test_sum_op.py
@@ -20,12 +20,8 @@
 import gradient_checker
 import numpy as np
 from decorator_helper import prog_scope
-from eager_op_test import (
-    OpTest,
-    convert_float_to_uint16,
-    convert_uint16_to_float,
-)
 from op import Operator
+from op_test import OpTest, convert_float_to_uint16, convert_uint16_to_float
 
 import paddle
 import paddle.inference as paddle_infer
diff --git a/test/legacy_test/test_svd_op.py b/test/legacy_test/test_svd_op.py
index 847a2e7e161cb..e74bd74fb5b4a 100644
--- a/test/legacy_test/test_svd_op.py
+++ b/test/legacy_test/test_svd_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, skip_check_grad_ci
+from op_test import OpTest, skip_check_grad_ci
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_sync_batch_norm_op.py b/test/legacy_test/test_sync_batch_norm_op.py
index d98aed3531689..68cb93c31d91e 100644
--- a/test/legacy_test/test_sync_batch_norm_op.py
+++ b/test/legacy_test/test_sync_batch_norm_op.py
@@ -26,11 +26,7 @@
 
 import numpy as np
 from decorator_helper import prog_scope
-from eager_op_test import (
-    OpTest,
-    _set_use_system_allocator,
-    convert_float_to_uint16,
-)
+from op_test import OpTest, _set_use_system_allocator, convert_float_to_uint16
 
 import paddle
 from paddle import base, nn
diff --git a/test/legacy_test/test_take_along_axis_op.py b/test/legacy_test/test_take_along_axis_op.py
index 6805b9b623406..b86bb0222ec7e 100644
--- a/test/legacy_test/test_take_along_axis_op.py
+++ b/test/legacy_test/test_take_along_axis_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle.framework import core
diff --git a/test/legacy_test/test_target_assign_op.py b/test/legacy_test/test_target_assign_op.py
index 8554fbbf5ff30..98369d62247df 100644
--- a/test/legacy_test/test_target_assign_op.py
+++ b/test/legacy_test/test_target_assign_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 def gen_match_and_neg_indices(num_prior, gt_lod, neg_lod):
diff --git a/test/legacy_test/test_tdm_child_op.py b/test/legacy_test/test_tdm_child_op.py
index 0bb0e38c6cde6..b1c100a2a789f 100644
--- a/test/legacy_test/test_tdm_child_op.py
+++ b/test/legacy_test/test_tdm_child_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, paddle_static_guard
+from op_test import OpTest, paddle_static_guard
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_tdm_sampler_op.py b/test/legacy_test/test_tdm_sampler_op.py
index 84d003ca323b0..e56496d9aa97b 100644
--- a/test/legacy_test/test_tdm_sampler_op.py
+++ b/test/legacy_test/test_tdm_sampler_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, paddle_static_guard
+from op_test import OpTest, paddle_static_guard
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_teacher_student_sigmoid_loss_op.py b/test/legacy_test/test_teacher_student_sigmoid_loss_op.py
index 2d056a02e0dff..984a47831064e 100644
--- a/test/legacy_test/test_teacher_student_sigmoid_loss_op.py
+++ b/test/legacy_test/test_teacher_student_sigmoid_loss_op.py
@@ -15,7 +15,7 @@
 from math import exp, log
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 from scipy.special import logit
 
 
diff --git a/test/legacy_test/test_temporal_shift_op.py b/test/legacy_test/test_temporal_shift_op.py
index 4c51cbefe7317..4d18daaf8f8fe 100644
--- a/test/legacy_test/test_temporal_shift_op.py
+++ b/test/legacy_test/test_temporal_shift_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_tile_op.py b/test/legacy_test/test_tile_op.py
index e64e7113653ec..6e0cea75b0c3c 100644
--- a/test/legacy_test/test_tile_op.py
+++ b/test/legacy_test/test_tile_op.py
@@ -17,7 +17,7 @@
 import gradient_checker
 import numpy as np
 from decorator_helper import prog_scope
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_top_k_op.py b/test/legacy_test/test_top_k_op.py
index 940db2bb899f8..5e5257e6cd229 100644
--- a/test/legacy_test/test_top_k_op.py
+++ b/test/legacy_test/test_top_k_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 
diff --git a/test/legacy_test/test_top_k_v2_op.py b/test/legacy_test/test_top_k_v2_op.py
index df4cd67b70753..9ff5d03473afc 100644
--- a/test/legacy_test/test_top_k_v2_op.py
+++ b/test/legacy_test/test_top_k_v2_op.py
@@ -15,11 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import (
-    OpTest,
-    convert_float_to_uint16,
-    convert_uint16_to_float,
-)
+from op_test import OpTest, convert_float_to_uint16, convert_uint16_to_float
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_trace_op.py b/test/legacy_test/test_trace_op.py
index 7b8f66b4bc35d..1d53c1180b836 100644
--- a/test/legacy_test/test_trace_op.py
+++ b/test/legacy_test/test_trace_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import base, tensor
diff --git a/test/legacy_test/test_trans_layout_op.py b/test/legacy_test/test_trans_layout_op.py
index 3617cdb013b8a..da59301aacfc3 100644
--- a/test/legacy_test/test_trans_layout_op.py
+++ b/test/legacy_test/test_trans_layout_op.py
@@ -18,7 +18,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 
diff --git a/test/legacy_test/test_transfer_dtype_op.py b/test/legacy_test/test_transfer_dtype_op.py
index 0a087b5d4f205..485efa5beecdf 100644
--- a/test/legacy_test/test_transfer_dtype_op.py
+++ b/test/legacy_test/test_transfer_dtype_op.py
@@ -15,11 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import (
-    OpTest,
-    convert_float_to_uint16,
-    convert_uint16_to_float,
-)
+from op_test import OpTest, convert_float_to_uint16, convert_uint16_to_float
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_transfer_layout_op.py b/test/legacy_test/test_transfer_layout_op.py
index ffcd979b0ba06..e09b73e904ead 100644
--- a/test/legacy_test/test_transfer_layout_op.py
+++ b/test/legacy_test/test_transfer_layout_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_transpose_op.py b/test/legacy_test/test_transpose_op.py
index adb0ca0779f87..5c4b4d6d16c8e 100644
--- a/test/legacy_test/test_transpose_op.py
+++ b/test/legacy_test/test_transpose_op.py
@@ -17,7 +17,7 @@
 import gradient_checker
 import numpy as np
 from decorator_helper import prog_scope
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_tree_conv_op.py b/test/legacy_test/test_tree_conv_op.py
index 997ffc74d0337..e05ee1a4d4cdf 100644
--- a/test/legacy_test/test_tree_conv_op.py
+++ b/test/legacy_test/test_tree_conv_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 def collect_node_patch(og, max_depth):
diff --git a/test/legacy_test/test_triangular_solve_op.py b/test/legacy_test/test_triangular_solve_op.py
index 60f699d88822d..abdc9d6fe1bd6 100644
--- a/test/legacy_test/test_triangular_solve_op.py
+++ b/test/legacy_test/test_triangular_solve_op.py
@@ -18,7 +18,7 @@
 import numpy as np
 
 sys.path.append("..")
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_tril_indices_op.py b/test/legacy_test/test_tril_indices_op.py
index 26d19e44c2887..9cef30a02519f 100644
--- a/test/legacy_test/test_tril_indices_op.py
+++ b/test/legacy_test/test_tril_indices_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_tril_triu_op.py b/test/legacy_test/test_tril_triu_op.py
index c0304354a1bd5..fee9b6f95023a 100644
--- a/test/legacy_test/test_tril_triu_op.py
+++ b/test/legacy_test/test_tril_triu_op.py
@@ -14,7 +14,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import base, tensor
diff --git a/test/legacy_test/test_trilinear_interp_op.py b/test/legacy_test/test_trilinear_interp_op.py
index bbdc14e434cfb..d83c1987e700c 100755
--- a/test/legacy_test/test_trilinear_interp_op.py
+++ b/test/legacy_test/test_trilinear_interp_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 from paddle.base import core
 
diff --git a/test/legacy_test/test_trilinear_interp_v2_op.py b/test/legacy_test/test_trilinear_interp_v2_op.py
index 68101d5c5e0f7..d81b04ebdb181 100755
--- a/test/legacy_test/test_trilinear_interp_v2_op.py
+++ b/test/legacy_test/test_trilinear_interp_v2_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_triplet_margin_with_distance_loss.py b/test/legacy_test/test_triplet_margin_with_distance_loss.py
index c1a1a55c49e4c..3c102fdc655c2 100644
--- a/test/legacy_test/test_triplet_margin_with_distance_loss.py
+++ b/test/legacy_test/test_triplet_margin_with_distance_loss.py
@@ -155,7 +155,10 @@ def test_dygraph(
             swap=swap,
             reduction=reduction,
         )
-    dy_result = dy_res.numpy()
+    if reduction != 'none':
+        dy_result = float(dy_res)
+    else:
+        dy_result = dy_res.numpy()
     paddle.enable_static()
     return dy_result
 
@@ -188,9 +191,10 @@ def calc_triplet_margin_distance_loss(
     return expected
 
 
-class TestTripletMarginWithDistanceLoss(unittest.TestCase):
+class TestTripletMarginWithDistanceLossnew(unittest.TestCase):
     def test_TripletMarginDistanceLoss(self):
         shape = (5, 5)
+        np.random.seed(1234)
         input = np.random.uniform(0.1, 0.8, size=shape).astype(np.float64)
         positive = np.random.uniform(0, 2, size=shape).astype(np.float64)
         negative = np.random.uniform(0, 2, size=shape).astype(np.float64)
@@ -258,6 +262,8 @@ def test_TripletMarginDistanceLoss(self):
                     dy_functional, expected, rtol=1e-5, atol=1e-8
                 )
 
+
+class TestTripletMarginWithDistanceLossError(unittest.TestCase):
     def test_TripletMarginDistanceLoss_error(self):
         paddle.disable_static()
         self.assertRaises(
@@ -278,6 +284,8 @@ def test_TripletMarginDistanceLoss_error(self):
         )
         paddle.enable_static()
 
+
+class TestTripletMarginWithDistanceLossDF(unittest.TestCase):
     def test_TripletMarginDistanceLoss_distance_function(self):
         def distance_function_1(x1, x2):
             return 1.0 - paddle.nn.functional.cosine_similarity(x1, x2)
@@ -287,6 +295,7 @@ def distance_function_2(x1, x2):
 
         distance_function_list = [distance_function_1, distance_function_2]
         shape = (5, 5)
+        np.random.seed(1234)
         input = np.random.uniform(0.1, 0.8, size=shape).astype(np.float64)
         positive = np.random.uniform(0, 2, size=shape).astype(np.float64)
         negative = np.random.uniform(0, 2, size=shape).astype(np.float64)
@@ -336,6 +345,8 @@ def distance_function_2(x1, x2):
                 static_functional, dy_functional, rtol=1e-5, atol=1e-8
             )
 
+
+class TestTripletMarginWithDistanceLossDFE(unittest.TestCase):
     def test_TripletMarginWithDistanceLoss_distance_funtion_error(self):
         paddle.disable_static()
 
@@ -344,6 +355,7 @@ def distance_function(x1, x2):
 
         func = distance_function
         shape = (5, 5)
+        np.random.seed(1234)
         input = np.random.uniform(0.1, 0.8, size=shape).astype(np.float64)
         positive = np.random.uniform(0, 2, size=shape).astype(np.float64)
         negative = np.random.uniform(0, 2, size=shape).astype(np.float64)
@@ -358,6 +370,8 @@ def distance_function(x1, x2):
         )
         paddle.enable_static()
 
+
+class TestTripletMarginWithDistanceLossDim(unittest.TestCase):
     def test_TripletMarginDistanceLoss_dimension(self):
         paddle.disable_static()
 
@@ -383,10 +397,13 @@ def test_TripletMarginDistanceLoss_dimension(self):
         )
         paddle.enable_static()
 
+
+class TestTripletMarginWithDistanceLossSwap(unittest.TestCase):
     def test_TripletMarginWithDistanceLoss_swap(self):
         reduction = 'mean'
         place = paddle.CPUPlace()
         shape = (5, 5)
+        np.random.seed(1234)
         input = np.random.uniform(0.1, 0.8, size=shape).astype(np.float64)
         positive = np.random.uniform(0, 2, size=shape).astype(np.float64)
         negative = np.random.uniform(0, 2, size=shape).astype(np.float64)
@@ -450,6 +467,8 @@ def test_TripletMarginWithDistanceLoss_swap(self):
             dy_functional, expected, rtol=1e-5, atol=1e-8
         )
 
+
+class TestTripletMarginWithDistanceLossMargin(unittest.TestCase):
     def test_TripletMarginWithDistanceLoss_margin(self):
         paddle.disable_static()
 
diff --git a/test/legacy_test/test_triu_indices_op.py b/test/legacy_test/test_triu_indices_op.py
index 4f5c7610ddd7e..37f02f64ee3f9 100644
--- a/test/legacy_test/test_triu_indices_op.py
+++ b/test/legacy_test/test_triu_indices_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_trunc_op.py b/test/legacy_test/test_trunc_op.py
index d1dae7c8f4081..e67c0d94b78bc 100644
--- a/test/legacy_test/test_trunc_op.py
+++ b/test/legacy_test/test_trunc_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_unbind_op.py b/test/legacy_test/test_unbind_op.py
index 45a4d92fef019..bb04894d76b75 100644
--- a/test/legacy_test/test_unbind_op.py
+++ b/test/legacy_test/test_unbind_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import base, tensor
diff --git a/test/legacy_test/test_unfold_op.py b/test/legacy_test/test_unfold_op.py
index 9e9c0806708de..8a7f2aaf199f3 100644
--- a/test/legacy_test/test_unfold_op.py
+++ b/test/legacy_test/test_unfold_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_uniform_random_bf16_op.py b/test/legacy_test/test_uniform_random_bf16_op.py
index 2e92c2041107b..1c32b30bf6899 100644
--- a/test/legacy_test/test_uniform_random_bf16_op.py
+++ b/test/legacy_test/test_uniform_random_bf16_op.py
@@ -15,8 +15,8 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_uint16_to_float
 from op import Operator
+from op_test import OpTest, convert_uint16_to_float
 from test_uniform_random_op import output_hist, output_hist_diag
 
 import paddle
diff --git a/test/legacy_test/test_uniform_random_inplace_op.py b/test/legacy_test/test_uniform_random_inplace_op.py
index f022656d1ada6..50f2801288b6f 100644
--- a/test/legacy_test/test_uniform_random_inplace_op.py
+++ b/test/legacy_test/test_uniform_random_inplace_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_uint16_to_float
+from op_test import OpTest, convert_uint16_to_float
 
 import paddle
 from paddle import base
@@ -73,7 +73,7 @@ def test_check_output(self):
 
     def verify_output(self, outs):
         hist, prob = self.output_hist(np.array(outs[0]))
-        np.testing.assert_allclose(hist, prob, rtol=0, atol=0.001)
+        np.testing.assert_allclose(hist, prob, rtol=0.015, atol=0.001)
 
     # TODO: Due to the lack of the self.python_api=paddle.uniform_random_inplace setting, the dynamic graph is temporarily turned off, set check_dygraph=False
     def test_check_grad(self):
diff --git a/test/legacy_test/test_uniform_random_op.py b/test/legacy_test/test_uniform_random_op.py
index 4cfafe4c7a84e..eca41e655f9a3 100644
--- a/test/legacy_test/test_uniform_random_op.py
+++ b/test/legacy_test/test_uniform_random_op.py
@@ -16,8 +16,8 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_uint16_to_float
 from op import Operator
+from op_test import OpTest, convert_uint16_to_float
 from test_attribute_var import UnittestBase
 
 import paddle
diff --git a/test/legacy_test/test_unique.py b/test/legacy_test/test_unique.py
index cf1746a4fd1d4..8fe9dfa9af635 100644
--- a/test/legacy_test/test_unique.py
+++ b/test/legacy_test/test_unique.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16, paddle_static_guard
+from op_test import OpTest, convert_float_to_uint16, paddle_static_guard
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_unique_consecutive_op.py b/test/legacy_test/test_unique_consecutive_op.py
index 587ef404b7c39..36fd33490d18c 100644
--- a/test/legacy_test/test_unique_consecutive_op.py
+++ b/test/legacy_test/test_unique_consecutive_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_unique_with_counts.py b/test/legacy_test/test_unique_with_counts.py
index 364276a5e473d..4cc2879bfab7a 100644
--- a/test/legacy_test/test_unique_with_counts.py
+++ b/test/legacy_test/test_unique_with_counts.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, paddle_static_guard
+from op_test import OpTest, paddle_static_guard
 
 import paddle
 from paddle.base import core
diff --git a/test/legacy_test/test_unpool3d_op.py b/test/legacy_test/test_unpool3d_op.py
index f8404d9680985..eca6b989dbdbc 100644
--- a/test/legacy_test/test_unpool3d_op.py
+++ b/test/legacy_test/test_unpool3d_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 import paddle.nn.functional as F
diff --git a/test/legacy_test/test_unpool_op.py b/test/legacy_test/test_unpool_op.py
index 6020b7b87f17a..2d76b32cde4b6 100644
--- a/test/legacy_test/test_unpool_op.py
+++ b/test/legacy_test/test_unpool_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 from test_attribute_var import UnittestBase
 
 import paddle
diff --git a/test/legacy_test/test_unsqueeze2_op.py b/test/legacy_test/test_unsqueeze2_op.py
index df4115eb0c57a..3f7c43ef10a16 100755
--- a/test/legacy_test/test_unsqueeze2_op.py
+++ b/test/legacy_test/test_unsqueeze2_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 
diff --git a/test/legacy_test/test_unsqueeze_op.py b/test/legacy_test/test_unsqueeze_op.py
index 0fbc44a728eed..39aec97e23ecd 100755
--- a/test/legacy_test/test_unsqueeze_op.py
+++ b/test/legacy_test/test_unsqueeze_op.py
@@ -17,7 +17,7 @@
 import gradient_checker
 import numpy as np
 from decorator_helper import prog_scope
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_unstack_op.py b/test/legacy_test/test_unstack_op.py
index e07c96ce19019..4bc980025fe33 100755
--- a/test/legacy_test/test_unstack_op.py
+++ b/test/legacy_test/test_unstack_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_update_loss_scaling_op.py b/test/legacy_test/test_update_loss_scaling_op.py
index 32cee8125fc27..e05bfa540d08d 100644
--- a/test/legacy_test/test_update_loss_scaling_op.py
+++ b/test/legacy_test/test_update_loss_scaling_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16, paddle_static_guard
+from op_test import OpTest, convert_float_to_uint16, paddle_static_guard
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_var_base.py b/test/legacy_test/test_var_base.py
index 052f9bb1bc603..026675e45b518 100644
--- a/test/legacy_test/test_var_base.py
+++ b/test/legacy_test/test_var_base.py
@@ -512,9 +512,6 @@ def test_deep_copy(self):
             with self.assertRaises(ValueError):
                 x_copy[:] = 5.0
 
-            with self.assertRaises(RuntimeError):
-                copy.deepcopy(z)
-
             x_copy2 = copy.deepcopy(x, memo)
             y_copy2 = copy.deepcopy(y, memo)
             self.assertEqual(id(x_copy), id(x_copy2))
diff --git a/test/legacy_test/test_var_conv_2d.py b/test/legacy_test/test_var_conv_2d.py
index df9c1d26c3e63..cb799784a7d6d 100644
--- a/test/legacy_test/test_var_conv_2d.py
+++ b/test/legacy_test/test_var_conv_2d.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, skip_check_grad_ci
+from op_test import OpTest, skip_check_grad_ci
 
 
 class TestVarConv2DOp(OpTest):
diff --git a/test/legacy_test/test_viterbi_decode_op.py b/test/legacy_test/test_viterbi_decode_op.py
index 3b62d035b64fd..f5b9c3677a7f4 100644
--- a/test/legacy_test/test_viterbi_decode_op.py
+++ b/test/legacy_test/test_viterbi_decode_op.py
@@ -11,7 +11,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_warpctc_op.py b/test/legacy_test/test_warpctc_op.py
index e7c1ff28e1b5b..cb60214bbc6d0 100644
--- a/test/legacy_test/test_warpctc_op.py
+++ b/test/legacy_test/test_warpctc_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 from test_softmax_op import stable_softmax
 
 import paddle
diff --git a/test/legacy_test/test_warprnnt_op.py b/test/legacy_test/test_warprnnt_op.py
index c5c2950cc7c24..ced735b4310ab 100644
--- a/test/legacy_test/test_warprnnt_op.py
+++ b/test/legacy_test/test_warprnnt_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle import _C_ops
diff --git a/test/legacy_test/test_where_op.py b/test/legacy_test/test_where_op.py
index 3daa2fbbb6898..2c508ee0c556f 100644
--- a/test/legacy_test/test_where_op.py
+++ b/test/legacy_test/test_where_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import base
diff --git a/test/legacy_test/test_yolo_box_op.py b/test/legacy_test/test_yolo_box_op.py
index 81445c081fea1..2059571ac4478 100644
--- a/test/legacy_test/test_yolo_box_op.py
+++ b/test/legacy_test/test_yolo_box_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 
diff --git a/test/legacy_test/test_yolov3_loss_op.py b/test/legacy_test/test_yolov3_loss_op.py
index c9a9779a2a984..9eaf5e5579524 100644
--- a/test/legacy_test/test_yolov3_loss_op.py
+++ b/test/legacy_test/test_yolov3_loss_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 from scipy.special import expit, logit
 
 import paddle
diff --git a/test/mkldnn/test_activation_bf16_mkldnn_op.py b/test/mkldnn/test_activation_bf16_mkldnn_op.py
index 6e002222ae429..d1c5a7b313435 100644
--- a/test/mkldnn/test_activation_bf16_mkldnn_op.py
+++ b/test/mkldnn/test_activation_bf16_mkldnn_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTestTool, convert_float_to_uint16
+from op_test import OpTestTool, convert_float_to_uint16
 from scipy.special import erf
 from test_activation_op import TestActivation
 from test_gelu_op import gelu
diff --git a/test/mkldnn/test_activation_mkldnn_op.py b/test/mkldnn/test_activation_mkldnn_op.py
index 6887bd9059edd..e6ef8388f771d 100644
--- a/test/mkldnn/test_activation_mkldnn_op.py
+++ b/test/mkldnn/test_activation_mkldnn_op.py
@@ -15,8 +15,8 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
 from mkldnn_op_test import check_if_mkldnn_primitives_exist_in_bwd
+from op_test import OpTest, convert_float_to_uint16
 from test_activation_op import (
     TestAbs,
     TestAbs_ZeroDim,
diff --git a/test/mkldnn/test_batch_norm_mkldnn_op.py b/test/mkldnn/test_batch_norm_mkldnn_op.py
index a274990047f17..bdca10ff02d62 100644
--- a/test/mkldnn/test_batch_norm_mkldnn_op.py
+++ b/test/mkldnn/test_batch_norm_mkldnn_op.py
@@ -15,8 +15,8 @@
 import unittest
 
 import numpy as np
-from eager_op_test import _set_use_system_allocator
 from mkldnn_op_test import check_if_mkldnn_batchnorm_primitives_exist_in_bwd
+from op_test import _set_use_system_allocator
 from test_batch_norm_op import (
     TestBatchNormOpInference,
     TestBatchNormOpTraining,
diff --git a/test/mkldnn/test_bilinear_interp_mkldnn_op.py b/test/mkldnn/test_bilinear_interp_mkldnn_op.py
index f58e9c88fd6f5..023b07d9ef467 100644
--- a/test/mkldnn/test_bilinear_interp_mkldnn_op.py
+++ b/test/mkldnn/test_bilinear_interp_mkldnn_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, skip_check_grad_ci
+from op_test import OpTest, skip_check_grad_ci
 
 
 def bilinear_interp_mkldnn_np(
diff --git a/test/mkldnn/test_bilinear_interp_v2_mkldnn_op.py b/test/mkldnn/test_bilinear_interp_v2_mkldnn_op.py
index 99f20bf3c09a8..42a5751945a02 100644
--- a/test/mkldnn/test_bilinear_interp_v2_mkldnn_op.py
+++ b/test/mkldnn/test_bilinear_interp_v2_mkldnn_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci
+from op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci
 
 
 def bilinear_interp_onednn_np(
diff --git a/test/mkldnn/test_cast_mkldnn_op.py b/test/mkldnn/test_cast_mkldnn_op.py
index edf484b8aa9cd..8856fd9071391 100644
--- a/test/mkldnn/test_cast_mkldnn_op.py
+++ b/test/mkldnn/test_cast_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle.base import core
diff --git a/test/mkldnn/test_clip_mkldnn_op.py b/test/mkldnn/test_clip_mkldnn_op.py
index b7538cb45b18c..752193d35b64b 100644
--- a/test/mkldnn/test_clip_mkldnn_op.py
+++ b/test/mkldnn/test_clip_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, OpTestTool, convert_float_to_uint16
+from op_test import OpTest, OpTestTool, convert_float_to_uint16
 
 import paddle
 from paddle.base import core
diff --git a/test/mkldnn/test_concat_bf16_mkldnn_op.py b/test/mkldnn/test_concat_bf16_mkldnn_op.py
index f62886aca8951..0e316b533ca02 100644
--- a/test/mkldnn/test_concat_bf16_mkldnn_op.py
+++ b/test/mkldnn/test_concat_bf16_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 from paddle import enable_static
 from paddle.base import core
diff --git a/test/mkldnn/test_concat_int8_mkldnn_op.py b/test/mkldnn/test_concat_int8_mkldnn_op.py
index 8421ab47c0a1c..5ad5046f1c23e 100644
--- a/test/mkldnn/test_concat_int8_mkldnn_op.py
+++ b/test/mkldnn/test_concat_int8_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 class TestConcatOp(OpTest):
diff --git a/test/mkldnn/test_concat_mkldnn_op.py b/test/mkldnn/test_concat_mkldnn_op.py
index 906bb4c731305..48b0244b8b077 100644
--- a/test/mkldnn/test_concat_mkldnn_op.py
+++ b/test/mkldnn/test_concat_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 from paddle import enable_static
 from paddle.base import core
diff --git a/test/mkldnn/test_conv2d_bf16_mkldnn_op.py b/test/mkldnn/test_conv2d_bf16_mkldnn_op.py
index acaaa7406471c..a4526dcf0851a 100644
--- a/test/mkldnn/test_conv2d_bf16_mkldnn_op.py
+++ b/test/mkldnn/test_conv2d_bf16_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, OpTestTool, convert_float_to_uint16
+from op_test import OpTest, OpTestTool, convert_float_to_uint16
 from test_conv2d_op import TestConv2DOp, conv2d_forward_naive
 
 from paddle.base import core
diff --git a/test/mkldnn/test_conv2d_int8_mkldnn_op.py b/test/mkldnn/test_conv2d_int8_mkldnn_op.py
index dc2fe60848148..981570cd5bb52 100644
--- a/test/mkldnn/test_conv2d_int8_mkldnn_op.py
+++ b/test/mkldnn/test_conv2d_int8_mkldnn_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 from test_conv2d_op import TestConv2DOp, conv2d_forward_naive
 
 from paddle.base import core
diff --git a/test/mkldnn/test_conv2d_mkldnn_op.py b/test/mkldnn/test_conv2d_mkldnn_op.py
index 9fbd4ea46471c..3c77581acf80d 100644
--- a/test/mkldnn/test_conv2d_mkldnn_op.py
+++ b/test/mkldnn/test_conv2d_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, skip_check_grad_ci
+from op_test import OpTest, skip_check_grad_ci
 from test_conv2d_op import TestConv2DOp, TestConv2DOp_v2
 
 
diff --git a/test/mkldnn/test_conv2d_transpose_bf16_mkldnn_op.py b/test/mkldnn/test_conv2d_transpose_bf16_mkldnn_op.py
index 53fa07cb42628..09c3c1172354f 100644
--- a/test/mkldnn/test_conv2d_transpose_bf16_mkldnn_op.py
+++ b/test/mkldnn/test_conv2d_transpose_bf16_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 from test_conv2d_transpose_op import conv2dtranspose_forward_naive
 
 from paddle import enable_static
diff --git a/test/mkldnn/test_conv2d_transpose_mkldnn_op.py b/test/mkldnn/test_conv2d_transpose_mkldnn_op.py
index 6a1a012f4c8fa..55fdbefe16c0a 100644
--- a/test/mkldnn/test_conv2d_transpose_mkldnn_op.py
+++ b/test/mkldnn/test_conv2d_transpose_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 from test_conv2d_transpose_op import TestConv2DTransposeOp
 
 from paddle import enable_static
diff --git a/test/mkldnn/test_dequantize_mkldnn_op.py b/test/mkldnn/test_dequantize_mkldnn_op.py
index e2a49d6d4754a..5b3b25d09c99c 100644
--- a/test/mkldnn/test_dequantize_mkldnn_op.py
+++ b/test/mkldnn/test_dequantize_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 
diff --git a/test/mkldnn/test_elementwise_add_bf16_mkldnn_op.py b/test/mkldnn/test_elementwise_add_bf16_mkldnn_op.py
index d9d3bac981ced..645ec938b50cd 100644
--- a/test/mkldnn/test_elementwise_add_bf16_mkldnn_op.py
+++ b/test/mkldnn/test_elementwise_add_bf16_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 from paddle import enable_static
 from paddle.base import core
diff --git a/test/mkldnn/test_elementwise_add_mkldnn_op.py b/test/mkldnn/test_elementwise_add_mkldnn_op.py
index 7b48c658e95e0..1d87cd065b370 100644
--- a/test/mkldnn/test_elementwise_add_mkldnn_op.py
+++ b/test/mkldnn/test_elementwise_add_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import skip_check_grad_ci
+from op_test import skip_check_grad_ci
 from test_elementwise_add_op import TestElementwiseAddOp
 
 from paddle import enable_static
diff --git a/test/mkldnn/test_elementwise_div_mkldnn_op.py b/test/mkldnn/test_elementwise_div_mkldnn_op.py
index d6c545cc017f5..bc3d340aba969 100644
--- a/test/mkldnn/test_elementwise_div_mkldnn_op.py
+++ b/test/mkldnn/test_elementwise_div_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, OpTestTool, convert_float_to_uint16
+from op_test import OpTest, OpTestTool, convert_float_to_uint16
 
 from paddle import enable_static
 from paddle.base import core
diff --git a/test/mkldnn/test_elementwise_mul_bf16_mkldnn_op.py b/test/mkldnn/test_elementwise_mul_bf16_mkldnn_op.py
index ac1e47dd868d1..b339faa3093bd 100644
--- a/test/mkldnn/test_elementwise_mul_bf16_mkldnn_op.py
+++ b/test/mkldnn/test_elementwise_mul_bf16_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 from paddle import enable_static
 from paddle.base import core
diff --git a/test/mkldnn/test_elementwise_mul_onednn_op.py b/test/mkldnn/test_elementwise_mul_onednn_op.py
index f8c38bf6d167a..ce5a7b58502f5 100644
--- a/test/mkldnn/test_elementwise_mul_onednn_op.py
+++ b/test/mkldnn/test_elementwise_mul_onednn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import skip_check_grad_ci
+from op_test import skip_check_grad_ci
 from test_elementwise_mul_op import ElementwiseMulOp
 
 from paddle import enable_static
diff --git a/test/mkldnn/test_elementwise_sub_onednn_op.py b/test/mkldnn/test_elementwise_sub_onednn_op.py
index 1340eac9c92b7..04097e035553a 100644
--- a/test/mkldnn/test_elementwise_sub_onednn_op.py
+++ b/test/mkldnn/test_elementwise_sub_onednn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, OpTestTool, convert_float_to_uint16
+from op_test import OpTest, OpTestTool, convert_float_to_uint16
 
 from paddle import enable_static
 from paddle.base import core
diff --git a/test/mkldnn/test_expand_v2_mkldnn_op.py b/test/mkldnn/test_expand_v2_mkldnn_op.py
index 1ef44c8b27cb8..4c8b9d0221a4e 100644
--- a/test/mkldnn/test_expand_v2_mkldnn_op.py
+++ b/test/mkldnn/test_expand_v2_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, OpTestTool, convert_float_to_uint16
+from op_test import OpTest, OpTestTool, convert_float_to_uint16
 
 import paddle
 from paddle.base import core
diff --git a/test/mkldnn/test_fc_bf16_mkldnn_op.py b/test/mkldnn/test_fc_bf16_mkldnn_op.py
index 7bc726e677693..f4a7f82232cd5 100644
--- a/test/mkldnn/test_fc_bf16_mkldnn_op.py
+++ b/test/mkldnn/test_fc_bf16_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 from paddle import enable_static
 from paddle.base import core
diff --git a/test/mkldnn/test_fc_int8_mkldnn_op.py b/test/mkldnn/test_fc_int8_mkldnn_op.py
index 4d83ee1f008ba..2b0784f6d8252 100644
--- a/test/mkldnn/test_fc_int8_mkldnn_op.py
+++ b/test/mkldnn/test_fc_int8_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, OpTestTool
+from op_test import OpTest, OpTestTool
 
 
 @OpTestTool.skip_if_not_cpu()
diff --git a/test/mkldnn/test_fc_mkldnn_op.py b/test/mkldnn/test_fc_mkldnn_op.py
index 8373144a11cdd..7e61ef5984c60 100644
--- a/test/mkldnn/test_fc_mkldnn_op.py
+++ b/test/mkldnn/test_fc_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 def fully_connected_naive(input, weights, bias_data):
diff --git a/test/mkldnn/test_fill_constant_mkldnn_op.py b/test/mkldnn/test_fill_constant_mkldnn_op.py
index c8b5e04979532..d293110ee7293 100644
--- a/test/mkldnn/test_fill_constant_mkldnn_op.py
+++ b/test/mkldnn/test_fill_constant_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, OpTestTool
+from op_test import OpTest, OpTestTool
 
 import paddle
 
diff --git a/test/mkldnn/test_flatten_mkldnn_op.py b/test/mkldnn/test_flatten_mkldnn_op.py
index b54a3974614f2..483262dee59dc 100644
--- a/test/mkldnn/test_flatten_mkldnn_op.py
+++ b/test/mkldnn/test_flatten_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, OpTestTool, convert_float_to_uint16
+from op_test import OpTest, OpTestTool, convert_float_to_uint16
 
 import paddle
 from paddle.base import core
diff --git a/test/mkldnn/test_fused_vit_attention.py b/test/mkldnn/test_fused_vit_attention.py
index 1de329d7032b8..8c4876e4281b2 100644
--- a/test/mkldnn/test_fused_vit_attention.py
+++ b/test/mkldnn/test_fused_vit_attention.py
@@ -24,50 +24,53 @@
 
 
 def test_fuse_resenet_unit():
-    place = paddle.CPUPlace()
-    program = paddle.static.Program()
-    startup_program = paddle.static.Program()
-    batch_size = 1
-    token_size = 4097
-    hidden_size = 768
-    num_heads = 12
-    dtype = np.float32
-    with paddle.static.program_guard(program, startup_program):
-        x = paddle.static.data(
-            "x", [batch_size, token_size, hidden_size * 3], dtype=dtype
-        )
-        qkv = x.reshape(
-            (batch_size, token_size, 3, num_heads, hidden_size // num_heads)
-        ).transpose((2, 0, 3, 1, 4))
+    tests = [[1, 4096, 768, 12], [10, 4097, 756, 12], [10, 4097, 756, 12]]
+    for test in tests:
+        batch_size = test[0]
+        token_size = test[1]
+        hidden_size = test[2]
+        num_heads = test[3]
+        dtype = np.float32
 
-        q, k, v = qkv[0], qkv[1], qkv[2]
+        place = paddle.CPUPlace()
+        program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        with paddle.static.program_guard(program, startup_program):
+            x = paddle.static.data(
+                "x", [batch_size, token_size, hidden_size * 3], dtype=dtype
+            )
+            qkv = x.reshape(
+                (batch_size, token_size, 3, num_heads, hidden_size // num_heads)
+            ).transpose((2, 0, 3, 1, 4))
 
-        attn = q.matmul(k.transpose((0, 1, 3, 2)))
+            q, k, v = qkv[0], qkv[1], qkv[2]
 
-        attn = paddle.nn.functional.softmax(attn, axis=-1)
+            attn = q.matmul(k.transpose((0, 1, 3, 2)))
 
-        out = (
-            (attn.matmul(v))
-            .transpose((0, 2, 1, 3))
-            .reshape((-1, token_size, hidden_size))
-        )
+            attn = paddle.nn.functional.softmax(attn, axis=-1)
+
+            out = (
+                (attn.matmul(v))
+                .transpose((0, 2, 1, 3))
+                .reshape((-1, token_size, hidden_size))
+            )
 
-    graph = core.Graph(program.desc)
-    core.get_pass("self_attention_fuse_pass").apply(graph)
-    after_program = paddle.base.framework.IrGraph(graph).to_program()
-    exe = paddle.static.Executor(place)
-    exe.run(startup_program)
+        graph = core.Graph(program.desc)
+        core.get_pass("self_attention_fuse_pass").apply(graph)
+        after_program = paddle.framework.IrGraph(graph).to_program()
+        exe = paddle.static.Executor(place)
+        exe.run(startup_program)
 
-    feed = {
-        "x": np.random.randn(batch_size, token_size, hidden_size * 3).astype(
-            dtype
+        feed = {
+            "x": np.random.randn(
+                batch_size, token_size, hidden_size * 3
+            ).astype(dtype)
+        }
+        before_out = exe.run(program, feed=feed, fetch_list=[out.name])
+        after_out = exe.run(after_program, feed=feed, fetch_list=[out.name])
+        np.testing.assert_allclose(
+            before_out[0], after_out[0], rtol=1e-05, atol=0.005
         )
-    }
-    before_out = exe.run(program, feed=feed, fetch_list=[out.name])
-    after_out = exe.run(after_program, feed=feed, fetch_list=[out.name])
-    np.testing.assert_allclose(
-        before_out[0], after_out[0], rtol=1e-05, atol=0.005
-    )
 
 
 if __name__ == '__main__':
diff --git a/test/mkldnn/test_fusion_gru_bf16_mkldnn_op.py b/test/mkldnn/test_fusion_gru_bf16_mkldnn_op.py
index 0bac634467b5b..93b141f9eefca 100644
--- a/test/mkldnn/test_fusion_gru_bf16_mkldnn_op.py
+++ b/test/mkldnn/test_fusion_gru_bf16_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 from test_fusion_gru_op import fusion_gru
 from test_fusion_lstm_op import ACTIVATION
 
diff --git a/test/mkldnn/test_fusion_gru_int8_mkldnn_op.py b/test/mkldnn/test_fusion_gru_int8_mkldnn_op.py
index a167fd40c8fc9..352ce5bc5db5f 100644
--- a/test/mkldnn/test_fusion_gru_int8_mkldnn_op.py
+++ b/test/mkldnn/test_fusion_gru_int8_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 from test_fusion_gru_op import fusion_gru
 from test_fusion_lstm_op import ACTIVATION
 
diff --git a/test/mkldnn/test_fusion_lstm_bf16_mkldnn_op.py b/test/mkldnn/test_fusion_lstm_bf16_mkldnn_op.py
index 4ccd419d40731..12f0c54b40163 100644
--- a/test/mkldnn/test_fusion_lstm_bf16_mkldnn_op.py
+++ b/test/mkldnn/test_fusion_lstm_bf16_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 from test_fusion_lstm_op import ACTIVATION, fusion_lstm
 
 from paddle.base import core
diff --git a/test/mkldnn/test_fusion_lstm_int8_mkldnn_op.py b/test/mkldnn/test_fusion_lstm_int8_mkldnn_op.py
index 546a30c3046ce..96bee8d9927bf 100644
--- a/test/mkldnn/test_fusion_lstm_int8_mkldnn_op.py
+++ b/test/mkldnn/test_fusion_lstm_int8_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 from test_fusion_lstm_op import ACTIVATION, fusion_lstm
 
 
diff --git a/test/mkldnn/test_gaussian_random_mkldnn_op.py b/test/mkldnn/test_gaussian_random_mkldnn_op.py
index 133a01f593db9..d66212391ff0d 100644
--- a/test/mkldnn/test_gaussian_random_mkldnn_op.py
+++ b/test/mkldnn/test_gaussian_random_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 from test_gaussian_random_op import TestGaussianRandomOp
 
 import paddle
diff --git a/test/mkldnn/test_layer_norm_bf16_mkldnn_op.py b/test/mkldnn/test_layer_norm_bf16_mkldnn_op.py
index 6e28472d5e9e0..a67dd64a4fbd4 100644
--- a/test/mkldnn/test_layer_norm_bf16_mkldnn_op.py
+++ b/test/mkldnn/test_layer_norm_bf16_mkldnn_op.py
@@ -18,7 +18,7 @@
 from operator import mul
 
 import numpy as np
-from eager_op_test import _set_use_system_allocator, convert_float_to_uint16
+from op_test import _set_use_system_allocator, convert_float_to_uint16
 from test_layer_norm_mkldnn_op import (
     TestLayerNormMKLDNNOp,
     _reference_layer_norm_naive,
diff --git a/test/mkldnn/test_layer_norm_mkldnn_op.py b/test/mkldnn/test_layer_norm_mkldnn_op.py
index 4533ccd05179d..4d0d7cce9c1dc 100644
--- a/test/mkldnn/test_layer_norm_mkldnn_op.py
+++ b/test/mkldnn/test_layer_norm_mkldnn_op.py
@@ -18,7 +18,7 @@
 from operator import mul
 
 import numpy as np
-from eager_op_test import OpTestTool, _set_use_system_allocator
+from op_test import OpTestTool, _set_use_system_allocator
 
 from paddle import base, enable_static
 from paddle.base import core
diff --git a/test/mkldnn/test_log_softmax_mkldnn_op.py b/test/mkldnn/test_log_softmax_mkldnn_op.py
index 7bed35162cec6..8d4dff23a73e0 100644
--- a/test/mkldnn/test_log_softmax_mkldnn_op.py
+++ b/test/mkldnn/test_log_softmax_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, OpTestTool, convert_float_to_uint16
+from op_test import OpTest, OpTestTool, convert_float_to_uint16
 from test_log_softmax import ref_log_softmax
 
 import paddle
diff --git a/test/mkldnn/test_matmul_bf16_mkldnn_op.py b/test/mkldnn/test_matmul_bf16_mkldnn_op.py
index 74b55b8db78bb..66650f2678463 100644
--- a/test/mkldnn/test_matmul_bf16_mkldnn_op.py
+++ b/test/mkldnn/test_matmul_bf16_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 from paddle import enable_static
 from paddle.base import core
diff --git a/test/mkldnn/test_matmul_mkldnn_op.py b/test/mkldnn/test_matmul_mkldnn_op.py
index 43a3d6c304378..85a6d79de9759 100644
--- a/test/mkldnn/test_matmul_mkldnn_op.py
+++ b/test/mkldnn/test_matmul_mkldnn_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 class TestDnnlMatMulOp(OpTest):
diff --git a/test/mkldnn/test_matmul_v2_mkldnn_op.py b/test/mkldnn/test_matmul_v2_mkldnn_op.py
index 97dab61f9dc0a..8c9fb2e092835 100644
--- a/test/mkldnn/test_matmul_v2_mkldnn_op.py
+++ b/test/mkldnn/test_matmul_v2_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, OpTestTool, convert_float_to_uint16
+from op_test import OpTest, OpTestTool, convert_float_to_uint16
 
 import paddle
 from paddle.base import core
diff --git a/test/mkldnn/test_mul_int8_mkldnn_op.py b/test/mkldnn/test_mul_int8_mkldnn_op.py
index 4d3fd1afca472..75cb7eb9a604f 100644
--- a/test/mkldnn/test_mul_int8_mkldnn_op.py
+++ b/test/mkldnn/test_mul_int8_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, skip_check_grad_ci
+from op_test import OpTest, skip_check_grad_ci
 
 import paddle
 from paddle.base import core
diff --git a/test/mkldnn/test_mul_mkldnn_op.py b/test/mkldnn/test_mul_mkldnn_op.py
index 948addfc17751..5466267ad47a0 100644
--- a/test/mkldnn/test_mul_mkldnn_op.py
+++ b/test/mkldnn/test_mul_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, OpTestTool, convert_float_to_uint16
+from op_test import OpTest, OpTestTool, convert_float_to_uint16
 
 import paddle
 from paddle.base import core
diff --git a/test/mkldnn/test_multi_gru_mkldnn_op.py b/test/mkldnn/test_multi_gru_mkldnn_op.py
index b52fe95ce2b52..afabd03a3a0dc 100644
--- a/test/mkldnn/test_multi_gru_mkldnn_op.py
+++ b/test/mkldnn/test_multi_gru_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 from test_fusion_gru_op import ACTIVATION, fusion_gru
 
 
diff --git a/test/mkldnn/test_nearest_interp_mkldnn_op.py b/test/mkldnn/test_nearest_interp_mkldnn_op.py
index a3f6d85693e86..1e07a60568824 100644
--- a/test/mkldnn/test_nearest_interp_mkldnn_op.py
+++ b/test/mkldnn/test_nearest_interp_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, skip_check_grad_ci
+from op_test import OpTest, skip_check_grad_ci
 
 
 def nearest_neighbor_interp_mkldnn_np(
diff --git a/test/mkldnn/test_nearest_interp_v2_mkldnn_op.py b/test/mkldnn/test_nearest_interp_v2_mkldnn_op.py
index 0c7875dfb8b7a..ec3f40dfcd8f6 100644
--- a/test/mkldnn/test_nearest_interp_v2_mkldnn_op.py
+++ b/test/mkldnn/test_nearest_interp_v2_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import (
+from op_test import (
     OpTest,
     OpTestTool,
     convert_float_to_uint16,
diff --git a/test/mkldnn/test_pool2d_bf16_mkldnn_op.py b/test/mkldnn/test_pool2d_bf16_mkldnn_op.py
index 0a1b04d157a9b..0b510e5e7fa50 100644
--- a/test/mkldnn/test_pool2d_bf16_mkldnn_op.py
+++ b/test/mkldnn/test_pool2d_bf16_mkldnn_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, OpTestTool, convert_float_to_uint16
+from op_test import OpTest, OpTestTool, convert_float_to_uint16
 from test_pool2d_op import (
     TestPool2D_Op_Mixin,
     adaptive_end_index,
diff --git a/test/mkldnn/test_pool2d_int8_mkldnn_op.py b/test/mkldnn/test_pool2d_int8_mkldnn_op.py
index 47ed3728150be..5b41081821b3e 100644
--- a/test/mkldnn/test_pool2d_int8_mkldnn_op.py
+++ b/test/mkldnn/test_pool2d_int8_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 from test_pool2d_op import TestPool2D_Op, max_pool2D_forward_naive
 
 from paddle.base import core
diff --git a/test/mkldnn/test_prelu_mkldnn_op.py b/test/mkldnn/test_prelu_mkldnn_op.py
index a3264c646bc3e..c32169bc9fe7f 100644
--- a/test/mkldnn/test_prelu_mkldnn_op.py
+++ b/test/mkldnn/test_prelu_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, OpTestTool, convert_float_to_uint16
+from op_test import OpTest, OpTestTool, convert_float_to_uint16
 
 import paddle
 from paddle.base import core
diff --git a/test/mkldnn/test_quantize_mkldnn_op.py b/test/mkldnn/test_quantize_mkldnn_op.py
index 22324e4b035b3..12c10f4ae656a 100644
--- a/test/mkldnn/test_quantize_mkldnn_op.py
+++ b/test/mkldnn/test_quantize_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 
diff --git a/test/mkldnn/test_reduce_bf16_mkldnn_op.py b/test/mkldnn/test_reduce_bf16_mkldnn_op.py
index 5407efd05a48d..187ce4cde4739 100644
--- a/test/mkldnn/test_reduce_bf16_mkldnn_op.py
+++ b/test/mkldnn/test_reduce_bf16_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import (
+from op_test import (
     OpTest,
     OpTestTool,
     convert_float_to_uint16,
diff --git a/test/mkldnn/test_reduce_mkldnn_op.py b/test/mkldnn/test_reduce_mkldnn_op.py
index 8e0ae6274d92a..3dce2c72e5568 100644
--- a/test/mkldnn/test_reduce_mkldnn_op.py
+++ b/test/mkldnn/test_reduce_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, OpTestTool, skip_check_grad_ci
+from op_test import OpTest, OpTestTool, skip_check_grad_ci
 
 import paddle
 
diff --git a/test/mkldnn/test_requantize_mkldnn_op.py b/test/mkldnn/test_requantize_mkldnn_op.py
index fbe5d075422d1..3c5f3f1cf992d 100644
--- a/test/mkldnn/test_requantize_mkldnn_op.py
+++ b/test/mkldnn/test_requantize_mkldnn_op.py
@@ -15,8 +15,8 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
 from mkldnn_op_test import format_reorder
+from op_test import OpTest
 
 from paddle import base
 from paddle.base import core
diff --git a/test/mkldnn/test_reshape_bf16_op.py b/test/mkldnn/test_reshape_bf16_op.py
index 76b509219b105..245c47a327db7 100644
--- a/test/mkldnn/test_reshape_bf16_op.py
+++ b/test/mkldnn/test_reshape_bf16_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 from paddle import enable_static
 from paddle.base import core
diff --git a/test/mkldnn/test_reshape_mkldnn_op.py b/test/mkldnn/test_reshape_mkldnn_op.py
index ca06b5fd4c748..f30fd939f3aae 100644
--- a/test/mkldnn/test_reshape_mkldnn_op.py
+++ b/test/mkldnn/test_reshape_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, OpTestTool, convert_float_to_uint16
+from op_test import OpTest, OpTestTool, convert_float_to_uint16
 
 import paddle
 from paddle.base import core
diff --git a/test/mkldnn/test_scale_bf16_mkldnn_op.py b/test/mkldnn/test_scale_bf16_mkldnn_op.py
index a50e47a712cb4..3f0a2be49290c 100644
--- a/test/mkldnn/test_scale_bf16_mkldnn_op.py
+++ b/test/mkldnn/test_scale_bf16_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle.base import core
diff --git a/test/mkldnn/test_scale_mkldnn_op.py b/test/mkldnn/test_scale_mkldnn_op.py
index f60b84ecd2b88..54b412bf8f155 100644
--- a/test/mkldnn/test_scale_mkldnn_op.py
+++ b/test/mkldnn/test_scale_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 
diff --git a/test/mkldnn/test_shape_mkldnn_op.py b/test/mkldnn/test_shape_mkldnn_op.py
index 4325062628e95..ed9b81f8e3599 100644
--- a/test/mkldnn/test_shape_mkldnn_op.py
+++ b/test/mkldnn/test_shape_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, OpTestTool
+from op_test import OpTest, OpTestTool
 
 import paddle
 from paddle.base import core
diff --git a/test/mkldnn/test_shuffle_channel_mkldnn_op.py b/test/mkldnn/test_shuffle_channel_mkldnn_op.py
index d363bef88606b..90ca2e44ed986 100644
--- a/test/mkldnn/test_shuffle_channel_mkldnn_op.py
+++ b/test/mkldnn/test_shuffle_channel_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, OpTestTool
+from op_test import OpTest, OpTestTool
 
 import paddle
 from paddle.base import core
diff --git a/test/mkldnn/test_slice_mkldnn_op.py b/test/mkldnn/test_slice_mkldnn_op.py
index b733a11ca912c..66161dbad4908 100644
--- a/test/mkldnn/test_slice_mkldnn_op.py
+++ b/test/mkldnn/test_slice_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, OpTestTool, convert_float_to_uint16
+from op_test import OpTest, OpTestTool, convert_float_to_uint16
 
 import paddle
 from paddle.base import core
diff --git a/test/mkldnn/test_softmax_bf16_mkldnn_op.py b/test/mkldnn/test_softmax_bf16_mkldnn_op.py
index 2e04100ec31ad..d8881af4e2956 100644
--- a/test/mkldnn/test_softmax_bf16_mkldnn_op.py
+++ b/test/mkldnn/test_softmax_bf16_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import convert_float_to_uint16
+from op_test import convert_float_to_uint16
 from test_softmax_op import (
     TestSoftmaxOp,
     TestSoftmaxOp2,
diff --git a/test/mkldnn/test_softmax_mkldnn_op.py b/test/mkldnn/test_softmax_mkldnn_op.py
index 777dfb0cb02dc..d66ff320d9f26 100644
--- a/test/mkldnn/test_softmax_mkldnn_op.py
+++ b/test/mkldnn/test_softmax_mkldnn_op.py
@@ -15,8 +15,8 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
 from mkldnn_op_test import check_if_mkldnn_primitives_exist_in_bwd
+from op_test import OpTest
 from test_softmax_op import (
     TestSoftmaxOp,
     TestSoftmaxOp2,
diff --git a/test/mkldnn/test_softplus_mkldnn_op.py b/test/mkldnn/test_softplus_mkldnn_op.py
index 65e29014cb6e9..58e4fafdf2698 100644
--- a/test/mkldnn/test_softplus_mkldnn_op.py
+++ b/test/mkldnn/test_softplus_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, OpTestTool, convert_float_to_uint16
+from op_test import OpTest, OpTestTool, convert_float_to_uint16
 
 import paddle
 
diff --git a/test/mkldnn/test_split_bf16_mkldnn_op.py b/test/mkldnn/test_split_bf16_mkldnn_op.py
index 81e25ce43b9a9..6e8b1b56ebc07 100644
--- a/test/mkldnn/test_split_bf16_mkldnn_op.py
+++ b/test/mkldnn/test_split_bf16_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle.base import core
diff --git a/test/mkldnn/test_split_mkldnn_op.py b/test/mkldnn/test_split_mkldnn_op.py
index 6a049de4f3551..15a24c3b4861f 100644
--- a/test/mkldnn/test_split_mkldnn_op.py
+++ b/test/mkldnn/test_split_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 
diff --git a/test/mkldnn/test_squeeze2_mkldnn_op.py b/test/mkldnn/test_squeeze2_mkldnn_op.py
index b1efad9690bb6..61521ecf8bc80 100644
--- a/test/mkldnn/test_squeeze2_mkldnn_op.py
+++ b/test/mkldnn/test_squeeze2_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, OpTestTool, convert_float_to_uint16
+from op_test import OpTest, OpTestTool, convert_float_to_uint16
 
 import paddle
 from paddle.base import core
diff --git a/test/mkldnn/test_stack_mkldnn_op.py b/test/mkldnn/test_stack_mkldnn_op.py
index 7a04ea38be9a5..82acf285ce16d 100644
--- a/test/mkldnn/test_stack_mkldnn_op.py
+++ b/test/mkldnn/test_stack_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, OpTestTool
+from op_test import OpTest, OpTestTool
 
 import paddle
 from paddle.base import core
diff --git a/test/mkldnn/test_sum_bf16_mkldnn_op.py b/test/mkldnn/test_sum_bf16_mkldnn_op.py
index af622b8cff5d4..8fbef74e38d2d 100644
--- a/test/mkldnn/test_sum_bf16_mkldnn_op.py
+++ b/test/mkldnn/test_sum_bf16_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import convert_float_to_uint16
+from op_test import convert_float_to_uint16
 from test_sum_op import TestSumOp
 
 from paddle import enable_static
diff --git a/test/mkldnn/test_transpose_bf16_mkldnn_op.py b/test/mkldnn/test_transpose_bf16_mkldnn_op.py
index d0817ab103db6..bd0f8473205d6 100644
--- a/test/mkldnn/test_transpose_bf16_mkldnn_op.py
+++ b/test/mkldnn/test_transpose_bf16_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16
 
 from paddle import enable_static
 from paddle.base import core
diff --git a/test/mkldnn/test_transpose_int8_mkldnn_op.py b/test/mkldnn/test_transpose_int8_mkldnn_op.py
index 282c6dd94acea..b800d6b40c504 100644
--- a/test/mkldnn/test_transpose_int8_mkldnn_op.py
+++ b/test/mkldnn/test_transpose_int8_mkldnn_op.py
@@ -15,8 +15,8 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
 from mkldnn_op_test import format_reorder
+from op_test import OpTest
 
 from paddle.base import core
 
diff --git a/test/mkldnn/test_transpose_mkldnn_op.py b/test/mkldnn/test_transpose_mkldnn_op.py
index 5562cc7f5a1d8..5286c561e0861 100644
--- a/test/mkldnn/test_transpose_mkldnn_op.py
+++ b/test/mkldnn/test_transpose_mkldnn_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 class TestTransposeMKLDNN(OpTest):
diff --git a/test/prim/new_ir_prim/test_custom_vjp_trait.py b/test/prim/new_ir_prim/test_custom_vjp_trait.py
new file mode 100644
index 0000000000000..0d99d73cedd0b
--- /dev/null
+++ b/test/prim/new_ir_prim/test_custom_vjp_trait.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle
+from paddle import ir, nn
+from paddle.base.core import has_custom_vjp
+
+paddle.enable_static()
+
+
+def get_gelu_program_new_ir():
+    main_program, start_program = (
+        paddle.static.Program(),
+        paddle.static.Program(),
+    )
+    with paddle.static.program_guard(main_program, start_program):
+        x = paddle.static.data('x', [2, 3, 3], dtype='float32')
+        net = nn.GELU()
+        out = net(x)
+    newir_program = ir.translate_to_new_ir(main_program.desc)
+    return newir_program
+
+
+def get_multiply_program_new_ir():
+    main_program, start_program = (
+        paddle.static.Program(),
+        paddle.static.Program(),
+    )
+    with paddle.static.program_guard(main_program, start_program):
+        x = paddle.static.data('x', [2, 3, 3], dtype='float32')
+        y = paddle.static.data('y', [2, 3, 3], dtype='float32')
+        out = paddle.multiply(x, y)
+    newir_program = ir.translate_to_new_ir(main_program.desc)
+    return newir_program
+
+
+class TestCustomVjpTrait(unittest.TestCase):
+    def test_gelu_op_custom_vjp_trait(self):
+        newir_program = get_gelu_program_new_ir()
+        op = newir_program.global_block().ops[-1]
+        self.assertEqual(op.name(), "pd_op.gelu")
+        self.assertEqual(has_custom_vjp(op), True)
+
+    def test_multiply_op_custom_vjp_trait(self):
+        newir_program = get_multiply_program_new_ir()
+        op = newir_program.global_block().ops[-1]
+        self.assertEqual(op.name(), "pd_op.multiply")
+        self.assertEqual(has_custom_vjp(op), False)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/prim/new_ir_prim/test_decomp_op.py b/test/prim/new_ir_prim/test_decomp_op.py
index 413008f814f7f..1bc85d2fcf4db 100644
--- a/test/prim/new_ir_prim/test_decomp_op.py
+++ b/test/prim/new_ir_prim/test_decomp_op.py
@@ -42,7 +42,7 @@ def get_ir_program():
 class TestBuildOp(unittest.TestCase):
     def test_build_op(self):
         newir_program = get_ir_program()
-        y = newir_program.block().ops[-2].results()
+        y = newir_program.global_block().ops[-2].results()
         orig_shape = y[0].shape
         paddle.framework.set_flags({"FLAGS_enable_new_ir_api": True})
         core._set_prim_forward_enabled(True)
@@ -52,18 +52,18 @@ def test_build_op(self):
         assert (
             orig_shape == new_shape
         ), f"Original shape {orig_shape} is not equal to new shape {new_shape}"
-        op_name_list = [op.name() for op in newir_program.block().ops]
+        op_name_list = [op.name() for op in newir_program.global_block().ops]
         self.assertEqual(
             op_name_list,
             [
-                'pd.data',
-                'pd.matmul',
-                'pd.add',
-                'pd.full_int_array',
-                'pd.sum',
-                'pd.full',
-                'pd.divide',
-                'pd.tanh',
+                'pd_op.data',
+                'pd_op.matmul',
+                'pd_op.add',
+                'pd_op.full_int_array',
+                'pd_op.sum',
+                'pd_op.full',
+                'pd_op.divide',
+                'pd_op.tanh',
             ],
         )
 
diff --git a/test/prim/new_ir_prim/test_prim_custom_vjp.py b/test/prim/new_ir_prim/test_prim_custom_vjp.py
index 6cd0527ff6438..4ca737fa380e3 100644
--- a/test/prim/new_ir_prim/test_prim_custom_vjp.py
+++ b/test/prim/new_ir_prim/test_prim_custom_vjp.py
@@ -70,16 +70,20 @@ def base_net(self, flag=None):
 
             if flag == "backward":
                 whole_ops_before = [
-                    op.name() for op in main_program.block().ops
+                    op.name() for op in main_program.global_block().ops
                 ]
                 assert (
-                    "pd.gelu" in whole_ops_before
-                    and "pd.gelu_grad" not in whole_ops_before
+                    "pd_op.gelu" in whole_ops_before
+                    and "pd_op.gelu_grad" not in whole_ops_before
                 )
                 core._set_prim_forward_enabled(True)
-                [res2] = decompose(main_program, [res2], whitelist={"pd.gelu"})
-                whole_ops_after = [op.name() for op in main_program.block().ops]
-                assert "pd.gelu" not in whole_ops_after
+                [res2] = decompose(
+                    main_program, [res2], whitelist={"pd_op.gelu"}
+                )
+                whole_ops_after = [
+                    op.name() for op in main_program.global_block().ops
+                ]
+                assert "pd_op.gelu" not in whole_ops_after
                 core._set_prim_forward_enabled(False)
 
             exe = paddle.static.Executor()
diff --git a/test/prim/new_ir_prim/test_prim_program.py b/test/prim/new_ir_prim/test_prim_program.py
index c4cc0187b1ad8..ca754d12e88b1 100644
--- a/test/prim/new_ir_prim/test_prim_program.py
+++ b/test/prim/new_ir_prim/test_prim_program.py
@@ -55,20 +55,29 @@ def base_net(self, flag=None):
                 feed={'x': self.x, 'y': self.y}, fetch_list=[new_out, gradients]
             )
 
-        whole_ops = [op.name() for op in main_program.block().ops]
+        whole_ops = [op.name() for op in main_program.global_block().ops]
         if flag == "forward":
             core._set_prim_forward_enabled(False)
-            assert 'pd.mean' not in whole_ops and 'pd.divide_grad' in whole_ops
+            assert (
+                'pd_op.mean' not in whole_ops
+                and 'pd_op.divide_grad' in whole_ops
+            )
         elif flag == "backward":
             core._set_prim_backward_enabled(False)
-            assert 'pd.mean' in whole_ops and 'pd.divide_grad' not in whole_ops
+            assert (
+                'pd_op.mean' in whole_ops
+                and 'pd_op.divide_grad' not in whole_ops
+            )
         elif flag == "all":
             core._set_prim_all_enabled(False)
             assert (
-                'pd.mean' not in whole_ops and 'pd.divide_grad' not in whole_ops
+                'pd_op.mean' not in whole_ops
+                and 'pd_op.divide_grad' not in whole_ops
             )
         else:
-            assert 'pd.mean' in whole_ops and 'pd.divide_grad' in whole_ops
+            assert (
+                'pd_op.mean' in whole_ops and 'pd_op.divide_grad' in whole_ops
+            )
         return fwd, dx, dy
 
     def test_prim_forward(self):
diff --git a/test/prim/new_ir_prim/test_prim_simpnet.py b/test/prim/new_ir_prim/test_prim_simpnet.py
index 505152354b986..ffef0766f3d32 100644
--- a/test/prim/new_ir_prim/test_prim_simpnet.py
+++ b/test/prim/new_ir_prim/test_prim_simpnet.py
@@ -75,11 +75,12 @@ def base_net(self, flag=None):
                 fetch_list=[res2, gradients[0], gradients[1]],
             )
 
-        whole_ops = [op.name() for op in main_program.block().ops]
+        whole_ops = [op.name() for op in main_program.global_block().ops]
         if flag == "all":
             core._set_prim_all_enabled(False)
             assert (
-                'pd.gelu' not in whole_ops and 'pd.divide_grad' not in whole_ops
+                'pd_op.gelu' not in whole_ops
+                and 'pd_op.divide_grad' not in whole_ops
             )
         return outs
 
diff --git a/test/prim/new_ir_prim/test_vjp_prim.py b/test/prim/new_ir_prim/test_vjp_prim.py
index c6244892bfa5a..a40475a778edd 100644
--- a/test/prim/new_ir_prim/test_vjp_prim.py
+++ b/test/prim/new_ir_prim/test_vjp_prim.py
@@ -64,42 +64,42 @@ def test_divide_grad_prim_case1(self):
         newir_program = get_ir_divide_program()
         paddle.framework.core._set_prim_backward_enabled(True)
         paddle.framework.set_flags({"FLAGS_enable_new_ir_api": True})
-        dout = newir_program.block().ops[-2].result(0)
+        dout = newir_program.global_block().ops[-2].result(0)
         out_grads = [[dout]]
         stop_gradients = [[False], [False]]
-        divide_op = newir_program.block().ops[-1]
+        divide_op = newir_program.global_block().ops[-1]
         with paddle.ir.core.program_guard(newir_program):
             grad_outs = call_vjp(divide_op, out_grads, stop_gradients)
-        reshape_op2 = newir_program.block().ops[-1]
-        reshape_op1 = newir_program.block().ops[-8]
+        reshape_op2 = newir_program.global_block().ops[-1]
+        reshape_op1 = newir_program.global_block().ops[-8]
         self.assertEqual(len(grad_outs), 2)
-        self.assertEqual(len(newir_program.block().ops), 21)
+        self.assertEqual(len(newir_program.global_block().ops), 21)
         self.assertEqual(reshape_op2.result(0), grad_outs[0][0])
         self.assertEqual(reshape_op1.result(0), grad_outs[1][0])
         all_op_names = [
-            "pd.full",
-            "pd.full",
-            "pd.full",
-            "pd.divide",
-            "pd.full",
-            "pd.elementwise_pow",
-            "pd.divide",
-            "pd.full",
-            "pd.scale",
-            "pd.multiply",
-            "pd.full_int_array",
-            "pd.sum",
-            "pd.full_int_array",
-            "pd.reshape",
-            "pd.full",
-            "pd.divide",
-            "pd.multiply",
-            "pd.full_int_array",
-            "pd.sum",
-            "pd.full_int_array",
-            "pd.reshape",
+            "pd_op.full",
+            "pd_op.full",
+            "pd_op.full",
+            "pd_op.divide",
+            "pd_op.full",
+            "pd_op.elementwise_pow",
+            "pd_op.divide",
+            "pd_op.full",
+            "pd_op.scale",
+            "pd_op.multiply",
+            "pd_op.full_int_array",
+            "pd_op.sum",
+            "pd_op.full_int_array",
+            "pd_op.reshape",
+            "pd_op.full",
+            "pd_op.divide",
+            "pd_op.multiply",
+            "pd_op.full_int_array",
+            "pd_op.sum",
+            "pd_op.full_int_array",
+            "pd_op.reshape",
         ]
-        for idx, op in enumerate(newir_program.block().ops):
+        for idx, op in enumerate(newir_program.global_block().ops):
             self.assertEqual(op.name(), all_op_names[idx])
         paddle.framework.core._set_prim_backward_enabled(False)
         paddle.framework.set_flags({"FLAGS_enable_new_ir_api": False})
@@ -107,47 +107,46 @@ def test_divide_grad_prim_case1(self):
     def test_divide_grad_no_prim(self):
         newir_program = get_ir_divide_program()
         paddle.framework.core._set_prim_backward_enabled(False)
-        dout = newir_program.block().ops[-2].result(0)
+        dout = newir_program.global_block().ops[-2].result(0)
         out_grads = [[dout]]
         stop_gradients = [[False], [False]]
-        divide_op = newir_program.block().ops[-1]
+        divide_op = newir_program.global_block().ops[-1]
         with paddle.ir.core.program_guard(newir_program):
             grad_outs = call_vjp(divide_op, out_grads, stop_gradients)
         self.assertEqual(len(grad_outs), 2)
         self.assertEqual(
-            grad_outs[0][0].get_defining_op().name(), "pd.divide_grad"
+            grad_outs[0][0].get_defining_op().name(), "pd_op.divide_grad"
         )
         self.assertEqual(
-            grad_outs[1][0].get_defining_op().name(), "pd.divide_grad"
+            grad_outs[1][0].get_defining_op().name(), "pd_op.divide_grad"
         )
-        self.assertEqual(len(newir_program.block().ops), 5)
+        self.assertEqual(len(newir_program.global_block().ops), 5)
 
     def test_sum_grad_prim(self):
         newir_program = get_ir_sum_program()
         paddle.framework.core._set_prim_backward_enabled(True)
         paddle.framework.set_flags({"FLAGS_enable_new_ir_api": True})
-        dout = newir_program.block().ops[-3].result(0)
+        dout = newir_program.global_block().ops[-3].result(0)
         out_grads = [[dout]]
-        stop_gradients = [[False], [True]]
-        sum_op = newir_program.block().ops[-1]
+        stop_gradients = [[False]]
+        sum_op = newir_program.global_block().ops[-1]
         with paddle.ir.core.program_guard(newir_program):
             grad_outs = call_vjp(sum_op, out_grads, stop_gradients)
-        expand_op = newir_program.block().ops[-1]
-        self.assertEqual(len(grad_outs), 2)
-        self.assertEqual(len(newir_program.block().ops), 8)
+        expand_op = newir_program.global_block().ops[-1]
+        self.assertEqual(len(grad_outs), 1)
+        self.assertEqual(len(newir_program.global_block().ops), 8)
         self.assertEqual(expand_op.result(0), grad_outs[0][0])
-        self.assertEqual(grad_outs[1][0], None)
         all_op_names = [
-            "pd.full",
-            "pd.full",
-            "pd.full_int_array",
-            "pd.sum",
-            "pd.full_int_array",
-            "pd.reshape",
-            "pd.full_int_array",
-            "pd.expand",
+            "pd_op.full",
+            "pd_op.full",
+            "pd_op.full_int_array",
+            "pd_op.sum",
+            "pd_op.full_int_array",
+            "pd_op.reshape",
+            "pd_op.full_int_array",
+            "pd_op.expand",
         ]
-        for idx, op in enumerate(newir_program.block().ops):
+        for idx, op in enumerate(newir_program.global_block().ops):
             self.assertEqual(op.name(), all_op_names[idx])
         paddle.framework.core._set_prim_backward_enabled(False)
         paddle.framework.set_flags({"FLAGS_enable_new_ir_api": False})
@@ -155,18 +154,17 @@ def test_sum_grad_prim(self):
     def test_sum_grad_no_prim(self):
         newir_program = get_ir_sum_program()
         paddle.framework.core._set_prim_backward_enabled(False)
-        dout = newir_program.block().ops[-2].result(0)
+        dout = newir_program.global_block().ops[-2].result(0)
         out_grads = [[dout]]
-        stop_gradients = [[False], [True]]
-        sum_op = newir_program.block().ops[-1]
+        stop_gradients = [[False]]
+        sum_op = newir_program.global_block().ops[-1]
         with paddle.ir.core.program_guard(newir_program):
             grad_outs = call_vjp(sum_op, out_grads, stop_gradients)
-        self.assertEqual(len(grad_outs), 2)
+        self.assertEqual(len(grad_outs), 1)
         self.assertEqual(
-            grad_outs[0][0].get_defining_op().name(), "pd.sum_grad"
+            grad_outs[0][0].get_defining_op().name(), "pd_op.sum_grad"
         )
-        self.assertEqual(grad_outs[1][0], None)
-        self.assertEqual(len(newir_program.block().ops), 5)
+        self.assertEqual(len(newir_program.global_block().ops), 5)
 
 
 if __name__ == "__main__":
diff --git a/test/sequence/test_sequence_concat.py b/test/sequence/test_sequence_concat.py
index 513003c0df107..7eb37e7b3803b 100644
--- a/test/sequence/test_sequence_concat.py
+++ b/test/sequence/test_sequence_concat.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 
diff --git a/test/sequence/test_sequence_conv.py b/test/sequence/test_sequence_conv.py
index 62ac526d61e0f..21644be661dde 100644
--- a/test/sequence/test_sequence_conv.py
+++ b/test/sequence/test_sequence_conv.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 
diff --git a/test/sequence/test_sequence_erase_op.py b/test/sequence/test_sequence_erase_op.py
index 94aa4fde4d036..96f72b798e296 100644
--- a/test/sequence/test_sequence_erase_op.py
+++ b/test/sequence/test_sequence_erase_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 def sequence_erase(in_seq, lod0, tokens):
diff --git a/test/sequence/test_sequence_expand.py b/test/sequence/test_sequence_expand.py
index 8a0553fb03054..edab321bccb91 100644
--- a/test/sequence/test_sequence_expand.py
+++ b/test/sequence/test_sequence_expand.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 class TestSequenceExpand(OpTest):
diff --git a/test/sequence/test_sequence_expand_as.py b/test/sequence/test_sequence_expand_as.py
index 090a6242c003a..82d0e0c395522 100644
--- a/test/sequence/test_sequence_expand_as.py
+++ b/test/sequence/test_sequence_expand_as.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle.base import Program, program_guard
diff --git a/test/sequence/test_sequence_mask.py b/test/sequence/test_sequence_mask.py
index 54d7a3c87dcad..9cd14490f43c0 100644
--- a/test/sequence/test_sequence_mask.py
+++ b/test/sequence/test_sequence_mask.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle.base.framework import (
diff --git a/test/sequence/test_sequence_pad_op.py b/test/sequence/test_sequence_pad_op.py
index b78ddf12eef9a..788671a51c3de 100644
--- a/test/sequence/test_sequence_pad_op.py
+++ b/test/sequence/test_sequence_pad_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 from paddle.base import core
diff --git a/test/sequence/test_sequence_pool.py b/test/sequence/test_sequence_pool.py
index d25af65629a56..44727bd359c68 100644
--- a/test/sequence/test_sequence_pool.py
+++ b/test/sequence/test_sequence_pool.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, skip_check_grad_ci
+from op_test import OpTest, skip_check_grad_ci
 
 import paddle
 
diff --git a/test/sequence/test_sequence_reshape.py b/test/sequence/test_sequence_reshape.py
index 5fa6b443a11ce..1c4af6bf5a134 100644
--- a/test/sequence/test_sequence_reshape.py
+++ b/test/sequence/test_sequence_reshape.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 
diff --git a/test/sequence/test_sequence_reverse.py b/test/sequence/test_sequence_reverse.py
index 7e3016a67e00a..f43c03018585f 100644
--- a/test/sequence/test_sequence_reverse.py
+++ b/test/sequence/test_sequence_reverse.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 
diff --git a/test/sequence/test_sequence_scatter_op.py b/test/sequence/test_sequence_scatter_op.py
index 7363b3c34d4a5..626c5363552c9 100644
--- a/test/sequence/test_sequence_scatter_op.py
+++ b/test/sequence/test_sequence_scatter_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 class TestSequenceScatterOp(OpTest):
diff --git a/test/sequence/test_sequence_slice_op.py b/test/sequence/test_sequence_slice_op.py
index a68087ff75fe4..22c276824c8a5 100644
--- a/test/sequence/test_sequence_slice_op.py
+++ b/test/sequence/test_sequence_slice_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 class TestSequenceSliceOp(OpTest):
diff --git a/test/sequence/test_sequence_softmax_op.py b/test/sequence/test_sequence_softmax_op.py
index de6258bbc2612..10ce6a318f4f2 100644
--- a/test/sequence/test_sequence_softmax_op.py
+++ b/test/sequence/test_sequence_softmax_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 from test_softmax_op import stable_softmax
 
 from paddle.base import core
diff --git a/test/sequence/test_sequence_topk_avg_pooling.py b/test/sequence/test_sequence_topk_avg_pooling.py
index 4d120b6be8afd..470b3029ab9ed 100644
--- a/test/sequence/test_sequence_topk_avg_pooling.py
+++ b/test/sequence/test_sequence_topk_avg_pooling.py
@@ -17,7 +17,7 @@
 from copy import deepcopy
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 
 class TestSequenceTopkAvgPoolingOp(OpTest):
diff --git a/test/sequence/test_sequence_unpad_op.py b/test/sequence/test_sequence_unpad_op.py
index bc59fb9aa9471..9eaaff04e5fdf 100644
--- a/test/sequence/test_sequence_unpad_op.py
+++ b/test/sequence/test_sequence_unpad_op.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 
diff --git a/test/white_list/new_ir_op_test_precision_white_list b/test/white_list/new_ir_op_test_precision_white_list
index 34f439e329974..db9ea5c44bea3 100644
--- a/test/white_list/new_ir_op_test_precision_white_list
+++ b/test/white_list/new_ir_op_test_precision_white_list
@@ -1,3 +1,5 @@
+test_adamw_op
+test_adamw_op_static_build
 test_affine_grid_op
 test_bicubic_interp_v2_op
 test_conv2d_op
@@ -10,8 +12,10 @@ test_conv3d_transpose_part2_op
 test_deformable_conv_op
 test_graph_send_recv_op
 test_graph_send_ue_recv_op
+test_graph_send_uv_op
 test_grid_sampler_op
 test_group_norm_op
 test_pool2d_op
 test_pool3d_op
+test_sgd_op
 test_trilinear_interp_v2_op
diff --git a/test/white_list/new_ir_op_test_white_list b/test/white_list/new_ir_op_test_white_list
index 57d6154bfc7d5..613769ec5b657 100644
--- a/test/white_list/new_ir_op_test_white_list
+++ b/test/white_list/new_ir_op_test_white_list
@@ -1,7 +1,14 @@
 test_accuracy_op
+test_adadelta_op
+test_adagrad_op
+test_adagrad_op_static_build
+test_adamax_op
+test_adamw_op
+test_adamw_op_static_build
 test_addmm_op
 test_affine_grid_op
 test_allclose_op
+test_amp_check_finite_and_scale_op
 test_angle_op
 test_arange
 test_arg_min_max_op
@@ -46,7 +53,6 @@ test_cross_op
 test_cummax_op
 test_cummin_op
 test_cumprod_op
-test_dataloader_dataset
 test_deformable_conv_op
 test_determinant_op
 test_diag_embed
@@ -62,17 +68,20 @@ test_eig_op
 test_eig_op_static_build
 test_eigvalsh_op
 test_eigvals_op
+test_einsum_op
 test_elementwise_div_op
 test_elementwise_floordiv_op
 test_elementwise_heaviside_op
 test_elementwise_min_op
 test_elementwise_mod_op
+test_elementwise_mul_op
 test_elementwise_pow_op
 test_erfinv_op
 test_expand_v2_op
 test_eye_op
 test_fill_any_op
 test_fill_constant_batch_size_like
+test_fill_constant_op
 test_fill_diagonal_tensor_op
 test_flip
 test_fmax_op
@@ -84,6 +93,7 @@ test_gaussian_random_op
 test_generate_proposals_v2_op
 test_graph_send_recv_op
 test_graph_send_ue_recv_op
+test_graph_send_uv_op
 test_grid_sampler_op
 test_group_norm_op
 test_histogram_op
@@ -94,7 +104,6 @@ test_i0_op
 test_i1e_op
 test_i1_op
 test_index_add_op
-test_isclose_op
 test_index_sample_op
 test_instance_norm_op_v2
 test_instance_norm_op_v2_new_ir
@@ -107,6 +116,8 @@ test_kron_op
 test_kthvalue_op
 test_label_smooth_op
 test_label_smooth_op_new_ir
+test_lamb_op
+test_lamb_op_static_build
 test_lerp_op
 test_lgamma_op
 test_linear_interp_v2_op
@@ -129,6 +140,7 @@ test_maxout_op
 test_mean_op
 test_mode_op
 test_multi_dot_op
+test_multiplex_op
 test_mv_op
 test_nearest_interp_v2_op
 test_nextafter_op
@@ -152,7 +164,9 @@ test_prior_box_op
 test_psroi_pool_op
 test_put_along_axis_op
 test_range
+test_real_imag_op_with_stride
 test_reduce_op
+test_reduce_op_static_build
 test_reshape_op
 test_reverse_op
 test_roi_align_op
@@ -163,6 +177,7 @@ test_searchsorted_op
 test_segment_ops
 test_segment_ops_static_build
 test_selu_op
+test_sgd_op
 test_shape_op
 test_shard_index_op
 test_sigmoid_cross_entropy_with_logits_op
@@ -173,6 +188,7 @@ test_solve_op
 test_spectral_norm_op
 test_spectral_op
 test_squared_l2_norm_op
+test_squeeze_op_with_stride
 test_svd_op
 test_take_along_axis_op
 test_temporal_shift_op
@@ -184,14 +200,18 @@ test_tril_indices_op
 test_trilinear_interp_v2_op
 test_triu_indices_op
 test_trunc_op
+test_unbind_op_with_stride
 test_unfold_op
 test_unique_consecutive_op
 test_unpool3d_op
+test_unpool_op
 test_unsqueeze2_op
+test_unsqueeze_op_with_stride
+test_update_loss_scaling_op
+test_update_loss_scaling_op_static_build
 test_viterbi_decode_op
 test_warpctc_op
 test_warprnnt_op
 test_where_op
 test_yolo_box_op
 test_yolov3_loss_op
-test_fill_constant_op
diff --git a/test/white_list/new_ir_python_api_grad_white_list.py b/test/white_list/new_ir_python_api_grad_white_list.py
deleted file mode 100644
index b96ba7c4ff939..0000000000000
--- a/test/white_list/new_ir_python_api_grad_white_list.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-new_ir_python_api_grad_white_list = [
-    "mean",
-    "reduce_sum",
-]
diff --git a/test/xpu/op_test_xpu.py b/test/xpu/op_test_xpu.py
index b17561758ad2d..543465a7cc752 100644
--- a/test/xpu/op_test_xpu.py
+++ b/test/xpu/op_test_xpu.py
@@ -14,12 +14,12 @@
 
 
 import numpy as np
-from eager_op_test import OpTest
 from get_test_cover_info import (
     get_xpu_op_support_types,
     is_empty_grad_op_type,
     type_dict_str_to_numpy,
 )
+from op_test import OpTest
 from testsuite import append_loss_ops, create_op, set_input
 from white_list import no_grad_set_white_list, op_threshold_white_list
 
diff --git a/test/xpu/test_activation_op_xpu.py b/test/xpu/test_activation_op_xpu.py
index b2a546542b91a..52a42cb4c3779 100644
--- a/test/xpu/test_activation_op_xpu.py
+++ b/test/xpu/test_activation_op_xpu.py
@@ -17,12 +17,12 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
 from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test import OpTest
 from op_test_xpu import XPUOpTest
 
 import paddle
diff --git a/test/xpu/test_bitwise_op_xpu.py b/test/xpu/test_bitwise_op_xpu.py
index 1d261c95bb49a..d00b4c067da2b 100644
--- a/test/xpu/test_bitwise_op_xpu.py
+++ b/test/xpu/test_bitwise_op_xpu.py
@@ -16,12 +16,12 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
 from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test import OpTest
 from op_test_xpu import XPUOpTest
 
 import paddle
diff --git a/test/xpu/test_concat_op_xpu.py b/test/xpu/test_concat_op_xpu.py
index 99e99bc9ec922..908f90c6c5b18 100644
--- a/test/xpu/test_concat_op_xpu.py
+++ b/test/xpu/test_concat_op_xpu.py
@@ -16,12 +16,12 @@
 import unittest
 
 import numpy as np
-from eager_op_test import skip_check_grad_ci
 from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test import skip_check_grad_ci
 from op_test_xpu import XPUOpTest
 
 import paddle
diff --git a/test/xpu/test_diagonal_op_xpu.py b/test/xpu/test_diagonal_op_xpu.py
index a4ea644cfb97c..9874d124e7388 100644
--- a/test/xpu/test_diagonal_op_xpu.py
+++ b/test/xpu/test_diagonal_op_xpu.py
@@ -16,12 +16,12 @@
 import unittest
 
 import numpy as np
-from eager_op_test import skip_check_grad_ci
 from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test import skip_check_grad_ci
 from op_test_xpu import XPUOpTest
 
 import paddle
diff --git a/test/xpu/test_elementwise_add_op_xpu.py b/test/xpu/test_elementwise_add_op_xpu.py
index 796f936c16f87..6c3b1e69abe36 100644
--- a/test/xpu/test_elementwise_add_op_xpu.py
+++ b/test/xpu/test_elementwise_add_op_xpu.py
@@ -16,12 +16,12 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, skip_check_grad_ci
 from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test import OpTest, skip_check_grad_ci
 from op_test_xpu import XPUOpTest
 
 import paddle
diff --git a/test/xpu/test_elementwise_add_op_xpu_kp.py b/test/xpu/test_elementwise_add_op_xpu_kp.py
index ff5c1afeaa902..c551a538ce147 100644
--- a/test/xpu/test_elementwise_add_op_xpu_kp.py
+++ b/test/xpu/test_elementwise_add_op_xpu_kp.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, skip_check_grad_ci
+from op_test import OpTest, skip_check_grad_ci
 from op_test_xpu import XPUOpTest
 
 import paddle
diff --git a/test/xpu/test_elementwise_div_op_xpu.py b/test/xpu/test_elementwise_div_op_xpu.py
index 481bb6c9c0d7c..52e2e62e067d2 100644
--- a/test/xpu/test_elementwise_div_op_xpu.py
+++ b/test/xpu/test_elementwise_div_op_xpu.py
@@ -15,12 +15,12 @@
 import unittest
 
 import numpy as np
-from eager_op_test import skip_check_grad_ci
 from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test import skip_check_grad_ci
 from op_test_xpu import XPUOpTest
 
 import paddle
diff --git a/test/xpu/test_elementwise_floordiv_op_xpu.py b/test/xpu/test_elementwise_floordiv_op_xpu.py
index 1fbe2b72f70b1..b2ad694f967de 100644
--- a/test/xpu/test_elementwise_floordiv_op_xpu.py
+++ b/test/xpu/test_elementwise_floordiv_op_xpu.py
@@ -15,12 +15,12 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
 from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test import OpTest
 from op_test_xpu import XPUOpTest
 
 import paddle
diff --git a/test/xpu/test_elementwise_max_op_xpu.py b/test/xpu/test_elementwise_max_op_xpu.py
index d897cb8acaebc..f1a8ca8b57278 100644
--- a/test/xpu/test_elementwise_max_op_xpu.py
+++ b/test/xpu/test_elementwise_max_op_xpu.py
@@ -15,12 +15,12 @@
 import unittest
 
 import numpy as np
-from eager_op_test import skip_check_grad_ci
 from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test import skip_check_grad_ci
 from op_test_xpu import XPUOpTest
 
 import paddle
diff --git a/test/xpu/test_elementwise_min_op_xpu.py b/test/xpu/test_elementwise_min_op_xpu.py
index 0836fcf9f3085..3190c71e76fad 100644
--- a/test/xpu/test_elementwise_min_op_xpu.py
+++ b/test/xpu/test_elementwise_min_op_xpu.py
@@ -15,12 +15,12 @@
 import unittest
 
 import numpy as np
-from eager_op_test import skip_check_grad_ci
 from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test import skip_check_grad_ci
 from op_test_xpu import XPUOpTest
 
 import paddle
diff --git a/test/xpu/test_elementwise_mod_op_xpu.py b/test/xpu/test_elementwise_mod_op_xpu.py
index 06ed903b47201..0b80e8e5511f0 100644
--- a/test/xpu/test_elementwise_mod_op_xpu.py
+++ b/test/xpu/test_elementwise_mod_op_xpu.py
@@ -15,12 +15,12 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
 from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test import OpTest
 from op_test_xpu import XPUOpTest
 
 import paddle
diff --git a/test/xpu/test_elementwise_mul_op_xpu.py b/test/xpu/test_elementwise_mul_op_xpu.py
index fdb9673cf1c72..6bd604df07e40 100644
--- a/test/xpu/test_elementwise_mul_op_xpu.py
+++ b/test/xpu/test_elementwise_mul_op_xpu.py
@@ -15,12 +15,12 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, skip_check_grad_ci
 from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test import OpTest, skip_check_grad_ci
 from op_test_xpu import XPUOpTest
 
 import paddle
diff --git a/test/xpu/test_elementwise_pow_op_xpu.py b/test/xpu/test_elementwise_pow_op_xpu.py
index a757af81bf3cf..ddcf64fb9d405 100644
--- a/test/xpu/test_elementwise_pow_op_xpu.py
+++ b/test/xpu/test_elementwise_pow_op_xpu.py
@@ -15,12 +15,12 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, skip_check_grad_ci
 from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test import OpTest, skip_check_grad_ci
 from op_test_xpu import XPUOpTest
 
 import paddle
diff --git a/test/xpu/test_elementwise_sub_op_xpu.py b/test/xpu/test_elementwise_sub_op_xpu.py
index 1c96f6b9441e6..8e595932eae29 100644
--- a/test/xpu/test_elementwise_sub_op_xpu.py
+++ b/test/xpu/test_elementwise_sub_op_xpu.py
@@ -16,12 +16,12 @@
 import unittest
 
 import numpy as np
-from eager_op_test import skip_check_grad_ci
 from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test import skip_check_grad_ci
 from op_test_xpu import XPUOpTest
 
 import paddle
diff --git a/test/xpu/test_fill_constant_op_xpu.py b/test/xpu/test_fill_constant_op_xpu.py
index 2467a03ce63c9..afd54cbd9ae48 100644
--- a/test/xpu/test_fill_constant_op_xpu.py
+++ b/test/xpu/test_fill_constant_op_xpu.py
@@ -16,12 +16,12 @@
 import unittest
 
 import numpy as np
-from eager_op_test import convert_float_to_uint16
 from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test import convert_float_to_uint16
 from op_test_xpu import XPUOpTest
 
 import paddle
diff --git a/test/xpu/test_fill_diagonal_tensor_op_xpu.py b/test/xpu/test_fill_diagonal_tensor_op_xpu.py
index eca8ab3352729..9d0f4f425fba1 100644
--- a/test/xpu/test_fill_diagonal_tensor_op_xpu.py
+++ b/test/xpu/test_fill_diagonal_tensor_op_xpu.py
@@ -16,12 +16,12 @@
 import unittest
 
 import numpy as np
-from eager_op_test import skip_check_grad_ci
 from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test import skip_check_grad_ci
 from op_test_xpu import XPUOpTest
 
 import paddle
diff --git a/test/xpu/test_fused_resnet_basic_block_op_xpu.py b/test/xpu/test_fused_resnet_basic_block_op_xpu.py
index 709222496fcde..c7500f8ea8a87 100644
--- a/test/xpu/test_fused_resnet_basic_block_op_xpu.py
+++ b/test/xpu/test_fused_resnet_basic_block_op_xpu.py
@@ -16,12 +16,12 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
 from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test import OpTest
 
 import paddle
 from paddle import base, nn
diff --git a/test/xpu/test_group_norm_op_xpu.py b/test/xpu/test_group_norm_op_xpu.py
index bf0b528b6e8a7..6c8a5a0e44280 100644
--- a/test/xpu/test_group_norm_op_xpu.py
+++ b/test/xpu/test_group_norm_op_xpu.py
@@ -16,12 +16,12 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
 from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test import OpTest
 from op_test_xpu import XPUOpTest
 
 import paddle
diff --git a/test/xpu/test_huber_loss_op_xpu.py b/test/xpu/test_huber_loss_op_xpu.py
index a41d4a250327c..fc20d52efb682 100644
--- a/test/xpu/test_huber_loss_op_xpu.py
+++ b/test/xpu/test_huber_loss_op_xpu.py
@@ -16,12 +16,12 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
 from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test import OpTest
 from op_test_xpu import XPUOpTest
 
 import paddle
diff --git a/test/xpu/test_log_loss_op_xpu.py b/test/xpu/test_log_loss_op_xpu.py
index 1262288166b59..6137d00ad3441 100644
--- a/test/xpu/test_log_loss_op_xpu.py
+++ b/test/xpu/test_log_loss_op_xpu.py
@@ -16,7 +16,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from op_test import OpTest
 
 import paddle
 
diff --git a/test/xpu/test_logical_op_xpu.py b/test/xpu/test_logical_op_xpu.py
index 90d6ba1c9335c..584e1afe2905a 100755
--- a/test/xpu/test_logical_op_xpu.py
+++ b/test/xpu/test_logical_op_xpu.py
@@ -16,12 +16,12 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
 from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test import OpTest
 from op_test_xpu import XPUOpTest
 
 import paddle
diff --git a/test/xpu/test_refactor_op_xpu.py b/test/xpu/test_refactor_op_xpu.py
index 51bad8cec2e82..f0193fb1aad1a 100644
--- a/test/xpu/test_refactor_op_xpu.py
+++ b/test/xpu/test_refactor_op_xpu.py
@@ -16,12 +16,12 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
 from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test import OpTest
 from op_test_xpu import XPUOpTest
 
 import paddle
diff --git a/test/xpu/test_stack_op_xpu.py b/test/xpu/test_stack_op_xpu.py
index cf49dcabef471..8ed1b4c637abc 100644
--- a/test/xpu/test_stack_op_xpu.py
+++ b/test/xpu/test_stack_op_xpu.py
@@ -15,12 +15,12 @@
 import unittest
 
 import numpy as np
-from eager_op_test import skip_check_grad_ci
 from get_test_cover_info import (
     XPUOpTestWrapper,
     create_test_class,
     get_xpu_op_support_types,
 )
+from op_test import skip_check_grad_ci
 from op_test_xpu import XPUOpTest
 
 import paddle
diff --git a/tools/check_api_approvals.sh b/tools/check_api_approvals.sh
index 140ba2091e140..3989a0cceff1b 100644
--- a/tools/check_api_approvals.sh
+++ b/tools/check_api_approvals.sh
@@ -108,18 +108,6 @@ if [ "$inference_approve" != "" ]; then
     check_approval 1 qingqing01 heavengate
 fi
 
-filter_fluid=`git diff --name-only upstream/develop |  grep "py$" | grep "^test/"`
-filter_fluid+=" `git diff --name-only upstream/develop | grep "py$" | grep -v "^python/paddle/fluid"| grep "^python/paddle"`"
-has_fluid=`git diff -U0 upstream/$BRANCH -- $filter_fluid | grep '^\+' | grep -v '^++' | grep -E "(fluid\.)|(paddle\.fluid)"`
-if [ "${has_fluid}" != "" ]; then
-    for fluid in "${has_fluid}";
-    do
-        echo "${fluid}"
-    done
-    echo_line="You must have one RD (zoooo0820(Recommend), or jeff41404) approval for using fluid API, because fluid API is going to be removed.\n"
-    check_approval 1 zoooo0820 jeff41404
-fi
-
 
 DEV_OP_USE_DEFAULT_GRAD_MAKER_SPEC=${PADDLE_ROOT}/paddle/fluid/op_use_default_grad_maker_DEV.spec
 PR_OP_USE_DEFAULT_GRAD_MAKER_SPEC=${PADDLE_ROOT}/paddle/fluid/op_use_default_grad_maker_PR.spec
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index d467ae1cc4671..d309a50d921ed 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -230,10 +230,10 @@ for API_FILE in ${API_FILES[*]}; do
   fi
 done
 
-DEPS_PHI_IN_IR=`git diff --name-only upstream/$BRANCH | grep -E "paddle/ir/" | grep "CMakeList" |xargs -r git diff -U0 upstream/$BRANCH --| grep "^\+" | grep "phi" || true`
+DEPS_PHI_IN_IR=`git diff --name-only upstream/$BRANCH | grep -E "paddle/pir/" | grep "CMakeList" |xargs -r git diff -U0 upstream/$BRANCH --| grep "^\+" | grep "phi" || true`
 echo "DEPS_PHI_IN_IR:${DEPS_PHI_IN_IR}"
 if [ "${DEPS_PHI_IN_IR}" ] && [ "${DEPS_PHI_IN_IR}" != "" ]; then
-    echo_line="You must have one RD (Aurelius84, phlrain, zhangbo9674, winter-wang) approval for the CMakeLists.txt with DEPS phi* in paddle/ir directory.\n"
+    echo_line="You must have one RD (Aurelius84, phlrain, zhangbo9674, winter-wang) approval for the CMakeLists.txt with DEPS phi* in paddle/pir directory.\n"
     check_approval 1 Aurelius84 phlrain zhangbo9674 winter-wang
 fi
 FILTER=`git diff --name-only upstream/develop | grep -v "tools/"`
@@ -341,9 +341,9 @@ if [ "${HAS_MODIFIED_STATIC_BUILD}" != "" ] && [ "${GIT_PR_ID}" != ""]; then
     check_approval 1 From00 zhiqiu
 fi
 
-HAS_MODIFIED_PY_FLUID=`git diff --name-only upstream/$BRANCH | grep "python/paddle/base" || true`
+HAS_MODIFIED_PY_FLUID=`git diff --name-only upstream/$BRANCH | grep "python/paddle/fluid" || true`
 if [ "${HAS_MODIFIED_PY_FLUID}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    echo_line="You must have one RD (zoooo0820(Recommend), or jeff41404) approval for file changes in python/paddle/base, because fluid API is going to be removed.\n"
+    echo_line="You must have one RD (zoooo0820(Recommend), or jeff41404) approval for file changes in python/paddle/fluid, because fluid API has been removed.\n"
     check_approval 1 zoooo0820 jeff41404
 fi
 
diff --git a/tools/ci_op_benchmark.sh b/tools/ci_op_benchmark.sh
index c5bdd75ae636d..1b116418d96e5 100644
--- a/tools/ci_op_benchmark.sh
+++ b/tools/ci_op_benchmark.sh
@@ -119,7 +119,7 @@ function load_CHANGE_OP_FILES {
     LOG "[INFO] Uninstall PaddlePaddle ..."
     pip uninstall -y paddlepaddle paddlepaddle_gpu
     LOG "[INFO] Install PaddlePaddle ..."
-    pip install build/pr_whl/paddlepaddle_gpu-0.0.0-cp37-cp37m-linux_x86_64.whl
+    pip install build/pr_whl/*.whl
     collect_kernel_registry_info
     LOG "[INFO] No op to test, skip this ci." && exit 0
   fi
@@ -211,7 +211,7 @@ function run_op_benchmark_test {
     LOG "[INFO] Uninstall PaddlePaddle ..."
     pip uninstall -y paddlepaddle paddlepaddle_gpu
     LOG "[INFO] Install PaddlePaddle ..."
-    pip install build/${branch_name}/paddlepaddle_gpu-0.0.0-cp37-cp37m-linux_x86_64.whl
+    pip install build/${branch_name}/*.whl
     logs_dir="$(pwd)/logs-${branch_name}"
     [ -d $logs_dir ] && rm -rf $logs_dir/* || mkdir -p $logs_dir
     pushd benchmark/api > /dev/null
diff --git a/tools/coverage/paddle_coverage.sh b/tools/coverage/paddle_coverage.sh
index 016e076e7d5ad..ee2a38f5da851 100644
--- a/tools/coverage/paddle_coverage.sh
+++ b/tools/coverage/paddle_coverage.sh
@@ -33,7 +33,7 @@ make install
 
 cd /paddle/build
 
-python3.7 ${PADDLE_ROOT}/tools/coverage/gcda_clean.py ${GIT_PR_ID} || exit 101
+python ${PADDLE_ROOT}/tools/coverage/gcda_clean.py ${GIT_PR_ID} || exit 101
 
 lcov --capture -d ./ -o coverage.info --rc lcov_branch_coverage=0
 
@@ -49,10 +49,10 @@ function gen_full_html_report() {
         '/paddle/paddle/fluid/recordio/*' \
         '/paddle/paddle/fluid/string/*' \
         '/paddle/paddle/fluid/eager/*' \
-        '/paddle/paddle/fluid/ir/*' \
+        '/paddle/paddle/fluid/pir/*' \
         '/paddle/paddle/fluid/ir_adaptor/*' \
         '/paddle/paddle/phi/*' \
-        '/paddle/paddle/ir/*' \
+        '/paddle/paddle/pir/*' \
         '/paddle/paddle/utils/*' \
         -o coverage-full.tmp \
         --rc lcov_branch_coverage=0
@@ -125,9 +125,9 @@ fi
 function gen_diff_html_report() {
     if [ "${GIT_PR_ID}" != "" ]; then
 
-        COVERAGE_DIFF_PATTERN="`python3.7 ${PADDLE_ROOT}/tools/coverage/pull_request.py files ${GIT_PR_ID}`"
+        COVERAGE_DIFF_PATTERN="`python ${PADDLE_ROOT}/tools/coverage/pull_request.py files ${GIT_PR_ID}`"
 
-        python3.7 ${PADDLE_ROOT}/tools/coverage/pull_request.py diff ${GIT_PR_ID} > git-diff.out
+        python ${PADDLE_ROOT}/tools/coverage/pull_request.py diff ${GIT_PR_ID} > git-diff.out
     fi
 
     lcov --extract coverage-full.info \
@@ -135,7 +135,7 @@ function gen_diff_html_report() {
         -o coverage-diff.info \
         --rc lcov_branch_coverage=0
 
-    python3.7 ${PADDLE_ROOT}/tools/coverage/coverage_diff.py coverage-diff.info git-diff.out > coverage-diff.tmp
+    python ${PADDLE_ROOT}/tools/coverage/coverage_diff.py coverage-diff.info git-diff.out > coverage-diff.tmp
 
     mv -f coverage-diff.tmp coverage-diff.info
 
@@ -154,7 +154,7 @@ coverage combine `$(ls python-coverage.data.*)` || NO_PYTHON_COVERAGE_DATA=1
 
 sed -i 's/mnt\/paddle/paddle/g' python-coverage.xml
 
-`$(python3.7 ${PADDLE_ROOT}/tools/coverage/python_coverage.py > python-coverage.info)` || [[ "${NO_PYTHON_COVERAGE_DATA}" == "1" ]]
+`$(python ${PADDLE_ROOT}/tools/coverage/python_coverage.py > python-coverage.info)` || [[ "${NO_PYTHON_COVERAGE_DATA}" == "1" ]]
 
 # python full html report
 #
@@ -180,9 +180,9 @@ gen_python_full_html_report || true
 
 function gen_python_diff_html_report() {
     if [ "${GIT_PR_ID}" != "" ]; then
-        COVERAGE_DIFF_PATTERN="`python3.7 ${PADDLE_ROOT}/tools/coverage/pull_request.py files ${GIT_PR_ID}`"
+        COVERAGE_DIFF_PATTERN="`python ${PADDLE_ROOT}/tools/coverage/pull_request.py files ${GIT_PR_ID}`"
 
-        python3.7 ${PADDLE_ROOT}/tools/coverage/pull_request.py diff ${GIT_PR_ID} > python-git-diff.out
+        python ${PADDLE_ROOT}/tools/coverage/pull_request.py diff ${GIT_PR_ID} > python-git-diff.out
     fi
 
     lcov --extract python-coverage-full.info \
@@ -190,7 +190,7 @@ function gen_python_diff_html_report() {
         -o python-coverage-diff.info \
         --rc lcov_branch_coverage=0
 
-    python3.7 ${PADDLE_ROOT}/tools/coverage/coverage_diff.py python-coverage-diff.info python-git-diff.out > python-coverage-diff.tmp
+    python ${PADDLE_ROOT}/tools/coverage/coverage_diff.py python-coverage-diff.info python-git-diff.out > python-coverage-diff.tmp
 
     mv -f python-coverage-diff.tmp python-coverage-diff.info
 
@@ -208,7 +208,7 @@ gen_python_diff_html_report || true
 
 echo "Assert Diff Coverage"
 
-python3.7 ${PADDLE_ROOT}/tools/coverage/coverage_lines.py coverage-diff.info 0.9 || COVERAGE_LINES_ASSERT=1
+python ${PADDLE_ROOT}/tools/coverage/coverage_lines.py coverage-diff.info 0.9 || COVERAGE_LINES_ASSERT=1
 
 echo "Assert Python Diff Coverage"
 
@@ -216,7 +216,7 @@ if [ ${WITH_XPU:-OFF} == "ON" ]; then
     echo "XPU has no python coverage!"
 else
     if [[ "${NO_PYTHON_COVERAGE_DATA}" != "1" ]];then
-        python3.7 ${PADDLE_ROOT}/tools/coverage/coverage_lines.py python-coverage-diff.info 0.9 || PYTHON_COVERAGE_LINES_ASSERT=1
+        python ${PADDLE_ROOT}/tools/coverage/coverage_lines.py python-coverage-diff.info 0.9 || PYTHON_COVERAGE_LINES_ASSERT=1
     fi
 fi
 
diff --git a/tools/dockerfile/ci_dockerfile.sh b/tools/dockerfile/ci_dockerfile.sh
index defcdfb5fed97..b9487be10022e 100644
--- a/tools/dockerfile/ci_dockerfile.sh
+++ b/tools/dockerfile/ci_dockerfile.sh
@@ -50,6 +50,14 @@ function make_ubuntu_trt7_dockerfile(){
     RUN apt remove -y libnccl* --allow-change-held-packages \&\&  apt-get install -y --allow-unauthenticated libsndfile1 libnccl2=2.8.4-1+cuda10.2 libnccl-dev=2.8.4-1+cuda10.2 zstd pigz --allow-change-held-packages #g" ${dockerfile_name}
 }
 
+function make_ubuntu_trt7_dockerfile_temp_ues(){
+  dockerfile_name="Dockerfile.cuda102_cudnn8_gcc82_ubuntu16"
+  echo "FROM registry.baidubce.com/paddlepaddle/paddleqa:coverage-ci-temp-use" >> ${dockerfile_name}
+  echo "RUN wget https://www.openssl.org/source/openssl-1.1.1v.tar.gz && tar -xvf openssl-1.1.1v.tar.gz && cd openssl-1.1.1v && ./config -fPIC --prefix=/usr/local/ssl > /dev/null && make > /dev/null && make install > /dev/null && cd ../ && rm -rf openssl-1.1.1v*" >> ${dockerfile_name}
+  echo "ENV OPENSSL_ROOT_DIR=/usr/local/ssl" >> ${dockerfile_name}
+  echo "ENV LD_LIBRARY_PATH=/usr/local/ssl/lib:\g$LD_LIBRARY_PATH" >> ${dockerfile_name}
+}
+
 function make_cpu_dockerfile(){
   dockerfile_name="Dockerfile.cuda9_cudnn7_gcc48_py35_centos6"
   sed "s#<baseimg>#ubuntu:20.04#g" ./Dockerfile.ubuntu20 >${dockerfile_name}
@@ -137,7 +145,7 @@ function make_ubuntu20_cu112_dockerfile(){
 }
 
 function main() {
-  make_ubuntu_trt7_dockerfile
+  make_ubuntu_trt7_dockerfile_temp_ues
   make_cpu_dockerfile
   make_ce_framework_dockcerfile
   make_unbuntu20_cu12_dockerfile
diff --git a/tools/group_case_for_parallel.py b/tools/group_case_for_parallel.py
index 5e8768aa9cc8a..0f48c1db26918 100644
--- a/tools/group_case_for_parallel.py
+++ b/tools/group_case_for_parallel.py
@@ -30,7 +30,7 @@ def group_case_for_parallel(rootPath):
         'exclusive_card_tests_mem0',
     ]:
         os.system(
-            f'cd {rootPath}/tools && wget --no-proxy https://paddle-docker-tar.bj.bcebos.com/pre_test_bak/{filename} --no-check-certificate'
+            f'cd {rootPath}/tools && wget --no-proxy https://paddle-docker-tar.bj.bcebos.com/pre_test_bak_20230908/{filename} --no-check-certificate'
         )
 
     # get nightly tests
diff --git a/tools/jetson_infer_op.py b/tools/jetson_infer_op.py
index 80bf1555a3dfb..300d796ea9d44 100644
--- a/tools/jetson_infer_op.py
+++ b/tools/jetson_infer_op.py
@@ -156,8 +156,8 @@ def get_op_list(op_list_file='list_op.txt'):
 
 def set_diff_value(file, atol="1e-5", inplace_atol="1e-7"):
     """
-    :param file: refer to eager_op_test.py
-    :param atol: refer to eager_op_test.py
+    :param file: refer to op_test.py
+    :param atol: refer to op_test.py
     :param inplace_atol:
     :return:
     """
@@ -269,8 +269,8 @@ def run_test_first(op_list_file):
     """
     old_list = get_op_list(op_list_file)
     new_list = filter(lambda x: x not in black_list, old_list)
-    eager_op_test = transform_list_to_str(new_list)
-    os.system("ctest -R \"(" + eager_op_test + ")\" >& test_op_log.txt")
+    op_test = transform_list_to_str(new_list)
+    os.system("ctest -R \"(" + op_test + ")\" >& test_op_log.txt")
 
 
 def run_test_second():
diff --git a/tools/sampcd_processor.py b/tools/sampcd_processor.py
index ab53cc6121c05..107c655fe7f70 100644
--- a/tools/sampcd_processor.py
+++ b/tools/sampcd_processor.py
@@ -23,13 +23,11 @@
 """
 
 import functools
-import logging
 import multiprocessing
 import os
 import platform
 import queue
 import re
-import sys
 import threading
 import time
 import typing
@@ -39,6 +37,7 @@
     TEST_TIMEOUT,
     DocTester,
     TestResult,
+    log_exit,
     logger,
     parse_args,
     run_doctest,
@@ -247,6 +246,7 @@ def __init__(
         self.mode = mode
         self.verbose = verbose
         self.config = {**XDOCTEST_CONFIG, **(config or {})}
+        self._test_capacity = set()
 
         self._patch_global_state = patch_global_state
         self._patch_tensor_place = patch_tensor_place
@@ -328,6 +328,8 @@ def prepare(self, test_capacity: set):
         logger.info("running under python %s", platform.python_version())
         logger.info("running under xdoctest %s", xdoctest.__version__)
 
+        self._test_capacity = test_capacity
+
     def run(self, api_name: str, docstring: str) -> typing.List[TestResult]:
         """Run the xdoctest with a docstring."""
         # parse global directive
@@ -443,29 +445,31 @@ def print_summary(self, test_results, whl_error=None):
         summary_timeout = []
         summary_nocodes = []
 
-        stdout_handler = logging.StreamHandler(stream=sys.stdout)
-        logger.addHandler(stdout_handler)
-        logger.info("----------------End of the Check--------------------")
+        logger.warning("----------------Check results--------------------")
+        logger.warning(">>> Sample code test capacity: %s", self._test_capacity)
+
         if whl_error is not None and whl_error:
-            logger.info("%s is not in whl.", whl_error)
-            logger.info("")
-            logger.info("Please check the whl package and API_PR.spec!")
-            logger.info(
+            logger.warning("%s is not in whl.", whl_error)
+            logger.warning("")
+            logger.warning("Please check the whl package and API_PR.spec!")
+            logger.warning(
                 "You can follow these steps in order to generate API.spec:"
             )
-            logger.info("1. cd ${paddle_path}, compile paddle;")
-            logger.info("2. pip install build/python/dist/(build whl package);")
-            logger.info(
+            logger.warning("1. cd ${paddle_path}, compile paddle;")
+            logger.warning(
+                "2. pip install build/python/dist/(build whl package);"
+            )
+            logger.warning(
                 "3. run 'python tools/print_signatures.py paddle > paddle/fluid/API.spec'."
             )
             for test_result in test_results:
                 if test_result.failed:
-                    logger.info(
+                    logger.error(
                         "In addition, mistakes found in sample codes: %s",
                         test_result.name,
                     )
-            logger.info("----------------------------------------------------")
-            sys.exit(1)
+            log_exit(1)
+
         else:
             for test_result in test_results:
                 if not test_result.nocode:
@@ -490,45 +494,60 @@ def print_summary(self, test_results, whl_error=None):
 
             if len(summary_success):
                 logger.info(
-                    ">>> %d sample codes ran success", len(summary_success)
+                    ">>> %d sample codes ran success in env: %s",
+                    len(summary_success),
+                    self._test_capacity,
                 )
                 logger.info('\n'.join(summary_success))
 
             if len(summary_skiptest):
-                logger.info(
-                    ">>> %d sample codes skipped", len(summary_skiptest)
+                logger.warning(
+                    ">>> %d sample codes skipped in env: %s",
+                    len(summary_skiptest),
+                    self._test_capacity,
                 )
-                logger.info('\n'.join(summary_skiptest))
+                logger.warning('\n'.join(summary_skiptest))
 
             if len(summary_nocodes):
-                logger.info(
-                    ">>> %d apis could not run test or don't have sample codes",
+                logger.error(
+                    ">>> %d apis don't have sample codes or could not run test in env: %s",
                     len(summary_nocodes),
+                    self._test_capacity,
                 )
-                logger.info('\n'.join(summary_nocodes))
+                logger.error('\n'.join(summary_nocodes))
 
             if len(summary_timeout):
-                logger.info(
-                    ">>> %d sample codes ran timeout", len(summary_timeout)
+                logger.error(
+                    ">>> %d sample codes ran timeout or error in env: %s",
+                    len(summary_timeout),
+                    self._test_capacity,
                 )
                 for _result in summary_timeout:
-                    logger.info(
+                    logger.error(
                         f"{_result['api_name']} - more than {_result['run_time']}s"
                     )
 
             if len(summary_failed):
-                logger.info(
-                    ">>> %d sample codes ran failed", len(summary_failed)
+                logger.error(
+                    ">>> %d sample codes ran failed in env: %s",
+                    len(summary_failed),
+                    self._test_capacity,
                 )
-                logger.info('\n'.join(summary_failed))
+                logger.error('\n'.join(summary_failed))
 
             if summary_failed or summary_timeout or summary_nocodes:
-                logger.info(
-                    "Mistakes found in sample codes. Please recheck the sample codes."
+                logger.warning(
+                    ">>> Mistakes found in sample codes in env: %s!",
+                    self._test_capacity,
                 )
-                sys.exit(1)
+                logger.warning(">>> Please recheck the sample codes.")
+                log_exit(1)
 
-        logger.info("Sample code check is successful!")
+        logger.warning(
+            ">>> Sample code check is successful in env: %s!",
+            self._test_capacity,
+        )
+        logger.warning("----------------End of the Check--------------------")
 
 
 if __name__ == '__main__':
diff --git a/tools/sampcd_processor_utils.py b/tools/sampcd_processor_utils.py
index d7b0a660f1639..8e4000fb30c64 100644
--- a/tools/sampcd_processor_utils.py
+++ b/tools/sampcd_processor_utils.py
@@ -26,14 +26,20 @@
 logger = logging.getLogger(__name__)
 logger.propagate = False
 
-if logger.handlers:
-    console = logger.handlers[
-        0
-    ]  # we assume the first handler is the one we want to configure
-else:
-    console = logging.StreamHandler(stream=sys.stderr)
-    logger.addHandler(console)
-console.setFormatter(logging.Formatter("%(message)s"))
+formatter = logging.Formatter("%(message)s")
+
+# add stdout for all logs
+handler_stdout = logging.StreamHandler(stream=sys.stdout)
+handler_stdout.setLevel(logging.DEBUG)
+handler_stdout.setFormatter(formatter)
+
+# add stderr for bad code-block
+handler_stderr = logging.StreamHandler(stream=sys.stderr)
+handler_stderr.setLevel(logging.WARNING)
+handler_stderr.setFormatter(formatter)
+
+logger.addHandler(handler_stdout)
+logger.addHandler(handler_stderr)
 
 
 RUN_ON_DEVICE = 'cpu'
@@ -376,6 +382,17 @@ def _append_code_block(in_examples):
     return code_blocks
 
 
+def log_exit(arg=None):
+    if arg:
+        _logger = logger.warning
+    else:
+        _logger = logger.info
+
+    _logger("----------------End of the Check--------------------")
+
+    sys.exit(arg)
+
+
 def init_logger(debug=True, log_file=None):
     """
     init logger level and file handler
@@ -409,7 +426,7 @@ def check_test_mode(mode="cpu", gpu_id=0):
         logger.error(
             "Unrecognized argument:%s, 'cpu' or 'gpu' is desired.", mode
         )
-        sys.exit("Invalid arguments")
+        log_exit("Invalid arguments")
 
     return mode
 
@@ -465,8 +482,8 @@ def get_docstring(full_test=False):
                 docstrings_to_test[api] = api_obj.__doc__
 
     if len(docstrings_to_test) == 0 and len(whl_error) == 0:
-        logger.info("-----API_PR.spec is the same as API_DEV.spec-----")
-        sys.exit(0)
+        logger.warning("-----API_PR.spec is the same as API_DEV.spec-----")
+        log_exit(0)
     logger.info("API_PR is diff from API_DEV: %s", docstrings_to_test.keys())
     logger.info("Total api: %s", len(docstrings_to_test.keys()))
 
@@ -497,9 +514,6 @@ def check_old_style(docstrings_to_test: typing.Dict[str, str]):
                 old_style_apis.append(docstring_name)
 
     if old_style_apis:
-        stdout_handler = logging.StreamHandler(stream=sys.stdout)
-        logger.addHandler(stdout_handler)
-
         logger.info(
             ">>> %d apis use plain sample code style.",
             len(old_style_apis),
@@ -512,8 +526,7 @@ def check_old_style(docstrings_to_test: typing.Dict[str, str]):
         logger.info(
             "For more information: https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/dev_guides/style_guide_and_references/code_example_writing_specification_cn.html "
         )
-        logger.info("----------------End of the Check--------------------")
-        sys.exit(1)
+        log_exit(1)
 
 
 def exec_gen_doc():