diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index b98a2288682660..08055cd9a5407b 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -156,7 +156,7 @@ cc_test(test_seqpool_cvm_concat_fuse_pass SRCS seqpool_cvm_concat_fuse_pass_test
 cc_test(test_repeated_fc_relu_fuse_pass_cc SRCS repeated_fc_relu_fuse_pass_tester.cc DEPS repeated_fc_relu_fuse_pass framework_proto)
 cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass)
 cc_test(test_simplify_with_basic_ops_pass SRCS simplify_with_basic_ops_pass_tester.cc DEPS simplify_with_basic_ops_pass)
-cc_test(test_fc_elementwise_layernorm_fuse_pass SRCS fc_elementwise_layernorm_fuse_pass_tester.cc DEPS fc_elementwise_layernorm_fuse_pass)
+cc_test(test_fc_elementwise_layernorm_fuse_pass_cc SRCS fc_elementwise_layernorm_fuse_pass_tester.cc DEPS fc_elementwise_layernorm_fuse_pass)
 cc_test(test_skip_layernorm_fuse_pass SRCS skip_layernorm_fuse_pass_tester.cc DEPS skip_layernorm_fuse_pass)
 cc_test(test_multihead_matmul_fuse_pass SRCS multihead_matmul_fuse_pass_tester.cc DEPS multihead_matmul_fuse_pass)
 cc_test(test_conv_bn_fuse_pass_cc SRCS conv_bn_fuse_pass_tester.cc DEPS conv_bn_fuse_pass)
diff --git a/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.cc
index d3cf3319adfc5e..0bf30c29f32793 100644
--- a/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <string>
 
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -338,3 +339,9 @@ void FCElementwiseLayerNormFusePass::ApplyImpl(ir::Graph *graph) const {
 
 REGISTER_PASS(fc_elementwise_layernorm_fuse_pass,
               paddle::framework::ir::FCElementwiseLayerNormFusePass);
+REGISTER_PASS_CAPABILITY(fc_elementwise_layernorm_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("fc", 0)
+            .LE("elementwise_add", 1)
+            .EQ("layer_norm", 0));
diff --git a/paddle/fluid/framework/ir/ipu/avg_shard_pass.cc b/paddle/fluid/framework/ir/ipu/avg_shard_pass.cc
new file mode 100644
index 00000000000000..9dcbbb9c9856e5
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/avg_shard_pass.cc
@@ -0,0 +1,56 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/avg_shard_pass.h"
+
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void AvgShardPass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(10) << "enter AvgShardPass::ApplyImpl";
+
+  std::shared_ptr<platform::ipu::IpuBackend> ipu_backend =
+      platform::ipu::IpuBackend::GetInstance();
+
+  if (ipu_backend->GetIpuStrategy()->need_avg_shard) {
+    VLOG(10) << "start AvgShardPass";
+    auto nodes = ir::TopologySortOperations(*graph);
+    auto num_ipus = ipu_backend->GetIpuStrategy()->num_ipus;
+
+    int shard_position = nodes.size() / num_ipus;
+    int index_and_stage = -1;
+    for (int i = 0; i < nodes.size(); i++) {
+      if ((i % shard_position) == 0 && index_and_stage < num_ipus - 1) {
+        index_and_stage++;
+      }
+      nodes[i]->Op()->SetAttr("ipu_index", index_and_stage);
+      nodes[i]->Op()->SetAttr("ipu_stage", index_and_stage);
+    }
+    VLOG(10) << "end AvgShardPass";
+  }
+
+  VLOG(10) << "leave AvgShardPass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(avg_shard_pass, paddle::framework::ir::AvgShardPass);
diff --git a/paddle/fluid/framework/ir/ipu/avg_shard_pass.h b/paddle/fluid/framework/ir/ipu/avg_shard_pass.h
new file mode 100644
index 00000000000000..b13acbd198dd52
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/avg_shard_pass.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class AvgShardPass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.cc b/paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.cc
new file mode 100644
index 00000000000000..5dcfddf6187f2b
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.cc
@@ -0,0 +1,133 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.h"
+
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void ForwardGraphExtractPass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(10) << "enter ForwardGraphExtractPass::ApplyImpl";
+
+  std::unordered_map<OpRole, std::unordered_set<ir::Node*>> all_ops{
+      {OpRole::kForward, {}},  {OpRole::kBackward, {}},
+      {OpRole::kOptimize, {}}, {OpRole::kRPC, {}},
+      {OpRole::kDist, {}},     {OpRole::kLRSched, {}},
+      {OpRole::kLoss, {}},     {OpRole::kNotSpecified, {}}};
+  for (auto* node : graph->Nodes()) {
+    if (!node->IsOp()) {
+      continue;
+    }
+    auto op_role = BOOST_GET_MUTABLE(int, node->Op()->GetAttr("op_role"));
+    if (op_role == static_cast<int>(OpRole::kForward)) {
+      all_ops[OpRole::kForward].insert(node);
+    } else if (op_role == static_cast<int>(OpRole::kBackward)) {
+      all_ops[OpRole::kBackward].insert(node);
+    } else if (op_role == static_cast<int>(OpRole::kOptimize)) {
+      all_ops[OpRole::kOptimize].insert(node);
+    } else if (op_role == static_cast<int>(OpRole::kRPC)) {
+    } else if (op_role == static_cast<int>(OpRole::kDist)) {
+    } else if (op_role == static_cast<int>(OpRole::kLRSched)) {
+    } else if (op_role == static_cast<int>(OpRole::kLoss)) {
+      all_ops[OpRole::kLoss].insert(node);
+    } else if (op_role == static_cast<int>(OpRole::kNotSpecified)) {
+      LOG(WARNING) << "Op: " << node->Name() << " OpRole is NotSpecified ";
+    }
+  }
+
+  std::unordered_set<ir::Node*> forward_vars;
+  std::unordered_set<ir::Node*> backward_vars;
+  std::unordered_set<ir::Node*> control_vars;
+  // forward_vars
+  for (auto& nodes : std::array<std::unordered_set<ir::Node*>, 2>{
+           all_ops[OpRole::kForward], all_ops[OpRole::kLoss]}) {
+    for (auto* node : nodes) {
+      for (auto* in_node : node->inputs) {
+        forward_vars.insert(in_node);
+      }
+      for (auto* out_node : node->outputs) {
+        forward_vars.insert(out_node);
+      }
+    }
+  }
+  // control_vars &  backward_vars
+  for (auto* node : graph->Nodes()) {
+    if (!node->IsVar()) {
+      continue;
+    }
+    if (node->IsCtrlVar()) {
+      control_vars.insert(node);
+    }
+    for (auto* in_node : node->inputs) {
+      if (all_ops[OpRole::kOptimize].count(in_node)) {
+        backward_vars.insert(node);
+      }
+    }
+  }
+  // all removed node
+  std::unordered_set<ir::Node*> rm_nodes;
+  for (auto* node : graph->Nodes()) {
+    if (backward_vars.count(node)) {
+      rm_nodes.insert(node);
+    } else if (control_vars.count(node)) {
+      rm_nodes.insert(node);
+    } else if (all_ops[OpRole::kBackward].count(node)) {
+      rm_nodes.insert(node);
+    } else if (all_ops[OpRole::kForward].count(node) == 0 &&
+               all_ops[OpRole::kLoss].count(node) == 0 &&
+               forward_vars.count(node) == 0) {
+      rm_nodes.insert(node);
+    } else if (node->Name() == "feed" || node->Name() == "fetch") {
+      rm_nodes.insert(node);
+    }
+  }
+
+  VLOG(10) << "Remove Node: ";
+  for (auto* node : rm_nodes) {
+    // rm node releations
+    for (auto* node_in : node->inputs) {
+      for (size_t i = 0; i < node_in->outputs.size(); ++i) {
+        if (node_in->outputs[i] == node) {
+          node_in->outputs.erase(node_in->outputs.begin() + i);
+          break;
+        }
+      }
+    }
+    for (auto* node_out : node->outputs) {
+      for (size_t i = 0; i < node_out->inputs.size(); ++i) {
+        if (node_out->inputs[i] == node) {
+          node_out->inputs.erase(node_out->inputs.begin() + i);
+          break;
+        }
+      }
+    }
+    VLOG(10) << "\t" << node->Name();
+    graph->RemoveNode(node);
+  }
+
+  VLOG(10) << "Post Graph: ";
+  VLOG(10) << DebugString(graph);
+
+  VLOG(10) << "leave ForwardGraphExtractPass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(forward_graph_extract_pass,
+              paddle::framework::ir::ForwardGraphExtractPass);
diff --git a/paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.h b/paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.h
new file mode 100644
index 00000000000000..afa9f1c15f2ab8
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class ForwardGraphExtractPass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/ipu/infer_shape_pass.cc b/paddle/fluid/framework/ir/ipu/infer_shape_pass.cc
new file mode 100644
index 00000000000000..ceef27ac1ce3c0
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/infer_shape_pass.cc
@@ -0,0 +1,108 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/infer_shape_pass.h"
+
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/variable_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void InferShapePass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(10) << "enter InferShapePass::ApplyImpl";
+  VLOG(10) << "Raw Graph: ";
+  VLOG(10) << DebugString(graph);
+
+  std::shared_ptr<platform::ipu::IpuBackend> ipu_backend =
+      platform::ipu::IpuBackend::GetInstance();
+  auto batch_size = ipu_backend->GetIpuStrategy()->batch_size;
+
+  auto feed_list = Get<std::vector<std::string>>("feed_list");
+  for (auto node : graph->Nodes()) {
+    if (!node->IsVar()) {
+      continue;
+    }
+    bool is_feed = std::find(feed_list.begin(), feed_list.end(),
+                             node->Name()) != feed_list.end();
+    if (is_feed) {
+      auto input_shape = node->Var()->GetShape();
+      if (input_shape[0] <= -1) {
+        input_shape[0] = batch_size;
+        node->Var()->SetShape(input_shape);
+      }
+      // int64->int32
+      if (node->Var()->GetDataType() == proto::VarType::INT64) {
+        node->Var()->SetDataType(proto::VarType::INT32);
+      }
+    }
+  }
+
+  // temp scope for shape inference
+  std::shared_ptr<paddle::framework::Scope> scope(
+      new paddle::framework::Scope());
+  for (auto node : graph->Nodes()) {
+    if (!node->IsVar()) {
+      continue;
+    }
+    auto var_desc = node->Var();
+    auto* ptr = scope->Var(var_desc->Name());
+    paddle::framework::InitializeVariable(ptr, var_desc->GetType());
+
+    auto tensor = ptr->GetMutable<paddle::framework::LoDTensor>();
+    tensor->Resize(paddle::framework::make_ddim(var_desc->GetShape()));
+  }
+
+  // infer shape
+  auto nodes = ir::TopologySortOperations(*graph);
+  for (auto node : nodes) {
+    auto op_desc = node->Op();
+    auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
+    paddle::framework::RuntimeContext ctx(op->Inputs(), op->Outputs(), *scope);
+    op->RuntimeInferShape(*scope, paddle::platform::CPUPlace(), ctx);
+
+    for (auto it = ctx.outputs.begin(); it != ctx.outputs.end(); it++) {
+      for (int i = 0; i < it->second.size(); i++) {
+        auto output_name = op_desc->Output(it->first)[i];
+        auto dim =
+            it->second[i]->GetMutable<paddle::framework::LoDTensor>()->dims();
+        auto new_shape = paddle::framework::vectorize(dim);
+        for (auto output_node : node->outputs) {
+          if (output_node->Name() == output_name) {
+            output_node->Var()->SetShape(new_shape);
+          }
+        }
+      }
+    }
+  }
+  // release the temp scope
+  scope.reset();
+
+  VLOG(10) << "Post Graph: ";
+  VLOG(10) << DebugString(graph);
+  VLOG(10) << "leave InferShapePass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(infer_shape_pass, paddle::framework::ir::InferShapePass)
+    .RequirePassAttr("feed_list");
diff --git a/paddle/fluid/framework/ir/ipu/infer_shape_pass.h b/paddle/fluid/framework/ir/ipu/infer_shape_pass.h
new file mode 100644
index 00000000000000..3e8148b7f066d9
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/infer_shape_pass.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class InferShapePass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/ipu/inference_postprocess_pass.cc b/paddle/fluid/framework/ir/ipu/inference_postprocess_pass.cc
new file mode 100644
index 00000000000000..616139a52ac06c
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/inference_postprocess_pass.cc
@@ -0,0 +1,89 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/inference_postprocess_pass.h"
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+#include "paddle/fluid/platform/device/ipu/ipu_strategy.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void InferencePostprocessPass::ApplyImpl(ir::Graph *graph) const {
+  VLOG(10) << "enter InferencePostprocessPass::ApplyImpl";
+
+  std::vector<std::string> feed_list;
+  feed_list = Get<std::vector<std::string>>("feed_list");
+  std::vector<std::string> fetch_list;
+  fetch_list = Get<std::vector<std::string>>("fetch_list");
+
+  auto *feed_var = new paddle::framework::VarDesc("feed");
+  feed_var->SetType(proto::VarType::FEED_MINIBATCH);
+  auto *feed_var_node = graph->CreateVarNode(feed_var);
+
+  auto *fetch_var = new paddle::framework::VarDesc("fetch");
+  fetch_var->SetType(proto::VarType::FETCH_LIST);
+  auto *fetch_var_node = graph->CreateVarNode(fetch_var);
+
+  for (int i = 0; i < feed_list.size(); i++) {
+    for (auto node : graph->Nodes()) {
+      if (node->Name() == feed_list[i]) {
+        auto *op = new paddle::framework::OpDesc();
+        op->SetType("feed");
+        op->SetInput("X", {"feed"});
+        op->SetOutput("Out", {node->Name()});
+        op->SetAttr("col", i);
+        auto *op_node = graph->CreateOpNode(op);
+        node->inputs.push_back(op_node);
+        op_node->outputs.push_back(node);
+        feed_var_node->outputs.push_back(op_node);
+        op_node->inputs.push_back(feed_var_node);
+        break;
+      }
+    }
+  }
+
+  for (int i = 0; i < fetch_list.size(); i++) {
+    for (auto node : graph->Nodes()) {
+      if (node->Name() == fetch_list[i]) {
+        auto *op = new paddle::framework::OpDesc();
+        op->SetType("fetch");
+        op->SetInput("X", {node->Name()});
+        op->SetOutput("Out", {"fetch"});
+        op->SetAttr("col", i);
+        auto *op_node = graph->CreateOpNode(op);
+        node->outputs.push_back(op_node);
+        op_node->inputs.push_back(node);
+        fetch_var_node->inputs.push_back(op_node);
+        op_node->outputs.push_back(fetch_var_node);
+        break;
+      }
+    }
+  }
+
+  VLOG(10) << "leave InferencePostprocessPass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(inference_postprocess_pass,
+              paddle::framework::ir::InferencePostprocessPass)
+    .RequirePassAttr("feed_list")
+    .RequirePassAttr("fetch_list");
diff --git a/paddle/fluid/framework/ir/ipu/inference_postprocess_pass.h b/paddle/fluid/framework/ir/ipu/inference_postprocess_pass.h
new file mode 100644
index 00000000000000..e80e1905d4ad79
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/inference_postprocess_pass.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class InferencePostprocessPass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/ipu/inference_process_pass.cc b/paddle/fluid/framework/ir/ipu/inference_process_pass.cc
new file mode 100644
index 00000000000000..d02dcce0cc62c8
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/inference_process_pass.cc
@@ -0,0 +1,129 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/inference_process_pass.h"
+
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+#include "paddle/fluid/platform/device/ipu/ipu_strategy.h"
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void InferenceProcessPass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(10) << "enter InferenceProcessPass::ApplyImpl";
+
+  // Get a new instance of ipu_backend
+  std::shared_ptr<platform::ipu::IpuBackend> ipu_backend =
+      platform::ipu::IpuBackend::GetNewInstance();
+
+  // Set scope
+  auto& scope = graph->Get<Scope>(kParamScopeAttr);
+  ipu_backend->SetScope(scope);
+
+  // Set ipu_strategy
+  static std::shared_ptr<platform::ipu::IpuStrategy> ipu_strategy_instance_(
+      new platform::ipu::IpuStrategy());
+  ipu_strategy_instance_->is_training = false;
+  auto num_ipus = graph->Get<int>("num_ipus");
+  ipu_strategy_instance_->num_ipus = num_ipus;
+  if (num_ipus > 1) {
+    ipu_strategy_instance_->popart_options_.virtualGraphMode =
+        platform::ipu::VirtualGraphMode::Manual;
+  } else {
+    ipu_strategy_instance_->popart_options_.virtualGraphMode =
+        platform::ipu::VirtualGraphMode::Off;
+  }
+
+  auto enable_pipelining = graph->Get<bool>("enable_pipelining");
+  ipu_strategy_instance_->popart_options_.enablePipelining = enable_pipelining;
+  if (enable_pipelining) {
+    auto batches_per_step = graph->Get<int>("batches_per_step");
+    PADDLE_ENFORCE_GE(
+        batches_per_step, num_ipus,
+        platform::errors::InvalidArgument("Batched per step should be equal or "
+                                          "greater than the number of IPUs"));
+    ipu_strategy_instance_->batches_per_step = batches_per_step;
+  }
+  ipu_strategy_instance_->batch_size = graph->Get<int>("batch_size");
+  ipu_strategy_instance_->need_avg_shard = graph->Get<bool>("need_avg_shard");
+
+  ipu_backend->SetIpuStrategy(*(ipu_strategy_instance_.get()));
+
+  // Get feed_list and fetch list
+  std::vector<std::string> feed_list = {};
+  std::vector<std::string> fetch_list = {};
+  for (auto node : graph->Nodes()) {
+    if (node->Name() == "feed") {
+      if (node->IsOp()) {
+        feed_list.push_back("");
+      }
+    } else if (node->Name() == "fetch") {
+      if (node->IsOp()) {
+        fetch_list.push_back("");
+      }
+    }
+  }
+  for (auto node : graph->Nodes()) {
+    if (node->Name() == "feed") {
+      if (node->IsOp()) {
+        feed_list[BOOST_GET_CONST(int, node->Op()->GetAttr("col"))] =
+            node->outputs[0]->Name();
+      }
+    } else if (node->Name() == "fetch") {
+      if (node->IsOp()) {
+        fetch_list[BOOST_GET_CONST(int, node->Op()->GetAttr("col"))] =
+            node->inputs[0]->Name();
+      }
+    }
+  }
+
+  // Run passes
+  std::vector<std::string> graph_pass = {"forward_graph_extract_pass",
+                                         "infer_shape_pass", "avg_shard_pass",
+                                         "popart_canonicalization_pass"};
+  std::vector<std::string> compile_pass = {
+      "ipu_inplace_pass", "ipu_graph_builder_pass", "ipu_runtime_replacer_pass",
+      "inference_postprocess_pass"};
+  for (auto pass_name : graph_pass) {
+    auto pass = PassRegistry::Instance().Get(pass_name);
+    if (pass_name == "infer_shape_pass") {
+      pass->Set("feed_list", new std::vector<std::string>(feed_list.begin(),
+                                                          feed_list.end()));
+    }
+    pass->Apply(graph);
+  }
+
+  for (auto pass_name : compile_pass) {
+    auto pass = PassRegistry::Instance().Get(pass_name);
+    pass->Set("feed_list",
+              new std::vector<std::string>(feed_list.begin(), feed_list.end()));
+    pass->Set("fetch_list", new std::vector<std::string>(fetch_list.begin(),
+                                                         fetch_list.end()));
+    pass->Apply(graph);
+  }
+
+  VLOG(10) << "leave InferenceProcessPass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(inference_process_pass,
+              paddle::framework::ir::InferenceProcessPass);
diff --git a/paddle/fluid/framework/ir/ipu/inference_process_pass.h b/paddle/fluid/framework/ir/ipu/inference_process_pass.h
new file mode 100644
index 00000000000000..bac0e88377f7c6
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/inference_process_pass.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class InferenceProcessPass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.cc b/paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.cc
new file mode 100644
index 00000000000000..5a53466089bc88
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.cc
@@ -0,0 +1,52 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.h"
+
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void IpuGraphBuilderPass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(10) << "enter IpuGraphBuilderPass::ApplyImpl";
+  VLOG(10) << "Raw Graph: ";
+  VLOG(10) << DebugString(graph);
+
+  std::vector<std::string> feed_list;
+  feed_list = Get<std::vector<std::string>>("feed_list");
+
+  std::vector<std::string> fetch_list;
+  fetch_list = Get<std::vector<std::string>>("fetch_list");
+
+  std::shared_ptr<platform::ipu::IpuBackend> ipu_backend =
+      platform::ipu::IpuBackend::GetInstance();
+
+  ipu_backend->Compile(graph, feed_list, fetch_list);
+
+  VLOG(10) << "Post Graph: ";
+  VLOG(10) << DebugString(graph);
+  VLOG(10) << "leave IpuGraphBuilderPass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(ipu_graph_builder_pass,
+              paddle::framework::ir::IpuGraphBuilderPass)
+    .RequirePassAttr("feed_list")
+    .RequirePassAttr("fetch_list");
diff --git a/paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.h b/paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.h
new file mode 100644
index 00000000000000..6237df36480335
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class IpuGraphBuilderPass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/ipu/ipu_inplace_pass.cc b/paddle/fluid/framework/ir/ipu/ipu_inplace_pass.cc
new file mode 100644
index 00000000000000..d3f1f1633ffc94
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/ipu_inplace_pass.cc
@@ -0,0 +1,85 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/ipu_inplace_pass.h"
+
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+std::string GenerateVarName(Node *node) {
+  return node->Name() + "_" + std::to_string(node->id());
+}
+
+void IpuInplacePass::ApplyImpl(ir::Graph *graph) const {
+  // use this pass after forward_graph_extract_pass
+  // raise error if the inplaced var both in feed_list & fetch_list
+  VLOG(10) << "enter IpuInplacePass::ApplyImpl";
+  VLOG(10) << "Raw Graph: ";
+  VLOG(10) << DebugString(graph);
+
+  std::vector<std::string> feed_list;
+  feed_list = Get<std::vector<std::string>>("feed_list");
+  std::vector<std::string> fetch_list;
+  fetch_list = Get<std::vector<std::string>>("fetch_list");
+
+  std::map<std::string, int> var_name;
+  for (auto *node : graph->Nodes()) {
+    if (node->IsVar()) {
+      if (var_name.find(node->Name()) == var_name.end()) {
+        var_name.emplace(node->Name(), 1);
+      } else {
+        var_name[node->Name()]++;
+      }
+    }
+  }
+
+  for (auto *node : graph->Nodes()) {
+    if (node->IsVar()) {
+      if (var_name[node->Name()] > 1) {
+        auto is_feed = (std::find(feed_list.begin(), feed_list.end(),
+                                  node->Name()) != feed_list.end()) &&
+                       (node->inputs.size() == 0);
+        auto is_fetch = (std::find(fetch_list.begin(), fetch_list.end(),
+                                   node->Name()) != fetch_list.end()) &&
+                        (node->outputs.size() == 0);
+        if (!is_feed && !is_fetch && !node->Var()->Persistable()) {
+          auto old_name = node->Name();
+          auto new_name = GenerateVarName(node);
+          node->RenameVar(new_name);
+          for (auto *op_in : node->inputs) {
+            op_in->Op()->RenameOutput(old_name, new_name);
+          }
+          for (auto *op_out : node->outputs) {
+            op_out->Op()->RenameInput(old_name, new_name);
+          }
+        }
+      }
+    }
+  }
+
+  VLOG(10) << "Post Graph: ";
+  VLOG(10) << DebugString(graph);
+  VLOG(10) << "leave IpuInplacePass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(ipu_inplace_pass, paddle::framework::ir::IpuInplacePass)
+    .RequirePassAttr("feed_list")
+    .RequirePassAttr("fetch_list");
diff --git a/paddle/fluid/framework/ir/ipu/ipu_inplace_pass.h b/paddle/fluid/framework/ir/ipu/ipu_inplace_pass.h
new file mode 100644
index 00000000000000..86756276c8c3dc
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/ipu_inplace_pass.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class IpuInplacePass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/ipu/ipu_pass_base.cc b/paddle/fluid/framework/ir/ipu/ipu_pass_base.cc
new file mode 100644
index 00000000000000..ba9233eeb8cb95
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/ipu_pass_base.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void IPUPassBase::Init(const std::string& repr, Graph* graph) const {
+  repr_ = repr;
+  graph_ = graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/ipu/ipu_pass_base.h b/paddle/fluid/framework/ir/ipu/ipu_pass_base.h
new file mode 100644
index 00000000000000..b56d3e4c65b1c0
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/ipu_pass_base.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/scope.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class IPUPassBase : public Pass {
+ public:
+  void Init(const std::string& repr, Graph* graph) const;
+  virtual ~IPUPassBase() {}
+
+ protected:
+  mutable Graph* graph_;
+  mutable std::string repr_;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.cc b/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.cc
new file mode 100644
index 00000000000000..a3e020714e1db9
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.cc
@@ -0,0 +1,97 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.h"
+
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void IpuRuntimeReplacerPass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(10) << "enter IpuRuntimeReplacerPass::ApplyImpl";
+  VLOG(10) << "Raw Graph: ";
+  VLOG(10) << DebugString(graph);
+
+  std::vector<std::string> feed_list;
+  feed_list = Get<std::vector<std::string>>("feed_list");
+
+  std::vector<std::string> fetch_list;
+  fetch_list = Get<std::vector<std::string>>("fetch_list");
+
+  framework::OpDesc ipu_rt_op_desc;
+  ipu_rt_op_desc.SetType("ipu_runtime");
+  ipu_rt_op_desc.SetInput("FeedList", feed_list);
+  ipu_rt_op_desc.SetOutput("FetchList", fetch_list);
+  ipu_rt_op_desc.Flush();
+
+  // Create a new node for the ipu_runtime_op.
+  auto* ipu_rt_node = graph->CreateOpNode(&ipu_rt_op_desc);
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsVar()) {
+      for (auto feed : feed_list) {
+        if (node->Name() == feed) {
+          IR_NODE_LINK_TO(node, ipu_rt_node);
+        }
+      }
+      for (auto fetch : fetch_list) {
+        if (node->Name() == fetch) {
+          IR_NODE_LINK_TO(ipu_rt_node, node);
+        }
+      }
+    }
+  }
+
+  // set ipu_runtime_op dtype attr
+  if (fetch_list.size() == 1) {
+    for (auto* node : graph->Nodes()) {
+      if (node->IsVar()) {
+        for (auto fetch : fetch_list) {
+          if (node->Name() == fetch) {
+            ipu_rt_node->Op()->SetAttr("dtype", node->Var()->GetDataType());
+          }
+        }
+      }
+    }
+  }
+
+  // Remove unneeded nodes.
+  std::unordered_set<const Node*> marked_nodes;
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp()) {
+      auto* op_desc = node->Op();
+      if (op_desc->Type() != "ipu_runtime") {
+        marked_nodes.insert(node);
+      }
+    }
+  }
+
+  GraphSafeRemoveNodes(graph, marked_nodes);
+
+  VLOG(10) << "Post Graph: ";
+  VLOG(10) << DebugString(graph);
+  VLOG(10) << "leave IpuRuntimeReplacerPass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(ipu_runtime_replacer_pass,
+              paddle::framework::ir::IpuRuntimeReplacerPass)
+    .RequirePassAttr("feed_list")
+    .RequirePassAttr("fetch_list");
diff --git a/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.h b/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.h
new file mode 100644
index 00000000000000..ba2cc8702fa473
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class IpuRuntimeReplacerPass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc
new file mode 100644
index 00000000000000..c6be2c775bd211
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc
@@ -0,0 +1,91 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/optimizer_extract_pass.h"
+
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(10) << "enter IpuOptimizerExtractPass::ApplyImpl";
+  VLOG(10) << "Raw Graph: ";
+  VLOG(10) << DebugString(graph);
+
+  auto ipu_backend = paddle::platform::ipu::IpuBackend::GetInstance();
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp() && node->Op()) {
+      int op_role = BOOST_GET_CONST(
+          int, node->Op()->GetAttr(
+                   framework::OpProtoAndCheckerMaker::OpRoleAttrName()));
+
+      // graph usually have multiple optimizer node for different parameter,
+      // and these node have the same type and attr value usually
+      if ((op_role == static_cast<int>(framework::OpRole::kOptimize))) {
+        ipu_backend->GetExecutor().SetOptimizerType(node->Op()->Type());
+        VLOG(10) << "found optimizer type: " << node->Op()->Type();
+
+        for (const std::string& attr_name : node->Op()->AttrNames()) {
+          auto attr_type = node->Op()->GetAttrType(attr_name);
+          // with adam, attr are float
+          if (attr_type == proto::AttrType::FLOAT) {
+            auto attr_value =
+                BOOST_GET_CONST(float, node->Op()->GetAttr(attr_name));
+            ipu_backend->GetExecutor().SetOptimizerAttr(attr_name, attr_value);
+          } else {
+            VLOG(10) << "Skip " << attr_type;
+          }
+        }
+
+        auto lr_var_name = node->Op()->Input("LearningRate");
+        PADDLE_ENFORCE_EQ(lr_var_name.size(), 1u,
+                          platform::errors::InvalidArgument(
+                              "In op(%s), find input(LearningRate) failed.",
+                              node->Op()->Type()));
+
+        ipu_backend->GetExecutor().SetLRVarName(lr_var_name[0]);
+      }
+
+      if ((op_role == static_cast<int>(framework::OpRole::kLoss))) {
+        VLOG(10) << "found loss op type: " << node->Op()->Type();
+        auto outputs = node->Op()->Outputs();
+        PADDLE_ENFORCE_EQ(
+            outputs.size(), 1,
+            platform::errors::InvalidArgument("Can only support one loss key"));
+
+        auto losses_name = outputs.begin()->second;
+        PADDLE_ENFORCE_EQ(losses_name.size(), 1,
+                          platform::errors::InvalidArgument(
+                              "Can only support one loss name"));
+
+        ipu_backend->GetExecutor().SetLoss(losses_name[0]);
+      }
+    }
+  }
+
+  VLOG(10) << "Post Graph: ";
+  VLOG(10) << DebugString(graph);
+  VLOG(10) << "leave IpuOptimizerExtractPass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(optimizer_extract_pass,
+              paddle::framework::ir::IpuOptimizerExtractPass);
diff --git a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.h b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.h
new file mode 100644
index 00000000000000..fd274ded8f5bd1
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class IpuOptimizerExtractPass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.cc b/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.cc
new file mode 100644
index 00000000000000..c23bfdcb154f16
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.cc
@@ -0,0 +1,79 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.h"
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/platform/device/ipu/common.h"
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+using paddle::platform::ipu::IpuBackend;
+using framework::ir::Graph;
+using framework::ir::Node;
+
+void IpuOptimizerStateAlignPass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(10) << "enter IpuOptimizerStateAlignPass::ApplyImpl";
+  VLOG(10) << "Raw Graph: ";
+  VLOG(10) << DebugString(graph);
+
+  auto ipu_backend = IpuBackend::GetInstance();
+  const auto* scope_ = ipu_backend->GetScope();
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp() && node->Op()) {
+      int op_role = BOOST_GET_CONST(
+          int, node->Op()->GetAttr(
+                   framework::OpProtoAndCheckerMaker::OpRoleAttrName()));
+
+      if ((op_role == static_cast<int>(framework::OpRole::kOptimize))) {
+        auto inputs = node->Op()->Inputs();
+        if (inputs.count(platform::ipu::sBeta1Pow)) {
+          auto var = scope_->GetVar(inputs.at(platform::ipu::sBeta1Pow)[0]);
+          auto data = var->GetMutable<framework::LoDTensor>()->data<float>();
+          auto beta = BOOST_GET_CONST(
+              float, node->Op()->GetAttr(platform::ipu::sBeta1));
+
+          // ensure current save with beta1pow, rather than step.
+          // beta1pow = beta1 ^ (step + 1). Just set beta1pow because popart
+          // support single Step__
+          bool save_with_beta1pow = (data[0] < 1.0f) && (data[0] > 0.0f);
+          float step = 0;
+          float beta_acc = beta;
+          while (beta_acc > data[0] && save_with_beta1pow) {
+            beta_acc *= beta;
+            step += 1;
+          }
+
+          if (save_with_beta1pow) {
+            data[0] = step;
+          }
+        }
+      }
+    }
+  }
+
+  VLOG(10) << "Post Graph: ";
+  VLOG(10) << DebugString(graph);
+  VLOG(10) << "leave IpuOptimizerStateAlignPass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(optimizer_state_align_pass,
+              paddle::framework::ir::IpuOptimizerStateAlignPass);
diff --git a/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.h b/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.h
new file mode 100644
index 00000000000000..21a1017d88452a
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+/*
+ * This pass should only affect optimizer that need bias correction,
+ * include Adam/Lamb.
+ */
+
+class IpuOptimizerStateAlignPass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc b/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc
new file mode 100644
index 00000000000000..c97b7fd5bcb0cb
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc
@@ -0,0 +1,68 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.h"
+
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/post_canonicalization.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+using framework::ir::Graph;
+using framework::ir::Node;
+using platform::ipu::SymbolHandler;
+
+void PopartCanonicalizationPass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(10) << "enter PopartCanonicalizationPass::ApplyImpl";
+  VLOG(10) << "Raw Graph: ";
+  VLOG(10) << DebugString(graph);
+
+  auto nodes = graph->Nodes();
+  for (auto* node : nodes) {
+    if (!node->IsOp()) {
+      continue;
+    }
+    auto* op = node->Op();
+    auto op_type = op->Type();
+
+    ir::Node* new_node = nullptr;
+    SymbolHandler handler = platform::ipu::GetHandler(op_type);
+    if (handler) {
+      VLOG(11) << "Raw Paddle Node:";
+      VLOG(11) << node->Op()->Proto()->DebugString();
+      new_node = handler(graph, node);
+      VLOG(11) << "Post Popart Node:";
+      VLOG(11) << new_node->Op()->Proto()->DebugString();
+
+      platform::ipu::ClearNode(node);
+      graph->RemoveNode(node);
+    } else {
+      LOG(ERROR) << "Can not find OpHandler for op_type: " << op_type;
+    }
+  }
+
+  VLOG(10) << "Post Graph: ";
+  VLOG(10) << DebugString(graph);
+  VLOG(10) << "leave PopartCanonicalizationPass::ApplyImpl";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(popart_canonicalization_pass,
+              paddle::framework::ir::PopartCanonicalizationPass);
diff --git a/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.h b/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.h
new file mode 100644
index 00000000000000..6690873f2a9ac2
--- /dev/null
+++ b/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class PopartCanonicalizationPass : public IPUPassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
index 523c2161326466..f3d96c38506564 100644
--- a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
@@ -84,13 +84,16 @@ void TransposeFlattenConcatFusePass::RunTransposeFlattenConcatFuse(
       LOG(WARNING) << "Pass in op compat failed.";
       return;
     }
+
     const int kNumFields = 5;
     const int kTransOffset = 1;
     const int kTransOutOffset = 2;
     const int kFlattenOffset = 3;
     const int kFlattenOutOffset = 4;
-    std::vector<Node *> nodes;
 
+    std::vector<Node *> nodes;
+    std::vector<int> trans_axis0;
+    int flatten_axis0;
     for (int i = 0; i < times; i++) {
       PADDLE_ENFORCE_NOT_NULL(
           subgraph.at(pattern.GetPDNode("transpose" + std::to_string(i))),
@@ -112,6 +115,33 @@ void TransposeFlattenConcatFusePass::RunTransposeFlattenConcatFuse(
           platform::errors::NotFound("Can not find %s in subgraph.",
                                      input_nodes[i]->name()));
 
+      if (i == 0) {
+        trans_axis0 = BOOST_GET_CONST(
+            std::vector<int>,
+            subgraph.at(pattern.GetPDNode("transpose" + std::to_string(0)))
+                ->Op()
+                ->GetAttr("axis"));
+        flatten_axis0 = BOOST_GET_CONST(
+            int, subgraph.at(pattern.GetPDNode("flatten" + std::to_string(0)))
+                     ->Op()
+                     ->GetAttr("axis"));
+      } else {
+        std::vector<int> trans_axis = BOOST_GET_CONST(
+            std::vector<int>,
+            subgraph.at(pattern.GetPDNode("transpose" + std::to_string(i)))
+                ->Op()
+                ->GetAttr("axis"));
+        // All axis of transpose should be the same
+        if (trans_axis0 != trans_axis) return;
+
+        int flatten_axis = BOOST_GET_CONST(
+            int, subgraph.at(pattern.GetPDNode("flatten" + std::to_string(0)))
+                     ->Op()
+                     ->GetAttr("axis"));
+        // All axis of flatten should be the same
+        if (flatten_axis0 != flatten_axis) return;
+      }
+
       nodes.push_back(subgraph.at(input_nodes[i]));
       nodes.push_back(
           subgraph.at(pattern.GetPDNode("transpose" + std::to_string(i))));
diff --git a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
index b13166cff60aa3..8d2ee2f01008bf 100644
--- a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
+++ b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
@@ -2,7 +2,7 @@ cc_library(cinn_cache_key SRCS cinn_cache_key.cc DEPS boost graph graph_helper l
 cc_library(build_cinn_pass SRCS build_cinn_pass.cc DEPS pass subgraph_detector graph_pattern_detector cinn_compiler errors enforce)
 cc_library(transform_desc SRCS transform_desc.cc DEPS proto_desc cinn)
 cc_library(cinn_graph_symbolization SRCS cinn_graph_symbolization.cc DEPS lod_tensor graph transform_desc cinn)
-cc_library(cinn_compiler SRCS cinn_compiler.cc DEPS framework_proto graph lod_tensor cinn_cache_key cinn_graph_symbolization cinn)
+cc_library(cinn_compiler SRCS cinn_compiler.cc DEPS framework_proto graph lod_tensor cinn_cache_key cinn_graph_symbolization cinn cinn_launch_context)
 
 if (WITH_TESTING)
   cc_test(cinn_lib_test SRCS cinn_lib_test.cc DEPS cinn)
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc
index 368fb4a5fd8c96..0e157ae7d79f38 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc
@@ -29,55 +29,32 @@ namespace paddle {
 namespace framework {
 namespace paddle2cinn {
 
+using GraphHashStrategy = CinnCacheKey::GraphHashStrategy;
+
+CinnCacheKey::CinnCacheKey(GraphHashStrategy graph_hash)
+    : graph_hash_(graph_hash) {}
+
 CinnCacheKey::CinnCacheKey(
     const ir::Graph& graph,
     const std::map<std::string, const LoDTensor*>& input_tensors,
-    const std::string& arch_str) {
+    const std::string& arch_str, GraphHashStrategy graph_hash)
+    : graph_hash_(graph_hash) {
   this->SetKey(graph, input_tensors, arch_str);
 }
 
 CinnCacheKey::CinnCacheKey(const ir::Graph& graph,
                            const std::map<std::string, DDim>& input_shapes,
-                           const std::string& arch_str) {
+                           const std::string& arch_str,
+                           GraphHashStrategy graph_hash)
+    : graph_hash_(graph_hash) {
   this->SetKey(graph, input_shapes, arch_str);
 }
 
-size_t CinnCacheKey::HashGraph(const ir::Graph& graph) {
-  // using Dot to unqiue graph
-  inference::analysis::Dot dot;
-  std::unordered_map<const ir::Node*, std::string> node2dot;
-  int id = 0;
-  // Create nodes
-  // graph.Nodes() return unordered_set, the same graph may
-  // return different result?
-  for (const ir::Node* n : graph.Nodes()) {
-    std::string node_id = std::to_string(id++);
-    dot.AddNode(node_id, {}, n->Name(), true);
-    node2dot[n] = node_id;
-  }
-
-  // Create edges
-  for (const ir::Node* n : graph.Nodes()) {
-    const auto& src_id = node2dot.at(n);
-    for (auto* out : n->outputs) {
-      const auto& dest_id = node2dot.at(out);
-      dot.AddEdge(src_id, dest_id, {});
-    }
-  }
-
-  const std::string& viz_graph = dot.Build();
-  VLOG(1) << "The hash graph:\n" << viz_graph;
-
-  size_t hash_val = std::hash<std::string>()(viz_graph);
-  VLOG(4) << "The graph's hash value is: " << hash_val;
-  return hash_val;
-}
-
 void CinnCacheKey::SetKey(
     const ir::Graph& graph,
     const std::map<std::string, const LoDTensor*>& input_tensors,
     const std::string& arch_str) {
-  graph_serialize_str_ = std::to_string(HashGraph(graph));
+  graph_hash_val_ = graph_hash_(graph);
   for (const auto& name_tensor : input_tensors) {
     input_shapes_[name_tensor.first] = name_tensor.second->dims();
   }
@@ -87,7 +64,7 @@ void CinnCacheKey::SetKey(
 void CinnCacheKey::SetKey(const ir::Graph& graph,
                           const std::map<std::string, DDim>& input_shapes,
                           const std::string& arch_str) {
-  graph_serialize_str_ = std::to_string(HashGraph(graph));
+  graph_hash_val_ = graph_hash_(graph);
   input_shapes_ = input_shapes;
   arch_str_ = arch_str;
 }
@@ -97,7 +74,7 @@ bool CinnCacheKey::operator!=(const CinnCacheKey& other) const {
 }
 
 bool CinnCacheKey::operator==(const CinnCacheKey& other) const {
-  return graph_serialize_str_ == other.graph_serialize_str_ &&
+  return graph_hash_val_ == other.graph_hash_val_ &&
          input_shapes_ == other.input_shapes_ && arch_str_ == other.arch_str_;
 }
 
@@ -114,11 +91,48 @@ size_t CinnCacheKey::Hash::operator()(const CinnCacheKey& key) const {
     ret = hash_combine(ret, string_hasher(name_shape.second.to_str()));
   }
 
-  ret = hash_combine(ret, string_hasher(key.graph_serialize_str_));
+  ret = hash_combine(ret, key.graph_hash_val_);
   ret = hash_combine(ret, string_hasher(key.arch_str_));
   return ret;
 }
 
+size_t CinnCacheKeyByStructure::HashGraph(const ir::Graph& graph) {
+  // sort grad node by name and id.
+  auto compare = [](ir::Node* n1, ir::Node* n2) {
+    return (n1->Name() == n2->Name()) ? (n1->id() < n2->id())
+                                      : (n1->Name() < n2->Name());
+  };
+
+  // graph.Nodes() return unordered_set, here using set to avoid the same graph
+  // may return different result
+  std::set<ir::Node *, bool (*)(ir::Node *, ir::Node *)> node_set(compare),
+      output_set(compare);
+  node_set.insert(graph.Nodes().begin(), graph.Nodes().end());
+
+  std::string hash_str;
+  for (ir::Node* n : node_set) {
+    hash_str.append(n->Name());
+
+    output_set.clear();
+    output_set.insert(n->outputs.begin(), n->outputs.end());
+    for (auto* out : output_set) {
+      hash_str.append(out->Name());
+    }
+  }
+
+  VLOG(1) << "The hash graph:\n" << hash_str;
+
+  size_t hash_val = std::hash<std::string>()(hash_str);
+  VLOG(4) << "The graph's hash value by graph structure is: " << hash_val;
+  return hash_val;
+}
+
+size_t CinnCacheKeyByAddress::HashGraph(const ir::Graph& graph) {
+  size_t hash_val = reinterpret_cast<size_t>(&graph);
+  VLOG(4) << "The graph's hash value by graph address is: " << hash_val;
+  return hash_val;
+}
+
 }  // namespace paddle2cinn
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_cache_key.h b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.h
index 941f8e0cdecc18..67325297c47724 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_cache_key.h
+++ b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <functional>
 #include <map>
 
 #include "paddle/fluid/framework/ddim.h"
@@ -33,14 +34,18 @@ namespace paddle2cinn {
 // shapes.
 class CinnCacheKey {
  public:
+  using GraphHashStrategy = std::function<size_t(const ir::Graph&)>;
+
+  explicit CinnCacheKey(GraphHashStrategy graph_hash);
+
   CinnCacheKey(const ir::Graph& graph,
                const std::map<std::string, const LoDTensor*>& input_tensors,
-               const std::string& arch_str);
+               const std::string& arch_str, GraphHashStrategy graph_hash);
   CinnCacheKey(const ir::Graph& graph,
                const std::map<std::string, DDim>& input_shapes,
-               const std::string& arch_str);
+               const std::string& arch_str, GraphHashStrategy graph_hash);
 
-  ~CinnCacheKey() {}
+  ~CinnCacheKey() = default;
 
   void SetKey(const ir::Graph& graph,
               const std::map<std::string, const LoDTensor*>& input_tensors,
@@ -58,13 +63,38 @@ class CinnCacheKey {
   };
 
  private:
-  size_t HashGraph(const ir::Graph& graph);
-
-  std::string graph_serialize_str_;
+  GraphHashStrategy graph_hash_;
+  size_t graph_hash_val_;
   std::map<std::string, DDim> input_shapes_;
   std::string arch_str_;
 };
 
+#define CINN_CACHE_KEY_CREATE(NAME)                                    \
+  class NAME : public CinnCacheKey {                                   \
+   public:                                                             \
+    NAME() : CinnCacheKey(HashGraph) {}                                \
+                                                                       \
+    NAME(const ir::Graph& graph,                                       \
+         const std::map<std::string, const LoDTensor*>& input_tensors, \
+         const std::string& arch_str)                                  \
+        : CinnCacheKey(graph, input_tensors, arch_str, HashGraph) {}   \
+                                                                       \
+    NAME(const ir::Graph& graph,                                       \
+         const std::map<std::string, DDim>& input_shapes,              \
+         const std::string& arch_str)                                  \
+        : CinnCacheKey(graph, input_shapes, arch_str, HashGraph) {}    \
+                                                                       \
+   private:                                                            \
+    static size_t HashGraph(const ir::Graph& graph);                   \
+  };
+
+// Class to store the keys by graph address for compiling CINN.
+CINN_CACHE_KEY_CREATE(CinnCacheKeyByAddress)
+// Class to store the keys by graph structure for compiling CINN.
+CINN_CACHE_KEY_CREATE(CinnCacheKeyByStructure)
+
+#undef CINN_CACHE_KEY_CREATE
+
 }  // namespace paddle2cinn
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc
index f13f44998211f4..f9b48ef4b5ec0c 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc
@@ -26,8 +26,8 @@ namespace paddle {
 namespace framework {
 namespace paddle2cinn {
 
-TEST(CinnCacheKeyTest, TestAsUnorderedKey) {
-  std::unordered_set<CinnCacheKey, CinnCacheKey::Hash> test_set;
+TEST(CinnCacheKeyTest, TestAsUnorderedKeyByStructure) {
+  std::unordered_set<CinnCacheKeyByStructure, CinnCacheKey::Hash> test_set;
 
   ProgramDesc empty_program;
   ir::Graph empty_graph(empty_program);
@@ -47,19 +47,20 @@ TEST(CinnCacheKeyTest, TestAsUnorderedKey) {
   DDim ddim = paddle::framework::make_ddim({1, 2, 3});
   std::map<std::string, DDim> feed_shapes = {{"X", ddim}};
 
-  CinnCacheKey cache_key0(empty_graph, feed_tensors, "x86");
-  CinnCacheKey cache_key1(empty_graph, feed_shapes, "x86");
+  CinnCacheKeyByStructure cache_key0(empty_graph, feed_tensors, "x86");
+  CinnCacheKeyByStructure cache_key1(empty_graph, feed_shapes, "x86");
   EXPECT_EQ(cache_key0, cache_key1);
 
-  CinnCacheKey cache_key2(graph, feed_shapes, "x86");
-  CinnCacheKey cache_key3(graph, feed_shapes, "nvgpu");
-  CinnCacheKey cache_key4(graph, feed_tensors, "nvgpu");
+  CinnCacheKeyByStructure cache_key2(graph, feed_shapes, "x86");
+  CinnCacheKeyByStructure cache_key3(graph, feed_shapes, "nvgpu");
+  CinnCacheKeyByStructure cache_key4(graph, feed_tensors, "nvgpu");
   EXPECT_NE(cache_key2, cache_key3);
   EXPECT_EQ(cache_key3, cache_key4);
 
-  CinnCacheKey cache_key5(empty_graph,
-                          std::map<std::string, const LoDTensor *>(), "unk");
-  CinnCacheKey cache_key6(empty_graph, std::map<std::string, DDim>(), "unk");
+  CinnCacheKeyByStructure cache_key5(
+      empty_graph, std::map<std::string, const LoDTensor *>(), "unk");
+  CinnCacheKeyByStructure cache_key6(empty_graph, std::map<std::string, DDim>(),
+                                     "unk");
   EXPECT_EQ(cache_key5, cache_key6);
 
   EXPECT_NE(cache_key1, cache_key3);
@@ -98,6 +99,107 @@ TEST(CinnCacheKeyTest, TestAsUnorderedKey) {
   EXPECT_EQ(test_set.find(cache_key6), test_set.end());
 }
 
+TEST(CinnCacheKeyTest, TestAsUnorderedKeyByAddress) {
+  std::unordered_set<CinnCacheKeyByAddress, CinnCacheKey::Hash> test_set;
+
+  ProgramDesc empty_program;
+  ir::Graph empty_graph(empty_program);
+
+  ProgramDesc program;
+  auto *global_block = program.MutableBlock(0);
+  auto *x = global_block->Var("X");
+  x->SetType(proto::VarType::LOD_TENSOR);
+  ir::Graph graph(program);
+
+  LoDTensor tensor;
+  tensor.Resize({1, 2, 3});
+  const LoDTensor *tensor_pointer = &tensor;
+  std::map<std::string, const LoDTensor *> feed_tensors = {
+      {"X", tensor_pointer}};
+
+  DDim ddim = paddle::framework::make_ddim({1, 2, 3});
+  std::map<std::string, DDim> feed_shapes = {{"X", ddim}};
+
+  CinnCacheKeyByAddress cache_key0(empty_graph, feed_tensors, "x86");
+  CinnCacheKeyByAddress cache_key1(empty_graph, feed_shapes, "x86");
+  EXPECT_EQ(cache_key0, cache_key1);
+
+  CinnCacheKeyByAddress cache_key2(graph, feed_shapes, "x86");
+  CinnCacheKeyByAddress cache_key3(graph, feed_shapes, "nvgpu");
+  CinnCacheKeyByAddress cache_key4(graph, feed_tensors, "nvgpu");
+  EXPECT_NE(cache_key2, cache_key3);
+  EXPECT_EQ(cache_key3, cache_key4);
+
+  CinnCacheKeyByAddress cache_key5(
+      empty_graph, std::map<std::string, const LoDTensor *>(), "unk");
+  CinnCacheKeyByAddress cache_key6(empty_graph, std::map<std::string, DDim>(),
+                                   "unk");
+  EXPECT_EQ(cache_key5, cache_key6);
+
+  EXPECT_NE(cache_key1, cache_key3);
+  EXPECT_NE(cache_key4, cache_key2);
+
+  EXPECT_NE(cache_key3, cache_key5);
+  EXPECT_NE(cache_key6, cache_key4);
+
+  EXPECT_NE(cache_key5, cache_key1);
+  EXPECT_NE(cache_key2, cache_key6);
+
+  test_set.insert(cache_key0);
+  test_set.insert(cache_key1);
+  test_set.insert(cache_key3);
+  test_set.insert(cache_key4);
+  test_set.insert(cache_key5);
+  test_set.insert(cache_key6);
+  EXPECT_EQ(test_set.size(), 3U);
+
+  auto iter = test_set.find(cache_key0);
+  EXPECT_NE(iter, test_set.end());
+  test_set.erase(iter);
+  EXPECT_EQ(test_set.size(), 2U);
+  EXPECT_EQ(test_set.find(cache_key1), test_set.end());
+
+  iter = test_set.find(cache_key3);
+  EXPECT_NE(iter, test_set.end());
+  test_set.erase(iter);
+  EXPECT_EQ(test_set.size(), 1U);
+  EXPECT_EQ(test_set.find(cache_key4), test_set.end());
+
+  iter = test_set.find(cache_key5);
+  EXPECT_NE(iter, test_set.end());
+  test_set.erase(iter);
+  EXPECT_EQ(test_set.size(), 0U);
+  EXPECT_EQ(test_set.find(cache_key6), test_set.end());
+}
+
+TEST(CinnCacheKeyTest, TestSameGraph) {
+  ProgramDesc program1;
+  auto *global_block1 = program1.MutableBlock(0);
+  auto *x1 = global_block1->Var("X");
+  x1->SetType(proto::VarType::LOD_TENSOR);
+  ir::Graph graph1(program1);
+
+  ProgramDesc program2;
+  auto *global_block2 = program2.MutableBlock(0);
+  auto *x2 = global_block2->Var("X");
+  x2->SetType(proto::VarType::LOD_TENSOR);
+  ir::Graph graph2(program2);
+
+  LoDTensor tensor;
+  tensor.Resize({1, 2, 3});
+  const LoDTensor *tensor_pointer = &tensor;
+  std::map<std::string, const LoDTensor *> feed_tensors = {
+      {"X", tensor_pointer}};
+
+  CinnCacheKeyByAddress cache_key_by_address1(graph1, feed_tensors, "x86");
+  CinnCacheKeyByAddress cache_key_by_address2(graph2, feed_tensors, "x86");
+  EXPECT_NE(cache_key_by_address1, cache_key_by_address2);
+
+  CinnCacheKeyByStructure cache_key_by_struct1(graph1, feed_tensors, "x86");
+  CinnCacheKeyByStructure cache_key_by_struct2(graph2, feed_tensors, "x86");
+  EXPECT_EQ(cache_key_by_struct1, cache_key_by_struct2);
+}
+
 }  // namespace paddle2cinn
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
index 7fc8eff3d31c9e..54167d95899d6f 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
@@ -41,6 +41,7 @@
 #include "paddle/fluid/framework/rw_lock.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/inference/analysis/dot.h"
+#include "paddle/fluid/operators/cinn/cinn_launch_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/string/string_helper.h"
 
@@ -68,23 +69,41 @@ const CinnCompiledObject& CinnCompiler::Compile(
     const std::map<std::string, const LoDTensor*>& input_tensors,
     const Target& target, void* stream) {
   VLOG(1) << "-- The graph to be compiled is:\n" << VizGraph(graph);
-  CinnCacheKey cur_key(graph, input_tensors, target.arch_str());
+  CinnCacheKeyByAddress cur_key_by_address(graph, input_tensors,
+                                           target.arch_str());
+  CinnCacheKeyByStructure cur_key_by_struct;
+
   bool exist = false;
   {
     AutoRDLock r_guard{&rwlock_};
-    exist = cache_.count(cur_key) != 0;
+    exist = cache_by_address_.count(cur_key_by_address) != 0;
+    // if cannot find graph by address, checkout whether the graph structure
+    // have been stored in cache.
+    if (!exist) {
+      // generate the structure cache key
+      cur_key_by_struct.SetKey(graph, input_tensors, target.arch_str());
+
+      // if the graph structure can be found, storing the graph address in
+      // cache for next query.
+      if (cache_by_struct_.count(cur_key_by_struct) != 0) {
+        exist = true;
+        cache_by_address_[cur_key_by_address] =
+            cache_by_struct_.at(cur_key_by_struct).get();
+      }
+    }
   }
   if (!exist) {
     std::int64_t compiled_num = real_compiled_num_.fetch_add(1);
     auto compiled_res =
         CompileGraph(graph, input_tensors, target, compiled_num, stream);
     AutoWRLock w_guard{&rwlock_};
-    if (!cache_.count(cur_key)) {
-      cache_[cur_key] = std::move(compiled_res);
+    if (!cache_by_struct_.count(cur_key_by_struct)) {
+      cache_by_address_[cur_key_by_address] = compiled_res.get();
+      cache_by_struct_[cur_key_by_struct] = std::move(compiled_res);
     }
   }
   AutoRDLock guard{&rwlock_};
-  const auto& cached_boj = *cache_[cur_key];
+  const auto& cached_boj = *cache_by_address_[cur_key_by_address];
   return cached_boj;
 }
 
@@ -181,7 +200,8 @@ void CinnCompiler::Clear() {
   {
     AutoWRLock guard{&rwlock_};
     graphs_.clear();
-    cache_.clear();
+    cache_by_address_.clear();
+    cache_by_struct_.clear();
   }
   real_compiled_num_.store(0);
 }
@@ -217,6 +237,9 @@ std::unique_ptr<CinnCompiledObject> CinnCompiler::CompileGraph(
   *compiled_obj = {std::move(graph_compiler),
                    std::move(compiled_res.runtime_program), scope,
                    symbol.var_model_to_program_map()};
+  compiled_obj->launch_context =
+      std::make_unique<operators::details::CinnLaunchContext>(
+          compiled_obj->paddle2cinn_varmap, compiled_obj->scope);
   return compiled_obj;
 }
 
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
index 71119acf1fb49e..3bc60e55557070 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
@@ -31,6 +31,13 @@
 #include "paddle/fluid/platform/macros.h"
 
 namespace paddle {
+
+namespace operators {
+namespace details {
+class CinnLaunchContext;
+}
+}
+
 namespace framework {
 namespace paddle2cinn {
 
@@ -39,6 +46,7 @@ struct CinnCompiledObject {
   std::unique_ptr<::cinn::hlir::framework::Program> runtime_program;
   std::shared_ptr<::cinn::hlir::framework::Scope> scope;
   std::unordered_map<std::string, std::string> paddle2cinn_varmap;
+  std::unique_ptr<operators::details::CinnLaunchContext> launch_context;
 };
 
 // Entrance to use CINN.
@@ -87,9 +95,12 @@ class CinnCompiler {
       void* stream = nullptr) const;
 
   std::unordered_map<std::string, std::unique_ptr<ir::Graph>> graphs_;
-  std::unordered_map<CinnCacheKey, std::unique_ptr<CinnCompiledObject>,
+  std::unordered_map<CinnCacheKeyByAddress, CinnCompiledObject*,
                      CinnCacheKey::Hash>
-      cache_;
+      cache_by_address_;
+  std::unordered_map<CinnCacheKeyByStructure,
+                     std::unique_ptr<CinnCompiledObject>, CinnCacheKey::Hash>
+      cache_by_struct_;
   std::atomic_int64_t real_compiled_num_{0};
   mutable RWLock rwlock_;
 
diff --git a/paddle/fluid/framework/pten_utils.cc b/paddle/fluid/framework/pten_utils.cc
index 55254c65fad598..9521df651f9de9 100644
--- a/paddle/fluid/framework/pten_utils.cc
+++ b/paddle/fluid/framework/pten_utils.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <sstream>
 
 #include "paddle/fluid/framework/pten_utils.h"
+#include "paddle/pten/core/convert_utils.h"
 #include "paddle/pten/core/kernel_factory.h"
 
 #include "paddle/fluid/framework/lod_tensor.h"
@@ -190,8 +191,9 @@ KernelArgsNameMakerByOpProto::GetAttrsArgsNames() {
 }
 
 KernelSignature KernelArgsNameMakerByOpProto::GetKernelSignature() {
-  return KernelSignature(op_proto_->type(), GetInputArgsNames(),
-                         GetAttrsArgsNames(), GetOutputArgsNames());
+  return KernelSignature(pten::TransToPtenKernelName(op_proto_->type()),
+                         GetInputArgsNames(), GetAttrsArgsNames(),
+                         GetOutputArgsNames());
 }
 
 std::string KernelSignatureToString(const KernelSignature& signature) {
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 8875ef74bce14e..54f46e49c4f730 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -487,6 +487,14 @@ static void PreparedOpRunImpl(
         op.Type(), outs, dev_ctx->GetPlace());
   }
 
+  if (FLAGS_benchmark) {
+    dev_ctx->Wait();
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
+    VLOG(4) << "Operator(" << op.Type() << "): context wait and get last error";
+#endif
+  }
+
   /**
    * [ Why need handle complex gradient to real gradient? ]
    *
diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.cc b/paddle/fluid/operators/cinn/cinn_launch_context.cc
index 90a4ca73399cf5..f6337a754f91ae 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_context.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_context.cc
@@ -32,7 +32,30 @@ CinnLaunchContext::CinnLaunchContext(
       [](const auto& name_view) { return std::string(name_view.data()); });
 }
 
-bool CinnLaunchContext::IsVariableUsed(const std::string& paddle_name) {
+void CinnLaunchContext::UpdateCapturedEnv(const framework::Scope& scope,
+                                          const platform::Place& place) {
+  if (std::addressof(scope) == cached_scope_ &&
+      std::addressof(place) == cached_place_) {
+    VLOG(4) << "Captured scope:" << cached_scope_ << ", place:" << cached_place_
+            << " are not changed";
+    return;
+  }
+  cached_scope_ = std::addressof(scope);
+  cached_place_ = std::addressof(place);
+  cached_temp_scope_ = scope.NewTmpScope();
+  VLOG(4) << "Captured env is update, scope:" << cached_scope_ << "->"
+          << std::addressof(scope) << ", place:" << cached_place_ << "->"
+          << std::addressof(place);
+}
+
+bool CinnLaunchContext::IsArgumentsInitialized() const {
+  if (hold_buffers_.empty() || name2argument_.empty()) {
+    return false;
+  }
+  return true;
+}
+
+bool CinnLaunchContext::IsVariableUsed(const std::string& paddle_name) const {
   return paddle2cinn_varmap_.count(paddle_name) > 0 &&
          cinn_variable_names_.count(paddle2cinn_varmap_.at(paddle_name)) > 0;
 }
@@ -67,85 +90,83 @@ void CinnLaunchContext::CheckTensorEquivalent(const std::string& paddle_name,
   // TODO(CtfGo): check the underlying data type after CINN ready
 }
 
-void CinnLaunchContext::AssignExternalVariable(const std::string& paddle_name,
-                                               const platform::Place& place,
-                                               LoDTensor* paddle_tensor) {
+void CinnLaunchContext::AssignExternalVariable(const std::string& paddle_name) {
   PADDLE_ENFORCE_EQ(IsVariableUsed(paddle_name), true,
                     platform::errors::InvalidArgument(
                         "Paddle variable(%s) not used by cinn", paddle_name));
 
   const auto& cinn_name = paddle2cinn_varmap_.at(paddle_name);
+  const auto& paddle_tensor =
+      cached_scope_->GetVar(paddle_name)->Get<LoDTensor>();
   CinnTensor cinn_tensor = GetCinnTensor(cinn_name);
-  if (!paddle_tensor->IsInitialized()) {
-    paddle_tensor->Resize(framework::make_ddim(cinn_tensor->shape().data()));
+  if (paddle_tensor.IsInitialized()) {
+    CheckTensorEquivalent(paddle_name, paddle_tensor, cinn_tensor);
   }
-  CheckTensorEquivalent(paddle_name, *paddle_tensor, cinn_tensor);
-  return SetArgument(cinn_name, place, /* free_mem_callback = */ false,
-                     paddle_tensor);
+
+  auto cinn_buffer = std::make_unique<cinn_buffer_t>();
+  // assign dimensions and alloc/free callback of cinn_buffer_t
+  cinn_buffer->resize(cinn_tensor->shape().data().data(),
+                      cinn_tensor->shape().data().size());
+  cinn_buffer->external_malloc = new std::function<int(void*, cinn_buffer_t*)>(
+      [this, paddle_name](void* ctx, cinn_buffer_t* buffer) {
+        auto* tensor =
+            cached_scope_->GetVar(paddle_name)->GetMutable<LoDTensor>();
+        tensor->Resize(framework::DDim(buffer->dims, buffer->dimensions));
+        buffer->memory = reinterpret_cast<uint8_t*>(
+            tensor->mutable_data<float>(*cached_place_));
+        return 0;
+      });
+
+  // external variables will be recycled by global gc, so do nothing here
+  cinn_buffer->external_free = new std::function<int(void*, cinn_buffer_t*)>(
+      [](void* ctx, cinn_buffer_t* buffer) {
+        // Do nothing
+        return 0;
+      });
+
+  return SetArgument(cinn_name, std::move(cinn_buffer));
 }
 
-void CinnLaunchContext::AssignInternalVariable(const std::string& cinn_name,
-                                               const platform::Place& place,
-                                               LoDTensor* paddle_tensor) {
+void CinnLaunchContext::AssignInternalVariable(const std::string& cinn_name) {
   PADDLE_ENFORCE_GT(cinn_variable_names_.count(cinn_name), 0,
                     platform::errors::InvalidArgument(
                         "Variable(%s) not found in cinn socpe.", cinn_name));
   CinnTensor cinn_tensor = GetCinnTensor(cinn_name);
-  if (!paddle_tensor->IsInitialized()) {
-    paddle_tensor->Resize(framework::make_ddim(cinn_tensor->shape().data()));
-  }
-  CheckTensorEquivalent(cinn_name, *paddle_tensor, cinn_tensor);
-  return SetArgument(cinn_name, place, /* free_mem_callback = */ true,
-                     paddle_tensor);
-}
-
-std::unique_ptr<cinn_buffer_t> CinnLaunchContext::ShareTensorWithCinnBuffer(
-    const platform::Place& place, bool free_mem_callback, LoDTensor* tensor) {
-  // convert paddle dimensions array to cinn format
-  std::vector<cinn_dimension_t> cinn_dims(tensor->dims().size());
-  for (auto i = 0; i < tensor->dims().size(); ++i) {
-    cinn_dims[i] = static_cast<cinn_dimension_t>(tensor->dims().at(i));
-  }
-
   auto cinn_buffer = std::make_unique<cinn_buffer_t>();
-  // assign size and memory
-  cinn_buffer->resize(cinn_dims.data(), cinn_dims.size());
+  // assign dimensions and alloc/free callback of cinn_buffer_t
+  cinn_buffer->resize(cinn_tensor->shape().data().data(),
+                      cinn_tensor->shape().data().size());
 
   cinn_buffer->external_malloc = new std::function<int(void*, cinn_buffer_t*)>(
-      [place, tensor](void* ctx, cinn_buffer_t* buffer) {
-        buffer->memory =
-            reinterpret_cast<uint8_t*>(tensor->mutable_data<float>(place));
+      [this, cinn_name](void* ctx, cinn_buffer_t* buffer) {
+        auto* tensor =
+            cached_temp_scope_->Var(cinn_name)->GetMutable<LoDTensor>();
+        tensor->Resize(framework::DDim(buffer->dims, buffer->dimensions));
+        buffer->memory = reinterpret_cast<uint8_t*>(
+            tensor->mutable_data<float>(*cached_place_));
         return 0;
       });
 
-  if (free_mem_callback) {
-    cinn_buffer->external_free = new std::function<int(void*, cinn_buffer_t*)>(
-        [tensor](void* ctx, cinn_buffer_t* buffer) {
-          tensor->clear();
-          return 0;
-        });
-    return cinn_buffer;
-  }
-
+  // internal variables should release its buffer immediately
+  // if no instruction use it
   cinn_buffer->external_free = new std::function<int(void*, cinn_buffer_t*)>(
-      [](void* ctx, cinn_buffer_t* buffer) {
-        // Do nothing
+      [this, cinn_name](void* ctx, cinn_buffer_t* buffer) {
+        auto* tensor =
+            cached_temp_scope_->GetVar(cinn_name)->GetMutable<LoDTensor>();
+        tensor->clear();
         return 0;
       });
-  return cinn_buffer;
+  return SetArgument(cinn_name, std::move(cinn_buffer));
 }
 
 void CinnLaunchContext::SetArgument(const std::string& cinn_name,
-                                    const platform::Place& place,
-                                    bool free_mem_callback,
-                                    LoDTensor* paddle_tensor) {
-  auto buffer =
-      ShareTensorWithCinnBuffer(place, free_mem_callback, paddle_tensor);
+                                    std::unique_ptr<cinn_buffer_t>&& buffer) {
+  VLOG(4) << "SetArgument-" << name2argument_.size() << ": name(" << cinn_name
+          << "), dims(" << framework::DDim(buffer->dims, buffer->dimensions)
+          << ").";
+
   name2argument_.emplace(cinn_name, buffer.get());
   hold_buffers_.emplace_back(std::move(buffer));
-  VLOG(4) << "SetArgument-" << name2argument_.size() << ": "
-          << "name(" << cinn_name << "), dims(" << paddle_tensor->dims()
-          << ").";
 }
 
 const std::map<std::string, cinn_pod_value_t>&
diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.h b/paddle/fluid/operators/cinn/cinn_launch_context.h
index c990255d68253d..7bf70c9b9d0227 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_context.h
+++ b/paddle/fluid/operators/cinn/cinn_launch_context.h
@@ -24,7 +24,7 @@
 #include "cinn/runtime/cinn_runtime.h"
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/framework/scope.h"
 
 namespace paddle {
 namespace operators {
@@ -40,16 +40,22 @@ class CinnLaunchContext {
       const std::unordered_map<std::string, std::string>& paddle2cinn_varmap,
       const std::shared_ptr<CinnScope>& cinn_scope);
 
+  // explicitly update several environment variables captured
+  // by callback of execution arguments
+  void UpdateCapturedEnv(const framework::Scope& scope,
+                         const platform::Place& place);
+
+  // Return whether execution arguments has been initialized
+  bool IsArgumentsInitialized() const;
+
   // Return whether a Paddle variable used on compiled kernels
-  bool IsVariableUsed(const std::string& var_name);
+  bool IsVariableUsed(const std::string& paddle_name) const;
 
   // Assign tensor buffer to input or output variables
-  void AssignExternalVariable(const std::string& var_name,
-                              const platform::Place& place, LoDTensor* tensor);
+  void AssignExternalVariable(const std::string& paddle_name);
 
   // Assign tensor buffer to internal variables
-  void AssignInternalVariable(const std::string& var_name,
-                              const platform::Place& place, LoDTensor* tensor);
+  void AssignInternalVariable(const std::string& cinn_name);
 
   // Extract internal variable names from CinnScope
   // by excluding used input and output variables
@@ -58,10 +64,6 @@ class CinnLaunchContext {
   // Finalize all execution arguments and return them
   const std::map<std::string, cinn_pod_value_t>& FinalizeArguments() const;
 
-  std::vector<std::unique_ptr<cinn_buffer_t>> HandoverBuffers() {
-    return std::move(hold_buffers_);
-  }
-
  private:
   // Get CinnTensor with CINN variable name
   CinnTensor GetCinnTensor(const std::string& var_name);
@@ -72,16 +74,15 @@ class CinnLaunchContext {
                              const LoDTensor& paddle_tensor,
                              const CinnTensor& cinn_tensor);
 
-  // Share the buffer of a Paddle tensor to CINN by delivering memory address
-  // to a cinn_buffer_t object
-  std::unique_ptr<cinn_buffer_t> ShareTensorWithCinnBuffer(
-      const platform::Place& place, bool free_mem_callback, LoDTensor* tensor);
-
-  // Set an argument with (cinn name)->(paddle tensor) pair
-  void SetArgument(const std::string& cinn_name, const platform::Place& place,
-                   bool free_mem_callback, LoDTensor* paddle_tensor);
+  // Set an argument with (cinn name)->(cinn_buffer_t) pair
+  void SetArgument(const std::string& cinn_name,
+                   std::unique_ptr<cinn_buffer_t>&& buffer);
 
  private:
+  const framework::Scope* cached_scope_ = nullptr;
+  const platform::Place* cached_place_ = nullptr;
+  std::unique_ptr<framework::Scope> cached_temp_scope_ = nullptr;
+
   // a variable name map from paddle to cinn
   const std::unordered_map<std::string, std::string>& paddle2cinn_varmap_;
   // the variable scope of cinn
diff --git a/paddle/fluid/operators/cinn/cinn_launch_context_test.cc b/paddle/fluid/operators/cinn/cinn_launch_context_test.cc
index d922e8355b44c5..da7640c3c0f682 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_context_test.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_context_test.cc
@@ -45,81 +45,86 @@ std::unique_ptr<CinnLaunchContext> CreateDefaultLaunchContext() {
   return std::make_unique<CinnLaunchContext>(paddle2cinn_varmap, cinn_scope);
 }
 
-TEST(CinnLaunchContextTest, TestIsVariableUsed) {
+TEST(CinnLaunchContextTest, TestBasic) {
   auto launch_context = CreateDefaultLaunchContext();
-
+  // test IsVariableUsed
   ASSERT_EQ(launch_context->IsVariableUsed("var1"), true);
   ASSERT_EQ(launch_context->IsVariableUsed("var4"), false);
-}
-
-TEST(CinnLaunchContextTest, TestGetInternalVariableNames) {
-  auto launch_context = CreateDefaultLaunchContext();
-  auto internal_variable_names = launch_context->GetInternalVariableNames();
-  ASSERT_EQ(internal_variable_names.size(), 3);
-  EXPECT_NE(internal_variable_names.find("cinn_var2"),
-            internal_variable_names.end());
+  // test UpdateCapturedEnv
+  platform::CPUPlace place;
+  framework::Scope scope;
+  ASSERT_NO_THROW(launch_context->UpdateCapturedEnv(scope, place));
+  // test IsArgumentsInitialized
+  ASSERT_FALSE(launch_context->IsArgumentsInitialized());
 }
 
 TEST(CinnLaunchContextTest, TestCheckTensorEquivalent) {
-  auto launch_context = CreateDefaultLaunchContext();
   platform::CPUPlace place;
   framework::Scope scope;
+  auto launch_context = CreateDefaultLaunchContext();
+  launch_context->UpdateCapturedEnv(scope, place);
   auto* tensor1 = scope.Var("var1")->GetMutable<LoDTensor>();
 
   // CheckTensorEquivalent: tensor dimension not equivalent
   tensor1->mutable_data<float>(framework::make_ddim({3, 5}), place);
-  ASSERT_THROW(launch_context->AssignExternalVariable("var1", place, tensor1),
+  ASSERT_THROW(launch_context->AssignExternalVariable("var1"),
                paddle::platform::EnforceNotMet);
 }
 
 TEST(CinnLaunchContextTest, TestAssignVariablePreCondition) {
-  auto launch_context = CreateDefaultLaunchContext();
   platform::CPUPlace place;
   framework::Scope scope;
+  auto launch_context = CreateDefaultLaunchContext();
+  launch_context->UpdateCapturedEnv(scope, place);
   auto* tensor4 = scope.Var("var4")->GetMutable<LoDTensor>();
 
   // not used
-  ASSERT_THROW(launch_context->AssignExternalVariable("var4", place, tensor4),
+  ASSERT_THROW(launch_context->AssignExternalVariable("var4"),
                paddle::platform::EnforceNotMet);
   // not found
-  ASSERT_THROW(
-      launch_context->AssignExternalVariable("cinn_var4", place, tensor4),
-      paddle::platform::EnforceNotMet);
+  ASSERT_THROW(launch_context->AssignInternalVariable("cinn_var4"),
+               paddle::platform::EnforceNotMet);
 }
 
 TEST(CinnLaunchContextTest, TestSetArgument) {
+  platform::CPUPlace cpu_place;
+  platform::Place place(cpu_place);
+  framework::Scope scope;
   auto launch_context = CreateDefaultLaunchContext();
+  launch_context->UpdateCapturedEnv(scope, place);
 
-  platform::CPUPlace place;
-  framework::Scope scope;
+  // assign external variables
   auto* tensor1 = scope.Var("var1")->GetMutable<LoDTensor>();
   float* data1 =
       tensor1->mutable_data<float>(framework::make_ddim({3, 4}), place);
   data1[0] = 9.99f;
   data1[10] = 19.99f;
+  ASSERT_NO_THROW(launch_context->AssignExternalVariable("var1"));
 
-  // assign external variable
-  ASSERT_NO_THROW(
-      launch_context->AssignExternalVariable("var1", place, tensor1));
-  auto* tensor2 = scope.Var("var2")->GetMutable<LoDTensor>();
-  tensor2->mutable_data<float>(framework::make_ddim({6, 7, 8}), place);
-  ASSERT_NO_THROW(
-      launch_context->AssignInternalVariable("cinn_var2", place, tensor2));
-  // FinalizeArguments not missed check
-  ASSERT_THROW(launch_context->FinalizeArguments(),
-               paddle::platform::EnforceNotMet);
   auto* tensor3 = scope.Var("var3")->GetMutable<LoDTensor>();
   tensor3->mutable_data<float>(framework::make_ddim({10, 16}), place);
-  ASSERT_NO_THROW(
-      launch_context->AssignExternalVariable("var3", place, tensor3));
+  ASSERT_NO_THROW(launch_context->AssignExternalVariable("var3"));
+
+  // FinalizeArguments missed check
+  ASSERT_THROW(launch_context->FinalizeArguments(),
+               paddle::platform::EnforceNotMet);
+  // test get internal variables
+  auto internal_variable_names = launch_context->GetInternalVariableNames();
+  ASSERT_EQ(internal_variable_names.size(), 1);
+  EXPECT_EQ(*internal_variable_names.begin(), "cinn_var2");
 
+  auto* tensor2 = scope.Var("var2")->GetMutable<LoDTensor>();
+  tensor2->mutable_data<float>(framework::make_ddim({6, 7, 8}), place);
+  ASSERT_NO_THROW(launch_context->AssignInternalVariable("cinn_var2"));
+
+  // check argument is set correctly and alloc/free callbacks work well
   auto name2argument = launch_context->FinalizeArguments();
   ASSERT_EQ(name2argument.size(), 3);
   ASSERT_EQ(name2argument.count("cinn_var1"), 1);
-  // check ShareTensorWithCinnBuffer
+  ASSERT_TRUE(launch_context->IsArgumentsInitialized());
+
   auto* cinn_buffer =
       static_cast<cinn_buffer_t*>(name2argument.at("cinn_var1"));
-
   ASSERT_EQ(cinn_buffer->memory, nullptr);
   cinn_buffer->external_malloc->operator()(nullptr, cinn_buffer);
   ASSERT_NE(cinn_buffer->memory, nullptr);
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc b/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc
index 813e7b1152f87e..ea36a19202ef06 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc
@@ -31,26 +31,6 @@ namespace operators {
 namespace details {
 
 #ifdef PADDLE_WITH_CUDA
-void CUDART_CB ReleaseScope(void* data) {
-  auto* temp_scope = static_cast<framework::Scope*>(data);
-  delete temp_scope;
-}
-
-void CUDART_CB ReleaseBuffers(void* data) {
-  auto* buffers =
-      static_cast<std::vector<std::unique_ptr<cinn_buffer_t>>*>(data);
-  delete buffers;
-}
-
-template <>
-void ReleaseResource<platform::CUDADeviceContext>(
-    const std::vector<void*>& resources, void* stream) {
-  PADDLE_ENFORCE_GPU_SUCCESS(cudaLaunchHostFunc(
-      static_cast<gpuStream_t>(stream), ReleaseScope, resources[0]));
-  PADDLE_ENFORCE_GPU_SUCCESS(cudaLaunchHostFunc(
-      static_cast<gpuStream_t>(stream), ReleaseBuffers, resources[1]));
-}
-
 template <>
 void* GetStream<platform::CUDADeviceContext>(
     const framework::ExecutionContext& ctx) {
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.h b/paddle/fluid/operators/cinn/cinn_launch_op.h
index 3a272916332bea..8a5ca166941df0 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op.h
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.h
@@ -56,25 +56,12 @@ void LaunchCinnExecution(const CinnCompiledObject& compiled_obj,
 // Set cinn FLAGS (such as FLAGS_cinn_cudnn_deterministic) with paddle's FLAGS.
 void SetCinnRuntimeFlags();
 
-template <typename DeviceContext>
-void ReleaseResource(const std::vector<void*>& resources, void* stream) {
-  auto* temp_scope = static_cast<framework::Scope*>(resources[0]);
-  auto* buffers =
-      static_cast<std::vector<std::unique_ptr<cinn_buffer_t>>*>(resources[1]);
-  delete temp_scope;
-  delete buffers;
-}
-
 template <typename DeviceContext>
 void* GetStream(const framework::ExecutionContext& ctx) {
   return nullptr;
 }
 
 #ifdef PADDLE_WITH_CUDA
-template <>
-void ReleaseResource<platform::CUDADeviceContext>(
-    const std::vector<void*>& resources, void* stream);
-
 template <>
 void* GetStream<platform::CUDADeviceContext>(
     const framework::ExecutionContext& ctx);
@@ -116,56 +103,54 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
         compilation_key, inputs_name2tensor, target, stream);
     details::DebugCinnCompiledResult(cinn_compiled_object);
 
-    auto launch_context = std::make_unique<details::CinnLaunchContext>(
-        cinn_compiled_object.paddle2cinn_varmap, cinn_compiled_object.scope);
-
+    const auto& launch_context = cinn_compiled_object.launch_context;
     // Step 3. Prepare arguments needed for the compiled executable program.
-    VLOG(4) << "CinnLaunchOp prepare arguments";
-
-    // 3.1 Prepare input variables: tensors of input variables have
-    //     been initialized before graph compiled, just check the
-    //     equiality between tensors of paddle and cinn.
-    for (const auto& var_name : input_variable_names) {
-      if (!launch_context->IsVariableUsed(var_name)) {
-        // some input variables don't need for cinn because they are
-        // eliminated by optimized passes or some cinn operators use
-        // less variables
-        VLOG(4) << "Input variable(" << var_name << ") not used by cinn";
-        continue;
+    launch_context->UpdateCapturedEnv(scope, place);
+    if (!launch_context->IsArgumentsInitialized()) {
+      VLOG(4) << "CinnLaunchOp prepare arguments";
+
+      // 3.1 Prepare input variables: tensors of input variables have
+      //     been initialized before graph compiled, just check the
+      //     equiality between tensors of paddle and cinn.
+      for (const auto& var_name : input_variable_names) {
+        if (!launch_context->IsVariableUsed(var_name)) {
+          // some input variables don't need for cinn because they are
+          // eliminated by optimized passes or some cinn operators use
+          // less variables
+          VLOG(4) << "Input variable(" << var_name << ") not used by cinn";
+          continue;
+        }
+
+        launch_context->AssignExternalVariable(var_name);
       }
 
-      launch_context->AssignExternalVariable(
-          var_name, place, scope.GetVar(var_name)->GetMutable<LoDTensor>());
-    }
-
-    // 3.2 Prepare output variables: all output variables should
-    //     be initialized and allocated buffer before
-    //     the runtime program start execution, the compilation result
-    //     includes details of their buffer assginment and we use that to
-    //     allocate space in Paddle. For those variables allocated yet,
-    //     like persistable parameters, just check the equiality between
-    //     Paddle allocation and CINN buffer assginment.
-    auto output_variable_names = ctx.OutputNames(kOutputs);
-    for (const auto var_name : output_variable_names) {
-      PADDLE_ENFORCE_EQ(launch_context->IsVariableUsed(var_name), true,
-                        platform::errors::InvalidArgument(
-                            "Output variable(%s) not used by cinn", var_name));
-
-      auto* tensor = scope.GetVar(var_name)->GetMutable<LoDTensor>();
-      launch_context->AssignExternalVariable(var_name, place, tensor);
-    }
+      // 3.2 Prepare output variables: all output variables should
+      //     be initialized and allocated buffer before
+      //     the runtime program start execution, the compilation result
+      //     includes details of their buffer assginment and we use that to
+      //     allocate space in Paddle. For those variables allocated yet,
+      //     like persistable parameters, just check the equiality between
+      //     Paddle allocation and CINN buffer assginment.
+      auto output_variable_names = ctx.OutputNames(kOutputs);
+      for (const auto var_name : output_variable_names) {
+        PADDLE_ENFORCE_EQ(
+            launch_context->IsVariableUsed(var_name), true,
+            platform::errors::InvalidArgument(
+                "Output variable(%s) not used by cinn", var_name));
+
+        launch_context->AssignExternalVariable(var_name);
+      }
 
-    // 3.3 Prepare internal or temporary variables: Create a temporary
-    //     scope to keep internal variables within graph or temporary
-    //     variables needed by the compiled runtime program in addition.
-    //     Here we directly use the names from CinnScope as Paddle variable
-    //     names, because they will not be used outside the graph
-    //     and should be destructed after computation finished.
-    auto internal_variable_names = launch_context->GetInternalVariableNames();
-    framework::Scope* temp_scope = scope.NewTmpScope().release();
-    for (const auto& var_name : internal_variable_names) {
-      auto* tensor = temp_scope->Var(var_name)->GetMutable<LoDTensor>();
-      launch_context->AssignInternalVariable(var_name, place, tensor);
+      // 3.3 Prepare internal or temporary variables: Create a temporary
+      //     scope to keep internal variables within graph or temporary
+      //     variables needed by the compiled runtime program in addition.
+      //     Here we directly use the names from CinnScope as Paddle variable
+      //     names, because they will not be used outside the graph
+      //     and should be destructed after computation finished.
+      auto internal_variable_names = launch_context->GetInternalVariableNames();
+      for (const auto& var_name : internal_variable_names) {
+        launch_context->AssignInternalVariable(var_name);
+      }
     }
 
     // Step 4. Set CINN runtime FLAGS, such as FLAGS_cinn_cudnn_deterministic.
@@ -175,12 +160,6 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
     VLOG(4) << "Run Cinn compiled executable program with stream: " << stream;
     details::LaunchCinnExecution(cinn_compiled_object, *launch_context, stream);
     VLOG(4) << "CinnLaunchOp launch execution done.";
-
-    // Step 6. Release some resources, such as `temp_scope` and cinn_buffers.
-    auto* buffers_holder = new std::vector<std::unique_ptr<cinn_buffer_t>>{
-        launch_context->HandoverBuffers()};
-    details::ReleaseResource<DeviceContext>({temp_scope, buffers_holder},
-                                            stream);
   }
 };
 
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
index 02373c38184fca..e10fdf522ff7c5 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc
@@ -130,8 +130,9 @@ TEST(CinnLaunchOpTest, TestElementwiseAddPass) {
     scope.Var(test_out_name)->GetMutable<LoDTensor>();
     scope.Var(expected_out_name)->GetMutable<LoDTensor>();
 
-    cinn_launch_op->Run(scope, place);
-    elementwise_add_op->Run(scope, place);
+    platform::Place run_place(place);
+    cinn_launch_op->Run(scope, run_place);
+    elementwise_add_op->Run(scope, run_place);
 
     LoDTensor test_out, expected_out;
     TensorCopySync(scope.Var(test_out_name)->Get<LoDTensor>(),
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index ed06fac298a8fd..155eb1ebbe3db3 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -555,10 +555,10 @@ class Reshape2Op : public ReshapeOp {
       const framework::ExecutionContext &ctx) const override {
     auto multi_inputs = ctx.MultiInput<framework::Tensor>("ShapeTensor");
     if (multi_inputs.size() > 0) {
-      return framework::KernelSignature("reshape.mulhost", {"X", "ShapeTensor"},
+      return framework::KernelSignature("reshape_mulhost", {"X", "ShapeTensor"},
                                         {}, {"Out"});
     } else if (ctx.HasInput("Shape")) {
-      return framework::KernelSignature("reshape.host", {"X", "Shape"}, {},
+      return framework::KernelSignature("reshape_host", {"X", "Shape"}, {},
                                         {"Out"});
     } else {
       return framework::KernelSignature("reshape", {"X"}, {"shape"}, {"Out"});
diff --git a/paddle/fluid/platform/device/gpu/gpu_primitives.h b/paddle/fluid/platform/device/gpu/gpu_primitives.h
index d443e78ed874f3..3e070da546b2ae 100644
--- a/paddle/fluid/platform/device/gpu/gpu_primitives.h
+++ b/paddle/fluid/platform/device/gpu/gpu_primitives.h
@@ -101,6 +101,20 @@ inline static __device__ uint32_t add_to_high_half(uint32_t val, float x) {
   return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
 }
 
+#if CUDA_VERSION >= 10000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+static __device__ __forceinline__ float16 CUDAFP16ToPDFP16(__half x) {
+  return *reinterpret_cast<float16 *>(&x);
+}
+
+static __device__ __forceinline__ __half PDFP16ToCUDAFP16(float16 x) {
+  return *reinterpret_cast<__half *>(&x);
+}
+
+CUDA_ATOMIC_WRAPPER(Add, float16) {
+  return CUDAFP16ToPDFP16(
+      atomicAdd(reinterpret_cast<__half *>(address), PDFP16ToCUDAFP16(val)));
+}
+#else
 CUDA_ATOMIC_WRAPPER(Add, float16) {
   // concrete packed float16 value may exsits in lower or higher 16bits
   // of the 32bits address.
@@ -133,6 +147,7 @@ CUDA_ATOMIC_WRAPPER(Add, float16) {
   }
 }
 #endif
+#endif
 
 CUDA_ATOMIC_WRAPPER(Add, complex<float>) {
   float *real = reinterpret_cast<float *>(address);
diff --git a/paddle/fluid/platform/device/ipu/CMakeLists.txt b/paddle/fluid/platform/device/ipu/CMakeLists.txt
index 25629ba74d9152..9be12cbf6d4376 100644
--- a/paddle/fluid/platform/device/ipu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/ipu/CMakeLists.txt
@@ -1,5 +1,5 @@
-# IPU
 IF(WITH_IPU)
+  FILE(GLOB POPART_CANONICALIZATION_SRC ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/device/ipu/popart_canonicalization/*.cc)
   cc_library(ipu_device SRCS device.cc DEPS enforce popart)
   cc_library(ipu_utils SRCS ipu_utils.cc DEPS memory framework_proto popart)
   cc_library(ipu_strategy SRCS ipu_strategy.cc DEPS popart graph framework_proto enforce)
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/activation_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/activation_ops.cc
new file mode 100644
index 00000000000000..5793c4c0e3ca69
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/activation_ops.cc
@@ -0,0 +1,72 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+namespace {
+
+Node *activation_op_handler(Graph *graph, Node *node, const std::string &type) {
+  auto new_node = CreateBaseOp(graph, node, type, {GetInputVarNode("X", node)},
+                               node->outputs);
+  return new_node;
+}
+
+Node *relu_handler(Graph *graph, Node *node) {
+  return activation_op_handler(graph, node, "popart_relu");
+}
+
+Node *tanh_handler(Graph *graph, Node *node) {
+  return activation_op_handler(graph, node, "popart_tanh");
+}
+
+Node *log_handler(Graph *graph, Node *node) {
+  return activation_op_handler(graph, node, "popart_log");
+}
+
+Node *sigmoid_handler(Graph *graph, Node *node) {
+  return activation_op_handler(graph, node, "popart_sigmoid");
+}
+
+Node *sqrt_handler(Graph *graph, Node *node) {
+  return activation_op_handler(graph, node, "popart_sqrt");
+}
+
+Node *gelu_handler(Graph *graph, Node *node) {
+  return activation_op_handler(graph, node, "popart_gelu_v2");
+}
+
+Node *log_softmax_handler(Graph *graph, Node *node) {
+  auto axis = BOOST_GET_CONST(int, node->Op()->GetAttr("axis"));
+  auto new_softmax = CreateSoftmaxOpset11(graph, node, node->inputs, {}, axis);
+  return CreateBaseOp(graph, node, "popart_log", new_softmax->outputs,
+                      node->outputs);
+}
+
+REGISTER_HANDLER(relu, relu_handler);
+REGISTER_HANDLER(tanh, tanh_handler);
+REGISTER_HANDLER(log, log_handler);
+REGISTER_HANDLER(sigmoid, sigmoid_handler);
+REGISTER_HANDLER(sqrt, sqrt_handler);
+REGISTER_HANDLER(gelu, gelu_handler);
+REGISTER_HANDLER(log_softmax, log_softmax_handler);
+
+}  // namespace
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.cc
new file mode 100644
index 00000000000000..d46fc55ec6ce0d
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.cc
@@ -0,0 +1,185 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+
+// This avoids the static initialisation order fiasco,
+std::unordered_map<std::string, SymbolHandler> &SymbolHandlers() {
+  static std::unordered_map<std::string, SymbolHandler> symbol_handlers;
+  return symbol_handlers;
+}
+
+bool RegisterHandler(const std::string &symbol, const SymbolHandler &handler) {
+  if (SymbolHandlers().count(symbol) != 0) {
+    LOG(WARNING) << "Trying to register popart handler twice for operator: "
+                 << symbol;
+    return false;
+  }
+  bool new_handler = SymbolHandlers().emplace(symbol, handler).second;
+  return new_handler;
+}
+
+// Return a pointer to a handler if one is registered for this kind of node or
+// an empty std::function otherwise.
+SymbolHandler GetHandler(const std::string &kind) {
+  auto it = SymbolHandlers().find(kind);
+  if (it != SymbolHandlers().end()) {
+    return it->second;
+  }
+  return {};
+}
+
+void ConnectNodes(Node *first_node, Node *next_node) {
+  first_node->outputs.push_back(next_node);
+  next_node->inputs.push_back(first_node);
+}
+
+void DisConnectNodes(Node *first_node, Node *next_node) {
+  auto rm_by_value = [&](std::vector<Node *> &vec, Node *n) {
+    vec.erase(std::remove(vec.begin(), vec.end(), n), vec.end());
+  };
+  rm_by_value(first_node->outputs, next_node);
+  rm_by_value(next_node->inputs, first_node);
+  rm_by_value(first_node->inputs, next_node);
+  rm_by_value(next_node->outputs, first_node);
+}
+
+void ClearNode(Node *node) {
+  auto rm_by_value = [&](std::vector<Node *> &vec, Node *n) {
+    vec.erase(std::remove(vec.begin(), vec.end(), n), vec.end());
+  };
+  for (auto *node_in : node->inputs) {
+    rm_by_value(node_in->outputs, node);
+  }
+  for (auto *node_out : node->outputs) {
+    rm_by_value(node_out->inputs, node);
+  }
+}
+
+void CopyOpAttr(const std::string &attr_name, OpDesc *op, OpDesc *new_op,
+                bool override) {
+  if (new_op->HasAttr(attr_name) && !override) {
+    return;
+  }
+  if (op->HasAttr(attr_name)) {
+    VLOG(10) << "Copying attr: " << attr_name << " from " << op->Type()
+             << " to " << new_op->Type();
+    new_op->SetAttr(attr_name, op->GetAttr(attr_name));
+    new_op->Flush();
+  }
+}
+
+const int VarType2OnnxDtype(const int type) {
+  auto dtype = static_cast<framework::proto::VarType::Type>(type);
+  switch (dtype) {
+    case framework::proto::VarType::BOOL:
+      return static_cast<int>(ONNXDataType::BOOL);
+    case framework::proto::VarType::INT16:
+      return static_cast<int>(ONNXDataType::INT16);
+    case framework::proto::VarType::INT32:
+      return static_cast<int>(ONNXDataType::INT32);
+    case framework::proto::VarType::INT64:
+      return static_cast<int>(ONNXDataType::INT64);
+    case framework::proto::VarType::FP16:
+      return static_cast<int>(ONNXDataType::FLOAT16);
+    case framework::proto::VarType::FP32:
+      return static_cast<int>(ONNXDataType::FLOAT);
+    case framework::proto::VarType::FP64:
+      return static_cast<int>(ONNXDataType::DOUBLE);
+    case framework::proto::VarType::UINT8:
+      return static_cast<int>(ONNXDataType::UINT8);
+    case framework::proto::VarType::INT8:
+      return static_cast<int>(ONNXDataType::INT8);
+    case framework::proto::VarType::BF16:
+      return static_cast<int>(ONNXDataType::BFLOAT16);
+    case framework::proto::VarType::COMPLEX64:
+      return static_cast<int>(ONNXDataType::COMPLEX64);
+    case framework::proto::VarType::COMPLEX128:
+      return static_cast<int>(ONNXDataType::COMPLEX128);
+    default:
+      PADDLE_THROW(
+          platform::errors::Unimplemented("Unsupported data type: %d.", dtype));
+  }
+}
+
+const std::string VarType2PopStr(const int type) {
+  auto dtype = static_cast<framework::proto::VarType::Type>(type);
+  switch (dtype) {
+    case framework::proto::VarType::UINT8:
+      return "UINT8";
+    case framework::proto::VarType::INT8:
+      return "INT8";
+    case framework::proto::VarType::INT16:
+      return "INT16";
+    case framework::proto::VarType::INT32:
+      return "INT32";
+    case framework::proto::VarType::INT64:
+      return "INT64";
+    case framework::proto::VarType::BOOL:
+      return "BOOL";
+    case framework::proto::VarType::FP64:
+      return "DOUBLE";
+    case framework::proto::VarType::FP32:
+      return "FLOAT";
+    case framework::proto::VarType::FP16:
+      return "FLOAT16";
+    default:
+      PADDLE_THROW(
+          paddle::platform::errors::Unavailable("Unsupported data type."));
+  }
+}
+
+Node *GetInputVarNode(const std::string &input_name, const Node *op_node,
+                      const int id) {
+  auto var_name = op_node->Op()->Input(input_name).at(id);
+  return GetInputVarNodeByVarName(var_name, op_node);
+}
+
+Node *GetOutputVarNode(const std::string &output_name, const Node *op_node,
+                       const int id) {
+  auto var_name = op_node->Op()->Output(output_name).at(id);
+  return GetOutputVarNodeByVarName(var_name, op_node);
+}
+
+Node *GetInputVarNodeByVarName(const std::string &var_name,
+                               const Node *op_node) {
+  for (auto *var : op_node->inputs) {
+    if (var->Name() == var_name) {
+      return var;
+    }
+  }
+  return nullptr;
+}
+
+Node *GetOutputVarNodeByVarName(const std::string &var_name,
+                                const Node *op_node) {
+  for (auto *var : op_node->outputs) {
+    if (var->Name() == var_name) {
+      return var;
+    }
+  }
+  return nullptr;
+}
+
+const bool is_float_equal(float a, float b, float eps) {
+  return std::fabs(a - b) <= eps;
+}
+
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h b/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h
new file mode 100644
index 00000000000000..c1b2bd0c8b5fd4
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h
@@ -0,0 +1,64 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/node.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/platform/device/ipu/ipu_utils.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+
+using framework::ir::Graph;
+using framework::ir::Node;
+using framework::OpDesc;
+
+#define REGISTER_HANDLER(name, func) \
+  static bool __UNUSED_##name =      \
+      paddle::platform::ipu::RegisterHandler(#name, func)
+
+using SymbolHandler = std::function<Node *(Graph *, Node *)>;
+
+std::unordered_map<std::string, SymbolHandler> &SymbolHandlers();
+
+bool RegisterHandler(const std::string &, const SymbolHandler &);
+
+SymbolHandler GetHandler(const std::string &);
+
+void ConnectNodes(Node *first_node, Node *next_node);
+void DisConnectNodes(Node *first_node, Node *next_node);
+void ClearNode(Node *node);
+void CopyOpAttr(const std::string &attr_name, OpDesc *op, OpDesc *new_op,
+                bool override = false);
+
+const int VarType2OnnxDtype(const int type);
+const std::string VarType2PopStr(const int type);
+
+Node *GetInputVarNode(const std::string &input_name, const Node *op_node,
+                      const int id = 0);
+Node *GetOutputVarNode(const std::string &output_name, const Node *op_node,
+                       const int id = 0);
+Node *GetInputVarNodeByVarName(const std::string &var_name,
+                               const Node *op_node);
+Node *GetOutputVarNodeByVarName(const std::string &var_name,
+                                const Node *op_node);
+
+const bool is_float_equal(float a, float b, float eps = 1e-8);
+
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/elementwise_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/elementwise_ops.cc
new file mode 100644
index 00000000000000..f0c19cac3a6c3f
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/elementwise_ops.cc
@@ -0,0 +1,108 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+namespace {
+
+Node *elementwise_op_handler(Graph *graph, Node *node,
+                             const std::string &type) {
+  auto *op = node->Op();
+  auto x_shape = GetInputVarNode("X", node)->Var()->GetShape();
+  int64_t x_rank = x_shape.size();
+  auto y_shape = GetInputVarNode("Y", node)->Var()->GetShape();
+  int64_t y_rank = y_shape.size();
+
+  auto axis = BOOST_GET_CONST(int, op->GetAttr("axis"));
+  if (axis == -1 || axis == x_rank - 1 || x_rank == y_rank) {
+    auto new_node =
+        CreateBaseOp(graph, node, type,
+                     {GetInputVarNode("X", node), GetInputVarNode("Y", node)},
+                     node->outputs);
+    return new_node;
+  } else {
+    auto y_new_shape = std::vector<int64_t>(x_rank, 1);
+    for (int i = axis; i < axis + y_rank; ++i) {
+      y_new_shape[i] = y_shape[i - axis];
+    }
+    auto attrs = AttributeMap{
+        {"value", y_new_shape},
+        {"dims", std::vector<int64_t>{x_rank}},
+        {"dtype", ONNXDataType::INT64},
+    };
+    // constant
+    auto new_node_const = CreateConst(graph, node, {}, {}, attrs);
+    // reshape
+    auto new_node_reshape = CreateBaseOp(
+        graph, node, "popart_reshape",
+        {GetInputVarNode("Y", node), new_node_const->outputs[0]}, {});
+    // elementwise_op
+    auto new_node =
+        CreateBaseOp(graph, node, type,
+                     {GetInputVarNode("X", node), new_node_reshape->outputs[0]},
+                     node->outputs);
+    return new_node;
+  }
+}
+
+Node *elementwise_add_handler(Graph *graph, Node *node) {
+  return elementwise_op_handler(graph, node, "popart_add");
+}
+
+Node *elementwise_sub_handler(Graph *graph, Node *node) {
+  return elementwise_op_handler(graph, node, "popart_sub");
+}
+
+Node *elementwise_div_handler(Graph *graph, Node *node) {
+  return elementwise_op_handler(graph, node, "popart_div");
+}
+
+Node *elementwise_mul_handler(Graph *graph, Node *node) {
+  return elementwise_op_handler(graph, node, "popart_mul");
+}
+
+Node *elementwise_min_handler(Graph *graph, Node *node) {
+  return elementwise_op_handler(graph, node, "popart_min");
+}
+
+Node *elementwise_max_handler(Graph *graph, Node *node) {
+  return elementwise_op_handler(graph, node, "popart_max");
+}
+
+Node *elementwise_pow_handler(Graph *graph, Node *node) {
+  return elementwise_op_handler(graph, node, "popart_pow");
+}
+
+Node *elementwise_mod_handler(Graph *graph, Node *node) {
+  return elementwise_op_handler(graph, node, "popart_mod");
+}
+
+REGISTER_HANDLER(elementwise_add, elementwise_add_handler);
+REGISTER_HANDLER(elementwise_sub, elementwise_sub_handler);
+REGISTER_HANDLER(elementwise_div, elementwise_div_handler);
+REGISTER_HANDLER(elementwise_mul, elementwise_mul_handler);
+REGISTER_HANDLER(elementwise_min, elementwise_min_handler);
+REGISTER_HANDLER(elementwise_max, elementwise_max_handler);
+REGISTER_HANDLER(elementwise_pow, elementwise_pow_handler);
+REGISTER_HANDLER(elementwise_mod, elementwise_mod_handler);
+
+}  // namespace
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/logic_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/logic_ops.cc
new file mode 100644
index 00000000000000..92362ebf5be7d5
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/logic_ops.cc
@@ -0,0 +1,36 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+namespace {
+
+Node *equal_handler(Graph *graph, Node *node) {
+  auto new_node = CreateBaseOp(
+      graph, node, "popart_equal",
+      {GetInputVarNode("X", node), GetInputVarNode("Y", node)}, node->outputs);
+  return new_node;
+}
+
+REGISTER_HANDLER(equal, equal_handler);
+
+}  // namespace
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc
new file mode 100644
index 00000000000000..af7e4d0c7dbe9d
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc
@@ -0,0 +1,259 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+namespace {
+
+Node *mean_handler(Graph *graph, Node *node) {
+  return CreateBaseOp(graph, node, "popart_reducemean",
+                      {GetInputVarNode("X", node)},
+                      {GetOutputVarNode("Out", node)},
+                      {
+                          {"keepdims", int64_t{0}},
+                      });
+}
+
+Node *pow_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  if (op->HasInput("FactorTensor") && !op->Input("FactorTensor").empty()) {
+    return CreateBaseOp(
+        graph, node, "popart_pow",
+        {GetInputVarNode("X", node), GetInputVarNode("FactorTensor", node)},
+        node->outputs);
+  } else {
+    // Op(pow) -> Op(Constant)->Var(const_out)->Op(Pow)
+    auto value_ = BOOST_GET_CONST(float, op->GetAttr("factor"));
+    auto attrs =
+        MakeConstAttrMapFromValue<float>(value_, {1}, ONNXDataType::FLOAT);
+    auto new_node_const = CreateConst(graph, node, {}, {}, attrs);
+    return CreateBaseOp(graph, node, "popart_pow", {GetInputVarNode("X", node),
+                                                    new_node_const->outputs[0]},
+                        node->outputs);
+  }
+}
+
+Node *mul_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto x_num_col_dims = BOOST_GET_CONST(int, op->GetAttr("x_num_col_dims"));
+  auto y_num_col_dims = BOOST_GET_CONST(int, op->GetAttr("y_num_col_dims"));
+  auto x_shape_ = GetInputVarNode("X", node)->Var()->GetShape();
+  auto y_shape_ = GetInputVarNode("Y", node)->Var()->GetShape();
+
+  // build the shape for reshape
+  std::vector<int64_t> reshape_shape_{};
+  for (int left = 0; left < x_num_col_dims; left++) {
+    reshape_shape_.push_back(int64_t(x_shape_[left]));
+  }
+  for (int right = y_num_col_dims; right < y_shape_.size(); right++) {
+    reshape_shape_.push_back(int64_t(y_shape_[right]));
+  }
+  auto x_flatten =
+      CreateBaseOp(graph, node, "popart_flatten", {GetInputVarNode("X", node)},
+                   {}, {{"axis", int64_t(x_num_col_dims)}});
+  auto y_flatten =
+      CreateBaseOp(graph, node, "popart_flatten", {GetInputVarNode("Y", node)},
+                   {}, {{"axis", int64_t(y_num_col_dims)}});
+  auto matmul =
+      CreateBaseOp(graph, node, "popart_matmul",
+                   {x_flatten->outputs[0], y_flatten->outputs[0]}, {}, {});
+
+  auto reshape_const = CreateConst(
+      graph, node, {}, {},
+      {{"value", reshape_shape_},
+       {"dims", std::vector<int64_t>{int64_t(reshape_shape_.size())}},
+       {"dtype", ONNXDataType::INT64}});
+  return CreateBaseOp(graph, node, "popart_reshape",
+                      {matmul->outputs[0], reshape_const->outputs[0]},
+                      node->outputs, {});
+}
+
+Node *matmul_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto transpose_x = BOOST_GET_CONST(bool, op->GetAttr("transpose_X"));
+  auto transpose_y = BOOST_GET_CONST(bool, op->GetAttr("transpose_Y"));
+  auto alpha = BOOST_GET_CONST(float, op->GetAttr("alpha"));
+  auto x_shape = GetInputVarNode("X", node)->Var()->GetShape();
+  auto y_shape = GetInputVarNode("Y", node)->Var()->GetShape();
+
+  int x_rank = x_shape.size();
+  std::vector<int64_t> perm;
+  if (x_rank == 1) {
+    perm = std::vector<int64_t>{0};
+  } else if (x_rank == 2) {
+    return CreateGemm(graph, node,
+                      {GetInputVarNode("X", node), GetInputVarNode("Y", node)},
+                      node->outputs, transpose_x, transpose_y, alpha);
+  } else if (x_rank == 3) {
+    perm = std::vector<int64_t>{0, 2, 1};
+  } else if (x_rank == 4) {
+    perm = std::vector<int64_t>{0, 1, 3, 2};
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "op matmul with input rank == %d", x_rank));
+  }
+
+  Node *x_node = GetInputVarNode("X", node);
+  Node *y_node = GetInputVarNode("Y", node);
+  if (transpose_x) {
+    x_node = CreateBaseOp(graph, node, "popart_transpose",
+                          {GetInputVarNode("X", node)}, {}, {{"perm", perm}});
+    x_node = x_node->outputs[0];
+  }
+  if (transpose_y) {
+    y_node = CreateBaseOp(graph, node, "popart_transpose",
+                          {GetInputVarNode("Y", node)}, {}, {{"perm", perm}});
+    y_node = y_node->outputs[0];
+  }
+  if (is_float_equal(alpha, 1.0)) {
+    auto o_node =
+        CreateBaseOp(graph, node, "popart_matmul", {x_node, y_node}, {});
+    auto attr = MakeConstAttrMapFromValue(alpha, {1}, ONNXDataType::FLOAT);
+    auto const_node = CreateConst(graph, node, {}, {}, attr);
+    return CreateBaseOp(graph, node, "popart_mul",
+                        {o_node->outputs[0], const_node->outputs[0]},
+                        node->outputs);
+  } else {
+    return CreateBaseOp(graph, node, "popart_matmul", {x_node, y_node},
+                        node->outputs);
+  }
+}
+
+Node *sum_handler(Graph *graph, Node *node) {
+  return CreateBaseOp(graph, node, "popart_sum", node->inputs, node->outputs);
+}
+
+Node *softmax_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto axis = BOOST_GET_CONST(int, op->GetAttr("axis"));
+  return CreateSoftmaxOpset11(graph, node, node->inputs, node->outputs, axis);
+}
+
+Node *scale_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto scale_ = BOOST_GET_CONST(float, op->GetAttr("scale"));
+  auto bias_ = BOOST_GET_CONST(float, op->GetAttr("bias"));
+  auto bias_after_scale_ =
+      BOOST_GET_CONST(bool, op->GetAttr("bias_after_scale"));
+  auto data_type_ = GetInputVarNode("X", node)->Var()->GetDataType();
+
+  auto new_node_bias_var =
+      CreateConst(graph, node, {}, {}, {{"value", std::vector<float>{bias_}},
+                                        {"dims", std::vector<int64_t>{1}},
+                                        {"dtype", ONNXDataType::FLOAT}});
+  new_node_bias_var = new_node_bias_var->outputs[0];
+
+  Node *new_node_scale_var = nullptr;
+  if (op->HasInput("ScaleTensor") && !op->Input("ScaleTensor").empty()) {
+    new_node_scale_var = GetInputVarNode("ScaleTensor", node);
+  } else {
+    new_node_scale_var =
+        CreateConst(graph, node, {}, {}, {{"value", std::vector<float>{scale_}},
+                                          {"dims", std::vector<int64_t>{1}},
+                                          {"dtype", ONNXDataType::FLOAT}});
+    new_node_scale_var = new_node_scale_var->outputs[0];
+  }
+
+  // convert to float32
+  auto new_node_cast =
+      CreateCast(graph, node, {GetInputVarNode("X", node)}, {},
+                 static_cast<int>(framework::proto::VarType::FP32));
+  Node *result = nullptr;
+  if (bias_after_scale_) {
+    auto new_node_mul =
+        CreateBaseOp(graph, node, "popart_mul",
+                     {new_node_cast->outputs[0], new_node_scale_var}, {}, {});
+    result =
+        CreateBaseOp(graph, node, "popart_add",
+                     {new_node_mul->outputs[0], new_node_bias_var}, {}, {});
+  } else {
+    auto new_node_add =
+        CreateBaseOp(graph, node, "popart_add",
+                     {new_node_cast->outputs[0], new_node_bias_var}, {}, {});
+    result =
+        CreateBaseOp(graph, node, "popart_mul",
+                     {new_node_add->outputs[0], new_node_scale_var}, {}, {});
+  }
+  auto result_after_cast =
+      CreateCast(graph, node, result->outputs, node->outputs,
+                 static_cast<int>(data_type_));
+  return result_after_cast;
+}
+
+Node *cross_entropy2_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto ignoreIndex = BOOST_GET_CONST(int, op->GetAttr("ignore_index"));
+  auto new_cast = CreateCast(graph, node, {GetInputVarNode("Label", node)}, {},
+                             framework::proto::VarType::INT32);
+  auto label_shape_ = GetInputVarNode("Label", node)->Var()->GetShape();
+  if (label_shape_.size() == 1) {
+    return CreateBaseOp(graph, node, "popart_nllloss",
+                        {GetInputVarNode("X", node), new_cast->outputs[0]},
+                        {GetOutputVarNode("Y", node)},
+                        {
+                            {"ignoreIndex", ignoreIndex},
+                        });
+  } else {
+    std::vector<int64_t> new_shape_{label_shape_[0]};
+    auto const_before_loss = CreateBaseOp(
+        graph, node, "popart_constant", {}, {},
+        {{"value", new_shape_},
+         {"dims",
+          std::vector<int64_t>{static_cast<int64_t>(new_shape_.size())}},
+         {"dtype", ONNXDataType::INT64}});
+
+    auto reshape_before_loss = CreateBaseOp(
+        graph, node, "popart_reshape",
+        {new_cast->outputs[0], const_before_loss->outputs[0]}, {}, {});
+
+    auto nllloss = CreateBaseOp(
+        graph, node, "popart_nllloss",
+        {GetInputVarNode("X", node), reshape_before_loss->outputs[0]}, {},
+        {
+            {"ignoreIndex", ignoreIndex},
+        });
+
+    auto const_after_loss = CreateBaseOp(
+        graph, node, "popart_constant", {}, {},
+        {{"value", label_shape_},
+         {"dims",
+          std::vector<int64_t>{static_cast<int64_t>(label_shape_.size())}},
+         {"dtype", ONNXDataType::INT64}});
+
+    auto reshape_after_loss =
+        CreateBaseOp(graph, node, "popart_reshape",
+                     {nllloss->outputs[0], const_after_loss->outputs[0]},
+                     {GetOutputVarNode("Y", node)}, {});
+    return reshape_after_loss;
+  }
+}
+
+REGISTER_HANDLER(mean, mean_handler);
+REGISTER_HANDLER(pow, pow_handler);
+REGISTER_HANDLER(mul, mul_handler);
+REGISTER_HANDLER(matmul, matmul_handler);
+REGISTER_HANDLER(sum, sum_handler);
+REGISTER_HANDLER(softmax, softmax_handler);
+REGISTER_HANDLER(scale, scale_handler);
+REGISTER_HANDLER(cross_entropy2, cross_entropy2_handler);
+
+}  // namespace
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc
new file mode 100644
index 00000000000000..58f3e42b7387a7
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc
@@ -0,0 +1,301 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+namespace {
+
+Node *conv2d_handler(Graph *graph, Node *node) {
+  OpDesc *op = node->Op();
+  auto dilations_ = BOOST_GET_CONST(std::vector<int>, op->GetAttr("dilations"));
+  auto dilations = std::vector<int64_t>{dilations_.begin(), dilations_.end()};
+  auto group_ = BOOST_GET_CONST(int, op->GetAttr("groups"));
+  auto pads_ = BOOST_GET_CONST(std::vector<int>, op->GetAttr("paddings"));
+  if (pads_.size() == 2) {
+    pads_.push_back(pads_[0]);
+    pads_.push_back(pads_[1]);
+  }
+  auto pads = std::vector<int64_t>{pads_.begin(), pads_.end()};
+  auto stride_ = BOOST_GET_CONST(std::vector<int>, op->GetAttr("strides"));
+  auto stride = std::vector<int64_t>{stride_.begin(), stride_.end()};
+  if (op->HasInput("Bias") && !op->Input("Bias").empty()) {
+    return CreateConv(
+        graph, node,
+        {
+            GetInputVarNode("Input", node), GetInputVarNode("Filter", node),
+            GetInputVarNode("Bias", node),
+        },
+        node->outputs, dilations, group_, {}, pads, stride);
+  } else {
+    return CreateConv(
+        graph, node,
+        {
+            GetInputVarNode("Input", node), GetInputVarNode("Filter", node),
+        },
+        node->outputs, dilations, group_, {}, pads, stride);
+  }
+}
+
+Node *batch_norm_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  std::vector<Node *> inputs;
+  inputs.push_back(GetInputVarNode("X", node));
+  inputs.push_back(GetInputVarNode("Scale", node));
+  inputs.push_back(GetInputVarNode("Bias", node));
+  inputs.push_back(GetInputVarNode("Mean", node));
+  inputs.push_back(GetInputVarNode("Variance", node));
+  int64_t num_outputs = 1;
+  std::vector<Node *> outputs;
+  auto is_test_type = op->GetAttrType("is_test");
+  bool is_test;
+  if (is_test_type == 0) {
+    // int
+    is_test = BOOST_GET_CONST(int, op->GetAttr("is_test"));
+  } else {
+    // bool
+    is_test = BOOST_GET_CONST(bool, op->GetAttr("is_test"));
+  }
+  outputs.push_back(GetOutputVarNode("Y", node));
+  if (!is_test) {
+    outputs.push_back(GetOutputVarNode("MeanOut", node));
+    outputs.push_back(GetOutputVarNode("VarianceOut", node));
+    outputs.push_back(GetOutputVarNode("SavedMean", node));
+    outputs.push_back(GetOutputVarNode("SavedVariance", node));
+    num_outputs = 5;
+  }
+  // outputs.push_back(GetOutputVarNode("ReserveSpace", node));
+  auto momentum = BOOST_GET_CONST(float, op->GetAttr("momentum"));
+  auto epsilon = BOOST_GET_CONST(float, op->GetAttr("epsilon"));
+  // data_layout
+  return CreateBaseOp(graph, node, "popart_batchnormalization", inputs, outputs,
+                      {
+                          {"momentum", momentum},
+                          {"epsilon", epsilon},
+                          {"num_outputs", num_outputs},
+                      });
+}
+
+Node *pool2d_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto pooling_type = BOOST_GET_CONST(std::string, op->GetAttr("pooling_type"));
+  auto global_pooling = BOOST_GET_CONST(bool, op->GetAttr("global_pooling"));
+  if (global_pooling) {
+    if (pooling_type == "max") {
+      return CreateBaseOp(graph, node, "popart_globalmaxpool", node->inputs,
+                          node->outputs);
+    } else if (pooling_type == "avg") {
+      return CreateBaseOp(graph, node, "popart_globalaveragepool", node->inputs,
+                          node->outputs);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "op pool2d with unkonwn pooling_type: %s", pooling_type));
+    }
+  }
+  if (op->HasAttr("padding_algorithm")) {
+    auto padding_algorithm =
+        BOOST_GET_CONST(std::string, op->GetAttr("padding_algorithm"));
+    if (padding_algorithm != "EXPLICIT") {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "op pool2d with unkonwn padding_algorithm: %s", padding_algorithm));
+    }
+  }
+
+  auto ksize = BOOST_GET_CONST(std::vector<int>, op->GetAttr("ksize"));
+  auto kernel_shape = std::vector<int64_t>{ksize.begin(), ksize.end()};
+  auto ceil_mode_ = BOOST_GET_CONST(bool, op->GetAttr("ceil_mode"));
+  auto ceil_mode = int64_t(ceil_mode_ ? 1 : 0);
+  auto paddings = BOOST_GET_CONST(std::vector<int>, op->GetAttr("paddings"));
+  auto pads = std::vector<int64_t>{paddings.begin(), paddings.end()};
+  if (pads.size() == 2) {
+    pads.push_back(paddings[0]);
+    pads.push_back(paddings[1]);
+  }
+  auto strides_ = BOOST_GET_CONST(std::vector<int>, op->GetAttr("strides"));
+  auto strides = std::vector<int64_t>{strides_.begin(), strides_.end()};
+  if (pooling_type == "max") {
+    int64_t num_outputs = 1;
+    auto dilations = std::vector<int64_t>{};
+    int64_t storage_order = 0;
+    return CreateBaseOp(graph, node, "popart_maxpool", node->inputs,
+                        node->outputs, {
+                                           {"num_outputs", num_outputs},
+                                           {"kernel_shape", kernel_shape},
+                                           {"ceil_mode", ceil_mode},
+                                           {"dilations", dilations},
+                                           {"pads", pads},
+                                           {"storage_order", storage_order},
+                                           {"strides", strides},
+                                       });
+  } else if (pooling_type == "avg") {
+    int64_t count_include_pad = 0;
+    return CreateBaseOp(graph, node, "popart_averagepool", node->inputs,
+                        node->outputs,
+                        {
+                            {"kernel_shape", kernel_shape},
+                            {"ceil_mode", ceil_mode},
+                            {"count_include_pad", count_include_pad},
+                            {"pads", pads},
+                            {"strides", strides},
+                        });
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "op pool2d with unkonwn pooling_type: %s", pooling_type));
+  }
+}
+
+Node *group_norm_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto epsilon_ = BOOST_GET_CONST(float, op->GetAttr("epsilon"));
+  auto groups_ = BOOST_GET_CONST(int, op->GetAttr("groups"));
+  auto groups = int64_t{groups_};
+  auto attrs_ = AttributeMap{{"epsilon", epsilon_}, {"num_groups", groups}};
+
+  std::vector<Node *> inputs_ = {GetInputVarNode("X", node),
+                                 GetInputVarNode("Scale", node),
+                                 GetInputVarNode("Bias", node)};
+  std::vector<Node *> outputs_ = {GetOutputVarNode("Y", node),
+                                  GetOutputVarNode("Mean", node),
+                                  GetOutputVarNode("Variance", node)};
+  return CreateBaseOp(graph, node, "popart_groupnormalization_v2", inputs_,
+                      outputs_, attrs_);
+}
+
+Node *instance_norm_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto epsilon_ = BOOST_GET_CONST(float, op->GetAttr("epsilon"));
+  auto attrs_ = AttributeMap{{"epsilon", epsilon_}};
+
+  std::vector<Node *> inputs_ = {GetInputVarNode("X", node),
+                                 GetInputVarNode("Scale", node),
+                                 GetInputVarNode("Bias", node)};
+  std::vector<Node *> outputs_ = {GetOutputVarNode("Y", node)};
+  return CreateBaseOp(graph, node, "popart_instancenormalization", inputs_,
+                      outputs_, attrs_);
+}
+
+Node *layer_norm_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto begin_norm_axis_ = BOOST_GET_CONST(int, op->GetAttr("begin_norm_axis"));
+  auto input_shape_ = GetInputVarNode("X", node)->Var()->GetShape();
+
+  std::vector<int64_t> norm_shape_{1, 1};
+  for (int i = 0; i < input_shape_.size(); i++) {
+    if (i < begin_norm_axis_) {
+      norm_shape_[0] *= input_shape_[i];
+    } else {
+      norm_shape_[1] *= input_shape_[i];
+    }
+  }
+
+  auto attrs1 = AttributeMap{
+      {"value", norm_shape_},
+      {"dims", std::vector<int64_t>{static_cast<int64_t>(norm_shape_.size())}},
+      {"dtype", ONNXDataType::INT64}};
+  auto reshape1_const =
+      CreateBaseOp(graph, node, "popart_constant", {}, {}, attrs1);
+  auto new_node_reshape1 = CreateBaseOp(
+      graph, node, "popart_reshape",
+      {GetInputVarNode("X", node), reshape1_const->outputs[0]}, {}, {});
+
+  auto epsilon_ = BOOST_GET_CONST(float, op->GetAttr("epsilon"));
+  int64_t groups_ = 1;
+  auto groupnorm_attrs_ =
+      AttributeMap{{"epsilon", epsilon_}, {"num_groups", groups_}};
+  auto out_Y_ = MakeVarNode(graph, node);
+  CreateBaseOp(graph, node, "popart_groupnormalization_v2",
+               {new_node_reshape1->outputs[0], GetInputVarNode("Scale", node),
+                GetInputVarNode("Bias", node)},
+               {out_Y_, GetOutputVarNode("Mean", node),
+                GetOutputVarNode("Variance", node)},
+               groupnorm_attrs_);
+
+  auto attrs2 = AttributeMap{
+      {"value", input_shape_},
+      {"dims", std::vector<int64_t>{static_cast<int64_t>(input_shape_.size())}},
+      {"dtype", ONNXDataType::INT64}};
+  auto reshape2_const =
+      CreateBaseOp(graph, node, "popart_constant", {}, {}, attrs2);
+  auto new_node_reshape2 = CreateBaseOp(graph, node, "popart_reshape",
+                                        {out_Y_, reshape2_const->outputs[0]},
+                                        {GetOutputVarNode("Y", node)}, {});
+  return new_node_reshape2;
+}
+
+Node *dropout_handler(Graph *graph, Node *node) {
+  auto *op = node->Op();
+  auto dropout_prob_ = BOOST_GET_CONST(float, op->GetAttr("dropout_prob"));
+  auto dropout_implementation_ =
+      BOOST_GET_CONST(std::string, op->GetAttr("dropout_implementation"));
+  auto is_test_type_ = op->GetAttrType("is_test");
+  bool is_test_;
+  if (is_test_type_ == 0) {
+    // int
+    is_test_ = BOOST_GET_CONST(int, op->GetAttr("is_test"));
+  } else {
+    // bool
+    is_test_ = BOOST_GET_CONST(bool, op->GetAttr("is_test"));
+  }
+
+  if (is_test_) {
+    if (dropout_implementation_ == "upscale_in_train") {
+      return CreateBaseOp(graph, node, "popart_identity",
+                          {GetInputVarNode("X", node)},
+                          {GetOutputVarNode("Out", node)}, {});
+    } else if (dropout_implementation_ == "downgrade_in_infer") {
+      auto scale =
+          CreateConst(graph, node, {}, {},
+                      {{"value", std::vector<float>{1 - dropout_prob_}},
+                       {"dims", std::vector<int64_t>{1}},
+                       {"dtype", ONNXDataType::FLOAT}});
+      return CreateBaseOp(graph, node, "popart_mul",
+                          {GetInputVarNode("X", node), scale->outputs[0]},
+                          {GetOutputVarNode("Out", node)}, {});
+    } else {
+      PADDLE_THROW(
+          platform::errors::InvalidArgument("Invalid dropout_implementation"));
+    }
+  } else {
+    if (dropout_implementation_ == "upscale_in_train") {
+      auto attrs_ =
+          AttributeMap{{"num_outputs", (int64_t)1}, {"ratio", dropout_prob_}};
+      return CreateBaseOp(graph, node, "popart_dropout",
+                          {GetInputVarNode("X", node)},
+                          {GetOutputVarNode("Out", node)}, attrs_);
+    } else if (dropout_implementation_ == "downgrade_in_infer") {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Do not support downgrade_in_infer with training"));
+    } else {
+      PADDLE_THROW(
+          platform::errors::InvalidArgument("Invalid dropout_implementation"));
+    }
+  }
+}
+
+REGISTER_HANDLER(pool2d, pool2d_handler);
+REGISTER_HANDLER(batch_norm, batch_norm_handler);
+REGISTER_HANDLER(group_norm, group_norm_handler);
+REGISTER_HANDLER(instance_norm, instance_norm_handler);
+REGISTER_HANDLER(layer_norm, layer_norm_handler);
+REGISTER_HANDLER(conv2d, conv2d_handler);
+REGISTER_HANDLER(dropout, dropout_handler);
+
+}  // namespace
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc
new file mode 100644
index 00000000000000..b7a3a8ca7c60f5
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc
@@ -0,0 +1,195 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+
+// singleton
+static int var_count = 0;
+static int op_count = 0;
+
+const std::string GenerateVarName() {
+  return std::string("_gen_var_") + std::to_string(var_count++);
+}
+
+const std::string GenerateOpName() {
+  return std::string("_gen_op_") + std::to_string(op_count++);
+}
+
+const std::string CreateOpIdentifyId(Node *node) {
+  // format: op_type|out_var0|out_var1|...|_gen_*
+  // this name will be used as op name when exporting onnx model from popart
+  auto op_type = node->Name();
+  std::string op_out = "";
+  for (auto *out_node : node->outputs) {
+    op_out += "|";
+    op_out += out_node->Name();
+  }
+  return {op_type + op_out + "|" + GenerateOpName()};
+}
+
+Node *MakeVarNode(Graph *graph, Node *node) {
+  auto var_name = GenerateVarName();
+  auto var_desc = std::make_unique<framework::VarDesc>(var_name);
+
+  auto var = graph->CreateVarNode(var_desc.get());
+  return var;
+}
+
+Node *MakeOpNode(Graph *graph, Node *node, const std::string &type,
+                 const std::vector<Node *> &inputs,
+                 const std::vector<Node *> &outputs) {
+  auto op_desc = std::make_unique<framework::OpDesc>();
+  op_desc->SetType(type);
+  auto op = graph->CreateOpNode(op_desc.get());
+
+  for (auto *in : inputs) {
+    ConnectNodes(in, op);
+  }
+  if (outputs.empty()) {
+    auto var = MakeVarNode(graph, node);
+    ConnectNodes(op, var);
+  } else {
+    for (auto *out : outputs) {
+      ConnectNodes(op, out);
+    }
+  }
+
+  // i/o
+  std::vector<std::string> input_names;
+  for (auto node : op->inputs) {
+    input_names.push_back(node->Name());
+  }
+  op->Op()->SetInput("__inputs__", input_names);
+  std::vector<std::string> output_names;
+  for (auto node : op->outputs) {
+    output_names.push_back(node->Name());
+  }
+  op->Op()->SetOutput("__outputs__", output_names);
+  op->Op()->Flush();
+
+  return op;
+}
+
+Node *CreateBaseOp(Graph *graph, Node *node, const std::string &type,
+                   const std::vector<Node *> &inputs,
+                   const std::vector<Node *> &outputs,
+                   const AttributeMap &attrs) {
+  auto new_node = MakeOpNode(graph, node, type, inputs, outputs);
+  if (!attrs.empty()) {
+    new_node->Op()->SetAttrMap(attrs);
+  }
+  // deal special attr
+  if (!new_node->Op()->HasAttr(sIpuIndexAttr)) {
+    CopyOpAttr(sIpuIndexAttr, node->Op(), new_node->Op());
+  }
+  if (!new_node->Op()->HasAttr(sIpuStageAttr)) {
+    CopyOpAttr(sIpuStageAttr, node->Op(), new_node->Op());
+  }
+  {
+    new_node->Op()->SetAttr(sOpIdentifyIdAttr, CreateOpIdentifyId(node));
+    new_node->Op()->Flush();
+  }
+
+  return new_node;
+}
+
+Node *CreateConst(Graph *graph, Node *node, const std::vector<Node *> &inputs,
+                  const std::vector<Node *> &outputs,
+                  const AttributeMap &attrs) {
+  return CreateBaseOp(graph, node, "popart_constant", inputs, outputs, attrs);
+}
+
+Node *CreateCast(Graph *graph, Node *node, const std::vector<Node *> &inputs,
+                 const std::vector<Node *> &outputs, const int otype) {
+  auto to = VarType2PopStr(otype);
+  return CreateBaseOp(graph, node, "popart_cast", inputs, outputs,
+                      {{"to", to}});
+}
+
+Node *CreateGemm(Graph *graph, Node *node, const std::vector<Node *> &inputs,
+                 const std::vector<Node *> &outputs, int64_t transA,
+                 int64_t transB, float alpha, float beta) {
+  return CreateBaseOp(graph, node, "popart_gemm", inputs, outputs,
+                      {
+                          {"alpha", alpha},
+                          {"beta", beta},
+                          {"transA", transA},
+                          {"transB", transB},
+                      });
+}
+
+Node *CreateReshape(Graph *graph, Node *node, const std::vector<Node *> &inputs,
+                    const std::vector<Node *> &outputs,
+                    const std::vector<int64_t> &oshape) {
+  auto attr = AttributeMap{
+      {"value", oshape},
+      {"dims", std::vector<int64_t>{static_cast<int64_t>(oshape.size())}},
+      {"dtype", ONNXDataType::INT64}};
+  auto new_node_const =
+      CreateBaseOp(graph, node, "popart_constant", {}, {}, attr);
+  auto new_node_reshape =
+      CreateBaseOp(graph, node, "popart_reshape",
+                   {inputs[0], new_node_const->outputs[0]}, outputs);
+  return new_node_reshape;
+}
+
+Node *CreateConv(Graph *graph, Node *node, const std::vector<Node *> &inputs,
+                 const std::vector<Node *> &outputs,
+                 const std::vector<int64_t> &dilations, int64_t group,
+                 const std::vector<int64_t> &kernel_shape,
+                 const std::vector<int64_t> &pads,
+                 const std::vector<int64_t> &strides) {
+  auto attrs = AttributeMap{
+      {"dilations", dilations},       {"group", group},
+      {"kernel_shape", kernel_shape}, {"pads", pads},
+      {"strides", strides},
+  };
+  return CreateBaseOp(graph, node, "popart_conv", inputs, outputs, attrs);
+}
+
+Node *CreateSoftmaxOpset11(Graph *graph, Node *node,
+                           const std::vector<Node *> &inputs,
+                           const std::vector<Node *> &outputs, int64_t axis) {
+  PADDLE_ENFORCE_EQ(inputs.size(), 1, platform::errors::InvalidArgument(
+                                          "Softmax op only support one input"));
+  auto x_shape = inputs[0]->Var()->GetShape();
+  int x_rank = x_shape.size();
+  if (axis < 0) {
+    axis = axis + x_rank;
+  }
+  if (axis == x_rank - 1) {
+    return CreateBaseOp(graph, node, "popart_softmax", inputs, outputs,
+                        {{"axis", int64_t{-1}}});
+  } else {
+    auto perm = std::vector<int64_t>(x_rank);
+    std::iota(perm.begin(), perm.end(), 0);
+    perm[x_rank - 1] = axis;
+    perm[axis] = x_rank - 1;
+    auto new_transpose_pre = CreateBaseOp(graph, node, "popart_transpose",
+                                          inputs, {}, {{"perm", perm}});
+    auto new_softmax =
+        CreateBaseOp(graph, node, "popart_softmax", new_transpose_pre->outputs,
+                     {}, {{"axis", int64_t{-1}}});
+    return CreateBaseOp(graph, node, "popart_transpose", new_softmax->outputs,
+                        outputs, {{"perm", perm}});
+  }
+}
+
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h
new file mode 100644
index 00000000000000..7e70e56ef9166c
--- /dev/null
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h
@@ -0,0 +1,85 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/platform/device/ipu/common.h"
+#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
+
+namespace paddle {
+namespace platform {
+namespace ipu {
+
+using paddle::framework::AttributeMap;
+
+template <typename T>
+AttributeMap MakeConstAttrMap(std::vector<T> value, std::vector<int64_t> dims,
+                              int dtype) {
+  return AttributeMap{{"value", value}, {"dims", dims}, {"dtype", dtype}};
+}
+
+template <typename T>
+AttributeMap MakeConstAttrMapFromValue(T v, std::vector<int64_t> dims,
+                                       int dtype) {
+  size_t size = 1;
+  for (auto &dim : dims) {
+    size *= dim;
+  }
+  return MakeConstAttrMap<T>(std::vector<T>(size, v), dims, dtype);
+}
+
+const std::string GenerateVarName();
+const std::string CreateOpIdentifyId(Node *node);
+
+Node *MakeVarNode(Graph *graph, Node *node);
+Node *MakeOpNode(Graph *graph, Node *node, const std::string &type,
+                 const std::vector<Node *> &inputs,
+                 const std::vector<Node *> &outputs);
+
+Node *CreateBaseOp(Graph *graph, Node *node, const std::string &type,
+                   const std::vector<Node *> &inputs,
+                   const std::vector<Node *> &outputs,
+                   const AttributeMap &attrs = {});
+
+Node *CreateConst(Graph *graph, Node *node, const std::vector<Node *> &inputs,
+                  const std::vector<Node *> &outputs,
+                  const AttributeMap &attrs);
+
+// otype is proto::VarType::Type
+Node *CreateCast(Graph *graph, Node *node, const std::vector<Node *> &inputs,
+                 const std::vector<Node *> &outputs, const int otype);
+
+Node *CreateGemm(Graph *graph, Node *node, const std::vector<Node *> &inputs,
+                 const std::vector<Node *> &outputs, int64_t transA = 0,
+                 int64_t transB = 0, float alpha = 1.0f, float beta = 1.0f);
+
+Node *CreateReshape(Graph *graph, Node *node, const std::vector<Node *> &inputs,
+                    const std::vector<Node *> &outputs,
+                    const std::vector<int64_t> &oshape);
+
+Node *CreateConv(Graph *graph, Node *node, const std::vector<Node *> &inputs,
+                 const std::vector<Node *> &outputs,
+                 const std::vector<int64_t> &dilations = {1, 1},
+                 int64_t group = 1,
+                 const std::vector<int64_t> &kernel_shape = {},
+                 const std::vector<int64_t> &pads = {0, 0, 0, 0},
+                 const std::vector<int64_t> &strides = {1, 1});
+
+Node *CreateSoftmaxOpset11(Graph *graph, Node *node,
+                           const std::vector<Node *> &inputs,
+                           const std::vector<Node *> &outputs, int64_t axis);
+
+}  // namespace ipu
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/ipu/supported_ops_autogen.h b/paddle/fluid/platform/device/ipu/supported_ops_autogen.h
index 4cd7f928f6e22b..763c5a46abe287 100644
--- a/paddle/fluid/platform/device/ipu/supported_ops_autogen.h
+++ b/paddle/fluid/platform/device/ipu/supported_ops_autogen.h
@@ -195,3 +195,5 @@ OP_DECL(popart_sqrt, aiOnnxOpset.sqrt, NONE) // NOLINT
 OP_DECL(popart_tanh, aiOnnxOpset.tanh, NONE) // NOLINT
 OP_DECL(popart_tile, aiOnnxOpset.tile, NONE) // NOLINT
 OP_DECL(popart_transpose, aiOnnxOpset.transpose, ARG(INT_VEC,perm) ) // NOLINT
+
+// clang-format on
diff --git a/paddle/fluid/pybind/reader_py.cc b/paddle/fluid/pybind/reader_py.cc
index 60b99a964a57fe..d4fa1b2c89abf3 100644
--- a/paddle/fluid/pybind/reader_py.cc
+++ b/paddle/fluid/pybind/reader_py.cc
@@ -37,6 +37,9 @@ PADDLE_DEFINE_EXPORTED_bool(
     "If set true, the queue.pop will only get data from queue but not "
     "remove the data from queue for speed testing");
 
+// disable auto conversion to list in Python
+PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);
+
 namespace paddle {
 namespace pybind {
 
diff --git a/paddle/pten/api/lib/CMakeLists.txt b/paddle/pten/api/lib/CMakeLists.txt
index 96ad9ade8e3ad5..d1e60c4505d6b0 100644
--- a/paddle/pten/api/lib/CMakeLists.txt
+++ b/paddle/pten/api/lib/CMakeLists.txt
@@ -22,6 +22,10 @@ set(api_source_file ${CMAKE_SOURCE_DIR}/paddle/pten/api/lib/api.cc)
 set(api_header_file_tmp ${api_header_file}.tmp)
 set(api_source_file_tmp ${api_source_file}.tmp)
 
+if (NOT PYTHON_EXECUTABLE)
+  find_package(PythonInterp REQUIRED)
+endif()
+
 add_custom_command(
   OUTPUT ${api_header_file} ${api_source_file}
   COMMAND ${PYTHON_EXECUTABLE} -m pip install pyyaml
diff --git a/paddle/pten/api/lib/kernel_declare.h b/paddle/pten/api/lib/kernel_declare.h
new file mode 100644
index 00000000000000..8c21094a4af202
--- /dev/null
+++ b/paddle/pten/api/lib/kernel_declare.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/core/kernel_registry.h"
+
+// TODO(chenweihang) After the kernel is split into a single file,
+// the kernel declare statement is automatically generated according to the
+// file name of the kernel, and this header file will be removed
+
+PT_DECLARE_KERNEL(full_like, CPU);
+PT_DECLARE_KERNEL(dot, CPU);
+PT_DECLARE_KERNEL(flatten, CPU);
+PT_DECLARE_KERNEL(sign, CPU);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PT_DECLARE_KERNEL(full_like, CUDA);
+PT_DECLARE_KERNEL(dot, CUDA);
+PT_DECLARE_KERNEL(flatten, CUDA);
+PT_DECLARE_KERNEL(sign, CUDA);
+#endif
+
+#ifdef PADDLE_WITH_XPU
+PT_DECLARE_KERNEL(flatten, XPU);
+#endif
diff --git a/paddle/pten/api/lib/utils.cc b/paddle/pten/api/lib/utils.cc
index e17b19d9f689e1..bfde9b14b0020d 100644
--- a/paddle/pten/api/lib/utils.cc
+++ b/paddle/pten/api/lib/utils.cc
@@ -25,10 +25,14 @@ limitations under the License. */
 #include "paddle/pten/include/core.h"
 #include "paddle/pten/include/infermeta.h"
 
-PT_DECLARE_MODULE(UtilsCPU);
+PT_DECLARE_KERNEL(copy, CPU);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_DECLARE_MODULE(UtilsCUDA);
+PT_DECLARE_KERNEL(copy, CUDA);
+#endif
+
+#ifdef PADDLE_WITH_XPU
+PT_DECLARE_KERNEL(copy, XPU);
 #endif
 
 namespace paddle {
diff --git a/paddle/pten/core/kernel_registry.h b/paddle/pten/core/kernel_registry.h
index cd6fa80906cfbd..be624177dfb14c 100644
--- a/paddle/pten/core/kernel_registry.h
+++ b/paddle/pten/core/kernel_registry.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <cstring>
+#include <string>
 #include <type_traits>
 #include <typeindex>
 #include <typeinfo>
@@ -24,6 +25,8 @@
 #include "paddle/pten/core/kernel_factory.h"
 #include "paddle/pten/core/kernel_utils.h"
 
+#include "paddle/fluid/platform/enforce.h"
+
 namespace pten {
 
 #define BACKEND(arg__) pten::Backend::arg__
@@ -193,64 +196,35 @@ struct KernelRegistrar {
 #define _PT_ARG_N(args) _PT_ARG_N_EXPAND args
 #define _PT_RESQ_N() 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 
+/** PT_REGISTER_KERNEL
+ *
+ * The most frequently used kernel registration macro, used for kernel
+ * registration with only data type as template parameter, and the function
+ * pointer of the corresponding data type is automatically instantiated
+ * during registration.
+ */
 #define PT_REGISTER_KERNEL(                                       \
     kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...) \
-  _PT_REGISTER_KERNEL(kernel_name,                                \
-                      PT_ID,                                      \
-                      backend,                                    \
-                      layout,                                     \
-                      meta_kernel_fn,                             \
-                      cpp_dtype,                                  \
-                      __VA_ARGS__)
+  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                              \
+      pt_register_kernel_ns_check_##kernel_name,                  \
+      "PT_REGISTER_KERNEL must be called in global namespace.");  \
+  _PT_REGISTER_KERNEL(                                            \
+      kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, __VA_ARGS__)
+
 #ifndef _WIN32
-#define _PT_REGISTER_KERNEL(                                                   \
-    kernel_name, func_id, backend, layout, meta_kernel_fn, cpp_dtype, ...)     \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                           \
-      PT_CONCATENATE(pt_op_kernel_ns_check_, func_id),                         \
-      "PT_REGISTER_KERNEL must be called in global namespace.");               \
-  PT_KERNEL_INSTANTIATION(meta_kernel_fn, cpp_dtype, __VA_ARGS__);             \
-  static void PT_CONCATENATE(__PT_KERNEL_args_def_FN_,                         \
-                             func_id)(::pten::Kernel*);                        \
-  PT_KERNEL_REGISTRAR_INIT(kernel_name,                                        \
-                           func_id,                                            \
-                           backend,                                            \
-                           layout,                                             \
-                           &PT_CONCATENATE(__PT_KERNEL_args_def_FN_, func_id), \
-                           meta_kernel_fn,                                     \
-                           cpp_dtype,                                          \
-                           __VA_ARGS__);                                       \
-  void PT_CONCATENATE(__PT_KERNEL_args_def_FN_,                                \
-                      func_id)(::pten::Kernel * kernel)
+#define _PT_REGISTER_KERNEL(                                          \
+    kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...)     \
+  PT_KERNEL_INSTANTIATION(meta_kernel_fn, cpp_dtype, __VA_ARGS__);    \
+  static void __PT_KERNEL_args_def_FN_##kernel_name(::pten::Kernel*); \
+  PT_KERNEL_REGISTRAR_INIT(kernel_name,                               \
+                           backend,                                   \
+                           layout,                                    \
+                           &__PT_KERNEL_args_def_FN_##kernel_name,    \
+                           meta_kernel_fn,                            \
+                           cpp_dtype,                                 \
+                           __VA_ARGS__);                              \
+  void __PT_KERNEL_args_def_FN_##kernel_name(::pten::Kernel* kernel)
 #else
-#define _PT_REGISTER_KERNEL(                                                   \
-    kernel_name, func_id, backend, layout, meta_kernel_fn, cpp_dtype, ...)     \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                           \
-      PT_CONCATENATE(pt_op_kernel_ns_check_, func_id),                         \
-      "PT_REGISTER_KERNEL must be called in global namespace.");               \
-  static void PT_CONCATENATE(__PT_KERNEL_args_def_FN_,                         \
-                             func_id)(::pten::Kernel*);                        \
-  PT_KERNEL_REGISTRAR_INIT(kernel_name,                                        \
-                           func_id,                                            \
-                           backend,                                            \
-                           layout,                                             \
-                           &PT_CONCATENATE(__PT_KERNEL_args_def_FN_, func_id), \
-                           meta_kernel_fn,                                     \
-                           cpp_dtype,                                          \
-                           __VA_ARGS__);                                       \
-  void PT_CONCATENATE(__PT_KERNEL_args_def_FN_,                                \
-                      func_id)(::pten::Kernel * kernel)
-#endif
-
-#define PT_KERNEL_INSTANTIATION(meta_kernel_fn, cpp_dtype, ...) \
-  _PT_KERNEL_INSTANTIATION(PT_NARGS(cpp_dtype, __VA_ARGS__),    \
-                           meta_kernel_fn,                      \
-                           cpp_dtype,                           \
-                           __VA_ARGS__)
-
-#define _PT_KERNEL_INSTANTIATION(N, meta_kernel_fn, cpp_dtype, ...) \
-  PT_CONCATENATE(_PT_KERNEL_INSTANTIATION_, N)                      \
-  (meta_kernel_fn, cpp_dtype, __VA_ARGS__)
-
 /**
  * `template decltype(fn) fn` can work on gcc and clang,
  * but msvc will failed, error like:
@@ -261,8 +235,30 @@ struct KernelRegistrar {
  *
  *   https://stackoverflow.com/questions/63989585/explicit-instantiation-of-function-using-decltype-work-on-g-but-not-on-visua
  *
- * So we solve the explict instantiation of kernel by CMake
+ * And msvc can work without template instantiation
  */
+#define _PT_REGISTER_KERNEL(                                          \
+    kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...)     \
+  static void __PT_KERNEL_args_def_FN_##kernel_name(::pten::Kernel*); \
+  PT_KERNEL_REGISTRAR_INIT(kernel_name,                               \
+                           backend,                                   \
+                           layout,                                    \
+                           &__PT_KERNEL_args_def_FN_##kernel_name,    \
+                           meta_kernel_fn,                            \
+                           cpp_dtype,                                 \
+                           __VA_ARGS__);                              \
+  void __PT_KERNEL_args_def_FN_##kernel_name(::pten::Kernel* kernel)
+#endif
+
+#define PT_KERNEL_INSTANTIATION(meta_kernel_fn, cpp_dtype, ...) \
+  _PT_KERNEL_INSTANTIATION(PT_NARGS(cpp_dtype, __VA_ARGS__),    \
+                           meta_kernel_fn,                      \
+                           cpp_dtype,                           \
+                           __VA_ARGS__)
+
+#define _PT_KERNEL_INSTANTIATION(N, meta_kernel_fn, cpp_dtype, ...) \
+  PT_CONCATENATE(_PT_KERNEL_INSTANTIATION_, N)                      \
+  (meta_kernel_fn, cpp_dtype, __VA_ARGS__)
 
 #define _PT_KERNEL_INSTANTIATION_1(meta_kernel_fn, cpp_dtype, ...) \
   template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>
@@ -309,22 +305,15 @@ struct KernelRegistrar {
   template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
   PT_EXPAND(_PT_KERNEL_INSTANTIATION_14(meta_kernel_fn, __VA_ARGS__))
 
-#define PT_KERNEL_REGISTRAR_INIT(kernel_name,                 \
-                                 func_id,                     \
-                                 backend,                     \
-                                 layout,                      \
-                                 args_def_fn,                 \
-                                 meta_kernel_fn,              \
-                                 cpp_dtype,                   \
-                                 ...)                         \
-  _PT_KERNEL_REGISTRAR_INIT(PT_NARGS(cpp_dtype, __VA_ARGS__), \
-                            kernel_name,                      \
-                            func_id,                          \
-                            backend,                          \
-                            layout,                           \
-                            args_def_fn,                      \
-                            meta_kernel_fn,                   \
-                            cpp_dtype,                        \
+#define PT_KERNEL_REGISTRAR_INIT(                                              \
+    kernel_name, backend, layout, args_def_fn, meta_kernel_fn, cpp_dtype, ...) \
+  _PT_KERNEL_REGISTRAR_INIT(PT_NARGS(cpp_dtype, __VA_ARGS__),                  \
+                            kernel_name,                                       \
+                            backend,                                           \
+                            layout,                                            \
+                            args_def_fn,                                       \
+                            meta_kernel_fn,                                    \
+                            cpp_dtype,                                         \
                             __VA_ARGS__)
 
 // clang-format off
@@ -333,7 +322,6 @@ struct KernelRegistrar {
   and multi-line macros cannot be skipped with NOLINT.*/
 #define _PT_KERNEL_REGISTRAR_INIT(N,              \
                                   kernel_name,    \
-                                  func_id,        \
                                   backend,        \
                                   layout,         \
                                   args_def_fn,    \
@@ -342,7 +330,6 @@ struct KernelRegistrar {
                                   ...)            \
   PT_CONCATENATE(_PT_KERNEL_REGISTRAR_INIT_, N) ( \
     kernel_name,                                  \
-    func_id,                                      \
     PT_ID,                                        \
     backend,                                      \
     layout,                                       \
@@ -354,7 +341,6 @@ struct KernelRegistrar {
 // clang-format on
 
 #define _PT_KERNEL_REGISTRAR_INIT_1(kernel_name,                    \
-                                    func_id,                        \
                                     registrar_id,                   \
                                     backend,                        \
                                     layout,                         \
@@ -363,17 +349,17 @@ struct KernelRegistrar {
                                     cpp_dtype,                      \
                                     ...)                            \
   static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
       ::pten::KernelArgsParseFunctor<decltype(                      \
           &meta_kernel_fn<cpp_dtype>)>::Parse,                      \
       args_def_fn,                                                  \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>));
+      PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
+  int TouchKernelSymbolFor_##kernel_name##_##backend() { return 0; }
 #define _PT_KERNEL_REGISTRAR_INIT_2(kernel_name,                    \
-                                    func_id,                        \
                                     registrar_id,                   \
                                     backend,                        \
                                     layout,                         \
@@ -382,8 +368,8 @@ struct KernelRegistrar {
                                     cpp_dtype,                      \
                                     ...)                            \
   static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -392,7 +378,6 @@ struct KernelRegistrar {
       args_def_fn,                                                  \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_1(kernel_name,                \
-                                        func_id,                    \
                                         PT_ID,                      \
                                         backend,                    \
                                         layout,                     \
@@ -400,7 +385,6 @@ struct KernelRegistrar {
                                         meta_kernel_fn,             \
                                         __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_3(kernel_name,                    \
-                                    func_id,                        \
                                     registrar_id,                   \
                                     backend,                        \
                                     layout,                         \
@@ -409,8 +393,8 @@ struct KernelRegistrar {
                                     cpp_dtype,                      \
                                     ...)                            \
   static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -419,7 +403,6 @@ struct KernelRegistrar {
       args_def_fn,                                                  \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_2(kernel_name,                \
-                                        func_id,                    \
                                         PT_ID,                      \
                                         backend,                    \
                                         layout,                     \
@@ -427,7 +410,6 @@ struct KernelRegistrar {
                                         meta_kernel_fn,             \
                                         __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_4(kernel_name,                    \
-                                    func_id,                        \
                                     registrar_id,                   \
                                     backend,                        \
                                     layout,                         \
@@ -436,8 +418,8 @@ struct KernelRegistrar {
                                     cpp_dtype,                      \
                                     ...)                            \
   static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -446,7 +428,6 @@ struct KernelRegistrar {
       args_def_fn,                                                  \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_3(kernel_name,                \
-                                        func_id,                    \
                                         PT_ID,                      \
                                         backend,                    \
                                         layout,                     \
@@ -454,7 +435,6 @@ struct KernelRegistrar {
                                         meta_kernel_fn,             \
                                         __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_5(kernel_name,                    \
-                                    func_id,                        \
                                     registrar_id,                   \
                                     backend,                        \
                                     layout,                         \
@@ -463,8 +443,8 @@ struct KernelRegistrar {
                                     cpp_dtype,                      \
                                     ...)                            \
   static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -473,7 +453,6 @@ struct KernelRegistrar {
       args_def_fn,                                                  \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_4(kernel_name,                \
-                                        func_id,                    \
                                         PT_ID,                      \
                                         backend,                    \
                                         layout,                     \
@@ -481,7 +460,6 @@ struct KernelRegistrar {
                                         meta_kernel_fn,             \
                                         __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_6(kernel_name,                    \
-                                    func_id,                        \
                                     registrar_id,                   \
                                     backend,                        \
                                     layout,                         \
@@ -490,8 +468,8 @@ struct KernelRegistrar {
                                     cpp_dtype,                      \
                                     ...)                            \
   static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -500,7 +478,6 @@ struct KernelRegistrar {
       args_def_fn,                                                  \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_5(kernel_name,                \
-                                        func_id,                    \
                                         PT_ID,                      \
                                         backend,                    \
                                         layout,                     \
@@ -508,7 +485,6 @@ struct KernelRegistrar {
                                         meta_kernel_fn,             \
                                         __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_7(kernel_name,                    \
-                                    func_id,                        \
                                     registrar_id,                   \
                                     backend,                        \
                                     layout,                         \
@@ -517,8 +493,8 @@ struct KernelRegistrar {
                                     cpp_dtype,                      \
                                     ...)                            \
   static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -527,7 +503,6 @@ struct KernelRegistrar {
       args_def_fn,                                                  \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_6(kernel_name,                \
-                                        func_id,                    \
                                         PT_ID,                      \
                                         backend,                    \
                                         layout,                     \
@@ -535,7 +510,6 @@ struct KernelRegistrar {
                                         meta_kernel_fn,             \
                                         __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_8(kernel_name,                    \
-                                    func_id,                        \
                                     registrar_id,                   \
                                     backend,                        \
                                     layout,                         \
@@ -544,8 +518,8 @@ struct KernelRegistrar {
                                     cpp_dtype,                      \
                                     ...)                            \
   static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -554,7 +528,6 @@ struct KernelRegistrar {
       args_def_fn,                                                  \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_7(kernel_name,                \
-                                        func_id,                    \
                                         PT_ID,                      \
                                         backend,                    \
                                         layout,                     \
@@ -562,7 +535,6 @@ struct KernelRegistrar {
                                         meta_kernel_fn,             \
                                         __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_9(kernel_name,                    \
-                                    func_id,                        \
                                     registrar_id,                   \
                                     backend,                        \
                                     layout,                         \
@@ -571,8 +543,8 @@ struct KernelRegistrar {
                                     cpp_dtype,                      \
                                     ...)                            \
   static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -581,7 +553,6 @@ struct KernelRegistrar {
       args_def_fn,                                                  \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_8(kernel_name,                \
-                                        func_id,                    \
                                         PT_ID,                      \
                                         backend,                    \
                                         layout,                     \
@@ -589,7 +560,6 @@ struct KernelRegistrar {
                                         meta_kernel_fn,             \
                                         __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_10(kernel_name,                   \
-                                     func_id,                       \
                                      registrar_id,                  \
                                      backend,                       \
                                      layout,                        \
@@ -598,8 +568,8 @@ struct KernelRegistrar {
                                      cpp_dtype,                     \
                                      ...)                           \
   static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -608,7 +578,6 @@ struct KernelRegistrar {
       args_def_fn,                                                  \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_9(kernel_name,                \
-                                        func_id,                    \
                                         PT_ID,                      \
                                         backend,                    \
                                         layout,                     \
@@ -616,7 +585,6 @@ struct KernelRegistrar {
                                         meta_kernel_fn,             \
                                         __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_11(kernel_name,                   \
-                                     func_id,                       \
                                      registrar_id,                  \
                                      backend,                       \
                                      layout,                        \
@@ -625,8 +593,8 @@ struct KernelRegistrar {
                                      cpp_dtype,                     \
                                      ...)                           \
   static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -635,7 +603,6 @@ struct KernelRegistrar {
       args_def_fn,                                                  \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_10(kernel_name,               \
-                                         func_id,                   \
                                          PT_ID,                     \
                                          backend,                   \
                                          layout,                    \
@@ -643,7 +610,6 @@ struct KernelRegistrar {
                                          meta_kernel_fn,            \
                                          __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_12(kernel_name,                   \
-                                     func_id,                       \
                                      registrar_id,                  \
                                      backend,                       \
                                      layout,                        \
@@ -652,8 +618,8 @@ struct KernelRegistrar {
                                      cpp_dtype,                     \
                                      ...)                           \
   static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -662,7 +628,6 @@ struct KernelRegistrar {
       args_def_fn,                                                  \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_11(kernel_name,               \
-                                         func_id,                   \
                                          PT_ID,                     \
                                          backend,                   \
                                          layout,                    \
@@ -670,7 +635,6 @@ struct KernelRegistrar {
                                          meta_kernel_fn,            \
                                          __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_13(kernel_name,                   \
-                                     func_id,                       \
                                      registrar_id,                  \
                                      backend,                       \
                                      layout,                        \
@@ -679,8 +643,8 @@ struct KernelRegistrar {
                                      cpp_dtype,                     \
                                      ...)                           \
   static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -689,7 +653,6 @@ struct KernelRegistrar {
       args_def_fn,                                                  \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_12(kernel_name,               \
-                                         func_id,                   \
                                          PT_ID,                     \
                                          backend,                   \
                                          layout,                    \
@@ -697,7 +660,6 @@ struct KernelRegistrar {
                                          meta_kernel_fn,            \
                                          __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_14(kernel_name,                   \
-                                     func_id,                       \
                                      registrar_id,                  \
                                      backend,                       \
                                      layout,                        \
@@ -706,8 +668,8 @@ struct KernelRegistrar {
                                      cpp_dtype,                     \
                                      ...)                           \
   static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -716,7 +678,6 @@ struct KernelRegistrar {
       args_def_fn,                                                  \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_13(kernel_name,               \
-                                         func_id,                   \
                                          PT_ID,                     \
                                          backend,                   \
                                          layout,                    \
@@ -724,7 +685,6 @@ struct KernelRegistrar {
                                          meta_kernel_fn,            \
                                          __VA_ARGS__))
 #define _PT_KERNEL_REGISTRAR_INIT_15(kernel_name,                   \
-                                     func_id,                       \
                                      registrar_id,                  \
                                      backend,                       \
                                      layout,                        \
@@ -733,8 +693,8 @@ struct KernelRegistrar {
                                      cpp_dtype,                     \
                                      ...)                           \
   static const ::pten::KernelRegistrar PT_CONCATENATE(              \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
-      kernel_name,                                                  \
+      __reg_pt_kernel_##kernel_name##_, registrar_id)(              \
+      #kernel_name,                                                 \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
@@ -743,7 +703,6 @@ struct KernelRegistrar {
       args_def_fn,                                                  \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
   PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_14(kernel_name,               \
-                                         func_id,                   \
                                          PT_ID,                     \
                                          backend,                   \
                                          layout,                    \
@@ -751,90 +710,59 @@ struct KernelRegistrar {
                                          meta_kernel_fn,            \
                                          __VA_ARGS__))
 
-#define PT_REGISTER_KERNEL_STANDARD(                \
-    kernel_name, backend, layout, dtype, kernel_fn) \
-  _PT_REGISTER_KERNEL_STANDARD(                     \
-      kernel_name, PT_ID, backend, layout, dtype, kernel_fn)
-
-#define _PT_REGISTER_KERNEL_STANDARD(                                      \
-    kernel_name, func_id, backend, layout, dtype, kernel_fn)               \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                       \
-      PT_CONCATENATE(pt_op_kernel_ns_check_, func_id),                     \
-      "_PT_REGISTER_KERNEL_STANDARD must be called in global namespace."); \
-  template decltype(kernel_fn) kernel_fn;                                  \
-  static void PT_CONCATENATE(__PT_KERNEL_args_def_FN_,                     \
-                             func_id)(::pten::Kernel*);                    \
-  static const ::pten::KernelRegistrar PT_CONCATENATE(__reg_pt_op_kernel_, \
-                                                      func_id)(            \
-      kernel_name,                                                         \
-      BACKEND(backend),                                                    \
-      DATALAYOUT(layout),                                                  \
-      DATATYPE(dtype),                                                     \
-      ::pten::KernelArgsParseFunctor<decltype(&kernel_fn)>::Parse,         \
-      args_def_fn,                                                         \
-      PT_KERNEL(kernel_fn));                                               \
-  void PT_CONCATENATE(__PT_KERNEL_args_def_FN_, func_id)(::pten::Kernel*)
-
-// use to declare symbol
-#define PT_REGISTER_MODULE(name) \
-  int RegisterSymbolsFor##name() { return 0; }
-
-#define PT_DECLARE_MODULE(name)          \
-  extern int RegisterSymbolsFor##name(); \
-  UNUSED static int use_kernel_module_##name = RegisterSymbolsFor##name()
-
-// only used in cpp tests
-
-#define PT_REGISTER_KERNEL_FOR_TEST(                              \
-    kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...) \
-  _PT_REGISTER_KERNEL_FOR_TEST(kernel_name,                       \
-                               PT_ID,                             \
-                               backend,                           \
-                               layout,                            \
-                               meta_kernel_fn,                    \
-                               cpp_dtype,                         \
-                               __VA_ARGS__)
-
-#define _PT_REGISTER_KERNEL_FOR_TEST(                                      \
-    kernel_name, func_id, backend, layout, meta_kernel_fn, cpp_dtype, ...) \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                       \
-      PT_CONCATENATE(pt_op_kernel_for_test_ns_check_, func_id),            \
-      "PT_REGISTER_KERNEL must be called in global namespace.");           \
-  static void PT_CONCATENATE(__PT_KERNEL_for_test_args_def_FN_,            \
-                             func_id)(::pten::Kernel*);                    \
-  PT_KERNEL_REGISTRAR_INIT(                                                \
-      kernel_name,                                                         \
-      func_id,                                                             \
-      backend,                                                             \
-      layout,                                                              \
-      &PT_CONCATENATE(__PT_KERNEL_for_test_args_def_FN_, func_id),         \
-      meta_kernel_fn,                                                      \
-      cpp_dtype,                                                           \
-      __VA_ARGS__);                                                        \
-  void PT_CONCATENATE(__PT_KERNEL_for_test_args_def_FN_,                   \
-                      func_id)(::pten::Kernel * kernel)
-
-#define PT_REGISTER_KERNEL_WITH_NO_TYPE(          \
-    kernel_name, backend, layout, meta_kernel_fn) \
-  _PT_REGISTER_KERNEL_WITH_NO_TYPE(               \
-      kernel_name, PT_ID, backend, layout, meta_kernel_fn)
-
-#define _PT_REGISTER_KERNEL_WITH_NO_TYPE(                                  \
-    kernel_name, func_id, backend, layout, meta_kernel_fn)                 \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                       \
-      PT_CONCATENATE(pt_op_kernel_ns_check_, func_id),                     \
-      "PT_REGISTER_KERNEL must be called in global namespace.");           \
-  decltype(meta_kernel_fn) meta_kernel_fn;                                 \
-  static void PT_CONCATENATE(__PT_KERNEL_args_def_FN_,                     \
-                             func_id)(::pten::Kernel*);                    \
-  static const ::pten::KernelRegistrar PT_CONCATENATE(__reg_pt_op_kernel_, \
-                                                      func_id)(            \
-      kernel_name,                                                         \
-      BACKEND(backend),                                                    \
-      DATALAYOUT(layout),                                                  \
-      ::pten::KernelArgsParseFunctor<decltype(&meta_kernel_fn)>::Parse,    \
-      &PT_CONCATENATE(__PT_KERNEL_args_def_FN_, func_id),                  \
-      PT_KERNEL(meta_kernel_fn));                                          \
-  void PT_CONCATENATE(__PT_KERNEL_args_def_FN_,                            \
-                      func_id)(::pten::Kernel * kernel)
+/** PT_REGISTER_SINGLE_KERNEL
+ *
+ * Used to register a single kernel, pass in the complete function pointer
+ * of the kernel, this registration macro will not do automatic template
+ * instantiation.
+ */
+#define PT_REGISTER_SINGLE_KERNEL(                                           \
+    kernel_name, backend, layout, dtype, kernel_fn)                          \
+  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                         \
+      pt_register_single_kernel_ns_check_##kernel_name,                      \
+      "PT_REGISTER_SINGLE_KERNEL must be called in global namespace.");      \
+  static void __PT_SINGLE_KERNEL_args_def_FN_##kernel_name(::pten::Kernel*); \
+  static const ::pten::KernelRegistrar __reg_pt_single_kernel_##kernel_name( \
+      #kernel_name,                                                          \
+      BACKEND(backend),                                                      \
+      DATALAYOUT(layout),                                                    \
+      DATATYPE(dtype),                                                       \
+      ::pten::KernelArgsParseFunctor<decltype(&kernel_fn)>::Parse,           \
+      args_def_fn,                                                           \
+      PT_KERNEL(kernel_fn));                                                 \
+  int TouchKernelSymbolFor_##kernel_name##_##backend() { return 0; }         \
+  void __PT_SINGLE_KERNEL_args_def_FN_##kernel_name(::pten::Kernel*)
+
+/** PT_REGISTER_KERNEL_ALL_DTYPE
+ *
+ * Used to register a kernel that supports all data types, such as copy and
+ * reshape that are not sensitive to data types.
+ */
+#define PT_REGISTER_KERNEL_ALL_DTYPE(kernel_name, backend, layout, kernel_fn) \
+  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                          \
+      pt_register_kernel_all_dtype_ns_check_##kernel_name,                    \
+      "PT_REGISTER_KERNEL_ALL_DTYPE must be called in global namespace.");    \
+  static void __PT_KERNEL_ALL_DTYPE_args_def_FN_##kernel_name(                \
+      ::pten::Kernel*);                                                       \
+  static const ::pten::KernelRegistrar                                        \
+      __reg_pt_kernel_all_dtype_##kernel_name(                                \
+          #kernel_name,                                                       \
+          BACKEND(backend),                                                   \
+          DATALAYOUT(layout),                                                 \
+          ::pten::KernelArgsParseFunctor<decltype(&kernel_fn)>::Parse,        \
+          &__PT_KERNEL_ALL_DTYPE_args_def_FN_##kernel_name,                   \
+          PT_KERNEL(kernel_fn));                                              \
+  int TouchKernelSymbolFor_##kernel_name##_##backend() { return 0; }          \
+  void __PT_KERNEL_ALL_DTYPE_args_def_FN_##kernel_name(::pten::Kernel* kernel)
+
+/** PT_DECLARE_KERNEL
+ *
+ * Used to export the symbols of the file where the kernel is located,
+ * to avoid being removed by linker
+ */
+#define PT_DECLARE_KERNEL(kernel_name, backend)                             \
+  extern int TouchKernelSymbolFor_##kernel_name##_##backend();              \
+  UNUSED static int __declare_kernel_symbol_for_##kernel_name##_##backend = \
+      TouchKernelSymbolFor_##kernel_name##_##backend()
+
 }  // namespace pten
diff --git a/paddle/pten/kernels/cpu/creation.cc b/paddle/pten/kernels/cpu/creation.cc
index 4f09fc489f8f67..4175203410f8da 100644
--- a/paddle/pten/kernels/cpu/creation.cc
+++ b/paddle/pten/kernels/cpu/creation.cc
@@ -61,9 +61,7 @@ void FillConstant(const CPUContext& dev_ctx,
 
 }  // namespace pten
 
-PT_REGISTER_MODULE(CreationCPU);
-
-PT_REGISTER_KERNEL("full_like",
+PT_REGISTER_KERNEL(full_like,
                    CPU,
                    ANY,
                    pten::FillAnyLike,
@@ -74,7 +72,7 @@ PT_REGISTER_KERNEL("full_like",
                    bool,
                    paddle::platform::float16) {}
 
-PT_REGISTER_KERNEL("full",
+PT_REGISTER_KERNEL(full,
                    CPU,
                    ANY,
                    pten::FillConstant,
diff --git a/paddle/pten/kernels/cpu/linalg.cc b/paddle/pten/kernels/cpu/linalg.cc
index 32411560b55168..7ffac0537b60c0 100644
--- a/paddle/pten/kernels/cpu/linalg.cc
+++ b/paddle/pten/kernels/cpu/linalg.cc
@@ -70,12 +70,10 @@ void Matmul(const CPUContext& dev_ctx,
 
 }  // namespace pten
 
-PT_REGISTER_MODULE(LinalgCPU);
-
 using complex64 = ::paddle::platform::complex<float>;
 using complex128 = ::paddle::platform::complex<double>;
 
-PT_REGISTER_KERNEL("dot",
+PT_REGISTER_KERNEL(dot,
                    CPU,
                    ANY,
                    pten::Dot,
@@ -87,5 +85,4 @@ PT_REGISTER_KERNEL("dot",
                    complex128) {}
 
 PT_REGISTER_KERNEL(
-    "matmul_v2", CPU, ANY, pten::Matmul, float, double, complex64, complex128) {
-}
+    matmul_v2, CPU, ANY, pten::Matmul, float, double, complex64, complex128) {}
diff --git a/paddle/pten/kernels/cpu/manipulation.cc b/paddle/pten/kernels/cpu/manipulation.cc
index e0e9cefbf671bf..61c6cb57a9f780 100644
--- a/paddle/pten/kernels/cpu/manipulation.cc
+++ b/paddle/pten/kernels/cpu/manipulation.cc
@@ -130,12 +130,9 @@ void Cast(const CPUContext& dev_ctx,
 
 }  // namespace pten
 
-// TODO(chenweihang): replace by better impl
-PT_REGISTER_MODULE(ManipulationCPU);
-
 // TODO(yuanrisheng): "flatten_contiguous_range" is compatible with old kernel
 // architecture, kernel_name should be "flatten".
-PT_REGISTER_KERNEL("flatten",
+PT_REGISTER_KERNEL(flatten,
                    CPU,
                    ANY,
                    pten::Flatten,
@@ -145,8 +142,7 @@ PT_REGISTER_KERNEL("flatten",
                    int8_t,
                    int,
                    int64_t) {}
-
-PT_REGISTER_KERNEL("flatten.mid",
+PT_REGISTER_KERNEL(flatten_mid,
                    CPU,
                    ANY,
                    pten::FlattenWithXShape,
@@ -156,7 +152,8 @@ PT_REGISTER_KERNEL("flatten.mid",
                    int8_t,
                    int,
                    int64_t) {}
-PT_REGISTER_KERNEL("cast",
+
+PT_REGISTER_KERNEL(cast,
                    CPU,
                    ANY,
                    pten::Cast,
@@ -174,42 +171,33 @@ PT_REGISTER_KERNEL("cast",
   kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
 }
 
-// TODO(yuanrisheng): "reshape2" is compatible with old kernel
-// architecture, kernel_name should be "reshape".
-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape",
-                                CPU,
-                                ANY,
-                                pten::ReshapeFromVectorVal) {}
-
-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.mid",
-                                CPU,
-                                ANY,
-                                pten::ReshapeFromVectorValWithXShape) {}
-
-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.host", CPU, ANY, pten::ReshapeFromDT) {
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape, CPU, ANY, pten::ReshapeFromVectorVal) {}
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape_mid,
+                             CPU,
+                             ANY,
+                             pten::ReshapeFromVectorValWithXShape) {}
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape_host, CPU, ANY, pten::ReshapeFromDT) {
   kernel->InputAt(1).SetBackend(pten::Backend::CPU);
   kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
 }
-
-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.host.mid",
-                                CPU,
-                                ANY,
-                                pten::ReshapeFromDTWithXShape) {
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape_host_mid,
+                             CPU,
+                             ANY,
+                             pten::ReshapeFromDTWithXShape) {
   kernel->InputAt(1).SetBackend(pten::Backend::CPU);
   kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
 }
-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.mulhost",
-                                CPU,
-                                ANY,
-                                pten::ReshapeFromVectorDT) {
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape_mulhost,
+                             CPU,
+                             ANY,
+                             pten::ReshapeFromVectorDT) {
   kernel->InputAt(1).SetBackend(pten::Backend::CPU);
   kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
 }
-
-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.mulhost.mid",
-                                CPU,
-                                ANY,
-                                pten::ReshapeFromVectorDTWithXShape) {
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape_mulhost_mid,
+                             CPU,
+                             ANY,
+                             pten::ReshapeFromVectorDTWithXShape) {
   kernel->InputAt(1).SetBackend(pten::Backend::CPU);
   kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
 }
diff --git a/paddle/pten/kernels/cpu/math.cc b/paddle/pten/kernels/cpu/math.cc
index ddfb2f5f458540..2d556d96c2fcf7 100644
--- a/paddle/pten/kernels/cpu/math.cc
+++ b/paddle/pten/kernels/cpu/math.cc
@@ -106,18 +106,14 @@ DEFINE_CPU_ELEMENTWISE_OP(Mul)
 
 }  // namespace pten
 
-// TODO(chenweihang): replace by better impl
-PT_REGISTER_MODULE(MathCPU);
-
 using complex64 = ::paddle::platform::complex<float>;
 using complex128 = ::paddle::platform::complex<double>;
 
 // NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16
 // using bfloat16 = ::paddle::platform::bfloat16;
-
-PT_REGISTER_KERNEL("sign", CPU, ANY, pten::Sign, float, double) {}
-PT_REGISTER_KERNEL("mean", CPU, ANY, pten::Mean, float, double, bool) {}
-PT_REGISTER_KERNEL("scale",
+PT_REGISTER_KERNEL(sign, CPU, ANY, pten::Sign, float, double) {}
+PT_REGISTER_KERNEL(mean, CPU, ANY, pten::Mean, float, double, bool) {}
+PT_REGISTER_KERNEL(scale,
                    CPU,
                    ANY,
                    pten::Scale,
@@ -129,8 +125,7 @@ PT_REGISTER_KERNEL("scale",
                    int16_t,
                    int,
                    int64_t) {}
-
-PT_REGISTER_KERNEL("add",
+PT_REGISTER_KERNEL(add,
                    CPU,
                    ANY,
                    pten::ElementwiseAdd,
@@ -140,7 +135,7 @@ PT_REGISTER_KERNEL("add",
                    int64_t,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL("subtract",
+PT_REGISTER_KERNEL(subtract,
                    CPU,
                    ANY,
                    pten::ElementwiseSub,
@@ -150,7 +145,7 @@ PT_REGISTER_KERNEL("subtract",
                    int64_t,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL("divide",
+PT_REGISTER_KERNEL(divide,
                    CPU,
                    ANY,
                    pten::ElementwiseDiv,
@@ -160,7 +155,7 @@ PT_REGISTER_KERNEL("divide",
                    int64_t,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL("multiply",
+PT_REGISTER_KERNEL(multiply,
                    CPU,
                    ANY,
                    pten::ElementwiseMul,
@@ -171,8 +166,7 @@ PT_REGISTER_KERNEL("multiply",
                    bool,
                    complex64,
                    complex128) {}
-
-PT_REGISTER_KERNEL("sum",
+PT_REGISTER_KERNEL(sum,
                    CPU,
                    ANY,
                    pten::Sum,
diff --git a/paddle/pten/kernels/cpu/utils.cc b/paddle/pten/kernels/cpu/utils.cc
index b462ef70c2f06e..500b4664d63888 100644
--- a/paddle/pten/kernels/cpu/utils.cc
+++ b/paddle/pten/kernels/cpu/utils.cc
@@ -57,7 +57,4 @@ void Copy(const CPUContext& dev_ctx,
 
 }  // namespace pten
 
-// TODO(chenweihang): replace by better impl
-PT_REGISTER_MODULE(UtilsCPU);
-
-PT_REGISTER_KERNEL_WITH_NO_TYPE("copy", CPU, ANY, pten::Copy) {}
+PT_REGISTER_KERNEL_ALL_DTYPE(copy, CPU, ANY, pten::Copy) {}
diff --git a/paddle/pten/kernels/cuda/creation.cu b/paddle/pten/kernels/cuda/creation.cu
index 8bc23fb6af056f..dd29fd5fbb84d2 100644
--- a/paddle/pten/kernels/cuda/creation.cu
+++ b/paddle/pten/kernels/cuda/creation.cu
@@ -62,9 +62,7 @@ void FillConstant(const CUDAContext& dev_ctx,
 
 }  // namespace pten
 
-PT_REGISTER_MODULE(CreationCUDA);
-
-PT_REGISTER_KERNEL("full_like",
+PT_REGISTER_KERNEL(full_like,
                    CUDA,
                    ANY,
                    pten::FillAnyLike,
@@ -75,7 +73,7 @@ PT_REGISTER_KERNEL("full_like",
                    bool,
                    paddle::platform::float16) {}
 
-PT_REGISTER_KERNEL("full",
+PT_REGISTER_KERNEL(full,
                    CUDA,
                    ANY,
                    pten::FillConstant,
diff --git a/paddle/pten/kernels/cuda/linalg.cu b/paddle/pten/kernels/cuda/linalg.cu
index fe2ac6f184ff73..b08ed8f71ee6b2 100644
--- a/paddle/pten/kernels/cuda/linalg.cu
+++ b/paddle/pten/kernels/cuda/linalg.cu
@@ -54,13 +54,11 @@ void Matmul(const CUDAContext& dev_ctx,
 
 }  // namespace pten
 
-PT_REGISTER_MODULE(LinalgCUDA);
-
 using float16 = paddle::platform::float16;
 using complex64 = ::paddle::platform::complex<float>;
 using complex128 = ::paddle::platform::complex<double>;
 
-PT_REGISTER_KERNEL("dot",
+PT_REGISTER_KERNEL(dot,
                    CUDA,
                    ANY,
                    pten::Dot,
@@ -71,7 +69,7 @@ PT_REGISTER_KERNEL("dot",
                    complex64,
                    complex128) {}
 
-PT_REGISTER_KERNEL("matmul_v2",
+PT_REGISTER_KERNEL(matmul_v2,
                    CUDA,
                    ANY,
                    pten::Matmul,
diff --git a/paddle/pten/kernels/cuda/manipulation.cu b/paddle/pten/kernels/cuda/manipulation.cu
index acaf1ac2cc62b2..e668d1b04d7238 100644
--- a/paddle/pten/kernels/cuda/manipulation.cu
+++ b/paddle/pten/kernels/cuda/manipulation.cu
@@ -129,13 +129,9 @@ void Cast(const CUDAContext& dev_ctx,
 
 }  // namespace pten
 
-// TODO(chenweihang): replace by better impl
-PT_REGISTER_MODULE(ManipulationCUDA);
-
 using float16 = paddle::platform::float16;
-// TODO(yuanrisheng): "flatten_contiguous_range" is compatible with old kernel
-// architecture, kernel_name should be "flatten".
-PT_REGISTER_KERNEL("flatten",
+
+PT_REGISTER_KERNEL(flatten,
                    CUDA,
                    ANY,
                    pten::Flatten,
@@ -146,8 +142,7 @@ PT_REGISTER_KERNEL("flatten",
                    int8_t,
                    int,
                    int64_t) {}
-
-PT_REGISTER_KERNEL("flatten.mid",
+PT_REGISTER_KERNEL(flatten_mid,
                    CUDA,
                    ANY,
                    pten::FlattenWithXShape,
@@ -159,7 +154,7 @@ PT_REGISTER_KERNEL("flatten.mid",
                    int64_t) {}
 
 #define PTEN_REGISTER_CAST_CUDA_BASE_TYPE(op_name, ...) \
-  PT_REGISTER_KERNEL("cast",                            \
+  PT_REGISTER_KERNEL(cast,                              \
                      CUDA,                              \
                      ANY,                               \
                      pten::Cast,                        \
@@ -184,44 +179,33 @@ PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast, paddle::platform::bfloat16)
 PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast)
 #endif
 
-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape",
-                                CUDA,
-                                ANY,
-                                pten::ReshapeFromVectorVal) {}
-
-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.mid",
-                                CUDA,
-                                ANY,
-                                pten::ReshapeFromVectorValWithXShape) {}
-
-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.host",
-                                CUDA,
-                                ANY,
-                                pten::ReshapeFromDT) {
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape, CUDA, ANY, pten::ReshapeFromVectorVal) {}
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape_mid,
+                             CUDA,
+                             ANY,
+                             pten::ReshapeFromVectorValWithXShape) {}
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape_host, CUDA, ANY, pten::ReshapeFromDT) {
   kernel->InputAt(1).SetBackend(pten::Backend::CPU);
   kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
 }
-
-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.host.mid",
-                                CUDA,
-                                ANY,
-                                pten::ReshapeFromDTWithXShape) {
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape_host_mid,
+                             CUDA,
+                             ANY,
+                             pten::ReshapeFromDTWithXShape) {
   kernel->InputAt(1).SetBackend(pten::Backend::CPU);
   kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
 }
-
-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.mulhost",
-                                CUDA,
-                                ANY,
-                                pten::ReshapeFromVectorDT) {
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape_mulhost,
+                             CUDA,
+                             ANY,
+                             pten::ReshapeFromVectorDT) {
   kernel->InputAt(1).SetBackend(pten::Backend::CPU);
   kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
 }
-
-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.mulhost.mid",
-                                CUDA,
-                                ANY,
-                                pten::ReshapeFromVectorDTWithXShape) {
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape_mulhost_mid,
+                             CUDA,
+                             ANY,
+                             pten::ReshapeFromVectorDTWithXShape) {
   kernel->InputAt(1).SetBackend(pten::Backend::CPU);
   kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
 }
diff --git a/paddle/pten/kernels/cuda/math.cu b/paddle/pten/kernels/cuda/math.cu
index 388d42942c10a4..66aaf14dcd0f62 100644
--- a/paddle/pten/kernels/cuda/math.cu
+++ b/paddle/pten/kernels/cuda/math.cu
@@ -111,16 +111,13 @@ void Sum(const CUDAContext& dev_ctx,
 
 }  // namespace pten
 
-// TODO(chenweihang): replace by better impl
-PT_REGISTER_MODULE(MathCUDA);
-
 using float16 = paddle::platform::float16;
 using complex64 = ::paddle::platform::complex<float>;
 using complex128 = ::paddle::platform::complex<double>;
 
-PT_REGISTER_KERNEL("sign", CUDA, ANY, pten::Sign, float, double, float16) {}
-PT_REGISTER_KERNEL("mean", CUDA, ANY, pten::Mean, float, double, bool) {}
-PT_REGISTER_KERNEL("scale",
+PT_REGISTER_KERNEL(sign, CUDA, ANY, pten::Sign, float, double, float16) {}
+PT_REGISTER_KERNEL(mean, CUDA, ANY, pten::Mean, float, double, bool) {}
+PT_REGISTER_KERNEL(scale,
                    CUDA,
                    ANY,
                    pten::Scale,
@@ -132,7 +129,7 @@ PT_REGISTER_KERNEL("scale",
                    int16_t,
                    int,
                    int64_t) {}
-PT_REGISTER_KERNEL("add",
+PT_REGISTER_KERNEL(add,
                    CUDA,
                    ANY,
                    pten::ElementwiseAdd,
@@ -143,7 +140,7 @@ PT_REGISTER_KERNEL("add",
                    float16,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL("subtract",
+PT_REGISTER_KERNEL(subtract,
                    CUDA,
                    ANY,
                    pten::ElementwiseSub,
@@ -154,7 +151,7 @@ PT_REGISTER_KERNEL("subtract",
                    float16,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL("divide",
+PT_REGISTER_KERNEL(divide,
                    CUDA,
                    ANY,
                    pten::ElementwiseDiv,
@@ -165,7 +162,7 @@ PT_REGISTER_KERNEL("divide",
                    float16,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL("multiply",
+PT_REGISTER_KERNEL(multiply,
                    CUDA,
                    ANY,
                    pten::ElementwiseMul,
@@ -177,7 +174,7 @@ PT_REGISTER_KERNEL("multiply",
                    float16,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL("sum",
+PT_REGISTER_KERNEL(sum,
                    CUDA,
                    ANY,
                    pten::Sum,
diff --git a/paddle/pten/kernels/cuda/utils.cu b/paddle/pten/kernels/cuda/utils.cu
index 24da650d1f3eb9..49027e956b2d7d 100644
--- a/paddle/pten/kernels/cuda/utils.cu
+++ b/paddle/pten/kernels/cuda/utils.cu
@@ -234,7 +234,4 @@ void Copy(const CUDAContext& dev_ctx,
 }
 }  // namespace pten
 
-// TODO(chenweihang): replace by better impl
-PT_REGISTER_MODULE(UtilsCUDA);
-
-PT_REGISTER_KERNEL_WITH_NO_TYPE("copy", CUDA, ANY, pten::Copy) {}
+PT_REGISTER_KERNEL_ALL_DTYPE(copy, CUDA, ANY, pten::Copy) {}
diff --git a/paddle/pten/kernels/xpu/manipulation.cc b/paddle/pten/kernels/xpu/manipulation.cc
index 5f1c0d42eb5a8f..f361933cad45a5 100644
--- a/paddle/pten/kernels/xpu/manipulation.cc
+++ b/paddle/pten/kernels/xpu/manipulation.cc
@@ -95,12 +95,7 @@ void ReshapeFromVectorDT(const XPUContext& dev_ctx,
 
 }  // namespace pten
 
-// TODO(chenweihang): replace by better impl
-PT_REGISTER_MODULE(ManipulationXPU);
-
-// TODO(yuanrisheng): "flatten_contiguous_range" is compatible with old kernel
-// architecture, kernel_name should be "flatten".
-PT_REGISTER_KERNEL("flatten_contiguous_range",
+PT_REGISTER_KERNEL(flatten,
                    XPU,
                    ANY,
                    pten::Flatten,
@@ -112,7 +107,7 @@ PT_REGISTER_KERNEL("flatten_contiguous_range",
                    int,
                    int64_t) {}
 
-PT_REGISTER_KERNEL("flatten_contiguous_range.mid",
+PT_REGISTER_KERNEL(flatten_mid,
                    XPU,
                    ANY,
                    pten::FlattenWithXShape,
@@ -124,9 +119,4 @@ PT_REGISTER_KERNEL("flatten_contiguous_range.mid",
                    int,
                    int64_t) {}
 
-// TODO(yuanrisheng): "reshape2" is compatible with old kernel
-// architecture, kernel_name should be "reshape".
-PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2",
-                                XPU,
-                                ANY,
-                                pten::ReshapeFromVectorVal) {}
+PT_REGISTER_KERNEL_ALL_DTYPE(reshape, XPU, ANY, pten::ReshapeFromVectorVal) {}
diff --git a/paddle/pten/kernels/xpu/utils.cc b/paddle/pten/kernels/xpu/utils.cc
index 329dc2baf87b58..5c98217f4ec2c4 100644
--- a/paddle/pten/kernels/xpu/utils.cc
+++ b/paddle/pten/kernels/xpu/utils.cc
@@ -76,7 +76,4 @@ void Copy(const XPUDeviceContext& dev_ctx,
 
 }  // namespace pten
 
-// TODO(chenweihang): replace by better impl
-PT_REGISTER_MODULE(UtilsXPU);
-
-PT_REGISTER_KERNEL_WITH_NO_TYPE("copy", XPU, ANY, pten::Copy) {}
+PT_REGISTER_KERNEL_ALL_DTYPE(copy, XPU, ANY, pten::Copy) {}
diff --git a/paddle/pten/tests/api/test_reshape_api.cc b/paddle/pten/tests/api/test_reshape_api.cc
index b6179f11b1019e..227dcc6e9568d7 100644
--- a/paddle/pten/tests/api/test_reshape_api.cc
+++ b/paddle/pten/tests/api/test_reshape_api.cc
@@ -21,12 +21,6 @@ limitations under the License. */
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/kernel_registry.h"
 
-PT_DECLARE_MODULE(ManipulationCPU);
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_DECLARE_MODULE(ManipulationCUDA);
-#endif
-
 namespace paddle {
 namespace tests {
 
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index ba809dceee191b..6ffdd75f72b4ec 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -230,6 +230,8 @@
 from .tensor.math import lerp  # noqa: F401
 from .tensor.math import rad2deg  # noqa: F401
 from .tensor.math import deg2rad  # noqa: F401
+from .tensor.math import gcd  # noqa: F401
+from .tensor.math import lcm  # noqa: F401
 from .tensor.math import diff  # noqa: F401
 from .tensor.math import angle  # noqa: F401
 
@@ -483,6 +485,8 @@
            'atan2',
            'rad2deg',
            'deg2rad',
+           'gcd',
+           'lcm',
            'expand',
            'broadcast_to',
            'ones_like',
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
index ffd24add50a4d0..dc313c33ee3e28 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
@@ -27,11 +27,13 @@
 import paddle
 import paddle.fluid as fluid
 from paddle import framework
+from paddle.fluid import core
 import paddle.distributed as dist
 from paddle.optimizer import Optimizer
+from paddle.fluid.clip import ClipGradByGlobalNorm
 
 from ...utils.internal_storage import ParamStorage
-from ...meta_parallel.sharding.sharding_utils import Type
+from ...meta_parallel.sharding.sharding_utils import Type, device_guard, ShardingClipGrad
 
 # CUDA alignment 256 bytes
 alignment = {"gpu": 256, }
@@ -99,16 +101,41 @@ def __init__(self,
 
         self.broadcast_fp16 = broadcast_fp16
         self.param_storages = {}  # {dtype: {rank: InternalStorage}}
+
+        if isinstance(self._optim._grad_clip, ClipGradByGlobalNorm):
+            logging.warning(
+                "While using ClipGradByGlobalNorm in ShardingOptimizer, the grad clip of original optimizer will be changed."
+            )
+            self._optim._grad_clip = ShardingClipGrad(self._optim._grad_clip,
+                                                      group,
+                                                      paddle.get_device())
+
+        if offload:
+            assert self._pfp16, "Only support offload strategy while using \'Adam\', \'AdamW\' and \'Momentum\' optimizer with AMP/Pure FP16"
+
         self.offload = offload  # Using for offload
+        self.offload_device = "cpu"
+
+        self._master_params = {}
 
         # Update optimizer parameters and adjust parameter storage and use according to rank.
         self.update_opt_status()
 
     def _generate_master_params(self, trainable_params):
-        for param in trainable_params:
-            if param.dtype == Type.fp16.value:
-                self._optim._master_weights[param.name] = paddle.cast(
-                    param, Type.fp32.value)
+        if self.offload:
+            for param in trainable_params:
+                if param.name not in self._master_params.keys():
+                    self._master_params[param.name] = core.VarBase(
+                        name=param.name,
+                        value=param.cast(dtype=Type.fp32.value).numpy(),
+                        place=core.CPUPlace(),
+                        stop_gradient=param.stop_gradient)
+            self._optim._master_weights = self._master_params
+        else:
+            for param in trainable_params:
+                if param.dtype == Type.fp16.value:
+                    self._optim._master_weights[param.name] = paddle.cast(
+                        param, Type.fp32.value)
 
     def update_opt_status(self):
         """Update optimizer status and parameter storage information, and special functions to be developed.
@@ -243,22 +270,43 @@ def step(self):
         A wrapper for Optimizer's step function to finish the update operation of the optimizer.
         """
 
-        # Synchronize optimizer parameters for the current rank
-        if len(self.dtype_rank_params.keys(
-        )) == 1 and Type.fp32.value in self.dtype_rank_params.keys():
-            self._optim._parameter_list = self.dtype_rank_params[
-                Type.fp32.value][self.rank]
-        elif len(self.dtype_rank_params.keys(
-        )) == 1 and Type.fp16.value in self.dtype_rank_params.keys():
-            self._optim._parameter_list = self.dtype_rank_params[
-                Type.fp16.value][self.rank]
+        if self.offload:
+            self._optim._parameter_list = [
+                param for name, param in self._master_params.items()
+            ]
         else:
-            self._optim._parameter_list = self.dtype_rank_params[
-                Type.fp16.value][self.rank] + self.dtype_rank_params[
+            # Synchronize optimizer parameters for the current rank
+            if len(self.dtype_rank_params.keys(
+            )) == 1 and Type.fp32.value in self.dtype_rank_params.keys():
+                self._optim._parameter_list = self.dtype_rank_params[
                     Type.fp32.value][self.rank]
+            elif len(self.dtype_rank_params.keys(
+            )) == 1 and Type.fp16.value in self.dtype_rank_params.keys():
+                self._optim._parameter_list = self.dtype_rank_params[
+                    Type.fp16.value][self.rank]
+            else:
+                self._optim._parameter_list = self.dtype_rank_params[
+                    Type.fp16.value][self.rank] + self.dtype_rank_params[
+                        Type.fp32.value][self.rank]
 
         # Run the optimizer of the current rank step
-        self._optim.step()
+        if self.offload:
+            with device_guard(self.rank, self.offload_device):
+                self._optim.step()
+
+                for param in self._optim._parameter_list:
+                    self._master_params[param.name].set_value(param)
+
+            dev_id = 0 if paddle.get_device() == "cpu" else int(
+                paddle.get_device().split(":")[1])
+
+            for param in self._local_params:
+                if param.name in self._master_params.keys():
+                    param.set_value(self._master_params[param.name].cuda(dev_id)
+                                    .cast(dtype=param.dtype))
+                    self._master_params[param.name].clear_gradient(False)
+        else:
+            self._optim.step()
 
         # Synchronize all the updated shards in between the ranks
         self._broadcast_params()
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
index 37b85751149f71..fd49c2a7d65869 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
@@ -112,6 +112,18 @@ def __init__(
         self._has_grad_storage = []
         self._grad_storage_list = []
 
+        # offload
+        # TODO(haohongxiang): Now it's not supported for multi-optimizers using Offload strategy
+        self._offload_optims = list(
+            filter(lambda optim: optim.offload, self._sharding_optimizers))
+        if len(self._offload_optims) > 0:
+            assert len(
+                self._sharding_optimizers
+            ) == 1, "Only support offload strategy for single optimizer"
+
+        self._offload = self._sharding_optimizers[0].offload
+        self._offload_device = "cpu"
+
         # Set backward pass hooks
         self._bw_hooks = []
 
@@ -156,7 +168,8 @@ def clear_gradients(self):
         # Release grad storages
         for dtype in self._grad_storages.keys():
             if self._rank in self._grad_storages[dtype].keys():
-                self._grad_storages[dtype][self._rank].buffer.zero_()
+                if not self._offload:
+                    self._grad_storages[dtype][self._rank].buffer.zero_()
 
         # Release params
         for param in self._trainable_params:
@@ -167,17 +180,24 @@ def grad_scale(self):
         """
         Before the gradient accumulation, scale the gradient.
         """
-        # Scale grad storages
-        for dtype in self._grad_storages.keys():
-            if self._rank in self._grad_storages[dtype].keys():
-                self._grad_storages[dtype][self._rank].buffer.scale_(
-                    scale=self._world_size_scaling)
-
-        # Scale params
-        for param in self._trainable_params:
-            if param.name in self._param_grads and param.grad is not None:
-                param.grad.scale_(scale=self._world_size_scaling)
-                param._reset_grad_inplace_version(True)
+        if self._offload:
+            for param in self._trainable_params:
+                if param.name in self._sharding_optimizers[
+                        0]._master_params.keys():
+                    self._sharding_optimizers[0]._master_params[
+                        param.name].grad.scale_(scale=self._world_size_scaling)
+        else:
+            # Scale grad storages
+            for dtype in self._grad_storages.keys():
+                if self._rank in self._grad_storages[dtype].keys():
+                    self._grad_storages[dtype][self._rank].buffer.scale_(
+                        scale=self._world_size_scaling)
+
+            # Scale params
+            for param in self._trainable_params:
+                if param.name in self._param_grads and param.grad is not None:
+                    param.grad.scale_(scale=self._world_size_scaling)
+                    param._reset_grad_inplace_version(True)
 
     def _init_internal_storage(self, needs_fresh):
         """
@@ -195,8 +215,14 @@ def to(self, device=None, dtype=None, blocking=True):
         """
         Synchronously or asynchronously convert the data type of the layer, the device is not supported now.
         """
+        assert isinstance(device, str), "Device must be type str"
         assert device == self._default_device, "New devices are not supported, because of the optimizer state is not sync"
 
+        self._layer.to(device=device, dtype=dtype, blocking=blocking)
+
+        # Re-build the buckets, hooks, etc..
+        self._fresh_trainable()
+
     def _fresh_trainable(self):
         """ Whether to update training parameters. """
 
@@ -283,12 +309,17 @@ def reduce(*_):
                     self._grad_reduced[index] = False
                     if not self._accumulate_grads:
                         param.grad.scale_(scale=self._world_size_scaling)
-                    param._reset_grad_inplace_version(True)
+                        param._reset_grad_inplace_version(True)
 
                     # Clear the gradient that does not belong to the current rank through the callback function
                     def cleanup():
                         if dst_rank != self._rank:
                             param.clear_gradient(False)
+                        elif self._offload:
+                            self._sharding_optimizers[0]._master_params[
+                                param.name]._copy_gradient_from(param.grad.cpu(
+                                ).cast(dtype=Type.fp32.value))
+                            param.clear_gradient(False)
 
                     # Synchronize the reduce parameter gradient
                     self._tasks_flow.append(
@@ -339,6 +370,15 @@ def cleanup():
 
                                 grad_storage.buffer.value().get_tensor()._clear(
                                 )
+                            elif self._offload:
+                                grad_storage.to(device=self._offload_device)
+                                for param in grad_storage._params:
+                                    self._sharding_optimizers[0]._master_params[
+                                        param.name]._copy_gradient_from(
+                                            param.grad.cast(
+                                                dtype=Type.fp32.value))
+                                grad_storage.buffer.value().get_tensor()._clear(
+                                )
 
                         # Reduce the bucket
                         grad_storage.sent = True
@@ -478,7 +518,7 @@ def _build_grad_storages(self):
         # Rebuild fp16/fp32 grad storages
         for dtype in self._grad_storages.keys():
             for dst_rank, grad_storage in self._grad_storages[dtype].items():
-                if dst_rank != self._rank:
+                if self._offload or dst_rank != self._rank:
                     grad_storage.manumal_relase()
                     grad_storage.rebuild()
 
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
index d4c443e385f6fe..651bed82396d1d 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
@@ -17,10 +17,17 @@
 from collections import abc
 from enum import Enum
 from math import inf
+import numpy as np
+from types import MethodType
 
 import paddle
 import paddle.distributed as dist
+from paddle import _C_ops
 from paddle.fluid import core
+from paddle.fluid import layers
+from paddle.fluid.dygraph import to_variable
+from paddle.fluid.framework import dygraph_only
+from paddle.fluid.dygraph import base as imperative_base
 
 
 class Taskflow:
@@ -41,6 +48,88 @@ class Type(Enum):
     fp32 = paddle.float32
 
 
+class ShardingClipGrad:
+    def __init__(self, clip, group, device):
+        self._clip = clip
+        self._group = group
+        self._device = device
+
+    @imperative_base.no_grad
+    def _dygraph_clip(self, params_grads):
+        params_and_grads = []
+
+        sum_square_fp16 = []
+        sum_square_fp32 = []
+
+        for p, g in params_grads:
+            if g is None or getattr(p, 'need_clip', True) is False:
+                continue
+
+            merge_grad = g
+            if g.type == core.VarDesc.VarType.SELECTED_ROWS:
+                merge_grad = layers.get_tensor_from_selected_rows(
+                    layers.merge_selected_rows(g))
+            square = layers.square(merge_grad)
+            sum_square = layers.reduce_sum(square)
+
+            if p.dtype == paddle.float16:
+                sum_square_fp16.append(sum_square)
+            elif p.dtype == paddle.float32:
+                sum_square_fp32.append(sum_square)
+
+        # global norm of non-distributed FP16 params_and_grads
+        if len(sum_square_fp16) == 0:
+            global_norm_fp16 = paddle.to_tensor([0.], dtype=paddle.float32)
+        else:
+            global_norm_fp16 = layers.concat(sum_square_fp16)
+            global_norm_fp16 = layers.reduce_sum(global_norm_fp16)
+            global_norm_fp16 = paddle.cast(
+                global_norm_fp16, dtype=paddle.float32)
+
+        # global norm of non-distributed FP32 params_and_grads
+        global_norm_fp32 = layers.concat(sum_square_fp32) if len(
+            sum_square_fp32) != 0 else paddle.to_tensor(
+                [0.], dtype=paddle.float32)
+        global_norm_fp32 = layers.reduce_sum(global_norm_fp32)
+
+        global_norm_var = global_norm_fp16 + global_norm_fp32
+
+        # add all reduce to get global norm of distributed params_and_grads
+        dev_id = int(self._device.split(":")[1])
+        with device_guard(dev_id, "gpu"):
+            paddle.distributed.all_reduce(global_norm_var, group=self._group)
+
+        global_norm_var = layers.sqrt(global_norm_var)
+        max_global_norm = layers.fill_constant(
+            shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm)
+
+        clip_var = layers.elementwise_div(
+            x=max_global_norm,
+            y=layers.elementwise_max(
+                x=global_norm_var, y=max_global_norm))
+        clip_var_fp16 = paddle.cast(clip_var, paddle.float16)
+
+        for p, g in params_grads:
+            if g is None:
+                continue
+            if getattr(p, 'need_clip', True) is False:
+                params_and_grads.append((p, g))
+                continue
+            if p.dtype == paddle.float16:
+                new_grad = layers.elementwise_mul(x=g, y=clip_var_fp16)
+            else:
+                new_grad = layers.elementwise_mul(x=g, y=clip_var)
+            params_and_grads.append((p, new_grad))
+
+        return params_and_grads
+
+    def __getattr__(self, item):
+        return getattr(self._clip, item)
+
+    def __call__(self, params_grads):
+        return self._dygraph_clip(params_grads)
+
+
 @contextlib.contextmanager
 def device_guard(dev_id, device="cpu"):
     origin_device = paddle.device.get_device()
@@ -52,3 +141,65 @@ def device_guard(dev_id, device="cpu"):
         yield
     finally:
         paddle.set_device(origin_device)
+
+
+@dygraph_only
+def ShardingScaler(scaler, sharding_group):
+    def unscale_method(self, optimizer):
+        if not self._enable:
+            return
+        param_grads = []
+        param_grads_fp16 = []
+        param_grads_fp32 = []
+
+        if getattr(optimizer, '_param_groups', None) and isinstance(
+                optimizer._param_groups[0], dict):
+
+            for group in optimizer._param_groups:
+                for param in group['params']:
+                    if param._grad_ivar() is not None:
+                        param_grads.append(param._grad_ivar())
+                        if param._grad_ivar(
+                        ).dtype == core.VarDesc.VarType.FP16:
+                            param_grads_fp16.append(param._grad_ivar())
+                        else:
+                            param_grads_fp32.append(param._grad_ivar())
+        else:
+            param_grads = [
+                param._grad_ivar() for param in optimizer._parameter_list
+                if param._grad_ivar() is not None
+            ]
+            param_grads_fp16 = [
+                param._grad_ivar() for param in optimizer._parameter_list
+                if (param._grad_ivar() is not None
+                    ) and (param._grad_ivar().dtype == core.VarDesc.VarType.FP16
+                           )
+            ]
+            param_grads_fp32 = [
+                param._grad_ivar() for param in optimizer._parameter_list
+                if (param._grad_ivar() is not None
+                    ) and (param._grad_ivar().dtype == core.VarDesc.VarType.FP32
+                           )
+            ]
+        temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool))
+        temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool))
+        if len(param_grads_fp16):
+            _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale,
+                                            param_grads_fp16,
+                                            temp_found_inf_fp16)
+        if len(param_grads_fp32):
+            _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale,
+                                            param_grads_fp32,
+                                            temp_found_inf_fp32)
+
+        self._found_inf = 1 if temp_found_inf_fp16 or temp_found_inf_fp32 else 0
+        is_found_inf = paddle.to_tensor([self._found_inf], dtype="int32")
+
+        paddle.distributed.all_reduce(
+            is_found_inf,
+            op=paddle.distributed.ReduceOp.MAX,
+            group=sharding_group)
+        self._found_inf = is_found_inf.numpy()[0]
+
+    scaler._unscale = MethodType(unscale_method, scaler)
+    return scaler
diff --git a/python/paddle/distributed/fleet/utils/internal_storage.py b/python/paddle/distributed/fleet/utils/internal_storage.py
index ff41ca217e43bb..f44b57ede468b2 100644
--- a/python/paddle/distributed/fleet/utils/internal_storage.py
+++ b/python/paddle/distributed/fleet/utils/internal_storage.py
@@ -50,6 +50,29 @@ def __init__(self, size, dtype, device, convert_cpu=False):
         else:
             self.buffer = paddle.zeros(size, dtype=dtype)
 
+    def to(self, device, dtype=None, keep_alignment=True):
+        """
+        Move the underlying buffer
+        """
+        assert self.buffer is not None, "Cannot move a collapsed bucket, please rebuild it"
+        assert (dtype == Type.fp32.value or
+                Type.fp16.value), "Conversion type is not supported now"
+
+        dev_id = 0 if paddle.get_device() == "cpu" else int(paddle.get_device()
+                                                            .split(":")[1])
+
+        if self._device != device:
+            tmp_buffer = self.buffer.cuda(
+                dev_id) if device == "gpu" else self.buffer.cpu()
+            for param in self._params:
+                param.clear_gradient(False)
+                param._gradient_set_empty(False)
+            self.buffer.value().get_tensor()._clear()
+            self.buffer = tmp_buffer
+
+        if dtype is not None:
+            self.buffer = self.buffer.cast(dtype=dtype)
+
 
 class ParamStorage(InternalStorage):
     """
@@ -60,6 +83,16 @@ def __init__(self, size, dtype, device):
         super().__init__(size, dtype, device, convert_cpu=True)
         self.param2align = None
 
+    def to(self, device, dtype=None, keep_alignment=True):
+        """
+        Move the underlying buffer
+        """
+
+        super().to(device, dtype)
+
+        if keep_alignment:
+            self._array_params()
+
     @fluid.dygraph.no_grad
     def add_rank_params(self, trainable_params, param2align):
         """
@@ -78,7 +111,7 @@ def add_rank_params(self, trainable_params, param2align):
             p_shape = self._add_param_as_view(param, param2align[param.name])
             cpu_param_shape.append(p_shape)
 
-        # buffer covert from cpu to cuda
+        # buffer convert from cpu to cuda
         dev_id = int(paddle.get_device().split(":")[1])
         self.buffer = self.buffer.cuda(dev_id)
         self._fill = 0
@@ -109,7 +142,8 @@ def _add_param_as_view(self, param, align):
         param.stop_gradient = origin_state
 
         # Copy the current param value
-        dev_id = int(paddle.get_device().split(":")[1])
+        dev_id = 0 if paddle.get_device() == "cpu" else int(paddle.get_device()
+                                                            .split(":")[1])
         with device_guard(dev_id, "cpu"):
             tmp_var = core.VarBase(tensor=self.buffer._slice(self._fill,
                                                              var_end))
@@ -134,6 +168,18 @@ def _convert_buffer(self, param, p_shape, align):
 
         self._fill = offset
 
+    @fluid.dygraph.no_grad
+    def _array_params(self):
+        """
+        Given the parameters which have been registered previously, rebuild the whole InternalStorage.
+        """
+        assert len(self._params) > 0
+        assert self.param2align is not None
+
+        self._fill = 0
+        for p in self._params:
+            self._convert_buffer(p, p.shape, self.param2align[p.name])  # modify
+
 
 class GradStorage(InternalStorage):
     """
@@ -171,6 +217,18 @@ def can_add_grad_view(self, param, align):
             param.shape) + align <= self._max_size and id(
                 param) not in self._param_ids
 
+    def to(self, device, dtype=None, keep_alignment=True):
+        """
+        Move the underlying buffer
+        """
+        if self._release:
+            self.rebuild()
+
+        super().to(device, dtype)
+
+        if keep_alignment:
+            self._array_grads()
+
     @fluid.dygraph.no_grad
     def add_grad(self, param, align):
         """
@@ -206,17 +264,25 @@ def rebuild(self):
         """
         Given the parameter gradients which have been registered previously, rebuild the whole InternalStorage.
         """
-        assert len(self._params) > 0
 
         if self._release:
-            self.buffer = paddle.zeros(
-                [self._max_size], dtype=self._params[0].dtype)
+            self.buffer = paddle.zeros([self._max_size], dtype=self._dtype)
 
             for p in self._params:
                 self._add_grad_as_view(p, self._parm2align[p.name])
 
             self._release = False
 
+    @fluid.dygraph.no_grad
+    def _array_grads(self):
+        """
+        Given the parameters gradients which have been registered previously, rebuild the whole InternalStorage.
+        """
+        if len(self._params) > 0:
+            self._fill = 0
+            for p in self._params:
+                self._add_grad_as_view(p, self._parm2align[p.name])
+
     @fluid.dygraph.no_grad
     def _add_grad_as_view(self, param, align):
         assert np.prod(
@@ -229,8 +295,17 @@ def _add_grad_as_view(self, param, align):
         assert offset <= np.prod(self.buffer.shape)
 
         # Copy the current grad value to InternalStorage
-        assert self._device == "gpu"
-        tmp_var = core.VarBase(self.buffer._slice(self._fill, grad_end))
-        param._copy_gradient_from(tmp_var)
-        tmp_var.value().get_tensor()._clear()
+        dev_id = 0 if paddle.get_device() == "cpu" else int(paddle.get_device()
+                                                            .split(":")[1])
+        if self._device == "cpu":
+            with device_guard(dev_id, self._device):
+                tmp_var = core.VarBase(self.buffer._slice(self._fill, grad_end))
+                param._copy_gradient_from(tmp_var)
+                tmp_var.value().get_tensor()._clear()
+
+        elif self._device == "gpu":
+            tmp_var = core.VarBase(self.buffer._slice(self._fill, grad_end))
+            param._copy_gradient_from(tmp_var)
+            tmp_var.value().get_tensor()._clear()
+
         self._fill = offset
diff --git a/python/paddle/distribution.py b/python/paddle/distribution.py
index e30d3e4c20a92a..cf198eab1e8e02 100644
--- a/python/paddle/distribution.py
+++ b/python/paddle/distribution.py
@@ -305,7 +305,8 @@ def sample(self, shape, seed=0):
         else:
             output_shape = shape + batch_shape
             output = nn.uniform_random(
-                output_shape, seed=seed, dtype=self.dtype) * (tensor.zeros(
+                output_shape, dtype=self.dtype, min=0., max=1.,
+                seed=seed) * (tensor.zeros(
                     output_shape, dtype=self.dtype) + (self.high - self.low))
             output = elementwise_add(output, self.low, name=name)
             if self.all_arg_is_float:
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
index 125d9fa88d4aed..5d29dc522b3ef6 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
@@ -484,7 +484,7 @@ def save_quantized_model(self, model, path, input_spec=None, **config):
                 model_filename=model_filename,
                 params_filename=params_filename))
 
-        self._gather_scales(infer_program, scope)
+        self._gather_scales(infer_program, scope, fetch_targets)
 
         self._set_skip_quant_attr(infer_program)
 
@@ -520,10 +520,10 @@ def _is_target_layer(self, layer):
 
         return flag
 
-    def _gather_scales(self, program, scope):
+    def _gather_scales(self, program, scope, fetch_targets):
         """
         Get all scales from fake ops, save them into the corresponding ops
-        and delete all moving_average_abs_max_scale ops. 
+        and delete all moving_average_abs_max_scale ops.
         """
 
         def _gather_input_scale():
@@ -580,6 +580,11 @@ def _gather_output_scale():
 
                 for next_op in next_ops:
                     next_op._rename_input(out_var_name, in_var_name)
+                    # If next_op is `fetch` and out_var_name in fetch_targets,
+                    # fetch_targets must update to in_var_name when rename input.
+                    for i in range(len(fetch_targets)):
+                        if fetch_targets[i].name == out_var_name:
+                            fetch_targets[i] = block.var(in_var_name)
 
         _gather_input_scale()
         _gather_output_scale()
diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
index 3e033f70aca38a..1ddb9c8e5fa9f4 100644
--- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
+++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
@@ -410,6 +410,23 @@ def quantize(self):
                for op_type in self._dynamic_quantize_op_type):
             self._collect_dynamic_quantize_op_threshold(
                 self._dynamic_quantize_op_type)
+
+        # Move sub blocks persistable var to global block
+        global_block = self._program.global_block()
+        for _op in global_block.ops:
+            if _op.type == "while":
+                _block_id = _op.attr("sub_block").id
+                _block = self._program.block(_block_id)
+                persistables = []
+                for _name, _var in _block.vars.items():
+                    if _var.persistable:
+                        global_block._clone_variable(_var)
+                        persistables.append(_name)
+                for _name in persistables:
+                    _block._remove_var(_name)
+                persistables.extend(_op.input('X'))
+                _op.desc.set_input("X", persistables)
+
         return self._program
 
     def save_quantized_model(self,
@@ -451,10 +468,6 @@ def _load_model_data(self):
                                     model_filename=self._model_filename,
                                     params_filename=self._params_filename)
 
-        if self._program.num_blocks > 1:
-            _logger.error("The post training quantization requires that the "
-                          "program only has one block.")
-
         if self._optimize_model:
             self._optimize_fp32_model()
 
@@ -505,23 +518,26 @@ def collect_var_name(var_name_list, persistable_var_names, op_type):
                     self._quantized_act_var_name.add(var_name)
 
         persistable_var_names = _all_persistable_var_names(self._program)
-        for op in self._program.global_block().ops:
-            op_type = op.type
-            if self._is_full_quantize and \
-                op_type not in self._quantizable_op_type:
-                _logger.warning(op_type + " is not supported for quantization.")
-            # For quantized ops, sample inputs and outputs
-            if op_type in self._quantizable_op_type:
-                collect_var_name(
-                    _get_op_input_var_names(op), persistable_var_names, op_type)
-                collect_var_name(
-                    _get_op_output_var_names(op), persistable_var_names,
-                    op_type)
-            # For other op, only sample output scale
-            elif op_type in self._out_scale_op_list:
-                collect_var_name(
-                    _get_op_output_var_names(op), persistable_var_names,
-                    op_type)
+        for block_id in range(len(self._program.blocks)):
+            for op in self._program.blocks[block_id].ops:
+                op_type = op.type
+                if self._is_full_quantize and \
+                    op_type not in self._quantizable_op_type:
+                    _logger.warning(op_type +
+                                    " is not supported for quantization.")
+                # For quantized ops, sample inputs and outputs
+                if op_type in self._quantizable_op_type:
+                    collect_var_name(
+                        _get_op_input_var_names(op), persistable_var_names,
+                        op_type)
+                    collect_var_name(
+                        _get_op_output_var_names(op), persistable_var_names,
+                        op_type)
+                # For other op, only sample output scale
+                elif op_type in self._out_scale_op_list:
+                    collect_var_name(
+                        _get_op_output_var_names(op), persistable_var_names,
+                        op_type)
 
     def _set_activation_persistable(self):
         '''
@@ -696,16 +712,17 @@ def _save_input_threhold(self):
         '''
         assert self._algo == "min_max", \
             "The algo should be min_max to save input threshold."
-        for op in self._program.global_block().ops:
-            if op.type in self._quantizable_op_type:
-                for var_name in _get_op_input_var_names(op):
-                    assert var_name in self._quantized_var_min
-                    assert var_name in self._quantized_var_max
-                    op._set_attr(var_name + ".min",
-                                 self._quantized_var_min[var_name])
-                    op._set_attr(var_name + ".max",
-                                 self._quantized_var_max[var_name])
-                    op._set_attr("with_quant_attr", True)
+        for block_id in range(len(self._program.blocks)):
+            for op in self._program.blocks[block_id].ops:
+                if op.type in self._quantizable_op_type:
+                    for var_name in _get_op_input_var_names(op):
+                        assert var_name in self._quantized_var_min
+                        assert var_name in self._quantized_var_max
+                        op._set_attr(var_name + ".min",
+                                     self._quantized_var_min[var_name])
+                        op._set_attr(var_name + ".max",
+                                     self._quantized_var_max[var_name])
+                        op._set_attr("with_quant_attr", True)
 
     def _collect_activation_abs_min_max(self):
         '''
@@ -795,7 +812,12 @@ def _update_program(self):
             activation_quantize_type=self._activation_quantize_type,
             weight_quantize_type=self._weight_quantize_type,
             quantizable_op_type=major_quantizable_op_types)
-        transform_pass.apply(graph)
+
+        for sub_graph in graph.all_sub_graphs():
+            # Insert fake_quant/fake_dequantize op must in test graph, so
+            # set per graph's _for_test is True.
+            sub_graph._for_test = True
+            transform_pass.apply(sub_graph)
 
         # use AddQuantDequantPass to insert fake_quant_dequant op
         minor_quantizable_op_types = []
@@ -806,7 +828,10 @@ def _update_program(self):
             scope=self._scope,
             place=self._place,
             quantizable_op_type=minor_quantizable_op_types)
-        add_quant_dequant_pass.apply(graph)
+
+        for sub_graph in graph.all_sub_graphs():
+            sub_graph._for_test = True
+            add_quant_dequant_pass.apply(sub_graph)
 
         # save threshold to scale var node
         if self._algo in ["KL", "hist"]:
@@ -836,7 +861,11 @@ def _update_program(self):
             activation_bits=self._activation_bits,
             weight_quantize_type=self._weight_quantize_type,
             quantizable_op_type=major_quantizable_op_types)
-        freeze_pass.apply(graph)
+
+        for sub_graph in graph.all_sub_graphs():
+            sub_graph._for_test = True
+            freeze_pass.apply(sub_graph)
+
         self._program = graph.to_program()
 
     def _save_output_threshold(self):
@@ -888,13 +917,15 @@ def analysis_and_save_info(op_node, out_var_name):
                 save_info(op_node, out_var_name, self._quantized_var_max,
                           "out_max", "post_min_max")
 
-        for op in self._program.global_block().ops:
-            if op.type in (self._quantizable_op_type + self._out_scale_op_list):
-                out_var_names = _get_op_output_var_names(op)
-                assert len(out_var_names) == 1, "Post training " + \
-                    "quantization only support one output for " + op.type
-                for var_name in out_var_names:
-                    analysis_and_save_info(op, var_name)
+        for block_id in range(len(self._program.blocks)):
+            for op in self._program.blocks[block_id].ops:
+                if op.type in (
+                        self._quantizable_op_type + self._out_scale_op_list):
+                    out_var_names = _get_op_output_var_names(op)
+                    assert len(out_var_names) == 1, "Post training " + \
+                        "quantization only support one output for " + op.type
+                    for var_name in out_var_names:
+                        analysis_and_save_info(op, var_name)
 
     def _collect_dynamic_quantize_op_threshold(self, target_ops_type):
         """
diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
index 94d7a2ed153488..494ea969797197 100644
--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
@@ -139,6 +139,7 @@ endfunction()
 if(WIN32)
 	list(REMOVE_ITEM TEST_OPS test_light_nas)
 	list(REMOVE_ITEM TEST_OPS test_post_training_quantization_mnist)
+	list(REMOVE_ITEM TEST_OPS test_post_training_quantization_while)
 	list(REMOVE_ITEM TEST_OPS test_post_training_quantization_mobilenetv1)
 	list(REMOVE_ITEM TEST_OPS test_post_training_quantization_resnet50)
 	list(REMOVE_ITEM TEST_OPS test_post_training_quantization_lstm_model)
@@ -336,6 +337,7 @@ if(NOT WIN32)
     set_tests_properties(test_post_training_quantization_mobilenetv1 PROPERTIES TIMEOUT 600 LABELS "RUN_TYPE=NIGHTLY")
     set_tests_properties(test_post_training_quantization_resnet50 PROPERTIES TIMEOUT 600 LABELS "RUN_TYPE=NIGHTLY")
     set_tests_properties(test_post_training_quantization_mnist PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_post_training_quantization_while PROPERTIES TIMEOUT 120)
     set_tests_properties(test_imperative_ptq PROPERTIES TIMEOUT 120)
     set_tests_properties(test_weight_quantization_mobilenetv1 PROPERTIES TIMEOUT 120)
 endif()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_while.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_while.py
new file mode 100644
index 00000000000000..3c3dfd08fccfa3
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_while.py
@@ -0,0 +1,313 @@
+#   copyright (c) 2021 paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license");
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+import unittest
+import os
+import time
+import sys
+import random
+import math
+import functools
+import contextlib
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from paddle.dataset.common import download
+from paddle.fluid.contrib.slim.quantization import PostTrainingQuantization
+
+paddle.enable_static()
+
+random.seed(0)
+np.random.seed(0)
+
+
+class TestPostTrainingQuantization(unittest.TestCase):
+    def setUp(self):
+        self.download_path = 'int8/download'
+        self.cache_folder = os.path.expanduser('~/.cache/paddle/dataset/' +
+                                               self.download_path)
+        self.timestamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())
+        self.int8_model_path = os.path.join(os.getcwd(),
+                                            "post_training_" + self.timestamp)
+        try:
+            os.system("mkdir -p " + self.int8_model_path)
+        except Exception as e:
+            print("Failed to create {} due to {}".format(self.int8_model_path,
+                                                         str(e)))
+            sys.exit(-1)
+
+    def tearDown(self):
+        try:
+            os.system("rm -rf {}".format(self.int8_model_path))
+        except Exception as e:
+            print("Failed to delete {} due to {}".format(self.int8_model_path,
+                                                         str(e)))
+
+    def cache_unzipping(self, target_folder, zip_path):
+        cmd = 'tar xf {0} -C {1}'.format(zip_path, target_folder)
+        os.system(cmd)
+
+    def download_model(self, data_url, data_md5, folder_name):
+        download(data_url, self.download_path, data_md5)
+        file_name = data_url.split('/')[-1]
+        zip_path = os.path.join(self.cache_folder, file_name)
+        print('Data is downloaded at {0}'.format(zip_path))
+
+        data_cache_folder = os.path.join(self.cache_folder, folder_name)
+        self.cache_unzipping(self.cache_folder, zip_path)
+        return data_cache_folder
+
+    def run_program(self, model_path, batch_size, infer_iterations):
+        print("test model path:" + model_path)
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        [infer_program, feed_dict, fetch_targets] = \
+            fluid.io.load_inference_model(model_path,
+                    model_filename='model.pdmodel',
+                    params_filename='model.pdiparams', executor=exe)
+        val_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size)
+
+        img_shape = [1, 28, 28]
+        test_info = []
+        cnt = 0
+        periods = []
+        for batch_id, data in enumerate(val_reader()):
+            image = np.array(
+                [x[0].reshape(img_shape) for x in data]).astype("float32")
+            input_label = np.array([x[1] for x in data]).astype("int64")
+
+            t1 = time.time()
+            out = exe.run(infer_program,
+                          feed={feed_dict[0]: image},
+                          fetch_list=fetch_targets)
+            t2 = time.time()
+            period = t2 - t1
+            periods.append(period)
+
+            out_label = np.argmax(np.array(out[0]), axis=1)
+            top1_num = sum(input_label == out_label)
+            test_info.append(top1_num)
+            cnt += len(data)
+
+            if (batch_id + 1) == infer_iterations:
+                break
+
+        throughput = cnt / np.sum(periods)
+        latency = np.average(periods)
+        acc1 = np.sum(test_info) / cnt
+        return (throughput, latency, acc1)
+
+    def generate_quantized_model(self,
+                                 model_path,
+                                 algo="KL",
+                                 quantizable_op_type=["conv2d"],
+                                 is_full_quantize=False,
+                                 is_use_cache_file=False,
+                                 is_optimize_model=False,
+                                 batch_size=10,
+                                 batch_nums=10):
+
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        scope = fluid.global_scope()
+        val_reader = paddle.dataset.mnist.train()
+
+        ptq = PostTrainingQuantization(
+            executor=exe,
+            model_dir=model_path,
+            model_filename='model.pdmodel',
+            params_filename='model.pdiparams',
+            sample_generator=val_reader,
+            batch_size=batch_size,
+            batch_nums=batch_nums,
+            algo=algo,
+            quantizable_op_type=quantizable_op_type,
+            is_full_quantize=is_full_quantize,
+            optimize_model=is_optimize_model,
+            is_use_cache_file=is_use_cache_file)
+        ptq.quantize()
+        ptq.save_quantized_model(
+            self.int8_model_path,
+            model_filename='model.pdmodel',
+            params_filename='model.pdiparams')
+
+    def run_test(self,
+                 model_name,
+                 data_url,
+                 data_md5,
+                 algo,
+                 quantizable_op_type,
+                 is_full_quantize,
+                 is_use_cache_file,
+                 is_optimize_model,
+                 diff_threshold,
+                 batch_size=10,
+                 infer_iterations=10,
+                 quant_iterations=5):
+
+        origin_model_path = self.download_model(data_url, data_md5, model_name)
+        #origin_model_path = os.path.join(origin_model_path, model_name)
+
+        print("Start FP32 inference for {0} on {1} images ...".format(
+            model_name, infer_iterations * batch_size))
+        (fp32_throughput, fp32_latency, fp32_acc1) = self.run_program(
+            origin_model_path, batch_size, infer_iterations)
+
+        print("Start INT8 post training quantization for {0} on {1} images ...".
+              format(model_name, quant_iterations * batch_size))
+        self.generate_quantized_model(
+            origin_model_path, algo, quantizable_op_type, is_full_quantize,
+            is_use_cache_file, is_optimize_model, batch_size, quant_iterations)
+
+        print("Start INT8 inference for {0} on {1} images ...".format(
+            model_name, infer_iterations * batch_size))
+        (int8_throughput, int8_latency, int8_acc1) = self.run_program(
+            self.int8_model_path, batch_size, infer_iterations)
+
+        print("---Post training quantization of {} method---".format(algo))
+        print(
+            "FP32 {0}: batch_size {1}, throughput {2} img/s, latency {3} s, acc1 {4}.".
+            format(model_name, batch_size, fp32_throughput, fp32_latency,
+                   fp32_acc1))
+        print(
+            "INT8 {0}: batch_size {1}, throughput {2} img/s, latency {3} s, acc1 {4}.\n".
+            format(model_name, batch_size, int8_throughput, int8_latency,
+                   int8_acc1))
+        sys.stdout.flush()
+
+        delta_value = fp32_acc1 - int8_acc1
+        self.assertLess(delta_value, diff_threshold)
+
+
+class TestPostTrainingKLForWhile(TestPostTrainingQuantization):
+    def test_post_training_kl(self):
+        model_name = "mnist_while"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_while.tar.gz"
+        data_md5 = "2387390beeb37b51dec041c27b8a681f"
+        algo = "KL"
+        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.01
+        batch_size = 10
+        infer_iterations = 50
+        quant_iterations = 5
+        self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold, batch_size, infer_iterations,
+                      quant_iterations)
+
+
+class TestPostTraininghistForWhile(TestPostTrainingQuantization):
+    def test_post_training_hist(self):
+        model_name = "mnist_while"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_while.tar.gz"
+        data_md5 = "2387390beeb37b51dec041c27b8a681f"
+        algo = "hist"
+        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.01
+        batch_size = 10
+        infer_iterations = 50
+        quant_iterations = 5
+        self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold, batch_size, infer_iterations,
+                      quant_iterations)
+
+
+class TestPostTrainingmseForWhile(TestPostTrainingQuantization):
+    def test_post_training_mse(self):
+        model_name = "mnist_while"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_while.tar.gz"
+        data_md5 = "2387390beeb37b51dec041c27b8a681f"
+        algo = "mse"
+        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.01
+        batch_size = 10
+        infer_iterations = 50
+        quant_iterations = 5
+        self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold, batch_size, infer_iterations,
+                      quant_iterations)
+
+
+class TestPostTrainingavgForWhile(TestPostTrainingQuantization):
+    def test_post_training_avg(self):
+        model_name = "mnist_while"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_while.tar.gz"
+        data_md5 = "2387390beeb37b51dec041c27b8a681f"
+        algo = "avg"
+        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.01
+        batch_size = 10
+        infer_iterations = 50
+        quant_iterations = 5
+        self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold, batch_size, infer_iterations,
+                      quant_iterations)
+
+
+class TestPostTrainingMinMaxForWhile(TestPostTrainingQuantization):
+    def test_post_training_min_max(self):
+        model_name = "mnist_while"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_while.tar.gz"
+        data_md5 = "2387390beeb37b51dec041c27b8a681f"
+        algo = "min_max"
+        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.01
+        batch_size = 10
+        infer_iterations = 50
+        quant_iterations = 5
+        self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold, batch_size, infer_iterations,
+                      quant_iterations)
+
+
+class TestPostTrainingAbsMaxForWhile(TestPostTrainingQuantization):
+    def test_post_training_abs_max(self):
+        model_name = "mnist_while"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_while.tar.gz"
+        data_md5 = "2387390beeb37b51dec041c27b8a681f"
+        algo = "abs_max"
+        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.01
+        batch_size = 10
+        infer_iterations = 50
+        quant_iterations = 5
+        self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold, batch_size, infer_iterations,
+                      quant_iterations)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/fluid/dataloader/dataloader_iter.py
index d8cb3e0918dc8d..10a9358612960c 100644
--- a/python/paddle/fluid/dataloader/dataloader_iter.py
+++ b/python/paddle/fluid/dataloader/dataloader_iter.py
@@ -273,6 +273,8 @@ def __next__(self):
             else:
                 if self._return_list:
                     data = self._reader.read_next_list()
+                    for i in range(len(data)):
+                        data[i] = data[i]._move_to_list()
                     data = [
                         _restore_batch(d, s)
                         for d, s in zip(data, self._structure_infos[:len(
@@ -718,6 +720,8 @@ def __next__(self):
             else:
                 if self._return_list:
                     data = self._reader.read_next_list()
+                    for i in range(len(data)):
+                        data[i] = data[i]._move_to_list()
                     data = [
                         _restore_batch(d, s)
                         for d, s in zip(data, self._structure_infos[:len(
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index e601c1cb4c3005..d10564e21ea47b 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -1999,6 +1999,14 @@ def _run_using_fleet_executor(self,
                 fetch_list=fetch_list,
                 feed_var_name=feed_var_name,
                 fetch_var_name=fetch_var_name)
+            main_block = cached_program.block(0)
+            for op in main_block.ops:
+                # set the op_role of fetch op to Optimize to avoid
+                # erase the fetched vars by gc for pipeline
+                if op.type == 'fetch':
+                    op._set_attr(
+                        'op_role',
+                        core.op_proto_and_checker_maker.OpRole.Optimize)
             self._add_program_cache(cache_key, cached_program)
         if cached_ctx is None:
             fleet_opt = program._pipeline_opt["fleet_opt"]
@@ -2007,6 +2015,18 @@ def _run_using_fleet_executor(self,
             self._add_ctx_cache(cache_key, cached_ctx)
         if feed:
             self._feed_data(cached_program, feed, feed_var_name, cached_scope)
+
+        from paddle.optimizer.lr import LRScheduler
+        if hasattr(program, 'lr_sheduler'):
+            lr_sheduler = program.lr_sheduler
+            assert isinstance(lr_sheduler, LRScheduler), "must be LRScheduler"
+            lr_value = lr_sheduler()
+            lr_var = program.global_block().vars[lr_sheduler._var_name]
+            data = np.array([lr_value]).astype(convert_dtype(lr_var.dtype))
+            tensor = core.get_variable_tensor(cached_scope,
+                                              lr_sheduler._var_name)
+            tensor.set(data, self.place)
+
         cached_ctx.run()
         if fetch_list:
             arr = cached_scope.find_var(fetch_var_name).get_fetch_list()
diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py
index dfc887292e7cff..83ccd1051bb669 100644
--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
@@ -1254,7 +1254,10 @@ def __iter__(self):
     def __next__(self):
         try:
             if self._return_list:
-                return self._reader.read_next_list()
+                data = self._reader.read_next_list()
+                for i in range(len(data)):
+                    data[i] = data[i]._move_to_list()
+                return data
             else:
                 return self._reader.read_next()
         except StopIteration:
diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
index 05008a3bc12f7e..2b4002ab9c9d49 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py
@@ -30,6 +30,7 @@
 seed = 2021
 epoch = 2
 batch_size = 32
+linear_size = 10000
 
 strategy = fleet.DistributedStrategy()
 strategy.hybrid_configs = {
@@ -45,12 +46,12 @@
 
 
 class MLP(fluid.Layer):
-    def __init__(self, param_attr=None, bias_attr=None):
+    def __init__(self, linear_size=10000, param_attr=None, bias_attr=None):
         super(MLP, self).__init__()
 
-        self._linear1 = Linear(10000, 10000)
-        self._linear2 = Linear(10000, 10000)
-        self._linear3 = Linear(10000, 10)
+        self._linear1 = Linear(linear_size, linear_size)
+        self._linear2 = Linear(linear_size, linear_size)
+        self._linear3 = Linear(linear_size, 10)
 
     def forward(self, inputs):
         y = self._linear1(inputs)
@@ -59,10 +60,10 @@ def forward(self, inputs):
         return y
 
 
-def reader_decorator():
+def reader_decorator(linear_size=10000):
     def __reader__():
         for _ in range(100):
-            img = np.random.rand(10000).astype('float32')
+            img = np.random.rand(linear_size).astype('float32')
             label = np.ones(1).astype('int64')
             yield img, label
 
@@ -120,6 +121,9 @@ def train_mlp(model,
         use_multiprocess=True)
     train_loader.set_sample_list_generator(train_reader)
 
+    if sharding_stage == 2:
+        model.to(device="gpu")
+
     for eop in range(epoch):
         model.train()
 
@@ -153,9 +157,6 @@ def train_mlp(model,
             if all_test and batch_id == 2:
                 return model.parameters()
 
-    if sharding_stage == 2:
-        model.to(device="gpu")
-
     return model.parameters()
 
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py
new file mode 100644
index 00000000000000..8adcda9d24e1ca
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py
@@ -0,0 +1,115 @@
+# -*- coding: UTF-8 -*-
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import argparse
+import ast
+import time
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.nn import Linear
+from paddle.distributed import fleet
+from paddle.fluid.dygraph import nn
+
+from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import ShardingOptimizerStage2
+from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import ShardingStage2
+from paddle.distributed.fleet.meta_parallel.sharding.sharding_utils import ShardingScaler
+
+from dygraph_sharding_stage2 import MLP, reader_decorator, optimizer_setting
+
+seed = 2021
+epoch = 2
+batch_size = 32
+linear_size = 8000
+
+np.random.seed(seed)
+paddle.seed(seed)
+
+
+def train_mlp(model, offload=False):
+    group = paddle.distributed.new_group([0, 1])
+    optimizer = optimizer_setting(model=model, use_pure_fp16=True)
+
+    model = paddle.amp.decorate(models=model, level='O2', save_dtype='float32')
+    scaler = paddle.amp.GradScaler(init_loss_scaling=32768)
+    scaler = ShardingScaler(scaler, group)
+
+    optimizer = ShardingOptimizerStage2(
+        params=model.parameters(),
+        optim=optimizer,
+        group=group,
+        offload=offload)
+    model = ShardingStage2(model, optimizer, group=group, accumulate_grads=True)
+
+    train_reader = paddle.batch(
+        reader_decorator(linear_size), batch_size=batch_size, drop_last=True)
+
+    train_loader = paddle.io.DataLoader.from_generator(
+        capacity=32,
+        use_double_buffer=True,
+        iterable=True,
+        return_list=True,
+        use_multiprocess=True)
+    train_loader.set_sample_list_generator(train_reader)
+
+    for eop in range(epoch):
+        model.train()
+
+        for batch_id, data in enumerate(train_loader()):
+            img, label = data
+            label.stop_gradient = True
+            img.stop_gradient = True
+
+            with paddle.amp.auto_cast(True, level='O2'):
+                out = model(img)
+                loss = paddle.nn.functional.cross_entropy(
+                    input=out, label=label)
+
+            avg_loss = paddle.mean(x=loss.cast(dtype=paddle.float32))
+            scaler.scale(avg_loss).backward()
+
+            model.grad_scale()
+            scaler.step(optimizer)
+            scaler.update()
+            model.clear_gradients()
+
+    for dtype in optimizer.param_storages:
+        for dst_rank, param_storage in optimizer.param_storages[dtype].items():
+            param_storage.to(device="gpu", dtype=dtype)
+
+    return model.parameters()
+
+
+def test_sharding_stage2_offload():
+    mlp = MLP(linear_size)
+    mlp_offload = MLP(linear_size)
+    mlp_offload.set_state_dict(mlp.state_dict())
+
+    mlp_params = train_mlp(mlp, offload=False)
+    mlp_offload_params = train_mlp(mlp_offload, offload=True)
+
+    for i in range(len(mlp_params)):
+        for j in range(len(mlp_offload_params)):
+            if mlp_params[i].name == mlp_offload_params[j].name:
+                np.testing.assert_allclose(
+                    mlp_params[i].numpy(),
+                    mlp_offload_params[j].numpy(),
+                    rtol=1e-6)
+    return
+
+
+if __name__ == '__main__':
+    test_sharding_stage2_offload()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py b/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py
index 33754fac127a44..c05ad30da2797f 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py
@@ -322,14 +322,14 @@ def assert_op_list(self, op_list_after_fusion):
             "Expected operator list after fusion is {}, but now it's {}".format(
                 op_list_after_fusion, after_op_list), )
 
-    def run_and_statis(
-            self,
-            quant=False,
-            max_examples=100,
-            reproduce=None,
-            min_success_num=25,
-            max_duration=180,
-            passes=None, ):
+    def run_and_statis(self,
+                       quant=False,
+                       max_examples=100,
+                       reproduce=None,
+                       min_success_num=25,
+                       max_duration=180,
+                       passes=None,
+                       use_gpu_run_baseline=False):
         if os.getenv('HYPOTHESIS_TEST_PROFILE', 'ci') == "dev":
             max_examples *= 10
             min_success_num *= 10
@@ -354,7 +354,10 @@ def program_generator(draw):
             return self.sample_program_config(draw)
 
         def run_test(prog_config):
-            return self.run_test(quant=quant, prog_configs=[prog_config])
+            return self.run_test(
+                quant=quant,
+                prog_configs=[prog_config],
+                use_gpu_run_baseline=use_gpu_run_baseline)
 
         generator = st.composite(program_generator)
         loop_func = given(generator())(run_test)
@@ -371,8 +374,8 @@ def run_test(prog_config):
         logging.info("Number of Ran Programs: {}".format(self.num_ran_programs))
         logging.info("Number of Ignore Tests: {}".format(self.num_ignore_tests))
         successful_ran_programs = int(self.num_ran_programs -
-                                      self.num_ignore_tests /
-                                      self.num_predictor_kinds)
+                                      self.num_ignore_tests / max(
+                                          self.num_predictor_kinds, 1))
         logging.info(
             "Number of successfully ran programs approximately equal to {}".
             format(successful_ran_programs))
@@ -391,7 +394,10 @@ def run_test(prog_config):
                 format(max_duration))
             assert False
 
-    def run_test(self, quant=False, prog_configs=None):
+    def run_test(self,
+                 quant=False,
+                 prog_configs=None,
+                 use_gpu_run_baseline=False):
         status = True
 
         for prog_config in prog_configs:
@@ -413,7 +419,9 @@ def run_test(self, quant=False, prog_configs=None):
             results: List[Dict[str, np.ndarray]] = []
 
             # baseline: cpu no ir_optim run
-            base_config = self.create_inference_config(ir_optim=False)
+
+            base_config = self.create_inference_config(
+                ir_optim=False, use_gpu=use_gpu_run_baseline)
             logging.info('RUN program_config: ' + str(prog_config))
             results.append(
                 self.run_test_config(model, params, prog_config, base_config,
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_adaptive_pool2d_convert_global_pass_autoscan.py b/python/paddle/fluid/tests/unittests/ir/inference/test_adaptive_pool2d_convert_global_pass_autoscan.py
index 8cb6af1dcf0441..96c2a175208faa 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_adaptive_pool2d_convert_global_pass_autoscan.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_adaptive_pool2d_convert_global_pass_autoscan.py
@@ -109,7 +109,7 @@ def teller2(program_config, predictor_config):
     def test(self):
         self.run_and_statis(
             quant=False,
-            max_examples=100,
+            max_examples=300,
             passes=["adaptive_pool2d_convert_global_pass"],
             min_success_num=40)
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_fc_elementwise_layernorm_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_fc_elementwise_layernorm_fuse_pass.py
new file mode 100644
index 00000000000000..2ccb9de5d5470f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_fc_elementwise_layernorm_fuse_pass.py
@@ -0,0 +1,135 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from auto_scan_test import PassAutoScanTest, IgnoreReasons
+from program_config import TensorConfig, ProgramConfig, OpConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+import unittest
+
+import hypothesis
+from hypothesis import given, settings, seed, example, assume, reproduce_failure
+import hypothesis.strategies as st
+
+
+class TestFCElementwiseLayerNormFusePass(PassAutoScanTest):
+    """
+    x_var   w(persistable) bias_var(persistable)
+      \     |              /
+          fc     
+          |
+      fc_out_var  bias_var(persistable)
+            \        /
+          elementwise_add  bias_var(persistable)  scale_var(persistable)
+                  \            |                       /                    
+                           layer_norm
+                         /      |         \
+                        Y    mean_var  variance_var
+    """
+
+    def sample_predictor_configs(self, program_config):
+        # for gpu
+        config = self.create_inference_config(use_gpu=True)
+        yield config, ["fused_fc_elementwise_layernorm"], (1e-5, 1e-5)
+
+    def sample_program_config(self, draw):
+        # 1. Generate shape of input:X of fc
+        x_shape = draw(
+            st.lists(
+                st.integers(
+                    min_value=1, max_value=8), min_size=2, max_size=5))
+        x_shape = [2, 1]
+        x_rank = len(x_shape)
+        # 2. Generate attr:in_num_col_dims of fc
+        in_num_col_dims = draw(st.integers(min_value=1, max_value=x_rank - 1))
+        # 3. Generate legal shape of input:W/bias of fc
+        w_shape = draw(
+            st.lists(
+                st.integers(
+                    min_value=1, max_value=8), min_size=2, max_size=2))
+        w_shape[0] = int(np.prod(x_shape[in_num_col_dims:]))
+        w_shape = [1, 2]
+        fc_bias_shape = [w_shape[1], ]
+        if draw(st.booleans()):
+            fc_bias_shape.insert(0, 1)
+        fc_bias_shape = [2, ]
+        fc_out_shape = x_shape[:in_num_col_dims] + w_shape[1:]
+        # 4. Generate legal attr:axis/shape of elementwise_add
+        add_bias_shape = fc_out_shape[:]
+        axis = draw(st.integers(min_value=-1, max_value=0))
+        # 5. Generate legal shape of layer_norm
+        begin_norm_axis = draw(
+            st.integers(
+                min_value=1, max_value=len(fc_out_shape) - 1))
+        layer_norm_shape = [int(np.prod(fc_out_shape[begin_norm_axis:]))]
+        epsilon = 1e-5
+
+        fc_op = OpConfig(
+            "fc",
+            inputs={"Input": ["fc_x"],
+                    "W": ["fc_w"],
+                    "Bias": ["fc_bias"]},
+            outputs={"Out": ["fc_out"]},
+            in_num_col_dims=in_num_col_dims,
+            padding_weights=False,
+            activation_type="",
+            use_quantizer=False,
+            use_mkldnn=False, )
+        add_op = OpConfig(
+            "elementwise_add",
+            inputs={"X": ["fc_out"],
+                    "Y": ["add_bias"]},
+            outputs={"Out": ["add_out"]},
+            axis=axis, )
+        layer_norm_op = OpConfig(
+            "layer_norm",
+            inputs={
+                "X": ["add_out"],
+                "Scale": ["scale"],
+                "Bias": ["layer_norm_bias"]
+            },
+            outputs={
+                "Y": ["layer_norm_out"],
+                "Mean": ["layer_norm_mean"],
+                "Variance": ["layer_norm_var"]
+            },
+            begin_norm_axis=begin_norm_axis,
+            epsilon=epsilon)
+
+        ops = [fc_op, add_op, layer_norm_op]
+        program_config = ProgramConfig(
+            ops=ops,
+            weights={
+                "fc_w": TensorConfig(shape=w_shape),
+                "fc_bias": TensorConfig(shape=fc_bias_shape),
+                "add_bias": TensorConfig(shape=add_bias_shape),
+                "scale": TensorConfig(shape=layer_norm_shape),
+                "layer_norm_bias": TensorConfig(shape=layer_norm_shape),
+            },
+            inputs={"fc_x": TensorConfig(shape=x_shape), },
+            outputs=ops[-1].outputs["Y"], )
+        return program_config
+
+    def test(self):
+        self.run_and_statis(
+            quant=False,
+            max_examples=300,
+            passes=["fc_elementwise_layernorm_fuse_pass"],
+            use_gpu_run_baseline=True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py
index 83d4b7091cb327..64c3042b63cf8e 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,72 +12,147 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from auto_scan_test import PassAutoScanTest, IgnoreReasons
+from program_config import TensorConfig, ProgramConfig, OpConfig
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
 import unittest
-import numpy as np
-from inference_pass_test import InferencePassTest
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-from paddle.fluid.core import PassVersionChecker
 
+import hypothesis
+from hypothesis import given, settings, seed, example, assume, reproduce_failure
+import hypothesis.strategies as st
 
-class TransposeFlattenConcatFusePassTest(InferencePassTest):
-    def setUp(self):
-        with fluid.program_guard(self.main_program, self.startup_program):
-            data1 = fluid.data(name="data1", shape=[5, 5, 5], dtype="float32")
-            data2 = fluid.data(name="data2", shape=[5, 5, 5], dtype="float32")
-            trans1 = fluid.layers.transpose(data1, perm=[2, 1, 0])
-            trans2 = fluid.layers.transpose(data2, perm=[2, 1, 0])
-            flatt1 = fluid.layers.flatten(trans1)
-            flatt2 = fluid.layers.flatten(trans2)
-            concat_out = fluid.layers.concat([flatt1, flatt2])
-            # There is no parameters for above structure. 
-            # Hence, append a batch_norm to avoid failure caused by load_combined. 
-            out = fluid.layers.batch_norm(concat_out, is_test=True)
 
-        self.feeds = {
-            "data1": np.random.random([5, 5, 5]).astype("float32"),
-            "data2": np.random.random([5, 5, 5]).astype("float32")
-        }
-        self.fetch_list = [out]
+class TestTransposeFlattenConcatFusePass(PassAutoScanTest):
+    """
+        x_1_var              x_2_var
+          |                     |
+      transpose2            transpose2
+          |                     | 
+       flatten2              flatten2
+          \                     /
+    flatten2_out_var    flatten2_out_var
+              \              /
+                   concat 
+    """
 
-    def test_check_output(self):
-        # There is no cpu pass for transpose_flatten_concat_fuse
-        if core.is_compiled_with_cuda():
-            use_gpu = True
-            self.check_output_with_option(use_gpu)
+    def sample_predictor_configs(self, program_config):
+        # TRT  
+        # after tensorrt_subgraph_pass ，The pass needs to be deleted on TRT
 
-        PassVersionChecker.IsCompatible('transpose_flatten_concat_fuse_pass')
+        # for gpu
+        config = self.create_inference_config(use_gpu=True)
+        yield config, ["fusion_transpose_flatten_concat", ], (1e-5, 1e-5)
 
+    def is_program_valid(self, prog_config):
+        concat_axis = prog_config.ops[-1].attrs["axis"]
+        ops_num = len(prog_config.ops) - 1
+        if ops_num % 2 != 0:
+            return False
+        input_num = ops_num // 2
+        flatten_shape = 0
+        x_trans_axis = prog_config.ops[0].attrs["axis"]
+        x_flatten_axis = prog_config.ops[1].attrs["axis"]
+        for i in range(input_num):
+            input_name = "transpose2_x" + str(i)
+            input_shape = prog_config.inputs[input_name].shape
+            trans_axis = prog_config.ops[i * 2].attrs["axis"]
+            if x_trans_axis != trans_axis:
+                return False
+            #  calculate shape after transpose
+            input_shape = [input_shape[j] for j in trans_axis]
+            #  calculate shape after flateen
+            flatten_axis = prog_config.ops[i * 2 + 1].attrs["axis"]
+            if x_flatten_axis != flatten_axis:
+                return False
+            flatten_shape1 = flatten_shape2 = 1
+            for j in range(len(input_shape)):
+                if j < flatten_axis:
+                    flatten_shape1 *= input_shape[j]
+                else:
+                    flatten_shape2 *= input_shape[j]
+            if concat_axis == 0:
+                if i == 0:
+                    flatten_shape = flatten_shape2
+                elif flatten_shape != flatten_shape2:
+                    return False
+            else:
+                if i == 0:
+                    flatten_shape = flatten_shape1
+                elif flatten_shape != flatten_shape1:
+                    return False
+        return True
 
-class TransposeFlattenConcatFusePassWithAxisTest(InferencePassTest):
-    def setUp(self):
-        with fluid.program_guard(self.main_program, self.startup_program):
-            data1 = fluid.data(name="data1", shape=[5, 5, 5], dtype="float32")
-            data2 = fluid.data(name="data2", shape=[5, 5, 5], dtype="float32")
-            trans1 = fluid.layers.transpose(data1, perm=[2, 1, 0])
-            trans2 = fluid.layers.transpose(data2, perm=[2, 1, 0])
-            flatt1 = fluid.layers.flatten(trans1, axis=2)
-            flatt2 = fluid.layers.flatten(trans2, axis=2)
-            concat_out = fluid.layers.concat([flatt1, flatt2], axis=1)
-            # There is no parameters for above structure. 
-            # Hence, append a batch_norm to avoid failure caused by load_combined. 
-            out = fluid.layers.batch_norm(concat_out, is_test=True)
+    def sample_program_config(self, draw):
+        times = draw(st.integers(min_value=1, max_value=6))
+        concat_axis = draw(st.integers(min_value=0, max_value=1))
+        ops = []
+        concat_input = []
+        inputs = {}
+        x_shape_rank = draw(st.integers(min_value=2, max_value=5))
+        #  Generate axis of transpose
+        trans_axis = [j for j in range(x_shape_rank)]
+        for j in range(x_shape_rank - 1):
+            if draw(st.booleans()):
+                trans_axis[j], trans_axis[-1] = trans_axis[-1], trans_axis[j]
+        #  Generate axis of flatten
+        flatten_axis = draw(
+            st.integers(
+                min_value=0, max_value=x_shape_rank - 1))
+        for i in range(times):
+            #  Generate x_shape of transpose
+            x_shape = draw(
+                st.lists(
+                    st.integers(
+                        min_value=1, max_value=10),
+                    min_size=x_shape_rank,
+                    max_size=x_shape_rank))
 
-        self.feeds = {
-            "data1": np.random.random([5, 5, 5]).astype("float32"),
-            "data2": np.random.random([5, 5, 5]).astype("float32")
-        }
-        self.fetch_list = [out]
+            str_i = str(i)
+            transpose_op = OpConfig(
+                "transpose2",
+                inputs={"X": ["transpose2_x" + str_i], },
+                axis=trans_axis,
+                outputs={
+                    "Out": ["trans_out" + str_i],
+                    "XShape": ["trans_shape" + str_i]
+                }, )
+            ops.append(transpose_op)
+            flatten_op = OpConfig(
+                "flatten2",
+                inputs={"X": ["trans_out" + str_i], },
+                axis=flatten_axis,
+                outputs={
+                    "Out": ["flatten2_out" + str_i],
+                    "XShape": ["xshape" + str_i]
+                }, )
+            concat_input.append("flatten2_out" + str_i)
+            ops.append(flatten_op)
+            inputs["transpose2_x" + str_i] = TensorConfig(shape=x_shape)
 
-    def test_check_output(self):
-        # There is no cpu pass for transpose_flatten_concat_fuse
-        if core.is_compiled_with_cuda():
-            use_gpu = True
-            self.check_output_with_option(use_gpu)
+        concat_op = OpConfig(
+            "concat",
+            inputs={
+                "X": concat_input,
+                "AxisTensor": [],
+            },
+            outputs={"Out": ["concat_out"]},
+            axis=concat_axis, )
 
-        self.assertTrue(
-            PassVersionChecker.IsCompatible(
-                'transpose_flatten_concat_fuse_pass'))
+        ops.append(concat_op)
+
+        program_config = ProgramConfig(
+            ops=ops,
+            weights={},
+            inputs=inputs,
+            outputs=ops[-1].outputs["Out"], )
+        return program_config
+
+    def test(self):
+        self.run_and_statis(
+            quant=False,
+            max_examples=300,
+            passes=["transpose_flatten_concat_fuse_pass"])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_distribution.py b/python/paddle/fluid/tests/unittests/test_distribution.py
index f1c12c90490c25..6cf2c5f6e2ca4b 100644
--- a/python/paddle/fluid/tests/unittests/test_distribution.py
+++ b/python/paddle/fluid/tests/unittests/test_distribution.py
@@ -336,6 +336,29 @@ def init_static_data(self, batch_size, dims):
                 name='values', shape=[dims], dtype='float32')
 
 
+class UniformTestSample(unittest.TestCase):
+    def setUp(self):
+        self.init_param()
+
+    def init_param(self):
+        self.low = 3.0
+        self.high = 4.0
+
+    def test_uniform_sample(self):
+        paddle.disable_static()
+        uniform = Uniform(low=self.low, high=self.high)
+        s = uniform.sample([100])
+        self.assertTrue((s >= self.low).all())
+        self.assertTrue((s < self.high).all())
+        paddle.enable_static()
+
+
+class UniformTestSample2(UniformTestSample):
+    def init_param(self):
+        self.low = -5.0
+        self.high = 2.0
+
+
 class NormalNumpy(DistributionNumpy):
     def __init__(self, loc, scale):
         self.loc = np.array(loc)
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py
index c5cf8c5d5ed690..f76dcb5687c2ab 100644
--- a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py
@@ -26,6 +26,9 @@ class TestDygraphShardingStage2(TestMultipleGpus):
     def test_dygraph_sharding_optimizer_stage2(self):
         self.run_mnist_2gpu('dygraph_sharding_stage2.py')
 
+    def test_dygraph_sharding_optimizer_stage2_offload(self):
+        self.run_mnist_2gpu('dygraph_sharding_stage2_offload.py')
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_executor.py b/python/paddle/fluid/tests/unittests/test_fleet_executor.py
index 09f9fa6ce105df..fbc5db341e5e9b 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_executor.py
@@ -47,6 +47,18 @@ def run_fleet_executor(self, place, x_data, y_data):
                 name='y', shape=y_data.shape, dtype=y_data.dtype)
             z = x + y
             a = 2 * x + 3 * y
+            loss = paddle.mean(a)
+            base_lr = 0.1
+            passes = [30, 60, 80, 90]
+            steps_per_pass = 10
+            bd = [steps_per_pass * p for p in passes]
+            lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
+            lr_val = paddle.optimizer.lr.PiecewiseDecay(
+                boundaries=bd, values=lr)
+            opt = paddle.optimizer.AdamW(
+                learning_rate=lr_val,
+                grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0))
+            opt.minimize(loss)
         # TODO: section_program will be removed in the future
         empty_program._pipeline_opt = {
             "fleet_opt": self.fake_fleet_opt(),
diff --git a/python/paddle/fluid/tests/unittests/test_gcd.py b/python/paddle/fluid/tests/unittests/test_gcd.py
new file mode 100644
index 00000000000000..820216dc56cd60
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_gcd.py
@@ -0,0 +1,93 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid import Program, program_guard
+from op_test import OpTest
+
+paddle.enable_static()
+
+
+class TestGcdAPI(unittest.TestCase):
+    def setUp(self):
+        self.x_np = 12
+        self.y_np = 20
+        self.x_shape = [1]
+        self.y_shape = [1]
+
+    def test_static_graph(self):
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(startup_program, train_program):
+            x = fluid.data(name='input1', dtype='int32', shape=self.x_shape)
+            y = fluid.data(name='input2', dtype='int32', shape=self.y_shape)
+            out = paddle.gcd(x, y)
+
+            place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+            ) else fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            res = exe.run(fluid.default_main_program(),
+                          feed={'input1': self.x_np,
+                                'input2': self.y_np},
+                          fetch_list=[out])
+            self.assertTrue((np.array(res[0]) == np.gcd(self.x_np, self.y_np)
+                             ).all())
+
+    def test_dygraph(self):
+        paddle.disable_static()
+        x = paddle.to_tensor(self.x_np)
+        y = paddle.to_tensor(self.y_np)
+        result = paddle.gcd(x, y)
+        self.assertEqual(
+            np.allclose(np.gcd(self.x_np, self.y_np), result.numpy()), True)
+
+        paddle.enable_static()
+
+
+class TestGcdAPI2(TestGcdAPI):
+    def setUp(self):
+        self.x_np = np.arange(6).astype(np.int32)
+        self.y_np = np.array([20]).astype(np.int32)
+        self.x_shape = [6]
+        self.y_shape = [1]
+
+
+class TestGcdAPI3(TestGcdAPI):
+    def setUp(self):
+        self.x_np = 0
+        self.y_np = 20
+        self.x_shape = [1]
+        self.y_shape = [1]
+
+
+class TestGcdAPI4(TestGcdAPI):
+    def setUp(self):
+        self.x_np = 0
+        self.y_np = 0
+        self.x_shape = [1]
+        self.y_shape = [1]
+
+
+class TestGcdAPI5(TestGcdAPI):
+    def setUp(self):
+        self.x_np = 12
+        self.y_np = -20
+        self.x_shape = [1]
+        self.y_shape = [1]
diff --git a/python/paddle/fluid/tests/unittests/test_lcm.py b/python/paddle/fluid/tests/unittests/test_lcm.py
new file mode 100644
index 00000000000000..123c3e3d444e1b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_lcm.py
@@ -0,0 +1,93 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid import Program, program_guard
+from op_test import OpTest
+
+paddle.enable_static()
+
+
+class TestLcmAPI(unittest.TestCase):
+    def setUp(self):
+        self.x_np = 12
+        self.y_np = 20
+        self.x_shape = [1]
+        self.y_shape = [1]
+
+    def test_static_graph(self):
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(startup_program, train_program):
+            x1 = fluid.data(name='input1', dtype='int32', shape=self.x_shape)
+            x2 = fluid.data(name='input2', dtype='int32', shape=self.y_shape)
+            out = paddle.lcm(x1, x2)
+
+            place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+            ) else fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            res = exe.run(fluid.default_main_program(),
+                          feed={'input1': self.x_np,
+                                'input2': self.y_np},
+                          fetch_list=[out])
+            self.assertTrue((np.array(res[0]) == np.lcm(self.x_np, self.y_np)
+                             ).all())
+
+    def test_dygraph(self):
+        paddle.disable_static()
+        x1 = paddle.to_tensor(self.x_np)
+        x2 = paddle.to_tensor(self.y_np)
+        result = paddle.lcm(x1, x2)
+        self.assertEqual(
+            np.allclose(np.lcm(self.x_np, self.y_np), result.numpy()), True)
+
+        paddle.enable_static()
+
+
+class TestLcmAPI2(TestLcmAPI):
+    def setUp(self):
+        self.x_np = np.arange(6).astype(np.int32)
+        self.y_np = np.array([20]).astype(np.int32)
+        self.x_shape = [6]
+        self.y_shape = [1]
+
+
+class TestLcmAPI3(TestLcmAPI):
+    def setUp(self):
+        self.x_np = 0
+        self.y_np = 20
+        self.x_shape = [1]
+        self.y_shape = [1]
+
+
+class TestLcmAPI4(TestLcmAPI):
+    def setUp(self):
+        self.x_np = 0
+        self.y_np = 0
+        self.x_shape = [1]
+        self.y_shape = [1]
+
+
+class TestLcmAPI5(TestLcmAPI):
+    def setUp(self):
+        self.x_np = 12
+        self.y_np = -20
+        self.x_shape = [1]
+        self.y_shape = [1]
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index c5bc99546c878a..36dfd717a12a0f 100755
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -197,6 +197,8 @@
 from .math import lerp_  # noqa: F401
 from .math import rad2deg  # noqa: F401
 from .math import deg2rad  # noqa: F401
+from .math import gcd  # noqa: F401
+from .math import lcm  # noqa: F401
 from .math import diff  # noqa: F401
 from .math import angle  # noqa: F401
 
@@ -414,6 +416,10 @@
            'triangular_solve',
            'as_complex',
            'as_real',
+           'rad2deg',
+           'deg2rad',
+           'gcd',
+           'lcm',
            'diff',
            'lerp',
            'lerp_',
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index fdd81fa8c0f9eb..fefaecaf604a07 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -2624,9 +2624,9 @@ def lerp(x, y, weight, name=None):
             lerp(x, y, weight) = x + weight * (y - x).
 
     Args:
-        x (Tensor): An N-D Tensor, the data type is float32, float64.
-        y (Tensor): An N-D Tensor, the data type is float32, float64.
-        weight (float|Tensor): the weight for the interpolation formula.
+        x (Tensor): An N-D Tensor with starting points, the data type is float32, float64.
+        y (Tensor): An N-D Tensor with ending points, the data type is float32, float64.
+        weight (float|Tensor): The weight for the interpolation formula. When weight is Tensor, the data type is float32, float64.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -2788,6 +2788,139 @@ def deg2rad(x, name=None):
             type='scale', inputs={'X':out_cast}, outputs={'Out': out}, attrs={'scale': deg2rad_scale})
         return out
 
+def gcd(x, y, name=None):
+    """
+    Computes the element-wise greatest common divisor (GCD) of input |x| and |y|.
+    Both x and y must have integer types.
+    
+    Note:
+        gcd(0,0)=0, gcd(0, y)=|y|
+
+    Args:
+        x, y (Tensor): An N-D Tensor, the data type is int8，int16，int32，int64，uint8. 
+            If x.shape != y.shape, they must be broadcastable to a common shape (which becomes the shape of the output).
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        out (Tensor): An N-D Tensor, the data type is the same with input.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            
+            x1 = paddle.to_tensor(12)
+            x2 = paddle.to_tensor(20)
+            paddle.gcd(x1, x2)
+            # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
+            #        [4])
+
+            x3 = paddle.to_tensor(np.arange(6))
+            paddle.gcd(x3, x2)
+            # Tensor(shape=[6], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
+            #        [20, 1 , 2 , 1 , 4 , 5])
+
+            x4 = paddle.to_tensor(0)
+            paddle.gcd(x4, x2)
+            # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
+            #        [20])
+
+            paddle.gcd(x4, x4)
+            # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
+            #        [0])
+            
+            x5 = paddle.to_tensor(-20)
+            paddle.gcd(x1, x5)
+            # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
+            #        [4])
+    """
+    shape = paddle.broadcast_shape(x.shape, y.shape)
+    x = paddle.broadcast_to(x, shape)
+    y = paddle.broadcast_to(y, shape)
+    x = paddle.abs(x)
+    y = paddle.abs(y)
+
+    def _gcd_cond_fn(x, y):
+        return paddle.any(y != 0)
+
+    def _gcd_body_fn(x, y):
+        # paddle.mod will raise an error when any element of y is 0. To avoid
+        # that, we change those zeros to ones. Their values don't matter because
+        # they won't be used.
+        y_not_equal_0 = (y != 0)
+        y_safe = paddle.where(y_not_equal_0, y, paddle.ones(y.shape, y.dtype))
+        x, y = (paddle.where(y_not_equal_0, y, x),
+                  paddle.where(y_not_equal_0, paddle.mod(x, y_safe),paddle.zeros(y.shape, y.dtype)))
+        return (paddle.where(x < y, y, x), paddle.where(x < y, x, y))
+
+    if in_dygraph_mode():
+        while _gcd_cond_fn(x, y):
+            x, y = _gcd_body_fn(x, y)
+
+        return x
+    else:
+        check_variable_and_dtype(x, 'x', ['int32', 'int64', 'int8', 'int16', 'uint8'], 'gcd')
+        check_variable_and_dtype(y, 'y', ['int32', 'int64', 'int8', 'int16', 'uint8'], 'gcd')
+        out, _ = paddle.static.nn.while_loop(_gcd_cond_fn, _gcd_body_fn, [x, y])
+        return out
+
+def lcm(x, y, name=None):
+    """
+    Computes the element-wise least common multiple (LCM) of input |x| and |y|.
+    Both x and y must have integer types.
+    
+    Note:
+        lcm(0,0)=0, lcm(0, y)=0
+
+    Args:
+        x, y (Tensor): An N-D Tensor, the data type is int8，int16，int32，int64，uint8. 
+            If x.shape != y.shape, they must be broadcastable to a common shape (which becomes the shape of the output).
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        out (Tensor): An N-D Tensor, the data type is the same with input.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            
+            x1 = paddle.to_tensor(12)
+            x2 = paddle.to_tensor(20)
+            paddle.lcm(x1, x2)
+            # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
+            #        [60])
+
+            x3 = paddle.to_tensor(np.arange(6))
+            paddle.lcm(x3, x2)
+            # Tensor(shape=[6], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
+            #        [0, 20, 20, 60, 20, 20])
+
+            x4 = paddle.to_tensor(0)
+            paddle.lcm(x4, x2)
+            # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
+            #        [0])
+
+            paddle.lcm(x4, x4)
+            # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
+            #        [0])
+            
+            x5 = paddle.to_tensor(-20)
+            paddle.lcm(x1, x5)
+            # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
+            #        [60])
+    """
+    d = paddle.gcd(x, y)
+    # paddle.mod will raise an error when any element of y is 0. To avoid
+    # that, we change those zeros to ones. Their values don't matter because
+    # they won't be used.
+    d_equal_0 = paddle.equal(d, 0)
+    d_safe = paddle.where(d_equal_0, paddle.ones(d.shape, d.dtype), d)
+    out = paddle.where(d_equal_0, paddle.zeros(d.shape, d.dtype), paddle.abs(x * y) // d_safe)
+    return out
+
 def diff(x, n=1, axis=-1, prepend=None, append=None, name=None):
     r"""
     Computes the n-th forward difference along the given axis.
diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py
index 5506ee95bd7c9e..ed3bb1dc5f1f01 100644
--- a/python/paddle/utils/code_gen/api_gen.py
+++ b/python/paddle/utils/code_gen/api_gen.py
@@ -345,6 +345,7 @@ def source_include(header_file_path):
 #include "glog/logging.h"
 
 #include "paddle/pten/api/lib/api_registry.h"
+#include "paddle/pten/api/lib/kernel_declare.h"
 #include "paddle/pten/api/lib/kernel_dispatch.h"
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/kernel_registry.h"
@@ -353,22 +354,6 @@ def source_include(header_file_path):
 """
 
 
-def module_declare():
-    return """
-PT_DECLARE_MODULE(CreationCPU);
-PT_DECLARE_MODULE(LinalgCPU);
-PT_DECLARE_MODULE(ManipulationCPU);
-PT_DECLARE_MODULE(MathCPU);
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_DECLARE_MODULE(CreationCUDA);
-PT_DECLARE_MODULE(LinalgCUDA);
-PT_DECLARE_MODULE(ManipulationCUDA);
-PT_DECLARE_MODULE(MathCUDA);
-#endif
-"""
-
-
 def api_register():
     return """
 PT_REGISTER_API(Creation);
@@ -405,7 +390,6 @@ def generate_api(api_yaml_path, header_file_path, source_file_path):
 
     include_header_file = "paddle/pten/api/include/api.h"
     source_file.write(source_include(include_header_file))
-    source_file.write(module_declare())
     source_file.write(namespace[0])
 
     for api in apis:
diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
index 79a742c314bd07..46b415e24d8ca2 100644
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -202,7 +202,7 @@
     'test_fleet_runtime',
     'test_rnn_cudnn_params_packing',
     'test_mkldnn_placement_pass',
-    'test_fc_elementwise_layernorm_fuse_pass',
+    'test_fc_elementwise_layernorm_fuse_pass_cc',
     'program_desc_test',
     'test_simplify_with_basic_ops_pass',
     'test_dygraph_mode_of_unittest',
@@ -1417,7 +1417,7 @@
     'test_fc_mkldnn_op',
     'test_fc_lstm_fuse_pass',
     'test_fc_gru_fuse_pass',
-    'test_fc_elementwise_layernorm_fuse_pass',
+    'test_fc_elementwise_layernorm_fuse_pass_cc',
     'test_fc_bf16_mkldnn_op',
     'test_executor_feed_non_tensor',
     'test_executor_check_feed',