From 06d6edeb7ceb4ce66d33004f276993d11af15a97 Mon Sep 17 00:00:00 2001
From: phlrain <phliuhongyu@126.com>
Date: Fri, 28 Jul 2023 13:34:36 +0000
Subject: [PATCH 01/22] new ir remove fetch list

---
 .../new_executor/standalone_executor.cc       | 33 ++++++++--
 .../new_executor/standalone_executor.h        |  2 +
 .../ir/phi_kernel_adaptor/phi_kernel_util.cc  | 22 +++++--
 .../ir/phi_kernel_adaptor/phi_kernel_util.h   | 66 ++++++++-----------
 4 files changed, 74 insertions(+), 49 deletions(-)
diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc
index c80f9b36ff98ba..5d2845f9ca6f32 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor.cc
@@ -65,6 +65,19 @@ StandaloneExecutor::StandaloneExecutor(const platform::Place& place,
     if (FLAGS_enable_new_ir_in_executor) {
       VLOG(6) << "begin to translate" << std::endl;
       auto base_program = paddle::TranslateLegacyProgramToProgram(*program);
+
+      auto block = base_program->block();
+      for (auto it = block->begin(); it != block->end(); ++it) {
+        if ((*it)->name() == "pd.fetch") {
+          fetch_var_names_.push_back((*it)
+                                         ->attributes()
+                                         .at("name")
+                                         .dyn_cast<ir::StrAttribute>()
+                                         .AsString() +
+                                     "@fetch");
+        }
+      }
+
       auto kernel_program =
           paddle::dialect::PdOpLowerToKernelPass(base_program.get(), place);
       interpretercores_.emplace_back(std::make_shared<InterpreterCore>(
@@ -130,11 +143,23 @@ paddle::framework::FetchList StandaloneExecutor::Run(
   }
 
   // return Fetch Tensors
-  auto* fetch_var = scope_->FindVar(interpreter::kFetchVarName);
-  if (fetch_var) {
-    return std::move(*fetch_var->GetMutable<framework::FetchList>());
+
+  if (FLAGS_enable_new_ir_in_executor) {
+    framework::FetchList fetch_res;
+
+    for (auto& var_name : fetch_var_names_) {
+      auto* var = scope_->FindVar(var_name);
+      fetch_res.push_back(var->Get<phi::DenseTensor>());
+    }
+
+    return fetch_res;
   } else {
-    return {};
+    auto* fetch_var = scope_->FindVar(interpreter::kFetchVarName);
+    if (fetch_var) {
+      return std::move(*fetch_var->GetMutable<framework::FetchList>());
+    } else {
+      return {};
+    }
   }
 }
 
diff --git a/paddle/fluid/framework/new_executor/standalone_executor.h b/paddle/fluid/framework/new_executor/standalone_executor.h
index 0302128d9263da..1da628fe27bb79 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.h
+++ b/paddle/fluid/framework/new_executor/standalone_executor.h
@@ -50,6 +50,8 @@ class StandaloneExecutor {
   std::vector<std::shared_ptr<InterpreterCore>> interpretercores_;
 
   Scope* scope_;
+
+  std::vector<std::string> fetch_var_names_;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc b/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc
index 0388ee9791a35b..a04c6b46230174 100644
--- a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc
+++ b/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc
@@ -187,13 +187,21 @@ void HandleForSpecialOp(
 
   if (op_name == "pd.fetch") {
     // fetch is a very special op, with no output
-    auto var = const_cast<paddle::framework::Scope*>(inner_scope->root())
-                   ->Var("fetch");
-    VLOG(6) << "Create var: fetch in scope " << inner_scope->root();
-    auto fetch_list = var->GetMutable<paddle::framework::FetchList>();
-    int index =
-        op->attributes().at("col").dyn_cast<ir::Int32Attribute>().data();
-    fetch_list->resize(index + 1);
+    auto fetch_src_name =
+        op->attributes().at("name").dyn_cast<ir::StrAttribute>().AsString();
+
+    auto fetch_var_name = fetch_src_name + "@fetch";
+    auto* var = const_cast<paddle::framework::Scope*>(inner_scope->root())
+                    ->Var(fetch_var_name);
+    var->GetMutable<phi::DenseTensor>();
+    auto value = op->result(0);
+
+    value_2_var_name->emplace(value, fetch_var_name);
+
+    auto id = var_name_2_id->size();
+    var_name_2_id->emplace(fetch_var_name, id);
+    variable_list->push_back(var);
+    variable_2_var_name->emplace(var, fetch_var_name);
   }
 
   if (op_name == "pd.feed") {
diff --git a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h b/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h
index f3021ad4765321..27b09a8b6c3b52 100644
--- a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h
+++ b/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h
@@ -285,47 +285,37 @@ void BuildPhiContext(ir::Operation* op,
   }
 
   // TODO(phlrain): use var type instead of op name
-  if (op->attributes().count("op_name") &&
-      (op->attributes().at("op_name").dyn_cast<ir::StrAttribute>().AsString() ==
-       "pd.fetch")) {
-    // process fetch op
-    auto fetch_var = inner_scope->FindVar("fetch");
-    auto* fetch_list = fetch_var->GetMutable<paddle::framework::FetchList>();
-    int index =
-        op->attributes().at("col").dyn_cast<ir::Int32Attribute>().data();
-    auto* out_tensor = &(PADDLE_GET(phi::DenseTensor, fetch_list->at(index)));
-    ctx->EmplaceBackOutput(out_tensor);
-  } else {
-    for (size_t i = 0; i < op->num_results(); ++i) {
-      ir::Value out_ptr = op->result(i);
-      auto name = name_map.at(out_ptr);
-      VLOG(6) << "ctx->EmplaceBackOutput: " << name;
-      auto out_type = out_ptr.type();
-      if (!out_type) {
-        phi::DenseTensor* ptr = nullptr;
-        OutType out_ptr(ptr);
-        ctx->EmplaceBackOutput(out_ptr);
-      } else if (out_type.isa<paddle::dialect::AllocatedDenseTensorType>()) {
-        ctx->EmplaceBackOutput(OutType(const_cast<phi::DenseTensor*>(
-            &(inner_scope->FindVar(name)->Get<phi::DenseTensor>()))));
-      } else if (out_type.isa<paddle::dialect::AllocatedSelectedRowsType>()) {
-        ctx->EmplaceBackOutput(OutType(const_cast<phi::SelectedRows*>(
-            &(inner_scope->FindVar(name)->Get<phi::SelectedRows>()))));
-      } else if (out_type.isa<ir::VectorType>()) {
-        OutListType outputs;
-        auto& variable_array =
-            scope->FindVar(name)->Get<paddle::framework::VariableRefArray>();
-        for (size_t i = 0; i < variable_array.size(); ++i) {
-          outputs.emplace_back(OutType(const_cast<phi::DenseTensor*>(
-              &(variable_array[i]->Get<phi::DenseTensor>()))));
-        }
-        ctx->EmplaceBackOutputs(outputs);
-      } else {
-        PADDLE_THROW(
-            phi::errors::Unimplemented("only support DenseTensor and vector "));
+
+  for (size_t i = 0; i < op->num_results(); ++i) {
+    ir::Value out_ptr = op->result(i);
+    auto name = name_map.at(out_ptr);
+    VLOG(6) << "ctx->EmplaceBackOutput: " << name;
+    auto out_type = out_ptr.type();
+    if (!out_type) {
+      phi::DenseTensor* ptr = nullptr;
+      OutType out_ptr(ptr);
+      ctx->EmplaceBackOutput(out_ptr);
+    } else if (out_type.isa<paddle::dialect::AllocatedDenseTensorType>()) {
+      ctx->EmplaceBackOutput(OutType(const_cast<phi::DenseTensor*>(
+          &(inner_scope->FindVar(name)->Get<phi::DenseTensor>()))));
+    } else if (out_type.isa<paddle::dialect::AllocatedSelectedRowsType>()) {
+      ctx->EmplaceBackOutput(OutType(const_cast<phi::SelectedRows*>(
+          &(inner_scope->FindVar(name)->Get<phi::SelectedRows>()))));
+    } else if (out_type.isa<ir::VectorType>()) {
+      OutListType outputs;
+      auto& variable_array =
+          scope->FindVar(name)->Get<paddle::framework::VariableRefArray>();
+      for (size_t i = 0; i < variable_array.size(); ++i) {
+        outputs.emplace_back(OutType(const_cast<phi::DenseTensor*>(
+            &(variable_array[i]->Get<phi::DenseTensor>()))));
       }
+      ctx->EmplaceBackOutputs(outputs);
+    } else {
+      PADDLE_THROW(
+          phi::errors::Unimplemented("only support DenseTensor and vector "));
     }
   }
+
   VLOG(6) << "Done build phi context";
 }
 

From 774196e0e89721aecf41c6b7e4ceec6c2cc82597 Mon Sep 17 00:00:00 2001
From: phlrain <phliuhongyu@126.com>
Date: Sat, 29 Jul 2023 13:50:25 +0000
Subject: [PATCH 02/22] fix pattern rewrite bug

---
 .../framework/new_executor/interpretercore.cc | 12 +++--
 .../framework/new_executor/interpretercore.h  |  3 ++
 .../new_executor/new_ir_interpreter.cc        | 50 ++++++++++++-------
 .../new_executor/new_ir_interpreter.h         |  3 ++
 .../new_executor/standalone_executor.cc       | 30 ++++++++---
 .../ir/transforms/constant_folding_pass.cc    | 22 +++++++-
 6 files changed, 89 insertions(+), 31 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index 837814b5f9dee6..04e1457f33dcbf 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -47,13 +47,15 @@ InterpreterCore::InterpreterCore(const platform::Place& place,
       place, block, scope, execution_config);
 }
 
-InterpreterCore::InterpreterCore(const platform::Place& place,
-                                 std::unique_ptr<::ir::Program> ir_prog,
-                                 framework::Scope* scope,
-                                 const ExecutionConfig& execution_config) {
+InterpreterCore::InterpreterCore(
+    const platform::Place& place,
+    const std::vector<std::string>& fetch_var_names,
+    std::unique_ptr<::ir::Program> ir_prog,
+    framework::Scope* scope,
+    const ExecutionConfig& execution_config) {
   VLOG(4) << "InterpreterCore(): " << this << " on " << place;
   impl_ = std::make_unique<NewIRInterpreter>(
-      place, std::move(ir_prog), scope, execution_config);
+      place, fetch_var_names, std::move(ir_prog), scope, execution_config);
 }
 
 InterpreterCore::~InterpreterCore() {
diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h
index b9c633272e677a..66f998bb557f6e 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.h
+++ b/paddle/fluid/framework/new_executor/interpretercore.h
@@ -37,6 +37,7 @@ class InterpreterCore {
                   const ExecutionConfig& execution_config = ExecutionConfig());
   // This constructor is for New IR.
   InterpreterCore(const platform::Place& place,
+                  const std::vector<std::string>& fetch_var_names,
                   std::unique_ptr<::ir::Program> ir_prog,
                   Scope* scope,
                   const ExecutionConfig& execution_config = ExecutionConfig());
@@ -80,6 +81,8 @@ class InterpreterCore {
   DISABLE_COPY_AND_ASSIGN(InterpreterCore);
 
   std::unique_ptr<InterpreterBaseImpl> impl_;
+
+  std::vector<std::string> fetch_var_names_;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
index 3cdc815a562ae4..da7a959c4a8aaf 100644
--- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
@@ -42,16 +42,19 @@
 namespace paddle {
 namespace framework {
 
-NewIRInterpreter::NewIRInterpreter(const platform::Place& place,
-                                   std::unique_ptr<::ir::Program> ir_prog,
-                                   framework::Scope* scope,
-                                   const ExecutionConfig& execution_config)
+NewIRInterpreter::NewIRInterpreter(
+    const platform::Place& place,
+    const std::vector<std::string>& fetch_var_names,
+    std::unique_ptr<::ir::Program> ir_prog,
+    framework::Scope* scope,
+    const ExecutionConfig& execution_config)
     : place_(place),
       stream_analyzer_(place),
       execution_config_(execution_config),
       var_scope_(scope),
       scope_(scope),
-      ir_program_(std::move(ir_prog)) {
+      ir_program_(std::move(ir_prog)),
+      fetch_var_names_(fetch_var_names) {
   VLOG(4) << "NewIRInterpreter(): " << this << " on " << place_;
   static_build_ = FLAGS_new_executor_static_build &&
                   !FLAGS_new_executor_use_cuda_graph &&
@@ -228,20 +231,33 @@ FetchList NewIRInterpreter::Run(const std::vector<std::string>& feed_names,
 
   // return Fetch Tensors
   Scope* inner_scope = InnerScope();
-  auto* fetch_var = inner_scope->FindVar(interpreter::kFetchVarName);
-  if (fetch_var && need_fetch) {
-    auto fetch_list = std::move(*fetch_var->GetMutable<framework::FetchList>());
-#ifdef PADDLE_WITH_CUDA
-    if (platform::IsCUDAGraphCapturing()) {
-      PADDLE_ENFORCE_EQ(fetch_list.empty(),
-                        true,
-                        platform::errors::InvalidArgument(
-                            "Cannot fetch data when using CUDA Graph."));
+  if (FLAGS_enable_new_ir_in_executor) {
+    framework::FetchList fetch_res;
+
+    if (need_fetch) {
+      for (auto& var_name : fetch_var_names_) {
+        auto* var = inner_scope->FindVar(var_name);
+        fetch_res.push_back(var->Get<phi::DenseTensor>());
+      }
     }
-#endif
-    return fetch_list;
+    return fetch_res;
   } else {
-    return {};
+    auto* fetch_var = inner_scope->FindVar(interpreter::kFetchVarName);
+    if (fetch_var && need_fetch) {
+      auto fetch_list =
+          std::move(*fetch_var->GetMutable<framework::FetchList>());
+#ifdef PADDLE_WITH_CUDA
+      if (platform::IsCUDAGraphCapturing()) {
+        PADDLE_ENFORCE_EQ(fetch_list.empty(),
+                          true,
+                          platform::errors::InvalidArgument(
+                              "Cannot fetch data when using CUDA Graph."));
+      }
+#endif
+      return fetch_list;
+    } else {
+      return {};
+    }
   }
 }
 
diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.h b/paddle/fluid/framework/new_executor/new_ir_interpreter.h
index 14c8d1778c288e..744a130a1aa048 100644
--- a/paddle/fluid/framework/new_executor/new_ir_interpreter.h
+++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.h
@@ -34,6 +34,7 @@ class NewIRInterpreter : public InterpreterBaseImpl {
 
  public:
   NewIRInterpreter(const platform::Place& place,
+                   const std::vector<std::string>& fetch_var_names,
                    std::unique_ptr<::ir::Program> ir_prog,
                    Scope* scope,
                    const ExecutionConfig& execution_config = ExecutionConfig());
@@ -217,6 +218,8 @@ class NewIRInterpreter : public InterpreterBaseImpl {
   std::vector<Variable*> variable_list_;
 
   interpreter::IrDependencyBuilder ir_dependency_builder_;
+
+  std::vector<std::string> fetch_var_names_;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc
index 5d2845f9ca6f32..0e6292f0b1bf4e 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor.cc
@@ -69,19 +69,33 @@ StandaloneExecutor::StandaloneExecutor(const platform::Place& place,
       auto block = base_program->block();
       for (auto it = block->begin(); it != block->end(); ++it) {
         if ((*it)->name() == "pd.fetch") {
-          fetch_var_names_.push_back((*it)
-                                         ->attributes()
-                                         .at("name")
-                                         .dyn_cast<ir::StrAttribute>()
-                                         .AsString() +
-                                     "@fetch");
+          size_t index = (*it)
+                             ->attributes()
+                             .at("col")
+                             .dyn_cast<ir::Int32Attribute>()
+                             .data();
+
+          if (fetch_var_names_.size() < index + 1) {
+            fetch_var_names_.resize(index + 1);
+          }
+
+          fetch_var_names_[index] = (*it)
+                                        ->attributes()
+                                        .at("name")
+                                        .dyn_cast<ir::StrAttribute>()
+                                        .AsString() +
+                                    "@fetch";
         }
       }
 
       auto kernel_program =
           paddle::dialect::PdOpLowerToKernelPass(base_program.get(), place);
-      interpretercores_.emplace_back(std::make_shared<InterpreterCore>(
-          place_, std::move(kernel_program), scope_, execution_config));
+      interpretercores_.emplace_back(
+          std::make_shared<InterpreterCore>(place_,
+                                            fetch_var_names_,
+                                            std::move(kernel_program),
+                                            scope_,
+                                            execution_config));
     } else {
       interpretercores_.emplace_back(
           std::make_shared<InterpreterCore>(place_,
diff --git a/paddle/fluid/ir/transforms/constant_folding_pass.cc b/paddle/fluid/ir/transforms/constant_folding_pass.cc
index 3fcdee6748b206..5f107af71e519a 100644
--- a/paddle/fluid/ir/transforms/constant_folding_pass.cc
+++ b/paddle/fluid/ir/transforms/constant_folding_pass.cc
@@ -71,15 +71,35 @@ class ConstantFoldingPattern : public ir::RewritePattern {
     ir::Program* program = op->GetParentProgram();
     auto temp_program = BuildProgramFromOperation(op);
 
+    std::vector<std::string> fetch_var_names;
+    auto block = temp_program->block();
+    for (auto it = block->begin(); it != block->end(); ++it) {
+      if ((*it)->name() == "pd.fetch") {
+        size_t index =
+            (*it)->attributes().at("col").dyn_cast<ir::Int32Attribute>().data();
+
+        if (fetch_var_names.size() < index + 1) {
+          fetch_var_names.resize(index + 1);
+        }
+
+        fetch_var_names[index] = (*it)
+                                     ->attributes()
+                                     .at("name")
+                                     .dyn_cast<ir::StrAttribute>()
+                                     .AsString() +
+                                 "@fetch";
+      }
+    }
+
     // Execute program
     paddle::framework::interpreter::ExecutionConfig exe_config;
     exe_config.create_local_scope = false;
     paddle::framework::InterpreterCore core(
         phi::CPUPlace{},
+        fetch_var_names,
         paddle::dialect::PdOpLowerToKernelPass(temp_program.get()),
         &scope_,
         exe_config);
-
     paddle::framework::FetchList fetch_list = core.Run({});
 
     // TODO(liuyuanle): Support multiple output.

From 7e60294e45d5f1f90701d180b61e33949bfd2ae4 Mon Sep 17 00:00:00 2001
From: phlrain <phliuhongyu@126.com>
Date: Sun, 30 Jul 2023 13:36:35 +0000
Subject: [PATCH 03/22] try to remove constant fold

---
 test/cpp/ir/pattern_rewrite/pattern_rewrite_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/cpp/ir/pattern_rewrite/pattern_rewrite_test.cc b/test/cpp/ir/pattern_rewrite/pattern_rewrite_test.cc
index ebb6144753e2aa..da7b46acf8c2ab 100644
--- a/test/cpp/ir/pattern_rewrite/pattern_rewrite_test.cc
+++ b/test/cpp/ir/pattern_rewrite/pattern_rewrite_test.cc
@@ -1087,7 +1087,7 @@ TEST(pattern_rewrite, Patterns) {
 
   ir::PassManager pm(ctx);
   pm.AddPass(std::make_unique<TestPass>());
-  pm.AddPass(ir::CreateConstantFoldingPass());
+  // pm.AddPass(ir::CreateConstantFoldingPass());
   pm.AddPass(ir::CreateDeadCodeEliminationPass());
   pm.EnablePassTiming();
   pm.EnableIRPrinting();

From 4b38badb7bc9f777c622d99e0e561a9ce8ff5519 Mon Sep 17 00:00:00 2001
From: phlrain <phliuhongyu@126.com>
Date: Mon, 31 Jul 2023 09:43:51 +0000
Subject: [PATCH 04/22] revert code

---
 test/cpp/ir/pattern_rewrite/pattern_rewrite_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/cpp/ir/pattern_rewrite/pattern_rewrite_test.cc b/test/cpp/ir/pattern_rewrite/pattern_rewrite_test.cc
index da7b46acf8c2ab..ebb6144753e2aa 100644
--- a/test/cpp/ir/pattern_rewrite/pattern_rewrite_test.cc
+++ b/test/cpp/ir/pattern_rewrite/pattern_rewrite_test.cc
@@ -1087,7 +1087,7 @@ TEST(pattern_rewrite, Patterns) {
 
   ir::PassManager pm(ctx);
   pm.AddPass(std::make_unique<TestPass>());
-  // pm.AddPass(ir::CreateConstantFoldingPass());
+  pm.AddPass(ir::CreateConstantFoldingPass());
   pm.AddPass(ir::CreateDeadCodeEliminationPass());
   pm.EnablePassTiming();
   pm.EnableIRPrinting();

From 354b1f9adec2240c57e5fb887581fe16e0c47c59 Mon Sep 17 00:00:00 2001
From: phlrain <phliuhongyu@126.com>
Date: Mon, 31 Jul 2023 14:34:10 +0000
Subject: [PATCH 05/22] add pattern rewrite test flag

---
 test/cpp/ir/pattern_rewrite/CMakeLists.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test/cpp/ir/pattern_rewrite/CMakeLists.txt b/test/cpp/ir/pattern_rewrite/CMakeLists.txt
index fd527db555003e..2023cc0cf413f3 100644
--- a/test/cpp/ir/pattern_rewrite/CMakeLists.txt
+++ b/test/cpp/ir/pattern_rewrite/CMakeLists.txt
@@ -7,3 +7,7 @@ endif()
 
 cc_test_old(pattern_rewrite_test SRCS pattern_rewrite_test.cc DEPS
             ${PATTERN_REWRITE_TEST_DEPS})
+
+set_tests_properties(
+  pattern_rewrite_test PROPERTIES ENVIRONMENT
+                                  "FLAGS_enable_new_ir_in_executor=true")

From edf3ce286ba9e9f0ca036ec7a5dd76c3265862b9 Mon Sep 17 00:00:00 2001
From: phlrain <phliuhongyu@126.com>
Date: Tue, 1 Aug 2023 04:04:28 +0000
Subject: [PATCH 06/22] fix multi fetch

---
 .../framework/new_executor/standalone_executor.cc    | 12 ++++++++++++
 .../fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc   |  3 ++-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc
index 0e6292f0b1bf4e..a89db7b22e18de 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor.cc
@@ -90,12 +90,24 @@ StandaloneExecutor::StandaloneExecutor(const platform::Place& place,
 
       auto kernel_program =
           paddle::dialect::PdOpLowerToKernelPass(base_program.get(), place);
+
       interpretercores_.emplace_back(
           std::make_shared<InterpreterCore>(place_,
                                             fetch_var_names_,
                                             std::move(kernel_program),
                                             scope_,
                                             execution_config));
+
+      // NOTE(phlrain): why we add prefix here. In earger op test,
+      // different test case use same scope (not same standalone executor),
+      // we must add prefix to prevent fetch same variable in different case
+      std::stringstream pre_ss;
+      pre_ss << interpretercores_.back()->Impl();
+
+      for (size_t i = 0; i < fetch_var_names_.size(); ++i) {
+        fetch_var_names_[i] = pre_ss.str() + "_" + fetch_var_names_[i];
+      }
+
     } else {
       interpretercores_.emplace_back(
           std::make_shared<InterpreterCore>(place_,
diff --git a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc b/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc
index d5de1abcf7a2f6..81464874e2a932 100644
--- a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc
+++ b/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc
@@ -190,7 +190,8 @@ void HandleForSpecialOp(
     auto fetch_src_name =
         op->attributes().at("name").dyn_cast<ir::StrAttribute>().AsString();
 
-    auto fetch_var_name = fetch_src_name + "@fetch";
+    auto fetch_var_name = var_name_prefix + "_" + fetch_src_name + "@fetch";
+    std::cerr << "fetch var name " << fetch_var_name << std::endl;
     auto* var = const_cast<paddle::framework::Scope*>(inner_scope->root())
                     ->Var(fetch_var_name);
     var->GetMutable<phi::DenseTensor>();

From c7206c1ee6dfe3ff25d0d9734401df6fb649021b Mon Sep 17 00:00:00 2001
From: phlrain <phliuhongyu@126.com>
Date: Tue, 1 Aug 2023 05:36:47 +0000
Subject: [PATCH 07/22] remove usless code

---
 paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc b/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc
index 81464874e2a932..3704f33632f9ff 100644
--- a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc
+++ b/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc
@@ -191,7 +191,6 @@ void HandleForSpecialOp(
         op->attributes().at("name").dyn_cast<ir::StrAttribute>().AsString();
 
     auto fetch_var_name = var_name_prefix + "_" + fetch_src_name + "@fetch";
-    std::cerr << "fetch var name " << fetch_var_name << std::endl;
     auto* var = const_cast<paddle::framework::Scope*>(inner_scope->root())
                     ->Var(fetch_var_name);
     var->GetMutable<phi::DenseTensor>();

From b80e3d4393fb15e9607e33599b99e6b6c08cf0f1 Mon Sep 17 00:00:00 2001
From: phlrain <phliuhongyu@126.com>
Date: Tue, 1 Aug 2023 06:33:41 +0000
Subject: [PATCH 08/22] new ir support legacy kernel instraction

---
 .../new_executor/instruction/CMakeLists.txt   |   1 +
 .../instruction/instruction_base.cc           | 184 +++++++++++++++++
 .../instruction/instruction_base.h            |  27 +++
 .../instruction/legacy_kernel_instruction.cc  | 184 +++++++++++++++++
 .../instruction/legacy_kernel_instruction.h   |  71 +++++++
 .../instruction/phi_kernel_instruction.cc     | 186 +-----------------
 .../instruction/phi_kernel_instruction.h      |   8 -
 .../new_executor/new_ir_interpreter.cc        |  34 +++-
 .../new_executor/standalone_executor.cc       |   4 +-
 .../fused_softmax_mask_upper_triangle_op.cu   |   1 +
 test/legacy_test/eager_op_test.py             |   2 +
 ...est_softmax_mask_fuse_upper_triangle_op.py |  47 ++---
 12 files changed, 528 insertions(+), 221 deletions(-)
 create mode 100644 paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc
 create mode 100644 paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h

diff --git a/paddle/fluid/framework/new_executor/instruction/CMakeLists.txt b/paddle/fluid/framework/new_executor/instruction/CMakeLists.txt
index 8d9a93757d3099..88064749eaf027 100644
--- a/paddle/fluid/framework/new_executor/instruction/CMakeLists.txt
+++ b/paddle/fluid/framework/new_executor/instruction/CMakeLists.txt
@@ -1,4 +1,5 @@
 cc_library(
   instruction_base
   SRCS instruction_base.cc phi_kernel_instruction.cc
+       legacy_kernel_instruction.cc
   DEPS phi framework_proto)
diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_base.cc b/paddle/fluid/framework/new_executor/instruction/instruction_base.cc
index 6c09d7aa2a13fd..11f9e4071fe8fc 100644
--- a/paddle/fluid/framework/new_executor/instruction/instruction_base.cc
+++ b/paddle/fluid/framework/new_executor/instruction/instruction_base.cc
@@ -16,6 +16,10 @@
 #include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 
+#include "paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h"
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/ir/core/builtin_attribute.h"
+
 namespace paddle {
 namespace framework {
 
@@ -93,5 +97,185 @@ void InstructionBase::SetOutputs(
   output_index_ = outputs;
 }
 
+std::vector<int> InstructionBase::GetValueIds(
+    ir::Value value,
+    Scope* inner_scope,
+    const std::unordered_map<::ir::Value, std::string>& value_2_var_name,
+    const std::map<std::string, int>& var_name_2_id,
+    const std::unordered_map<const paddle::framework::Variable*, std::string>&
+        variable_2_var_name) {
+  std::vector<int> ids;
+  std::string var_name = value_2_var_name.at(value);
+  ids.push_back(var_name_2_id.at(var_name));
+  // NOTE(zhangbo): Value maybe a VariableRefArray
+  auto var = inner_scope->FindVar(var_name);
+  if (var->IsType<paddle::framework::VariableRefArray>()) {
+    auto& var_array = var->Get<paddle::framework::VariableRefArray>();
+    for (size_t i = 0; i < var_array.size(); ++i) {
+      ids.push_back(var_name_2_id.at(variable_2_var_name.at(var_array[i])));
+    }
+  }
+  return ids;
+}
+
+void InstructionBase::InitInputsOutputsIds(
+    ::ir::Operation* op,
+    Scope* inner_scope,
+    const std::unordered_map<::ir::Value, std::string>& value_2_var_name,
+    const std::map<std::string, int>& var_name_2_id,
+    const std::unordered_map<const paddle::framework::Variable*, std::string>&
+        variable_2_var_name,
+    const std::string& op_name) {
+  std::unordered_map<ir::Value, std::vector<int>> inputs;
+  for (size_t i = 0; i < op->num_operands(); i++) {
+    ir::Value value = op->operand(i);
+    if (value) {
+      PADDLE_ENFORCE_NE(
+          value_2_var_name.find(value),
+          value_2_var_name.end(),
+          phi::errors::PreconditionNotMet(
+              "input should in name map, [%d] 'th input of [%s] op",
+              i,
+              op_name));
+      std::vector<int> inputs_id = GetValueIds(value,
+                                               inner_scope,
+                                               value_2_var_name,
+                                               var_name_2_id,
+                                               variable_2_var_name);
+      inputs.emplace(value, inputs_id);
+    }
+  }
+  SetInputs(inputs);
+  VLOG(8) << "finish process inputs_index";
+  std::unordered_map<ir::Value, std::vector<int>> outputs;
+  for (size_t i = 0; i < op->num_results(); i++) {
+    ir::Value value = op->result(i);
+    if (value) {
+      PADDLE_ENFORCE_NE(
+          value_2_var_name.find(value),
+          value_2_var_name.end(),
+          phi::errors::PreconditionNotMet(
+              "input should in name map, [%d] 'th input of [%s] op",
+              i,
+              op_name));
+      std::vector<int> outputs_id = GetValueIds(value,
+                                                inner_scope,
+                                                value_2_var_name,
+                                                var_name_2_id,
+                                                variable_2_var_name);
+      outputs.emplace(value, outputs_id);
+    }
+  }
+  SetOutputs(outputs);
+  VLOG(8) << "finish process outputs_index";
+}
+
+platform::DeviceContext* InstructionBase::ParseDeviceContext(
+    ir::Operation* op,
+    platform::DeviceContext* origin_dev_ctx,
+    const platform::Place& place,
+    const std::string& execution_stream,
+    const int stream_priority) {
+  auto op_attributes = op->attributes();
+  auto op_name =
+      op_attributes.at("op_name").dyn_cast<::ir::StrAttribute>().AsString();
+  interpreter::ContextManager& ctx_manager =
+      interpreter::ContextManager::Instance();
+
+  platform::DeviceContext* dev_ctx = nullptr;
+
+  // only gpu need update. xpu not need, because xpu memcpy op kernel is
+  // synchronous.
+  if (platform::is_gpu_place(place) || platform::is_custom_place(place)) {
+    VLOG(6) << "Parse DeviceContext for " << op_name
+            << ", execution stream = " << execution_stream;
+    if (execution_stream != kDefaultStream) {
+      dev_ctx = ctx_manager
+                    .Get(std::string(kCustomStream) + "-" + execution_stream,
+                         place,
+                         stream_priority)
+                    .get()
+                    .get();
+      interpreter::SetDeviceCommContext(op, dev_ctx);
+      return dev_ctx;
+    }
+
+    if (op_name == interpreter::kMemcpyD2H) {
+      dev_ctx = ctx_manager.Get(std::string(kD2HStream), place, stream_priority)
+                    .get()
+                    .get();
+      interpreter::SetDeviceCommContext(op, dev_ctx);
+      return dev_ctx;
+    } else if (op_name == interpreter::kMemcpyH2D) {
+      dev_ctx = ctx_manager.Get(std::string(kH2DStream), place, stream_priority)
+                    .get()
+                    .get();
+      interpreter::SetDeviceCommContext(op, dev_ctx);
+      return dev_ctx;
+    }
+
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    // NOTE(Ruibiao): Here supports multi-stream overlap for c_allreduce_sum
+    // with use_cal_stream==false by returning a device context getting from the
+    // global NCCLCommContext instance. Because when use_calc_stream==false, in
+    // OP kernel, the NCCL communication will be launched to the stream directly
+    // getting from the global NCCLCommContext instance rather than the
+    // DeviceContext passed from executor (see CAllReduceOpCUDAKernel in
+    // c_allreduce_op.h). Now it is just a temporary solution for ONLY
+    // c_allreduce_sum which is used in ResNet50 distributed training.
+    if (op_name == "c_allreduce_sum" && op_attributes.at("use_calc_stream")
+                                                .dyn_cast<::ir::BoolAttribute>()
+                                                .data() == false) {
+      int ring_id =
+          op_attributes.at("ring_id").dyn_cast<::ir::Int32Attribute>().data();
+      return platform::NCCLCommContext::Instance()
+          .Get(ring_id, place)
+          ->dev_context();
+    }
+#endif
+  }
+
+  if (origin_dev_ctx != nullptr) {
+    interpreter::SetDeviceCommContext(op, origin_dev_ctx);
+  }
+  return origin_dev_ctx;
+}
+
+OpFuncType InstructionBase::AnalyseOpFuncType(ir::Operation* op,
+                                              const platform::Place& place) {
+  if (platform::is_cpu_place(place)) {
+    return OpFuncType::kCpuSync;
+  }
+
+  PADDLE_ENFORCE_EQ(interpreter::IsSupportedHeterPlace(place),
+                    true,
+                    phi::errors::Fatal("Unsupported current place %s", place));
+
+  // Some GPU OPs do not launch CUDA Kernel, but spend a lot of time on CPU
+  // computing. They execute serially in device thread and block CUDA kernel
+  // launching in other GPU OPs. To improve performance, set them as kGpuSync
+  // and so that they would be dispatched to host thread.
+  auto op_attributes = op->attributes();
+  auto op_name =
+      op_attributes.at("op_name").dyn_cast<::ir::StrAttribute>().AsString();
+  if (op_name == kCoalesceTensor &&
+      (!platform::is_xpu_place(place) ||
+       op->attribute<ir::BoolAttribute>("persist_output").data() == false) &&
+      op->attribute<ir::BoolAttribute>("set_constant").data() == false &&
+      op->attribute<ir::BoolAttribute>("copy_data").data() == false) {
+    return OpFuncType::kGpuSync;
+  }
+
+  // for memcpy explicitly called by user
+  if (platform::is_gpu_place(place) && op_name == interpreter::kMemcpyD2H) {
+    return OpFuncType::kGpuSync;
+  }
+
+  if (op_name == "shape") {
+    return OpFuncType::kGpuSync;
+  }
+  return OpFuncType::kGpuAsync;
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_base.h b/paddle/fluid/framework/new_executor/instruction/instruction_base.h
index 7452990a1d9076..11d1f3e2f3eae8 100644
--- a/paddle/fluid/framework/new_executor/instruction/instruction_base.h
+++ b/paddle/fluid/framework/new_executor/instruction/instruction_base.h
@@ -137,7 +137,34 @@ class InstructionBase {
 
   virtual const std::string& Name() const = 0;
 
+ protected:
+  OpFuncType AnalyseOpFuncType(ir::Operation* op, const platform::Place& place);
+
+  platform::DeviceContext* ParseDeviceContext(
+      ir::Operation* op,
+      platform::DeviceContext* origin_dev_ctx,
+      const platform::Place& place,
+      const std::string& execution_stream,
+      const int stream_priority);
+
+  void InitInputsOutputsIds(
+      ::ir::Operation* op,
+      Scope* inner_scope,
+      const std::unordered_map<::ir::Value, std::string>& value_2_var_name,
+      const std::map<std::string, int>& var_name_2_id,
+      const std::unordered_map<const paddle::framework::Variable*, std::string>&
+          variable_2_var_name,
+      const std::string& op_name);
+
  private:
+  std::vector<int> GetValueIds(
+      ir::Value value,
+      Scope* inner_scope,
+      const std::unordered_map<::ir::Value, std::string>& value_2_var_name,
+      const std::map<std::string, int>& var_name_2_id,
+      const std::unordered_map<const paddle::framework::Variable*, std::string>&
+          variable_2_var_name);
+
   size_t id_;
 
   bool is_artificial_;  // Instruction is artificial means that it is only used
diff --git a/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc b/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc
new file mode 100644
index 00000000000000..81690f45f463f1
--- /dev/null
+++ b/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc
@@ -0,0 +1,184 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h"
+
+#include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h"
+#include "paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/ir/dialect/pd_dialect.h"
+#include "paddle/fluid/ir/interface/infermeta.h"
+#include "paddle/fluid/ir/interface/op_yaml_info.h"
+#include "paddle/fluid/ir/interface/op_yaml_info_parser.h"
+#include "paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h"
+
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/core/meta_tensor.h"
+#include "paddle/phi/core/type_defs.h"
+
+#include "paddle/ir/core/builtin_attribute.h"
+#include "paddle/ir/core/operation.h"
+#include "paddle/ir/core/value.h"
+
+namespace paddle {
+namespace framework {
+
+LegacyKernelInstruction::LegacyKernelInstruction(
+    size_t id,
+    const platform::Place& place,
+    ir::Operation* op,
+    Scope* scope,
+    Scope* local_scope,
+    const std::unordered_map<::ir::Value, std::string>& value_2_var_name,
+    const std::map<std::string, int>& var_name_2_id,
+    const std::unordered_map<const paddle::framework::Variable*, std::string>&
+        variable_2_var_name)
+    : InstructionBase(id, place) {
+  auto op_attributes = op->attributes();
+  auto op_name =
+      op_attributes.at("op_name").dyn_cast<::ir::StrAttribute>().AsString();
+  ir::OpInfo op_info = ir::IrContext::Instance()->GetRegisteredOpInfo(op_name);
+
+  legacy_op_name_ = op_name;
+  VLOG(6) << "construct phi kernel instruction for: " << legacy_op_name_;
+
+  // Todo: support paddle::dialect::DistAttribute
+  //   if (op_attributes.count("dist_attr") != 0) {
+  //     if (op_attributes.count("execution_stream") != 0) {
+  //         SetExecutionStream(op_attributes.at("execution_stream")
+  //                             .dyn_cast<::ir::StrAttribute>()
+  //                             .data());
+  //     }
+  //     if (op_attributes.count("stream_priority") != 0) {
+  //         SetStreamPriority(op_attributes.at("stream_priority")
+  //                             .dyn_cast<::ir::Int32Attribute>()
+  //                             .data());
+  //     }
+  //     if (op_attributes.count("scheduling_priority") != 0) {
+  //         SetSchedulingPriority(op_attributes.at("scheduling_priority")
+  //                                 .dyn_cast<::ir::Int64Attribute>()
+  //                                 .data());
+  //     }
+  //   } else {
+  //     if (interpreter::IsCommunicationOp(op)) {
+  //       // NOTE(Ruibiao): Dispatching computation before communication
+  //       improves
+  //       // multi-stream overlap when the time cost of communication less than
+  //       // that of the calculation (e.g., ResNet50_bs128_pure_fp16 N4C32
+  //       // training).
+  //       op_func_node.scheduling_priority_ = 1;
+  //     }
+  //   }
+  VLOG(6) << "finish process dist attributes";
+
+  SetKernelType(AnalyseOpFuncType(op, place));
+  VLOG(6) << "finish process analyse kernel type";
+
+  infer_meta_interface_ =
+      op_info.GetInterfaceImpl<paddle::dialect::InferMetaInterface>();
+  VLOG(6) << "finish process infer_meta_interface_";
+
+  auto yaml_interface =
+      op_info.GetInterfaceImpl<paddle::dialect::OpYamlInfoInterface>();
+  PADDLE_ENFORCE_NOT_NULL(
+      yaml_interface,
+      phi::errors::PreconditionNotMet(
+          "can not find OpYamlInfoInterface from [%s]", legacy_op_name_));
+  paddle::dialect::OpYamlInfoParser yaml_info_parser(
+      yaml_interface->get_op_info_());
+  VLOG(6) << "finish process yaml_info_parser";
+
+  ::ir::BuildPhiContext<
+      phi::InferMetaContext,
+      phi::MetaTensor,
+      phi::MetaTensor,
+      paddle::small_vector<phi::MetaTensor, phi::kInputSmallVectorSize>,
+      paddle::small_vector<phi::MetaTensor, phi::kInputSmallVectorSize>,
+      false>(op,
+             value_2_var_name,
+             scope,
+             local_scope,
+             yaml_info_parser,
+             &infer_meta_context_);
+  VLOG(6) << "finish process infer meta context";
+
+  auto kernel_name =
+      op_attributes.at("kernel_name").dyn_cast<ir::StrAttribute>().AsString();
+  auto kernel_key = op_attributes.at("kernel_key")
+                        .dyn_cast<paddle::dialect::KernelAttribute>()
+                        .data();
+  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
+      kernel_name, kernel_key);
+  phi_kernel_ = new phi::Kernel(kernel_result.kernel);
+  PADDLE_ENFORCE_EQ(
+      phi_kernel_->IsValid(), true, "not found kernel for [%s]", kernel_name);
+  VLOG(6) << "finish process select kernel";
+
+  operator_base_ =
+      ir::BuildOperatorBase(op, value_2_var_name, yaml_info_parser);
+  paddle::framework::VariableValueMap in_map;
+  paddle::framework::VariableValueMap out_map;
+  auto dev_ctx = phi::DeviceContextPool::Instance().Get(
+      phi::TransToPhiPlace(kernel_key.backend()));
+
+  runtime_context_ = std::make_shared<paddle::framework::RuntimeContext>(
+      paddle::framework::RuntimeContext(in_map, out_map));
+  ir::BuildRuntimeContext(op,
+                          value_2_var_name,
+                          scope,
+                          local_scope,
+                          yaml_info_parser,
+                          runtime_context_.get());
+  kernel_context_ = std::make_shared<paddle::framework::ExecutionContext>(
+      paddle::framework::ExecutionContext(
+          *operator_base_, *local_scope, *dev_ctx, *(runtime_context_.get())));
+
+  VLOG(6) << "finish process kernel context";
+  SetDeviceContext(
+      ParseDeviceContext(op,
+                         phi::DeviceContextPool::Instance().Get(
+                             phi::TransToPhiPlace(kernel_key.backend())),
+                         place,
+                         GetExecutionStream(),
+                         GetStreamPriority()));
+  VLOG(6) << "finish process device context";
+
+  Scope* inner_scope = local_scope == nullptr ? scope : local_scope;
+  InitInputsOutputsIds(op,
+                       inner_scope,
+                       value_2_var_name,
+                       var_name_2_id,
+                       variable_2_var_name,
+                       legacy_op_name_);
+  VLOG(6) << "finish process inputs outputs index";
+
+  auto& no_need_buffer_ids = yaml_info_parser.NoNeedBufferIds();
+  std::unordered_set<::ir::Value> no_need_buffer_values;
+  for (size_t id = 0; id < no_need_buffer_ids.size(); id++) {
+    no_need_buffer_values.insert(op->operand(no_need_buffer_ids[id]));
+  }
+  SetNoNeedBuffer(no_need_buffer_values);
+  VLOG(6) << "finish process no need buffer";
+}
+
+void LegacyKernelInstruction::Run() {
+  infer_meta_interface_->infer_meta_(&(infer_meta_context_));
+  VLOG(6) << "Run op " << legacy_op_name_ << " infer meta.";
+  (*(phi_kernel_))((kernel_context_.get()));
+  VLOG(6) << "Run op " << legacy_op_name_ << " kernel.";
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h b/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h
new file mode 100644
index 00000000000000..a8a150fbb6c776
--- /dev/null
+++ b/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h
@@ -0,0 +1,71 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/new_executor/instruction/instruction_base.h"
+
+namespace ir {
+class Operation;
+}  // namespace ir
+
+namespace paddle {
+namespace framework {
+class Scope;
+class Value;
+
+class LegacyKernelInstruction : public InstructionBase {
+ public:
+  LegacyKernelInstruction(
+      size_t id,
+      const platform::Place& place,
+      ::ir::Operation* op,
+      Scope* scope,
+      Scope* local_scope,
+      const std::unordered_map<::ir::Value, std::string>& value_2_var_name,
+      const std::map<std::string, int>& var_name_2_id,
+      const std::unordered_map<const paddle::framework::Variable*, std::string>&
+          variable_2_var_name);
+
+  phi::Kernel* PhiKernel() const { return phi_kernel_; }
+
+  const phi::InferMetaContext& InferMetaContext() const {
+    return infer_meta_context_;
+  }
+
+  paddle::dialect::InferMetaInterface::Concept* InferMetaInterface() const {
+    return infer_meta_interface_;
+  }
+
+  void Run() override;
+
+  const std::string& Name() const override { return legacy_op_name_; }
+
+ private:
+  std::string legacy_op_name_;
+
+  paddle::dialect::InferMetaInterface::Concept* infer_meta_interface_{
+      nullptr};  // not owned
+
+  phi::InferMetaContext infer_meta_context_;
+
+  std::shared_ptr<framework::RuntimeContext> runtime_context_;
+  std::shared_ptr<paddle::framework::OperatorBase> operator_base_;
+  std::shared_ptr<paddle::framework::ExecutionContext> kernel_context_;
+
+  phi::Kernel* phi_kernel_{nullptr};  // not owned
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc
index 39e791aca3f8ac..4e73418d5abd6b 100644
--- a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc
+++ b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc
@@ -35,112 +35,6 @@
 namespace paddle {
 namespace framework {
 
-platform::DeviceContext* ParseDeviceContext(
-    ir::Operation* op,
-    platform::DeviceContext* origin_dev_ctx,
-    const platform::Place& place,
-    const std::string& execution_stream,
-    const int stream_priority) {
-  auto op_attributes = op->attributes();
-  auto op_name =
-      op_attributes.at("op_name").dyn_cast<::ir::StrAttribute>().AsString();
-  interpreter::ContextManager& ctx_manager =
-      interpreter::ContextManager::Instance();
-
-  platform::DeviceContext* dev_ctx = nullptr;
-
-  // only gpu need update. xpu not need, because xpu memcpy op kernel is
-  // synchronous.
-  if (platform::is_gpu_place(place) || platform::is_custom_place(place)) {
-    VLOG(6) << "Parse DeviceContext for " << op_name
-            << ", execution stream = " << execution_stream;
-    if (execution_stream != kDefaultStream) {
-      dev_ctx = ctx_manager
-                    .Get(std::string(kCustomStream) + "-" + execution_stream,
-                         place,
-                         stream_priority)
-                    .get()
-                    .get();
-      interpreter::SetDeviceCommContext(op, dev_ctx);
-      return dev_ctx;
-    }
-
-    if (op_name == interpreter::kMemcpyD2H) {
-      dev_ctx = ctx_manager.Get(std::string(kD2HStream), place, stream_priority)
-                    .get()
-                    .get();
-      interpreter::SetDeviceCommContext(op, dev_ctx);
-      return dev_ctx;
-    } else if (op_name == interpreter::kMemcpyH2D) {
-      dev_ctx = ctx_manager.Get(std::string(kH2DStream), place, stream_priority)
-                    .get()
-                    .get();
-      interpreter::SetDeviceCommContext(op, dev_ctx);
-      return dev_ctx;
-    }
-
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    // NOTE(Ruibiao): Here supports multi-stream overlap for c_allreduce_sum
-    // with use_cal_stream==false by returning a device context getting from the
-    // global NCCLCommContext instance. Because when use_calc_stream==false, in
-    // OP kernel, the NCCL communication will be launched to the stream directly
-    // getting from the global NCCLCommContext instance rather than the
-    // DeviceContext passed from executor (see CAllReduceOpCUDAKernel in
-    // c_allreduce_op.h). Now it is just a temporary solution for ONLY
-    // c_allreduce_sum which is used in ResNet50 distributed training.
-    if (op_name == "c_allreduce_sum" && op_attributes.at("use_calc_stream")
-                                                .dyn_cast<::ir::BoolAttribute>()
-                                                .data() == false) {
-      int ring_id =
-          op_attributes.at("ring_id").dyn_cast<::ir::Int32Attribute>().data();
-      return platform::NCCLCommContext::Instance()
-          .Get(ring_id, place)
-          ->dev_context();
-    }
-#endif
-  }
-
-  if (origin_dev_ctx != nullptr) {
-    interpreter::SetDeviceCommContext(op, origin_dev_ctx);
-  }
-  return origin_dev_ctx;
-}
-
-OpFuncType AnalyseOpFuncType(ir::Operation* op, const platform::Place& place) {
-  if (platform::is_cpu_place(place)) {
-    return OpFuncType::kCpuSync;
-  }
-
-  PADDLE_ENFORCE_EQ(interpreter::IsSupportedHeterPlace(place),
-                    true,
-                    phi::errors::Fatal("Unsupported current place %s", place));
-
-  // Some GPU OPs do not launch CUDA Kernel, but spend a lot of time on CPU
-  // computing. They execute serially in device thread and block CUDA kernel
-  // launching in other GPU OPs. To improve performance, set them as kGpuSync
-  // and so that they would be dispatched to host thread.
-  auto op_attributes = op->attributes();
-  auto op_name =
-      op_attributes.at("op_name").dyn_cast<::ir::StrAttribute>().AsString();
-  if (op_name == kCoalesceTensor &&
-      (!platform::is_xpu_place(place) ||
-       op->attribute<ir::BoolAttribute>("persist_output").data() == false) &&
-      op->attribute<ir::BoolAttribute>("set_constant").data() == false &&
-      op->attribute<ir::BoolAttribute>("copy_data").data() == false) {
-    return OpFuncType::kGpuSync;
-  }
-
-  // for memcpy explicitly called by user
-  if (platform::is_gpu_place(place) && op_name == interpreter::kMemcpyD2H) {
-    return OpFuncType::kGpuSync;
-  }
-
-  if (op_name == "shape") {
-    return OpFuncType::kGpuSync;
-  }
-  return OpFuncType::kGpuAsync;
-}
-
 PhiKernelInstruction::PhiKernelInstruction(
     size_t id,
     const platform::Place& place,
@@ -256,8 +150,12 @@ PhiKernelInstruction::PhiKernelInstruction(
   VLOG(6) << "finish process device context";
 
   Scope* inner_scope = local_scope == nullptr ? scope : local_scope;
-  InitInputsOutputsIds(
-      op, inner_scope, value_2_var_name, var_name_2_id, variable_2_var_name);
+  InitInputsOutputsIds(op,
+                       inner_scope,
+                       value_2_var_name,
+                       var_name_2_id,
+                       variable_2_var_name,
+                       phi_op_name_);
   VLOG(6) << "finish process inputs outputs index";
 
   auto& no_need_buffer_ids = yaml_info_parser.NoNeedBufferIds();
@@ -269,78 +167,6 @@ PhiKernelInstruction::PhiKernelInstruction(
   VLOG(6) << "finish process no need buffer";
 }
 
-std::vector<int> GetValueIds(
-    ir::Value value,
-    Scope* inner_scope,
-    const std::unordered_map<::ir::Value, std::string>& value_2_var_name,
-    const std::map<std::string, int>& var_name_2_id,
-    const std::unordered_map<const paddle::framework::Variable*, std::string>&
-        variable_2_var_name) {
-  std::vector<int> ids;
-  std::string var_name = value_2_var_name.at(value);
-  ids.push_back(var_name_2_id.at(var_name));
-  // NOTE(zhangbo): Value maybe a VariableRefArray
-  auto var = inner_scope->FindVar(var_name);
-  if (var->IsType<paddle::framework::VariableRefArray>()) {
-    auto& var_array = var->Get<paddle::framework::VariableRefArray>();
-    for (size_t i = 0; i < var_array.size(); ++i) {
-      ids.push_back(var_name_2_id.at(variable_2_var_name.at(var_array[i])));
-    }
-  }
-  return ids;
-}
-
-void PhiKernelInstruction::InitInputsOutputsIds(
-    ::ir::Operation* op,
-    Scope* inner_scope,
-    const std::unordered_map<::ir::Value, std::string>& value_2_var_name,
-    const std::map<std::string, int>& var_name_2_id,
-    const std::unordered_map<const paddle::framework::Variable*, std::string>&
-        variable_2_var_name) {
-  std::unordered_map<ir::Value, std::vector<int>> inputs;
-  for (size_t i = 0; i < op->num_operands(); i++) {
-    ir::Value value = op->operand(i);
-    if (value) {
-      PADDLE_ENFORCE_NE(
-          value_2_var_name.find(value),
-          value_2_var_name.end(),
-          phi::errors::PreconditionNotMet(
-              "input should in name map, [%d] 'th input of [%s] op",
-              i,
-              phi_op_name_));
-      std::vector<int> inputs_id = GetValueIds(value,
-                                               inner_scope,
-                                               value_2_var_name,
-                                               var_name_2_id,
-                                               variable_2_var_name);
-      inputs.emplace(value, inputs_id);
-    }
-  }
-  SetInputs(inputs);
-  VLOG(8) << "finish process inputs_index";
-  std::unordered_map<ir::Value, std::vector<int>> outputs;
-  for (size_t i = 0; i < op->num_results(); i++) {
-    ir::Value value = op->result(i);
-    if (value) {
-      PADDLE_ENFORCE_NE(
-          value_2_var_name.find(value),
-          value_2_var_name.end(),
-          phi::errors::PreconditionNotMet(
-              "input should in name map, [%d] 'th input of [%s] op",
-              i,
-              phi_op_name_));
-      std::vector<int> outputs_id = GetValueIds(value,
-                                                inner_scope,
-                                                value_2_var_name,
-                                                var_name_2_id,
-                                                variable_2_var_name);
-      outputs.emplace(value, outputs_id);
-    }
-  }
-  SetOutputs(outputs);
-  VLOG(8) << "finish process outputs_index";
-}
-
 void PhiKernelInstruction::Run() {
   infer_meta_interface_->infer_meta_(&(infer_meta_context_));
   VLOG(6) << "Run op " << phi_op_name_ << " infer meta.";
diff --git a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h
index b30fa8bff751b5..fcd35a3b762904 100644
--- a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h
+++ b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h
@@ -55,14 +55,6 @@ class PhiKernelInstruction : public InstructionBase {
   const std::string& Name() const override { return phi_op_name_; }
 
  private:
-  void InitInputsOutputsIds(
-      ::ir::Operation* op,
-      Scope* inner_scope,
-      const std::unordered_map<::ir::Value, std::string>& value_2_var_name,
-      const std::map<std::string, int>& var_name_2_id,
-      const std::unordered_map<const paddle::framework::Variable*, std::string>&
-          variable_2_var_name);
-
   std::string phi_op_name_;
 
   paddle::dialect::InferMetaInterface::Concept* infer_meta_interface_{
diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
index b9060cb16e0d88..78208f30b8e239 100644
--- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
@@ -36,6 +36,7 @@
 #include "paddle/fluid/platform/flags.h"
 #include "paddle/phi/backends/device_manager.h"
 
+#include "paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h"
 #include "paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h"
 #include "paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h"
 
@@ -1645,15 +1646,30 @@ void NewIRInterpreter::BuildInstruction() {
         VLOG(6) << "skip process " << op_name;
         continue;
       }
-      vec_instruction_base_.emplace_back(
-          std::make_unique<PhiKernelInstruction>(op_idx++,
-                                                 place_,
-                                                 (*it),
-                                                 scope_,
-                                                 local_scope_,
-                                                 value_2_var_name_,
-                                                 var_name_2_id_,
-                                                 variable_2_var_name_));
+
+      if (op_name == "pd.fused_softmax_mask_upper_triangle" ||
+          op_name == "pd.fused_softmax_mask_upper_triangle_grad") {
+        std::cerr << "emplace lagcy kernel " << op_name << std::endl;
+        vec_instruction_base_.emplace_back(
+            std::make_unique<LegacyKernelInstruction>(op_idx++,
+                                                      place_,
+                                                      (*it),
+                                                      scope_,
+                                                      local_scope_,
+                                                      value_2_var_name_,
+                                                      var_name_2_id_,
+                                                      variable_2_var_name_));
+      } else {
+        vec_instruction_base_.emplace_back(
+            std::make_unique<PhiKernelInstruction>(op_idx++,
+                                                   place_,
+                                                   (*it),
+                                                   scope_,
+                                                   local_scope_,
+                                                   value_2_var_name_,
+                                                   var_name_2_id_,
+                                                   variable_2_var_name_));
+      }
     } else {
       PADDLE_THROW(platform::errors::Unimplemented(
           "Now only support pd_kernel dialect."));
diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc
index c80f9b36ff98ba..1f15aa6e12f648 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor.cc
@@ -65,8 +65,10 @@ StandaloneExecutor::StandaloneExecutor(const platform::Place& place,
     if (FLAGS_enable_new_ir_in_executor) {
       VLOG(6) << "begin to translate" << std::endl;
       auto base_program = paddle::TranslateLegacyProgramToProgram(*program);
+      base_program->Print(std::cout);
       auto kernel_program =
           paddle::dialect::PdOpLowerToKernelPass(base_program.get(), place);
+      kernel_program->Print(std::cout);
       interpretercores_.emplace_back(std::make_shared<InterpreterCore>(
           place_, std::move(kernel_program), scope_, execution_config));
     } else {
@@ -126,7 +128,7 @@ paddle::framework::FetchList StandaloneExecutor::Run(
       interpretercores_[job_idx]->ShareBuildResultsFrom(
           interpretercores_[type_to_first_id[job_type]]);
     }
-    interpretercores_[job_idx]->Run(feed_names, /*need_fetch = */ false);
+    interpretercores_[job_idx]->BetaRun(feed_names, /*need_fetch = */ false);
   }
 
   // return Fetch Tensors
diff --git a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu
index 779ee234071af0..05a82db2826385 100644
--- a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu
+++ b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu
@@ -475,6 +475,7 @@ template <typename T, typename DeviceContext>
 class SoftmaxMaskFuseUpperTriangleGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
+    std::cerr << "comute grad " << std::endl;
     auto* grad_x =
         context.Output<phi::DenseTensor>(framework::GradVarName("X"));
     auto* grad_y =
diff --git a/test/legacy_test/eager_op_test.py b/test/legacy_test/eager_op_test.py
index f34ff8e137da6f..175cc7346f39f9 100644
--- a/test/legacy_test/eager_op_test.py
+++ b/test/legacy_test/eager_op_test.py
@@ -2682,6 +2682,7 @@ def check_grad_with_place(
             max_relative_error = (
                 0.001 if max_relative_error < 0.001 else max_relative_error
             )
+        print("grad", analytic_grads)
         self._assert_is_close(
             numeric_grads,
             analytic_grads,
@@ -3039,6 +3040,7 @@ def _get_gradient(
                 compiled_prog = fluid.CompiledProgram(prog, build_strategy)
                 prog = compiled_prog
             executor = fluid.Executor(place)
+            print(prog)
             res = list(
                 map(
                     np.array,
diff --git a/test/legacy_test/test_softmax_mask_fuse_upper_triangle_op.py b/test/legacy_test/test_softmax_mask_fuse_upper_triangle_op.py
index 82dbaaf0e78c46..217a96281e02c2 100644
--- a/test/legacy_test/test_softmax_mask_fuse_upper_triangle_op.py
+++ b/test/legacy_test/test_softmax_mask_fuse_upper_triangle_op.py
@@ -18,7 +18,6 @@
 from eager_op_test import OpTest
 
 import paddle
-from paddle import fluid, incubate
 from paddle.fluid import core
 
 paddle.enable_static()
@@ -48,37 +47,38 @@ def setUp(self):
         rst = _get_softmax_upper(x)
         self.outputs = {'Out': rst}
 
-    def test_check_output(self):
-        self.check_output_with_place(core.CUDAPlace(0))
+    # def test_check_output(self):
+    #     self.check_output_with_place(core.CUDAPlace(0))
 
     def test_check_grad(self):
         self.check_grad_with_place(core.CUDAPlace(0), ["X"], "Out")
 
 
-@unittest.skipIf(
-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-)
-class TestSoftmaxMaskFuseOp1(OpTest):
-    def setUp(self):
-        self.op_type = "fused_softmax_mask_upper_triangle"
-        x = np.random.random((1, 4, 32, 32))
-        self.inputs = {'X': x}
-        rst = _get_softmax_upper(x)
-        self.outputs = {'Out': rst}
+# @unittest.skipIf(
+#     not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
+# )
+# class TestSoftmaxMaskFuseOp1(OpTest):
+#     def setUp(self):
+#         self.op_type = "fused_softmax_mask_upper_triangle"
+#         x = np.random.random((1, 4, 32, 32))
+#         self.inputs = {'X': x}
+#         rst = _get_softmax_upper(x)
+#         self.outputs = {'Out': rst}
 
-    def test_check_output(self):
-        try:
-            self.check_output_with_place(core.CPUPlace())
-        except (NotImplementedError, RuntimeError):
-            pass
+#     def test_check_output(self):
+#         try:
+#             self.check_output_with_place(core.CPUPlace())
+#         except (NotImplementedError, RuntimeError):
+#             pass
 
-    def test_check_grad(self):
-        try:
-            self.check_grad_with_place(core.CPUPlace(), ["X"], "Out")
-        except (NotImplementedError, RuntimeError):
-            pass
+#     def test_check_grad(self):
+#         try:
+#             self.check_grad_with_place(core.CPUPlace(), ["X"], "Out")
+#         except (NotImplementedError, RuntimeError):
+#             pass
 
 
+'''
 @unittest.skipIf(
     not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
 )
@@ -117,6 +117,7 @@ def test_dygraph(self):
                 rst = incubate.softmax_mask_fuse_upper_triangle(input_x)
                 np.testing.assert_allclose(rst, rst_np, rtol=1e-05)
 
+'''
 
 if __name__ == '__main__':
     unittest.main()

From 755eace533310f942a808c5fedf5fad7c5a4c41b Mon Sep 17 00:00:00 2001
From: phlrain <phliuhongyu@126.com>
Date: Tue, 1 Aug 2023 09:27:55 +0000
Subject: [PATCH 09/22] new ir support legacy kernel instruction

---
 .../new_executor/instruction/instruction_base.h          | 9 ++++++---
 .../fluid/framework/new_executor/new_ir_interpreter.cc   | 5 +++++
 .../operators/fused_softmax_mask_upper_triangle_op.cu    | 2 ++
 paddle/phi/kernels/impl/fetch_impl.h                     | 2 ++
 4 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_base.h b/paddle/fluid/framework/new_executor/instruction/instruction_base.h
index 11d1f3e2f3eae8..0af596ea50a694 100644
--- a/paddle/fluid/framework/new_executor/instruction/instruction_base.h
+++ b/paddle/fluid/framework/new_executor/instruction/instruction_base.h
@@ -21,6 +21,8 @@
 
 #include "paddle/fluid/framework/new_executor/new_executor_defs.h"
 #include "paddle/fluid/platform/event.h"
+#include "paddle/ir/core/operation.h"
+#include "paddle/ir/core/value.h"
 
 namespace ir {
 class Value;
@@ -138,10 +140,11 @@ class InstructionBase {
   virtual const std::string& Name() const = 0;
 
  protected:
-  OpFuncType AnalyseOpFuncType(ir::Operation* op, const platform::Place& place);
+  OpFuncType AnalyseOpFuncType(::ir::Operation* op,
+                               const platform::Place& place);
 
   platform::DeviceContext* ParseDeviceContext(
-      ir::Operation* op,
+      ::ir::Operation* op,
       platform::DeviceContext* origin_dev_ctx,
       const platform::Place& place,
       const std::string& execution_stream,
@@ -158,7 +161,7 @@ class InstructionBase {
 
  private:
   std::vector<int> GetValueIds(
-      ir::Value value,
+      ::ir::Value value,
       Scope* inner_scope,
       const std::unordered_map<::ir::Value, std::string>& value_2_var_name,
       const std::map<std::string, int>& var_name_2_id,
diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
index 7e914cd4332212..67c1c4da0b848d 100644
--- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
@@ -1875,6 +1875,10 @@ void NewIRInterpreter::CalculateLastLiveOps() {
         ins.begin(), ins.end()};
     ins_and_outs.insert(outs.begin(), outs.end());
 
+    if (instr->Name() != "pd.fetch") {
+      ins_and_outs.insert(outs.begin(), outs.end());
+    }
+
     for (auto& item : ins_and_outs) {
       for (auto var_id : item.second) {
         // skip no_need_buffer input vars
@@ -2117,6 +2121,7 @@ void NewIRInterpreter::RunInstructionBase(InstructionBase* instr_node) {
 
     VLOG(5) << "begin to run op " << instr_node->Name();
     if (!instr_node->IsArtificial()) {
+      std::cerr << "op name " << instr_node->Name() << std::endl;
       instr_node->Run();
       VLOG(4) << "done instruction node run";
       CheckGC(instr_node);
diff --git a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu
index 05a82db2826385..c098a11537a07a 100644
--- a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu
+++ b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu
@@ -476,6 +476,7 @@ class SoftmaxMaskFuseUpperTriangleGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     std::cerr << "comute grad " << std::endl;
+
     auto* grad_x =
         context.Output<phi::DenseTensor>(framework::GradVarName("X"));
     auto* grad_y =
@@ -486,6 +487,7 @@ class SoftmaxMaskFuseUpperTriangleGradKernel : public framework::OpKernel<T> {
     auto* grad_y_data = grad_y->data<T>();
     auto* softmax_rst_data = softmax_rst->data<T>();
 
+    std::cerr << "grad x" << grad_x->dims() << std::endl;
     auto y_dim = grad_y->dims();
     auto batches = y_dim[0];
     auto attn_heads = y_dim[1];
diff --git a/paddle/phi/kernels/impl/fetch_impl.h b/paddle/phi/kernels/impl/fetch_impl.h
index d90a813e4a16b3..3769f58c424c28 100644
--- a/paddle/phi/kernels/impl/fetch_impl.h
+++ b/paddle/phi/kernels/impl/fetch_impl.h
@@ -21,7 +21,9 @@ namespace phi {
 
 template <typename T, typename Context>
 void FetchKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) {
+  std::cerr << "fetch out " << out->dims() << std::endl;
   phi::Copy(ctx, x, phi::CPUPlace(), true, out);
+  std::cerr << "fetch out " << out->data<T>()[0] << std::endl;
 }
 
 }  // namespace phi

From adf66ea51ce4a6e888bf6bd08685b7692ddf7ffb Mon Sep 17 00:00:00 2001
From: phlrain <phliuhongyu@126.com>
Date: Tue, 1 Aug 2023 10:23:35 +0000
Subject: [PATCH 10/22] add scope prefix

---
 .../new_executor/interpreter_base_impl.h         |  3 +++
 .../framework/new_executor/interpretercore.cc    |  9 +++++++++
 .../framework/new_executor/interpretercore.h     |  3 +++
 .../framework/new_executor/new_ir_interpreter.cc | 16 +++++++++++++---
 .../framework/new_executor/new_ir_interpreter.h  |  4 ++++
 .../new_executor/program_interpreter.cc          |  9 +++++++++
 .../framework/new_executor/program_interpreter.h |  3 +++
 .../new_executor/standalone_executor.cc          |  5 ++---
 .../fluid/ir/transforms/constant_folding_pass.cc |  4 +++-
 9 files changed, 49 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/interpreter_base_impl.h b/paddle/fluid/framework/new_executor/interpreter_base_impl.h
index 586aef975a643c..ab75b56b96def8 100644
--- a/paddle/fluid/framework/new_executor/interpreter_base_impl.h
+++ b/paddle/fluid/framework/new_executor/interpreter_base_impl.h
@@ -108,6 +108,9 @@ class InterpreterBaseImpl {
   virtual const interpreter::StreamAnalyzer& GetStreamAnalyzer() const = 0;
 
   virtual bool IsSharedResultsBuild() const = 0;
+
+  virtual void SetScopePrefix(const std::string& prefix) = 0;
+  virtual const std::string& GetScopePrefix() const = 0;
 };
 
 inline void SetDeviceId(const platform::Place& place) {
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index 04e1457f33dcbf..edf2ca6666d46c 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -125,5 +125,14 @@ const platform::Place& InterpreterCore::GetPlace() const {
 void InterpreterCore::SetOutputHooks(const std::vector<HookFunc>& hookfuncs) {
   impl_->SetOutputHooks(hookfuncs);
 }
+
+void InterpreterCore::SetScopePrefix(const std::string& prefix) {
+  impl_->SetScopePrefix(prefix);
+}
+
+const std::string& InterpreterCore::GetScopePrefix() const {
+  return impl_->GetScopePrefix();
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h
index 66f998bb557f6e..db26215b2ffc06 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.h
+++ b/paddle/fluid/framework/new_executor/interpretercore.h
@@ -77,6 +77,9 @@ class InterpreterCore {
 
   void SetOutputHooks(const std::vector<HookFunc>& hookfuncs);
 
+  void SetScopePrefix(const std::string& prefix);
+  const std::string& GetScopePrefix() const;
+
  private:
   DISABLE_COPY_AND_ASSIGN(InterpreterCore);
 
diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
index ec5b7730b51754..c01392da706985 100644
--- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
@@ -106,6 +106,10 @@ NewIRInterpreter::NewIRInterpreter(
   };
 
   PrepareForCUDAGraphCapture();
+
+  std::stringstream ss;
+  ss << this;
+  scope_prefix_ = ss.str();
 }
 
 NewIRInterpreter::~NewIRInterpreter() {
@@ -200,11 +204,10 @@ FetchList NewIRInterpreter::Run(const std::vector<std::string>& feed_names,
 
   if (!is_build_) {
     LOG_FIRST_N(INFO, 1) << "New Executor is Running.";
-    std::stringstream ss;
-    ss << this;
+    std::cerr << "scope prefix " << scope_prefix_ << std::endl;
     ::ir::BuildScope(*ir_program_->block(),
                      InnerScope(),
-                     ss.str(),
+                     scope_prefix_,
                      &value_2_var_name_,
                      &variable_2_var_name_,
                      &var_name_2_id_,
@@ -2161,5 +2164,12 @@ void NewIRInterpreter::PreAnalysis() {
   VLOG(4) << "Done AnalyseExecuteOrderForTrace";
 }
 
+void NewIRInterpreter::SetScopePrefix(const std::string& prefix) {
+  scope_prefix_ = prefix;
+}
+const std::string& NewIRInterpreter::GetScopePrefix() const {
+  return scope_prefix_;
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.h b/paddle/fluid/framework/new_executor/new_ir_interpreter.h
index ef00957e7d8ca1..e62264e1ebd7fc 100644
--- a/paddle/fluid/framework/new_executor/new_ir_interpreter.h
+++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.h
@@ -90,6 +90,9 @@ class NewIRInterpreter : public InterpreterBaseImpl {
 
   int GetIdByName(const std::string& name) const;
 
+  void SetScopePrefix(const std::string& prefix) override;
+  const std::string& GetScopePrefix() const override;
+
  private:
   // build graph
   void Convert(std::vector<paddle::framework::OpFuncNode>* op_func_nodes);
@@ -254,6 +257,7 @@ class NewIRInterpreter : public InterpreterBaseImpl {
   interpreter::NewIrStreamAnalyzer ir_stream_analyzer_;
 
   std::vector<std::string> fetch_var_names_;
+  std::string scope_prefix_;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc
index 08ddd3444fd1c9..38c8ead21d73d5 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/program_interpreter.cc
@@ -1507,5 +1507,14 @@ void ProgramInterpreter::AnalyseExecuteOrderForTrace() {
   trace_execute_order_ = trace_order;
 }
 
+void ProgramInterpreter::SetScopePrefix(const std::string& prefix) {
+  PADDLE_THROW(phi::errors::Unimplemented(
+      "Program Interpreter not support SetScopePrefix"));
+}
+const std::string& ProgramInterpreter::GetScopePrefix() const {
+  PADDLE_THROW(phi::errors::Unimplemented(
+      "Program Interpreter not support GetScopePrefix"));
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/program_interpreter.h b/paddle/fluid/framework/new_executor/program_interpreter.h
index a942425609c189..890ed9fde6412f 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.h
+++ b/paddle/fluid/framework/new_executor/program_interpreter.h
@@ -85,6 +85,9 @@ class ProgramInterpreter : public InterpreterBaseImpl {
     hookfuncs_ = hookfuncs;
   }
 
+  void SetScopePrefix(const std::string& prefix) override;
+  const std::string& GetScopePrefix() const override;
+
  private:
   // build graph
   void Convert(std::vector<paddle::framework::OpFuncNode>* op_func_nodes);
diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc
index a89db7b22e18de..3ef42ff0727021 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor.cc
@@ -101,11 +101,10 @@ StandaloneExecutor::StandaloneExecutor(const platform::Place& place,
       // NOTE(phlrain): why we add prefix here. In earger op test,
       // different test case use same scope (not same standalone executor),
       // we must add prefix to prevent fetch same variable in different case
-      std::stringstream pre_ss;
-      pre_ss << interpretercores_.back()->Impl();
+      auto prefix = interpretercores_.back()->Impl()->GetScopePrefix();
 
       for (size_t i = 0; i < fetch_var_names_.size(); ++i) {
-        fetch_var_names_[i] = pre_ss.str() + "_" + fetch_var_names_[i];
+        fetch_var_names_[i] = prefix + "_" + fetch_var_names_[i];
       }
 
     } else {
diff --git a/paddle/fluid/ir/transforms/constant_folding_pass.cc b/paddle/fluid/ir/transforms/constant_folding_pass.cc
index 5f107af71e519a..edba2300ca111d 100644
--- a/paddle/fluid/ir/transforms/constant_folding_pass.cc
+++ b/paddle/fluid/ir/transforms/constant_folding_pass.cc
@@ -82,7 +82,8 @@ class ConstantFoldingPattern : public ir::RewritePattern {
           fetch_var_names.resize(index + 1);
         }
 
-        fetch_var_names[index] = (*it)
+        fetch_var_names[index] = "ConstantFoldPrefix_" +
+                                 (*it)
                                      ->attributes()
                                      .at("name")
                                      .dyn_cast<ir::StrAttribute>()
@@ -100,6 +101,7 @@ class ConstantFoldingPattern : public ir::RewritePattern {
         paddle::dialect::PdOpLowerToKernelPass(temp_program.get()),
         &scope_,
         exe_config);
+    core.SetScopePrefix("ConstantFoldPrefix");
     paddle::framework::FetchList fetch_list = core.Run({});
 
     // TODO(liuyuanle): Support multiple output.

From e8f64bcd5356c40993914cc84da49bb906cfbcc4 Mon Sep 17 00:00:00 2001
From: phlrain <phliuhongyu@126.com>
Date: Wed, 2 Aug 2023 02:40:23 +0000
Subject: [PATCH 11/22] update

---
 .../framework/new_executor/instruction/instruction_base.cc    | 1 +
 paddle/fluid/framework/new_executor/new_ir_interpreter.cc     | 4 ++--
 paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc         | 3 +++
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_base.cc b/paddle/fluid/framework/new_executor/instruction/instruction_base.cc
index 11f9e4071fe8fc..84da44e2b83f5a 100644
--- a/paddle/fluid/framework/new_executor/instruction/instruction_base.cc
+++ b/paddle/fluid/framework/new_executor/instruction/instruction_base.cc
@@ -151,6 +151,7 @@ void InstructionBase::InitInputsOutputsIds(
   for (size_t i = 0; i < op->num_results(); i++) {
     ir::Value value = op->result(i);
     if (value) {
+      std::cerr << "value  " << value.impl() << std::endl;
       PADDLE_ENFORCE_NE(
           value_2_var_name.find(value),
           value_2_var_name.end(),
diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
index 67c1c4da0b848d..b670fba7fa09f3 100644
--- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
@@ -1601,8 +1601,7 @@ void NewIRInterpreter::BuildInstruction() {
                          .dyn_cast<::ir::StrAttribute>()
                          .AsString();
       if (op_name == "builtin.combine" || op_name == "builtin.slice" ||
-          op_name == "pd.feed" || op_name == "pd.fetch" ||
-          op_name == "builtin.set_parameter" ||
+          op_name == "pd.feed" || op_name == "builtin.set_parameter" ||
           op_name == "builtin.get_parameter") {
         VLOG(6) << "skip process " << op_name;
         continue;
@@ -2086,6 +2085,7 @@ void NewIRInterpreter::TraceInstructionList(
 
   for (size_t idx = 0; idx < trace_execute_order_.size(); idx++) {
     auto instr_id = trace_execute_order_[idx];
+    std::cerr << "instr id  " << instr_id << std::endl;
     InstructionBase* instr_node = vec_instruction_base_.at(instr_id).get();
 
     VLOG(6) << "Run InstructionBase " << instr_id;
diff --git a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc b/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc
index 5dd27a04ad7cf0..0ba72c7ddcd701 100644
--- a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc
+++ b/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc
@@ -80,6 +80,9 @@ paddle::framework::Variable* CreateVar(
     VLOG(6) << "Create var: " << name << " in scope " << inner_scope;
     var = inner_scope->Var(name);
   }
+
+  std::cerr << "creater variable for value " << std::endl;
+  std::cerr << value.impl() << "\t" << name << std::endl;
   value_2_var_name->emplace(value, name);
   variable_2_var_name->emplace(var, name);
   auto id = var_name_2_id->size();

From e97beede65081988ddbc127b1378192f68e95ef1 Mon Sep 17 00:00:00 2001
From: phlrain <phliuhongyu@126.com>
Date: Wed, 2 Aug 2023 07:26:25 +0000
Subject: [PATCH 12/22] update

---
 .../instruction/instruction_base.cc           |  1 -
 .../instruction/instruction_base.h            |  1 +
 .../new_executor/new_ir_interpreter.cc        |  3 --
 paddle/phi/kernels/impl/fetch_impl.h          |  2 -
 test/legacy_test/eager_op_test.py             |  1 -
 ...est_softmax_mask_fuse_upper_triangle_op.py | 47 +++++++++----------
 6 files changed, 24 insertions(+), 31 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_base.cc b/paddle/fluid/framework/new_executor/instruction/instruction_base.cc
index 612d5dfb85a72a..5de5d25e6efd29 100644
--- a/paddle/fluid/framework/new_executor/instruction/instruction_base.cc
+++ b/paddle/fluid/framework/new_executor/instruction/instruction_base.cc
@@ -150,7 +150,6 @@ void InstructionBase::InitInputsOutputsIds(
   for (size_t i = 0; i < op->num_results(); i++) {
     ir::Value value = op->result(i);
     if (value) {
-      std::cerr << "check value " << value.impl() << std::endl;
       PADDLE_ENFORCE_NE(
           value_2_var_name.find(value),
           value_2_var_name.end(),
diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_base.h b/paddle/fluid/framework/new_executor/instruction/instruction_base.h
index eaa76e2a338db6..870b842241253f 100644
--- a/paddle/fluid/framework/new_executor/instruction/instruction_base.h
+++ b/paddle/fluid/framework/new_executor/instruction/instruction_base.h
@@ -203,6 +203,7 @@ class InstructionBase {
 
   std::unordered_set<::ir::Value> no_need_buffer_values_;
 
+ protected:
   std::string phi_op_name_;
 };
 
diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
index 02cd5ea40a20da..3916ffec2fa33d 100644
--- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
@@ -1891,7 +1891,6 @@ void NewIRInterpreter::CalculateLastLiveOps() {
         instr->Outputs();
     std::unordered_multimap<::ir::Value, std::vector<int>> ins_and_outs{
         ins.begin(), ins.end()};
-    ins_and_outs.insert(outs.begin(), outs.end());
 
     if (instr->Name() != "pd.fetch") {
       ins_and_outs.insert(outs.begin(), outs.end());
@@ -2104,7 +2103,6 @@ void NewIRInterpreter::TraceInstructionList(
 
   for (size_t idx = 0; idx < trace_execute_order_.size(); idx++) {
     auto instr_id = trace_execute_order_[idx];
-    std::cerr << "instr id  " << instr_id << std::endl;
     InstructionBase* instr_node = vec_instruction_base_.at(instr_id).get();
 
     VLOG(6) << "Run InstructionBase " << instr_id;
@@ -2140,7 +2138,6 @@ void NewIRInterpreter::RunInstructionBase(InstructionBase* instr_node) {
 
     VLOG(5) << "begin to run op " << instr_node->Name();
     if (!instr_node->IsArtificial()) {
-      std::cerr << "op name " << instr_node->Name() << std::endl;
       instr_node->Run();
       VLOG(4) << "done instruction node run";
       CheckGC(instr_node);
diff --git a/paddle/phi/kernels/impl/fetch_impl.h b/paddle/phi/kernels/impl/fetch_impl.h
index 3769f58c424c28..d90a813e4a16b3 100644
--- a/paddle/phi/kernels/impl/fetch_impl.h
+++ b/paddle/phi/kernels/impl/fetch_impl.h
@@ -21,9 +21,7 @@ namespace phi {
 
 template <typename T, typename Context>
 void FetchKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) {
-  std::cerr << "fetch out " << out->dims() << std::endl;
   phi::Copy(ctx, x, phi::CPUPlace(), true, out);
-  std::cerr << "fetch out " << out->data<T>()[0] << std::endl;
 }
 
 }  // namespace phi
diff --git a/test/legacy_test/eager_op_test.py b/test/legacy_test/eager_op_test.py
index e4830facaa3b1a..485416a3f04af3 100644
--- a/test/legacy_test/eager_op_test.py
+++ b/test/legacy_test/eager_op_test.py
@@ -2681,7 +2681,6 @@ def check_grad_with_place(
             max_relative_error = (
                 0.001 if max_relative_error < 0.001 else max_relative_error
             )
-        print("grad", analytic_grads)
         self._assert_is_close(
             numeric_grads,
             analytic_grads,
diff --git a/test/legacy_test/test_softmax_mask_fuse_upper_triangle_op.py b/test/legacy_test/test_softmax_mask_fuse_upper_triangle_op.py
index 217a96281e02c2..82dbaaf0e78c46 100644
--- a/test/legacy_test/test_softmax_mask_fuse_upper_triangle_op.py
+++ b/test/legacy_test/test_softmax_mask_fuse_upper_triangle_op.py
@@ -18,6 +18,7 @@
 from eager_op_test import OpTest
 
 import paddle
+from paddle import fluid, incubate
 from paddle.fluid import core
 
 paddle.enable_static()
@@ -47,38 +48,37 @@ def setUp(self):
         rst = _get_softmax_upper(x)
         self.outputs = {'Out': rst}
 
-    # def test_check_output(self):
-    #     self.check_output_with_place(core.CUDAPlace(0))
+    def test_check_output(self):
+        self.check_output_with_place(core.CUDAPlace(0))
 
     def test_check_grad(self):
         self.check_grad_with_place(core.CUDAPlace(0), ["X"], "Out")
 
 
-# @unittest.skipIf(
-#     not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-# )
-# class TestSoftmaxMaskFuseOp1(OpTest):
-#     def setUp(self):
-#         self.op_type = "fused_softmax_mask_upper_triangle"
-#         x = np.random.random((1, 4, 32, 32))
-#         self.inputs = {'X': x}
-#         rst = _get_softmax_upper(x)
-#         self.outputs = {'Out': rst}
+@unittest.skipIf(
+    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
+)
+class TestSoftmaxMaskFuseOp1(OpTest):
+    def setUp(self):
+        self.op_type = "fused_softmax_mask_upper_triangle"
+        x = np.random.random((1, 4, 32, 32))
+        self.inputs = {'X': x}
+        rst = _get_softmax_upper(x)
+        self.outputs = {'Out': rst}
 
-#     def test_check_output(self):
-#         try:
-#             self.check_output_with_place(core.CPUPlace())
-#         except (NotImplementedError, RuntimeError):
-#             pass
+    def test_check_output(self):
+        try:
+            self.check_output_with_place(core.CPUPlace())
+        except (NotImplementedError, RuntimeError):
+            pass
 
-#     def test_check_grad(self):
-#         try:
-#             self.check_grad_with_place(core.CPUPlace(), ["X"], "Out")
-#         except (NotImplementedError, RuntimeError):
-#             pass
+    def test_check_grad(self):
+        try:
+            self.check_grad_with_place(core.CPUPlace(), ["X"], "Out")
+        except (NotImplementedError, RuntimeError):
+            pass
 
 
-'''
 @unittest.skipIf(
     not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
 )
@@ -117,7 +117,6 @@ def test_dygraph(self):
                 rst = incubate.softmax_mask_fuse_upper_triangle(input_x)
                 np.testing.assert_allclose(rst, rst_np, rtol=1e-05)
 
-'''
 
 if __name__ == '__main__':
     unittest.main()

From 3eeec33bdb778fbbe0fa931c77d67fe6db957397 Mon Sep 17 00:00:00 2001
From: phlrain <phliuhongyu@126.com>
Date: Sat, 5 Aug 2023 11:23:30 +0000
Subject: [PATCH 13/22] update

---
 .../instruction/instruction_base.cc           | 130 +-----------------
 .../instruction/instruction_base.h            |   8 ++
 .../instruction/instruction_util.h            |  61 ++------
 .../instruction/legacy_kernel_instruction.cc  |   5 +-
 .../instruction/phi_kernel_instruction.cc     |   1 +
 .../new_executor/interpreter_base_impl.h      |   3 -
 .../framework/new_executor/interpretercore.cc |   8 --
 .../framework/new_executor/interpretercore.h  |   3 -
 .../new_executor/new_ir_interpreter.cc        |   8 +-
 .../new_executor/new_ir_interpreter.h         |   3 -
 .../new_executor/program_interpreter.cc       |   9 --
 .../new_executor/program_interpreter.h        |   3 -
 .../ir/transforms/constant_folding_pass.cc    |   1 -
 13 files changed, 23 insertions(+), 220 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_base.cc b/paddle/fluid/framework/new_executor/instruction/instruction_base.cc
index 5de5d25e6efd29..5fd12551ff176c 100644
--- a/paddle/fluid/framework/new_executor/instruction/instruction_base.cc
+++ b/paddle/fluid/framework/new_executor/instruction/instruction_base.cc
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/new_executor/instruction/instruction_base.h"
+
+#include "paddle/fluid/framework/new_executor/instruction/instruction_util.h"
 #include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 
@@ -97,27 +99,6 @@ void InstructionBase::SetOutputs(
   output_index_ = outputs;
 }
 
-std::vector<int> InstructionBase::GetValueIds(
-    ir::Value value,
-    Scope* inner_scope,
-    const std::unordered_map<::ir::Value, std::string>& value_2_var_name,
-    const std::map<std::string, int>& var_name_2_id,
-    const std::unordered_map<const paddle::framework::Variable*, std::string>&
-        variable_2_var_name) {
-  std::vector<int> ids;
-  std::string var_name = value_2_var_name.at(value);
-  ids.push_back(var_name_2_id.at(var_name));
-  // NOTE(zhangbo): Value maybe a VariableRefArray
-  auto var = inner_scope->FindVar(var_name);
-  if (var->IsType<paddle::framework::VariableRefArray>()) {
-    auto& var_array = var->Get<paddle::framework::VariableRefArray>();
-    for (size_t i = 0; i < var_array.size(); ++i) {
-      ids.push_back(var_name_2_id.at(variable_2_var_name.at(var_array[i])));
-    }
-  }
-  return ids;
-}
-
 void InstructionBase::InitInputsOutputsIds(
     ::ir::Operation* op,
     Scope* inner_scope,
@@ -169,112 +150,5 @@ void InstructionBase::InitInputsOutputsIds(
   VLOG(8) << "finish process outputs_index";
 }
 
-platform::DeviceContext* InstructionBase::ParseDeviceContext(
-    ir::Operation* op,
-    platform::DeviceContext* origin_dev_ctx,
-    const platform::Place& place,
-    const std::string& execution_stream,
-    const int stream_priority) {
-  auto op_attributes = op->attributes();
-  auto op_name =
-      op_attributes.at("op_name").dyn_cast<::ir::StrAttribute>().AsString();
-  interpreter::ContextManager& ctx_manager =
-      interpreter::ContextManager::Instance();
-
-  platform::DeviceContext* dev_ctx = nullptr;
-
-  // only gpu need update. xpu not need, because xpu memcpy op kernel is
-  // synchronous.
-  if (platform::is_gpu_place(place) || platform::is_custom_place(place)) {
-    VLOG(6) << "Parse DeviceContext for " << op_name
-            << ", execution stream = " << execution_stream;
-    if (execution_stream != kDefaultStream) {
-      dev_ctx = ctx_manager
-                    .Get(std::string(kCustomStream) + "-" + execution_stream,
-                         place,
-                         stream_priority)
-                    .get()
-                    .get();
-      interpreter::SetDeviceCommContext(op, dev_ctx);
-      return dev_ctx;
-    }
-
-    if (op_name == interpreter::kMemcpyD2H) {
-      dev_ctx = ctx_manager.Get(std::string(kD2HStream), place, stream_priority)
-                    .get()
-                    .get();
-      interpreter::SetDeviceCommContext(op, dev_ctx);
-      return dev_ctx;
-    } else if (op_name == interpreter::kMemcpyH2D) {
-      dev_ctx = ctx_manager.Get(std::string(kH2DStream), place, stream_priority)
-                    .get()
-                    .get();
-      interpreter::SetDeviceCommContext(op, dev_ctx);
-      return dev_ctx;
-    }
-
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    // NOTE(Ruibiao): Here supports multi-stream overlap for c_allreduce_sum
-    // with use_cal_stream==false by returning a device context getting from the
-    // global NCCLCommContext instance. Because when use_calc_stream==false, in
-    // OP kernel, the NCCL communication will be launched to the stream directly
-    // getting from the global NCCLCommContext instance rather than the
-    // DeviceContext passed from executor (see CAllReduceOpCUDAKernel in
-    // c_allreduce_op.h). Now it is just a temporary solution for ONLY
-    // c_allreduce_sum which is used in ResNet50 distributed training.
-    if (op_name == "c_allreduce_sum" && op_attributes.at("use_calc_stream")
-                                                .dyn_cast<::ir::BoolAttribute>()
-                                                .data() == false) {
-      int ring_id =
-          op_attributes.at("ring_id").dyn_cast<::ir::Int32Attribute>().data();
-      return platform::NCCLCommContext::Instance()
-          .Get(ring_id, place)
-          ->dev_context();
-    }
-#endif
-  }
-
-  if (origin_dev_ctx != nullptr) {
-    interpreter::SetDeviceCommContext(op, origin_dev_ctx);
-  }
-  return origin_dev_ctx;
-}
-
-OpFuncType InstructionBase::AnalyseOpFuncType(ir::Operation* op,
-                                              const platform::Place& place) {
-  if (platform::is_cpu_place(place)) {
-    return OpFuncType::kCpuSync;
-  }
-
-  PADDLE_ENFORCE_EQ(interpreter::IsSupportedHeterPlace(place),
-                    true,
-                    phi::errors::Fatal("Unsupported current place %s", place));
-
-  // Some GPU OPs do not launch CUDA Kernel, but spend a lot of time on CPU
-  // computing. They execute serially in device thread and block CUDA kernel
-  // launching in other GPU OPs. To improve performance, set them as kGpuSync
-  // and so that they would be dispatched to host thread.
-  auto op_attributes = op->attributes();
-  auto op_name =
-      op_attributes.at("op_name").dyn_cast<::ir::StrAttribute>().AsString();
-  if (op_name == kCoalesceTensor &&
-      (!platform::is_xpu_place(place) ||
-       op->attribute<ir::BoolAttribute>("persist_output").data() == false) &&
-      op->attribute<ir::BoolAttribute>("set_constant").data() == false &&
-      op->attribute<ir::BoolAttribute>("copy_data").data() == false) {
-    return OpFuncType::kGpuSync;
-  }
-
-  // for memcpy explicitly called by user
-  if (platform::is_gpu_place(place) && op_name == interpreter::kMemcpyD2H) {
-    return OpFuncType::kGpuSync;
-  }
-
-  if (op_name == "shape") {
-    return OpFuncType::kGpuSync;
-  }
-  return OpFuncType::kGpuAsync;
-}
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_base.h b/paddle/fluid/framework/new_executor/instruction/instruction_base.h
index cf98694ba7dad3..5ce2358a7df799 100644
--- a/paddle/fluid/framework/new_executor/instruction/instruction_base.h
+++ b/paddle/fluid/framework/new_executor/instruction/instruction_base.h
@@ -139,6 +139,14 @@ class InstructionBase {
 
   virtual const std::string& Name() const = 0;
 
+  void InitInputsOutputsIds(
+      ::ir::Operation* op,
+      Scope* inner_scope,
+      const std::unordered_map<::ir::Value, std::string>& value_2_var_name,
+      const std::map<std::string, int>& var_name_2_id,
+      const std::unordered_map<const paddle::framework::Variable*, std::string>&
+          variable_2_var_name);
+
  protected:
   size_t id_;
 
diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_util.h b/paddle/fluid/framework/new_executor/instruction/instruction_util.h
index 1905fcd66c9d8a..b1a431fe20f25e 100644
--- a/paddle/fluid/framework/new_executor/instruction/instruction_util.h
+++ b/paddle/fluid/framework/new_executor/instruction/instruction_util.h
@@ -20,10 +20,17 @@
 #include <vector>
 
 #include "paddle/fluid/framework/new_executor/new_executor_defs.h"
+#include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/event.h"
+#include "paddle/ir/core/builtin_attribute.h"
 #include "paddle/ir/core/operation.h"
 #include "paddle/ir/core/value.h"
 
+#include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h"
+#include "paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h"
+#include "paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h"
+#include "paddle/fluid/platform/collective_helper.h"
+
 namespace paddle {
 namespace framework {
 
@@ -48,57 +55,6 @@ std::vector<int> GetValueIds(
   return ids;
 }
 
-void PhiKernelInstruction::InitInputsOutputsIds(
-    ::ir::Operation* op,
-    Scope* inner_scope,
-    const std::unordered_map<::ir::Value, std::string>& value_2_var_name,
-    const std::map<std::string, int>& var_name_2_id,
-    const std::unordered_map<const paddle::framework::Variable*, std::string>&
-        variable_2_var_name) {
-  std::unordered_map<ir::Value, std::vector<int>> inputs;
-  for (size_t i = 0; i < op->num_operands(); i++) {
-    ir::Value value = op->operand_source(i);
-    if (value) {
-      PADDLE_ENFORCE_NE(
-          value_2_var_name.find(value),
-          value_2_var_name.end(),
-          phi::errors::PreconditionNotMet(
-              "input should in name map, [%d] 'th input of [%s] op",
-              i,
-              phi_op_name_));
-      std::vector<int> inputs_id = GetValueIds(value,
-                                               inner_scope,
-                                               value_2_var_name,
-                                               var_name_2_id,
-                                               variable_2_var_name);
-      inputs.emplace(value, inputs_id);
-    }
-  }
-  SetInputs(inputs);
-  VLOG(8) << "finish process inputs_index";
-  std::unordered_map<ir::Value, std::vector<int>> outputs;
-  for (size_t i = 0; i < op->num_results(); i++) {
-    ir::Value value = op->result(i);
-    if (value) {
-      PADDLE_ENFORCE_NE(
-          value_2_var_name.find(value),
-          value_2_var_name.end(),
-          phi::errors::PreconditionNotMet(
-              "input should in name map, [%d] 'th input of [%s] op",
-              i,
-              phi_op_name_));
-      std::vector<int> outputs_id = GetValueIds(value,
-                                                inner_scope,
-                                                value_2_var_name,
-                                                var_name_2_id,
-                                                variable_2_var_name);
-      outputs.emplace(value, outputs_id);
-    }
-  }
-  SetOutputs(outputs);
-  VLOG(8) << "finish process outputs_index";
-}
-
 platform::DeviceContext* ParseDeviceContext(
     ir::Operation* op,
     platform::DeviceContext* origin_dev_ctx,
@@ -170,7 +126,8 @@ platform::DeviceContext* ParseDeviceContext(
   return origin_dev_ctx;
 }
 
-OpFuncType AnalyseOpFuncType(ir::Operation* op, const platform::Place& place) {
+OpFuncType AnalyseOpFuncType(::ir::Operation* op,
+                             const platform::Place& place) {
   if (platform::is_cpu_place(place)) {
     return OpFuncType::kCpuSync;
   }
diff --git a/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc b/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc
index b3a2b6236e3ee1..b3d013443055d7 100644
--- a/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc
+++ b/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h"
 
+#include "paddle/fluid/framework/new_executor/instruction/instruction_util.h"
 #include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h"
 #include "paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h"
 #include "paddle/fluid/framework/scope.h"
@@ -28,10 +29,6 @@
 #include "paddle/phi/core/meta_tensor.h"
 #include "paddle/phi/core/type_defs.h"
 
-#include "paddle/ir/core/builtin_attribute.h"
-#include "paddle/ir/core/operation.h"
-#include "paddle/ir/core/value.h"
-
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc
index db6499aeaa3538..d5b7b5affc5d4b 100644
--- a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc
+++ b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc
@@ -32,6 +32,7 @@
 #include "paddle/ir/core/operation.h"
 #include "paddle/ir/core/value.h"
 
+#include "paddle/fluid/framework/new_executor/instruction/instruction_util.h"
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/new_executor/interpreter_base_impl.h b/paddle/fluid/framework/new_executor/interpreter_base_impl.h
index ab75b56b96def8..586aef975a643c 100644
--- a/paddle/fluid/framework/new_executor/interpreter_base_impl.h
+++ b/paddle/fluid/framework/new_executor/interpreter_base_impl.h
@@ -108,9 +108,6 @@ class InterpreterBaseImpl {
   virtual const interpreter::StreamAnalyzer& GetStreamAnalyzer() const = 0;
 
   virtual bool IsSharedResultsBuild() const = 0;
-
-  virtual void SetScopePrefix(const std::string& prefix) = 0;
-  virtual const std::string& GetScopePrefix() const = 0;
 };
 
 inline void SetDeviceId(const platform::Place& place) {
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index edf2ca6666d46c..9ee34fcc39c115 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -126,13 +126,5 @@ void InterpreterCore::SetOutputHooks(const std::vector<HookFunc>& hookfuncs) {
   impl_->SetOutputHooks(hookfuncs);
 }
 
-void InterpreterCore::SetScopePrefix(const std::string& prefix) {
-  impl_->SetScopePrefix(prefix);
-}
-
-const std::string& InterpreterCore::GetScopePrefix() const {
-  return impl_->GetScopePrefix();
-}
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h
index db26215b2ffc06..66f998bb557f6e 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.h
+++ b/paddle/fluid/framework/new_executor/interpretercore.h
@@ -77,9 +77,6 @@ class InterpreterCore {
 
   void SetOutputHooks(const std::vector<HookFunc>& hookfuncs);
 
-  void SetScopePrefix(const std::string& prefix);
-  const std::string& GetScopePrefix() const;
-
  private:
   DISABLE_COPY_AND_ASSIGN(InterpreterCore);
 
diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
index 748cd65fceb250..002b9345ecb584 100644
--- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
@@ -113,10 +113,6 @@ NewIRInterpreter::NewIRInterpreter(
   };
 
   PrepareForCUDAGraphCapture();
-
-  std::stringstream ss;
-  ss << this;
-  scope_prefix_ = ss.str();
 }
 
 NewIRInterpreter::~NewIRInterpreter() {
@@ -1633,7 +1629,7 @@ void NewIRInterpreter::BuildInstruction() {
         vec_instruction_base_.emplace_back(
             std::make_unique<LegacyKernelInstruction>(op_idx++,
                                                       place_,
-                                                      (*it),
+                                                      op,
                                                       scope_,
                                                       local_scope_,
                                                       value_2_var_name_,
@@ -1643,7 +1639,7 @@ void NewIRInterpreter::BuildInstruction() {
         vec_instruction_base_.emplace_back(
             std::make_unique<PhiKernelInstruction>(op_idx++,
                                                    place_,
-                                                   (*it),
+                                                   op,
                                                    scope_,
                                                    local_scope_,
                                                    value_2_var_name_,
diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.h b/paddle/fluid/framework/new_executor/new_ir_interpreter.h
index ce46c986b30761..1388a0276e4655 100644
--- a/paddle/fluid/framework/new_executor/new_ir_interpreter.h
+++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.h
@@ -90,9 +90,6 @@ class NewIRInterpreter : public InterpreterBaseImpl {
 
   int GetIdByName(const std::string& name) const;
 
-  void SetScopePrefix(const std::string& prefix) override;
-  const std::string& GetScopePrefix() const override;
-
  private:
   // build graph
   void Convert(std::vector<paddle::framework::OpFuncNode>* op_func_nodes);
diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc
index c2ed5e7a42889b..9156c46dc6dc27 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/program_interpreter.cc
@@ -1505,14 +1505,5 @@ void ProgramInterpreter::AnalyseExecuteOrderForTrace() {
   trace_execute_order_ = trace_order;
 }
 
-void ProgramInterpreter::SetScopePrefix(const std::string& prefix) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      "Program Interpreter not support SetScopePrefix"));
-}
-const std::string& ProgramInterpreter::GetScopePrefix() const {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      "Program Interpreter not support GetScopePrefix"));
-}
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/program_interpreter.h b/paddle/fluid/framework/new_executor/program_interpreter.h
index 890ed9fde6412f..a942425609c189 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.h
+++ b/paddle/fluid/framework/new_executor/program_interpreter.h
@@ -85,9 +85,6 @@ class ProgramInterpreter : public InterpreterBaseImpl {
     hookfuncs_ = hookfuncs;
   }
 
-  void SetScopePrefix(const std::string& prefix) override;
-  const std::string& GetScopePrefix() const override;
-
  private:
   // build graph
   void Convert(std::vector<paddle::framework::OpFuncNode>* op_func_nodes);
diff --git a/paddle/fluid/ir/transforms/constant_folding_pass.cc b/paddle/fluid/ir/transforms/constant_folding_pass.cc
index eb6e19ac95f0db..0465a189d6f2e2 100644
--- a/paddle/fluid/ir/transforms/constant_folding_pass.cc
+++ b/paddle/fluid/ir/transforms/constant_folding_pass.cc
@@ -100,7 +100,6 @@ class ConstantFoldingPattern : public ir::RewritePattern {
         paddle::dialect::PdOpLowerToKernelPass(temp_program.get()),
         &scope_,
         exe_config);
-    core.SetScopePrefix("ConstantFoldPrefix");
     paddle::framework::FetchList fetch_list = core.Run({});
 
     // TODO(liuyuanle): Support multiple output.

From cdf32c7b276cc80ca7133e0ecbe0f8298c6b9453 Mon Sep 17 00:00:00 2001
From: phlrain <phliuhongyu@126.com>
Date: Sat, 5 Aug 2023 14:59:41 +0000
Subject: [PATCH 14/22] update

---
 .../new_executor/instruction/CMakeLists.txt   |   2 +-
 .../instruction/instruction_util.cc           | 175 ++++++++++++++++++
 .../instruction/instruction_util.h            | 133 +------------
 .../new_executor/new_ir_interpreter.cc        |   2 +-
 .../new_executor/standalone_executor.cc       |  10 +
 .../ir/transforms/pd_op_to_kernel_pass.cc     |   9 +-
 6 files changed, 197 insertions(+), 134 deletions(-)
 create mode 100644 paddle/fluid/framework/new_executor/instruction/instruction_util.cc

diff --git a/paddle/fluid/framework/new_executor/instruction/CMakeLists.txt b/paddle/fluid/framework/new_executor/instruction/CMakeLists.txt
index 88064749eaf027..93356e6b217a0f 100644
--- a/paddle/fluid/framework/new_executor/instruction/CMakeLists.txt
+++ b/paddle/fluid/framework/new_executor/instruction/CMakeLists.txt
@@ -1,5 +1,5 @@
 cc_library(
   instruction_base
   SRCS instruction_base.cc phi_kernel_instruction.cc
-       legacy_kernel_instruction.cc
+       legacy_kernel_instruction.cc instruction_util.cc
   DEPS phi framework_proto)
diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_util.cc b/paddle/fluid/framework/new_executor/instruction/instruction_util.cc
new file mode 100644
index 00000000000000..d8ddc30633be07
--- /dev/null
+++ b/paddle/fluid/framework/new_executor/instruction/instruction_util.cc
@@ -0,0 +1,175 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <map>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/framework/new_executor/instruction/instruction_util.h"
+
+#include "paddle/fluid/framework/new_executor/new_executor_defs.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/event.h"
+#include "paddle/ir/core/builtin_attribute.h"
+#include "paddle/ir/core/operation.h"
+#include "paddle/ir/core/value.h"
+
+#include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h"
+#include "paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h"
+#include "paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h"
+#include "paddle/fluid/platform/collective_helper.h"
+
+namespace paddle {
+namespace framework {
+
+std::vector<int> GetValueIds(
+    ir::Value value,
+    Scope* inner_scope,
+    const std::unordered_map<::ir::Value, std::string>& value_2_var_name,
+    const std::map<std::string, int>& var_name_2_id,
+    const std::unordered_map<const paddle::framework::Variable*, std::string>&
+        variable_2_var_name) {
+  std::vector<int> ids;
+  std::string var_name = value_2_var_name.at(value);
+  ids.push_back(var_name_2_id.at(var_name));
+  // NOTE(zhangbo): Value maybe a VariableRefArray
+  auto var = inner_scope->FindVar(var_name);
+  if (var->IsType<paddle::framework::VariableRefArray>()) {
+    auto& var_array = var->Get<paddle::framework::VariableRefArray>();
+    for (auto item : var_array) {
+      ids.push_back(var_name_2_id.at(variable_2_var_name.at(item)));
+    }
+  }
+  return ids;
+}
+
+platform::DeviceContext* ParseDeviceContext(
+    ir::Operation* op,
+    platform::DeviceContext* origin_dev_ctx,
+    const platform::Place& place,
+    const std::string& execution_stream,
+    const int stream_priority) {
+  auto op_attributes = op->attributes();
+  auto op_name =
+      op_attributes.at("op_name").dyn_cast<::ir::StrAttribute>().AsString();
+  interpreter::ContextManager& ctx_manager =
+      interpreter::ContextManager::Instance();
+
+  platform::DeviceContext* dev_ctx = nullptr;
+
+  // only gpu need update. xpu not need, because xpu memcpy op kernel is
+  // synchronous.
+  if (platform::is_gpu_place(place) || platform::is_custom_place(place)) {
+    VLOG(6) << "Parse DeviceContext for " << op_name
+            << ", execution stream = " << execution_stream;
+    if (execution_stream != kDefaultStream) {
+      dev_ctx = ctx_manager
+                    .Get(std::string(kCustomStream) + "-" + execution_stream,
+                         place,
+                         stream_priority)
+                    .get()
+                    .get();
+      interpreter::SetDeviceCommContext(op, dev_ctx);
+      return dev_ctx;
+    }
+
+    if (op_name == interpreter::kMemcpyD2H) {
+      dev_ctx = ctx_manager.Get(std::string(kD2HStream), place, stream_priority)
+                    .get()
+                    .get();
+      interpreter::SetDeviceCommContext(op, dev_ctx);
+      return dev_ctx;
+    } else if (op_name == interpreter::kMemcpyH2D) {
+      dev_ctx = ctx_manager.Get(std::string(kH2DStream), place, stream_priority)
+                    .get()
+                    .get();
+      interpreter::SetDeviceCommContext(op, dev_ctx);
+      return dev_ctx;
+    }
+
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    // NOTE(Ruibiao): Here supports multi-stream overlap for c_allreduce_sum
+    // with use_cal_stream==false by returning a device context getting from the
+    // global NCCLCommContext instance. Because when use_calc_stream==false, in
+    // OP kernel, the NCCL communication will be launched to the stream directly
+    // getting from the global NCCLCommContext instance rather than the
+    // DeviceContext passed from executor (see CAllReduceOpCUDAKernel in
+    // c_allreduce_op.h). Now it is just a temporary solution for ONLY
+    // c_allreduce_sum which is used in ResNet50 distributed training.
+    if (op_name == "c_allreduce_sum" && op_attributes.at("use_calc_stream")
+                                                .dyn_cast<::ir::BoolAttribute>()
+                                                .data() == false) {
+      int ring_id =
+          op_attributes.at("ring_id").dyn_cast<::ir::Int32Attribute>().data();
+      return platform::NCCLCommContext::Instance()
+          .Get(ring_id, place)
+          ->dev_context();
+    }
+#endif
+  }
+
+  if (origin_dev_ctx != nullptr) {
+    interpreter::SetDeviceCommContext(op, origin_dev_ctx);
+  }
+  return origin_dev_ctx;
+}
+
+OpFuncType AnalyseOpFuncType(::ir::Operation* op,
+                             const platform::Place& place) {
+  if (platform::is_cpu_place(place)) {
+    return OpFuncType::kCpuSync;
+  }
+
+  auto kernel_key = op->attributes()
+                        .at("kernel_key")
+                        .dyn_cast<dialect::KernelAttribute>()
+                        .data();
+  if (phi::TransToPhiPlace(kernel_key.backend()).GetType() ==
+      phi::AllocationType::CPU) {
+    return OpFuncType::kCpuSync;
+  }
+
+  PADDLE_ENFORCE_EQ(interpreter::IsSupportedHeterPlace(place),
+                    true,
+                    phi::errors::Fatal("Unsupported current place %s", place));
+
+  // Some GPU OPs do not launch CUDA Kernel, but spend a lot of time on CPU
+  // computing. They execute serially in device thread and block CUDA kernel
+  // launching in other GPU OPs. To improve performance, set them as kGpuSync
+  // and so that they would be dispatched to host thread.
+  auto op_attributes = op->attributes();
+  auto op_name =
+      op_attributes.at("op_name").dyn_cast<::ir::StrAttribute>().AsString();
+  if (op_name == kCoalesceTensor &&
+      (!platform::is_xpu_place(place) ||
+       op->attribute<ir::BoolAttribute>("persist_output").data() == false) &&
+      op->attribute<ir::BoolAttribute>("set_constant").data() == false &&
+      op->attribute<ir::BoolAttribute>("copy_data").data() == false) {
+    return OpFuncType::kGpuSync;
+  }
+
+  // for memcpy explicitly called by user
+  if (platform::is_gpu_place(place) && op_name == interpreter::kMemcpyD2H) {
+    return OpFuncType::kGpuSync;
+  }
+
+  if (op_name == "shape") {
+    return OpFuncType::kGpuSync;
+  }
+  return OpFuncType::kGpuAsync;
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_util.h b/paddle/fluid/framework/new_executor/instruction/instruction_util.h
index b1a431fe20f25e..3d0aa3df9de963 100644
--- a/paddle/fluid/framework/new_executor/instruction/instruction_util.h
+++ b/paddle/fluid/framework/new_executor/instruction/instruction_util.h
@@ -25,12 +25,6 @@
 #include "paddle/ir/core/builtin_attribute.h"
 #include "paddle/ir/core/operation.h"
 #include "paddle/ir/core/value.h"
-
-#include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h"
-#include "paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h"
-#include "paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h"
-#include "paddle/fluid/platform/collective_helper.h"
-
 namespace paddle {
 namespace framework {
 
@@ -40,136 +34,15 @@ std::vector<int> GetValueIds(
     const std::unordered_map<::ir::Value, std::string>& value_2_var_name,
     const std::map<std::string, int>& var_name_2_id,
     const std::unordered_map<const paddle::framework::Variable*, std::string>&
-        variable_2_var_name) {
-  std::vector<int> ids;
-  std::string var_name = value_2_var_name.at(value);
-  ids.push_back(var_name_2_id.at(var_name));
-  // NOTE(zhangbo): Value maybe a VariableRefArray
-  auto var = inner_scope->FindVar(var_name);
-  if (var->IsType<paddle::framework::VariableRefArray>()) {
-    auto& var_array = var->Get<paddle::framework::VariableRefArray>();
-    for (auto item : var_array) {
-      ids.push_back(var_name_2_id.at(variable_2_var_name.at(item)));
-    }
-  }
-  return ids;
-}
+        variable_2_var_name);
 
 platform::DeviceContext* ParseDeviceContext(
     ir::Operation* op,
     platform::DeviceContext* origin_dev_ctx,
     const platform::Place& place,
     const std::string& execution_stream,
-    const int stream_priority) {
-  auto op_attributes = op->attributes();
-  auto op_name =
-      op_attributes.at("op_name").dyn_cast<::ir::StrAttribute>().AsString();
-  interpreter::ContextManager& ctx_manager =
-      interpreter::ContextManager::Instance();
-
-  platform::DeviceContext* dev_ctx = nullptr;
-
-  // only gpu need update. xpu not need, because xpu memcpy op kernel is
-  // synchronous.
-  if (platform::is_gpu_place(place) || platform::is_custom_place(place)) {
-    VLOG(6) << "Parse DeviceContext for " << op_name
-            << ", execution stream = " << execution_stream;
-    if (execution_stream != kDefaultStream) {
-      dev_ctx = ctx_manager
-                    .Get(std::string(kCustomStream) + "-" + execution_stream,
-                         place,
-                         stream_priority)
-                    .get()
-                    .get();
-      interpreter::SetDeviceCommContext(op, dev_ctx);
-      return dev_ctx;
-    }
-
-    if (op_name == interpreter::kMemcpyD2H) {
-      dev_ctx = ctx_manager.Get(std::string(kD2HStream), place, stream_priority)
-                    .get()
-                    .get();
-      interpreter::SetDeviceCommContext(op, dev_ctx);
-      return dev_ctx;
-    } else if (op_name == interpreter::kMemcpyH2D) {
-      dev_ctx = ctx_manager.Get(std::string(kH2DStream), place, stream_priority)
-                    .get()
-                    .get();
-      interpreter::SetDeviceCommContext(op, dev_ctx);
-      return dev_ctx;
-    }
-
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    // NOTE(Ruibiao): Here supports multi-stream overlap for c_allreduce_sum
-    // with use_cal_stream==false by returning a device context getting from the
-    // global NCCLCommContext instance. Because when use_calc_stream==false, in
-    // OP kernel, the NCCL communication will be launched to the stream directly
-    // getting from the global NCCLCommContext instance rather than the
-    // DeviceContext passed from executor (see CAllReduceOpCUDAKernel in
-    // c_allreduce_op.h). Now it is just a temporary solution for ONLY
-    // c_allreduce_sum which is used in ResNet50 distributed training.
-    if (op_name == "c_allreduce_sum" && op_attributes.at("use_calc_stream")
-                                                .dyn_cast<::ir::BoolAttribute>()
-                                                .data() == false) {
-      int ring_id =
-          op_attributes.at("ring_id").dyn_cast<::ir::Int32Attribute>().data();
-      return platform::NCCLCommContext::Instance()
-          .Get(ring_id, place)
-          ->dev_context();
-    }
-#endif
-  }
-
-  if (origin_dev_ctx != nullptr) {
-    interpreter::SetDeviceCommContext(op, origin_dev_ctx);
-  }
-  return origin_dev_ctx;
-}
-
-OpFuncType AnalyseOpFuncType(::ir::Operation* op,
-                             const platform::Place& place) {
-  if (platform::is_cpu_place(place)) {
-    return OpFuncType::kCpuSync;
-  }
-
-  auto kernel_key = op->attributes()
-                        .at("kernel_key")
-                        .dyn_cast<dialect::KernelAttribute>()
-                        .data();
-  if (phi::TransToPhiPlace(kernel_key.backend()).GetType() ==
-      phi::AllocationType::CPU) {
-    return OpFuncType::kCpuSync;
-  }
-
-  PADDLE_ENFORCE_EQ(interpreter::IsSupportedHeterPlace(place),
-                    true,
-                    phi::errors::Fatal("Unsupported current place %s", place));
-
-  // Some GPU OPs do not launch CUDA Kernel, but spend a lot of time on CPU
-  // computing. They execute serially in device thread and block CUDA kernel
-  // launching in other GPU OPs. To improve performance, set them as kGpuSync
-  // and so that they would be dispatched to host thread.
-  auto op_attributes = op->attributes();
-  auto op_name =
-      op_attributes.at("op_name").dyn_cast<::ir::StrAttribute>().AsString();
-  if (op_name == kCoalesceTensor &&
-      (!platform::is_xpu_place(place) ||
-       op->attribute<ir::BoolAttribute>("persist_output").data() == false) &&
-      op->attribute<ir::BoolAttribute>("set_constant").data() == false &&
-      op->attribute<ir::BoolAttribute>("copy_data").data() == false) {
-    return OpFuncType::kGpuSync;
-  }
-
-  // for memcpy explicitly called by user
-  if (platform::is_gpu_place(place) && op_name == interpreter::kMemcpyD2H) {
-    return OpFuncType::kGpuSync;
-  }
-
-  if (op_name == "shape") {
-    return OpFuncType::kGpuSync;
-  }
-  return OpFuncType::kGpuAsync;
-}
+    const int stream_priority);
 
+OpFuncType AnalyseOpFuncType(::ir::Operation* op, const platform::Place& place);
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
index a41455af1d2258..c82e3cbc28b47b 100644
--- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
@@ -1619,7 +1619,7 @@ void NewIRInterpreter::BuildInstruction() {
       if (op_name == "builtin.combine" || op_name == "pd.feed" ||
           op_name == "builtin.set_parameter" ||
           op_name == "builtin.get_parameter" || op_name == "builtin.slice" ||
-          op_name == "pd.data" || op_name == "pd.shaddow_output") {
+          op_name == "pd.data" || op_name == "pd.shadow_output") {
         VLOG(6) << "skip process " << op_name;
         continue;
       }
diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc
index ac60a546f983ec..87b48c4c81ffb6 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor.cc
@@ -90,12 +90,21 @@ StandaloneExecutor::StandaloneExecutor(const platform::Place& place,
 
       auto kernel_program =
           paddle::dialect::PdOpLowerToKernelPass(base_program.get(), place);
+      std::cerr << "print" << std::endl;
+      base_program->Print(std::cout);
+      kernel_program->Print(std::cout);
       interpretercores_.emplace_back(
           std::make_shared<InterpreterCore>(place_,
                                             fetch_var_names_,
                                             std::move(kernel_program),
                                             scope_,
                                             execution_config));
+      std::stringstream pre_ss;
+      pre_ss << interpretercores_.back()->Impl();
+
+      for (size_t i = 0; i < fetch_var_names_.size(); ++i) {
+        fetch_var_names_[i] = pre_ss.str() + "_" + fetch_var_names_[i];
+      }
     } else {
       interpretercores_.emplace_back(
           std::make_shared<InterpreterCore>(place_,
@@ -160,6 +169,7 @@ paddle::framework::FetchList StandaloneExecutor::Run(
   if (FLAGS_enable_new_ir_in_executor) {
     framework::FetchList fetch_res;
 
+    std::cerr << "before fetch " << std::endl;
     for (auto& var_name : fetch_var_names_) {
       auto* var = scope_->FindVar(var_name);
       fetch_res.push_back(var->Get<phi::DenseTensor>());
diff --git a/paddle/fluid/ir/transforms/pd_op_to_kernel_pass.cc b/paddle/fluid/ir/transforms/pd_op_to_kernel_pass.cc
index e53e1058a84281..ccbe7316b530ea 100644
--- a/paddle/fluid/ir/transforms/pd_op_to_kernel_pass.cc
+++ b/paddle/fluid/ir/transforms/pd_op_to_kernel_pass.cc
@@ -59,6 +59,10 @@ const std::unordered_set<std::string> UnchangeOutputOps = {
     "builtin.get_parameter",
     "pd.shadow_output"};
 
+const std::unordered_set<std::string> LegacyOpList = {
+    "pd.fused_softmax_mask_upper_triangle",
+    "pd.fused_softmax_mask_upper_triangle_grad"};
+
 bool NeedFallBackCpu(const ir::Operation* op,
                      const std::string& kernel_fn_name,
                      const phi::KernelKey& kernel_key) {
@@ -401,7 +405,8 @@ std::unique_ptr<ir::Program> PdOpLowerToKernelPass(ir::Program* prog,
           kernel_fn_str, kernel_key);
       auto args_def = phi_kernel.args_def();
       auto output_defs = args_def.output_defs();
-      if (!UnchangeOutputOps.count(op_item->name())) {
+      if (!UnchangeOutputOps.count(op_item->name()) &&
+          !LegacyOpList.count(op_item->name())) {
         PADDLE_ENFORCE_EQ(
             op_item->num_results(),
             output_defs.size(),
@@ -413,7 +418,7 @@ std::unique_ptr<ir::Program> PdOpLowerToKernelPass(ir::Program* prog,
       for (size_t i = 0; i < op_item->num_results(); ++i) {
         phi::Place out_place;
         if ((!UnchangeOutputOps.count(op_item->name())) &&
-            phi_kernel.IsValid()) {
+            (!LegacyOpList.count(op_item->name())) && phi_kernel.IsValid()) {
           out_place = phi::TransToPhiPlace(output_defs[i].backend);
         } else {
           out_place = phi::TransToPhiPlace(kernel_key.backend());

From 79a57bfea913aa9678416f2ecdd95dd0594ca05c Mon Sep 17 00:00:00 2001
From: phlrain <phliuhongyu@126.com>
Date: Sun, 6 Aug 2023 03:28:09 +0000
Subject: [PATCH 15/22] fix

---
 .../new_executor/standalone_executor.cc       |  13 +-
 .../ir/phi_kernel_adaptor/phi_kernel_util.cc  |   2 +-
 .../ir/transforms/constant_folding_pass.cc    |   1 +
 .../fused_softmax_mask_upper_triangle_op.cu   |   3 -
 test/legacy_test/eager_op_test.py             |   1 -
 test/legacy_test/test_channel_shuffle.py      | 471 +++++++++---------
 6 files changed, 237 insertions(+), 254 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc
index 87b48c4c81ffb6..0e1d2de6bed29c 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor.cc
@@ -90,21 +90,12 @@ StandaloneExecutor::StandaloneExecutor(const platform::Place& place,
 
       auto kernel_program =
           paddle::dialect::PdOpLowerToKernelPass(base_program.get(), place);
-      std::cerr << "print" << std::endl;
-      base_program->Print(std::cout);
-      kernel_program->Print(std::cout);
       interpretercores_.emplace_back(
           std::make_shared<InterpreterCore>(place_,
                                             fetch_var_names_,
                                             std::move(kernel_program),
                                             scope_,
                                             execution_config));
-      std::stringstream pre_ss;
-      pre_ss << interpretercores_.back()->Impl();
-
-      for (size_t i = 0; i < fetch_var_names_.size(); ++i) {
-        fetch_var_names_[i] = pre_ss.str() + "_" + fetch_var_names_[i];
-      }
     } else {
       interpretercores_.emplace_back(
           std::make_shared<InterpreterCore>(place_,
@@ -162,14 +153,12 @@ paddle::framework::FetchList StandaloneExecutor::Run(
       interpretercores_[job_idx]->ShareBuildResultsFrom(
           interpretercores_[type_to_first_id[job_type]]);
     }
-    interpretercores_[job_idx]->BetaRun(feed_names, /*need_fetch = */ false);
+    interpretercores_[job_idx]->Run(feed_names, /*need_fetch = */ false);
   }
 
   // return Fetch Tensors
   if (FLAGS_enable_new_ir_in_executor) {
     framework::FetchList fetch_res;
-
-    std::cerr << "before fetch " << std::endl;
     for (auto& var_name : fetch_var_names_) {
       auto* var = scope_->FindVar(var_name);
       fetch_res.push_back(var->Get<phi::DenseTensor>());
diff --git a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc b/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc
index 3728cd48ea1f10..02eafff2b83331 100644
--- a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc
+++ b/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc
@@ -230,7 +230,7 @@ void HandleForSpecialOp(
     auto fetch_src_name =
         op->attributes().at("name").dyn_cast<ir::StrAttribute>().AsString();
 
-    auto fetch_var_name = var_name_prefix + "_" + fetch_src_name + "@fetch";
+    auto fetch_var_name = fetch_src_name + "@fetch";
     auto* var = const_cast<paddle::framework::Scope*>(inner_scope->root())
                     ->Var(fetch_var_name);
     var->GetMutable<phi::DenseTensor>();
diff --git a/paddle/fluid/ir/transforms/constant_folding_pass.cc b/paddle/fluid/ir/transforms/constant_folding_pass.cc
index 0465a189d6f2e2..cebcf5d02f7701 100644
--- a/paddle/fluid/ir/transforms/constant_folding_pass.cc
+++ b/paddle/fluid/ir/transforms/constant_folding_pass.cc
@@ -100,6 +100,7 @@ class ConstantFoldingPattern : public ir::RewritePattern {
         paddle::dialect::PdOpLowerToKernelPass(temp_program.get()),
         &scope_,
         exe_config);
+
     paddle::framework::FetchList fetch_list = core.Run({});
 
     // TODO(liuyuanle): Support multiple output.
diff --git a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu
index c098a11537a07a..779ee234071af0 100644
--- a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu
+++ b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu
@@ -475,8 +475,6 @@ template <typename T, typename DeviceContext>
 class SoftmaxMaskFuseUpperTriangleGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    std::cerr << "comute grad " << std::endl;
-
     auto* grad_x =
         context.Output<phi::DenseTensor>(framework::GradVarName("X"));
     auto* grad_y =
@@ -487,7 +485,6 @@ class SoftmaxMaskFuseUpperTriangleGradKernel : public framework::OpKernel<T> {
     auto* grad_y_data = grad_y->data<T>();
     auto* softmax_rst_data = softmax_rst->data<T>();
 
-    std::cerr << "grad x" << grad_x->dims() << std::endl;
     auto y_dim = grad_y->dims();
     auto batches = y_dim[0];
     auto attn_heads = y_dim[1];
diff --git a/test/legacy_test/eager_op_test.py b/test/legacy_test/eager_op_test.py
index 485416a3f04af3..a68f81b2f25a41 100644
--- a/test/legacy_test/eager_op_test.py
+++ b/test/legacy_test/eager_op_test.py
@@ -3038,7 +3038,6 @@ def _get_gradient(
                 compiled_prog = fluid.CompiledProgram(prog, build_strategy)
                 prog = compiled_prog
             executor = fluid.Executor(place)
-            print(prog)
             res = list(
                 map(
                     np.array,
diff --git a/test/legacy_test/test_channel_shuffle.py b/test/legacy_test/test_channel_shuffle.py
index f8b6ef1df9514b..90843648120324 100644
--- a/test/legacy_test/test_channel_shuffle.py
+++ b/test/legacy_test/test_channel_shuffle.py
@@ -15,12 +15,9 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest, convert_float_to_uint16
+from eager_op_test import OpTest
 
 import paddle
-import paddle.nn.functional as F
-from paddle import fluid
-from paddle.fluid import core
 
 
 def channel_shuffle_np(x, groups, data_format="NCHW"):
@@ -70,253 +67,253 @@ def init_dtype(self):
     def init_data_format(self):
         self.format = "NCHW"
 
-    def test_check_output(self):
-        self.check_output()
+    # def test_check_output(self):
+    #     self.check_output()
 
     def test_check_grad(self):
         self.check_grad(['X'], 'Out')
 
 
-class TestChannelLast(TestChannelShuffleOp):
-    def init_data_format(self):
-        self.format = "NHWC"
-
-
-class TestChannelShuffleAPI(unittest.TestCase):
-    def setUp(self):
-        self.x_1_np = np.random.random([2, 9, 4, 4]).astype("float64")
-        self.x_2_np = np.random.random([2, 4, 4, 9]).astype("float64")
-        self.out_1_np = channel_shuffle_np(self.x_1_np, 3)
-        self.out_2_np = channel_shuffle_np(self.x_2_np, 3, "NHWC")
-
-    def test_static_graph_functional(self):
-        for use_cuda in (
-            [False, True] if core.is_compiled_with_cuda() else [False]
-        ):
-            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
-
-            paddle.enable_static()
-            x_1 = paddle.static.data(
-                name="x", shape=[2, 9, 4, 4], dtype="float64"
-            )
-            x_2 = paddle.static.data(
-                name="x2", shape=[2, 4, 4, 9], dtype="float64"
-            )
-            out_1 = F.channel_shuffle(x_1, 3)
-            out_2 = F.channel_shuffle(x_2, 3, "NHWC")
-
-            exe = paddle.static.Executor(place=place)
-            res_1 = exe.run(
-                fluid.default_main_program(),
-                feed={"x": self.x_1_np},
-                fetch_list=out_1,
-                use_prune=True,
-            )
-
-            res_2 = exe.run(
-                fluid.default_main_program(),
-                feed={"x2": self.x_2_np},
-                fetch_list=out_2,
-                use_prune=True,
-            )
-
-            np.testing.assert_allclose(res_1[0], self.out_1_np)
-            np.testing.assert_allclose(res_2[0], self.out_2_np)
-
-    # same test between layer and functional in this op.
-    def test_static_graph_layer(self):
-        for use_cuda in (
-            [False, True] if core.is_compiled_with_cuda() else [False]
-        ):
-            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
-
-            paddle.enable_static()
-            x_1 = paddle.static.data(
-                name="x", shape=[2, 9, 4, 4], dtype="float64"
-            )
-            x_2 = paddle.static.data(
-                name="x2", shape=[2, 4, 4, 9], dtype="float64"
-            )
-            # init instance
-            ps_1 = paddle.nn.ChannelShuffle(3)
-            ps_2 = paddle.nn.ChannelShuffle(3, "NHWC")
-            out_1 = ps_1(x_1)
-            out_2 = ps_2(x_2)
-            out_1_np = channel_shuffle_np(self.x_1_np, 3)
-            out_2_np = channel_shuffle_np(self.x_2_np, 3, "NHWC")
-
-            exe = paddle.static.Executor(place=place)
-            res_1 = exe.run(
-                fluid.default_main_program(),
-                feed={"x": self.x_1_np},
-                fetch_list=out_1,
-                use_prune=True,
-            )
-
-            res_2 = exe.run(
-                fluid.default_main_program(),
-                feed={"x2": self.x_2_np},
-                fetch_list=out_2,
-                use_prune=True,
-            )
-
-            np.testing.assert_allclose(res_1[0], out_1_np)
-            np.testing.assert_allclose(res_2[0], out_2_np)
-
-    def run_dygraph(self, groups, data_format):
-        n, c, h, w = 2, 9, 4, 4
-
-        if data_format == "NCHW":
-            shape = [n, c, h, w]
-        if data_format == "NHWC":
-            shape = [n, h, w, c]
-
-        x = np.random.random(shape).astype("float64")
-
-        npresult = channel_shuffle_np(x, groups, data_format)
-
-        for use_cuda in (
-            [False, True] if core.is_compiled_with_cuda() else [False]
-        ):
-            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
-
-            paddle.disable_static(place=place)
-
-            channel_shuffle = paddle.nn.ChannelShuffle(
-                groups, data_format=data_format
-            )
-            result = channel_shuffle(paddle.to_tensor(x))
-
-            np.testing.assert_allclose(result.numpy(), npresult, rtol=1e-05)
-
-            result_functional = F.channel_shuffle(
-                paddle.to_tensor(x), 3, data_format
-            )
-            np.testing.assert_allclose(
-                result_functional.numpy(), npresult, rtol=1e-05
-            )
-
-            channel_shuffle_str = f'groups={groups}'
-            if data_format != 'NCHW':
-                channel_shuffle_str += f', data_format={data_format}'
-            self.assertEqual(channel_shuffle.extra_repr(), channel_shuffle_str)
-
-    def test_dygraph1(self):
-        self.run_dygraph(3, "NCHW")
-
-    def test_dygraph2(self):
-        self.run_dygraph(3, "NHWC")
-
-
-class TestChannelShuffleError(unittest.TestCase):
-    def test_error_functional(self):
-        def error_input():
-            with paddle.fluid.dygraph.guard():
-                x = np.random.random([9, 4, 4]).astype("float64")
-                channel_shuffle = F.channel_shuffle(paddle.to_tensor(x), 3)
-
-        self.assertRaises(ValueError, error_input)
-
-        def error_groups_1():
-            with paddle.fluid.dygraph.guard():
-                x = np.random.random([2, 9, 4, 4]).astype("float64")
-                channel_shuffle = F.channel_shuffle(paddle.to_tensor(x), 3.33)
-
-        self.assertRaises(TypeError, error_groups_1)
-
-        def error_groups_2():
-            with paddle.fluid.dygraph.guard():
-                x = np.random.random([2, 9, 4, 4]).astype("float64")
-                channel_shuffle = F.channel_shuffle(paddle.to_tensor(x), -1)
-
-        self.assertRaises(ValueError, error_groups_2)
-
-        def error_data_format():
-            with paddle.fluid.dygraph.guard():
-                x = np.random.random([2, 9, 4, 4]).astype("float64")
-                channel_shuffle = F.channel_shuffle(
-                    paddle.to_tensor(x), 3, "WOW"
-                )
+# class TestChannelLast(TestChannelShuffleOp):
+#     def init_data_format(self):
+#         self.format = "NHWC"
+
+
+# class TestChannelShuffleAPI(unittest.TestCase):
+#     def setUp(self):
+#         self.x_1_np = np.random.random([2, 9, 4, 4]).astype("float64")
+#         self.x_2_np = np.random.random([2, 4, 4, 9]).astype("float64")
+#         self.out_1_np = channel_shuffle_np(self.x_1_np, 3)
+#         self.out_2_np = channel_shuffle_np(self.x_2_np, 3, "NHWC")
+
+#     def test_static_graph_functional(self):
+#         for use_cuda in (
+#             [False, True] if core.is_compiled_with_cuda() else [False]
+#         ):
+#             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+
+#             paddle.enable_static()
+#             x_1 = paddle.static.data(
+#                 name="x", shape=[2, 9, 4, 4], dtype="float64"
+#             )
+#             x_2 = paddle.static.data(
+#                 name="x2", shape=[2, 4, 4, 9], dtype="float64"
+#             )
+#             out_1 = F.channel_shuffle(x_1, 3)
+#             out_2 = F.channel_shuffle(x_2, 3, "NHWC")
+
+#             exe = paddle.static.Executor(place=place)
+#             res_1 = exe.run(
+#                 fluid.default_main_program(),
+#                 feed={"x": self.x_1_np},
+#                 fetch_list=out_1,
+#                 use_prune=True,
+#             )
+
+#             res_2 = exe.run(
+#                 fluid.default_main_program(),
+#                 feed={"x2": self.x_2_np},
+#                 fetch_list=out_2,
+#                 use_prune=True,
+#             )
+
+#             np.testing.assert_allclose(res_1[0], self.out_1_np)
+#             np.testing.assert_allclose(res_2[0], self.out_2_np)
+
+#     # same test between layer and functional in this op.
+#     def test_static_graph_layer(self):
+#         for use_cuda in (
+#             [False, True] if core.is_compiled_with_cuda() else [False]
+#         ):
+#             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+
+#             paddle.enable_static()
+#             x_1 = paddle.static.data(
+#                 name="x", shape=[2, 9, 4, 4], dtype="float64"
+#             )
+#             x_2 = paddle.static.data(
+#                 name="x2", shape=[2, 4, 4, 9], dtype="float64"
+#             )
+#             # init instance
+#             ps_1 = paddle.nn.ChannelShuffle(3)
+#             ps_2 = paddle.nn.ChannelShuffle(3, "NHWC")
+#             out_1 = ps_1(x_1)
+#             out_2 = ps_2(x_2)
+#             out_1_np = channel_shuffle_np(self.x_1_np, 3)
+#             out_2_np = channel_shuffle_np(self.x_2_np, 3, "NHWC")
+
+#             exe = paddle.static.Executor(place=place)
+#             res_1 = exe.run(
+#                 fluid.default_main_program(),
+#                 feed={"x": self.x_1_np},
+#                 fetch_list=out_1,
+#                 use_prune=True,
+#             )
+
+#             res_2 = exe.run(
+#                 fluid.default_main_program(),
+#                 feed={"x2": self.x_2_np},
+#                 fetch_list=out_2,
+#                 use_prune=True,
+#             )
+
+#             np.testing.assert_allclose(res_1[0], out_1_np)
+#             np.testing.assert_allclose(res_2[0], out_2_np)
+
+#     def run_dygraph(self, groups, data_format):
+#         n, c, h, w = 2, 9, 4, 4
+
+#         if data_format == "NCHW":
+#             shape = [n, c, h, w]
+#         if data_format == "NHWC":
+#             shape = [n, h, w, c]
+
+#         x = np.random.random(shape).astype("float64")
+
+#         npresult = channel_shuffle_np(x, groups, data_format)
+
+#         for use_cuda in (
+#             [False, True] if core.is_compiled_with_cuda() else [False]
+#         ):
+#             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+
+#             paddle.disable_static(place=place)
+
+#             channel_shuffle = paddle.nn.ChannelShuffle(
+#                 groups, data_format=data_format
+#             )
+#             result = channel_shuffle(paddle.to_tensor(x))
+
+#             np.testing.assert_allclose(result.numpy(), npresult, rtol=1e-05)
+
+#             result_functional = F.channel_shuffle(
+#                 paddle.to_tensor(x), 3, data_format
+#             )
+#             np.testing.assert_allclose(
+#                 result_functional.numpy(), npresult, rtol=1e-05
+#             )
+
+#             channel_shuffle_str = f'groups={groups}'
+#             if data_format != 'NCHW':
+#                 channel_shuffle_str += f', data_format={data_format}'
+#             self.assertEqual(channel_shuffle.extra_repr(), channel_shuffle_str)
+
+#     def test_dygraph1(self):
+#         self.run_dygraph(3, "NCHW")
+
+#     def test_dygraph2(self):
+#         self.run_dygraph(3, "NHWC")
+
+
+# class TestChannelShuffleError(unittest.TestCase):
+#     def test_error_functional(self):
+#         def error_input():
+#             with paddle.fluid.dygraph.guard():
+#                 x = np.random.random([9, 4, 4]).astype("float64")
+#                 channel_shuffle = F.channel_shuffle(paddle.to_tensor(x), 3)
+
+#         self.assertRaises(ValueError, error_input)
+
+#         def error_groups_1():
+#             with paddle.fluid.dygraph.guard():
+#                 x = np.random.random([2, 9, 4, 4]).astype("float64")
+#                 channel_shuffle = F.channel_shuffle(paddle.to_tensor(x), 3.33)
+
+#         self.assertRaises(TypeError, error_groups_1)
+
+#         def error_groups_2():
+#             with paddle.fluid.dygraph.guard():
+#                 x = np.random.random([2, 9, 4, 4]).astype("float64")
+#                 channel_shuffle = F.channel_shuffle(paddle.to_tensor(x), -1)
+
+#         self.assertRaises(ValueError, error_groups_2)
+
+#         def error_data_format():
+#             with paddle.fluid.dygraph.guard():
+#                 x = np.random.random([2, 9, 4, 4]).astype("float64")
+#                 channel_shuffle = F.channel_shuffle(
+#                     paddle.to_tensor(x), 3, "WOW"
+#                 )
+
+#         self.assertRaises(ValueError, error_data_format)
+
+#     def test_error_layer(self):
+#         def error_input_layer():
+#             with paddle.fluid.dygraph.guard():
+#                 x = np.random.random([9, 4, 4]).astype("float64")
+#                 cs = paddle.nn.ChannelShuffle(3)
+#                 cs(paddle.to_tensor(x))
+
+#         self.assertRaises(ValueError, error_input_layer)
+
+#         def error_groups_layer_1():
+#             with paddle.fluid.dygraph.guard():
+#                 x = np.random.random([2, 9, 4, 4]).astype("float64")
+#                 cs = paddle.nn.ChannelShuffle(3.33)
+
+#         self.assertRaises(TypeError, error_groups_layer_1)
 
-        self.assertRaises(ValueError, error_data_format)
+#         def error_groups_layer_2():
+#             with paddle.fluid.dygraph.guard():
+#                 x = np.random.random([2, 9, 4, 4]).astype("float64")
+#                 cs = paddle.nn.ChannelShuffle(-1)
 
-    def test_error_layer(self):
-        def error_input_layer():
-            with paddle.fluid.dygraph.guard():
-                x = np.random.random([9, 4, 4]).astype("float64")
-                cs = paddle.nn.ChannelShuffle(3)
-                cs(paddle.to_tensor(x))
+#         self.assertRaises(ValueError, error_groups_layer_2)
 
-        self.assertRaises(ValueError, error_input_layer)
+#         def error_data_format_layer():
+#             with paddle.fluid.dygraph.guard():
+#                 x = np.random.random([2, 9, 4, 4]).astype("float64")
+#                 cs = paddle.nn.ChannelShuffle(3, "MEOW")
 
-        def error_groups_layer_1():
-            with paddle.fluid.dygraph.guard():
-                x = np.random.random([2, 9, 4, 4]).astype("float64")
-                cs = paddle.nn.ChannelShuffle(3.33)
+#         self.assertRaises(ValueError, error_data_format_layer)
 
-        self.assertRaises(TypeError, error_groups_layer_1)
 
-        def error_groups_layer_2():
-            with paddle.fluid.dygraph.guard():
-                x = np.random.random([2, 9, 4, 4]).astype("float64")
-                cs = paddle.nn.ChannelShuffle(-1)
+# class TestChannelShuffleFP16OP(TestChannelShuffleOp):
+#     def init_dtype(self):
+#         self.dtype = np.float16
 
-        self.assertRaises(ValueError, error_groups_layer_2)
 
-        def error_data_format_layer():
-            with paddle.fluid.dygraph.guard():
-                x = np.random.random([2, 9, 4, 4]).astype("float64")
-                cs = paddle.nn.ChannelShuffle(3, "MEOW")
+# @unittest.skipIf(
+#     not core.is_compiled_with_cuda()
+#     or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+#     "core is not complied with CUDA and not support the bfloat16",
+# )
+# class TestChannelShuffleBF16OP(OpTest):
+#     def setUp(self):
+#         self.op_type = "channel_shuffle"
+#         self.init_data_format()
+#         n, c, h, w = 2, 9, 4, 4
+#         self.python_api = paddle.nn.functional.channel_shuffle
+#         self.dtype = np.uint16
+#         self.use_mkldnn = False
+
+#         if self.format == "NCHW":
+#             shape = [n, c, h, w]
+#         if self.format == "NHWC":
+#             shape = [n, h, w, c]
+
+#         groups = 3
+
+#         x = np.random.random(shape).astype('float32')
+#         out = channel_shuffle_np(x, groups, self.format)
+#         self.inputs = {'X': convert_float_to_uint16(x)}
+#         self.attrs = {'groups': groups, "data_format": self.format}
+#         self.outputs = {'Out': convert_float_to_uint16(out)}
 
-        self.assertRaises(ValueError, error_data_format_layer)
-
-
-class TestChannelShuffleFP16OP(TestChannelShuffleOp):
-    def init_dtype(self):
-        self.dtype = np.float16
-
-
-@unittest.skipIf(
-    not core.is_compiled_with_cuda()
-    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
-    "core is not complied with CUDA and not support the bfloat16",
-)
-class TestChannelShuffleBF16OP(OpTest):
-    def setUp(self):
-        self.op_type = "channel_shuffle"
-        self.init_data_format()
-        n, c, h, w = 2, 9, 4, 4
-        self.python_api = paddle.nn.functional.channel_shuffle
-        self.dtype = np.uint16
-        self.use_mkldnn = False
-
-        if self.format == "NCHW":
-            shape = [n, c, h, w]
-        if self.format == "NHWC":
-            shape = [n, h, w, c]
-
-        groups = 3
-
-        x = np.random.random(shape).astype('float32')
-        out = channel_shuffle_np(x, groups, self.format)
-        self.inputs = {'X': convert_float_to_uint16(x)}
-        self.attrs = {'groups': groups, "data_format": self.format}
-        self.outputs = {'Out': convert_float_to_uint16(out)}
-
-    def init_data_format(self):
-        self.format = "NCHW"
-
-    def test_check_output(self):
-        place = core.CUDAPlace(0)
-        self.check_output_with_place(place)
-
-    def test_check_grad(self):
-        place = core.CUDAPlace(0)
-        self.check_grad_with_place(
-            place,
-            ['X'],
-            'Out',
-        )
+#     def init_data_format(self):
+#         self.format = "NCHW"
+
+#     def test_check_output(self):
+#         place = core.CUDAPlace(0)
+#         self.check_output_with_place(place)
+
+#     def test_check_grad(self):
+#         place = core.CUDAPlace(0)
+#         self.check_grad_with_place(
+#             place,
+#             ['X'],
+#             'Out',
+#         )
 
 
 if __name__ == '__main__':

From 33947b0bbaa55e13e0d54eb4fcbed88a489ddc53 Mon Sep 17 00:00:00 2001
From: phlrain <phliuhongyu@126.com>
Date: Sun, 6 Aug 2023 03:37:44 +0000
Subject: [PATCH 16/22] revert channel shuffl test

---
 test/legacy_test/test_channel_shuffle.py | 467 ++++++++++++-----------
 1 file changed, 235 insertions(+), 232 deletions(-)

diff --git a/test/legacy_test/test_channel_shuffle.py b/test/legacy_test/test_channel_shuffle.py
index 90843648120324..8da783e6773a26 100644
--- a/test/legacy_test/test_channel_shuffle.py
+++ b/test/legacy_test/test_channel_shuffle.py
@@ -15,9 +15,12 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from eager_op_test import OpTest, convert_float_to_uint16
 
 import paddle
+import paddle.nn.functional as F
+from paddle import fluid
+from paddle.fluid import core
 
 
 def channel_shuffle_np(x, groups, data_format="NCHW"):
@@ -74,246 +77,246 @@ def test_check_grad(self):
         self.check_grad(['X'], 'Out')
 
 
-# class TestChannelLast(TestChannelShuffleOp):
-#     def init_data_format(self):
-#         self.format = "NHWC"
-
-
-# class TestChannelShuffleAPI(unittest.TestCase):
-#     def setUp(self):
-#         self.x_1_np = np.random.random([2, 9, 4, 4]).astype("float64")
-#         self.x_2_np = np.random.random([2, 4, 4, 9]).astype("float64")
-#         self.out_1_np = channel_shuffle_np(self.x_1_np, 3)
-#         self.out_2_np = channel_shuffle_np(self.x_2_np, 3, "NHWC")
-
-#     def test_static_graph_functional(self):
-#         for use_cuda in (
-#             [False, True] if core.is_compiled_with_cuda() else [False]
-#         ):
-#             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
-
-#             paddle.enable_static()
-#             x_1 = paddle.static.data(
-#                 name="x", shape=[2, 9, 4, 4], dtype="float64"
-#             )
-#             x_2 = paddle.static.data(
-#                 name="x2", shape=[2, 4, 4, 9], dtype="float64"
-#             )
-#             out_1 = F.channel_shuffle(x_1, 3)
-#             out_2 = F.channel_shuffle(x_2, 3, "NHWC")
-
-#             exe = paddle.static.Executor(place=place)
-#             res_1 = exe.run(
-#                 fluid.default_main_program(),
-#                 feed={"x": self.x_1_np},
-#                 fetch_list=out_1,
-#                 use_prune=True,
-#             )
-
-#             res_2 = exe.run(
-#                 fluid.default_main_program(),
-#                 feed={"x2": self.x_2_np},
-#                 fetch_list=out_2,
-#                 use_prune=True,
-#             )
-
-#             np.testing.assert_allclose(res_1[0], self.out_1_np)
-#             np.testing.assert_allclose(res_2[0], self.out_2_np)
-
-#     # same test between layer and functional in this op.
-#     def test_static_graph_layer(self):
-#         for use_cuda in (
-#             [False, True] if core.is_compiled_with_cuda() else [False]
-#         ):
-#             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
-
-#             paddle.enable_static()
-#             x_1 = paddle.static.data(
-#                 name="x", shape=[2, 9, 4, 4], dtype="float64"
-#             )
-#             x_2 = paddle.static.data(
-#                 name="x2", shape=[2, 4, 4, 9], dtype="float64"
-#             )
-#             # init instance
-#             ps_1 = paddle.nn.ChannelShuffle(3)
-#             ps_2 = paddle.nn.ChannelShuffle(3, "NHWC")
-#             out_1 = ps_1(x_1)
-#             out_2 = ps_2(x_2)
-#             out_1_np = channel_shuffle_np(self.x_1_np, 3)
-#             out_2_np = channel_shuffle_np(self.x_2_np, 3, "NHWC")
-
-#             exe = paddle.static.Executor(place=place)
-#             res_1 = exe.run(
-#                 fluid.default_main_program(),
-#                 feed={"x": self.x_1_np},
-#                 fetch_list=out_1,
-#                 use_prune=True,
-#             )
-
-#             res_2 = exe.run(
-#                 fluid.default_main_program(),
-#                 feed={"x2": self.x_2_np},
-#                 fetch_list=out_2,
-#                 use_prune=True,
-#             )
-
-#             np.testing.assert_allclose(res_1[0], out_1_np)
-#             np.testing.assert_allclose(res_2[0], out_2_np)
-
-#     def run_dygraph(self, groups, data_format):
-#         n, c, h, w = 2, 9, 4, 4
-
-#         if data_format == "NCHW":
-#             shape = [n, c, h, w]
-#         if data_format == "NHWC":
-#             shape = [n, h, w, c]
-
-#         x = np.random.random(shape).astype("float64")
-
-#         npresult = channel_shuffle_np(x, groups, data_format)
-
-#         for use_cuda in (
-#             [False, True] if core.is_compiled_with_cuda() else [False]
-#         ):
-#             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
-
-#             paddle.disable_static(place=place)
-
-#             channel_shuffle = paddle.nn.ChannelShuffle(
-#                 groups, data_format=data_format
-#             )
-#             result = channel_shuffle(paddle.to_tensor(x))
-
-#             np.testing.assert_allclose(result.numpy(), npresult, rtol=1e-05)
-
-#             result_functional = F.channel_shuffle(
-#                 paddle.to_tensor(x), 3, data_format
-#             )
-#             np.testing.assert_allclose(
-#                 result_functional.numpy(), npresult, rtol=1e-05
-#             )
-
-#             channel_shuffle_str = f'groups={groups}'
-#             if data_format != 'NCHW':
-#                 channel_shuffle_str += f', data_format={data_format}'
-#             self.assertEqual(channel_shuffle.extra_repr(), channel_shuffle_str)
-
-#     def test_dygraph1(self):
-#         self.run_dygraph(3, "NCHW")
-
-#     def test_dygraph2(self):
-#         self.run_dygraph(3, "NHWC")
-
-
-# class TestChannelShuffleError(unittest.TestCase):
-#     def test_error_functional(self):
-#         def error_input():
-#             with paddle.fluid.dygraph.guard():
-#                 x = np.random.random([9, 4, 4]).astype("float64")
-#                 channel_shuffle = F.channel_shuffle(paddle.to_tensor(x), 3)
-
-#         self.assertRaises(ValueError, error_input)
-
-#         def error_groups_1():
-#             with paddle.fluid.dygraph.guard():
-#                 x = np.random.random([2, 9, 4, 4]).astype("float64")
-#                 channel_shuffle = F.channel_shuffle(paddle.to_tensor(x), 3.33)
-
-#         self.assertRaises(TypeError, error_groups_1)
-
-#         def error_groups_2():
-#             with paddle.fluid.dygraph.guard():
-#                 x = np.random.random([2, 9, 4, 4]).astype("float64")
-#                 channel_shuffle = F.channel_shuffle(paddle.to_tensor(x), -1)
-
-#         self.assertRaises(ValueError, error_groups_2)
-
-#         def error_data_format():
-#             with paddle.fluid.dygraph.guard():
-#                 x = np.random.random([2, 9, 4, 4]).astype("float64")
-#                 channel_shuffle = F.channel_shuffle(
-#                     paddle.to_tensor(x), 3, "WOW"
-#                 )
-
-#         self.assertRaises(ValueError, error_data_format)
-
-#     def test_error_layer(self):
-#         def error_input_layer():
-#             with paddle.fluid.dygraph.guard():
-#                 x = np.random.random([9, 4, 4]).astype("float64")
-#                 cs = paddle.nn.ChannelShuffle(3)
-#                 cs(paddle.to_tensor(x))
-
-#         self.assertRaises(ValueError, error_input_layer)
-
-#         def error_groups_layer_1():
-#             with paddle.fluid.dygraph.guard():
-#                 x = np.random.random([2, 9, 4, 4]).astype("float64")
-#                 cs = paddle.nn.ChannelShuffle(3.33)
-
-#         self.assertRaises(TypeError, error_groups_layer_1)
+class TestChannelLast(TestChannelShuffleOp):
+    def init_data_format(self):
+        self.format = "NHWC"
+
+
+class TestChannelShuffleAPI(unittest.TestCase):
+    def setUp(self):
+        self.x_1_np = np.random.random([2, 9, 4, 4]).astype("float64")
+        self.x_2_np = np.random.random([2, 4, 4, 9]).astype("float64")
+        self.out_1_np = channel_shuffle_np(self.x_1_np, 3)
+        self.out_2_np = channel_shuffle_np(self.x_2_np, 3, "NHWC")
+
+    def test_static_graph_functional(self):
+        for use_cuda in (
+            [False, True] if core.is_compiled_with_cuda() else [False]
+        ):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+
+            paddle.enable_static()
+            x_1 = paddle.static.data(
+                name="x", shape=[2, 9, 4, 4], dtype="float64"
+            )
+            x_2 = paddle.static.data(
+                name="x2", shape=[2, 4, 4, 9], dtype="float64"
+            )
+            out_1 = F.channel_shuffle(x_1, 3)
+            out_2 = F.channel_shuffle(x_2, 3, "NHWC")
+
+            exe = paddle.static.Executor(place=place)
+            res_1 = exe.run(
+                fluid.default_main_program(),
+                feed={"x": self.x_1_np},
+                fetch_list=out_1,
+                use_prune=True,
+            )
+
+            res_2 = exe.run(
+                fluid.default_main_program(),
+                feed={"x2": self.x_2_np},
+                fetch_list=out_2,
+                use_prune=True,
+            )
+
+            np.testing.assert_allclose(res_1[0], self.out_1_np)
+            np.testing.assert_allclose(res_2[0], self.out_2_np)
+
+    # same test between layer and functional in this op.
+    def test_static_graph_layer(self):
+        for use_cuda in (
+            [False, True] if core.is_compiled_with_cuda() else [False]
+        ):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+
+            paddle.enable_static()
+            x_1 = paddle.static.data(
+                name="x", shape=[2, 9, 4, 4], dtype="float64"
+            )
+            x_2 = paddle.static.data(
+                name="x2", shape=[2, 4, 4, 9], dtype="float64"
+            )
+            # init instance
+            ps_1 = paddle.nn.ChannelShuffle(3)
+            ps_2 = paddle.nn.ChannelShuffle(3, "NHWC")
+            out_1 = ps_1(x_1)
+            out_2 = ps_2(x_2)
+            out_1_np = channel_shuffle_np(self.x_1_np, 3)
+            out_2_np = channel_shuffle_np(self.x_2_np, 3, "NHWC")
+
+            exe = paddle.static.Executor(place=place)
+            res_1 = exe.run(
+                fluid.default_main_program(),
+                feed={"x": self.x_1_np},
+                fetch_list=out_1,
+                use_prune=True,
+            )
+
+            res_2 = exe.run(
+                fluid.default_main_program(),
+                feed={"x2": self.x_2_np},
+                fetch_list=out_2,
+                use_prune=True,
+            )
+
+            np.testing.assert_allclose(res_1[0], out_1_np)
+            np.testing.assert_allclose(res_2[0], out_2_np)
+
+    def run_dygraph(self, groups, data_format):
+        n, c, h, w = 2, 9, 4, 4
+
+        if data_format == "NCHW":
+            shape = [n, c, h, w]
+        if data_format == "NHWC":
+            shape = [n, h, w, c]
+
+        x = np.random.random(shape).astype("float64")
+
+        npresult = channel_shuffle_np(x, groups, data_format)
+
+        for use_cuda in (
+            [False, True] if core.is_compiled_with_cuda() else [False]
+        ):
+            place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
+
+            paddle.disable_static(place=place)
+
+            channel_shuffle = paddle.nn.ChannelShuffle(
+                groups, data_format=data_format
+            )
+            result = channel_shuffle(paddle.to_tensor(x))
+
+            np.testing.assert_allclose(result.numpy(), npresult, rtol=1e-05)
+
+            result_functional = F.channel_shuffle(
+                paddle.to_tensor(x), 3, data_format
+            )
+            np.testing.assert_allclose(
+                result_functional.numpy(), npresult, rtol=1e-05
+            )
+
+            channel_shuffle_str = f'groups={groups}'
+            if data_format != 'NCHW':
+                channel_shuffle_str += f', data_format={data_format}'
+            self.assertEqual(channel_shuffle.extra_repr(), channel_shuffle_str)
+
+    def test_dygraph1(self):
+        self.run_dygraph(3, "NCHW")
+
+    def test_dygraph2(self):
+        self.run_dygraph(3, "NHWC")
+
+
+class TestChannelShuffleError(unittest.TestCase):
+    def test_error_functional(self):
+        def error_input():
+            with paddle.fluid.dygraph.guard():
+                x = np.random.random([9, 4, 4]).astype("float64")
+                channel_shuffle = F.channel_shuffle(paddle.to_tensor(x), 3)
+
+        self.assertRaises(ValueError, error_input)
+
+        def error_groups_1():
+            with paddle.fluid.dygraph.guard():
+                x = np.random.random([2, 9, 4, 4]).astype("float64")
+                channel_shuffle = F.channel_shuffle(paddle.to_tensor(x), 3.33)
+
+        self.assertRaises(TypeError, error_groups_1)
+
+        def error_groups_2():
+            with paddle.fluid.dygraph.guard():
+                x = np.random.random([2, 9, 4, 4]).astype("float64")
+                channel_shuffle = F.channel_shuffle(paddle.to_tensor(x), -1)
+
+        self.assertRaises(ValueError, error_groups_2)
+
+        def error_data_format():
+            with paddle.fluid.dygraph.guard():
+                x = np.random.random([2, 9, 4, 4]).astype("float64")
+                channel_shuffle = F.channel_shuffle(
+                    paddle.to_tensor(x), 3, "WOW"
+                )
 
-#         def error_groups_layer_2():
-#             with paddle.fluid.dygraph.guard():
-#                 x = np.random.random([2, 9, 4, 4]).astype("float64")
-#                 cs = paddle.nn.ChannelShuffle(-1)
+        self.assertRaises(ValueError, error_data_format)
 
-#         self.assertRaises(ValueError, error_groups_layer_2)
+    def test_error_layer(self):
+        def error_input_layer():
+            with paddle.fluid.dygraph.guard():
+                x = np.random.random([9, 4, 4]).astype("float64")
+                cs = paddle.nn.ChannelShuffle(3)
+                cs(paddle.to_tensor(x))
 
-#         def error_data_format_layer():
-#             with paddle.fluid.dygraph.guard():
-#                 x = np.random.random([2, 9, 4, 4]).astype("float64")
-#                 cs = paddle.nn.ChannelShuffle(3, "MEOW")
+        self.assertRaises(ValueError, error_input_layer)
 
-#         self.assertRaises(ValueError, error_data_format_layer)
+        def error_groups_layer_1():
+            with paddle.fluid.dygraph.guard():
+                x = np.random.random([2, 9, 4, 4]).astype("float64")
+                cs = paddle.nn.ChannelShuffle(3.33)
 
+        self.assertRaises(TypeError, error_groups_layer_1)
 
-# class TestChannelShuffleFP16OP(TestChannelShuffleOp):
-#     def init_dtype(self):
-#         self.dtype = np.float16
+        def error_groups_layer_2():
+            with paddle.fluid.dygraph.guard():
+                x = np.random.random([2, 9, 4, 4]).astype("float64")
+                cs = paddle.nn.ChannelShuffle(-1)
 
+        self.assertRaises(ValueError, error_groups_layer_2)
 
-# @unittest.skipIf(
-#     not core.is_compiled_with_cuda()
-#     or not core.is_bfloat16_supported(core.CUDAPlace(0)),
-#     "core is not complied with CUDA and not support the bfloat16",
-# )
-# class TestChannelShuffleBF16OP(OpTest):
-#     def setUp(self):
-#         self.op_type = "channel_shuffle"
-#         self.init_data_format()
-#         n, c, h, w = 2, 9, 4, 4
-#         self.python_api = paddle.nn.functional.channel_shuffle
-#         self.dtype = np.uint16
-#         self.use_mkldnn = False
-
-#         if self.format == "NCHW":
-#             shape = [n, c, h, w]
-#         if self.format == "NHWC":
-#             shape = [n, h, w, c]
-
-#         groups = 3
-
-#         x = np.random.random(shape).astype('float32')
-#         out = channel_shuffle_np(x, groups, self.format)
-#         self.inputs = {'X': convert_float_to_uint16(x)}
-#         self.attrs = {'groups': groups, "data_format": self.format}
-#         self.outputs = {'Out': convert_float_to_uint16(out)}
+        def error_data_format_layer():
+            with paddle.fluid.dygraph.guard():
+                x = np.random.random([2, 9, 4, 4]).astype("float64")
+                cs = paddle.nn.ChannelShuffle(3, "MEOW")
 
-#     def init_data_format(self):
-#         self.format = "NCHW"
-
-#     def test_check_output(self):
-#         place = core.CUDAPlace(0)
-#         self.check_output_with_place(place)
-
-#     def test_check_grad(self):
-#         place = core.CUDAPlace(0)
-#         self.check_grad_with_place(
-#             place,
-#             ['X'],
-#             'Out',
-#         )
+        self.assertRaises(ValueError, error_data_format_layer)
+
+
+class TestChannelShuffleFP16OP(TestChannelShuffleOp):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not complied with CUDA and not support the bfloat16",
+)
+class TestChannelShuffleBF16OP(OpTest):
+    def setUp(self):
+        self.op_type = "channel_shuffle"
+        self.init_data_format()
+        n, c, h, w = 2, 9, 4, 4
+        self.python_api = paddle.nn.functional.channel_shuffle
+        self.dtype = np.uint16
+        self.use_mkldnn = False
+
+        if self.format == "NCHW":
+            shape = [n, c, h, w]
+        if self.format == "NHWC":
+            shape = [n, h, w, c]
+
+        groups = 3
+
+        x = np.random.random(shape).astype('float32')
+        out = channel_shuffle_np(x, groups, self.format)
+        self.inputs = {'X': convert_float_to_uint16(x)}
+        self.attrs = {'groups': groups, "data_format": self.format}
+        self.outputs = {'Out': convert_float_to_uint16(out)}
+
+    def init_data_format(self):
+        self.format = "NCHW"
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(
+            place,
+            ['X'],
+            'Out',
+        )
 
 
 if __name__ == '__main__':

From 14c4e9bf3d50e94926955d3ac4f93eb81751bb13 Mon Sep 17 00:00:00 2001
From: phlrain <phliuhongyu@126.com>
Date: Sun, 6 Aug 2023 09:06:33 +0000
Subject: [PATCH 17/22] polish code

---
 .../instruction/instruction_base.cc           | 34 +++++++++++--------
 .../instruction/instruction_base.h            |  3 --
 .../instruction/phi_kernel_instruction.h      |  2 ++
 3 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_base.cc b/paddle/fluid/framework/new_executor/instruction/instruction_base.cc
index 5fd12551ff176c..fb9dc40ed57c9d 100644
--- a/paddle/fluid/framework/new_executor/instruction/instruction_base.cc
+++ b/paddle/fluid/framework/new_executor/instruction/instruction_base.cc
@@ -106,6 +106,9 @@ void InstructionBase::InitInputsOutputsIds(
     const std::map<std::string, int>& var_name_2_id,
     const std::unordered_map<const paddle::framework::Variable*, std::string>&
         variable_2_var_name) {
+  auto op_attributes = op->attributes();
+  auto op_name =
+      op_attributes.at("op_name").dyn_cast<::ir::StrAttribute>().AsString();
   std::unordered_map<ir::Value, std::vector<int>> inputs;
   for (size_t i = 0; i < op->num_operands(); i++) {
     ir::Value value = op->operand_source(i);
@@ -116,7 +119,7 @@ void InstructionBase::InitInputsOutputsIds(
           phi::errors::PreconditionNotMet(
               "input should in name map, [%d] 'th input of [%s] op",
               i,
-              phi_op_name_));
+              op_name));
       std::vector<int> inputs_id = GetValueIds(value,
                                                inner_scope,
                                                value_2_var_name,
@@ -130,22 +133,23 @@ void InstructionBase::InitInputsOutputsIds(
   std::unordered_map<ir::Value, std::vector<int>> outputs;
   for (size_t i = 0; i < op->num_results(); i++) {
     ir::Value value = op->result(i);
-    if (value) {
-      PADDLE_ENFORCE_NE(
-          value_2_var_name.find(value),
-          value_2_var_name.end(),
-          phi::errors::PreconditionNotMet(
-              "input should in name map, [%d] 'th input of [%s] op",
-              i,
-              phi_op_name_));
-      std::vector<int> outputs_id = GetValueIds(value,
-                                                inner_scope,
-                                                value_2_var_name,
-                                                var_name_2_id,
-                                                variable_2_var_name);
-      outputs.emplace(value, outputs_id);
+    if ((!value) || (!(value.type()))) {
+      continue;
     }
+
+    PADDLE_ENFORCE_NE(
+        value_2_var_name.find(value),
+        value_2_var_name.end(),
+        phi::errors::PreconditionNotMet(
+            "input should in name map, [%d] 'th input of [%s] op", i, op_name));
+    std::vector<int> outputs_id = GetValueIds(value,
+                                              inner_scope,
+                                              value_2_var_name,
+                                              var_name_2_id,
+                                              variable_2_var_name);
+    outputs.emplace(value, outputs_id);
   }
+
   SetOutputs(outputs);
   VLOG(8) << "finish process outputs_index";
 }
diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_base.h b/paddle/fluid/framework/new_executor/instruction/instruction_base.h
index 5ce2358a7df799..c8ca7d8c6158ce 100644
--- a/paddle/fluid/framework/new_executor/instruction/instruction_base.h
+++ b/paddle/fluid/framework/new_executor/instruction/instruction_base.h
@@ -183,9 +183,6 @@ class InstructionBase {
   std::unordered_map<::ir::Value, std::vector<int>> output_index_;
 
   std::unordered_set<::ir::Value> no_need_buffer_values_;
-
- protected:
-  std::string phi_op_name_;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h
index 1d7e887e3bd93d..c637cce8651fbf 100644
--- a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h
+++ b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h
@@ -63,6 +63,8 @@ class PhiKernelInstruction : public InstructionBase {
   phi::KernelContext kernel_context_;
 
   phi::Kernel* phi_kernel_{nullptr};  // not owned
+
+  std::string phi_op_name_;
 };
 
 }  // namespace framework

From 55f630664701c2377f63a6065c069dee38e65602 Mon Sep 17 00:00:00 2001
From: phlrain <phliuhongyu@126.com>
Date: Sun, 6 Aug 2023 14:51:32 +0000
Subject: [PATCH 18/22] try to fix windows compile error

---
 .../new_executor/instruction/legacy_kernel_instruction.cc  | 7 +++----
 .../new_executor/instruction/legacy_kernel_instruction.h   | 2 +-
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc b/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc
index b3d013443055d7..92306779c612e1 100644
--- a/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc
+++ b/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc
@@ -138,9 +138,8 @@ LegacyKernelInstruction::LegacyKernelInstruction(
                           local_scope,
                           yaml_info_parser,
                           runtime_context_.get());
-  kernel_context_ = std::make_shared<paddle::framework::ExecutionContext>(
-      paddle::framework::ExecutionContext(
-          *operator_base_, *local_scope, *dev_ctx, *(runtime_context_.get())));
+  kernel_context_ = new paddle::framework::ExecutionContext(
+      *operator_base_, *local_scope, *dev_ctx, *(runtime_context_.get()));
 
   VLOG(6) << "finish process kernel context";
   SetDeviceContext(
@@ -169,7 +168,7 @@ LegacyKernelInstruction::LegacyKernelInstruction(
 void LegacyKernelInstruction::Run() {
   infer_meta_interface_->infer_meta_(&(infer_meta_context_));
   VLOG(6) << "Run op " << legacy_op_name_ << " infer meta.";
-  (*(phi_kernel_))((kernel_context_.get()));
+  (*(phi_kernel_))((kernel_context_));
   VLOG(6) << "Run op " << legacy_op_name_ << " kernel.";
 }
 
diff --git a/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h b/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h
index a8a150fbb6c776..19ff76e36075fb 100644
--- a/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h
+++ b/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h
@@ -62,7 +62,7 @@ class LegacyKernelInstruction : public InstructionBase {
 
   std::shared_ptr<framework::RuntimeContext> runtime_context_;
   std::shared_ptr<paddle::framework::OperatorBase> operator_base_;
-  std::shared_ptr<paddle::framework::ExecutionContext> kernel_context_;
+  paddle::framework::ExecutionContext* kernel_context_;
 
   phi::Kernel* phi_kernel_{nullptr};  // not owned
 };

From a17ab1443e1d9a949b027b1ce7466e43f3d83567 Mon Sep 17 00:00:00 2001
From: phlrain <phliuhongyu@126.com>
Date: Mon, 7 Aug 2023 01:47:23 +0000
Subject: [PATCH 19/22] polish code

---
 .../framework/new_executor/instruction/instruction_util.h   | 1 +
 .../new_executor/instruction/legacy_kernel_instruction.cc   | 6 ++++++
 .../new_executor/instruction/legacy_kernel_instruction.h    | 3 ++-
 paddle/fluid/framework/new_executor/new_ir_interpreter.cc   | 1 -
 4 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_util.h b/paddle/fluid/framework/new_executor/instruction/instruction_util.h
index 3d0aa3df9de963..a41ce07957e4ae 100644
--- a/paddle/fluid/framework/new_executor/instruction/instruction_util.h
+++ b/paddle/fluid/framework/new_executor/instruction/instruction_util.h
@@ -44,5 +44,6 @@ platform::DeviceContext* ParseDeviceContext(
     const int stream_priority);
 
 OpFuncType AnalyseOpFuncType(::ir::Operation* op, const platform::Place& place);
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc b/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc
index 92306779c612e1..eadf0c1f806cf4 100644
--- a/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc
+++ b/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc
@@ -165,6 +165,12 @@ LegacyKernelInstruction::LegacyKernelInstruction(
   VLOG(6) << "finish process no need buffer";
 }
 
+LegacyKernelInstruction::~LegacyKernelInstruction() {
+  if (kernel_context_ != nullptr) {
+    delete kernel_context_;
+  }
+}
+
 void LegacyKernelInstruction::Run() {
   infer_meta_interface_->infer_meta_(&(infer_meta_context_));
   VLOG(6) << "Run op " << legacy_op_name_ << " infer meta.";
diff --git a/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h b/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h
index 19ff76e36075fb..bade9481c3cb1e 100644
--- a/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h
+++ b/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h
@@ -38,6 +38,7 @@ class LegacyKernelInstruction : public InstructionBase {
       const std::unordered_map<const paddle::framework::Variable*, std::string>&
           variable_2_var_name);
 
+  ~LegacyKernelInstruction();
   phi::Kernel* PhiKernel() const { return phi_kernel_; }
 
   const phi::InferMetaContext& InferMetaContext() const {
@@ -60,9 +61,9 @@ class LegacyKernelInstruction : public InstructionBase {
 
   phi::InferMetaContext infer_meta_context_;
 
+  paddle::framework::ExecutionContext* kernel_context_{nullptr};
   std::shared_ptr<framework::RuntimeContext> runtime_context_;
   std::shared_ptr<paddle::framework::OperatorBase> operator_base_;
-  paddle::framework::ExecutionContext* kernel_context_;
 
   phi::Kernel* phi_kernel_{nullptr};  // not owned
 };
diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
index c82e3cbc28b47b..9def00f22e537b 100644
--- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
@@ -1626,7 +1626,6 @@ void NewIRInterpreter::BuildInstruction() {
 
       if (op_name == "pd.fused_softmax_mask_upper_triangle" ||
           op_name == "pd.fused_softmax_mask_upper_triangle_grad") {
-        std::cerr << "emplace lagcy kernel " << op_name << std::endl;
         vec_instruction_base_.emplace_back(
             std::make_unique<LegacyKernelInstruction>(op_idx++,
                                                       place_,

From 1f713ed8b0aa89c13a735a684ccbe202e85bc818 Mon Sep 17 00:00:00 2001
From: phlrain <phliuhongyu@126.com>
Date: Mon, 7 Aug 2023 07:48:19 +0000
Subject: [PATCH 20/22] update

---
 .../fluid/framework/new_executor/instruction/instruction_base.h | 1 -
 .../new_executor/instruction/legacy_kernel_instruction.h        | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_base.h b/paddle/fluid/framework/new_executor/instruction/instruction_base.h
index c8ca7d8c6158ce..f078da97107e7e 100644
--- a/paddle/fluid/framework/new_executor/instruction/instruction_base.h
+++ b/paddle/fluid/framework/new_executor/instruction/instruction_base.h
@@ -21,7 +21,6 @@
 
 #include "paddle/fluid/framework/new_executor/new_executor_defs.h"
 #include "paddle/fluid/platform/event.h"
-#include "paddle/ir/core/operation.h"
 #include "paddle/ir/core/value.h"
 
 namespace ir {
diff --git a/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h b/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h
index bade9481c3cb1e..27c1cb133bec01 100644
--- a/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h
+++ b/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h
@@ -18,12 +18,12 @@
 
 namespace ir {
 class Operation;
+class Value;
 }  // namespace ir
 
 namespace paddle {
 namespace framework {
 class Scope;
-class Value;
 
 class LegacyKernelInstruction : public InstructionBase {
  public:

From ca68a2b5412db735a508aa1f515b679d9c7a9f64 Mon Sep 17 00:00:00 2001
From: phlrain <phliuhongyu@126.com>
Date: Tue, 8 Aug 2023 01:51:25 +0000
Subject: [PATCH 21/22] update

---
 third_party/flashattn | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/flashattn b/third_party/flashattn
index 18106c1ba0ccee..b5bdb79d5e1f2f 160000
--- a/third_party/flashattn
+++ b/third_party/flashattn
@@ -1 +1 @@
-Subproject commit 18106c1ba0ccee81b97ca947397c08a141815a47
+Subproject commit b5bdb79d5e1f2f88b1ef62e86899a14f82fa079a

From 8ec5558bead4c7738008209edee5346f05081d80 Mon Sep 17 00:00:00 2001
From: phlrain <phliuhongyu@126.com>
Date: Tue, 8 Aug 2023 01:54:49 +0000
Subject: [PATCH 22/22] revert op test

---
 test/legacy_test/test_channel_shuffle.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/legacy_test/test_channel_shuffle.py b/test/legacy_test/test_channel_shuffle.py
index 8da783e6773a26..f8b6ef1df9514b 100644
--- a/test/legacy_test/test_channel_shuffle.py
+++ b/test/legacy_test/test_channel_shuffle.py
@@ -70,8 +70,8 @@ def init_dtype(self):
     def init_data_format(self):
         self.format = "NCHW"
 
-    # def test_check_output(self):
-    #     self.check_output()
+    def test_check_output(self):
+        self.check_output()
 
     def test_check_grad(self):
         self.check_grad(['X'], 'Out')