From 418d2796e95b36f49ad23defda0c5fe8cfe80e57 Mon Sep 17 00:00:00 2001
From: Yuanle Liu <yuanlehome@163.com>
Date: Thu, 8 Jun 2023 11:13:37 +0800
Subject: [PATCH] output tensor hook support while op (#54432)

---
 paddle/fluid/framework/naive_executor.cc      | 10 +++-
 paddle/fluid/framework/naive_executor.h       |  4 +-
 .../framework/new_executor/interpretercore.cc |  4 ++
 .../framework/new_executor/interpretercore.h  |  7 +++
 paddle/fluid/framework/operator.h             |  7 +++
 .../fluid/inference/api/analysis_predictor.cc | 55 +++++--------------
 .../fluid/inference/api/analysis_predictor.h  | 13 +----
 paddle/fluid/inference/api/paddle_api.h       | 13 +----
 .../inference/api/paddle_inference_api.h      | 10 ----
 paddle/fluid/inference/api/paddle_tensor.h    |  6 +-
 .../fluid/operators/controlflow/while_op.cc   |  2 +
 paddle/fluid/pybind/inference_api.cc          |  6 +-
 .../api/analysis_predictor_tester.cc          | 47 ----------------
 13 files changed, 48 insertions(+), 136 deletions(-)
diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index c31e0661140da..28cabf54ee4de 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -66,6 +66,10 @@ void NaiveExecutor::Run() {
                                 platform::NvtxRangeColor::Green);
 #endif
 
+    if (op->Type() == "while") {
+      op->SetOutputHooks(hookfuncs_);
+    }
+
     op->Run(*scope_, place_);
 
     // Update the shared_holder so that only records the max one.
@@ -97,8 +101,8 @@ void NaiveExecutor::Run() {
 #ifdef PADDLE_WITH_INFERENCE_NVTX
     platform::CudaNvtxRangePop();
 #endif
-    for (auto &func : hookfunc_) {
-      func(op.get());
+    for (auto &func : hookfuncs_) {
+      func(op.get(), scope_);
     }
   }
 #ifdef PADDLE_WITH_INFERENCE_NVTX
@@ -178,7 +182,7 @@ phi::DenseTensor *NaiveExecutor::FindTensor(const std::string &name) {
 }
 
 void NaiveExecutor::RegisterOutputHook(const HookFunc &hookfunc) {
-  hookfunc_.push_back(hookfunc);
+  hookfuncs_.push_back(hookfunc);
 }
 
 void NaiveExecutor::MakeReusePlan(
diff --git a/paddle/fluid/framework/naive_executor.h b/paddle/fluid/framework/naive_executor.h
index f1a4a036cde36..8361d79fd18f1 100644
--- a/paddle/fluid/framework/naive_executor.h
+++ b/paddle/fluid/framework/naive_executor.h
@@ -38,7 +38,7 @@ class Scope;
 
 class NaiveExecutor {
  public:
-  using HookFunc = std::function<void(OperatorBase*)>;
+  using HookFunc = std::function<void(OperatorBase*, Scope*)>;
 
   explicit NaiveExecutor(const platform::Place& place) : place_(place) {}
 
@@ -86,7 +86,7 @@ class NaiveExecutor {
   std::vector<std::unique_ptr<OperatorBase>> ops_;
   Scope* scope_{nullptr};
 
-  std::vector<HookFunc> hookfunc_;
+  std::vector<HookFunc> hookfuncs_;
 
   // Record information that tensor_a should ShareBufferWith tensor_b.
   std::unordered_map<OperatorBase*, std::unordered_map<phi::DenseTensor*, int>>
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index 2e47699499f2b..dc3674e8d8063 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -949,6 +949,10 @@ void InterpreterCore::RunOperator(const Instruction& instr_node) {
 #endif
   }
 
+  for (auto& hook : hookfuncs_) {
+    hook(op, local_scope);
+  }
+
   // for debug nan/inf
   if (op_with_kernel != nullptr && FLAGS_check_nan_inf) {
     VLOG(4) << "Check nan/inf";
diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h
index cf335f2b0bd21..904bfc5ec69b1 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.h
+++ b/paddle/fluid/framework/new_executor/interpretercore.h
@@ -77,6 +77,11 @@ class InterpreterCore {
 
   const platform::Place& GetPlace() const { return place_; }
 
+  using HookFunc = std::function<void(OperatorBase*, Scope*)>;
+  void SetOutputHooks(const std::vector<HookFunc>& hookfuncs) {
+    hookfuncs_ = hookfuncs;
+  }
+
  private:
   DISABLE_COPY_AND_ASSIGN(InterpreterCore);
   // build graph
@@ -184,6 +189,8 @@ class InterpreterCore {
   std::vector<size_t> trace_execute_order_;
 
   InstructionSchedulingPriorityLess instruction_scheduling_priority_less;
+
+  std::vector<HookFunc> hookfuncs_;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 3de6560653cb6..e6a2058107b1d 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -371,6 +371,11 @@ class OperatorBase {
 
   void SetId(uint64_t id) { id_ = id; }
 
+  using HookFunc = std::function<void(OperatorBase*, Scope*)>;
+  void SetOutputHooks(const std::vector<HookFunc>& hookfuncs) {
+    hookfuncs_ = hookfuncs;
+  }
+
  protected:
   std::string type_;
   // NOTE: in case of OpGrad, inputs_ contains:
@@ -399,6 +404,8 @@ class OperatorBase {
   // Whether this operator executes in an Executor.
   bool run_by_executor_{true};
 
+  std::vector<HookFunc> hookfuncs_;
+
  private:
   void GenerateTemporaryNames();
   void CheckAllInputOutputSet() const;
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 26c8b57d37bb6..4fcad3c7c117e 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -2638,47 +2638,26 @@ void AnalysisPredictor::RegisterOutputHook(
     const OutputTensorHookFunc &hookfunc) {
   static std::once_flag register_hook_flag;
   std::call_once(register_hook_flag, [this] {
-    executor_->RegisterOutputHook([this](framework::OperatorBase *op) {
-      for (auto &output : op->Outputs()) {
-        for (auto &var_name : output.second) {
-          auto *var = this->sub_scope_->FindVar(var_name);
-          if (!var || !var->IsType<phi::DenseTensor>()) continue;
-          auto dense_tensor = var->Get<phi::DenseTensor>();
-          if (!dense_tensor.initialized()) continue;
-          auto tensor = this->GetOutputTensor(var_name);
-          for (auto &hookfunc : this->hookfuncs_) {
-            hookfunc(op->Type(), var_name, *tensor);
+    executor_->RegisterOutputHook(
+        [this](framework::OperatorBase *op, framework::Scope *scope) {
+          for (auto &output : op->Outputs()) {
+            for (auto &var_name : output.second) {
+              auto *var = scope->FindVar(var_name);
+              if (!var || !var->IsType<phi::DenseTensor>()) continue;
+              auto dense_tensor = var->Get<phi::DenseTensor>();
+              if (!dense_tensor.initialized()) continue;
+              auto tensor = paddle::Tensor(
+                  std::make_shared<phi::DenseTensor>(dense_tensor), var_name);
+              for (auto &hookfunc : this->hookfuncs_) {
+                hookfunc(op->Type(), var_name, tensor);
+              }
+            }
           }
-        }
-      }
-    });
+        });
   });
   hookfuncs_.push_back(hookfunc);
 }
 
-void AnalysisPredictor::RegisterOutputHook(
-    const OutputTensorHookFunc_V2 &hookfunc) {
-  static std::once_flag register_hook_flag;
-  std::call_once(register_hook_flag, [this] {
-    executor_->RegisterOutputHook([this](framework::OperatorBase *op) {
-      for (auto &output : op->Outputs()) {
-        for (auto &var_name : output.second) {
-          auto *var = this->sub_scope_->FindVar(var_name);
-          if (!var || !var->IsType<phi::DenseTensor>()) continue;
-          auto dense_tensor = var->Get<phi::DenseTensor>();
-          if (!dense_tensor.initialized()) continue;
-          auto tensor = paddle::Tensor(
-              std::make_shared<phi::DenseTensor>(dense_tensor), var_name);
-          for (auto &hookfunc : this->hookfuncs_v2_) {
-            hookfunc(op->Type(), var_name, tensor);
-          }
-        }
-      }
-    });
-  });
-  hookfuncs_v2_.push_back(hookfunc);
-}
-
 template <>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<AnalysisConfig>(
     const AnalysisConfig &config) {
@@ -2964,10 +2943,6 @@ void Predictor::RegisterOutputHook(const OutputTensorHookFunc &hookfunc) {
   predictor_->RegisterOutputHook(hookfunc);
 }
 
-void Predictor::RegisterOutputHook(const OutputTensorHookFunc_V2 &hookfunc) {
-  predictor_->RegisterOutputHook(hookfunc);
-}
-
 void *Predictor::GetExecStream() const { return predictor_->GetExecStream(); }
 
 int GetNumBytesOfDataType(DataType dtype) {
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index 144d40e7cd23b..f047e68b5ccc2 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -318,16 +318,6 @@ class AnalysisPredictor : public PaddlePredictor {
   ///
   Argument::fusion_statis_t fusion_statis() { return fusion_statis_; }
 
-  ///
-  /// \brief Register a output hook function to operate the intermediate tensor
-  /// of op output. when using this function, memory reuse should be tured off.
-  /// The hook function signature is void(const std::string&, const
-  /// std::string&, const paddle_infer::Tensor&>). Here, the first parameter is
-  /// op's type, the second param is output var name of the op, and the third
-  /// parameter is output tensor with the var name.
-  ///
-  void RegisterOutputHook(const OutputTensorHookFunc &hookfunc) override;
-
   ///
   /// \brief Register a output hook function to operate the intermediate tensor
   /// of op output. when using this function, memory reuse should be tured off.
@@ -336,7 +326,7 @@ class AnalysisPredictor : public PaddlePredictor {
   /// type, the second param is output var name of the op, and the third
   /// parameter is output tensor with the var name.
   ///
-  void RegisterOutputHook(const OutputTensorHookFunc_V2 &hookfunc) override;
+  void RegisterOutputHook(const OutputTensorHookFunc &hookfunc) override;
 
   ///
   /// \brief Initialize mkldnn quantizer and execute mkldnn quantization pass
@@ -608,7 +598,6 @@ class AnalysisPredictor : public PaddlePredictor {
 
  private:
   std::vector<OutputTensorHookFunc> hookfuncs_;
-  std::vector<OutputTensorHookFunc_V2> hookfuncs_v2_;
 
   // Some status here that help to determine the status inside the predictor.
   bool status_is_cloned_{false};
diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
index 1dc4215a4966c..211f6b59539a1 100644
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -39,7 +39,6 @@ using PaddleDType = paddle_infer::DataType;
 using PaddlePlace = paddle_infer::PlaceType;
 using PaddleDataLayout = paddle_infer::DataLayout;
 using paddle_infer::OutputTensorHookFunc;
-using paddle_infer::OutputTensorHookFunc_V2;
 
 /// \brief Memory manager for PaddleTensor.
 ///
@@ -314,16 +313,6 @@ class PD_INFER_DECL PaddlePredictor {
   ///
   virtual uint64_t TryShrinkMemory() { return 0; }
 
-  ///
-  /// \brief Register a output hook function to operate the intermediate tensor
-  /// of op output. when using this function, memory reuse should be tured off.
-  /// The hook function signature is void(const std::string&, const
-  /// std::string&, const paddle_infer::Tensor&>). Here, the first parameter is
-  /// op's type, the second param is output var name of the op, and the third
-  /// parameter is output tensor with the var name.
-  ///
-  virtual void RegisterOutputHook(const OutputTensorHookFunc& hookfunc) {}
-
   ///
   /// \brief Register a output hook function to operate the intermediate tensor
   /// of op output. when using this function, memory reuse should be tured off.
@@ -332,7 +321,7 @@ class PD_INFER_DECL PaddlePredictor {
   /// type, the second param is output var name of the op, and the third
   /// parameter is output tensor with the var name.
   ///
-  virtual void RegisterOutputHook(const OutputTensorHookFunc_V2& hookfunc) {}
+  virtual void RegisterOutputHook(const OutputTensorHookFunc& hookfunc) {}
 
   /// \brief Clone an existing predictor
   /// When using clone, the same network will be created,
diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
index fdabde89f915c..aa77015ba6377 100644
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -199,16 +199,6 @@ class PD_INFER_DECL Predictor {
   ///
   void RegisterOutputHook(const OutputTensorHookFunc& hookfunc);
 
-  ///
-  /// \brief Register a output hook function to operate the intermediate tensor
-  /// of op output. when using this function, memory reuse should be tured off.
-  /// The hook function signature is void(const std::string&, const
-  /// std::string&, const Tensor&>). Here, the first parameter is op's
-  /// type, the second param is output var name of the op, and the third
-  /// parameter is output tensor with the var name.
-  ///
-  void RegisterOutputHook(const OutputTensorHookFunc_V2& hookfunc);
-
   ///
   /// \brief Get the execution stream on devices with a concept of stream,
   /// otherwise returns nullptr.
diff --git a/paddle/fluid/inference/api/paddle_tensor.h b/paddle/fluid/inference/api/paddle_tensor.h
index 1b8ae09cf3c98..9cc228dbb9bd3 100644
--- a/paddle/fluid/inference/api/paddle_tensor.h
+++ b/paddle/fluid/inference/api/paddle_tensor.h
@@ -36,11 +36,7 @@ namespace paddle_infer {
 /// Strings for text data.
 using Strings = std::vector<std::string>;
 
-class Tensor;
-using OutputTensorHookFunc =
-    std::function<void(const std::string&, const std::string&, const Tensor&)>;
-
-using OutputTensorHookFunc_V2 = std::function<void(
+using OutputTensorHookFunc = std::function<void(
     const std::string&, const std::string&, const paddle::Tensor&)>;
 
 typedef void (*CallbackFunc)(void*);
diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc
index cb472fc6948ca..e30387b751cfb 100644
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
@@ -220,6 +220,8 @@ class WhileOp : public framework::OperatorBase {
           dev_place, *block, &placeholder, execution_config));
     }
 
+    core_->SetOutputHooks(hookfuncs_);
+
     if (!is_test) {
       while (cond_data) {
         auto &current_scope = scope.NewScope();
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index 32e6ff6920d41..711f99e87489e 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -1096,11 +1096,7 @@ void BindPaddleInferPredictor(py::module *m) {
       .def("clear_intermediate_tensor",
            &paddle_infer::Predictor::ClearIntermediateTensor)
       .def("register_output_hook",
-           py::overload_cast<const paddle_infer::OutputTensorHookFunc &>(
-               &paddle_infer::Predictor::RegisterOutputHook))
-      .def("register_output_hook_v2",
-           py::overload_cast<const paddle_infer::OutputTensorHookFunc_V2 &>(
-               &paddle_infer::Predictor::RegisterOutputHook));
+           &paddle_infer::Predictor::RegisterOutputHook);
 }
 
 void BindZeroCopyTensor(py::module *m) {
diff --git a/test/cpp/inference/api/analysis_predictor_tester.cc b/test/cpp/inference/api/analysis_predictor_tester.cc
index 84070cf39bc81..6e3497d14a0dd 100644
--- a/test/cpp/inference/api/analysis_predictor_tester.cc
+++ b/test/cpp/inference/api/analysis_predictor_tester.cc
@@ -668,53 +668,6 @@ TEST(Predictor, Streams) {
 #endif
 
 TEST(AnalysisPredictor, OutputTensorHookFunc) {
-  auto hookfunc = [](const std::string& type,
-                     const std::string& var_name,
-                     const Tensor& tensor) { LOG(INFO) << "in hook function"; };
-
-  {
-    Config config;
-    config.SetModel(FLAGS_dirname);
-    config.EnableUseGpu(100, 0);
-
-    auto predictor = CreatePredictor(config);
-
-    predictor->RegisterOutputHook(hookfunc);
-    auto w0 = predictor->GetInputHandle("firstw");
-    auto w1 = predictor->GetInputHandle("secondw");
-    auto w2 = predictor->GetInputHandle("thirdw");
-    auto w3 = predictor->GetInputHandle("forthw");
-    w0->Reshape({4, 1});
-    w1->Reshape({4, 1});
-    w2->Reshape({4, 1});
-    w3->Reshape({4, 1});
-    auto* w0_data = w0->mutable_data<int64_t>(PlaceType::kCPU);
-    auto* w1_data = w1->mutable_data<int64_t>(PlaceType::kCPU);
-    auto* w2_data = w2->mutable_data<int64_t>(PlaceType::kCPU);
-    auto* w3_data = w3->mutable_data<int64_t>(PlaceType::kCPU);
-    for (int i = 0; i < 4; i++) {
-      w0_data[i] = i;
-      w1_data[i] = i;
-      w2_data[i] = i;
-      w3_data[i] = i;
-    }
-    predictor->Run();
-    predictor->TryShrinkMemory();
-  }
-
-  {
-    Config config;
-    config.SetModel(FLAGS_dirname);
-    config.EnableMemoryOptim();
-    config.EnableUseGpu(100, 0);
-
-    auto predictor = CreatePredictor(config);
-
-    predictor->RegisterOutputHook(hookfunc);
-  }
-}
-
-TEST(AnalysisPredictor, OutputTensorHookFunc_V2) {
   auto hookfunc = [](const std::string& type,
                      const std::string& var_name,
                      const paddle::Tensor& tensor) {