From 533f9d0aa9a607d6312aba02a927fabd45dce92c Mon Sep 17 00:00:00 2001
From: yuanlehome <yuanlehome@163.com>
Date: Thu, 30 Mar 2023 05:35:56 +0000
Subject: [PATCH 1/6] inference predictor support paddle::Tensor

---
 cmake/phi_header.cmake                        |  31 ++--
 paddle/fluid/framework/feed_fetch_method.cc   |   2 +-
 .../fluid/inference/api/analysis_predictor.cc | 155 ++++++++++++++++--
 .../fluid/inference/api/analysis_predictor.h  |  42 ++++-
 .../inference/api/demo_ci/CMakeLists.txt      |   2 +-
 paddle/fluid/inference/api/paddle_api.h       |  10 ++
 .../inference/api/paddle_inference_api.h      |  11 ++
 paddle/fluid/inference/api/paddle_tensor.h    |   2 +
 paddle/fluid/pybind/inference_api.cc          |  32 ++--
 paddle/phi/api/include/tensor.h               |   6 +-
 paddle/phi/api/lib/api_custom_impl.cc         |   1 +
 11 files changed, 249 insertions(+), 45 deletions(-)

diff --git a/cmake/phi_header.cmake b/cmake/phi_header.cmake
index 7596b59e71c76e..d1f062ba9fda07 100644
--- a/cmake/phi_header.cmake
+++ b/cmake/phi_header.cmake
@@ -17,21 +17,21 @@ set(PADDLE_INFERENCE_INSTALL_DIR
 
 function(phi_header_path_compat TARGET_PATH)
   message(STATUS "phi header path compat processing: ${TARGET_PATH}")
-  string(FIND ${TARGET_PATH} "experimental" pos)
-  if(pos GREATER 1)
-    file(GLOB HEADERS "${TARGET_PATH}/*" "*.h")
-    foreach(header ${HEADERS})
-      if(${header} MATCHES ".*.h$")
-        file(READ ${header} HEADER_CONTENT)
-        string(REPLACE "paddle/phi/" "paddle/include/experimental/phi/"
-                       HEADER_CONTENT "${HEADER_CONTENT}")
-        string(REPLACE "paddle/utils/" "paddle/include/experimental/utils/"
-                       HEADER_CONTENT "${HEADER_CONTENT}")
-        file(WRITE ${header} "${HEADER_CONTENT}")
-        message(STATUS "phi header path compat processing complete: ${header}")
-      endif()
-    endforeach()
-  endif()
+  # string(FIND ${TARGET_PATH} "experimental" pos)
+  # if(pos GREATER 1)
+  file(GLOB HEADERS "${TARGET_PATH}/*" "*.h")
+  foreach(header ${HEADERS})
+    if(${header} MATCHES ".*.h$")
+      file(READ ${header} HEADER_CONTENT)
+      string(REPLACE "paddle/phi/" "paddle/include/experimental/phi/"
+                     HEADER_CONTENT "${HEADER_CONTENT}")
+      string(REPLACE "paddle/utils/" "paddle/include/experimental/utils/"
+                     HEADER_CONTENT "${HEADER_CONTENT}")
+      file(WRITE ${header} "${HEADER_CONTENT}")
+      message(STATUS "phi header path compat processing complete: ${header}")
+    endif()
+  endforeach()
+  # endif()
 endfunction()
 
 phi_header_path_compat(
@@ -46,6 +46,7 @@ phi_header_path_compat(
   ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/common)
 phi_header_path_compat(
   ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/core)
+phi_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/)
 
 # In order to be compatible with the original behavior, the header file name needs to be changed
 file(RENAME
diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc
index f21ca0c858acc0..0294e1ca54b437 100644
--- a/paddle/fluid/framework/feed_fetch_method.cc
+++ b/paddle/fluid/framework/feed_fetch_method.cc
@@ -95,7 +95,7 @@ phi::DenseTensor& GetVariableTensor(const Scope& scope,
   PADDLE_ENFORCE_EQ(var->IsType<phi::DenseTensor>(),
                     true,
                     platform::errors::InvalidArgument(
-                        "Only support lod tensor in GetVariableTensor now."));
+                        "Only support DenseTensor in GetVariableTensor now."));
   return *var->GetMutable<phi::DenseTensor>();
 }
 
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 91dee8a9ae4ee9..fb41d5fd353d3b 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -155,11 +155,10 @@ phi::Backend ConvertBackend(paddle_infer::PlaceType backend) {
       return phi::Backend::CPU;
   }
 }
-}  // namespace
 
-bool PaddleTensorToLoDTensor(const PaddleTensor &pt,
-                             phi::DenseTensor *t,
-                             const platform::Place &place) {
+bool PaddleTensorToDenseTensor(const PaddleTensor &pt,
+                               phi::DenseTensor *t,
+                               const platform::Place &place) {
   framework::DDim ddim = phi::make_ddim(pt.shape);
   void *input_ptr;
   if (pt.dtype == PaddleDType::INT64) {
@@ -270,6 +269,7 @@ bool PaddleTensorToLoDTensor(const PaddleTensor &pt,
   t->set_lod(lod);
   return true;
 }
+}  // namespace
 
 bool AnalysisPredictor::Init(
     const std::shared_ptr<framework::Scope> &parent_scope,
@@ -923,6 +923,17 @@ void AnalysisPredictor::MkldnnPreSet(const std::vector<PaddleTensor> &inputs) {
 #endif
 }
 
+void AnalysisPredictor::MkldnnPreSet(
+    const std::vector<paddle::Tensor> &inputs) {
+#ifdef PADDLE_WITH_MKLDNN
+  std::vector<std::vector<int>> inputs_shape;
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    inputs_shape.emplace_back(phi::vectorize<int>(inputs[i].dims()));
+  }
+  MkldnnPreSet(inputs_shape);
+#endif
+}
+
 void AnalysisPredictor::MkldnnPreSet(
     const std::vector<std::vector<int>> &inputs_shape) {
 #ifdef PADDLE_WITH_MKLDNN
@@ -1037,6 +1048,73 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
   return true;
 }
 
+bool AnalysisPredictor::Run(const std::vector<paddle::Tensor> &inputs,
+                            std::vector<paddle::Tensor> *outputs) {
+  inference::DisplayMemoryInfo(place_, "before run");
+  paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
+#ifdef PADDLE_WITH_MKLDNN
+  if (config_.use_mkldnn_) MkldnnPreSet(inputs);
+#endif
+  VLOG(3) << "predict start";
+  inference::Timer timer;
+  timer.tic();
+  // set feed variable
+  framework::Scope *scope = sub_scope_ ? sub_scope_ : scope_.get();
+  PADDLE_ENFORCE_NOT_NULL(
+      scope,
+      platform::errors::PreconditionNotMet("The scope should not be nullptr."));
+  if (!SetFeed(inputs, scope)) {
+    LOG(ERROR) << "fail to set feed";
+    return false;
+  }
+
+#ifdef PADDLE_WITH_TENSORRT
+  if (config_.tensorrt_engine_enabled()) {
+    inference::tensorrt::TensorRTEngine::predictor_id_per_thread =
+        predictor_id_;
+    VLOG(3) << "thread_local var predictor_id in TensorRTEngine is set to: "
+            << inference::tensorrt::TensorRTEngine::predictor_id_per_thread;
+  }
+#endif
+
+  // Run the inference program
+  // if share variables, we need not create variables
+  executor_->Run();
+
+  inference::DisplayMemoryInfo(place_, "after run");
+
+  // get fetch variable
+  if (!GetFetch(outputs, scope)) {
+    LOG(ERROR) << "fail to get fetches";
+    return false;
+  }
+  VLOG(3) << "predict cost: " << timer.toc() << "ms";
+
+  // All the containers in the scope will be hold in inference, but the
+  // operators assume that the container will be reset after each batch.
+  // Here is a bugfix, collect all the container variables, and reset then to a
+  // bool; the next time, the operator will call MutableData and construct a new
+  // container again, so that the container will be empty for each batch.
+  if (sub_scope_) {
+    tensor_array_batch_cleaner_.CollectNoTensorVars(sub_scope_);
+  }
+  tensor_array_batch_cleaner_.ResetNoTensorVars();
+
+  // recover the cpu_math_library_num_threads to 1, in order to avoid thread
+  // conflict when integrating it into deployment service.
+  paddle::platform::SetNumThreads(1);
+#ifdef PADDLE_WITH_MKLDNN
+  if (config_.use_mkldnn_) MkldnnPostReset();
+#endif
+#if defined(PADDLE_WITH_MKLML)
+  // Frees unused memory allocated by the Intel® MKL Memory Allocator to
+  // avoid memory leak. See:
+  // https://software.intel.com/en-us/mkl-developer-reference-c-mkl-free-buffers
+  platform::dynload::MKL_Free_Buffers();
+#endif
+  return true;
+}
+
 bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
                                 framework::Scope *scope) {
   VLOG(3) << "Predictor::set_feed";
@@ -1051,7 +1129,7 @@ bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
 
   for (size_t i = 0; i < inputs.size(); ++i) {
     phi::DenseTensor *input = &feed_tensors_[i];
-    if (!PaddleTensorToLoDTensor(inputs[i], input, place_)) {
+    if (!PaddleTensorToDenseTensor(inputs[i], input, place_)) {
       return false;
     }
     int idx = -1;
@@ -1065,7 +1143,41 @@ bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
     } else {
       idx = PADDLE_GET_CONST(int, feeds_[i]->GetAttr("col"));
     }
-    framework::SetFeedVariable(scope, *input, "feed", idx);
+    framework::SetFeedVariable(scope, *input, framework::kFeedOpType, idx);
+  }
+  return true;
+}
+
+bool AnalysisPredictor::SetFeed(const std::vector<paddle::Tensor> &inputs,
+                                framework::Scope *scope) {
+  VLOG(3) << "Predictor::set_feed";
+  PADDLE_ENFORCE_EQ(inputs.size(),
+                    feeds_.size(),
+                    platform::errors::InvalidArgument(
+                        "wrong feed input size, need %d but get %d.",
+                        feeds_.size(),
+                        inputs.size()));
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    PADDLE_ENFORCE_EQ(inputs[i].initialized(),
+                      true,
+                      paddle::platform::errors::InvalidArgument(
+                          "The input Tensor expected to be initialized."));
+  }
+
+  if (std::all_of(inputs.cbegin(), inputs.cend(), [&](const paddle::Tensor &t) {
+        return !t.name().empty() && feed_names_.count(t.name());
+      })) {
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      auto &t = framework::GetVariableTensor(*scope, inputs[i].name());
+      t.ShareDataWith(
+          *std::dynamic_pointer_cast<phi::DenseTensor>(inputs[i].impl()));
+    }
+  } else {
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      auto &t = framework::GetVariableTensor(*scope, idx2feeds_[i]);
+      t.ShareDataWith(
+          *std::dynamic_pointer_cast<phi::DenseTensor>(inputs[i].impl()));
+    }
   }
   return true;
 }
@@ -1104,7 +1216,7 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
             idx,
             i));
     framework::FetchType &fetch_var =
-        framework::GetFetchVariable(*scope, "fetch", idx);
+        framework::GetFetchVariable(*scope, framework::kFetchOpType, idx);
     auto &fetch = PADDLE_GET(phi::DenseTensor, fetch_var);
     auto type = framework::TransToProtoVarType(fetch.dtype());
     auto output = &(outputs->at(i));
@@ -1129,6 +1241,20 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
   return true;
 }
 
+bool AnalysisPredictor::GetFetch(std::vector<paddle::Tensor> *outputs,
+                                 framework::Scope *scope) {
+  VLOG(3) << "Predictor::get_fetch";
+  outputs->resize(fetches_.size());
+  // TODO(liuyuanle): customize output Tensor's holder
+  for (size_t i = 0; i < fetches_.size(); ++i) {
+    auto const &name = idx2fetches_[i];
+    auto &t = framework::GetVariableTensor(*scope, name);
+    (*outputs)[i] =
+        std::move(paddle::Tensor(std::make_shared<phi::DenseTensor>(t), name));
+  }
+  return true;
+}
+
 void AnalysisPredictor::PrepareArgument() {
   VLOG(3) << "AnalysisPredictor::PrepareArgument";
   // Init std::unique_ptr argument_.
@@ -1583,7 +1709,7 @@ void AnalysisPredictor::PrepareFeedFetch() {
                               "The sub_scope should not be nullptr."));
   CreateFeedFetchVar(sub_scope_);
   for (auto *op : inference_program_->Block(0).AllOps()) {
-    if (op->Type() == "feed") {
+    if (op->Type() == framework::kFeedOpType) {
       int idx = PADDLE_GET_CONST(int, op->GetAttr("col"));
       if (feeds_.size() <= static_cast<size_t>(idx)) {
         feeds_.resize(idx + 1);
@@ -1591,7 +1717,7 @@ void AnalysisPredictor::PrepareFeedFetch() {
       feeds_[idx] = op;
       feed_names_[op->Output("Out")[0]] = idx;
       idx2feeds_[idx] = op->Output("Out")[0];
-    } else if (op->Type() == "fetch") {
+    } else if (op->Type() == framework::kFetchOpType) {
       int idx = PADDLE_GET_CONST(int, op->GetAttr("col"));
       if (fetches_.size() <= static_cast<size_t>(idx)) {
         fetches_.resize(idx + 1);
@@ -1606,9 +1732,9 @@ void AnalysisPredictor::CreateFeedFetchVar(framework::Scope *scope) {
   PADDLE_ENFORCE_NOT_NULL(
       scope,
       platform::errors::InvalidArgument("The scope should not be nullptr."));
-  auto *var = scope->Var("feed");
+  auto *var = scope->Var(framework::kFeedOpType);
   var->GetMutable<framework::FeedList>();
-  var = scope->Var("fetch");
+  var = scope->Var(framework::kFetchOpType);
   var->GetMutable<framework::FetchList>();
 }
 
@@ -2190,7 +2316,7 @@ void AnalysisPredictor::ClearIntermediateTensor() {
       const std::string name = var->Name();
       auto *variable = executor_->GetScope()->FindVar(name);
       if (variable != nullptr && variable->IsType<phi::DenseTensor>() &&
-          name != "feed" && name != "fetch") {
+          name != framework::kFeedOpType && name != framework::kFetchOpType) {
         VLOG(3) << "Clear Intermediate Tensor: " << name;
         auto *t = variable->GetMutable<phi::DenseTensor>();
         t->clear();
@@ -2656,6 +2782,11 @@ std::map<std::string, DataType> Predictor::GetOutputTypes() {
 
 bool Predictor::Run() { return predictor_->ZeroCopyRun(); }
 
+bool Predictor::Run(const std::vector<paddle::Tensor> &inputs,
+                    std::vector<paddle::Tensor> *outputs) {
+  return predictor_->Run(inputs, outputs);
+}
+
 std::unique_ptr<Predictor> Predictor::Clone(void *stream) {
   auto analysis_pred = predictor_->Clone(stream);
   std::unique_ptr<Predictor> pred(new Predictor(std::move(analysis_pred)));
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index 5a578a9b94fcb7..67ec3c308df839 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -31,15 +31,14 @@
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/resource_manager.h"
 #include "paddle/fluid/platform/device/gpu/gpu_types.h"
-#include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/phi/core/dense_tensor.h"
 #ifdef PADDLE_WITH_TESTING
 #include <gtest/gtest.h>
 #include <gtest/gtest_prod.h>
 #endif
 
 namespace paddle_infer {
-using float16 = paddle::platform::float16;
 namespace experimental {
 class InternalUtils;
 };
@@ -150,6 +149,16 @@ class AnalysisPredictor : public PaddlePredictor {
            std::vector<PaddleTensor> *output_data,
            int batch_size = -1) override;
 
+  ///
+  /// \brief Run the prediction engine.
+  ///
+  /// \param[in] inputs input tensors
+  /// \param[out] outputs output tensors
+  /// \return Whether the function executed successfully
+  ///
+  bool Run(const std::vector<paddle::Tensor> &inputs,
+           std::vector<paddle::Tensor> *outputs) override;
+
   ///
   /// \brief Get the input names
   ///
@@ -378,6 +387,17 @@ class AnalysisPredictor : public PaddlePredictor {
   ///
   bool SetFeed(const std::vector<PaddleTensor> &input_datas,
                framework::Scope *scope);
+
+  ///
+  /// \brief Prepare input data, only used in Run()
+  ///
+  /// \param[in] inputs inpute tensors
+  /// \param[in] scope the scope used by predictor
+  /// \return Whether the function executed successfully
+  ///
+  bool SetFeed(const std::vector<paddle::Tensor> &inputs,
+               framework::Scope *scope);
+
   ///
   /// \brief Get the output data, only used in Run()
   ///
@@ -387,6 +407,16 @@ class AnalysisPredictor : public PaddlePredictor {
   ///
   bool GetFetch(std::vector<PaddleTensor> *output_data,
                 framework::Scope *scope);
+
+  ///
+  /// \brief Get the output data, only used in Run()
+  ///
+  /// \param[out] outputs output tensors
+  /// \param[in] scope the scope used by predictor
+  /// \return Whether the function executed successfully
+  ///
+  bool GetFetch(std::vector<paddle::Tensor> *outputs, framework::Scope *scope);
+
   ///
   /// \brief Get the output data, only used in GetFetch()
   ///
@@ -404,6 +434,14 @@ class AnalysisPredictor : public PaddlePredictor {
   /// \param[in] inputs tensors
   ///
   void MkldnnPreSet(const std::vector<PaddleTensor> &inputs);
+  ///
+  /// \brief PreSet for Mkldnn multi-thread and dynamic shape input.
+  ///
+  /// Used in AnalysisPredictor::Run().
+  ///
+  /// \param[in] inputs tensors
+  ///
+  void MkldnnPreSet(const std::vector<paddle::Tensor> &inputs);
 
   ///
   /// \brief PreSet for Mkldnn multi-thread and dynamic shape input.
diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
index fc23caee656380..11f214bc45d535 100644
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -83,7 +83,7 @@ else()
   if(WITH_MKL)
     set(FLAG_OPENMP "-fopenmp")
   endif()
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 ${FLAG_OPENMP}")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 ${FLAG_OPENMP}")
 endif()
 
 if(WITH_GPU)
diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
index e83c1a9f9444c9..8d7f54ea771840 100644
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -221,6 +221,16 @@ class PD_INFER_DECL PaddlePredictor {
                    std::vector<PaddleTensor>* output_data,
                    int batch_size = -1) = 0;
 
+  /// \brief This interface takes input and runs the network.
+  /// \param[in] inputs An list of Tensor as the input to the network.
+  /// \param[out] output_data Pointer to the tensor list, which holds the output
+  /// Tensor
+  /// \return Whether the run is successful
+  virtual bool Run(const std::vector<paddle::Tensor>& inputs,
+                   std::vector<paddle::Tensor>* outputs) {
+    return false;
+  }
+
   /// \brief  Used to get the name of the network input.
   /// Be inherited by AnalysisPredictor, Only used in ZeroCopy scenarios.
   /// \return Input tensor names.
diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
index d7f15e0529894f..210fa81c99f988 100644
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -128,6 +128,17 @@ class PD_INFER_DECL Predictor {
   ///
   bool Run();
 
+  ///
+  /// \brief Run the prediction engine
+  ///
+  /// \param[in] inputs An list of Tensor as the input to the network.
+  /// \param[out] outputs Pointer to the tensor list, which holds the output
+  /// Tensor
+  ///
+  /// \return Whether the run is successful
+  bool Run(const std::vector<paddle::Tensor>& inputs,
+           std::vector<paddle::Tensor>* outputs);
+
   ///
   /// \brief Get the output names
   ///
diff --git a/paddle/fluid/inference/api/paddle_tensor.h b/paddle/fluid/inference/api/paddle_tensor.h
index 7c61d4098b44dd..5ed1ae47427f9f 100644
--- a/paddle/fluid/inference/api/paddle_tensor.h
+++ b/paddle/fluid/inference/api/paddle_tensor.h
@@ -21,6 +21,8 @@
 
 #include "paddle_infer_declare.h"  // NOLINT
 
+#include "paddle/phi/api/include/tensor.h"  // expose paddle::Tensor
+
 #ifdef PADDLE_WITH_ONNXRUNTIME
 #include "onnxruntime_c_api.h"    // NOLINT
 #include "onnxruntime_cxx_api.h"  // NOLINT
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index afc5b73e232c78..14be816fbedeaf 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -65,7 +65,7 @@ constexpr int NPY_UINT16_ = 4;
 // paddle::platform::float16 as numpy.float16.
 // Ref: https://github.com/pybind/pybind11/issues/1776
 template <>
-struct npy_format_descriptor<paddle_infer::float16> {
+struct npy_format_descriptor<phi::dtype::float16> {
   static py::dtype dtype() {
     handle ptr = npy_api::get().PyArray_DescrFromType_(NPY_FLOAT16_);
     return reinterpret_borrow<py::dtype>(ptr);
@@ -180,7 +180,7 @@ py::dtype PaddleDTypeToNumpyDType(PaddleDType dtype) {
       dt = py::dtype::of<float>();
       break;
     case PaddleDType::FLOAT16:
-      dt = py::dtype::of<paddle_infer::float16>();
+      dt = py::dtype::of<phi::dtype::float16>();
       break;
     case PaddleDType::UINT8:
       dt = py::dtype::of<uint8_t>();
@@ -264,7 +264,7 @@ void PaddleInferShareExternalData(paddle_infer::Tensor &tensor,  // NOLINT
         ToPaddleInferPlace(input_tensor.place().GetType()));
   } else if (input_tensor.dtype() == phi::DataType::FLOAT16) {
     tensor.ShareExternalData(
-        static_cast<paddle::platform::float16 *>(input_tensor.data()),
+        static_cast<phi::dtype::float16 *>(input_tensor.data()),
         shape,
         ToPaddleInferPlace(input_tensor.place().GetType()));
   } else if (input_tensor.dtype() == phi::DataType::INT32) {
@@ -353,7 +353,7 @@ size_t PaddleGetDTypeSize(PaddleDType dt) {
       size = sizeof(float);
       break;
     case PaddleDType::FLOAT16:
-      size = sizeof(paddle_infer::float16);
+      size = sizeof(phi::dtype::float16);
       break;
     case PaddleDType::INT8:
       size = sizeof(int8_t);
@@ -392,8 +392,8 @@ py::array ZeroCopyTensorToNumpy(ZeroCopyTensor &tensor) {  // NOLINT
       tensor.copy_to_cpu<float>(static_cast<float *>(array.mutable_data()));
       break;
     case PaddleDType::FLOAT16:
-      tensor.copy_to_cpu<paddle::platform::float16>(
-          static_cast<paddle::platform::float16 *>(array.mutable_data()));
+      tensor.copy_to_cpu<phi::dtype::float16>(
+          static_cast<phi::dtype::float16 *>(array.mutable_data()));
       break;
     case PaddleDType::UINT8:
       tensor.copy_to_cpu<uint8_t>(static_cast<uint8_t *>(array.mutable_data()));
@@ -432,8 +432,8 @@ py::array PaddleInferTensorToNumpy(paddle_infer::Tensor &tensor) {  // NOLINT
       tensor.CopyToCpu<float>(static_cast<float *>(array.mutable_data()));
       break;
     case PaddleDType::FLOAT16:
-      tensor.CopyToCpu<paddle::platform::float16>(
-          static_cast<paddle::platform::float16 *>(array.mutable_data()));
+      tensor.CopyToCpu<phi::dtype::float16>(
+          static_cast<phi::dtype::float16 *>(array.mutable_data()));
       break;
     case PaddleDType::UINT8:
       tensor.CopyToCpu(static_cast<uint8_t *>(array.mutable_data()));
@@ -1070,6 +1070,17 @@ void BindPaddleInferPredictor(py::module *m) {
 #endif
              self.Run();
            })
+      .def(
+          "run",
+          [](paddle_infer::Predictor &self, py::handle py_in_tensor_list) {
+            auto in_tensor_list =
+                CastPyArg2VectorOfTensor(py_in_tensor_list.ptr(), 0);
+            std::vector<paddle::Tensor> outputs;
+            self.Run(in_tensor_list, &outputs);
+            return py::handle(ToPyObject(outputs));
+          },
+          py::arg("inputs"),
+          py::call_guard<py::gil_scoped_release>())
       .def("clone",
            [](paddle_infer::Predictor &self) { return self.Clone(nullptr); })
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -1099,7 +1110,7 @@ void BindZeroCopyTensor(py::module *m) {
       .def("copy_from_cpu", &ZeroCopyTensorCreate<int64_t>)
       .def("copy_from_cpu", &ZeroCopyTensorCreate<double>)
       .def("copy_from_cpu", &ZeroCopyTensorCreate<float>)
-      .def("copy_from_cpu", &ZeroCopyTensorCreate<paddle_infer::float16>)
+      .def("copy_from_cpu", &ZeroCopyTensorCreate<phi::dtype::float16>)
       .def("copy_from_cpu", &ZeroCopyTensorCreate<bool>)
       .def("copy_from_cpu", &ZeroCopyStringTensorCreate)
       .def("copy_to_cpu", &ZeroCopyTensorToNumpy)
@@ -1123,8 +1134,7 @@ void BindPaddleInferTensor(py::module *m) {
       .def("_copy_from_cpu_bind", &PaddleInferTensorCreate<int64_t>)
       .def("_copy_from_cpu_bind", &PaddleInferTensorCreate<double>)
       .def("_copy_from_cpu_bind", &PaddleInferTensorCreate<float>)
-      .def("_copy_from_cpu_bind",
-           &PaddleInferTensorCreate<paddle_infer::float16>)
+      .def("_copy_from_cpu_bind", &PaddleInferTensorCreate<phi::dtype::float16>)
       .def("_copy_from_cpu_bind", &PaddleInferTensorCreate<bool>)
       .def("_copy_from_cpu_bind", &PaddleInferStringTensorCreate)
       .def("_share_external_data_bind", &PaddleInferShareExternalData)
diff --git a/paddle/phi/api/include/tensor.h b/paddle/phi/api/include/tensor.h
index d3943750fd21ef..24bcc63dbd278f 100644
--- a/paddle/phi/api/include/tensor.h
+++ b/paddle/phi/api/include/tensor.h
@@ -416,7 +416,7 @@ class PADDLE_API Tensor final {
   /**
    * @brief Return the name of Tensor.
    * @note Used to adapt original execution mechanism and debug analysis
-   * in the development of new dygraph. It may be removed in the future.
+   * in the development of new dygraph.
    *
    * @return const std::string&
    */
@@ -425,7 +425,7 @@ class PADDLE_API Tensor final {
   /**
    * @brief Set name of Tensor.
    * @note Used to adapt original execution mechanism and debug analysis
-   * in the development of new dygraph. It may be removed in the future.
+   * in the development of new dygraph.
    *
    * @param const std::string& name
    */
@@ -657,7 +657,7 @@ class PADDLE_API Tensor final {
 
   /**
    * Tensor name: used to adapt original execution mechanism and debug analysis
-   * in the development of new dygraph. It may be removed in the future.
+   * in the development of new dygraph.
    */
   std::string name_{""};
 
diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc
index 3cebef1588ea5a..6a409b64196239 100644
--- a/paddle/phi/api/lib/api_custom_impl.cc
+++ b/paddle/phi/api/lib/api_custom_impl.cc
@@ -136,6 +136,7 @@ Tensor add_n_impl(const std::vector<Tensor>& x) {
 Tensor copy_to_impl(const Tensor& x, Place place, bool blocking) {
   Tensor out;
   copy(x, place, blocking, &out);
+  out.set_name(x.name());
   return out;
 }
 

From d6997ff555b78e1c61021d0d5773d2cd2bcfc0b6 Mon Sep 17 00:00:00 2001
From: yuanlehome <yuanlehome@163.com>
Date: Thu, 30 Mar 2023 07:43:35 +0000
Subject: [PATCH 2/6] add unitest

---
 .../test_inference_double_datatype.py}        |   0
 .../inference/test_inference_predictor_run.py | 129 ++++++++++++++++++
 2 files changed, 129 insertions(+)
 rename python/paddle/fluid/tests/unittests/ir/{test_inference_datatype.py => inference/test_inference_double_datatype.py} (100%)
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_inference_predictor_run.py

diff --git a/python/paddle/fluid/tests/unittests/ir/test_inference_datatype.py b/python/paddle/fluid/tests/unittests/ir/inference/test_inference_double_datatype.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/ir/test_inference_datatype.py
rename to python/paddle/fluid/tests/unittests/ir/inference/test_inference_double_datatype.py
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_inference_predictor_run.py b/python/paddle/fluid/tests/unittests/ir/inference/test_inference_predictor_run.py
new file mode 100644
index 00000000000000..da7144720f5f2c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_inference_predictor_run.py
@@ -0,0 +1,129 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tempfile
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle.inference import Config, create_predictor
+
+
+class TestNet(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.fc1 = paddle.nn.Linear(4, 4)
+        self.fc2 = paddle.nn.Linear(4, 4)
+
+    def forward(self, x1, x2):
+        y1 = self.fc1(x1)
+        y2 = self.fc2(x2)
+        return y1 + y2
+
+
+@unittest.skipIf(
+    not paddle.is_compiled_with_cuda(), 'should compile with cuda.'
+)
+class TestPredictorRunWithTensor(unittest.TestCase):
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+        net = TestNet()
+        model = paddle.jit.to_static(
+            net,
+            input_spec=[
+                paddle.static.InputSpec(
+                    shape=[None, 4], dtype='float32', name='input0'
+                ),
+                paddle.static.InputSpec(
+                    shape=[None, 4], dtype='float32', name='input1'
+                ),
+            ],
+        )
+        paddle.jit.save(
+            model,
+            os.path.join(
+                self.temp_dir.name, 'test_predictor_run_model/inference'
+            ),
+        )
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
+    def init_predictor(self):
+        config = Config(
+            os.path.join(
+                self.temp_dir.name,
+                'test_predictor_run_model/inference.pdmodel',
+            ),
+            os.path.join(
+                self.temp_dir.name,
+                'test_predictor_run_model/inference.pdiparams',
+            ),
+        )
+        config.enable_use_gpu(256, 0)
+        config.enable_memory_optim()
+        predictor = create_predictor(config)
+        return predictor
+
+    def get_inputs(self):
+        input0 = np.array([[1, 2, 3, 4], [2, 3, 4, 5]]).astype(np.float32)
+        input1 = np.array([[0.1, 0.2, 0.3, 0.4], [1.2, 1.3, 1.4, 1.5]]).astype(
+            np.float32
+        )
+
+        input0_tensor = paddle.to_tensor(input0)
+        input1_tensor = paddle.to_tensor(input1)
+
+        return [input0_tensor, input1_tensor]
+
+    def get_disorder_output(self):
+        predictor = self.init_predictor()
+
+        [input0_tensor, input1_tensor] = self.get_inputs()
+
+        input_names = predictor.get_input_names()
+        input0_tensor.name = input_names[0]
+        input1_tensor.name = input_names[1]
+
+        # disorder
+        inputs = [input1_tensor, input0_tensor]
+        outputs = predictor.run(inputs)
+
+        return outputs[0]
+
+    def get_inorder_output(self):
+        predictor = self.init_predictor()
+
+        [input0_tensor, input1_tensor] = self.get_inputs()
+
+        # disorder
+        inputs = [input0_tensor, input1_tensor]
+        outputs = predictor.run(inputs)
+
+        return outputs[0]
+
+    def test_output(self):
+        inorder_output = self.get_inorder_output()
+        disorder_output = self.get_disorder_output()
+        print(inorder_output)
+        print(disorder_output)
+        assert np.allclose(
+            inorder_output.numpy().flatten(), disorder_output.numpy().flatten()
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()

From 4f8b943316d9a7589579244f99300aae75db62d8 Mon Sep 17 00:00:00 2001
From: yuanlehome <yuanlehome@163.com>
Date: Thu, 30 Mar 2023 12:27:10 +0000
Subject: [PATCH 3/6] fix jit layer

---
 paddle/fluid/jit/engine/predictor_engine.cc   | 135 +-----------------
 .../inference/test_inference_predictor_run.py |   2 +-
 2 files changed, 6 insertions(+), 131 deletions(-)

diff --git a/paddle/fluid/jit/engine/predictor_engine.cc b/paddle/fluid/jit/engine/predictor_engine.cc
index d18f4f487dbe2e..847018e07e51cc 100644
--- a/paddle/fluid/jit/engine/predictor_engine.cc
+++ b/paddle/fluid/jit/engine/predictor_engine.cc
@@ -22,11 +22,6 @@
 namespace paddle {
 namespace jit {
 
-static PaddleTensor DenseTensorToPaddleTensor(DenseTensor *t);
-static bool PaddleTensorToDenseTensor(const PaddleTensor &pt,
-                                      DenseTensor *t,
-                                      const platform::Place &place);
-
 PredictorEngine::PredictorEngine(
     const std::shared_ptr<FunctionInfo> &info,
     const std::shared_ptr<VariableMap> &params_dict,
@@ -78,135 +73,15 @@ std::unique_ptr<BaseEngine> PredictorEngine::Clone(void *stream) {
 
 std::vector<Tensor> PredictorEngine::operator()(
     const std::vector<Tensor> &inputs) {
-  auto dense_tensors = utils::ToDenseTensors(inputs);
-  return utils::ToTensors(this->operator()(dense_tensors));
-}
-
-std::vector<DenseTensor> PredictorEngine::operator()(
-    const std::vector<DenseTensor> &inputs) {
-  std::vector<PaddleTensor> pt_inputs;
-  std::vector<PaddleTensor> pt_outputs;
-  for (auto &t : inputs) {
-    auto non_const_t = const_cast<DenseTensor *>(&t);
-    pt_inputs.emplace_back(DenseTensorToPaddleTensor(non_const_t));
-  }
-
-  predictor_->Run(pt_inputs, &pt_outputs);
-
-  std::vector<DenseTensor> outputs;
-  for (auto &pt : pt_outputs) {
-    DenseTensor t;
-    PaddleTensorToDenseTensor(pt, &t, place_);
-    outputs.emplace_back(t);
-  }
+  std::vector<Tensor> outputs;
+  predictor_->Run(inputs, &outputs);
 
   return outputs;
 }
 
-static PaddleTensor DenseTensorToPaddleTensor(DenseTensor *t) {
-  PaddleTensor pt;
-  switch (framework::TransToProtoVarType(t->dtype())) {
-    case framework::proto::VarType::INT32: {
-      pt.data.Reset(t->data(), t->numel() * sizeof(int32_t));
-      pt.dtype = PaddleDType::INT32;
-    } break;
-    case framework::proto::VarType::INT64: {
-      pt.data.Reset(t->data(), t->numel() * sizeof(int64_t));
-      pt.dtype = PaddleDType::INT64;
-    } break;
-    case framework::proto::VarType::FP32: {
-      pt.data.Reset(t->data(), t->numel() * sizeof(float));
-      pt.dtype = PaddleDType::FLOAT32;
-    } break;
-    default:
-      PADDLE_THROW(
-          platform::errors::Unimplemented("Unsupported tensor date type. Now "
-                                          "only supports INT64, FP32, INT32."));
-  }
-  pt.shape = phi::vectorize<int>(t->dims());
-  return pt;
-}
-
-static bool PaddleTensorToDenseTensor(const PaddleTensor &pt,
-                                      DenseTensor *t,
-                                      const platform::Place &place) {
-  framework::DDim ddim = phi::make_ddim(pt.shape);
-  void *input_ptr;
-  switch (pt.dtype) {
-    case PaddleDType::INT64:
-      input_ptr = t->mutable_data<int64_t>(ddim, place);
-      break;
-    case PaddleDType::FLOAT32:
-      input_ptr = t->mutable_data<float>(ddim, place);
-      break;
-    case PaddleDType::INT32:
-      input_ptr = t->mutable_data<int32_t>(ddim, place);
-      break;
-    case PaddleDType::FLOAT16:
-      input_ptr = t->mutable_data<float16>(ddim, place);
-      break;
-    default:
-      LOG(ERROR) << "unsupported feed type " << pt.dtype;
-      return false;
-  }
-
-  PADDLE_ENFORCE_NOT_NULL(
-      input_ptr,
-      paddle::platform::errors::Fatal(
-          "Cannot convert to LoDTensor because LoDTensor creation failed."));
-  PADDLE_ENFORCE_NOT_NULL(
-      pt.data.data(),
-      paddle::platform::errors::InvalidArgument(
-          "The data contained in the input PaddleTensor is illegal."));
-
-  if (platform::is_cpu_place(place)) {
-    // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
-    std::memcpy(
-        static_cast<void *>(input_ptr), pt.data.data(), pt.data.length());
-  } else if (platform::is_ipu_place(place)) {
-#ifdef PADDLE_WITH_IPU
-    std::memcpy(
-        static_cast<void *>(input_ptr), pt.data.data(), pt.data.length());
-#else
-    PADDLE_THROW(paddle::platform::errors::Fatal(
-        "Not compile with WITH_IPU, should not reach here."));
-#endif
-  } else if (platform::is_gpu_place(place)) {
-    PADDLE_ENFORCE_EQ(platform::is_xpu_place(place),
-                      false,
-                      platform::errors::InvalidArgument(
-                          "Only one choice can be made between CPU and XPU."));
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto *dev_ctx = static_cast<const phi::GPUContext *>(pool.Get(place));
-    auto dst_gpu_place = place;
-    memory::Copy(dst_gpu_place,
-                 static_cast<void *>(input_ptr),
-                 platform::CPUPlace(),
-                 pt.data.data(),
-                 pt.data.length(),
-                 dev_ctx->stream());
-#else
-    PADDLE_THROW(paddle::platform::errors::Fatal(
-        "Not compile with CUDA, should not reach here."));
-#endif
-  } else if (platform::is_xpu_place(place)) {
-#ifdef PADDLE_WITH_XPU
-    auto dst_xpu_place = place;
-    memory::Copy(dst_xpu_place,
-                 static_cast<void *>(input_ptr),
-                 platform::CPUPlace(),
-                 pt.data.data(),
-                 pt.data.length());
-#else
-    PADDLE_THROW(paddle::platform::errors::Fatal(
-        "Not compile with XPU, should not reach here."));
-#endif
-  } else {
-    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
-        "The analysis predictor supports CPU, GPU and XPU now."));
-  }
-  return true;
+std::vector<DenseTensor> PredictorEngine::operator()(
+    const std::vector<DenseTensor> &inputs) {
+  return utils::ToDenseTensors(this->operator()(utils::ToTensors(inputs)));
 }
 
 }  // namespace jit
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_inference_predictor_run.py b/python/paddle/fluid/tests/unittests/ir/inference/test_inference_predictor_run.py
index da7144720f5f2c..3dfcdaea30e097 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_inference_predictor_run.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_inference_predictor_run.py
@@ -109,7 +109,7 @@ def get_inorder_output(self):
 
         [input0_tensor, input1_tensor] = self.get_inputs()
 
-        # disorder
+        # inorder
         inputs = [input0_tensor, input1_tensor]
         outputs = predictor.run(inputs)
 

From d2460c6928e0fd3aeabf028d92c2dc6d6714a3bc Mon Sep 17 00:00:00 2001
From: yuanlehome <yuanlehome@163.com>
Date: Fri, 31 Mar 2023 09:02:46 +0000
Subject: [PATCH 4/6] fix

---
 paddle/fluid/jit/engine/predictor_engine.cc                    | 1 +
 .../unittests/ir/inference/test_inference_predictor_run.py     | 3 +--
 ...inference_double_datatype.py => test_inference_datatype.py} | 0
 3 files changed, 2 insertions(+), 2 deletions(-)
 rename python/paddle/fluid/tests/unittests/ir/{inference/test_inference_double_datatype.py => test_inference_datatype.py} (100%)

diff --git a/paddle/fluid/jit/engine/predictor_engine.cc b/paddle/fluid/jit/engine/predictor_engine.cc
index 847018e07e51cc..54e35bc0f69dd6 100644
--- a/paddle/fluid/jit/engine/predictor_engine.cc
+++ b/paddle/fluid/jit/engine/predictor_engine.cc
@@ -47,6 +47,7 @@ PredictorEngine::PredictorEngine(
   config.SetSkipLoadParams(true);
   config.SetApplyOptim(true);
   config.SwitchIrOptim(true);
+  config.SwitchUseFeedFetchOps(false);
 
   predictor_.reset(new AnalysisPredictor(config));
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_inference_predictor_run.py b/python/paddle/fluid/tests/unittests/ir/inference/test_inference_predictor_run.py
index 3dfcdaea30e097..99ba29956c5dad 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_inference_predictor_run.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_inference_predictor_run.py
@@ -118,8 +118,7 @@ def get_inorder_output(self):
     def test_output(self):
         inorder_output = self.get_inorder_output()
         disorder_output = self.get_disorder_output()
-        print(inorder_output)
-        print(disorder_output)
+
         assert np.allclose(
             inorder_output.numpy().flatten(), disorder_output.numpy().flatten()
         )
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_inference_double_datatype.py b/python/paddle/fluid/tests/unittests/ir/test_inference_datatype.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/ir/inference/test_inference_double_datatype.py
rename to python/paddle/fluid/tests/unittests/ir/test_inference_datatype.py

From 8175bfb3b6bc133540642b6344dabb4501d46bbe Mon Sep 17 00:00:00 2001
From: yuanlehome <yuanlehome@163.com>
Date: Mon, 10 Apr 2023 11:29:16 +0000
Subject: [PATCH 5/6] fix win bug

---
 paddle/fluid/inference/api/analysis_predictor.cc | 4 ----
 paddle/fluid/pybind/eager_utils.cc               | 4 ++++
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 42f861c92c0a0b..2e6a59294cb604 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1052,8 +1052,6 @@ bool AnalysisPredictor::Run(const std::vector<paddle::Tensor> &inputs,
   if (config_.use_mkldnn_) MkldnnPreSet(inputs);
 #endif
   VLOG(3) << "predict start";
-  inference::Timer timer;
-  timer.tic();
   // set feed variable
   framework::Scope *scope = sub_scope_ ? sub_scope_ : scope_.get();
   PADDLE_ENFORCE_NOT_NULL(
@@ -1084,7 +1082,6 @@ bool AnalysisPredictor::Run(const std::vector<paddle::Tensor> &inputs,
     LOG(ERROR) << "fail to get fetches";
     return false;
   }
-  VLOG(3) << "predict cost: " << timer.toc() << "ms";
 
   // All the containers in the scope will be hold in inference, but the
   // operators assume that the container will be reset after each batch.
@@ -1241,7 +1238,6 @@ bool AnalysisPredictor::GetFetch(std::vector<paddle::Tensor> *outputs,
                                  framework::Scope *scope) {
   VLOG(3) << "Predictor::get_fetch";
   outputs->resize(fetches_.size());
-  // TODO(liuyuanle): customize output Tensor's holder
   for (size_t i = 0; i < fetches_.size(); ++i) {
     auto const &name = idx2fetches_[i];
     auto &t = framework::GetVariableTensor(*scope, name);
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index 1a177f59adba22..887125fda19f98 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -736,7 +736,11 @@ PyObject* ToPyObject(const std::vector<std::vector<size_t>>& value) {
 
 PyObject* ToPyObject(const std::vector<paddle::Tensor>& value,
                      bool return_py_none_if_not_initialize) {
+  // NOTE(liuyuanle): I encountered a bug(access violation) in windows. ref to
+  // https://stackoverflow.com/questions/55598839/how-to-fix-access-violation-error-when-returning-pyobject-from-c-function-usin
+  PyGILState_STATE gstate = PyGILState_Ensure();
   PyObject* result = PyList_New((Py_ssize_t)value.size());
+  PyGILState_Release(gstate);
 
   for (size_t i = 0; i < value.size(); i++) {
     if (!value[i].initialized() && return_py_none_if_not_initialize) {

From 2d546c6ed365c86f3e4ab7b4180e6b7c45bfe960 Mon Sep 17 00:00:00 2001
From: yuanlehome <yuanlehome@163.com>
Date: Mon, 10 Apr 2023 11:38:05 +0000
Subject: [PATCH 6/6] add (Recommended)

---
 paddle/fluid/inference/api/analysis_predictor.h   | 2 +-
 paddle/fluid/inference/api/paddle_api.h           | 2 +-
 paddle/fluid/inference/api/paddle_inference_api.h | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index 67ec3c308df839..83207a8bfd654c 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -150,7 +150,7 @@ class AnalysisPredictor : public PaddlePredictor {
            int batch_size = -1) override;
 
   ///
-  /// \brief Run the prediction engine.
+  /// \brief Run the prediction engine (Recommended).
   ///
   /// \param[in] inputs input tensors
   /// \param[out] outputs output tensors
diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
index 8d7f54ea771840..3a51f91b3afc22 100644
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -221,7 +221,7 @@ class PD_INFER_DECL PaddlePredictor {
                    std::vector<PaddleTensor>* output_data,
                    int batch_size = -1) = 0;
 
-  /// \brief This interface takes input and runs the network.
+  /// \brief This interface takes input and runs the network (Recommended).
   /// \param[in] inputs An list of Tensor as the input to the network.
   /// \param[out] output_data Pointer to the tensor list, which holds the output
   /// Tensor
diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
index 210fa81c99f988..54a9d9af117caa 100644
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -129,7 +129,7 @@ class PD_INFER_DECL Predictor {
   bool Run();
 
   ///
-  /// \brief Run the prediction engine
+  /// \brief Run the prediction engine (Recommended)
   ///
   /// \param[in] inputs An list of Tensor as the input to the network.
   /// \param[out] outputs Pointer to the tensor list, which holds the output