apache · jcf94 · Aug 27, 2021 · Jun 25, 2021 · Jun 25, 2021 · Jun 26, 2021
diff --git a/src/runtime/graph_executor/graph_executor.cc b/src/runtime/graph_executor/graph_executor.cc
@@ -91,6 +91,11 @@ void GraphExecutor::Init(const std::string& graph_json, tvm::runtime::Module mod
     std::string& name = nodes_[nid].name;
     input_map_[name] = i;
   }
+  for (size_t i = 0; i < outputs_.size(); i++) {
+    const uint32_t nid = outputs_[i].node_id;
+    std::string& name = nodes_[nid].name;
+    output_map_[name] = i;
+  }
 }
 /*!
  * \brief Get the input index given the name of input.
@@ -104,6 +109,18 @@ int GraphExecutor::GetInputIndex(const std::string& name) {
   }
   return -1;
 }
+/*!
+ * \brief Get the output index given the name of output.
+ * \param name The name of the output.
+ * \return The index of output.
+ */
+int GraphExecutor::GetOutputIndex(const std::string& name) {
+  auto it = output_map_.find(name);
+  if (it != output_map_.end()) {
+    return it->second;
+  }
+  return -1;
+}
 /*!
  * \brief set index-th input to the graph.
  * \param index The input index.
@@ -114,6 +131,23 @@ void GraphExecutor::SetInput(int index, DLTensor* data_in) {
   uint32_t eid = this->entry_id(input_nodes_[index], 0);
   data_entry_[eid].CopyFrom(data_in);
 }
+/*!
+ * \brief Check the legality of external DLTensor*.
+ * \param external The external DLTensor*.
+ * \param eid The data_enrty_ index.
+ */
+void GraphExecutor::CheckExternalDLTensor(const DLTensor* external, uint32_t eid) const {
+  const DLTensor* internal = data_entry_[eid].operator->();
+
+  ICHECK_EQ(data_alignment_[eid], details::GetDataAlignment(*external));
+  ICHECK_EQ(reinterpret_cast<size_t>(external->data) % kAllocAlignment, 0);
+  ICHECK_EQ(internal->ndim, static_cast<size_t>(external->ndim));
+  ICHECK_EQ(internal->device.device_type, external->device.device_type);
+  ICHECK_EQ(internal->device.device_id, external->device.device_id);
+  for (auto i = 0; i < external->ndim; ++i) {
+    ICHECK_EQ(internal->shape[i], external->shape[i]);
+  }
+}
 /*!
  * \brief set index-th input to the graph without copying the data.
  * \param index The input index.
@@ -122,23 +156,37 @@ void GraphExecutor::SetInput(int index, DLTensor* data_in) {
 void GraphExecutor::SetInputZeroCopy(int index, DLTensor* data_ref) {
   ICHECK_LT(static_cast<size_t>(index), input_nodes_.size());
   uint32_t eid = this->entry_id(input_nodes_[index], 0);
-  const DLTensor* old_t = data_entry_[eid].operator->();
-
   // check the consistency of input
-  ICHECK_EQ(data_alignment_[eid], details::GetDataAlignment(*data_ref));
-  ICHECK_EQ(reinterpret_cast<size_t>(data_ref->data) % kAllocAlignment, 0);
-  ICHECK_EQ(old_t->ndim, static_cast<size_t>(data_ref->ndim));
-  ICHECK_EQ(old_t->device.device_type, data_ref->device.device_type);
-  ICHECK_EQ(old_t->device.device_id, data_ref->device.device_id);
-  for (auto i = 0; i < data_ref->ndim; ++i) {
-    ICHECK_EQ(old_t->shape[i], data_ref->shape[i]);
-  }
-
+  CheckExternalDLTensor(data_ref, eid);
   // Update the data pointer for each argument of each op
   for (DLTensor* t : input_dltensors_[eid]) {
     t->data = data_ref->data;
   }
 }
+/*!
+ * \brief set index-th output to the graph without copying the data.
+ * \param index The output index.
+ * \param data_ref The output data that is referred.
+ */
+void GraphExecutor::SetOutputZeroCopy(int index, DLTensor* data_ref) {
+  ICHECK_LT(static_cast<size_t>(index), outputs_.size());
+  ICHECK_LT(static_cast<size_t>(index), output_dltensors_.size());
+  const NodeEntry& output_node = outputs_[index];
+  uint32_t output_node_eid = this->entry_id(output_node);
+
+  // check the consistency of output
+  CheckExternalDLTensor(data_ref, output_node_eid);
+
+  // Update the data pointer for output op
+  for (DLTensor* t : output_dltensors_[output_node_eid]) {
+    t->data = data_ref->data;
+  }
+
+  // Update the input of the op connected to the output
+  for (DLTensor* t : both_output_opinput_dltensors_[output_node_eid]) {
+    t->data = data_ref->data;
+  }
+}
 /*!
  * \brief Get the number of outputs
  *
@@ -358,11 +406,17 @@ void GraphExecutor::SetupStorage() {
 void GraphExecutor::SetupOpExecs() {
   op_execs_.resize(this->GetNumOfNodes());
   input_dltensors_.resize(num_node_entries());
+  output_dltensors_.resize(num_node_entries());
+  both_output_opinput_dltensors_.resize(num_node_entries());
   std::unordered_set<uint32_t> input_node_eids;
   for (size_t i = 0; i < input_nodes_.size(); i++) {
     uint32_t nid = input_nodes_[i];
     input_node_eids.insert(entry_id(nid, 0));
   }
+  std::unordered_set<uint32_t> output_node_eids;
+  for (size_t i = 0; i < outputs_.size(); i++) {
+    output_node_eids.insert(entry_id(outputs_[i]));
+  }
 
   // setup the array and requirements.
   for (uint32_t nid = 0; nid < this->GetNumOfNodes(); ++nid) {
@@ -383,10 +437,25 @@ void GraphExecutor::SetupOpExecs() {
     std::tie(op_execs_[nid], op_args) = CreateTVMOp(inode.param, args);
 
     for (size_t i = 0; i < inode.inputs.size(); i++) {
-      uint32_t eid = this->entry_id(inode.inputs[i]);
+      uint32_t input_eid = this->entry_id(inode.inputs[i]);
       // check if op input is model input
-      if (input_node_eids.count(eid) > 0) {
-        input_dltensors_[eid].push_back(static_cast<DLTensor*>(op_args->arg_values[i].v_handle));
+      if (input_node_eids.count(input_eid) > 0) {
+        input_dltensors_[input_eid].push_back(
+            static_cast<DLTensor*>(op_args->arg_values[i].v_handle));
+      }
+      // check if any model output is the input of the op
+      if (output_node_eids.count(input_eid) > 0) {
+        both_output_opinput_dltensors_[input_eid].push_back(
+            static_cast<DLTensor*>(op_args->arg_values[i].v_handle));
+      }
+    }
+
+    for (uint32_t i = inode.inputs.size(); i < inode.inputs.size() + inode.param.num_outputs; ++i) {
+      uint32_t output_eid = this->entry_id(nid, i - inode.inputs.size());
+      // check if op output is model output
+      if (output_node_eids.count(output_eid) > 0) {
+        output_dltensors_[output_eid].push_back(
+            static_cast<DLTensor*>(op_args->arg_values[i].v_handle));
       }
     }
   }
@@ -462,6 +531,15 @@ PackedFunc GraphExecutor::GetFunction(const std::string& name,
         this->SetInputZeroCopy(args[0], args[1]);
       }
     });
+  } else if (name == "set_output_zero_copy") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      if (String::CanConvertFrom(args[0])) {
+        int out_idx = this->GetOutputIndex(args[0].operator String());
+        if (out_idx >= 0) this->SetOutputZeroCopy(out_idx, args[1]);
+      } else {
+        this->SetOutputZeroCopy(args[0], args[1]);
+      }
+    });
   } else if (name == "get_output") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
       if (args.num_args == 2) {

diff --git a/src/runtime/graph_executor/graph_executor.h b/src/runtime/graph_executor/graph_executor.h
@@ -107,6 +107,13 @@ class TVM_DLL GraphExecutor : public ModuleNode {
    */
   int GetInputIndex(const std::string& name);
 
+  /*!
+   * \brief Get the output index given the name of output.
+   * \param name The name of the output.
+   * \return The index of output.
+   */
+  int GetOutputIndex(const std::string& name);
+
   /*!
    * \brief set index-th input to the graph.
    * \param index The input index.
@@ -119,6 +126,12 @@ class TVM_DLL GraphExecutor : public ModuleNode {
    * \param data_ref The input data that is referred.
    */
   void SetInputZeroCopy(int index, DLTensor* data_ref);
+  /*!
+   * \brief set index-th output to the graph without copying the data.
+   * \param index The output index.
+   * \param data_ref The output data that is referred.
+   */
+  void SetOutputZeroCopy(int index, DLTensor* data_ref);
   /*!
    * \brief Get the number of outputs
    *
@@ -193,6 +206,9 @@ class TVM_DLL GraphExecutor : public ModuleNode {
     uint32_t node_id;
     uint32_t index;
     uint32_t version;
+    inline bool operator==(const NodeEntry& other) const {
+      return node_id == other.node_id && index == other.index && version == other.version;
+    }
     // JSON Loader
     void Load(dmlc::JSONReader* reader) {
       reader->BeginArray();
@@ -377,6 +393,12 @@ class TVM_DLL GraphExecutor : public ModuleNode {
   void SetupStorage();
   /*! \brief Setup the executors. */
   void SetupOpExecs();
+  /*!
+   * \brief Check the legality of external DLTensor*.
+   * \param external The external DLTensor*.
+   * \param eid The data_enrty_ index.
+   */
+  void CheckExternalDLTensor(const DLTensor* external, uint32_t eid) const;
   /*!
    * \brief Create an execution function given input.
    * \param attrs The node attributes.
@@ -397,8 +419,14 @@ class TVM_DLL GraphExecutor : public ModuleNode {
   std::vector<uint32_t> input_nodes_;
   /*! \brief Map of input names to input indices. */
   std::unordered_map<std::string, uint32_t> input_map_;
+  /*! \brief Map of output names to output indices. */
+  std::unordered_map<std::string, uint32_t> output_map_;
   /*! \brief Used for quick node input DLTensor* lookup given an input eid. */
   std::vector<std::vector<DLTensor*>> input_dltensors_;
+  /*! \brief Used for quick node output DLTensor* lookup given an output eid. */
+  std::vector<std::vector<DLTensor*>> output_dltensors_;
+  /*! \brief Used for quick node(both model output and op input) DLTensor* lookup given an eid. */
+  std::vector<std::vector<DLTensor*>> both_output_opinput_dltensors_;
   /*! \brief Used for quick entry indexing. */
   std::vector<uint32_t> node_row_ptr_;
   /*! \brief Output entries. */

diff --git a/tests/cpp/build_module_test.cc b/tests/cpp/build_module_test.cc
@@ -199,3 +199,129 @@ TEST(BuildModule, Heterogeneous) {
     ICHECK_LT(std::fabs(p_out[i] - (i + (i + 1.0) - (i - 1.0))), 1e-5);
   }
 }
+
+TEST(BuildModule, ZeroCopy) {
+  /*
+   *
+   *          A    B
+   *           \  /
+   *      elemwise_add(out0)
+   *              \
+   *       C      copy
+   *        \      /
+   *      elemwise_sub(out1)
+   */
+
+  using namespace tvm;
+  using namespace tvm::te;
+
+  auto target_llvm = Target("llvm");
+
+  // The shape of input tensors.
+  const int n = 4;
+  Array<PrimExpr> shape{n};
+
+  auto A = placeholder(shape, DataType::Float(32), "A");
+  auto B = placeholder(shape, DataType::Float(32), "B");
+  auto C = placeholder(shape, DataType::Float(32), "C");
+
+  auto elemwise_add = compute(
+      A->shape, [&A, &B](PrimExpr i) { return A[i] + B[i]; }, "elemwise_add");
+
+  auto copy = placeholder(shape, DataType::Float(32), "__copy");
+  auto elemwise_sub = compute(
+      C->shape, [&copy, &C](PrimExpr i) { return copy[i] - C[i]; }, "elemwise_sub");
+
+  With<Target> llvm_scope(target_llvm);
+  auto s1 = create_schedule({elemwise_add->op});
+  auto s2 = create_schedule({elemwise_sub->op});
+
+  auto args1 = Array<Tensor>({A, B, elemwise_add});
+  auto args2 = Array<Tensor>({copy, C, elemwise_sub});
+
+  std::unordered_map<Tensor, Buffer> binds;
+  auto lowered_s1 = LowerSchedule(s1, args1, "elemwise_add", binds);
+  auto lowered_s2 = LowerSchedule(s2, args2, "elemwise_sub", binds);
+  Map<tvm::Target, IRModule> inputs = {{target_llvm, lowered_s1}, {target_llvm, lowered_s2}};
+  auto module = build(inputs, Target());
+
+  // Execute the graph and check the correctness.
+  // Setup graph json.
+  std::string json =
+      "{\"nodes\": [{\"op\": \"null\", \"name\": \"A\", \"inputs\": []}, "
+      "{\"op\": \"null\", \"name\": \"B\", \"inputs\": []}, {\"op\": "
+      "\"tvm_op\", \"name\": \"elemwise_add\", \"attrs\": {\"flatten_data\": "
+      "\"1\", \"func_name\": \"elemwise_add\", \"num_inputs\": \"2\", "
+      "\"num_outputs\": \"1\"}, \"inputs\": [[0, 0, 0], [1, 0, 0]]}, {\"op\": "
+      "\"tvm_op\", \"name\": \"__copy_add_to_sub\", \"attrs\": "
+      "{\"flatten_data\": \"0\", \"func_name\": \"__copy\", \"num_inputs\": "
+      "\"1\", \"num_outputs\": \"1\"}, \"inputs\": [[2, 0, 0]]}, {\"op\": "
+      "\"null\", \"name\": \"C\", \"inputs\": []}, {\"op\": \"tvm_op\", "
+      "\"name\": \"elemwise_sub\", \"attrs\": {\"flatten_data\": \"0\", "
+      "\"func_name\": \"elemwise_sub\", \"num_inputs\": \"2\", "
+      "\"num_outputs\": \"1\"}, \"inputs\": [[3, 0, 0], [4, 0, 0]]}], "
+      "\"arg_nodes\": [0, 1, 4], \"node_row_ptr\": [0, 1, 2, 3, 4, 5, 6], "
+      "\"heads\": [[2, 0, 0], [5, 0, 0]], \"attrs\": {\"storage_id\": [\"list_int\", "
+      "[3, 4, 0, 1, 5, 2]], \"shape\": [\"list_shape\", [[4], [4], [4], [4], [4], "
+      "[4]]], \"device_index\": [\"list_int\", [2, 2, 2, 1, 1, 1]], \"dtype\": "
+      "[\"list_int\", [0, 0, 0, 0, 0, 0]], \"dltype\": [\"list_str\", "
+      "[\"float32\", \"float32\", \"float32\", \"float32\", \"float32\", "
+      "\"float32\"]]}}";
+  // Setup inputs.
+  auto a_val = runtime::NDArray::Empty({n}, {kDLFloat, 32, 1}, {kDLCPU, 0});
+  auto b_val = runtime::NDArray::Empty({n}, {kDLFloat, 32, 1}, {kDLCPU, 0});
+  auto c_val = runtime::NDArray::Empty({n}, {kDLFloat, 32, 1}, {kDLCPU, 0});
+
+  auto pa = (float*)(a_val->data);
+  auto pb = (float*)(b_val->data);
+  auto pc = (float*)(c_val->data);
+
+  // Assign values.
+  for (int i = 0; i < n; i++) {
+    pa[i] = i;
+    pb[i] = i + 1.0;
+    pc[i] = i - 1.0;
+  }
+
+  // // Initialize graph executor.
+  int device_type = static_cast<int>(kDLCPU);
+  int device_id = 0;
+
+  const runtime::PackedFunc* graph_executor =
+      tvm::runtime::Registry::Get("tvm.graph_executor.create");
+  runtime::Module mod = (*graph_executor)(json, module, device_type, device_id);
+
+  // test FFI for module.
+  auto test_ffi = PackedFunc([](TVMArgs args, TVMRetValue* rv) {
+    int tcode = args[1];
+    ICHECK_EQ(args[0].type_code(), tcode);
+  });
+
+  test_ffi(runtime::Module(mod), static_cast<int>(kTVMModuleHandle));
+  test_ffi(Optional<runtime::Module>(mod), static_cast<int>(kTVMModuleHandle));
+
+  PackedFunc set_input_zero_copy = mod.GetFunction("set_input_zero_copy", false);
+  PackedFunc set_output_zero_copy = mod.GetFunction("set_output_zero_copy", false);
+  PackedFunc run = mod.GetFunction("run", false);
+  set_input_zero_copy("A", a_val);
+  set_input_zero_copy("B", b_val);
+  set_input_zero_copy("C", c_val);
+
+  tvm::runtime::NDArray out0 = runtime::NDArray::Empty({n}, {kDLFloat, 32, 1}, {kDLCPU, 0});
+  tvm::runtime::NDArray out1 = runtime::NDArray::Empty({n}, {kDLFloat, 32, 1}, {kDLCPU, 0});
+  set_output_zero_copy("elemwise_add", out0);
+  set_output_zero_copy("elemwise_sub", out1);
+
+  run();
+  float* p_out0 = (float*)out0->data;
+  float* p_out1 = (float*)out1->data;
+
+  // Check correctness.
+  for (int i = 0; i < n; ++i) {
+    ICHECK_LT(std::fabs(p_out0[i] - (i + (i + 1.0))), 1e-5);
+  }
+
+  for (int i = 0; i < n; ++i) {
+    ICHECK_LT(std::fabs(p_out1[i] - (i + (i + 1.0) - (i - 1.0))), 1e-5);
+  }
+}