apache · DickJC123 · Nov 1, 2019 · Apr 9, 2019 · Apr 29, 2019 · May 5, 2019
diff --git a/3rdparty/tvm b/3rdparty/tvm
diff --git a/src/common/exec_utils.h b/src/common/exec_utils.h
@@ -622,6 +622,51 @@ inline nnvm::Graph AssignContext(nnvm::Graph g,
   return g;
 }
 
+inline void CopyGraph(nnvm::Graph *dst, const nnvm::Graph &src, bool copy_variables) {
+  using nnvm::Node;
+  using nnvm::NodePtr;
+  using nnvm::NodeEntry;
+  std::unordered_map<Node*, NodePtr> old_new;
+  // use DFSVisit to copy all the nodes
+  DFSVisit(src.outputs, [&old_new, copy_variables](const NodePtr& node) {
+      NodePtr np;
+      if (copy_variables || !node->is_variable()) {
+        np = Node::Create();
+        np->attrs = node->attrs;
+      } else {
+        np = node;
+      }
+      old_new[node.get()] = std::move(np);
+    });
+  // connect nodes of new graph
+  for (const auto &kv : old_new) {
+    for (const NodeEntry& e : kv.first->inputs) {
+      Node *ptr = e.node.get();
+      kv.second->inputs.emplace_back(NodeEntry{old_new[ptr], e.index, e.version});
+    }
+    for (const NodePtr& p : kv.first->control_deps) {
+      kv.second->control_deps.emplace_back(old_new[p.get()]);
+    }
+  }
+  // set the head
+  for (const NodeEntry &e : src.outputs) {
+    (*dst).outputs.emplace_back(NodeEntry{old_new[e.node.get()], e.index, e.version});
+  }
+}
+
+inline bool CheckForInputNameDuplicates(const nnvm::IndexedGraph &idx) {
+  std::set<std::string> names;
+  for (const auto& nid : idx.input_nodes()) {
+    const std::string &name = idx[nid].source->attrs.name;
+    if (names.count(name)) {
+      LOG(WARNING) << "Variable name " << name << " is used more than once!";
+      return false;
+    }
+    names.insert(name);
+  }
+  return true;
+}
+
 }  // namespace common
 }  // namespace mxnet
 #endif  // MXNET_COMMON_EXEC_UTILS_H_

diff --git a/src/executor/exec_pass.h b/src/executor/exec_pass.h
@@ -34,10 +34,30 @@
 #include <vector>
 #include <memory>
 #include <string>
+#include <utility>
 
 namespace mxnet {
 namespace exec {
 
+template <typename Attr>
+using FAccessSubgraphAttr = std::function<std::pair<std::vector<Attr>, std::vector<Attr>>
+                              (const NodeAttrs& attrs)>;
+
+using FAccessSubgraphShape = FAccessSubgraphAttr<mxnet::TShape>;
+using FAccessSubgraphType = FAccessSubgraphAttr<int>;
+using FAccessSubgraphStorageType = FAccessSubgraphAttr<int>;
+
+template <typename Attr>
+using FProvideSubgraphAttr = std::function<void (const NodeAttrs& attrs,
+                                                 const std::vector<std::vector<Attr>> &in_attrs,
+                                                 const std::vector<std::vector<Attr>> &out_attrs)>;
+using FProvideSubgraphShape = FProvideSubgraphAttr<mxnet::TShape>;
+using FProvideSubgraphType = FProvideSubgraphAttr<int>;
+using FProvideSubgraphStorageType = FProvideSubgraphAttr<int>;
+
+using TIsFusion = bool;
+using TIsFusionHelper = bool;
+
 /*! \brief reuse graph definition */
 using nnvm::Graph;
 
@@ -170,6 +190,16 @@ void AttachOpResources(const Graph& g,
  */
 Graph DetectInplaceAddTo(Graph g);
 
+using NodeEntryMapCounter =
+  std::unordered_map<nnvm::NodeEntry, uint32_t, nnvm::NodeEntryHash, nnvm::NodeEntryEqual>;
+/*!\brief
+ * This is to count how many time each output is used by another node (or the output of the graph)
+ */
+NodeEntryMapCounter GetNodeEntryCount(const Graph& g);
+
+Graph FusePointwiseForward(Graph&& g);
+Graph FusePointwiseBackward(Graph&& g);
+
 /*!
  * \brief Infer shapes in the graph given the information.
  * \param graph The input graph.

diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc
@@ -26,6 +26,7 @@
 #include <nnvm/graph.h>
 #include <nnvm/pass_functions.h>
 #include <vector>
+#include <set>
 #include <algorithm>
 
 #include "./exec_pass.h"
@@ -325,6 +326,7 @@ nnvm::Graph GraphExecutor::InitFullGraph(nnvm::Symbol symbol,
   if (!need_grad_) return g;
   for (size_t i = 0; i < g.outputs.size(); ++i) {
     NodeEntry ngrad(nnvm::Node::Create(), 0, 0);
+    ngrad.node->attrs.name = "_head_grad_" + std::to_string(i);
     head_grad_entry_.emplace_back(AttrHint(ngrad, g.outputs[i]));
     head_grad_map_[ngrad.node.get()] = i;
   }
@@ -365,6 +367,7 @@ nnvm::Graph GraphExecutor::InitFullGraph(nnvm::Symbol symbol,
   for (const auto &e : g_grad.outputs) {
     g.outputs.push_back(e);
   }
+
   return g;
 }
 
@@ -784,6 +787,7 @@ void GraphExecutor::Init(nnvm::Symbol symbol,
                          const nnvm::NodeEntryMap<NDArray>& feed_dict) {
   nnvm::Graph g = InitGraph(symbol, default_ctx, ctx_map, in_arg_ctxes, arg_grad_ctxes,
                             aux_state_ctxes, grad_req_types);
+
   // The following code of shape and dtype inferences and argument
   // initialization is for simple_bind only. Regular bind operation
   // should do this differently.
@@ -964,6 +968,7 @@ Executor* GraphExecutor::Reshape(const bool partial_shaping,
              this);
   return exec;
 }
+
 /*!
  * \brief This function is triggered by both simple_bind
  * and bind flows.
@@ -981,6 +986,40 @@ Graph GraphExecutor::InitGraph(nnvm::Symbol symbol,
   // setup gradient
   nnvm::Graph g = InitFullGraph(symbol, grad_req_types);
 
+#if MXNET_USE_CUDA && !defined(_WIN32)
+  if (dmlc::GetEnv("MXNET_USE_FUSION", true) && default_ctx.dev_mask() == Context::kGPU) {
+    nnvm::Graph unoptimized_graph;
+    common::CopyGraph(&unoptimized_graph, g, false);
+
+    if (common::CheckForInputNameDuplicates(unoptimized_graph.indexed_graph())) {
+      g.attrs["num_forward_outputs"] = std::make_shared<nnvm::any>(num_forward_outputs_);
+      g = FusePointwiseForward(std::move(g));
+      g.attrs["num_forward_outputs"] = std::make_shared<nnvm::any>(num_forward_outputs_);
+      g = FusePointwiseBackward(std::move(g));
+      // Check the topological order of inputs
+      const auto &original_inputs = unoptimized_graph.indexed_graph().input_nodes();
+      const auto &new_inputs = g.indexed_graph().input_nodes();
+      if (original_inputs.size() != new_inputs.size()) {
+        LOG(WARNING)
+          << "Number of inputs after fusion does not match original number of inputs. "
+          << "This is most probably a bug. Disabling fusion for this run.";
+        g = unoptimized_graph;
+      } else {
+        for (size_t i = 0; i < new_inputs.size(); ++i) {
+          if (unoptimized_graph.indexed_graph()[original_inputs[i]].source->attrs.name !=
+              g.indexed_graph()[new_inputs[i]].source->attrs.name) {
+            LOG(WARNING) << "Disabling fusion due to altered topological order of inputs.";
+            g = unoptimized_graph;
+            break;
+          }
+        }
+      }
+    } else {
+      LOG(WARNING) << "Graph contains duplicate names for some of its inputs - fusion is NOT enabled!";
+     }
+  }
+#endif  // MXNET_USE_CUDA
+
   // create "device" and "context" attrs for the graph
   g = AssignContext(g, default_ctx, ctx_map,
                     in_arg_ctxes,
@@ -1897,7 +1936,7 @@ Executor *Executor::SimpleBind(nnvm::Symbol symbol,
                                  arg_stype_map, default_ctx, group2ctx, &tmp_in_arg_ctxes,
                                  &tmp_arg_grad_ctxes, &tmp_grad_req_types, &tmp_aux_state_ctxes);
   }
-  exec->Init(symbol, default_ctx, group2ctx, tmp_in_arg_ctxes, tmp_arg_grad_ctxes,
+  exec->Init(symbol.Copy(), default_ctx, group2ctx, tmp_in_arg_ctxes, tmp_arg_grad_ctxes,
              tmp_aux_state_ctxes, arg_shape_map, arg_dtype_map, arg_stype_map, tmp_grad_req_types,
              shared_arg_names, in_args, arg_grads, aux_states, shared_buffer, shared_exec);
   return exec;
@@ -1922,8 +1961,8 @@ Executor *Executor::Bind(nnvm::Symbol symbol,
         exec::BuildSubgraph(symbol, exec->subgraph_property(), default_ctx, group2ctx, &tmp_in_args,
                             &tmp_arg_grad_store, &tmp_grad_req_type, &tmp_aux_states);
   }
-  exec->Init(symbol, default_ctx, group2ctx, tmp_in_args, tmp_arg_grad_store, tmp_grad_req_type,
-             tmp_aux_states, reinterpret_cast<Executor*>(shared_exec));
+  exec->Init(symbol.Copy(), default_ctx, group2ctx, tmp_in_args, tmp_arg_grad_store,
+             tmp_grad_req_type, tmp_aux_states, reinterpret_cast<Executor*>(shared_exec));
   return exec;
 }
 }  // namespace mxnet