Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Pointwise fusion for GPU #15167

Merged
merged 112 commits into from
Nov 1, 2019
Merged
Show file tree
Hide file tree
Changes from 58 commits
Commits
Show all changes
112 commits
Select commit Hold shift + click to select a range
9653b67
Beginning of RTC of pointwise ops
ptrendx Apr 9, 2019
0e1774f
Code generation from the given JSON
ptrendx Apr 29, 2019
8bf2945
add initial simple_partition_pass and use it for pointwise fusion
Caenorst May 5, 2019
5cbb50d
fix the fusion, use a symbol.Copy() at the beginning of binding funct…
Caenorst May 5, 2019
fcf23c7
Fixes
Caenorst May 5, 2019
892c18f
Adding support for attribute inference for backward nodes when fusing
ptrendx May 23, 2019
0a342a0
keep proper input ordering for fused Op
Caenorst May 24, 2019
07de800
instantiate the indexed_graph before starting the subgraph replacemen…
Caenorst May 24, 2019
975e8a6
Fuse backward
ptrendx May 27, 2019
6d9c0bf
fix ordering of subgraph node inputs using subgraph topological order…
Caenorst May 29, 2019
384fbb0
excluse forward node fusion during the fusion of the nodes in the bac…
Caenorst May 29, 2019
b9506ff
Dealing with fused backward nodes inferattr
ptrendx Jun 3, 2019
f30fbbb
use subgraph.indexed_graph() instead of main for _FusedOpHelper nodes…
Caenorst Jun 4, 2019
1a2e30d
Adding support for other reqs in codegen
ptrendx Jun 4, 2019
15fbed5
Fix
ptrendx Jun 4, 2019
506b126
Cleaning
ptrendx Jun 4, 2019
cf88753
Change the TVM submodule
ptrendx Jun 4, 2019
b861af9
More cleaning
ptrendx Jun 4, 2019
d001b5d
Making linter happy
ptrendx Jun 5, 2019
48f1b94
Do fusion only if default context is GPU
ptrendx Jun 5, 2019
37d4bbf
Fixes for tests
ptrendx Jun 5, 2019
616b932
Fix the TVM commit
ptrendx Jun 6, 2019
56303c8
Fix lint
ptrendx Jun 6, 2019
00e61cf
Guard fusion with MXNET_USE_CUDA
ptrendx Jun 6, 2019
204ab30
Fix
ptrendx Jun 6, 2019
0e89f8c
Fix clang-tidy
ptrendx Jun 10, 2019
73a2a5c
Add erf and erfinv backward
ptrendx Jun 12, 2019
4d0f1c9
Gluon support for fusion
ptrendx Jun 10, 2019
3dddad7
Cleaning
ptrendx Jun 13, 2019
5067fa6
Cleaning and allow shape/type change in FusedOp
ptrendx Jun 13, 2019
b27a369
Fixing Gluon bugs
ptrendx Jun 13, 2019
f18847c
Fixing after rebase
ptrendx Jun 14, 2019
9a05327
Fixing race condition and guarding against races when using NVRTC
ptrendx Jun 14, 2019
309f9a7
Cleaning and renaming FusedOp to _FusedOp
ptrendx Jun 14, 2019
9617b03
Going easy on Windows compiler
ptrendx Jun 14, 2019
d730027
Merge branch 'upstream' into pr_fusion
ptrendx Jun 19, 2019
de9027b
Disable fusion on Windows for now
ptrendx Jun 19, 2019
3d2d715
Refactor InferAttr and InferShapeAttr
ptrendx Jun 19, 2019
5221677
Added slice and half2 support to FusedOp
nvchai Jun 17, 2019
f3e4f7a
Fix lint errors
nvchai Jun 17, 2019
84822e1
Added multiple types support for vector loading/storing
nvchai Jun 18, 2019
2896258
add slice fusion when it's at the beginning of subgraphs
Caenorst Jun 18, 2019
eb0151c
Removed constant ndim assumption in fused op
nvchai Jun 18, 2019
935342f
Fix memory alignment issue in slice for FusedOp
nvchai Jun 18, 2019
ffa6c63
Fixes
nvchai Jun 19, 2019
803fd2a
Fix lint errors
nvchai Jun 19, 2019
3ed3aef
Do not include cuda_fp16.h
ptrendx Jun 19, 2019
84c2df5
Refactor fused op op lists
ptrendx Jun 19, 2019
1d94365
Make linter happy
ptrendx Jun 19, 2019
844cb9f
Changes from review
ptrendx Jun 19, 2019
204b127
Fixes after rebase
ptrendx Jun 20, 2019
56eb99d
Expand FusedOp support for slice
nvchai Jun 19, 2019
e31b586
Fix for fp16 _zeros and _ones
ptrendx Jun 20, 2019
c611b56
Fix
ptrendx Jun 20, 2019
d0d0fcf
Moving aux functions to unnamed namespace and detail namespace -> fusion
ptrendx Jun 21, 2019
39e309f
Merge branch 'upstream' into pr_fusion
ptrendx Jun 25, 2019
7f12eac
Disabling fusion if it alters topological order of inputs
ptrendx Jun 25, 2019
654a358
Print code only when env variable is set
ptrendx Jun 25, 2019
32b690a
Fix
ptrendx Jun 25, 2019
39bfcf6
Fix lint and 2 tests that specify the same names for multiple inputs
ptrendx Jun 25, 2019
b109a38
Fixes from review and disabling fusion of slice with non-default step
ptrendx Jun 26, 2019
f1a14fd
Add amp_cast to fusion, fixes
ptrendx Jun 27, 2019
a72b980
Add amp_multicast and its backward to the list of support ops
ptrendx Jul 2, 2019
e4e674e
Apply wording suggestions from code review
ptrendx Jul 2, 2019
5766481
Apply wording suggestions from code review
ptrendx Jul 2, 2019
62513e6
Make clearer comment
ptrendx Jul 2, 2019
dd651d3
Adding punctuation and capitalization to \brief descriptions
ptrendx Jul 2, 2019
7974888
Fix
ptrendx Jul 2, 2019
2aa8950
Fix
ptrendx Jul 3, 2019
a96e778
Add backward_cast to fusion
ptrendx Jul 8, 2019
9ea5464
Adding unittests for fusion. Fix for erfinv_grad
ptrendx Jul 8, 2019
6c3a75a
Adding slice ops and add_n to tests
ptrendx Jul 9, 2019
6d0eaf3
Fixes from review
ptrendx Jul 10, 2019
70735f2
Setting inplace option
ptrendx Jul 11, 2019
9049086
Fix lint
ptrendx Jul 12, 2019
6f56a8b
Storing double in half
ptrendx Jul 12, 2019
171c24f
Retrigger CI
ptrendx Jul 19, 2019
26b19ed
Slight relaxing of the relative tolerance in the test
ptrendx Jul 23, 2019
551c3b7
Merge branch 'upstream' into pr_fusion
ptrendx Jul 23, 2019
912e831
Move the env variable check to the end
ptrendx Jul 24, 2019
052576e
Fix a race condition between InferShape and scheduled Forward
ptrendx Jul 25, 2019
0e1918f
Fix flakey test_fusion test involving fp32 erfinv op.
DickJC123 Jul 26, 2019
1bbdba6
Merge branch 'upstream' into pr_fusion
ptrendx Jul 26, 2019
7e1df6a
Fix from review
ptrendx Jul 29, 2019
7a92738
Added broadcast_like and slice_like to fused op
nvchai Jul 11, 2019
a1dee58
Minor fix and cleanup
nvchai Jul 11, 2019
36201fe
Added negative axis support in slice_axis, temporarily disabled fusio…
nvchai Aug 1, 2019
c077e97
Added axes support to slice_like
nvchai Aug 2, 2019
3f0bfb4
Added axis support to broadcast_like
nvchai Aug 2, 2019
1e20339
Add fast_load_slice function to fused op code
nvchai Aug 9, 2019
13b3076
Added runtime switch for choosing fast and slow slice kernel
nvchai Aug 14, 2019
e5649e1
Fix lint and warning
ptrendx Aug 20, 2019
868bcf6
Going easy on Windows compiler (again)
ptrendx Aug 21, 2019
1608d6a
Fix slice_like
ptrendx Sep 4, 2019
037a5de
Debug broadcast_like fusion
ptrendx Sep 4, 2019
e501bc9
Fix lint
ptrendx Sep 5, 2019
e0ca7d0
Fix lint
ptrendx Sep 10, 2019
8d3dc77
Trigger CI
ptrendx Sep 11, 2019
786b071
Get rid of the initializer list
ptrendx Sep 11, 2019
0720f66
Fix backward calls with different gradient type
ptrendx Sep 16, 2019
da8bfe3
avoid cycle when adding node specific for inputs of subgraph for poin…
Caenorst Sep 19, 2019
ed03595
Fix lint
ptrendx Sep 20, 2019
69facdc
Add namespace to the fusion implementations
ptrendx Sep 23, 2019
a5ee989
Merge branch 'upstream' into pr_fusion
ptrendx Oct 21, 2019
e26770b
Set launch bounds on the fused kernel
ptrendx Oct 21, 2019
80e36ba
Fix NumPy tests
ptrendx Oct 21, 2019
36e5ce8
Test showcasing an issue fixed in PR #16553
ptrendx Oct 22, 2019
f77fe5b
Cast scalarts to FP32 and perform (a*1.0/b) instead of (a/b)
MoisesHer Oct 23, 2019
fdf710e
Merge branch 'upstream' into pr_fusion
ptrendx Oct 24, 2019
76aa154
Fix a bug in cycle detection for inputs only op in pointwise fusion
Caenorst Oct 28, 2019
929b8e9
Merge branch 'upstream' into pr_fusion
ptrendx Oct 29, 2019
3d1b5af
Add comments to simple_partition_pass.h file
ptrendx Oct 31, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion 3rdparty/tvm
Submodule tvm updated from 21935d to 88163e
45 changes: 45 additions & 0 deletions src/common/exec_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -622,6 +622,51 @@ inline nnvm::Graph AssignContext(nnvm::Graph g,
return g;
}

inline void CopyGraph(nnvm::Graph *dst, const nnvm::Graph &src, bool copy_variables) {
ptrendx marked this conversation as resolved.
Show resolved Hide resolved
using nnvm::Node;
using nnvm::NodePtr;
using nnvm::NodeEntry;
std::unordered_map<Node*, NodePtr> old_new;
// use DFSVisit to copy all the nodes
DFSVisit(src.outputs, [&old_new, copy_variables](const NodePtr& node) {
NodePtr np;
if (copy_variables || !node->is_variable()) {
np = Node::Create();
np->attrs = node->attrs;
} else {
np = node;
}
old_new[node.get()] = std::move(np);
});
// connect nodes of new graph
for (const auto &kv : old_new) {
for (const NodeEntry& e : kv.first->inputs) {
Node *ptr = e.node.get();
kv.second->inputs.emplace_back(NodeEntry{old_new[ptr], e.index, e.version});
}
for (const NodePtr& p : kv.first->control_deps) {
kv.second->control_deps.emplace_back(old_new[p.get()]);
}
}
// set the head
for (const NodeEntry &e : src.outputs) {
(*dst).outputs.emplace_back(NodeEntry{old_new[e.node.get()], e.index, e.version});
}
}

inline bool CheckForInputNameDuplicates(const nnvm::IndexedGraph &idx) {
std::set<std::string> names;
ptrendx marked this conversation as resolved.
Show resolved Hide resolved
for (const auto& nid : idx.input_nodes()) {
const std::string &name = idx[nid].source->attrs.name;
if (names.count(name)) {
LOG(WARNING) << "Variable name " << name << " is used more than once!";
return false;
}
names.insert(name);
}
return true;
}

} // namespace common
} // namespace mxnet
#endif // MXNET_COMMON_EXEC_UTILS_H_
Expand Down
30 changes: 30 additions & 0 deletions src/executor/exec_pass.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,30 @@
#include <vector>
#include <memory>
#include <string>
#include <utility>

namespace mxnet {
namespace exec {

template <typename Attr>
using FAccessSubgraphAttr = std::function<std::pair<std::vector<Attr>, std::vector<Attr>>
(const NodeAttrs& attrs)>;

using FAccessSubgraphShape = FAccessSubgraphAttr<mxnet::TShape>;
using FAccessSubgraphType = FAccessSubgraphAttr<int>;
using FAccessSubgraphStorageType = FAccessSubgraphAttr<int>;

template <typename Attr>
using FProvideSubgraphAttr = std::function<void (const NodeAttrs& attrs,
const std::vector<std::vector<Attr>> &in_attrs,
const std::vector<std::vector<Attr>> &out_attrs)>;
using FProvideSubgraphShape = FProvideSubgraphAttr<mxnet::TShape>;
using FProvideSubgraphType = FProvideSubgraphAttr<int>;
using FProvideSubgraphStorageType = FProvideSubgraphAttr<int>;

using TIsFusion = bool;
using TIsFusionHelper = bool;

/*! \brief reuse graph definition */
using nnvm::Graph;

Expand Down Expand Up @@ -170,6 +190,16 @@ void AttachOpResources(const Graph& g,
*/
Graph DetectInplaceAddTo(Graph g);

using NodeEntryMapCounter =
std::unordered_map<nnvm::NodeEntry, uint32_t, nnvm::NodeEntryHash, nnvm::NodeEntryEqual>;
/*!\brief
* This is to count how many time each output is used by another node (or the output of the graph)
ptrendx marked this conversation as resolved.
Show resolved Hide resolved
*/
NodeEntryMapCounter GetNodeEntryCount(const Graph& g);

Graph FusePointwiseForward(Graph&& g);
Graph FusePointwiseBackward(Graph&& g);

/*!
* \brief Infer shapes in the graph given the information.
* \param graph The input graph.
Expand Down
45 changes: 42 additions & 3 deletions src/executor/graph_executor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include <nnvm/graph.h>
#include <nnvm/pass_functions.h>
#include <vector>
#include <set>
#include <algorithm>

#include "./exec_pass.h"
Expand Down Expand Up @@ -325,6 +326,7 @@ nnvm::Graph GraphExecutor::InitFullGraph(nnvm::Symbol symbol,
if (!need_grad_) return g;
for (size_t i = 0; i < g.outputs.size(); ++i) {
NodeEntry ngrad(nnvm::Node::Create(), 0, 0);
ngrad.node->attrs.name = "_head_grad_" + std::to_string(i);
head_grad_entry_.emplace_back(AttrHint(ngrad, g.outputs[i]));
head_grad_map_[ngrad.node.get()] = i;
}
Expand Down Expand Up @@ -365,6 +367,7 @@ nnvm::Graph GraphExecutor::InitFullGraph(nnvm::Symbol symbol,
for (const auto &e : g_grad.outputs) {
g.outputs.push_back(e);
}

return g;
}

Expand Down Expand Up @@ -784,6 +787,7 @@ void GraphExecutor::Init(nnvm::Symbol symbol,
const nnvm::NodeEntryMap<NDArray>& feed_dict) {
nnvm::Graph g = InitGraph(symbol, default_ctx, ctx_map, in_arg_ctxes, arg_grad_ctxes,
aux_state_ctxes, grad_req_types);

// The following code of shape and dtype inferences and argument
// initialization is for simple_bind only. Regular bind operation
// should do this differently.
Expand Down Expand Up @@ -964,6 +968,7 @@ Executor* GraphExecutor::Reshape(const bool partial_shaping,
this);
return exec;
}

/*!
* \brief This function is triggered by both simple_bind
* and bind flows.
Expand All @@ -981,6 +986,40 @@ Graph GraphExecutor::InitGraph(nnvm::Symbol symbol,
// setup gradient
nnvm::Graph g = InitFullGraph(symbol, grad_req_types);

#if MXNET_USE_CUDA && !defined(_WIN32)
if (dmlc::GetEnv("MXNET_USE_FUSION", true) && default_ctx.dev_mask() == Context::kGPU) {
ptrendx marked this conversation as resolved.
Show resolved Hide resolved
nnvm::Graph unoptimized_graph;
common::CopyGraph(&unoptimized_graph, g, false);

if (common::CheckForInputNameDuplicates(unoptimized_graph.indexed_graph())) {
g.attrs["num_forward_outputs"] = std::make_shared<nnvm::any>(num_forward_outputs_);
g = FusePointwiseForward(std::move(g));
g.attrs["num_forward_outputs"] = std::make_shared<nnvm::any>(num_forward_outputs_);
g = FusePointwiseBackward(std::move(g));
// Check the topological order of inputs
const auto &original_inputs = unoptimized_graph.indexed_graph().input_nodes();
const auto &new_inputs = g.indexed_graph().input_nodes();
if (original_inputs.size() != new_inputs.size()) {
LOG(WARNING)
<< "Number of inputs after fusion does not match original number of inputs. "
<< "This is most probably a bug. Disabling fusion for this run.";
g = unoptimized_graph;
} else {
for (size_t i = 0; i < new_inputs.size(); ++i) {
if (unoptimized_graph.indexed_graph()[original_inputs[i]].source->attrs.name !=
g.indexed_graph()[new_inputs[i]].source->attrs.name) {
LOG(WARNING) << "Disabling fusion due to altered topological order of inputs.";
g = unoptimized_graph;
break;
}
}
}
} else {
LOG(WARNING) << "Graph contains duplicate names for some of its inputs - fusion is NOT enabled!";
}
}
#endif // MXNET_USE_CUDA

// create "device" and "context" attrs for the graph
g = AssignContext(g, default_ctx, ctx_map,
in_arg_ctxes,
Expand Down Expand Up @@ -1897,7 +1936,7 @@ Executor *Executor::SimpleBind(nnvm::Symbol symbol,
arg_stype_map, default_ctx, group2ctx, &tmp_in_arg_ctxes,
&tmp_arg_grad_ctxes, &tmp_grad_req_types, &tmp_aux_state_ctxes);
}
exec->Init(symbol, default_ctx, group2ctx, tmp_in_arg_ctxes, tmp_arg_grad_ctxes,
exec->Init(symbol.Copy(), default_ctx, group2ctx, tmp_in_arg_ctxes, tmp_arg_grad_ctxes,
tmp_aux_state_ctxes, arg_shape_map, arg_dtype_map, arg_stype_map, tmp_grad_req_types,
shared_arg_names, in_args, arg_grads, aux_states, shared_buffer, shared_exec);
return exec;
Expand All @@ -1922,8 +1961,8 @@ Executor *Executor::Bind(nnvm::Symbol symbol,
exec::BuildSubgraph(symbol, exec->subgraph_property(), default_ctx, group2ctx, &tmp_in_args,
&tmp_arg_grad_store, &tmp_grad_req_type, &tmp_aux_states);
}
exec->Init(symbol, default_ctx, group2ctx, tmp_in_args, tmp_arg_grad_store, tmp_grad_req_type,
tmp_aux_states, reinterpret_cast<Executor*>(shared_exec));
exec->Init(symbol.Copy(), default_ctx, group2ctx, tmp_in_args, tmp_arg_grad_store,
tmp_grad_req_type, tmp_aux_states, reinterpret_cast<Executor*>(shared_exec));
return exec;
}
} // namespace mxnet
Loading