From 1ed28aeb5967f8705d28f9d4a5ddcb834843e4a2 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Thu, 9 Aug 2018 02:07:44 +0900
Subject: [PATCH 01/77] [NNVM] Enhance operator fusion for more element wise
 patterns (#1548)

---
 nnvm/src/compiler/graph_fuse.cc              | 97 ++++++++++++++++++++
 nnvm/tests/python/compiler/test_op_fusion.py | 44 ++++++++-
 topi/python/topi/arm_cpu/conv2d.py           |  5 +-
 topi/python/topi/util.py                     | 26 ++++--
 4 files changed, 158 insertions(+), 14 deletions(-)
diff --git a/nnvm/src/compiler/graph_fuse.cc b/nnvm/src/compiler/graph_fuse.cc
index d4e668972593..247224aa9bfe 100644
--- a/nnvm/src/compiler/graph_fuse.cc
+++ b/nnvm/src/compiler/graph_fuse.cc
@@ -161,6 +161,103 @@ nnvm::Graph GraphFusePartition(nnvm::Graph g) {
       }
     }
   }
+
+  /*
+     Above algorithm will not fuse a node whose output is fed to more than one
+     child node. This is because in general, it does not make sense to fuse multiple
+     children branches with their parent, as in the following example.
+
+            conv2d
+            /  |  \
+           /   |   \
+         op    op   op
+          |    |    |
+          |    |    |
+
+     However, when all children branches meet at a certain node, there is a possibility for
+     further operator fusion. For example, all nodes in the following subgraph can be fused
+     into a single node, if three 'in-between' nodes and the bottom node are all element wise
+     operation.
+
+            conv2d
+            /  |  \
+           /   |   \
+         op    op   op
+          \    |    /
+           \   |   /
+          elemwise add
+               |
+
+     This pattern is not uncommon. For example, it arises when conv2d op is followed by exponential
+     linear unit. If bias add and batch normalization are also present, they can be fused as well.
+
+     In fact, above fusion algorithm already fuses three in-between nodes and the element wise
+     add node in the figure above. The following code fuses the conv2d node with the already
+     fused children nodes. The following patterns are supported.
+
+     * Any number of child nodes from the top node
+     * The path from the top node to bottom node can contain any number of element wise ops.
+
+     The only restriction is that in-between nodes cannot have more than one child.
+
+     The overview of the algorithm below is as follows:
+
+     1. Check if all children nodes are fused into a single op by the existing fusion algorithm
+     2. Fuse the parent node to children nodes, and update its group id to be the children's group id
+     3. If the parent node originally belongs to another group (for example, conv + batch norm),
+        propagate the new group id to a grand parent and upward
+  */
+  if (opt_level >= 1) {
+    std::vector<std::vector<uint32_t> > children_group_ids(idx.num_nodes());
+    std::vector<std::vector<uint32_t> > node_ids_per_group(idx.num_nodes());
+    for (uint32_t nid = idx.num_nodes() - 1; nid != 0; --nid) {
+      const auto& inode = idx[nid];
+      if (inode.source->is_variable()) continue;
+      CHECK_NE(group_vec[nid], -1);
+      node_ids_per_group[group_vec[nid]].push_back(nid);
+      if (inode.inputs.size() != 1) continue;
+      const uint32_t parent_nid = inode.inputs[0].node_id;
+      // if parent node has more than one child, record each child's group id.
+      if (ref_count[parent_nid] > 1) children_group_ids[parent_nid].push_back(group_vec[nid]);
+    }
+
+    std::vector<int> new_group_id(idx.num_nodes(), -1);
+    for (uint32_t nid = idx.num_nodes() - 1; nid != 0; --nid) {
+      if (new_group_id[group_vec[nid]] != -1) {
+        // propagate new group id from child
+        group_vec[nid] = new_group_id[group_vec[nid]];
+      }
+      TOpPattern pt = op_pattern.get(idx[nid].source->op(), kOpaque);
+      if (pt == kOpaque) continue;
+      const auto& group_ids = children_group_ids[nid];
+      if (group_ids.size() <= 1) continue;
+      const uint32_t child_group_id = group_ids[0];
+      const auto& children_node_ids = node_ids_per_group[child_group_id];
+
+      auto is_same_group_id = [child_group_id](uint32_t id) {
+          return id == child_group_id;
+      };
+      auto is_fusible_pattern = [&idx](uint32_t child_nid) {
+        TOpPattern child_pt = op_pattern.get(idx[child_nid].source->op(), kOpaque);
+        return child_pt  <= kBroadcast;
+      };
+      // fuse this node with children if
+      // all children belong to the same group and
+      // all nodes in the group are element wise or broadcast op.
+      const bool can_be_fused = std::all_of(group_ids.begin(), group_ids.end(), is_same_group_id) &&
+        std::all_of(children_node_ids.begin(), children_node_ids.end(), is_fusible_pattern);
+
+      if (can_be_fused) {
+        new_group_id[group_vec[nid]] = child_group_id;
+        group_vec[nid] = child_group_id;
+        for (uint32_t nid2 : node_ids_per_group[child_group_id]) {
+          pattern_vec[nid2] = pattern_vec[nid];
+          master_vec[nid2] = master_vec[nid];
+        }
+      }
+    }
+  }
+
   g.attrs["group_root"] = std::make_shared<any>(std::move(group_vec));
   g.attrs["group_master"] = std::make_shared<any>(std::move(master_vec));
   g.attrs["pattern"] = std::make_shared<any>(std::move(pattern_vec));
diff --git a/nnvm/tests/python/compiler/test_op_fusion.py b/nnvm/tests/python/compiler/test_op_fusion.py
index f33e18197840..8d05ae02c579 100644
--- a/nnvm/tests/python/compiler/test_op_fusion.py
+++ b/nnvm/tests/python/compiler/test_op_fusion.py
@@ -5,7 +5,7 @@
 from tvm.contrib import graph_runtime
 from nnvm import symbol as sym
 from nnvm.compiler import graph_util, graph_attr
-from nnvm.testing import ctx_list
+from nnvm.testing import ctx_list, utils
 
 def test_ewise_injective():
     x = sym.Variable("x")
@@ -77,7 +77,49 @@ def test_injective_reduce_injective():
         np.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
 
 
+def build_and_run(sym, params, data, out_shape, target, ctx, opt_level=2):
+    with nnvm.compiler.build_config(opt_level=opt_level):
+        graph, lib, params = nnvm.compiler.build(sym, target, shape={"data":data.shape}, params=params)
+    module = graph_runtime.create(graph, lib, ctx)
+    module.set_input(**params)
+    module.set_input("data", data)
+    module.run()
+    out =  module.get_output(0, tvm.nd.empty(out_shape))
+    return out.asnumpy(), graph
+
+
+def test_fuse_conv2d_elu():
+    def elu(data):
+        return -0.5 * sym.relu(1 - sym.exp(data)) + sym.relu(data)
+
+    def get_sym(out_channel):
+        data = sym.Variable(name="data")
+        data = sym.conv2d(data=data, kernel_size=(3,3), channels=out_channel, padding=(1, 1),
+                          layout="NCHW", kernel_layout="OIHW", use_bias=True)
+        data = sym.batch_norm(data)
+        data = elu(data)
+        return data
+
+    in_channel = 8
+    out_channel = 16
+    size = 64
+    dshape = (1, in_channel, size, size)
+    oshape = (1, out_channel, size, size)
+    data = np.random.uniform(-1, 1, dshape).astype(np.float32)
+
+    for target, ctx in ctx_list():
+        sym1 = get_sym(out_channel)
+        sym2 = get_sym(out_channel)
+        _, params1 = utils.create_workload(sym1, 1, dshape[1:], seed=0)
+        _, params2 = utils.create_workload(sym2, 1, dshape[1:], seed=0)
+        output1, g1 = build_and_run(sym1, params1, data, oshape, target, ctx, opt_level=2)
+        output2, g2 = build_and_run(sym2, params2, data, oshape, target, ctx, opt_level=0)
+        np.testing.assert_allclose(output1, output2, rtol=1e-5, atol=1e-5)
+        # data, conv weight, bias, batch norm gamma, batch norm beta, conv op
+        assert g1.index.num_nodes == 6
+
 if __name__ == "__main__":
     test_injective_reduce_injective()
     test_ewise_injective()
     test_conv_ewise_injective()
+    test_fuse_conv2d_elu()
diff --git a/topi/python/topi/arm_cpu/conv2d.py b/topi/python/topi/arm_cpu/conv2d.py
index e28c08cb93ee..f5dbec8e552b 100644
--- a/topi/python/topi/arm_cpu/conv2d.py
+++ b/topi/python/topi/arm_cpu/conv2d.py
@@ -39,11 +39,10 @@ def decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype):
 def schedule_conv2d_nchw_arm_cpu(cfg, outs):
     """TOPI schedule callback"""
     s = tvm.create_schedule([x.op for x in outs])
-    scheduled_ops = []
 
     def _callback(op):
         # schedule conv2d
-        if 'spatial_conv_output' in op.tag and op not in scheduled_ops:
+        if 'spatial_conv_output' in op.tag:
             output = op.output(0)
             conv = op.input_tensors[0]
 
@@ -65,8 +64,6 @@ def _callback(op):
             output = op.output(0)
             _schedule_winograd(cfg, s, output, outs[0])
 
-        scheduled_ops.append(op)
-
     traverse_inline(s, outs[0].op, _callback)
     return s
 
diff --git a/topi/python/topi/util.py b/topi/python/topi/util.py
index b5d5dd2b99ad..71e123e83475 100644
--- a/topi/python/topi/util.py
+++ b/topi/python/topi/util.py
@@ -5,25 +5,33 @@
 
 from . import tag
 
-def traverse_inline(s, op, callback):
+def traverse_inline(s, final_op, callback):
     """Traverse computation graph and do auto inline
 
     Parameters
     ----------
     s: schedule
         The schedule
-    op: Operation
+    final_op: Operation
         The final output operator.
     callback: callable
         The callback function on each op
     """
-    if tag.is_injective(op.tag):
-        if op not in s.outputs:
-            s[op].compute_inline()
-        for tensor in op.input_tensors:
-            if tensor.op.input_tensors:
-                traverse_inline(s, tensor.op, callback)
-    callback(op)
+    visited = set()
+
+    def _traverse(op):
+        if op in visited:
+            return
+        visited.add(op)
+        if tag.is_injective(op.tag):
+            if op not in s.outputs:
+                s[op].compute_inline()
+            for tensor in op.input_tensors:
+                if tensor.op.input_tensors:
+                    _traverse(tensor.op)
+        callback(op)
+
+    _traverse(final_op)
 
 
 def prod(x):

From 60da47055a1e9f2ef3ecfa6efc70a0c8d21ddf5e Mon Sep 17 00:00:00 2001
From: "Tang, Cheng" <souptc@gmail.com>
Date: Wed, 8 Aug 2018 10:11:24 -0700
Subject: [PATCH 02/77] add tvm external registry entrypoint (#1562)

---
 include/tvm/base.h                 | 6 ++++++
 nnvm/src/compiler/graph_runtime.cc | 2 +-
 src/api/api_base.cc                | 4 ++++
 3 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/include/tvm/base.h b/include/tvm/base.h
index d113f45352bb..1d7cf8add3ca 100644
--- a/include/tvm/base.h
+++ b/include/tvm/base.h
@@ -83,5 +83,11 @@ struct NodeFactoryReg
       ::dmlc::Registry<::tvm::NodeFactoryReg>::Get()->__REGISTER__(TypeName::_type_key) \
       .set_body([]() { return std::make_shared<TypeName>(); })
 
+TVM_DLL::dmlc::Registry<::tvm::NodeFactoryReg > * GetTVMNodeFactoryRegistry();
+
+#define TVM_EXTERNAL_REGISTER_NODE_TYPE(TypeName)                                \
+  static DMLC_ATTRIBUTE_UNUSED ::tvm::NodeFactoryReg & __make_Node ## _ ## TypeName ## __ = \
+      ::tvm::GetTVMNodeFactoryRegistry()->__REGISTER__(TypeName::_type_key) \
+      .set_body([]() { return std::make_shared<TypeName>(); })
 }  // namespace tvm
 #endif  // TVM_BASE_H_
diff --git a/nnvm/src/compiler/graph_runtime.cc b/nnvm/src/compiler/graph_runtime.cc
index e623192258de..7301fd74117e 100644
--- a/nnvm/src/compiler/graph_runtime.cc
+++ b/nnvm/src/compiler/graph_runtime.cc
@@ -100,6 +100,6 @@ TVM_REGISTER_GLOBAL("nnvm.compiler._load_param_dict")
     *rv = ret;
   });
 
-TVM_REGISTER_NODE_TYPE(NDArrayWrapperNode);
+TVM_EXTERNAL_REGISTER_NODE_TYPE(NDArrayWrapperNode);
 }  // namespace compiler
 }  // namespace nnvm
diff --git a/src/api/api_base.cc b/src/api/api_base.cc
index cc76f6a8f50b..37970e69e24f 100644
--- a/src/api/api_base.cc
+++ b/src/api/api_base.cc
@@ -41,4 +41,8 @@ TVM_REGISTER_API("_TVMSetStream")
 .set_body([](TVMArgs args,  TVMRetValue *ret) {
     TVMSetStream(args[0], args[1], args[2]);
   });
+
+TVM_DLL::dmlc::Registry<::tvm::NodeFactoryReg > * GetTVMNodeFactoryRegistry() {
+  return ::dmlc::Registry<::tvm::NodeFactoryReg>::Get();
+}
 }  // namespace tvm

From 7ea06e6e4cb0d3045255c4c98928d69db32562d5 Mon Sep 17 00:00:00 2001
From: Siju <sijusamuel@gmail.com>
Date: Wed, 8 Aug 2018 22:53:51 +0530
Subject: [PATCH 03/77] [ONNX]onnx gather bug fix (#1543)

---
 nnvm/python/nnvm/frontend/onnx.py             |  10 +-
 .../python/frontend/onnx/test_forward.py      | 186 ++++++++++--------
 2 files changed, 104 insertions(+), 92 deletions(-)

diff --git a/nnvm/python/nnvm/frontend/onnx.py b/nnvm/python/nnvm/frontend/onnx.py
index f4062c103fcf..fa26648b293a 100644
--- a/nnvm/python/nnvm/frontend/onnx.py
+++ b/nnvm/python/nnvm/frontend/onnx.py
@@ -489,15 +489,11 @@ def _impl_v1(cls, inputs, attr, params):
 class Gather(OnnxOpConverter):
     """ Operator converter for Gather.
     """
-
     @classmethod
     def _impl_v1(cls, inputs, attr, params):
-        axis = attr['axis']
-        indices = np.array(attr['indices'], dtype='int32')
-        name = 'gather_indices'
-        gather_indices = _sym.Variable(name=name, init=indices)
-        params[name] = indices
-        return _sym.take(inputs[0], gather_indices, axis=axis)
+        axis = attr.get('axis', 0)
+        return AttrCvt(op_name='take',
+                       extras={'axis':axis})(inputs, attr)
 
 class LRN(OnnxOpConverter):
     """ Operator converter for Local Response Normalization.
diff --git a/nnvm/tests/python/frontend/onnx/test_forward.py b/nnvm/tests/python/frontend/onnx/test_forward.py
index f4dc3559ee16..9fb3aed2da10 100644
--- a/nnvm/tests/python/frontend/onnx/test_forward.py
+++ b/nnvm/tests/python/frontend/onnx/test_forward.py
@@ -8,21 +8,50 @@
 from model_zoo import super_resolution, squeezenet1_1, lenet, resnet18_1_0
 from onnx import helper, TensorProto
 
-def get_tvm_output(model, x, target, ctx, out_shape, dtype='float32'):
-    new_sym, params = nnvm.frontend.from_onnx(model)
-    input_name = model.graph.input[0].name
-    shape_dict = {input_name: x.shape}
-    dtype_dict = {input_name: dtype}
-    graph, lib, params = nnvm.compiler.build(new_sym, target, shape_dict, dtype_dict, params=params)
+def get_tvm_output(graph_def, input_data, target, ctx, output_shape, output_dtype='float32'):
+    """ Generic function to execute and get tvm output"""
+
+    sym, params = nnvm.frontend.from_onnx(graph_def)
+    target = 'llvm'
+    if isinstance(input_data, list):
+        input_names = {}
+        shape_dict = {}
+        dtype_dict = {}
+        for i, _ in enumerate(input_data):
+            input_names[i] = graph_def.graph.input[i].name
+            shape_dict[input_names[i]] = input_data[i].shape
+            dtype_dict[input_names[i]] = input_data[i].dtype
+    else:
+        input_names = graph_def.graph.input[0].name
+        shape_dict = {input_names: input_data.shape}
+        dtype_dict = {input_names: input_data.dtype}
+
+    graph, lib, params = nnvm.compiler.build(sym, target, shape_dict,
+                                             dtype=dtype_dict, params=params)
+
+    ctx = tvm.cpu(0)
+    from tvm.contrib import graph_runtime
     m = graph_runtime.create(graph, lib, ctx)
     # set inputs
-    m.set_input(input_name, tvm.nd.array(x.astype(dtype)))
+    if isinstance(input_data, list):
+        for i, e in enumerate(input_names):
+            m.set_input(input_names[i], tvm.nd.array(input_data[i].astype(input_data[i].dtype)))
+    else:
+        m.set_input(input_names, tvm.nd.array(input_data.astype(input_data.dtype)))
+
     m.set_input(**params)
+    # execute
     m.run()
     # get outputs
-    out = m.get_output(0, tvm.nd.empty(out_shape, dtype))
-    return out.asnumpy()
-
+    if isinstance(output_shape, list) and isinstance(output_dtype, list):
+        tvm_output_list = []
+        for i, s in enumerate(output_shape):
+            tvm_output = m.get_output(i, tvm.nd.empty((s), output_dtype[i]))
+            tvm_output_list.append(tvm_output.asnumpy())
+        return tvm_output_list
+    else:
+        tvm_output = m.get_output(0, tvm.nd.empty((output_shape), output_dtype))
+        return tvm_output.asnumpy()
 
 def get_caffe2_output(model, x, dtype='float32'):
     import caffe2.python.onnx.backend
@@ -70,13 +99,15 @@ def test_reshape():
 
     graph = helper.make_graph([ref_node, reshape_node],
                               "reshape_test",
-                              inputs = [helper.make_tensor_value_info("in", TensorProto.FLOAT, list(in_shape))],
-                              outputs = [helper.make_tensor_value_info("out", TensorProto.FLOAT, list(ref_shape))])
+                              inputs = [helper.make_tensor_value_info("in",
+                                            TensorProto.FLOAT, list(in_shape))],
+                              outputs = [helper.make_tensor_value_info("out",
+                                            TensorProto.FLOAT, list(ref_shape))])
 
     model = helper.make_model(graph, producer_name='reshape_test')
 
     for target, ctx in ctx_list():
-        x = np.random.uniform(size=in_shape)
+        x = np.random.uniform(size=in_shape).astype('int32')
         tvm_out = get_tvm_output(model, x, target, ctx, ref_shape, 'float32')
 
     np.testing.assert_allclose(ref_shape, tvm_out.shape)
@@ -98,13 +129,15 @@ def test_reshape_like():
 
     graph = helper.make_graph([ref_node, copy_node, reshape_node],
                               "reshape_like_test",
-                              inputs = [helper.make_tensor_value_info("in", TensorProto.FLOAT, list(in_shape))],
-                              outputs = [helper.make_tensor_value_info("out", TensorProto.FLOAT, list(ref_shape))])
+                              inputs = [helper.make_tensor_value_info("in",
+                                            TensorProto.FLOAT, list(in_shape))],
+                              outputs = [helper.make_tensor_value_info("out",
+                                            TensorProto.FLOAT, list(ref_shape))])
 
     model = helper.make_model(graph, producer_name='reshape_like_test')
 
     for target, ctx in ctx_list():
-        x = np.random.uniform(size=in_shape)
+        x = np.random.uniform(size=in_shape).astype('float32')
         tvm_out = get_tvm_output(model, x, target, ctx, ref_shape, 'float32')
 
     np.testing.assert_allclose(ref_shape, tvm_out.shape)
@@ -122,31 +155,18 @@ def _test_power_iteration(x_shape, y_shape):
 
     graph = helper.make_graph([res],
                               'power_test',
-                              inputs = [helper.make_tensor_value_info("x", TensorProto.FLOAT, list(x_shape)),
-                                        helper.make_tensor_value_info("y", TensorProto.FLOAT, list(y_shape))],
-                              outputs = [helper.make_tensor_value_info("out", TensorProto.FLOAT, list(np_res.shape))])
+                              inputs = [helper.make_tensor_value_info("x",
+                                            TensorProto.FLOAT, list(x_shape)),
+                                        helper.make_tensor_value_info("y",
+                                            TensorProto.FLOAT, list(y_shape))],
+                              outputs = [helper.make_tensor_value_info("out",
+                                            TensorProto.FLOAT, list(np_res.shape))])
 
     model = helper.make_model(graph, producer_name='power_test')
 
     for target, ctx in ctx_list():
-        new_sym, params = nnvm.frontend.from_onnx(model)
-
-        input_name = model.graph.input[0].name
-        input_name1 = model.graph.input[1].name
-        shape_dict = {input_name: x.shape, input_name1: y.shape}
-        dtype_dict = {input_name: x.dtype, input_name1: y.dtype}
-
-        graph, lib, params = nnvm.compiler.build(new_sym, target, shape_dict, dtype_dict, params=params)
-        m = graph_runtime.create(graph, lib, ctx)
-        # set inputs
-        m.set_input(input_name, tvm.nd.array(x))
-        m.set_input(input_name1, tvm.nd.array(y))
-        m.set_input(**params)
-        m.run()
-        # get outputs
-        tvm_out = m.get_output(0, tvm.nd.empty(np_res.shape, np_res.dtype))
-
-        np.testing.assert_allclose(np_res, tvm_out.asnumpy(), rtol=1e-5, atol=1e-5)
+        tvm_out = get_tvm_output(model, [x, y], target, ctx, np_res.shape)
+        np.testing.assert_allclose(np_res, tvm_out, rtol=1e-5, atol=1e-5)
 
 def test_power():
     _test_power_iteration((1, 3), (1))
@@ -160,13 +180,15 @@ def test_squeeze():
 
     graph = helper.make_graph([y],
                               'squeeze_test',
-                              inputs = [helper.make_tensor_value_info("in", TensorProto.FLOAT, list(in_shape))],
-                              outputs = [helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_shape))])
+                              inputs = [helper.make_tensor_value_info("in",
+                                            TensorProto.FLOAT, list(in_shape))],
+                              outputs = [helper.make_tensor_value_info("out",
+                                            TensorProto.FLOAT, list(out_shape))])
 
     model = helper.make_model(graph, producer_name='squeeze_test')
 
     for target, ctx in ctx_list():
-        x = np.random.uniform(size=in_shape)
+        x = np.random.uniform(size=in_shape).astype('float32')
         tvm_out = get_tvm_output(model, x, target, ctx, out_shape, 'float32')
 
     np.testing.assert_allclose(out_shape, tvm_out.shape)
@@ -179,44 +201,47 @@ def test_unsqueeze():
 
     graph = helper.make_graph([y],
                               'squeeze_test',
-                              inputs = [helper.make_tensor_value_info("in", TensorProto.FLOAT, list(in_shape))],
-                              outputs = [helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_shape))])
+                              inputs = [helper.make_tensor_value_info("in",
+                                            TensorProto.FLOAT, list(in_shape))],
+                              outputs = [helper.make_tensor_value_info("out",
+                                            TensorProto.FLOAT, list(out_shape))])
 
     model = helper.make_model(graph, producer_name='squeeze_test')
 
     for target, ctx in ctx_list():
-        x = np.random.uniform(size=in_shape)
+        x = np.random.uniform(size=in_shape).astype('float32')
         tvm_out = get_tvm_output(model, x, target, ctx, out_shape, 'float32')
 
     np.testing.assert_allclose(out_shape, tvm_out.shape)
 
-def verify_gather(in_shape, indices, axis=0):
-    indices_src = np.array(indices, dtype="int32")
-
-    x = np.random.uniform(size=in_shape)
-    out_np = np.take(x, indices_src, axis=axis)
+def verify_gather(in_shape, indices, axis, dtype):
+    x = np.random.uniform(size=in_shape).astype(dtype)
+    indices = np.array(indices, dtype="int32")
+    out_np = np.take(x, indices, axis=axis)
 
-    y = helper.make_node("Gather", ['in'], ['out'], indices=indices, axis=axis)
+    y = helper.make_node("Gather", ['in', 'indices'], ['out'], axis=axis)
 
     graph = helper.make_graph([y],
                               'gather_test',
                               inputs = [helper.make_tensor_value_info("in",
-                                  TensorProto.FLOAT, list(in_shape))],
+                                            TensorProto.FLOAT, list(in_shape)),
+                                        helper.make_tensor_value_info("indices",
+                                            TensorProto.INT32, list(indices.shape))],
                               outputs = [helper.make_tensor_value_info("out",
-                                  TensorProto.FLOAT, list(out_np.shape))])
-
+                                            TensorProto.FLOAT, list(out_np.shape))])
     model = helper.make_model(graph, producer_name='gather_test')
 
     for target, ctx in ctx_list():
-        tvm_out = get_tvm_output(model, x, target, ctx, out_np.shape, 'float32')
-
-    np.testing.assert_allclose(out_np, tvm_out)
+        tvm_out = get_tvm_output(model, [x, indices], target, ctx, out_np.shape)
+        np.testing.assert_allclose(out_np, tvm_out)
 
 def test_gather():
-    verify_gather((4,), [1])
-    verify_gather((4,), [0, 1, 2, 3])
-    verify_gather((4, 2), [1], 1)
-    verify_gather((4, 3, 5, 6), [2, 1, 0, 0], -2)
+    verify_gather((4,), [1], 0, 'int32')
+    verify_gather((1,4), [0], 0, 'int32')
+    verify_gather((4,), [[[1,0],[0,1]]], 0, 'float32')
+    verify_gather((2,2), [[[1,0],[0,1]]], 1, 'int32')
+    verify_gather((3,3,3), [[[1,0]]], -1, 'int32')
+    verify_gather((4,3,5,6), [[2,1,0,0]], 0, 'float32')
 
 def _test_slice_iteration(indata, outdata, starts, ends, axes=None):
     if axes:
@@ -226,8 +251,10 @@ def _test_slice_iteration(indata, outdata, starts, ends, axes=None):
 
     graph = helper.make_graph([y],
                               'slice_test',
-                              inputs = [helper.make_tensor_value_info("in", TensorProto.FLOAT, list(indata.shape))],
-                              outputs = [helper.make_tensor_value_info("out", TensorProto.FLOAT, list(outdata.shape))])
+                              inputs = [helper.make_tensor_value_info("in",
+                                            TensorProto.FLOAT, list(indata.shape))],
+                              outputs = [helper.make_tensor_value_info("out",
+                                            TensorProto.FLOAT, list(outdata.shape))])
 
     model = helper.make_model(graph, producer_name='slice_test')
 
@@ -251,8 +278,10 @@ def _test_onnx_op_elementwise(inshape, outfunc, npargs, dtype, opname, kwargs):
 
     graph = helper.make_graph([y],
                               opname+'_test',
-                              inputs = [helper.make_tensor_value_info("in", TensorProto.FLOAT, list(indata.shape))],
-                              outputs = [helper.make_tensor_value_info("out", TensorProto.FLOAT, list(outdata.shape))])
+                              inputs = [helper.make_tensor_value_info("in",
+                                            TensorProto.FLOAT, list(indata.shape))],
+                              outputs = [helper.make_tensor_value_info("out",
+                                            TensorProto.FLOAT, list(outdata.shape))])
 
     model = helper.make_model(graph, producer_name=opname+'_test')
 
@@ -278,40 +307,27 @@ def test_clip():
 def test_matmul():
     a_shape = (4, 3)
     b_shape = (3, 4)
-    out_shape = (4, 4)
 
     a_array = np.random.uniform(size=a_shape).astype('float32')
     b_array = np.random.uniform(size=b_shape).astype('float32')
+    out_np = np.matmul(a_array, b_array)
 
     mul_node = helper.make_node("MatMul", ["a", "b"], ["out"])
 
     graph = helper.make_graph([mul_node],
                               "matmul_test",
-                              inputs = [helper.make_tensor_value_info("a", TensorProto.FLOAT, list(a_shape)),
-                                        helper.make_tensor_value_info("b", TensorProto.FLOAT, list(b_shape))],
-                              outputs = [helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_shape))])
+                              inputs = [helper.make_tensor_value_info("a",
+                                            TensorProto.FLOAT, list(a_shape)),
+                                        helper.make_tensor_value_info("b",
+                                            TensorProto.FLOAT, list(b_shape))],
+                              outputs = [helper.make_tensor_value_info("out",
+                                            TensorProto.FLOAT, list(out_np.shape))])
 
     model = helper.make_model(graph, producer_name='matmul_test')
 
     for target, ctx in ctx_list():
-        new_sym, params = nnvm.frontend.from_onnx(model)
-
-        input_name = model.graph.input[0].name
-        input_name1 = model.graph.input[1].name
-        shape_dict = {input_name: a_array.shape, input_name1: b_array.shape}
-        dtype_dict = {input_name: 'float32', input_name1: 'float32'}
-
-        graph, lib, params = nnvm.compiler.build(new_sym, target, shape_dict, dtype_dict, params=params)
-        m = graph_runtime.create(graph, lib, ctx)
-        # set inputs
-        m.set_input(input_name, tvm.nd.array(a_array.astype('float32')))
-        m.set_input(input_name1, tvm.nd.array(b_array.astype('float32')))
-        m.set_input(**params)
-        m.run()
-        # get outputs
-        tvm_out = m.get_output(0, tvm.nd.empty(out_shape, 'float32'))
-
-        np.testing.assert_allclose(np.matmul(a_array, b_array), tvm_out.asnumpy(), rtol=1e-5, atol=1e-5)
+        tvm_out = get_tvm_output(model, [a_array, b_array], target, ctx, out_np.shape)
+        np.testing.assert_allclose(out_np, tvm_out, rtol=1e-5, atol=1e-5)
 
 def verify_lrn(shape, nsize, dtype, alpha=None, beta=None, bias=None):
     in_array = np.random.uniform(size=shape).astype(dtype)

From c9f9a3f9be7db611d11b9a28476af62571af9581 Mon Sep 17 00:00:00 2001
From: Siju <sijusamuel@gmail.com>
Date: Wed, 8 Aug 2018 22:56:10 +0530
Subject: [PATCH 04/77] l2normalization operator support for tensorflow (#1528)

---
 nnvm/python/nnvm/frontend/tensorflow.py       | 23 ++++++++++-
 .../frontend/tensorflow/test_forward.py       | 39 ++++++++++++++++++-
 2 files changed, 59 insertions(+), 3 deletions(-)

diff --git a/nnvm/python/nnvm/frontend/tensorflow.py b/nnvm/python/nnvm/frontend/tensorflow.py
index ac47f9c3435f..d761e34c7c59 100644
--- a/nnvm/python/nnvm/frontend/tensorflow.py
+++ b/nnvm/python/nnvm/frontend/tensorflow.py
@@ -434,6 +434,21 @@ def _impl(inputs, attr, params):
         return AttrCvt(op_name='lrn')(new_inputs, attr_new)
     return _impl
 
+def _sum():
+    def _impl(inputs, attr, params):
+        axis = params.pop(inputs[1].list_output_names()[0]).asnumpy()
+        return AttrCvt(
+            op_name='sum',
+            extras={'axis': axis},
+            transforms={'keep_dims':'keepdims'},
+            ignores=['name', 'Tidx'])(inputs[0], attr)
+    return _impl
+
+def _square():
+    def _impl(inputs, attr, params):
+        return _sym.elemwise_mul(inputs[0], inputs[0])
+    return _impl
+
 def _gather_v2():
     "Tensorflow now support only gatherv2"
     def _impl(inputs, attr, params):
@@ -651,13 +666,17 @@ def _impl(inputs, attr, params):
     'Identity'                          : _identity(),
     'MatMul'                            : _matmul(),
     'MaxPool'                           : _pooling('max_pool'),
+    'Add'                               : _elemwise('add'),
+    'Sub'                               : _elemwise('sub'),
     'Mul'                               : _elemwise('mul'),
+    'Maximum'                           : _elemwise('max'),
+    'Minimum'                           : _elemwise('min'),
+    'Sum'                               : _sum(),
+    'Square'                            : _square(),
     'Relu'                              : AttrCvt('relu'),
     'Reshape'                           : _reshape(),
     'ResizeBilinear'                    : _resize_bilinear(),
     'Softmax'                           : AttrCvt('softmax', {'axis': ('axis', 1)}),
-    'Sub'                               : _elemwise('sub'),
-    'Add'                               : _elemwise('add'),
     'Rsqrt'                             : _rsqrt(),
     'Squeeze'                           : _squeeze(),
     'FusedBatchNorm'                    : _fused_batch_norm(),
diff --git a/nnvm/tests/python/frontend/tensorflow/test_forward.py b/nnvm/tests/python/frontend/tensorflow/test_forward.py
index 96b472e8b381..495852f9e5d6 100644
--- a/nnvm/tests/python/frontend/tensorflow/test_forward.py
+++ b/nnvm/tests/python/frontend/tensorflow/test_forward.py
@@ -12,6 +12,7 @@
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import graph_util
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import nn
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import math_ops
@@ -948,7 +949,6 @@ def _test_lrn(ishape, size, axis, bias, alpha, beta):
                 sess,
                 sess.graph.as_graph_def(add_shapes=True),
                 ['lrn'],)
-
             tf_output = run_tf_graph(sess, inp_array, 'lrn0_data:0', 'lrn:0')
             tvm_output = run_tvm_graph(graph_def,
                                        inp_array,
@@ -959,6 +959,42 @@ def _test_lrn(ishape, size, axis, bias, alpha, beta):
 def test_forward_lrn():
     _test_lrn((1, 3, 20, 20), 3, 1, 1.0, 1.0, 0.5)
 
+#######################################################################
+# l2_normalize
+# ------------
+def _test_l2_normalize(ishape, eps, axis):
+    """ testing l2 normalize (uses max, sum, square, sqrt frontend operators)"""
+
+    inp_array = np.random.uniform(size=ishape).astype(np.float32)
+    inp_array.fill(1)
+
+    with tf.Graph().as_default():
+        in1 = tf.placeholder(shape=inp_array.shape, dtype=inp_array.dtype, name="Placeholder")
+        nn.l2_normalize(in1,
+                        axis=axis,
+                        epsilon=eps,
+                        name=None,
+                        dim=None)
+
+        with tf.Session() as sess:
+            graph_def = tf.graph_util.convert_variables_to_constants(
+                sess,
+                sess.graph.as_graph_def(add_shapes=True),
+                ['l2_normalize'],
+                )
+            tf_output = run_tf_graph(sess, inp_array, 'Placeholder:0', 'Placeholder:0')
+            tvm_output = run_tvm_graph(graph_def,
+                                       inp_array,
+                                       "Placeholder",
+                                       tf_output.shape,
+                                       tf_output.dtype)
+
+            np.testing.assert_allclose(tf_output, tvm_output, atol=1e-3, rtol=1e-3)
+            sess.close()
+def test_forward_l2_normalize():
+    _test_l2_normalize((1, 3, 20, 20), 0.001, (0,))
+
+#######################################################################
 # Main
 # ----
 if __name__ == '__main__':
@@ -981,3 +1017,4 @@ def test_forward_lrn():
     test_forward_gather()
     test_forward_ptb()
     test_forward_lrn()
+    test_forward_l2_normalize()

From 53d243114d3d35d27c7b5bd90fafccb4d8d9dda0 Mon Sep 17 00:00:00 2001
From: Zhi <5145158+zhiics@users.noreply.github.com>
Date: Wed, 8 Aug 2018 13:51:47 -0700
Subject: [PATCH 05/77] Separate fusion and Compilation (#1564)

* Separate fusion and compilation

* fix description of graph_fuse.h

* fix lint

* fix @masahi 's comments, move fusion out of target

* fix graph passing and make fused_entries singula in graph attr

* fix typo

* fix some comments

* run test again

* remove rvalue for graphfuse and graphfindfusiablegroups
---
 nnvm/python/nnvm/compiler/build_module.py |   4 +-
 nnvm/src/compiler/graph_compile.cc        | 259 +++++++++++++++++
 nnvm/src/compiler/graph_fuse.cc           | 333 +++-------------------
 nnvm/src/compiler/graph_fuse.h            |  80 ++++++
 4 files changed, 388 insertions(+), 288 deletions(-)
 create mode 100644 nnvm/src/compiler/graph_compile.cc
 create mode 100644 nnvm/src/compiler/graph_fuse.h

diff --git a/nnvm/python/nnvm/compiler/build_module.py b/nnvm/python/nnvm/compiler/build_module.py
index fd8599bcfa93..217598c9d79a 100644
--- a/nnvm/python/nnvm/compiler/build_module.py
+++ b/nnvm/python/nnvm/compiler/build_module.py
@@ -298,8 +298,10 @@ def build(graph, target=None, shape=None, dtype="float32",
         else:
             graph._set_json_attr("opt_level", 0, "int")
         graph = graph.apply("InferShape").apply("InferType")
+        graph = graph.apply("GraphFindFusibleGroups")
+        graph = graph.apply("GraphFuse")
         with target:
-            graph = graph.apply("GraphFusePartition").apply("GraphFuseCompile")
+            graph = graph.apply("GraphCompile")
         libmod = graph_attr._move_out_module(graph, "module")
         # Write variable initial values into params
         if init_var:
diff --git a/nnvm/src/compiler/graph_compile.cc b/nnvm/src/compiler/graph_compile.cc
new file mode 100644
index 000000000000..e51730c09d66
--- /dev/null
+++ b/nnvm/src/compiler/graph_compile.cc
@@ -0,0 +1,259 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file graph_compile.cc
+ * \brief Compile a graph. It lowers the graph nodes into low level IR.
+ */
+
+#include <dmlc/parameter.h>
+#include <nnvm/compiler/packed_func_ext.h>
+#include <nnvm/graph.h>
+#include <nnvm/graph_attr_types.h>
+#include <nnvm/node.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/pass.h>
+#include <nnvm/pass_functions.h>
+#include <nnvm/tuple.h>
+#include <tvm/lowered_func.h>
+#include <tvm/runtime/packed_func.h>
+
+#include "compile_engine.h"
+#include "graph_fuse.h"
+#include "graph_runtime.h"
+#include "pattern_util.h"
+
+namespace nnvm {
+namespace compiler {
+
+using namespace tvm;
+
+// Decorate the result of PlanMemory
+// This function does two things:
+// - Give separate memory to each variable.
+// - Tie the memory of output/lhs in assign node properly
+//   so the execution of assign can have side effect.
+nnvm::Graph DecorateMemoryPlan(
+    nnvm::Graph g,
+    const std::vector<int>& assign_flag) {
+  const IndexedGraph& idx = g.indexed_graph();
+  StorageVector storage_vec = g.MoveCopyAttr<StorageVector>("storage_id");
+  g.attrs.erase("storage_allocated_bytes");
+  g.attrs.erase("storage_inplace_index");
+  size_t num_not_allocated = g.MoveCopyAttr<size_t>(
+      "storage_num_not_allocated");
+  CHECK_EQ(num_not_allocated, 0U)
+      << "Can only build inference graph with all statically allocated memory";
+
+  // Reassign variable id so that they are different.
+  int max_id = 0;
+  for (size_t i = 0; i < storage_vec.size(); ++i) {
+    max_id = std::max(storage_vec[i] + 1, max_id);
+  }
+  for (uint32_t nid : idx.input_nodes()) {
+    storage_vec[idx.entry_id(nid, 0)] = max_id++;
+  }
+  // Tie up the assign node storage properly.
+  for (uint32_t nid = 0 ; nid < idx.num_nodes(); ++nid) {
+    if (assign_flag[nid] == 0) continue;
+    const auto& inode = idx[nid];
+    int var_storage_id = storage_vec[idx.entry_id(inode.inputs[0])];
+    storage_vec[idx.entry_id(nid, 0)] = var_storage_id;
+
+    if (assign_flag[nid] == 2) {
+      storage_vec[idx.entry_id(inode.inputs[1])] = var_storage_id;
+    }
+  }
+  g.attrs["storage_id"] = std::make_shared<any>(std::move(storage_vec));
+  return g;
+}
+
+nnvm::Graph GraphCompile(const nnvm::Graph& g) {
+  // Get attributes from the graph.
+  const ShapeVector& shape_vec = g.GetAttr<ShapeVector>("shape");
+  const DTypeVector& dtype_vec = g.GetAttr<DTypeVector>("dtype");
+  const GroupVec& group_vec = g.GetAttr<GroupVec>("group_root");
+  const MasterVec& master_vec = g.GetAttr<MasterVec>("group_master");
+  const PatternVec& pattern_vec = g.GetAttr<PatternVec>("pattern");
+
+  CHECK(g.HasAttr("fused_entry")) << "Fusion hasn't been applied yet.";
+  FuseEntryVec fuse_entries = g.GetAttr<FuseEntryVec>("fused_entry");
+
+  std::string target = g.GetAttr<std::string>("target");
+  std::string target_host;
+
+  if (g.HasAttr("target_host")) {
+    target_host = g.GetAttr<std::string>("target_host");
+  }
+  // Specially handle assign.
+  const nnvm::Op* assign_op = nnvm::Op::Get("_assign");
+
+  // Start lowering.
+  Array<tvm::LoweredFunc> func_list;
+  std::unordered_set<const tvm::Node*> func_set;
+  const IndexedGraph& idx = g.indexed_graph();
+
+  for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
+    const auto& inode = idx[nid];
+    if (inode.source->is_variable()) continue;
+    int root_id = group_vec[nid];
+        if (static_cast<int>(nid) != root_id) continue;
+    int master = master_vec[root_id];
+    FuseEntry& fe = fuse_entries[root_id];
+
+    const IndexedGraph& subidx = fe.subgraph.indexed_graph();
+    CHECK_EQ(subidx.input_nodes().size(), fe.imap.size());
+    CHECK_EQ(subidx.input_nodes().size(), fe.input_info.size());
+
+    Array<Tensor> inputs;
+    for (uint32_t sub_input_id : subidx.input_nodes()) {
+      auto it = fe.input_info.find(subidx[sub_input_id].source);
+      inputs.push_back(it->second);
+    }
+    // Find master idx in the subgraph.
+    int sub_master_idx = 0;
+    for (uint32_t i = 0; i < subidx.num_nodes(); i++) {
+      if (subidx[i].source->op() == idx[master].source->op()) {
+        sub_master_idx = i;
+        break;
+      }
+    }
+    fe.compiled_func = GraphLower(fe.subgraph, inputs, target, sub_master_idx);
+    for (LoweredFunc f : fe.compiled_func->funcs) {
+      if (!func_set.count(f.get())) {
+        func_set.insert(f.get());
+        func_list.push_back(f);
+      }
+    }
+  }
+
+  const nnvm::Op* tvm_op = nnvm::Op::Get("tvm_op");
+
+  std::unordered_map<uint32_t, nnvm::NodePtr> old_new;
+  for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
+    const auto& inode = idx[nid];
+    if (inode.source->is_variable()) {
+      // Only copy name since that is sufficient.
+      nnvm::NodePtr np = nnvm::Node::Create();
+      np->attrs.name = inode.source->attrs.name;
+      old_new[nid] = np;
+      continue;
+    }
+    int root_id = group_vec[nid];
+    if (static_cast<int>(nid) != root_id) continue;
+
+    // Handle normal op
+    FuseEntry& fe = fuse_entries[root_id];
+    const IndexedGraph& subidx = fe.subgraph.indexed_graph();
+    nnvm::NodePtr np = nnvm::Node::Create();
+    np->attrs.op = tvm_op;
+    np->attrs.name = inode.source->attrs.name;
+    TVMOpParam param;
+    param.func_name = fe.compiled_func->func_name;
+    param.num_inputs = static_cast<uint32_t>(fe.imap.size());
+    param.num_outputs = static_cast<uint32_t>(fe.subgraph.outputs.size());
+    param.flatten_data = fe.flatten_data;
+    param.UpdateDict(&(np->attrs.dict));
+    np->attrs.parsed = std::move(param);
+
+    for (uint32_t sub_input_id : subidx.input_nodes()) {
+      // Need to make sure subgraph input order is consistent to the order of
+      // the graph input.
+      auto rit = fe.reverse_imap.find(subidx[sub_input_id].source);
+      CHECK(rit != fe.reverse_imap.end());
+      const IndexedGraph::NodeEntry& e = rit->second;
+            auto it = old_new.find(e.node_id);
+      CHECK(it != old_new.end())
+          << "cannot find node_id=" << e.node_id;
+      np->inputs.emplace_back(
+          nnvm::NodeEntry{it->second, e.index, e.version});
+    }
+    for (const uint32_t node_id : inode.control_deps) {
+      auto it = old_new.find(node_id);
+      CHECK(it != old_new.end());
+      np->control_deps.emplace_back(it->second);
+    }
+    old_new[nid] = np;
+  }
+  nnvm::Graph ret;
+  for (const auto& e : idx.outputs()) {
+    auto it = old_new.find(group_vec[e.node_id]);
+    CHECK(it != old_new.end())
+        << "cannot find node_id=" << e.node_id;
+    ret.outputs.emplace_back(
+        nnvm::NodeEntry{it->second, e.index, e.version});
+  }
+
+  // Reference counter of each op node.
+  // For now, always store result when an op is referred more than once.
+  std::vector<uint32_t> ref_count = GetNodeRefCounts(idx);
+  for (const auto& e : idx.outputs()) {
+    // This line will realize all the outputs.
+    ref_count[e.node_id] += 1;
+  }
+
+  const IndexedGraph& new_idx = ret.indexed_graph();
+
+  // Handling assign:
+  //
+  //  assign is a special operator that mutates the variable.
+  //  Currently assign is implemented as output = copy(input[1])
+  //  Then we run DecorageMemoryPlan to force
+  //  output.storage = input[0].storage
+  //
+  std::vector<int> assign_flag(new_idx.num_nodes(), 0);
+  ShapeVector new_shape_vec = ShapeVector(new_idx.num_node_entries(), TShape());
+  DTypeVector new_dtype_vec = DTypeVector(new_idx.num_node_entries());
+  std::vector<std::string> new_dltype_vec(new_idx.num_node_entries());
+
+  for (const auto& kv : old_new) {
+    uint32_t nid = kv.first;
+    const auto& inode = idx[nid];
+    uint32_t new_nid = new_idx.node_id(kv.second.get());
+    if (inode.source->op() == assign_op) {
+      // Check if rhs of assign can be computed inplace.
+      // If yes, we can simply set that memory to be assign target
+      // and change assign to nop.
+      const IndexedGraph::NodeEntry& rhs = inode.inputs[1];
+      if (ref_count[rhs.node_id] <= 1 &&
+          !(idx[rhs.node_id].source->is_variable()) &&
+          pattern_vec[group_vec[rhs.node_id]] <= kBroadcast) {
+        assign_flag[new_nid] = 2;
+        TVMOpParam& param = dmlc::get<TVMOpParam>(kv.second->attrs.parsed);
+        param.func_name = "__nop";
+        param.UpdateDict(&(kv.second->attrs.dict));
+      } else {
+        assign_flag[new_nid] = 1;
+      }
+    }
+    for (uint32_t i = 0; i < inode.source->num_outputs(); ++i) {
+      uint32_t new_eid = new_idx.entry_id(new_idx.node_id(kv.second.get()), i);
+      uint32_t old_eid = idx.entry_id(nid, i);
+      new_shape_vec[new_eid] = shape_vec[old_eid];
+      new_dtype_vec[new_eid] = dtype_vec[old_eid];
+      new_dltype_vec[new_eid] = tvm::runtime::TVMType2String(
+          GetDLType(dtype_vec[old_eid]));
+    }
+  }
+  ret.attrs["shape"] = std::make_shared<any>(std::move(new_shape_vec));
+  ret.attrs["dtype"] = std::make_shared<any>(std::move(new_dtype_vec));
+  ret.attrs["dltype"] = std::make_shared<any>(std::move(new_dltype_vec));
+
+  // Setup module
+  static const PackedFunc& fbuild = GetPackedFunc("nnvm.compiler.build_target");
+  tvm::runtime::Module module = fbuild(func_list, target, target_host);
+  ret.attrs["module"] = std::make_shared<any>(std::move(module));
+  ret = nnvm::ApplyPass(ret, "PlanMemory");
+  ret = DecorateMemoryPlan(ret, assign_flag);
+  return ret;
+}
+
+NNVM_REGISTER_PASS(GraphCompile)
+    .set_body(GraphCompile)
+    .depend_graph_attr("shape")
+    .depend_graph_attr("dtype")
+    .depend_graph_attr("fused_entry")
+    .depend_graph_attr("group_root")
+    .depend_graph_attr("pattern")
+    .depend_graph_attr("group_master");
+
+}  // namespace compiler
+}  // namespace nnvm
diff --git a/nnvm/src/compiler/graph_fuse.cc b/nnvm/src/compiler/graph_fuse.cc
index 247224aa9bfe..52a8ae44f8ee 100644
--- a/nnvm/src/compiler/graph_fuse.cc
+++ b/nnvm/src/compiler/graph_fuse.cc
@@ -3,18 +3,19 @@
  * \file graph_fuse.cc
  * \brief Fuse the operators together.
  */
+#include <dmlc/parameter.h>
+#include <nnvm/compiler/packed_func_ext.h>
 #include <nnvm/graph.h>
+#include <nnvm/graph_attr_types.h>
 #include <nnvm/node.h>
 #include <nnvm/op_attr_types.h>
-#include <nnvm/graph_attr_types.h>
-#include <nnvm/tuple.h>
 #include <nnvm/pass.h>
 #include <nnvm/pass_functions.h>
-#include <nnvm/compiler/packed_func_ext.h>
-#include <tvm/runtime/packed_func.h>
+#include <nnvm/tuple.h>
 #include <tvm/lowered_func.h>
-#include <dmlc/parameter.h>
-#include "./compile_engine.h"
+#include <tvm/runtime/packed_func.h>
+
+#include "./graph_fuse.h"
 #include "./graph_runtime.h"
 #include "./pattern_util.h"
 
@@ -22,28 +23,10 @@ namespace nnvm {
 namespace compiler {
 using namespace tvm;
 
-// The single fuse rule.
-enum class FuseRule {
-  kUknown,
-  kFuseToMaster,
-  kRealize
-};
-
-/*!
- * \brief Get DLDataType from dtype flag.
- *
- * \param type_flag The data type flag
- * \return corresponding DLDataType
- */
-DLDataType GetDLType(int type_flag) {
-  return Type2TVMType(GetTVMType(type_flag));
-}
-
 // Partition the graph into segments
 // Each segment will be compiled into one operator.
-// Need also mark the property of the segment.
-nnvm::Graph GraphFusePartition(nnvm::Graph g) {
-  // setup ref counter
+// Also mark the property of the segment.
+nnvm::Graph GraphFindFusibleGroups(nnvm::Graph g) {
   const IndexedGraph& idx = g.indexed_graph();
   int opt_level = 2;
   if (g.attrs.count("opt_level") != 0) {
@@ -61,7 +44,7 @@ nnvm::Graph GraphFusePartition(nnvm::Graph g) {
     ref_count[e.node_id] += 1;
   }
   // Pattern for the subgraph
-  std::vector<TOpPattern> pattern_vec(idx.num_nodes(),  kOpaque);
+  PatternVec pattern_vec(idx.num_nodes(),  kOpaque);
   // Whether node can be fused to parent.
   std::vector<FuseRule> fuse_vec(idx.num_nodes(), FuseRule::kUknown);
   // Master node id of fusion segment.
@@ -77,7 +60,7 @@ nnvm::Graph GraphFusePartition(nnvm::Graph g) {
     TOpPattern pt = op_pattern.get(inode.source->op(), kOpaque);
 
     if (pt <= kBroadcast) {
-      // Try to check if we can fuse to the master.
+      // Check if we can fuse to the master.
       int chosen_master = -1;
       bool ewise = inode.source->num_outputs() == 1;
       for (const auto& e : inode.inputs) {
@@ -108,7 +91,7 @@ nnvm::Graph GraphFusePartition(nnvm::Graph g) {
         pt = ewise ? kElemWise : kBroadcast;
       }
     } else if (pt == kInjective || pt == kCommReduce) {
-      // fuse to the comm reduce or injective
+      // Fuse to the comm reduce or injective
       for (const auto& e : inode.inputs) {
         if (fuse_vec[e.node_id] == FuseRule::kUknown) {
           TOpPattern ipt = pattern_vec[e.node_id];
@@ -123,7 +106,7 @@ nnvm::Graph GraphFusePartition(nnvm::Graph g) {
         master_vec[nid] = nid;
       }
     } else {
-      // realize
+      // Realize
       master_vec[nid] = nid;
       for (const auto& e : inode.inputs) {
         if (fuse_vec[e.node_id] == FuseRule::kUknown) {
@@ -144,15 +127,15 @@ nnvm::Graph GraphFusePartition(nnvm::Graph g) {
     }
   }
 
-  // point to the group root id of each node
-  std::vector<int> group_vec(idx.num_nodes(), -1);
+  // Point to the group root id of each node.
+  GroupVec group_vec(idx.num_nodes(), -1);
   for (uint32_t i = idx.num_nodes(); i != 0; --i) {
     uint32_t nid = i - 1;
     const auto& inode = idx[nid];
     if (group_vec[nid] == -1) {
       group_vec[nid] = nid;
     }
-    // propagate the group id.
+    // Propagate the group id.
     for (const auto& e : inode.inputs) {
       if (fuse_vec[e.node_id] == FuseRule::kFuseToMaster) {
         CHECK(group_vec[e.node_id] == -1||
@@ -264,121 +247,43 @@ nnvm::Graph GraphFusePartition(nnvm::Graph g) {
   return g;
 }
 
-
-NNVM_REGISTER_PASS(GraphFusePartition)
-.set_body(GraphFusePartition)
+NNVM_REGISTER_PASS(GraphFindFusibleGroups)
+.set_body(GraphFindFusibleGroups)
 .depend_graph_attr("shape")
 .depend_graph_attr("dtype");
 
-
-// Decorate the result of PlanMemory
-// This function does two things:
-// - Give separate memory to each variable
-// - Tie the memory of output/lhs in assign node properly
-//   so the execution of assign can have side effect.
-nnvm::Graph DecorateMemoryPlan(
-    nnvm::Graph g,
-    const std::vector<int>& assign_flag) {
-  // setup ref counter
-  const IndexedGraph& idx = g.indexed_graph();
-  StorageVector storage_vec = g.MoveCopyAttr<StorageVector>("storage_id");
-  g.attrs.erase("storage_allocated_bytes");
-  g.attrs.erase("storage_inplace_index");
-  size_t num_not_allocated = g.MoveCopyAttr<size_t>(
-      "storage_num_not_allocated");
-  CHECK_EQ(num_not_allocated, 0U)
-      << "Can only build inference graph with all statically allocated memory";
-
-  // reassign variable id so that they are different.
-  int max_id = 0;
-  for (size_t i = 0; i < storage_vec.size(); ++i) {
-    max_id = std::max(storage_vec[i] + 1, max_id);
-  }
-  for (uint32_t nid : idx.input_nodes()) {
-    storage_vec[idx.entry_id(nid, 0)] = max_id++;
-  }
-  // tie up the assign node storage properly
-  for (uint32_t nid = 0 ; nid < idx.num_nodes(); ++nid) {
-    if (assign_flag[nid] == 0) continue;
-    const auto& inode = idx[nid];
-    int var_storage_id = storage_vec[idx.entry_id(inode.inputs[0])];
-    storage_vec[idx.entry_id(nid, 0)] = var_storage_id;
-
-    if (assign_flag[nid] == 2) {
-      storage_vec[idx.entry_id(inode.inputs[1])] = var_storage_id;
-    }
-  }
-  g.attrs["storage_id"] = std::make_shared<any>(std::move(storage_vec));
-  return g;
-}
-
-struct INodeEntryHash {
-  size_t operator()(const IndexedGraph::NodeEntry& e) const {
-    return e.node_id;
-  }
-};
-
-struct INodeEntryEqual {
-  size_t operator()(const IndexedGraph::NodeEntry& a,
-                    const IndexedGraph::NodeEntry& b) const {
-    return a.node_id == b.node_id && a.index == b.index;
-  }
-};
-
-// Auxiliary data structure for representing fused op.
-struct FuseEntry {
-  // subgraph of the fragement
-  Graph subgraph;
-  // The input map
-  std::unordered_map<IndexedGraph::NodeEntry, nnvm::NodeEntry,
-                     INodeEntryHash, INodeEntryEqual> imap;
-  // reverse map to the old input entry
-  std::unordered_map<const Node*, IndexedGraph::NodeEntry> reverse_imap;
-  // TVM Placeholder for inputs
-  std::unordered_map<const Node*, Tensor> input_info;
-  // Whether we can flatten data
-  bool flatten_data;
-  // The corresponding function.
-  GraphFunc compiled_func;
-};
-
 // Fuse the partitioned graph into segments.
-// Create a new graph with fused noded.
-// Also inheritate attribute shape, dltype from previous graph.
-nnvm::Graph GraphFuseCompile(nnvm::Graph g) {
-  // setup ref counter
+// Create a new graph with fused nodes.
+// Also inherit attribute shape, dltype from the previous graph.
+nnvm::Graph GraphFuse(nnvm::Graph g) {
+  CHECK(g.HasAttr("group_root") && g.HasAttr("pattern"))
+      << "GraphFindFusibleGroups pass hasn't been applied yet.";
+
   const IndexedGraph& idx = g.indexed_graph();
   // Get attributes from the graph
   const ShapeVector& shape_vec = g.GetAttr<ShapeVector>("shape");
   const DTypeVector& dtype_vec = g.GetAttr<DTypeVector>("dtype");
-  const std::vector<int>& group_vec = g.GetAttr<std::vector<int> >("group_root");
-  const std::vector<int>& master_vec = g.GetAttr<std::vector<int> >("group_master");
-  const std::vector<TOpPattern>& pattern_vec =
-      g.GetAttr<std::vector<TOpPattern> >("pattern");
-  std::string target = g.GetAttr<std::string>("target");
-  std::string target_host;
+  const GroupVec& group_vec = g.GetAttr<GroupVec>("group_root");
+  const PatternVec& pattern_vec = g.GetAttr<PatternVec>("pattern");
 
-  if (g.HasAttr("target_host")) {
-    target_host = g.GetAttr<std::string>("target_host");
-  }
-  // specially handle assign
+  // Specially handle assign op.
   const nnvm::Op* assign_op = nnvm::Op::Get("_assign");
 
-  std::vector<FuseEntry> fuse_vec(idx.num_nodes());
-  // setup inputs and placeholder.
+  FuseEntryVec fuse_entries(idx.num_nodes());
+  // Setup inputs and placeholder.
   for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
     const auto& inode = idx[nid];
     if (inode.source->is_variable()) continue;
     CHECK_GE(group_vec[nid], 0);
     int root_id = group_vec[nid];
-    FuseEntry& fe = fuse_vec[root_id];
+    FuseEntry& fe = fuse_entries[root_id];
     fe.flatten_data = (pattern_vec[root_id] == kElemWise ||
                        inode.source->op() == assign_op);
     for (const auto& e : inode.inputs) {
       if (group_vec[e.node_id] != root_id && fe.imap.count(e) == 0) {
         Array<Expr> shape;
         if (fe.flatten_data) {
-          // elementwise support flatten
+          // Elementwise support flatten
           int64_t prod = 1;
           for (int64_t x : shape_vec[idx.entry_id(e)]) {
             prod *= x;
@@ -403,17 +308,18 @@ nnvm::Graph GraphFuseCompile(nnvm::Graph g) {
       }
     }
   }
+
   // Setup the Subgraph
   std::vector<NodeEntry> subgraph_vec(idx.num_node_entries());
   for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
     const auto& inode = idx[nid];
     if (inode.source->is_variable()) continue;
     int root_id = group_vec[nid];
-    FuseEntry& fe = fuse_vec[root_id];
-    // copy and create subgraph node.
+    FuseEntry& fe = fuse_entries[root_id];
+    // Create a subgraph node.
     NodePtr gnode = Node::Create();
     gnode->attrs = inode.source->attrs;
-    // input loading
+    // Set input entries for the subgraph node.
     for (const auto& e : inode.inputs) {
       if (group_vec[e.node_id] != root_id) {
         auto it = fe.imap.find(e);
@@ -426,7 +332,7 @@ nnvm::Graph GraphFuseCompile(nnvm::Graph g) {
         gnode->inputs.push_back(ne);
       }
     }
-    // schedule on root node, and use master's schedule
+    // Schedule on the root node and use the master's schedule
     if (static_cast<int>(nid) != root_id) {
       for (uint32_t index = 0; index < inode.source->num_outputs(); ++index) {
         uint32_t eid = idx.entry_id(nid, index);
@@ -438,165 +344,18 @@ nnvm::Graph GraphFuseCompile(nnvm::Graph g) {
       }
     }
   }
-  // Start lowering
-  Array<tvm::LoweredFunc> func_list;
-  std::unordered_set<const tvm::Node*> func_set;
-
-  for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
-    const auto& inode = idx[nid];
-    if (inode.source->is_variable()) continue;
-    int root_id = group_vec[nid];
-    if (static_cast<int>(nid) != root_id) continue;
-    int master = master_vec[root_id];
-    FuseEntry& fe = fuse_vec[root_id];
-
-    const IndexedGraph& subidx = fe.subgraph.indexed_graph();
-    CHECK_EQ(subidx.input_nodes().size(), fe.imap.size());
-    CHECK_EQ(subidx.input_nodes().size(), fe.input_info.size());
-
-    Array<Tensor> inputs;
-    for (uint32_t sub_input_id : subidx.input_nodes()) {
-      auto it = fe.input_info.find(subidx[sub_input_id].source);
-      inputs.push_back(it->second);
-    }
-    // find master idx in subgraph
-    int sub_master_idx = 0;
-    for (uint32_t i = 0; i < subidx.num_nodes(); i++) {
-      if (subidx[i].source->op() == idx[master].source->op()) {
-        sub_master_idx = i;
-        break;
-      }
-    }
-    fe.compiled_func = GraphLower(fe.subgraph, inputs, target, sub_master_idx);
-    for (LoweredFunc f : fe.compiled_func->funcs) {
-      if (!func_set.count(f.get())) {
-        func_set.insert(f.get());
-        func_list.push_back(f);
-      }
-    }
-  }
-
-  const nnvm::Op* tvm_op = nnvm::Op::Get("tvm_op");
-
-  std::unordered_map<uint32_t, nnvm::NodePtr> old_new;
-  for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
-    const auto& inode = idx[nid];
-    if (inode.source->is_variable()) {
-      // only copy over name since that is sufficient.
-      nnvm::NodePtr np = nnvm::Node::Create();
-      np->attrs.name = inode.source->attrs.name;
-      old_new[nid] = np;
-      continue;
-    }
-    int root_id = group_vec[nid];
-    if (static_cast<int>(nid) != root_id) continue;
-
-    // Handle normal op
-    FuseEntry& fe = fuse_vec[root_id];
-    const IndexedGraph& subidx = fe.subgraph.indexed_graph();
-    nnvm::NodePtr np = nnvm::Node::Create();
-    np->attrs.op = tvm_op;
-    np->attrs.name = inode.source->attrs.name;
-    TVMOpParam param;
-    param.func_name = fe.compiled_func->func_name;
-    param.num_inputs = static_cast<uint32_t>(fe.imap.size());
-    param.num_outputs = static_cast<uint32_t>(fe.subgraph.outputs.size());
-    param.flatten_data = fe.flatten_data;
-    param.UpdateDict(&(np->attrs.dict));
-    np->attrs.parsed = std::move(param);
-
-    for (uint32_t sub_input_id : subidx.input_nodes()) {
-      // Need to make sure subgraph input order meets order of the graph input
-      auto rit = fe.reverse_imap.find(subidx[sub_input_id].source);
-      CHECK(rit != fe.reverse_imap.end());
-      const IndexedGraph::NodeEntry& e = rit->second;
-      auto it = old_new.find(e.node_id);
-      CHECK(it != old_new.end())
-          << "cannot find node_id=" << e.node_id;
-      np->inputs.emplace_back(
-          nnvm::NodeEntry{it->second, e.index, e.version});
-    }
-    for (const uint32_t node_id : inode.control_deps) {
-      auto it = old_new.find(node_id);
-      CHECK(it != old_new.end());
-      np->control_deps.emplace_back(it->second);
-    }
-    old_new[nid] = np;
-  }
-  nnvm::Graph ret;
-  for (const auto& e : idx.outputs()) {
-    auto it = old_new.find(group_vec[e.node_id]);
-    CHECK(it != old_new.end())
-        << "cannot find node_id=" << e.node_id;
-    ret.outputs.emplace_back(
-        nnvm::NodeEntry{it->second, e.index, e.version});
-  }
-
-  // Reference counter of each op node
-  // For now, always store result when an op is referred more than once.
-  std::vector<uint32_t> ref_count = GetNodeRefCounts(idx);
-  for (const auto& e : idx.outputs()) {
-    // this line will realize all the outputs
-    ref_count[e.node_id] += 1;
-  }
-
-  const IndexedGraph& new_idx = ret.indexed_graph();
-
-  // Handling assign:
-  //
-  //  assign is a special operator that mutates the variable.
-  //  Currently assign is implemented as output = copy(input[1])
-  //  Then we run DecorageMemoryPlan to force
-  //  output.storage = input[0].storage
-  //
-  std::vector<int> assign_flag(new_idx.num_nodes(), 0);
-  ShapeVector new_shape_vec = ShapeVector(new_idx.num_node_entries(), TShape());
-  DTypeVector new_dtype_vec = DTypeVector(new_idx.num_node_entries());
-  std::vector<std::string> new_dltype_vec(new_idx.num_node_entries());
-
-  for (const auto& kv : old_new) {
-    uint32_t nid = kv.first;
-    const auto& inode = idx[nid];
-    uint32_t new_nid = new_idx.node_id(kv.second.get());
-    if (inode.source->op() == assign_op) {
-      // Check if rhs of assign can be comute inplace
-      // If yes, we can simply set that memory to be assign target
-      // and change assign to nop
-      const IndexedGraph::NodeEntry& rhs = inode.inputs[1];
-      if (ref_count[rhs.node_id] <= 1 &&
-          !(idx[rhs.node_id].source->is_variable()) &&
-          pattern_vec[group_vec[rhs.node_id]] <= kBroadcast) {
-        assign_flag[new_nid] = 2;
-        TVMOpParam& param = dmlc::get<TVMOpParam>(kv.second->attrs.parsed);
-        param.func_name = "__nop";
-        param.UpdateDict(&(kv.second->attrs.dict));
-      } else {
-        assign_flag[new_nid] = 1;
-      }
-    }
-    for (uint32_t i = 0; i < inode.source->num_outputs(); ++i) {
-      uint32_t new_eid = new_idx.entry_id(new_idx.node_id(kv.second.get()), i);
-      uint32_t old_eid = idx.entry_id(nid, i);
-      new_shape_vec[new_eid] = shape_vec[old_eid];
-      new_dtype_vec[new_eid] = dtype_vec[old_eid];
-      new_dltype_vec[new_eid] = tvm::runtime::TVMType2String(
-          GetDLType(dtype_vec[old_eid]));
-    }
-  }
-  ret.attrs["shape"] = std::make_shared<any>(std::move(new_shape_vec));
-  ret.attrs["dtype"] = std::make_shared<any>(std::move(new_dtype_vec));
-  ret.attrs["dltype"] = std::make_shared<any>(std::move(new_dltype_vec));
-  // Setup module
-  static const PackedFunc& fbuild = GetPackedFunc("nnvm.compiler.build_target");
-  tvm::runtime::Module module = fbuild(func_list, target, target_host);
-  ret.attrs["module"] = std::make_shared<any>(std::move(module));
-  ret = nnvm::ApplyPass(ret, "PlanMemory");
-  ret = DecorateMemoryPlan(ret, assign_flag);
-  return ret;
+  g.attrs["fused_entry"] = std::make_shared<any>(std::move(fuse_entries));
+  return g;
 }
 
-NNVM_REGISTER_PASS(GraphFuseCompile)
-.set_body(GraphFuseCompile);
+NNVM_REGISTER_PASS(GraphFuse)
+    .set_body(GraphFuse)
+    .set_change_graph(true)
+    .provide_graph_attr("fused_entry")
+    .depend_graph_attr("shape")
+    .depend_graph_attr("dtype")
+    .depend_graph_attr("group_root")
+    .depend_graph_attr("group_master");
 
 }  // namespace compiler
 }  // namespace nnvm
diff --git a/nnvm/src/compiler/graph_fuse.h b/nnvm/src/compiler/graph_fuse.h
new file mode 100644
index 000000000000..6faac7d3e162
--- /dev/null
+++ b/nnvm/src/compiler/graph_fuse.h
@@ -0,0 +1,80 @@
+/*!
+ * Copyright (c) 2018 by Contributors
+ * \file graph_fuse.h
+ * \brief Definition of structs used by graph fusion
+*/
+#ifndef NNVM_COMPILER_GRAPH_FUSE_H_
+#define NNVM_COMPILER_GRAPH_FUSE_H_
+
+#include <nnvm/graph.h>
+#include <vector>
+
+#include "compile_engine.h"
+
+namespace nnvm {
+namespace compiler {
+
+// The single fuse rule.
+enum class FuseRule {
+  kUknown,
+  kFuseToMaster,
+  kRealize
+};
+
+/*!
+ * \brief Get DLDataType from dtype flag.
+ *
+ * \param type_flag The data type flag
+ * \return corresponding DLDataType
+ */
+inline DLDataType GetDLType(int type_flag) {
+  return tvm::Type2TVMType(GetTVMType(type_flag));
+}
+
+struct INodeEntryHash {
+  size_t operator()(const IndexedGraph::NodeEntry& e) const {
+    return e.node_id;
+  }
+};
+
+struct INodeEntryEqual {
+  size_t operator()(const IndexedGraph::NodeEntry &a,
+                    const IndexedGraph::NodeEntry &b) const {
+    return a.node_id == b.node_id && a.index == b.index;
+  }
+};
+
+// Auxiliary data structure for representing fused op.
+struct FuseEntry {
+  // Subgraph of the fragment
+  Graph subgraph;
+  // The input map
+  std::unordered_map<IndexedGraph::NodeEntry, nnvm::NodeEntry, INodeEntryHash,
+                     INodeEntryEqual>
+      imap;
+  // Reverse map to the old input entry
+  std::unordered_map<const Node *, IndexedGraph::NodeEntry> reverse_imap;
+  // TVM Placeholder for inputs
+  std::unordered_map<const Node *, Tensor> input_info;
+  // Whether we can flatten data
+  bool flatten_data;
+  // The corresponding function.
+  GraphFunc compiled_func;
+};
+
+// GroupVec stores the root node ids of the fused nodes.
+using GroupVec = std::vector<int>;
+
+// MasterVec stores master node ids of fused groups.
+using MasterVec = std::vector<int>;
+
+// FuseVec stores fused entries.
+using FuseEntryVec = std::vector<FuseEntry>;
+
+// PatternVec stores operator patterns.
+using PatternVec = std::vector<TOpPattern>;
+
+}  // namespace compiler
+}  // namespace nnvm
+
+#endif  // NNVM_COMPILER_GRAPH_FUSE_H_

From 85483c3734ce243a97212b6399229e0fa09f4968 Mon Sep 17 00:00:00 2001
From: MORITA Kazutaka <morita.kazutaka@lab.ntt.co.jp>
Date: Thu, 9 Aug 2018 11:38:09 +0900
Subject: [PATCH 06/77] [TOPI] add injective scheduler for HLS backends (#1553)

* [TOPI] add injective scheduler for HLS backends

* Introduced PrintBinaryExpr
---
 src/codegen/build_module.cc              |  2 ++
 src/codegen/codegen_vhls.cc              | 41 ++++++++++++++++++++++++
 src/codegen/codegen_vhls.h               |  2 ++
 src/codegen/intrin_rule_vhls.cc          | 15 +++++++++
 topi/python/topi/__init__.py             |  1 +
 topi/python/topi/hls/__init__.py         |  5 +++
 topi/python/topi/hls/injective.py        | 31 ++++++++++++++++++
 topi/tests/python/test_topi_broadcast.py |  2 ++
 topi/tests/python/test_topi_clip.py      |  2 +-
 topi/tests/python/test_topi_math.py      |  2 +-
 topi/tests/python/test_topi_relu.py      |  2 +-
 topi/tests/python/test_topi_transform.py | 18 +++++------
 12 files changed, 111 insertions(+), 12 deletions(-)
 create mode 100644 topi/python/topi/hls/__init__.py
 create mode 100644 topi/python/topi/hls/injective.py

diff --git a/src/codegen/build_module.cc b/src/codegen/build_module.cc
index 45d10617de05..0cb0ec3cc4be 100644
--- a/src/codegen/build_module.cc
+++ b/src/codegen/build_module.cc
@@ -91,9 +91,11 @@ Target CreateTarget(const std::string& target_name,
   } else if (target_name == "sdaccel") {
     t->device_type = kDLOpenCL;
     t->keys_array.push_back(ir::StringImm::make("sdaccel"));
+    t->keys_array.push_back(ir::StringImm::make("hls"));
   } else if (target_name == "aocl") {
     t->device_type = kDLAOCL;
     t->keys_array.push_back(ir::StringImm::make("aocl"));
+    t->keys_array.push_back(ir::StringImm::make("hls"));
   } else if (target_name == "opengl") {
     t->device_type = kOpenGL;
     t->keys_array.push_back(ir::StringImm::make("opengl"));
diff --git a/src/codegen/codegen_vhls.cc b/src/codegen/codegen_vhls.cc
index 0795670bded6..b9f9f7505978 100644
--- a/src/codegen/codegen_vhls.cc
+++ b/src/codegen/codegen_vhls.cc
@@ -16,6 +16,7 @@ void CodeGenVivadoHLS::Init(bool output_ssa) {
   CodeGenC::Init(output_ssa);
 
   this->stream << "#include <ap_int.h>\n\n";
+  this->stream << "#include <algorithm>\n\n";
 }
 
 void CodeGenVivadoHLS::PrintType(Type t, std::ostream& os) {
@@ -67,6 +68,46 @@ void CodeGenVivadoHLS::PreFunctionBody(LoweredFunc f) {
   this->stream << "#pragma HLS INTERFACE s_axilite port=return bundle=control\n\n";
 }
 
+template<typename T>
+inline void PrintBinaryExpr(const T* op,
+                            const char *opstr,
+                            std::ostream& os,  // NOLINT(*)
+                            CodeGenVivadoHLS* p) {
+  os << opstr << '(';
+  p->PrintExpr(op->a, os);
+  os << ", ";
+  p->PrintExpr(op->b, os);
+  os << ')';
+}
+
+void CodeGenVivadoHLS::VisitExpr_(const Min *op, std::ostream& os) {  // NOLINT(*)
+  const char *opstr = "std::min";
+  if (op->type.is_float()) {
+    switch (op->type.bits()) {
+      case 32:
+        opstr = "fminf"; break;
+      case 64:
+        opstr = "fmin"; break;
+    }
+  }
+
+  PrintBinaryExpr(op, opstr, os, this);
+}
+
+void CodeGenVivadoHLS::VisitExpr_(const Max *op, std::ostream& os) {  // NOLINT(*)
+  const char *opstr = "std::max";
+  if (op->type.is_float()) {
+    switch (op->type.bits()) {
+      case 32:
+        opstr = "fmaxf"; break;
+      case 64:
+        opstr = "fmax"; break;
+    }
+  }
+
+  PrintBinaryExpr(op, opstr, os, this);
+}
+
 
 runtime::Module BuildSDAccel(Array<LoweredFunc> funcs, std::string target_str) {
   using tvm::runtime::Registry;
diff --git a/src/codegen/codegen_vhls.h b/src/codegen/codegen_vhls.h
index 02a0da2dcfa5..bcb7d6f49d8c 100644
--- a/src/codegen/codegen_vhls.h
+++ b/src/codegen/codegen_vhls.h
@@ -20,6 +20,8 @@ class CodeGenVivadoHLS final : public CodeGenC {
   void PrintType(Type t, std::ostream& os);
   void AddFunction(LoweredFunc f);
   void PreFunctionBody(LoweredFunc f);
+  void VisitExpr_(const Min *op, std::ostream& os);
+  void VisitExpr_(const Max *op, std::ostream& os);
 };
 
 }  // namespace codegen
diff --git a/src/codegen/intrin_rule_vhls.cc b/src/codegen/intrin_rule_vhls.cc
index 32869eec1db5..b360142cd985 100644
--- a/src/codegen/intrin_rule_vhls.cc
+++ b/src/codegen/intrin_rule_vhls.cc
@@ -9,6 +9,21 @@ namespace tvm {
 namespace codegen {
 namespace intrin {
 
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.sdaccel.floor")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.sdaccel.ceil")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.sdaccel.trunc")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.sdaccel.fabs")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.sdaccel.round")
+.set_body(DispatchExtern<Direct>);
+
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.sdaccel.exp")
 .set_body(DispatchExtern<Direct>);
 
diff --git a/topi/python/topi/__init__.py b/topi/python/topi/__init__.py
index 349f805cc7f2..3ef59913e07b 100644
--- a/topi/python/topi/__init__.py
+++ b/topi/python/topi/__init__.py
@@ -32,6 +32,7 @@
 from . import rocm
 from . import vision
 from . import image
+from . import hls
 # not import testing by default
 # because testing can have extra deps that are not necessary
 # we can import them from test cases explicitly
diff --git a/topi/python/topi/hls/__init__.py b/topi/python/topi/hls/__init__.py
new file mode 100644
index 000000000000..69b80514ff56
--- /dev/null
+++ b/topi/python/topi/hls/__init__.py
@@ -0,0 +1,5 @@
+# pylint: disable=redefined-builtin, wildcard-import
+"""HLS specific declaration and schedules."""
+from __future__ import absolute_import as _abs
+
+from .injective import schedule_injective, schedule_elemwise, schedule_broadcast
diff --git a/topi/python/topi/hls/injective.py b/topi/python/topi/hls/injective.py
new file mode 100644
index 000000000000..21dfeb95df1b
--- /dev/null
+++ b/topi/python/topi/hls/injective.py
@@ -0,0 +1,31 @@
+# pylint: disable=invalid-name, unused-variable,
+"""Schedule for composition of injective operator"""
+import tvm
+from .. import generic
+
+@generic.schedule_injective.register(["hls"])
+def schedule_injective(outs):
+    """Schedule for injective op.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of reduce in the format
+          of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+    tvm.schedule.AutoInlineInjective(s)
+    for out in outs:
+        fused = s[out].fuse(*s[out].op.axis)
+        px, x = s[out].split(fused, nparts=1)
+        s[out].bind(px, tvm.thread_axis("pipeline"))
+    return s
+
+schedule_elemwise = schedule_injective
+schedule_broadcast = schedule_injective
diff --git a/topi/tests/python/test_topi_broadcast.py b/topi/tests/python/test_topi_broadcast.py
index c5720050e538..f888033b3914 100644
--- a/topi/tests/python/test_topi_broadcast.py
+++ b/topi/tests/python/test_topi_broadcast.py
@@ -31,6 +31,7 @@ def check_device(device):
     check_device("metal")
     check_device("rocm")
     check_device("nvptx")
+    check_device("sdaccel")
 
 
 def verify_broadcast_binary_ele(lhs_shape, rhs_shape,
@@ -87,6 +88,7 @@ def check_device(device):
     check_device("metal")
     check_device("rocm")
     check_device("nvptx")
+    check_device("sdaccel")
 
 def test_broadcast_to():
     verify_broadcast_to_ele((1,), (10,), topi.broadcast_to)
diff --git a/topi/tests/python/test_topi_clip.py b/topi/tests/python/test_topi_clip.py
index 041565433bcc..ffc89aeb9bc3 100644
--- a/topi/tests/python/test_topi_clip.py
+++ b/topi/tests/python/test_topi_clip.py
@@ -34,7 +34,7 @@ def check_device(device):
         f(a, b)
         np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
-    for device in ['llvm', 'opencl']:
+    for device in ['llvm', 'opencl', 'sdaccel']:
         check_device(device)
 
 def test_clip():
diff --git a/topi/tests/python/test_topi_math.py b/topi/tests/python/test_topi_math.py
index 4190c8e1d213..8d82dbe5bf82 100644
--- a/topi/tests/python/test_topi_math.py
+++ b/topi/tests/python/test_topi_math.py
@@ -39,7 +39,7 @@ def check_device(device):
             foo(a, b)
             np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5, atol=1e-5)
 
-        for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'llvm', 'nvptx']:
+        for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'llvm', 'nvptx', 'sdaccel']:
             check_device(device)
 
 
diff --git a/topi/tests/python/test_topi_relu.py b/topi/tests/python/test_topi_relu.py
index 9feeabbf122f..7c75a9b08975 100644
--- a/topi/tests/python/test_topi_relu.py
+++ b/topi/tests/python/test_topi_relu.py
@@ -27,7 +27,7 @@ def check_device(device):
         foo(a, b)
         np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
-    for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
+    for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx', 'sdaccel']:
         check_device(device)
 
 
diff --git a/topi/tests/python/test_topi_transform.py b/topi/tests/python/test_topi_transform.py
index b62f6b43c3de..4788d758cf45 100644
--- a/topi/tests/python/test_topi_transform.py
+++ b/topi/tests/python/test_topi_transform.py
@@ -22,7 +22,7 @@ def check_device(device):
         foo(data_nd, out_nd)
         np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm", "vulkan"]:
+    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm", "vulkan", "sdaccel"]:
         check_device(device)
 
 
@@ -45,7 +45,7 @@ def check_device(device):
         foo(data_nd, out_nd)
         np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm", "vulkan"]:
+    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm", "vulkan", "sdaccel"]:
         check_device(device)
 
 
@@ -68,7 +68,7 @@ def check_device(device):
         foo(data_nd, out_nd)
         np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm", "vulkan"]:
+    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm", "vulkan", "sdaccel"]:
         check_device(device)
 
 
@@ -96,7 +96,7 @@ def check_device(device):
         foo(data_nd, out_nd)
         np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm", "vulkan"]:
+    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm", "vulkan", "sdaccel"]:
         check_device(device)
 
 def verify_concatenate(shapes, axis):
@@ -121,7 +121,7 @@ def check_device(device):
         foo(*(data_nds + [out_nd]))
         np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm", "vulkan"]:
+    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm", "vulkan", "sdaccel"]:
         check_device(device)
 
 
@@ -146,7 +146,7 @@ def check_device(device):
         for out_nd, out_npy in zip(out_nds, out_npys):
             np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm", "vulkan"]:
+    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm", "vulkan", "sdaccel"]:
         check_device(device)
 
 
@@ -204,7 +204,7 @@ def check_device(device):
         foo(data_nd, out_nd)
         np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in ["llvm", "cuda", "opencl"]:
+    for device in ["llvm", "cuda", "opencl", "sdaccel"]:
         check_device(device)
 
 def verify_take(src_shape, indices_src, axis=None):
@@ -243,7 +243,7 @@ def check_device(device):
         foo(data_nd, indices_nd, out_nd)
         np.testing.assert_allclose(out_nd.asnumpy(), out_npys)
 
-    for device in ["llvm", "opencl"]:
+    for device in ["llvm", "opencl", "sdaccel"]:
         check_device(device)
 
 def verify_strided_slice(in_shape, begin, end, stride=None):
@@ -270,7 +270,7 @@ def check_device(device):
         foo(data_nd, out_nd)
         np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in ["llvm", "opencl"]:
+    for device in ["llvm", "opencl", "sdaccel"]:
         check_device(device)
 
 def test_strided_slice():

From 41d4dd6ef16445c4ea7c7835f139d9f30398f027 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Thu, 9 Aug 2018 11:40:28 +0800
Subject: [PATCH 07/77] Use int for int8x4 due to performance overhead of char4
 (#1569)

* Use int for int8x4 due to performance overhead of char4

* Add a comment about using int

* Remove invalid test
---
 src/codegen/codegen_cuda.cc                | 6 +++++-
 tests/python/unittest/test_codegen_cuda.py | 1 -
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/codegen/codegen_cuda.cc b/src/codegen/codegen_cuda.cc
index 70ab807c9509..44c02830d0fc 100644
--- a/src/codegen/codegen_cuda.cc
+++ b/src/codegen/codegen_cuda.cc
@@ -90,7 +90,11 @@ void CodeGenCUDA::PrintType(Type t, std::ostream& os) {  // NOLINT(*)
         if (t.lanes() == 4) {
           // directly 4 8 bit int in integer.
           enable_int8_ = true;
-          os << "char4"; return;
+
+          // We use int for int8x4 instead of char4 because using char4 is
+          // likely to produce extra instructions to pack four int8 elements
+          // into 32-bit data.
+          os << "int"; return;
         } else if (t.lanes() == 8) {
           enable_int8_ = true;
           os << "int2"; return;
diff --git a/tests/python/unittest/test_codegen_cuda.py b/tests/python/unittest/test_codegen_cuda.py
index b0ed92a30281..0b54863d1aab 100644
--- a/tests/python/unittest/test_codegen_cuda.py
+++ b/tests/python/unittest/test_codegen_cuda.py
@@ -31,7 +31,6 @@ def check_cuda(dtype, n, lanes):
         
     check_cuda("float32", 64, 2)
     check_cuda("float16", 64, 2)
-    check_cuda("int8", 64, 4)
 
 
 def test_cuda_multiply_add():

From 6d4cf44855108f0bf0465e45182199a9a224328b Mon Sep 17 00:00:00 2001
From: Siva <sivar.b@huawei.com>
Date: Thu, 9 Aug 2018 23:36:03 +0530
Subject: [PATCH 08/77] [DOCS] Neural network Deployment Guide with System
 Module Mode #1523 (#1533)

---
 docs/deploy/nnvm.md | 57 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/docs/deploy/nnvm.md b/docs/deploy/nnvm.md
index aa6c39facd1f..e4ce14528b53 100644
--- a/docs/deploy/nnvm.md
+++ b/docs/deploy/nnvm.md
@@ -116,3 +116,60 @@ int main()
     return 0;
 }
 ```
+
+## Deploy as System Module
+C++ additionally support deployment as system module.
+This process need few additional options as given below to NNVM build.
+
+- For target llvm append --system-lib as ```target=llvm --system-lib```
+- For a GPU build (or non llvm) the additional option should be given to targat_host as ```target_host=llvm --system-lib```
+
+Module export require additional options for not to compile but save as ```lib.export_library (path, fcompile=False)```
+
+The output of above API is a tar compressed file containing object file ```(lib.o)``` and cpp source file ```(devc.cc)``` which embeds device blob. Thease two files should be compiled along with other files or objects while building c++ application.
+Please refer to [Makefile](https://github.com/dmlc/tvm/tree/master/apps/howto_deploy/Makefile#L32) for a reference.
+
+The c++ code to load this system module require the below change.
+
+```cpp
+    // tvm module for compiled functions
+    tvm::runtime::Module mod_syslib = (*tvm::runtime::Registry::Get("module._GetSystemLib"))();
+```
+
+Based on the build environment the system object, device blob source should be included in the final executable. An example with bazel build is given below.
+```bash
+cc_library(
+    name = "host_module",
+    srcs = ["lib.o"],
+    alwayslink=1
+)
+
+cc_library(
+    name = "device_module",
+    srcs = ["devc.cc"],
+    alwayslink=1
+)
+
+cc_library(
+    name = "tvm_runtime",
+    srcs = ["libtvm_runtime_pack.cc"],
+)
+
+cc_binary(
+    name = "bazel_deploy",
+    srcs = ["cpp_deploy.cc"],
+    deps = [
+        ":tvm_runtime", ":host_module", ":device_module"
+    ],
+    linkopts = [ "-lpthread -ldl" ]
+)
+
+```
+
+This build directive creates
+- new library ```host_module``` out of ```lib.o```
+- new library ```device_module``` out of ```devc.cc```
+
+These intermediate modules can be used as a dependency to final deploy application.
+
+In bazel ```alwayslink=1``` enforce embedding entire lib into application (even though it doesn't call any API from this module).

From 672147c8e183de76a7fe34fa2aeace7c8829cabd Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <mercy_zheng@sjtu.edu.cn>
Date: Thu, 9 Aug 2018 11:17:37 -0700
Subject: [PATCH 09/77] add conv2d transpose and fix bugs (#1566)

---
 nnvm/include/nnvm/top/nn.h                    |   5 +
 nnvm/python/nnvm/testing/dcgan.py             |  19 ++-
 nnvm/python/nnvm/top/nn.py                    |   6 +-
 nnvm/src/top/nn/convolution.cc                |   2 +-
 .../python/frontend/mxnet/model_zoo/dcgan.py  |  19 ++-
 python/tvm/autotvm/measure/measure_methods.py |   4 +-
 python/tvm/autotvm/task/dispatcher.py         |  31 +++-
 python/tvm/autotvm/task/nnvm_integration.py   |  13 +-
 python/tvm/autotvm/tophub.py                  |  18 ++-
 python/tvm/autotvm/tuner/callback.py          |   4 +-
 topi/python/topi/arm_cpu/__init__.py          |   1 +
 topi/python/topi/arm_cpu/conv2d.py            |  24 +--
 topi/python/topi/arm_cpu/conv2d_transpose.py  | 144 ++++++++++++++++++
 topi/python/topi/arm_cpu/depthwise_conv2d.py  |  17 ++-
 topi/python/topi/nn/conv2d_transpose.py       |   8 +-
 topi/tests/python/test_topi_conv2d.py         |   2 +-
 .../python/test_topi_conv2d_transpose_nchw.py |   2 +-
 tutorials/autotvm/tune_nnvm_arm.py            |  58 ++++---
 tutorials/nnvm_quick_start.py                 |   2 +-
 19 files changed, 299 insertions(+), 80 deletions(-)
 create mode 100644 topi/python/topi/arm_cpu/conv2d_transpose.py

diff --git a/nnvm/include/nnvm/top/nn.h b/nnvm/include/nnvm/top/nn.h
index c9baa116e8aa..865024733494 100644
--- a/nnvm/include/nnvm/top/nn.h
+++ b/nnvm/include/nnvm/top/nn.h
@@ -254,6 +254,7 @@ struct Conv2DTransposeParam : public dmlc::Parameter<Conv2DTransposeParam> {
   int groups;
   std::string layout;
   std::string kernel_layout;
+  int out_dtype;
   bool use_bias;
 
   DMLC_DECLARE_PARAMETER(Conv2DTransposeParam) {
@@ -286,6 +287,10 @@ struct Conv2DTransposeParam : public dmlc::Parameter<Conv2DTransposeParam> {
       .describe("Dimension ordering of data and weight. Can be 'OIHW', 'OIHW16o16i', etc."
                 "'O', 'I', 'H', 'W' stands for num_filter, input_channel, height, and width"
                 "dimensions respectively.");
+    DMLC_DECLARE_DTYPE_FIELD(out_dtype)
+        .add_enum("same", -1)
+        .set_default(-1)
+        .describe("Output data type, set to explicit type under mixed precision setting");
     DMLC_DECLARE_FIELD(use_bias).set_default(true)
       .describe("Whether the layer uses a bias vector.");
   }
diff --git a/nnvm/python/nnvm/testing/dcgan.py b/nnvm/python/nnvm/testing/dcgan.py
index 421699ad4c14..4adc63bd1f2a 100644
--- a/nnvm/python/nnvm/testing/dcgan.py
+++ b/nnvm/python/nnvm/testing/dcgan.py
@@ -42,28 +42,31 @@ def deconv2d_bn_relu(data, prefix, **kwargs):
 
 def get_symbol(oshape, ngf=128, code=None):
     """get symbol of dcgan generator"""
-    assert oshape[-1] == 32, "Only support 32x32 image"
-    assert oshape[-2] == 32, "Only support 32x32 image"
+    assert oshape[-1] == 64, "Only support 64x64 image"
+    assert oshape[-2] == 64, "Only support 64x64 image"
 
     code = sym.Variable("data") if code is None else code
-    net = sym.dense(code, name="g1", units=4*4*ngf*4, use_bias=False)
+    net = sym.dense(code, name="g1", units=4*4*ngf*8, use_bias=False)
     net = sym.relu(net)
     # 4 x 4
-    net = sym.reshape(net, shape=(-1, ngf * 4, 4, 4))
+    net = sym.reshape(net, shape=(-1, ngf * 8, 4, 4))
     # 8 x 8
     net = deconv2d_bn_relu(
-        net, ishape=(ngf * 4, 4, 4), oshape=(ngf * 2, 8, 8), kshape=(4, 4), prefix="g2")
+        net, ishape=(ngf * 8, 4, 4), oshape=(ngf * 4, 8, 8), kshape=(4, 4), prefix="g2")
     # 16x16
     net = deconv2d_bn_relu(
-        net, ishape=(ngf * 2, 8, 8), oshape=(ngf, 16, 16), kshape=(4, 4), prefix="g3")
+        net, ishape=(ngf * 4, 8, 8), oshape=(ngf * 2, 16, 16), kshape=(4, 4), prefix="g3")
     # 32x32
+    net = deconv2d_bn_relu(
+        net, ishape=(ngf * 2, 16, 16), oshape=(ngf, 32, 32), kshape=(4, 4), prefix="g4")
+    # 64x64
     net = deconv2d(
-        net, ishape=(ngf, 16, 16), oshape=oshape[-3:], kshape=(4, 4), name="g4_deconv")
+        net, ishape=(ngf, 32, 32), oshape=oshape[-3:], kshape=(4, 4), name="g5_deconv")
     net = sym.tanh(net)
     return net
 
 
-def get_workload(batch_size, oshape=(3, 32, 32), ngf=128, random_len=100, dtype="float32"):
+def get_workload(batch_size, oshape=(3, 64, 64), ngf=128, random_len=100, dtype="float32"):
     """Get benchmark workload for a DCGAN generator
 
     Parameters
diff --git a/nnvm/python/nnvm/top/nn.py b/nnvm/python/nnvm/top/nn.py
index f59424203402..f9a2c2813a04 100644
--- a/nnvm/python/nnvm/top/nn.py
+++ b/nnvm/python/nnvm/top/nn.py
@@ -251,11 +251,15 @@ def compute_conv2d_transpose(attrs, inputs, _):
     strides = attrs.get_int_tuple("strides")
     dilation = attrs.get_int_tuple("dilation")
     groups = attrs.get_int("groups")
+    out_dtype = attrs.get_string("out_dtype")
     layout = attrs["layout"]
+    out_dtype = inputs[0].dtype if out_dtype == "same" else out_dtype
+
     assert layout == "NCHW", "only support nchw for now"
     assert dilation == (1, 1), "not support dilate now"
     assert groups == 1, "only support groups == 1 for now"
-    out = topi.nn.conv2d_transpose_nchw(inputs[0], inputs[1], strides, padding)
+
+    out = topi.nn.conv2d_transpose_nchw(inputs[0], inputs[1], strides, padding, out_dtype)
     if attrs.get_bool("use_bias"):
         bias = inputs[2]
         bias = topi.expand_dims(bias, axis=1, num_newaxis=2)
diff --git a/nnvm/src/top/nn/convolution.cc b/nnvm/src/top/nn/convolution.cc
index 2843bea1f4ad..229d4ac30f78 100644
--- a/nnvm/src/top/nn/convolution.cc
+++ b/nnvm/src/top/nn/convolution.cc
@@ -556,7 +556,7 @@ v            (batch_size, channels, out_height, out_width) if `layout` is `NCHW`
 .set_attr<FGetAttrDict>("FGetAttrDict", ParamGetAttrDict<Conv2DTransposeParam>)
 .set_attr<FListInputNames>("FListInputNames", UseBiasListInputNames<Conv2DTransposeParam>)
 .set_attr<FInferShape>("FInferShape", Conv2DTransposeInferShape)
-.set_attr<FInferType>("FInferType", ElemwiseType<-1, 1>)
+.set_attr<FInferType>("FInferType", Conv2DInferType<Conv2DTransposeParam>)
 .set_attr<FCorrectLayout>("FCorrectLayout", Conv2DTransposeCorrectLayout)
 .set_num_outputs(1)
 .set_num_inputs(UseBiasNumInputs<Conv2DTransposeParam>)
diff --git a/nnvm/tests/python/frontend/mxnet/model_zoo/dcgan.py b/nnvm/tests/python/frontend/mxnet/model_zoo/dcgan.py
index 98133d369b13..8af030b6b184 100644
--- a/nnvm/tests/python/frontend/mxnet/model_zoo/dcgan.py
+++ b/nnvm/tests/python/frontend/mxnet/model_zoo/dcgan.py
@@ -40,24 +40,27 @@ def deconv2d_bn_relu(data, prefix, **kwargs):
     net = mx.sym.Activation(net, name="%s_act" % prefix, act_type='relu')
     return net
 
-def get_symbol(oshape=(3, 32, 32), ngf=128, code=None):
+def get_symbol(oshape=(3, 64, 64), ngf=128, code=None):
     """get symbol of dcgan generator"""
-    assert oshape[-1] == 32, "Only support 32x32 image"
-    assert oshape[-2] == 32, "Only support 32x32 image"
+    assert oshape[-1] == 64, "Only support 64x64 image"
+    assert oshape[-2] == 64, "Only support 64x64 image"
 
     code = mx.sym.Variable("data") if code is None else code
-    net = mx.sym.FullyConnected(code, name="g1", num_hidden=4*4*ngf*4, no_bias=True, flatten=False)
+    net = mx.sym.FullyConnected(code, name="g1", num_hidden=ngf*8*4*4, no_bias=True, flatten=False)
     net = mx.sym.Activation(net, act_type='relu')
     # 4 x 4
-    net = mx.sym.reshape(net, shape=(-1, ngf * 4, 4, 4))
+    net = mx.sym.reshape(net, shape=(-1, ngf * 8, 4, 4))
     # 8 x 8
     net = deconv2d_bn_relu(
-        net, ishape=(ngf * 4, 4, 4), oshape=(ngf * 2, 8, 8), kshape=(4, 4), prefix="g2")
+        net, ishape=(ngf * 8, 4, 4), oshape=(ngf * 4, 8, 8), kshape=(4, 4), prefix="g2")
     # 16x16
     net = deconv2d_bn_relu(
-        net, ishape=(ngf * 2, 8, 8), oshape=(ngf, 16, 16), kshape=(4, 4), prefix="g3")
+        net, ishape=(ngf * 4, 8, 8), oshape=(ngf * 2, 16, 16), kshape=(4, 4), prefix="g3")
     # 32x32
+    net = deconv2d_bn_relu(
+        net, ishape=(ngf * 2, 16, 16), oshape=(ngf, 32, 32), kshape=(4, 4), prefix="g4")
+    # 64x64
     net = deconv2d(
-        net, ishape=(ngf, 16, 16), oshape=oshape[-3:], kshape=(4, 4), name="g4_deconv")
+        net, ishape=(ngf, 32, 32), oshape=oshape[-3:], kshape=(4, 4), name="g5_deconv")
     net = mx.sym.Activation(net, act_type='tanh')
     return net
diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index 30802dd8198e..6e95a6e435d0 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -345,7 +345,9 @@ def _measure_common(input_pack, build_func, build_kwargs, number, repeat,
                     msg = msg.split('\n')[-2].split(": ")[1]
                 except Exception:  # pylint: disable=broad-except
                     pass
-                raise InstantiationError(msg)
+                res_pack.append(MeasureResult((InstantiationError(msg),),
+                                              MeasureErrorNo.INSTANTIATION_ERROR,
+                                              tstamp - tic, tstamp))
             else:
                 res_pack.append(MeasureResult((RuntimeError(msg),),
                                               MeasureErrorNo.COMPILE_HOST,
diff --git a/python/tvm/autotvm/task/dispatcher.py b/python/tvm/autotvm/task/dispatcher.py
index 2304b425f34b..93f6d584abfa 100644
--- a/python/tvm/autotvm/task/dispatcher.py
+++ b/python/tvm/autotvm/task/dispatcher.py
@@ -21,6 +21,8 @@
 
 from tvm import target as _target
 
+from .space import ConfigSpace
+
 logger = logging.getLogger('autotvm')
 
 class DispatchContext(object):
@@ -120,7 +122,12 @@ def dispatch_func(func, *args, **kwargs):
             raise RuntimeError("DispatchContext is not initialized")
         workload = func(*args, **kwargs)
         cfg = context.query(tgt, workload)
-        return dispatch_dict[cfg.template_key](cfg, *args, **kwargs)
+        if cfg.template_key:
+            return dispatch_dict[cfg.template_key](cfg, *args, **kwargs)
+        else:
+            assert dispatch_dict, "No func registered for this dispatcher"
+            for v in dispatch_dict.values():
+                return v(cfg, *args, **kwargs)
 
     fdecorate = decorate(fworkload, dispatch_func)
     fdecorate.register = register
@@ -159,13 +166,18 @@ class ApplyHistoryBest(DispatchContext):
         Otherwise, it is an iterator.
     default: ConfigEntity, optional
         The default config to return when no history records
+    allow_fallback: bool
+        Whether allow to use a fallback configuration if cannot find
+        tuned result.
     """
-    def __init__(self, records, default=None):
+    def __init__(self, records, default=None, allow_fallback=False):
         super(ApplyHistoryBest, self).__init__()
 
         self.best_by_targetkey = {}
         self.best_by_model = {}
         self._default = default
+        self._allow_fallback = allow_fallback
+        self.fallback = {}
 
         if records:
             self.load(records)
@@ -244,5 +256,18 @@ def query(self, target, workload):
 
         if self._default:
             return self._default
+
+        if self._allow_fallback:
+            key = (target, workload)
+            if key in self.fallback:
+                return self.fallback[key]
+            logger.warning(
+                "Cannot find config for target=%s, workload=%s. A fallback configuration "
+                "is used, which may bring great performance regression.", target, workload)
+            cfg = ConfigSpace()
+            self.fallback[key] = cfg
+            return cfg
+
         raise RuntimeError(
-            "Cannot find config for target=%s, workload=%s" % (target, workload))
+            "Cannot find config for target=%s, workload=%s. You need to do tuning "
+            "for this workload to get the config." % (target, workload))
diff --git a/python/tvm/autotvm/task/nnvm_integration.py b/python/tvm/autotvm/task/nnvm_integration.py
index a16527f9cb01..338b46784a75 100644
--- a/python/tvm/autotvm/task/nnvm_integration.py
+++ b/python/tvm/autotvm/task/nnvm_integration.py
@@ -53,12 +53,14 @@ def __init__(self):
         import nnvm
 
         self.symbol2topi = {
-            nnvm.sym.conv2d: [topi.nn.conv2d, topi.nn.depthwise_conv2d_nchw]
+            nnvm.sym.conv2d: [topi.nn.conv2d, topi.nn.depthwise_conv2d_nchw],
+            nnvm.sym.conv2d_transpose: [topi.nn.conv2d_transpose],
         }
 
         self.topi_to_task = {
             topi.nn.conv2d: "topi_nn_conv2d",
             topi.nn.depthwise_conv2d_nchw: "topi_nn_depthwise_conv2d_nchw",
+            topi.nn.conv2d_transpose_nchw: "topi_nn_conv2d_transpose_nchw",
         }
 
         self._register_dummy()
@@ -110,6 +112,15 @@ def _topi_nn_depthwise_conv2d_nchw(*args, **kwargs):
             s = topi.generic.schedule_depthwise_conv2d_nchw([C])
             return s, [A, W, C]
 
+        @register("topi_nn_conv2d_transpose_nchw")
+        def _topi_nn_conv2d_transpose_nchw(*args, **kwargs):
+            assert not kwargs, "Do not support kwargs in template function call"
+            args = deserialize_args(args)
+            A, W = args[:2]
+            C = topi.nn.conv2d_transpose_nchw(*args, **kwargs)
+            s = topi.generic.schedule_conv2d_transpose_nchw([C])
+            return s, [A, W, C]
+
     def reset(self):
         """Reset task collections"""
         self.task_collection = []
diff --git a/python/tvm/autotvm/tophub.py b/python/tvm/autotvm/tophub.py
index 94ff011f4f28..e11bb7a4fc92 100644
--- a/python/tvm/autotvm/tophub.py
+++ b/python/tvm/autotvm/tophub.py
@@ -9,6 +9,7 @@
 import logging
 import os
 import json
+import sys
 
 from .task import ApplyHistoryBest
 from .. import target as _target
@@ -27,7 +28,7 @@ def _alias(name):
     return table.get(name, name)
 
 
-def context(target, extra_files=None):
+def context(target, extra_files=None, allow_fallback=False):
     """Return the dispatch context with pre-tuned parameters.
     The corresponding downloaded *.log files under tophub root path will be loaded.
     Users can also add their own files in argument `extra_files`.
@@ -38,9 +39,12 @@ def context(target, extra_files=None):
         The compilation target
     extra_files: list of str, optional
         Extra log files to load
+    allow_fallback: bool
+        Whether allow to use a fallback configuration if cannot find
+        tuned result.
     """
     rootpath = AUTOTVM_TOPHUB_ROOT_PATH
-    best_context = ApplyHistoryBest([])
+    best_context = ApplyHistoryBest([], allow_fallback=allow_fallback)
 
     if isinstance(target, str):
         target = _target.create(target)
@@ -99,7 +103,15 @@ def check_package(backend):
 
     if os.path.isfile(os.path.join(AUTOTVM_TOPHUB_ROOT_PATH, backend + ".log")):
         return
-    download_package(backend)
+
+    if sys.version_info >= (3,):
+        import urllib.request as urllib2
+    else:
+        import urllib2
+    try:
+        download_package(backend)
+    except urllib2.URLError:
+        logging.warning("Failed to download tophub package for %s", backend)
 
 
 def list_packages():
diff --git a/python/tvm/autotvm/tuner/callback.py b/python/tvm/autotvm/tuner/callback.py
index a777f9c7ceb8..15d5ac1c9689 100644
--- a/python/tvm/autotvm/tuner/callback.py
+++ b/python/tvm/autotvm/tuner/callback.py
@@ -118,8 +118,8 @@ def _callback(tuner, inputs, results):
             ctx.cur_flops = flops
             ctx.best_flops = tuner.best_flops
 
-            sys.stdout.write('%s Current/Best: %7.2f/%7.2f GFLOPS | Progress: (%d/%d) '
-                             '| %.2f s\r' %
+            sys.stdout.write('\r%s Current/Best: %7.2f/%7.2f GFLOPS | Progress: (%d/%d) '
+                             '| %.2f s' %
                              (prefix, ctx.cur_flops/1e9, ctx.best_flops/1e9, ctx.ct, ctx.total,
                               time.time() - tic))
             sys.stdout.flush()
diff --git a/topi/python/topi/arm_cpu/__init__.py b/topi/python/topi/arm_cpu/__init__.py
index bb79769c1adc..8d78f67ac0b6 100644
--- a/topi/python/topi/arm_cpu/__init__.py
+++ b/topi/python/topi/arm_cpu/__init__.py
@@ -2,4 +2,5 @@
 
 from . import conv2d
 from . import depthwise_conv2d
+from . import conv2d_transpose
 from . import bitserial_conv2d
diff --git a/topi/python/topi/arm_cpu/conv2d.py b/topi/python/topi/arm_cpu/conv2d.py
index f5dbec8e552b..48bb4fb022c7 100644
--- a/topi/python/topi/arm_cpu/conv2d.py
+++ b/topi/python/topi/arm_cpu/conv2d.py
@@ -42,7 +42,7 @@ def schedule_conv2d_nchw_arm_cpu(cfg, outs):
 
     def _callback(op):
         # schedule conv2d
-        if 'spatial_conv_output' in op.tag:
+        if 'spatial_conv2d_output' in op.tag:
             output = op.output(0)
             conv = op.input_tensors[0]
 
@@ -60,7 +60,7 @@ def _callback(op):
 
             _schedule_spatial_pack(cfg, s, data_vec, kernel_vec, conv, output, outs[0])
 
-        if 'winograd_conv_output' in op.tag:
+        if 'winograd_conv2d_output' in op.tag:
             output = op.output(0)
             _schedule_winograd(cfg, s, output, outs[0])
 
@@ -72,7 +72,7 @@ def _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, n
     assert layout == "NCHW", "Only support NCHW"
     out_dtype = out_dtype or data.dtype
 
-    _, CI, IH, IW = get_const_tuple(data.shape)
+    N, CI, IH, IW = get_const_tuple(data.shape)
     if len(kernel.shape) == 4:
         pre_packed = False
         CO, _, KH, KW = get_const_tuple(kernel.shape)
@@ -81,13 +81,12 @@ def _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, n
         CO, _, KH, KW, VC = get_const_tuple(kernel.shape)
         CO = CO * VC
 
-    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, (KH, KW))
+    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (KH, KW))
     HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
 
-    N = 1
-    OH = (IH + pad_top + pad_down - KH) // HSTR + 1
+    OH = (IH + pad_top + pad_bottom - KH) // HSTR + 1
     OW = (IW + pad_left + pad_right - KW) // WSTR + 1
-    data_pad = pad(data, [0, 0, pad_top, pad_left], [0, 0, pad_down, pad_right])
+    data_pad = pad(data, [0, 0, pad_top, pad_left], [0, 0, pad_bottom, pad_right])
 
     # ==================== define configuration space ====================
     n, co, oh, ow = cfg.axis(N), cfg.axis(CO), cfg.axis(OH), cfg.axis(OW)
@@ -145,7 +144,7 @@ def _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, n
 
     output = tvm.compute(oshape, lambda n, co, h, w:
                          conv[n][co//VC][h//VH][w//VW][h%VH][w%VW][co%VC],
-                         name='output_unpack', tag='spatial_conv_output',
+                         name='output_unpack', tag='spatial_conv2d_output',
                          attrs={'workload': _conv_arg_to_workload(data, kernel, strides, padding,
                                                                   layout, out_dtype)})
     return output
@@ -195,11 +194,14 @@ def _schedule_spatial_pack(cfg, s, data_vec, kernel_vec,
     if kernel_vec.op.name == 'kernel_vec':
         co, _, _, _, _ = s[kernel_vec].op.axis
         if autotvm.GLOBAL_SCOPE.in_tuning:
-            # kernel packing will be pre-computed during compliation, so we skip
+            # kernel packing will be pre-computed during compilation, so we skip
             # this part to make tuning records correct
             s[kernel_vec].pragma(co, 'debug_skip_region')
         else:
             s[kernel_vec].parallel(co)
+    elif kernel_vec.op.name == 'kernel_vec_conv2d_transpose':  # for conv2d transpose
+        co, _, _, _, _ = s[kernel_vec].op.axis
+        s[kernel_vec].parallel(co)
 
     return s
 
@@ -330,7 +332,7 @@ def _decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype, tile_
     # unpack output
     output = tvm.compute((N, K, H, W), lambda n, k, h, w:
                          Y[k][n * nH * nW + (h//m) * nW + w//m][h % m][w % m],
-                         name='output', tag='winograd_conv_output',
+                         name='output', tag='winograd_conv2d_output',
                          attrs={'workload': _winograd_conv_arg_to_workload(
                              data, kernel, strides, padding, layout, out_dtype, tile_size)})
 
@@ -462,7 +464,7 @@ def schedule_conv2d_winograd_without_weight_transform_(cfg, outs):
     s = tvm.create_schedule([x.op for x in outs])
 
     def _callback(op):
-        if 'winograd_conv_output' in op.tag:
+        if 'winograd_conv2d_output' in op.tag:
             output = op.output(0)
             _schedule_winograd(cfg, s, output, outs[0])
 
diff --git a/topi/python/topi/arm_cpu/conv2d_transpose.py b/topi/python/topi/arm_cpu/conv2d_transpose.py
new file mode 100644
index 000000000000..99f199782ed2
--- /dev/null
+++ b/topi/python/topi/arm_cpu/conv2d_transpose.py
@@ -0,0 +1,144 @@
+# pylint: disable=invalid-name, unused-variable
+"""Transposed 2D convolution operators (sometimes called Deconvolution)."""
+from __future__ import absolute_import as _abs
+
+import tvm
+from tvm import autotvm
+
+from ..generic import schedule_conv2d_transpose_nchw
+from ..nn import conv2d_transpose_nchw, dilate, pad, get_pad_tuple
+from ..util import get_const_tuple, traverse_inline
+from .conv2d import _schedule_spatial_pack
+
+@autotvm.task.register_topi_compute(conv2d_transpose_nchw, "arm_cpu", "direct")
+def conv2d_transpose_nchw_arm(cfg, Input, Filter, strides, padding, out_dtype):
+    """Transposed 2D convolution nchw forward operator.
+
+    Parameters
+    ----------
+    Input : tvm.Tensor
+        4-D with shape [batch, in_channel, in_height, in_width]
+
+    Filter : tvm.Tensor
+        4-D with shape [in_channel, num_filter, filter_height, filter_width]
+
+    strides : tuple of two ints
+        The spatial stride along height and width
+
+    padding : int or str
+        Padding size, or ['VALID', 'SAME']
+
+    out_dtype: str
+        The output data type. This is used for mixed precision.
+
+    Returns
+    -------
+    Output : tvm.Tensor
+        4-D with shape [batch, out_channel, out_height, out_width]
+    """
+    return _decl_spatial_pack(cfg, Input, Filter, strides, padding, "NCHW", out_dtype, 2)
+
+def _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, num_tile):
+    assert layout == "NCHW", "Only support NCHW"
+    out_dtype = out_dtype or data.dtype
+
+    N, CI, IH, IW = get_const_tuple(data.shape)
+    _, CO, KH, KW = get_const_tuple(kernel.shape)
+
+    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (KH, KW))
+    bpad_top, bpad_bottom = KH - 1 - pad_top, KH - 1 - pad_bottom
+    bpad_left, bpad_right = KW - 1 - pad_left, KW - 1 - pad_right
+    HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
+
+    OH = (IH - 1) * HSTR - pad_top - pad_bottom + KH
+    OW = (IW - 1) * WSTR - pad_left - pad_right + KW
+
+    dilated_input = dilate(data, [1, 1, HSTR, WSTR])
+    data_pad = pad(dilated_input, [0, 0, bpad_top, bpad_left], [0, 0, bpad_bottom, bpad_right])
+
+    # ==================== define configuration space ====================
+    n, co, oh, ow = cfg.axis(N), cfg.axis(CO), cfg.axis(OH), cfg.axis(OW)
+    ci, kh, kw = cfg.reduce_axis(CI), cfg.reduce_axis(KH), cfg.reduce_axis(KW)
+
+    if num_tile == 2:     # for arm cpu
+        co, vc = cfg.define_split('tile_co', co, num_outputs=2)
+        oh, vh = cfg.define_split('tile_oh', oh, num_outputs=2)
+        ow, vw = cfg.define_split('tile_ow', ow, num_outputs=2)
+    elif num_tile == 3:   # for mali gpu
+        co, _, vc = cfg.define_split('tile_co', co, num_outputs=3)
+        oh, _, vh = cfg.define_split('tile_oh', oh, num_outputs=3)
+        ow, _, vw = cfg.define_split('tile_ow', ow, num_outputs=3)
+    else:
+        raise RuntimeError("Invalid num_tile")
+
+    cfg.define_reorder("reorder_0",
+                       [n, co, oh, ow, ci, kh, kw, vh, vw, vc],
+                       policy='candidate', candidate=[
+                           [n, co, oh, ow, ci, kh, kw, vh, vw, vc],
+                           [n, co, oh, ow, ci, kh, kw, vc, vh, vw]])
+
+    cfg.define_annotate("ann_reduce", [kh, kw], policy='try_unroll')
+    cfg.define_annotate("ann_spatial", [vh, vw, vc], policy='try_unroll_vec')
+    # ====================================================================
+
+    VC = cfg["tile_co"].size[-1]
+    VH = cfg["tile_oh"].size[-1]
+    VW = cfg["tile_ow"].size[-1]
+
+    dvshape = (N, OH // VH, OW // VW, CI, VH + KH-1, VW + KW-1)
+    kvshape = (CO // VC, CI, KH, KW, VC)
+    ovshape = (N, CO // VC, OH // VH, OW // VW, VH, VW, VC)
+    oshape = (N, CO, OH, OW)
+
+    data_vec = tvm.compute(dvshape, lambda n, h, w, ci, vh, vw:
+                           data_pad[n][ci][h*VH + vh][w*VW + vw],
+                           name='data_vec')
+
+    kernel_vec = tvm.compute(kvshape, lambda co, ci, kh, kw, vc:
+                             kernel[ci][co*VC+vc][kh][kw],
+                             name='kernel_vec_conv2d_transpose')
+
+    ci = tvm.reduce_axis((0, CI), name='ci')
+    kh = tvm.reduce_axis((0, KH), name='kh')
+    kw = tvm.reduce_axis((0, KW), name='kw')
+
+    conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \
+        tvm.sum(data_vec[n, h, w, ci, vh + kh, vw + kw].astype(out_dtype) *
+                kernel_vec[co, ci, KH - 1 - kh, KW - 1 - kw, vc].astype(out_dtype),
+                axis=[ci, kh, kw]), name='conv')
+
+    output = tvm.compute(oshape, lambda n, co, h, w:
+                         conv[n][co//VC][h//VH][w//VW][h%VH][w%VW][co%VC],
+                         name='output_unpack', tag='spatial_conv2d_transpose_output')
+    return output
+
+
+# register customized schedule for arm cpu.
+@autotvm.task.register_topi_schedule(schedule_conv2d_transpose_nchw, "arm_cpu", "direct")
+def schedule_conv2d_transpose_arm(cfg, outs):
+    """Schedule conv2d transpose for arm cpu"""
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if 'spatial_conv2d_transpose_output' in op.tag:
+            output = op.output(0)
+            conv = op.input_tensors[0]
+
+            data_vec = conv.op.input_tensors[0]
+            data_pad = data_vec.op.input_tensors[0]
+            dilated_input = data_pad.op.input_tensors[0]
+            s[data_pad].compute_inline()
+            s[dilated_input].compute_inline()
+
+            kernel_vec = conv.op.input_tensors[1]
+            if kernel_vec.op.name == 'kernel_vec':
+                kernel = kernel_vec.op.input_tensors[0]
+            else:
+                kernel = kernel_vec
+            if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
+                s[kernel].compute_inline()
+
+            _schedule_spatial_pack(cfg, s, data_vec, kernel_vec, conv, output, outs[0])
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
diff --git a/topi/python/topi/arm_cpu/depthwise_conv2d.py b/topi/python/topi/arm_cpu/depthwise_conv2d.py
index c4dbeae38eeb..8aafc436319f 100644
--- a/topi/python/topi/arm_cpu/depthwise_conv2d.py
+++ b/topi/python/topi/arm_cpu/depthwise_conv2d.py
@@ -15,7 +15,16 @@
 # register customized schedule for arm cpu.
 @autotvm.task.register_topi_schedule(schedule_depthwise_conv2d_nchw, 'arm_cpu', 'direct')
 def schedule_depthwise_conv2d_nchw_(cfg, outs):
-    """Schedule depthwise conv2d"""
+    """Schedule depthwise conv2d
+
+    Parameters
+    ----------
+    cfg: ConfigEntity
+        The configuration of this tempalte
+    outs: Array of Tensor
+        The computation graph description of depthwise convolution2d
+        in the format of an array of tensors.
+    """
     outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
     s = tvm.create_schedule([x.op for x in outs])
 
@@ -79,10 +88,8 @@ def _schedule(cfg, s, data, data_pad, kernel, output):
 
         return s
 
-    scheduled_ops = []
-
     def _callback(op):
-        if op.tag == 'depthwise_conv2d_nchw' and op not in scheduled_ops:
+        if op.tag == 'depthwise_conv2d_nchw':
             output = op.output(0)
             kernel = op.input_tensors[1]
             data = op.input_tensors[0]
@@ -92,7 +99,5 @@ def _callback(op):
                 data = data_pad.op.input_tensors[0]
             _schedule(cfg, s, data, data_pad, kernel, output)
 
-        scheduled_ops.append(op)
-
     traverse_inline(s, outs[0].op, _callback)
     return s
diff --git a/topi/python/topi/nn/conv2d_transpose.py b/topi/python/topi/nn/conv2d_transpose.py
index 270409ad099b..f9fdd3f1b2dc 100644
--- a/topi/python/topi/nn/conv2d_transpose.py
+++ b/topi/python/topi/nn/conv2d_transpose.py
@@ -10,7 +10,7 @@
 
 
 @tvm.target.generic_func
-def conv2d_transpose_nchw(Input, Filter, strides, padding):
+def conv2d_transpose_nchw(Input, Filter, strides, padding, out_dtype):
     """Transposed 2D convolution nchw forward operator.
 
     Parameters
@@ -27,6 +27,9 @@ def conv2d_transpose_nchw(Input, Filter, strides, padding):
     padding : int or str
         Padding size, or ['VALID', 'SAME']
 
+    out_dtype : str
+        The output data type. This is used for mixed precision.
+
     Returns
     -------
     Output : tvm.Tensor
@@ -58,7 +61,8 @@ def conv2d_transpose_nchw(Input, Filter, strides, padding):
     Output = tvm.compute(
         (batch, out_c, out_h, out_w),
         lambda b, c, h, w: tvm.sum(
-            PaddedInput[b, dc, h+dh, w+dw] * Filter[dc, c, filter_h-1-dh, filter_w-1-dw],
+            PaddedInput[b, dc, h+dh, w+dw].astype(out_dtype) *
+            Filter[dc, c, filter_h-1-dh, filter_w-1-dw].astype(out_dtype),
             axis=[dc, dh, dw]), tag="conv2d_transpose_nchw")
 
     return Output
diff --git a/topi/tests/python/test_topi_conv2d.py b/topi/tests/python/test_topi_conv2d.py
index 124c98c65c7a..365fdf551c4f 100644
--- a/topi/tests/python/test_topi_conv2d.py
+++ b/topi/tests/python/test_topi_conv2d.py
@@ -40,7 +40,7 @@ def get_ref_data():
     np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
 def test_conv2d():
-    with autotvm.tophub.context(tvm.target.arm_cpu('rasp3b')):
+    with autotvm.tophub.context(tvm.target.arm_cpu('rasp3b'), allow_fallback=True):
         verify_conv2d(1, 56, 64, 64, 3, 1, 1)
 
 if __name__ == "__main__":
diff --git a/topi/tests/python/test_topi_conv2d_transpose_nchw.py b/topi/tests/python/test_topi_conv2d_transpose_nchw.py
index 674fe9fd8953..0c985400031a 100644
--- a/topi/tests/python/test_topi_conv2d_transpose_nchw.py
+++ b/topi/tests/python/test_topi_conv2d_transpose_nchw.py
@@ -12,7 +12,7 @@ def verify_conv2d_transpose_nchw(batch, in_channel, in_size, num_filter, kernel,
 
     A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
     W = tvm.placeholder((in_channel, num_filter, kernel, kernel), name='W')
-    B = topi.nn.conv2d_transpose_nchw(A, W, [stride, stride], padding)
+    B = topi.nn.conv2d_transpose_nchw(A, W, [stride, stride], padding, A.dtype)
     C = topi.nn.relu(B)
 
     a_shape = get_const_tuple(A.shape)
diff --git a/tutorials/autotvm/tune_nnvm_arm.py b/tutorials/autotvm/tune_nnvm_arm.py
index 37779403fd0a..d11823f204e1 100644
--- a/tutorials/autotvm/tune_nnvm_arm.py
+++ b/tutorials/autotvm/tune_nnvm_arm.py
@@ -62,7 +62,7 @@
 
 def get_network(name, batch_size):
     """Get the symbol definition and random weight of a network"""
-    shape = {"data": (batch_size, 3, 224, 224)}
+    input_shape = (batch_size, 3, 224, 224)
     output_shape = (batch_size, 1000)
 
     if name =='resnet-18':
@@ -90,7 +90,7 @@ def get_network(name, batch_size):
     else:
         raise ValueError("Unsupported network: " + name)
 
-    return net, params, shape, output_shape
+    return net, params, input_shape, output_shape
 
 #################################################################
 # Start RPC Tracker
@@ -226,8 +226,8 @@ def get_network(name, batch_size):
 def tune_tasks(tasks,
                measure_option,
                tuner='xgb',
-               n_trial=500,
-               early_stopping=200,
+               n_trial=1000,
+               early_stopping=None,
                log_filename='tuning.log',
                use_transfer_learning=True,
                try_winograd=True):
@@ -283,10 +283,10 @@ def tune_tasks(tasks,
 def tune_and_evaluate():
     # extract workloads from nnvm graph
     print("Extract tasks...")
-    net, params, shape, out_shape = get_network(network, batch_size=1)
-    tasks = autotvm.task.extract_from_graph(net, shape=shape, dtype=dtype,
-                                            symbols=(nnvm.sym.conv2d,),
-                                            target=target)
+    net, params, input_shape, out_shape = get_network(network, batch_size=1)
+    tasks = autotvm.task.extract_from_graph(net, target=target,
+                                            shape={'data': input_shape}, dtype=dtype,
+                                            symbols=(nnvm.sym.conv2d,))
 
     # run tuning tasks
     print("Tuning...")
@@ -298,7 +298,7 @@ def tune_and_evaluate():
         with nnvm.compiler.build_config(opt_level=2, add_pass=['AlterOpLayout']):
             graph, lib, params = nnvm.compiler.build(
                 net, target=target,
-                shape=shape, params=params, dtype=dtype)
+                shape={'data': input_shape}, params=params, dtype=dtype)
 
         # export library
         tmp = tempdir()
@@ -319,7 +319,7 @@ def tune_and_evaluate():
         # upload parameters to device
         ctx = remote.context(str(target), 0)
         rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
-        data_tvm = tvm.nd.array((np.random.uniform(size=shape['data'])).astype(dtype))
+        data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
         module = runtime.create(graph, rlib, ctx)
         module.set_input('data', data_tvm)
         module.set_input(**rparams)
@@ -341,35 +341,33 @@ def tune_and_evaluate():
 # -------------
 # The tuning needs to train xgboost models and use them for prediction.
 # So a high performance CPU is recommended.
-# It takes about 1.5 hour on a 32T AMD Ryzen CPU.
+# It takes about 2 hours on a 32T AMD Ryzen CPU.
 # One sample output is
 #
 # .. code-block:: bash
 #
 #    Extract tasks...
 #    Tuning...
-#    [Task  1/16]  Current/Best:   13.15/  20.49 GFLOPS | Progress: (297/1000) | 348.51 s Done.
-#    [Task  2/16]  Current/Best:   16.66/  22.64 GFLOPS | Progress: (475/1000) | 415.42 s Done.
-#    [Task  3/16]  Current/Best:   10.33/  14.19 GFLOPS | Progress: (306/1000) | 239.61 s Done.
-#    [Task  4/16]  Current/Best:   13.29/  20.88 GFLOPS | Progress: (242/1000) | 227.48 s Done.
-#    [Task  5/16]  Current/Best:   13.28/  15.61 GFLOPS | Progress: (237/1000) | 191.56 s Done.
-#    [Task  6/16]  Current/Best:   20.16/  23.86 GFLOPS | Progress: (315/1000) | 304.31 s Done.
-#    [Task  7/16]  Current/Best:    9.22/  22.00 GFLOPS | Progress: (458/1000) | 433.26 s Done.
-#    [Task  8/16]  Current/Best:   14.12/  17.80 GFLOPS | Progress: (270/1000) | 240.73 s Done.
-#    [Task  9/16]  Current/Best:   14.59/  24.02 GFLOPS | Progress: (209/1000) | 213.61 s Done.
-#    [Task 10/16]  Current/Best:    9.86/  21.74 GFLOPS | Progress: (367/1000) | 359.93 s Done.
-#    [Task 11/16]  Current/Best:    5.01/  18.86 GFLOPS | Progress: (202/1000) | 191.18 s Done.
-#    [Task 12/16]  Current/Best:    8.61/  25.23 GFLOPS | Progress: (220/1000) | 220.74 s Done.
-#    [Task 13/16]  Current/Best:   10.87/  25.79 GFLOPS | Progress: (465/1000) | 902.14 s Done.
-#    [Task 14/16]  Current/Best:   15.33/  29.38 GFLOPS | Progress: (239/1000) | 481.33 s Done.
-#    [Task 15/16]  Current/Best:   12.09/  38.60 GFLOPS | Progress: (476/1000) | 928.35 s Done.
-#    [Task 16/16]  Current/Best:   16.77/  47.08 GFLOPS | Progress: (255/1000) | 439.91 s Done.
+#    [Task  1/16]  Current/Best:   18.85/  19.67 GFLOPS | Progress: (353/1000) | 387.05 s Done.
+#    [Task  2/16]  Current/Best:   16.10/  23.50 GFLOPS | Progress: (444/1000) | 379.99 s Done.
+#    [Task  3/16]  Current/Best:    5.49/  13.96 GFLOPS | Progress: (610/1000) | 485.87 s Done.
+#    [Task  4/16]  Current/Best:   10.07/  20.48 GFLOPS | Progress: (430/1000) | 391.66 s Done.
+#    [Task  5/16]  Current/Best:   11.50/  15.50 GFLOPS | Progress: (374/1000) | 356.03 s Done.
+#    [Task  6/16]  Current/Best:   10.76/  23.77 GFLOPS | Progress: (526/1000) | 526.42 s Done.
+#    [Task  7/16]  Current/Best:   12.71/  22.03 GFLOPS | Progress: (341/1000) | 322.96 s Done.
+#    [Task  8/16]  Current/Best:    8.60/  17.91 GFLOPS | Progress: (272/1000) | 236.08 s Done.
+#    [Task  9/16]  Current/Best:   15.37/  23.62 GFLOPS | Progress: (275/1000) | 275.18 s Done.
+#    [Task 10/16]  Current/Best:    6.62/  23.01 GFLOPS | Progress: (330/1000) | 315.02 s Done.
+#    [Task 11/16]  Current/Best:    1.85/  21.39 GFLOPS | Progress: (281/1000) | 239.19 s Done.
+#    [Task 12/16]  Current/Best:   15.41/  24.02 GFLOPS | Progress: (258/1000) | 270.82 s Done.
+#    [Task 13/16]  Current/Best:   17.96/  25.79 GFLOPS | Progress: (380/1000) | 738.29 s Done.
+#    [Task 14/16]  Current/Best:   14.81/  31.17 GFLOPS | Progress: (413/1000) | 799.21 s Done.
+#    [Task 15/16]  Current/Best:   24.39/  40.97 GFLOPS | Progress: (355/1000) | 700.25 s Done.
+#    [Task 16/16]  Current/Best:    9.42/  49.90 GFLOPS | Progress: (348/1000) | 603.84 s Done.
 #    Compile...
 #    Upload...
 #    Evaluate inference time cost...
-#    Mean inference time (std dev): 156.51 ms (0.89 ms)
-#
-
+#    Mean inference time (std dev): 157.29 ms (1.74 ms)
 
 ######################################################################
 #
diff --git a/tutorials/nnvm_quick_start.py b/tutorials/nnvm_quick_start.py
index c9f6c33591d0..c171823604cd 100644
--- a/tutorials/nnvm_quick_start.py
+++ b/tutorials/nnvm_quick_start.py
@@ -109,7 +109,7 @@
 # Save and Load Compiled Module
 # -----------------------------
 # We can also save the graph, lib and parameters into files and load them
-# back in development environment.
+# back in deploy environment.
 
 ####################################################
 

From 764516a6b3375d611a1dfa7f8735ac3f185a2f18 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Thu, 9 Aug 2018 14:21:05 -0700
Subject: [PATCH 10/77] [TEST] force openblas threads to be 1 (#1580)

---
 tests/scripts/task_python_nnvm.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/scripts/task_python_nnvm.sh b/tests/scripts/task_python_nnvm.sh
index 2fc41980fb3d..790073a2fe8b 100755
--- a/tests/scripts/task_python_nnvm.sh
+++ b/tests/scripts/task_python_nnvm.sh
@@ -1,6 +1,8 @@
 #!/bin/bash
 
 export PYTHONPATH=nnvm/python:python:topi/python
+# to avoid openblas threading error
+export OMP_NUM_THREADS=1
 
 echo "Running unittest..."
 python -m nose -v nnvm/tests/python/unittest || exit -1

From 2afe024809abd22a76ac8072fb0a0310df7360f2 Mon Sep 17 00:00:00 2001
From: Dayananda V <dayanandasiet@gmail.com>
Date: Fri, 10 Aug 2018 07:11:49 +0530
Subject: [PATCH 11/77] Vulkan TVM Android Support (#1571)

---
 apps/android_rpc/README.md                    | 25 +++--
 .../app/src/main/jni/Application.mk           | 16 ++--
 apps/android_rpc/tests/android_rpc_test.py    | 91 +++++++++++++------
 .../src/main/java/ml/dmlc/tvm/TVMContext.java | 15 +++
 .../main/java/ml/dmlc/tvm/rpc/RPCSession.java | 18 ++++
 python/tvm/rpc/client.py                      |  4 +
 web/tvm_runtime.js                            |  2 +
 7 files changed, 125 insertions(+), 46 deletions(-)

diff --git a/apps/android_rpc/README.md b/apps/android_rpc/README.md
index 41d361c823ed..eef22f3c7010 100644
--- a/apps/android_rpc/README.md
+++ b/apps/android_rpc/README.md
@@ -123,18 +123,25 @@ export TVM_NDK_CC=/opt/android-toolchain-arm64/bin/aarch64-linux-android-g++
 python android_rpc_test.py
 ```
 
-This will compile TVM IR to shared libraries (CPU and OpenCL) and run vector addition on your Android device. On my test device, it gives following results.
+This will compile TVM IR to shared libraries (CPU, OpenCL and Vulkan) and run vector addition on your Android device. To verify compiled TVM IR shared libraries on OpenCL target set [`'test_opencl = True'`](https://github.com/dmlc/tvm/blob/master/apps/android_rpc/tests/android_rpc_test.py#L25) and on Vulkan target set [`'test_vulkan = False'`](https://github.com/dmlc/tvm/blob/master/apps/android_rpc/tests/android_rpc_test.py#L27) in  [tests/android_rpc_test.py](https://github.com/dmlc/tvm/blob/master/apps/android_rpc/tests/android_rpc_test.py), by default on CPU target will execute.
+On my test device, it gives following results.
 
 ```bash
-TVM: Initializing cython mode...
-[01:21:43] src/codegen/llvm/codegen_llvm.cc:75: set native vector to be 32 for target aarch64
-[01:21:43] src/runtime/opencl/opencl_device_api.cc:194: Initialize OpenCL platform 'Apple'
-[01:21:43] src/runtime/opencl/opencl_device_api.cc:214: opencl(0)='Iris' cl_device_id=0x1024500
-[01:21:44] src/codegen/llvm/codegen_llvm.cc:75: set native vector to be 32 for target aarch64
-Run GPU test ...
-0.000155807 secs/op
 Run CPU test ...
-0.00139824 secs/op
+0.000962932 secs/op
+
+Run GPU(OpenCL Flavor) test ...
+0.000155807 secs/op
+
+[23:29:34] /home/tvm/src/runtime/vulkan/vulkan_device_api.cc:674: Cannot initialize vulkan: [23:29:34] /home/tvm/src/runtime/vulkan/vulkan_device_api.cc:512: Check failed: __e == VK_SUCCESS Vulan Error, code=-9: VK_ERROR_INCOMPATIBLE_DRIVER
+
+Stack trace returned 10 entries:
+[bt] (0) /home/user/.local/lib/python3.6/site-packages/tvm-0.4.0-py3.6-linux-x86_64.egg/tvm/libtvm.so(dmlc::StackTrace[abi:cxx11]()+0x53) [0x7f477f5399f3]
+.........
+
+You can still compile vulkan module but cannot run locally
+Run GPU(Vulkan Flavor) test ...
+0.000225198 secs/op
 ```
 
 You can define your own TVM operators and test via this RPC app on your Android device to find the most optimized TVM schedule.
diff --git a/apps/android_rpc/app/src/main/jni/Application.mk b/apps/android_rpc/app/src/main/jni/Application.mk
index 5bf52bdaffc0..f142e2995777 100644
--- a/apps/android_rpc/app/src/main/jni/Application.mk
+++ b/apps/android_rpc/app/src/main/jni/Application.mk
@@ -1,9 +1,9 @@
 ifndef config
-	ifneq ("$(wildcard ./config.mk)","")
-	  config ?= config.mk
-	else
-	  config ?= make/config.mk
-	endif
+    ifneq ("$(wildcard ./config.mk)","")
+        config ?= config.mk
+    else
+        config ?= make/config.mk
+    endif
 endif
 
 include $(config)
@@ -16,10 +16,10 @@ APP_STL := c++_static
 
 APP_CPPFLAGS += -DDMLC_LOG_STACK_TRACE=0 -DTVM4J_ANDROID=1 -std=c++11 -Oz -frtti
 ifeq ($(USE_OPENCL), 1)
-	APP_CPPFLAGS += -DTVM_OPENCL_RUNTIME=1
+    APP_CPPFLAGS += -DTVM_OPENCL_RUNTIME=1
 endif
 
 ifeq ($(USE_VULKAN), 1)
-	APP_CPPFLAGS += -DTVM_VULKAN_RUNTIME=1
-	APP_LDFLAGS += -lvulkan
+    APP_CPPFLAGS += -DTVM_VULKAN_RUNTIME=1
+    APP_LDFLAGS += -lvulkan
 endif
diff --git a/apps/android_rpc/tests/android_rpc_test.py b/apps/android_rpc/tests/android_rpc_test.py
index cfb04c1ca9a9..44618efd45c1 100644
--- a/apps/android_rpc/tests/android_rpc_test.py
+++ b/apps/android_rpc/tests/android_rpc_test.py
@@ -21,59 +21,92 @@
 arch = "arm64"
 target = "llvm -target=%s-linux-android" % arch
 
+# whether enable to execute test on OpenCL target
+test_opencl = False
+# whether enable to execute test on Vulkan target
+test_vulkan = False
+
 def test_rpc_module():
     # graph
     n = tvm.convert(1024)
     A = tvm.placeholder((n,), name='A')
     B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B')
+    a_np = np.random.uniform(size=1024).astype(A.dtype)
     temp = util.tempdir()
-    s = tvm.create_schedule(B.op)
-    xo, xi = s[B].split(B.op.axis[0], factor=64)
-    s[B].bind(xi, tvm.thread_axis("threadIdx.x"))
-    s[B].bind(xo, tvm.thread_axis("blockIdx.x"))
-    # Build the dynamic lib.
-    # If we don't want to do metal and only use cpu, just set target to be target
-    f = tvm.build(s, [A, B], "opencl", target_host=target, name="myadd")
-    path_dso1 = temp.relpath("dev_lib2.so")
-    f.export_library(path_dso1, ndk.create_shared)
 
+    # Establish remote connection with target hardware
+    tracker = rpc.connect_tracker(tracker_host, tracker_port)
+    remote = tracker.request(key, priority=0,
+                             session_timeout=60)
+
+    # Compile the Graph for CPU target
     s = tvm.create_schedule(B.op)
     xo, xi = s[B].split(B.op.axis[0], factor=64)
     s[B].parallel(xi)
     s[B].pragma(xo, "parallel_launch_point")
     s[B].pragma(xi, "parallel_barrier_when_finish")
     f = tvm.build(s, [A, B], target, name="myadd_cpu")
-    path_dso2 = temp.relpath("cpu_lib.so")
-    f.export_library(path_dso2, ndk.create_shared)
-
-    tracker = rpc.connect_tracker(tracker_host, tracker_port)
-    remote = tracker.request(key, priority=0,
-                             session_timeout=60)
+    path_dso_cpu = temp.relpath("cpu_lib.so")
+    f.export_library(path_dso_cpu, ndk.create_shared)
 
+    # Execute the portable graph on cpu target
     print('Run CPU test ...')
     ctx = remote.cpu(0)
-    remote.upload(path_dso2)
+    remote.upload(path_dso_cpu)
     f2 = remote.load_module("cpu_lib.so")
-    a_np = np.random.uniform(size=1024).astype(A.dtype)
     a = tvm.nd.array(a_np, ctx)
     b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
     time_f = f2.time_evaluator(f2.entry_name, ctx, number=10)
     cost = time_f(a, b).mean
-    print('%g secs/op' % cost)
+    print('%g secs/op\n' % cost)
     np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
 
+    # Compile the Graph for OpenCL target
+    if test_opencl:
+        s = tvm.create_schedule(B.op)
+        xo, xi = s[B].split(B.op.axis[0], factor=64)
+        s[B].bind(xi, tvm.thread_axis("threadIdx.x"))
+        s[B].bind(xo, tvm.thread_axis("blockIdx.x"))
+        # Build the dynamic lib.
+        # If we don't want to do metal and only use cpu, just set target to be target
+        f = tvm.build(s, [A, B], "opencl", target_host=target, name="myadd")
+        path_dso_cl = temp.relpath("dev_lib_cl.so")
+        f.export_library(path_dso_cl, ndk.create_shared)
+
+        print('Run GPU(OpenCL Flavor) test ...')
+        ctx = remote.cl(0)
+        remote.upload(path_dso_cl)
+        f1 = remote.load_module("dev_lib_cl.so")
+        a = tvm.nd.array(a_np, ctx)
+        b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
+        time_f = f1.time_evaluator(f1.entry_name, ctx, number=10)
+        cost = time_f(a, b).mean
+        print('%g secs/op\n' % cost)
+        np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
+
+    # Compile the Graph for Vulkan target
+    if test_vulkan:
+        s = tvm.create_schedule(B.op)
+        xo, xi = s[B].split(B.op.axis[0], factor=64)
+        s[B].bind(xi, tvm.thread_axis("threadIdx.x"))
+        s[B].bind(xo, tvm.thread_axis("blockIdx.x"))
+        # Build the dynamic lib.
+        # If we don't want to do metal and only use cpu, just set target to be target
+        f = tvm.build(s, [A, B], "vulkan", target_host=target, name="myadd")
+        path_dso_vulkan = temp.relpath("dev_lib_vulkan.so")
+        f.export_library(path_dso_vulkan, ndk.create_shared)
+
+        print('Run GPU(Vulkan Flavor) test ...')
+        ctx = remote.vulkan(0)
+        remote.upload(path_dso_vulkan)
+        f1 = remote.load_module("dev_lib_vulkan.so")
+        a = tvm.nd.array(a_np, ctx)
+        b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
+        time_f = f1.time_evaluator(f1.entry_name, ctx, number=10)
+        cost = time_f(a, b).mean
+        print('%g secs/op\n' % cost)
+        np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
 
-    print('Run GPU test ...')
-    ctx = remote.cl(0)
-    remote.upload(path_dso1)
-    f1 = remote.load_module("dev_lib2.so")
-    a_np = np.random.uniform(size=1024).astype(A.dtype)
-    a = tvm.nd.array(a_np, ctx)
-    b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
-    time_f = f1.time_evaluator(f1.entry_name, ctx, number=10)
-    cost = time_f(a, b).mean
-    print('%g secs/op' % cost)
-    np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
 
 if __name__ == "__main__":
     test_rpc_module()
diff --git a/jvm/core/src/main/java/ml/dmlc/tvm/TVMContext.java b/jvm/core/src/main/java/ml/dmlc/tvm/TVMContext.java
index 0d108e0a2943..d9051f0d9d4d 100644
--- a/jvm/core/src/main/java/ml/dmlc/tvm/TVMContext.java
+++ b/jvm/core/src/main/java/ml/dmlc/tvm/TVMContext.java
@@ -30,6 +30,7 @@ public class TVMContext {
     MASK2STR.put(1, "cpu");
     MASK2STR.put(2, "gpu");
     MASK2STR.put(4, "opencl");
+    MASK2STR.put(7, "vulkan");
     MASK2STR.put(8, "metal");
     MASK2STR.put(9, "vpi");
 
@@ -38,6 +39,7 @@ public class TVMContext {
     STR2MASK.put("cuda", 2);
     STR2MASK.put("cl", 4);
     STR2MASK.put("opencl", 4);
+    STR2MASK.put("vulkan", 7);
     STR2MASK.put("metal", 8);
     STR2MASK.put("vpi", 9);
   }
@@ -81,6 +83,19 @@ public static TVMContext opencl() {
     return opencl(0);
   }
 
+  /**
+   * Construct a Vulkan device.
+   * @param devId The device id
+   * @return The created context
+   */
+  public static TVMContext vulkan(int devId) {
+    return new TVMContext(7, devId);
+  }
+
+  public static TVMContext vulkan() {
+    return vulkan(0);
+  }
+
   /**
    * Construct a metal device.
    * @param devId The device id
diff --git a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/RPCSession.java b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/RPCSession.java
index 0eec9224a40c..8ebf188b0667 100644
--- a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/RPCSession.java
+++ b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/RPCSession.java
@@ -143,6 +143,24 @@ public TVMContext cl() {
     return cl(0);
   }
 
+  /**
+   * Construct remote OpenCL device.
+   * @param devId device id.
+   * @return Remote OpenCL context.
+   */
+  public TVMContext vulkan(int devId) {
+    return context(7, devId);
+  }
+
+  /**
+   * Construct remote OpenCL device.
+   * @return Remote OpenCL context.
+   */
+  public TVMContext vulkan() {
+    return vulkan(0);
+  }
+
+
   /**
    * Construct remote Metal device.
    * @param devId device id.
diff --git a/python/tvm/rpc/client.py b/python/tvm/rpc/client.py
index 57f368b0e660..ffbe6eeab6ee 100644
--- a/python/tvm/rpc/client.py
+++ b/python/tvm/rpc/client.py
@@ -130,6 +130,10 @@ def cl(self, dev_id=0):
         """Construct OpenCL device."""
         return self.context(4, dev_id)
 
+    def vulkan(self, dev_id=0):
+        """Construct Vulkan device."""
+        return self.context(7, dev_id)
+
     def metal(self, dev_id=0):
         """Construct Metal device."""
         return self.context(8, dev_id)
diff --git a/web/tvm_runtime.js b/web/tvm_runtime.js
index ef594e9433fb..786745d3ce88 100644
--- a/web/tvm_runtime.js
+++ b/web/tvm_runtime.js
@@ -696,6 +696,7 @@ var tvm_runtime = tvm_runtime || {};
       1 : "cpu",
       2 : "gpu",
       4 : "opencl",
+      7 : "vulkan",
       8 : "metal",
       9 : "vpi",
       11 : "opengl",
@@ -706,6 +707,7 @@ var tvm_runtime = tvm_runtime || {};
       "cuda": 2,
       "cl": 4,
       "opencl": 4,
+      "vulkan": 7,
       "metal": 8,
       "vpi": 9,
       "opengl": 11,

From 48fc410e0102e22ab9fdd5ec8879e10c6a3f1580 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Thu, 9 Aug 2018 18:55:48 -0700
Subject: [PATCH 12/77] [TEAM] merrymercy->code owner (#1581)

---
 CONTRIBUTORS.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 6e3cf55b94b0..2d571ba668ea 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -18,6 +18,7 @@ See the [community structure document](http://docs.tvm.ai/contribute/community.h
 - [Yuwei Hu](https://github.com/Huyuwei) TOPI
 - [Zhixun Tan](https://github.com/phisiart) OpenGL/WebGL backend
 - [Nick Hynes](https://github.com/nhynes) SGX and secured computing
+- [Lianmin Zheng](https://github.com/merrymercy) AutoTVM
 
 ## Reviewers
 - [Masahiro Masuda](https://github.com/masahi)
@@ -27,7 +28,6 @@ See the [community structure document](http://docs.tvm.ai/contribute/community.h
 - [Alex Weaver](https://github.com/alex-weaver)
 - [Eddie Yan](https://github.com/eqy)
 - [Joshua Z. Zhang](https://github.com/zhreshold)
-- [Lianmin Zheng](https://github.com/merrymercy)
 
 ## List of Contributors
 - [Full List of Contributors](https://github.com/dmlc/tvm/graphs/contributors)

From 48ff777aa347036af5c22d25aa9096b61a9519ad Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <mercy_zheng@sjtu.edu.cn>
Date: Fri, 10 Aug 2018 11:45:09 -0700
Subject: [PATCH 13/77] [AUTOTVM] API change (#1583)

---
 python/tvm/autotvm/__init__.py                |  2 +-
 python/tvm/autotvm/measure/__init__.py        |  2 +-
 python/tvm/autotvm/measure/measure.py         | 10 ++++----
 python/tvm/autotvm/measure/measure_methods.py | 24 +++++++++----------
 python/tvm/autotvm/tuner/tuner.py             |  4 ++--
 tutorials/autotvm/tune_conv2d_cuda.py         |  2 +-
 tutorials/autotvm/tune_nnvm_arm.py            |  6 ++---
 7 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/python/tvm/autotvm/__init__.py b/python/tvm/autotvm/__init__.py
index 20426be84aa1..5b312d93d288 100644
--- a/python/tvm/autotvm/__init__.py
+++ b/python/tvm/autotvm/__init__.py
@@ -22,7 +22,7 @@
 from . import tophub
 
 # some shortcuts
-from .measure import measure_option, MeasureInput, MeasureResult, MeasureErrorNo, use_rpc
+from .measure import measure_option, MeasureInput, MeasureResult, MeasureErrorNo
 from .tuner import callback
 from .task import template, get_config, create, ConfigSpace, ConfigEntity, \
     ApplyHistoryBest as apply_history_best
diff --git a/python/tvm/autotvm/measure/__init__.py b/python/tvm/autotvm/measure/__init__.py
index b9bd3c37b01d..880dfd1ffe29 100644
--- a/python/tvm/autotvm/measure/__init__.py
+++ b/python/tvm/autotvm/measure/__init__.py
@@ -1,7 +1,7 @@
 """Distributed executor infrastructure to scale up the tuning"""
 
 from .measure import MeasureInput, MeasureResult, MeasureErrorNo, measure_option
-from .measure_methods import request_remote, check_remote, create_measure_batch, use_rpc
+from .measure_methods import request_remote, check_remote, create_measure_batch, rpc
 
 from .local_executor import LocalExecutor
 from .executor import Future, Executor
diff --git a/python/tvm/autotvm/measure/measure.py b/python/tvm/autotvm/measure/measure.py
index 6a05e1a6a349..2325a970bc45 100644
--- a/python/tvm/autotvm/measure/measure.py
+++ b/python/tvm/autotvm/measure/measure.py
@@ -49,7 +49,7 @@ def measure_option(measure_func,
                    number=1,
                    repeat=1,
                    timeout=60,
-                   parallel_num=1,
+                   n_parallel=1,
                    do_fork=True,
                    build_func='default',
                    check_correctness=False,
@@ -63,7 +63,7 @@ def measure_option(measure_func,
         and a RPC server silently for the user.
 
         callable: It is a callable function for measurement.
-                  See the return value of measure/measure_methods.py::use_rpc for example.
+                  See the return value of measure/measure_methods.py::rpc for example.
     number : int, optional
         Number of times to do the measurement for average
     repeat : int, optional
@@ -74,7 +74,7 @@ def measure_option(measure_func,
     timeout: int, optional
         Timeout for a whole batch. TimeoutError will be returned as the result if a
         task timeouts.
-    parallel_num: int, optional
+    n_parallel: int, optional
         The number of measurement task that can run in parallel.
         Set this according to the number of cpu cores (for compilation) and
         the number of devices you have (for measuring generate code).
@@ -106,7 +106,7 @@ def measure_option(measure_func,
     and handle the logic of measurement.
 
     Signature:
-    * measure_func (see the return value of measure/measure_methods.py::use_rpc for example)
+    * measure_func (see the return value of measure/measure_methods.py::rpc for example)
     def measure_func(input_pack, build_func, build_kwargs, number, repeat, ref_input, ref_output):
         return measure_results
 
@@ -119,7 +119,7 @@ def build_func(inp, tmp_dir, **kwargs):
         'number': number,
         'repeat': repeat,
         'timeout': timeout,
-        'parallel_num': parallel_num,
+        'n_parallel': n_parallel,
         'do_fork': do_fork,
         'build_func': build_func,
         'check_correctness': check_correctness,
diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index 6e95a6e435d0..e192ee26ee3e 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -13,8 +13,8 @@
 
 import numpy as np
 
-from ... import rpc, ir_pass, build, build_config, nd, context, TVMError, register_func, \
-    target as _target
+from ... import ir_pass, build, build_config, nd, context, TVMError, register_func, \
+    target as _target, rpc as _rpc
 from ...contrib import nvcc, util, ndk
 
 from ..util import get_const_tuple
@@ -60,7 +60,7 @@ def request_remote(device_key, tracker_addr=None, priority=1, timeout=60):
         host = os.environ['TVM_TRACKER_HOST']
         port = int(os.environ['TVM_TRACKER_PORT'])
 
-    tracker = rpc.connect_tracker(host, port)
+    tracker = _rpc.connect_tracker(host, port)
     remote = tracker.request(device_key, priority=priority,
                              session_timeout=timeout)
     return remote
@@ -113,7 +113,7 @@ def create_measure_batch(task, option):
 
     measure_func = option['measure_func']
     number, repeat = option['number'], option['repeat']
-    timeout, parallel_num, do_fork = option['timeout'], option['parallel_num'], option['do_fork']
+    timeout, n_parallel, do_fork = option['timeout'], option['n_parallel'], option['do_fork']
     build_func = option['build_func']
     check_correctness = option['check_correctness']
     replay_db = option['replay_db']
@@ -134,7 +134,7 @@ def create_measure_batch(task, option):
                         use_popen=True, silent=True,
                         tracker_addr=(tracker.host, tracker.port))
 
-        measure_func = use_rpc(device_key, tracker.host, tracker.port)
+        measure_func = rpc(device_key, tracker.host, tracker.port)
         attach_objects = (server, tracker)
 
     build_kwargs = {}
@@ -218,18 +218,18 @@ def measure_batch(measure_inputs):
             return partial_results
         return results
 
-    measure_batch.parallel_num = parallel_num
+    measure_batch.n_parallel = n_parallel
     # attach server and tracker object to avoid them of being garbage-collected
     measure_batch.attach_objects = attach_objects
     return measure_batch
 
 
-def use_rpc(key,
-            host=None,
-            port=None,
-            priority=1,
-            session_timeout=60,
-            pack_size=1):
+def rpc(key,
+        host=None,
+        port=None,
+        priority=1,
+        session_timeout=60,
+        pack_size=1):
     """
     Create a standard measure_func which uses RPC Tracker for measurement.
     This measure_func will request a device from the RPC Tracker and
diff --git a/python/tvm/autotvm/tuner/tuner.py b/python/tvm/autotvm/tuner/tuner.py
index 5d1fc1507e58..91004cba4603 100644
--- a/python/tvm/autotvm/tuner/tuner.py
+++ b/python/tvm/autotvm/tuner/tuner.py
@@ -85,7 +85,7 @@ def tune(self, n_trial, measure_option, early_stopping=None, callbacks=()):
             every measurement pair. See autotvm/tuner/callback.py for some examples.
         """
         measure_batch = create_measure_batch(self.task, measure_option)
-        parallel_num = getattr(measure_batch, 'parallel_num', 1)
+        n_parallel = getattr(measure_batch, 'n_parallel', 1)
         early_stopping = early_stopping or 1e9
         old_level = logger.level
 
@@ -95,7 +95,7 @@ def tune(self, n_trial, measure_option, early_stopping=None, callbacks=()):
             if not self.has_next():
                 break
 
-            configs = self.next_batch(min(parallel_num, n_trial - i))
+            configs = self.next_batch(min(n_parallel, n_trial - i))
 
             inputs = [MeasureInput(self.task.target, self.task, config) for config in configs]
             results = measure_batch(inputs)
diff --git a/tutorials/autotvm/tune_conv2d_cuda.py b/tutorials/autotvm/tune_conv2d_cuda.py
index 179ac811ab70..375d1a9b755e 100644
--- a/tutorials/autotvm/tune_conv2d_cuda.py
+++ b/tutorials/autotvm/tune_conv2d_cuda.py
@@ -168,7 +168,7 @@ def conv2d_no_batching(N, H, W, CI, CO, KH, KW, stride, padding):
 # run 8 parallel threads for compilation
 measure_option = autotvm.measure_option('local',
                                         number=5,
-                                        parallel_num=8,
+                                        n_parallel=8,
                                         timeout=20)
 
 # begin tuning, log records to file `conv2d.log`
diff --git a/tutorials/autotvm/tune_nnvm_arm.py b/tutorials/autotvm/tune_nnvm_arm.py
index d11823f204e1..f3d1c62bdaf2 100644
--- a/tutorials/autotvm/tune_nnvm_arm.py
+++ b/tutorials/autotvm/tune_nnvm_arm.py
@@ -191,9 +191,9 @@ def get_network(name, batch_size):
    'early_stopping': 250,
 
    'measure_option': autotvm.measure_option(
-       autotvm.use_rpc(device_key, host='localhost', port=9190),
+       autotvm.measure.rpc(device_key, host='localhost', port=9190),
        number=4,
-       parallel_num=1,
+       n_parallel=1,
        timeout=10,
        build_func='ndk' if use_android else 'default',
    ),
@@ -205,7 +205,7 @@ def get_network(name, batch_size):
 #
 #   In general, the default value provided here works well. It is the same
 #   value that we used to generate pre-tuned parameters.
-#   If you have multiple devices, you can set :code:`parallel_num` to
+#   If you have multiple devices, you can set :code:`n_parallel` to
 #   the number of devices you have. (e.g. set it to 3 if you register 3 rk3399
 #   boards to the tracker).
 #   If you have large time budget, you can set :code:`n_trial`, :code:`early_stopping` larger,

From b0368e338f7e1df63dfc5a697c8c9694e97d10a0 Mon Sep 17 00:00:00 2001
From: Hao Jin <haojin2@users.noreply.github.com>
Date: Fri, 10 Aug 2018 18:01:55 -0400
Subject: [PATCH 14/77] update dmlc-core for security reason (#1584)

---
 dmlc-core | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dmlc-core b/dmlc-core
index e864aa6757cd..4f0564ec7694 160000
--- a/dmlc-core
+++ b/dmlc-core
@@ -1 +1 @@
-Subproject commit e864aa6757cdbe78b1296fe5231fd3050b7802c3
+Subproject commit 4f0564ec769477c66d480dd966088f172050c874

From f52255b9b50457e8a6b747a6a2352d3b0d444236 Mon Sep 17 00:00:00 2001
From: eqy <eqy@cs.washington.edu>
Date: Fri, 10 Aug 2018 15:02:10 -0700
Subject: [PATCH 15/77] DLPack Conversion API (#1573)

---
 include/tvm/runtime/c_runtime_api.h | 26 ++++++++++++
 include/tvm/runtime/ndarray.h       |  2 +-
 python/tvm/_ffi/ndarray.py          | 62 ++++++++++++++++++++++++++++-
 python/tvm/contrib/dlpack.py        | 43 ++++++++++++++++++++
 python/tvm/ndarray.py               |  2 +-
 src/runtime/ndarray.cc              | 36 +++++++++++++----
 tests/python/contrib/test_dlpack.py | 44 ++++++++++++++++++++
 7 files changed, 205 insertions(+), 10 deletions(-)
 create mode 100644 python/tvm/contrib/dlpack.py
 create mode 100644 tests/python/contrib/test_dlpack.py

diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h
index 17d00bf479aa..dca0d5ed4a30 100644
--- a/include/tvm/runtime/c_runtime_api.h
+++ b/include/tvm/runtime/c_runtime_api.h
@@ -445,6 +445,32 @@ TVM_DLL int TVMArrayCopyFromTo(TVMArrayHandle from,
                                TVMArrayHandle to,
                                TVMStreamHandle stream);
 
+/*!
+ * \brief Produce an array from the DLManagedTensor that shares data memory
+ * with the DLManagedTensor.
+ * \param from The source DLManagedTensor.
+ * \param out The output array handle.
+ * \return 0 when success, -1 when failure happens
+ */
+TVM_DLL int TVMArrayFromDLPack(DLManagedTensor* from,
+                               TVMArrayHandle* out);
+
+/*!
+ * \brief Produce a DLMangedTensor from the array that shares data memory with
+ * the array.
+ * \param from The source array.
+ * \param out The DLManagedTensor handle.
+ * \return 0 when success, -1 when failure happens
+ */
+TVM_DLL int TVMArrayToDLPack(TVMArrayHandle from,
+                             DLManagedTensor** out);
+
+/*!
+ * \brief Delete (free) a DLManagedTensor's data.
+ * \param dltensor Pointer to the DLManagedTensor. 
+ */
+TVM_DLL void TVMDLManagedTensorCallDeleter(DLManagedTensor* dltensor);
+
 /*!
  * \brief Create a new runtime stream.
  *
diff --git a/include/tvm/runtime/ndarray.h b/include/tvm/runtime/ndarray.h
index 2b51b2e0fcfe..d3ecce8ba9d0 100644
--- a/include/tvm/runtime/ndarray.h
+++ b/include/tvm/runtime/ndarray.h
@@ -155,7 +155,7 @@ class NDArray {
    * that is DLPack compatible.
    *
    * The memory is retained until the NDArray went out of scope.
-   *
+   * \param tensor The DLPack tensor to copy from.
    * \return The created NDArray view.
    */
   TVM_DLL static NDArray FromDLPack(DLManagedTensor* tensor);
diff --git a/python/tvm/_ffi/ndarray.py b/python/tvm/_ffi/ndarray.py
index 3788c07ac440..d994d7c2e4a5 100644
--- a/python/tvm/_ffi/ndarray.py
+++ b/python/tvm/_ffi/ndarray.py
@@ -5,7 +5,7 @@
 import sys
 import ctypes
 import numpy as np
-from .base import _LIB, check_call, c_array, string_types, _FFI_MODE
+from .base import _LIB, check_call, c_array, string_types, _FFI_MODE, c_str
 from .runtime_ctypes import TVMType, TVMContext, TVMArray, TVMArrayHandle
 from .runtime_ctypes import TypeCode, tvm_shape_index_t
 
@@ -28,6 +28,17 @@
     from ._ctypes.ndarray import NDArrayBase as _NDArrayBase
 
 
+TVMPyCapsuleDestructor = ctypes.CFUNCTYPE(None, ctypes.c_void_p)
+_c_str_dltensor = c_str('dltensor')
+
+
+# used for PyCapsule manipulation
+if hasattr(ctypes, 'pythonapi'):
+    ctypes.pythonapi.PyCapsule_GetName.restype = ctypes.c_char_p
+    ctypes.pythonapi.PyCapsule_GetPointer.restype = ctypes.c_void_p
+    ctypes.pythonapi.PyCapsule_New.restype = ctypes.py_object
+
+
 def context(dev_type, dev_id=0):
     """Construct a TVM context with given device type and id.
 
@@ -62,6 +73,7 @@ def context(dev_type, dev_id=0):
         dev_type = TVMContext.STR2MASK[dev_type]
     return TVMContext(dev_type, dev_id)
 
+
 def numpyasarray(np_data):
     """Return a TVMArray representation of a numpy array.
     """
@@ -112,6 +124,42 @@ def empty(shape, dtype="float32", ctx=context(1, 0)):
         ctypes.byref(handle)))
     return _make_array(handle, False)
 
+
+def from_dlpack(dltensor):
+    """Produce an array from a DLPack tensor without memory copy.
+    Retreives the underlying DLPack tensor's pointer to create an array from the
+    data. Removes the original DLPack tensor's destructor as now the array is
+    responsible for destruction.
+
+    Parameters
+    ----------
+    dltensor : DLPack tensor
+
+    Returns
+    -------
+    arr: tvm.nd.NDArray
+        The array view of the tensor data.
+    """
+    dltensor = ctypes.py_object(dltensor)
+    name = ctypes.pythonapi.PyCapsule_GetName(dltensor)
+    ptr = ctypes.pythonapi.PyCapsule_GetPointer(dltensor, name)
+    handle = TVMArrayHandle()
+    check_call(_LIB.TVMArrayFromDLPack(ptr, ctypes.byref(handle)))
+    ctypes.pythonapi.PyCapsule_SetDestructor(dltensor, None)
+    return _make_array(handle, False)
+
+
+def _dlpack_deleter(pycapsule):
+    pycapsule = ctypes.py_object(pycapsule)
+    if ctypes.pythonapi.PyCapsule_IsValid(pycapsule, _c_str_dltensor):
+        ptr = ctypes.pythonapi.PyCapsule_GetPointer(pycapsule, _c_str_dltensor)
+        _LIB.TVMDLManagedTensorCallDeleter(ptr)
+        ctypes.pythonapi.PyCapsule_SetDestructor(dltensor, TVMPyCapsuleDestructor(0))
+
+
+_c_dlpack_deleter = TVMPyCapsuleDestructor(_dlpack_deleter)
+
+
 class NDArrayBase(_NDArrayBase):
     """A simple Device/CPU Array object in runtime."""
     @property
@@ -260,6 +308,18 @@ def copyto(self, target):
             raise ValueError("Unsupported target type %s" % str(type(target)))
         return target
 
+    def to_dlpack(self):
+        """Produce an array from a DLPack Tensor without copying memory
+
+        Returns
+        -------
+        dlpack : DLPack tensor view of the array data
+        """
+        handle = ctypes.c_void_p()
+        check_call(_LIB.TVMArrayToDLPack(self.handle, ctypes.byref(handle)))
+        return ctypes.pythonapi.PyCapsule_New(handle, _c_str_dltensor, _c_dlpack_deleter)
+
+
 def free_extension_handle(handle, type_code):
     """Free c++ extension type handle
 
diff --git a/python/tvm/contrib/dlpack.py b/python/tvm/contrib/dlpack.py
new file mode 100644
index 000000000000..11db29f98b3e
--- /dev/null
+++ b/python/tvm/contrib/dlpack.py
@@ -0,0 +1,43 @@
+"""Wrapping functions to bridge frameworks with DLPack support to TVM"""
+from .. import ndarray
+
+def convert_func(tvm_func, tensor_type, to_dlpack_func):
+    """Convert a tvm function into one that accepts a tensor from another
+       framework, provided the other framework supports DLPACK
+
+    Parameters
+    ----------
+    tvm_func: Function
+        Built tvm function operating on arrays
+
+    tensor_type: Type
+        Type of the tensors of the target framework
+
+    to_dlpack_func: Function
+        Function to convert the source tensors to DLPACK
+    """
+    assert callable(tvm_func)
+
+    def _wrapper(*args):
+        args = tuple(ndarray.from_dlpack(to_dlpack_func(arg))\
+            if isinstance(arg, tensor_type) else arg for arg in args)
+        return tvm_func(*args)
+
+    return _wrapper
+
+def to_pytorch_func(tvm_func):
+    """Convert a tvm function into one that accepts PyTorch tensors
+
+    Parameters
+    ----------
+    tvm_func: Function
+        Built tvm function operating on arrays
+
+    Returns
+    -------
+    wrapped_func: Function
+        Wrapped tvm function that operates on PyTorch tensors
+    """
+    import torch
+    import torch.utils.dlpack
+    return convert_func(tvm_func, torch.Tensor, torch.utils.dlpack.to_dlpack)
diff --git a/python/tvm/ndarray.py b/python/tvm/ndarray.py
index 18e958973d94..448e5f6d8bdb 100644
--- a/python/tvm/ndarray.py
+++ b/python/tvm/ndarray.py
@@ -8,7 +8,7 @@
 import numpy as _np
 
 from ._ffi.ndarray import TVMContext, TVMType, NDArrayBase
-from ._ffi.ndarray import context, empty
+from ._ffi.ndarray import context, empty, from_dlpack
 from ._ffi.ndarray import _set_class_ndarray
 from ._ffi.ndarray import register_extension, free_extension_handle
 
diff --git a/src/runtime/ndarray.cc b/src/runtime/ndarray.cc
index f862f32f6e99..424a2b09cb15 100644
--- a/src/runtime/ndarray.cc
+++ b/src/runtime/ndarray.cc
@@ -93,6 +93,16 @@ struct NDArray::Internal {
     arr.data_ = nullptr;
     return tensor;
   }
+  // Container to DLManagedTensor
+  static DLManagedTensor* ToDLPack(NDArray::Container* from) {
+    CHECK(from != nullptr);
+    DLManagedTensor* ret = new DLManagedTensor();
+    ret->dl_tensor = from->dl_tensor;
+    ret->manager_ctx = from;
+    from->IncRef();
+    ret->deleter = NDArrayDLPackDeleter;
+    return ret;
+  }
 };
 
 NDArray NDArray::CreateView(std::vector<int64_t> shape,
@@ -115,13 +125,7 @@ NDArray NDArray::CreateView(std::vector<int64_t> shape,
 }
 
 DLManagedTensor* NDArray::ToDLPack() const {
-  CHECK(data_ != nullptr);
-  DLManagedTensor* ret = new DLManagedTensor();
-  ret->dl_tensor = data_->dl_tensor;
-  ret->manager_ctx = const_cast<NDArray*>(this);
-  data_->IncRef();
-  ret->deleter = NDArrayDLPackDeleter;
-  return ret;
+  return Internal::ToDLPack(data_);
 }
 
 NDArray NDArray::Empty(std::vector<int64_t> shape,
@@ -213,6 +217,24 @@ int TVMArrayCopyFromTo(TVMArrayHandle from,
   API_END();
 }
 
+int TVMArrayFromDLPack(DLManagedTensor* from,
+                       TVMArrayHandle* out) {
+  API_BEGIN();
+  *out = NDArray::Internal::MoveAsDLTensor(NDArray::FromDLPack(from));
+  API_END();
+}
+
+int TVMArrayToDLPack(TVMArrayHandle from,
+                     DLManagedTensor** out) {
+  API_BEGIN();
+  *out = NDArray::Internal::ToDLPack(reinterpret_cast<NDArray::Container*>(from));
+  API_END();
+}
+
+void TVMDLManagedTensorCallDeleter(DLManagedTensor* dltensor) {
+  (*(dltensor->deleter))(dltensor);
+}
+
 int TVMArrayCopyFromBytes(TVMArrayHandle handle,
                           void* data,
                           size_t nbytes) {
diff --git a/tests/python/contrib/test_dlpack.py b/tests/python/contrib/test_dlpack.py
new file mode 100644
index 000000000000..9a8ea34e69d5
--- /dev/null
+++ b/tests/python/contrib/test_dlpack.py
@@ -0,0 +1,44 @@
+import tvm
+import numpy as np
+from tvm.contrib.dlpack import to_pytorch_func
+
+def test():
+    a = np.random.randn(1337)
+    tvm_a = tvm.nd.array(a)
+    np.testing.assert_equal(tvm.nd.from_dlpack(tvm_a.to_dlpack()).asnumpy(), a)
+
+    try:
+        import torch
+        import torch.utils.dlpack
+
+        x = torch.rand(56, 56)
+        tvm_x = tvm.nd.from_dlpack(torch.utils.dlpack.to_dlpack(x))
+        np.testing.assert_equal(x.numpy(), tvm_x.asnumpy())
+        y = tvm.nd.from_dlpack(tvm_x.to_dlpack())
+        np.testing.assert_equal(y.asnumpy(), tvm_x.asnumpy())
+        np.testing.assert_equal(torch.utils.dlpack.from_dlpack(y.to_dlpack()).numpy(), tvm_x.asnumpy())
+
+        n = tvm.convert(137)
+        xx = torch.rand(137,137)
+        yy = torch.rand(137,137)
+        zz2 = torch.empty(137,137)
+        zz = xx.mm(yy)
+        XX = tvm.placeholder((n,n), name='X')
+        YY = tvm.placeholder((n,n), name='Y')
+
+        k = tvm.reduce_axis((0, n), name='k')
+        ZZ = tvm.compute((n,n), lambda i,j : tvm.sum(XX[i,k]*YY[k,j], axis=k))
+        s = tvm.create_schedule(ZZ.op)
+        f = tvm.build(s, [XX, YY, ZZ], target_host='llvm', name='f')
+
+        f_pytorch = to_pytorch_func(f)
+        zz2 = torch.empty(137,137)
+        f_pytorch(xx, yy, zz2)
+        np.testing.assert_allclose(zz.numpy(), zz2.numpy(), rtol=1e-6)
+
+    except ImportError:
+        pass
+
+
+if __name__ ==  '__main__':
+    test()

From edda6cc1106798d42eeeac0a56bab3853bba413e Mon Sep 17 00:00:00 2001
From: eqy <eqy@cs.washington.edu>
Date: Fri, 10 Aug 2018 19:04:46 -0700
Subject: [PATCH 16/77] use phone EditText for numerical fields (#1587)

---
 apps/android_rpc/app/src/main/res/layout/content_main.xml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/apps/android_rpc/app/src/main/res/layout/content_main.xml b/apps/android_rpc/app/src/main/res/layout/content_main.xml
index 0f2564833ecd..82be44d98451 100644
--- a/apps/android_rpc/app/src/main/res/layout/content_main.xml
+++ b/apps/android_rpc/app/src/main/res/layout/content_main.xml
@@ -20,6 +20,7 @@
             android:hint="@string/input_address"
             android:layout_width="wrap_content"
             android:layout_height="wrap_content"
+            android:inputType="phone"
             android:background="@android:drawable/editbox_background"/>
     </LinearLayout>
 
@@ -37,6 +38,7 @@
             android:minWidth="100dip"
             android:layout_width="wrap_content"
             android:layout_height="wrap_content"
+            android:inputType="phone"
             android:background="@android:drawable/editbox_background"/>
     </LinearLayout>
 

From ec3f09b30ec1c2203a119b5fccfeb48166fcc7b3 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sat, 11 Aug 2018 09:15:05 -0700
Subject: [PATCH 17/77] [RUNTIME] Refactor to enable stackvm in runtime.
 (#1588)

---
 CMakeLists.txt                                |  12 +-
 Jenkinsfile                                   |   1 +
 cmake/config.cmake                            |   3 +
 include/tvm/ir.h                              |  20 +--
 include/tvm/runtime/util.h                    |  28 +++-
 python/tvm/module.py                          |   9 +-
 src/codegen/codegen.cc                        |   1 -
 src/codegen/stack_vm/stack_vm_module.cc       |  71 ----------
 .../codegen_stackvm.cc}                       |  23 +++-
 .../codegen_stackvm.h}                        |  10 +-
 .../stackvm/stackvm.cc}                       |  71 +++++++---
 .../stack_vm.h => runtime/stackvm/stackvm.h}  |  55 +++++---
 src/runtime/stackvm/stackvm_module.cc         | 128 ++++++++++++++++++
 src/runtime/stackvm/stackvm_module.h          |  27 ++++
 tests/python/unittest/test_module_load.py     |  22 ++-
 15 files changed, 337 insertions(+), 144 deletions(-)
 delete mode 100644 src/codegen/stack_vm/stack_vm_module.cc
 rename src/codegen/{stack_vm/codegen_stack_vm.cc => stackvm/codegen_stackvm.cc} (95%)
 rename src/codegen/{stack_vm/codegen_stack_vm.h => stackvm/codegen_stackvm.h} (95%)
 rename src/{codegen/stack_vm/stack_vm.cc => runtime/stackvm/stackvm.cc} (90%)
 rename src/{codegen/stack_vm/stack_vm.h => runtime/stackvm/stackvm.h} (89%)
 create mode 100644 src/runtime/stackvm/stackvm_module.cc
 create mode 100644 src/runtime/stackvm/stackvm_module.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 39776d53d1f1..572f4aef1432 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -29,6 +29,7 @@ tvm_option(USE_ROCM "Build with ROCM" OFF)
 tvm_option(ROCM_PATH "The path to rocm" /opt/rocm)
 tvm_option(USE_RPC "Build with RPC" ON)
 tvm_option(USE_LLVM "Build with LLVM, can be set to specific llvm-config path" OFF)
+tvm_option(USE_STACKVM_RUNTIME "Include stackvm into the runtime" OFF)
 tvm_option(USE_GRAPH_RUNTIME "Build with tiny graph runtime" ON)
 tvm_option(USE_GRAPH_RUNTIME_DEBUG "Build with tiny graph runtime debug mode" OFF)
 tvm_option(USE_RTTI "Build with RTTI" ON)
@@ -97,7 +98,6 @@ file(GLOB COMPILER_SRCS
     src/arithmetic/*.cc
     src/autotvm/*.cc
     src/codegen/*.cc
-    src/codegen/stack_vm/*.cc
     src/lang/*.cc
     src/pass/*.cc
     src/op/*.cc
@@ -135,6 +135,16 @@ if(USE_RPC)
   list(APPEND RUNTIME_SRCS ${RUNTIME_RPC_SRCS})
 endif(USE_RPC)
 
+file(GLOB STACKVM_RUNTIME_SRCS src/runtime/stackvm/*.cc)
+file(GLOB STACKVM_CODEGEN_SRCS src/codegen/stackvm/*.cc)
+list(APPEND COMPILER_SRCS ${STACKVM_CODEGEN_SRCS})
+if(USE_STACKVM_RUNTIME)
+  message(STATUS "Build with stackvm support in runtime...")
+  list(APPEND RUNTIME_SRCS ${STACKVM_RUNTIME_SRCS})
+else()
+  list(APPEND COMPILER_SRCS ${STACKVM_RUNTIME_SRCS})
+endif(USE_STACKVM_RUNTIME)
+
 if(USE_GRAPH_RUNTIME)
   message(STATUS "Build with Graph runtime support...")
   file(GLOB RUNTIME_GRAPH_SRCS src/runtime/graph/*.cc)
diff --git a/Jenkinsfile b/Jenkinsfile
index bec0d2be5df8..2ecf3c59f8aa 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -96,6 +96,7 @@ stage('Build') {
            echo set\\(USE_RPC ON\\) >> config.cmake
            echo set\\(USE_SORT ON\\) >> config.cmake
            echo set\\(USE_GRAPH_RUNTIME ON\\) >> config.cmake
+           echo set\\(USE_STACKVM_RUNTIME ON\\) >> config.cmake
            echo set\\(USE_BLAS openblas\\) >> config.cmake
            echo set\\(CMAKE_CXX_COMPILER g++\\) >> config.cmake
            echo set\\(CMAKE_CXX_FLAGS -Werror\\) >> config.cmake
diff --git a/cmake/config.cmake b/cmake/config.cmake
index 85c5102169a9..c364a88cce11 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -65,6 +65,9 @@ set(USE_OPENGL OFF)
 # Whether enable RPC runtime
 set(USE_RPC ON)
 
+# Whether embed stackvm into the runtime
+set(USE_STACKVM_RUNTIME OFF)
+
 # Whether enable tiny embedded graph runtime.
 set(USE_GRAPH_RUNTIME ON)
 
diff --git a/include/tvm/ir.h b/include/tvm/ir.h
index 9ea16131188d..646824332902 100644
--- a/include/tvm/ir.h
+++ b/include/tvm/ir.h
@@ -12,6 +12,7 @@
 #include <string>
 #include "./base.h"
 #include "./expr.h"
+#include "./runtime/util.h"
 
 namespace tvm {
 namespace ir {
@@ -449,25 +450,6 @@ constexpr const char* tvm_global_barrier_kinit = "tvm_global_barrier_kinit";
  */
 constexpr const char* tvm_thread_allreduce = "tvm_thread_allreduce";
 
-/*! \brief The kind of structure field info */
-enum TVMStructFieldKind : int {
-  // array head address
-  kArrAddr,
-  kArrData,
-  kArrShape,
-  kArrStrides,
-  kArrNDim,
-  kArrTypeCode,
-  kArrTypeBits,
-  kArrTypeLanes,
-  kArrByteOffset,
-  kArrDeviceId,
-  kArrDeviceType,
-  kArrKindBound_,
-  // TVMValue field
-  kTVMValueContent,
-  kTVMValueKindBound_
-};
 }   // namespace intrinsic
 
 // Reuse IR node defintiion from HalideIR
diff --git a/include/tvm/runtime/util.h b/include/tvm/runtime/util.h
index 160642ffcc85..7fa62be912be 100644
--- a/include/tvm/runtime/util.h
+++ b/include/tvm/runtime/util.h
@@ -21,7 +21,33 @@ namespace runtime {
 inline bool TypeMatch(TVMType t, int code, int bits, int lanes = 1) {
   return t.code == code && t.bits == bits && t.lanes == lanes;
 }
-
 }  // namespace runtime
 }  // namespace tvm
+// Forward declare the intrinsic id we need
+// in structure fetch to enable stackvm in runtime
+namespace tvm {
+namespace ir {
+namespace intrinsic {
+/*! \brief The kind of structure field info used in intrinsic */
+enum TVMStructFieldKind : int {
+  // array head address
+  kArrAddr,
+  kArrData,
+  kArrShape,
+  kArrStrides,
+  kArrNDim,
+  kArrTypeCode,
+  kArrTypeBits,
+  kArrTypeLanes,
+  kArrByteOffset,
+  kArrDeviceId,
+  kArrDeviceType,
+  kArrKindBound_,
+  // TVMValue field
+  kTVMValueContent,
+  kTVMValueKindBound_
+};
+}  // namespace intrinsic
+}  // namespace ir
+}  // namespace tvm
 #endif  // TVM_RUNTIME_UTIL_H_
diff --git a/python/tvm/module.py b/python/tvm/module.py
index 1b83c9b26243..6cca6fb0f722 100644
--- a/python/tvm/module.py
+++ b/python/tvm/module.py
@@ -90,9 +90,12 @@ def export_library(self,
         kwargs : dict, optiona;
             Additional arguments passed to fcompile
         """
-        if self.type_key == "stacktvm":
-            raise ValueError("Module[%s]: export_library requires llvm module,"
-                             " did you build with LLVM enabled?" % self.type_key)
+        if self.type_key == "stackvm":
+            if not file_name.endswith(".stackvm"):
+                raise ValueError("Module[%s]: can only be saved as stackvm format."
+                                 "did you build with LLVM enabled?" % self.type_key)
+            self.save(file_name)
+            return
 
         if self.type_key != "llvm":
             raise ValueError("Module[%s]: Only llvm support export shared" % self.type_key)
diff --git a/src/codegen/codegen.cc b/src/codegen/codegen.cc
index 8bc7d238a866..12570e5881a9 100644
--- a/src/codegen/codegen.cc
+++ b/src/codegen/codegen.cc
@@ -40,7 +40,6 @@ std::string PackImportsToC(const runtime::Module& mod, bool system_lib) {
     CHECK_EQ(im->imports().size(), 0U)
         << "Only support simply one-level hierarchy";
     std::string tkey = im->type_key();
-    std::string bin;
     stream->Write(tkey);
     im->SaveToBinary(stream);
   }
diff --git a/src/codegen/stack_vm/stack_vm_module.cc b/src/codegen/stack_vm/stack_vm_module.cc
deleted file mode 100644
index 731663deb448..000000000000
--- a/src/codegen/stack_vm/stack_vm_module.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-/*!
- *  Copyright (c) 2017 by Contributors
- * \file stack_vm_module.cc
- */
-#include <tvm/runtime/registry.h>
-#include <tvm/runtime/module.h>
-#include <tvm/codegen.h>
-#include "./codegen_stack_vm.h"
-
-namespace tvm {
-namespace codegen {
-
-class StackVMModuleNode : public runtime::ModuleNode {
- public:
-  const char* type_key() const {
-    return "stackvm";
-  }
-
-  PackedFunc GetFunction(
-      const std::string& name,
-      const std::shared_ptr<ModuleNode>& sptr_to_self) final {
-    if (name == runtime::symbol::tvm_module_main) {
-      return GetFunction(entry_func_, sptr_to_self);
-    }
-    auto it = fmap_.find(name);
-    if (it == fmap_.end()) return PackedFunc();
-    const StackVM& vm = it->second;
-    // capture sptr_to_self to keep module node alive.
-    return PackedFunc([vm, sptr_to_self](TVMArgs args, TVMRetValue* rv) {
-        vm(args);
-      });
-  }
-
-  std::string GetSource(const std::string& format) final {
-    std::ostringstream os;
-    for (const auto& kv : fmap_) {
-      os << "Function: " << kv.first << '\n';
-      os << kv.second;
-    }
-    return os.str();
-  }
-
-  static runtime::Module Build(const Array<LoweredFunc>& funcs) {
-    CHECK_NE(funcs.size(), 0U);
-    std::shared_ptr<StackVMModuleNode> n =
-        std::make_shared<StackVMModuleNode>();
-    for (LoweredFunc f : funcs) {
-      StackVM vm = codegen::CodeGenStackVM().Compile(f);
-      CHECK(!n->fmap_.count(f->name))
-          << "Function name " << f->name << "already exist in list";
-      vm.mod_ctx = n.get();
-      n->fmap_[f->name] = std::move(vm);
-    }
-    n->entry_func_ = funcs[0]->name;
-    return runtime::Module(n);
-  }
-
- private:
-  // entry function.
-  std::string entry_func_;
-  // internal function map
-  std::unordered_map<std::string, StackVM> fmap_;
-};
-
-TVM_REGISTER_API("codegen.build_stackvm")
-.set_body([](TVMArgs args, TVMRetValue* rv) {
-    *rv = StackVMModuleNode::Build(args[0]);
-  });
-
-}  // namespace codegen
-}  // namespace tvm
diff --git a/src/codegen/stack_vm/codegen_stack_vm.cc b/src/codegen/stackvm/codegen_stackvm.cc
similarity index 95%
rename from src/codegen/stack_vm/codegen_stack_vm.cc
rename to src/codegen/stackvm/codegen_stackvm.cc
index 168e411fa6e2..517793ff14a3 100644
--- a/src/codegen/stack_vm/codegen_stack_vm.cc
+++ b/src/codegen/stackvm/codegen_stackvm.cc
@@ -1,11 +1,12 @@
 /*!
  *  Copyright (c) 2017 by Contributors
- * \file codegen_stack_vm.cc
+ * \file codegen_stackvm.cc
  */
 #include <tvm/runtime/registry.h>
 #include <tvm/packed_func_ext.h>
 #include <limits>
-#include "./codegen_stack_vm.h"
+#include "./codegen_stackvm.h"
+#include "../../runtime/stackvm/stackvm_module.h"
 
 namespace tvm {
 namespace codegen {
@@ -19,6 +20,7 @@ StackVM CodeGenStackVM::Compile(LoweredFunc f) {
     CHECK_EQ(static_cast<size_t>(vid), i);
   }
   this->Push(f->body);
+  vm_.InitCache();
   return std::move(vm_);
 }
 
@@ -486,5 +488,22 @@ void CodeGenStackVM::VisitExpr_(const Let *op) {
   this->PushOp(StackVM::STORE_HEAP, static_cast<int>(vid));
   this->Push(op->body);
 }
+
+runtime::Module BuildStackVM(const Array<LoweredFunc>& funcs) {
+  CHECK_NE(funcs.size(), 0U);
+  std::unordered_map<std::string, StackVM> fmap;
+  for (LoweredFunc f : funcs) {
+    StackVM vm = codegen::CodeGenStackVM().Compile(f);
+    CHECK(!fmap.count(f->name))
+        << "Function name " << f->name << "already exist in list";
+    fmap[f->name] = std::move(vm);
+  }
+  return runtime::StackVMModuleCreate(fmap, funcs[0]->name);
+}
+
+TVM_REGISTER_API("codegen.build_stackvm")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    *rv = BuildStackVM(args[0]);
+  });
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/codegen/stack_vm/codegen_stack_vm.h b/src/codegen/stackvm/codegen_stackvm.h
similarity index 95%
rename from src/codegen/stack_vm/codegen_stack_vm.h
rename to src/codegen/stackvm/codegen_stackvm.h
index 089284529242..23bd61dcb4c2 100644
--- a/src/codegen/stack_vm/codegen_stack_vm.h
+++ b/src/codegen/stackvm/codegen_stackvm.h
@@ -3,8 +3,8 @@
  * \file codegen_stack_vm.h
  * \brief Codegen into Simple Stack VM.
  */
-#ifndef TVM_CODEGEN_STACK_VM_CODEGEN_STACK_VM_H_
-#define TVM_CODEGEN_STACK_VM_CODEGEN_STACK_VM_H_
+#ifndef TVM_CODEGEN_STACKVM_CODEGEN_STACKVM_H_
+#define TVM_CODEGEN_STACKVM_CODEGEN_STACKVM_H_
 
 #include <tvm/ir.h>
 #include <tvm/ir_functor_ext.h>
@@ -14,12 +14,14 @@
 #include <vector>
 #include <unordered_map>
 
-#include "./stack_vm.h"
+#include "../../runtime/stackvm/stackvm.h"
 
 namespace tvm {
 namespace codegen {
 
 using namespace ir;
+using runtime::StackVM;
+
 /*!
  * \brief A base class to generate a stack VM.
  *  This module is used to generate host wrapper
@@ -145,4 +147,4 @@ class CodeGenStackVM
 
 }  // namespace codegen
 }  // namespace tvm
-#endif  // TVM_CODEGEN_STACK_VM_CODEGEN_STACK_VM_H_
+#endif  // TVM_CODEGEN_STACKVM_CODEGEN_STACKVM_H_
diff --git a/src/codegen/stack_vm/stack_vm.cc b/src/runtime/stackvm/stackvm.cc
similarity index 90%
rename from src/codegen/stack_vm/stack_vm.cc
rename to src/runtime/stackvm/stackvm.cc
index 95feeae3679e..f86bfec087e4 100644
--- a/src/codegen/stack_vm/stack_vm.cc
+++ b/src/runtime/stackvm/stackvm.cc
@@ -1,15 +1,16 @@
 /*!
  *  Copyright (c) 2017 by Contributors
  * Implementation stack VM.
- * \file stack_vm.cc
+ * \file stackvm.cc
  */
 #include <dmlc/thread_local.h>
-#include <tvm/ir.h>
+#include <tvm/runtime/util.h>
 #include <tvm/runtime/c_backend_api.h>
-#include "./stack_vm.h"
+#include <algorithm>
+#include "./stackvm.h"
 
 namespace tvm {
-namespace codegen {
+namespace runtime {
 
 typedef dmlc::ThreadLocalStore<StackVM::State> StackVMStateStore;
 
@@ -172,28 +173,64 @@ std::ostream& operator<<(std::ostream& os, const StackVM& vm) {  // NOLINT(*)
   return os;
 }
 
-void StackVM::operator()(const runtime::TVMArgs& args) const {
+void StackVM::Run(const runtime::TVMArgs& args,
+                  runtime::ModuleNode* mod_ctx) const {
   StackVM::State* s = StackVM::ThreadLocalState();
+  if (s->heap.size() < heap_size) {
+    s->heap.resize(heap_size);
+  }
   s->sp = 0;
   s->pc = 0;
-  if (s->heap.size() < this->heap_size) {
-    s->heap.resize(this->heap_size);
-  }
-
+  s->mod_ctx = mod_ctx;
   s->heap[0].v_handle = (void*)args.values;  // NOLINT(*)
   s->heap[1].v_handle = (void*)args.type_codes;  // NOLINT(*)
   s->heap[2].v_int64 = args.num_args;
   this->Run(s);
 }
 
+void StackVM::InitCache() {
+  extern_func_cache_.clear();
+  extern_func_cache_.resize(
+      extern_func_name.size(), PackedFunc(nullptr));
+}
+
+void StackVM::Save(dmlc::Stream* strm) const {
+  // to be endian invariant.
+  std::vector<int32_t> code_copy(code.size());
+  std::transform(code.begin(), code.end(), code_copy.begin(), [](Code c) {
+      return c.v_int;
+    });
+  strm->Write(code_copy);
+  strm->Write(str_data);
+  strm->Write(extern_func_name);
+  strm->Write(heap_id_name);
+  strm->Write(heap_size);
+  strm->Write(stack_size);
+}
+
+bool StackVM::Load(dmlc::Stream* strm)  {
+  // to be endian invariant.
+  std::vector<int32_t> code_copy;
+  if (!strm->Read(&code_copy)) return false;
+  code.resize(code_copy.size());
+  std::transform(code_copy.begin(), code_copy.end(), code.begin(), [](int v) {
+      Code code; code.v_int = v; return code;
+    });
+  if (!strm->Read(&str_data)) return false;
+  if (!strm->Read(&extern_func_name)) return false;
+  if (!strm->Read(&heap_id_name)) return false;
+  if (!strm->Read(&heap_size)) return false;
+  if (!strm->Read(&stack_size)) return false;
+  this->InitCache();
+  return true;
+}
+
 void StackVM::Run(State* s) const {
   int64_t sp = s->sp;
   int64_t pc = s->pc;
   int64_t alloca_sp = s->sp;
   std::vector<TVMValue>& stack = s->stack;
   std::vector<TVMValue>& heap = s->heap;
-  s->extern_func.clear();
-  s->extern_func.resize(extern_func_name.size());
   if (stack.size() < stack_size) {
     stack.resize(stack_size);
   }
@@ -488,17 +525,19 @@ void StackVM::Run(State* s) const {
 }
 
 const PackedFunc& StackVM::GetExtern(State* s, int fid) const {
-  PackedFunc& f = s->extern_func[fid];
+  CHECK_LT(static_cast<size_t>(fid), extern_func_cache_.size());
+  // allow race write in this, since write is idempotent
+  PackedFunc& f = extern_func_cache_[fid];
   if (f == nullptr) {
-    CHECK(mod_ctx != nullptr)
+    CHECK(s->mod_ctx != nullptr)
         << "No local context is set in stackvm";
-    const PackedFunc* pf = mod_ctx->GetFuncFromEnv(extern_func_name[fid]);
+    CHECK(s->mod_ctx != nullptr);
+    const PackedFunc* pf = s->mod_ctx->GetFuncFromEnv(extern_func_name[fid]);
     CHECK(pf != nullptr);
     f = *pf;
-    CHECK(f != nullptr);
   }
   return f;
 }
 
-}  // namespace codegen
+}  // namespace runtime
 }  // namespace tvm
diff --git a/src/codegen/stack_vm/stack_vm.h b/src/runtime/stackvm/stackvm.h
similarity index 89%
rename from src/codegen/stack_vm/stack_vm.h
rename to src/runtime/stackvm/stackvm.h
index 54972d39a5df..b2ce975b2c73 100644
--- a/src/codegen/stack_vm/stack_vm.h
+++ b/src/runtime/stackvm/stackvm.h
@@ -1,36 +1,36 @@
 /*!
  *  Copyright (c) 2016 by Contributors
- * \file stack_vm.h
+ * \file stackvm.h
  * \brief A simple stack-based virtual machine.
  *
  *  This can be used to interepret host side code
  *  to setup calls into device functions
  *  when only Runtime compilation for device is available(via NVRTC or OpenCL).
  */
-#ifndef TVM_CODEGEN_STACK_VM_STACK_VM_H_
-#define TVM_CODEGEN_STACK_VM_STACK_VM_H_
+#ifndef TVM_RUNTIME_STACKVM_STACKVM_H_
+#define TVM_RUNTIME_STACKVM_STACKVM_H_
 
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/module.h>
-#include <tvm/packed_func_ext.h>
 #include <string>
 #include <vector>
 
 namespace tvm {
-namespace codegen {
+namespace runtime {
 
 using runtime::operator<<;
 /*!
- * \brief A simple stack-based virtual machine.
+ * \brief A simple stack-based virtual machine program.
  */
 class StackVM {
  public:
   /*!
-   * \brief Invoke the StackVM as PackedFunc
+   * \brief Invoke the StackVM program.
    * \param args The arguments to the StackVM.
+   * \param mod_ctx The module context used in running.
    */
-  void operator()(const TVMArgs& args) const;
+  void Run(const TVMArgs& args, runtime::ModuleNode* mod_ctx) const;
   /*!
    * \brief The opcode of stack vm
    * \note Notation
@@ -276,21 +276,25 @@ class StackVM {
     std::vector<TVMValue> stack;
     /*! \brief The global heap space */
     std::vector<TVMValue> heap;
-    /*! \brief extern functions */
-    std::vector<PackedFunc> extern_func;
     /*! \brief stack pointer  */
     int64_t sp{0};
     /*! \brief program counter */
     int64_t pc{0};
+    /*! \brief The current module context of stackvm */
+    runtime::ModuleNode* mod_ctx{nullptr};
   };
-  /*! \brief The external function entries. */
-  struct ExternFuncEntry {
-    std::string name;
-    runtime::PackedFunc func;
-  };
-
-  /*! \brief execute the stack vm with given state */
-  void Run(State* state) const;
+  /*! \brief Initialize local cache*/
+  void InitCache();
+  /*!
+   * \brief Save stackvm program to an output stream
+   * \param strm The output stream
+   */
+  void Save(dmlc::Stream* strm) const;
+  /*!
+   * \brief Load stackvm program from output stream
+   * \param strm The output stream
+   */
+  bool Load(dmlc::Stream* strm);
   /*!
    * \brief Print instruction at location pc
    * \param os The ostream
@@ -300,12 +304,11 @@ class StackVM {
   int64_t PrintCode(std::ostream&os, int64_t pc) const;  // NOLINT(*)
   /*! \brief Get thread local state of the stack VM */
   static State* ThreadLocalState();
+  // The code below are programs
   /*! \brief The instructions */
   std::vector<Code> code;
   /*! \brief constant error messages */
   std::vector<std::string> str_data;
-  /*! \brief The current module context of stackvm */
-  runtime::ModuleNode* mod_ctx{nullptr};
   /*! \brief Extern functions */
   std::vector<std::string> extern_func_name;
   /*! \brief name of each heap id */
@@ -385,10 +388,18 @@ class StackVM {
   friend std::ostream& operator<<(std::ostream& os, const StackVM& vm);  // NOLINT(*)
 
  private:
+  //  execute the stack vm with given state
+  void Run(State* state) const;
   // get extern function.
   const PackedFunc& GetExtern(State* s, int fid) const;
+  // cached extern function
+  mutable std::vector<PackedFunc> extern_func_cache_;
 };
 
-}  // namespace codegen
+}  // namespace runtime
 }  // namespace tvm
-#endif  // TVM_CODEGEN_STACK_VM_STACK_VM_H_
+
+namespace dmlc {
+DMLC_DECLARE_TRAITS(has_saveload, ::tvm::runtime::StackVM, true);
+}
+#endif  // TVM_RUNTIME_STACKVM_STACKVM_H_
diff --git a/src/runtime/stackvm/stackvm_module.cc b/src/runtime/stackvm/stackvm_module.cc
new file mode 100644
index 000000000000..71ca9ba6c09a
--- /dev/null
+++ b/src/runtime/stackvm/stackvm_module.cc
@@ -0,0 +1,128 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file stackvm_module.cc
+ */
+#include <tvm/runtime/registry.h>
+#include <tvm/runtime/module.h>
+#include <dmlc/memory_io.h>
+#include "./stackvm_module.h"
+#include "../file_util.h"
+#include "../module_util.h"
+
+namespace tvm {
+namespace runtime {
+
+class StackVMModuleNode : public runtime::ModuleNode {
+ public:
+  const char* type_key() const {
+    return "stackvm";
+  }
+
+  PackedFunc GetFunction(
+      const std::string& name,
+      const std::shared_ptr<ModuleNode>& sptr_to_self) final {
+    if (name == runtime::symbol::tvm_module_main) {
+      return GetFunction(entry_func_, sptr_to_self);
+    }
+    auto it = fmap_.find(name);
+    if (it == fmap_.end()) return PackedFunc();
+    const StackVM& vm = it->second;
+    // capture sptr_to_self to keep module node alive.
+    return PackedFunc([vm, sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        vm.Run(args, this);
+      });
+  }
+
+  std::string GetSource(const std::string& format) final {
+    std::ostringstream os;
+    for (const auto& kv : fmap_) {
+      os << "Function: " << kv.first << '\n';
+      os << kv.second;
+    }
+    return os.str();
+  }
+
+  void SaveToFile(const std::string& file_name,
+                  const std::string& format) final {
+    std::string data, mblob;
+    dmlc::MemoryStringStream writer(&data);
+    dmlc::Stream* strm = &writer;
+    strm->Write(fmap_);
+    strm->Write(entry_func_);
+    // also save imports
+    uint64_t num_imports = static_cast<uint64_t>(imports_.size());
+    strm->Write(num_imports);
+
+    for (runtime::Module im : imports_) {
+      CHECK_EQ(im->imports().size(), 0U)
+          << "Only support simply one-level hierarchy";
+      std::string tkey = im->type_key();
+      strm->Write(tkey);
+      LOG(INFO) << "save " << tkey;
+      im->SaveToBinary(strm);
+      LOG(INFO) << "FInish save " << tkey;
+    }
+    SaveBinaryToFile(file_name, data);
+  }
+
+  static Module Create(std::unordered_map<std::string, StackVM> fmap,
+                       std::string entry_func) {
+    std::shared_ptr<StackVMModuleNode> n =
+        std::make_shared<StackVMModuleNode>();
+    n->fmap_ = std::move(fmap);
+    n->entry_func_ = std::move(entry_func);
+    return Module(n);
+  }
+
+  static Module Load(dmlc::Stream* strm) {
+    std::unordered_map<std::string, StackVM> fmap;
+    std::string entry_func, data;
+    strm->Read(&fmap);
+    strm->Read(&entry_func);
+    std::shared_ptr<StackVMModuleNode> n =
+        std::make_shared<StackVMModuleNode>();
+    n->fmap_ = std::move(fmap);
+    n->entry_func_ = std::move(entry_func);
+    uint64_t num_imports;
+    strm->Read(&num_imports);
+    for (uint64_t i = 0; i < num_imports; ++i) {
+      std::string tkey;
+      CHECK(strm->Read(&tkey));
+      std::string fkey = "module.loadbinary_" + tkey;
+      const PackedFunc* f = Registry::Get(fkey);
+      CHECK(f != nullptr)
+          << "Loader of " << tkey << "("
+          << fkey << ") is not presented.";
+      Module m = (*f)(static_cast<void*>(strm));
+      n->imports_.emplace_back(std::move(m));
+    }
+    return Module(n);
+  }
+
+  static Module LoadFromFile(std::string file_name,
+                             std::string format) {
+    std::string data;
+    LoadBinaryFromFile(file_name, &data);
+    dmlc::MemoryStringStream reader(&data);
+    return Load(&reader);
+  }
+
+ private:
+  // internal function map
+  std::unordered_map<std::string, StackVM> fmap_;
+  // entry function.
+  std::string entry_func_;
+};
+
+Module StackVMModuleCreate(std::unordered_map<std::string, StackVM> fmap,
+                           std::string entry_func) {
+  return StackVMModuleNode::Create(fmap, entry_func);
+}
+
+TVM_REGISTER_GLOBAL("module.loadfile_stackvm")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    *rv = StackVMModuleNode::LoadFromFile(args[0], args[1]);
+  });
+
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/stackvm/stackvm_module.h b/src/runtime/stackvm/stackvm_module.h
new file mode 100644
index 000000000000..fcd51a64f870
--- /dev/null
+++ b/src/runtime/stackvm/stackvm_module.h
@@ -0,0 +1,27 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file stackvm_module.h
+ * \brief StackVM module
+ */
+#ifndef TVM_RUNTIME_STACKVM_STACKVM_MODULE_H_
+#define TVM_RUNTIME_STACKVM_STACKVM_MODULE_H_
+
+#include <tvm/runtime/packed_func.h>
+#include <string>
+#include "./stackvm.h"
+
+namespace tvm {
+namespace runtime {
+/*!
+ * \brief create a stackvm module
+ *
+ * \param fmap The map from name to function
+ * \param entry_func The entry function name.
+ * \return The created module
+ */
+Module StackVMModuleCreate(std::unordered_map<std::string, StackVM> fmap,
+                           std::string entry_func);
+
+}  // namespace runtime
+}  // namespace tvm
+#endif  // TVM_RUNTIME_STACKVM_STACKVM_MODULE_H_
diff --git a/tests/python/unittest/test_module_load.py b/tests/python/unittest/test_module_load.py
index 1b239a357f66..8ee3ea5e06c0 100644
--- a/tests/python/unittest/test_module_load.py
+++ b/tests/python/unittest/test_module_load.py
@@ -109,11 +109,25 @@ def check_device(device):
             f2[name](a, b)
             np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
 
-    check_device("cuda")
-    check_device("vulkan")
-    check_device("opencl")
-    check_device("metal")
+    def check_stackvm(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        temp = util.tempdir()
+        name = "myadd_%s" % device
+        f = tvm.build(s, [A, B], device, "stackvm", name=name)
+        path_dso = temp.relpath("dev_lib.stackvm")
+        #f.export_library(path_dso)
+        #f1 = tvm.module.load(path_dso)
+        a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx)
+        b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx)
+        f(a, b)
+        np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
 
+    for device in ["cuda", "vulkan", "opencl", "metal"]:
+        check_device(device)
+        check_stackvm(device)
 
 def test_combine_module_llvm():
     """Test combine multiple module into one shared lib."""

From 19cf5c66ada5ed9be027eaafae07099cfaf93164 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sat, 11 Aug 2018 15:31:15 -0700
Subject: [PATCH 18/77] [DLPACK] Enable cython support (#1589)

---
 HalideIR                            |  2 +-
 include/tvm/runtime/c_runtime_api.h |  2 +-
 python/tvm/_ffi/_ctypes/ndarray.py  | 49 ++++++++++++++++++++++++++++-
 python/tvm/_ffi/_cython/base.pxi    | 13 +++++++-
 python/tvm/_ffi/_cython/ndarray.pxi | 38 ++++++++++++++++++++++
 python/tvm/_ffi/ndarray.py          | 48 +++-------------------------
 tests/scripts/task_python_nnvm.sh   |  4 +++
 tests/scripts/task_python_topi.sh   |  4 +++
 8 files changed, 113 insertions(+), 47 deletions(-)

diff --git a/HalideIR b/HalideIR
index a5a80bdc8232..a0b9563f4571 160000
--- a/HalideIR
+++ b/HalideIR
@@ -1 +1 @@
-Subproject commit a5a80bdc8232c9dbfe508bb5c46e8f58cdf7ec20
+Subproject commit a0b9563f45719553adf4d39fe3c14db1af0e1f40
diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h
index dca0d5ed4a30..32d574340052 100644
--- a/include/tvm/runtime/c_runtime_api.h
+++ b/include/tvm/runtime/c_runtime_api.h
@@ -467,7 +467,7 @@ TVM_DLL int TVMArrayToDLPack(TVMArrayHandle from,
 
 /*!
  * \brief Delete (free) a DLManagedTensor's data.
- * \param dltensor Pointer to the DLManagedTensor. 
+ * \param dltensor Pointer to the DLManagedTensor.
  */
 TVM_DLL void TVMDLManagedTensorCallDeleter(DLManagedTensor* dltensor);
 
diff --git a/python/tvm/_ffi/_ctypes/ndarray.py b/python/tvm/_ffi/_ctypes/ndarray.py
index df877679fc7d..8b88e7dc98ea 100644
--- a/python/tvm/_ffi/_ctypes/ndarray.py
+++ b/python/tvm/_ffi/_ctypes/ndarray.py
@@ -1,11 +1,47 @@
+# pylint: disable=invalid-name
 """Runtime NDArray api"""
 from __future__ import absolute_import
 
 import ctypes
-from ..base import _LIB, check_call
+from ..base import _LIB, check_call, c_str
 from ..runtime_ctypes import TVMArrayHandle
 from .types import RETURN_SWITCH, C_TO_PY_ARG_SWITCH, _wrap_arg_func, _return_handle
 
+
+TVMPyCapsuleDestructor = ctypes.CFUNCTYPE(None, ctypes.c_void_p)
+_c_str_dltensor = c_str('dltensor')
+_c_str_used_dltensor = c_str('used_dltensor')
+
+
+# used for PyCapsule manipulation
+if hasattr(ctypes, 'pythonapi'):
+    ctypes.pythonapi.PyCapsule_GetName.restype = ctypes.c_char_p
+    ctypes.pythonapi.PyCapsule_GetPointer.restype = ctypes.c_void_p
+    ctypes.pythonapi.PyCapsule_New.restype = ctypes.py_object
+
+
+def _from_dlpack(dltensor):
+    dltensor = ctypes.py_object(dltensor)
+    if ctypes.pythonapi.PyCapsule_IsValid(dltensor, _c_str_dltensor):
+        ptr = ctypes.pythonapi.PyCapsule_GetPointer(dltensor, _c_str_dltensor)
+        handle = TVMArrayHandle()
+        check_call(_LIB.TVMArrayFromDLPack(ptr, ctypes.byref(handle)))
+        ctypes.pythonapi.PyCapsule_SetName(dltensor, _c_str_used_dltensor)
+        ctypes.pythonapi.PyCapsule_SetDestructor(dltensor, TVMPyCapsuleDestructor(0))
+        return _make_array(handle, False)
+    raise ValueError("Expect a dltensor field, PyCapsule can only be consumed once")
+
+
+def _dlpack_deleter(pycapsule):
+    pycapsule = ctypes.cast(pycapsule, ctypes.py_object)
+    if ctypes.pythonapi.PyCapsule_IsValid(pycapsule, _c_str_dltensor):
+        ptr = ctypes.pythonapi.PyCapsule_GetPointer(pycapsule, _c_str_dltensor)
+        _LIB.TVMDLManagedTensorCallDeleter(ptr)
+        ctypes.pythonapi.PyCapsule_SetDestructor(dltensor, TVMPyCapsuleDestructor(0))
+
+_c_dlpack_deleter = TVMPyCapsuleDestructor(_dlpack_deleter)
+
+
 class NDArrayBase(object):
     """A simple Device/CPU Array object in runtime."""
     __slots__ = ["handle", "is_view"]
@@ -29,6 +65,17 @@ def __del__(self):
     def _tvm_handle(self):
         return ctypes.cast(self.handle, ctypes.c_void_p).value
 
+    def to_dlpack(self):
+        """Produce an array from a DLPack Tensor without copying memory
+
+        Returns
+        -------
+        dlpack : DLPack tensor view of the array data
+        """
+        handle = ctypes.c_void_p()
+        check_call(_LIB.TVMArrayToDLPack(self.handle, ctypes.byref(handle)))
+        return ctypes.pythonapi.PyCapsule_New(handle, _c_str_dltensor, _c_dlpack_deleter)
+
 
 def _make_array(handle, is_view):
     handle = ctypes.cast(handle, TVMArrayHandle)
diff --git a/python/tvm/_ffi/_cython/base.pxi b/python/tvm/_ffi/_cython/base.pxi
index 50a99245f793..00173c431bb7 100644
--- a/python/tvm/_ffi/_cython/base.pxi
+++ b/python/tvm/_ffi/_cython/base.pxi
@@ -1,6 +1,7 @@
 from ..base import TVMError
 from libcpp.vector cimport vector
 from cpython.version cimport PY_MAJOR_VERSION
+from cpython cimport pycapsule
 from libc.stdint cimport int64_t, uint64_t, uint8_t, uint16_t
 import ctypes
 
@@ -40,6 +41,11 @@ cdef extern from "tvm/runtime/c_runtime_api.h":
         int64_t* strides
         uint64_t byte_offset
 
+    ctypedef struct DLManagedTensor:
+        DLTensor dl_tensor
+        void* manager_ctx
+        void (*deleter)(DLManagedTensor* self)
+
     ctypedef struct TVMValue:
         int64_t v_int64
         double v_float64
@@ -49,7 +55,7 @@ cdef extern from "tvm/runtime/c_runtime_api.h":
         DLContext v_ctx
 
 ctypedef int64_t tvm_index_t
-ctypedef void* DLTensorHandle
+ctypedef DLTensor* DLTensorHandle
 ctypedef void* TVMStreamHandle
 ctypedef void* TVMRetValueHandle
 ctypedef void* TVMFunctionHandle
@@ -92,6 +98,11 @@ cdef extern from "tvm/runtime/c_runtime_api.h":
     int TVMArrayCopyFromTo(DLTensorHandle src,
                            DLTensorHandle to,
                            TVMStreamHandle stream)
+    int TVMArrayFromDLPack(DLManagedTensor* arr_from,
+                           DLTensorHandle* out)
+    int TVMArrayToDLPack(DLTensorHandle arr_from,
+                         DLManagedTensor** out)
+    void TVMDLManagedTensorCallDeleter(DLManagedTensor* dltensor)
 
 cdef extern from "tvm/c_dsl_api.h":
     int TVMNodeFree(NodeHandle handle)
diff --git a/python/tvm/_ffi/_cython/ndarray.pxi b/python/tvm/_ffi/_cython/ndarray.pxi
index 44b0a544609d..0a507affec1c 100644
--- a/python/tvm/_ffi/_cython/ndarray.pxi
+++ b/python/tvm/_ffi/_cython/ndarray.pxi
@@ -1,5 +1,29 @@
 from ..runtime_ctypes import TVMArrayHandle
 
+cdef const char* _c_str_dltensor = "dltensor"
+cdef const char* _c_str_used_dltensor = "used_dltensor"
+
+
+cdef void _c_dlpack_deleter(object pycaps):
+    cdef DLManagedTensor* dltensor
+    if pycapsule.PyCapsule_IsValid(pycaps, _c_str_dltensor):
+        dltensor = <DLManagedTensor*>pycapsule.PyCapsule_GetPointer(pycaps, _c_str_dltensor)
+        TVMDLManagedTensorCallDeleter(dltensor)
+
+
+def _from_dlpack(object dltensor):
+    cdef DLManagedTensor* ptr
+    cdef DLTensorHandle chandle
+    if pycapsule.PyCapsule_IsValid(dltensor, _c_str_dltensor):
+        ptr = <DLManagedTensor*>pycapsule.PyCapsule_GetPointer(dltensor, _c_str_dltensor)
+        CALL(TVMArrayFromDLPack(ptr, &chandle))
+        # set name and destructor to be empty
+        pycapsule.PyCapsule_SetDestructor(dltensor, NULL)
+        pycapsule.PyCapsule_SetName(dltensor, _c_str_used_dltensor)
+        return c_make_array(chandle, 0)
+    raise ValueError("Expect a dltensor field, pycapsule.PyCapsule can only be consumed once")
+
+
 cdef class NDArrayBase:
     cdef DLTensor* chandle
     cdef int c_is_view
@@ -35,12 +59,26 @@ cdef class NDArrayBase:
         if self.c_is_view == 0:
             CALL(TVMArrayFree(self.chandle))
 
+    def to_dlpack(self):
+        """Produce an array from a DLPack Tensor without copying memory
+
+        Returns
+        -------
+        dlpack : DLPack tensor view of the array data
+        """
+        cdef DLManagedTensor* dltensor
+        if self.c_is_view != 0:
+            raise ValueError("to_dlpack do not work with memory views")
+        CALL(TVMArrayToDLPack(self.chandle, &dltensor))
+        return pycapsule.PyCapsule_New(dltensor, _c_str_dltensor, _c_dlpack_deleter)
+
 
 cdef c_make_array(void* chandle, is_view):
     ret = _CLASS_NDARRAY(None, is_view)
     (<NDArrayBase>ret).chandle = <DLTensor*>chandle
     return ret
 
+
 cdef _TVM_COMPATS = ()
 
 cdef _TVM_EXT_RET = {}
diff --git a/python/tvm/_ffi/ndarray.py b/python/tvm/_ffi/ndarray.py
index d994d7c2e4a5..e49c3b62f473 100644
--- a/python/tvm/_ffi/ndarray.py
+++ b/python/tvm/_ffi/ndarray.py
@@ -17,28 +17,17 @@
     if _FFI_MODE == "ctypes":
         raise ImportError()
     if sys.version_info >= (3, 0):
-        from ._cy3.core import _set_class_ndarray, _reg_extension, _make_array
+        from ._cy3.core import _set_class_ndarray, _reg_extension, _make_array, _from_dlpack
         from ._cy3.core import NDArrayBase as _NDArrayBase
     else:
-        from ._cy2.core import _set_class_ndarray, _reg_extension, _make_array
+        from ._cy2.core import _set_class_ndarray, _reg_extension, _make_array, _from_dlpack
         from ._cy2.core import NDArrayBase as _NDArrayBase
 except IMPORT_EXCEPT:
     # pylint: disable=wrong-import-position
-    from ._ctypes.ndarray import _set_class_ndarray, _reg_extension, _make_array
+    from ._ctypes.ndarray import _set_class_ndarray, _reg_extension, _make_array, _from_dlpack
     from ._ctypes.ndarray import NDArrayBase as _NDArrayBase
 
 
-TVMPyCapsuleDestructor = ctypes.CFUNCTYPE(None, ctypes.c_void_p)
-_c_str_dltensor = c_str('dltensor')
-
-
-# used for PyCapsule manipulation
-if hasattr(ctypes, 'pythonapi'):
-    ctypes.pythonapi.PyCapsule_GetName.restype = ctypes.c_char_p
-    ctypes.pythonapi.PyCapsule_GetPointer.restype = ctypes.c_void_p
-    ctypes.pythonapi.PyCapsule_New.restype = ctypes.py_object
-
-
 def context(dev_type, dev_id=0):
     """Construct a TVM context with given device type and id.
 
@@ -134,30 +123,14 @@ def from_dlpack(dltensor):
     Parameters
     ----------
     dltensor : DLPack tensor
+        Input DLManagedTensor, can only be consumed once.
 
     Returns
     -------
     arr: tvm.nd.NDArray
         The array view of the tensor data.
     """
-    dltensor = ctypes.py_object(dltensor)
-    name = ctypes.pythonapi.PyCapsule_GetName(dltensor)
-    ptr = ctypes.pythonapi.PyCapsule_GetPointer(dltensor, name)
-    handle = TVMArrayHandle()
-    check_call(_LIB.TVMArrayFromDLPack(ptr, ctypes.byref(handle)))
-    ctypes.pythonapi.PyCapsule_SetDestructor(dltensor, None)
-    return _make_array(handle, False)
-
-
-def _dlpack_deleter(pycapsule):
-    pycapsule = ctypes.py_object(pycapsule)
-    if ctypes.pythonapi.PyCapsule_IsValid(pycapsule, _c_str_dltensor):
-        ptr = ctypes.pythonapi.PyCapsule_GetPointer(pycapsule, _c_str_dltensor)
-        _LIB.TVMDLManagedTensorCallDeleter(ptr)
-        ctypes.pythonapi.PyCapsule_SetDestructor(dltensor, TVMPyCapsuleDestructor(0))
-
-
-_c_dlpack_deleter = TVMPyCapsuleDestructor(_dlpack_deleter)
+    return _from_dlpack(dltensor)
 
 
 class NDArrayBase(_NDArrayBase):
@@ -308,17 +281,6 @@ def copyto(self, target):
             raise ValueError("Unsupported target type %s" % str(type(target)))
         return target
 
-    def to_dlpack(self):
-        """Produce an array from a DLPack Tensor without copying memory
-
-        Returns
-        -------
-        dlpack : DLPack tensor view of the array data
-        """
-        handle = ctypes.c_void_p()
-        check_call(_LIB.TVMArrayToDLPack(self.handle, ctypes.byref(handle)))
-        return ctypes.pythonapi.PyCapsule_New(handle, _c_str_dltensor, _c_dlpack_deleter)
-
 
 def free_extension_handle(handle, type_code):
     """Free c++ extension type handle
diff --git a/tests/scripts/task_python_nnvm.sh b/tests/scripts/task_python_nnvm.sh
index 790073a2fe8b..cf6039d58416 100755
--- a/tests/scripts/task_python_nnvm.sh
+++ b/tests/scripts/task_python_nnvm.sh
@@ -4,6 +4,10 @@ export PYTHONPATH=nnvm/python:python:topi/python
 # to avoid openblas threading error
 export OMP_NUM_THREADS=1
 
+# Rebuild cython
+make cython || exit -1
+make cython3 || exit -1
+
 echo "Running unittest..."
 python -m nose -v nnvm/tests/python/unittest || exit -1
 python3 -m nose -v nnvm/tests/python/unittest || exit -1
diff --git a/tests/scripts/task_python_topi.sh b/tests/scripts/task_python_topi.sh
index 13a324d79b1f..6842ddaae13a 100755
--- a/tests/scripts/task_python_topi.sh
+++ b/tests/scripts/task_python_topi.sh
@@ -1,4 +1,8 @@
 export PYTHONPATH=python:topi/python
 
+# Rebuild cython
+make cython || exit -1
+make cython3 || exit -1
+
 python -m nose -v topi/tests/python || exit -1
 python3 -m nose -v topi/tests/python || exit -1

From 60769b77f9abe29aafabda4d5d1cd625e7c61f9f Mon Sep 17 00:00:00 2001
From: Leyuan Wang <laurawly@gmail.com>
Date: Mon, 13 Aug 2018 09:35:46 -0700
Subject: [PATCH 19/77] Fixed bugs for SSD sorting and multbox detection
 (#1578)

---
 topi/python/topi/cuda/nms.py          | 480 ++++++++++++++++++++------
 topi/python/topi/cuda/ssd/multibox.py | 225 ++++++++----
 2 files changed, 534 insertions(+), 171 deletions(-)

diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py
index 4d4e402de5c2..361208bf1cfb 100644
--- a/topi/python/topi/cuda/nms.py
+++ b/topi/python/topi/cuda/nms.py
@@ -7,19 +7,155 @@
 from topi.vision import nms
 
 
-def sort_ir(data, index, output, axis, is_descend):
-    """Low level IR to do sorting on the GPU, same usage as tvm.contrib.sort.argsort on the CPU.
+def sort_pre_ir(index, sizes_out, axis_mul_before, axis_mul_after):
+    """Low level IR routing subfunction 1/4 for computing segments' staring locatons.
+
+    Parameters
+    ----------
+    index : Buffer
+        Buffer of number of valid output boxes.
+
+    sizes_out : Buffer
+        Output buffer of start locations of each sorting segment.
+
+    axis_mul_before : int
+        The multiplication result of axis dimensions before axis.
+
+    axis_mul_after : int
+        The multiplication result of axis dimensions after axis.
+
+    Returns
+    -------
+    stmt : Stmt
+        The result IR statement.
+    """
+    max_threads = int(
+        tvm.target.current_target(allow_none=False).max_num_threads)
+    tx = tvm.thread_axis("threadIdx.x")
+    bx = tvm.thread_axis("blockIdx.x")
+    ib = tvm.ir_builder.create()
+    p_index = ib.buffer_ptr(index)
+    dshape = sizes_out.shape
+    sizes = ib.buffer_ptr(sizes_out)
+    nthread_tx = max_threads
+    nthread_bx = dshape[0] // max_threads + 1
+    ib.scope_attr(tx, "thread_extent", nthread_tx)
+    ib.scope_attr(bx, "thread_extent", nthread_bx)
+    tid = bx * max_threads + tx
+
+    with ib.if_scope(tid < axis_mul_before * axis_mul_after):
+        sizes[tid] = p_index[tid]
+
+    # scan
+    with ib.if_scope(tid < 1):
+        with ib.for_range(0, axis_mul_before * axis_mul_after - 1, name="k") as k:
+            sizes[k + 1] += sizes[k]
+    body = ib.get()
+    return body
+
+
+def sort_pre_ir_data(data, index, sizes_in, data_out, index_out, \
+                     axis, axis_mul_before, axis_mul_after):
+    """Low level IR routing subfunction 2/4 for flattening data and indices into segmented format.
 
     Parameters
     ----------
     data: Buffer
-        2D Buffer of input boxes' score with shape [batch_size, num_anchors].
+        Buffer of output boxes with class and score.
 
     index : Buffer
-        Buffer of number of valid number of boxes.
+        Buffer of number of valid output boxes.
 
-    output : Buffer
-        Output buffer of indicies of sorted tensor.
+    sizes_in : Buffer
+        Buffer of start locations of each sorting segment.
+
+    data_out : Buffer
+        Buffer of flattened segmented data.
+
+    index_out : Buffer
+        Buffer of flattened segmented indices.
+
+    axis : int
+        The axis used for sorting.
+
+    axis_mul_before : int
+        The multiplication result of axis dimensions before axis.
+
+    axis_mul_after : int
+        The multiplication result of axis dimensions after axis.
+
+    Returns
+    -------
+    stmt : Stmt
+        The result IR statement.
+    """
+    ib = tvm.ir_builder.create()
+    sizes = ib.buffer_ptr(sizes_in)
+    p_index = ib.buffer_ptr(index)
+    p_data = ib.buffer_ptr(data)
+    data_new = ib.buffer_ptr(data_out)
+    index_new = ib.buffer_ptr(index_out)
+    max_threads = int(
+        tvm.target.current_target(allow_none=False).max_num_threads)
+    tx = tvm.thread_axis("threadIdx.x")
+    bx = tvm.thread_axis("blockIdx.x")
+    dshape = tvm.max(sizes_in.shape[0], p_index[0])
+    nthread_tx = max_threads
+    nthread_bx = dshape // max_threads + 1
+    ib.scope_attr(tx, "thread_extent", nthread_tx)
+    ib.scope_attr(bx, "thread_extent", nthread_bx)
+    tid = bx * max_threads + tx
+    with ib.if_scope(axis_mul_before * axis_mul_after > 1):
+        with ib.if_scope(tid < axis_mul_before * axis_mul_after):
+            i = tid / axis_mul_after
+            j = tid % axis_mul_after
+            current_sort_num = p_index[tid]
+            base_idx = i * data.shape[axis] * axis_mul_after + j
+            with ib.for_range(0, current_sort_num, name="k") as k:
+                full_idx = base_idx + k * axis_mul_after
+                with ib.if_scope(tid == 0):
+                    start = 0
+                with ib.else_scope():
+                    start = sizes[tid-1]
+                index_new[start + k] = k
+                data_new[start + k] = p_data[full_idx]
+    with ib.else_scope():
+        with ib.if_scope(tid == 0):
+            with ib.for_range(0, p_index[0], name="k") as k:
+                index_new[k] = k
+
+    body = ib.get()
+    return body
+
+def sort_oet_ir(data, index, new_data, new_index, loc, out_index, axis_mul_before, \
+                axis_mul_after, axis, is_descend):
+    """Low level IR routing subfunction 3/4 for Odd-Even-Transposition sorting.
+
+    Parameters
+    ----------
+    data: Buffer
+        Buffer of output boxes with class and score.
+
+    index : Buffer
+        Buffer of number of valid output boxes.
+
+    new_data : Buffer
+        Buffer of flattened segmented data.
+
+    new_index : Buffer
+        Buffer of flattened segmented indices.
+
+    loc : Buffer
+        Buffer of start locations of each sorting segment.
+
+    out_index : Buffer
+        Output buffer of output box indexes sorted by score in a flattened segmented format.
+
+    axis_mul_before : int
+        The multiplication result of axis dimensions before axis.
+
+    axis_mul_after : int
+        The multiplication result of axis dimensions after axis.
 
     axis : int
         The axis used for sorting.
@@ -32,15 +168,197 @@ def sort_ir(data, index, output, axis, is_descend):
     stmt : Stmt
         The result IR statement.
     """
-
     max_threads = int(
         tvm.target.current_target(allow_none=False).max_num_threads)
     tx = tvm.thread_axis("threadIdx.x")
     bx = tvm.thread_axis("blockIdx.x")
     ib = tvm.ir_builder.create()
+    dshape = loc.shape
+    fshape = data.shape[axis] * dshape[0]
+    temp_data = ib.allocate(
+        "float32", dshape, name="temp_data", scope="local")
     p_data = ib.buffer_ptr(data)
     p_index = ib.buffer_ptr(index)
+    data_new = ib.buffer_ptr(new_data)
+    index_new = ib.buffer_ptr(new_index)
+    index_out = ib.buffer_ptr(out_index)
+    sizes = ib.buffer_ptr(loc)
+    nthread_tx = max_threads
+    nthread_bx = fshape // max_threads + 1
+    ib.scope_attr(tx, "thread_extent", nthread_tx)
+    ib.scope_attr(bx, "thread_extent", nthread_bx)
+    tid = bx * max_threads + tx
+
+    with ib.if_scope(axis_mul_before * axis_mul_after > 1):
+        with ib.if_scope(tid < axis_mul_before * axis_mul_after):
+            with ib.if_scope(tid == 0):
+                start = 0
+            with ib.else_scope():
+                start = sizes[tid-1]
+            # OddEvenTransposeSort
+            with ib.for_range(0, p_index[tid], name="k") as k:
+                with ib.for_range(0, p_index[tid] - 1, name="i") as i:
+                    with ib.if_scope(i % 2 == k % 2):
+                        with ib.if_scope(((data_new[i+start] < data_new[i+start+1]) == is_descend)):
+                            temp_data[tid] = data_new[i+start]
+                            data_new[i+start] = data_new[i+start+1]
+                            data_new[i+start+1] = temp_data[tid]
+                            index_out[tid] = index_new[i+start]
+                            index_new[i+start] = index_new[i+start+1]
+                            index_new[i+start+1] = index_out[tid]
+        with ib.if_scope(tid < 1):
+            with ib.for_range(0, sizes[dshape[0] - 1], name="i") as i:
+                index_out[i] = index_new[i]
+    with ib.else_scope():
+        with ib.for_range(0, fshape, name="k", for_type="unroll") as k:
+            with ib.if_scope(tvm.all(k % 2 == tid % 2, tid < fshape)):
+                with ib.if_scope(k % 2 == 0):
+                    with ib.if_scope(tvm.all(tid + 1 < fshape, (p_data[tid] < p_data[tid+1]) \
+                                             == is_descend)):
+                        data_new[tid] = p_data[tid+1]
+                        index_out[tid] = index_new[tid+1]
+                    with ib.else_scope():
+                        data_new[tid] = p_data[tid]
+                        index_out[tid] = index_new[tid]
+                with ib.else_scope():
+                    with ib.if_scope(tvm.all(tid + 1 < fshape, (data_new[tid] < data_new[tid+1]) \
+                                             == is_descend)):
+                        p_data[tid] = data_new[tid+1]
+                        index_new[tid] = index_out[tid+1]
+                    with ib.else_scope():
+                        p_data[tid] = data_new[tid]
+                        index_new[tid] = index_out[tid]
+            with ib.if_scope(tvm.all(k % 2 != tid % 2, tid < fshape)):
+                with ib.if_scope(k % 2 == 0):
+                    with ib.if_scope(tvm.all(tid > 0, (p_data[tid-1] < p_data[tid]) == is_descend)):
+                        data_new[tid] = p_data[tid-1]
+                        index_out[tid] = index_new[tid-1]
+                    with ib.else_scope():
+                        data_new[tid] = p_data[tid]
+                        index_out[tid] = index_new[tid]
+                with ib.else_scope():
+                    with ib.if_scope(tvm.all(tid > 0, (data_new[tid-1] < data_new[tid]) \
+                                             == is_descend)):
+                        p_data[tid] = data_new[tid-1]
+                        index_new[tid] = index_out[tid-1]
+                    with ib.else_scope():
+                        p_data[tid] = data_new[tid]
+                        index_new[tid] = index_out[tid]
+        with ib.if_scope(fshape % 2 == 1):
+            with ib.if_scope(tid < 1):
+                with ib.for_range(0, fshape, name="k") as k:
+                    index_out[tid] = index_new[tid]
+    body = ib.get()
+    return body
+
+
+def sort_ir_out(data, index, new_index, loc, output, axis_mul_before, axis_mul_after, axis):
+    """Low level IR routing subfunction 4/4 for writing sorted indices to output format.
+
+    Parameters
+    ----------
+    data: Buffer
+        Buffer of output boxes with class and score.
+
+    index : Buffer
+        Buffer of number of valid output boxes.
+
+    new_index : Buffer
+        Buffer of sorted indices in a flatten format.
+
+    loc : Buffer
+        Buffer of start locations of each sorting segment.
+
+    output : Buffer
+        Output buffer of output box indexes sorted by score.
+
+    axis_mul_before : int
+        The multiplication result of axis dimensions before axis.
+
+    axis_mul_after : int
+        The multiplication result of axis dimensions after axis.
+
+    axis : int
+        The axis used for sorting.
+
+    is_descend : bool
+        If the sorted data is in descending order.
+
+    Returns
+    -------
+    stmt : Stmt
+        The result IR statement.
+    """
+    max_threads = int(
+        tvm.target.current_target(allow_none=False).max_num_threads)
+    tx = tvm.thread_axis("threadIdx.x")
+    bx = tvm.thread_axis("blockIdx.x")
+    ib = tvm.ir_builder.create()
+    dshape = tvm.max(loc.shape[0], data.shape[axis])
+    p_index = ib.buffer_ptr(index)
+    index_new = ib.buffer_ptr(new_index)
+    sizes = ib.buffer_ptr(loc)
     p_out = ib.buffer_ptr(output)
+    nthread_tx = max_threads
+    nthread_bx = dshape // max_threads + 1
+    ib.scope_attr(tx, "thread_extent", nthread_tx)
+    ib.scope_attr(bx, "thread_extent", nthread_bx)
+    tid = bx * max_threads + tx
+
+    with ib.if_scope(axis_mul_before * axis_mul_after > 1):
+        with ib.if_scope(tid < axis_mul_before * axis_mul_after):
+            i = tid / axis_mul_after
+            j = tid % axis_mul_after
+            base_idx = i * data.shape[axis] * axis_mul_after + j
+            with ib.for_range(0, data.shape[axis], name="k") as k:
+                with ib.if_scope(tid == 0):
+                    start = 0
+                with ib.else_scope():
+                    start = sizes[tid-1]
+                p_out[base_idx + k * axis_mul_after] = tvm.select(
+                    k < p_index[tid], index_new[k+start], k)
+    with ib.else_scope():
+        with ib.if_scope(tid < data.shape[axis]):
+            p_out[tid] = tvm.select(tid < p_index[0], index_new[tid], tid)
+
+    body = ib.get()
+    return body
+
+
+def sort_gpu(data, data_buf, index, index_buf, output_buf, axis, is_descend):
+    """Function to generate low level IR to do sorting on the GPU, use it by calling sort_gpu.
+
+    Parameters
+    ----------
+    data: tvm.Tensor
+        3-D tensor with shape [batch_size, num_anchors, 6].
+        The last dimension should be in format of
+        [class_id, score, box_left, box_top, box_right, box_bottom].
+
+    data_buf: Buffer
+        2D Buffer of input boxes' score with shape [batch_size, num_anchors].
+
+    index : tvm.Tensor
+        1-D tensor for valid number of boxes.
+
+    index_buf : Buffer
+        Buffer of number of valid number of boxes.
+
+    output_buf : Buffer
+        Output buffer of indicies of sorted tensor.
+
+    axis : int
+        The axis used for sorting.
+
+    is_descend : bool
+        If the sorted data is in descending order.
+
+    Returns
+    -------
+    out : tvm.Tensor
+        3-D tensor with shape [batch_size, num_anchors].
+    """
+
     ndim = len(data.shape)
     assert data.dtype == "float32", "Currently only supports input dtype to be float32"
     assert axis < ndim, "Axis out of boundary for input ndim %d" % ndim
@@ -55,89 +373,60 @@ def sort_ir(data, index, output, axis, is_descend):
         elif i > axis:
             axis_mul_after *= data.shape[i]
 
-    dshape = 0
-    for i in range(0, len(index.shape)):
-        dshape += index.shape[i]
-    dshape = tvm.select(dshape > axis_mul_before*axis_mul_after, dshape,
-                        axis_mul_before*axis_mul_after)
-
-    sizes_temp = ib.allocate(
-        "int32", dshape, name="sizes_temp", scope="global")
-    sizes = ib.allocate("int32", dshape, name="sizes", scope="global")
-    temp_index = ib.allocate("int32", dshape, name="temp_index", scope="local")
-    temp_data = ib.allocate("float32", dshape, name="temp_data", scope="local")
-    data_new = ib.allocate("float32", dshape, name="data_new", scope="global")
-    index_new = ib.allocate("int32", dshape, name="index_new", scope="global")
-    nthread_tx = max_threads
-    nthread_bx = dshape // max_threads + 1
-    ib.scope_attr(tx, "thread_extent", nthread_tx)
-    ib.scope_attr(bx, "thread_extent", nthread_bx)
-    tid = bx * max_threads + tx
-
-    with ib.if_scope(tid < axis_mul_before * axis_mul_after):
-        sizes[tid] = p_index[tid]
-        sizes_temp[tid] = p_index[tid]
-
-    with ib.if_scope(tid < axis_mul_before * axis_mul_after):
-        with ib.for_range(0, tvm.floor(tvm.sqrt((axis_mul_before * axis_mul_after) \
-             .astype("float32"))) + 1, name="k") as k:
-            with ib.if_scope(tid - (tvm.const(1, "int32") << k) >= 0):
-                with ib.if_scope(k % 2 == 0):
-                    sizes[tid] += sizes_temp[tid - (
-                        tvm.const(1, "int32") << k)]
-                    sizes_temp[tid] = sizes[tid]
-                with ib.else_scope():
-                    sizes_temp[tid] += sizes[tid - (
-                        tvm.const(1, "int32") << k)]
-                    sizes[tid] = sizes_temp[tid]
-
-    with ib.if_scope(tid < axis_mul_before * axis_mul_after):
-        i = tid / axis_mul_after
-        j = tid % axis_mul_after
-        current_sort_num = p_index[tid]
-        base_idx = i * data.shape[axis] * axis_mul_after + j
-        with ib.for_range(0, current_sort_num, name="k") as k:
-            full_idx = base_idx + k * axis_mul_after
-            with ib.if_scope(tid == 0):
-                start = 0
-            with ib.else_scope():
-                start = sizes[tid-1]
-            index_new[start + k] = k
-            data_new[start + k] = p_data[full_idx]
-
-    with ib.if_scope(tid < axis_mul_before * axis_mul_after):
-        with ib.if_scope(tid == 0):
-            start = 0
-        with ib.else_scope():
-            start = sizes[tid-1]
-        # OddEvenTransposeSort
-        with ib.for_range(0, p_index[tid], name="k") as k:
-            with ib.for_range(0, p_index[tid] - 1, name="i") as i:
-                with ib.if_scope(i % 2 == (k & 1)):
-                    with ib.if_scope(((data_new[i+start] < data_new[i+start+1]) ^
-                                      is_descend) == False):
-                        temp_data[tid] = data_new[i+start]
-                        data_new[i+start] = data_new[i+start+1]
-                        data_new[i+start+1] = temp_data[tid]
-                        temp_index[tid] = index_new[i+start]
-                        index_new[i+start] = index_new[i+start+1]
-                        index_new[i+start+1] = temp_index[tid]
-
-    with ib.if_scope(tid < axis_mul_before * axis_mul_after):
-        i = tid / axis_mul_after
-        j = tid % axis_mul_after
-        current_sort_num = p_index[tid]
-        base_idx = i * data.shape[axis] * axis_mul_after + j
-        with ib.for_range(0, data.shape[axis], name="k") as k:
-            with ib.if_scope(tid == 0):
-                start = 0
-            with ib.else_scope():
-                start = sizes[tid-1]
-            p_out[base_idx + k * axis_mul_after] = tvm.select(
-                k < current_sort_num,
-                index_new[k+start], k)
-    body = ib.get()
-    return body
+    dshape = axis_mul_before*axis_mul_after
+    fshape = data.shape[axis] * dshape
+
+    loc_buf = api.decl_buffer(dshape, index.dtype, "sizes", data_alignment=8)
+    new_index_buf = api.decl_buffer(
+        fshape, index.dtype, "index_new", data_alignment=8)
+    out_index_buf = api.decl_buffer(
+        fshape, index.dtype, "index_out", data_alignment=8)
+    new_data_buf = api.decl_buffer(
+        dshape, data.dtype, "data_new", data_alignment=8)
+
+    loc = \
+        tvm.extern([(dshape,)],
+                   [index],
+                   lambda ins, outs: sort_pre_ir(
+                       ins[0], outs[0], axis_mul_before, axis_mul_after),
+                   dtype=[index.dtype],
+                   in_buffers=index_buf,
+                   out_buffers=[loc_buf],
+                   tag="sorting_prepare")
+
+    data_new, index_new = \
+        tvm.extern([(dshape,), (fshape,)],
+                   [data, index, loc],
+                   lambda ins, outs: sort_pre_ir_data(
+                       ins[0], ins[1], ins[2], outs[0], outs[1], axis,
+                       axis_mul_before, axis_mul_after),
+                   dtype=[data.dtype, index.dtype],
+                   in_buffers=[data_buf, index_buf, loc_buf],
+                   out_buffers=[new_data_buf, new_index_buf],
+                   tag="sorting_data")
+
+    index_out = \
+        tvm.extern([(fshape,)],
+                   [data, index, data_new, index_new, loc],
+                   lambda ins, outs: sort_oet_ir(
+                       ins[0], ins[1], ins[2], ins[3], ins[4], outs[0],
+                       axis_mul_before, axis_mul_after, axis, is_descend),
+                   dtype=[index.dtype],
+                   in_buffers=[data_buf, index_buf,
+                               new_data_buf, new_index_buf, loc_buf],
+                   out_buffers=[out_index_buf],
+                   tag="sorting_oet")
+    out = \
+        tvm.extern([data.shape],
+                   [data, index, index_out, loc],
+                   lambda ins, outs: sort_ir_out(
+                       ins[0], ins[1], ins[2], ins[3], outs[0],
+                       axis_mul_before, axis_mul_after, axis),
+                   dtype=[index.dtype],
+                   in_buffers=[data_buf, index_buf, out_index_buf, loc_buf],
+                   out_buffers=output_buf,
+                   tag="sorting_output")
+    return out
 
 
 def nms_ir(data, sort_result, valid_count, out, nms_threshold, force_suppress, nms_topk):
@@ -333,15 +622,8 @@ def nms_gpu(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk
     sort_tensor_buf = api.decl_buffer(score_shape, sort_tensor_dtype,
                                       "sort_tensor_buf", data_alignment=8)
 
-    sort_tensor = \
-        tvm.extern(score_shape,
-                   [score_tensor, valid_count],
-                   lambda ins, outs: sort_ir(
-                       ins[0], ins[1], outs[0], score_axis, True),
-                   dtype=sort_tensor_dtype,
-                   in_buffers=[score_tensor_buf, valid_count_buf],
-                   out_buffers=sort_tensor_buf,
-                   name="nms_sort")
+    sort_tensor = sort_gpu(score_tensor, score_tensor_buf, valid_count,
+                           valid_count_buf, sort_tensor_buf, score_axis, True)
     out = \
         tvm.extern(data.shape,
                    [data, sort_tensor, valid_count],
diff --git a/topi/python/topi/cuda/ssd/multibox.py b/topi/python/topi/cuda/ssd/multibox.py
index c22e7a513d7d..3c013c4d1605 100644
--- a/topi/python/topi/cuda/ssd/multibox.py
+++ b/topi/python/topi/cuda/ssd/multibox.py
@@ -1,4 +1,4 @@
-# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, too-many-statements
+# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, too-many-statements, too-many-function-args
 """SSD multibox operators"""
 from __future__ import absolute_import as _abs
 import math
@@ -13,6 +13,7 @@
 from topi.vision.ssd import multibox_transform_loc
 from ..nms import nms
 
+
 def multibox_prior_ir(data, out, sizes, ratios, steps, offsets):
     """Low level IR routing for multibox_prior operator.
 
@@ -41,7 +42,8 @@ def multibox_prior_ir(data, out, sizes, ratios, steps, offsets):
     stmt : Stmt
         The result IR statement.
     """
-    max_threads = int(math.sqrt(tvm.target.current_target(allow_none=False).max_num_threads))
+    max_threads = int(math.sqrt(
+        tvm.target.current_target(allow_none=False).max_num_threads))
     tx = tvm.thread_axis("threadIdx.x")
     ty = tvm.thread_axis("threadIdx.y")
     bx = tvm.thread_axis("blockIdx.x")
@@ -76,7 +78,8 @@ def multibox_prior_ir(data, out, sizes, ratios, steps, offsets):
 
             for k in range(num_sizes + num_ratios - 1):
                 w = tvm.select(k < num_sizes,
-                               size_ratio_concat[k] * in_height / in_width / 2.0,
+                               size_ratio_concat[
+                                   k] * in_height / in_width / 2.0,
                                size_ratio_concat[0] * in_height / in_width *
                                math.sqrt(size_ratio_concat[k + 1]) / 2.0)
                 h = tvm.select(k < num_sizes, size_ratio_concat[k] / 2.0,
@@ -93,7 +96,7 @@ def multibox_prior_ir(data, out, sizes, ratios, steps, offsets):
 
 
 @multibox_prior.register(["cuda", "gpu"])
-def multibox_prior_gpu(data, sizes=(1,), ratios=(1,), steps=(-1, -1), \
+def multibox_prior_gpu(data, sizes=(1,), ratios=(1,), steps=(-1, -1),
                        offsets=(0.5, 0.5), clip=False):
     """Generate prior(anchor) boxes from data, sizes and ratios.
 
@@ -124,31 +127,114 @@ def multibox_prior_gpu(data, sizes=(1,), ratios=(1,), steps=(-1, -1), \
     """
     num_sizes = len(sizes)
     num_ratios = len(ratios)
-    oshape = (1, data.shape[2] * data.shape[3] * (num_sizes + num_ratios - 1), 4)
+    oshape = (
+        1, data.shape[2] * data.shape[3] * (num_sizes + num_ratios - 1), 4)
     out = tvm.extern(oshape, [data], lambda ins, outs:
-                     multibox_prior_ir(ins[0], outs[0], sizes, ratios, steps, offsets),
+                     multibox_prior_ir(
+                         ins[0], outs[0], sizes, ratios, steps, offsets),
                      tag="multibox_prior")
     if clip:
         out = topi.clip(out, 0, 1)
     return out
 
 
-def transform_loc_ir(cls_prob, loc_pred, anchor, valid_count, out, clip, threshold, variances):
-    """Low level IR routing for transform location in multibox_detection operator.
+def transform_loc_pre(cls_prob, valid_count, temp_flag, temp_id, temp_score_out, threshold):
+    """Low level IR routing for transform location data preparation.
 
     Parameters
     ----------
     cls_prob : Buffer
         Buffer of class probabilities.
 
+    valid_count : Buffer
+        Buffer of number of valid output boxes.
+
+    temp_flag : Buffer
+        Output intermediate result buffer
+
+    temp_id : Buffer
+        Output intermediate result buffer
+
+    temp_score_out : Buffer
+        Output buffer
+
+    threshold : float
+        Threshold to be a positive prediction.
+
+    Returns
+    -------
+    stmt : Stmt
+        The result IR statement.
+    """
+    batch_size = cls_prob.shape[0]
+    num_classes = cls_prob.shape[1]
+    num_anchors = cls_prob.shape[2]
+
+    max_threads = int(
+        tvm.target.current_target(allow_none=False).max_num_threads)
+    ib = tvm.ir_builder.create()
+    score = ib.buffer_ptr(temp_score_out)
+    cls_id = ib.buffer_ptr(temp_id)
+    flag = ib.buffer_ptr(temp_flag)
+    tx = tvm.thread_axis("threadIdx.x")
+    bx = tvm.thread_axis("blockIdx.x")
+    nthread_tx = max_threads
+    nthread_bx = (batch_size * num_anchors * num_classes) // max_threads + 1
+    ib.scope_attr(tx, "thread_extent", nthread_tx)
+    ib.scope_attr(bx, "thread_extent", nthread_bx)
+    tid = bx * max_threads + tx
+    p_cls_prob = ib.buffer_ptr(cls_prob)
+    p_valid_count = ib.buffer_ptr(valid_count)
+
+    with ib.if_scope(tid < batch_size * num_anchors):
+        n = tid / num_anchors  # number of batches
+        i = tid % num_anchors  # number of anchors
+        score[i] = -1.0
+        cls_id[i] = 0
+        p_valid_count[n] = 0
+        with ib.for_range(0, num_classes-1, name="k") as k:
+            temp = p_cls_prob[n * num_anchors * num_classes + (k + 1) * num_anchors + i]
+            with ib.if_scope(temp > score[i]):
+                cls_id[i] = k + 1
+                score[i] = temp
+        with ib.if_scope(tvm.all(cls_id[i] > 0, score[i] < threshold)):
+            cls_id[i] = 0
+        with ib.if_scope(cls_id[i] > 0):
+            flag[i] = 1
+        with ib.else_scope():
+            flag[i] = 0
+
+        with ib.if_scope(tid < batch_size):
+            with ib.for_range(0, num_anchors, name="k") as k:
+                with ib.if_scope(k > 0):
+                    flag[tid * num_anchors +
+                         k] += flag[tid * num_anchors + k - 1]
+            p_valid_count[n] = flag[tid * num_anchors + num_anchors - 1]
+
+    body = ib.get()
+    return body
+
+
+def transform_loc_ir(loc_pred, anchor, temp_flag, temp_id, temp_score_in, \
+                     out, clip, variances, batch_size, num_classes, num_anchors):
+    """Low level IR routing for transform location in multibox_detection operator.
+
+    Parameters
+    ----------
     loc_pred : Buffer
         Buffer of location regression predictions.
 
     anchor : Buffer
         Buffer of prior anchor boxes.
 
-    valid_count : Buffer
-        Buffer of number of valid output boxes.
+    temp_flag : Buffer
+        Intermediate result buffer.
+
+    temp_id : Buffer
+        Intermediate result buffer.
+
+    temp_score_in : Buffer
+        Input buffer which stores intermediate results.
 
     out : Buffer
         Output buffer.
@@ -156,12 +242,18 @@ def transform_loc_ir(cls_prob, loc_pred, anchor, valid_count, out, clip, thresho
     clip : boolean
         Whether to clip out-of-boundary boxes.
 
-    threshold : float
-        Threshold to be a positive prediction.
-
     variances : tuple of float
         Variances to be decoded from box regression output.
 
+    batch_size : int
+        Batch size
+
+    num_classes : int
+        Number of classes
+
+    num_anchors : int
+        Number of anchors
+
     Returns
     -------
     stmt : Stmt
@@ -187,21 +279,16 @@ def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw,
         ow = tvm.exp(pw * vw) * aw / 2.0
         oh = tvm.exp(ph * vh) * ah / 2.0
         return tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, ox - ow)), ox - ow), \
-               tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, oy - oh)), oy - oh), \
-               tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, ox + ow)), ox + ow), \
-               tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, oy + oh)), oy + oh)
-
-    batch_size = cls_prob.shape[0]
-    num_classes = cls_prob.shape[1]
-    num_anchors = cls_prob.shape[2]
+            tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, oy - oh)), oy - oh), \
+            tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, ox + ow)), ox + ow), \
+            tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, oy + oh)), oy + oh)
 
+    max_threads = int(
+        tvm.target.current_target(allow_none=False).max_num_threads)
     ib = tvm.ir_builder.create()
-    temp_score = ib.allocate('float32', (batch_size * (num_classes -1) * num_anchors, \
-                 ), name="temp_score", scope="global")
-    score = ib.allocate('float32', (batch_size * num_anchors, ), name="score", scope="local")
-    cls_id = ib.allocate('int32', (batch_size * num_anchors, ), name="id", scope="local")
-    flag = ib.allocate('int32', (batch_size * num_anchors, ), name="flag", scope="global")
-    max_threads = int(tvm.target.current_target(allow_none=False).max_num_threads)
+    score = ib.buffer_ptr(temp_score_in)
+    cls_id = ib.buffer_ptr(temp_id)
+    flag = ib.buffer_ptr(temp_flag)
     tx = tvm.thread_axis("threadIdx.x")
     bx = tvm.thread_axis("blockIdx.x")
     nthread_tx = max_threads
@@ -209,42 +296,13 @@ def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw,
     ib.scope_attr(tx, "thread_extent", nthread_tx)
     ib.scope_attr(bx, "thread_extent", nthread_bx)
     tid = bx * max_threads + tx
-    p_cls_prob = ib.buffer_ptr(cls_prob)
     p_loc_pred = ib.buffer_ptr(loc_pred)
     p_anchor = ib.buffer_ptr(anchor)
-    p_valid_count = ib.buffer_ptr(valid_count)
     p_out = ib.buffer_ptr(out)
-    with ib.if_scope(tid < batch_size * num_anchors * num_classes):
-        n = tid / (num_anchors * num_classes)
-        j = (tid % (num_anchors * num_classes)) / num_anchors
-        i = tid % num_anchors
-        with ib.if_scope(j > 0):
-            temp_score[n * num_anchors * num_classes + i * (num_classes - 1) + j-1] = \
-            p_cls_prob[tid]
-        p_valid_count[n] = 0
-    with ib.if_scope(tid < batch_size * num_anchors):
-        n = tid / num_anchors
-        i = tid % num_anchors
-        score[tid] = -1.0
-        cls_id[tid] = 0
-        with ib.for_range(0, num_classes-1, name="k") as k:
-            temp = temp_score[tid * (num_classes-1) + k]
-            cls_id[tid] = tvm.select(temp > score[tid], k + 1, cls_id[tid])
-            score[tid] = tvm.make.Max(temp, score[tid])
-        with ib.if_scope(tvm.all(cls_id[tid] > 0, score[tid] < threshold)):
-            cls_id[tid] = 0
-        with ib.if_scope(cls_id[tid] > 0):
-            flag[tid] = 1
-        with ib.else_scope():
-            flag[tid] = 0
-    with ib.if_scope(tid < batch_size):
-        with ib.for_range(0, num_anchors, name="k") as k:
-            with ib.if_scope(k > 0):
-                flag[tid * num_anchors + k] += flag[tid * num_anchors + k - 1]
-        p_valid_count[tid] = flag[tid * num_anchors + num_anchors - 1]
+
     with ib.if_scope(tid < batch_size * num_anchors):
-        n = tid / num_anchors
-        i = tid % num_anchors
+        n = tid / num_anchors  # number of batches
+        i = tid % num_anchors  # number of anchors
         with ib.if_scope(cls_id[tid] > 0):
             with ib.if_scope(tid == 0):
                 out_base_idx = n * num_anchors * 6
@@ -253,17 +311,17 @@ def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw,
             p_out[out_base_idx] = cls_id[tid] - 1.0
             p_out[out_base_idx + 1] = score[tid]
             p_out[out_base_idx + 2], p_out[out_base_idx + 3], p_out[out_base_idx + 4], \
-            p_out[out_base_idx + 5] = transform_loc(p_loc_pred, tid * 4, p_anchor, i*4,
-                                                    clip, variances[0], variances[1],
-                                                    variances[2], variances[3])
+                p_out[out_base_idx + 5] = transform_loc(p_loc_pred, tid * 4,
+                                                        p_anchor, i*4, clip, variances[0],
+                                                        variances[1], variances[2], variances[3])
 
     body = ib.get()
     return body
 
 
 @multibox_transform_loc.register(["cuda", "gpu"])
-def multibox_transform_loc_gpu(cls_prob, loc_pred, anchor, clip=True, threshold=0.01,
-                               variances=(0.1, 0.1, 0.2, 0.2)):
+def multibox_transform_loc_gpu(cls_prob, loc_pred, anchor, clip=True, \
+                               threshold=0.01, variances=(0.1, 0.1, 0.2, 0.2)):
     """Location transformation for multibox detection
 
     Parameters
@@ -297,20 +355,42 @@ def multibox_transform_loc_gpu(cls_prob, loc_pred, anchor, clip=True, threshold=
         1-D tensor with shape (batch_size,), number of valid anchor boxes.
     """
     batch_size = cls_prob.shape[0]
-    num_anchors = anchor.shape[1]
+    num_classes = cls_prob.shape[1]
+    num_anchors = cls_prob.shape[2]
     oshape = (batch_size, num_anchors, 6)
     # Define data alignment for intermediate buffer
     valid_count_dtype = "int32"
     valid_count_buf = api.decl_buffer((batch_size,), valid_count_dtype,
                                       "valid_count_buf", data_alignment=4)
-    out_buf = api.decl_buffer(oshape, cls_prob.dtype, "out_buf", data_alignment=8)
-    valid_count, out = \
-        tvm.extern([(batch_size,), oshape],
-                   [cls_prob, loc_pred, anchor],
+    out_buf = api.decl_buffer(
+        oshape, cls_prob.dtype, "out_buf", data_alignment=8)
+    size = num_anchors
+    temp_flag_buf = api.decl_buffer(
+        (size,), valid_count_dtype, "flag", data_alignment=8)
+    temp_id_buf = api.decl_buffer(
+        (size,), valid_count_dtype, "cls_id", data_alignment=8)
+    temp_score_buf = api.decl_buffer(
+        (size,), cls_prob.dtype, "score", data_alignment=8)
+
+    valid_count, temp_flag, temp_id, temp_score = \
+        tvm.extern([(batch_size,), (size,), (size,), (size,)],
+                   [cls_prob],
+                   lambda ins, outs: transform_loc_pre(
+                       ins[0], outs[0], outs[1], outs[2], outs[3], threshold),
+                   dtype=[valid_count_dtype,
+                          valid_count_dtype, valid_count_dtype, cls_prob.dtype],
+                   out_buffers=[valid_count_buf,
+                                temp_flag_buf, temp_id_buf, temp_score_buf],
+                   tag="multibox_transform_loc_first_step")
+
+    out = \
+        tvm.extern([oshape],
+                   [loc_pred, anchor, temp_flag, temp_id, temp_score],
                    lambda ins, outs: transform_loc_ir(
-                       ins[0], ins[1], ins[2], outs[0], outs[1], clip, threshold, variances),
-                   dtype=[valid_count_dtype, cls_prob.dtype],
-                   out_buffers=[valid_count_buf, out_buf],
+                       ins[0], ins[1], ins[2], ins[3], ins[4], outs[0], clip, \
+                       variances, batch_size, num_classes, num_anchors),
+                   dtype=[cls_prob.dtype],
+                   out_buffers=[out_buf],
                    tag="multibox_transform_loc")
     return [out, valid_count]
 
@@ -356,5 +436,6 @@ def multibox_detection_gpu(cls_prob, loc_pred, anchor, clip=True, threshold=0.01
     """
     inter_out = multibox_transform_loc(cls_prob, loc_pred, anchor,
                                        clip, threshold, variances)
-    out = nms(inter_out[0], inter_out[1], nms_threshold, force_suppress, nms_topk)
+    out = nms(
+        inter_out[0], inter_out[1], nms_threshold, force_suppress, nms_topk)
     return out

From 81b42bc3716b80e821651d3dcd6942a653e57e9f Mon Sep 17 00:00:00 2001
From: Pariksheet Pinjari <pariksheet.pinjari@huawei.com>
Date: Tue, 14 Aug 2018 01:43:09 +0530
Subject: [PATCH 20/77] Split_indices negative axis added (#1595)

---
 topi/include/topi/transform.h                | 5 +++++
 topi/tests/python_cpp/test_topi_transform.py | 1 +
 2 files changed, 6 insertions(+)

diff --git a/topi/include/topi/transform.h b/topi/include/topi/transform.h
index 09af612b957b..245b38cfb63d 100644
--- a/topi/include/topi/transform.h
+++ b/topi/include/topi/transform.h
@@ -475,6 +475,11 @@ inline Array<Tensor> split_sections(const Tensor& x,
                            int axis,
                            std::string name = "tensor",
                            std::string tag = kInjective) {
+  if (axis < 0) {
+    axis += static_cast<int>(x->shape.size());
+  }
+  CHECK_LT(axis, x->shape.size()) << "axis out of bounds";
+
   auto src_axis_size = static_cast<int>(GetConstInt(x->shape[axis]));
 
   CHECK_GT(num_sections, 0) << "Slice count must be > 0";
diff --git a/topi/tests/python_cpp/test_topi_transform.py b/topi/tests/python_cpp/test_topi_transform.py
index c8b7c3906caa..3f7bdbfdd499 100644
--- a/topi/tests/python_cpp/test_topi_transform.py
+++ b/topi/tests/python_cpp/test_topi_transform.py
@@ -340,6 +340,7 @@ def test_concatenate():
 
 def test_split():
     verify_split((2, 12, 3), 3, 1)
+    verify_split((2, 12, 3), 3, -1)
     verify_split((2, 12, 3), [2, 4], 1)
     verify_split((10, 12, 24), [5, 7, 9], -1)
 

From 84296f96327d190529220596e2cd2de5ff8ed39b Mon Sep 17 00:00:00 2001
From: Albin Joy <albin.joy@huawei.com>
Date: Tue, 14 Aug 2018 02:44:26 +0530
Subject: [PATCH 21/77] [FRONTEND][TENSORFLOW] Optimized tensorflow testcases
 (#1546)

* [NNVM][TENSORFLOW] Optimized tensorflow testcases

* Replace Constants with Placeholder

* Review comment fix
---
 .../frontend/tensorflow/test_forward.py       | 457 ++++++------------
 1 file changed, 136 insertions(+), 321 deletions(-)

diff --git a/nnvm/tests/python/frontend/tensorflow/test_forward.py b/nnvm/tests/python/frontend/tensorflow/test_forward.py
index 495852f9e5d6..64c57c126f8d 100644
--- a/nnvm/tests/python/frontend/tensorflow/test_forward.py
+++ b/nnvm/tests/python/frontend/tensorflow/test_forward.py
@@ -83,6 +83,34 @@ def run_tf_graph(sess, input_data, input_node, output_node):
     output_data = sess.run(tensor, input_dict)
     return output_data
 
+
+def compare_tf_with_tvm(in_data, in_name, out_name, init_global_variables=False):
+    """Generic function to generate and compare tensorflow and TVM output"""
+
+    out_node = out_name.split(':')[0] if ":" in out_name else out_name
+
+    if isinstance(in_name, list):
+        in_node = [0]*len(in_name)
+        for i in range(len(in_name)):
+            in_node[i] = in_name[i].split(':')[0] if ":" in in_name[i] else in_name[i]
+    else:
+        in_node = in_name.split(':')[0] if ":" in in_name else in_name
+
+    with tf.Session() as sess:
+        if init_global_variables:
+            sess.run(variables.global_variables_initializer())
+        final_graph_def = tf.graph_util.convert_variables_to_constants(
+            sess,
+            sess.graph.as_graph_def(add_shapes=True),
+            [out_node],
+            )
+
+        tf_output = run_tf_graph(sess, in_data, in_name, out_name)
+        tvm_output = run_tvm_graph(final_graph_def, in_data,
+                                   in_node, tf_output.shape, tf_output.dtype)
+        np.testing.assert_allclose(tf_output, tvm_output, atol=1e-5, rtol=1e-5)
+        sess.close()
+
 #######################################################################
 # Pooling
 # -------
@@ -93,31 +121,15 @@ def _test_pooling(input_shape, **kwargs):
         np.prod(input_shape), dtype=np.float32).reshape(input_shape) - 1
 
     with tf.Graph().as_default():
-        in_data = constant_op.constant(x, shape=input_shape, dtype='float32')
-        # pylint: disable=unused-variable
-        pool = nn_ops.pool(in_data, **kwargs)
-        # pylint: enable=unused-variable
+        in_data = array_ops.placeholder(shape=input_shape, dtype='float32')
+        nn_ops.pool(in_data, **kwargs)
 
         if kwargs['pooling_type'] == 'MAX':
-            out_node = 'max_pool'
             out_name = 'max_pool:0'
         else:
-            out_node = 'avg_pool'
             out_name = 'avg_pool:0'
 
-        with tf.Session() as sess:
-            graph_def = tf.graph_util.convert_variables_to_constants(
-                sess,
-                sess.graph.as_graph_def(add_shapes=True),
-                [out_node],
-                )
-
-            tf_output = run_tf_graph(sess, x, 'Const:0', out_name)
-            tvm_output = run_tvm_graph(graph_def, x.astype('float32'),
-                                       "Const", tf_output.shape, 'float32')
-            np.testing.assert_allclose(tf_output, tvm_output, atol=1e-3, rtol=1e-3)
-
-            sess.close()
+        compare_tf_with_tvm(x, 'Placeholder:0', out_name)
 
 def test_forward_pooling():
     """ Pooling """
@@ -195,35 +207,19 @@ def _test_convolution(tensor_in_sizes, filter_in_sizes,
     filter_array = [f * 1.0 for f in range(1, total_size_2 + 1)]
 
     with tf.Graph().as_default():
-        in_data = constant_op.constant(data_array, shape=tensor_in_sizes, dtype='float32')
+        in_data = array_ops.placeholder(shape=tensor_in_sizes, dtype='float32')
         in_filter = constant_op.constant(filter_array, shape=filter_in_sizes, dtype='float32')
         strides = [1] + strides + [1]
         dilations = [1] + dilations + [1]
 
-        # pylint: disable=unused-variable
-        conv = nn_ops.conv2d(in_data,
-                             in_filter,
-                             strides=strides,
-                             padding=padding,
-                             data_format=data_format)
-        # pylint: enable=unused-variable
+        nn_ops.conv2d(in_data,
+                      in_filter,
+                      strides=strides,
+                      padding=padding,
+                      data_format=data_format)
 
-        with tf.Session() as sess:
-            graph_def = tf.graph_util.convert_variables_to_constants(
-                sess,
-                sess.graph.as_graph_def(add_shapes=True),
-                ['Conv2D'],
-                )
-
-            tf_output = run_tf_graph(sess, np.reshape(data_array, tensor_in_sizes),
-                                     'Const:0', 'Conv2D:0')
-            tvm_output = run_tvm_graph(graph_def,
-                                       np.reshape(data_array, tensor_in_sizes).astype('float32'),
-                                       "Const", tf_output.shape, 'float32')
-
-            np.testing.assert_allclose(tf_output, tvm_output, atol=1e-3, rtol=1e-3)
-
-            sess.close()
+        compare_tf_with_tvm(np.reshape(data_array, tensor_in_sizes).astype('float32'),
+                            'Placeholder:0', 'Conv2D:0')
 
 def test_forward_convolution():
     _test_convolution([4, 8, 8, 176], [1, 1, 176, 32], [1, 1], [1, 1], 'SAME', 'NHWC')
@@ -239,28 +235,10 @@ def _test_reshape(data, out_shape):
     """ One iteration of reshape operation with given data and out shape """
 
     with tf.Graph().as_default():
-        in_data = constant_op.constant(data, shape=data.shape, dtype=data.dtype)
-
-        # pylint: disable=unused-variable
-        reshape_out = array_ops.reshape(in_data, out_shape)
-        # pylint: enable=unused-variable
+        in_data = array_ops.placeholder(shape=data.shape, dtype=data.dtype)
+        array_ops.reshape(in_data, out_shape)
 
-        with tf.Session() as sess:
-            graph_def = tf.graph_util.convert_variables_to_constants(
-                sess,
-                sess.graph.as_graph_def(add_shapes=True),
-                ['Reshape'],
-                )
-
-            tf_output = run_tf_graph(sess, data,
-                                     'Const:0', 'Reshape:0')
-            tvm_output = run_tvm_graph(graph_def,
-                                       data,
-                                       "Const", tf_output.shape, data.dtype)
-
-            np.testing.assert_allclose(tf_output, tvm_output)
-
-            sess.close()
+        compare_tf_with_tvm(data, 'Placeholder:0', 'Reshape:0')
 
 def test_forward_reshape():
     _test_reshape(np.arange(6.0), [2, 3])
@@ -279,31 +257,14 @@ def _test_squeeze(data, squeeze_dims=None):
         squeeze_dims = []
 
     with tf.Graph().as_default():
-        in_data = constant_op.constant(data, shape=data.shape, dtype=data.dtype)
+        in_data = array_ops.placeholder(shape=data.shape, dtype=data.dtype)
 
-        # pylint: disable=unused-variable
         if squeeze_dims:
-            squeeze_out = array_ops.squeeze(in_data, squeeze_dims)
+            array_ops.squeeze(in_data, squeeze_dims)
         else:
-            squeeze_out = array_ops.squeeze(in_data)
-        # pylint: enable=unused-variable
-
-        with tf.Session() as sess:
-            graph_def = tf.graph_util.convert_variables_to_constants(
-                sess,
-                sess.graph.as_graph_def(add_shapes=True),
-                ['Squeeze'],
-                )
-
-            tf_output = run_tf_graph(sess, data,
-                                     'Const:0', 'Squeeze:0')
-            tvm_output = run_tvm_graph(graph_def,
-                                       data,
-                                       "Const", tf_output.shape, data.dtype)
+            array_ops.squeeze(in_data)
 
-            np.testing.assert_allclose(tf_output, tvm_output)
-
-            sess.close()
+        compare_tf_with_tvm(data, 'Placeholder:0', 'Squeeze:0')
 
 def test_forward_squeeze():
     """ Squeeze """
@@ -336,28 +297,10 @@ def _test_concat_v2(data, dim):
     """ One iteration of ConcatV2 """
 
     with tf.Graph().as_default():
+        gen_array_ops._concat_v2(data, dim)
 
-        # pylint: disable=unused-variable
-        concat_out = gen_array_ops._concat_v2(data, dim)
-        # pylint: enable=unused-variable
-
-        with tf.Session() as sess:
-            graph_def = tf.graph_util.convert_variables_to_constants(
-                sess,
-                sess.graph.as_graph_def(add_shapes=True),
-                ['ConcatV2'],
-                )
-
-            tf_output = run_tf_graph(sess, data,
-                                     ['ConcatV2/values_0:0', 'ConcatV2/values_1:0'], 'ConcatV2:0')
-            tvm_output = run_tvm_graph(graph_def,
-                                       data,
-                                       ["ConcatV2/values_0", 'ConcatV2/values_1'],
-                                       tf_output.shape, tf_output.dtype)
-
-            np.testing.assert_allclose(tf_output, tvm_output)
-
-            sess.close()
+        compare_tf_with_tvm(data, ['ConcatV2/values_0:0', 'ConcatV2/values_1:0'],
+                            'ConcatV2:0')
 
 def _test_forward_concat_v2():
     t1 = np.array([])
@@ -377,28 +320,10 @@ def _test_sigmoid(data):
     """ One iteration of sigmoid """
 
     with tf.Graph().as_default():
-        in_data = constant_op.constant(data, shape=data.shape, dtype=data.dtype)
-
-        # pylint: disable=unused-variable
+        in_data = array_ops.placeholder(shape=data.shape, dtype=data.dtype)
         sigmoid_out = math_ops.sigmoid(in_data)
-        # pylint: enable=unused-variable
-
-        with tf.Session() as sess:
-            graph_def = tf.graph_util.convert_variables_to_constants(
-                sess,
-                sess.graph.as_graph_def(add_shapes=True),
-                ['Sigmoid'],
-                )
-
-            tf_output = run_tf_graph(sess, data,
-                                     'Const:0', 'Sigmoid:0')
-            tvm_output = run_tvm_graph(graph_def,
-                                       data,
-                                       "Const", tf_output.shape, data.dtype)
 
-            np.testing.assert_allclose(tf_output, tvm_output, atol=1e-5, rtol=1e-5)
-
-            sess.close()
+        compare_tf_with_tvm(data, 'Placeholder:0', 'Sigmoid:0')
 
 def test_forward_sigmoid():
     """ Sigmoid """
@@ -412,24 +337,10 @@ def test_forward_sigmoid():
 def _test_argx(func, data, **kwargs):
 
     with tf.Graph().as_default():
-        inp = constant_op.constant(data, shape=data.shape, dtype=data.dtype, name="c0")
-
-        # pylint: disable=unused-variable
-        out = func(inp, name="argx0", **kwargs)
-        # pylint: enable=unused-variable
-
-        with tf.Session() as sess:
-            graph_def = tf.graph_util.convert_variables_to_constants(
-                sess=sess,
-                input_graph_def=sess.graph.as_graph_def(add_shapes=True),
-                output_node_names=["argx0"])
-
-            tf_output = run_tf_graph(sess, data, input_node="c0:0", output_node="argx0:0")
-            tvm_output = run_tvm_graph(graph_def, data, "c0", tf_output.shape, output_dtype='int32')
-
-            np.testing.assert_allclose(tf_output, tvm_output, atol=1e-5, rtol=1e-5)
+        inp = array_ops.placeholder(shape=data.shape, dtype=data.dtype, name="c0")
+        func(inp, name="argx0", **kwargs, output_type=tf.int32)
 
-            sess.close()
+        compare_tf_with_tvm(data, 'c0:0', 'argx0:0')
 
 def test_argmin_argmax():
     for axis in [None,0,1,2]:
@@ -442,6 +353,8 @@ def test_argmin_argmax():
 # --------
 
 def _test_variable(data):
+    """ One iteration of a variable """
+
     tf.reset_default_graph()
     input_op = array_ops.placeholder(shape=data.shape, dtype=data.dtype)
     input_tensor = array_ops.reshape(input_op, data.shape)
@@ -450,84 +363,15 @@ def _test_variable(data):
     with variable_scope.variable_scope("linear", reuse=None):
         w = variable_scope.get_variable(
             "w", shape=[size, size], dtype=input_tensor.dtype)
-    # pylint: disable=unused-variable
-    output_op = math_ops.matmul(input_tensor, w)
-    # pylint: enable=unused-variable
-
-    with tf.Session() as sess:
-        sess.run(variables.global_variables_initializer())
-        final_graph_def = tf.graph_util.convert_variables_to_constants(
-            sess,
-            sess.graph.as_graph_def(add_shapes=True),
-            ['MatMul'],
-            )
-
-        tf_output = run_tf_graph(sess, data, 'Placeholder:0', 'MatMul:0')
-        tvm_output = run_tvm_graph(final_graph_def, data,
-                                   "Placeholder", tf_output.shape, data.dtype)
+    math_ops.matmul(input_tensor, w)
 
-        np.testing.assert_allclose(tf_output, tvm_output, atol=1e-5, rtol=1e-5)
-        sess.close()
+    compare_tf_with_tvm(data, 'Placeholder:0', 'MatMul:0', init_global_variables=True)
 
 def test_forward_variable():
     """Variable type op test"""
     _test_variable(np.random.uniform(size=(32, 100)).astype('float32'))
 
 
-#######################################################################
-# LSTM
-# ----
-def _test_lstm_cell(batch_size, num_hidden, num_layers, forget_bias, dtype):
-    tf.reset_default_graph()
-    input_size = num_hidden
-    input_data = np.full((batch_size, input_size), 1., dtype=dtype)
-    in_state_c = np.full((num_layers, batch_size, num_hidden), 0.1, dtype=dtype)
-    in_state_h = np.full((num_layers, batch_size, num_hidden), 0.1, dtype=dtype)
-
-    def _get_tensorflow_output():
-        with tf.Session() as sess:
-            with variable_scope.variable_scope(
-                "root", initializer=init_ops.constant_initializer(0.5)):
-                m0 = array_ops.zeros([batch_size, num_hidden])
-                m1 = array_ops.zeros([batch_size, num_hidden])
-                x=tf.placeholder(shape=(batch_size, input_size), dtype=dtype)
-                g, ((out_m0, out_m1)) = \
-                     tf.contrib.rnn.LSTMBlockCell(num_hidden,
-                                                  forget_bias=forget_bias)(x, ((m0, m1)))
-                sess.run([variables.global_variables_initializer()])
-                res = sess.run([g, out_m0, out_m1], {
-                    x.name: np.array([[1., 1.]]),
-                    m0.name: 0.1 * np.ones([batch_size, num_hidden]),
-                    m1.name: 0.1 * np.ones([batch_size, num_hidden]),
-                })
-            graph_def = sess.graph.as_graph_def(add_shapes=True)
-            final_graph_def = graph_util.convert_variables_to_constants(
-                sess,
-                graph_def,
-                ['root/lstm_cell/LSTMBlockCell'])
-            return final_graph_def, res
-
-    graph_def, tf_out = _get_tensorflow_output()
-    tvm_output = run_tvm_graph(graph_def, [input_data, in_state_c, in_state_h],
-                               ['root/Placeholder', 'root/lstm_cell/LSTMBlockCell_c',
-                                'root/lstm_cell/LSTMBlockCell_h'],
-                               [tf_out[0].shape, (2, batch_size, num_hidden)],
-                               [tf_out[0].dtype, tf_out[1].dtype])
-
-    if isinstance(tvm_output, list):
-        out = tvm_output[0]
-        out_state = tvm_output[1]
-        out_state_tup = np.split(out_state, indices_or_sections=2, axis=0)
-        out_state_c = np.reshape(out_state_tup[0], (batch_size, num_hidden))
-        out_state_h = np.reshape(out_state_tup[1], (batch_size, num_hidden))
-        tvm_out = [out, out_state_c, out_state_h]
-        np.testing.assert_allclose(tf_out, tvm_out, rtol=1e-3, atol=1e-3)
-
-def test_forward_lstm():
-    '''test LSTM block cell'''
-    _test_lstm_cell(1, 2, 1, 0.0, 'float32')
-
-
 #######################################################################
 # StridedSlice
 # ------------
@@ -535,6 +379,8 @@ def test_forward_lstm():
 def _test_stridedslice(ip_shape, begin, end, stride, dtype,
                              begin_mask=0, end_mask=0, new_axis_mask=0,
                              shrink_axis_mask=0, ellipsis_mask=0):
+    """ One iteration of a Stridedslice """
+
     tf.reset_default_graph()
     in_data = tf.placeholder(dtype, ip_shape, name="in_data")
     tf.strided_slice(in_data, begin, end, stride, begin_mask=begin_mask,
@@ -543,17 +389,7 @@ def _test_stridedslice(ip_shape, begin, end, stride, dtype,
                          ellipsis_mask=ellipsis_mask, name="strided_slice")
     np_data = np.random.uniform(size=ip_shape).astype(dtype)
 
-    with tf.Session() as sess:
-        final_graph_def = tf.graph_util.convert_variables_to_constants(
-            sess,
-            sess.graph.as_graph_def(add_shapes=True),
-            ['strided_slice'])
-        tf_output = run_tf_graph(sess, np_data,
-                                 'in_data:0', 'strided_slice:0')
-        tvm_output = run_tvm_graph(final_graph_def, np_data,
-                                   "in_data", tf_output.shape, np_data.dtype)
-        np.testing.assert_allclose(tf_output, tvm_output, atol=1e-5, rtol=1e-5)
-        sess.close()
+    compare_tf_with_tvm(np_data, 'in_data:0', 'strided_slice:0')
 
 def test_forward_stridedslice():
     '''test StridedSlice'''
@@ -586,6 +422,8 @@ def test_forward_stridedslice():
 # ------
 
 def _test_gather(ip_shape, indice_shape, indice_value, axis, dtype):
+    """ One iteration of a Gather """
+
     tf.reset_default_graph()
     in_data = tf.placeholder(dtype, ip_shape, name="in_data")
     indices = tf.placeholder("int32", indice_shape, name="indices")
@@ -601,17 +439,7 @@ def _fill_indices(indice_value):
         return indices
     np_indices = _fill_indices(indice_value)
 
-    with tf.Session() as sess:
-        final_graph_def = tf.graph_util.convert_variables_to_constants(
-            sess,
-            sess.graph.as_graph_def(add_shapes=True),
-            ['GatherV2'])
-        tf_output = run_tf_graph(sess, [np_data, np_indices], ['in_data:0',
-                                 'indices:0'], 'GatherV2:0')
-        tvm_output = run_tvm_graph(final_graph_def, [np_data, np_indices],
-                                   ['in_data', 'indices'], tf_output.shape, dtype)
-        np.testing.assert_allclose(tf_output, tvm_output, atol=1e-5, rtol=1e-5)
-        sess.close()
+    compare_tf_with_tvm([np_data, np_indices], ['in_data:0', 'indices:0'], 'GatherV2:0')
 
 def test_forward_gather():
     '''test gather layer'''
@@ -640,28 +468,11 @@ def test_forward_multi_input():
 
         out1 = tf.add(in1, in2, name='out1')
         out2 = tf.subtract(in3, in4, name='out2')
-
         out = tf.multiply(out1, out2, name='out')
+        in_data = np.arange(9, dtype='int32').reshape([3, 3])
 
-        with tf.Session() as sess:
-            graph_def = tf.graph_util.convert_variables_to_constants(
-                sess,
-                sess.graph.as_graph_def(add_shapes=True),
-                ['out'],
-                )
-
-            in_data = np.arange(9, dtype='int32').reshape([3, 3])
-
-            tf_output = run_tf_graph(sess, [in_data, in_data, in_data, in_data ],
-                                     ['in1:0', 'in2:0', 'in3:0', 'in4:0'], 'out:0')
-            tvm_output = run_tvm_graph(graph_def,
-                                       [in_data, in_data, in_data, in_data ],
-                                       ['in1', 'in2', 'in3', 'in4'],
-                                       tf_output.shape, tf_output.dtype)
-
-            np.testing.assert_allclose(tf_output, tvm_output)
-
-            sess.close()
+        compare_tf_with_tvm([in_data, in_data, in_data, in_data],
+                            ['in1:0', 'in2:0', 'in3:0', 'in4:0'], 'out:0')
 
 #######################################################################
 # Resize Bilinear
@@ -674,36 +485,75 @@ def _test_resize_bilinear(in_shape, to_shape, align_corners):
     shape_data = np.array(to_shape).astype('int32')
 
     with tf.Graph().as_default():
-        in_data = constant_op.constant(data, shape=data.shape, dtype=data.dtype)
+        in_data = array_ops.placeholder(shape=data.shape, dtype=data.dtype)
         shape_data = constant_op.constant(shape_data, shape=shape_data.shape, dtype=shape_data.dtype)
+        tf.image.resize_bilinear(in_data, shape_data, align_corners=align_corners)
 
-        # pylint: disable=unused-variable
-        resize_out = tf.image.resize_bilinear(in_data, shape_data, align_corners=align_corners)
-        # pylint: enable=unused-variable
+        compare_tf_with_tvm(data, 'Placeholder:0', 'ResizeBilinear:0')
 
-        with tf.Session() as sess:
-            graph_def = tf.graph_util.convert_variables_to_constants(
-                sess,
-                sess.graph.as_graph_def(add_shapes=True),
-                ['ResizeBilinear'],
-                )
+def test_forward_resize_bilinear():
+    """ Resize Bilinear """
 
-            tf_output = run_tf_graph(sess, data,
-                    'Const:0', 'ResizeBilinear:0')
+    _test_resize_bilinear((4, 16, 32, 32), [50, 50], False)
+    _test_resize_bilinear((6, 32, 64, 64), [20, 20], True)
 
-            tvm_output = run_tvm_graph(graph_def,
-                                       data,
-                                       "Const", tf_output.shape, data.dtype)
 
-            np.testing.assert_allclose(tf_output, tvm_output, atol=1e-3, rtol=1e-3)
+#######################################################################
+# LSTM
+# ----
 
-            sess.close()
+def _test_lstm_cell(batch_size, num_hidden, num_layers, forget_bias, dtype):
+    """ One iteration of a LSTM cell """
 
-def test_forward_resize_bilinear():
-    """ Resize Bilinear """
+    tf.reset_default_graph()
+    input_size = num_hidden
+    input_data = np.full((batch_size, input_size), 1., dtype=dtype)
+    in_state_c = np.full((num_layers, batch_size, num_hidden), 0.1, dtype=dtype)
+    in_state_h = np.full((num_layers, batch_size, num_hidden), 0.1, dtype=dtype)
 
-    _test_resize_bilinear((4, 16, 32, 32), [50, 50], False)
-    _test_resize_bilinear((6, 32, 64, 64), [20, 20], True)
+    def _get_tensorflow_output():
+        with tf.Session() as sess:
+            with variable_scope.variable_scope(
+                "root", initializer=init_ops.constant_initializer(0.5)):
+                m0 = array_ops.zeros([batch_size, num_hidden])
+                m1 = array_ops.zeros([batch_size, num_hidden])
+                x=tf.placeholder(shape=(batch_size, input_size), dtype=dtype)
+                g, ((out_m0, out_m1)) = \
+                     tf.contrib.rnn.LSTMBlockCell(num_hidden,
+                                                  forget_bias=forget_bias)(x, ((m0, m1)))
+                sess.run([variables.global_variables_initializer()])
+                res = sess.run([g, out_m0, out_m1], {
+                    x.name: np.array([[1., 1.]]),
+                    m0.name: 0.1 * np.ones([batch_size, num_hidden]),
+                    m1.name: 0.1 * np.ones([batch_size, num_hidden]),
+                })
+            graph_def = sess.graph.as_graph_def(add_shapes=True)
+            final_graph_def = graph_util.convert_variables_to_constants(
+                sess,
+                graph_def,
+                ['root/lstm_cell/LSTMBlockCell'])
+            return final_graph_def, res
+
+    graph_def, tf_out = _get_tensorflow_output()
+    tvm_output = run_tvm_graph(graph_def, [input_data, in_state_c, in_state_h],
+                               ['root/Placeholder', 'root/lstm_cell/LSTMBlockCell_c',
+                                'root/lstm_cell/LSTMBlockCell_h'],
+                               [tf_out[0].shape, (2, batch_size, num_hidden)],
+                               [tf_out[0].dtype, tf_out[1].dtype])
+    assert isinstance(tvm_output, list)
+
+    out = tvm_output[0]
+    out_state = tvm_output[1]
+    out_state_tup = np.split(out_state, indices_or_sections=2, axis=0)
+    out_state_c = np.reshape(out_state_tup[0], (batch_size, num_hidden))
+    out_state_h = np.reshape(out_state_tup[1], (batch_size, num_hidden))
+    tvm_out = [out, out_state_c, out_state_h]
+    np.testing.assert_allclose(tf_out, tvm_out, rtol=1e-3, atol=1e-3)
+
+def test_forward_lstm():
+    '''test LSTM block cell'''
+
+    _test_lstm_cell(1, 2, 1, 0.0, 'float32')
 
 #######################################################################
 # Pad
@@ -714,30 +564,17 @@ def _test_pad(input_shape, paddings, mode, **kwargs):
     x = np.arange(np.prod(input_shape), dtype=np.float32).reshape(input_shape)
 
     with tf.Graph().as_default():
-        in_data = constant_op.constant(x, shape=input_shape, dtype='float32')
+        in_data = array_ops.placeholder(shape=input_shape, dtype='float32')
         pad_values = constant_op.constant(paddings)
         pad = tf.pad(in_data, paddings=pad_values, mode=mode, **kwargs)
 
         if mode == 'CONSTANT':
             if 'constant_values' in kwargs:
-                out_node = 'PadV2'
                 out_name = 'PadV2:0'
             else:
-                out_node = 'Pad'
                 out_name = 'Pad:0'
 
-        with tf.Session() as sess:
-            graph_def = tf.graph_util.convert_variables_to_constants(
-                sess,
-                sess.graph.as_graph_def(add_shapes=True),
-                [out_node],
-                )
-
-            tf_output = run_tf_graph(sess, x, 'Const:0', out_name)
-            tvm_output = run_tvm_graph(graph_def, x.astype('float32'),
-                                       "Const", tf_output.shape, 'float32')
-            np.testing.assert_allclose(tf_output, tvm_output)
-            sess.close()
+        compare_tf_with_tvm(x, 'Placeholder:0', out_name)
 
 def test_forward_pad():
     """ Pad """
@@ -944,17 +781,7 @@ def _test_lrn(ishape, size, axis, bias, alpha, beta):
                                             alpha=alpha,
                                             beta=beta)
 
-        with tf.Session() as sess:
-            graph_def = tf.graph_util.convert_variables_to_constants(
-                sess,
-                sess.graph.as_graph_def(add_shapes=True),
-                ['lrn'],)
-            tf_output = run_tf_graph(sess, inp_array, 'lrn0_data:0', 'lrn:0')
-            tvm_output = run_tvm_graph(graph_def,
-                                       inp_array,
-                                       "lrn0_data", tf_output.shape, tf_output.dtype)
-            np.testing.assert_allclose(tf_output, tvm_output, atol=1e-3, rtol=1e-3)
-            sess.close()
+        compare_tf_with_tvm(inp_array, 'lrn0_data:0', 'lrn:0')
 
 def test_forward_lrn():
     _test_lrn((1, 3, 20, 20), 3, 1, 1.0, 1.0, 0.5)
@@ -962,38 +789,26 @@ def test_forward_lrn():
 #######################################################################
 # l2_normalize
 # ------------
+
 def _test_l2_normalize(ishape, eps, axis):
     """ testing l2 normalize (uses max, sum, square, sqrt frontend operators)"""
 
     inp_array = np.random.uniform(size=ishape).astype(np.float32)
-    inp_array.fill(1)
 
     with tf.Graph().as_default():
-        in1 = tf.placeholder(shape=inp_array.shape, dtype=inp_array.dtype, name="Placeholder")
+        in1 = tf.placeholder(shape=inp_array.shape, dtype=inp_array.dtype)
         nn.l2_normalize(in1,
                         axis=axis,
                         epsilon=eps,
                         name=None,
                         dim=None)
 
-        with tf.Session() as sess:
-            graph_def = tf.graph_util.convert_variables_to_constants(
-                sess,
-                sess.graph.as_graph_def(add_shapes=True),
-                ['l2_normalize'],
-                )
-            tf_output = run_tf_graph(sess, inp_array, 'Placeholder:0', 'Placeholder:0')
-            tvm_output = run_tvm_graph(graph_def,
-                                       inp_array,
-                                       "Placeholder",
-                                       tf_output.shape,
-                                       tf_output.dtype)
-
-            np.testing.assert_allclose(tf_output, tvm_output, atol=1e-3, rtol=1e-3)
-            sess.close()
+        compare_tf_with_tvm(inp_array, 'Placeholder:0', 'l2_normalize:0')
+
 def test_forward_l2_normalize():
     _test_l2_normalize((1, 3, 20, 20), 0.001, (0,))
 
+
 #######################################################################
 # Main
 # ----
@@ -1011,7 +826,7 @@ def test_forward_l2_normalize():
     test_forward_mobilenet()
     test_forward_variable()
     test_forward_resize_bilinear()
-    test_forward_pad()    
+    test_forward_pad()
     test_forward_lstm()
     test_forward_stridedslice()
     test_forward_gather()

From daa2569999d1afd5fec4501288e055f683ed5312 Mon Sep 17 00:00:00 2001
From: Siva <sivar.b@huawei.com>
Date: Tue, 14 Aug 2018 21:28:34 +0530
Subject: [PATCH 22/77] [NNVM][DOC] Update NNVM symbol documentation to latest.
 Ref. 1591 (#1599)

---
 docs/nnvm_top.rst | 79 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)

diff --git a/docs/nnvm_top.rst b/docs/nnvm_top.rst
index 96a37b779e1e..927990647a69 100644
--- a/docs/nnvm_top.rst
+++ b/docs/nnvm_top.rst
@@ -29,6 +29,7 @@ This level enables fully connected multi-layer perceptron.
 
    nnvm.symbol.dense
    nnvm.symbol.relu
+   nnvm.symbol.prelu
    nnvm.symbol.tanh
    nnvm.symbol.sigmoid
    nnvm.symbol.exp
@@ -39,6 +40,8 @@ This level enables fully connected multi-layer perceptron.
    nnvm.symbol.elemwise_mul
    nnvm.symbol.elemwise_div
    nnvm.symbol.elemwise_sum
+   nnvm.symbol.elemwise_mod
+   nnvm.symbol.elemwise_pow
    nnvm.symbol.flatten
    nnvm.symbol.concatenate
    nnvm.symbol.expand_dims
@@ -50,6 +53,14 @@ This level enables fully connected multi-layer perceptron.
    nnvm.symbol.log_softmax
    nnvm.symbol.pad
    nnvm.symbol.block_grad
+   nnvm.symbol.matmul
+   nnvm.symbol.resize
+   nnvm.symbol.upsampling
+   nnvm.symbol.take
+   nnvm.symbol.l2_normalize
+   nnvm.symbol.flip
+   nnvm.symbol.lrn
+   nnvm.symbol.where
 
 
 **Level 2: Convolutions**
@@ -92,6 +103,7 @@ This level enables typical convnet models.
    nnvm.symbol.__lshift_scalar__
    nnvm.symbol.__rshift_scalar__
 
+
 **Level 4: Broadcast and Reductions**
 
 .. autosummary::
@@ -117,11 +129,43 @@ This level enables typical convnet models.
    nnvm.symbol.ones_like
    nnvm.symbol.zeros
    nnvm.symbol.zeros_like
+   nnvm.symbol.slice_like
+   nnvm.symbol.strided_slice
+   nnvm.symbol.argmax
+   nnvm.symbol.argmin
+   nnvm.symbol.collapse_sum
+   nnvm.symbol.broadcast_equal
+   nnvm.symbol.broadcast_greater_equal
+   nnvm.symbol.broadcast_greater_equal
+   nnvm.symbol.broadcast_greater
+   nnvm.symbol.broadcast_left_shift
+   nnvm.symbol.broadcast_less_equal
+   nnvm.symbol.broadcast_less_equal
+   nnvm.symbol.broadcast_less
+   nnvm.symbol.broadcast_max
+   nnvm.symbol.broadcast_min
+   nnvm.symbol.broadcast_mod
+   nnvm.symbol.broadcast_not_equal
+   nnvm.symbol.broadcast_pow
+   nnvm.symbol.broadcast_right_shift
+
+
+**Level 5: Vision Operators**
+
+.. autosummary::
+   :nosignatures:
+
+   nnvm.symbol.multibox_prior
+   nnvm.symbol.multibox_transform_loc
+   nnvm.symbol.nms
+   nnvm.symbol.yolo_region
+   nnvm.symbol.yolo_reorg
 
 Detailed Definitions
 --------------------
 .. autofunction:: nnvm.symbol.dense
 .. autofunction:: nnvm.symbol.relu
+.. autofunction:: nnvm.symbol.prelu
 .. autofunction:: nnvm.symbol.tanh
 .. autofunction:: nnvm.symbol.sigmoid
 .. autofunction:: nnvm.symbol.exp
@@ -132,6 +176,8 @@ Detailed Definitions
 .. autofunction:: nnvm.symbol.elemwise_mul
 .. autofunction:: nnvm.symbol.elemwise_div
 .. autofunction:: nnvm.symbol.elemwise_sum
+.. autofunction:: nnvm.symbol.elemwise_mod
+.. autofunction:: nnvm.symbol.elemwise_pow
 .. autofunction:: nnvm.symbol.flatten
 .. autofunction:: nnvm.symbol.concatenate
 .. autofunction:: nnvm.symbol.expand_dims
@@ -143,6 +189,14 @@ Detailed Definitions
 .. autofunction:: nnvm.symbol.log_softmax
 .. autofunction:: nnvm.symbol.pad
 .. autofunction:: nnvm.symbol.block_grad
+.. autofunction:: nnvm.symbol.matmul
+.. autofunction:: nnvm.symbol.resize
+.. autofunction:: nnvm.symbol.upsampling
+.. autofunction:: nnvm.symbol.take
+.. autofunction:: nnvm.symbol.l2_normalize
+.. autofunction:: nnvm.symbol.flip
+.. autofunction:: nnvm.symbol.lrn
+.. autofunction:: nnvm.symbol.where
 
 .. autofunction:: nnvm.symbol.conv2d
 .. autofunction:: nnvm.symbol.conv2d_transpose
@@ -191,3 +245,28 @@ Detailed Definitions
 .. autofunction:: nnvm.symbol.ones_like
 .. autofunction:: nnvm.symbol.zeros
 .. autofunction:: nnvm.symbol.zeros_like
+.. autofunction:: nnvm.symbol.slice_like
+.. autofunction:: nnvm.symbol.strided_slice
+.. autofunction:: nnvm.symbol.argmax
+.. autofunction:: nnvm.symbol.argmin
+.. autofunction:: nnvm.symbol.collapse_sum
+.. autofunction:: nnvm.symbol.broadcast_equal
+.. autofunction:: nnvm.symbol.broadcast_greater_equal
+.. autofunction:: nnvm.symbol.broadcast_greater_equal
+.. autofunction:: nnvm.symbol.broadcast_greater
+.. autofunction:: nnvm.symbol.broadcast_left_shift
+.. autofunction:: nnvm.symbol.broadcast_less_equal
+.. autofunction:: nnvm.symbol.broadcast_less_equal
+.. autofunction:: nnvm.symbol.broadcast_less
+.. autofunction:: nnvm.symbol.broadcast_max
+.. autofunction:: nnvm.symbol.broadcast_min
+.. autofunction:: nnvm.symbol.broadcast_mod
+.. autofunction:: nnvm.symbol.broadcast_not_equal
+.. autofunction:: nnvm.symbol.broadcast_pow
+.. autofunction:: nnvm.symbol.broadcast_right_shift
+
+.. autofunction:: nnvm.symbol.multibox_prior
+.. autofunction:: nnvm.symbol.multibox_transform_loc
+.. autofunction:: nnvm.symbol.nms
+.. autofunction:: nnvm.symbol.yolo_region
+.. autofunction:: nnvm.symbol.yolo_reorg

From bc7431bda67e49a6fbfb06a2acfc1899c456e2ba Mon Sep 17 00:00:00 2001
From: Siva <sivar.b@huawei.com>
Date: Tue, 14 Aug 2018 22:14:33 +0530
Subject: [PATCH 23/77] [NNVM][POOL] bug fix. Remove the hardcode. (#1600)

---
 nnvm/src/top/nn/pooling.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nnvm/src/top/nn/pooling.cc b/nnvm/src/top/nn/pooling.cc
index cccd5b1c710b..8b9b7a64aa0d 100644
--- a/nnvm/src/top/nn/pooling.cc
+++ b/nnvm/src/top/nn/pooling.cc
@@ -77,7 +77,7 @@ inline bool Pool2DInferShape(const nnvm::NodeAttrs& attrs,
   } else {
     oshape[hidx] = ((dshape[hidx] + pad_h - param.pool_size[0] +
                     param.strides[0] - 1) / param.strides[0]) + 1;
-    oshape[widx] = ((dshape[3] + pad_w - param.pool_size[1] +
+    oshape[widx] = ((dshape[widx] + pad_w - param.pool_size[1] +
                     param.strides[1] - 1) / param.strides[1]) + 1;
   }
   NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, 0, oshape);

From 54a115ef14fb6dabbf6ea8eb9e6dd85846030c72 Mon Sep 17 00:00:00 2001
From: Siju <sijusamuel@gmail.com>
Date: Tue, 14 Aug 2018 23:18:02 +0530
Subject: [PATCH 24/77] [FRONTEND][DARKNET]LSTM and GRU support (#1576)

---
 nnvm/python/nnvm/frontend/darknet.py          | 140 ++++++++++++++++--
 nnvm/python/nnvm/testing/darknet.py           |   3 +
 .../python/frontend/darknet/test_forward.py   |  44 +++++-
 3 files changed, 175 insertions(+), 12 deletions(-)

diff --git a/nnvm/python/nnvm/frontend/darknet.py b/nnvm/python/nnvm/frontend/darknet.py
index 3a197a416219..3aa36b7e7ef9 100644
--- a/nnvm/python/nnvm/frontend/darknet.py
+++ b/nnvm/python/nnvm/frontend/darknet.py
@@ -412,7 +412,12 @@ def __init__(self, net, dtype='float32'):
         self._sym_array = {}
         self._tvmparams = {}
         self._outs = []
-        self._rnn_state_ctr = 0
+        self._state_ctr = {}
+        self._state_ctr['rnn'] = 0
+        self._state_ctr['crnn'] = 0
+        self._state_ctr['lstm'] = 0
+        self._state_ctr['cell_state'] = 0
+        self._state_ctr['gru'] = 0
 
     def _read_memory_buffer(self, shape, data):
         length = 1
@@ -623,16 +628,16 @@ def _get_opname(self, layer):
         """Returs the layer name."""
         return layer.type
 
-    def _new_rnn_state_sym(self, state=None):
+    def _new_rnn_state_sym(self, state=None, name='rnn'):
         """Returs a symbol for state"""
-        name = "rnn%d_state" % (self._rnn_state_ctr)
-        self._rnn_state_ctr += 1
-        return _sym.Variable(name=name, init=state)
+        sym_name = name + "%d_state" % self._state_ctr[name]
+        self._state_ctr[name] += 1
+        return _sym.Variable(name=sym_name, init=state)
 
-    def _get_rnn_state_buffer(self, layer):
+    def _get_rnn_state_buffer(self, layer, name):
         """Get the state buffer for rnn."""
         buffer = np.zeros((1, layer.outputs), self.dtype)
-        return self._new_rnn_state_sym(buffer)
+        return self._new_rnn_state_sym(buffer, name)
 
     def _get_darknet_rnn_attrs(self, layer, sym):
         """Get the rnn converted symbol from attributes."""
@@ -653,7 +658,7 @@ def _handle_darknet_rnn_layers(self, layer_num, sym):
             attr.update({'batch' : layer.batch})
             attr.update({'num_hidden' : str(layer.outputs)})
 
-            state = self._get_rnn_state_buffer(layer)
+            state = self._get_rnn_state_buffer(layer, 'rnn')
 
             for _ in range(layer.steps):
                 input_layer = layer.input_layer
@@ -678,7 +683,7 @@ def _handle_darknet_rnn_layers(self, layer_num, sym):
             attr.update({'batch' : layer.batch})
             attr.update({'num_hidden' : str(layer.outputs)})
 
-            state = self._get_rnn_state_buffer(layer)
+            state = self._get_rnn_state_buffer(layer, 'crnn')
 
             for _ in range(layer.steps):
                 input_layer = layer.input_layer
@@ -698,6 +703,123 @@ def _handle_darknet_rnn_layers(self, layer_num, sym):
             self._sym_array[layer_num] = sym
             processed = True
 
+        elif LAYERTYPE.LSTM == layer.type:
+            if layer.steps > 1:
+                raise NotImplementedError("Currently support only single step GRU")
+
+            op_name_add = 'elemwise_add'
+            op_name_mul = 'elemwise_mul'
+            attrs = {}
+            act_attr = {}
+
+            h_state = self._get_rnn_state_buffer(layer, 'lstm')
+            c_state = self._get_rnn_state_buffer(layer, 'cell_state')
+            for _ in range(layer.steps):
+                sym_wf = self._get_darknet_rnn_attrs(layer.wf, h_state)
+                sym_wi = self._get_darknet_rnn_attrs(layer.wi, h_state)
+                sym_wg = self._get_darknet_rnn_attrs(layer.wg, h_state)
+                sym_wo = self._get_darknet_rnn_attrs(layer.wo, h_state)
+
+                input_sym = sym
+                sym_uf = self._get_darknet_rnn_attrs(layer.uf, input_sym)
+                sym_ui = self._get_darknet_rnn_attrs(layer.ui, input_sym)
+                sym_ug = self._get_darknet_rnn_attrs(layer.ug, input_sym)
+                sym_uo = self._get_darknet_rnn_attrs(layer.uo, input_sym)
+
+                new_inputs = _as_list([sym_wf, sym_uf])
+                add_f = _darknet_get_nnvm_op(op_name_add)(*new_inputs, **attrs)
+
+                new_inputs = _as_list([sym_wi, sym_ui])
+                add_i = _darknet_get_nnvm_op(op_name_add)(*new_inputs, **attrs)
+
+                new_inputs = _as_list([sym_wg, sym_ug])
+                add_g = _darknet_get_nnvm_op(op_name_add)(*new_inputs, **attrs)
+
+                new_inputs = _as_list([sym_wo, sym_uo])
+                add_o = _darknet_get_nnvm_op(op_name_add)(*new_inputs, **attrs)
+
+                act_attr['activation'] = ACTIVATION.LOGISTIC
+                act_f, _ = _darknet_activations(_as_list(add_f), act_attr)
+
+                act_attr['activation'] = ACTIVATION.LOGISTIC
+                act_i, _ = _darknet_activations(_as_list(add_i), act_attr)
+
+                act_attr['activation'] = ACTIVATION.TANH
+                act_g, _ = _darknet_activations(_as_list(add_g), act_attr)
+
+                act_attr['activation'] = ACTIVATION.LOGISTIC
+                act_o, _ = _darknet_activations(_as_list(add_o), act_attr)
+
+                new_inputs = _as_list([act_i, act_g])
+                mul_t = _darknet_get_nnvm_op(op_name_mul)(*new_inputs, **attrs)
+
+                new_inputs = _as_list([act_f, c_state])
+                c_state = _darknet_get_nnvm_op(op_name_mul)(*new_inputs, **attrs)
+
+                new_inputs = _as_list([mul_t, c_state])
+                c_state = _darknet_get_nnvm_op(op_name_add)(*new_inputs, **attrs)
+
+                act_attr['activation'] = ACTIVATION.TANH
+                h_state, _ = _darknet_activations(_as_list(c_state), act_attr)
+
+                new_inputs = _as_list([act_o, h_state])
+                h_state = _darknet_get_nnvm_op(op_name_mul)(*new_inputs, **attrs)
+                self._outs = self._outs + [c_state, h_state]
+                sym = h_state
+            self._sym_array[layer_num] = sym
+            processed = True
+
+        elif LAYERTYPE.GRU == layer.type:
+            if layer.steps > 1:
+                raise NotImplementedError("Currently support only single step GRU")
+
+            op_name_add = 'elemwise_add'
+            op_name_mul = 'elemwise_mul'
+            attrs = {}
+            act_attr = {}
+
+            state = self._get_rnn_state_buffer(layer, "gru")
+            for _ in range(layer.steps):
+                sym_wz = self._get_darknet_rnn_attrs(layer.wz, state)
+                sym_wr = self._get_darknet_rnn_attrs(layer.wr, state)
+
+                input_sym = sym
+                sym_uz = self._get_darknet_rnn_attrs(layer.uz, input_sym)
+                sym_ur = self._get_darknet_rnn_attrs(layer.ur, input_sym)
+                sym_uh = self._get_darknet_rnn_attrs(layer.uh, input_sym)
+
+                new_inputs = _as_list([sym_uz, sym_wz])
+                add_z = _darknet_get_nnvm_op(op_name_add)(*new_inputs, **attrs)
+
+                new_inputs = _as_list([sym_ur, sym_wr])
+                add_r = _darknet_get_nnvm_op(op_name_add)(*new_inputs, **attrs)
+
+                act_attr['activation'] = ACTIVATION.LOGISTIC
+                act_z, _ = _darknet_activations(_as_list(add_z), act_attr)
+
+                act_attr['activation'] = ACTIVATION.LOGISTIC
+                act_r, _ = _darknet_activations(_as_list(add_r), act_attr)
+
+                new_inputs = _as_list([act_r, state])
+                forgot = _darknet_get_nnvm_op(op_name_mul)(*new_inputs, **attrs)
+
+                sym_wh = self._get_darknet_rnn_attrs(layer.wh, forgot)
+
+                new_inputs = _as_list([sym_uh, sym_wh])
+                h_state = _darknet_get_nnvm_op(op_name_add)(*new_inputs, **attrs)
+
+                if layer.tanh == 1:
+                    act_attr['activation'] = ACTIVATION.TANH
+                else:
+                    act_attr['activation'] = ACTIVATION.LOGISTIC
+                h_state, _ = _darknet_activations(_as_list(h_state), act_attr)
+
+                sym = act_z * state + (1 - act_z) * h_state
+
+                self._outs = self._outs + [sym]
+            self._sym_array[layer_num] = sym
+            processed = True
+
         return processed, sym
 
     def from_darknet(self):
diff --git a/nnvm/python/nnvm/testing/darknet.py b/nnvm/python/nnvm/testing/darknet.py
index 362fd3058954..e3d110e9605e 100644
--- a/nnvm/python/nnvm/testing/darknet.py
+++ b/nnvm/python/nnvm/testing/darknet.py
@@ -491,6 +491,9 @@ class ACTIVATION(object):
 layer make_region_layer(int batch, int w, int h, int n, int classes, int coords);
 layer make_softmax_layer(int batch, int inputs, int groups);
 layer make_rnn_layer(int batch, int inputs, int outputs, int steps, ACTIVATION activation, int batch_normalize, int adam);
+layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int output_filters, int steps, ACTIVATION activation, int batch_normalize);
+layer make_lstm_layer(int batch, int inputs, int outputs, int steps, int batch_normalize, int adam);
+layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_normalize, int adam);
 void free_network(network *net);
 """
                    )
diff --git a/nnvm/tests/python/frontend/darknet/test_forward.py b/nnvm/tests/python/frontend/darknet/test_forward.py
index e68aed085664..5fc71a86211e 100644
--- a/nnvm/tests/python/frontend/darknet/test_forward.py
+++ b/nnvm/tests/python/frontend/darknet/test_forward.py
@@ -306,7 +306,7 @@ def test_forward_softmax_temperature():
     LIB.free_network(net)
 
 def test_forward_rnn():
-    '''test softmax layer'''
+    '''test RNN layer'''
     net = LIB.make_network(1)
     batch = 1
     inputs = 256
@@ -325,7 +325,7 @@ def test_forward_rnn():
     LIB.free_network(net)
 
 def test_forward_crnn():
-    '''test softmax layer'''
+    '''test CRNN layer'''
     net = LIB.make_network(1)
     batch = 1
     c = 3
@@ -349,6 +349,42 @@ def test_forward_crnn():
     test_forward(net)
     LIB.free_network(net)
 
+def test_forward_lstm():
+    '''test LSTM layer'''
+    net = LIB.make_network(1)
+    batch = 1
+    inputs = 256
+    outputs = 256
+    steps = 1
+    batch_normalize = 0
+    adam = 0
+    layer_1 = LIB.make_lstm_layer(batch, inputs, outputs, steps, batch_normalize, adam)
+    net.layers[0] = layer_1
+    net.inputs = inputs
+    net.outputs = outputs
+    net.w = net.h = 0
+    LIB.resize_network(net, net.w, net.h)
+    test_rnn_forward(net)
+    LIB.free_network(net)
+
+def test_forward_gru():
+    '''test GRU layer'''
+    net = LIB.make_network(1)
+    batch = 1
+    inputs = 256
+    outputs = 256
+    steps = 1
+    batch_normalize = 0
+    adam = 0
+    layer_1 = LIB.make_gru_layer(batch, inputs, outputs, steps, batch_normalize, adam)
+    net.layers[0] = layer_1
+    net.inputs = inputs
+    net.outputs = outputs
+    net.w = net.h = 0
+    LIB.resize_network(net, net.w, net.h)
+    test_rnn_forward(net)
+    LIB.free_network(net)
+
 def test_forward_activation_logistic():
     '''test logistic activation layer'''
     net = LIB.make_network(1)
@@ -395,4 +431,6 @@ def test_forward_activation_logistic():
     test_forward_elu()
     test_forward_rnn()
     test_forward_crnn()
-    test_forward_activation_logistic()
\ No newline at end of file
+    test_forward_lstm()
+    test_forward_gru()
+    test_forward_activation_logistic()

From 7751a6babc1e0bcd6d94560530a78fb0579fc9f1 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <mercy_zheng@sjtu.edu.cn>
Date: Wed, 15 Aug 2018 15:34:05 -0700
Subject: [PATCH 25/77] [AUTOTVM] Fix GATuner and improve error message (#1605)

---
 include/tvm/operation.h                       |  2 ++
 python/tvm/autotvm/measure/measure_methods.py |  2 ++
 python/tvm/autotvm/task/nnvm_integration.py   | 15 ++++++++++++++-
 python/tvm/autotvm/task/task.py               |  2 +-
 python/tvm/autotvm/task/topi_integration.py   |  2 +-
 python/tvm/autotvm/tuner/callback.py          |  8 +++++++-
 python/tvm/autotvm/tuner/ga_tuner.py          | 14 ++++++++++----
 7 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/include/tvm/operation.h b/include/tvm/operation.h
index d13680531af9..ed8be6e4a7c0 100644
--- a/include/tvm/operation.h
+++ b/include/tvm/operation.h
@@ -366,6 +366,8 @@ class ExternOpNode : public OperationNode {
     v->Visit("tag", &tag);
     v->Visit("attrs", &attrs);
     v->Visit("inputs", &inputs);
+    v->Visit("input_placeholders", &input_placeholders);
+    v->Visit("output_placeholders", &output_placeholders);
     v->Visit("body", &body);
   }
   EXPORT static Operation make(std::string name,
diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index e192ee26ee3e..d845cc1f88fd 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -394,6 +394,8 @@ def _measure_common(input_pack, build_func, build_kwargs, number, repeat,
             msg = str(exc)
             if "Stack trace returned" in msg:
                 msg = msg[:msg.index("Stack trace returned")]
+            if "CUDA Source" in msg:
+                msg = msg[:msg.index("CUDA Source")]
             costs = (RuntimeError(msg),)
             errno = MeasureErrorNo.RUNTIME_DEVICE
         tstamp = time.time()
diff --git a/python/tvm/autotvm/task/nnvm_integration.py b/python/tvm/autotvm/task/nnvm_integration.py
index 338b46784a75..1b50869fc378 100644
--- a/python/tvm/autotvm/task/nnvm_integration.py
+++ b/python/tvm/autotvm/task/nnvm_integration.py
@@ -4,12 +4,16 @@
 
 """
 import warnings
+import logging
+
 
 from ... import tensor, placeholder, target as _target
 
 from ..util import get_const_tuple
 from .task import create, register
+from .dispatcher import ApplyHistoryBest
 
+logger = logging.getLogger('autotvm')
 
 def serialize_args(args):
     """serialize arguments of a topi function to a hashable tuple.
@@ -176,8 +180,17 @@ def extract_from_graph(graph, shape, dtype, target, symbols, target_host=None):
 
     # run compiler to collect all TOPI calls during compilation
     env.reset()
+
+    # disable logger temporarily
+    old_state = logger.disabled
+    logger.disabled = True
+
+    # use a dummy target to do a fake compile for collecting topi calls
     dummy_target = _target.create("opencl -device=dummy")
-    nnvm.compiler.build(graph, target=dummy_target, shape=shape, dtype=dtype)
+    with ApplyHistoryBest([], allow_fallback=True):
+        nnvm.compiler.build(graph, target=dummy_target, shape=shape, dtype=dtype)
+
+    logger.disabled = old_state
 
     tasks = []
     for task_name, args in env.get_tasks():
diff --git a/python/tvm/autotvm/task/task.py b/python/tvm/autotvm/task/task.py
index 7a386f1f9e67..f8923fca56e3 100644
--- a/python/tvm/autotvm/task/task.py
+++ b/python/tvm/autotvm/task/task.py
@@ -368,7 +368,7 @@ def traverse(ops):
                 pass
             else:
                 raise FlopCalculationError("Only support tvm.compute currently. "
-                                           "Other ops like tvm.scan is not supported")
+                                           "Other ops like tvm.scan/tvm.extern is not supported")
         return ret
 
     try:
diff --git a/python/tvm/autotvm/task/topi_integration.py b/python/tvm/autotvm/task/topi_integration.py
index 012ca4a214e9..18f45f8d6708 100644
--- a/python/tvm/autotvm/task/topi_integration.py
+++ b/python/tvm/autotvm/task/topi_integration.py
@@ -62,7 +62,7 @@ def _decorator(f):
         for target_key in targets:
             if target_key not in _REGISTED_DISPATHCER:
                 _REGISTED_DISPATHCER[target_key] = {}
-            if topi_compute not in _REGISTED_DISPATHCER:
+            if topi_compute not in _REGISTED_DISPATHCER[target_key]:
                 @topi_compute.register(target_key)
                 @dispatcher
                 def config_dispatcher(*args, **kwargs):
diff --git a/python/tvm/autotvm/tuner/callback.py b/python/tvm/autotvm/tuner/callback.py
index 15d5ac1c9689..6f66871f671c 100644
--- a/python/tvm/autotvm/tuner/callback.py
+++ b/python/tvm/autotvm/tuner/callback.py
@@ -101,11 +101,17 @@ def __init__(self):
             self.total = total
 
         def __del__(self):
-            sys.stdout.write(' Done.\n')
+            if logger.level < logging.DEBUG:  # only print progress bar in non-debug mode
+                sys.stdout.write(' Done.\n')
 
     ctx = _Context()
     tic = time.time()
 
+    if logger.level < logging.DEBUG:  # only print progress bar in non-debug mode
+        sys.stdout.write('\r%s Current/Best: %7.2f/%7.2f GFLOPS | Progress: (%d/%d) '
+                         '| %.2f s' % (prefix, 0, 0, 0, total, time.time() - tic))
+        sys.stdout.flush()
+
     def _callback(tuner, inputs, results):
         ctx.ct += len(inputs)
 
diff --git a/python/tvm/autotvm/tuner/ga_tuner.py b/python/tvm/autotvm/tuner/ga_tuner.py
index 916bd4ee68c6..b92737ed5317 100644
--- a/python/tvm/autotvm/tuner/ga_tuner.py
+++ b/python/tvm/autotvm/tuner/ga_tuner.py
@@ -47,6 +47,7 @@ def __init__(self, task, pop_size, elite_num=3, mutation_prob=0.1):
 
         # random initialization
         self.pop_size = min(self.pop_size, len(self.space))
+        self.elite_num = min(self.pop_size, self.elite_num)
         for _ in range(self.pop_size):
             tmp_gene = point2knob(np.random.randint(len(self.space)), self.dims)
             while knob2point(tmp_gene, self.dims) in self.visited:
@@ -70,9 +71,9 @@ def update(self, inputs, results):
                 y = inp.task.flop / np.mean(res.costs)
                 self.scores.append(y)
             else:
-                self.scores.append(0)
+                self.scores.append(0.0)
 
-        if len(self.scores) >= len(self.genes):
+        if len(self.scores) >= len(self.genes) and len(self.visited) < len(self.space):
             genes = self.genes + self.elites
             scores = np.array(self.scores[:len(self.genes)] + self.elite_scores)
 
@@ -85,8 +86,13 @@ def update(self, inputs, results):
 
             # cross over
             indices = np.arange(len(genes))
-            scores /= np.max(scores)
-            probs = scores / np.sum(scores)
+            max_score = np.max(scores)
+            if max_score < 1e-8:
+                probs = np.empty_like(scores)
+                probs[:] = 1.0 / len(scores)
+            else:
+                scores /= max_score
+                probs = scores / np.sum(scores)
             tmp_genes = []
             for _ in range(self.pop_size):
                 p1, p2 = np.random.choice(indices, size=2, replace=False, p=probs)

From a80356bbe0e857d674d3dbe0f0cfbec43ba5c950 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <mercy_zheng@sjtu.edu.cn>
Date: Wed, 15 Aug 2018 15:42:24 -0700
Subject: [PATCH 26/77] [NNVM] Add symbol for inception v3 (#1604)

---
 nnvm/python/nnvm/testing/__init__.py          |   1 +
 nnvm/python/nnvm/testing/inception_v3.py      | 255 ++++++++++++++++++
 nnvm/python/nnvm/testing/squeezenet.py        |   2 +-
 nnvm/src/compiler/graph_hash.cc               |   2 +-
 .../frontend/mxnet/model_zoo/__init__.py      |   9 +-
 .../frontend/mxnet/model_zoo/inception_v3.py  | 170 ++++++++++++
 .../tests/python/frontend/mxnet/test_graph.py |  11 +-
 7 files changed, 442 insertions(+), 8 deletions(-)
 create mode 100644 nnvm/python/nnvm/testing/inception_v3.py
 create mode 100644 nnvm/tests/python/frontend/mxnet/model_zoo/inception_v3.py

diff --git a/nnvm/python/nnvm/testing/__init__.py b/nnvm/python/nnvm/testing/__init__.py
index bff828d68280..4a879047ec7e 100644
--- a/nnvm/python/nnvm/testing/__init__.py
+++ b/nnvm/python/nnvm/testing/__init__.py
@@ -8,6 +8,7 @@
 from . import resnet
 from . import vgg
 from . import squeezenet
+from . import inception_v3
 from . import dcgan
 from . import dqn
 from . import yolo2_detection
diff --git a/nnvm/python/nnvm/testing/inception_v3.py b/nnvm/python/nnvm/testing/inception_v3.py
new file mode 100644
index 000000000000..f14daa1ae656
--- /dev/null
+++ b/nnvm/python/nnvm/testing/inception_v3.py
@@ -0,0 +1,255 @@
+"""
+Inception V3, suitable for images with around 299 x 299
+
+Reference:
+Szegedy, Christian, et al. "Rethinking the Inception Architecture for Computer Vision."
+arXiv preprint arXiv:1512.00567 (2015).
+
+Adopted from https://github.com/apache/incubator-mxnet/blob/
+             master/example/image-classification/symbols/inception-v3.py
+"""
+# pylint: disable=invalid-name,missing-docstring,unused-argument
+from .. import symbol as sym
+from .utils import create_workload
+
+def Conv(data, num_filter, kernel=(1, 1), stride=(1, 1), pad=(0, 0), name=None, suffix=''):
+    conv = sym.conv2d(data=data, channels=num_filter, kernel_size=kernel,
+                      strides=stride, padding=pad, use_bias=False,
+                      name='%s%s_conv2d' % (name, suffix))
+    bn = sym.batch_norm(data=conv, name='%s%s_batchnorm' % (name, suffix), epsilon=2e-5)
+    act = sym.relu(data=bn, name='%s%s_relu' % (name, suffix))
+    return act
+
+def Pooling(data, kernel, stride, pad, pool_type, name):
+    if pool_type == 'max':
+        return sym.max_pool2d(data=data, pool_size=kernel, strides=stride, padding=pad, name=name)
+    elif pool_type == 'avg':
+        return sym.avg_pool2d(data=data, pool_size=kernel, strides=stride, padding=pad, name=name,
+                              count_include_pad=True)
+    else:
+        raise ValueError("Invalid pooling type: " + pool_type)
+
+def Inception7A(data,
+                num_1x1,
+                num_3x3_red, num_3x3_1, num_3x3_2,
+                num_5x5_red, num_5x5,
+                pool, proj,
+                name):
+    tower_1x1 = Conv(data, num_1x1, name=('%s_conv' % name))
+    tower_5x5 = Conv(data, num_5x5_red, name=('%s_tower' % name), suffix='_conv')
+    tower_5x5 = Conv(tower_5x5, num_5x5, kernel=(5, 5), pad=(2, 2), name=('%s_tower' % name),
+                     suffix='_conv_1')
+    tower_3x3 = Conv(data, num_3x3_red, name=('%s_tower_1' % name), suffix='_conv')
+    tower_3x3 = Conv(tower_3x3, num_3x3_1, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name),
+                     suffix='_conv_1')
+    tower_3x3 = Conv(tower_3x3, num_3x3_2, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name),
+                     suffix='_conv_2')
+    pooling = Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool,
+                      name=('%s_pool_%s_pool' % (pool, name)))
+
+    cproj = Conv(pooling, proj, name=('%s_tower_2' % name), suffix='_conv')
+    concat = sym.concatenate(*[tower_1x1, tower_5x5, tower_3x3, cproj],
+                             name='ch_concat_%s_chconcat' % name)
+    return concat
+
+# First Downsample
+def Inception7B(data,
+                num_3x3,
+                num_d3x3_red, num_d3x3_1, num_d3x3_2,
+                pool,
+                name):
+    tower_3x3 = Conv(data, num_3x3, kernel=(3, 3), pad=(0, 0), stride=(2, 2),
+                     name=('%s_conv' % name))
+    tower_d3x3 = Conv(data, num_d3x3_red, name=('%s_tower' % name), suffix='_conv')
+    tower_d3x3 = Conv(tower_d3x3, num_d3x3_1, kernel=(3, 3), pad=(1, 1), stride=(1, 1),
+                      name=('%s_tower' % name), suffix='_conv_1')
+    tower_d3x3 = Conv(tower_d3x3, num_d3x3_2, kernel=(3, 3), pad=(0, 0), stride=(2, 2),
+                      name=('%s_tower' % name), suffix='_conv_2')
+    pooling = Pooling(data=data, kernel=(3, 3), stride=(2, 2), pad=(0, 0), pool_type="max",
+                      name=('max_pool_%s_pool' % name))
+    concat = sym.concatenate(*[tower_3x3, tower_d3x3, pooling], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def Inception7C(data,
+                num_1x1,
+                num_d7_red, num_d7_1, num_d7_2,
+                num_q7_red, num_q7_1, num_q7_2, num_q7_3, num_q7_4,
+                pool, proj,
+                name):
+    tower_1x1 = Conv(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_conv' % name))
+    tower_d7 = Conv(data=data, num_filter=num_d7_red, name=('%s_tower' % name), suffix='_conv')
+    tower_d7 = Conv(data=tower_d7, num_filter=num_d7_1, kernel=(1, 7), pad=(0, 3),
+                    name=('%s_tower' % name), suffix='_conv_1')
+    tower_d7 = Conv(data=tower_d7, num_filter=num_d7_2, kernel=(7, 1), pad=(3, 0),
+                    name=('%s_tower' % name), suffix='_conv_2')
+    tower_q7 = Conv(data=data, num_filter=num_q7_red, name=('%s_tower_1' % name), suffix='_conv')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_1, kernel=(7, 1), pad=(3, 0),
+                    name=('%s_tower_1' % name), suffix='_conv_1')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_2, kernel=(1, 7), pad=(0, 3),
+                    name=('%s_tower_1' % name), suffix='_conv_2')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_3, kernel=(7, 1), pad=(3, 0),
+                    name=('%s_tower_1' % name), suffix='_conv_3')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_4, kernel=(1, 7), pad=(0, 3),
+                    name=('%s_tower_1' % name), suffix='_conv_4')
+    pooling = Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool,
+                      name=('%s_pool_%s_pool' % (pool, name)))
+    cproj = Conv(data=pooling, num_filter=proj, kernel=(1, 1),
+                 name=('%s_tower_2' % name), suffix='_conv')
+    # concat
+    concat = sym.concatenate(*[tower_1x1, tower_d7, tower_q7, cproj],
+                             name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def Inception7D(data,
+                num_3x3_red, num_3x3,
+                num_d7_3x3_red, num_d7_1, num_d7_2, num_d7_3x3,
+                pool,
+                name):
+    tower_3x3 = Conv(data=data, num_filter=num_3x3_red, name=('%s_tower' % name),
+                     suffix='_conv')
+    tower_3x3 = Conv(data=tower_3x3, num_filter=num_3x3, kernel=(3, 3), pad=(0, 0), stride=(2, 2),
+                     name=('%s_tower' % name), suffix='_conv_1')
+    tower_d7_3x3 = Conv(data=data, num_filter=num_d7_3x3_red, name=('%s_tower_1' % name),
+                        suffix='_conv')
+    tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_1, kernel=(1, 7), pad=(0, 3),
+                        name=('%s_tower_1' % name), suffix='_conv_1')
+    tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_2, kernel=(7, 1), pad=(3, 0),
+                        name=('%s_tower_1' % name), suffix='_conv_2')
+    tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_3x3, kernel=(3, 3), stride=(2, 2),
+                        name=('%s_tower_1' % name), suffix='_conv_3')
+    pooling = Pooling(data=data, kernel=(3, 3), stride=(2, 2), pool_type=pool, pad=(0, 0),
+                      name=('%s_pool_%s_pool' % (pool, name)))
+    # concat
+    concat = sym.concatenate(*[tower_3x3, tower_d7_3x3, pooling],
+                             name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def Inception7E(data,
+                num_1x1,
+                num_d3_red, num_d3_1, num_d3_2,
+                num_3x3_d3_red, num_3x3, num_3x3_d3_1, num_3x3_d3_2,
+                pool, proj,
+                name):
+    tower_1x1 = Conv(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_conv' % name))
+    tower_d3 = Conv(data=data, num_filter=num_d3_red, name=('%s_tower' % name), suffix='_conv')
+    tower_d3_a = Conv(data=tower_d3, num_filter=num_d3_1, kernel=(1, 3), pad=(0, 1),
+                      name=('%s_tower' % name), suffix='_mixed_conv')
+    tower_d3_b = Conv(data=tower_d3, num_filter=num_d3_2, kernel=(3, 1), pad=(1, 0),
+                      name=('%s_tower' % name), suffix='_mixed_conv_1')
+    tower_3x3_d3 = Conv(data=data, num_filter=num_3x3_d3_red, name=('%s_tower_1' % name),
+                        suffix='_conv')
+    tower_3x3_d3 = Conv(data=tower_3x3_d3, num_filter=num_3x3, kernel=(3, 3), pad=(1, 1),
+                        name=('%s_tower_1' % name), suffix='_conv_1')
+    tower_3x3_d3_a = Conv(data=tower_3x3_d3, num_filter=num_3x3_d3_1, kernel=(1, 3), pad=(0, 1),
+                          name=('%s_tower_1' % name), suffix='_mixed_conv')
+    tower_3x3_d3_b = Conv(data=tower_3x3_d3, num_filter=num_3x3_d3_2, kernel=(3, 1), pad=(1, 0),
+                          name=('%s_tower_1' % name), suffix='_mixed_conv_1')
+    pooling = Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool,
+                      name=('%s_pool_%s_pool' % (pool, name)))
+    cproj = Conv(data=pooling, num_filter=proj, kernel=(1, 1), name=('%s_tower_2' % name),
+                 suffix='_conv')
+    # concat
+    concat = sym.concatenate(
+        *[tower_1x1, tower_d3_a, tower_d3_b, tower_3x3_d3_a, tower_3x3_d3_b, cproj],
+        name='ch_concat_%s_chconcat' % name)
+    return concat
+
+
+def get_symbol(num_classes=1000, **kwargs):
+    data = sym.Variable(name="data")
+    # stage 1
+    conv = Conv(data, 32, kernel=(3, 3), stride=(2, 2), name="conv")
+    conv_1 = Conv(conv, 32, kernel=(3, 3), name="conv_1")
+    conv_2 = Conv(conv_1, 64, kernel=(3, 3), pad=(1, 1), name="conv_2")
+    pool = Pooling(data=conv_2, kernel=(3, 3), stride=(2, 2), pool_type="max", pad=(0, 0),
+                   name="pool")
+    # stage 2
+    conv_3 = Conv(pool, 80, kernel=(1, 1), name="conv_3")
+    conv_4 = Conv(conv_3, 192, kernel=(3, 3), name="conv_4")
+    pool1 = Pooling(data=conv_4, kernel=(3, 3), stride=(2, 2), pool_type="max", pad=(0, 0),
+                    name="pool1")
+
+    # stage 3
+    in3a = Inception7A(pool1, 64,
+                       64, 96, 96,
+                       48, 64,
+                       "avg", 32, "mixed")
+    in3b = Inception7A(in3a, 64,
+                       64, 96, 96,
+                       48, 64,
+                       "avg", 64, "mixed_1")
+    in3c = Inception7A(in3b, 64,
+                       64, 96, 96,
+                       48, 64,
+                       "avg", 64, "mixed_2")
+    in3d = Inception7B(in3c, 384,
+                       64, 96, 96,
+                       "max", "mixed_3")
+    # stage 4
+    in4a = Inception7C(in3d, 192,
+                       128, 128, 192,
+                       128, 128, 128, 128, 192,
+                       "avg", 192, "mixed_4")
+    in4b = Inception7C(in4a, 192,
+                       160, 160, 192,
+                       160, 160, 160, 160, 192,
+                       "avg", 192, "mixed_5")
+    in4c = Inception7C(in4b, 192,
+                       160, 160, 192,
+                       160, 160, 160, 160, 192,
+                       "avg", 192, "mixed_6")
+    in4d = Inception7C(in4c, 192,
+                       192, 192, 192,
+                       192, 192, 192, 192, 192,
+                       "avg", 192, "mixed_7")
+    in4e = Inception7D(in4d, 192, 320,
+                       192, 192, 192, 192,
+                       "max", "mixed_8")
+    # stage 5
+    in5a = Inception7E(in4e, 320,
+                       384, 384, 384,
+                       448, 384, 384, 384,
+                       "avg", 192, "mixed_9")
+    in5b = Inception7E(in5a, 320,
+                       384, 384, 384,
+                       448, 384, 384, 384,
+                       "max", 192, "mixed_10")
+    # pool
+    pool = Pooling(data=in5b, kernel=(8, 8), stride=(1, 1), pool_type="avg", pad=(0, 0),
+                   name="global_pool")
+    flatten = sym.flatten(data=pool, name="flatten")
+    fc1 = sym.dense(data=flatten, units=num_classes, name='fc1')
+    softmax = sym.softmax(data=fc1, name='softmax')
+    return softmax
+
+def get_workload(batch_size=1, num_classes=1000,
+                 image_shape=(3, 299, 299), dtype="float32", **kwargs):
+    """Get benchmark workload for InceptionV3
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch size used in the model
+
+    num_classes : int, optional
+        Number of classes
+
+    image_shape : tuple, optional
+        The input image shape
+
+    dtype : str, optional
+        The data type
+
+    kwargs : dict
+        Extra arguments
+
+    Returns
+    -------
+    net : nnvm.Symbol
+        The computational graph
+
+    params : dict of str to NDArray
+        The parameters.
+    """
+    net = get_symbol(num_classes=num_classes, **kwargs)
+    return create_workload(net, batch_size, image_shape, dtype)
diff --git a/nnvm/python/nnvm/testing/squeezenet.py b/nnvm/python/nnvm/testing/squeezenet.py
index a445e8cfb7da..eab2cf06fee6 100644
--- a/nnvm/python/nnvm/testing/squeezenet.py
+++ b/nnvm/python/nnvm/testing/squeezenet.py
@@ -98,7 +98,7 @@ def get_symbol(num_classes, version, **kwargs):
 
 def get_workload(batch_size=1, num_classes=1000, version='1.0',
                  image_shape=(3, 224, 224), dtype="float32", **kwargs):
-    """Get benchmark workload for resnet
+    """Get benchmark workload for SqueezeNet
 
     Parameters
     ----------
diff --git a/nnvm/src/compiler/graph_hash.cc b/nnvm/src/compiler/graph_hash.cc
index d881130f72cc..ccd2e3ce433f 100644
--- a/nnvm/src/compiler/graph_hash.cc
+++ b/nnvm/src/compiler/graph_hash.cc
@@ -125,7 +125,7 @@ std::string GraphDeepCompare(const Graph& a,
   const IndexedGraph& idxb = b.indexed_graph();
   std::ostringstream err;
   if (idxa.num_nodes() != idxb.num_nodes()) {
-    err << "Number of nodes mismatch";
+    err << "Number of nodes mismatch (" <<  idxa.num_nodes() << " v.s " << idxb.num_nodes() << ")";
     return err.str();
   }
   if (idxa.num_node_entries() != idxb.num_node_entries()) {
diff --git a/nnvm/tests/python/frontend/mxnet/model_zoo/__init__.py b/nnvm/tests/python/frontend/mxnet/model_zoo/__init__.py
index e3c9acdf23ef..66e743ad9c33 100644
--- a/nnvm/tests/python/frontend/mxnet/model_zoo/__init__.py
+++ b/nnvm/tests/python/frontend/mxnet/model_zoo/__init__.py
@@ -1,11 +1,8 @@
 """MXNet and NNVM model zoo."""
 from __future__ import absolute_import
-from . import mlp, resnet, vgg, dqn, dcgan, squeezenet
+from . import mlp, resnet, vgg, dqn, dcgan, squeezenet, inception_v3
 import nnvm.testing
 
-__all__ = ['mx_mlp', 'nnvm_mlp', 'mx_resnet', 'nnvm_resnet', 'mx_vgg', 'nnvm_vgg',
-           'mx_squeezenet', 'nnvm_squeezenet']
-
 _num_class = 1000
 
 # mlp fc
@@ -35,6 +32,10 @@
     mx_squeezenet[version] = squeezenet.get_symbol(version=version)
     nnvm_squeezenet[version] = nnvm.testing.squeezenet.get_workload(1, version=version)[0]
 
+# inception
+mx_inception_v3 = inception_v3.get_symbol()
+nnvm_inception_v3 = nnvm.testing.inception_v3.get_workload(1)[0]
+
 # dqn
 mx_dqn = dqn.get_symbol()
 nnvm_dqn = nnvm.testing.dqn.get_workload(1)[0]
diff --git a/nnvm/tests/python/frontend/mxnet/model_zoo/inception_v3.py b/nnvm/tests/python/frontend/mxnet/model_zoo/inception_v3.py
new file mode 100644
index 000000000000..b8585bf05037
--- /dev/null
+++ b/nnvm/tests/python/frontend/mxnet/model_zoo/inception_v3.py
@@ -0,0 +1,170 @@
+"""
+Inception V3, suitable for images with around 299 x 299
+
+Reference:
+Szegedy, Christian, et al. "Rethinking the Inception Architecture for Computer Vision." arXiv preprint arXiv:1512.00567 (2015).
+
+Adopted from https://github.com/apache/incubator-mxnet/blob/
+             master/example/image-classification/symbols/inception-v3.py
+"""
+import mxnet as mx
+import numpy as np
+
+def Conv(data, num_filter, kernel=(1, 1), stride=(1, 1), pad=(0, 0), name=None, suffix=''):
+    conv = mx.sym.Convolution(data=data, num_filter=num_filter, kernel=kernel, stride=stride, pad=pad, no_bias=True, name='%s%s_conv2d' %(name, suffix))
+    bn = mx.sym.BatchNorm(data=conv, eps=2e-5, name='%s%s_batchnorm' % (name, suffix))
+    act = mx.sym.Activation(data=bn, act_type='relu', name='%s%s_relu' %(name, suffix))
+    return act
+
+
+def Inception7A(data,
+                num_1x1,
+                num_3x3_red, num_3x3_1, num_3x3_2,
+                num_5x5_red, num_5x5,
+                pool, proj,
+                name):
+    tower_1x1 = Conv(data, num_1x1, name=('%s_conv' % name))
+    tower_5x5 = Conv(data, num_5x5_red, name=('%s_tower' % name), suffix='_conv')
+    tower_5x5 = Conv(tower_5x5, num_5x5, kernel=(5, 5), pad=(2, 2), name=('%s_tower' % name), suffix='_conv_1')
+    tower_3x3 = Conv(data, num_3x3_red, name=('%s_tower_1' % name), suffix='_conv')
+    tower_3x3 = Conv(tower_3x3, num_3x3_1, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name), suffix='_conv_1')
+    tower_3x3 = Conv(tower_3x3, num_3x3_2, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name), suffix='_conv_2')
+    pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name)))
+    cproj = Conv(pooling, proj, name=('%s_tower_2' %  name), suffix='_conv')
+    concat = mx.sym.Concat(*[tower_1x1, tower_5x5, tower_3x3, cproj], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+# First Downsample
+def Inception7B(data,
+                num_3x3,
+                num_d3x3_red, num_d3x3_1, num_d3x3_2,
+                pool,
+                name):
+    tower_3x3 = Conv(data, num_3x3, kernel=(3, 3), pad=(0, 0), stride=(2, 2), name=('%s_conv' % name))
+    tower_d3x3 = Conv(data, num_d3x3_red, name=('%s_tower' % name), suffix='_conv')
+    tower_d3x3 = Conv(tower_d3x3, num_d3x3_1, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name=('%s_tower' % name), suffix='_conv_1')
+    tower_d3x3 = Conv(tower_d3x3, num_d3x3_2, kernel=(3, 3), pad=(0, 0), stride=(2, 2), name=('%s_tower' % name), suffix='_conv_2')
+    pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pad=(0,0), pool_type="max", name=('max_pool_%s_pool' % name))
+    concat = mx.sym.Concat(*[tower_3x3, tower_d3x3, pooling], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def Inception7C(data,
+                num_1x1,
+                num_d7_red, num_d7_1, num_d7_2,
+                num_q7_red, num_q7_1, num_q7_2, num_q7_3, num_q7_4,
+                pool, proj,
+                name):
+    tower_1x1 = Conv(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_conv' % name))
+    tower_d7 = Conv(data=data, num_filter=num_d7_red, name=('%s_tower' % name), suffix='_conv')
+    tower_d7 = Conv(data=tower_d7, num_filter=num_d7_1, kernel=(1, 7), pad=(0, 3), name=('%s_tower' % name), suffix='_conv_1')
+    tower_d7 = Conv(data=tower_d7, num_filter=num_d7_2, kernel=(7, 1), pad=(3, 0), name=('%s_tower' % name), suffix='_conv_2')
+    tower_q7 = Conv(data=data, num_filter=num_q7_red, name=('%s_tower_1' % name), suffix='_conv')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_1, kernel=(7, 1), pad=(3, 0), name=('%s_tower_1' % name), suffix='_conv_1')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_2, kernel=(1, 7), pad=(0, 3), name=('%s_tower_1' % name), suffix='_conv_2')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_3, kernel=(7, 1), pad=(3, 0), name=('%s_tower_1' % name), suffix='_conv_3')
+    tower_q7 = Conv(data=tower_q7, num_filter=num_q7_4, kernel=(1, 7), pad=(0, 3), name=('%s_tower_1' % name), suffix='_conv_4')
+    pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name)))
+    cproj = Conv(data=pooling, num_filter=proj, kernel=(1, 1), name=('%s_tower_2' %  name), suffix='_conv')
+    # concat
+    concat = mx.sym.Concat(*[tower_1x1, tower_d7, tower_q7, cproj], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def Inception7D(data,
+                num_3x3_red, num_3x3,
+                num_d7_3x3_red, num_d7_1, num_d7_2, num_d7_3x3,
+                pool,
+                name):
+    tower_3x3 = Conv(data=data, num_filter=num_3x3_red, name=('%s_tower' % name), suffix='_conv')
+    tower_3x3 = Conv(data=tower_3x3, num_filter=num_3x3, kernel=(3, 3), pad=(0,0), stride=(2, 2), name=('%s_tower' % name), suffix='_conv_1')
+    tower_d7_3x3 = Conv(data=data, num_filter=num_d7_3x3_red, name=('%s_tower_1' % name), suffix='_conv')
+    tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_1, kernel=(1, 7), pad=(0, 3), name=('%s_tower_1' % name), suffix='_conv_1')
+    tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_2, kernel=(7, 1), pad=(3, 0), name=('%s_tower_1' % name), suffix='_conv_2')
+    tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_3x3, kernel=(3, 3), stride=(2, 2), name=('%s_tower_1' % name), suffix='_conv_3')
+    pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name)))
+    # concat
+    concat = mx.sym.Concat(*[tower_3x3, tower_d7_3x3, pooling], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def Inception7E(data,
+                num_1x1,
+                num_d3_red, num_d3_1, num_d3_2,
+                num_3x3_d3_red, num_3x3, num_3x3_d3_1, num_3x3_d3_2,
+                pool, proj,
+                name):
+    tower_1x1 = Conv(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_conv' % name))
+    tower_d3 = Conv(data=data, num_filter=num_d3_red, name=('%s_tower' % name), suffix='_conv')
+    tower_d3_a = Conv(data=tower_d3, num_filter=num_d3_1, kernel=(1, 3), pad=(0, 1), name=('%s_tower' % name), suffix='_mixed_conv')
+    tower_d3_b = Conv(data=tower_d3, num_filter=num_d3_2, kernel=(3, 1), pad=(1, 0), name=('%s_tower' % name), suffix='_mixed_conv_1')
+    tower_3x3_d3 = Conv(data=data, num_filter=num_3x3_d3_red, name=('%s_tower_1' % name), suffix='_conv')
+    tower_3x3_d3 = Conv(data=tower_3x3_d3, num_filter=num_3x3, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name), suffix='_conv_1')
+    tower_3x3_d3_a = Conv(data=tower_3x3_d3, num_filter=num_3x3_d3_1, kernel=(1, 3), pad=(0, 1), name=('%s_tower_1' % name), suffix='_mixed_conv')
+    tower_3x3_d3_b = Conv(data=tower_3x3_d3, num_filter=num_3x3_d3_2, kernel=(3, 1), pad=(1, 0), name=('%s_tower_1' % name), suffix='_mixed_conv_1')
+    pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name)))
+    cproj = Conv(data=pooling, num_filter=proj, kernel=(1, 1), name=('%s_tower_2' %  name), suffix='_conv')
+    # concat
+    concat = mx.sym.Concat(*[tower_1x1, tower_d3_a, tower_d3_b, tower_3x3_d3_a, tower_3x3_d3_b, cproj], name='ch_concat_%s_chconcat' % name)
+    return concat
+
+def get_symbol(num_classes=1000, **kwargs):
+    data = mx.sym.Variable(name="data")
+    # stage 1
+    conv = Conv(data, 32, kernel=(3, 3), stride=(2, 2), name="conv")
+    conv_1 = Conv(conv, 32, kernel=(3, 3), name="conv_1")
+    conv_2 = Conv(conv_1, 64, kernel=(3, 3), pad=(1, 1), name="conv_2")
+    pool = mx.sym.Pooling(data=conv_2, kernel=(3, 3), stride=(2, 2), pool_type="max", name="pool")
+    # stage 2
+    conv_3 = Conv(pool, 80, kernel=(1, 1), name="conv_3")
+    conv_4 = Conv(conv_3, 192, kernel=(3, 3), name="conv_4")
+    pool1 = mx.sym.Pooling(data=conv_4, kernel=(3, 3), stride=(2, 2), pool_type="max", name="pool1")
+
+    # # stage 3
+    in3a = Inception7A(pool1, 64,
+                       64, 96, 96,
+                       48, 64,
+                       "avg", 32, "mixed")
+    in3b = Inception7A(in3a, 64,
+                       64, 96, 96,
+                       48, 64,
+                       "avg", 64, "mixed_1")
+    in3c = Inception7A(in3b, 64,
+                       64, 96, 96,
+                       48, 64,
+                       "avg", 64, "mixed_2")
+    in3d = Inception7B(in3c, 384,
+                       64, 96, 96,
+                       "max", "mixed_3")
+    # stage 4
+    in4a = Inception7C(in3d, 192,
+                       128, 128, 192,
+                       128, 128, 128, 128, 192,
+                       "avg", 192, "mixed_4")
+    in4b = Inception7C(in4a, 192,
+                       160, 160, 192,
+                       160, 160, 160, 160, 192,
+                       "avg", 192, "mixed_5")
+    in4c = Inception7C(in4b, 192,
+                       160, 160, 192,
+                       160, 160, 160, 160, 192,
+                       "avg", 192, "mixed_6")
+    in4d = Inception7C(in4c, 192,
+                       192, 192, 192,
+                       192, 192, 192, 192, 192,
+                       "avg", 192, "mixed_7")
+    in4e = Inception7D(in4d, 192, 320,
+                       192, 192, 192, 192,
+                       "max", "mixed_8")
+    # stage 5
+    in5a = Inception7E(in4e, 320,
+                       384, 384, 384,
+                       448, 384, 384, 384,
+                       "avg", 192, "mixed_9")
+    in5b = Inception7E(in5a, 320,
+                       384, 384, 384,
+                       448, 384, 384, 384,
+                       "max", 192, "mixed_10")
+    # pool
+    pool = mx.sym.Pooling(data=in5b, kernel=(8, 8), stride=(1, 1), pool_type="avg", name="global_pool")
+    flatten = mx.sym.Flatten(data=pool, name="flatten")
+    fc1 = mx.sym.FullyConnected(data=flatten, num_hidden=num_classes, name='fc1', flatten=False)
+    softmax = mx.sym.SoftmaxOutput(data=fc1, name='softmax')
+    return softmax
diff --git a/nnvm/tests/python/frontend/mxnet/test_graph.py b/nnvm/tests/python/frontend/mxnet/test_graph.py
index 18e124ad6ffc..e89224cd969e 100644
--- a/nnvm/tests/python/frontend/mxnet/test_graph.py
+++ b/nnvm/tests/python/frontend/mxnet/test_graph.py
@@ -39,17 +39,23 @@ def test_squeezenet():
         nnvm_sym = model_zoo.nnvm_squeezenet[version]
         compare_graph(from_mx_sym, nnvm_sym)
 
+def test_inception_v3():
+    mx_sym = model_zoo.mx_inception_v3
+    from_mx_sym, _ = nnvm.frontend.from_mxnet(mx_sym)
+    nnvm_sym = model_zoo.nnvm_inception_v3
+    compare_graph(from_mx_sym, nnvm_sym, ishape=(2, 3, 299, 299))
+
 def test_dqn():
     mx_sym = model_zoo.mx_dqn
     from_mx_sym, _ = nnvm.frontend.from_mxnet(mx_sym)
     nnvm_sym = model_zoo.nnvm_dqn
-    compare_graph(from_mx_sym, nnvm_sym)
+    compare_graph(from_mx_sym, nnvm_sym, ishape=(2, 4, 84, 84))
 
 def test_dcgan():
     mx_sym = model_zoo.mx_dcgan
     from_mx_sym, _ = nnvm.frontend.from_mxnet(mx_sym)
     nnvm_sym = model_zoo.nnvm_dcgan
-    compare_graph(from_mx_sym, nnvm_sym)
+    compare_graph(from_mx_sym, nnvm_sym, ishape=(2, 100))
 
 def test_multi_outputs():
     def compose(F, **kwargs):
@@ -70,3 +76,4 @@ def compose(F, **kwargs):
     test_dqn()
     test_dcgan()
     test_squeezenet()
+    test_inception_v3()

From 9bcc317371f709e0f455b8a147511a203d38c71e Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Wed, 15 Aug 2018 23:21:05 -0700
Subject: [PATCH 27/77] [TEAM] New reviewer: kevinthesun (#1606)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 2d571ba668ea..9db50b02b11a 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -26,6 +26,7 @@ See the [community structure document](http://docs.tvm.ai/contribute/community.h
 - [Pariksheet Pinjari](https://github.com/PariksheetPinjari909)
 - [Siva](https://github.com/srkreddy1238)
 - [Alex Weaver](https://github.com/alex-weaver)
+- [Yao Wang](https://github.com/kevinthesun)
 - [Eddie Yan](https://github.com/eqy)
 - [Joshua Z. Zhang](https://github.com/zhreshold)
 

From 11dd933f71e0da53169b89d59abbccf0b73f4f0f Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Thu, 16 Aug 2018 14:05:08 -0700
Subject: [PATCH 28/77] [RUNTIME] Enable return NDArray in RPC (#1610)

---
 include/tvm/runtime/ndarray.h             |  1 +
 src/api/api_base.cc                       |  8 +++
 src/runtime/rpc/rpc_module.cc             | 61 +++++++++++++++++++---
 src/runtime/rpc/rpc_session.cc            | 63 +++++++++++++++++++----
 src/runtime/rpc/rpc_session.h             |  1 +
 tests/python/unittest/test_runtime_rpc.py | 44 +++++++++++++++-
 6 files changed, 160 insertions(+), 18 deletions(-)

diff --git a/include/tvm/runtime/ndarray.h b/include/tvm/runtime/ndarray.h
index d3ecce8ba9d0..c288ce5f3adb 100644
--- a/include/tvm/runtime/ndarray.h
+++ b/include/tvm/runtime/ndarray.h
@@ -246,6 +246,7 @@ struct NDArray::Container {
 
  private:
   friend class NDArray;
+  friend class RPCWrappedFunc;
   /*!
    * \brief The shape container,
    *  can be used used for shape data.
diff --git a/src/api/api_base.cc b/src/api/api_base.cc
index 37970e69e24f..70301993ad3a 100644
--- a/src/api/api_base.cc
+++ b/src/api/api_base.cc
@@ -37,6 +37,14 @@ TVM_REGISTER_API("_nop")
 .set_body([](TVMArgs args,  TVMRetValue *ret) {
   });
 
+// internal fucntion used for debug and testing purposes
+TVM_REGISTER_API("_ndarray_use_count")
+.set_body([](TVMArgs args,  TVMRetValue *ret) {
+    runtime::NDArray nd = args[0];
+    // substract the current one
+    *ret = (nd.use_count() - 1);
+  });
+
 TVM_REGISTER_API("_TVMSetStream")
 .set_body([](TVMArgs args,  TVMRetValue *ret) {
     TVMSetStream(args[0], args[1], args[2]);
diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc
index 251871bf0cc1..d6c56e1b7cf4 100644
--- a/src/runtime/rpc/rpc_module.cc
+++ b/src/runtime/rpc/rpc_module.cc
@@ -12,13 +12,13 @@ namespace tvm {
 namespace runtime {
 
 // Wrapped remote function to packed func.
-struct RPCWrappedFunc {
+class RPCWrappedFunc {
  public:
   RPCWrappedFunc(void* handle,
                  std::shared_ptr<RPCSession> sess)
       : handle_(handle), sess_(sess) {
     fwrap_ = PackedFunc([sess](TVMArgs args, TVMRetValue* rv) {
-        WrapRemote(sess, args.values[0].v_handle, args.type_codes[0], rv);
+        WrapRemote(sess, args, rv);
       });
   }
 
@@ -34,10 +34,47 @@ struct RPCWrappedFunc {
   }
 
   static void WrapRemote(std::shared_ptr<RPCSession> sess,
-                         void* handle,
-                         int tcode,
+                         TVMArgs args,
                          TVMRetValue* rv);
 
+  // deleter of RPC remote array
+  static void RemoteNDArrayDeleter(NDArray::Container* ptr) {
+    RemoteSpace* space = static_cast<RemoteSpace*>(ptr->dl_tensor.data);
+    space->sess->CallRemote(RPCCode::kNDArrayFree, ptr->manager_ctx);
+    delete space;
+    delete ptr;
+  }
+  // wrap return value as remote NDArray.
+  static NDArray WrapRemoteNDArray(std::shared_ptr<RPCSession> sess,
+                                   DLTensor* tensor,
+                                   void* nd_handle) {
+    NDArray::Container* data = new NDArray::Container();
+    data->manager_ctx = nd_handle;
+    data->deleter = RemoteNDArrayDeleter;
+    RemoteSpace* space = new RemoteSpace();
+    space->sess = sess;
+    space->data = tensor->data;
+    data->dl_tensor.data = space;
+    NDArray ret(data);
+    // RAII now in effect
+    data->shape_ = std::vector<int64_t>(
+        tensor->shape, tensor->shape + tensor->ndim);
+    data->dl_tensor.shape = dmlc::BeginPtr(data->shape_);
+    data->dl_tensor.ndim = static_cast<int>(data->shape_.size());
+    // setup dtype
+    data->dl_tensor.dtype = tensor->dtype;
+    // setup ctx, encode as remote session
+    data->dl_tensor.ctx.device_id = tensor->ctx.device_id;
+    data->dl_tensor.ctx.device_type = static_cast<DLDeviceType>(
+        static_cast<int>(tensor->ctx.device_type) +
+        kRPCSessMask * (sess->table_index() + 1));
+    // check strides.
+    CHECK(tensor->strides == nullptr);
+    // setup byteoffset
+    data->dl_tensor.byte_offset = tensor->byte_offset;
+    return ret;
+  }
+
  private:
   PackedFunc fwrap_;
   void* handle_{nullptr};
@@ -126,20 +163,28 @@ class RPCModuleNode final : public ModuleNode {
 };
 
 void RPCWrappedFunc::WrapRemote(std::shared_ptr<RPCSession> sess,
-                                void* handle,
-                                int tcode,
+                                TVMArgs args,
                                 TVMRetValue *rv) {
+  void* handle = args.values[0].v_handle;
+  int tcode = args.type_codes[0];
+
   if (handle == nullptr) return;
   if (tcode == kFuncHandle) {
     auto wf = std::make_shared<RPCWrappedFunc>(handle, sess);
     *rv = PackedFunc([wf](TVMArgs args, TVMRetValue* rv) {
         return wf->operator()(args, rv);
       });
-  } else {
-    CHECK_EQ(tcode, kModuleHandle);
+  } else if (tcode == kModuleHandle) {
     std::shared_ptr<RPCModuleNode> n =
         std::make_shared<RPCModuleNode>(handle, sess);
     *rv = Module(n);
+  } else if (tcode == kArrayHandle || tcode == kNDArrayContainer) {
+    CHECK_EQ(args.size(), 2);
+    DLTensor* tensor = args[0];
+    void* nd_handle = args[1];
+    *rv = WrapRemoteNDArray(sess, tensor, nd_handle);
+  } else {
+    LOG(FATAL) << "Cannot wrap tcode=" << tcode;
   }
 }
 
diff --git a/src/runtime/rpc/rpc_session.cc b/src/runtime/rpc/rpc_session.cc
index 21fff7b29882..6bb01b9bd459 100644
--- a/src/runtime/rpc/rpc_session.cc
+++ b/src/runtime/rpc/rpc_session.cc
@@ -130,19 +130,22 @@ class RPCSession::EventHandler : public dmlc::Stream {
           break;
         }
         case kReturnReceived: {
-          CHECK_EQ(arg_buf_->value.size(), 1U);
+          CHECK_GE(arg_buf_->value.size(), 1U);
+
           TVMArgValue argv = arg_buf_->AsTVMArgs()[0];
           if (argv.type_code() == kFuncHandle ||
-              argv.type_code() == kModuleHandle) {
+              argv.type_code() == kModuleHandle ||
+              argv.type_code() == kArrayHandle) {
             CHECK(fwrap != nullptr) << "function/module wrapper not available";
             fwrap->CallPacked(arg_buf_->AsTVMArgs(), rv);
           } else {
+            CHECK_EQ(arg_buf_->value.size(), 1U);
             *rv = argv;
           }
           arg_buf_.reset();
           this->SwitchToState(kRecvCode);
           std::swap(client_mode_, client_mode);
-          return  RPCCode::kReturn;
+          return RPCCode::kReturn;
         }
         case kCopyAckReceived: {
           std::swap(client_mode_, client_mode);
@@ -172,15 +175,22 @@ class RPCSession::EventHandler : public dmlc::Stream {
     ctx.device_type = static_cast<DLDeviceType>(dev_type % kRPCSessMask);
     return ctx;
   }
-  // send Packed sequence to writer.
-  void SendPackedSeq(const TVMValue* arg_values, const int* type_codes, int n) {
+  // Send Packed sequence to writer.
+  // return_ndarray is a special flag to handle returning of ndarray
+  //    In this case, we return the shape, context and data of the array,
+  //    as well as a customized PackedFunc that handles deletion of
+  //    the array in the remote.
+  void SendPackedSeq(const TVMValue* arg_values,
+                     const int* type_codes,
+                     int n,
+                     bool return_ndarray = false) {
     this->Write(n);
-    // only handles .
     for (int i = 0; i < n; ++i) {
       int tcode = type_codes[i];
       if (tcode == kNDArrayContainer) tcode = kArrayHandle;
       this->Write(tcode);
     }
+
     // Argument packing.
     for (int i = 0; i < n; ++i) {
       int tcode = type_codes[i];
@@ -215,9 +225,23 @@ class RPCSession::EventHandler : public dmlc::Stream {
         case kNDArrayContainer:
         case kArrayHandle: {
           DLTensor* arr = static_cast<DLTensor*>(value.v_handle);
-          TVMContext ctx = StripSessMask(arr->ctx);
-          uint64_t data = reinterpret_cast<uint64_t>(
-              static_cast<RemoteSpace*>(arr->data)->data);
+          TVMContext ctx;
+          uint64_t data;
+          if (!return_ndarray) {
+            // in the client mode
+            // ctx contains the remote table index
+            // the space is wrapped by an RemoteSpace
+            // that holds reference to the session.
+            ctx = StripSessMask(arr->ctx);
+            data = reinterpret_cast<uint64_t>(
+                static_cast<RemoteSpace*>(arr->data)->data);
+          } else {
+            // When we return NDArray, we directly return
+            // the space and the context
+            // The client will be further wrapping
+            ctx = arr->ctx;
+            data = reinterpret_cast<uint64_t>(arr->data);
+          }
           this->Write(data);
           this->Write(ctx);
           this->Write(arr->ndim);
@@ -701,6 +725,21 @@ class RPCSession::EventHandler : public dmlc::Stream {
               << "Only server can send function and module handle back.";
         rv.MoveToCHost(&ret_value, &ret_tcode);
         SendPackedSeq(&ret_value, &ret_tcode, 1);
+      } else if (rv.type_code() == kNDArrayContainer) {
+        // always send handle in 64 bit.
+        CHECK(!client_mode_)
+            << "Only server can send NDArray back";
+        // We follow a special protocol to return NDArray to client side
+        // The first pack value is the NDArray handle as DLTensor
+        // The second pack value is a customized deleter that deletes the NDArray.
+        TVMValue ret_value_pack[2];
+        int ret_tcode_pack[2];
+        rv.MoveToCHost(&ret_value_pack[0], &ret_tcode_pack[0]);
+
+        NDArray::Container* nd = static_cast<NDArray::Container*>(ret_value_pack[0].v_handle);
+        ret_value_pack[1].v_handle = nd;
+        ret_tcode_pack[1] = kHandle;
+        SendPackedSeq(ret_value_pack, ret_tcode_pack, 2, true);
       } else {
         ret_value = rv.value();
         ret_tcode = rv.type_code();
@@ -1090,6 +1129,11 @@ void RPCModuleGetSource(TVMArgs args, TVMRetValue *rv) {
   *rv = (*static_cast<Module*>(mhandle))->GetSource(fmt);
 }
 
+void RPCNDArrayFree(TVMArgs args, TVMRetValue *rv) {
+  void* handle = args[0];
+  static_cast<NDArray::Container*>(handle)->DecRef();
+}
+
 void RPCGetTimeEvaluator(TVMArgs args, TVMRetValue *rv) {
   PackedFunc *pf = static_cast<PackedFunc*>(args[0].operator void*());
   void *fhandle = new PackedFunc(WrapTimeEvaluator(*pf, args[1], args[2], args[3]));
@@ -1138,6 +1182,7 @@ void RPCSession::EventHandler::HandlePackedCall() {
     case RPCCode::kModuleFree: CallHandler(RPCModuleFree); break;
     case RPCCode::kModuleGetFunc: CallHandler(RPCModuleGetFunc); break;
     case RPCCode::kModuleGetSource: CallHandler(RPCModuleGetSource); break;
+    case RPCCode::kNDArrayFree: CallHandler(RPCNDArrayFree); break;
     default: LOG(FATAL) << "Unknown event " << static_cast<int>(code_);
   }
   CHECK_EQ(state_, kRecvCode);
diff --git a/src/runtime/rpc/rpc_session.h b/src/runtime/rpc/rpc_session.h
index 68f6763ae6db..4b736de0e041 100644
--- a/src/runtime/rpc/rpc_session.h
+++ b/src/runtime/rpc/rpc_session.h
@@ -48,6 +48,7 @@ enum class RPCCode : int {
   kModuleFree,
   kModuleGetFunc,
   kModuleGetSource,
+  kNDArrayFree
 };
 
 /*!
diff --git a/tests/python/unittest/test_runtime_rpc.py b/tests/python/unittest/test_runtime_rpc.py
index e7c0cc1bbabd..0de788068b6b 100644
--- a/tests/python/unittest/test_runtime_rpc.py
+++ b/tests/python/unittest/test_runtime_rpc.py
@@ -175,6 +175,7 @@ def test_rpc_return_func():
     @tvm.register_func("rpc.test.remote_func")
     def addone(x):
         return lambda y: x+y
+
     server = rpc.Server("localhost", key="x1")
     client = rpc.connect(server.host, server.port, key="x1")
     f1 = client.get_function("rpc.test.remote_func")
@@ -182,6 +183,46 @@ def addone(x):
     assert fadd(12) == 22
 
 
+def test_rpc_return_ndarray():
+    # Use closure to check the ref counter correctness
+    nd = tvm.nd.array(np.zeros(10).astype("float32"))
+    @tvm.register_func("rpc.test.remote_return_nd")
+    def my_module(name):
+        if name == "get_arr":
+            return lambda : nd
+        elif name == "ref_count":
+            return lambda : tvm._api_internal._ndarray_use_count(nd)
+        elif name == "get_elem":
+            return lambda idx: nd.asnumpy()[idx]
+        elif name == "get_arr_elem":
+            return lambda arr, idx: arr.asnumpy()[idx]
+
+    # start server
+    server = rpc.Server("localhost", key="x1")
+    client = rpc.connect(server.host, server.port, key="x1")
+    m = client.get_function("rpc.test.remote_return_nd")
+    get_arr = m("get_arr")
+    ref_count = m("ref_count")
+    get_elem = m("get_elem")
+    get_arr_elem = m("get_arr_elem")
+    # array test
+    def run_arr_test():
+        arr = get_arr()
+        assert ref_count() == 2
+        arr2 = get_arr()
+        assert ref_count() == 3
+        assert arr.context == client.cpu(0)
+        arr.copyfrom(np.ones(10).astype(arr.dtype))
+        assert arr2.asnumpy()[0] == 1.0
+        assert get_elem(0) == 1.0
+        assert get_arr_elem(arr2, 0) == 1.0
+
+    assert ref_count() == 1
+    run_arr_test()
+    # check recycle correctness
+    assert ref_count() == 1
+
+
 def test_local_func():
     @tvm.register_func("rpc.test.remote_func2")
     def addone(x):
@@ -199,9 +240,10 @@ def addone(x):
 
 if __name__ == "__main__":
     logging.basicConfig(level=logging.INFO)
+    test_rpc_return_ndarray()
+    test_rpc_return_func()
     test_bigendian_rpc()
     test_rpc_remote_module()
-    test_rpc_return_func()
     test_rpc_file_exchange()
     test_rpc_array()
     test_rpc_simple()

From 093dc741366df633e841a38f3e015bd8464eb1c0 Mon Sep 17 00:00:00 2001
From: Keren Zhou <robinho364@gmail.com>
Date: Thu, 16 Aug 2018 16:35:23 -0700
Subject: [PATCH 29/77] [NNVM] Add ONNX upsample converter (#1591)

---
 nnvm/python/nnvm/frontend/onnx.py             | 19 ++++++++
 .../python/frontend/onnx/test_forward.py      | 47 +++++++++++++++++++
 2 files changed, 66 insertions(+)

diff --git a/nnvm/python/nnvm/frontend/onnx.py b/nnvm/python/nnvm/frontend/onnx.py
index fa26648b293a..f62202a37dff 100644
--- a/nnvm/python/nnvm/frontend/onnx.py
+++ b/nnvm/python/nnvm/frontend/onnx.py
@@ -406,6 +406,24 @@ def _impl(inputs, attr, params):
     return _impl
 
 
+class Upsample(OnnxOpConverter):
+    """ Operator converter for Upsample (nearest mode).
+    """
+
+    @classmethod
+    def _impl_v7(cls, inputs, attr, params):
+        scales = attr.get('scales')
+        assert len(scales) == 4 and scales[0] == 1.0 and scales[1] == 1.0 and scales[2] == scales[3]
+        mode = attr.get('mode')
+        if mode == b'nearest':
+            method = "NEAREST_NEIGHBOR"
+        elif mode == b'linear':
+            method = "BILINEAR"
+        else:
+            raise ValueError("Invalid ONNX upsample mode: {}".format(mode))
+        return _sym.upsampling(inputs[0], scale=int(scales[-1]), method=method, layout='NCHW')
+
+
 class Shape(OnnxOpConverter):
     """ Operator converter for Shape.
     """
@@ -540,6 +558,7 @@ def _get_convert_map(opset):
         # 'Crop'
         # 'Embedding'
         # 'Upsample'
+        'Upsample' : Upsample.get_converter(opset),
         'SpatialBN': BatchNorm.get_converter(opset),
 
         # defs/generator
diff --git a/nnvm/tests/python/frontend/onnx/test_forward.py b/nnvm/tests/python/frontend/onnx/test_forward.py
index 9fb3aed2da10..3f2fbb144289 100644
--- a/nnvm/tests/python/frontend/onnx/test_forward.py
+++ b/nnvm/tests/python/frontend/onnx/test_forward.py
@@ -1,6 +1,8 @@
 import numpy as np
 import math
 import nnvm
+import topi
+import topi.testing
 import tvm
 from tvm.contrib import graph_runtime
 from nnvm.testing.config import ctx_list
@@ -380,6 +382,50 @@ def test_lrn():
     verify_lrn((5, 5, 5, 5), 3, 'float32')
     verify_lrn((5, 5, 5, 5), 3, 'float32', alpha=0.0002, beta=0.5, bias=2.0)
 
+def _test_upsample_nearest():
+    scale = 2
+    in_shape = (1, 1, 3, 3)
+    out_shape = (1, 1, 3*scale, 3*scale)
+    y = helper.make_node("Upsample", ['in'], ['out'], mode='nearest', scales=[1.0, 1.0, 2.0, 2.0])
+    
+    in_array = np.random.uniform(size=in_shape).astype(np.float32)
+    out_array = topi.testing.upsampling_python(in_array, scale, "NCHW")
+
+    graph = helper.make_graph([y],
+                              'upsample_nearest_test',
+                              inputs = [helper.make_tensor_value_info("in", TensorProto.FLOAT, list(in_shape))],
+                              outputs = [helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_shape))])
+
+    model = helper.make_model(graph, producer_name='upsample_nearest_test')
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, in_array, target, ctx, out_shape, 'float32')
+        np.testing.assert_allclose(out_array, tvm_out)
+
+def _test_upsample_bilinear():
+    scale = 2
+    in_shape = (1, 1, 3, 3)
+    out_shape = (1, 1, 3*scale, 3*scale)
+    y = helper.make_node("Upsample", ['in'], ['out'], mode='linear', scales=[1.0, 1.0, 2.0, 2.0])
+    
+    in_array = np.random.uniform(size=in_shape).astype(np.float32)
+    out_array = topi.testing.bilinear_resize_python(in_array, (3*scale, 3*scale), "NCHW")
+
+    graph = helper.make_graph([y],
+                              'upsample_bilinear_test',
+                              inputs = [helper.make_tensor_value_info("in", TensorProto.FLOAT, list(in_shape))],
+                              outputs = [helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_shape))])
+
+    model = helper.make_model(graph, producer_name='upsample_bilinear_test')
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, in_array, target, ctx, out_shape, 'float32')
+        np.testing.assert_allclose(out_array, tvm_out, rtol=1e-5, atol=1e-5)
+
+def test_upsample():
+    _test_upsample_nearest()
+    _test_upsample_bilinear()
+
 
 if __name__ == '__main__':
     # verify_super_resolution_example()
@@ -398,3 +444,4 @@ def test_lrn():
     test_matmul()
     test_gather()
     test_lrn()
+    test_upsample()

From e282915a9c76964551d600bc01602dc53d7a4da1 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <mercy_zheng@sjtu.edu.cn>
Date: Thu, 16 Aug 2018 16:37:03 -0700
Subject: [PATCH 30/77] add -mattr=+neon for all arm cpu target (#1612)

---
 apps/benchmark/README.md               | 11 +++++++----
 python/tvm/target.py                   | 16 ++++++++--------
 tutorials/nnvm/deploy_model_on_rasp.py |  2 +-
 3 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/apps/benchmark/README.md b/apps/benchmark/README.md
index e83e47c46eb7..ee22f90dc435 100644
--- a/apps/benchmark/README.md
+++ b/apps/benchmark/README.md
@@ -63,8 +63,11 @@ python3 -m tvm.exec.rpc_tracker
   python3 arm_cpu_imagenet_bench.py --device mate10pro --rpc-key mate10pro  
   ```
 
-  If your device has a same SoC of the above device, you can reuse these parameters
-  (e.g. use `llvm -device=arm_cpu -mode=rk3399 -target=aarch64-linux-gnu` as target).
-  Otherwise, you need to tune for your own device, please follow this 
-  [tutorial](https://docs.tvm.ai/tutorials/autotvm/tune_nnvm_arm.html).
+  If your device has a same or similar SoC of the above devices, you can reuse these parameters.
+  For example, if your SoC is similar to rasp3b, use
+  ```bash
+  python3 arm_cpu_imagenet_bench.py --device rasp3b --rpc-key your_custom_key
+  ```
+  For other devices, to get the best performance, it is recommended that you tune your network by yourself. 
+  Please follow this [tutorial](https://docs.tvm.ai/tutorials/autotvm/tune_nnvm_arm.html).
 
diff --git a/python/tvm/target.py b/python/tvm/target.py
index fed20c3914c6..e2d780f75264 100644
--- a/python/tvm/target.py
+++ b/python/tvm/target.py
@@ -427,14 +427,14 @@ def arm_cpu(model='unknown', options=None):
     from . import autotvm
 
     trans_table = {
-        "pixel2":    ["-model=snapdragon835", "-target=arm64-linux-android"],
-        "mate10":    ["-model=kirin970", "-target=arm64-linux-android"],
-        "mate10pro": ["-model=kirin970", "-target=arm64-linux-android"],
-        "p20":       ["-model=kirin970", "-target=arm64-linux-android"],
-        "p20pro":    ["-model=kirin970", "-target=arm64-linux-android"],
-        "rasp3b":    ["-model=bcm2837", "-target=armv7l-linux-gnueabihf"],
-        "rk3399":    ["-model=rk3399", "-target=aarch64-linux-gnu"],
-        "pynq":      ["-model=pynq", "-target=armv7a-linux-eabi"],
+        "pixel2":    ["-model=snapdragon835", "-target=arm64-linux-android -mattr=+neon"],
+        "mate10":    ["-model=kirin970", "-target=arm64-linux-android -mattr=+neon"],
+        "mate10pro": ["-model=kirin970", "-target=arm64-linux-android -mattr=+neon"],
+        "p20":       ["-model=kirin970", "-target=arm64-linux-android -mattr=+neon"],
+        "p20pro":    ["-model=kirin970", "-target=arm64-linux-android -mattr=+neon"],
+        "rasp3b":    ["-model=bcm2837", "-target=armv7l-linux-gnueabihf -mattr=+neon"],
+        "rk3399":    ["-model=rk3399", "-target=aarch64-linux-gnu -mattr=+neon"],
+        "pynq":      ["-model=pynq", "-target=armv7a-linux-eabi -mattr=+neon"],
     }
     pre_defined_opt = trans_table.get(model, ["-model=%s" % model])
 
diff --git a/tutorials/nnvm/deploy_model_on_rasp.py b/tutorials/nnvm/deploy_model_on_rasp.py
index c11f202c1251..40dbdaeb00ee 100644
--- a/tutorials/nnvm/deploy_model_on_rasp.py
+++ b/tutorials/nnvm/deploy_model_on_rasp.py
@@ -154,7 +154,7 @@ def transform_image(image):
 else:
     target = tvm.target.arm_cpu('rasp3b')
     # The above line is a simple form of
-    # target = tvm.target.create('llvm -devcie=arm_cpu -target=armv7l-linux-gnueabihf')
+    # target = tvm.target.create('llvm -devcie=arm_cpu -model=bcm2837 -target=armv7l-linux-gnueabihf -mattr=+neon')
 
 with nnvm.compiler.build_config(opt_level=2, add_pass=['AlterOpLayout']):
     graph, lib, params = nnvm.compiler.build(

From acc2151c6992cefccc1538819f726854bcc6a3c7 Mon Sep 17 00:00:00 2001
From: Ashok Emani <ashok.emani@intel.com>
Date: Thu, 16 Aug 2018 19:19:43 -0700
Subject: [PATCH 31/77] fix output_shape in conv2d_nchw (#1613)

---
 topi/include/topi/nn.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/topi/include/topi/nn.h b/topi/include/topi/nn.h
index 53b899796e37..4a537a646425 100644
--- a/topi/include/topi/nn.h
+++ b/topi/include/topi/nn.h
@@ -265,7 +265,7 @@ inline tvm::Tensor conv2d_nchw(const tvm::Tensor& I,
   auto pW = I->shape[3];
   tvm::Array<tvm::Expr> output_shape{
       I->shape[0],                                            // B
-      W->shape[1],                                            // O
+      W->shape[0],                                            // O
       (I->shape[2] - W->shape[2] + 2 * pad_h) / stride_h + 1,  // H
       (I->shape[3] - W->shape[3] + 2 * pad_w) / stride_w + 1   // W
   };

From 6cd5a8f991eb8ba3cdd27a063067af733aecc9a2 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Fri, 17 Aug 2018 11:37:56 +0900
Subject: [PATCH 32/77] [NNVM] Bug fix Prevent fusing convolution with
 injective op  (#1608)

---
 nnvm/src/compiler/graph_fuse.cc              | 31 +++++++++++++++++-
 nnvm/tests/python/compiler/test_op_fusion.py | 34 ++++++++++++++++++++
 2 files changed, 64 insertions(+), 1 deletion(-)

diff --git a/nnvm/src/compiler/graph_fuse.cc b/nnvm/src/compiler/graph_fuse.cc
index 52a8ae44f8ee..f65312be1a29 100644
--- a/nnvm/src/compiler/graph_fuse.cc
+++ b/nnvm/src/compiler/graph_fuse.cc
@@ -63,12 +63,16 @@ nnvm::Graph GraphFindFusibleGroups(nnvm::Graph g) {
       // Check if we can fuse to the master.
       int chosen_master = -1;
       bool ewise = inode.source->num_outputs() == 1;
+      bool mark_as_injective = false;
       for (const auto& e : inode.inputs) {
         if (fuse_vec[e.node_id] == FuseRule::kUknown) {
           TOpPattern ipt = pattern_vec[e.node_id];
           if (ipt != kElemWise) ewise = false;
-          if (ipt <= kInjective) {
+          if (ipt <= kBroadcast) {
+            fuse_vec[e.node_id] = FuseRule::kFuseToMaster;
+          } else if (ipt == kInjective) {
             fuse_vec[e.node_id] = FuseRule::kFuseToMaster;
+            mark_as_injective = true;
           } else if (ipt == kOutEWiseFusable &&
                      chosen_master == -1 &&
                      shape_vec[idx.entry_id(nid, 0)] == shape_vec[idx.entry_id(e)]) {
@@ -87,6 +91,8 @@ nnvm::Graph GraphFindFusibleGroups(nnvm::Graph g) {
       master_vec[nid] = chosen_master;
       if (chosen_master != -1) {
         pt = kOutEWiseFusable;
+      } else if (mark_as_injective) {
+        pt = kInjective;
       } else {
         pt = ewise ? kElemWise : kBroadcast;
       }
@@ -135,8 +141,31 @@ nnvm::Graph GraphFindFusibleGroups(nnvm::Graph g) {
     if (group_vec[nid] == -1) {
       group_vec[nid] = nid;
     }
+
+    // Check if injective op and out_ewise_fusable op (e.g. conv2d) are in the same group.
+    bool parent_out_ewise = false;
+    bool parent_injective = false;
+    for (const auto& e : inode.inputs) {
+      TOpPattern pt = pattern_vec[e.node_id];
+      if (pt == kOutEWiseFusable) {
+        parent_out_ewise = true;
+      } else if (pt == kInjective) {
+        parent_injective = true;
+      }
+    }
+    // Change the master node from out_ewise_fusable op to itself
+    if (parent_injective && parent_out_ewise) master_vec[nid] = nid;
+
     // Propagate the group id.
     for (const auto& e : inode.inputs) {
+      TOpPattern pt = pattern_vec[e.node_id];
+      if (parent_out_ewise && parent_injective) {
+        if (pt == kOutEWiseFusable) {
+          continue;  // Do not fuse out_ewise_fusable op
+        } else if (pt == kInjective) {
+          master_vec[e.node_id] = nid;
+        }
+      }
       if (fuse_vec[e.node_id] == FuseRule::kFuseToMaster) {
         CHECK(group_vec[e.node_id] == -1||
               group_vec[e.node_id] == group_vec[nid]);
diff --git a/nnvm/tests/python/compiler/test_op_fusion.py b/nnvm/tests/python/compiler/test_op_fusion.py
index 8d05ae02c579..5f4da3865a45 100644
--- a/nnvm/tests/python/compiler/test_op_fusion.py
+++ b/nnvm/tests/python/compiler/test_op_fusion.py
@@ -77,6 +77,39 @@ def test_injective_reduce_injective():
         np.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
 
 
+def test_injective_conv2d():
+    channels = 16
+    data = sym.Variable(name="data")
+    pool = sym.global_avg_pool2d(data=data)
+    weight = sym.reshape(pool, shape=[1, channels, 1, 1])
+    residual = sym.conv2d(data=data, kernel_size=(3,3), channels=channels, padding=(1, 1),
+                          layout="NCHW", kernel_layout="OIHW", use_bias=False, name="conv")
+    net = weight * data + residual
+    size = 56
+    dtype="float32"
+    dshape = (1, channels, size, size)
+    kshape = (channels, channels, 3, 3)
+    oshape = dshape
+    shape_dict = {"data": dshape}
+
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(net, target, shape_dict)
+        # data, global_avg_pool, conv weight, conv op, fused elemwise add
+        assert graph.index.num_nodes == 5
+
+        data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype))
+        kernel = tvm.nd.array(np.random.uniform(size=kshape).astype(dtype))
+        m = graph_runtime.create(graph, lib, ctx)
+        m.run(data=data, conv_weight=kernel)
+        # get output
+        out = m.get_output(0, tvm.nd.empty(oshape, dtype))
+        residual = topi.testing.conv2d_nchw_python(
+            data.asnumpy(), kernel.asnumpy(), (1,1), 'SAME')
+        weight = np.mean(data.asnumpy(), axis=(2, 3))
+        c_np = weight[:, :, np.newaxis, np.newaxis] * data.asnumpy() + residual
+        np.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
+
+
 def build_and_run(sym, params, data, out_shape, target, ctx, opt_level=2):
     with nnvm.compiler.build_config(opt_level=opt_level):
         graph, lib, params = nnvm.compiler.build(sym, target, shape={"data":data.shape}, params=params)
@@ -123,3 +156,4 @@ def get_sym(out_channel):
     test_ewise_injective()
     test_conv_ewise_injective()
     test_fuse_conv2d_elu()
+    test_injective_conv2d()

From 9b0e499054f204cee411afc1965c95a87d6dab1c Mon Sep 17 00:00:00 2001
From: Sergey Mironov <grrwlf@gmail.com>
Date: Sat, 18 Aug 2018 07:40:52 +0300
Subject: [PATCH 33/77] [NNVM] TF: Add Pack operation (#1570)

---
 nnvm/include/nnvm/top/tensor.h                |  2 +-
 nnvm/python/nnvm/frontend/tensorflow.py       |  9 ++++++
 nnvm/src/top/tensor/transform.cc              | 19 ++++++------
 .../frontend/tensorflow/test_forward.py       | 29 ++++++++++++++++++-
 4 files changed, 48 insertions(+), 11 deletions(-)

diff --git a/nnvm/include/nnvm/top/tensor.h b/nnvm/include/nnvm/top/tensor.h
index 22ee9d7118e6..53ed5b3b0a22 100644
--- a/nnvm/include/nnvm/top/tensor.h
+++ b/nnvm/include/nnvm/top/tensor.h
@@ -16,7 +16,7 @@ namespace top {
 struct ConcatenateParam : public dmlc::Parameter<ConcatenateParam> {
   int axis;
   DMLC_DECLARE_PARAMETER(ConcatenateParam) {
-    DMLC_DECLARE_FIELD(axis).set_lower_bound(0).set_default(1)
+    DMLC_DECLARE_FIELD(axis).set_default(1)
     .describe("the axis to be concated.");
   }
 };
diff --git a/nnvm/python/nnvm/frontend/tensorflow.py b/nnvm/python/nnvm/frontend/tensorflow.py
index d761e34c7c59..092b8fa20219 100644
--- a/nnvm/python/nnvm/frontend/tensorflow.py
+++ b/nnvm/python/nnvm/frontend/tensorflow.py
@@ -339,6 +339,14 @@ def _impl(inputs, attr, params):
             extras={'axis': axis.asnumpy()[0]})(inputs, attr)
     return _impl
 
+def _pack():
+    def _impl(inputs, attr, params):
+        axis = int(attr["axis"])
+        inputs_reshaped = [_sym.expand_dims(i, axis=axis, num_newaxis=1) for i in inputs]
+        return _sym.concatenate(*inputs_reshaped, axis=axis)
+
+    return _impl
+
 def _reshape():
     def _impl(inputs, attr, params):
         try:
@@ -673,6 +681,7 @@ def _impl(inputs, attr, params):
     'Minimum'                           : _elemwise('min'),
     'Sum'                               : _sum(),
     'Square'                            : _square(),
+    'Pack'                              : _pack(),
     'Relu'                              : AttrCvt('relu'),
     'Reshape'                           : _reshape(),
     'ResizeBilinear'                    : _resize_bilinear(),
diff --git a/nnvm/src/top/tensor/transform.cc b/nnvm/src/top/tensor/transform.cc
index 78255d20f040..52dca5654838 100644
--- a/nnvm/src/top/tensor/transform.cc
+++ b/nnvm/src/top/tensor/transform.cc
@@ -93,23 +93,24 @@ inline bool ConcatenateInferShape(const NodeAttrs& attrs,
   TShape dshape;
   dim_t size = 0;
   bool has_zero = false;
+  int axis = param.axis >= 0 ? param.axis : in_shape->at(0).ndim() + param.axis;
   for (size_t i = 0; i < in_shape->size(); ++i) {
     TShape tmp = (*in_shape)[i];
     if (tmp.ndim()) {
-      CHECK_LT(static_cast<dim_t>(param.axis), tmp.ndim())
-          << "concat dim " << param.axis << " out of range of input shape " << tmp;
-      has_zero = tmp[param.axis] == 0 || has_zero;
-      size += tmp[param.axis];
-      tmp[param.axis] = 0;
+      CHECK_LT(static_cast<dim_t>(axis), tmp.ndim())
+          << "concat dim " << axis << " out of range of input shape " << tmp;
+      has_zero = tmp[axis] == 0 || has_zero;
+      size += tmp[axis];
+      tmp[axis] = 0;
       shape_assign(&dshape, tmp);
     }
   }
 
   TShape tmp = (*out_shape)[0];
   if (tmp.ndim()) {
-    CHECK_LT(static_cast<dim_t>(param.axis), tmp.ndim())
-        << "concat dim " << param.axis << " out of range of input shape " << tmp;
-    tmp[param.axis] = 0;
+    CHECK_LT(static_cast<dim_t>(axis), tmp.ndim())
+        << "concat dim " << axis << " out of range of input shape " << tmp;
+    tmp[axis] = 0;
     shape_assign(&dshape, tmp);
   }
 
@@ -119,7 +120,7 @@ inline bool ConcatenateInferShape(const NodeAttrs& attrs,
     NNVM_ASSIGN_INPUT_SHAPE(attrs, *in_shape, i, dshape);
   }
 
-  if (!has_zero) dshape[param.axis] = size;
+  if (!has_zero) dshape[axis] = size;
   NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, 0, dshape);
   return dshape.Size() != 0;
 }
diff --git a/nnvm/tests/python/frontend/tensorflow/test_forward.py b/nnvm/tests/python/frontend/tensorflow/test_forward.py
index 64c57c126f8d..6fa020a03444 100644
--- a/nnvm/tests/python/frontend/tensorflow/test_forward.py
+++ b/nnvm/tests/python/frontend/tensorflow/test_forward.py
@@ -342,7 +342,7 @@ def _test_argx(func, data, **kwargs):
 
         compare_tf_with_tvm(data, 'c0:0', 'argx0:0')
 
-def test_argmin_argmax():
+def test_forward_argminmax():
     for axis in [None,0,1,2]:
         data = np.random.uniform(size=(8,4,9)).astype('float32')
         _test_argx(tf.argmax, data=data, axis=axis)
@@ -555,6 +555,31 @@ def test_forward_lstm():
 
     _test_lstm_cell(1, 2, 1, 0.0, 'float32')
 
+
+
+#######################################################################
+# Pack
+# ---
+def _test_pack(axis, shape, **kwargs):
+
+    a = np.arange(np.prod(shape), dtype=np.float32).reshape(shape)
+    b = np.arange(np.prod(shape), dtype=np.float32).reshape(shape)
+
+    with tf.Graph().as_default():
+        tf_a = array_ops.placeholder(shape=shape, dtype='float32', name='pl_a')
+        tf_b = array_ops.placeholder(shape=shape, dtype='float32', name='pl_b')
+        tf_c = tf.stack([tf_a,tf_b], axis=axis, **kwargs)
+        assert tf_c.op.op_def.name == 'Pack', "tf.stack() is expected to produce 'Pack' operation"
+
+        compare_tf_with_tvm([a,b], ['pl_a:0','pl_b:0'], 'stack:0')
+
+def test_forward_pack():
+    for axis in range(-3,3):
+        _test_pack(axis, [3,2,1])
+    for axis in range(-1,1):
+        _test_pack(axis, [3])
+    _test_pack(0, [])
+
 #######################################################################
 # Pad
 # ---
@@ -818,9 +843,11 @@ def test_forward_l2_normalize():
     test_forward_reshape()
     test_forward_squeeze()
     test_forward_sigmoid()
+    test_forward_argminmax()
     if tf.__version__ == '1.4.1':
         _test_forward_concat_v2()
     test_forward_multi_input()
+    test_forward_pack()
     test_forward_inception_v3()
     test_forward_inception_v1()
     test_forward_mobilenet()

From 38d0835728f8b11c4467ec6a55a44ad9de24de7b Mon Sep 17 00:00:00 2001
From: xqdan <danxiaoqiang@126.com>
Date: Sun, 19 Aug 2018 02:18:29 +0800
Subject: [PATCH 34/77] #1592 [PASS] Fix missing mem CHECK in storage_rewrite
 (#1616)

---
 src/pass/storage_rewrite.cc                   |  6 ++
 .../unittest/test_pass_storage_rewrite.py     | 63 ++++++++++++-------
 2 files changed, 48 insertions(+), 21 deletions(-)

diff --git a/src/pass/storage_rewrite.cc b/src/pass/storage_rewrite.cc
index 0170499e1491..877216ed7656 100644
--- a/src/pass/storage_rewrite.cc
+++ b/src/pass/storage_rewrite.cc
@@ -584,6 +584,12 @@ class StoragePlanRewriter : public IRMutator {
           e->new_alloc = Allocate::make(
               e->alloc_var, alloc_type, {combo_size}, const_true(),
               Evaluate::make(0));
+          if (e->scope.tag.length() != 0) {
+            MemoryInfo info = GetMemoryInfo(e->scope.to_string());
+            uint64_t total_elem = e->const_nbits / e->elem_type.bits();
+            CHECK_LE(total_elem * e->elem_type.bits(), info->max_num_bits)
+                << "Allocation exceed bound of memory tag " << e->scope.to_string();
+          }
         }
       }
     }
diff --git a/tests/python/unittest/test_pass_storage_rewrite.py b/tests/python/unittest/test_pass_storage_rewrite.py
index 2bb02998982f..3c07a1f26aff 100644
--- a/tests/python/unittest/test_pass_storage_rewrite.py
+++ b/tests/python/unittest/test_pass_storage_rewrite.py
@@ -28,15 +28,30 @@ def verify(n):
     tvm.ir_pass.PostOrderVisit(stmt, verify)
     assert num_alloc[0] == 1
 
+def register_mem(scope_tb, max_bits):
+    #Register mem
+    @tvm.register_func("tvm.info.mem.%s" % scope_tb)
+    def mem_info_inp_buffer():
+        return tvm.make.node("MemoryInfo",
+                        unit_bits= 16,
+                        max_simd_bits=32,
+                        max_num_bits=max_bits,
+                        head_address=None)
+
 def test_alloc_seq():
+    scope_tb = "local.L0A"
+    max_bits = 1024 * 1024 * 1024
+
+    register_mem(scope_tb, max_bits)
+
     ib = tvm.ir_builder.create()
     n = tvm.var("n")
     with ib.for_range(0, n, name="i") as i:
         with ib.for_range(0, 10, name="j") as j:
-            A = ib.allocate("float32", 200, name="A", scope="local.L0A")
+            A = ib.allocate("float32", 200, name="A", scope=scope_tb)
             A[j] = 1.2
         with ib.for_range(0, 10, name="j") as j:
-            A = ib.allocate("float32", 200, name="B", scope="local.L0A")
+            A = ib.allocate("float32", 200, name="B", scope=scope_tb)
             A[j] = 1.3
 
     body = ib.get()
@@ -233,16 +248,9 @@ def test_parallel_alloc():
 
     assert(isinstance(body.body.body.body.body, tvm.stmt.Allocate))
 
-def test_inplace_rule2():
+def test_inplace_rule2(scope_tb = "local_TB2", max_bits = 1024 * 1024 * 1024):
     #Test Buffer
-    scope_tb = "local_TB2"
-    @tvm.register_func("tvm.info.mem.%s" % scope_tb)
-    def mem_info_inp_buffer():
-        return tvm.make.node("MemoryInfo",
-                        unit_bits= 16,
-                        max_simd_bits=32,
-                        max_num_bits=1024*1024*1024,
-                        head_address=None)
+    register_mem(scope_tb, max_bits)
     m = 10
     A = tvm.placeholder((m,), name='A')
     C = tvm.placeholder((m,), name='C')
@@ -275,16 +283,23 @@ def verify(n):
     tvm.ir_pass.PostOrderVisit(stmt, verify)
     assert num_alloc[0] == 2
 
+def test_exceed_mem():
+    max_bits = 639
+    # The critical max_num_bits is between 639 and 640
+    loc = -1
+    try:
+        test_inplace_rule2("local_TEM", max_bits)
+    except Exception as e:
+        estr = str(e)
+        loc = estr.find('Allocation exceed bound of memory')
+        assert loc != -1
+
 def test_inplace_rule3():
     #Test Buffer
     scope_tb = "local_TB3"
-    @tvm.register_func("tvm.info.mem.%s" % scope_tb)
-    def mem_info_inp_buffer():
-        return tvm.make.node("MemoryInfo",
-                        unit_bits= 16,
-                        max_simd_bits=32,
-                        max_num_bits=1024*1024*1024,
-                        head_address=None)
+    max_bits=1024 * 1024 * 1024
+
+    register_mem(scope_tb, max_bits)
     m = 10
     B0 = tvm.placeholder((m,), name='B0')
     B1 = tvm.placeholder((m,), name='B1')
@@ -388,17 +403,22 @@ def verify(n):
     assert num_alloc[0] == 1
 
 def test_alloc_seq_type2():
+    scope_tb = "local.L0A2"
+    max_bits=1024 * 1024 * 1024
+
+    register_mem(scope_tb, max_bits)
+
     ib = tvm.ir_builder.create()
     n = tvm.var("n")
     with ib.for_range(0, n, name="i") as i:
         with ib.for_range(0, 10, name="j") as j:
-            A = ib.allocate("float32", 200, name="A", scope="local.L0A")
+            A = ib.allocate("float32", 200, name="A", scope=scope_tb)
             A[j] = 1.2
         with ib.for_range(0, 20, name="j") as j:
-            B = ib.allocate("int16", 400, name="B", scope="local.L0A")
+            B = ib.allocate("int16", 400, name="B", scope=scope_tb)
             B[j] = tvm.const(1, "int16")
         with ib.for_range(0, 10, name="j") as j:
-            C = ib.allocate("float32", 200, name="C", scope="local.L0A")
+            C = ib.allocate("float32", 200, name="C", scope=scope_tb)
             C[j] = 1.2
 
     body = ib.get()
@@ -465,6 +485,7 @@ def test_replace_dataflow():
     test_storage_combine()
     test_storage_share_gpu()
     test_inplace_rule2()
+    test_exceed_mem()
     test_inplace_rule3()
     test_alloc_seq_type()
     test_alloc_seq_type2()

From a5f2759ec5ac79592f860919349a85bf63ed548a Mon Sep 17 00:00:00 2001
From: Siju <sijusamuel@gmail.com>
Date: Sat, 18 Aug 2018 23:50:58 +0530
Subject: [PATCH 35/77] =?UTF-8?q?[FRONTEND][COREML]MultiplyLayerParams=20L?=
 =?UTF-8?q?2NormalizeLayerParams=20and=20UpsampleLayerParams=20support=20?=
 =?UTF-8?q?=E2=80=A6=20(#1511)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 nnvm/python/nnvm/frontend/coreml.py           |  22 ++
 .../python/frontend/coreml/test_forward.py    | 190 +++++++++++++++++-
 2 files changed, 211 insertions(+), 1 deletion(-)

diff --git a/nnvm/python/nnvm/frontend/coreml.py b/nnvm/python/nnvm/frontend/coreml.py
index e80cfe23f220..3ca76bb0b20e 100644
--- a/nnvm/python/nnvm/frontend/coreml.py
+++ b/nnvm/python/nnvm/frontend/coreml.py
@@ -217,6 +217,16 @@ def AddLayerParams(op, insyms, symtab):
         ret = _sym.__add_scalar__(ret, scalar=op.alpha)
     return ret
 
+def MultiplyLayerParams(op, insyms, symtab):
+    if not isinstance(insyms, list):
+        insyms = [insyms]
+    ret = insyms[0]
+    for i in range(1, len(insyms)):
+        ret = _sym.elemwise_mul(ret, insyms[i])
+    if op.alpha != 1:
+        ret = _sym.__mul_scalar__(ret, scalar=op.alpha)
+    return ret
+
 def ConcatLayerParams(op, insyms, symtab):
     if not isinstance(insyms, list):
         insyms = [insyms]
@@ -249,6 +259,15 @@ def PermuteLayerParams(op, insym, symtab):
     axes = tuple(op.axis)
     return _sym.transpose(insym, axes=axes)
 
+def UpsampleLayerParams(op, insym, symtab):
+    if op.scalingFactor[0] != op.scalingFactor[1]:
+        raise NotImplementedError("Upsampling only supported with same \
+            height and width scaling factor.")
+    interpolationMode = 'NEAREST_NEIGHBOR' if op.mode == 0 else 'BILINEAR'
+    return _sym.upsampling(insym, scale=op.scalingFactor[0], method=interpolationMode)
+
+def L2NormalizeLayerParams(op, insym, symtab):
+    return _sym.l2_normalize(insym, eps=op.epsilon, axis=1)
 
 _convert_map = {
     'NeuralNetworkMeanImage': NeuralNetworkMeanImage,
@@ -261,10 +280,13 @@ def PermuteLayerParams(op, insym, symtab):
     'SoftmaxLayerParams':SoftmaxLayerParams,
     'InnerProductLayerParams':InnerProductLayerParams,
     'AddLayerParams':AddLayerParams,
+    'MultiplyLayerParams':MultiplyLayerParams,
     'FlattenLayerParams':FlattenLayerParams,
     'ConcatLayerParams':ConcatLayerParams,
     'PaddingLayerParams':PaddingLayerParams,
     'PermuteLayerParams':PermuteLayerParams,
+    'UpsampleLayerParams':UpsampleLayerParams,
+    'L2NormalizeLayerParams':L2NormalizeLayerParams
 }
 
 def coreml_op_to_nnvm(op, inname, outname, symtab):
diff --git a/nnvm/tests/python/frontend/coreml/test_forward.py b/nnvm/tests/python/frontend/coreml/test_forward.py
index d5c460e56987..27ae28c20ab9 100644
--- a/nnvm/tests/python/frontend/coreml/test_forward.py
+++ b/nnvm/tests/python/frontend/coreml/test_forward.py
@@ -1,8 +1,12 @@
 import numpy as np
 
-import topi
+from coremltools.models.neural_network import NeuralNetworkBuilder
+from coremltools.models import datatypes
+
 import tvm
 from tvm.contrib import graph_runtime
+import topi
+import topi.testing
 import nnvm.symbol as sym
 import nnvm.compiler
 from nnvm.testing.config import ctx_list
@@ -40,6 +44,190 @@ def test_resnet50_checkonly():
     model_file = model_zoo.get_resnet50()
     test_model_checkonly(model_file, 'resnet50')
 
+def run_tvm_graph(graph_def, input_data, input_name, output_shape, output_dtype='float32'):
+    """ Generic function to compile on nnvm and execute on tvm """
+
+    sym, params = nnvm.frontend.from_coreml(graph_def)
+    target = 'llvm'
+    if isinstance(input_data, list):
+        shape_dict = {}
+        dtype_dict = {}
+        for i, e in enumerate(input_name):
+            shape_dict[e] = input_data[i].shape
+            dtype_dict[e] = input_data[i].dtype
+    else:
+        shape_dict = {input_name: input_data.shape}
+        dtype_dict = {input_name: input_data.dtype}
+
+    graph, lib, params = nnvm.compiler.build(sym, target, shape_dict,
+                                             dtype=dtype_dict, params=params)
+
+    ctx = tvm.cpu(0)
+    from tvm.contrib import graph_runtime
+    m = graph_runtime.create(graph, lib, ctx)
+    # set inputs
+    if isinstance(input_data, list):
+        for i, e in enumerate(input_name):
+            m.set_input(e, tvm.nd.array(input_data[i].astype(input_data[i].dtype)))
+    else:
+        m.set_input(input_name, tvm.nd.array(input_data.astype(input_data.dtype)))
+
+    m.set_input(**params)
+    # execute
+    m.run()
+    # get outputs
+    if isinstance(output_shape, list) and isinstance(output_dtype, list):
+        tvm_output_list = []
+        for i, s in enumerate(output_shape):
+            tvm_output = m.get_output(i, tvm.nd.empty((s), output_dtype[i]))
+            tvm_output_list.append(tvm_output.asnumpy())
+        return tvm_output_list
+    else:
+        tvm_output = m.get_output(0, tvm.nd.empty((output_shape), output_dtype))
+        return tvm_output.asnumpy()
+
+def verify_AddLayerParams(input_dim, alpha=2):
+    dtype = 'float32'
+
+    a_np1 = np.random.uniform(size=input_dim).astype(dtype)
+    a_np2 = np.random.uniform(size=input_dim).astype(dtype)
+
+    b_np = np.add(a_np1, a_np2) + alpha
+    inputs = [('input1', datatypes.Array(*input_dim)),
+              ('input2', datatypes.Array(*input_dim))]
+    output = [('output', datatypes.Array(*b_np.shape))]
+    builder = NeuralNetworkBuilder(inputs, output)
+    builder.add_elementwise(name='Add',
+                            alpha=alpha,
+                            input_names=['input1', 'input2'],
+                            output_name='output',
+                            mode='ADD')
+    model = cm.models.MLModel(builder.spec)
+    for target, ctx in ctx_list():
+        out = run_tvm_graph(model,
+                           [a_np1, a_np2],
+                           ['input1', 'input2'],
+                           b_np.shape,
+                           dtype)
+        np.testing.assert_allclose(out, b_np, rtol=1e-5)
+
+def test_forward_AddLayerParams():
+    verify_AddLayerParams((1, 2, 2), 0)
+    verify_AddLayerParams((1, 2, 2), 1)
+    verify_AddLayerParams((1, 3, 3), 2)
+
+def verify_MultiplyLayerParams(input_dim, alpha):
+    dtype = 'float32'
+
+    a_np1 = np.random.uniform(size=input_dim).astype(dtype)
+    a_np2 = np.random.uniform(size=input_dim).astype(dtype)
+
+    b_np = np.multiply(a_np1, a_np2) * alpha
+    inputs = [('input1', datatypes.Array(*input_dim)),
+              ('input2', datatypes.Array(*input_dim))]
+    output = [('output', datatypes.Array(*b_np.shape))]
+    builder = NeuralNetworkBuilder(inputs, output)
+    builder.add_elementwise(name='Mul',
+                            alpha=alpha,
+                            input_names=['input1', 'input2'],
+                            output_name='output',
+                            mode='MULTIPLY')
+    model = cm.models.MLModel(builder.spec)
+    for target, ctx in ctx_list():
+        out = run_tvm_graph(model,
+                           [a_np1, a_np2],
+                           ['input1', 'input2'],
+                           b_np.shape,
+                           dtype)
+        np.testing.assert_allclose(out, b_np, rtol=1e-5)
+
+def test_forward_MultiplyLayerParams():
+    verify_MultiplyLayerParams((1, 2, 2), 0)
+    verify_MultiplyLayerParams((1, 2, 2), 1)
+    verify_MultiplyLayerParams((1, 3, 3), 2)
+
+def verify_ConcatLayerParams(input1_dim, input2_dim):
+    dtype = 'float32'
+
+    a_np1 = np.random.uniform(size=input1_dim).astype(dtype)
+    a_np2 = np.random.uniform(size=input2_dim).astype(dtype)
+
+    b_np = np.concatenate((a_np1, a_np2), axis=1)
+    inputs = [('input1', datatypes.Array(*input1_dim)),
+              ('input2', datatypes.Array(*input2_dim))]
+    output = [('output', datatypes.Array(*b_np.shape))]
+    builder = NeuralNetworkBuilder(inputs, output)
+    builder.add_elementwise(name='Concate',
+                            input_names=['input1', 'input2'],
+                            output_name='output',
+                            mode='CONCAT')
+    model = cm.models.MLModel(builder.spec)
+    for target, ctx in ctx_list():
+        out = run_tvm_graph(model,
+                           [a_np1, a_np2],
+                           ['input1', 'input2'],
+                           b_np.shape,
+                           dtype)
+        np.testing.assert_allclose(out, b_np, rtol=1e-5)
+
+def test_forward_ConcatLayerParams():
+    verify_ConcatLayerParams((1, 1, 2, 2), (1, 2, 2, 2))
+    verify_ConcatLayerParams((1, 2, 4, 4), (1, 3, 4, 4))
+
+def verify_UpsampleLayerParams(input_dim, scale, mode):
+    dtype = "float32"
+
+    a_np = np.full(input_dim, 1, dtype=dtype)
+    if mode == 'NN':
+        b_np = topi.testing.upsampling_python(a_np, scale)
+    else:
+        new_h = input_dim[2] * scale
+        new_w = input_dim[3] * scale
+        b_np = topi.testing.bilinear_resize_python(a_np, (new_h, new_w), 'NCHW')
+
+    input = [('input', datatypes.Array(*input_dim))]
+    output = [('output', datatypes.Array(*b_np.shape))]
+    builder = NeuralNetworkBuilder(input, output)
+    builder.add_upsample(name='Upsample',
+                         scaling_factor_h=scale,
+                         scaling_factor_w=scale,
+                         mode=mode,
+                         input_name='input',
+                         output_name='output')
+
+    model = cm.models.MLModel(builder.spec)
+    for target, ctx in ctx_list():
+        out = run_tvm_graph(model, a_np, 'input', b_np.shape, dtype)
+        np.testing.assert_allclose(out, b_np, rtol=1e-5)
+
+def test_forward_UpsampleLayerParams():
+    verify_UpsampleLayerParams((1, 16, 32, 32), 2, 'NN')
+    verify_UpsampleLayerParams((1, 4, 6, 6), 3, 'BILINEAR')
+
+def verify_l2_normalize(input_dim, eps):
+    dtype = "float32"
+
+    a_np = np.random.uniform(size=input_dim).astype(dtype)
+    b_np = topi.testing.l2_normalize_python(a_np, eps, 1)
+
+    input = [('input', datatypes.Array(*input_dim))]
+    output = [('output', datatypes.Array(*b_np.shape))]
+    builder = NeuralNetworkBuilder(input, output)
+    builder.add_l2_normalize(name='L2', epsilon=eps, input_name='input', output_name='output')
+
+    model = cm.models.MLModel(builder.spec)
+    for target, ctx in ctx_list():
+        out = run_tvm_graph(model, a_np, 'input', b_np.shape, dtype)
+        np.testing.assert_allclose(out, b_np, rtol=1e-5)
+
+def test_forward_l2_normalize():
+    verify_l2_normalize((1, 3, 20, 20), 0.001)
+
 if __name__ == '__main__':
     test_mobilenet_checkonly()
     test_resnet50_checkonly()
+    test_forward_AddLayerParams()
+    test_forward_ConcatLayerParams()
+    test_forward_MultiplyLayerParams()
+    test_forward_UpsampleLayerParams()
+    test_forward_l2_normalize()

From 1c66012ac9ccca6c9e8a3dcf24f737c1a0f734c7 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <mercy_zheng@sjtu.edu.cn>
Date: Sun, 19 Aug 2018 19:29:35 -0700
Subject: [PATCH 36/77] fix import (#1621)

---
 nnvm/python/nnvm/testing/yolo2_detection.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/nnvm/python/nnvm/testing/yolo2_detection.py b/nnvm/python/nnvm/testing/yolo2_detection.py
index b7744c45cff4..0b229149b8ea 100644
--- a/nnvm/python/nnvm/testing/yolo2_detection.py
+++ b/nnvm/python/nnvm/testing/yolo2_detection.py
@@ -10,9 +10,6 @@
 import math
 from collections import namedtuple
 import numpy as np
-from PIL import Image
-from PIL import ImageDraw
-from PIL import ImageFont
 
 def _entry_index(batch, w, h, outputs, classes, coords, location, entry):
     n = int(location/(w*h))
@@ -186,6 +183,10 @@ def _draw_label(im, r, c, label, rgb):
                         _set_pixel(im, i+c, j+r, k, val)#rgb[k] * val)
 
 def _get_label(labelstr, rgb):
+    from PIL import Image
+    from PIL import ImageDraw
+    from PIL import ImageFont
+
     text = labelstr
     colorText = "black"
     testDraw = ImageDraw.Draw(Image.new('RGB', (1, 1)))

From b00aabc5ca9108a3dcb68e19af5ae19d723ed746 Mon Sep 17 00:00:00 2001
From: masahi <masahi129@gmail.com>
Date: Mon, 20 Aug 2018 11:30:01 +0900
Subject: [PATCH 37/77] Add missing check when deciding conv op and injective
 op are in the same group (#1622)

---
 nnvm/src/compiler/graph_fuse.cc              |  1 +
 nnvm/tests/python/compiler/test_op_fusion.py | 34 ++++++++++++++++++++
 2 files changed, 35 insertions(+)

diff --git a/nnvm/src/compiler/graph_fuse.cc b/nnvm/src/compiler/graph_fuse.cc
index f65312be1a29..4999d93d1861 100644
--- a/nnvm/src/compiler/graph_fuse.cc
+++ b/nnvm/src/compiler/graph_fuse.cc
@@ -146,6 +146,7 @@ nnvm::Graph GraphFindFusibleGroups(nnvm::Graph g) {
     bool parent_out_ewise = false;
     bool parent_injective = false;
     for (const auto& e : inode.inputs) {
+      if (fuse_vec[e.node_id] != FuseRule::kFuseToMaster) continue;
       TOpPattern pt = pattern_vec[e.node_id];
       if (pt == kOutEWiseFusable) {
         parent_out_ewise = true;
diff --git a/nnvm/tests/python/compiler/test_op_fusion.py b/nnvm/tests/python/compiler/test_op_fusion.py
index 5f4da3865a45..0c81ac890d55 100644
--- a/nnvm/tests/python/compiler/test_op_fusion.py
+++ b/nnvm/tests/python/compiler/test_op_fusion.py
@@ -110,6 +110,39 @@ def test_injective_conv2d():
         np.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5)
 
 
+def test_concatenate_conv2d():
+    ch = 3
+    size = 8
+    data = sym.Variable(name="data")
+    concat = sym.concatenate(data, data, axis=1)
+    conv = sym.conv2d(data=concat, kernel_size=(1,1), channels=ch*2, use_bias=False, name="conv")
+    net = sym.elemwise_add(concat, conv)
+
+    dtype="float32"
+    dshape = (1, ch, size, size)
+    kshape = (ch*2, ch*2, 1, 1)
+    oshape = (1, ch*2, size, size)
+    shape_dict = {"data": dshape}
+
+    for target, ctx in ctx_list():
+        graph, lib, _ = nnvm.compiler.build(net, target, shape_dict)
+        # data, conv weight, conv op, concat
+        assert graph.index.num_nodes == 4
+
+        data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype))
+        kernel = tvm.nd.array(np.random.uniform(size=kshape).astype(dtype))
+        m = graph_runtime.create(graph, lib, ctx)
+        m.run(data=data, conv_weight=kernel)
+        # get output
+        out = m.get_output(0, tvm.nd.empty(oshape, dtype))
+
+        concat = np.concatenate((data.asnumpy(), data.asnumpy()), axis=1)
+        conv = topi.testing.conv2d_nchw_python(
+            concat, kernel.asnumpy(), (1,1), 'SAME')
+        ref = concat + conv
+        np.testing.assert_allclose(out.asnumpy(), ref, rtol=1e-5)
+
+
 def build_and_run(sym, params, data, out_shape, target, ctx, opt_level=2):
     with nnvm.compiler.build_config(opt_level=opt_level):
         graph, lib, params = nnvm.compiler.build(sym, target, shape={"data":data.shape}, params=params)
@@ -157,3 +190,4 @@ def get_sym(out_channel):
     test_conv_ewise_injective()
     test_fuse_conv2d_elu()
     test_injective_conv2d()
+    test_concatenate_conv2d()

From 20c495e95bb3f0cd3542e029e8c54701d827cc7d Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Mon, 20 Aug 2018 09:20:19 -0700
Subject: [PATCH 38/77] [NODEREF] Introduce named attribute system. (#1618)

---
 include/tvm/attrs.h                           | 593 ++++++++++++++++++
 include/tvm/runtime/packed_func.h             |   8 +
 python/tvm/make.py                            |  11 +
 src/api/api_base.cc                           |  12 -
 src/api/api_test.cc                           |  46 ++
 src/api/dsl_api.cc                            |  37 +-
 src/lang/attrs.cc                             |  45 ++
 src/lang/reflection.cc                        |  42 +-
 tests/cpp/attrs_test.cc                       |  76 +++
 tests/python/unittest/test_lang_reflection.py |  26 +
 10 files changed, 861 insertions(+), 35 deletions(-)
 create mode 100644 include/tvm/attrs.h
 create mode 100644 src/api/api_test.cc
 create mode 100644 src/lang/attrs.cc
 create mode 100644 tests/cpp/attrs_test.cc

diff --git a/include/tvm/attrs.h b/include/tvm/attrs.h
new file mode 100644
index 000000000000..aed6b1ff722f
--- /dev/null
+++ b/include/tvm/attrs.h
@@ -0,0 +1,593 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file tvm/attrs.h
+ * \brief TVM attribute module
+ *
+ *  This module enables declaration of named attributes
+ *  which support default value setup and bound checking.
+ *
+ * \code
+ *   struct MyAttrs : public tvm::AttrsNode<MyAttrs> {
+ *     float learning_rate;
+ *     int num_hidden;
+ *     std::string name;
+ *     // declare attribute fields in header file
+ *     TVM_DECLARE_ATTRS(MyAttrs, "attrs.MyAttrs") {
+ *       TVM_ATTR_FIELD(num_hidden).set_lower_bound(1);
+ *       TVM_ATTR_FIELD(learning_rate).set_default(0.01f);
+ *       TVM_ATTR_FIELD(name).set_default("hello");
+ *     }
+ *   };
+ *   // register it in cc file
+ *   TVM_REGISTER_NODE_TYPE(MyAttrs);
+ * \endcode
+ *
+ * \sa AttrsNode, TVM_DECLARE_ATTRS, TVM_ATTR_FIELD
+ */
+#ifndef TVM_ATTRS_H_
+#define TVM_ATTRS_H_
+
+#include <unordered_map>
+#include <vector>
+#include <type_traits>
+#include <string>
+#include "./ir.h"
+#include "./base.h"
+#include "./packed_func_ext.h"
+
+namespace tvm {
+/*!
+ * \brief Declare an attribute function.
+ * \param ClassName The name of the class.
+ * \param TypeKey The type key to be used by the TVM node system.
+ */
+#define TVM_DECLARE_ATTRS(ClassName, TypeKey)                   \
+  static constexpr const char* _type_key = TypeKey;             \
+  TVM_DECLARE_NODE_TYPE_INFO(ClassName, ::tvm::BaseAttrsNode);  \
+  template<typename FVisit>                                     \
+  void __VisitAttrs__(FVisit& __fvisit__)  // NOLINT(*)
+
+
+/*!
+ * \brief Declare an attribute field.
+ * \param FieldName The field name.
+ */
+#define TVM_ATTR_FIELD(FieldName) \
+  __fvisit__(#FieldName, &FieldName)
+
+
+/*! \brief Error thrown during attribute checking. */
+struct AttrError : public dmlc::Error {
+  /*!
+   * \brief constructor
+   * \param msg error message
+   */
+  explicit AttrError(const std::string &msg)
+      : dmlc::Error(msg) {}
+};
+
+/*!
+ * \brief Information about attribute fields in string representations.
+ */
+struct AttrFieldInfo {
+  /*! \brief name of the field */
+  std::string name;
+  /*! \brief type docstring information in str. */
+  std::string type_info;
+  /*! \brief detailed description of the type */
+  std::string description;
+};
+
+/*!
+ * \brief Base class of all attribute class
+ * \note Do not subclass AttrBaseNode directly,
+ *       subclass AttrsNode instead.
+ * \sa AttrsNode
+ */
+class BaseAttrsNode : public Node {
+ public:
+  using TVMArgs = runtime::TVMArgs;
+  using TVMRetValue = runtime::TVMRetValue;
+  /*!
+   * \brief Initialize the attributes by sequence of arguments
+   * \param args The postional arguments in the form
+   *        [key0, value0, key1, value1, ..., key_n, value_n]
+   */
+  template<typename... Args>
+  inline void InitBySeq(Args&& ...args);
+  /*!
+   * \brief Print readible docstring to ostream, add newline.
+   * \param os the stream to print the docstring to.
+   */
+  inline void PrintDocString(std::ostream &os) const;  // NOLINT(*)
+  /*!
+   * \brief Get the field information about the
+   * \note This function throws when the required a field is not present.
+   */
+  TVM_DLL virtual std::vector<AttrFieldInfo> ListFieldInfo() const = 0;
+  /*!
+   * \brief Initialize the attributes by arguments.
+   * \param kwargs The key value pairs for initialization.
+   *        [key0, value0, key1, value1, ..., key_n, value_n]
+   * \param allow_unknown Whether allow additional unknown fields.
+   * \note This function throws when the required a field is not present.
+   */
+  TVM_DLL virtual void InitByPackedArgs(const TVMArgs& kwargs, bool allow_unknown = false) = 0;
+
+  static constexpr const char* _type_key = "Attrs";
+  TVM_DECLARE_BASE_NODE_INFO(BaseAttrsNode, Node);
+};
+
+/*! \brief Base attribute container for all attributes */
+class Attrs : public NodeRef {
+ public:
+  // normal constructor
+  Attrs() {}
+  // construct from shared ptr.
+  explicit Attrs(std::shared_ptr<Node> n) : NodeRef(n) {}
+
+  /*! \return The attribute node */
+  const BaseAttrsNode* operator->() const {
+    return ptr();
+  }
+  /*! \brief specify container node */
+  using ContainerType = BaseAttrsNode;
+
+ private:
+  /*! \return the internal attribute node */
+  const BaseAttrsNode* ptr() const {
+    return static_cast<const BaseAttrsNode*>(node_.get());
+  }
+};
+
+/*!
+ * \brief Specialized attribute type that is backed by a map.
+ *  The DictAttrsNode implements the Attrs behavior,
+ *  its fields are directly accessible via object.field_name
+ *  like other normal nodes.
+ */
+class DictAttrsNode : public BaseAttrsNode {
+ public:
+  /*! \brief internal attrs map */
+  Map<std::string, NodeRef> dict;
+  /*!
+   * \brief Consruct a Attrs backed by DictAttrsNode.
+   * \param dict The attributes.
+   * \return The dict attributes.
+   */
+  TVM_DLL static Attrs make(Map<std::string, NodeRef> dict);
+  // implementations
+  void VisitAttrs(AttrVisitor* v) final;
+  void InitByPackedArgs(const runtime::TVMArgs& args, bool allow_unknown) final;
+  std::vector<AttrFieldInfo> ListFieldInfo() const final;
+  // type info
+  static constexpr const char* _type_key = "DictAttrs";
+  TVM_DECLARE_NODE_TYPE_INFO(DictAttrsNode, BaseAttrsNode);
+};
+
+// Namespace containing detail implementations
+namespace detail {
+using runtime::TVMArgValue;
+
+// helper entry that does nothing in set_default/bound/describe calls.
+struct AttrNopEntry {
+  using TSelf = AttrNopEntry;
+
+  TSelf& describe(DMLC_ATTRIBUTE_UNUSED const char* str) {
+    return *this;
+  }
+  template<typename T>
+  TSelf& set_default(DMLC_ATTRIBUTE_UNUSED T value) {
+    return *this;
+  }
+  template<typename T>
+  TSelf& set_lower_bound(DMLC_ATTRIBUTE_UNUSED T begin) {
+    return *this;
+  }
+  template<typename T>
+  TSelf& set_upper_bound(DMLC_ATTRIBUTE_UNUSED T end) {
+    return *this;
+  }
+};
+
+// Wrapper for normal visitor.
+class AttrNormalVisitor {
+ public:
+  explicit AttrNormalVisitor(AttrVisitor* visitor)
+      : visitor_(visitor) {
+  }
+  template<typename T>
+  AttrNopEntry operator()(const char* key, T* value) {
+    visitor_->Visit(key, value);
+    return AttrNopEntry();
+  }
+
+ private:
+  AttrVisitor* visitor_;
+};
+
+// helper entry that does initialization, set default.
+template<typename T>
+struct AttrInitEntry {
+  // The attributes
+  using TSelf = AttrInitEntry<T>;
+  // The type key
+  const char* type_key_;
+  // field name
+  const char* key_;
+  // internal value.
+  T* value_;
+  // whether the value is missing.
+  bool value_missing_{true};
+  // If the value is still missing in destruction time throw an error.
+  ~AttrInitEntry() DMLC_THROW_EXCEPTION {
+    if (value_missing_) {
+      std::ostringstream os;
+      os << type_key_ << ": Cannot find required field \'" << key_
+         << "\' during initialization";
+      throw AttrError(os.str());
+    }
+  }
+  // override fields.
+  // This function sets the lower bound of the attribute
+  TSelf& set_lower_bound(DMLC_ATTRIBUTE_UNUSED const T& begin) {
+    if (this->value_missing_)  return *this;
+    const T& val = *value_;
+    if (begin > val) {
+      std::ostringstream os;
+      os << type_key_ << "." << key_ << ": "
+         << "value " << val
+         << " is smaller than the lower bound " << begin;
+      throw AttrError(os.str());
+    }
+    return *this;
+  }
+  // This function sets the upper bound of the attribute
+  TSelf& set_upper_bound(DMLC_ATTRIBUTE_UNUSED const T& end) {
+    if (this->value_missing_)  return *this;
+    const T& val = *value_;
+    if (val > end) {
+      std::ostringstream os;
+      os << type_key_ << "." << key_ << ": "
+         << "value " << val
+         << " is bigger than the upper bound " << end;
+      throw AttrError(os.str());
+    }
+    return *this;
+  }
+  // set default when
+  TSelf& set_default(DMLC_ATTRIBUTE_UNUSED const T& value) {
+    if (!value_missing_) return *this;
+    *value_ = value;
+    value_missing_ = false;
+    return *this;
+  }
+  TSelf& describe(DMLC_ATTRIBUTE_UNUSED const char* str) {
+    return *this;
+  }
+};
+
+// Template function to allow smart conversion
+// from Expr types into the constants.
+template<typename T>
+inline void SetValue(T* ptr, const TVMArgValue& val) {
+  *ptr = val.operator T();
+}
+template<typename T>
+inline void SetIntValue(T* ptr, const TVMArgValue& val) {
+  if (val.type_code() == kDLInt) {
+    *ptr = static_cast<T>(val.value().v_int64);
+  } else {
+    Expr expr = val;
+    CHECK(expr.defined());
+    if (const ir::IntImm* op = expr.as<ir::IntImm>()) {
+      *ptr = static_cast<T>(op->value);
+    } else if (const ir::UIntImm* op = expr.as<ir::UIntImm>()) {
+      *ptr = static_cast<T>(op->value);
+    } else {
+      LOG(FATAL) << "Expect int value, but get " << expr->type_key();
+    }
+  }
+}
+template<>
+inline void SetValue<std::string>(std::string* ptr, const TVMArgValue& val) {
+  if (val.type_code() == kStr) {
+    *ptr = val.operator std::string();
+  } else {
+    Expr expr = val;
+    const ir::StringImm* op = expr.as<ir::StringImm>();
+    CHECK(op != nullptr);
+    *ptr = op->value;
+  }
+}
+template<>
+inline void SetValue<double>(double* ptr, const TVMArgValue& val) {
+  if (val.type_code() == kDLFloat || val.type_code() == kDLInt) {
+    *ptr = val.operator double();
+  } else {
+    Expr expr = val;
+    CHECK(expr.defined());
+    if (const ir::IntImm* op = expr.as<ir::IntImm>()) {
+      *ptr = static_cast<double>(op->value);
+    } else if (const ir::IntImm* op = expr.as<ir::IntImm>()) {
+      *ptr = static_cast<double>(op->value);
+    } else if (const ir::UIntImm* op = expr.as<ir::UIntImm>()) {
+      *ptr = static_cast<double>(op->value);
+    } else {
+      LOG(FATAL) << "Expect float value, but get " << expr->type_key();
+    }
+  }
+}
+template<>
+inline void SetValue<int>(int* ptr, const TVMArgValue& val) {
+  SetIntValue(ptr, val);
+}
+template<>
+inline void SetValue<int64_t>(int64_t* ptr, const TVMArgValue& val) {
+  SetIntValue(ptr, val);
+}
+template<>
+inline void SetValue<uint64_t>(uint64_t* ptr, const TVMArgValue& val) {
+  SetIntValue(ptr, val);
+}
+template<>
+inline void SetValue<bool>(bool* ptr, const TVMArgValue& val) {
+  SetIntValue(ptr, val);
+}
+
+// Visitor for value initialization
+template<typename FFind>
+class AttrInitVisitor {
+ public:
+  // Counter of number of matched attributes during visit.
+  // This is used to decide if there is additional unmatched attributes.
+  size_t hit_count_{0};
+  // constructor
+  AttrInitVisitor(const char* type_key, FFind ffind)
+      : type_key_(type_key), ffind_(ffind) {
+  }
+
+  template<typename T>
+  AttrInitEntry<T> operator()(const char* key, T* value) {
+    TVMArgValue val;
+    AttrInitEntry<T> opt;
+    opt.type_key_ = type_key_;
+    opt.key_ = key;
+    opt.value_ = value;
+    if (ffind_(key, &val)) {
+      SetValue(value, val);
+      opt.value_missing_ = false;
+      ++hit_count_;
+    } else {
+      opt.value_missing_ = true;
+    }
+    return opt;
+  }
+
+ private:
+  // the type key
+  const char* type_key_;
+  FFind ffind_;
+};
+
+template<typename FFind>
+inline AttrInitVisitor<FFind> CreateInitVisitor(
+    const char* type_key,
+    FFind ffind) {
+  return AttrInitVisitor<FFind>(type_key, ffind);
+}
+
+/*!
+ * \brief Helper struct to get the type name known to tvm.
+ * \tparam T the type we are interested in.
+ */
+template<typename T>
+struct TypeName {
+  static constexpr const char* value = T::ContainerType::_type_key;
+};
+
+template<>
+struct TypeName<int> {
+  static constexpr const char* value = "int";
+};
+
+template<>
+struct TypeName<int64_t> {
+  static constexpr const char* value = "int64";
+};
+
+template<>
+struct TypeName<uint64_t> {
+  static constexpr const char* value = "uint64_t";
+};
+
+template<>
+struct TypeName<Type> {
+  static constexpr const char* value = "Type";
+};
+
+template<>
+struct TypeName<std::string> {
+  static constexpr const char* value = "str";
+};
+
+template<>
+struct TypeName<bool> {
+  static constexpr const char* value = "bool";
+};
+
+template<>
+struct TypeName<void*> {
+  static constexpr const char* value = "handle";
+};
+
+template<>
+struct TypeName<double> {
+  static constexpr const char* value = "double";
+};
+
+class AttrDocEntry {
+ public:
+  using TSelf = AttrDocEntry;
+
+  explicit AttrDocEntry(AttrFieldInfo* info)
+      : info_(info) {
+  }
+  TSelf& describe(DMLC_ATTRIBUTE_UNUSED const char* str) {
+    info_->description = str;
+    return *this;
+  }
+  template<typename T>
+  TSelf& set_default(DMLC_ATTRIBUTE_UNUSED T value) {
+    std::ostringstream os;
+    os << info_->type_info << ", default=" << value;
+    info_->type_info = os.str();
+    return *this;
+  }
+  template<typename T>
+  TSelf& set_lower_bound(DMLC_ATTRIBUTE_UNUSED T begin) {
+    return *this;
+  }
+  template<typename T>
+  TSelf& set_upper_bound(DMLC_ATTRIBUTE_UNUSED T end) {
+    return *this;
+  }
+
+ private:
+  AttrFieldInfo* info_;
+};
+
+class AttrDocVisitor {
+ public:
+  template<typename T>
+  AttrDocEntry operator()(const char* key, T* v) {
+    AttrFieldInfo info;
+    info.name = key;
+    info.type_info = TypeName<T>::value;
+    fields_.emplace_back(std::move(info));
+    return AttrDocEntry(&(fields_.back()));
+  }
+
+  std::vector<AttrFieldInfo> fields_;
+};
+
+class AttrExistVisitor {
+ public:
+  std::string key_;
+  bool exist_{false};
+
+  template<typename T>
+  AttrNopEntry operator()(const char* key, T* v) {
+    if (exist_) return AttrNopEntry();
+    if (key == key_) exist_ = true;
+    return AttrNopEntry();
+  }
+};
+}  // namespace detail
+
+/*!
+ * \brief The base class of the all the
+ *  Use "curiously recurring template pattern".
+ *
+ * \tparam DerivedType The final attribute type.
+ */
+template<typename DerivedType>
+class AttrsNode : public BaseAttrsNode {
+ public:
+  void VisitAttrs(AttrVisitor* v) final {
+    detail::AttrNormalVisitor vis(v);
+    self()->__VisitAttrs__(vis);
+  }
+
+  void InitByPackedArgs(const runtime::TVMArgs& args, bool allow_unknown) final {
+    CHECK_EQ(args.size() % 2, 0);
+    const int kLinearSearchBound = 16;
+    int hit_count = 0;
+    // applies two stratgies to lookup
+    if (args.size() < kLinearSearchBound) {
+      // linear search.
+      auto ffind = [&args](const char* key, runtime::TVMArgValue* val) {
+        for (int i = 0; i < args.size(); i += 2) {
+          CHECK_EQ(args.type_codes[i], kStr);
+          if (!std::strcmp(key, args.values[i].v_str)) {
+            *val = args[i + 1];
+            return true;
+          }
+        }
+        return false;
+      };
+      auto vis = detail::CreateInitVisitor(DerivedType::_type_key, ffind);
+      self()->__VisitAttrs__(vis);
+      hit_count = vis.hit_count_;
+    } else {
+      // construct a map then do lookup.
+      std::unordered_map<std::string, runtime::TVMArgValue> kwargs;
+      for (int i = 0; i < args.size(); i += 2) {
+        CHECK_EQ(args.type_codes[i], kStr);
+        kwargs[args[i].operator std::string()] = args[i + 1];
+      }
+      auto ffind = [&kwargs](const char *key, runtime::TVMArgValue* val) {
+        auto it = kwargs.find(key);
+        if (it != kwargs.end()) {
+          *val = it->second;
+          return true;
+        }
+        return false;
+      };
+      auto vis = detail::CreateInitVisitor(DerivedType::_type_key, ffind);
+      self()->__VisitAttrs__(vis);
+      hit_count = vis.hit_count_;
+    }
+    // error handling, slow path
+    if (hit_count * 2 != args.size() && !allow_unknown) {
+      for (int i = 0; i < args.size(); i += 2) {
+        detail::AttrExistVisitor visitor;
+        visitor.key_ = args[i].operator std::string();
+        self()->__VisitAttrs__(visitor);
+        if (!visitor.exist_) {
+          std::ostringstream os;
+          os << DerivedType::_type_key
+             << ": does not have field \'" << visitor.key_
+             << "\', Possible fields:\n";
+          os << "----------------\n";
+          this->PrintDocString(os);
+          throw AttrError(os.str());
+        }
+      }
+    }
+  }
+
+  std::vector<AttrFieldInfo> ListFieldInfo() const final {
+    detail::AttrDocVisitor visitor;
+    self()->__VisitAttrs__(visitor);
+    return visitor.fields_;
+  }
+
+ private:
+  DerivedType* self() const {
+    return const_cast<DerivedType*>(
+        static_cast<const DerivedType*>(this));
+  }
+};
+
+
+template<typename... Args>
+inline void BaseAttrsNode::InitBySeq(Args&& ...args) {
+  runtime::PackedFunc pf([this](const TVMArgs& args, TVMRetValue *rv) {
+      this->InitByPackedArgs(args);
+    });
+  pf(std::forward<Args>(args)...);
+}
+
+inline void BaseAttrsNode::PrintDocString(std::ostream &os) const { // NOLINT(*)
+  std::vector<AttrFieldInfo> entry = this->ListFieldInfo();
+  for (AttrFieldInfo info : entry) {
+    os << info.name << " : " << info.type_info << '\n';
+    if (info.description.length() != 0) {
+      os << "    " << info.description << '\n';
+    }
+  }
+}
+
+}  // namespace tvm
+#endif  // TVM_ATTRS_H_
diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h
index 6d8df4a5e3d6..63e8ca7cd16b 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -223,6 +223,12 @@ class ExtTypeVTable {
 class TVMPODValue_ {
  public:
   operator double() const {
+    // Allow automatic conversion from int to float
+    // This avoids errors when user pass in int from
+    // the frontend while the API expects a float.
+    if (type_code_ == kDLInt) {
+      return static_cast<double>(value_.v_int64);
+    }
     TVM_CHECK_TYPE_CODE(type_code_, kDLFloat);
     return value_.v_float64;
   }
@@ -310,6 +316,8 @@ class TVMPODValue_ {
  */
 class TVMArgValue : public TVMPODValue_ {
  public:
+  /*! \brief default constructor */
+  TVMArgValue() {}
   /*!
    * \brief constructor
    * \param value of the function
diff --git a/python/tvm/make.py b/python/tvm/make.py
index 49f698f4f663..19949509778b 100644
--- a/python/tvm/make.py
+++ b/python/tvm/make.py
@@ -71,6 +71,17 @@ def node(type_key, **kwargs):
     **kwargs : dict
         The fields of the node.
 
+    Returns
+    -------
+    node : Node
+        The corresponding DSL Node
+
+    Note
+    ----
+    If the created node is instance of AttrsNode, then
+    the creator function will also run bound checks and
+    default value setup as supported by Attrs.
+
     Example
     -------
     The following code constructs a IntImm object
diff --git a/src/api/api_base.cc b/src/api/api_base.cc
index 70301993ad3a..3583f42a00c9 100644
--- a/src/api/api_base.cc
+++ b/src/api/api_base.cc
@@ -33,18 +33,6 @@ TVM_REGISTER_API("_load_json")
     *ret = LoadJSON<NodeRef>(args[0]);
   });
 
-TVM_REGISTER_API("_nop")
-.set_body([](TVMArgs args,  TVMRetValue *ret) {
-  });
-
-// internal fucntion used for debug and testing purposes
-TVM_REGISTER_API("_ndarray_use_count")
-.set_body([](TVMArgs args,  TVMRetValue *ret) {
-    runtime::NDArray nd = args[0];
-    // substract the current one
-    *ret = (nd.use_count() - 1);
-  });
-
 TVM_REGISTER_API("_TVMSetStream")
 .set_body([](TVMArgs args,  TVMRetValue *ret) {
     TVMSetStream(args[0], args[1], args[2]);
diff --git a/src/api/api_test.cc b/src/api/api_test.cc
new file mode 100644
index 000000000000..1744267fdcd7
--- /dev/null
+++ b/src/api/api_test.cc
@@ -0,0 +1,46 @@
+ /*!
+ *  Copyright (c) 2018 by Contributors
+ *  Code mainly used for test purposes.
+ * \file api_test.cc
+ */
+#include <tvm/expr.h>
+#include <tvm/tensor.h>
+#include <tvm/attrs.h>
+#include <tvm/api_registry.h>
+
+namespace tvm {
+// Attrs used to python API
+struct TestAttrs : public AttrsNode<TestAttrs> {
+  int axis;
+  std::string name;
+  Array<Expr> padding;
+
+  TVM_DECLARE_ATTRS(TestAttrs, "attrs.TestAttrs") {
+    TVM_ATTR_FIELD(axis)
+        .set_default(10)
+        .set_lower_bound(1)
+        .set_upper_bound(10)
+        .describe("axis field");
+    TVM_ATTR_FIELD(name)
+        .describe("name");
+    TVM_ATTR_FIELD(padding)
+        .describe("padding of input")
+        .set_default(Array<Expr>({0, 0}));
+  }
+};
+
+TVM_REGISTER_NODE_TYPE(TestAttrs);
+
+TVM_REGISTER_API("_nop")
+.set_body([](TVMArgs args,  TVMRetValue *ret) {
+  });
+
+// internal fucntion used for debug and testing purposes
+TVM_REGISTER_API("_ndarray_use_count")
+.set_body([](TVMArgs args,  TVMRetValue *ret) {
+    runtime::NDArray nd = args[0];
+    // substract the current one
+    *ret = (nd.use_count() - 1);
+  });
+
+}  // namespace tvm
diff --git a/src/api/dsl_api.cc b/src/api/dsl_api.cc
index 80d7c3163e10..9157e62fda8a 100644
--- a/src/api/dsl_api.cc
+++ b/src/api/dsl_api.cc
@@ -7,6 +7,7 @@
 #include <dmlc/logging.h>
 #include <dmlc/thread_local.h>
 #include <tvm/api_registry.h>
+#include <tvm/attrs.h>
 #include <vector>
 #include <string>
 #include <exception>
@@ -124,22 +125,35 @@ class DSLAPIImpl : public DSLAPI {
         (*static_cast<TVMAPINode*>(handle))->type_index());
   }
   void NodeGetAttr(NodeHandle handle,
-                  const char* key,
-                  TVMValue* ret_val,
-                  int* ret_type_code,
-                  int* ret_success) const final {
+                   const char* key,
+                   TVMValue* ret_val,
+                   int* ret_type_code,
+                   int* ret_success) const final {
     TVMRetValue rv;
     APIAttrGetter getter;
+    TVMAPINode* tnode = static_cast<TVMAPINode*>(handle);
     getter.skey = key;
     getter.ret = &rv;
-    TVMAPINode* tnode = static_cast<TVMAPINode*>(handle);
     if (getter.skey == "type_key") {
       ret_val->v_str = (*tnode)->type_key();
       *ret_type_code = kStr;
       *ret_success = 1;
-    } else {
+      return;
+    } else if (!(*tnode)->is_type<DictAttrsNode>()) {
       (*tnode)->VisitAttrs(&getter);
       *ret_success = getter.found_ref_object || rv.type_code() != kNull;
+    } else {
+      // specially handle dict attr
+      DictAttrsNode* dnode = static_cast<DictAttrsNode*>(tnode->get());
+      auto it = dnode->dict.find(key);
+      if (it != dnode->dict.end()) {
+        *ret_success = 1;
+        rv = (*it).second;
+      } else {
+        *ret_success = 0;
+      }
+    }
+    if (*ret_success) {
       if (rv.type_code() == kStr ||
           rv.type_code() == kTVMType) {
         TVMAPIThreadLocalEntry *e = TVMAPIThreadLocalStore::Get();
@@ -159,7 +173,16 @@ class DSLAPIImpl : public DSLAPI {
     TVMAPINode* tnode = static_cast<TVMAPINode*>(handle);
     APIAttrDir dir;
     dir.names = &(ret->ret_vec_str);
-    (*tnode)->VisitAttrs(&dir);
+
+    if (!(*tnode)->is_type<DictAttrsNode>()) {
+      (*tnode)->VisitAttrs(&dir);
+    } else {
+      // specially handle dict attr
+      DictAttrsNode* dnode = static_cast<DictAttrsNode*>(tnode->get());
+      for (const auto& kv : dnode->dict) {
+        ret->ret_vec_str.push_back(kv.first);
+      }
+    }
     ret->ret_vec_charp.clear();
     for (size_t i = 0; i < ret->ret_vec_str.size(); ++i) {
       ret->ret_vec_charp.push_back(ret->ret_vec_str[i].c_str());
diff --git a/src/lang/attrs.cc b/src/lang/attrs.cc
new file mode 100644
index 000000000000..49a91983e79d
--- /dev/null
+++ b/src/lang/attrs.cc
@@ -0,0 +1,45 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file attrs.cc
+ */
+#include <tvm/attrs.h>
+
+namespace tvm {
+
+void DictAttrsNode::VisitAttrs(AttrVisitor* v)  {
+  v->Visit("__dict__", &dict);
+}
+
+void DictAttrsNode::InitByPackedArgs(
+    const runtime::TVMArgs& args, bool allow_unknown) {
+  for (int i = 0; i < args.size(); i += 2) {
+    std::string key = args[i];
+    runtime::TVMArgValue val = args[i + 1];
+    if (val.type_code() == kNodeHandle) {
+      dict.Set(key, val.operator NodeRef());
+    } else if (val.type_code() == kStr) {
+      dict.Set(key, Expr(val.operator std::string()));
+    } else {
+      dict.Set(key, val.operator Expr());
+    }
+  }
+}
+
+std::vector<AttrFieldInfo> DictAttrsNode::ListFieldInfo() const {
+  return {};
+}
+
+Attrs DictAttrsNode::make(Map<std::string, NodeRef> dict) {
+  std::shared_ptr<DictAttrsNode> n = std::make_shared<DictAttrsNode>();
+  n->dict = std::move(dict);
+  return Attrs(n);
+}
+
+TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
+.set_dispatch<DictAttrsNode>([](const DictAttrsNode *op, IRPrinter *p) {
+    p->stream << op->dict;
+});
+
+TVM_REGISTER_NODE_TYPE(DictAttrsNode);
+
+}  // namespace tvm
diff --git a/src/lang/reflection.cc b/src/lang/reflection.cc
index 7c4e862f0abb..9fb9143aa7f4 100644
--- a/src/lang/reflection.cc
+++ b/src/lang/reflection.cc
@@ -5,6 +5,7 @@
  */
 #include <tvm/base.h>
 #include <tvm/expr.h>
+#include <tvm/attrs.h>
 #include <tvm/container.h>
 #include <tvm/packed_func_ext.h>
 #include <tvm/runtime/ndarray.h>
@@ -467,22 +468,15 @@ class NodeAttrSetter : public AttrVisitor {
   }
 };
 
-// API function to make node.
-// args format:
-//    type_key, key1, value1, ..., key_n, value_n
-void MakeNode(runtime::TVMArgs args, runtime::TVMRetValue* rv) {
+
+void InitNodeByPackedArgs(Node* n, const TVMArgs& args) {
   NodeAttrSetter setter;
-  setter.type_key = args[0].operator std::string();
-  CHECK_EQ(args.size() % 2, 1);
-  for (int i = 1; i < args.size(); i += 2) {
-    setter.attrs.emplace(
-        args[i].operator std::string(),
-        runtime::TVMArgValue(args.values[i + 1], args.type_codes[i + 1]));
-  }
-  auto* f = dmlc::Registry<NodeFactoryReg>::Find(setter.type_key);
-  CHECK(f != nullptr)
-      << "Node type \'" << setter.type_key << "\' is not registered in TVM";
-  std::shared_ptr<Node> n = f->body();
+  setter.type_key = n->type_key();
+  CHECK_EQ(args.size() % 2, 0);
+  for (int i = 0; i < args.size(); i += 2) {
+    setter.attrs.emplace(args[i].operator std::string(),
+                         args[i + 1]);
+  }
   n->VisitAttrs(&setter);
   if (setter.attrs.size() != 0) {
     std::ostringstream os;
@@ -492,10 +486,26 @@ void MakeNode(runtime::TVMArgs args, runtime::TVMRetValue* rv) {
     }
     LOG(FATAL) << os.str();
   }
+}
+
+// API function to make node.
+// args format:
+//   key1, value1, ..., key_n, value_n
+void MakeNode(const TVMArgs& args, TVMRetValue* rv) {
+  std::string type_key = args[0];
+  auto* f = dmlc::Registry<NodeFactoryReg>::Find(type_key);
+  CHECK(f != nullptr)
+      << "Node type \'" << type_key << "\' is not registered in TVM";
+  TVMArgs kwargs(args.values + 1, args.type_codes + 1, args.size() - 1);
+  std::shared_ptr<Node> n = f->body();
+  if (n->derived_from<BaseAttrsNode>()) {
+    static_cast<BaseAttrsNode*>(n.get())->InitByPackedArgs(kwargs);
+  } else {
+    InitNodeByPackedArgs(n.get(), kwargs);
+  }
   *rv = NodeRef(n);
 }
 
 TVM_REGISTER_GLOBAL("make._Node")
 .set_body(MakeNode);
-
 }  // namespace tvm
diff --git a/tests/cpp/attrs_test.cc b/tests/cpp/attrs_test.cc
new file mode 100644
index 000000000000..138e0b242e02
--- /dev/null
+++ b/tests/cpp/attrs_test.cc
@@ -0,0 +1,76 @@
+#include <dmlc/logging.h>
+#include <gtest/gtest.h>
+#include <tvm/attrs.h>
+#include <tvm/ir.h>
+
+namespace tvm {
+namespace test {
+// test example usage docs
+struct TestAttrs : public AttrsNode<TestAttrs> {
+  int axis;
+  std::string name;
+  Expr expr;
+  double learning_rate;
+
+  TVM_DECLARE_ATTRS(TestAttrs, "attrs.cpptest.TestAttrs") {
+    TVM_ATTR_FIELD(axis)
+        .set_default(10)
+        .set_lower_bound(1)
+        .set_upper_bound(10)
+        .describe("axis field");
+    TVM_ATTR_FIELD(name)
+        .describe("name of the field");
+    TVM_ATTR_FIELD(expr)
+        .describe("expression field")
+        .set_default(make_const(Int(32), 1));
+    TVM_ATTR_FIELD(learning_rate)
+        .describe("learning_rate")
+        .set_default(0.1);
+  }
+};
+}
+}
+
+TEST(Attrs, Basic) {
+  using namespace tvm;
+  using namespace tvm::test;
+  std::shared_ptr<TestAttrs> n = std::make_shared<TestAttrs>();
+  try {
+    n->InitBySeq("axis", 10);
+    LOG(FATAL) << "bad";
+  } catch (const tvm::AttrError& e) {
+  }
+  try {
+    n->InitBySeq("axis", 12, "name", "111");
+    LOG(FATAL) << "bad";
+  } catch (const tvm::AttrError& e) {
+  }
+
+  try {
+    n->InitBySeq("axisx", 12, "name", "111");
+    LOG(FATAL) << "bad";
+  } catch (const tvm::AttrError& e) {
+    std::string what = e.what();
+    CHECK(what.find("expr : Expr, default=1") != std::string::npos);
+    CHECK(what.find("axisx") != std::string::npos);
+  }
+  n->InitBySeq("learning_rate", Expr(1), "expr", 128, "name", "xx");
+  CHECK_EQ(n->learning_rate, 1.0);
+
+  n->InitBySeq("name", "xxx", "expr", 128);
+  CHECK_EQ(n->name, "xxx");
+  CHECK_EQ(n->axis, 10);
+  CHECK_EQ(n->expr.as<tvm::ir::IntImm>()->value, 128);
+  // Check docstring
+  std::ostringstream os;
+  n->PrintDocString(os);
+  LOG(INFO) << "docstring\n"<< os.str();
+  CHECK(os.str().find("expr : Expr, default=1") != std::string::npos);
+}
+
+
+int main(int argc, char ** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  testing::FLAGS_gtest_death_test_style = "threadsafe";
+  return RUN_ALL_TESTS();
+}
diff --git a/tests/python/unittest/test_lang_reflection.py b/tests/python/unittest/test_lang_reflection.py
index fefb8771a812..2ba67b8d9c86 100644
--- a/tests/python/unittest/test_lang_reflection.py
+++ b/tests/python/unittest/test_lang_reflection.py
@@ -36,6 +36,31 @@ def test_make_node():
     assert AA.op == A.op
     assert AA.value_index == A.value_index
 
+
+def test_make_attrs():
+    try:
+        x = tvm.make.node("attrs.TestAttrs", unknown_key=1, name="xx")
+        assert False
+    except tvm.TVMError as e:
+        assert str(e).find("unknown_key") != -1
+
+    try:
+        x = tvm.make.node("attrs.TestAttrs", axis=100, name="xx")
+        assert False
+    except tvm.TVMError as e:
+        assert str(e).find("upper bound") != -1
+
+    x = tvm.make.node("attrs.TestAttrs", name="xx", padding=(3,4))
+    assert x.name == "xx"
+    assert x.padding[0].value == 3
+    assert x.padding[1].value == 4
+    assert x.axis == 10
+
+    dattr = tvm.make.node("DictAttrs", x=1, y=10, name="xyz", padding=(0,0))
+    assert dattr.x.value == 1
+
+
+
 def test_make_sum():
     A = tvm.placeholder((2, 10), name='A')
     k = tvm.reduce_axis((0,10), "k")
@@ -46,6 +71,7 @@ def test_make_sum():
     assert BB.op.body[0].combiner is not None
 
 if __name__ == "__main__":
+    test_make_attrs()
     test_make_node()
     test_make_smap()
     test_const_saveload_json()

From 4dc21bdb29fdaecd0a45982c31f4dc4cae99238a Mon Sep 17 00:00:00 2001
From: Pariksheet Pinjari <pariksheet.pinjari@huawei.com>
Date: Mon, 20 Aug 2018 22:43:35 +0530
Subject: [PATCH 39/77] [NNVM][DARKNET]Yolo and Upsample frontend support
 (#1501)

* Yolo and Upsample frontend support

* Lint fix

* Mac support added

* Code clean and trigger CI
---
 nnvm/python/nnvm/frontend/darknet.py          | 32 +++++++-
 nnvm/python/nnvm/testing/darknet.py           | 25 +++++-
 nnvm/python/nnvm/top/vision.py                | 15 ++++
 nnvm/src/top/vision/yolo/yolo.cc              | 33 ++++++++
 .../python/frontend/darknet/test_forward.py   | 30 +++++++-
 tutorials/nnvm/from_darknet.py                | 77 +++++++++----------
 6 files changed, 165 insertions(+), 47 deletions(-)
 create mode 100644 nnvm/src/top/vision/yolo/yolo.cc

diff --git a/nnvm/python/nnvm/frontend/darknet.py b/nnvm/python/nnvm/frontend/darknet.py
index 3aa36b7e7ef9..7fb3e34750c8 100644
--- a/nnvm/python/nnvm/frontend/darknet.py
+++ b/nnvm/python/nnvm/frontend/darknet.py
@@ -32,8 +32,12 @@ class LAYERTYPE(object):
     NETWORK = 20
     XNOR = 21
     REGION = 22
-    REORG = 23
-    BLANK = 24
+    YOLO = 23
+    REORG = 24
+    UPSAMPLE = 25
+    LOGXENT = 26
+    L2NORM = 27
+    BLANK = 28
 
 class ACTIVATION(object):
     """Darknet ACTIVATION Class constant."""
@@ -257,6 +261,12 @@ def _darknet_reshape(inputs, attrs):
     new_attrs['shape'] = _darknet_required_attr(attrs, 'shape')
     return _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs), None
 
+def _darknet_upsampling(inputs, attrs):
+    """Process the upsampling operation."""
+    op_name, new_attrs = 'upsampling', {}
+    new_attrs['scale'] = attrs.get('scale', 1)
+    return _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs), None
+
 def _darknet_softmax_output(inputs, attrs):
     """Process the softmax operation."""
     temperature = attrs.get('temperature', 1)
@@ -298,6 +308,15 @@ def _darknet_region(inputs, attrs):
         new_attrs['softmax'] = attrs.get('softmax', 0)
     return _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs), None
 
+def _darknet_yolo(inputs, attrs):
+    """Process the yolo operation."""
+    op_name, new_attrs = 'yolov3_yolo', {}
+    if 'n' in attrs:
+        new_attrs['n'] = attrs.get('n', 1)
+    if 'classes' in attrs:
+        new_attrs['classes'] = attrs.get('classes', 1)
+    return _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs), None
+
 def _darknet_activations(inputs, attrs):
     """Process the activation function."""
     act = _darknet_required_attr(attrs, 'activation')
@@ -350,6 +369,8 @@ def _darknet_op_not_support(inputs, attrs):
     LAYERTYPE.REORG           : _darknet_reorg,
     LAYERTYPE.REGION          : _darknet_region,
     LAYERTYPE.SHORTCUT        : _darknet_shortcut,
+    LAYERTYPE.UPSAMPLE        : _darknet_upsampling,
+    LAYERTYPE.YOLO            : _darknet_yolo,
     LAYERTYPE.DETECTION       : _darknet_op_not_support,
     LAYERTYPE.CROP            : _darknet_op_not_support,
     LAYERTYPE.COST            : _darknet_op_not_support,
@@ -575,6 +596,13 @@ def _get_darknet_attrs(self, layer, layer_num):
             attr.update({'coords' : layer.coords})
             attr.update({'background' : layer.background})
             attr.update({'softmax' : layer.softmax})
+
+        elif LAYERTYPE.YOLO == layer.type:
+            attr.update({'n' : layer.n})
+            attr.update({'classes' : layer.classes})
+
+        elif LAYERTYPE.UPSAMPLE == layer.type:
+            attr.update({'scale' : layer.stride})
         else:
             err = "Darknet layer type {} is not supported in nnvm.".format(layer.type)
             raise NotImplementedError(err)
diff --git a/nnvm/python/nnvm/testing/darknet.py b/nnvm/python/nnvm/testing/darknet.py
index e3d110e9605e..9a346e01b50b 100644
--- a/nnvm/python/nnvm/testing/darknet.py
+++ b/nnvm/python/nnvm/testing/darknet.py
@@ -115,8 +115,12 @@ class LAYERTYPE(object):
     NETWORK = 20
     XNOR = 21
     REGION = 22
-    REORG = 23
-    BLANK = 24
+    YOLO = 23
+    REORG = 24
+    UPSAMPLE = 25
+    LOGXENT = 26
+    L2NORM = 27
+    BLANK = 28
 
 class ACTIVATION(object):
     """Darknet ACTIVATION Class constant."""
@@ -182,12 +186,16 @@ class ACTIVATION(object):
     NETWORK,
     XNOR,
     REGION,
+    YOLO,
     REORG,
+    UPSAMPLE,
+    LOGXENT,
+    L2NORM,
     BLANK
 } LAYERTYPE;
 
 typedef enum{
-    SSE, MASKED, LONE, SEG, SMOOTH
+    SSE, MASKED, L1, SEG, SMOOTH, WGAN
 } COSTTYPE;
 
 
@@ -241,18 +249,20 @@ class ACTIVATION(object):
     float shift;
     float ratio;
     float learning_rate_scale;
+    float clip;
     int softmax;
     int classes;
     int coords;
     int background;
     int rescore;
     int objectness;
-    int does_cost;
     int joint;
     int noadjust;
     int reorg;
     int log;
     int tanh;
+    int *mask;
+    int total;
 
     float alpha;
     float beta;
@@ -265,13 +275,17 @@ class ACTIVATION(object):
     float class_scale;
     int bias_match;
     int random;
+    float ignore_thresh;
+    float truth_thresh;
     float thresh;
+    float focus;
     int classfix;
     int absolute;
 
     int onlyforward;
     int stopbackward;
     int dontload;
+    int dontsave;
     int dontloadscales;
 
     float temperature;
@@ -309,6 +323,7 @@ class ACTIVATION(object):
 
     float * delta;
     float * output;
+    float * loss;
     float * squared;
     float * norms;
 
@@ -462,6 +477,7 @@ class ACTIVATION(object):
     int train;
     int index;
     float *cost;
+    float clip;
 } network;
 
 
@@ -491,6 +507,7 @@ class ACTIVATION(object):
 layer make_region_layer(int batch, int w, int h, int n, int classes, int coords);
 layer make_softmax_layer(int batch, int inputs, int groups);
 layer make_rnn_layer(int batch, int inputs, int outputs, int steps, ACTIVATION activation, int batch_normalize, int adam);
+layer make_yolo_layer(int batch, int w, int h, int n, int total, int *mask, int classes);
 layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int output_filters, int steps, ACTIVATION activation, int batch_normalize);
 layer make_lstm_layer(int batch, int inputs, int outputs, int steps, int batch_normalize, int adam);
 layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_normalize, int adam);
diff --git a/nnvm/python/nnvm/top/vision.py b/nnvm/python/nnvm/top/vision.py
index f2e12c0f367a..e59b2bdfe6d9 100644
--- a/nnvm/python/nnvm/top/vision.py
+++ b/nnvm/python/nnvm/top/vision.py
@@ -38,6 +38,21 @@ def schedule_region(attrs, outs, target):
 
 reg.register_pattern("yolo_region", OpPattern.OPAQUE)
 
+@reg.register_compute("yolov3_yolo")
+def compute_yolo(attrs, inputs, _):
+    """Compute definition of yolo"""
+    n = attrs.get_int("n")
+    classes = attrs.get_int("classes")
+    return topi.vision.yolo.yolo(inputs[0], n, classes)
+
+@reg.register_schedule("yolov3_yolo")
+def schedule_yolo(attrs, outs, target):
+    """Schedule definition of yolo"""
+    with tvm.target.create(target):
+        return topi.generic.schedule_injective(outs)
+
+reg.register_pattern("yolov3_yolo", OpPattern.OPAQUE)
+
 # multibox_prior
 @reg.register_schedule("multibox_prior")
 def schedule_multibox_prior(_, outs, target):
diff --git a/nnvm/src/top/vision/yolo/yolo.cc b/nnvm/src/top/vision/yolo/yolo.cc
new file mode 100644
index 000000000000..4800f4371f9d
--- /dev/null
+++ b/nnvm/src/top/vision/yolo/yolo.cc
@@ -0,0 +1,33 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file yolo.cc
+ * \brief Property def of yolo operators.
+ */
+#include <nnvm/op.h>
+#include <nnvm/node.h>
+#include <nnvm/op_attr_types.h>
+#include <nnvm/top/nn.h>
+#include "../../elemwise_op_common.h"
+
+namespace nnvm {
+namespace top {
+
+NNVM_REGISTER_OP(yolov3_yolo)
+.describe(R"code(Yolo layer
+)code" NNVM_ADD_FILELINE)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_support_level(5)
+.add_argument("data", "Tensor", "Input data")
+.set_attr<FInferShape>("FInferShape", ElemwiseShape<1, 1>)
+.set_attr<FInplaceOption>(
+    "FInplaceOption",
+    [](const NodeAttrs &attrs) {
+      return std::vector<std::pair<int, int>>{{0, 0}, {1, 0}};
+    })
+.set_attr<FGradient>("FGradient", [](const NodePtr &n,
+                                     const std::vector<NodeEntry> &ograds) {
+  return std::vector<NodeEntry>{ograds[0], ograds[0]};
+});
+}  // namespace top
+}  // namespace nnvm
diff --git a/nnvm/tests/python/frontend/darknet/test_forward.py b/nnvm/tests/python/frontend/darknet/test_forward.py
index 5fc71a86211e..3d7d06b48483 100644
--- a/nnvm/tests/python/frontend/darknet/test_forward.py
+++ b/nnvm/tests/python/frontend/darknet/test_forward.py
@@ -44,7 +44,7 @@ def _download(url, path, overwrite=False, sizecompare=False):
     except:
         urllib.urlretrieve(url, path)
 
-DARKNET_LIB = 'libdarknet.so'
+DARKNET_LIB = 'libdarknet2.0.so'
 DARKNETLIB_URL = 'https://github.com/siju-samuel/darknet/blob/master/lib/' \
                                     + DARKNET_LIB + '?raw=true'
 _download(DARKNETLIB_URL, DARKNET_LIB)
@@ -239,6 +239,8 @@ def test_forward_shortcut():
     layer_2 = LIB.make_convolutional_layer(1, 111, 111, 32, 32, 1, 1, 1, 0, 1, 0, 0, 0, 0)
     layer_3 = LIB.make_shortcut_layer(1, 0, 111, 111, 32, 111, 111, 32)
     layer_3.activation = 1
+    layer_3.alpha = 1
+    layer_3.beta = 1
     net.layers[0] = layer_1
     net.layers[1] = layer_2
     net.layers[2] = layer_3
@@ -272,6 +274,30 @@ def test_forward_region():
     test_forward(net)
     LIB.free_network(net)
 
+def test_forward_yolo_op():
+    '''test yolo layer'''
+    net = LIB.make_network(2)
+    layer_1 = LIB.make_convolutional_layer(1, 224, 224, 3, 14, 1, 3, 2, 0, 1, 0, 0, 0, 0)
+    a = []
+    layer_2 = LIB.make_yolo_layer(1, 111, 111, 2, 0, a, 2)
+    net.layers[0] = layer_1
+    net.layers[1] = layer_2
+    net.w = net.h = 224
+    LIB.resize_network(net, 224, 224)
+    test_forward(net)
+    LIB.free_network(net)
+
+def test_forward_upsample():
+    '''test upsample layer'''
+    net = LIB.make_network(1)
+    layer = LIB.make_upsample_layer(1, 19, 19, 3, 3)
+    layer.scale = 1
+    net.layers[0] = layer
+    net.w = net.h = 19
+    LIB.resize_network(net, 19, 19)
+    test_forward(net)
+    LIB.free_network(net)
+
 def test_forward_elu():
     '''test elu activation layer'''
     net = LIB.make_network(1)
@@ -428,6 +454,8 @@ def test_forward_activation_logistic():
     test_forward_rnn()
     test_forward_reorg()
     test_forward_region()
+    test_forward_yolo_op()
+    test_forward_upsample()
     test_forward_elu()
     test_forward_rnn()
     test_forward_crnn()
diff --git a/tutorials/nnvm/from_darknet.py b/tutorials/nnvm/from_darknet.py
index 883026f2af98..c6b70cf59413 100644
--- a/tutorials/nnvm/from_darknet.py
+++ b/tutorials/nnvm/from_darknet.py
@@ -22,54 +22,48 @@
 import numpy as np
 import tvm
 import os
+import sys
 
 from ctypes import *
 from tvm.contrib.download import download
 from nnvm.testing.darknet import __darknetffi__
 
-######################################################################
-# Set the parameters here.
-# Supported models alexnet, resnet50, resnet152, extraction, yolo
-#
-model_name = 'yolo'
-test_image = 'dog.jpg'
-target = 'llvm'
-ctx = tvm.cpu(0)
+#Model name
+MODEL_NAME = 'yolo'
 
 ######################################################################
-# Prepare cfg and weights file
-# ----------------------------
-# Pretrained model available https://pjreddie.com/darknet/imagenet/
-# Download cfg and weights file first time.
+# Download required files
+# -----------------------
+# Download cfg and weights file if first time.
+CFG_NAME = MODEL_NAME + '.cfg'
+WEIGHTS_NAME = MODEL_NAME + '.weights'
+REPO_URL = 'https://github.com/siju-samuel/darknet/blob/master/'
+CFG_URL = REPO_URL + 'cfg/' + CFG_NAME + '?raw=true'
+WEIGHTS_URL = REPO_URL + 'weights/' + WEIGHTS_NAME + '?raw=true'
+
+download(CFG_URL, CFG_NAME)
+download(WEIGHTS_URL, WEIGHTS_NAME)
 
-cfg_name = model_name + '.cfg'
-weights_name = model_name + '.weights'
-cfg_url = 'https://github.com/siju-samuel/darknet/blob/master/cfg/' + \
-            cfg_name + '?raw=true'
-weights_url = 'http://pjreddie.com/media/files/' + weights_name + '?raw=true'
-
-download(cfg_url, cfg_name)
-download(weights_url, weights_name)
-
-######################################################################
 # Download and Load darknet library
-# ---------------------------------
-
-darknet_lib = 'libdarknet.so'
-darknetlib_url = 'https://github.com/siju-samuel/darknet/blob/master/lib/' + \
-                        darknet_lib + '?raw=true'
-download(darknetlib_url, darknet_lib)
-
-#if the file doesnt exist, then exit normally.
-if os.path.isfile('./' + darknet_lib) is False:
-    exit(0)
-
-darknet_lib = __darknetffi__.dlopen('./' + darknet_lib)
-cfg = "./" + str(cfg_name)
-weights = "./" + str(weights_name)
-net = darknet_lib.load_network(cfg.encode('utf-8'), weights.encode('utf-8'), 0)
+if sys.platform in ['linux', 'linux2']:
+    DARKNET_LIB = 'libdarknet2.0.so'
+    DARKNET_URL = REPO_URL + 'lib/' + DARKNET_LIB + '?raw=true'
+elif sys.platform == 'darwin':
+    DARKNET_LIB = 'libdarknet_mac2.0.so'
+    DARKNET_URL = REPO_URL + 'lib_osx/' + DARKNET_LIB + '?raw=true'
+else:
+    err = "Darknet lib is not supported on {} platform".format(sys.platform)
+    raise NotImplementedError(err)
+
+download(DARKNET_URL, DARKNET_LIB)
+
+DARKNET_LIB = __darknetffi__.dlopen('./' + DARKNET_LIB)
+cfg = "./" + str(CFG_NAME)
+weights = "./" + str(WEIGHTS_NAME)
+net = DARKNET_LIB.load_network(cfg.encode('utf-8'), weights.encode('utf-8'), 0)
 dtype = 'float32'
 batch_size = 1
+
 print("Converting darknet to nnvm symbols...")
 sym, params = nnvm.frontend.darknet.from_darknet(net, dtype)
 
@@ -77,7 +71,9 @@
 # Compile the model on NNVM
 # -------------------------
 # compile the model
-data = np.empty([batch_size, net.c ,net.h, net.w], dtype);
+target = 'llvm'
+ctx = tvm.cpu(0)
+data = np.empty([batch_size, net.c, net.h, net.w], dtype)
 shape = {'data': data.shape}
 print("Compiling the model...")
 with nnvm.compiler.build_config(opt_level=2):
@@ -103,6 +99,7 @@ def save_lib():
 ######################################################################
 # Load a test image
 # --------------------------------------------------------------------
+test_image = 'dog.jpg'
 print("Loading the test image...")
 img_url = 'https://github.com/siju-samuel/darknet/blob/master/data/' + \
             test_image   +'?raw=true'
@@ -134,7 +131,7 @@ def save_lib():
 hier_thresh = 0.5
 img = nnvm.testing.darknet.load_image_color(test_image)
 _, im_h, im_w = img.shape
-probs= []
+probs = []
 boxes = []
 region_layer = net.layers[net.n - 1]
 boxes, probs = nnvm.testing.yolo2_detection.get_region_boxes(region_layer, im_w, im_h, net.w, net.h,
@@ -157,5 +154,5 @@ def save_lib():
 
 nnvm.testing.yolo2_detection.draw_detections(img, region_layer.w*region_layer.h*region_layer.n,
                  thresh, boxes, probs, names, region_layer.classes)
-plt.imshow(img.transpose(1,2,0))
+plt.imshow(img.transpose(1, 2, 0))
 plt.show()

From 5d533ec99b2b6b53f9abfa2f8844a82f180043de Mon Sep 17 00:00:00 2001
From: Yao Wang <kevinthesunwy@gmail.com>
Date: Mon, 20 Aug 2018 13:27:31 -0700
Subject: [PATCH 40/77] Improve x86 Inception (#1506)

* Improve x86 pooling and concat

* Fix

* Fix test concatenate correct layout

* Add conditional vectorize

* Fix lint

* Modify schedule for global pooling

* Fix

* Fix warning

* Fix alter layout test

* Remove vectorization for pooling when using 4D layout

* Remove vectorization for 4D concat

* Fix concatenate layout

* Fix concatenate schedule

* Fix concat

* Fix lint

* Fix concat

* Simplify pooling logic

* Update docstring

* Fix test topi pooling

* Small changes
---
 nnvm/python/nnvm/top/nn.py                    | 10 ++--
 nnvm/python/nnvm/top/transform.py             |  8 +++-
 nnvm/src/top/tensor/transform.cc              | 24 ++++++++--
 .../python/unittest/test_correct_layout.py    | 21 +++++++--
 topi/include/topi/nn/pooling.h                | 12 ++---
 topi/python/topi/cuda/pooling.py              |  7 ++-
 topi/python/topi/generic/injective.py         | 17 +++++++
 topi/python/topi/generic/nn.py                |  5 +-
 topi/python/topi/opengl/pooling.py            |  7 ++-
 topi/python/topi/x86/injective.py             | 46 +++++++++++++++++++
 topi/python/topi/x86/pooling.py               | 44 +++++++++++++++---
 topi/tests/python/test_topi_pooling.py        |  6 ++-
 12 files changed, 174 insertions(+), 33 deletions(-)

diff --git a/nnvm/python/nnvm/top/nn.py b/nnvm/python/nnvm/top/nn.py
index f9a2c2813a04..b452738123c3 100644
--- a/nnvm/python/nnvm/top/nn.py
+++ b/nnvm/python/nnvm/top/nn.py
@@ -280,20 +280,22 @@ def schedule_conv2d_transpose(attrs, outs, target):
 
 # max_pool2d
 @reg.register_schedule("max_pool2d")
-def schedule_max_pool2d(_, outs, target):
+def schedule_max_pool2d(attrs, outs, target):
     """Schedule definition of max_pool2d"""
+    layout = attrs["layout"]
     with tvm.target.create(target):
-        return topi.generic.schedule_pool(outs)
+        return topi.generic.schedule_pool(outs, layout)
 
 reg.register_pattern("max_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 # avg_pool2d
 @reg.register_schedule("avg_pool2d")
-def schedule_avg_pool2d(_, outs, target):
+def schedule_avg_pool2d(attrs, outs, target):
     """Schedule definition of avg_pool2d"""
+    layout = attrs["layout"]
     with tvm.target.create(target):
-        return topi.generic.schedule_pool(outs)
+        return topi.generic.schedule_pool(outs, layout)
 
 reg.register_pattern("avg_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
diff --git a/nnvm/python/nnvm/top/transform.py b/nnvm/python/nnvm/top/transform.py
index facb345c1abe..594007239d4a 100644
--- a/nnvm/python/nnvm/top/transform.py
+++ b/nnvm/python/nnvm/top/transform.py
@@ -2,6 +2,7 @@
 """Tensor transformation ops"""
 from __future__ import absolute_import
 
+import tvm
 import topi
 from .tensor import _fschedule_broadcast, _fschedule_injective
 from . import registry as reg
@@ -58,8 +59,13 @@ def compute_reshape_like(attrs, inputs, out_info):
 reg.register_schedule("squeeze", _fschedule_injective)
 
 # concatenate
+@reg.register_schedule("concatenate")
+def schedule_concatenate(_, outs, target):
+    """Schedule definition of concatenate"""
+    with tvm.target.create(target):
+        return topi.generic.schedule_concatenate(outs)
+
 reg.register_pattern("concatenate", OpPattern.INJECTIVE)
-reg.register_schedule("concatenate", _fschedule_injective)
 
 # split
 reg.register_pattern("split", OpPattern.INJECTIVE)
diff --git a/nnvm/src/top/tensor/transform.cc b/nnvm/src/top/tensor/transform.cc
index 52dca5654838..b1485438ca50 100644
--- a/nnvm/src/top/tensor/transform.cc
+++ b/nnvm/src/top/tensor/transform.cc
@@ -129,15 +129,31 @@ inline bool ConcatenateCorrectLayout(const NodeAttrs& attrs,
                                      std::vector<Layout> *ilayouts,
                                      const std::vector<Layout> *last_ilayouts,
                                      std::vector<Layout> *olayouts) {
+  const ConcatenateParam& param = nnvm::get<ConcatenateParam>(attrs.parsed);
   CHECK_EQ(ilayouts->size(), last_ilayouts->size());
   CHECK_EQ(olayouts->size(), 1U);
 
-  for (size_t i = 0; i < ilayouts->size(); ++i) {
-    const Layout& input = last_ilayouts->at(i).defined() ?
-                          last_ilayouts->at(i) : ilayouts->at(i);
-    NNVM_ASSIGN_LAYOUT(*ilayouts, i, input);
+  Layout layout;
+  if (!ilayouts->at(0).defined()) {
+    layout = last_ilayouts->at(0);
+  } else if (param.axis >= static_cast<int>(ilayouts->at(0).ndim())) {
+    CHECK(last_ilayouts->at(0).defined())
+      << "Current input layout " << ilayouts->at(0)
+      << " is invalid but last input layout is not "
+         "defined for the first input.";
+    layout = last_ilayouts->at(0);
+  } else if (last_ilayouts->at(0).defined()
+             && ilayouts->at(0)[param.axis]
+                != last_ilayouts->at(0)[param.axis]) {
+    layout = last_ilayouts->at(0);
+  } else {
+    layout = ilayouts->at(0);
   }
 
+  for (size_t i = 0; i < ilayouts->size(); ++i) {
+    NNVM_ASSIGN_LAYOUT(*ilayouts, i, layout);
+  }
+  NNVM_ASSIGN_LAYOUT(*olayouts, 0, layout);
   return true;
 }
 
diff --git a/nnvm/tests/python/unittest/test_correct_layout.py b/nnvm/tests/python/unittest/test_correct_layout.py
index 6176586284a7..8961498a579e 100644
--- a/nnvm/tests/python/unittest/test_correct_layout.py
+++ b/nnvm/tests/python/unittest/test_correct_layout.py
@@ -77,14 +77,25 @@ def test_concatenate():
     g, ldict = correct_layout(z, {"x": "HW", "y": "HW"})
     assert(ldict["x"][0] == "HW")
     assert(ldict["y"][0] == "HW")
-    assert(ldict["concat"][0] == "__undef__")
+    assert(ldict["concat"][0] == "HW")
     # second pass will insert layout transform
     _, ldict = correct_layout(g, {"x": "HW16w", "y": "HW16w"})
     assert(ldict["x"][0] == "HW16w")
     assert(ldict["y"][0] == "HW16w")
-    assert(ldict["x_HW"][0] == "HW")
-    assert(ldict["y_HW"][0] == "HW")
-    assert(ldict["concat"][0] == "__undef__")
+    assert(ldict["concat"][0] == "HW16w")
+
+    x1 = sym.Variable("x", shape=(10, 20, 60))
+    x2 = sym.Variable("y", shape=(10, 20, 40))
+    z = sym.concatenate(x1, x2, axis=2, name="concat")
+    g, ldict = correct_layout(z, {"x": "H20wW", "y": "H20wW"})
+    assert(ldict["x"][0] == "H20wW")
+    assert(ldict["y"][0] == "H20wW")
+    assert(ldict["concat"][0] == "H20wW")
+    # second pass will insert layout transform
+    _, ldict = correct_layout(g, {"x": "HW", "y": "HW"})
+    assert(ldict["x_H20wW"][0] == "H20wW")
+    assert(ldict["x_H20wW"][0] == "H20wW")
+    assert(ldict["concat"][0] == "H20wW")
 
 
 def test_expand_dims():
@@ -349,4 +360,4 @@ def test_reduce():
     test_transpose()
     test_broadcast_to()
     test_broadcast_binary()
-    test_reduce()
\ No newline at end of file
+    test_reduce()
diff --git a/topi/include/topi/nn/pooling.h b/topi/include/topi/nn/pooling.h
index 26d61d42991d..ca318adfe6cb 100644
--- a/topi/include/topi/nn/pooling.h
+++ b/topi/include/topi/nn/pooling.h
@@ -112,18 +112,18 @@ inline Tensor pool_impl(const Tensor& x,
     }, "tensor", "pool_max");
   } else if (pool_type == kAvgPool) {
     auto temp = do_pad ? pad(x, pad_before, pad_after, 0, "pad_temp") : x;
-    auto tsum = tvm::compute(out_shape, [&](const Array<Var>& output) {
+    auto tavg = [&](const Array<Var>& output, Expr divide_factor) {
       Array<Expr> indices;
       for (const Var& var : output) indices.push_back(var);
       indices.Set(height_axis, output[height_axis] * stride_height + dheight);
       indices.Set(width_axis, output[width_axis] * stride_width + dwidth);
-      return tvm::sum(temp(indices), { dheight, dwidth });
-    }, "tensor", "pool_avg");
+      return tvm::sum(temp(indices) / divide_factor, { dheight, dwidth });
+    };
 
     return tvm::compute(out_shape,
     [&](const Array<Var>& output) {
       if (count_include_pad) {
-        return tsum(output) / (kernel_height * kernel_width);
+        return tavg(output, kernel_height * kernel_width);
       } else {
         Expr h_start = output[height_axis] * stride_height - pad_top;
         Expr w_start = output[width_axis] * stride_width - pad_left;
@@ -133,9 +133,9 @@ inline Tensor pool_impl(const Tensor& x,
         w_start = ir::Max::make(w_start, make_const(Int(32), 0));
         Expr divide_factor = ir::Max::make((h_end - h_start) * (w_end - w_start),
                                            make_const(Int(32), 1));
-        return tsum(output) / divide_factor;
+        return tavg(output, divide_factor);
       }
-    }, "tensor", kElementWise);
+    }, "tensor", "pool_avg");
   } else {
     LOG(ERROR) << "Unrecognized pool_type: " << pool_type;
     return x;
diff --git a/topi/python/topi/cuda/pooling.py b/topi/python/topi/cuda/pooling.py
index 637f664fbd36..6b36e9a8743f 100644
--- a/topi/python/topi/cuda/pooling.py
+++ b/topi/python/topi/cuda/pooling.py
@@ -1,4 +1,4 @@
-# pylint: disable=invalid-name, unused-variable
+# pylint: disable=invalid-name, unused-variable, unused-argument
 """Schedule for pooling operators"""
 import tvm
 from .. import tag
@@ -70,7 +70,7 @@ def traverse(OP):
 
 
 @generic.schedule_pool.register(["cuda", "gpu"])
-def schedule_pool(outs):
+def schedule_pool(outs, layout):
     """Schedule for pool.
 
     Parameters
@@ -79,6 +79,9 @@ def schedule_pool(outs):
         The computation graph description of pool
         in the format of an array of tensors.
 
+    layout: str
+        Data layout.
+
     Returns
     -------
     s: Schedule
diff --git a/topi/python/topi/generic/injective.py b/topi/python/topi/generic/injective.py
index 0a9e394661af..975e4c11ea41 100644
--- a/topi/python/topi/generic/injective.py
+++ b/topi/python/topi/generic/injective.py
@@ -29,5 +29,22 @@ def schedule_injective(outs):
     s[x].fuse(s[x].op.axis)
     return s
 
+@tvm.target.generic_func
+def schedule_concatenate(outs):
+    """Schedule for concatenate op.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of reduce in the format
+          of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return schedule_injective(outs)
+
 schedule_elemwise = schedule_injective
 schedule_broadcast = schedule_injective
diff --git a/topi/python/topi/generic/nn.py b/topi/python/topi/generic/nn.py
index 1e01adb899b7..874decc792ec 100644
--- a/topi/python/topi/generic/nn.py
+++ b/topi/python/topi/generic/nn.py
@@ -282,7 +282,7 @@ def schedule_dense(outs):
 
 
 @tvm.target.override_native_generic_func("schedule_pool")
-def schedule_pool(outs):
+def schedule_pool(outs, layout):
     """Schedule for pool
 
     Parameters
@@ -291,6 +291,9 @@ def schedule_pool(outs):
           The computation graph description of pool
           in the format of an array of tensors.
 
+    layout: str
+        Data layout.
+
     Returns
     -------
     sch: Schedule
diff --git a/topi/python/topi/opengl/pooling.py b/topi/python/topi/opengl/pooling.py
index 8195ea91d8a6..d6dbf0eac5c2 100644
--- a/topi/python/topi/opengl/pooling.py
+++ b/topi/python/topi/opengl/pooling.py
@@ -1,4 +1,4 @@
-# pylint: disable=invalid-name, unused-variable
+# pylint: disable=invalid-name, unused-variable, unused-argument
 """Schedule for pooling operators"""
 import tvm
 from .. import tag
@@ -54,7 +54,7 @@ def traverse(OP):
 
 
 @generic.schedule_pool.register(["opengl"])
-def schedule_pool(outs):
+def schedule_pool(outs, layout):
     """Schedule for pool.
 
     Parameters
@@ -63,6 +63,9 @@ def schedule_pool(outs):
         The computation graph description of pool
         in the format of an array of tensors.
 
+    layout: str
+        Data layout.
+
     Returns
     -------
     s: Schedule
diff --git a/topi/python/topi/x86/injective.py b/topi/python/topi/x86/injective.py
index b43ebb98b82f..ac552903ad7f 100644
--- a/topi/python/topi/x86/injective.py
+++ b/topi/python/topi/x86/injective.py
@@ -33,5 +33,51 @@ def schedule_injective(outs):
         s[x].parallel(s[x].op.axis[0])
     return s
 
+@generic.schedule_concatenate.register(["cpu"])
+def schedule_concatenate(outs):
+    """X86 schedule for concatenate op.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of injective in the format
+          of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    def vectorize(sch, tensor, vectorize_limit):
+        """Internal vectorization function for concatenate."""
+        inner_axis = s[tensor].op.axis[len(s[tensor].op.axis) - 1]
+        inner_length = tensor.shape[len(tensor.shape) - 1].value
+        if inner_length <= vectorize_limit:
+            sch[tensor].vectorize(inner_axis)
+        else:
+            split_factor = 1
+            for i in range(vectorize_limit, 1, -1):
+                if inner_length % i == 0:
+                    split_factor = i
+                    break
+            if split_factor > 1:
+                _, inner_i = sch[tensor].split(inner_axis, split_factor)
+                sch[tensor].vectorize(inner_i)
+
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    x = outs[0]
+    s = tvm.create_schedule([x.op for x in outs])
+    tvm.schedule.AutoInlineInjective(s)
+    if len(s[x].op.axis) >= 5:
+        fused = s[x].fuse(s[x].op.axis[0], s[x].op.axis[1], s[x].op.axis[2])
+        vectorize(s, x, 64)
+        s[x].parallel(fused)
+    elif len(s[x].op.axis) >= 3:
+        fused = s[x].fuse(s[x].op.axis[0], s[x].op.axis[1])
+        s[x].parallel(fused)
+    else:
+        s[x].parallel(s[x].op.axis[0])
+    return s
+
 schedule_elemwise = schedule_injective
 schedule_broadcast = schedule_injective
diff --git a/topi/python/topi/x86/pooling.py b/topi/python/topi/x86/pooling.py
index 998edf7a0e16..5fce5f32afb6 100644
--- a/topi/python/topi/x86/pooling.py
+++ b/topi/python/topi/x86/pooling.py
@@ -4,19 +4,47 @@
 from .. import generic
 from .. import tag
 
-def _parallel_sch(sch):
+def _parallel_sch(sch, oshape, do_vectorize=False):
+    def vectorize(fused_axis, num_parallel_axis, vectorize_limit=64):
+        """Internal vectorization utility function."""
+        reorder_axis = [fused_axis]
+        for i in range(num_parallel_axis, len(sch.op.axis) - 1):
+            reorder_axis.append(sch.op.axis[i])
+        kw, kh = sch.op.reduce_axis
+        fuse_k = sch.fuse(kw, kh)
+        c = sch.op.axis[len(sch.op.axis) - 1]
+        reorder_axis += [fuse_k, c]
+        sch.reorder(*reorder_axis)
+        inner_length = oshape[len(oshape) - 1].value
+        if inner_length <= vectorize_limit:
+            sch.vectorize(c)
+        else:
+            split_factor = 1
+            for i in range(vectorize_limit, 1, -1):
+                if inner_length % i == 0:
+                    split_factor = i
+                    break
+            if split_factor > 1:
+                _, c_i = sch.split(c, split_factor)
+                sch.vectorize(c_i)
+
     if len(sch.op.axis) >= 5:
         fused = sch.fuse(sch.op.axis[0], sch.op.axis[1], sch.op.axis[2])
-        sch.parallel(fused)
+        if do_vectorize:
+            vectorize(fused, 3)
+
     elif len(sch.op.axis) >= 3:
         fused = sch.fuse(sch.op.axis[0], sch.op.axis[1])
-        sch.parallel(fused)
+        if do_vectorize:
+            vectorize(fused, 2)
     else:
         sch.parallel(sch.op.axis[0])
+        return
+    sch.parallel(fused)
 
 
 @generic.schedule_pool.register(["cpu"])
-def schedule_pool(outs):
+def schedule_pool(outs, layout):
     """Schedule for pool
 
     Parameters
@@ -25,6 +53,9 @@ def schedule_pool(outs):
           The computation graph description of pool
           in the format of an array of tensors.
 
+    layout: str
+        Data layout.
+
     Returns
     -------
     sch: Schedule
@@ -37,7 +68,8 @@ def schedule_pool(outs):
     def _schedule(PaddedInput, Pool):
         if isinstance(PaddedInput.op, tvm.tensor.ComputeOp):
             s[PaddedInput].compute_inline()
-        _parallel_sch(s[Pool])
+        do_vectorize = layout[-1] not in "HWhw"
+        _parallel_sch(s[Pool], outs[0].shape, do_vectorize)
 
     def traverse(OP):
         """Internal travserse function"""
@@ -93,7 +125,7 @@ def traverse(OP):
         # schedule pool
         elif OP.tag.startswith('global_pool'):
             Pool = OP.output(0)
-            _parallel_sch(s[Pool])
+            _parallel_sch(s[Pool], outs[0].shape)
         else:
             raise RuntimeError("Unsupported operator: %s" % OP.tag)
 
diff --git a/topi/tests/python/test_topi_pooling.py b/topi/tests/python/test_topi_pooling.py
index c9f790146b4a..b87795743c4c 100644
--- a/topi/tests/python/test_topi_pooling.py
+++ b/topi/tests/python/test_topi_pooling.py
@@ -10,9 +10,11 @@ def verify_pool(n, ic, ih, kh, sh, padding, pool_type, ceil_mode, count_include_
     kw = kh
     sw = sh
     pt, pl, pb, pr = padding
+    layout = "NCHW"
     A = tvm.placeholder((n, ic, ih, iw), name='A')
     B = topi.nn.pool(A, kernel=[kh, kw], stride=[sh, sw], padding=padding,
-                     pool_type=pool_type, ceil_mode=ceil_mode, count_include_pad=count_include_pad)
+                     pool_type=pool_type, ceil_mode=ceil_mode,
+                     layout="NCHW", count_include_pad=count_include_pad)
     B = topi.nn.relu(B)
     dtype = A.dtype
 
@@ -54,7 +56,7 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            s = topi.generic.schedule_pool(B)
+            s = topi.generic.schedule_pool(B, layout)
 
         a = tvm.nd.array(a_np, ctx)
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx)

From 7cb85d81968cd69576d923852d812590b93cc26d Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Mon, 20 Aug 2018 16:28:28 -0700
Subject: [PATCH 41/77] [VERSION] Update to 0.5.dev (#1623)

* [VERSION] Update to 0.5.dev

* Update the docs to include all intrins
---
 NEWS.md                             | 63 +++++++++++++++++++++++++++
 conda/nnvm/meta.yaml                |  2 +-
 conda/topi/meta.yaml                |  2 +-
 conda/tvm-libs/meta.yaml            |  2 +-
 conda/tvm/meta.yaml                 |  2 +-
 docs/api/python/intrin.rst          |  6 +++
 include/tvm/runtime/c_runtime_api.h |  2 +-
 python/tvm/_ffi/libinfo.py          |  6 ++-
 python/update_version.py            | 66 +++++++++++++++++++++++++++++
 web/tvm_runtime.js                  |  2 +-
 10 files changed, 145 insertions(+), 8 deletions(-)
 create mode 100644 python/update_version.py

diff --git a/NEWS.md b/NEWS.md
index 567aabf3fcbd..2c2f616cb2f0 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -9,6 +9,69 @@ Refer to the Roadmap issue for complete list on on-going version features.
 If you check in something that is not reflected in Roadmap issue, please reply
 to that issue so it can get added.
 
+## 0.4
+
+This release features several major improvements. The high-level graph optimizer is now part of TVM repo. Some of the highlights are: Initial support of AutoTVM for automated optimization; customized accelerator backend VTA.
+
+- Tensor operator primitives
+  - Introduce attrs field to operator primitives(e.g. compute) to store additional metadata, the attrs can be used as hint for scheduling
+- Enable embedding of asm micro-kernels
+- Hybrid python programming model
+   - python AST based IR builder interface
+   - support GPU programs
+- AutoTVM, Automated tuning, and scheduling
+   - basic autotvm infra
+    - GPU IR verifier
+   - basic autotuning tutorial
+   - topi integration
+- ARM support
+    - winograd support
+   - initial support of ARM autotuning records
+- TOPI Vision
+   - Generic GPU sort support(useful for vision)
+   - SSD operator support
+- TOPI numpy consistency
+   - Rename all binary operators for numpy consistecy: broadcast_add-> add, broadcast_sub -> substract, broadcast_mul -> multiply, broadcast_div->divide
+   - New operators: slice, LRN, equal, not_equal, less, greater
+   - tutorials on topi
+- Initial low-bit operator support support
+    - Optimized popcount generation on ARM
+    - general bit-serial convolution and GEMM
+    - optimized low bit kernels
+    - parallel optimization
+- New topi backend optimization for intel graphics
+- Adapt AVX schedules for SSE target
+- VTA: customized accelerator backend
+  - custom hardware backend example
+  - tutorials on how to use customized accelerator
+- Initial experimental support for  HLS backend
+- Bugfix in SPIRV code generator for vulkan
+- libdevice support, enable NVPTX backend
+- Introduce NDArrayContainer for managed NDarray
+- RPC and Device API
+   - Support communication between big/small endian machines.
+   - RPC and device API protocol upgrade (this is a non-backward compatible change) to support big-small endian communication. This is a non-backward compatible change, need to use the latest version of TVM runtime with the RPC
+   - graduate rpc from contrib, tvm.contrib.rpc->tvm.rpc
+   -Support tracker in Android RPC, add fault tolerance for AutoTVM
+- BIG.LITTLE aware threadpool
+- tvm4j graph runtime that runs end to end workload in java
+- DLPack support
+   - Support from_dlpack and to_dlpack
+   - Enables bridges to pytorch
+- Enable link of stackvm in runtime
+- Tensorflow graphdef frontend
+- Keras frontend
+   - improved to support reuse layers, add activations
+- ONNX
+   - gather,  LRN
+- CoreML frontend
+   - Support C-RNN and activation functions
+- Fix grads for sum and expand_like
+- Enhanced operator fusion for multiple elemwise branches
+- Separate nnvm fusion and compilation pass
+- Unified build system to cmake, customizable cmake path for vulkan, rocm, cuda
+
+
 ## 0.3
 
 This release features numerous improvements in TOPI and backends. We make the first step toward object detection support in TOPI, featuring operators necessary for YOLO and SSDs. The topi now supports numpy-style API and operator overloading. RPC is significantly improved to support resource allocation and using a pool of devices. We are adding two new backends: WebGL for running GPUs on the browser, and Vulkan for running on next-generation graphics API.
diff --git a/conda/nnvm/meta.yaml b/conda/nnvm/meta.yaml
index a8b47d0de118..9c045c177ff6 100644
--- a/conda/nnvm/meta.yaml
+++ b/conda/nnvm/meta.yaml
@@ -1,4 +1,4 @@
-{% set version = "0.4.dev" %}
+{% set version = "0.5.dev" %}
 
 package:
   name: nnvm
diff --git a/conda/topi/meta.yaml b/conda/topi/meta.yaml
index af2fb4fd4228..4002f577863b 100644
--- a/conda/topi/meta.yaml
+++ b/conda/topi/meta.yaml
@@ -1,4 +1,4 @@
-{% set version = "0.4.dev" %}
+{% set version = "0.5.dev" %}
 
 package:
   name: topi
diff --git a/conda/tvm-libs/meta.yaml b/conda/tvm-libs/meta.yaml
index dbdfd4a7701f..d6902c45a693 100644
--- a/conda/tvm-libs/meta.yaml
+++ b/conda/tvm-libs/meta.yaml
@@ -1,4 +1,4 @@
-{% set version = "0.4.dev" %}
+{% set version = "0.5.dev" %}
 
 package:
   name: tvm-libs
diff --git a/conda/tvm/meta.yaml b/conda/tvm/meta.yaml
index 478e095322eb..fe53b7dd49d9 100644
--- a/conda/tvm/meta.yaml
+++ b/conda/tvm/meta.yaml
@@ -1,4 +1,4 @@
-{% set version = "0.4.dev" %}
+{% set version = "0.5.dev" %}
 
 package:
   name: tvm
diff --git a/docs/api/python/intrin.rst b/docs/api/python/intrin.rst
index 3942c57f1a04..59f695196ce8 100644
--- a/docs/api/python/intrin.rst
+++ b/docs/api/python/intrin.rst
@@ -6,7 +6,10 @@ tvm.intrin
 
    tvm.call_packed
    tvm.call_pure_intrin
+   tvm.call_intrin
    tvm.call_pure_extern
+   tvm.call_extern
+   tvm.call_llvm_intrin
    tvm.register_intrin_rule
    tvm.exp
    tvm.log
@@ -18,7 +21,10 @@ tvm.intrin
 
 .. autofunction:: tvm.call_packed
 .. autofunction:: tvm.call_pure_intrin
+.. autofunction:: tvm.call_intrin
 .. autofunction:: tvm.call_pure_extern
+.. autofunction:: tvm.call_extern
+.. autofunction:: tvm.call_llvm_intrin
 .. autofunction:: tvm.register_intrin_rule
 .. autofunction:: tvm.exp
 .. autofunction:: tvm.log
diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h
index 32d574340052..52499fb9186f 100644
--- a/include/tvm/runtime/c_runtime_api.h
+++ b/include/tvm/runtime/c_runtime_api.h
@@ -43,7 +43,7 @@
 #endif
 
 // TVM version
-#define TVM_VERSION "0.4.0"
+#define TVM_VERSION "0.5.dev"
 
 
 // TVM Runtime is DLPack compatible.
diff --git a/python/tvm/_ffi/libinfo.py b/python/tvm/_ffi/libinfo.py
index 390849f8536d..f911829d38b1 100644
--- a/python/tvm/_ffi/libinfo.py
+++ b/python/tvm/_ffi/libinfo.py
@@ -100,5 +100,7 @@ def find_lib_path(name=None, search_path=None, optional=False):
 
 
 # current version
-# We use the version of the incoming release for code that is under development
-__version__ = "0.4.0"
+# We use the version of the incoming release for code
+# that is under development.
+# The following line is set by tvm/python/update_version.py
+__version__ = "0.5.dev"
diff --git a/python/update_version.py b/python/update_version.py
new file mode 100644
index 000000000000..9e958f109479
--- /dev/null
+++ b/python/update_version.py
@@ -0,0 +1,66 @@
+"""
+This is the global script that set the version information of TVM.
+This script runs and update all the locations that related to versions
+
+List of affected files:
+- tvm-root/python/tvm/_ffi/libinfo.py
+- tvm-root/include/tvm/runtime/c_runtime_api.h
+- tvm-root/web/tvm_runtime.js
+- tvm-root/conda/tvm/meta.yaml
+- tvm-root/conda/topi/meta.yaml
+- tvm-root/conda/nnvm/meta.yaml
+- tvm-root/conda/tvm-libs/meta.yaml
+"""
+import os
+import re
+# current version
+# We use the version of the incoming release for code
+# that is under development
+__version__ = "0.5.dev"
+
+# Implementations
+def update(file_name, pattern, repl):
+    update = []
+    hit_counter = 0
+    need_update = False
+    for l in open(file_name):
+        result = re.findall(pattern, l)
+        if result:
+            assert len(result) == 1
+            hit_counter += 1
+            if result[0] != repl:
+                l = re.sub(pattern, repl, l)
+                need_update = True
+                print("%s: %s->%s" % (file_name, result[0], repl))
+            else:
+                print("%s: version is already %s" % (file_name, repl))
+
+        update.append(l)
+    if hit_counter != 1:
+        raise RuntimeError("Cannot find version in %s" % file_name)
+
+    if need_update:
+        with open(file_name, "w") as output_file:
+            for l in update:
+                output_file.write(l)
+
+
+def main():
+    curr_dir = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+    proj_root = os.path.abspath(os.path.join(curr_dir, ".."))
+    # python path
+    update(os.path.join(proj_root, "python", "tvm", "_ffi", "libinfo.py"),
+           r"(?<=__version__ = \")[.0-9a-z]+", __version__)
+    # C++ header
+    update(os.path.join(proj_root, "include", "tvm", "runtime", "c_runtime_api.h"),
+           "(?<=TVM_VERSION \")[.0-9a-z]+", __version__)
+    # conda
+    for path in ["tvm", "topi", "nnvm", "tvm-libs"]:
+        update(os.path.join(proj_root, "conda", path, "meta.yaml"),
+               "(?<=version = \")[.0-9a-z]+", __version__)
+    # web
+    update(os.path.join(proj_root, "web", "tvm_runtime.js"),
+           "(?<=@version )[.0-9a-z]+", __version__)
+
+if __name__ == "__main__":
+    main()
diff --git a/web/tvm_runtime.js b/web/tvm_runtime.js
index 786745d3ce88..2eab15093b72 100644
--- a/web/tvm_runtime.js
+++ b/web/tvm_runtime.js
@@ -2,7 +2,7 @@
  * TVM Javascript web runtime library.
  *
  * @projectname tvm
- * @version 0.1
+ * @version 0.5.dev
  */
 /* eslint no-unused-vars: "off" */
 /* eslint no-unexpected-multiline: "off" */

From 21e1301086f74996af2d54e81ff342aa65b6cd35 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Tue, 21 Aug 2018 12:40:23 -0500
Subject: [PATCH 42/77] Add int8 gemm recipe (#1614)

---
 topi/recipe/gemm/gemm_int8.py | 185 ++++++++++++++++++++++++++++++++++
 1 file changed, 185 insertions(+)
 create mode 100644 topi/recipe/gemm/gemm_int8.py

diff --git a/topi/recipe/gemm/gemm_int8.py b/topi/recipe/gemm/gemm_int8.py
new file mode 100644
index 000000000000..61ef97d0a2bf
--- /dev/null
+++ b/topi/recipe/gemm/gemm_int8.py
@@ -0,0 +1,185 @@
+"Example code to perform int8 GEMM"
+import logging
+import sys
+import numpy as np
+import tvm
+from tvm import autotvm
+
+DO_TUNING = True
+PRETUNED_INDEX = 75333
+
+def intrin_dot():
+    n = 4  # dp4a requires operands packed by 4
+    x = tvm.placeholder((n,), name='x', dtype='int8')
+    y = tvm.placeholder((n,), name='y', dtype='int8')
+    k = tvm.reduce_axis((0, n), name='k')
+
+    z = tvm.compute(
+        (1,), lambda _: tvm.sum(
+            x[k].astype('int32') * y[k].astype('int32'), axis=k))
+
+    def intrin_func(ins, outs):
+        xx, yy = ins
+        zz = outs[0]
+        ib = tvm.ir_builder.create()
+
+        dp4a = zz.vstore(0, tvm.call_pure_extern('int32', '__dp4a',
+                                                 xx.vload(0, dtype='int8x4'),
+                                                 yy.vload(0, dtype='int8x4'),
+                                                 zz.vload(0)))
+        ib.emit(dp4a)
+
+        body = ib.get()
+        return body, zz.vstore(0, 0), body
+
+    with tvm.build_config(data_alignment=4, offset_factor=1) as cfg:
+        binds = {t: tvm.decl_buffer(t.shape, t.dtype, t.op.name,
+                                    data_alignment=cfg.data_alignment,
+                                    offset_factor=cfg.offset_factor,
+                                    scope='local') for t in [x, y, z]}
+        return tvm.decl_tensor_intrin(z.op, intrin_func, binds=binds)
+
+
+dot = intrin_dot()
+
+
+@autotvm.template
+def gemm_int8(n, m, l):
+    A = tvm.placeholder((n, l), name='A', dtype='int8')
+    B = tvm.placeholder((m, l), name='B', dtype='int8')
+
+    k = tvm.reduce_axis((0, l), name='k')
+    C = tvm.compute((n, m), lambda i, j: tvm.sum(A[i, k].astype('int32') * B[j, k].astype(
+        'int32'), axis=k), name='C')
+
+    cfg = autotvm.get_config()
+    s = tvm.create_schedule(C.op)
+    y, x = C.op.axis
+
+    AA = s.cache_read(A, 'shared', [C])
+    BB = s.cache_read(B, 'shared', [C])
+    AL = s.cache_read(AA, 'local', [C])
+    BL = s.cache_read(BB, 'local', [C])
+    CC = s.cache_write(C, 'local')
+
+    k = CC.op.reduce_axis[0]
+
+    cfg.define_split('tile_k', cfg.axis(k), num_outputs=3,
+                     filter=lambda entity: entity.size[2] == 4 and \
+                     entity.size[0] * 2 >= entity.size[1])
+
+    ko, kt, ki = cfg['tile_k'].apply(s, CC, k)
+
+    s[CC].tensorize(ki, dot)
+
+    block_x = tvm.thread_axis('blockIdx.x')
+    block_y = tvm.thread_axis('blockIdx.y')
+    thread_x = tvm.thread_axis('threadIdx.x')
+    thread_y = tvm.thread_axis('threadIdx.y')
+
+    def block_size_filter(entity):
+        return entity.size[0] * 2 >= entity.size[1] * 2 and \
+                entity.size[1] <= 16 and entity.size[3] <= 4
+    cfg.define_split('tile_y', cfg.axis(y), num_outputs=4, filter=block_size_filter)
+    cfg.define_split('tile_x', cfg.axis(x), num_outputs=4, filter=block_size_filter)
+    by, tyz, ty, yi = cfg['tile_y'].apply(s, C, y)
+    bx, txz, tx, xi = cfg['tile_x'].apply(s, C, x)
+
+    s[C].bind(by, block_y)
+    s[C].bind(bx, block_x)
+    s[C].bind(tyz, tvm.thread_axis('vthread'))
+    s[C].bind(txz, tvm.thread_axis('vthread'))
+    s[C].bind(ty, thread_y)
+    s[C].bind(tx, thread_x)
+    s[C].reorder(by, bx, tyz, txz, ty, tx, yi, xi)
+
+    s[CC].compute_at(s[C], tx)
+
+    yo, xo = CC.op.axis
+    s[CC].reorder(ko, kt, yo, xo, ki)
+    s[CC].unroll(kt)
+
+    for stage in [AL, BL]:
+        s[stage].compute_at(s[CC], kt)
+        _, xi = s[stage].split(stage.op.axis[1], factor=4)
+        s[stage].vectorize(xi)
+        s[stage].double_buffer()
+
+    cfg.define_knob('storage_align', [16, 48])
+    for stage in [AA, BB]:
+        s[stage].storage_align(s[stage].op.axis[0],
+                               cfg['storage_align'].val, 0)
+        s[stage].compute_at(s[CC], ko)
+
+        fused = s[stage].fuse(*s[stage].op.axis)
+        ty, tx = s[stage].split(fused, nparts=cfg['tile_y'].size[2])
+        tx, xi = s[stage].split(tx, nparts=cfg['tile_x'].size[2])
+        _, xi = s[stage].split(xi, factor=16)
+
+        s[stage].bind(ty, thread_y)
+        s[stage].bind(tx, thread_x)
+        s[stage].vectorize(xi)
+
+    cfg.define_knob('auto_unroll_max_step', [512, 1500])
+    s[C].pragma(by, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val)
+    s[C].pragma(by, 'unroll_explicit', False)
+
+    cfg.add_flop(n*m*l*2)
+    return s, [A, B, C]
+
+
+if __name__ == '__main__':
+    N = 2048
+    n = m = l = N
+
+    logging.basicConfig(level=logging.DEBUG, stream=sys.stdout)
+    task = autotvm.task.create(gemm_int8, args=(n, m, l), target='cuda')
+    print(task.config_space)
+
+    measure_option = autotvm.measure_option(
+        measure_func='local', number=10, n_parallel=8, timeout=20)
+    log_name = 'gemm_int8.log'
+    if DO_TUNING:
+        tuner = autotvm.tuner.XGBTuner(task)
+        tuner.tune(n_trial=1000, measure_option=measure_option,
+               callbacks=[autotvm.callback.log_to_file(log_name)])
+
+        dispatch_context = autotvm.apply_history_best(log_name)
+        best_config = dispatch_context.query(task.target, task.workload)
+        print('\nBest config:')
+        print(best_config)
+    else:
+        config = task.config_space.get(PRETUNED_INDEX)
+        dispatch_context = autotvm.task.ApplyConfig(config)
+        print("Using pretuned config:")
+        print(config)
+
+    with dispatch_context:
+        with tvm.target.create('cuda'):
+            s, arg_bufs = gemm_int8(n, m, l)
+            f = tvm.build(s, arg_bufs, 'cuda', name='gemm_int8')
+
+    ctx = tvm.context('cuda', 0)
+
+    a_np = np.random.randint(size=(n, l), low=-128, high=127, dtype='int8')
+    b_np = np.random.randint(size=(m, l), low=-128, high=127, dtype='int8')
+
+    a = tvm.nd.array(a_np, ctx)
+    b = tvm.nd.array(b_np, ctx)
+    c = tvm.nd.array(np.zeros((n, m), dtype='int32'), ctx)
+    f(a, b, c)
+
+    np.testing.assert_allclose(
+        c.asnumpy(),
+        np.dot(
+            a_np.astype('int32'),
+            b_np.T.astype('int32')),
+        rtol=1e-5)
+
+    num_ops = 2 * l * m * n
+    num_runs = 1000
+    timer_f = f.time_evaluator(f.entry_name, ctx, number=num_runs)
+    t = timer_f(a, b, c).mean
+    GOPS = num_ops / (t * 1e3) / 1e6
+    print("average time cost of %d runs = %g ms, %g GOPS." %
+          (num_runs, t * 1e3, GOPS))

From 16d3c1f6af2b1a564c89b37c6c057d007f78bf41 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Tue, 21 Aug 2018 16:35:59 -0700
Subject: [PATCH 43/77] [RUNTIME] Add TypedPackedFunc (#1626)

---
 include/tvm/runtime/packed_func.h | 251 ++++++++++++++++++++++++++++++
 tests/cpp/packed_func_test.cc     |  23 +++
 2 files changed, 274 insertions(+)

diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h
index 63e8ca7cd16b..758d03b5b18b 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -118,6 +118,163 @@ class PackedFunc {
   FType body_;
 };
 
+/*!
+ * \brief Please refer to \ref TypedPackedFuncAnchor "TypedPackedFunc<R(Args..)>"
+ */
+template<typename FType>
+class TypedPackedFunc;
+
+/*!
+ * \anchor TypedPackedFuncAnchor
+ * \brief A PackedFunc wrapper to provide typed function signature.
+ * It is backed by a PackedFunc internally.
+ *
+ * TypedPackedFunc enables compile time type checking.
+ * TypedPackedFunc works with the runtime system:
+ * - It can be passed as an argument of PackedFunc.
+ * - It can be assigned to TVMRetValue.
+ * - It can be directly converted to a type-erased PackedFunc.
+ *
+ * Developers should prefer TypedPackedFunc over PackedFunc in C++ code
+ * as it enables compile time checking.
+ * We can construct a TypedPackedFunc from a lambda function
+ * with the same signature.
+ *
+ * \code
+ *  // user defined lambda function.
+ *  auto addone = [](int x)->int {
+ *    return x + 1;
+ *  };
+ *  // We can directly convert
+ *  // lambda function to TypedPackedFunc
+ *  TypedPackedFunc<int(int)> ftyped(addone);
+ *  // invoke the function.
+ *  int y = ftyped(1);
+ *  // Can be directly converted to PackedFunc
+ *  PackedFunc packed = ftype;
+ * \endcode
+ * \tparam R The return value of the function.
+ * \tparam Args The argument signature of the function.
+ */
+template<typename R, typename ...Args>
+class TypedPackedFunc<R(Args...)> {
+ public:
+  /*! \brief short hand for this function type */
+  using TSelf = TypedPackedFunc<R(Args...)>;
+  /*! \brief default constructor */
+  TypedPackedFunc() {}
+  /*!
+   * \brief construct by wrap a PackedFunc
+   *
+   * Example usage:
+   * \code
+   * PackedFunc packed([](TVMArgs args, TVMRetValue *rv) {
+   *   int x = args[0];
+   *   *rv = x + 1;
+   *  });
+   * // construct from packed function
+   * TypedPackedFunc<int(int)> ftyped(packed);
+   * // call the typed version.
+   * CHECK_EQ(ftyped(1), 2);
+   * \endcode
+   *
+   * \param packed The packed function
+   */
+  explicit TypedPackedFunc(PackedFunc packed)
+      : packed_(packed) {
+  }
+  /*!
+   * \brief construct from a lambda function with the same signature.
+   *
+   * Example usage:
+   * \code
+   * auto typed_lambda = [](int x)->int { return x + 1; }
+   * // construct from packed function
+   * TypedPackedFunc<int(int)> ftyped(typed_lambda);
+   * // call the typed version.
+   * CHECK_EQ(ftyped(1), 2);
+   * \endcode
+   *
+   * \param typed_lambda typed lambda function.
+   * \tparam FLambda the type of the lambda function.
+   */
+  template<typename FLambda,
+           typename = typename std::enable_if<
+             std::is_convertible<FLambda,
+                                 std::function<R(Args...)>
+                                 >::value>::type>
+  explicit TypedPackedFunc(const FLambda& typed_lambda) {
+    this->AssignTypedLambda(typed_lambda);
+  }
+  /*!
+   * \brief copy assignment operator from typed lambda
+   *
+   * Example usage:
+   * \code
+   * // construct from packed function
+   * TypedPackedFunc<int(int)> ftyped;
+   * ftyped = [](int x) { return x + 1; }
+   * // call the typed version.
+   * CHECK_EQ(ftyped(1), 2);
+   * \endcode
+   *
+   * \param typed_lambda typed lambda function.
+   * \tparam FLambda the type of the lambda function.
+   * \returns reference to self.
+   */
+  template<typename FLambda,
+           typename = typename std::enable_if<
+             std::is_convertible<FLambda,
+                                 std::function<R(Args...)>
+                                 >::value>::type>
+  TSelf& operator=(FLambda typed_lambda) {  // NOLINT(*)
+    this->AssignTypedLambda(typed_lambda);
+    return *this;
+  }
+  /*!
+   * \brief copy assignment operator from PackedFunc.
+   * \param packed The packed function.
+   * \returns reference to self.
+   */
+  TSelf& operator=(PackedFunc packed) {
+    packed_ = packed;
+    return *this;
+  }
+  /*!
+   * \brief Invoke the operator.
+   * \param args The arguments
+   * \returns The return value.
+   */
+  inline R operator()(Args ...args) const;
+  /*!
+   * \brief convert to PackedFunc
+   * \return the internal PackedFunc
+   */
+  operator PackedFunc() const {
+    return packed();
+  }
+  /*!
+   * \return reference the internal PackedFunc
+   */
+  const PackedFunc& packed() const {
+    return packed_;
+  }
+
+ private:
+  friend class TVMRetValue;
+  /*! \brief The internal packed function */
+  PackedFunc packed_;
+  /*!
+   * \brief Assign the packed field using a typed lambda function.
+   *
+   * \param flambda The lambda function.
+   * \tparam FLambda The lambda function type.
+   * \note We capture the lambda when possible for maximum efficiency.
+   */
+  template<typename FLambda>
+  inline void AssignTypedLambda(FLambda flambda);
+};
+
 /*! \brief Arguments into TVM functions. */
 class TVMArgs {
  public:
@@ -361,6 +518,10 @@ class TVMArgValue : public TVMPODValue_ {
     TVM_CHECK_TYPE_CODE(type_code_, kFuncHandle);
     return *ptr<PackedFunc>();
   }
+  template<typename FType>
+  operator TypedPackedFunc<FType>() const {
+    return TypedPackedFunc<FType>(operator PackedFunc());
+  }
   operator Module() const {
     TVM_CHECK_TYPE_CODE(type_code_, kModuleHandle);
     return *ptr<Module>();
@@ -446,6 +607,10 @@ class TVMRetValue : public TVMPODValue_ {
     TVM_CHECK_TYPE_CODE(type_code_, kFuncHandle);
     return *ptr<PackedFunc>();
   }
+  template<typename FType>
+  operator TypedPackedFunc<FType>() const {
+    return TypedPackedFunc<FType>(operator PackedFunc());
+  }
   operator Module() const {
     TVM_CHECK_TYPE_CODE(type_code_, kModuleHandle);
     return *ptr<Module>();
@@ -512,6 +677,10 @@ class TVMRetValue : public TVMPODValue_ {
     this->SwitchToClass(kFuncHandle, f);
     return *this;
   }
+  template<typename FType>
+  TVMRetValue& operator=(const TypedPackedFunc<FType>& f) {
+    return operator=(f.packed());
+  }
   TVMRetValue& operator=(Module m) {
     this->SwitchToClass(kModuleHandle, m);
     return *this;
@@ -847,6 +1016,10 @@ class TVMArgsSetter {
     values_[i].v_handle = const_cast<PackedFunc*>(&value);
     type_codes_[i] = kFuncHandle;
   }
+  template<typename FType>
+  void operator()(size_t i, const TypedPackedFunc<FType>& value) const {  // NOLINT(*)
+    operator()(i, value.packed());
+  }
   void operator()(size_t i, const Module& value) const {  // NOLINT(*)
     values_[i].v_handle = const_cast<Module*>(&value);
     type_codes_[i] = kModuleHandle;
@@ -894,6 +1067,84 @@ inline TVMRetValue PackedFunc::operator()(Args&& ...args) const {
   return rv;
 }
 
+namespace detail {
+template<typename R, int nleft, int index, typename F>
+struct unpack_call_dispatcher {
+  template<typename ...Args>
+  static void run(const F& f,
+                  const TVMArgs& args_pack,
+                  TVMRetValue* rv,
+                  Args&&... unpacked_args) {
+    unpack_call_dispatcher<R, nleft - 1, index + 1, F>
+        ::run(f, args_pack, rv,
+              std::forward<Args>(unpacked_args)...,
+              args_pack[index]);
+  }
+};
+
+template<typename R, int index, typename F>
+struct unpack_call_dispatcher<R, 0, index, F> {
+  template<typename ...Args>
+  static void run(const F& f,
+                  const TVMArgs& args_pack,
+                  TVMRetValue* rv,
+                  Args&&... unpacked_args) {
+    *rv = R(f(std::forward<Args>(unpacked_args)...));
+  }
+};
+
+template<int index, typename F>
+struct unpack_call_dispatcher<void, 0, index, F> {
+  template<typename ...Args>
+  static void run(const F& f,
+                  const TVMArgs& args_pack,
+                  TVMRetValue* rv,
+                  Args&&... unpacked_args) {
+    f(std::forward<Args>(unpacked_args)...);
+  }
+};
+
+template<typename R, int nargs, typename F>
+inline void unpack_call(const F& f, const TVMArgs& args, TVMRetValue* rv) {
+  unpack_call_dispatcher<R, nargs, 0, F>::run(f, args, rv);
+}
+
+template<typename R, typename ...Args>
+inline R call_packed(const PackedFunc& pf, Args&& ...args) {
+  return R(pf(std::forward<Args>(args)...));
+}
+
+template<typename R>
+struct typed_packed_call_dispatcher {
+  template<typename ...Args>
+  static inline R run(const PackedFunc& pf, Args&& ...args) {
+    return pf(std::forward<Args>(args)...);
+  }
+};
+
+template<>
+struct typed_packed_call_dispatcher<void> {
+  template<typename ...Args>
+  static inline void run(const PackedFunc& pf, Args&& ...args) {
+    pf(std::forward<Args>(args)...);
+  }
+};
+}  // namespace detail
+
+template<typename R, typename ...Args>
+template<typename FType>
+inline void TypedPackedFunc<R(Args...)>::AssignTypedLambda(FType flambda) {
+  packed_ = PackedFunc([flambda](const TVMArgs& args, TVMRetValue* rv) {
+      detail::unpack_call<R, sizeof...(Args)>(flambda, args, rv);
+    });
+}
+
+template<typename R, typename ...Args>
+inline R TypedPackedFunc<R(Args...)>::operator()(Args... args) const {
+  return detail::typed_packed_call_dispatcher<R>
+      ::run(packed_, std::forward<Args>(args)...);
+}
+
 // extension and node type handling
 namespace detail {
 template<typename T, typename TSrc, bool is_ext>
diff --git a/tests/cpp/packed_func_test.cc b/tests/cpp/packed_func_test.cc
index 9b2f1df73731..abe26fabe9ea 100644
--- a/tests/cpp/packed_func_test.cc
+++ b/tests/cpp/packed_func_test.cc
@@ -135,6 +135,29 @@ TEST(PackedFunc, Type) {
   CHECK(get_type2("float32x2").operator Type() == Float(32, 2));
 }
 
+TEST(TypedPackedFunc, HighOrder) {
+  using namespace tvm;
+  using namespace tvm::runtime;
+  using Int1Func = TypedPackedFunc<int(int)>;
+  using Int2Func = TypedPackedFunc<int(int, int)>;
+  using BindFunc = TypedPackedFunc<Int1Func(Int2Func, int value)>;
+  BindFunc ftyped;
+  ftyped = [](Int2Func f1, int value) -> Int1Func {
+    auto binded = [f1, value](int x) {
+      return f1(value, x);
+    };
+    Int1Func x(binded);
+    return x;
+  };
+  auto add = [](int x, int y) { return x + y; };
+  CHECK_EQ(ftyped(Int2Func(add), 1)(2), 3);
+  PackedFunc f = ftyped(Int2Func(add), 1);
+  CHECK_EQ(f(3).operator int(), 4);
+  // call the type erased version.
+  Int1Func f1 = ftyped.packed()(Int2Func(add), 1);
+  CHECK_EQ(f1(3), 4);
+}
+
 // new namespoace
 namespace test {
 // register int vector as extension type

From 729224b17f5ff1b64c5ee086fabc3dac61742725 Mon Sep 17 00:00:00 2001
From: eqy <eqy@cs.washington.edu>
Date: Tue, 21 Aug 2018 18:35:03 -0700
Subject: [PATCH 44/77] check in (#1629)

---
 .../java/ml/dmlc/tvm/tvmrpc/MainActivity.java | 24 ++++---------------
 .../app/src/main/res/layout/content_main.xml  | 11 ---------
 .../app/src/main/res/values/strings.xml       |  3 +--
 3 files changed, 5 insertions(+), 33 deletions(-)

diff --git a/apps/android_rpc/app/src/main/java/ml/dmlc/tvm/tvmrpc/MainActivity.java b/apps/android_rpc/app/src/main/java/ml/dmlc/tvm/tvmrpc/MainActivity.java
index d80008bbe258..2ea4e4cb7528 100644
--- a/apps/android_rpc/app/src/main/java/ml/dmlc/tvm/tvmrpc/MainActivity.java
+++ b/apps/android_rpc/app/src/main/java/ml/dmlc/tvm/tvmrpc/MainActivity.java
@@ -39,11 +39,9 @@
 
 
 public class MainActivity extends AppCompatActivity {
-  private boolean skipRelaunch = true;
   // wait time before automatic restart of RPC Activity
   public static final int HANDLER_RESTART_DELAY = 5000;
 
-
   private void showDialog(String title, String msg) {
     AlertDialog.Builder builder = new AlertDialog.Builder(this);
     builder.setTitle(title);
@@ -91,7 +89,7 @@ private void setupRelaunch() {
     final Runnable rPCStarter = new Runnable() {
         public void run() {
             if (switchPersistent.isChecked()) {
-              System.err.println("relaunching RPC activity in 5s...");
+              System.err.println("relaunching RPC activity...");
               Intent intent = ((MainActivity) context).updateRPCPrefs();
               startActivity(intent);
             }
@@ -116,6 +114,7 @@ public void onCheckedChanged(CompoundButton buttonView, boolean isChecked) {
         if (isChecked) {
           System.err.println("automatic RPC restart enabled...");
           updateRPCPrefs();
+          setupRelaunch();
         } else {
           System.err.println("automatic RPC restart disabled...");
           updateRPCPrefs();
@@ -123,29 +122,14 @@ public void onCheckedChanged(CompoundButton buttonView, boolean isChecked) {
       }
     });
 
-    Button startRPC = findViewById(R.id.button_start_rpc);
-    startRPC.setOnClickListener(new View.OnClickListener() {
-        public void onClick(View v) {
-            Intent intent = ((MainActivity) context).updateRPCPrefs();
-            startActivity(intent);
-        }
-    });
-
     enableInputView(true);
   }
 
   @Override
   protected void onResume() {
     System.err.println("MainActivity onResume...");
-    System.err.println("skipRelaunch: " + skipRelaunch);
-    // if this is the first time onResume is called, do nothing, otherwise we
-    // may double launch
-    if (!skipRelaunch) {
-        enableInputView(true);
-        setupRelaunch();
-    } else {
-        skipRelaunch = false;
-    }
+    enableInputView(true);
+    setupRelaunch();
     super.onResume();
   }
 
diff --git a/apps/android_rpc/app/src/main/res/layout/content_main.xml b/apps/android_rpc/app/src/main/res/layout/content_main.xml
index 82be44d98451..69c1f76030df 100644
--- a/apps/android_rpc/app/src/main/res/layout/content_main.xml
+++ b/apps/android_rpc/app/src/main/res/layout/content_main.xml
@@ -78,15 +78,4 @@
             android:textOn="@string/switch_on" />
     </LinearLayout>
 
-    <LinearLayout
-        android:orientation="horizontal"
-        android:layout_width="fill_parent"
-        android:layout_height="wrap_content">
-        <Button
-            android:id="@+id/button_start_rpc"
-            android:layout_height="wrap_content"
-            android:layout_width="wrap_content"
-            android:text="@string/start_rpc" />
-    </LinearLayout>
-
 </LinearLayout>
diff --git a/apps/android_rpc/app/src/main/res/values/strings.xml b/apps/android_rpc/app/src/main/res/values/strings.xml
index 33caa374b496..f1ca2b90a001 100644
--- a/apps/android_rpc/app/src/main/res/values/strings.xml
+++ b/apps/android_rpc/app/src/main/res/values/strings.xml
@@ -9,11 +9,10 @@
     <string name="label_address">Address</string>
     <string name="label_port">Port</string>
     <string name="label_key">Key</string>
-    <string name="label_persistent">Keep RPC Alive</string>
+    <string name="label_persistent">Enable RPC</string>
 
     <string name="switch_on">Enabled</string>
     <string name="switch_off">Disabled</string>
 
-    <string name="start_rpc">Start RPC</string>
     <string name="stop_rpc">Stop RPC</string>
 </resources>

From b7beb1ebefa18e29bfbf8ff1c4f8f0c8892d93bc Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <mercy_zheng@sjtu.edu.cn>
Date: Tue, 21 Aug 2018 18:35:32 -0700
Subject: [PATCH 45/77] [AUTOTVM] Allow fallback for template & Fix bugs in
 tuners (#1615)

* support fallback & fix bugs in tuners & clean topi test

* update task extraction

* update task extraction

* fix arm tutorial

* Update tune_nnvm_arm.py
---
 nnvm/python/nnvm/compiler/build_module.py     |   5 +-
 .../compiler/test_autotvm_task_extraction.py  |  63 +++++++
 python/tvm/autotvm/__init__.py                |   3 +-
 python/tvm/autotvm/measure/measure.py         |   5 +-
 python/tvm/autotvm/measure/measure_methods.py |  44 ++---
 python/tvm/autotvm/task/__init__.py           |   2 +-
 python/tvm/autotvm/task/dispatcher.py         | 117 +++++++++----
 python/tvm/autotvm/task/nnvm_integration.py   | 117 +++++++++----
 python/tvm/autotvm/task/space.py              |  56 ++++++-
 python/tvm/autotvm/task/task.py               |   2 +-
 python/tvm/autotvm/tophub.py                  |   7 +-
 python/tvm/autotvm/tuner/ga_tuner.py          |  10 +-
 python/tvm/autotvm/tuner/model_based_tuner.py |  33 ++--
 .../tvm/autotvm/tuner/sa_model_optimizer.py   |   2 +-
 python/tvm/autotvm/tuner/tuner.py             |  13 +-
 .../tvm/autotvm/tuner/xgboost_cost_model.py   | 119 ++++++++-----
 python/tvm/autotvm/tuner/xgboost_tuner.py     |  17 +-
 python/tvm/exec/tophub.py                     |   9 +-
 python/tvm/target.py                          |   1 +
 .../unittest/test_autotvm_dispatch_context.py |  44 +++--
 tests/python/unittest/test_autotvm_space.py   |  15 +-
 .../unittest/test_autotvm_xgboost_model.py    |   6 +-
 topi/python/topi/arm_cpu/conv2d.py            |  86 +++++++---
 topi/python/topi/arm_cpu/depthwise_conv2d.py  |  14 +-
 topi/python/topi/x86/injective.py             |   2 +-
 topi/tests/python/common.py                   |  12 ++
 .../python/test_topi_bitserial_conv2d.py      |  25 ++-
 .../python/test_topi_bitserial_conv2d_rasp.py |  16 +-
 topi/tests/python/test_topi_bnn.py            |   2 +-
 topi/tests/python/test_topi_broadcast.py      |  25 +--
 topi/tests/python/test_topi_clip.py           |   3 +-
 topi/tests/python/test_topi_conv2d.py         |  47 ------
 topi/tests/python/test_topi_conv2d_hwcn.py    |  14 +-
 topi/tests/python/test_topi_conv2d_nchw.py    | 157 ++++++++++++------
 .../python/test_topi_conv2d_transpose_nchw.py |  22 +--
 topi/tests/python/test_topi_dense.py          |   9 +-
 .../python/test_topi_depthwise_conv2d.py      |  38 ++---
 tutorials/autotvm/tune_nnvm_arm.py            |   5 +-
 38 files changed, 756 insertions(+), 411 deletions(-)
 create mode 100644 nnvm/tests/python/compiler/test_autotvm_task_extraction.py
 create mode 100644 topi/tests/python/common.py
 delete mode 100644 topi/tests/python/test_topi_conv2d.py

diff --git a/nnvm/python/nnvm/compiler/build_module.py b/nnvm/python/nnvm/compiler/build_module.py
index 217598c9d79a..6fab4460b427 100644
--- a/nnvm/python/nnvm/compiler/build_module.py
+++ b/nnvm/python/nnvm/compiler/build_module.py
@@ -239,8 +239,9 @@ def build(graph, target=None, shape=None, dtype="float32",
         raise ValueError("Target is not set in env or passed as argument.")
     target = tvm.target.create(target)
 
-    # if not inside an autotvm config dispatch context, load pre-tuned parameters from TopHub
-    if autotvm.task.DispatchContext.current is None:
+    # If current dispatch context is fallback context (the default root context),
+    # then load pre-tuned parameters from TopHub
+    if isinstance(autotvm.DispatchContext.current, autotvm.FallbackContext):
         tophub_context = autotvm.tophub.context(target)
     else:
         tophub_context = autotvm.util.EmptyContext()
diff --git a/nnvm/tests/python/compiler/test_autotvm_task_extraction.py b/nnvm/tests/python/compiler/test_autotvm_task_extraction.py
new file mode 100644
index 000000000000..fd14934f8ade
--- /dev/null
+++ b/nnvm/tests/python/compiler/test_autotvm_task_extraction.py
@@ -0,0 +1,63 @@
+"""Test task extraction for autotvm"""
+
+import nnvm.testing
+import nnvm.compiler
+from tvm import autotvm
+
+def get_network(name, batch_size):
+    """Get the symbol definition and random weight of a network"""
+    input_shape = (batch_size, 3, 224, 224)
+    output_shape = (batch_size, 1000)
+
+    if name == 'resnet-18':
+        net, params = nnvm.testing.resnet.get_workload(num_layers=18, batch_size=batch_size)
+    elif name == 'mobilenet':
+        net, params = nnvm.testing.mobilenet.get_workload(batch_size=batch_size)
+    elif name == 'squeezenet v1.1':
+        net, params = nnvm.testing.squeezenet.get_workload(batch_size=batch_size, version='1.1')
+    elif name == 'vgg-16':
+        net, params = nnvm.testing.vgg.get_workload(num_layers=16, batch_size=batch_size)
+    elif name == 'dcgan':
+        net, params = nnvm.testing.dcgan.get_workload(batch_size=batch_size)
+        input_shape = (batch_size, 100)
+    else:
+        raise ValueError("Unsupported network: " + name)
+
+    return net, params, input_shape, output_shape
+
+def test_task_extraction():
+    target = 'llvm'
+    dtype = 'float32'
+
+    net, params, input_shape, out_shape = get_network('resnet-18', batch_size=1)
+    tasks = autotvm.task.extract_from_graph(net, target=target,
+                                            shape={'data': input_shape}, dtype=dtype,
+                                            symbols=(nnvm.sym.conv2d,))
+    assert len(tasks) == 12
+
+    net, params, input_shape, out_shape = get_network('resnet-18', batch_size=1)
+    tasks = autotvm.task.extract_from_graph(net, target=target,
+                                            shape={'data': input_shape}, dtype=dtype,
+                                            symbols=(nnvm.sym.dense,))
+    assert len(tasks) == 1
+
+    net, params, input_shape, out_shape = get_network('resnet-18', batch_size=1)
+    tasks = autotvm.task.extract_from_graph(net, target=target,
+                                            shape={'data': input_shape}, dtype=dtype,
+                                            symbols=(nnvm.sym.conv2d, nnvm.sym.dense))
+    assert len(tasks) == 13
+
+    net, params, input_shape, out_shape = get_network('mobilenet', batch_size=1)
+    tasks = autotvm.task.extract_from_graph(net, target=target,
+                                            shape={'data': input_shape}, dtype=dtype,
+                                            symbols=(nnvm.sym.conv2d, nnvm.sym.dense))
+    assert len(tasks) == 20
+
+    net, params, input_shape, out_shape = get_network('dcgan', batch_size=1)
+    tasks = autotvm.task.extract_from_graph(net, target=target,
+                                            shape={'data': input_shape}, dtype=dtype,
+                                            symbols=(nnvm.sym.conv2d_transpose,))
+    assert len(tasks) == 4
+
+if __name__ == '__main__':
+    test_task_extraction()
diff --git a/python/tvm/autotvm/__init__.py b/python/tvm/autotvm/__init__.py
index 5b312d93d288..625b50c10853 100644
--- a/python/tvm/autotvm/__init__.py
+++ b/python/tvm/autotvm/__init__.py
@@ -25,5 +25,6 @@
 from .measure import measure_option, MeasureInput, MeasureResult, MeasureErrorNo
 from .tuner import callback
 from .task import template, get_config, create, ConfigSpace, ConfigEntity, \
-    ApplyHistoryBest as apply_history_best
+    register_topi_compute, register_topi_schedule, \
+    DispatchContext, FallbackContext, ApplyHistoryBest as apply_history_best
 from .env import GLOBAL_SCOPE
diff --git a/python/tvm/autotvm/measure/measure.py b/python/tvm/autotvm/measure/measure.py
index 2325a970bc45..2d780eeaf004 100644
--- a/python/tvm/autotvm/measure/measure.py
+++ b/python/tvm/autotvm/measure/measure.py
@@ -89,8 +89,9 @@ def measure_option(measure_func,
 
         callable: customized build function for other backends (e.g. VTA).
                   See measure/measure_methods.py::default_build_func for example.
-    check_correctness: bool
-        Whether check correctness after measurement. This will use llvm cpu as reference.
+    check_correctness: bool, optional
+        Whether check correctness after measurement. This will use llvm cpu target to generate
+        reference output.
     replay_db : Database, optional
         The database that we retrieve saved MeasureResult from.
 
diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index d845cc1f88fd..2d740b9493b2 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -83,7 +83,7 @@ def check_remote(target, device_key, tracker_addr=None, priority=2, timeout=10):
         The priority of this request, larger is more prior
     timeout: float, optional
         The timeout of this check (units: seconds).
-        If time is out, a RuntimerError will be raised.
+        If time is out, a RuntimeError will be raised.
     """
     def _check():
         remote = request_remote(device_key, tracker_addr, priority)
@@ -281,11 +281,11 @@ def fmeasure(input_pack, build_func, build_kwargs, number, repeat, ref_input, re
         results: List of MeasureResult
             The results for input_pack
         """
-        remote = request_remote(key, (host, port), priority, session_timeout)
+        remote_args = (key, (host, port), priority, session_timeout)
 
         res = _measure_common(input_pack, build_func, build_kwargs, number, repeat,
                               ref_input, ref_output,
-                              remote)
+                              remote_args)
         return res
 
     fmeasure.pack_size = pack_size
@@ -294,7 +294,7 @@ def fmeasure(input_pack, build_func, build_kwargs, number, repeat, ref_input, re
 
 
 def _measure_common(input_pack, build_func, build_kwargs, number, repeat,
-                    ref_input=None, ref_output=None, remote=None):
+                    ref_input=None, ref_output=None, remote_args=None):
     """Measure the time cost for a pack of inputs.
 
     (Note: A pack is a list of inputs which will be measured inside a same RPC session)
@@ -318,8 +318,8 @@ def _measure_common(input_pack, build_func, build_kwargs, number, repeat,
         Reference input for checking correctness
     ref_output: Array of np.ndarray, optional
         Reference output for checking correctness
-    remote: RPCSession, optional
-        The remote RPC session
+    remote_args: Tuple, optional
+        The arguments to request_remote. If is not None, will use remote rpc devices.
 
     Returns
     -------
@@ -327,7 +327,8 @@ def _measure_common(input_pack, build_func, build_kwargs, number, repeat,
         The list of results of measurement.
     """
     res_pack = []
-    tmp_dir = util.tempdir() if remote else None
+    tmp_dir = util.tempdir() if remote_args else None
+    assert len(input_pack) == 1, "Only supports input_pack == 1 for now"
 
     for inp in input_pack:
         tic = time.time()
@@ -360,31 +361,36 @@ def _measure_common(input_pack, build_func, build_kwargs, number, repeat,
                                           tstamp - tic, tstamp))
             continue
 
-        # upload built module
-        if remote:
-            remote.upload(tmp_dir.relpath(filename))
-            func = remote.load_module(filename)
-            ctx = remote.context(str(inp.target), 0)
-            time_f = func.time_evaluator(
-                func.entry_name, ctx, number=number, repeat=repeat)
-        else:
-            ctx = context(str(inp.target), 0)
-            time_f = func.time_evaluator(
-                func.entry_name, ctx, number=number, repeat=repeat)
-
         # measure time
         errno = MeasureErrorNo.NO_ERROR
         try:
+            # upload built module
+            if remote_args:
+                remote = request_remote(*remote_args)
+                remote.upload(tmp_dir.relpath(filename))
+                func = remote.load_module(filename)
+                ctx = remote.context(str(inp.target), 0)
+                time_f = func.time_evaluator(
+                    func.entry_name, ctx, number=number, repeat=repeat)
+            else:
+                ctx = context(str(inp.target), 0)
+                time_f = func.time_evaluator(
+                    func.entry_name, ctx, number=number, repeat=repeat)
+
+            # set input
             if ref_input:
                 args = [nd.array(x, ctx=ctx) for x in ref_input]
             else:
                 args = [nd.empty(get_const_tuple(x.shape), dtype=x.dtype, ctx=ctx)
                         for x in arg_bufs]
+
             costs = time_f(*args).results
             if len(costs) > 2:  # remove largest and smallest value to reduce variance
                 costs = list(costs)
                 costs.sort()
                 costs = tuple(costs[1:-1])
+
+            # check correctness of output
             if ref_output:
                 for expected, real in zip(ref_output, args):
                     if not np.allclose(expected, real.asnumpy(), rtol=1e-4):
diff --git a/python/tvm/autotvm/task/__init__.py b/python/tvm/autotvm/task/__init__.py
index 0d43f92656cd..7592fc5af7df 100644
--- a/python/tvm/autotvm/task/__init__.py
+++ b/python/tvm/autotvm/task/__init__.py
@@ -9,7 +9,7 @@
 from .task import Task, create, register, template, get_config, args_to_workload
 from .space import ConfigSpace, ConfigEntity
 from .code_hash import attach_code_hash, attach_code_hash_to_arg
-from .dispatcher import DispatchContext, ApplyConfig, ApplyHistoryBest, dispatcher
+from .dispatcher import DispatchContext, ApplyConfig, ApplyHistoryBest, FallbackContext, dispatcher
 
 from .topi_integration import register_topi_compute, register_topi_schedule
 from .nnvm_integration import extract_from_graph
diff --git a/python/tvm/autotvm/task/dispatcher.py b/python/tvm/autotvm/task/dispatcher.py
index 93f6d584abfa..ec1dcc44f141 100644
--- a/python/tvm/autotvm/task/dispatcher.py
+++ b/python/tvm/autotvm/task/dispatcher.py
@@ -21,7 +21,7 @@
 
 from tvm import target as _target
 
-from .space import ConfigSpace
+from .space import FallbackConfigEntity
 
 logger = logging.getLogger('autotvm')
 
@@ -34,9 +34,36 @@ class DispatchContext(object):
     """
     current = None
 
+    def __init__(self):
+        self._old_ctx = DispatchContext.current
+
     def query(self, target, workload):
         """
-        Query the context to get the specific implementation.
+        Query the context to get the specific config for a template.
+        If cannot find the result inside this context, this function will query it
+        from the upper contexts.
+
+        Parameters
+        ----------
+        target: Target
+            The current target
+        workload : Workload
+            The current workload.
+
+        Returns
+        -------
+        cfg : ConfigSpace
+            The specific configuration.
+        """
+        ret = self._query_inside(target, workload)
+        if ret is None:
+            ret = self._old_ctx.query(target, workload)
+        return ret
+
+    def _query_inside(self, target, workload):
+        """
+        Query the context to get the specific config for a template.
+        This function only query config inside this context.
 
         Parameters
         ----------
@@ -117,17 +144,17 @@ def _do_reg(myf):
     def dispatch_func(func, *args, **kwargs):
         """The wrapped dispatch function"""
         tgt = _target.current_target()
-        context = DispatchContext.current
-        if context is None:
-            raise RuntimeError("DispatchContext is not initialized")
         workload = func(*args, **kwargs)
-        cfg = context.query(tgt, workload)
-        if cfg.template_key:
-            return dispatch_dict[cfg.template_key](cfg, *args, **kwargs)
-        else:
-            assert dispatch_dict, "No func registered for this dispatcher"
+        cfg = DispatchContext.current.query(tgt, workload)
+        if cfg.is_fallback and not cfg.template_key:
+            # first try 'direct' template
+            if 'direct' in dispatch_dict:
+                return dispatch_dict['direct'](cfg, *args, **kwargs)
+            # otherwise pick a random template
             for v in dispatch_dict.values():
                 return v(cfg, *args, **kwargs)
+        else:
+            return dispatch_dict[cfg.template_key](cfg, *args, **kwargs)
 
     fdecorate = decorate(fworkload, dispatch_func)
     fdecorate.register = register
@@ -135,7 +162,7 @@ def dispatch_func(func, *args, **kwargs):
 
 
 class ApplyConfig(DispatchContext):
-    """Apply a specific config entity during query.
+    """Apply a deterministic config entity for all queries.
 
     Parameters
     ----------
@@ -147,7 +174,7 @@ def __init__(self, config):
         self._config = config
         self.workload = None
 
-    def query(self, target, workload):
+    def _query_inside(self, target, workload):
         """Override query"""
         self.workload = workload
         return self._config
@@ -164,20 +191,12 @@ class ApplyHistoryBest(DispatchContext):
         If is str, then it should be the filename of a records log file.
                    Each row of this file is an encoded record pair.
         Otherwise, it is an iterator.
-    default: ConfigEntity, optional
-        The default config to return when no history records
-    allow_fallback: bool
-        Whether allow to use a fallback configuration if cannot find
-        tuned result.
     """
-    def __init__(self, records, default=None, allow_fallback=False):
+    def __init__(self, records):
         super(ApplyHistoryBest, self).__init__()
 
         self.best_by_targetkey = {}
         self.best_by_model = {}
-        self._default = default
-        self._allow_fallback = allow_fallback
-        self.fallback = {}
 
         if records:
             self.load(records)
@@ -234,7 +253,7 @@ def load(self, records):
 
         logger.debug("Finish loading %d records", counter)
 
-    def query(self, target, workload):
+    def _query_inside(self, target, workload):
         if target is None:
             raise RuntimeError("Need a target context to find the history best. "
                                "Hint: If your target is llvm, use `with tvm.target.create('llvm'):`"
@@ -254,20 +273,50 @@ def query(self, target, workload):
             if key in self.best_by_targetkey:
                 return self.best_by_targetkey[key][0].config
 
-        if self._default:
-            return self._default
+        return None
+
+
+class FallbackContext(DispatchContext):
+    """
+    A fallback dispatch context.
+
+    Any tunable template can be called under this context.
+    This is the root context.
+    """
+
+    def __init__(self):
+        super(FallbackContext, self).__init__()
+        self.memory = {}
+        self.silent = False
+
+    def _query_inside(self, target, workload):
+        key = (str(target), workload)
+        if key in self.memory:
+            return self.memory[key]
 
-        if self._allow_fallback:
-            key = (target, workload)
-            if key in self.fallback:
-                return self.fallback[key]
+        if not self.silent:
             logger.warning(
                 "Cannot find config for target=%s, workload=%s. A fallback configuration "
                 "is used, which may bring great performance regression.", target, workload)
-            cfg = ConfigSpace()
-            self.fallback[key] = cfg
-            return cfg
+        cfg = FallbackConfigEntity()
+
+        # cache this config
+        self.memory[key] = cfg
+        return cfg
+
+    def clear_cache(self, target, workload):
+        """Clear fallback cache. Pass the same argument as _query_inside to this function
+        to clean the cache.
+
+        Parameters
+        ----------
+        target: Target
+            The current target
+        workload : Workload
+            The current workload.
+        """
+        key = (str(target), workload)
+        if key in self.memory:
+            del self.memory[key]
 
-        raise RuntimeError(
-            "Cannot find config for target=%s, workload=%s. You need to do tuning "
-            "for this workload to get the config." % (target, workload))
+DispatchContext.current = FallbackContext()
diff --git a/python/tvm/autotvm/task/nnvm_integration.py b/python/tvm/autotvm/task/nnvm_integration.py
index 1b50869fc378..9138cc288372 100644
--- a/python/tvm/autotvm/task/nnvm_integration.py
+++ b/python/tvm/autotvm/task/nnvm_integration.py
@@ -7,11 +7,10 @@
 import logging
 
 
-from ... import tensor, placeholder, target as _target
+from ... import tensor, placeholder, create_schedule, target as _target
 
 from ..util import get_const_tuple
 from .task import create, register
-from .dispatcher import ApplyHistoryBest
 
 logger = logging.getLogger('autotvm')
 
@@ -56,40 +55,68 @@ def __init__(self):
         import topi
         import nnvm
 
+        # NOTE: To add more symbols, you only need to change the following lists
+        # nnvm symbol -> topi compute
         self.symbol2topi = {
             nnvm.sym.conv2d: [topi.nn.conv2d, topi.nn.depthwise_conv2d_nchw],
-            nnvm.sym.conv2d_transpose: [topi.nn.conv2d_transpose],
+            nnvm.sym.conv2d_transpose: [topi.nn.conv2d_transpose_nchw],
+            nnvm.sym.dense: [topi.nn.dense],
         }
 
+        # topi compute -> autotvm task name
         self.topi_to_task = {
             topi.nn.conv2d: "topi_nn_conv2d",
             topi.nn.depthwise_conv2d_nchw: "topi_nn_depthwise_conv2d_nchw",
             topi.nn.conv2d_transpose_nchw: "topi_nn_conv2d_transpose_nchw",
+            topi.nn.dense: "topi_nn_dense",
         }
 
-        self._register_dummy()
+        self.topi_to_schedule = {
+            topi.nn.conv2d: [topi.generic.schedule_conv2d_nchw,
+                             topi.generic.schedule_conv2d_nhwc],
+            topi.nn.depthwise_conv2d_nchw: [topi.generic.schedule_depthwise_conv2d_nchw,
+                                            topi.generic.schedule_depthwise_conv2d_nhwc],
+            topi.nn.conv2d_transpose_nchw: [topi.generic.schedule_conv2d_transpose_nchw],
+            topi.nn.dense: [topi.generic.schedule_dense],
+        }
+
+        self._register_tracing()
         self._register_topi_task()
         self.task_collection = []
+        self.wanted_topi_funcs = list(self.topi_to_task.keys())
+
+    def _register_tracing(self):
+        """Register tracing function to track the topi function call"""
+        # register topi compute for "tracing" target
+        for topi_compute in self.topi_to_task:
+            def _local_scope(compute_func):
+                """start a scope to hold the local function in for loop"""
 
-    def _register_dummy(self):
-        """Register dummy function to track the topi function call"""
-        for func in self.topi_to_task:
-            def _local_scope(local_func):
-                """build a scope to holds the function"""
-                @local_func.register("dummy", )
-                def _dummy_func(*args, **kwargs):
+                @compute_func.register("tracing", )
+                def _tracing_topi_compute(*args, **kwargs):
                     assert not kwargs, "Do not support extracting tuning tasks when" \
                                        "kwargs is used in TOPI function call." \
                                        "Please modify it to use only positional args."
 
-                    if (self.topi_to_task[local_func], serialize_args(args)) \
-                            not in self.task_collection:
-                        self.task_collection.append((self.topi_to_task[local_func],
-                                                     serialize_args(args)))
-                    with _target.create("opencl"):
-                        return local_func(*args)
+                    if compute_func in self.wanted_topi_funcs:  # record this call
+                        key = (self.topi_to_task[compute_func], serialize_args(args))
+                        if key not in self.task_collection:
+                            self.task_collection.append(key)
+
+                    return compute_func.fdefault(*args)
+            _local_scope(topi_compute)
+
+        # register topi schedule for "tracing" target
+        for topi_compute in self.topi_to_task:
+            for topi_schedule in self.topi_to_schedule[topi_compute]:
+                def _local_scope_(schedule_func):
+                    """start a scope to hold the local function in for loop"""
 
-            _local_scope(func)
+                    @schedule_func.register("tracing", )
+                    def _tracing_topi_compute(outs):
+                        outs = [outs] if isinstance(outs, tensor.Tensor) else outs
+                        return create_schedule([x.op for x in outs])
+                _local_scope_(topi_schedule)
 
     def _register_topi_task(self):
         """register tuning wrapper for topi function"""
@@ -125,17 +152,47 @@ def _topi_nn_conv2d_transpose_nchw(*args, **kwargs):
             s = topi.generic.schedule_conv2d_transpose_nchw([C])
             return s, [A, W, C]
 
-    def reset(self):
-        """Reset task collections"""
+        @register("topi_nn_dense")
+        def _topi_nn_dense(*args, **kwargs):
+            assert not kwargs, "Do not support kwargs in template function call"
+            args = deserialize_args(args)
+            data, weight, bias = args
+            C = topi.nn.dense(*args, **kwargs)
+            s = topi.generic.schedule_dense([C])
+            if bias is not None:
+                return s, [data, weight, bias, C]
+            return s, [data, weight, C]
+
+    def reset(self, wanted_topi_funcs):
+        """Reset task collections
+
+        Parameters
+        ----------
+        wanted_topi_funcs: List of function
+            The topi function to be extracted
+        """
         self.task_collection = []
+        self.wanted_topi_funcs = wanted_topi_funcs
 
     def get_tasks(self):
-        """Get collected tasks"""
+        """Get collected tasks
+
+        Returns
+        -------
+        tasks: List of tuple(name, args)
+            A list of tasks extracted from the nnvm graph
+        """
         return self.task_collection
 
     @staticmethod
     def get():
-        """Get the single instance of TaskExtractEnv"""
+        """Get the single instance of TaskExtractEnv
+
+        Returns
+        -------
+        env: TaskExtractEnv
+            The single instance of TaskExtractEnv
+        """
         if not TaskExtractEnv.current:
             TaskExtractEnv.current = TaskExtractEnv()
         return TaskExtractEnv.current
@@ -144,8 +201,8 @@ def get():
 def extract_from_graph(graph, shape, dtype, target, symbols, target_host=None):
     """ Extract tuning tasks from a nnvm graph.
 
-    This function collects tunning tasks by building the graph
-    with a "dummy" target and tracing all the calls to topi.
+    This function collects tuning tasks by building the graph
+    with a "tracing" target and tracing all the calls to topi.
 
     Parameters
     ----------
@@ -158,7 +215,7 @@ def extract_from_graph(graph, shape, dtype, target, symbols, target_host=None):
     target: tvm.target.Target
         The compilation target
     symbols : Array of nnvm.symbol
-        Array of nnvm symbols
+        Array of nnvm symbols want to be tuned
     target_host: tvm.target.Target
         The host compilation target
 
@@ -179,16 +236,16 @@ def extract_from_graph(graph, shape, dtype, target, symbols, target_host=None):
             warnings.warn("Symbol %s is not tunable, ignored" % sym_name)
 
     # run compiler to collect all TOPI calls during compilation
-    env.reset()
+    env.reset(topi_funcs)
 
     # disable logger temporarily
     old_state = logger.disabled
     logger.disabled = True
 
-    # use a dummy target to do a fake compile for collecting topi calls
-    dummy_target = _target.create("opencl -device=dummy")
-    with ApplyHistoryBest([], allow_fallback=True):
-        nnvm.compiler.build(graph, target=dummy_target, shape=shape, dtype=dtype)
+    # use a "tracing" target to do a fake compile for collecting topi calls
+    tracing_target = _target.create("llvm -device=tracing")
+    nnvm.compiler.engine.clear_cache()
+    nnvm.compiler.build(graph, target=tracing_target, shape=shape, dtype=dtype)
 
     logger.disabled = old_state
 
diff --git a/python/tvm/autotvm/task/space.py b/python/tvm/autotvm/task/space.py
index ea823c6f2760..5a34353acfe9 100644
--- a/python/tvm/autotvm/task/space.py
+++ b/python/tvm/autotvm/task/space.py
@@ -567,15 +567,16 @@ class ConfigSpace(object):
     """
     def __init__(self):
         # private dict to provide sugar
-        self.space_map = OrderedDict()  # name -> space
+        self.space_map = OrderedDict()    # name -> space
         self._collect = True
         self._length = None
-        self._entity_map = OrderedDict()
+        self._entity_map = OrderedDict()  # name -> entity
         self._constraints = []
         self.errors = []
         self.template_key = None
         self.code_hash = None
         self.flop = 0
+        self.is_fallback = False
 
     @staticmethod
     def axis(var):
@@ -607,6 +608,15 @@ def define_split(self, name, axis, policy='all', **kwargs):
             If is 'candidate', try listed candidate.
         kwargs: dict
             extra arguments for policy
+            see examples below for how to use filter
+
+        Examples
+        --------
+        >>> # use custom candidates
+        >>> cfg.define_split('tile_x', x, policy='candidate', candidate=[[1, 4, 4], [4, 1, 4]])
+
+        >>> # use a filter that only accepts the split scheme whose inner most tile is less then 4
+        >>> cfg.define_split('tile_y', y, policy='all', filter=lambda x: x.size[-1] <= 4)
         """
         axes = [axis]
         return self._add_new_transform(SplitSpace, name, axes, policy, **kwargs)
@@ -889,3 +899,45 @@ def from_json_dict(json_dict):
     def __repr__(self):
         return "%s,%s,%s,%d" % (str(self._entity_map)[12:-1], self.template_key,
                                 self.code_hash, self.index)
+
+class FallbackConfigEntity(ConfigSpace):
+    """The config entity created to support fallback"""
+
+    def __init__(self):
+        super(FallbackConfigEntity, self).__init__()
+        self.is_fallback = True
+
+    def fallback_split(self, name, constraints):
+        """Fallback a split knob
+
+        Parameters
+        ----------
+        name: str
+            name of the knob
+        constraints: List of int
+            The maximum tile size for every dimension. Value `-1` means no constraint.
+
+        Examples
+        --------
+        If you use cfg.define_split('tile_0', 128, num_outputs=3),
+        Then cfg.fallback_split('tile_0', [-1, 8, 4]) will give you cfg['tile_0'].size = [4, 8, 4]
+
+        If you use cfg.define_split('tile_0', 49, num_outputs=3),
+        Then cfg.fallback_split('tile_0', [-1, 8, 4]) will give you cfg['tile_0'].size = [7, 7, 1]
+        """
+        space = self.space_map[name]
+        assert len(constraints) == space.num_outputs
+        indices = np.arange(space.num_outputs)
+
+        # '-1' means no constraint
+        constraints = [x if x != -1 else 1e10 for x in constraints]
+
+        for entity in reversed(space.entities):
+            if all([entity.size[i] <= constraints[i] for i in indices]):
+                self._entity_map[name] = entity
+                return
+
+        raise RuntimeError("Cannot find feasible fallback split entity for node: " + name)
+
+    def __repr__(self):
+        return "%s,%s,%s" % (str(self._entity_map)[12:-1], self.template_key, self.code_hash)
diff --git a/python/tvm/autotvm/task/task.py b/python/tvm/autotvm/task/task.py
index f8923fca56e3..ab52788c8d91 100644
--- a/python/tvm/autotvm/task/task.py
+++ b/python/tvm/autotvm/task/task.py
@@ -206,7 +206,7 @@ def args_to_workload(x):
     elif isinstance(x, (expr.StringImm, expr.IntImm, expr.FloatImm)):
         return x.value
     elif x is None:
-        return None
+        return 0
     else:
         raise RuntimeError('Do not support type "%s" in argument. Consider to use'
                            'primitive types only' % type(x))
diff --git a/python/tvm/autotvm/tophub.py b/python/tvm/autotvm/tophub.py
index e11bb7a4fc92..3d7b249df905 100644
--- a/python/tvm/autotvm/tophub.py
+++ b/python/tvm/autotvm/tophub.py
@@ -28,7 +28,7 @@ def _alias(name):
     return table.get(name, name)
 
 
-def context(target, extra_files=None, allow_fallback=False):
+def context(target, extra_files=None):
     """Return the dispatch context with pre-tuned parameters.
     The corresponding downloaded *.log files under tophub root path will be loaded.
     Users can also add their own files in argument `extra_files`.
@@ -39,12 +39,9 @@ def context(target, extra_files=None, allow_fallback=False):
         The compilation target
     extra_files: list of str, optional
         Extra log files to load
-    allow_fallback: bool
-        Whether allow to use a fallback configuration if cannot find
-        tuned result.
     """
     rootpath = AUTOTVM_TOPHUB_ROOT_PATH
-    best_context = ApplyHistoryBest([], allow_fallback=allow_fallback)
+    best_context = ApplyHistoryBest([])
 
     if isinstance(target, str):
         target = _target.create(target)
diff --git a/python/tvm/autotvm/tuner/ga_tuner.py b/python/tvm/autotvm/tuner/ga_tuner.py
index b92737ed5317..b9d900e49577 100644
--- a/python/tvm/autotvm/tuner/ga_tuner.py
+++ b/python/tvm/autotvm/tuner/ga_tuner.py
@@ -86,13 +86,9 @@ def update(self, inputs, results):
 
             # cross over
             indices = np.arange(len(genes))
-            max_score = np.max(scores)
-            if max_score < 1e-8:
-                probs = np.empty_like(scores)
-                probs[:] = 1.0 / len(scores)
-            else:
-                scores /= max_score
-                probs = scores / np.sum(scores)
+            scores += 1e-8
+            scores /= np.max(scores)
+            probs = scores / np.sum(scores)
             tmp_genes = []
             for _ in range(self.pop_size):
                 p1, p2 = np.random.choice(indices, size=2, replace=False, p=probs)
diff --git a/python/tvm/autotvm/tuner/model_based_tuner.py b/python/tvm/autotvm/tuner/model_based_tuner.py
index d1c1b16d3181..62fc57f2e869 100644
--- a/python/tvm/autotvm/tuner/model_based_tuner.py
+++ b/python/tvm/autotvm/tuner/model_based_tuner.py
@@ -8,7 +8,7 @@
 import numpy as np
 
 from .tuner import Tuner
-
+from ..env import GLOBAL_SCOPE
 
 class FeatureCache(object):
     """Feature cache manager for cache sharing between different cost models"""
@@ -119,11 +119,9 @@ def load_basemodel(self, base_model):
         """
         raise NotImplementedError()
 
-    def clone_new(self):
-        """Clone a new model with the same parameters.
-        This function will only copy hyperparameters of the tuner, not all the trained model
-
-        This is used for deriving a base model conveniently
+    def spawn_base_model(self):
+        """Clone a base model with the same parameters.
+        The base model is used to fit history data in transfer learning.
 
         Returns
         -------
@@ -221,7 +219,9 @@ def next_batch(self, batch_size):
                     break
                 self.trial_pt += 1
 
-            if self.trial_pt >= len(self.trials):  # trial list is empty, choose randomly
+            if self.trial_pt >= len(self.trials) - int(0.05 * self.plan_size):
+                # if the trial list is empty or
+                # the tuner is doing the last 5% trials (e-greedy), choose randomly
                 index = np.random.randint(len(self.space))
                 while index in self.visited:
                     index = np.random.randint(len(self.space))
@@ -264,18 +264,16 @@ def update(self, inputs, results):
             self.train_ct += 1
 
     def load_history(self, data_set):
-        # filter data, only pick the data with a same task
-        data = []
-        for inp, res in data_set:
-            if inp.task.name == self.task.name and \
-                            inp.config.template_key == self.task.config_space.template_key:
-                data.append((inp, res))
-        if not data:
-            return
+        # set in_tuning as True to make the feature extraction consistent
+        GLOBAL_SCOPE.in_tuning = True
 
         # fit base model
-        base_model = self.cost_model.clone_new()
-        base_model.fit_log(data, self.plan_size)
+        base_model = self.cost_model.spawn_base_model()
+        success = base_model.fit_log(data_set, self.plan_size)
+
+        if not success:
+            GLOBAL_SCOPE.in_tuning = False
+            return
 
         # use base model to select initial points
         if not self.trials:
@@ -285,6 +283,7 @@ def load_history(self, data_set):
             self.trial_pt = 0
 
         self.cost_model.load_basemodel(base_model)
+        GLOBAL_SCOPE.in_tuning = False
 
     def has_next(self):
         return len(self.visited) < len(self.space)
diff --git a/python/tvm/autotvm/tuner/sa_model_optimizer.py b/python/tvm/autotvm/tuner/sa_model_optimizer.py
index 6e1c373c113f..1947c6dde4e0 100644
--- a/python/tvm/autotvm/tuner/sa_model_optimizer.py
+++ b/python/tvm/autotvm/tuner/sa_model_optimizer.py
@@ -87,7 +87,7 @@ def find_maximums(self, model, num, exclusive):
 
             new_scores = model.predict(new_points)
 
-            ac_prob = np.exp((new_scores - scores) / t)
+            ac_prob = np.exp((new_scores - scores) / (t + 1e-2))
             ac_index = np.random.random(len(ac_prob)) < ac_prob
 
             points[ac_index] = new_points[ac_index]
diff --git a/python/tvm/autotvm/tuner/tuner.py b/python/tvm/autotvm/tuner/tuner.py
index 91004cba4603..cffbb9798392 100644
--- a/python/tvm/autotvm/tuner/tuner.py
+++ b/python/tvm/autotvm/tuner/tuner.py
@@ -31,6 +31,10 @@ def __init__(self, task, **kwargs):
         self.best_measure_pair = None
         self.best_iter = 0
 
+        # time to leave
+        self.ttl = None
+        self.n_trial = None
+
     def has_next(self):
         """Whether has next untried config in the space
 
@@ -76,7 +80,7 @@ def tune(self, n_trial, measure_option, early_stopping=None, callbacks=()):
         measure_option: dict
             The options for how to measure generated code.
             You should use the return value ot autotvm.measure_option for this argument.
-        early_stopping: int
+        early_stopping: int, optional
             Early stop the tuning when not finding better configs in this number of trials
         callbacks: List of callable
             A list of callback functions. The signature of callback function is
@@ -87,6 +91,8 @@ def tune(self, n_trial, measure_option, early_stopping=None, callbacks=()):
         measure_batch = create_measure_batch(self.task, measure_option)
         n_parallel = getattr(measure_batch, 'n_parallel', 1)
         early_stopping = early_stopping or 1e9
+        self.n_trial = n_trial
+
         old_level = logger.level
 
         GLOBAL_SCOPE.in_tuning = True
@@ -127,11 +133,12 @@ def tune(self, n_trial, measure_option, early_stopping=None, callbacks=()):
             for callback in callbacks:
                 callback(self, inputs, results)
 
-            if i > self.best_iter + early_stopping:
+            self.ttl = min(early_stopping + self.best_iter, n_trial) - i
+            if i >= self.best_iter + early_stopping:
                 logger.debug("Early stopped. Best iter: %d.", self.best_iter)
                 break
 
-            if error_ct > 50:
+            if error_ct > 150:
                 logger.warning("Too many errors happen in the tuning. Now is in debug mode")
                 logger.setLevel(logging.DEBUG)
             else:
diff --git a/python/tvm/autotvm/tuner/xgboost_cost_model.py b/python/tvm/autotvm/tuner/xgboost_cost_model.py
index 178e92476752..bda3ee26e062 100644
--- a/python/tvm/autotvm/tuner/xgboost_cost_model.py
+++ b/python/tvm/autotvm/tuner/xgboost_cost_model.py
@@ -31,8 +31,12 @@ class XGBoostCostModel(CostModel):
         If is 'curve', use sampled curve feature (relation feature).
 
         Note on choosing feature type:
-        For single task tuning, 'itervar' and 'knob' is good.
+        For single task tuning, 'itervar' and 'knob' are good.
                                 'itervar' is more accurate but 'knob' is much faster.
+                                There are some constraints on 'itervar', if you meet
+                                problems with feature extraction when using 'itervar',
+                                you can swith to 'knob'.
+
         For cross-shape tuning (e.g. many convolutions with different shapes),
                                'itervar' and 'curve' has better transferability,
                                'knob' is faster.
@@ -46,8 +50,11 @@ class XGBoostCostModel(CostModel):
         The number of threads.
     log_interval: int, optional
         If is not none, the cost model will print training log every `log_interval` iterations.
+    upper_model: XGBoostCostModel, optional
+        The upper model used in transfer learning
     """
-    def __init__(self, task, feature_type, loss_type, num_threads=None, log_interval=25):
+    def __init__(self, task, feature_type, loss_type, num_threads=4, log_interval=25,
+                 upper_model=None):
         super(XGBoostCostModel, self).__init__()
 
         if xgb is None:
@@ -109,35 +116,51 @@ def __init__(self, task, feature_type, loss_type, num_threads=None, log_interval
         else:
             raise RuntimeError("Invalid feature type " + feature_type)
 
-        self.feature_cache = FeatureCache()
+        if upper_model:  # share a same feature cache with upper model
+            self.feature_cache = upper_model.feature_cache
+        else:
+            self.feature_cache = FeatureCache()
+        self.upper_model = upper_model
         self.feature_extra_ct = 0
         self.pool = None
         self.base_model = None
-        self.upper_model = None
 
         self._sample_size = 0
+        self._reset_pool(self.space, self.target, self.task)
 
-        self._reset_pool()
+    def _reset_pool(self, space, target, task):
+        """reset processing pool for feature extraction"""
+
+        if self.upper_model:  # base model will reuse upper model's pool,
+            self.upper_model._reset_pool(space, target, task)
+            return
+
+        self._close_pool()
 
-    def _reset_pool(self):
-        # reset processing pool for feature extraction
-        if self.pool:
-            self.pool.terminate()
-            self.pool.join()
-            del self.pool
         # use global variable to pass common arguments
         global _extract_space, _extract_target, _extract_task
-        _extract_space = self.space
-        _extract_target = self.target
-        _extract_task = self.task
+        _extract_space = space
+        _extract_target = target
+        _extract_task = task
         self.pool = multiprocessing.Pool(self.num_threads)
 
+    def _close_pool(self):
+        if self.pool:
+            self.pool.terminate()
+            self.pool.join()
+            self.pool = None
+
+    def _get_pool(self):
+        if self.upper_model:
+            return self.upper_model._get_pool()
+        return self.pool
+
     def _base_model_discount(self):
-        return 1.0 / (2 ** (self._sample_size / 50.0))
+        return 1.0 / (2 ** (self._sample_size / 64.0))
 
     def fit(self, xs, ys, plan_size):
         tic = time.time()
-        self._reset_pool()
+        self._reset_pool(self.space, self.target, self.task)
 
         x_train = self._get_feature(xs)
         y_train = np.array(ys)
@@ -150,8 +173,12 @@ def fit(self, xs, ys, plan_size):
         self._sample_size = len(x_train)
 
         if self.base_model:
-            dtrain.set_base_margin(self._base_model_discount() *
-                                   self.base_model.predict(xs, output_margin=True))
+            discount = self._base_model_discount()
+            if discount < 0.05:  # discard base model
+                self.base_model.upper_model = None
+                self.base_model = None
+            else:
+                dtrain.set_base_margin(discount * self.base_model.predict(xs, output_margin=True))
 
         self.bst = xgb.train(self.xgb_params, dtrain,
                              num_boost_round=8000,
@@ -172,11 +199,19 @@ def fit(self, xs, ys, plan_size):
 
     def fit_log(self, records, plan_size):
         tic = time.time()
-        self._reset_pool()
 
-        args = list(records)
-        logger.debug("XGB load %d entries from history log file", len(args))
+        # filter data, only pick the data with a same task
+        data = []
+        for inp, res in records:
+            if inp.task.name == self.task.name and \
+                            inp.config.template_key == self.task.config_space.template_key:
+                data.append((inp, res))
+
+        logger.debug("XGB load %d entries from history log file", len(data))
 
+        # extract feature
+        self._reset_pool(self.space, self.target, self.task)
+        pool = self._get_pool()
         if self.fea_type == 'itervar':
             feature_extract_func = _extract_itervar_feature_log
         elif self.fea_type == 'knob':
@@ -185,10 +220,21 @@ def fit_log(self, records, plan_size):
             feature_extract_func = _extract_curve_feature_log
         else:
             raise RuntimeError("Invalid feature type: " + self.fea_type)
-        res = self.pool.map(feature_extract_func, args)
-        xs, ys = zip(*res)
-        xs, ys = np.array(xs), np.array(ys)
+        res = pool.map(feature_extract_func, data)
+
+        # filter out feature with different shapes
+        fea_len = len(self._get_feature([0])[0])
+
+        xs, ys = [], []
+        for x, y in res:
+            if len(x) == fea_len:
+                xs.append(x)
+                ys.append(y)
 
+        if len(xs) < 500:  # no enough samples
+            return False
+
+        xs, ys = np.array(xs), np.array(ys)
         x_train = xs
         y_train = ys
         y_max = np.max(y_train)
@@ -212,6 +258,8 @@ def fit_log(self, records, plan_size):
 
         logger.debug("XGB train: %.2f\tobs: %d", time.time() - tic, len(xs))
 
+        return True
+
     def predict(self, xs, output_margin=False):
         feas = self._get_feature(xs)
         dtest = xgb.DMatrix(feas)
@@ -224,20 +272,12 @@ def predict(self, xs, output_margin=False):
 
     def load_basemodel(self, base_model):
         self.base_model = base_model
-        if isinstance(base_model, XGBoostCostModel):
-            # share feature cache
-            base_model.feature_cache = self.feature_cache
-
-            # close thread pool
-            if base_model.pool:
-                base_model.pool.terminate()
-                base_model.pool.join()
-                del base_model.pool
-            self.base_model.upper_model = self
-
-    def clone_new(self):
+        self.base_model._close_pool()
+        self.base_model.upper_model = self
+
+    def spawn_base_model(self):
         return XGBoostCostModel(self.task, self.fea_type, self.loss_type,
-                                self.num_threads, self.log_interval)
+                                self.num_threads, self.log_interval, self)
 
     def _get_feature(self, indexes):
         """get features for indexes, run extraction if we do not have cache for them"""
@@ -251,7 +291,7 @@ def _get_feature(self, indexes):
         need_extract = [x for x in indexes if x not in fea_cache]
 
         if need_extract:
-            pool = self.pool if self.upper_model is None else self.upper_model.pool
+            pool = self._get_pool()
             feas = pool.map(self.feature_extract_func, need_extract)
             for i, fea in zip(need_extract, feas):
                 fea_cache[i] = fea
@@ -261,6 +301,9 @@ def _get_feature(self, indexes):
             ret[i, :] = fea_cache[ii]
         return ret
 
+    def __del__(self):
+        self._close_pool()
+
 
 _extract_space = None
 _extract_target = None
diff --git a/python/tvm/autotvm/tuner/xgboost_tuner.py b/python/tvm/autotvm/tuner/xgboost_tuner.py
index 237ac4e19ab1..886c82a4d749 100644
--- a/python/tvm/autotvm/tuner/xgboost_tuner.py
+++ b/python/tvm/autotvm/tuner/xgboost_tuner.py
@@ -20,8 +20,12 @@ class XGBTuner(ModelBasedTuner):
         If is 'curve', use sampled curve feature (relation feature).
 
         Note on choosing feature type:
-        For single task tuning, 'itervar' and 'knob' is good.
+        For single task tuning, 'itervar' and 'knob' are good.
                                 'itervar' is more accurate but 'knob' is much faster.
+                                There are some constraints on 'itervar', if you meet
+                                problems with feature extraction when using 'itervar',
+                                you can swith to 'knob'.
+
         For cross-shape tuning (e.g. many convolutions with different shapes),
                                'itervar' and 'curve' has better transferability,
                                'knob' is faster.
@@ -32,8 +36,7 @@ class XGBTuner(ModelBasedTuner):
         If is 'rank', use pairwise rank loss to train cost model.
                      The cost model predicts relative rank score.
     num_threads: int, optional
-        The number of threads.
-    optimizer: str or ModelOptimizer, optional
+        The number of threads.  optimizer: str or ModelOptimizer, optional
         If is 'sa', use a default simulated annealing optimizer.
         Otherwise it should be a ModelOptimizer object.
     diversity_filter_ratio: int or float, optional
@@ -45,7 +48,7 @@ class XGBTuner(ModelBasedTuner):
         If is 0, output nothing.
         Otherwise, output debug information every `verbose` iterations.
     """
-    def __init__(self, task, plan_size=32,
+    def __init__(self, task, plan_size=64,
                  feature_type='itervar', loss_type='rank', num_threads=None,
                  optimizer='sa', diversity_filter_ratio=None, log_interval=50):
         cost_model = XGBoostCostModel(task,
@@ -62,3 +65,9 @@ def __init__(self, task, plan_size=32,
 
         super(XGBTuner, self).__init__(task, cost_model, optimizer,
                                        plan_size, diversity_filter_ratio)
+
+    def tune(self, *args, **kwargs):  # pylint: disable=arguments-differ
+        super(XGBTuner, self).tune(*args, **kwargs)
+
+        # manually close pool to avoid multiprocessing issues
+        self.cost_model._close_pool()
diff --git a/python/tvm/exec/tophub.py b/python/tvm/exec/tophub.py
index 9dd951a52701..9bfd6866506d 100644
--- a/python/tvm/exec/tophub.py
+++ b/python/tvm/exec/tophub.py
@@ -8,8 +8,8 @@
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
-    parser.add_argument("--download", type=str, nargs='+',
-                        help="Target to download. Use 'all' to download for all targets")
+    parser.add_argument("-d", "--download", type=str, nargs='+',
+                        help="The targets to download. Use 'all' to download for all targets")
     parser.add_argument("-l", "--list", action='store_true', help="List available packages")
     args = parser.parse_args()
 
@@ -21,8 +21,7 @@
         print("-" * 41)
         for target, info in info:
             print("%-20s %-20s" % (target, "%.2f MB" % (info['size']/1000000)))
-
-    if args.download:
+    elif args.download:
         info = list_packages()
         all_targets = [x[0] for x in info]
         if 'all' in args.download:
@@ -34,3 +33,5 @@
             if t not in all_targets:
                 print("Warning : cannot find tuned parameters of " + t + ". (ignored)")
             download_package(t)
+    else:
+        parser.print_help()
diff --git a/python/tvm/target.py b/python/tvm/target.py
index e2d780f75264..9d5200661c6c 100644
--- a/python/tvm/target.py
+++ b/python/tvm/target.py
@@ -263,6 +263,7 @@ def dispatch_func(func, *args, **kwargs):
                     "Keyword arguments cannot be used when invoking generic_func %s" % func_name)
             return generic_func_node(*args)
         fresult = decorate(fdefault, dispatch_func)
+        fresult.fdefault = fdefault
         fresult.register = register
         return fresult
     return fdecorate
diff --git a/tests/python/unittest/test_autotvm_dispatch_context.py b/tests/python/unittest/test_autotvm_dispatch_context.py
index 6c718e5bd041..1f2a7e276a32 100644
--- a/tests/python/unittest/test_autotvm_dispatch_context.py
+++ b/tests/python/unittest/test_autotvm_dispatch_context.py
@@ -3,34 +3,48 @@
 to the parameters of workload"""
 
 from collections import namedtuple
+from tvm import autotvm
 from tvm.autotvm.task import dispatcher, DispatchContext
 
-SimpleWorkload = namedtuple("SimpleWorkload", ["key"])
-SimpleConfig = namedtuple("SimpleConfig", ["template_key"])
+SimpleConfig = namedtuple('SimpleConfig', ('template_key', 'is_fallback'))
 
 def test_dispatch():
     @dispatcher
     def my_dispatcher(a, b):
-        return SimpleWorkload(key=a + b)
-
-    @my_dispatcher.register("spatial_pack")
-    def _sp_pack_add(cfg, a, b):
-        return b + 100
+        return (a, b)
 
     @my_dispatcher.register("im2col")
-    def _im2col_add(cfg, a, b):
-        return a + 1
+    def _im2col(cfg, a, b):
+        return a
+
+    @my_dispatcher.register("spatial_pack")
+    def _spatial_pack(cfg, a, b):
+        return b
 
     class SimpleDispatcher(DispatchContext):
         def query(self, target, workload):
-            tkey = "spatial_pack" if workload.key > 2 else "im2col"
-            return SimpleConfig(tkey)
+            a, b = workload
+            tkey = "spatial_pack" if a + b > 2 else "im2col"
+            cfg = SimpleConfig(tkey, False)
+            return cfg
 
     with SimpleDispatcher():
-        # im2col
-        assert my_dispatcher(1, 0) == 2
-        # spack
-        assert my_dispatcher(1, 100) == 200
+        # this will call im2col
+        assert my_dispatcher(1, 0) == 1
+
+        # this will call spatial pack
+        assert my_dispatcher(1, 100) == 100
+
+def test_fallback():
+
+    @autotvm.template
+    def simple_template(a, b):
+        cfg = autotvm.get_config()
+        assert cfg.is_fallback
+
+    simple_template(2, 3)
+
 
 if __name__ == "__main__":
     test_dispatch()
+    test_fallback()
diff --git a/tests/python/unittest/test_autotvm_space.py b/tests/python/unittest/test_autotvm_space.py
index 0320ef1c6f3c..e51e34e95a3b 100644
--- a/tests/python/unittest/test_autotvm_space.py
+++ b/tests/python/unittest/test_autotvm_space.py
@@ -1,7 +1,7 @@
 """Test space definition primitives"""
 
 import tvm
-from tvm.autotvm.task.space import ConfigSpace
+from tvm.autotvm.task.space import ConfigSpace, FallbackConfigEntity
 
 def gemm_func(cfg, N):
     A = tvm.placeholder((N, N), name='A')
@@ -26,5 +26,18 @@ def test_split():
     assert len(cfg) == 64
     assert len(cfg.space_map['tile_y']) == 8
 
+    # test fallback
+    cfg = FallbackConfigEntity()
+    cfg.define_split('tile_n', cfg.axis(128), num_outputs=3)
+    cfg.fallback_split('tile_n', [-1, 8, 4])
+
+    assert cfg['tile_n'].size == [4, 8, 4]
+
+    cfg = FallbackConfigEntity()
+    cfg.define_split('tile_n', cfg.axis(49), num_outputs=3)
+    cfg.fallback_split('tile_n', [-1, 8, 4])
+
+    assert cfg['tile_n'].size == [7, 7, 1]
+
 if __name__ == '__main__':
     test_split()
diff --git a/tests/python/unittest/test_autotvm_xgboost_model.py b/tests/python/unittest/test_autotvm_xgboost_model.py
index 3488d0f599a5..58da219f2e48 100644
--- a/tests/python/unittest/test_autotvm_xgboost_model.py
+++ b/tests/python/unittest/test_autotvm_xgboost_model.py
@@ -12,7 +12,7 @@
 
 def test_fit():
     task, target = get_sample_task()
-    records = get_sample_records(n=100)
+    records = get_sample_records(n=500)
 
     base_model = XGBoostCostModel(task, feature_type='itervar', loss_type='rank')
     base_model.fit_log(records, plan_size=32)
@@ -20,8 +20,8 @@ def test_fit():
     upper_model = XGBoostCostModel(task, feature_type='itervar', loss_type='rank')
     upper_model.load_basemodel(base_model)
 
-    xs = np.arange(100)
-    ys = np.arange(100)
+    xs = np.arange(10)
+    ys = np.arange(10)
 
     upper_model.fit(xs, ys, plan_size=32)
 
diff --git a/topi/python/topi/arm_cpu/conv2d.py b/topi/python/topi/arm_cpu/conv2d.py
index 48bb4fb022c7..a3945a4c9d76 100644
--- a/topi/python/topi/arm_cpu/conv2d.py
+++ b/topi/python/topi/arm_cpu/conv2d.py
@@ -27,7 +27,14 @@ def _conv_arg_to_workload(data, kernel, strides, padding, layout, out_dtype):
 @autotvm.task.dispatcher
 def conv2d_arm_cpu(data, kernel, strides, padding, layout, out_dtype):
     """TOPI compute callback. Mark this function as a dispatcher, so
-    this template can assign config according to workload"""
+    this template can assign config according to workload
+
+    Returns
+    -------
+    workload: Tuple
+        Dispatcher will use this workload to query corresponding config.
+        Then use cfg.template_key to call a registered template.
+    """
     return _conv_arg_to_workload(data, kernel, strides, padding, layout, out_dtype)
 
 @conv2d_arm_cpu.register(['direct'])
@@ -70,8 +77,10 @@ def _callback(op):
 
 def _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, num_tile):
     assert layout == "NCHW", "Only support NCHW"
-    out_dtype = out_dtype or data.dtype
+    # create workload according to raw arguments
+    wkl = _conv_arg_to_workload(data, kernel, strides, padding, layout, out_dtype)
 
+    out_dtype = out_dtype or data.dtype
     N, CI, IH, IW = get_const_tuple(data.shape)
     if len(kernel.shape) == 4:
         pre_packed = False
@@ -113,6 +122,18 @@ def _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, n
     cfg.define_annotate("ann_spatial", [vh, vw, vc], policy='try_unroll_vec')
     # ====================================================================
 
+    if cfg.is_fallback:
+        if num_tile == 2:
+            cfg.fallback_split('tile_co', [-1, 8])
+            cfg.fallback_split('tile_oh', [-1, 2])
+            cfg.fallback_split('tile_ow', [-1, 8])
+        else:
+            cfg.fallback_split('tile_co', [-1, 16, 4])
+            cfg.fallback_split('tile_oh', [-1, 1, 1])
+            cfg.fallback_split('tile_ow', [-1, 1, 4])
+        cfg['ann_reduce'].anns = ['unroll', 'unroll']
+        cfg['ann_spatial'].anns = ['none', 'unroll', 'vec']
+
     VC = cfg["tile_co"].size[-1]
     VH = cfg["tile_oh"].size[-1]
     VW = cfg["tile_ow"].size[-1]
@@ -145,8 +166,7 @@ def _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, n
     output = tvm.compute(oshape, lambda n, co, h, w:
                          conv[n][co//VC][h//VH][w//VW][h%VH][w%VW][co%VC],
                          name='output_unpack', tag='spatial_conv2d_output',
-                         attrs={'workload': _conv_arg_to_workload(data, kernel, strides, padding,
-                                                                  layout, out_dtype)})
+                         attrs={'workload': wkl})
     return output
 
 def _schedule_spatial_pack(cfg, s, data_vec, kernel_vec,
@@ -212,6 +232,10 @@ def decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype):
     return _decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype, tile_size)
 
 def _decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype, tile_size):
+    # create workload according to raw arguments
+    wkl = _winograd_conv_arg_to_workload(data, kernel, strides, padding, layout,
+                                         out_dtype, tile_size)
+
     N, CI, IH, IW = get_const_tuple(data.shape)
     if len(kernel.shape) == 4:
         pre_computed = False
@@ -333,10 +357,9 @@ def _decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype, tile_
     output = tvm.compute((N, K, H, W), lambda n, k, h, w:
                          Y[k][n * nH * nW + (h//m) * nW + w//m][h % m][w % m],
                          name='output', tag='winograd_conv2d_output',
-                         attrs={'workload': _winograd_conv_arg_to_workload(
-                             data, kernel, strides, padding, layout, out_dtype, tile_size)})
+                         attrs={'workload': wkl})
 
-    # we have to manually assign effective GFLOP for winogard
+    # we have to manually assign effective GFLOP for winograd
     cfg.add_flop(2 * N * K * H * W * KH * KW * C)
     return output
 
@@ -358,30 +381,29 @@ def _schedule_winograd(cfg, s, output, last):
         kernel, G = U.op.input_tensors
         s[G].compute_inline()
         eps, nu, k, c, kk, = s[U].op.axis
-        r_kh, r_kw = s[U].op.reduce_axis
-        s[U].reorder(k, c, eps, nu, r_kh, r_kw, kk)
-        s[U].unroll(eps)
-        s[U].unroll(nu)
-        s[U].unroll(r_kh)
-        s[U].unroll(r_kw)
-        s[U].vectorize(kk)
         if autotvm.GLOBAL_SCOPE.in_tuning:
             # kernel transformation will be pre-computed during compilation, so we skip
             # this part to make tuning records correct
-            s[U].pragma(k, 'debug_skip_region')
+            s[U].pragma(eps, 'debug_skip_region')
         else:
+            r_kh, r_kw = s[U].op.reduce_axis
+            s[U].reorder(k, c, eps, nu, r_kh, r_kw, kk)
+            for axis in [eps, nu, r_kh, r_kw]:
+                s[U].unroll(axis)
+            s[U].vectorize(kk)
             s[U].parallel(k)
 
+        if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag:
+            s[kernel].compute_inline()
+
     # transform image
     DD = s.cache_read(d, 'global', [V])
     s[B].compute_inline()
     eps, nu, b, c, bb = s[V].op.axis
     r_eps, r_nu = s[V].op.reduce_axis
     s[V].reorder(b, c, eps, nu, r_eps, r_nu, bb)
-    s[V].unroll(eps)
-    s[V].unroll(nu)
-    s[V].unroll(r_eps)
-    s[V].unroll(r_nu)
+    for axis in [eps, nu, r_eps, r_nu]:
+        s[V].unroll(axis)
     s[DD].compute_at(s[V], c)
     s[V].vectorize(bb)
     s[V].parallel(b)
@@ -405,10 +427,8 @@ def _schedule_winograd(cfg, s, output, last):
     s[A].compute_inline()
     k, b, vh, vw = s[Y].op.axis
     r_eps, r_nu = s[Y].op.reduce_axis
-    s[Y].unroll(vh)
-    s[Y].unroll(vw)
-    s[Y].unroll(r_eps)
-    s[Y].unroll(r_nu)
+    for axis in [vh, vw, r_eps, r_nu]:
+        s[Y].unroll(axis)
 
     # output
     n, co, h, w = s[last].op.axis
@@ -444,6 +464,7 @@ def _winograd_conv_arg_to_workload(data, kernel, strides, padding, layout, out_d
         [data, raw_kernel, strides, padding, layout, out_dtype])
 
 
+##### REGISTER TOPI COMPUTE / SCHEDULE FOR WINOGRAD WITH WEIGHT TRANSFORM #####
 @conv2d_winograd_without_weight_transform.register(['arm_cpu'])
 @autotvm.task.dispatcher
 def winograd_ww_config_dispatcher_(data, kernel, strides, padding, layout, out_dtype, tile_size):
@@ -472,6 +493,7 @@ def _callback(op):
     return s
 
 
+##### REGISTER ALTER OP LAYOUT #####
 @conv2d_alter_layout.register(["arm_cpu", "mali"])
 def _alter_conv2d_layout(attrs, inputs, tinfos):
     """Alter op layout for pre-computing kernel transformation"""
@@ -493,18 +515,30 @@ def _alter_conv2d_layout(attrs, inputs, tinfos):
         # query config of this workload
         workload = _conv_arg_to_workload(tinfos[0], tinfos[1], strides, padding,
                                          layout, out_dtype)
-        cfg = autotvm.task.DispatchContext.current.query(tvm.target.current_target(), workload)
+        cfg = autotvm.DispatchContext.current.query(tvm.target.current_target(), workload)
+
+        if cfg.is_fallback: # if is fallback, clear query cache and return None
+            context = autotvm.DispatchContext.current
+            while not isinstance(context, autotvm.FallbackContext):
+                context = context._old_ctx
+            context.clear_cache(tvm.target.current_target(), workload)
+            return None
 
         if cfg.template_key == 'direct':  # packing weight tensor
             new_attrs['kernel_layout'] = 'OIHW%do' % (cfg['tile_co'].size[-1])
             return sym.conv2d(*copy_inputs, **new_attrs)
         else:  # pre-compute weight transformation in winograd
-            tile_size = 4
+            if "-device=arm_cpu" in tvm.target.current_target().options:
+                tile_size = 4
+                VC = cfg['tile_k'].size[-1]
+            else:
+                from ..mali.conv2d import _pick_tile_size
+                tile_size = _pick_tile_size(tinfos[0], tinfos[1])
+                VC = cfg['tile_bna'].val
 
             weight = sym.contrib.conv2d_winograd_weight_transform(copy_inputs[1],
                                                                   tile_size=tile_size)
             CO, CI, KH, KW = get_const_tuple(tinfos[1].shape)
-            VC = cfg['tile_k'].size[-1]
             weight = sym.reshape(weight,
                                  shape=(KH + tile_size - 1, KW + tile_size - 1, CO // VC, VC, CI))
             weight = sym.transpose(weight, axes=[0, 1, 2, 4, 3])
diff --git a/topi/python/topi/arm_cpu/depthwise_conv2d.py b/topi/python/topi/arm_cpu/depthwise_conv2d.py
index 8aafc436319f..e066a1e29435 100644
--- a/topi/python/topi/arm_cpu/depthwise_conv2d.py
+++ b/topi/python/topi/arm_cpu/depthwise_conv2d.py
@@ -14,16 +14,21 @@
 
 # register customized schedule for arm cpu.
 @autotvm.task.register_topi_schedule(schedule_depthwise_conv2d_nchw, 'arm_cpu', 'direct')
-def schedule_depthwise_conv2d_nchw_(cfg, outs):
+def schedule_depthwise_conv2d_nchw_arm(cfg, outs):
     """Schedule depthwise conv2d
 
     Parameters
     ----------
     cfg: ConfigEntity
-        The configuration of this tempalte
+        The configuration of this template
     outs: Array of Tensor
         The computation graph description of depthwise convolution2d
         in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for depthwise_conv2d nchw.
     """
     outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
     s = tvm.create_schedule([x.op for x in outs])
@@ -38,6 +43,11 @@ def _schedule(cfg, s, data, data_pad, kernel, output):
         cfg.define_split('tile_h', h, num_outputs=2)
         cfg.define_split('tile_w', w, num_outputs=2)
 
+        if cfg.is_fallback:
+            cfg.fallback_split('tile_c', [-1, 8])
+            cfg.fallback_split('tile_h', [-1, 2])
+            cfg.fallback_split('tile_w', [-1, 8])
+
         # park data to vector form  [n, c, h, w] -> [n, C, h, w, VC]
         A0 = s.cache_read(data_pad, "global", C)
         _, c, h, w = s[A0].op.axis
diff --git a/topi/python/topi/x86/injective.py b/topi/python/topi/x86/injective.py
index ac552903ad7f..06847bf9f427 100644
--- a/topi/python/topi/x86/injective.py
+++ b/topi/python/topi/x86/injective.py
@@ -29,7 +29,7 @@ def schedule_injective(outs):
     elif len(s[x].op.axis) >= 3:
         fused = s[x].fuse(s[x].op.axis[0], s[x].op.axis[1])
         s[x].parallel(fused)
-    else:
+    elif len(s[x].op.axis) >= 1:
         s[x].parallel(s[x].op.axis[0])
     return s
 
diff --git a/topi/tests/python/common.py b/topi/tests/python/common.py
new file mode 100644
index 000000000000..d992be9292fc
--- /dev/null
+++ b/topi/tests/python/common.py
@@ -0,0 +1,12 @@
+"""Common utility for topi test"""
+
+def get_all_backend():
+    """return all supported target
+
+    Returns
+    -------
+    targets: list
+        A list of all supported targets
+    """
+    return ['llvm', 'cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx',
+            'llvm -device=arm_cpu']
diff --git a/topi/tests/python/test_topi_bitserial_conv2d.py b/topi/tests/python/test_topi_bitserial_conv2d.py
index 6df18483a45f..82af0006c2ef 100644
--- a/topi/tests/python/test_topi_bitserial_conv2d.py
+++ b/topi/tests/python/test_topi_bitserial_conv2d.py
@@ -1,11 +1,8 @@
-import os
 import numpy as np
 import tvm
 import topi
 import topi.testing
-from tvm.contrib.pickle_memoize import memoize
 from topi.util import get_const_tuple
-from tvm.contrib import util
 from tvm.contrib.pickle_memoize import memoize
 
 def generate_quantized_np(shape, bits, out_dtype):
@@ -16,23 +13,23 @@ def generate_quantized_np(shape, bits, out_dtype):
 def verify_bitserial_conv2d_nchw(batch, in_size, in_channel, num_filter, kernel, stride, padding, 
     activation_bits, weight_bits, dorefa):
     in_height = in_width = in_size
-    input_type='uint32'
-    out_dtype='int32'
+    input_type = 'uint32'
+    out_dtype = 'int32'
 
     with tvm.target.create('llvm'):
         A = tvm.placeholder((batch, in_channel, in_height, in_width), dtype=input_type, name='A')
         W = tvm.placeholder((num_filter, in_channel, kernel, kernel), dtype=input_type, name='W')
         B = topi.nn.bitserial_conv2d(A, W, stride, padding, activation_bits, weight_bits, 
-            out_dtype=out_dtype, layout="NCHW", dorefa=dorefa)
+                                     out_dtype=out_dtype, layout="NCHW", dorefa=dorefa)
         s = topi.generic.schedule_bitserial_conv2d_nchw([B])
 
     a_shape = get_const_tuple(A.shape)
     w_shape = get_const_tuple(W.shape)
-    dtype = A.dtype
 
+    @memoize("topi.tests.test_topi_bitseral_conv2d_nchw")
     def get_ref_data():
-        a_np = generate_quantized_np(get_const_tuple(A.shape), activation_bits, input_type)
-        w_np = generate_quantized_np(get_const_tuple(W.shape), weight_bits, input_type)
+        a_np = generate_quantized_np(get_const_tuple(a_shape), activation_bits, input_type)
+        w_np = generate_quantized_np(get_const_tuple(w_shape), weight_bits, input_type)
         if dorefa:
             w_ = np.copy(w_np).astype(out_dtype)
             for x in np.nditer(w_, op_flags=['readwrite']):
@@ -61,16 +58,16 @@ def verify_bitserial_conv2d_nhwc(batch, in_size, in_channel, num_filter, kernel,
         A = tvm.placeholder((batch, in_height, in_width, in_channel), dtype=input_type, name='A')
         W = tvm.placeholder((kernel, kernel, in_channel, num_filter), dtype=input_type, name='W')
         B = topi.nn.bitserial_conv2d(A, W, stride, padding, activation_bits, weight_bits, out_dtype=out_dtype, 
-                            layout="NHWC", dorefa=dorefa)
+                                     layout="NHWC", dorefa=dorefa)
         s = topi.generic.schedule_bitserial_conv2d_nhwc([B])
 
     a_shape = get_const_tuple(A.shape)
     w_shape = get_const_tuple(W.shape)
-    dtype = A.dtype
 
+    @memoize("topi.tests.test_topi_bitseral_conv2d_nhwc")
     def get_ref_data():
-        a_np = generate_quantized_np(get_const_tuple(A.shape), activation_bits, input_type)
-        w_np = generate_quantized_np(get_const_tuple(W.shape), weight_bits, input_type)
+        a_np = generate_quantized_np(get_const_tuple(a_shape), activation_bits, input_type)
+        w_np = generate_quantized_np(get_const_tuple(w_shape), weight_bits, input_type)
         if dorefa:
             w_ = np.copy(w_np).astype(out_dtype)
             for x in np.nditer(w_, op_flags=['readwrite']):
@@ -109,4 +106,4 @@ def test_bitserial_conv2d():
     verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 2, 2, False)
 
 if __name__ == "__main__":
-    test_bitserial_conv2d()
\ No newline at end of file
+    test_bitserial_conv2d()
diff --git a/topi/tests/python/test_topi_bitserial_conv2d_rasp.py b/topi/tests/python/test_topi_bitserial_conv2d_rasp.py
index 3de954abc291..de467818d37f 100644
--- a/topi/tests/python/test_topi_bitserial_conv2d_rasp.py
+++ b/topi/tests/python/test_topi_bitserial_conv2d_rasp.py
@@ -4,10 +4,6 @@
 import tvm
 import topi
 import topi.testing
-from topi.util import get_const_tuple
-from tvm.contrib import util
-
-target = 'llvm -target=armv7l-none-linux-gnueabihf -mcpu=cortex-a53 -mattr=+neon'
 
 def generate_quantized_np(shape, bits, out_dtype):
     np.random.seed(0)
@@ -17,20 +13,19 @@ def generate_quantized_np(shape, bits, out_dtype):
 
 # Verify that certain special instructions from the tensorize pass exist
 def verify_bitserial_conv2d_nhwc(batch, in_size, in_channel, num_filter, kernel, stride, padding, 
-                        activation_bits, weight_bits, dorefa):
+                                 activation_bits, weight_bits, dorefa):
     in_height = in_width = in_size
-    input_type='uint32'
-    out_dtype='int32'
+    input_type = 'uint32'
+    out_dtype = 'int32'
 
     with tvm.target.arm_cpu('rasp3b'):
         A = tvm.placeholder((batch, in_height, in_width, in_channel), dtype=input_type, name='A')
         W = tvm.placeholder((kernel, kernel, in_channel, num_filter), dtype=input_type, name='W')
         B = topi.nn.bitserial_conv2d(A, W, stride, padding, activation_bits, weight_bits, out_dtype=out_dtype, 
-                            layout="NHWC", dorefa=dorefa)
+                                     layout="NHWC", dorefa=dorefa)
         s = topi.generic.schedule_bitserial_conv2d_nhwc([B])
 
-    
-    func = tvm.build(s, [A, W, B], target)
+    func = tvm.build(s, [A, W, B], tvm.target.arm_cpu('rasp3b'))
    
     assembly = func.get_source('asm')
     matches = re.findall("vpadal", assembly)
@@ -47,7 +42,6 @@ def test_bitserial_conv2d():
     stride = 1
     pad = 1
 
-
     verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 1, 1, False)
     verify_bitserial_conv2d_nhwc(1, in_size, ic, oc, k, stride, pad, 2, 1, False)
 
diff --git a/topi/tests/python/test_topi_bnn.py b/topi/tests/python/test_topi_bnn.py
index 90abc68e6b68..cf9f377e9e1c 100644
--- a/topi/tests/python/test_topi_bnn.py
+++ b/topi/tests/python/test_topi_bnn.py
@@ -28,7 +28,7 @@ def get_ref_data():
         a_np = (np.random.randint(2, size=(batch, in_dim)) * 2 - 1).astype(dtype)
         b_np = (np.random.randint(2, size=(out_dim, in_dim)) * 2 - 1).astype(dtype)
         c_np = np.dot(a_np, b_np.T)
-        return (a_np, b_np, c_np)
+        return a_np, b_np, c_np
 
     a_np, b_np, c_np = get_ref_data()
 
diff --git a/topi/tests/python/test_topi_broadcast.py b/topi/tests/python/test_topi_broadcast.py
index f888033b3914..4ed5b31708e4 100644
--- a/topi/tests/python/test_topi_broadcast.py
+++ b/topi/tests/python/test_topi_broadcast.py
@@ -1,5 +1,5 @@
 """Test code for broadcasting operators."""
-import os
+from common import get_all_backend
 import numpy as np
 import tvm
 import topi
@@ -8,6 +8,7 @@ def verify_broadcast_to_ele(in_shape, out_shape, fbcast):
     # Build the logic and compile the function
     A = tvm.placeholder(shape=in_shape, name="A")
     B = fbcast(A, out_shape)
+
     def check_device(device):
         ctx = tvm.context(device, 0)
         if not ctx.exist:
@@ -21,16 +22,11 @@ def check_device(device):
         out_npy = np.broadcast_to(data_npy, out_shape)
         data_nd = tvm.nd.array(data_npy, ctx)
         out_nd = tvm.nd.array(np.empty(out_shape).astype(B.dtype), ctx)
-        for _ in range(1):
-            foo(data_nd, out_nd)
+        foo(data_nd, out_nd)
         np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    check_device("vulkan")
-    check_device("opencl")
-    check_device("cuda")
-    check_device("metal")
-    check_device("rocm")
-    check_device("nvptx")
+    for target in get_all_backend():
+        check_device(target)
     check_device("sdaccel")
 
 
@@ -45,9 +41,10 @@ def verify_broadcast_binary_ele(lhs_shape, rhs_shape,
     B = (tvm.var("B", dtype=dtype) if rhs_shape is None
          else tvm.placeholder(shape=rhs_shape, name="B", dtype=dtype))
     C = ftopi(A, B)
-    if (isinstance(A, tvm.expr.Expr) and isinstance(B, tvm.expr.Expr)):
+    if isinstance(A, tvm.expr.Expr) and isinstance(B, tvm.expr.Expr):
         assert(isinstance(C, tvm.expr.Expr))
         return
+
     def check_device(device):
         ctx = tvm.context(device, 0)
         if not ctx.exist:
@@ -82,12 +79,8 @@ def check_device(device):
         foo(lhs_nd, rhs_nd, out_nd)
         np.testing.assert_allclose(out_nd.asnumpy(), out_npy, rtol=1E-4, atol=1E-4)
 
-    check_device("opencl")
-    check_device("vulkan")
-    check_device("cuda")
-    check_device("metal")
-    check_device("rocm")
-    check_device("nvptx")
+    for target in get_all_backend():
+        check_device(target)
     check_device("sdaccel")
 
 def test_broadcast_to():
diff --git a/topi/tests/python/test_topi_clip.py b/topi/tests/python/test_topi_clip.py
index ffc89aeb9bc3..f1367463eb4f 100644
--- a/topi/tests/python/test_topi_clip.py
+++ b/topi/tests/python/test_topi_clip.py
@@ -5,6 +5,7 @@
 from topi.util import get_const_tuple
 from tvm.contrib.pickle_memoize import memoize
 
+from common import get_all_backend
 
 def verify_clip(N, a_min, a_max, dtype):
     A = tvm.placeholder((N, N), dtype=dtype, name='A')
@@ -34,7 +35,7 @@ def check_device(device):
         f(a, b)
         np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
-    for device in ['llvm', 'opencl', 'sdaccel']:
+    for device in get_all_backend():
         check_device(device)
 
 def test_clip():
diff --git a/topi/tests/python/test_topi_conv2d.py b/topi/tests/python/test_topi_conv2d.py
deleted file mode 100644
index 365fdf551c4f..000000000000
--- a/topi/tests/python/test_topi_conv2d.py
+++ /dev/null
@@ -1,47 +0,0 @@
-"""Example code to do conv2d."""
-import os
-import numpy as np
-import tvm
-from tvm import autotvm
-import topi
-import topi.testing
-from tvm.contrib.pickle_memoize import memoize
-from topi.util import get_const_tuple
-
-
-def verify_conv2d(batch, in_size, in_channel, num_filter, kernel, stride, padding):
-    in_height = in_width = in_size
-
-    with tvm.target.arm_cpu():
-        A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
-        W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W')
-        B = topi.nn.conv2d(A, W, (stride, stride), (padding, padding), 'NCHW', 'float32')
-        s = topi.generic.schedule_conv2d_nchw([B])
-
-    a_shape = get_const_tuple(A.shape)
-    w_shape = get_const_tuple(W.shape)
-    dtype = A.dtype
-
-    @memoize("topi.tests.test_topi_conv2d.verify_conv2d")
-    def get_ref_data():
-        a_np = np.random.uniform(size=a_shape).astype(dtype)
-        w_np = np.random.uniform(size=w_shape).astype(dtype)
-        b_np = topi.testing.conv2d_nchw_python(a_np, w_np, stride, padding)
-        return a_np, w_np, b_np
-
-    a_np, w_np, b_np = get_ref_data()
-
-    ctx = tvm.cpu(0)
-    a = tvm.nd.array(a_np, ctx)
-    w = tvm.nd.array(w_np, ctx)
-    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
-    func = tvm.build(s, [A, W, B], "llvm")
-    func(a, w, b)
-    np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
-
-def test_conv2d():
-    with autotvm.tophub.context(tvm.target.arm_cpu('rasp3b'), allow_fallback=True):
-        verify_conv2d(1, 56, 64, 64, 3, 1, 1)
-
-if __name__ == "__main__":
-    test_conv2d()
diff --git a/topi/tests/python/test_topi_conv2d_hwcn.py b/topi/tests/python/test_topi_conv2d_hwcn.py
index 1ff4b02470c4..af1afcb9ea9a 100644
--- a/topi/tests/python/test_topi_conv2d_hwcn.py
+++ b/topi/tests/python/test_topi_conv2d_hwcn.py
@@ -43,14 +43,12 @@ def check_device(device):
         w = tvm.nd.array(w_np, ctx)
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
-        with tvm.build_config(auto_unroll_max_step=128,
-                              unroll_explicit=(device != "cuda")):
-            func1 = tvm.build(s1, [A, W, B], device)
-            func2 = tvm.build(s2, [A, W, C], device)
-            func1(a, w, b)
-            func2(a, w, c)
-            np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
-            np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
+        func1 = tvm.build(s1, [A, W, B], device)
+        func2 = tvm.build(s2, [A, W, C], device)
+        func1(a, w, b)
+        func2(a, w, c)
+        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
 
     for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
         check_device(device)
diff --git a/topi/tests/python/test_topi_conv2d_nchw.py b/topi/tests/python/test_topi_conv2d_nchw.py
index c663384b8187..6f367d10c048 100644
--- a/topi/tests/python/test_topi_conv2d_nchw.py
+++ b/topi/tests/python/test_topi_conv2d_nchw.py
@@ -1,31 +1,41 @@
 """Example code to do convolution."""
-import os
+
 import numpy as np
 import tvm
+from tvm import autotvm
 import topi
 import topi.testing
 from tvm.contrib.pickle_memoize import memoize
 from topi.util import get_const_tuple
 
-def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1):
+from common import get_all_backend
+
+def verify_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, add_bias=False, add_relu=False):
     print("Workload: (%d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding))
 
     in_height = in_width = in_size
 
     A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
     W = tvm.placeholder((num_filter, in_channel, kernel, kernel), name='W')
+    bias = tvm.placeholder((num_filter, 1, 1), name='bias')
 
     a_shape = get_const_tuple(A.shape)
     w_shape = get_const_tuple(W.shape)
+    bias_shape = get_const_tuple(bias.shape)
     dtype = A.dtype
 
     @memoize("topi.tests.test_topi_conv2d_nchw.verify_conv2d_nchw")
     def get_ref_data():
         a_np = np.random.uniform(size=a_shape).astype(dtype)
         w_np = np.random.uniform(size=w_shape).astype(dtype)
+        b_np = np.random.uniform(size=bias_shape).astype(dtype)
         dw_np = topi.testing.dilate_python(w_np, (1, 1, dilation, dilation))
-        b_np = topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding)
-        c_np = np.maximum(b_np, 0)
+        c_np = topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding)
+        if add_bias:
+            b_np = np.random.uniform(size=bias_shape).astype(dtype)
+            c_np += b_np
+        if add_relu:
+            c_np = np.maximum(c_np, 0)
         return a_np, w_np, b_np, c_np
 
     a_np, w_np, b_np, c_np = get_ref_data()
@@ -38,66 +48,103 @@ def check_device(device):
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             dW = topi.nn.dilate(W, (1, 1, dilation, dilation))
-            B = topi.nn.conv2d(A, dW, stride, padding, layout='NCHW')
-            C = topi.nn.relu(B)
-            s1 = topi.generic.schedule_conv2d_nchw([B])
-            s2 = topi.generic.schedule_conv2d_nchw([C])
+            C = topi.nn.conv2d(A, dW, stride, padding, layout='NCHW', out_dtype=dtype)
+            if add_bias:
+                C = topi.add(C, bias)
+            if add_relu:
+                C = topi.nn.relu(C)
+            s = topi.generic.schedule_conv2d_nchw([C])
+
         a = tvm.nd.array(a_np, ctx)
         w = tvm.nd.array(w_np, ctx)
-        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
+        b = tvm.nd.array(b_np, ctx)
         c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
-        no_unroll_explicit = device in ["cuda", "nvptx", "rocm"]
-        with tvm.build_config(auto_unroll_max_step=1400,
-                              unroll_explicit=not no_unroll_explicit):
-            func1 = tvm.build(s1, [A, W, B], device, name="conv2d_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
-            func2 = tvm.build(s2, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
-            func1(a, w, b)
-            func2(a, w, c)
-            np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
-            np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
-
-    for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
+        if add_bias:
+            func = tvm.build(s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
+            func(a, w, b, c)
+        else:
+            func = tvm.build(s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
+            func(a, w, c)
+        np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
+
+    for device in get_all_backend():
         check_device(device)
 
 
 def test_conv2d_nchw():
+    autotvm.DispatchContext.current.silent = True
+
     # ResNet18 workloads
-    verify_conv2d_nchw(1, 3, 224, 64, 7, 2, 3)
-    verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1)
-    verify_conv2d_nchw(1, 64, 56, 64, 1, 1, 0)
-    verify_conv2d_nchw(1, 64, 56, 128, 3, 2, 1)
-    verify_conv2d_nchw(1, 64, 56, 128, 1, 2, 0)
-    verify_conv2d_nchw(1, 128, 28, 128, 3, 1, 1)
-    verify_conv2d_nchw(1, 128, 28, 256, 3, 2, 1)
-    verify_conv2d_nchw(1, 128, 28, 256, 1, 2, 0)
-    verify_conv2d_nchw(1, 256, 14, 256, 3, 1, 1)
-    verify_conv2d_nchw(1, 256, 14, 512, 3, 2, 1)
-    verify_conv2d_nchw(1, 256, 14, 512, 1, 2, 0)
-    verify_conv2d_nchw(1, 512, 7, 512, 3, 1, 1)
-    # ResNet50 workloads
-    verify_conv2d_nchw(1, 64, 56, 256, 1, 1, 0)
-    verify_conv2d_nchw(1, 256, 56, 64, 1, 1, 0)
-    verify_conv2d_nchw(1, 256, 56, 128, 1, 2, 0)
-    verify_conv2d_nchw(1, 128, 28, 512, 1, 1, 0)
-    verify_conv2d_nchw(1, 256, 56, 512, 1, 2, 0)
-    verify_conv2d_nchw(1, 512, 28, 128, 1, 1, 0)
-    verify_conv2d_nchw(1, 512, 28, 256, 1, 2, 0)
-    verify_conv2d_nchw(1, 256, 14, 1024, 1, 1, 0)
-    verify_conv2d_nchw(1, 512, 28, 1024, 1, 2, 0)
-    verify_conv2d_nchw(1, 1024, 14, 256, 1, 1, 0)
-    verify_conv2d_nchw(1, 1024, 14, 512, 1, 2, 0)
-    verify_conv2d_nchw(1, 512, 7, 2048, 1, 2, 0)
-    verify_conv2d_nchw(1, 1024, 14, 2048, 1, 2, 0)
-    verify_conv2d_nchw(1, 2048, 7, 512, 1, 1, 0)
-    # Vgg16 workloads
-    verify_conv2d_nchw(1, 128, 122, 128, 3, 1, 1)
-    # Super resolution workloads
-    verify_conv2d_nchw(1, 1, 224, 64, 5, 1, 2)
-    verify_conv2d_nchw(1, 64, 224, 64, 3, 1, 1)
-    verify_conv2d_nchw(1, 64, 224, 32, 3, 1, 1)
-    verify_conv2d_nchw(1, 32, 224, 9, 3, 1, 1)
+    verify_conv2d_nchw(1,   3, 224,  64, 7, 2, 3)
+    verify_conv2d_nchw(1,  64,  56,  64, 3, 1, 1)
+    verify_conv2d_nchw(1,  64,  56,  64, 1, 1, 0)
+    verify_conv2d_nchw(1,  64,  56, 128, 3, 2, 1)
+    verify_conv2d_nchw(1,  64,  56, 128, 1, 2, 0)
+    verify_conv2d_nchw(1, 128,  28, 128, 3, 1, 1)
+    verify_conv2d_nchw(1, 128,  28, 256, 3, 2, 1)
+    verify_conv2d_nchw(1, 128,  28, 256, 1, 2, 0)
+    verify_conv2d_nchw(1, 256,  14, 256, 3, 1, 1)
+    verify_conv2d_nchw(1, 256,  14, 512, 3, 2, 1)
+    verify_conv2d_nchw(1, 256,  14, 512, 1, 2, 0)
+    verify_conv2d_nchw(1, 512,   7, 512, 3, 1, 1)
+
+    # bias, relu
+    verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1, add_relu=True)
+    verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1, add_bias=True)
+    verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1, add_bias=True, add_relu=True)
+
     # dilation = 2
-    verify_conv2d_nchw(1, 128, 122, 128, 3, 1, 1, dilation=2)
+    verify_conv2d_nchw(1, 64, 56, 64, 3, 1, 1, dilation=2)
+
+    # weird workloads
+    verify_conv2d_nchw(1, 1, 1, 1, 1, 1, 1, dilation=1)
+    verify_conv2d_nchw(1, 1, 1, 1, 1, 1, 1, dilation=2)
+
+    # inception v3 workloads
+    verify_conv2d_nchw(1,    3, 299,  32, 3, 2, 0)
+    verify_conv2d_nchw(1,   32, 149,  32, 3, 1, 0)
+    verify_conv2d_nchw(1,   32, 147,  64, 3, 1, 1)
+    verify_conv2d_nchw(1,   64,  73,  80, 1, 1, 0)
+    verify_conv2d_nchw(1,   80,  73, 192, 3, 1, 0)
+    verify_conv2d_nchw(1,  192,  35,  64, 1, 1, 0)
+    verify_conv2d_nchw(1,  192,  35,  48, 1, 1, 0)
+    verify_conv2d_nchw(1,   48,  35,  64, 5, 1, 2)
+    verify_conv2d_nchw(1,   64,  35,  96, 3, 1, 1)
+    verify_conv2d_nchw(1,   96,  35,  96, 3, 1, 1)
+    verify_conv2d_nchw(1,  192,  35,  32, 1, 1, 0)
+    verify_conv2d_nchw(1,  256,  35,  64, 1, 1, 0)
+    verify_conv2d_nchw(1,  256,  35,  48, 1, 1, 0)
+    verify_conv2d_nchw(1,  288,  35,  64, 1, 1, 0)
+    verify_conv2d_nchw(1,  288,  35,  48, 1, 1, 0)
+    verify_conv2d_nchw(1,  288,  35, 384, 3, 2, 0)
+    # verify_conv2d_nchw(1,   96,  35,  96, 3, 2, 0)
+    # verify_conv2d_nchw(1,  768,  17, 192, 1, 1, 0)
+    # verify_conv2d_nchw(1,  768,  17, 128, 1, 1, 0)
+    # verify_conv2d_nchw(1,  128,  17, 128, 1, 1, 0)
+    # verify_conv2d_nchw(1,  128,  17, 192, 7, 1, 3)
+    # verify_conv2d_nchw(1,  128,  17, 128, 7, 1, 3)
+    # verify_conv2d_nchw(1,  128,  17, 192, 1, 1, 0)
+    # verify_conv2d_nchw(1,  768,  17, 160, 1, 1, 0)
+    # verify_conv2d_nchw(1,  160,  17, 160, 1, 1, 0)
+    # verify_conv2d_nchw(1,  160,  17, 192, 7, 1, 3)
+    # verify_conv2d_nchw(1,  160,  17, 160, 7, 1, 3)
+    # verify_conv2d_nchw(1,  160,  17, 192, 1, 1, 0)
+    # verify_conv2d_nchw(1,  192,  17, 192, 1, 1, 0)
+    # verify_conv2d_nchw(1,  192,  17, 192, 7, 1, 3)
+    # verify_conv2d_nchw(1,  192,  17, 320, 3, 2, 0)
+    # verify_conv2d_nchw(1,  192,  17, 192, 3, 2, 0)
+    verify_conv2d_nchw(1, 1280,   8, 320, 1, 1, 0)
+    verify_conv2d_nchw(1, 1280,   8, 384, 1, 1, 0)
+    verify_conv2d_nchw(1,  384,   8, 384, 1, 1, 0)
+    verify_conv2d_nchw(1,  384,   8, 384, 3, 1, 1)
+    verify_conv2d_nchw(1, 1280,   8, 448, 1, 1, 0)
+    verify_conv2d_nchw(1,  448,   8, 384, 3, 1, 1)
+    verify_conv2d_nchw(1, 1280,   8, 192, 1, 1, 0)
+    verify_conv2d_nchw(1, 2048,   8, 320, 1, 1, 0)
+    verify_conv2d_nchw(1, 2048,   8, 384, 1, 1, 0)
+    verify_conv2d_nchw(1, 2048,   8, 448, 1, 1, 0)
+    verify_conv2d_nchw(1, 2048,   8, 192, 1, 1, 0)
+
 
 if __name__ == "__main__":
     test_conv2d_nchw()
diff --git a/topi/tests/python/test_topi_conv2d_transpose_nchw.py b/topi/tests/python/test_topi_conv2d_transpose_nchw.py
index 0c985400031a..5f65c038be60 100644
--- a/topi/tests/python/test_topi_conv2d_transpose_nchw.py
+++ b/topi/tests/python/test_topi_conv2d_transpose_nchw.py
@@ -6,14 +6,13 @@
 from tvm.contrib.pickle_memoize import memoize
 from topi.util import get_const_tuple
 
+from common import get_all_backend
 
 def verify_conv2d_transpose_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding):
     in_height = in_width = in_size
 
     A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
     W = tvm.placeholder((in_channel, num_filter, kernel, kernel), name='W')
-    B = topi.nn.conv2d_transpose_nchw(A, W, [stride, stride], padding, A.dtype)
-    C = topi.nn.relu(B)
 
     a_shape = get_const_tuple(A.shape)
     w_shape = get_const_tuple(W.shape)
@@ -36,22 +35,23 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
+            B = topi.nn.conv2d_transpose_nchw(A, W, [stride, stride], [padding, padding], A.dtype)
+            C = topi.nn.relu(B)
             s1 = topi.generic.schedule_conv2d_transpose_nchw([B])
             s2 = topi.generic.schedule_conv2d_transpose_nchw([C])
         a = tvm.nd.array(a_np, ctx)
         w = tvm.nd.array(w_np, ctx)
         b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
         c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx)
-        with tvm.build_config(auto_unroll_max_step=128,
-                              unroll_explicit=(device != "cuda")):
-            func1 = tvm.build(s1, [A, W, B], device)
-            func2 = tvm.build(s2, [A, W, C], device)
-            func1(a, w, b)
-            func2(a, w, c)
-            np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
-            np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
 
-    for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
+        func1 = tvm.build(s1, [A, W, B], device)
+        func2 = tvm.build(s2, [A, W, C], device)
+        func1(a, w, b)
+        func2(a, w, c)
+        np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
+        np.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
+
+    for device in get_all_backend():
         check_device(device)
 
 
diff --git a/topi/tests/python/test_topi_dense.py b/topi/tests/python/test_topi_dense.py
index 2df43eb30887..92f95f3e0497 100644
--- a/topi/tests/python/test_topi_dense.py
+++ b/topi/tests/python/test_topi_dense.py
@@ -6,13 +6,12 @@
 from topi.util import get_const_tuple
 from tvm.contrib.pickle_memoize import memoize
 
+from common import get_all_backend
 
 def verify_dense(batch, in_dim, out_dim, use_bias=True):
     A = tvm.placeholder((batch, in_dim), name='A')
     B = tvm.placeholder((out_dim, in_dim), name='B')
     C = tvm.placeholder((out_dim,), name='C')
-    D = topi.nn.dense(A, B, C if use_bias else None)
-    D = topi.nn.relu(D)
     dtype = A.dtype
 
     # use memoize to pickle the test data for next time use
@@ -36,6 +35,8 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
+            D = topi.nn.dense(A, B, C if use_bias else None)
+            D = topi.nn.relu(D)
             s = topi.generic.schedule_dense(D)
         a = tvm.nd.array(a_np, ctx)
         b = tvm.nd.array(b_np, ctx)
@@ -45,13 +46,15 @@ def check_device(device):
         f(a, b, c, d)
         np.testing.assert_allclose(d.asnumpy(), d_np, rtol=1e-5)
 
-    for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
+    for device in get_all_backend():
         check_device(device)
 
 def test_dense():
     verify_dense(1, 1024, 1000, use_bias=True)
     verify_dense(1, 1024, 1000, use_bias=False)
 
+    verify_dense(2, 1024, 1000, use_bias=True)
+
 
 if __name__ == "__main__":
     test_dense()
diff --git a/topi/tests/python/test_topi_depthwise_conv2d.py b/topi/tests/python/test_topi_depthwise_conv2d.py
index 3086054ba487..8c27af8390fe 100644
--- a/topi/tests/python/test_topi_depthwise_conv2d.py
+++ b/topi/tests/python/test_topi_depthwise_conv2d.py
@@ -2,11 +2,10 @@
 import topi
 import topi.testing
 import numpy as np
-from scipy import signal
 from topi.util import get_const_tuple
 from tvm.contrib.pickle_memoize import memoize
-from topi.cuda.depthwise_conv2d import schedule_depthwise_conv2d_nhwc
 
+from common import get_all_backend
 
 def depthwise_conv2d_with_workload_nchw(batch, in_channel, in_height, channel_multiplier, filter_height, stride, padding, dilation=1):
     in_width = in_height
@@ -18,10 +17,6 @@ def depthwise_conv2d_with_workload_nchw(batch, in_channel, in_height, channel_mu
     DilatedFilter = topi.nn.dilate(Filter, (1, 1, dilation, dilation), name='DilatedFilter')
     Scale = tvm.placeholder((in_channel * channel_multiplier,), name='Scale')
     Shift = tvm.placeholder((in_channel * channel_multiplier,), name='Shift')
-    # declare
-    DepthwiseConv2d = topi.nn.depthwise_conv2d_nchw(Input, DilatedFilter, stride=stride, padding=padding)
-    ScaleShift = topi.nn.scale_shift_nchw(DepthwiseConv2d, Scale, Shift)
-    Relu = topi.nn.relu(ScaleShift)
 
     def check_device(device):
         ctx = tvm.context(device, 0)
@@ -30,6 +25,10 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
+            # declare
+            DepthwiseConv2d = topi.nn.depthwise_conv2d_nchw(Input, DilatedFilter, stride=stride, padding=padding)
+            ScaleShift = topi.nn.scale_shift_nchw(DepthwiseConv2d, Scale, Shift)
+            Relu = topi.nn.relu(ScaleShift)
             # schedule
             s1 = topi.generic.schedule_depthwise_conv2d_nchw(DepthwiseConv2d)
             s2 = topi.generic.schedule_depthwise_conv2d_nchw(ScaleShift)
@@ -88,12 +87,8 @@ def get_ref_data():
         np.testing.assert_allclose(scale_shift_tvm.asnumpy(), scale_shift_scipy, rtol=1e-5)
         np.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)
 
-    check_device("opencl")
-    check_device("cuda")
-    check_device("metal")
-    check_device("rocm")
-    check_device("vulkan")
-    check_device("nvptx")
+    for device in get_all_backend():
+        check_device(device)
 
 
 def depthwise_conv2d_with_workload_nhwc(batch, in_channel, in_height, channel_multiplier, filter_height, stride_h, padding, dilation=1):
@@ -107,11 +102,6 @@ def depthwise_conv2d_with_workload_nhwc(batch, in_channel, in_height, channel_mu
     DilatedFilter = topi.nn.dilate(Filter, (1, 1, dilation, dilation), name='DilatedFilter')
     Scale = tvm.placeholder((in_channel * channel_multiplier,), name='Scale')
     Shift = tvm.placeholder((in_channel * channel_multiplier,), name='Shift')
-    # declare
-    DepthwiseConv2d = topi.nn.depthwise_conv2d_nhwc(Input, DilatedFilter, stride=[stride_h, stride_w], padding=padding)
-    ScaleShift = topi.nn.scale_shift_nhwc(DepthwiseConv2d, Scale, Shift)
-    Relu = topi.nn.relu(ScaleShift)
-    # schedule
 
     def check_device(device):
         ctx = tvm.context(device, 0)
@@ -121,6 +111,11 @@ def check_device(device):
         print("Running on target: %s" % device)
 
         with tvm.target.create(device):
+            # declare
+            DepthwiseConv2d = topi.nn.depthwise_conv2d_nhwc(Input, DilatedFilter, stride=[stride_h, stride_w], padding=padding)
+            ScaleShift = topi.nn.scale_shift_nhwc(DepthwiseConv2d, Scale, Shift)
+            Relu = topi.nn.relu(ScaleShift)
+            # schedule
             s1 = topi.generic.schedule_depthwise_conv2d_nhwc(DepthwiseConv2d)
             s2 = topi.generic.schedule_depthwise_conv2d_nhwc(ScaleShift)
             s3 = topi.generic.schedule_depthwise_conv2d_nhwc(Relu)
@@ -180,12 +175,9 @@ def get_ref_data():
         np.testing.assert_allclose(scale_shift_tvm.asnumpy(), scale_shift_scipy, rtol=1e-5)
         np.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)
 
-    check_device("opencl")
-    check_device("cuda")
-    check_device("metal")
-    check_device("rocm")
-    check_device("vulkan")
-    check_device("nvptx")
+    for device in get_all_backend():
+        check_device(device)
+
 
 def test_depthwise_conv2d():
     print("testing nchw")
diff --git a/tutorials/autotvm/tune_nnvm_arm.py b/tutorials/autotvm/tune_nnvm_arm.py
index f3d1c62bdaf2..e85786037477 100644
--- a/tutorials/autotvm/tune_nnvm_arm.py
+++ b/tutorials/autotvm/tune_nnvm_arm.py
@@ -312,7 +312,9 @@ def tune_and_evaluate():
 
         # upload module to device
         print("Upload...")
-        remote = autotvm.measure.request_remote(device_key, timeout=10000)
+        remote = autotvm.measure.request_remote(device_key,
+                                                tracker_addr=('localhost', 9190),
+                                                timeout=10000)
         remote.upload(tmp.relpath(filename))
         rlib = remote.load_module(filename)
 
@@ -333,7 +335,6 @@ def tune_and_evaluate():
 
 # We do not run the tuning in our webpage server since it takes too long.
 # Uncomment the following line to run by yourself.
-
 # tune_and_evaluate()
 
 ######################################################################

From ca7681097d4fb483e24bfbd670b091064f4c4df2 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <mercy_zheng@sjtu.edu.cn>
Date: Wed, 22 Aug 2018 09:23:07 -0700
Subject: [PATCH 46/77] [TOPI][ARM CPU] fuse bias to depthwise conv2d (#1631)

---
 topi/python/topi/arm_cpu/depthwise_conv2d.py | 25 ++++++++++++--------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/topi/python/topi/arm_cpu/depthwise_conv2d.py b/topi/python/topi/arm_cpu/depthwise_conv2d.py
index e066a1e29435..c341d1a5b325 100644
--- a/topi/python/topi/arm_cpu/depthwise_conv2d.py
+++ b/topi/python/topi/arm_cpu/depthwise_conv2d.py
@@ -9,11 +9,11 @@
 from ..util import traverse_inline
 
 # register original implementation of depthwise_conv2d_nchw since we don't need to change this part
-autotvm.task.register_topi_compute(depthwise_conv2d_nchw, 'arm_cpu', 'direct',
+autotvm.task.register_topi_compute(depthwise_conv2d_nchw, ['arm_cpu', 'cpu'], 'direct',
                                    depthwise_conv2d_nchw.fdefault)
 
 # register customized schedule for arm cpu.
-@autotvm.task.register_topi_schedule(schedule_depthwise_conv2d_nchw, 'arm_cpu', 'direct')
+@autotvm.task.register_topi_schedule(schedule_depthwise_conv2d_nchw, ['arm_cpu', 'cpu'], 'direct')
 def schedule_depthwise_conv2d_nchw_arm(cfg, outs):
     """Schedule depthwise conv2d
 
@@ -44,15 +44,15 @@ def _schedule(cfg, s, data, data_pad, kernel, output):
         cfg.define_split('tile_w', w, num_outputs=2)
 
         if cfg.is_fallback:
-            cfg.fallback_split('tile_c', [-1, 8])
+            cfg.fallback_split('tile_c', [-1, 4])
             cfg.fallback_split('tile_h', [-1, 2])
-            cfg.fallback_split('tile_w', [-1, 8])
+            cfg.fallback_split('tile_w', [-1, 4])
 
         # park data to vector form  [n, c, h, w] -> [n, C, h, w, VC]
         A0 = s.cache_read(data_pad, "global", C)
-        _, c, h, w = s[A0].op.axis
+        n, c, h, w = s[A0].op.axis
         c, vc = cfg['tile_c'].apply(s, A0, c)
-        s[A0].reorder(c, h, w, vc)
+        s[A0].reorder(n, c, h, w, vc)
         A1 = s.cache_write(A0, 'global')
         s[A0].compute_inline()
 
@@ -64,9 +64,9 @@ def _schedule(cfg, s, data, data_pad, kernel, output):
         B1 = s.cache_write(B0, 'global')
         s[B0].compute_inline()
 
-        _, c, h, w = s[C].op.axis
+        n, c, h, w = s[C].op.axis
         c, vc, = cfg['tile_c'].apply(s, C, c)
-        s[C].reorder(c, h, w, vc)
+        s[C].reorder(n, c, h, w, vc)
 
         # depthwise conv
         C0 = s.cache_write(C, 'global')
@@ -86,9 +86,14 @@ def _schedule(cfg, s, data, data_pad, kernel, output):
                          max_unroll=16,
                          cfg=cfg)
 
+        # fusion
+        if C.op not in s.outputs:
+            s[C].compute_inline()
+
         # mark parallel
-        n, c, h, w = s[C].op.axis
-        s[C].parallel(c)
+        last = outs[0]
+        n, c, h, w = s[last].op.axis
+        s[last].parallel(c)
 
         n, c, h, w, vc = s[C0].op.axis
         s[C0].parallel(c)

From d060e9195752b2bb0cd4b58bcc56b4a9729a3b80 Mon Sep 17 00:00:00 2001
From: Siju <sijusamuel@gmail.com>
Date: Wed, 22 Aug 2018 21:53:55 +0530
Subject: [PATCH 47/77] [FRONTEND]minor bug fixes (#1632)

---
 nnvm/python/nnvm/frontend/tensorflow.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/nnvm/python/nnvm/frontend/tensorflow.py b/nnvm/python/nnvm/frontend/tensorflow.py
index 092b8fa20219..65dd3619b5b2 100644
--- a/nnvm/python/nnvm/frontend/tensorflow.py
+++ b/nnvm/python/nnvm/frontend/tensorflow.py
@@ -430,7 +430,6 @@ def _impl(inputs, attr, params):
 
 def _lrn():
     def _impl(inputs, attr, params):
-        new_inputs = []
         attr_new = {}
         depth_radius = attr.get('depth_radius', 5)
         size = (depth_radius * 2) + 1
@@ -439,7 +438,7 @@ def _impl(inputs, attr, params):
         attr_new['bias'] = attr.get('bias', 1)
         attr_new['alpha'] = attr.get('alpha', 1) * size
         attr_new['beta'] = attr.get('beta', 0.5)
-        return AttrCvt(op_name='lrn')(new_inputs, attr_new)
+        return AttrCvt(op_name='lrn')(inputs, attr_new)
     return _impl
 
 def _sum():
@@ -613,7 +612,7 @@ def _impl(inputs, in_state_c, in_state_h, attr, params):
         ixh = _sym.concatenate(*[in_data, in_state_h], axis=1)
         in_weight = _sym.transpose(in_weight)
         gates = _sym.dense(ixh, in_weight, in_bias, use_bias=True,
-                           units=num_hidden_layers, name="dense")
+                           units=num_hidden_layers)
         gate_list = _sym.split(gates, indices_or_sections=4, axis=1)
         in_gate = _sym.sigmoid(gate_list[0])
         in_transform = _sym.tanh(gate_list[1])

From b11f2a0495541cb348ae89093fd233d78eefec6e Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Wed, 22 Aug 2018 11:18:05 -0700
Subject: [PATCH 48/77] [ATTRS] change AttrFiledInfo->Node (#1634)

---
 include/tvm/attrs.h | 45 +++++++++++++++++++++++++++++----------------
 src/lang/attrs.cc   |  2 +-
 2 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/include/tvm/attrs.h b/include/tvm/attrs.h
index aed6b1ff722f..3e5169ba02b8 100644
--- a/include/tvm/attrs.h
+++ b/include/tvm/attrs.h
@@ -69,15 +69,27 @@ struct AttrError : public dmlc::Error {
 /*!
  * \brief Information about attribute fields in string representations.
  */
-struct AttrFieldInfo {
+class AttrFieldInfoNode : public Node {
+ public:
   /*! \brief name of the field */
   std::string name;
   /*! \brief type docstring information in str. */
   std::string type_info;
   /*! \brief detailed description of the type */
   std::string description;
+
+  void VisitAttrs(AttrVisitor* v) final {
+    v->Visit("name", &name);
+    v->Visit("type_info", &type_info);
+    v->Visit("description", &description);
+  }
+  static constexpr const char* _type_key = "AttrFieldInfo";
+  TVM_DECLARE_NODE_TYPE_INFO(AttrFieldInfoNode, Node);
 };
 
+/*! \brief AttrFieldInfo */
+TVM_DEFINE_NODE_REF(AttrFieldInfo, AttrFieldInfoNode);
+
 /*!
  * \brief Base class of all attribute class
  * \note Do not subclass AttrBaseNode directly,
@@ -104,7 +116,7 @@ class BaseAttrsNode : public Node {
    * \brief Get the field information about the
    * \note This function throws when the required a field is not present.
    */
-  TVM_DLL virtual std::vector<AttrFieldInfo> ListFieldInfo() const = 0;
+  TVM_DLL virtual Array<AttrFieldInfo> ListFieldInfo() const = 0;
   /*!
    * \brief Initialize the attributes by arguments.
    * \param kwargs The key value pairs for initialization.
@@ -159,7 +171,7 @@ class DictAttrsNode : public BaseAttrsNode {
   // implementations
   void VisitAttrs(AttrVisitor* v) final;
   void InitByPackedArgs(const runtime::TVMArgs& args, bool allow_unknown) final;
-  std::vector<AttrFieldInfo> ListFieldInfo() const final;
+  Array<AttrFieldInfo> ListFieldInfo() const final;
   // type info
   static constexpr const char* _type_key = "DictAttrs";
   TVM_DECLARE_NODE_TYPE_INFO(DictAttrsNode, BaseAttrsNode);
@@ -430,7 +442,7 @@ class AttrDocEntry {
  public:
   using TSelf = AttrDocEntry;
 
-  explicit AttrDocEntry(AttrFieldInfo* info)
+  explicit AttrDocEntry(std::shared_ptr<AttrFieldInfoNode> info)
       : info_(info) {
   }
   TSelf& describe(DMLC_ATTRIBUTE_UNUSED const char* str) {
@@ -454,21 +466,22 @@ class AttrDocEntry {
   }
 
  private:
-  AttrFieldInfo* info_;
+  std::shared_ptr<AttrFieldInfoNode> info_;
 };
 
 class AttrDocVisitor {
  public:
   template<typename T>
   AttrDocEntry operator()(const char* key, T* v) {
-    AttrFieldInfo info;
-    info.name = key;
-    info.type_info = TypeName<T>::value;
-    fields_.emplace_back(std::move(info));
-    return AttrDocEntry(&(fields_.back()));
+    std::shared_ptr<AttrFieldInfoNode> info
+        = std::make_shared<AttrFieldInfoNode>();
+    info->name = key;
+    info->type_info = TypeName<T>::value;
+    fields_.push_back(AttrFieldInfo(info));
+    return AttrDocEntry(info);
   }
 
-  std::vector<AttrFieldInfo> fields_;
+  Array<AttrFieldInfo> fields_;
 };
 
 class AttrExistVisitor {
@@ -557,7 +570,7 @@ class AttrsNode : public BaseAttrsNode {
     }
   }
 
-  std::vector<AttrFieldInfo> ListFieldInfo() const final {
+  Array<AttrFieldInfo> ListFieldInfo() const final {
     detail::AttrDocVisitor visitor;
     self()->__VisitAttrs__(visitor);
     return visitor.fields_;
@@ -580,11 +593,11 @@ inline void BaseAttrsNode::InitBySeq(Args&& ...args) {
 }
 
 inline void BaseAttrsNode::PrintDocString(std::ostream &os) const { // NOLINT(*)
-  std::vector<AttrFieldInfo> entry = this->ListFieldInfo();
+  Array<AttrFieldInfo> entry = this->ListFieldInfo();
   for (AttrFieldInfo info : entry) {
-    os << info.name << " : " << info.type_info << '\n';
-    if (info.description.length() != 0) {
-      os << "    " << info.description << '\n';
+    os << info->name << " : " << info->type_info << '\n';
+    if (info->description.length() != 0) {
+      os << "    " << info->description << '\n';
     }
   }
 }
diff --git a/src/lang/attrs.cc b/src/lang/attrs.cc
index 49a91983e79d..0d8d1f3c9ece 100644
--- a/src/lang/attrs.cc
+++ b/src/lang/attrs.cc
@@ -25,7 +25,7 @@ void DictAttrsNode::InitByPackedArgs(
   }
 }
 
-std::vector<AttrFieldInfo> DictAttrsNode::ListFieldInfo() const {
+Array<AttrFieldInfo> DictAttrsNode::ListFieldInfo() const {
   return {};
 }
 

From d90c1e450ada4593e12890cc01d25eb333a3eb69 Mon Sep 17 00:00:00 2001
From: MORITA Kazutaka <morita.kazutaka@lab.ntt.co.jp>
Date: Thu, 23 Aug 2018 07:36:09 +0900
Subject: [PATCH 49/77] [NNVM][KERAS] Add cropping support (#1636)

---
 nnvm/python/nnvm/frontend/keras.py              | 17 ++++++++++++++++-
 .../tests/python/frontend/keras/test_forward.py | 15 +++++++++++++++
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/nnvm/python/nnvm/frontend/keras.py b/nnvm/python/nnvm/frontend/keras.py
index 15493d18e7bb..3cfa481762e2 100644
--- a/nnvm/python/nnvm/frontend/keras.py
+++ b/nnvm/python/nnvm/frontend/keras.py
@@ -311,6 +311,21 @@ def _convert_upsample(insym, keras_layer, _):
     return _sym.upsampling(insym, **params)
 
 
+def _convert_cropping(insym, keras_layer, _):
+    _check_data_format(keras_layer)
+    crop_type = type(keras_layer).__name__
+    if crop_type == "Cropping1D":
+        raise NotImplementedError("Cropping1D not implemented")
+    elif crop_type == "Cropping2D":
+        (_, in_h, in_w, _) = keras_layer.input_shape
+        ((crop_t, crop_b), (crop_l, crop_r)) = keras_layer.cropping
+    else:
+        raise TypeError("Unrecognized cropping type : {}".format(crop_type))
+    int32_max = np.iinfo(np.int32).max
+    return _sym.strided_slice(insym, begin=[0, 0, crop_t, crop_l],
+                              end=[int32_max, int32_max, in_h-crop_b, in_w-crop_r])
+
+
 def _convert_batchnorm(insym, keras_layer, symtab):
     params = {'scale': False,
               'center': False,
@@ -409,6 +424,7 @@ def _default_skip(insym, keras_layer, _): # pylint: disable=unused-argument
     'Multiply'                 : _convert_merge,
     'ZeroPadding2D'            : _convert_padding,
     'UpSampling2D'             : _convert_upsample,
+    'Cropping2D'               : _convert_cropping,
 
     # 'ZeroPadding1D'          : _convert_padding,
     # 'AveragePooling1D'       : _convert_pooling,
@@ -416,7 +432,6 @@ def _default_skip(insym, keras_layer, _): # pylint: disable=unused-argument
     # 'GlobalAveragePooling1D' : _convert_pooling,
     # 'GlobalMaxPooling1D'     : _convert_pooling,
     # 'Cropping1D'             : _convert_cropping,
-    # 'Cropping2D'             : _convert_cropping,
     # 'UpSampling1D'           : _convert_upsample,
     # 'UpSampling3D'           : _convert_upsample,
     # 'Conv1D'                 : _convert_convolution1d,
diff --git a/nnvm/tests/python/frontend/keras/test_forward.py b/nnvm/tests/python/frontend/keras/test_forward.py
index 17c9fc1329d7..5c27a3e38099 100644
--- a/nnvm/tests/python/frontend/keras/test_forward.py
+++ b/nnvm/tests/python/frontend/keras/test_forward.py
@@ -110,6 +110,20 @@ def test_forward_reshape():
     verify_keras_frontend(keras_model)
 
 
+def test_forward_crop():
+    data = keras.layers.Input(shape=(32,32,3))
+    x = keras.layers.Cropping2D(cropping=((1, 1), (1, 1)))(data)
+    x = keras.layers.Cropping2D(cropping=(1, 1))(x)
+    x = keras.layers.Cropping2D(cropping=1)(x)
+    x = keras.layers.Cropping2D(cropping=((0, 1), (1, 0)))(x)
+    x = keras.layers.Cropping2D(cropping=(1, 0))(x)
+    x = keras.layers.Cropping2D(cropping=0)(x)
+    x = keras.layers.Add()([x, x])
+    x = keras.layers.GlobalAveragePooling2D()(x)
+    keras_model = keras.models.Model(data, x)
+    verify_keras_frontend(keras_model)
+
+
 def test_forward_vgg16():
     keras_model = keras.applications.vgg16.VGG16(include_top=True, weights=None,
         input_shape=(224,224,3), classes=1000)
@@ -196,6 +210,7 @@ def test_forward_reuse_layers():
     test_forward_separable_conv()
     test_forward_upsample()
     test_forward_reshape()
+    test_forward_crop()
     test_forward_vgg16()
     test_forward_xception()
     test_forward_resnet50()

From 58b2395d2d09036852e12ee7f751b26997c63741 Mon Sep 17 00:00:00 2001
From: MORITA Kazutaka <morita.kazutaka@lab.ntt.co.jp>
Date: Thu, 23 Aug 2018 07:48:01 +0900
Subject: [PATCH 50/77] [NNVM][KERAS] Fixed padding in pooling (#1635)

---
 nnvm/python/nnvm/frontend/keras.py               |  4 +---
 nnvm/tests/python/frontend/keras/test_forward.py | 15 ++++++++++++++-
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/nnvm/python/nnvm/frontend/keras.py b/nnvm/python/nnvm/frontend/keras.py
index 3cfa481762e2..3a0a25aa4979 100644
--- a/nnvm/python/nnvm/frontend/keras.py
+++ b/nnvm/python/nnvm/frontend/keras.py
@@ -269,14 +269,12 @@ def _convert_pooling(insym, keras_layer, symtab):
                   'padding': [0, 0]}
         if keras_layer.padding == 'valid':
             pass
-        # we insert a separate pad operator
         elif keras_layer.padding == 'same':
             in_h = keras_layer.input_shape[1]
             in_w = keras_layer.input_shape[2]
             pad_t, pad_b = _get_pad_pair(in_h, pool_h, stride_h)
             pad_l, pad_r = _get_pad_pair(in_w, pool_w, stride_w)
-            insym = _sym.pad(data=insym, pad_width=(
-                (0, 0), (0, 0), (pad_t, pad_b), (pad_l, pad_r)))
+            params['padding'] = [pad_t, pad_l, pad_b, pad_r]
         else:
             raise TypeError("Unsupported padding type : {}".format(keras_layer.padding))
         if pool_type == 'MaxPooling2D':
diff --git a/nnvm/tests/python/frontend/keras/test_forward.py b/nnvm/tests/python/frontend/keras/test_forward.py
index 5c27a3e38099..3e80c74399cc 100644
--- a/nnvm/tests/python/frontend/keras/test_forward.py
+++ b/nnvm/tests/python/frontend/keras/test_forward.py
@@ -38,7 +38,7 @@ def get_tvm_output(xs, target, ctx, dtype='float32'):
         out = m.get_output(0, tvm.nd.empty(out_shape, dtype))
         return out.asnumpy()
 
-    xs = [np.random.uniform(size=shape) for shape in in_shapes]
+    xs = [np.random.uniform(size=shape, low=-1.0, high=1.0) for shape in in_shapes]
     keras_out = get_keras_output(xs)
     for target, ctx in ctx_list():
         tvm_out = get_tvm_output([x.transpose([0,3,1,2]) for x in xs], target, ctx)
@@ -74,6 +74,18 @@ def test_forward_dense():
     verify_keras_frontend(keras_model)
 
 
+def test_forward_pool():
+    data = keras.layers.Input(shape=(2,2,1))
+    # maxpool
+    x = keras.layers.MaxPooling2D((3, 3), strides=(1, 1), padding='same')(data)
+    keras_model = keras.models.Model(data, x)
+    verify_keras_frontend(keras_model)
+    # avgpool
+    y = keras.layers.AveragePooling2D((3, 3), strides=(1, 1), padding='same')(data)
+    keras_model = keras.models.Model(data, y)
+    verify_keras_frontend(keras_model)
+
+
 def test_forward_transpose_conv():
     data = keras.layers.Input(shape=(32,32,3))
     x = keras.layers.Conv2D(filters=10, kernel_size=(3,3), strides=(2,2), padding='same')(data)
@@ -206,6 +218,7 @@ def test_forward_reuse_layers():
     test_forward_elemwise_add()
     test_forward_activations()
     test_forward_dense()
+    test_forward_pool()
     test_forward_transpose_conv()
     test_forward_separable_conv()
     test_forward_upsample()

From 00038567cba3e181338b28f27cda69a455f40107 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Wed, 22 Aug 2018 20:17:02 -0700
Subject: [PATCH 51/77] [TEAM] New reviewer: nishi-t (#1637)

---
 CONTRIBUTORS.md | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 9db50b02b11a..faac3386f245 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -23,6 +23,7 @@ See the [community structure document](http://docs.tvm.ai/contribute/community.h
 ## Reviewers
 - [Masahiro Masuda](https://github.com/masahi)
 - [Kazutaka Morita](https://github.com/kazum)
+- [Tatsuya Nishiyama](https://github.com/nishi-t)
 - [Pariksheet Pinjari](https://github.com/PariksheetPinjari909)
 - [Siva](https://github.com/srkreddy1238)
 - [Alex Weaver](https://github.com/alex-weaver)
@@ -35,9 +36,6 @@ See the [community structure document](http://docs.tvm.ai/contribute/community.h
   - To contributors: please add your name to the list.
 - [Qiao Zhang](https://github.com/zhangqiaorjc)
 - [Jian Weng](https://github.com/were)
-- [Masahiro Masuda](https://github.com/masahi)
 - [Haolong Zhang](https://github.com/haolongzhangm)
 - [Cody Hao Yu](https://github.com/comaniac)
 - [Chris Nuernberger](https://github.com/cnuernber)
-- [Tatsuya Nishiyama](https://github.com/nishi-t)
-- [Kazutaka Morita](https://github.com/kazum)

From c98ba601180ba80cb7d6484fa8dfdddb4742a5cb Mon Sep 17 00:00:00 2001
From: Siju <sijusamuel@gmail.com>
Date: Thu, 23 Aug 2018 08:47:36 +0530
Subject: [PATCH 52/77] [FRONTEND][COREML]More ops are added (#1619)

---
 nnvm/python/nnvm/frontend/coreml.py           |  40 +++++-
 .../python/frontend/coreml/test_forward.py    | 121 ++++++++++++++++++
 2 files changed, 160 insertions(+), 1 deletion(-)

diff --git a/nnvm/python/nnvm/frontend/coreml.py b/nnvm/python/nnvm/frontend/coreml.py
index 3ca76bb0b20e..7dfd54317b55 100644
--- a/nnvm/python/nnvm/frontend/coreml.py
+++ b/nnvm/python/nnvm/frontend/coreml.py
@@ -269,6 +269,40 @@ def UpsampleLayerParams(op, insym, symtab):
 def L2NormalizeLayerParams(op, insym, symtab):
     return _sym.l2_normalize(insym, eps=op.epsilon, axis=1)
 
+def LRNLayerParams(op, insym, symtab):
+    par = {}
+    par['size'] = op.localSize
+    par['bias'] = op.k
+    par['alpha'] = op.alpha
+    par['beta'] = op.beta
+    par['axis'] = 1 #default layout is nchw
+    return _sym.lrn(data=insym, **par)
+
+def AverageLayerParams(op, insyms, symtab):
+    if not isinstance(insyms, list) or len(insyms) < 2:
+        raise ValueError("Expect minimum 2 inputs")
+    count = len(insyms)
+    _sum = insyms[0]
+    for i in range(1, count):
+        _sum = _sym.broadcast_add(_sum, insyms[i])
+    return _sum / count
+
+def MaxLayerParams(op, insyms, symtab):
+    if not isinstance(insyms, list) or len(insyms) < 2:
+        raise ValueError("Expect minimum 2 inputs")
+    _max = insyms[0]
+    for i in range(1, len(insyms)):
+        _max = _sym.broadcast_max(_max, insyms[i])
+    return _max
+
+def MinLayerParams(op, insyms, symtab):
+    if not isinstance(insyms, list) or len(insyms) < 2:
+        raise ValueError("Expect minimum 2 inputs")
+    _min = insyms[0]
+    for i in range(1, len(insyms)):
+        _min = _sym.broadcast_min(_min, insyms[i])
+    return _min
+
 _convert_map = {
     'NeuralNetworkMeanImage': NeuralNetworkMeanImage,
     'NeuralNetworkImageScaler': NeuralNetworkImageScaler,
@@ -286,7 +320,11 @@ def L2NormalizeLayerParams(op, insym, symtab):
     'PaddingLayerParams':PaddingLayerParams,
     'PermuteLayerParams':PermuteLayerParams,
     'UpsampleLayerParams':UpsampleLayerParams,
-    'L2NormalizeLayerParams':L2NormalizeLayerParams
+    'L2NormalizeLayerParams':L2NormalizeLayerParams,
+    'LRNLayerParams':LRNLayerParams,
+    'AverageLayerParams':AverageLayerParams,
+    'MaxLayerParams':MaxLayerParams,
+    'MinLayerParams':MinLayerParams,
 }
 
 def coreml_op_to_nnvm(op, inname, outname, symtab):
diff --git a/nnvm/tests/python/frontend/coreml/test_forward.py b/nnvm/tests/python/frontend/coreml/test_forward.py
index 27ae28c20ab9..a33a7c5a5ed7 100644
--- a/nnvm/tests/python/frontend/coreml/test_forward.py
+++ b/nnvm/tests/python/frontend/coreml/test_forward.py
@@ -223,6 +223,123 @@ def verify_l2_normalize(input_dim, eps):
 def test_forward_l2_normalize():
     verify_l2_normalize((1, 3, 20, 20), 0.001)
 
+def verify_lrn(input_dim, size, bias, alpha, beta):
+    dtype = "float32"
+    axis=1
+    a_np = np.random.uniform(size=input_dim).astype(dtype)
+    b_np = topi.testing.lrn_python(a_np, size, axis, bias, alpha, beta)
+
+    input = [('input', datatypes.Array(*input_dim))]
+    output = [('output', datatypes.Array(*b_np.shape))]
+    builder = NeuralNetworkBuilder(input, output)
+    builder.add_lrn(name='LRN',
+                    input_name='input',
+                    output_name='output',
+                    alpha=alpha,
+                    beta=beta,
+                    k=bias,
+                    local_size=size)
+
+    model = cm.models.MLModel(builder.spec)
+    for target, ctx in ctx_list():
+        out = run_tvm_graph(model, a_np, 'input', b_np.shape, dtype)
+        np.testing.assert_allclose(out, b_np, rtol=1e-5)
+
+def test_forward_lrn():
+    verify_lrn((1, 3, 10, 20), 3, 1.0, 1.0, 0.5)
+
+def verify_average(input_dim1, input_dim2, axis=0):
+    dtype = 'float32'
+
+    a_np1 = np.random.uniform(size=input_dim1).astype(dtype)
+    a_np2 = np.random.uniform(size=input_dim2).astype(dtype)
+
+    b_np = np.mean((a_np1, a_np2), axis=axis)
+
+    inputs = [('input1', datatypes.Array(*input_dim1)),
+              ('input2', datatypes.Array(*input_dim2))]
+    output = [('output', datatypes.Array(*b_np.shape))]
+    builder = NeuralNetworkBuilder(inputs, output)
+    builder.add_elementwise(name='MEAN',
+                            input_names=['input1', 'input2'],
+                            output_name='output',
+                            mode='AVE')
+    model = cm.models.MLModel(builder.spec)
+    for target, ctx in ctx_list():
+        out = run_tvm_graph(model,
+                           [a_np1, a_np2],
+                           ['input1', 'input2'],
+                           b_np.shape,
+                           dtype)
+        np.testing.assert_allclose(out, b_np, rtol=1e-5)
+
+def test_forward_average():
+    verify_average((1, 3, 20, 20), (1, 3, 20, 20))
+    verify_average((3, 20, 20), (1, 3, 20, 20))
+    verify_average((20, 20), (1, 3, 20, 20))
+
+def verify_max(input_dim):
+    dtype = 'float32'
+
+    a_np1 = np.random.uniform(size=input_dim).astype(dtype)
+    a_np2 = np.random.uniform(size=input_dim).astype(dtype)
+    a_np3 = np.random.uniform(size=input_dim).astype(dtype)
+
+    b_np = np.max((a_np1, a_np2, a_np3), axis=0)
+
+    inputs = [('input1', datatypes.Array(*input_dim)),
+              ('input2', datatypes.Array(*input_dim)),
+              ('input3', datatypes.Array(*input_dim))]
+    output = [('output', datatypes.Array(*b_np.shape))]
+    builder = NeuralNetworkBuilder(inputs, output)
+    builder.add_elementwise(name='Max',
+                            input_names=['input1', 'input2', 'input3'],
+                            output_name='output',
+                            mode='MAX')
+    model = cm.models.MLModel(builder.spec)
+    for target, ctx in ctx_list():
+        out = run_tvm_graph(model,
+                           [a_np1, a_np2, a_np3],
+                           ['input1', 'input2', 'input3'],
+                           b_np.shape,
+                           dtype)
+        np.testing.assert_allclose(out, b_np, rtol=1e-5)
+
+def test_forward_max():
+    verify_max((1, 3, 20, 20))
+    verify_max((20, 20))
+
+def verify_min(input_dim):
+    dtype = 'float32'
+
+    a_np1 = np.random.uniform(size=input_dim).astype(dtype)
+    a_np2 = np.random.uniform(size=input_dim).astype(dtype)
+    a_np3 = np.random.uniform(size=input_dim).astype(dtype)
+
+    b_np = np.min((a_np1, a_np2, a_np3), axis=0)
+
+    inputs = [('input1', datatypes.Array(*input_dim)),
+              ('input2', datatypes.Array(*input_dim)),
+              ('input3', datatypes.Array(*input_dim))]
+    output = [('output', datatypes.Array(*b_np.shape))]
+    builder = NeuralNetworkBuilder(inputs, output)
+    builder.add_elementwise(name='Min',
+                            input_names=['input1', 'input2', 'input3'],
+                            output_name='output',
+                            mode='MIN')
+    model = cm.models.MLModel(builder.spec)
+    for target, ctx in ctx_list():
+        out = run_tvm_graph(model,
+                           [a_np1, a_np2, a_np3],
+                           ['input1', 'input2', 'input3'],
+                           b_np.shape,
+                           dtype)
+        np.testing.assert_allclose(out, b_np, rtol=1e-5)
+
+def test_forward_min():
+    verify_min((1, 3, 20, 20))
+    verify_min((20, 20))
+
 if __name__ == '__main__':
     test_mobilenet_checkonly()
     test_resnet50_checkonly()
@@ -231,3 +348,7 @@ def test_forward_l2_normalize():
     test_forward_MultiplyLayerParams()
     test_forward_UpsampleLayerParams()
     test_forward_l2_normalize()
+    test_forward_lrn()
+    test_forward_average()
+    test_forward_max()
+    test_forward_min()

From 56c50d2d072d976b058ba7e29a276e4e9b1e8839 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <mercy_zheng@sjtu.edu.cn>
Date: Wed, 22 Aug 2018 20:19:43 -0700
Subject: [PATCH 53/77] trigger ci (#1620)

---
 topi/python/topi/x86/nn.py | 43 +++++++++++++++++---------------------
 1 file changed, 19 insertions(+), 24 deletions(-)

diff --git a/topi/python/topi/x86/nn.py b/topi/python/topi/x86/nn.py
index 03e07222c420..6802d4c01e60 100644
--- a/topi/python/topi/x86/nn.py
+++ b/topi/python/topi/x86/nn.py
@@ -2,8 +2,9 @@
 """x86 nn operators"""
 from __future__ import absolute_import as _abs
 import tvm
+
 from .. import generic
-from .. import tag
+from ..util import traverse_inline
 
 @generic.schedule_softmax.register(["cpu"])
 def schedule_softmax(outs):
@@ -53,44 +54,38 @@ def schedule_dense(outs):
 
     outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
     s = tvm.create_schedule([x.op for x in outs])
-    scheduled_ops = []
-
-    def traverse(op):
-        """Traverse operators from computation graph"""
-        # inline all one-to-one-mapping operators except the last stage (output)
-        if tag.is_broadcast(op.tag):
-            if op not in s.outputs:
-                s[op].compute_inline()
-            for tensor in op.input_tensors:
-                if tensor.op.input_tensors and tensor.op not in scheduled_ops:
-                    traverse(tensor.op)
 
+    def _callback(op):
         if 'dense' in op.tag:
-            C = op.output(0)
-            x, y = C.op.axis
+            output = outs[0]
+            dense = op.output(0)
 
             # Write cache for blocks
-            CC = s.cache_write(C, 'global')
+            if dense.op in s.outputs:
+                CC = s.cache_write(dense, 'local')
+            else:
+                CC = dense
 
             # Tile
             bnx = 1
             bny = 4
-            _, yo, _, yi = s[C].tile(x, y, bnx, bny)
-            s[CC].compute_at(s[C], yo)
+            x, y = output.op.axis
+            xo, yo, xi, yi = s[output].tile(x, y, bnx, bny)
+
             xc, yc = s[CC].op.axis
             k, = s[CC].op.reduce_axis
             ko, ki = s[CC].split(k, factor=4)
             s[CC].reorder(ko, xc, ki, yc)
+
             s[CC].unroll(ki)
             s[CC].vectorize(yc)
 
-            # Vectorization
-            s[C].vectorize(yi)
-
-            # Parallelization
-            s[C].parallel(yo)
+            s[output].unroll(xi)
+            s[output].vectorize(yi)
 
-        scheduled_ops.append(op)
+            fused = s[output].fuse(xo, yo)
+            s[output].parallel(fused)
+            s[CC].compute_at(s[output], fused)
 
-    traverse(outs[0].op)
+    traverse_inline(s, outs[0].op, _callback)
     return s

From cfafd212c093982daec5295f69cf288ff72c9b0f Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <mercy_zheng@sjtu.edu.cn>
Date: Wed, 22 Aug 2018 20:21:15 -0700
Subject: [PATCH 54/77] [AUTOTVM] Simplify TopHub (#1630)

---
 apps/benchmark/arm_cpu_imagenet_bench.py      |  44 +++----
 apps/benchmark/util.py                        |  72 ++++++++++++
 nnvm/python/nnvm/testing/__init__.py          |   1 +
 nnvm/python/nnvm/testing/mobilenet_v2.py      |  51 ++++++++
 python/tvm/autotvm/tophub.py                  | 109 ++++++++----------
 python/tvm/exec/tophub.py                     |  37 ------
 python/tvm/rpc/server.py                      |   2 +-
 python/tvm/target.py                          |   5 -
 .../integration/test_benchmark_topi_conv2d.py |   2 +-
 vta/tutorials/resnet.py                       |   2 +-
 10 files changed, 190 insertions(+), 135 deletions(-)
 create mode 100644 apps/benchmark/util.py
 create mode 100644 nnvm/python/nnvm/testing/mobilenet_v2.py
 delete mode 100644 python/tvm/exec/tophub.py

diff --git a/apps/benchmark/arm_cpu_imagenet_bench.py b/apps/benchmark/arm_cpu_imagenet_bench.py
index 7baf244e0dae..f5057299920c 100644
--- a/apps/benchmark/arm_cpu_imagenet_bench.py
+++ b/apps/benchmark/arm_cpu_imagenet_bench.py
@@ -1,45 +1,26 @@
-"""Benchmark script for performance on ARM CPU.
+"""Benchmark script for ARM CPU.
 see README.md for the usage and results of this script.
 """
-
 import argparse
-import time
 
 import numpy as np
 
-import nnvm.testing
-import nnvm.compiler
 import tvm
-from tvm import autotvm
 from tvm.contrib.util import tempdir
 import tvm.contrib.graph_runtime as runtime
+import nnvm.compiler
+import nnvm.testing
 
-def get_network(name, batch_size):
-    """Get the symbol definition and random weight of a network"""
-    input_shape = (batch_size, 3, 224, 224)
-    output_shape = (batch_size, 1000)
-
-    if name == 'resnet-18':
-        net, params = nnvm.testing.resnet.get_workload(num_layers=18,
-                                                       batch_size=batch_size, image_shape=(3, 224, 224))
-    elif name == 'mobilenet':
-        net, params = nnvm.testing.mobilenet.get_workload(batch_size=batch_size)
-    elif name == 'squeezenet v1.1':
-        net, params = nnvm.testing.squeezenet.get_workload(batch_size=batch_size,
-                                                           version='1.1')
-    elif name == 'vgg-16':
-        net, params = nnvm.testing.vgg.get_workload(batch_size=batch_size, num_layers=16)
-    else:
-        raise RuntimeError("Unsupported network: " + name)
-
-    return net, params, input_shape, output_shape
+from util import get_network, print_progress
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("--network", type=str, choices=['resnet-18', 'mobilenet', 'squeezenet v1.1', 'vgg-16'])
-    parser.add_argument("--device", type=str, required=True, choices=['rk3399', 'mate10', 'mate10pro', 'p20', 'p20pro', 
-                                                                      'pixel2', 'rasp3b', 'pynq'])
+    parser.add_argument("--network", type=str, choices=
+                        ['resnet-18', 'resnet-34', 'vgg-16', 'mobilenet', 'squeezenet v1.1', ])
+    parser.add_argument("--device", type=str, required=True, choices=
+                        ['rk3399', 'mate10', 'mate10pro', 'p20', 'p20pro',
+                         'pixel2', 'rasp3b', 'pynq'])
     parser.add_argument("--host", type=str, default='localhost')
     parser.add_argument("--port", type=int, default=9190)
     parser.add_argument("--rpc-key", type=str, required=True)
@@ -49,7 +30,7 @@ def get_network(name, batch_size):
     dtype = 'float32'
 
     if args.network is None:
-        networks = ['squeezenet v1.1', 'mobilenet', 'resnet-18', 'vgg-16']
+        networks = ['squeezenet_v1.1', 'mobilenet', 'resnet-18', 'vgg-16']
     else:
         networks = [args.network]
 
@@ -63,8 +44,10 @@ def get_network(name, batch_size):
     print("%-20s %-20s" % ("Network Name", "Mean Inference Time (std dev)"))
     print("--------------------------------------------------")
     for network in networks:
+        print_progress(network)
         net, params, input_shape, output_shape = get_network(network, batch_size=1)
 
+        print_progress("%-20s building..." % network)
         with nnvm.compiler.build_config(opt_level=2, add_pass=['AlterOpLayout']):
             graph, lib, params = nnvm.compiler.build(
                 net, target=target, shape={'data': input_shape}, params=params, dtype=dtype)
@@ -79,6 +62,7 @@ def get_network(name, batch_size):
             lib.export_library(tmp.relpath(filename))
 
         # upload library and params
+        print_progress("%-20s uploading..." % network)
         ctx = remote.context(str(target), 0)
         remote.upload(tmp.relpath(filename))
         rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()}
@@ -90,7 +74,7 @@ def get_network(name, batch_size):
         module.set_input(**rparams)
 
         # evaluate
+        print_progress("%-20s evaluating..." % network)
         ftimer = module.module.time_evaluator("run", ctx, number=args.number, repeat=3)
         prof_res = np.array(ftimer().results) * 1000  # multiply 1000 for converting to millisecond
         print("%-20s %-19s (%s)" % (network, "%.2f ms" % np.mean(prof_res), "%.2f ms" % np.std(prof_res)))
-
diff --git a/apps/benchmark/util.py b/apps/benchmark/util.py
new file mode 100644
index 000000000000..bd4a3d04a1d1
--- /dev/null
+++ b/apps/benchmark/util.py
@@ -0,0 +1,72 @@
+"""Utility for benchmark"""
+
+import sys
+import nnvm
+
+def get_network(name, batch_size):
+    """Get the symbol definition and random weight of a network
+    
+    Parameters
+    ----------
+    name: str
+        The name of the network, can be 'resnet-18', 'resnet-50', 'vgg-16', 'inception_v3', 'mobilenet', ...
+    batch_size:
+        batch size
+
+    Returns
+    -------
+    net: nnvm.symbol
+        The NNVM symbol of network definition
+    params: dict
+        The random parameters for benchmark
+    input_shape: tuple
+        The shape of input tensor
+    output_shape: tuple
+        The shape of output tensor
+    """
+    input_shape = (batch_size, 3, 224, 224)
+    output_shape = (batch_size, 1000)
+
+    if "resnet" in name:
+        n_layer = int(name.split('-')[1])
+        net, params = nnvm.testing.resnet.get_workload(num_layers=n_layer, batch_size=batch_size)
+    elif "vgg" in name:
+        n_layer = int(name.split('-')[1])
+        net, params = nnvm.testing.vgg.get_workload(num_layers=n_layer, batch_size=batch_size)
+    elif name == 'mobilenet':
+        net, params = nnvm.testing.mobilenet.get_workload(batch_size=batch_size)
+    elif "squeezenet" in name:
+        version = name.split("_v")[1]
+        net, params = nnvm.testing.squeezenet.get_workload(batch_size=batch_size, version=version)
+    elif name == 'inception_v3':
+        input_shape = (1, 3, 299, 299)
+        net, params = nnvm.testing.inception_v3.get_workload(batch_size=batch_size)
+    elif name == 'custom':
+        # an example for custom network
+        from nnvm.testing import utils
+        net = nnvm.sym.Variable('data')
+        net = nnvm.sym.conv2d(net, channels=4, kernel_size=(3,3), padding=(1,1))
+        net = nnvm.sym.flatten(net)
+        net = nnvm.sym.dense(net, units=1000)
+        net, params = utils.create_workload(net, batch_size, (3, 224, 224))
+    elif name == 'mxnet':
+        # an example for mxnet model
+        from mxnet.gluon.model_zoo.vision import get_model
+        block = get_model('resnet18_v1', pretrained=True)
+        net, params = nnvm.frontend.from_mxnet(block)
+        net = nnvm.sym.softmax(net)
+    else:
+        raise ValueError("Unsupported network: " + name)
+
+    return net, params, input_shape, output_shape
+
+def print_progress(msg):
+    """print progress message
+    
+    Parameters
+    ----------
+    msg: str
+        The message to print
+    """
+    sys.stdout.write(msg + "\r")
+    sys.stdout.flush()
diff --git a/nnvm/python/nnvm/testing/__init__.py b/nnvm/python/nnvm/testing/__init__.py
index 4a879047ec7e..a04d2bc83587 100644
--- a/nnvm/python/nnvm/testing/__init__.py
+++ b/nnvm/python/nnvm/testing/__init__.py
@@ -4,6 +4,7 @@
 from .config import ctx_list
 from .utils import create_workload
 from . import mobilenet
+from . import mobilenet_v2
 from . import mlp
 from . import resnet
 from . import vgg
diff --git a/nnvm/python/nnvm/testing/mobilenet_v2.py b/nnvm/python/nnvm/testing/mobilenet_v2.py
new file mode 100644
index 000000000000..dc3c7cd85660
--- /dev/null
+++ b/nnvm/python/nnvm/testing/mobilenet_v2.py
@@ -0,0 +1,51 @@
+"""
+MobileNetV2, load model from gluon model zoo
+
+Reference:
+Inverted Residuals and Linear Bottlenecks:
+Mobile Networks for Classification, Detection and Segmentation
+https://arxiv.org/abs/1801.04381
+"""
+
+from .utils import create_workload
+from ..frontend.mxnet import _from_mxnet_impl
+
+def get_workload(batch_size, num_classes=1000, multiplier=1.0, dtype="float32"):
+    """Get benchmark workload for mobilenet
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch size used in the model
+
+    num_classes : int, optional
+        Number of classes
+
+    multiplier : tuple, optional
+        The input image shape
+
+    dtype : str, optional
+        The data type
+
+    Returns
+    -------
+    net : nnvm.Symbol
+        The computational graph
+
+    params : dict of str to NDArray
+        The parameters.
+    """
+    import mxnet as mx
+    from mxnet.gluon.model_zoo.vision.mobilenet import MobileNetV2
+
+    image_shape = (1, 3, 224, 224)
+
+    block = MobileNetV2(multiplier=multiplier, classes=num_classes)
+
+    data = mx.sym.Variable('data')
+    sym = block(data)
+    sym = mx.sym.SoftmaxOutput(sym)
+
+    net = _from_mxnet_impl(sym, {})
+
+    return create_workload(net, batch_size, image_shape[1:], dtype)
diff --git a/python/tvm/autotvm/tophub.py b/python/tvm/autotvm/tophub.py
index 3d7b249df905..4982455038fc 100644
--- a/python/tvm/autotvm/tophub.py
+++ b/python/tvm/autotvm/tophub.py
@@ -8,16 +8,22 @@
 
 import logging
 import os
-import json
 import sys
 
 from .task import ApplyHistoryBest
 from .. import target as _target
-from ..contrib.util import tempdir
 from ..contrib.download import download
 
+# root path to store TopHub files
 AUTOTVM_TOPHUB_ROOT_PATH = os.path.join(os.path.expanduser('~'), ".tvm", "tophub")
 
+# the version of each package
+PACKAGE_VERSION = {
+    'vta':     "v0.01",
+    'arm_cpu': "v0.01",
+    'cuda':    "v0.01",
+}
+
 logger = logging.getLogger('autotvm')
 
 def _alias(name):
@@ -30,7 +36,8 @@ def _alias(name):
 
 def context(target, extra_files=None):
     """Return the dispatch context with pre-tuned parameters.
-    The corresponding downloaded *.log files under tophub root path will be loaded.
+    This function will load the corresponding *.log files in AUTOTVM_TOPHUB_ROOT_PATH.
+    If cannot find them, it will download them from TopHub github repo.
     Users can also add their own files in argument `extra_files`.
 
     Parameters
@@ -40,21 +47,24 @@ def context(target, extra_files=None):
     extra_files: list of str, optional
         Extra log files to load
     """
-    rootpath = AUTOTVM_TOPHUB_ROOT_PATH
     best_context = ApplyHistoryBest([])
 
     if isinstance(target, str):
         target = _target.create(target)
 
-    big_target = str(target).split()[0]
-    if os.path.isfile(os.path.join(rootpath, big_target + ".log")):
-        best_context.load(os.path.join(rootpath, big_target + ".log"))
-
+    possible_names = [str(target).split()[0]]
     for opt in target.options:
         if opt.startswith("-device"):
-            model = _alias(opt[8:])
-            if os.path.isfile(os.path.join(rootpath, model) + ".log"):
-                best_context.load(os.path.join(rootpath, model) + ".log")
+            device = _alias(opt[8:])
+            possible_names.append(device)
+
+    all_packages = list(PACKAGE_VERSION.keys())
+    for name in possible_names:
+        if name in all_packages:
+            check_backend(name)
+
+            filename = "%s_%s.log" % (name, PACKAGE_VERSION[name])
+            best_context.load(os.path.join(AUTOTVM_TOPHUB_ROOT_PATH, filename))
 
     if extra_files:
         for filename in extra_files:
@@ -63,42 +73,21 @@ def context(target, extra_files=None):
     return best_context
 
 
-def download_package(backend):
-    """Download pre-tuned parameters of operators for a backend
-
-    Parameters
-    ----------
-    backend: str
-        The name of package
-    """
-    rootpath = AUTOTVM_TOPHUB_ROOT_PATH
-
-    if not os.path.isdir(rootpath):
-        # make directory
-        splits = os.path.split(rootpath)
-        for j in range(1, len(splits)+1):
-            path = os.path.join(*splits[:j])
-            if not os.path.isdir(path):
-                os.mkdir(path)
-
-    backend = _alias(backend)
-    logger.info("Download pre-tuned parameters for %s", backend)
-    download("https://raw.githubusercontent.com/uwsaml/tvm-distro/master/tophub/%s.log" % backend,
-             os.path.join(rootpath, backend + ".log"), True, verbose=0)
-
-
-def check_package(backend):
+def check_backend(backend):
     """Check whether have pre-tuned parameters of the certain target.
     If not, will download it.
 
     Parameters
     ----------
     backend: str
-        The name of package
+        The name of backend.
     """
     backend = _alias(backend)
+    assert backend in PACKAGE_VERSION, 'Cannot find backend "%s" in TopHub' % backend
 
-    if os.path.isfile(os.path.join(AUTOTVM_TOPHUB_ROOT_PATH, backend + ".log")):
+    version = PACKAGE_VERSION[backend]
+    package_name = "%s_%s.log" % (backend, version)
+    if os.path.isfile(os.path.join(AUTOTVM_TOPHUB_ROOT_PATH, package_name)):
         return
 
     if sys.version_info >= (3,):
@@ -106,29 +95,29 @@ def check_package(backend):
     else:
         import urllib2
     try:
-        download_package(backend)
-    except urllib2.URLError:
-        logging.warning("Failed to download tophub package for %s", backend)
+        download_package(package_name)
+    except urllib2.URLError as e:
+        logging.warning("Failed to download tophub package for %s: %s", backend, e)
 
 
-def list_packages():
-    """List all available pre-tuned op parameters for targets
+def download_package(package_name):
+    """Download pre-tuned parameters of operators for a backend
 
-    Returns
-    -------
-    ret: List
-        All available packets
+    Parameters
+    ----------
+    package_name: str
+        The name of package
     """
-    path = tempdir()
-    filename = path.relpath("info.json")
-    logger.info("Download meta info for pre-tuned parameters")
-    download("https://raw.githubusercontent.com/uwsaml/tvm-distro/master/tophub/info.json",
-             filename, True, verbose=0)
-
-    with open(filename, "r") as fin:
-        text = "".join(fin.readlines())
-    info = json.loads(text)
-    keys = list(info.keys())
-    keys.sort()
-
-    return [(k, info[k]) for k in keys]
+    rootpath = AUTOTVM_TOPHUB_ROOT_PATH
+
+    if not os.path.isdir(rootpath):
+        # make directory
+        splits = os.path.split(rootpath)
+        for j in range(1, len(splits)+1):
+            path = os.path.join(*splits[:j])
+            if not os.path.isdir(path):
+                os.mkdir(path)
+
+    logger.info("Download pre-tuned parameters package %s", package_name)
+    download("https://raw.githubusercontent.com/uwsaml/tvm-distro/master/tophub/%s"
+             % package_name, os.path.join(rootpath, package_name), True, verbose=0)
diff --git a/python/tvm/exec/tophub.py b/python/tvm/exec/tophub.py
deleted file mode 100644
index 9bfd6866506d..000000000000
--- a/python/tvm/exec/tophub.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# pylint: disable=invalid-name
-"""Download pre-tuned parameters of ops"""
-
-import argparse
-import logging
-
-from ..autotvm.tophub import list_packages, download_package
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-d", "--download", type=str, nargs='+',
-                        help="The targets to download. Use 'all' to download for all targets")
-    parser.add_argument("-l", "--list", action='store_true', help="List available packages")
-    args = parser.parse_args()
-
-    logging.basicConfig(level=logging.INFO)
-
-    if args.list:
-        info = list_packages()
-        print("\n%-20s %-20s" % ("Target", "Size"))
-        print("-" * 41)
-        for target, info in info:
-            print("%-20s %-20s" % (target, "%.2f MB" % (info['size']/1000000)))
-    elif args.download:
-        info = list_packages()
-        all_targets = [x[0] for x in info]
-        if 'all' in args.download:
-            targets = all_targets
-        else:
-            targets = args.download
-
-        for t in targets:
-            if t not in all_targets:
-                print("Warning : cannot find tuned parameters of " + t + ". (ignored)")
-            download_package(t)
-    else:
-        parser.print_help()
diff --git a/python/tvm/rpc/server.py b/python/tvm/rpc/server.py
index 792685b94a18..d65e21c794df 100644
--- a/python/tvm/rpc/server.py
+++ b/python/tvm/rpc/server.py
@@ -313,7 +313,7 @@ def __init__(self,
         self.use_popen = use_popen
 
         if silent:
-            logger.setLevel(logging.WARN)
+            logger.setLevel(logging.ERROR)
 
         if use_popen:
             cmd = [sys.executable,
diff --git a/python/tvm/target.py b/python/tvm/target.py
index 9d5200661c6c..40f9e099b3a6 100644
--- a/python/tvm/target.py
+++ b/python/tvm/target.py
@@ -425,8 +425,6 @@ def arm_cpu(model='unknown', options=None):
     options : str or list of str
         Additional options
     """
-    from . import autotvm
-
     trans_table = {
         "pixel2":    ["-model=snapdragon835", "-target=arm64-linux-android -mattr=+neon"],
         "mate10":    ["-model=kirin970", "-target=arm64-linux-android -mattr=+neon"],
@@ -439,9 +437,6 @@ def arm_cpu(model='unknown', options=None):
     }
     pre_defined_opt = trans_table.get(model, ["-model=%s" % model])
 
-    # download pre-tuned parameters for arm_cpu if there is not any.
-    autotvm.tophub.check_package('arm_cpu')
-
     opts = ["-device=arm_cpu"] + pre_defined_opt
     opts = _merge_opts(opts, options)
     return _api_internal._TargetCreate("llvm", *opts)
diff --git a/vta/tests/python/integration/test_benchmark_topi_conv2d.py b/vta/tests/python/integration/test_benchmark_topi_conv2d.py
index ca2451dec614..0661d292f4e5 100644
--- a/vta/tests/python/integration/test_benchmark_topi_conv2d.py
+++ b/vta/tests/python/integration/test_benchmark_topi_conv2d.py
@@ -128,7 +128,7 @@ def _run(env, remote):
                 run_cpu_conv2d(env, remote, key, batch_size, wl)
 
     # load pre-tuned operator parameters for ARM CPU
-    autotvm.tophub.check_package('vta')
+    autotvm.tophub.check_backend('vta')
     with autotvm.tophub.context('llvm -device=vtacpu'):
         vta.testing.run(_run)
 
diff --git a/vta/tutorials/resnet.py b/vta/tutorials/resnet.py
index 8d33a91d5691..8ca5eb7375b3 100644
--- a/vta/tutorials/resnet.py
+++ b/vta/tutorials/resnet.py
@@ -154,7 +154,7 @@ def generate_graph(graph_fn, params_fn, device="vta"):
 synset = eval(open(os.path.join(data_dir, categ_fn)).read())
 
 # Download pre-tuned op parameters of conv2d for ARM CPU used in VTA
-autotvm.tophub.check_package('vta')
+autotvm.tophub.check_backend('vta')
 
 
 ######################################################################

From b95b5958913927b90463dddb61eb18ef6e1556f6 Mon Sep 17 00:00:00 2001
From: MORITA Kazutaka <morita.kazutaka@lab.ntt.co.jp>
Date: Thu, 23 Aug 2018 14:11:12 +0900
Subject: [PATCH 55/77] Remove leading "./" from include paths (#1640)

---
 include/tvm/api_registry.h                       |  6 +++---
 include/tvm/arithmetic.h                         |  2 +-
 include/tvm/attrs.h                              |  6 +++---
 include/tvm/base.h                               |  2 +-
 include/tvm/buffer.h                             |  4 ++--
 include/tvm/build_module.h                       |  6 +++---
 include/tvm/c_dsl_api.h                          |  2 +-
 include/tvm/codegen.h                            | 10 +++++-----
 include/tvm/expr.h                               |  4 ++--
 include/tvm/ir.h                                 |  6 +++---
 include/tvm/ir_functor_ext.h                     |  2 +-
 include/tvm/ir_mutator.h                         |  4 ++--
 include/tvm/ir_operator.h                        |  4 ++--
 include/tvm/ir_pass.h                            |  8 ++++----
 include/tvm/ir_visitor.h                         |  2 +-
 include/tvm/lowered_func.h                       |  6 +++---
 include/tvm/operation.h                          | 12 ++++++------
 include/tvm/packed_func_ext.h                    |  8 ++++----
 include/tvm/runtime/c_backend_api.h              |  2 +-
 include/tvm/runtime/device_api.h                 |  4 ++--
 include/tvm/runtime/module.h                     |  4 ++--
 include/tvm/runtime/ndarray.h                    |  4 ++--
 include/tvm/runtime/packed_func.h                |  6 +++---
 include/tvm/runtime/registry.h                   |  2 +-
 include/tvm/runtime/serializer.h                 |  4 ++--
 include/tvm/runtime/util.h                       |  2 +-
 include/tvm/schedule.h                           |  8 ++++----
 include/tvm/schedule_pass.h                      |  4 ++--
 include/tvm/target_info.h                        |  4 ++--
 include/tvm/tensor.h                             |  6 +++---
 include/tvm/tensor_intrin.h                      |  4 ++--
 include/tvm/tvm.h                                | 12 ++++++------
 nnvm/include/nnvm/graph.h                        |  6 +++---
 nnvm/include/nnvm/graph_attr_types.h             |  4 ++--
 nnvm/include/nnvm/node.h                         |  6 +++---
 nnvm/include/nnvm/op.h                           |  4 ++--
 nnvm/include/nnvm/op_attr_types.h                |  8 ++++----
 nnvm/include/nnvm/pass.h                         |  4 ++--
 nnvm/include/nnvm/pass_functions.h               |  6 +++---
 nnvm/include/nnvm/symbolic.h                     |  4 ++--
 nnvm/include/nnvm/top/nn.h                       |  2 +-
 nnvm/include/nnvm/tuple.h                        |  2 +-
 nnvm/src/c_api/c_api_error.cc                    |  2 +-
 nnvm/src/c_api/c_api_graph.cc                    |  2 +-
 nnvm/src/c_api/c_api_symbolic.cc                 |  2 +-
 nnvm/src/compiler/alter_op_layout.cc             |  4 ++--
 nnvm/src/compiler/compile_engine.cc              |  7 +++++--
 nnvm/src/compiler/compile_engine.h               |  2 +-
 nnvm/src/compiler/fold_scale_axis.cc             |  4 ++--
 nnvm/src/compiler/graph_fuse.cc                  |  7 ++++---
 nnvm/src/compiler/graph_hash.cc                  |  6 ++++--
 nnvm/src/compiler/graph_runtime.cc               |  2 +-
 nnvm/src/compiler/packed_func_ext.cc             |  2 +-
 nnvm/src/compiler/simplify_inference.cc          |  4 ++--
 nnvm/src/pass/plan_memory.cc                     |  2 +-
 nnvm/src/top/elemwise_op_common.h                |  2 +-
 nnvm/src/top/nn/convolution.cc                   |  2 +-
 nnvm/src/top/nn/nn.cc                            |  2 +-
 nnvm/src/top/nn/pooling.cc                       |  2 +-
 nnvm/src/top/nn/upsampling.cc                    |  2 +-
 src/arithmetic/canonical.cc                      |  8 ++++++--
 src/arithmetic/detect_linear_equation.cc         |  2 +-
 src/arithmetic/int_set.cc                        |  4 ++--
 src/arithmetic/modular.cc                        |  2 +-
 src/codegen/codegen_aocl.cc                      |  4 ++--
 src/codegen/codegen_c.cc                         |  2 +-
 src/codegen/codegen_c.h                          |  2 +-
 src/codegen/codegen_cuda.cc                      |  2 +-
 src/codegen/codegen_cuda.h                       |  2 +-
 src/codegen/codegen_metal.cc                     |  5 +++--
 src/codegen/codegen_metal.h                      |  2 +-
 src/codegen/codegen_opencl.cc                    |  4 ++--
 src/codegen/codegen_opencl.h                     |  2 +-
 src/codegen/codegen_opengl.cc                    |  4 ++--
 src/codegen/codegen_opengl.h                     |  2 +-
 src/codegen/codegen_source_base.cc               |  2 +-
 src/codegen/codegen_vhls.cc                      |  4 ++--
 src/codegen/codegen_vhls.h                       |  2 +-
 src/codegen/intrin_rule.cc                       |  2 +-
 src/codegen/intrin_rule_cuda.cc                  |  2 +-
 src/codegen/intrin_rule_metal.cc                 |  2 +-
 src/codegen/intrin_rule_opencl.cc                |  2 +-
 src/codegen/intrin_rule_opengl.cc                |  2 +-
 src/codegen/intrin_rule_vhls.cc                  |  2 +-
 src/codegen/llvm/codegen_amdgpu.cc               |  2 +-
 src/codegen/llvm/codegen_arm.cc                  |  2 +-
 src/codegen/llvm/codegen_cpu.cc                  |  2 +-
 src/codegen/llvm/codegen_cpu.h                   |  2 +-
 src/codegen/llvm/codegen_llvm.cc                 |  4 ++--
 src/codegen/llvm/codegen_llvm.h                  |  2 +-
 src/codegen/llvm/codegen_nvptx.cc                |  2 +-
 src/codegen/llvm/intrin_rule_llvm.cc             |  2 +-
 src/codegen/llvm/intrin_rule_llvm.h              |  2 +-
 src/codegen/llvm/intrin_rule_rocm.cc             |  2 +-
 src/codegen/llvm/llvm_common.cc                  |  2 +-
 src/codegen/llvm/llvm_module.cc                  |  4 ++--
 src/codegen/source_module.cc                     |  2 +-
 src/codegen/spirv/build_vulkan.cc                |  2 +-
 src/codegen/spirv/codegen_spirv.cc               |  3 ++-
 src/codegen/spirv/codegen_spirv.h                |  2 +-
 src/codegen/spirv/ir_builder.cc                  |  2 +-
 src/codegen/stackvm/codegen_stackvm.cc           |  2 +-
 src/codegen/verilog/codegen_verilog.cc           |  2 +-
 src/codegen/verilog/codegen_verilog.h            |  2 +-
 src/codegen/verilog/verilog_ir.cc                |  3 ++-
 src/codegen/verilog/verilog_module.cc            |  2 +-
 src/codegen/verilog/vpi_device_api.cc            |  2 +-
 src/codegen/verilog/vpi_session.cc               |  2 +-
 src/contrib/nnpack/convolution.cc                |  2 +-
 src/contrib/nnpack/fully_connected.cc            |  2 +-
 src/contrib/nnpack/nnpack_utils.cc               |  2 +-
 src/contrib/random/random.cc                     |  4 ++--
 src/op/compute_op.cc                             |  5 +++--
 src/op/cross_thread_reduction.cc                 |  4 ++--
 src/op/extern_op.cc                              |  2 +-
 src/op/op_util.cc                                |  3 ++-
 src/op/scan_op.cc                                |  2 +-
 src/op/tensorize.cc                              |  4 ++--
 src/pass/arg_binder.cc                           |  4 ++--
 src/pass/coproc_sync.cc                          |  4 ++--
 src/pass/inject_double_buffer.cc                 |  2 +-
 src/pass/ir_mutator.cc                           |  2 +-
 src/pass/ir_util.cc                              |  2 +-
 src/pass/lift_attr_scope.cc                      |  2 +-
 src/pass/lower_intrin.cc                         |  2 +-
 src/pass/lower_thread_allreduce.cc               |  2 +-
 src/pass/lower_tvm_builtin.cc                    |  2 +-
 src/pass/lower_warp_memory.cc                    |  2 +-
 src/pass/make_api.cc                             |  4 ++--
 src/pass/narrow_channel_access.cc                |  2 +-
 src/pass/split_pipeline.cc                       |  2 +-
 src/pass/storage_access.cc                       |  5 +++--
 src/pass/storage_flatten.cc                      |  4 ++--
 src/pass/storage_rewrite.cc                      |  2 +-
 src/pass/storage_sync.cc                         |  4 ++--
 src/runtime/c_dsl_api.cc                         |  4 ++--
 src/runtime/c_runtime_api.cc                     |  2 +-
 src/runtime/cpu_device_api.cc                    |  2 +-
 src/runtime/cuda/cuda_device_api.cc              |  2 +-
 src/runtime/cuda/cuda_module.cc                  |  4 ++--
 src/runtime/dso_module.cc                        |  2 +-
 src/runtime/file_util.cc                         |  3 ++-
 src/runtime/file_util.h                          |  2 +-
 src/runtime/graph/graph_runtime.cc               |  5 ++++-
 src/runtime/meta_data.h                          |  2 +-
 src/runtime/metal/metal_device_api.mm            |  2 +-
 src/runtime/metal/metal_module.mm                |  4 ++--
 src/runtime/module.cc                            |  2 +-
 src/runtime/module_util.cc                       |  3 ++-
 src/runtime/ndarray.cc                           |  2 +-
 src/runtime/opencl/aocl/aocl_device_api.cc       |  2 +-
 src/runtime/opencl/aocl/aocl_module.cc           |  4 ++--
 src/runtime/opencl/opencl_device_api.cc          |  2 +-
 src/runtime/opencl/opencl_module.cc              |  4 ++--
 src/runtime/opencl/sdaccel/sdaccel_device_api.cc |  2 +-
 src/runtime/opencl/sdaccel/sdaccel_module.cc     |  4 ++--
 src/runtime/opengl/opengl_device_api.cc          |  4 ++--
 src/runtime/opengl/opengl_module.cc              |  4 ++--
 src/runtime/registry.cc                          |  2 +-
 src/runtime/rocm/rocm_device_api.cc              |  2 +-
 src/runtime/rocm/rocm_module.cc                  |  4 ++--
 src/runtime/rpc/rpc_device_api.cc                |  2 +-
 src/runtime/rpc/rpc_event_impl.cc                |  2 +-
 src/runtime/rpc/rpc_module.cc                    |  2 +-
 src/runtime/rpc/rpc_session.cc                   |  4 +++-
 src/runtime/rpc/rpc_socket_impl.cc               |  2 +-
 src/runtime/sgx/trusted/runtime.cc               |  6 +++---
 src/runtime/stackvm/stackvm.cc                   |  2 +-
 src/runtime/stackvm/stackvm_module.cc            |  2 +-
 src/runtime/stackvm/stackvm_module.h             |  2 +-
 src/runtime/system_lib_module.cc                 |  2 +-
 src/runtime/vulkan/vulkan_device_api.cc          |  2 +-
 src/runtime/vulkan/vulkan_module.cc              |  4 ++--
 src/runtime/workspace_pool.cc                    |  2 +-
 src/schedule/bound.cc                            |  4 ++--
 src/schedule/graph.cc                            |  2 +-
 src/schedule/message_passing.cc                  |  2 +-
 src/schedule/schedule_dataflow_rewrite.cc        |  2 +-
 src/schedule/schedule_lang.cc                    |  2 +-
 src/schedule/schedule_ops.cc                     |  2 +-
 verilog/tvm_vpi.cc                               |  4 +++-
 vta/hardware/xilinx/src/vta.cc                   |  2 +-
 vta/include/vta/runtime.h                        |  2 +-
 vta/src/pynq/pynq_driver.cc                      |  2 +-
 vta/tests/hardware/common/test_lib.cc            |  2 +-
 185 files changed, 310 insertions(+), 285 deletions(-)

diff --git a/include/tvm/api_registry.h b/include/tvm/api_registry.h
index 93bff2762481..e12ef423ed32 100644
--- a/include/tvm/api_registry.h
+++ b/include/tvm/api_registry.h
@@ -7,9 +7,9 @@
 #ifndef TVM_API_REGISTRY_H_
 #define TVM_API_REGISTRY_H_
 
-#include "./base.h"
-#include "./packed_func_ext.h"
-#include "./runtime/registry.h"
+#include "base.h"
+#include "packed_func_ext.h"
+#include "runtime/registry.h"
 
 /*!
  * \brief Register an API function globally.
diff --git a/include/tvm/arithmetic.h b/include/tvm/arithmetic.h
index 6a3c395fd404..54875bbbf474 100644
--- a/include/tvm/arithmetic.h
+++ b/include/tvm/arithmetic.h
@@ -9,7 +9,7 @@
 #include <vector>
 #include <unordered_map>
 #include <memory>
-#include "./expr.h"
+#include "expr.h"
 
 namespace tvm {
 
diff --git a/include/tvm/attrs.h b/include/tvm/attrs.h
index 3e5169ba02b8..7cd77a92d0dd 100644
--- a/include/tvm/attrs.h
+++ b/include/tvm/attrs.h
@@ -31,9 +31,9 @@
 #include <vector>
 #include <type_traits>
 #include <string>
-#include "./ir.h"
-#include "./base.h"
-#include "./packed_func_ext.h"
+#include "ir.h"
+#include "base.h"
+#include "packed_func_ext.h"
 
 namespace tvm {
 /*!
diff --git a/include/tvm/base.h b/include/tvm/base.h
index 1d7cf8add3ca..2c5c5ad54875 100644
--- a/include/tvm/base.h
+++ b/include/tvm/base.h
@@ -12,7 +12,7 @@
 #include <string>
 #include <memory>
 #include <functional>
-#include "./runtime/registry.h"
+#include "runtime/registry.h"
 
 namespace tvm {
 
diff --git a/include/tvm/buffer.h b/include/tvm/buffer.h
index 41fa1fa804a8..0f591299718e 100644
--- a/include/tvm/buffer.h
+++ b/include/tvm/buffer.h
@@ -9,8 +9,8 @@
 #include <tvm/container.h>
 #include <string>
 
-#include "./base.h"
-#include "./expr.h"
+#include "base.h"
+#include "expr.h"
 
 namespace tvm {
 
diff --git a/include/tvm/build_module.h b/include/tvm/build_module.h
index 96b876fe92f0..5dc832041410 100644
--- a/include/tvm/build_module.h
+++ b/include/tvm/build_module.h
@@ -9,9 +9,9 @@
 #include <string>
 #include <vector>
 #include <utility>
-#include "./runtime/packed_func.h"
-#include "./schedule_pass.h"
-#include "./lowered_func.h"
+#include "runtime/packed_func.h"
+#include "schedule_pass.h"
+#include "lowered_func.h"
 
 namespace tvm {
 using namespace tvm::runtime;
diff --git a/include/tvm/c_dsl_api.h b/include/tvm/c_dsl_api.h
index 6f15ef9a3e80..027a3952d9d4 100644
--- a/include/tvm/c_dsl_api.h
+++ b/include/tvm/c_dsl_api.h
@@ -14,7 +14,7 @@
 #ifndef TVM_C_DSL_API_H_
 #define TVM_C_DSL_API_H_
 
-#include "./runtime/c_runtime_api.h"
+#include "runtime/c_runtime_api.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/include/tvm/codegen.h b/include/tvm/codegen.h
index 6b5116a143cc..fca88de6a238 100644
--- a/include/tvm/codegen.h
+++ b/include/tvm/codegen.h
@@ -7,11 +7,11 @@
 #define TVM_CODEGEN_H_
 
 #include <string>
-#include "./base.h"
-#include "./expr.h"
-#include "./lowered_func.h"
-#include "./api_registry.h"
-#include "./runtime/packed_func.h"
+#include "base.h"
+#include "expr.h"
+#include "lowered_func.h"
+#include "api_registry.h"
+#include "runtime/packed_func.h"
 
 namespace tvm {
 /*! \brief namespace for lowlevel IR pass and codegen */
diff --git a/include/tvm/expr.h b/include/tvm/expr.h
index 8c789f8df1dc..64a112d05518 100644
--- a/include/tvm/expr.h
+++ b/include/tvm/expr.h
@@ -11,8 +11,8 @@
 #include <ir/IRPrinter.h>
 #include <string>
 #include <algorithm>
-#include "./base.h"
-#include "./runtime/c_runtime_api.h"
+#include "base.h"
+#include "runtime/c_runtime_api.h"
 
 namespace tvm {
 
diff --git a/include/tvm/ir.h b/include/tvm/ir.h
index 646824332902..f73533439dba 100644
--- a/include/tvm/ir.h
+++ b/include/tvm/ir.h
@@ -10,9 +10,9 @@
 #include <ir/IR.h>
 #include <type_traits>
 #include <string>
-#include "./base.h"
-#include "./expr.h"
-#include "./runtime/util.h"
+#include "base.h"
+#include "expr.h"
+#include "runtime/util.h"
 
 namespace tvm {
 namespace ir {
diff --git a/include/tvm/ir_functor_ext.h b/include/tvm/ir_functor_ext.h
index 3784608c8da1..a9845fdfc898 100644
--- a/include/tvm/ir_functor_ext.h
+++ b/include/tvm/ir_functor_ext.h
@@ -7,7 +7,7 @@
 #define TVM_IR_FUNCTOR_EXT_H_
 
 #include <tvm/ir_functor.h>
-#include "./ir.h"
+#include "ir.h"
 
 namespace tvm {
 namespace ir {
diff --git a/include/tvm/ir_mutator.h b/include/tvm/ir_mutator.h
index b8aae3638149..35c82e9f16c1 100644
--- a/include/tvm/ir_mutator.h
+++ b/include/tvm/ir_mutator.h
@@ -8,8 +8,8 @@
 
 #include <tvm/ir_functor.h>
 #include <unordered_map>
-#include "./expr.h"
-#include "./ir.h"
+#include "expr.h"
+#include "ir.h"
 
 namespace tvm {
 namespace ir {
diff --git a/include/tvm/ir_operator.h b/include/tvm/ir_operator.h
index 947c3b736d80..e809b06e49b5 100644
--- a/include/tvm/ir_operator.h
+++ b/include/tvm/ir_operator.h
@@ -7,8 +7,8 @@
 #define TVM_IR_OPERATOR_H_
 
 #include <algorithm>
-#include "./expr.h"
-#include "./ir.h"
+#include "expr.h"
+#include "ir.h"
 
 namespace tvm {
 
diff --git a/include/tvm/ir_pass.h b/include/tvm/ir_pass.h
index d875621a3f5e..cf20dfa1e9f3 100644
--- a/include/tvm/ir_pass.h
+++ b/include/tvm/ir_pass.h
@@ -14,10 +14,10 @@
 #include <unordered_map>
 #include <vector>
 #include <string>
-#include "./expr.h"
-#include "./buffer.h"
-#include "./schedule.h"
-#include "./lowered_func.h"
+#include "expr.h"
+#include "buffer.h"
+#include "schedule.h"
+#include "lowered_func.h"
 
 namespace tvm {
 namespace ir {
diff --git a/include/tvm/ir_visitor.h b/include/tvm/ir_visitor.h
index 8919b0f7a5c2..4b2887b28885 100644
--- a/include/tvm/ir_visitor.h
+++ b/include/tvm/ir_visitor.h
@@ -7,7 +7,7 @@
 #define TVM_IR_VISITOR_H_
 
 #include <tvm/ir_functor.h>
-#include "./ir.h"
+#include "ir.h"
 
 namespace tvm {
 namespace ir {
diff --git a/include/tvm/lowered_func.h b/include/tvm/lowered_func.h
index 19f7e27f1c75..acb9813339f8 100644
--- a/include/tvm/lowered_func.h
+++ b/include/tvm/lowered_func.h
@@ -11,9 +11,9 @@
 #include <ir/FunctionBase.h>
 #include <string>
 
-#include "./base.h"
-#include "./expr.h"
-#include "./tensor.h"
+#include "base.h"
+#include "expr.h"
+#include "tensor.h"
 
 namespace tvm {
 
diff --git a/include/tvm/operation.h b/include/tvm/operation.h
index ed8be6e4a7c0..c11242c0a55d 100644
--- a/include/tvm/operation.h
+++ b/include/tvm/operation.h
@@ -9,12 +9,12 @@
 #include <string>
 #include <vector>
 #include <unordered_map>
-#include "./expr.h"
-#include "./ir_operator.h"
-#include "./tensor.h"
-#include "./schedule.h"
-#include "./arithmetic.h"
-#include "./buffer.h"
+#include "expr.h"
+#include "ir_operator.h"
+#include "tensor.h"
+#include "schedule.h"
+#include "arithmetic.h"
+#include "buffer.h"
 
 namespace tvm {
 
diff --git a/include/tvm/packed_func_ext.h b/include/tvm/packed_func_ext.h
index 95964547ef8e..78351e094e69 100644
--- a/include/tvm/packed_func_ext.h
+++ b/include/tvm/packed_func_ext.h
@@ -12,10 +12,10 @@
 #include <memory>
 #include <type_traits>
 
-#include "./base.h"
-#include "./expr.h"
-#include "./tensor.h"
-#include "./runtime/packed_func.h"
+#include "base.h"
+#include "expr.h"
+#include "tensor.h"
+#include "runtime/packed_func.h"
 
 namespace tvm {
 using runtime::TVMArgs;
diff --git a/include/tvm/runtime/c_backend_api.h b/include/tvm/runtime/c_backend_api.h
index 60e284610494..f55748e38289 100644
--- a/include/tvm/runtime/c_backend_api.h
+++ b/include/tvm/runtime/c_backend_api.h
@@ -10,7 +10,7 @@
 #ifndef TVM_RUNTIME_C_BACKEND_API_H_
 #define TVM_RUNTIME_C_BACKEND_API_H_
 
-#include "./c_runtime_api.h"
+#include "c_runtime_api.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/include/tvm/runtime/device_api.h b/include/tvm/runtime/device_api.h
index 3458c143e662..0b91deafd9c0 100644
--- a/include/tvm/runtime/device_api.h
+++ b/include/tvm/runtime/device_api.h
@@ -7,8 +7,8 @@
 #define TVM_RUNTIME_DEVICE_API_H_
 
 #include <string>
-#include "./packed_func.h"
-#include "./c_runtime_api.h"
+#include "packed_func.h"
+#include "c_runtime_api.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/include/tvm/runtime/module.h b/include/tvm/runtime/module.h
index de0b02500b6d..3a98820b76f3 100644
--- a/include/tvm/runtime/module.h
+++ b/include/tvm/runtime/module.h
@@ -13,7 +13,7 @@
 #include <vector>
 #include <string>
 #include <unordered_map>
-#include "./c_runtime_api.h"
+#include "c_runtime_api.h"
 
 namespace tvm {
 namespace runtime {
@@ -173,5 +173,5 @@ inline const ModuleNode* Module::operator->() const {
 }  // namespace runtime
 }  // namespace tvm
 
-#include "./packed_func.h"
+#include "packed_func.h"
 #endif  // TVM_RUNTIME_MODULE_H_
diff --git a/include/tvm/runtime/ndarray.h b/include/tvm/runtime/ndarray.h
index c288ce5f3adb..0b7c3b49ccac 100644
--- a/include/tvm/runtime/ndarray.h
+++ b/include/tvm/runtime/ndarray.h
@@ -9,8 +9,8 @@
 #include <atomic>
 #include <vector>
 #include <utility>
-#include "./c_runtime_api.h"
-#include "./serializer.h"
+#include "c_runtime_api.h"
+#include "serializer.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h
index 758d03b5b18b..1c873a5ebccc 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -14,9 +14,9 @@
 #include <limits>
 #include <memory>
 #include <type_traits>
-#include "./c_runtime_api.h"
-#include "./module.h"
-#include "./ndarray.h"
+#include "c_runtime_api.h"
+#include "module.h"
+#include "ndarray.h"
 
 namespace HalideIR {
 // Forward declare type for extensions
diff --git a/include/tvm/runtime/registry.h b/include/tvm/runtime/registry.h
index 2a328c8086e0..c10a03e0604a 100644
--- a/include/tvm/runtime/registry.h
+++ b/include/tvm/runtime/registry.h
@@ -27,7 +27,7 @@
 
 #include <string>
 #include <vector>
-#include "./packed_func.h"
+#include "packed_func.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/include/tvm/runtime/serializer.h b/include/tvm/runtime/serializer.h
index b2ab5483a22d..e9a7d1db50ec 100644
--- a/include/tvm/runtime/serializer.h
+++ b/include/tvm/runtime/serializer.h
@@ -9,8 +9,8 @@
 
 #include <dmlc/io.h>
 #include <dmlc/serializer.h>
-#include "./c_runtime_api.h"
-#include "./ndarray.h"
+#include "c_runtime_api.h"
+#include "ndarray.h"
 
 namespace dmlc {
 namespace serializer {
diff --git a/include/tvm/runtime/util.h b/include/tvm/runtime/util.h
index 7fa62be912be..5fd130737158 100644
--- a/include/tvm/runtime/util.h
+++ b/include/tvm/runtime/util.h
@@ -6,7 +6,7 @@
 #ifndef TVM_RUNTIME_UTIL_H_
 #define TVM_RUNTIME_UTIL_H_
 
-#include "./c_runtime_api.h"
+#include "c_runtime_api.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/include/tvm/schedule.h b/include/tvm/schedule.h
index deaf74ccf222..b72eb7105faa 100644
--- a/include/tvm/schedule.h
+++ b/include/tvm/schedule.h
@@ -7,10 +7,10 @@
 #define TVM_SCHEDULE_H_
 
 #include <string>
-#include "./base.h"
-#include "./expr.h"
-#include "./tensor.h"
-#include "./tensor_intrin.h"
+#include "base.h"
+#include "expr.h"
+#include "tensor.h"
+#include "tensor_intrin.h"
 
 namespace tvm {
 
diff --git a/include/tvm/schedule_pass.h b/include/tvm/schedule_pass.h
index cd248f8b9b96..e2b4462b8d73 100644
--- a/include/tvm/schedule_pass.h
+++ b/include/tvm/schedule_pass.h
@@ -10,8 +10,8 @@
 #ifndef TVM_SCHEDULE_PASS_H_
 #define TVM_SCHEDULE_PASS_H_
 
-#include "./base.h"
-#include "./schedule.h"
+#include "base.h"
+#include "schedule.h"
 
 namespace tvm {
 namespace schedule {
diff --git a/include/tvm/target_info.h b/include/tvm/target_info.h
index 8569f188a4ab..338749cf832e 100644
--- a/include/tvm/target_info.h
+++ b/include/tvm/target_info.h
@@ -7,8 +7,8 @@
 #define TVM_TARGET_INFO_H_
 
 #include <string>
-#include "./base.h"
-#include "./expr.h"
+#include "base.h"
+#include "expr.h"
 
 namespace tvm {
 
diff --git a/include/tvm/tensor.h b/include/tvm/tensor.h
index 1a6338d9058c..ddccfce2fefb 100644
--- a/include/tvm/tensor.h
+++ b/include/tvm/tensor.h
@@ -12,9 +12,9 @@
 #include <vector>
 #include <type_traits>
 
-#include "./base.h"
-#include "./expr.h"
-#include "./arithmetic.h"
+#include "base.h"
+#include "expr.h"
+#include "arithmetic.h"
 
 namespace tvm {
 
diff --git a/include/tvm/tensor_intrin.h b/include/tvm/tensor_intrin.h
index bd3fd11021b4..fa8c895ccb08 100644
--- a/include/tvm/tensor_intrin.h
+++ b/include/tvm/tensor_intrin.h
@@ -7,8 +7,8 @@
 #define TVM_TENSOR_INTRIN_H_
 
 #include <string>
-#include "./tensor.h"
-#include "./buffer.h"
+#include "tensor.h"
+#include "buffer.h"
 
 namespace tvm {
 
diff --git a/include/tvm/tvm.h b/include/tvm/tvm.h
index 7e9c4305ffbb..645c68357f13 100644
--- a/include/tvm/tvm.h
+++ b/include/tvm/tvm.h
@@ -6,11 +6,11 @@
 #ifndef TVM_TVM_H_
 #define TVM_TVM_H_
 
-#include "./base.h"
-#include "./expr.h"
-#include "./ir_operator.h"
-#include "./tensor.h"
-#include "./operation.h"
-#include "./packed_func_ext.h"
+#include "base.h"
+#include "expr.h"
+#include "ir_operator.h"
+#include "tensor.h"
+#include "operation.h"
+#include "packed_func_ext.h"
 
 #endif  // TVM_TVM_H_
diff --git a/nnvm/include/nnvm/graph.h b/nnvm/include/nnvm/graph.h
index 1d3b662ff0b8..3f8a2a3642b1 100644
--- a/nnvm/include/nnvm/graph.h
+++ b/nnvm/include/nnvm/graph.h
@@ -12,9 +12,9 @@
 #include <algorithm>
 #include <unordered_map>
 #include <unordered_set>
-#include "./base.h"
-#include "./node.h"
-#include "./symbolic.h"
+#include "base.h"
+#include "node.h"
+#include "symbolic.h"
 
 namespace nnvm {
 
diff --git a/nnvm/include/nnvm/graph_attr_types.h b/nnvm/include/nnvm/graph_attr_types.h
index 2bd998fedfbb..2fe82c9a7de0 100644
--- a/nnvm/include/nnvm/graph_attr_types.h
+++ b/nnvm/include/nnvm/graph_attr_types.h
@@ -8,8 +8,8 @@
 
 #include <vector>
 #include <string>
-#include "./tuple.h"
-#include "./layout.h"
+#include "tuple.h"
+#include "layout.h"
 
 namespace nnvm {
 
diff --git a/nnvm/include/nnvm/node.h b/nnvm/include/nnvm/node.h
index 57afb0c5587a..ae782f04965e 100644
--- a/nnvm/include/nnvm/node.h
+++ b/nnvm/include/nnvm/node.h
@@ -10,9 +10,9 @@
 #include <string>
 #include <vector>
 #include <unordered_map>
-#include "./base.h"
-#include "./op.h"
-#include "./c_api.h"
+#include "base.h"
+#include "op.h"
+#include "c_api.h"
 
 namespace nnvm {
 
diff --git a/nnvm/include/nnvm/op.h b/nnvm/include/nnvm/op.h
index 5bdfcaca169d..9d171bbdb2bc 100644
--- a/nnvm/include/nnvm/op.h
+++ b/nnvm/include/nnvm/op.h
@@ -13,8 +13,8 @@
 #include <typeinfo>
 #include <limits>
 #include <functional>
-#include "./base.h"
-#include "./c_api.h"
+#include "base.h"
+#include "c_api.h"
 
 namespace nnvm {
 
diff --git a/nnvm/include/nnvm/op_attr_types.h b/nnvm/include/nnvm/op_attr_types.h
index b7f6be408a16..abed19f9bc7d 100644
--- a/nnvm/include/nnvm/op_attr_types.h
+++ b/nnvm/include/nnvm/op_attr_types.h
@@ -10,10 +10,10 @@
 #include <string>
 #include <utility>
 #include <functional>
-#include "./base.h"
-#include "./node.h"
-#include "./tuple.h"
-#include "./layout.h"
+#include "base.h"
+#include "node.h"
+#include "tuple.h"
+#include "layout.h"
 
 namespace nnvm {
 
diff --git a/nnvm/include/nnvm/pass.h b/nnvm/include/nnvm/pass.h
index 016d5ee2a763..2e8db6111887 100644
--- a/nnvm/include/nnvm/pass.h
+++ b/nnvm/include/nnvm/pass.h
@@ -8,8 +8,8 @@
 
 #include <vector>
 #include <functional>
-#include "./base.h"
-#include "./graph.h"
+#include "base.h"
+#include "graph.h"
 
 namespace nnvm {
 
diff --git a/nnvm/include/nnvm/pass_functions.h b/nnvm/include/nnvm/pass_functions.h
index 4c29e09d813a..5a98dd456fb2 100644
--- a/nnvm/include/nnvm/pass_functions.h
+++ b/nnvm/include/nnvm/pass_functions.h
@@ -13,9 +13,9 @@
 #include <string>
 #include <memory>
 #include <vector>
-#include "./base.h"
-#include "./pass.h"
-#include "./graph_attr_types.h"
+#include "base.h"
+#include "pass.h"
+#include "graph_attr_types.h"
 
 namespace nnvm {
 namespace pass {
diff --git a/nnvm/include/nnvm/symbolic.h b/nnvm/include/nnvm/symbolic.h
index ebb2ab5d30d0..42cf5dd775c2 100644
--- a/nnvm/include/nnvm/symbolic.h
+++ b/nnvm/include/nnvm/symbolic.h
@@ -15,8 +15,8 @@
 #include <tuple>
 #include <utility>
 
-#include "./base.h"
-#include "./node.h"
+#include "base.h"
+#include "node.h"
 
 namespace nnvm {
 /*!
diff --git a/nnvm/include/nnvm/top/nn.h b/nnvm/include/nnvm/top/nn.h
index 865024733494..143a9548f18a 100644
--- a/nnvm/include/nnvm/top/nn.h
+++ b/nnvm/include/nnvm/top/nn.h
@@ -11,7 +11,7 @@
 #include <nnvm/tuple.h>
 #include <nnvm/layout.h>
 #include <string>
-#include "./tensor.h"
+#include "tensor.h"
 
 namespace nnvm {
 namespace top {
diff --git a/nnvm/include/nnvm/tuple.h b/nnvm/include/nnvm/tuple.h
index 7e83aecc11f0..36b8ef13c74a 100644
--- a/nnvm/include/nnvm/tuple.h
+++ b/nnvm/include/nnvm/tuple.h
@@ -12,7 +12,7 @@
 #include <utility>
 #include <iostream>
 #include <string>
-#include "./base.h"
+#include "base.h"
 
 namespace nnvm {
 
diff --git a/nnvm/src/c_api/c_api_error.cc b/nnvm/src/c_api/c_api_error.cc
index 399268667ddd..fd91bfb8b306 100644
--- a/nnvm/src/c_api/c_api_error.cc
+++ b/nnvm/src/c_api/c_api_error.cc
@@ -4,7 +4,7 @@
  * \brief C error handling
  */
 #include <dmlc/thread_local.h>
-#include "./c_api_common.h"
+#include "c_api_common.h"
 
 struct ErrorEntry {
   std::string last_error;
diff --git a/nnvm/src/c_api/c_api_graph.cc b/nnvm/src/c_api/c_api_graph.cc
index 831aaec33e8c..a0e84aef4482 100644
--- a/nnvm/src/c_api/c_api_graph.cc
+++ b/nnvm/src/c_api/c_api_graph.cc
@@ -9,7 +9,7 @@
 #include <nnvm/graph.h>
 #include <nnvm/pass.h>
 #include <dmlc/json.h>
-#include "./c_api_common.h"
+#include "c_api_common.h"
 
 using namespace nnvm;
 
diff --git a/nnvm/src/c_api/c_api_symbolic.cc b/nnvm/src/c_api/c_api_symbolic.cc
index 9f62dbd80b0c..e175cfc7da25 100644
--- a/nnvm/src/c_api/c_api_symbolic.cc
+++ b/nnvm/src/c_api/c_api_symbolic.cc
@@ -6,7 +6,7 @@
 #include <nnvm/c_api.h>
 #include <nnvm/op.h>
 #include <nnvm/symbolic.h>
-#include "./c_api_common.h"
+#include "c_api_common.h"
 
 using namespace nnvm;
 
diff --git a/nnvm/src/compiler/alter_op_layout.cc b/nnvm/src/compiler/alter_op_layout.cc
index bf28df3d04f8..b02655fc8925 100644
--- a/nnvm/src/compiler/alter_op_layout.cc
+++ b/nnvm/src/compiler/alter_op_layout.cc
@@ -12,8 +12,8 @@
 #include <tvm/tvm.h>
 #include <algorithm>
 #include <functional>
-#include "./compile_engine.h"
-#include "./graph_transform.h"
+#include "compile_engine.h"
+#include "graph_transform.h"
 
 namespace nnvm {
 namespace compiler {
diff --git a/nnvm/src/compiler/compile_engine.cc b/nnvm/src/compiler/compile_engine.cc
index a9d4aa2d016a..b9b27621840c 100644
--- a/nnvm/src/compiler/compile_engine.cc
+++ b/nnvm/src/compiler/compile_engine.cc
@@ -11,8 +11,11 @@
 #include <nnvm/pass_functions.h>
 #include <nnvm/compiler/op_attr_types.h>
 #include <mutex>
-#include "./graph_hash.h"
-#include "./compile_engine.h"
+#include <tuple>
+#include <vector>
+#include <limits>
+#include "graph_hash.h"
+#include "compile_engine.h"
 
 namespace nnvm {
 namespace compiler {
diff --git a/nnvm/src/compiler/compile_engine.h b/nnvm/src/compiler/compile_engine.h
index d84fe2facbd3..7696b3b5f4eb 100644
--- a/nnvm/src/compiler/compile_engine.h
+++ b/nnvm/src/compiler/compile_engine.h
@@ -18,7 +18,7 @@
 #include <tvm/lowered_func.h>
 #include <string>
 #include <utility>
-#include "./graph_hash.h"
+#include "graph_hash.h"
 
 namespace nnvm {
 namespace compiler {
diff --git a/nnvm/src/compiler/fold_scale_axis.cc b/nnvm/src/compiler/fold_scale_axis.cc
index e38082b69916..639aba602589 100644
--- a/nnvm/src/compiler/fold_scale_axis.cc
+++ b/nnvm/src/compiler/fold_scale_axis.cc
@@ -9,8 +9,8 @@
 #include <nnvm/pass.h>
 #include <nnvm/compiler/op_attr_types.h>
 #include <nnvm/top/nn.h>
-#include "./pattern_util.h"
-#include "./graph_transform.h"
+#include "pattern_util.h"
+#include "graph_transform.h"
 
 namespace nnvm {
 namespace compiler {
diff --git a/nnvm/src/compiler/graph_fuse.cc b/nnvm/src/compiler/graph_fuse.cc
index 4999d93d1861..c9ea58affb2c 100644
--- a/nnvm/src/compiler/graph_fuse.cc
+++ b/nnvm/src/compiler/graph_fuse.cc
@@ -14,10 +14,11 @@
 #include <nnvm/tuple.h>
 #include <tvm/lowered_func.h>
 #include <tvm/runtime/packed_func.h>
+#include <limits>
 
-#include "./graph_fuse.h"
-#include "./graph_runtime.h"
-#include "./pattern_util.h"
+#include "graph_fuse.h"
+#include "graph_runtime.h"
+#include "pattern_util.h"
 
 namespace nnvm {
 namespace compiler {
diff --git a/nnvm/src/compiler/graph_hash.cc b/nnvm/src/compiler/graph_hash.cc
index ccd2e3ce433f..ca68727ea067 100644
--- a/nnvm/src/compiler/graph_hash.cc
+++ b/nnvm/src/compiler/graph_hash.cc
@@ -10,8 +10,10 @@
 #include <tvm/ir.h>
 #include <tvm/runtime/packed_func.h>
 #include <functional>
-#include "./node_attr.h"
-#include "./graph_hash.h"
+#include <vector>
+#include <algorithm>
+#include "node_attr.h"
+#include "graph_hash.h"
 
 namespace nnvm {
 namespace compiler {
diff --git a/nnvm/src/compiler/graph_runtime.cc b/nnvm/src/compiler/graph_runtime.cc
index 7301fd74117e..bc4a803681e4 100644
--- a/nnvm/src/compiler/graph_runtime.cc
+++ b/nnvm/src/compiler/graph_runtime.cc
@@ -4,7 +4,7 @@
  * \brief Interface code with TVM graph runtime.
 */
 #include <dmlc/memory_io.h>
-#include "./graph_runtime.h"
+#include "graph_runtime.h"
 
 namespace nnvm {
 namespace compiler {
diff --git a/nnvm/src/compiler/packed_func_ext.cc b/nnvm/src/compiler/packed_func_ext.cc
index d549f9e2004f..64846fc8e247 100644
--- a/nnvm/src/compiler/packed_func_ext.cc
+++ b/nnvm/src/compiler/packed_func_ext.cc
@@ -9,7 +9,7 @@
 #include <nnvm/compiler/packed_func_ext.h>
 #include <nnvm/compiler/op_attr_types.h>
 #include <tvm/runtime/c_runtime_api.h>
-#include "./node_attr.h"
+#include "node_attr.h"
 #include "compile_engine.h"
 
 namespace tvm {
diff --git a/nnvm/src/compiler/simplify_inference.cc b/nnvm/src/compiler/simplify_inference.cc
index a0782222aa06..bf00bcb5a894 100644
--- a/nnvm/src/compiler/simplify_inference.cc
+++ b/nnvm/src/compiler/simplify_inference.cc
@@ -9,8 +9,8 @@
 #include <nnvm/pass.h>
 #include <nnvm/compiler/op_attr_types.h>
 #include <nnvm/top/nn.h>
-#include "./graph_transform.h"
-#include "./pattern_util.h"
+#include "graph_transform.h"
+#include "pattern_util.h"
 
 namespace nnvm {
 namespace compiler {
diff --git a/nnvm/src/pass/plan_memory.cc b/nnvm/src/pass/plan_memory.cc
index 51448bcf1065..6c2fc0d087ea 100644
--- a/nnvm/src/pass/plan_memory.cc
+++ b/nnvm/src/pass/plan_memory.cc
@@ -8,7 +8,7 @@
 #include <nnvm/graph_attr_types.h>
 #include <nnvm/op_attr_types.h>
 #include <memory>
-#include "./graph_algorithm.h"
+#include "graph_algorithm.h"
 
 namespace nnvm {
 namespace pass {
diff --git a/nnvm/src/top/elemwise_op_common.h b/nnvm/src/top/elemwise_op_common.h
index e5bb0adcb078..ad8fc3d54ba8 100644
--- a/nnvm/src/top/elemwise_op_common.h
+++ b/nnvm/src/top/elemwise_op_common.h
@@ -12,7 +12,7 @@
 #include <vector>
 #include <utility>
 #include <functional>
-#include "./op_common.h"
+#include "op_common.h"
 
 namespace nnvm {
 namespace top {
diff --git a/nnvm/src/top/nn/convolution.cc b/nnvm/src/top/nn/convolution.cc
index 229d4ac30f78..d5c9c18f68a6 100644
--- a/nnvm/src/top/nn/convolution.cc
+++ b/nnvm/src/top/nn/convolution.cc
@@ -12,7 +12,7 @@
 #include <tvm/packed_func_ext.h>
 #include <nnvm/compiler/op_attr_types.h>
 #include <tvm/tvm.h>
-#include "./nn_common.h"
+#include "nn_common.h"
 #include "../op_common.h"
 #include "../elemwise_op_common.h"
 #include "topi/nn.h"
diff --git a/nnvm/src/top/nn/nn.cc b/nnvm/src/top/nn/nn.cc
index 322d77b6d032..0b5a11fdd096 100644
--- a/nnvm/src/top/nn/nn.cc
+++ b/nnvm/src/top/nn/nn.cc
@@ -12,7 +12,7 @@
 #include <nnvm/op_attr_types.h>
 #include <nnvm/compiler/op_attr_types.h>
 #include <nnvm/top/nn.h>
-#include "./nn_common.h"
+#include "nn_common.h"
 #include "../op_common.h"
 #include "../elemwise_op_common.h"
 #include "topi/nn/dense.h"
diff --git a/nnvm/src/top/nn/pooling.cc b/nnvm/src/top/nn/pooling.cc
index 8b9b7a64aa0d..6a53e1994fc1 100644
--- a/nnvm/src/top/nn/pooling.cc
+++ b/nnvm/src/top/nn/pooling.cc
@@ -10,7 +10,7 @@
 #include <nnvm/compiler/op_attr_types.h>
 #include <nnvm/compiler/util.h>
 #include <nnvm/top/nn.h>
-#include "./nn_common.h"
+#include "nn_common.h"
 #include "../op_common.h"
 #include "../elemwise_op_common.h"
 #include "topi/nn/pooling.h"
diff --git a/nnvm/src/top/nn/upsampling.cc b/nnvm/src/top/nn/upsampling.cc
index 6c5e13441406..f4bbeb62aa29 100644
--- a/nnvm/src/top/nn/upsampling.cc
+++ b/nnvm/src/top/nn/upsampling.cc
@@ -11,7 +11,7 @@
 #include <nnvm/node.h>
 #include <nnvm/op_attr_types.h>
 #include <nnvm/top/nn.h>
-#include "./nn_common.h"
+#include "nn_common.h"
 #include "../op_common.h"
 #include "../elemwise_op_common.h"
 #include "topi/elemwise.h"
diff --git a/src/arithmetic/canonical.cc b/src/arithmetic/canonical.cc
index ed6239961a3b..7acf79ff5308 100644
--- a/src/arithmetic/canonical.cc
+++ b/src/arithmetic/canonical.cc
@@ -6,8 +6,12 @@
 #include <tvm/ir_mutator.h>
 #include <tvm/arithmetic.h>
 #include <tvm/ir_pass.h>
-#include "./canonical.h"
-#include "./compute_expr.h"
+#include <algorithm>
+#include <map>
+#include <limits>
+#include <vector>
+#include "canonical.h"
+#include "compute_expr.h"
 #include "arithmetic/Simplify.h"
 
 namespace tvm {
diff --git a/src/arithmetic/detect_linear_equation.cc b/src/arithmetic/detect_linear_equation.cc
index 642a866866d2..109cdc6d9146 100644
--- a/src/arithmetic/detect_linear_equation.cc
+++ b/src/arithmetic/detect_linear_equation.cc
@@ -8,7 +8,7 @@
 #include <tvm/ir_visitor.h>
 #include <tvm/ir_functor_ext.h>
 #include <tvm/arithmetic.h>
-#include "./compute_expr.h"
+#include "compute_expr.h"
 
 namespace tvm {
 namespace arith {
diff --git a/src/arithmetic/int_set.cc b/src/arithmetic/int_set.cc
index c004b9666a58..c1b68fddd0e9 100644
--- a/src/arithmetic/int_set.cc
+++ b/src/arithmetic/int_set.cc
@@ -9,8 +9,8 @@
 #include <tvm/ir_functor_ext.h>
 #include <arithmetic/Interval.h>
 #include <unordered_map>
-#include "./compute_expr.h"
-#include "./int_set_internal.h"
+#include "compute_expr.h"
+#include "int_set_internal.h"
 
 namespace tvm {
 namespace arith {
diff --git a/src/arithmetic/modular.cc b/src/arithmetic/modular.cc
index c0eee45cc395..1c03d0f97485 100644
--- a/src/arithmetic/modular.cc
+++ b/src/arithmetic/modular.cc
@@ -8,7 +8,7 @@
 #include <tvm/ir_visitor.h>
 #include <tvm/arithmetic.h>
 #include <limits>
-#include "./int_set_internal.h"
+#include "int_set_internal.h"
 
 namespace tvm {
 namespace arith {
diff --git a/src/codegen/codegen_aocl.cc b/src/codegen/codegen_aocl.cc
index 8830588758ef..506a4f7ed92c 100644
--- a/src/codegen/codegen_aocl.cc
+++ b/src/codegen/codegen_aocl.cc
@@ -5,8 +5,8 @@
 #include <tvm/build_module.h>
 #include <vector>
 #include <string>
-#include "./codegen_opencl.h"
-#include "./build_common.h"
+#include "codegen_opencl.h"
+#include "build_common.h"
 #include "../runtime/opencl/aocl/aocl_module.h"
 #include "../runtime/file_util.h"
 
diff --git a/src/codegen/codegen_c.cc b/src/codegen/codegen_c.cc
index ec27f41cc702..09a6c7e6ab4c 100644
--- a/src/codegen/codegen_c.cc
+++ b/src/codegen/codegen_c.cc
@@ -4,7 +4,7 @@
  */
 #include <iomanip>
 #include <cctype>
-#include "./codegen_c.h"
+#include "codegen_c.h"
 #include "../pass/ir_util.h"
 #include "../arithmetic/compute_expr.h"
 
diff --git a/src/codegen/codegen_c.h b/src/codegen/codegen_c.h
index 0f14415f2af6..b36e37da54fe 100644
--- a/src/codegen/codegen_c.h
+++ b/src/codegen/codegen_c.h
@@ -14,7 +14,7 @@
 #include <vector>
 #include <unordered_map>
 #include <unordered_set>
-#include "./codegen_source_base.h"
+#include "codegen_source_base.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/codegen_cuda.cc b/src/codegen/codegen_cuda.cc
index 44c02830d0fc..7c8399cfc7b5 100644
--- a/src/codegen/codegen_cuda.cc
+++ b/src/codegen/codegen_cuda.cc
@@ -7,7 +7,7 @@
 #include <tvm/packed_func_ext.h>
 #include <vector>
 #include <string>
-#include "./codegen_cuda.h"
+#include "codegen_cuda.h"
 #include "../arithmetic/compute_expr.h"
 
 namespace tvm {
diff --git a/src/codegen/codegen_cuda.h b/src/codegen/codegen_cuda.h
index f5d9861ec6b2..cef2c77f9901 100644
--- a/src/codegen/codegen_cuda.h
+++ b/src/codegen/codegen_cuda.h
@@ -9,7 +9,7 @@
 #include <tvm/codegen.h>
 #include <tvm/packed_func_ext.h>
 #include <string>
-#include "./codegen_c.h"
+#include "codegen_c.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/codegen_metal.cc b/src/codegen/codegen_metal.cc
index 37121ccb755c..3bbe98289439 100644
--- a/src/codegen/codegen_metal.cc
+++ b/src/codegen/codegen_metal.cc
@@ -5,8 +5,9 @@
 #include <tvm/packed_func_ext.h>
 #include <vector>
 #include <string>
-#include "./codegen_metal.h"
-#include "./build_common.h"
+#include <algorithm>
+#include "codegen_metal.h"
+#include "build_common.h"
 #include "../runtime/metal/metal_module.h"
 #include "../runtime/thread_storage_scope.h"
 
diff --git a/src/codegen/codegen_metal.h b/src/codegen/codegen_metal.h
index 6f8bef64bbcf..9779fb800ff9 100644
--- a/src/codegen/codegen_metal.h
+++ b/src/codegen/codegen_metal.h
@@ -9,7 +9,7 @@
 #include <tvm/codegen.h>
 #include <tvm/packed_func_ext.h>
 #include <string>
-#include "./codegen_c.h"
+#include "codegen_c.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/codegen_opencl.cc b/src/codegen/codegen_opencl.cc
index 2d5026e827e2..3d3de5e3bcf4 100644
--- a/src/codegen/codegen_opencl.cc
+++ b/src/codegen/codegen_opencl.cc
@@ -5,8 +5,8 @@
 #include <tvm/packed_func_ext.h>
 #include <vector>
 #include <string>
-#include "./codegen_opencl.h"
-#include "./build_common.h"
+#include "codegen_opencl.h"
+#include "build_common.h"
 #include "../runtime/thread_storage_scope.h"
 #include "../runtime/opencl/opencl_module.h"
 
diff --git a/src/codegen/codegen_opencl.h b/src/codegen/codegen_opencl.h
index 424bfa5ae2b3..90569d176a0b 100644
--- a/src/codegen/codegen_opencl.h
+++ b/src/codegen/codegen_opencl.h
@@ -9,7 +9,7 @@
 #include <tvm/codegen.h>
 #include <tvm/packed_func_ext.h>
 #include <string>
-#include "./codegen_c.h"
+#include "codegen_c.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/codegen_opengl.cc b/src/codegen/codegen_opengl.cc
index 5e750a39e598..7fd85d35409d 100644
--- a/src/codegen/codegen_opengl.cc
+++ b/src/codegen/codegen_opengl.cc
@@ -8,8 +8,8 @@
 #include <tvm/packed_func_ext.h>
 #include <vector>
 #include <string>
-#include "./codegen_opengl.h"
-#include "./build_common.h"
+#include "codegen_opengl.h"
+#include "build_common.h"
 #include "../runtime/thread_storage_scope.h"
 
 namespace tvm {
diff --git a/src/codegen/codegen_opengl.h b/src/codegen/codegen_opengl.h
index 3cae1e323ec4..aa1552dfcff7 100644
--- a/src/codegen/codegen_opengl.h
+++ b/src/codegen/codegen_opengl.h
@@ -9,7 +9,7 @@
 #include <tvm/codegen.h>
 #include <tvm/packed_func_ext.h>
 #include <string>
-#include "./codegen_c.h"
+#include "codegen_c.h"
 #include "../runtime/opengl/opengl_module.h"
 
 namespace tvm {
diff --git a/src/codegen/codegen_source_base.cc b/src/codegen/codegen_source_base.cc
index cf3a6ec5ab04..39a573ceec68 100644
--- a/src/codegen/codegen_source_base.cc
+++ b/src/codegen/codegen_source_base.cc
@@ -2,7 +2,7 @@
  *  Copyright (c) 2017 by Contributors
  * \file codegen_source_base.cc
  */
-#include "./codegen_source_base.h"
+#include "codegen_source_base.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/codegen_vhls.cc b/src/codegen/codegen_vhls.cc
index b9f9f7505978..5776b895b4b3 100644
--- a/src/codegen/codegen_vhls.cc
+++ b/src/codegen/codegen_vhls.cc
@@ -5,8 +5,8 @@
 #include <tvm/build_module.h>
 #include <vector>
 #include <string>
-#include "./codegen_vhls.h"
-#include "./build_common.h"
+#include "codegen_vhls.h"
+#include "build_common.h"
 #include "../runtime/opencl/sdaccel/sdaccel_module.h"
 
 namespace tvm {
diff --git a/src/codegen/codegen_vhls.h b/src/codegen/codegen_vhls.h
index bcb7d6f49d8c..c0faefc75837 100644
--- a/src/codegen/codegen_vhls.h
+++ b/src/codegen/codegen_vhls.h
@@ -9,7 +9,7 @@
 #include <tvm/codegen.h>
 #include <tvm/packed_func_ext.h>
 #include <string>
-#include "./codegen_c.h"
+#include "codegen_c.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/intrin_rule.cc b/src/codegen/intrin_rule.cc
index 5f15a879c2ed..822d515fb8a5 100644
--- a/src/codegen/intrin_rule.cc
+++ b/src/codegen/intrin_rule.cc
@@ -3,7 +3,7 @@
  * \file intrin_rule_default.cc
  * \brief Default intrinsic rules.
  */
-#include "./intrin_rule.h"
+#include "intrin_rule.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/intrin_rule_cuda.cc b/src/codegen/intrin_rule_cuda.cc
index 43461a15932d..ee98a54329ab 100644
--- a/src/codegen/intrin_rule_cuda.cc
+++ b/src/codegen/intrin_rule_cuda.cc
@@ -3,7 +3,7 @@
  * \file intrin_rule_cuda.cc
  * \brief CUDA intrinsic rules.
  */
-#include "./intrin_rule.h"
+#include "intrin_rule.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/intrin_rule_metal.cc b/src/codegen/intrin_rule_metal.cc
index 3c210919132e..8b499fb9ea9b 100644
--- a/src/codegen/intrin_rule_metal.cc
+++ b/src/codegen/intrin_rule_metal.cc
@@ -3,7 +3,7 @@
  * \file intrin_rule_metal.cc
  * \brief Metal intrinsic rules.
  */
-#include "./intrin_rule.h"
+#include "intrin_rule.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/intrin_rule_opencl.cc b/src/codegen/intrin_rule_opencl.cc
index d91deaeda5fe..1cb1aed01102 100644
--- a/src/codegen/intrin_rule_opencl.cc
+++ b/src/codegen/intrin_rule_opencl.cc
@@ -3,7 +3,7 @@
  * \file intrin_rule_opencl.cc
  * \brief OpenCL intrinsic rules.
  */
-#include "./intrin_rule.h"
+#include "intrin_rule.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/intrin_rule_opengl.cc b/src/codegen/intrin_rule_opengl.cc
index e9728a25b40c..c9aa21c1a883 100644
--- a/src/codegen/intrin_rule_opengl.cc
+++ b/src/codegen/intrin_rule_opengl.cc
@@ -3,7 +3,7 @@
  * \file intrin_rule_opencl.cc
  * \brief OpenCL intrinsic rules.
  */
-#include "./intrin_rule.h"
+#include "intrin_rule.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/intrin_rule_vhls.cc b/src/codegen/intrin_rule_vhls.cc
index b360142cd985..996c45707364 100644
--- a/src/codegen/intrin_rule_vhls.cc
+++ b/src/codegen/intrin_rule_vhls.cc
@@ -3,7 +3,7 @@
  * \file intrin_rule_vhls.cc
  * \brief VHLS intrinsic rules.
  */
-#include "./intrin_rule.h"
+#include "intrin_rule.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/llvm/codegen_amdgpu.cc b/src/codegen/llvm/codegen_amdgpu.cc
index 9d1decb43227..dd2cf6714251 100644
--- a/src/codegen/llvm/codegen_amdgpu.cc
+++ b/src/codegen/llvm/codegen_amdgpu.cc
@@ -8,7 +8,7 @@
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/registry.h>
-#include "./codegen_llvm.h"
+#include "codegen_llvm.h"
 #include "../build_common.h"
 #include "../codegen_source_base.h"
 #include "../../pass/ir_util.h"
diff --git a/src/codegen/llvm/codegen_arm.cc b/src/codegen/llvm/codegen_arm.cc
index 18a0eb54e182..9f19fa1f47f0 100644
--- a/src/codegen/llvm/codegen_arm.cc
+++ b/src/codegen/llvm/codegen_arm.cc
@@ -4,7 +4,7 @@
  * \brief ARM specific code generator
  */
 #ifdef TVM_LLVM_VERSION
-#include "./codegen_cpu.h"
+#include "codegen_cpu.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/llvm/codegen_cpu.cc b/src/codegen/llvm/codegen_cpu.cc
index a8a2127febde..436c727f86f0 100644
--- a/src/codegen/llvm/codegen_cpu.cc
+++ b/src/codegen/llvm/codegen_cpu.cc
@@ -6,7 +6,7 @@
 
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/ir_pass.h>
-#include "./codegen_cpu.h"
+#include "codegen_cpu.h"
 #include "../../pass/ir_util.h"
 
 namespace tvm {
diff --git a/src/codegen/llvm/codegen_cpu.h b/src/codegen/llvm/codegen_cpu.h
index 5027dab911bd..b7a95a835d89 100644
--- a/src/codegen/llvm/codegen_cpu.h
+++ b/src/codegen/llvm/codegen_cpu.h
@@ -9,7 +9,7 @@
 #include <utility>
 #include <vector>
 #include <string>
-#include "./codegen_llvm.h"
+#include "codegen_llvm.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/llvm/codegen_llvm.cc b/src/codegen/llvm/codegen_llvm.cc
index ae576c981395..c1b1fe24f0a8 100644
--- a/src/codegen/llvm/codegen_llvm.cc
+++ b/src/codegen/llvm/codegen_llvm.cc
@@ -7,8 +7,8 @@
 
 #include <tvm/runtime/device_api.h>
 #include <tvm/runtime/c_runtime_api.h>
-#include "./codegen_llvm.h"
-#include "./codegen_cpu.h"
+#include "codegen_llvm.h"
+#include "codegen_cpu.h"
 #include "../codegen_common.h"
 #include "../../pass/ir_util.h"
 #include "../../arithmetic/compute_expr.h"
diff --git a/src/codegen/llvm/codegen_llvm.h b/src/codegen/llvm/codegen_llvm.h
index 4e61247f4acf..d0cee581a0b6 100644
--- a/src/codegen/llvm/codegen_llvm.h
+++ b/src/codegen/llvm/codegen_llvm.h
@@ -15,7 +15,7 @@
 #include <utility>
 #include <vector>
 #include <string>
-#include "./llvm_common.h"
+#include "llvm_common.h"
 #include "../../runtime/thread_storage_scope.h"
 
 namespace tvm {
diff --git a/src/codegen/llvm/codegen_nvptx.cc b/src/codegen/llvm/codegen_nvptx.cc
index 1cca1eacfe85..fc5ad99119ae 100644
--- a/src/codegen/llvm/codegen_nvptx.cc
+++ b/src/codegen/llvm/codegen_nvptx.cc
@@ -6,7 +6,7 @@
 #ifdef TVM_LLVM_VERSION
 
 #include <tvm/runtime/device_api.h>
-#include "./codegen_llvm.h"
+#include "codegen_llvm.h"
 #include "../build_common.h"
 #include "../../pass/ir_util.h"
 #include "../../runtime/cuda/cuda_module.h"
diff --git a/src/codegen/llvm/intrin_rule_llvm.cc b/src/codegen/llvm/intrin_rule_llvm.cc
index 4b2a3ca5bd02..307f0a3bc412 100644
--- a/src/codegen/llvm/intrin_rule_llvm.cc
+++ b/src/codegen/llvm/intrin_rule_llvm.cc
@@ -4,7 +4,7 @@
  */
 #ifdef TVM_LLVM_VERSION
 
-#include "./intrin_rule_llvm.h"
+#include "intrin_rule_llvm.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/llvm/intrin_rule_llvm.h b/src/codegen/llvm/intrin_rule_llvm.h
index 85641cb178e7..30e7674c3297 100644
--- a/src/codegen/llvm/intrin_rule_llvm.h
+++ b/src/codegen/llvm/intrin_rule_llvm.h
@@ -11,7 +11,7 @@
 #include <tvm/api_registry.h>
 #include <tvm/codegen.h>
 #include <string>
-#include "./llvm_common.h"
+#include "llvm_common.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/llvm/intrin_rule_rocm.cc b/src/codegen/llvm/intrin_rule_rocm.cc
index b9bee94e9c24..092eb77f8f59 100644
--- a/src/codegen/llvm/intrin_rule_rocm.cc
+++ b/src/codegen/llvm/intrin_rule_rocm.cc
@@ -4,7 +4,7 @@
  */
 #ifdef TVM_LLVM_VERSION
 
-#include "./intrin_rule_llvm.h"
+#include "intrin_rule_llvm.h"
 #include <tvm/ir.h>
 #include <tvm/expr.h>
 #include <tvm/api_registry.h>
diff --git a/src/codegen/llvm/llvm_common.cc b/src/codegen/llvm/llvm_common.cc
index 01f2c8869dc1..9d1ba6b1068f 100644
--- a/src/codegen/llvm/llvm_common.cc
+++ b/src/codegen/llvm/llvm_common.cc
@@ -6,7 +6,7 @@
 
 #include <tvm/base.h>
 #include <mutex>
-#include "./llvm_common.h"
+#include "llvm_common.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/llvm/llvm_module.cc b/src/codegen/llvm/llvm_module.cc
index 1b0e43f9c23a..54f986d628d3 100644
--- a/src/codegen/llvm/llvm_module.cc
+++ b/src/codegen/llvm/llvm_module.cc
@@ -7,8 +7,8 @@
 #include <tvm/runtime/packed_func.h>
 #include <tvm/codegen.h>
 #include <mutex>
-#include "./llvm_common.h"
-#include "./codegen_llvm.h"
+#include "llvm_common.h"
+#include "codegen_llvm.h"
 #include "../../runtime/file_util.h"
 #include "../../runtime/module_util.h"
 
diff --git a/src/codegen/source_module.cc b/src/codegen/source_module.cc
index 69dbda49976b..c7100e18735e 100644
--- a/src/codegen/source_module.cc
+++ b/src/codegen/source_module.cc
@@ -4,7 +4,7 @@
  * \brief Source code module, only for viewing
  */
 #include <tvm/runtime/packed_func.h>
-#include "./codegen_source_base.h"
+#include "codegen_source_base.h"
 #include "../runtime/file_util.h"
 #include "../runtime/meta_data.h"
 
diff --git a/src/codegen/spirv/build_vulkan.cc b/src/codegen/spirv/build_vulkan.cc
index 3cd1b56cda43..f5ec5628545a 100644
--- a/src/codegen/spirv/build_vulkan.cc
+++ b/src/codegen/spirv/build_vulkan.cc
@@ -8,7 +8,7 @@
 #include <dmlc/memory_io.h>
 #include <tvm/ir_pass.h>
 
-#include "./codegen_spirv.h"
+#include "codegen_spirv.h"
 #include "../build_common.h"
 #include "../../runtime/vulkan/vulkan_module.h"
 
diff --git a/src/codegen/spirv/codegen_spirv.cc b/src/codegen/spirv/codegen_spirv.cc
index 395bdff1477d..812fee4a114e 100644
--- a/src/codegen/spirv/codegen_spirv.cc
+++ b/src/codegen/spirv/codegen_spirv.cc
@@ -5,8 +5,9 @@
  */
 #include <tvm/ir.h>
 #include <tvm/ir_pass.h>
+#include <string>
 #include "../codegen_common.h"
-#include "./codegen_spirv.h"
+#include "codegen_spirv.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/spirv/codegen_spirv.h b/src/codegen/spirv/codegen_spirv.h
index a6c09362ddf7..6a43182f7f2e 100644
--- a/src/codegen/spirv/codegen_spirv.h
+++ b/src/codegen/spirv/codegen_spirv.h
@@ -12,7 +12,7 @@
 
 #include <vector>
 
-#include "./ir_builder.h"
+#include "ir_builder.h"
 #include "../../runtime/thread_storage_scope.h"
 
 namespace tvm {
diff --git a/src/codegen/spirv/ir_builder.cc b/src/codegen/spirv/ir_builder.cc
index eb7a67228e60..41cb48c5854b 100644
--- a/src/codegen/spirv/ir_builder.cc
+++ b/src/codegen/spirv/ir_builder.cc
@@ -3,7 +3,7 @@
  * \file ir_builder.cc
  * \brief IRBuilder for SPIRV block
  */
-#include "./ir_builder.h"
+#include "ir_builder.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/stackvm/codegen_stackvm.cc b/src/codegen/stackvm/codegen_stackvm.cc
index 517793ff14a3..0bede2dc0751 100644
--- a/src/codegen/stackvm/codegen_stackvm.cc
+++ b/src/codegen/stackvm/codegen_stackvm.cc
@@ -5,7 +5,7 @@
 #include <tvm/runtime/registry.h>
 #include <tvm/packed_func_ext.h>
 #include <limits>
-#include "./codegen_stackvm.h"
+#include "codegen_stackvm.h"
 #include "../../runtime/stackvm/stackvm_module.h"
 
 namespace tvm {
diff --git a/src/codegen/verilog/codegen_verilog.cc b/src/codegen/verilog/codegen_verilog.cc
index a4887390ad5d..d7e149257fdb 100644
--- a/src/codegen/verilog/codegen_verilog.cc
+++ b/src/codegen/verilog/codegen_verilog.cc
@@ -6,7 +6,7 @@
 #include <cctype>
 #include <sstream>
 #include <iostream>
-#include "./codegen_verilog.h"
+#include "codegen_verilog.h"
 #include "../../arithmetic/compute_expr.h"
 
 namespace tvm {
diff --git a/src/codegen/verilog/codegen_verilog.h b/src/codegen/verilog/codegen_verilog.h
index 7c8b811c2fa6..a38640ac3799 100644
--- a/src/codegen/verilog/codegen_verilog.h
+++ b/src/codegen/verilog/codegen_verilog.h
@@ -14,7 +14,7 @@
 #include <string>
 #include <vector>
 #include <unordered_map>
-#include "./verilog_ir.h"
+#include "verilog_ir.h"
 #include "../codegen_source_base.h"
 
 namespace tvm {
diff --git a/src/codegen/verilog/verilog_ir.cc b/src/codegen/verilog/verilog_ir.cc
index 1a03fc881665..b7576c83dfa8 100644
--- a/src/codegen/verilog/verilog_ir.cc
+++ b/src/codegen/verilog/verilog_ir.cc
@@ -5,7 +5,8 @@
 #include <tvm/ir_pass.h>
 #include <tvm/ir_visitor.h>
 #include <tvm/ir_mutator.h>
-#include "./verilog_ir.h"
+#include <utility>
+#include "verilog_ir.h"
 #include "../../arithmetic/compute_expr.h"
 
 namespace tvm {
diff --git a/src/codegen/verilog/verilog_module.cc b/src/codegen/verilog/verilog_module.cc
index 0319d6e6556c..0670a02e34ac 100644
--- a/src/codegen/verilog/verilog_module.cc
+++ b/src/codegen/verilog/verilog_module.cc
@@ -6,7 +6,7 @@
 #include <tvm/runtime/packed_func.h>
 #include <tvm/codegen.h>
 #include <mutex>
-#include "./codegen_verilog.h"
+#include "codegen_verilog.h"
 #include "../../runtime/file_util.h"
 #include "../../runtime/meta_data.h"
 
diff --git a/src/codegen/verilog/vpi_device_api.cc b/src/codegen/verilog/vpi_device_api.cc
index d53a12962fd7..656630351cf5 100644
--- a/src/codegen/verilog/vpi_device_api.cc
+++ b/src/codegen/verilog/vpi_device_api.cc
@@ -10,7 +10,7 @@
 #include <unordered_map>
 #include <map>
 #include <queue>
-#include "./vpi_session.h"
+#include "vpi_session.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/codegen/verilog/vpi_session.cc b/src/codegen/verilog/vpi_session.cc
index 6fbbbc01d32b..ac2861e8f74f 100644
--- a/src/codegen/verilog/vpi_session.cc
+++ b/src/codegen/verilog/vpi_session.cc
@@ -4,7 +4,7 @@
  * \brief IPC session call to verilog simulator via VPI.
  */
 #include <tvm/api_registry.h>
-#include "./vpi_session.h"
+#include "vpi_session.h"
 
 namespace tvm {
 namespace codegen {
diff --git a/src/contrib/nnpack/convolution.cc b/src/contrib/nnpack/convolution.cc
index 9ca02118aeb3..f658a1fe96d4 100644
--- a/src/contrib/nnpack/convolution.cc
+++ b/src/contrib/nnpack/convolution.cc
@@ -6,7 +6,7 @@
 #include <tvm/runtime/util.h>
 #include <dmlc/logging.h>
 #include <nnpack.h>
-#include "./nnpack_utils.h"
+#include "nnpack_utils.h"
 
 namespace tvm {
 namespace contrib {
diff --git a/src/contrib/nnpack/fully_connected.cc b/src/contrib/nnpack/fully_connected.cc
index df6356d933aa..ad2569e1f2e1 100644
--- a/src/contrib/nnpack/fully_connected.cc
+++ b/src/contrib/nnpack/fully_connected.cc
@@ -6,7 +6,7 @@
 #include <tvm/runtime/util.h>
 #include <dmlc/logging.h>
 #include <nnpack.h>
-#include "./nnpack_utils.h"
+#include "nnpack_utils.h"
 
 namespace tvm {
 namespace contrib {
diff --git a/src/contrib/nnpack/nnpack_utils.cc b/src/contrib/nnpack/nnpack_utils.cc
index 631f25b36647..3220d7af339f 100644
--- a/src/contrib/nnpack/nnpack_utils.cc
+++ b/src/contrib/nnpack/nnpack_utils.cc
@@ -2,7 +2,7 @@
  *  Copyright (c) 2017 by Contributors
  * \file Use external nnpack library call.
  */
-#include "./nnpack_utils.h"
+#include "nnpack_utils.h"
 
 namespace tvm {
 namespace contrib {
diff --git a/src/contrib/random/random.cc b/src/contrib/random/random.cc
index 27e2b065a01b..68821fe04124 100644
--- a/src/contrib/random/random.cc
+++ b/src/contrib/random/random.cc
@@ -8,9 +8,9 @@
 #include <dmlc/thread_local.h>
 #include <algorithm>
 #ifndef _LIBCPP_SGX_CONFIG
-#include "./mt_random_engine.cc"
+#include "mt_random_engine.cc"
 #else
-#include "./sgx_random_engine.cc"
+#include "sgx_random_engine.cc"
 #endif
 
 #define DLPACK_INTEGER_TYPE_SWITCH(type, DType, ...)    \
diff --git a/src/op/compute_op.cc b/src/op/compute_op.cc
index 366ea2c78fe6..267a25ff372b 100644
--- a/src/op/compute_op.cc
+++ b/src/op/compute_op.cc
@@ -9,8 +9,9 @@
 #include <tvm/ir_visitor.h>
 #include <tvm/ir_pass.h>
 #include <unordered_set>
-#include "./compute_op.h"
-#include "./op_util.h"
+#include <string>
+#include "compute_op.h"
+#include "op_util.h"
 #include "../schedule/message_passing.h"
 
 namespace tvm {
diff --git a/src/op/cross_thread_reduction.cc b/src/op/cross_thread_reduction.cc
index eb320388860a..c4599dee9bd8 100644
--- a/src/op/cross_thread_reduction.cc
+++ b/src/op/cross_thread_reduction.cc
@@ -4,8 +4,8 @@
  * \file cross_thread_reduction.cc
  */
 #include <tvm/ir_pass.h>
-#include "./compute_op.h"
-#include "./op_util.h"
+#include "compute_op.h"
+#include "op_util.h"
 
 namespace tvm {
 using namespace ir;
diff --git a/src/op/extern_op.cc b/src/op/extern_op.cc
index 759e258e90ef..86c1d5e74527 100644
--- a/src/op/extern_op.cc
+++ b/src/op/extern_op.cc
@@ -7,7 +7,7 @@
 #include <tvm/arithmetic.h>
 #include <tvm/ir.h>
 #include <unordered_set>
-#include "./op_util.h"
+#include "op_util.h"
 
 namespace tvm {
 using namespace ir;
diff --git a/src/op/op_util.cc b/src/op/op_util.cc
index 4f34d8d972ce..ba83997a0a16 100644
--- a/src/op/op_util.cc
+++ b/src/op/op_util.cc
@@ -7,7 +7,8 @@
 #include <tvm/ir_pass.h>
 #include <tvm/operation.h>
 #include <tvm/ir_mutator.h>
-#include "./op_util.h"
+#include <string>
+#include "op_util.h"
 #include "../schedule/message_passing.h"
 #include "../arithmetic/compute_expr.h"
 
diff --git a/src/op/scan_op.cc b/src/op/scan_op.cc
index 626c8eba46b4..d03601709ab4 100644
--- a/src/op/scan_op.cc
+++ b/src/op/scan_op.cc
@@ -6,7 +6,7 @@
 #include <tvm/operation.h>
 #include <tvm/ir.h>
 #include <tvm/ir_pass.h>
-#include "./op_util.h"
+#include "op_util.h"
 #include "../schedule/graph.h"
 
 namespace tvm {
diff --git a/src/op/tensorize.cc b/src/op/tensorize.cc
index 148ad0f90fe7..6423c4e942e4 100644
--- a/src/op/tensorize.cc
+++ b/src/op/tensorize.cc
@@ -7,8 +7,8 @@
 #include <tvm/ir_mutator.h>
 #include <tvm/ir_pass.h>
 #include <tvm/api_registry.h>
-#include "./op_util.h"
-#include "./compute_op.h"
+#include "op_util.h"
+#include "compute_op.h"
 #include "../schedule/message_passing.h"
 #include "../arithmetic/compute_expr.h"
 
diff --git a/src/pass/arg_binder.cc b/src/pass/arg_binder.cc
index 390c918d9692..0fac313c079b 100644
--- a/src/pass/arg_binder.cc
+++ b/src/pass/arg_binder.cc
@@ -6,8 +6,8 @@
 #include <tvm/ir.h>
 #include <tvm/ir_pass.h>
 #include <tvm/runtime/device_api.h>
-#include "./ir_util.h"
-#include "./arg_binder.h"
+#include "ir_util.h"
+#include "arg_binder.h"
 #include "../arithmetic/compute_expr.h"
 
 namespace tvm {
diff --git a/src/pass/coproc_sync.cc b/src/pass/coproc_sync.cc
index b3e64a989702..13dfef107e87 100644
--- a/src/pass/coproc_sync.cc
+++ b/src/pass/coproc_sync.cc
@@ -8,8 +8,8 @@
 #include <tvm/ir_visitor.h>
 #include <unordered_map>
 #include <unordered_set>
-#include "./ir_util.h"
-#include "./storage_access.h"
+#include "ir_util.h"
+#include "storage_access.h"
 
 namespace tvm {
 namespace ir {
diff --git a/src/pass/inject_double_buffer.cc b/src/pass/inject_double_buffer.cc
index 03ffdb01e107..1384ea1a89ac 100644
--- a/src/pass/inject_double_buffer.cc
+++ b/src/pass/inject_double_buffer.cc
@@ -7,7 +7,7 @@
 #include <tvm/ir_pass.h>
 #include <tvm/ir_visitor.h>
 #include <tvm/ir_mutator.h>
-#include "./ir_util.h"
+#include "ir_util.h"
 #include "../arithmetic/compute_expr.h"
 
 namespace tvm {
diff --git a/src/pass/ir_mutator.cc b/src/pass/ir_mutator.cc
index 9ca9ccd190ff..e82c4f554be0 100644
--- a/src/pass/ir_mutator.cc
+++ b/src/pass/ir_mutator.cc
@@ -5,7 +5,7 @@
 #include <tvm/ir.h>
 #include <tvm/ir_mutator.h>
 #include <tvm/packed_func_ext.h>
-#include "./ir_util.h"
+#include "ir_util.h"
 
 namespace tvm {
 namespace ir {
diff --git a/src/pass/ir_util.cc b/src/pass/ir_util.cc
index 579706ca9964..d06839beca33 100644
--- a/src/pass/ir_util.cc
+++ b/src/pass/ir_util.cc
@@ -3,7 +3,7 @@
  * \file ir_util.cc
  * \brief Helper functions to construct and compose IR nodes.
  */
-#include "./ir_util.h"
+#include "ir_util.h"
 
 namespace tvm {
 namespace ir {
diff --git a/src/pass/lift_attr_scope.cc b/src/pass/lift_attr_scope.cc
index a3a60aaac4d1..d5fd53812b99 100644
--- a/src/pass/lift_attr_scope.cc
+++ b/src/pass/lift_attr_scope.cc
@@ -7,7 +7,7 @@
  */
 #include <tvm/ir_pass.h>
 #include <tvm/ir_mutator.h>
-#include "./ir_util.h"
+#include "ir_util.h"
 
 namespace tvm {
 namespace ir {
diff --git a/src/pass/lower_intrin.cc b/src/pass/lower_intrin.cc
index 33ac6a94ecf7..b38051326d1d 100644
--- a/src/pass/lower_intrin.cc
+++ b/src/pass/lower_intrin.cc
@@ -8,7 +8,7 @@
 #include <tvm/ir_pass.h>
 #include <tvm/api_registry.h>
 #include <unordered_set>
-#include "./ir_util.h"
+#include "ir_util.h"
 
 namespace tvm {
 namespace ir {
diff --git a/src/pass/lower_thread_allreduce.cc b/src/pass/lower_thread_allreduce.cc
index 8c0eb037d953..4d7f086d0534 100644
--- a/src/pass/lower_thread_allreduce.cc
+++ b/src/pass/lower_thread_allreduce.cc
@@ -7,7 +7,7 @@
 #include <tvm/ir_mutator.h>
 #include <tvm/ir_pass.h>
 #include <unordered_set>
-#include "./ir_util.h"
+#include "ir_util.h"
 #include "../arithmetic/compute_expr.h"
 #include "../runtime/thread_storage_scope.h"
 
diff --git a/src/pass/lower_tvm_builtin.cc b/src/pass/lower_tvm_builtin.cc
index a63fef07bd12..46686a65803a 100644
--- a/src/pass/lower_tvm_builtin.cc
+++ b/src/pass/lower_tvm_builtin.cc
@@ -7,7 +7,7 @@
 #include <tvm/ir_mutator.h>
 #include <tvm/ir_pass.h>
 #include <unordered_set>
-#include "./ir_util.h"
+#include "ir_util.h"
 #include "../arithmetic/compute_expr.h"
 
 namespace tvm {
diff --git a/src/pass/lower_warp_memory.cc b/src/pass/lower_warp_memory.cc
index 8f153fd61188..85ae365f2a82 100644
--- a/src/pass/lower_warp_memory.cc
+++ b/src/pass/lower_warp_memory.cc
@@ -13,7 +13,7 @@
 #include <tvm/ir_visitor.h>
 #include <tvm/ir_pass.h>
 #include <unordered_set>
-#include "./ir_util.h"
+#include "ir_util.h"
 #include "../arithmetic/compute_expr.h"
 #include "../runtime/thread_storage_scope.h"
 
diff --git a/src/pass/make_api.cc b/src/pass/make_api.cc
index 206bd95010ce..8113c58f3f78 100644
--- a/src/pass/make_api.cc
+++ b/src/pass/make_api.cc
@@ -12,8 +12,8 @@
 #include <utility>
 #include <unordered_set>
 
-#include "./ir_util.h"
-#include "./arg_binder.h"
+#include "ir_util.h"
+#include "arg_binder.h"
 #include "../arithmetic/compute_expr.h"
 
 namespace tvm {
diff --git a/src/pass/narrow_channel_access.cc b/src/pass/narrow_channel_access.cc
index 733eeffb632e..7faf7d1b173e 100644
--- a/src/pass/narrow_channel_access.cc
+++ b/src/pass/narrow_channel_access.cc
@@ -11,7 +11,7 @@
 #include <tvm/ir_mutator.h>
 #include <tvm/arithmetic.h>
 #include <tvm/channel.h>
-#include "./ir_util.h"
+#include "ir_util.h"
 
 namespace tvm {
 namespace ir {
diff --git a/src/pass/split_pipeline.cc b/src/pass/split_pipeline.cc
index 38bd5f86fd68..0dd5bd65106f 100644
--- a/src/pass/split_pipeline.cc
+++ b/src/pass/split_pipeline.cc
@@ -11,7 +11,7 @@
 #include <tvm/channel.h>
 #include <unordered_map>
 #include <unordered_set>
-#include "./ir_util.h"
+#include "ir_util.h"
 
 namespace tvm {
 namespace ir {
diff --git a/src/pass/storage_access.cc b/src/pass/storage_access.cc
index 09be1a53da42..e7adcc75854f 100644
--- a/src/pass/storage_access.cc
+++ b/src/pass/storage_access.cc
@@ -5,8 +5,9 @@
 #include <tvm/ir_pass.h>
 #include <tvm/ir_mutator.h>
 #include <tvm/target_info.h>
-#include "./ir_util.h"
-#include "./storage_access.h"
+#include <string>
+#include "ir_util.h"
+#include "storage_access.h"
 #include "../arithmetic/compute_expr.h"
 
 namespace tvm {
diff --git a/src/pass/storage_flatten.cc b/src/pass/storage_flatten.cc
index f5cb98495ff9..28a6ace9bfa6 100644
--- a/src/pass/storage_flatten.cc
+++ b/src/pass/storage_flatten.cc
@@ -14,8 +14,8 @@
 #include <tvm/target_info.h>
 #include <tvm/runtime/device_api.h>
 #include <unordered_map>
-#include "./ir_util.h"
-#include "./arg_binder.h"
+#include "ir_util.h"
+#include "arg_binder.h"
 #include "../arithmetic/compute_expr.h"
 #include "../runtime/thread_storage_scope.h"
 
diff --git a/src/pass/storage_rewrite.cc b/src/pass/storage_rewrite.cc
index 877216ed7656..58b62f291d39 100644
--- a/src/pass/storage_rewrite.cc
+++ b/src/pass/storage_rewrite.cc
@@ -12,7 +12,7 @@
 #include <map>
 #include <unordered_set>
 #include <unordered_map>
-#include "./ir_util.h"
+#include "ir_util.h"
 #include "../arithmetic/compute_expr.h"
 #include "../runtime/thread_storage_scope.h"
 
diff --git a/src/pass/storage_sync.cc b/src/pass/storage_sync.cc
index 6e2d1020a6b5..43f3b94d114f 100644
--- a/src/pass/storage_sync.cc
+++ b/src/pass/storage_sync.cc
@@ -8,8 +8,8 @@
 #include <tvm/ir_visitor.h>
 #include <unordered_map>
 #include <unordered_set>
-#include "./ir_util.h"
-#include "./storage_access.h"
+#include "ir_util.h"
+#include "storage_access.h"
 #include "../runtime/thread_storage_scope.h"
 
 namespace tvm {
diff --git a/src/runtime/c_dsl_api.cc b/src/runtime/c_dsl_api.cc
index 6ae8b9911a4c..ae39a1266d06 100644
--- a/src/runtime/c_dsl_api.cc
+++ b/src/runtime/c_dsl_api.cc
@@ -5,8 +5,8 @@
  */
 #include <tvm/runtime/registry.h>
 #include <tvm/c_dsl_api.h>
-#include "./dsl_api.h"
-#include "./runtime_base.h"
+#include "dsl_api.h"
+#include "runtime_base.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc
index 916dfadecb4c..a081a4c1df11 100644
--- a/src/runtime/c_runtime_api.cc
+++ b/src/runtime/c_runtime_api.cc
@@ -17,7 +17,7 @@
 #include <algorithm>
 #include <string>
 #include <cstdlib>
-#include "./runtime_base.h"
+#include "runtime_base.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/cpu_device_api.cc b/src/runtime/cpu_device_api.cc
index e3434e01813e..d166a3a43dfa 100644
--- a/src/runtime/cpu_device_api.cc
+++ b/src/runtime/cpu_device_api.cc
@@ -8,7 +8,7 @@
 #include <tvm/runtime/device_api.h>
 #include <cstdlib>
 #include <cstring>
-#include "./workspace_pool.h"
+#include "workspace_pool.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/cuda/cuda_device_api.cc b/src/runtime/cuda/cuda_device_api.cc
index 98accdf1b0aa..8309b45a7963 100644
--- a/src/runtime/cuda/cuda_device_api.cc
+++ b/src/runtime/cuda/cuda_device_api.cc
@@ -8,7 +8,7 @@
 #include <dmlc/thread_local.h>
 #include <tvm/runtime/registry.h>
 #include <cuda_runtime.h>
-#include "./cuda_common.h"
+#include "cuda_common.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/cuda/cuda_module.cc b/src/runtime/cuda/cuda_module.cc
index a0e613107bae..4984517b16c6 100644
--- a/src/runtime/cuda/cuda_module.cc
+++ b/src/runtime/cuda/cuda_module.cc
@@ -2,7 +2,7 @@
  *  Copyright (c) 2017 by Contributors
  * \file cuda_module.cc
  */
-#include "./cuda_module.h"
+#include "cuda_module.h"
 
 #include <tvm/runtime/registry.h>
 #include <cuda.h>
@@ -11,7 +11,7 @@
 #include <array>
 #include <string>
 #include <mutex>
-#include "./cuda_common.h"
+#include "cuda_common.h"
 #include "../pack_args.h"
 #include "../thread_storage_scope.h"
 #include "../meta_data.h"
diff --git a/src/runtime/dso_module.cc b/src/runtime/dso_module.cc
index 60fdb427c246..fe7c362472d1 100644
--- a/src/runtime/dso_module.cc
+++ b/src/runtime/dso_module.cc
@@ -6,7 +6,7 @@
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/runtime/packed_func.h>
-#include "./module_util.h"
+#include "module_util.h"
 
 #if defined(_WIN32)
 #include <windows.h>
diff --git a/src/runtime/file_util.cc b/src/runtime/file_util.cc
index 7606bf89cd92..4df335a54f25 100644
--- a/src/runtime/file_util.cc
+++ b/src/runtime/file_util.cc
@@ -6,8 +6,9 @@
 #include <dmlc/logging.h>
 #include <tvm/runtime/serializer.h>
 #include <fstream>
+#include <vector>
 
-#include "./file_util.h"
+#include "file_util.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/file_util.h b/src/runtime/file_util.h
index b3357271856e..de520fa3158c 100644
--- a/src/runtime/file_util.h
+++ b/src/runtime/file_util.h
@@ -7,7 +7,7 @@
 #define TVM_RUNTIME_FILE_UTIL_H_
 
 #include <string>
-#include "./meta_data.h"
+#include "meta_data.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc
index 7a75771af23b..34bde9a89e36 100644
--- a/src/runtime/graph/graph_runtime.cc
+++ b/src/runtime/graph/graph_runtime.cc
@@ -8,7 +8,10 @@
 #include <dmlc/memory_io.h>
 #include <dmlc/json.h>
 #include <numeric>
-#include "./graph_runtime.h"
+#include <algorithm>
+#include <vector>
+#include <functional>
+#include "graph_runtime.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/meta_data.h b/src/runtime/meta_data.h
index 381bf9f60c79..40d08015e8cd 100644
--- a/src/runtime/meta_data.h
+++ b/src/runtime/meta_data.h
@@ -11,7 +11,7 @@
 #include <tvm/runtime/packed_func.h>
 #include <string>
 #include <vector>
-#include "./runtime_base.h"
+#include "runtime_base.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/metal/metal_device_api.mm b/src/runtime/metal/metal_device_api.mm
index 47c2899cea71..fcdbf13138a8 100644
--- a/src/runtime/metal/metal_device_api.mm
+++ b/src/runtime/metal/metal_device_api.mm
@@ -4,7 +4,7 @@
  */
 #include <tvm/runtime/registry.h>
 #include <dmlc/thread_local.h>
-#include "./metal_common.h"
+#include "metal_common.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/metal/metal_module.mm b/src/runtime/metal/metal_module.mm
index c79e2cf11ac5..c538957ca561 100644
--- a/src/runtime/metal/metal_module.mm
+++ b/src/runtime/metal/metal_module.mm
@@ -8,8 +8,8 @@
 #include <array>
 #include <string>
 #include <mutex>
-#include "./metal_module.h"
-#include "./metal_common.h"
+#include "metal_module.h"
+#include "metal_common.h"
 #include "../pack_args.h"
 #include "../thread_storage_scope.h"
 #include "../meta_data.h"
diff --git a/src/runtime/module.cc b/src/runtime/module.cc
index dbddfde44733..80dc1f3172f8 100644
--- a/src/runtime/module.cc
+++ b/src/runtime/module.cc
@@ -9,7 +9,7 @@
 #include <unordered_set>
 #include <cstring>
 #ifndef _LIBCPP_SGX_CONFIG
-#include "./file_util.h"
+#include "file_util.h"
 #endif
 
 namespace tvm {
diff --git a/src/runtime/module_util.cc b/src/runtime/module_util.cc
index 95da78d23f09..0c6d8ae4058d 100644
--- a/src/runtime/module_util.cc
+++ b/src/runtime/module_util.cc
@@ -8,7 +8,8 @@
 #endif
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/registry.h>
-#include "./module_util.h"
+#include <string>
+#include "module_util.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/ndarray.cc b/src/runtime/ndarray.cc
index 424a2b09cb15..04c178f25dfa 100644
--- a/src/runtime/ndarray.cc
+++ b/src/runtime/ndarray.cc
@@ -7,7 +7,7 @@
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/c_runtime_api.h>
 #include <tvm/runtime/device_api.h>
-#include "./runtime_base.h"
+#include "runtime_base.h"
 
 // deleter for arrays used by DLPack exporter
 extern "C" void NDArrayDLPackDeleter(DLManagedTensor* tensor);
diff --git a/src/runtime/opencl/aocl/aocl_device_api.cc b/src/runtime/opencl/aocl/aocl_device_api.cc
index e9cbc6b4cda0..61f636df6039 100644
--- a/src/runtime/opencl/aocl/aocl_device_api.cc
+++ b/src/runtime/opencl/aocl/aocl_device_api.cc
@@ -4,7 +4,7 @@
  */
 #include <tvm/runtime/registry.h>
 #include <dmlc/thread_local.h>
-#include "./aocl_common.h"
+#include "aocl_common.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/opencl/aocl/aocl_module.cc b/src/runtime/opencl/aocl/aocl_module.cc
index a056c5cee671..bbf2828fbd79 100644
--- a/src/runtime/opencl/aocl/aocl_module.cc
+++ b/src/runtime/opencl/aocl/aocl_module.cc
@@ -7,8 +7,8 @@
 #include <vector>
 #include <string>
 #include <unordered_map>
-#include "./aocl_common.h"
-#include "./aocl_module.h"
+#include "aocl_common.h"
+#include "aocl_module.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc
index ac9373f1375b..f1e224e5a9d1 100644
--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -4,7 +4,7 @@
  */
 #include <tvm/runtime/registry.h>
 #include <dmlc/thread_local.h>
-#include "./opencl_common.h"
+#include "opencl_common.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/opencl/opencl_module.cc b/src/runtime/opencl/opencl_module.cc
index 3efd789513ba..6d392036cc56 100644
--- a/src/runtime/opencl/opencl_module.cc
+++ b/src/runtime/opencl/opencl_module.cc
@@ -7,8 +7,8 @@
 #include <vector>
 #include <string>
 #include <unordered_map>
-#include "./opencl_common.h"
-#include "./opencl_module.h"
+#include "opencl_common.h"
+#include "opencl_module.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/opencl/sdaccel/sdaccel_device_api.cc b/src/runtime/opencl/sdaccel/sdaccel_device_api.cc
index 4b057b7e009a..bc98759b9b3f 100644
--- a/src/runtime/opencl/sdaccel/sdaccel_device_api.cc
+++ b/src/runtime/opencl/sdaccel/sdaccel_device_api.cc
@@ -4,7 +4,7 @@
  */
 #include <tvm/runtime/registry.h>
 #include <dmlc/thread_local.h>
-#include "./sdaccel_common.h"
+#include "sdaccel_common.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/opencl/sdaccel/sdaccel_module.cc b/src/runtime/opencl/sdaccel/sdaccel_module.cc
index c99e78c8e347..de9a710fbfe8 100644
--- a/src/runtime/opencl/sdaccel/sdaccel_module.cc
+++ b/src/runtime/opencl/sdaccel/sdaccel_module.cc
@@ -7,8 +7,8 @@
 #include <vector>
 #include <string>
 #include <unordered_map>
-#include "./sdaccel_common.h"
-#include "./sdaccel_module.h"
+#include "sdaccel_common.h"
+#include "sdaccel_module.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/opengl/opengl_device_api.cc b/src/runtime/opengl/opengl_device_api.cc
index 3a21ed6e6d07..191b64b6ce0a 100644
--- a/src/runtime/opengl/opengl_device_api.cc
+++ b/src/runtime/opengl/opengl_device_api.cc
@@ -4,8 +4,8 @@
  */
 #include <tvm/runtime/registry.h>
 #include <cstring>
-#include "./opengl_common.h"
-#include "./opengl_module.h"
+#include "opengl_common.h"
+#include "opengl_module.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/opengl/opengl_module.cc b/src/runtime/opengl/opengl_module.cc
index d800af95f053..976227a2924b 100644
--- a/src/runtime/opengl/opengl_module.cc
+++ b/src/runtime/opengl/opengl_module.cc
@@ -4,8 +4,8 @@
  */
 #include <tvm/runtime/registry.h>
 #include <utility>
-#include "./opengl_common.h"
-#include "./opengl_module.h"
+#include "opengl_common.h"
+#include "opengl_module.h"
 #include "../pack_args.h"
 #include "../thread_storage_scope.h"
 #include "../file_util.h"
diff --git a/src/runtime/registry.cc b/src/runtime/registry.cc
index 3f72828390ee..d7bbc3ce9996 100644
--- a/src/runtime/registry.cc
+++ b/src/runtime/registry.cc
@@ -10,7 +10,7 @@
 #include <mutex>
 #include <memory>
 #include <array>
-#include "./runtime_base.h"
+#include "runtime_base.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/rocm/rocm_device_api.cc b/src/runtime/rocm/rocm_device_api.cc
index 6aff5e56c715..355200a0cbb0 100644
--- a/src/runtime/rocm/rocm_device_api.cc
+++ b/src/runtime/rocm/rocm_device_api.cc
@@ -10,7 +10,7 @@
 #include <tvm/runtime/registry.h>
 #include <hip/hip_runtime_api.h>
 #include <hsa/hsa.h>
-#include "./rocm_common.h"
+#include "rocm_common.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/rocm/rocm_module.cc b/src/runtime/rocm/rocm_module.cc
index 503b04872c82..da3b04f66c49 100644
--- a/src/runtime/rocm/rocm_module.cc
+++ b/src/runtime/rocm/rocm_module.cc
@@ -8,8 +8,8 @@
 #include <array>
 #include <string>
 #include <mutex>
-#include "./rocm_module.h"
-#include "./rocm_common.h"
+#include "rocm_module.h"
+#include "rocm_common.h"
 #include "../pack_args.h"
 #include "../thread_storage_scope.h"
 #include "../meta_data.h"
diff --git a/src/runtime/rpc/rpc_device_api.cc b/src/runtime/rpc/rpc_device_api.cc
index 5740a393c253..4242f8e1ae58 100644
--- a/src/runtime/rpc/rpc_device_api.cc
+++ b/src/runtime/rpc/rpc_device_api.cc
@@ -5,7 +5,7 @@
 #include <dmlc/logging.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/runtime/device_api.h>
-#include "./rpc_session.h"
+#include "rpc_session.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/rpc/rpc_event_impl.cc b/src/runtime/rpc/rpc_event_impl.cc
index fc5ecca1f421..e553c6fad4a0 100644
--- a/src/runtime/rpc/rpc_event_impl.cc
+++ b/src/runtime/rpc/rpc_event_impl.cc
@@ -5,7 +5,7 @@
  */
 #include <tvm/runtime/registry.h>
 #include <memory>
-#include "./rpc_session.h"
+#include "rpc_session.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc
index d6c56e1b7cf4..80a8cc93ce19 100644
--- a/src/runtime/rpc/rpc_module.cc
+++ b/src/runtime/rpc/rpc_module.cc
@@ -6,7 +6,7 @@
 #include <tvm/runtime/registry.h>
 #include <memory>
 #include <cstring>
-#include "./rpc_session.h"
+#include "rpc_session.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/rpc/rpc_session.cc b/src/runtime/rpc/rpc_session.cc
index 6bb01b9bd459..0e2d637ab475 100644
--- a/src/runtime/rpc/rpc_session.cc
+++ b/src/runtime/rpc/rpc_session.cc
@@ -11,7 +11,9 @@
 #include <array>
 #include <string>
 #include <chrono>
-#include "./rpc_session.h"
+#include <vector>
+#include <utility>
+#include "rpc_session.h"
 #include "../../common/ring_buffer.h"
 
 namespace tvm {
diff --git a/src/runtime/rpc/rpc_socket_impl.cc b/src/runtime/rpc/rpc_socket_impl.cc
index 22f221d46526..6b2fa6c1f608 100644
--- a/src/runtime/rpc/rpc_socket_impl.cc
+++ b/src/runtime/rpc/rpc_socket_impl.cc
@@ -5,7 +5,7 @@
  */
 #include <tvm/runtime/registry.h>
 #include <memory>
-#include "./rpc_session.h"
+#include "rpc_session.h"
 #include "../../common/socket.h"
 
 namespace tvm {
diff --git a/src/runtime/sgx/trusted/runtime.cc b/src/runtime/sgx/trusted/runtime.cc
index a863327f956c..b7f66efbc97c 100644
--- a/src/runtime/sgx/trusted/runtime.cc
+++ b/src/runtime/sgx/trusted/runtime.cc
@@ -12,9 +12,9 @@
 #include "../../system_lib_module.cc"
 #include "../../thread_pool.cc"
 #include "../../workspace_pool.cc"
-#include "./ecall_registry.h"
-#include "./runtime.h"
-#include "./threading_backend.cc"
+#include "ecall_registry.h"
+#include "runtime.h"
+#include "threading_backend.cc"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/stackvm/stackvm.cc b/src/runtime/stackvm/stackvm.cc
index f86bfec087e4..f45d83027467 100644
--- a/src/runtime/stackvm/stackvm.cc
+++ b/src/runtime/stackvm/stackvm.cc
@@ -7,7 +7,7 @@
 #include <tvm/runtime/util.h>
 #include <tvm/runtime/c_backend_api.h>
 #include <algorithm>
-#include "./stackvm.h"
+#include "stackvm.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/stackvm/stackvm_module.cc b/src/runtime/stackvm/stackvm_module.cc
index 71ca9ba6c09a..7256c47862e5 100644
--- a/src/runtime/stackvm/stackvm_module.cc
+++ b/src/runtime/stackvm/stackvm_module.cc
@@ -5,7 +5,7 @@
 #include <tvm/runtime/registry.h>
 #include <tvm/runtime/module.h>
 #include <dmlc/memory_io.h>
-#include "./stackvm_module.h"
+#include "stackvm_module.h"
 #include "../file_util.h"
 #include "../module_util.h"
 
diff --git a/src/runtime/stackvm/stackvm_module.h b/src/runtime/stackvm/stackvm_module.h
index fcd51a64f870..918228faea1f 100644
--- a/src/runtime/stackvm/stackvm_module.h
+++ b/src/runtime/stackvm/stackvm_module.h
@@ -8,7 +8,7 @@
 
 #include <tvm/runtime/packed_func.h>
 #include <string>
-#include "./stackvm.h"
+#include "stackvm.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/system_lib_module.cc b/src/runtime/system_lib_module.cc
index 01ff99d7da87..ed48cb1a9d44 100644
--- a/src/runtime/system_lib_module.cc
+++ b/src/runtime/system_lib_module.cc
@@ -6,7 +6,7 @@
 #include <tvm/runtime/registry.h>
 #include <tvm/runtime/c_backend_api.h>
 #include <mutex>
-#include "./module_util.h"
+#include "module_util.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/vulkan/vulkan_device_api.cc b/src/runtime/vulkan/vulkan_device_api.cc
index 45f8549d54f2..cc89804806d2 100644
--- a/src/runtime/vulkan/vulkan_device_api.cc
+++ b/src/runtime/vulkan/vulkan_device_api.cc
@@ -5,7 +5,7 @@
 #include <tvm/runtime/registry.h>
 #include <dmlc/thread_local.h>
 #include <cstring>
-#include "./vulkan_common.h"
+#include "vulkan_common.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/vulkan/vulkan_module.cc b/src/runtime/vulkan/vulkan_module.cc
index b5425dd8fbc5..134c5fa45ba4 100644
--- a/src/runtime/vulkan/vulkan_module.cc
+++ b/src/runtime/vulkan/vulkan_module.cc
@@ -8,8 +8,8 @@
 #include <array>
 #include <string>
 #include <mutex>
-#include "./vulkan_common.h"
-#include "./vulkan_module.h"
+#include "vulkan_common.h"
+#include "vulkan_module.h"
 #include "../pack_args.h"
 #include "../thread_storage_scope.h"
 #include "../meta_data.h"
diff --git a/src/runtime/workspace_pool.cc b/src/runtime/workspace_pool.cc
index c903a8621206..d43b4641192c 100644
--- a/src/runtime/workspace_pool.cc
+++ b/src/runtime/workspace_pool.cc
@@ -3,7 +3,7 @@
  * \file workspace_pool.h
  * \brief Workspace pool utility.
  */
-#include "./workspace_pool.h"
+#include "workspace_pool.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/schedule/bound.cc b/src/schedule/bound.cc
index 7929969a8502..05c04834e78c 100644
--- a/src/schedule/bound.cc
+++ b/src/schedule/bound.cc
@@ -9,8 +9,8 @@
 #include <tvm/ir_pass.h>
 #include <unordered_map>
 #include <unordered_set>
-#include "./graph.h"
-#include "./message_passing.h"
+#include "graph.h"
+#include "message_passing.h"
 #include "../runtime/thread_storage_scope.h"
 
 namespace tvm {
diff --git a/src/schedule/graph.cc b/src/schedule/graph.cc
index da0aeb0eccaa..d92e7730b313 100644
--- a/src/schedule/graph.cc
+++ b/src/schedule/graph.cc
@@ -8,7 +8,7 @@
 #include <tvm/operation.h>
 #include <unordered_set>
 #include <unordered_map>
-#include "./graph.h"
+#include "graph.h"
 
 namespace tvm {
 namespace schedule {
diff --git a/src/schedule/message_passing.cc b/src/schedule/message_passing.cc
index b13dcefb1b9f..622e0b698902 100644
--- a/src/schedule/message_passing.cc
+++ b/src/schedule/message_passing.cc
@@ -6,7 +6,7 @@
 #include <tvm/arithmetic.h>
 #include <tvm/ir.h>
 #include <tvm/ir_pass.h>
-#include "./message_passing.h"
+#include "message_passing.h"
 #include "../arithmetic/compute_expr.h"
 
 namespace tvm {
diff --git a/src/schedule/schedule_dataflow_rewrite.cc b/src/schedule/schedule_dataflow_rewrite.cc
index e9fbcba088fe..fa26aea51a2b 100644
--- a/src/schedule/schedule_dataflow_rewrite.cc
+++ b/src/schedule/schedule_dataflow_rewrite.cc
@@ -7,7 +7,7 @@
 #include <tvm/ir_mutator.h>
 #include <tvm/ir_pass.h>
 #include <unordered_set>
-#include "./message_passing.h"
+#include "message_passing.h"
 #include "../pass/ir_util.h"
 #include "../arithmetic/compute_expr.h"
 
diff --git a/src/schedule/schedule_lang.cc b/src/schedule/schedule_lang.cc
index eea8aa1aae80..1490c85ff786 100644
--- a/src/schedule/schedule_lang.cc
+++ b/src/schedule/schedule_lang.cc
@@ -6,7 +6,7 @@
 #include <tvm/operation.h>
 #include <tvm/ir_mutator.h>
 #include <unordered_set>
-#include "./graph.h"
+#include "graph.h"
 
 namespace tvm {
 
diff --git a/src/schedule/schedule_ops.cc b/src/schedule/schedule_ops.cc
index 6fd2496aeabe..242423695464 100644
--- a/src/schedule/schedule_ops.cc
+++ b/src/schedule/schedule_ops.cc
@@ -11,7 +11,7 @@
 #include <utility>
 #include <unordered_map>
 #include <unordered_set>
-#include "./graph.h"
+#include "graph.h"
 #include "../op/op_util.h"
 #include "../pass/ir_util.h"
 
diff --git a/verilog/tvm_vpi.cc b/verilog/tvm_vpi.cc
index c663f7df51be..949b660ce447 100644
--- a/verilog/tvm_vpi.cc
+++ b/verilog/tvm_vpi.cc
@@ -8,7 +8,9 @@
 #include <cstdlib>
 #include <memory>
 #include <queue>
-#include "./tvm_vpi.h"
+#include <string>
+#include <vector>
+#include "tvm_vpi.h"
 #include "../src/common/pipe.h"
 
 namespace tvm {
diff --git a/vta/hardware/xilinx/src/vta.cc b/vta/hardware/xilinx/src/vta.cc
index 8d0432477486..5ca2cec6575d 100644
--- a/vta/hardware/xilinx/src/vta.cc
+++ b/vta/hardware/xilinx/src/vta.cc
@@ -8,7 +8,7 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "./vta.h"
+#include "vta.h"
 
 void fetch(
   uint32_t insn_count,
diff --git a/vta/include/vta/runtime.h b/vta/include/vta/runtime.h
index 6d77067be931..e58d45486282 100644
--- a/vta/include/vta/runtime.h
+++ b/vta/include/vta/runtime.h
@@ -11,7 +11,7 @@
 extern "C" {
 #endif
 
-#include "./driver.h"
+#include "driver.h"
 
 #define VTA_MEMCPY_H2D 1
 #define VTA_MEMCPY_D2H 2
diff --git a/vta/src/pynq/pynq_driver.cc b/vta/src/pynq/pynq_driver.cc
index e2630b14acde..1909ed35c562 100644
--- a/vta/src/pynq/pynq_driver.cc
+++ b/vta/src/pynq/pynq_driver.cc
@@ -6,7 +6,7 @@
 
 #include <vta/driver.h>
 #include <thread>
-#include "./pynq_driver.h"
+#include "pynq_driver.h"
 
 
 void* VTAMemAlloc(size_t size, int cached) {
diff --git a/vta/tests/hardware/common/test_lib.cc b/vta/tests/hardware/common/test_lib.cc
index 6c6d28ec0c69..95b793ea3ba1 100644
--- a/vta/tests/hardware/common/test_lib.cc
+++ b/vta/tests/hardware/common/test_lib.cc
@@ -4,7 +4,7 @@
  * \brief Test library for the VTA design simulation and driver tests.
  */
 
-#include "./test_lib.h"
+#include "test_lib.h"
 
 #ifdef NO_SIM
 #ifdef VTA_TARGET_PYNQ

From 56ab0adbe63305164d43062cb76451815ebf5d45 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Thu, 23 Aug 2018 08:28:55 -0700
Subject: [PATCH 56/77] [RUNTIME][PYTHON] Switch to use __new__ for
 constructing node. (#1644)

---
 python/tvm/_ffi/_ctypes/node.py  | 18 +++++++-----------
 python/tvm/_ffi/_cython/base.pxi |  4 ++--
 python/tvm/_ffi/_cython/node.pxi | 12 ++++++------
 python/tvm/_ffi/node.py          |  9 ++++++++-
 python/tvm/target.py             | 12 +++++++-----
 5 files changed, 30 insertions(+), 25 deletions(-)

diff --git a/python/tvm/_ffi/_ctypes/node.py b/python/tvm/_ffi/_ctypes/node.py
index 01244519532b..925aa93f8f96 100644
--- a/python/tvm/_ffi/_ctypes/node.py
+++ b/python/tvm/_ffi/_ctypes/node.py
@@ -24,7 +24,13 @@ def _return_node(x):
         handle = NodeHandle(handle)
     tindex = ctypes.c_int()
     check_call(_LIB.TVMNodeGetTypeIndex(handle, ctypes.byref(tindex)))
-    return NODE_TYPE.get(tindex.value, NodeBase)(handle)
+    cls = NODE_TYPE.get(tindex.value, NodeBase)
+    # Avoid calling __init__ of cls, instead directly call __new__
+    # This allows child class to implement their own __init__
+    node = cls.__new__(cls)
+    node.handle = handle
+    return node
+
 
 RETURN_SWITCH[TypeCode.NODE_HANDLE] = _return_node
 C_TO_PY_ARG_SWITCH[TypeCode.NODE_HANDLE] = _wrap_arg_func(
@@ -34,16 +40,6 @@ def _return_node(x):
 class NodeBase(object):
     __slots__ = ["handle"]
     # pylint: disable=no-member
-    def __init__(self, handle):
-        """Initialize the function with handle
-
-        Parameters
-        ----------
-        handle : SymbolHandle
-            the handle to the underlying C++ Symbol
-        """
-        self.handle = handle
-
     def __del__(self):
         if _LIB is not None:
             check_call(_LIB.TVMNodeFree(self.handle))
diff --git a/python/tvm/_ffi/_cython/base.pxi b/python/tvm/_ffi/_cython/base.pxi
index 00173c431bb7..ac5532835c47 100644
--- a/python/tvm/_ffi/_cython/base.pxi
+++ b/python/tvm/_ffi/_cython/base.pxi
@@ -106,8 +106,8 @@ cdef extern from "tvm/runtime/c_runtime_api.h":
 
 cdef extern from "tvm/c_dsl_api.h":
     int TVMNodeFree(NodeHandle handle)
-    TVMNodeTypeKey2Index(const char* type_key,
-                         int* out_index)
+    int TVMNodeTypeKey2Index(const char* type_key,
+                             int* out_index)
     int TVMNodeGetTypeIndex(NodeHandle handle,
                             int* out_index)
     int TVMNodeGetAttr(NodeHandle handle,
diff --git a/python/tvm/_ffi/_cython/node.pxi b/python/tvm/_ffi/_cython/node.pxi
index a563af5237f9..1ced48878803 100644
--- a/python/tvm/_ffi/_cython/node.pxi
+++ b/python/tvm/_ffi/_cython/node.pxi
@@ -1,3 +1,4 @@
+from ... import _api_internal
 from ..base import string_types
 from ..node_generic import _set_class_node_base
 
@@ -10,6 +11,7 @@ def _register_node(int index, object cls):
         NODE_TYPE.append(None)
     NODE_TYPE[index] = cls
 
+
 cdef inline object make_ret_node(void* chandle):
     global NODE_TYPE
     cdef int tindex
@@ -20,14 +22,15 @@ cdef inline object make_ret_node(void* chandle):
     if tindex < len(node_type):
         cls = node_type[tindex]
         if cls is not None:
-            obj = cls(None)
+            obj = cls.__new__(cls)
         else:
-            obj = NodeBase(None)
+            obj = NodeBase.__new__(NodeBase)
     else:
-        obj = NodeBase(None)
+        obj = NodeBase.__new__(NodeBase)
     (<NodeBase>obj).chandle = chandle
     return obj
 
+
 cdef class NodeBase:
     cdef void* chandle
 
@@ -49,9 +52,6 @@ cdef class NodeBase:
         def __set__(self, value):
             self._set_handle(value)
 
-    def __init__(self, handle):
-        self._set_handle(handle)
-
     def __dealloc__(self):
         CALL(TVMNodeFree(self.chandle))
 
diff --git a/python/tvm/_ffi/node.py b/python/tvm/_ffi/node.py
index d9e7397ae71f..98ece19f77f2 100644
--- a/python/tvm/_ffi/node.py
+++ b/python/tvm/_ffi/node.py
@@ -21,6 +21,12 @@
     # pylint: disable=wrong-import-position
     from ._ctypes.node import _register_node, NodeBase as _NodeBase
 
+
+def _new_object(cls):
+    """Helper function for pickle"""
+    return cls.__new__(cls)
+
+
 class NodeBase(_NodeBase):
     """NodeBase is the base class of all TVM language AST object."""
     def __repr__(self):
@@ -46,7 +52,8 @@ def __ne__(self, other):
         return not self.__eq__(other)
 
     def __reduce__(self):
-        return (type(self), (None,), self.__getstate__())
+        cls = type(self)
+        return (_new_object, (cls, ), self.__getstate__())
 
     def __getstate__(self):
         handle = self.handle
diff --git a/python/tvm/target.py b/python/tvm/target.py
index 40f9e099b3a6..07200058a021 100644
--- a/python/tvm/target.py
+++ b/python/tvm/target.py
@@ -79,11 +79,13 @@ class Target(NodeBase):
     - :any:`tvm.target.mali` create Mali target
     - :any:`tvm.target.intel_graphics` create Intel Graphics target
     """
-    def __init__(self, handle):
-        super(Target, self).__init__(handle)
-        self._keys = None
-        self._options = None
-        self._libs = None
+    def __new__(cls):
+        # Always override new to enable class
+        obj = NodeBase.__new__(cls)
+        obj._keys = None
+        obj._options = None
+        obj._libs = None
+        return obj
 
     @property
     def keys(self):

From 7afe6ba84e73205265150c1b91aa332a4e9aab01 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <mercy_zheng@sjtu.edu.cn>
Date: Thu, 23 Aug 2018 13:43:23 -0700
Subject: [PATCH 57/77] fix CO CI problem (#1641)

---
 tutorials/autotvm/tune_conv2d_cuda.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tutorials/autotvm/tune_conv2d_cuda.py b/tutorials/autotvm/tune_conv2d_cuda.py
index 375d1a9b755e..3ff26a05064d 100644
--- a/tutorials/autotvm/tune_conv2d_cuda.py
+++ b/tutorials/autotvm/tune_conv2d_cuda.py
@@ -64,7 +64,7 @@
 #
 
 @autotvm.template
-def conv2d_no_batching(N, H, W, CI, CO, KH, KW, stride, padding):
+def conv2d_no_batching(N, H, W, CO, CI, KH, KW, stride, padding):
     assert N == 1, "Only consider batch_size = 1 in this template"
 
     data = tvm.placeholder((N, CI, H, W), name='data')
@@ -206,8 +206,8 @@ def conv2d_no_batching(N, H, W, CI, CO, KH, KW, stride, padding):
 
 np.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2)
 
-# Evaluate running time. Here we choose a large repeat number (200) to reduce the noise
+# Evaluate running time. Here we choose a large repeat number (400) to reduce the noise
 # and the overhead of kernel launch. You can also use nvprof to validate the result.
-evaluator = func.time_evaluator(func.entry_name, ctx, number=200)
+evaluator = func.time_evaluator(func.entry_name, ctx, number=400)
 print('Time cost of this operator: %f' % evaluator(a_tvm, w_tvm, c_tvm).mean)
 

From 3dbc0d647640492e9fe1bb2c597682d447bcb73a Mon Sep 17 00:00:00 2001
From: MORINAGA <34588258+imorinaga@users.noreply.github.com>
Date: Fri, 24 Aug 2018 05:44:02 +0900
Subject: [PATCH 58/77] [RUNTIME] [OPENCL] Fix access modifiers (#1643)

---
 src/runtime/opencl/opencl_common.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h
index c37dbaa94d7a..d42cc669e742 100644
--- a/src/runtime/opencl/opencl_common.h
+++ b/src/runtime/opencl/opencl_common.h
@@ -260,14 +260,12 @@ class OpenCLModuleNode : public ModuleNode {
                           const std::string& func_name,
                           const KTRefEntry& e);
 
- protected:
+ private:
   // The workspace, need to keep reference to use it in destructor.
   // In case of static destruction order problem.
   std::shared_ptr<cl::OpenCLWorkspace> workspace_;
   // the binary data
   std::string data_;
-
- private:
   // The format
   std::string fmt_;
   // function information table.

From e33654452de9aa6c40a9c14164aacb30da0ce0a5 Mon Sep 17 00:00:00 2001
From: Jared Roesch <roeschinc@gmail.com>
Date: Thu, 23 Aug 2018 14:57:53 -0700
Subject: [PATCH 59/77] [FIX] Fix issue with TypedPackedFunc template
 instatition (#1649)

---
 include/tvm/runtime/packed_func.h | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h
index 1c873a5ebccc..b7351274a350 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -180,9 +180,7 @@ class TypedPackedFunc<R(Args...)> {
    *
    * \param packed The packed function
    */
-  explicit TypedPackedFunc(PackedFunc packed)
-      : packed_(packed) {
-  }
+  inline explicit TypedPackedFunc(PackedFunc packed);
   /*!
    * \brief construct from a lambda function with the same signature.
    *
@@ -925,6 +923,8 @@ inline PackedFunc::FType PackedFunc::body() const {
   return body_;
 }
 
+
+
 // internal namespace
 namespace detail {
 
@@ -1131,6 +1131,10 @@ struct typed_packed_call_dispatcher<void> {
 };
 }  // namespace detail
 
+template<typename R, typename ...Args>
+TypedPackedFunc<R(Args...)>::TypedPackedFunc(PackedFunc packed)
+  : packed_(packed) {}
+
 template<typename R, typename ...Args>
 template<typename FType>
 inline void TypedPackedFunc<R(Args...)>::AssignTypedLambda(FType flambda) {

From 62d34ca5746fc4e7e07ce14c811ead2c2c221723 Mon Sep 17 00:00:00 2001
From: MORITA Kazutaka <morita.kazutaka@lab.ntt.co.jp>
Date: Fri, 24 Aug 2018 07:57:28 +0900
Subject: [PATCH 60/77] [NNVM][KERAS] Support multiple outputs (#1648)

---
 nnvm/python/nnvm/frontend/keras.py            | 10 ++++-----
 .../python/frontend/keras/test_forward.py     | 21 ++++++++++++++++---
 2 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/nnvm/python/nnvm/frontend/keras.py b/nnvm/python/nnvm/frontend/keras.py
index 3a0a25aa4979..bb2ad783000c 100644
--- a/nnvm/python/nnvm/frontend/keras.py
+++ b/nnvm/python/nnvm/frontend/keras.py
@@ -532,15 +532,15 @@ def from_keras(model):
                 # they are named uniquely to input_1, input_2, input_3 ... by default.
                 for pred_idx, pred in zip(node.node_indices, node.inbound_layers):
                     if isinstance(pred, keras.engine.InputLayer):
-                        _sym = symtab.get_var(pred.name, must_contain=True)
+                        sym = symtab.get_var(pred.name, must_contain=True)
                     else:
-                        _sym = symtab.get_var(pred.name + ':' + str(pred_idx), must_contain=True)
-                    insym.append(_sym)
+                        sym = symtab.get_var(pred.name + ':' + str(pred_idx), must_contain=True)
+                    insym.append(sym)
 
                 if len(insym) == 1:
                     insym = insym[0]
                 keras_op_to_nnvm(insym, keras_layer, keras_layer.name + ':' + str(my_idx), symtab)
 
-    outsym = symtab.get_var(model._output_layers[0].name + ':0')
+    outsym = [symtab.get_var(layer.name + ':0') for layer in model._output_layers]
     tvmparams = {k:tvm.nd.array(np.array(v, dtype=np.float32)) for k, v in symtab.params.items()}
-    return outsym, tvmparams
+    return _sym.Group(outsym), tvmparams
diff --git a/nnvm/tests/python/frontend/keras/test_forward.py b/nnvm/tests/python/frontend/keras/test_forward.py
index 3e80c74399cc..c8c9b2c784e8 100644
--- a/nnvm/tests/python/frontend/keras/test_forward.py
+++ b/nnvm/tests/python/frontend/keras/test_forward.py
@@ -20,7 +20,9 @@ def verify_keras_frontend(keras_model):
     in_shapes = []
     for layer in keras_model._input_layers:
         in_shapes.append(tuple(dim.value if dim.value is not None else 1 for dim in layer.input.shape))
-    out_shape = [dim.value if dim.value is not None else 1 for dim in keras_model._output_layers[0].output.shape]
+    out_shapes = []
+    for layer in keras_model._output_layers:
+        out_shapes.append(tuple(dim.value if dim.value is not None else 1 for dim in layer.output.shape))
 
     def get_keras_output(xs, dtype='float32'):
         return keras_model.predict(xs)
@@ -35,8 +37,10 @@ def get_tvm_output(xs, target, ctx, dtype='float32'):
             m.set_input(name, tvm.nd.array(x.astype(dtype)))
         m.set_input(**params)
         m.run()
-        out = m.get_output(0, tvm.nd.empty(out_shape, dtype))
-        return out.asnumpy()
+
+        out = [m.get_output(i, tvm.nd.empty(shape, dtype)).asnumpy()
+                   for i, shape in enumerate(out_shapes)]
+        return out if len(out) > 1 else out[0]
 
     xs = [np.random.uniform(size=shape, low=-1.0, high=1.0) for shape in in_shapes]
     keras_out = get_keras_output(xs)
@@ -192,6 +196,16 @@ def test_forward_multi_inputs():
     verify_keras_frontend(keras_model)
 
 
+def test_forward_multi_outputs():
+    data = keras.layers.Input(shape=(32,32,3))
+    x = keras.layers.Conv2D(8, (3, 3), padding="same")(data)
+    x = keras.layers.GlobalAveragePooling2D()(x)
+    y = keras.layers.Conv2D(8, (3, 3), padding="same")(data)
+    y = keras.layers.GlobalAveragePooling2D()(y)
+    keras_model = keras.models.Model(data, [x, y])
+    verify_keras_frontend(keras_model)
+
+
 def test_forward_reuse_layers():
     # reuse conv2d
     data = keras.layers.Input(shape=(32,32,3))
@@ -230,4 +244,5 @@ def test_forward_reuse_layers():
     test_forward_mobilenet()
 
     test_forward_multi_inputs()
+    test_forward_multi_outputs()
     test_forward_reuse_layers()

From 6eecec92d1f99b6e1229b328ad3bc82d2bdfe7e5 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Thu, 23 Aug 2018 16:28:46 -0700
Subject: [PATCH 61/77] [PYTHON] Enable constructors in Node (#1647)

---
 python/tvm/_ffi/_ctypes/function.py           |  19 +
 python/tvm/_ffi/_ctypes/node.py               |  25 +-
 python/tvm/_ffi/_cython/function.pxi          |  44 +-
 python/tvm/_ffi/_cython/node.pxi              |  23 +
 python/tvm/_ffi/function.py                   |  18 +-
 python/tvm/api.py                             |  16 +-
 python/tvm/expr.py                            | 478 ++++++++++++++++--
 python/tvm/make.py                            |  43 +-
 python/tvm/stmt.py                            | 337 +++++++++++-
 src/api/api_ir.cc                             |   1 +
 .../python/unittest/test_lang_constructor.py  | 202 ++++++++
 11 files changed, 1084 insertions(+), 122 deletions(-)
 create mode 100644 tests/python/unittest/test_lang_constructor.py

diff --git a/python/tvm/_ffi/_ctypes/function.py b/python/tvm/_ffi/_ctypes/function.py
index 79f3c6033a1f..61679f0018c0 100644
--- a/python/tvm/_ffi/_ctypes/function.py
+++ b/python/tvm/_ffi/_ctypes/function.py
@@ -17,6 +17,7 @@
 from .types import TVMPackedCFunc, TVMCFuncFinalizer
 from .types import RETURN_SWITCH, C_TO_PY_ARG_SWITCH, _wrap_arg_func
 from .node import NodeBase
+from . import node as _node
 
 FunctionHandle = ctypes.c_void_p
 ModuleHandle = ctypes.c_void_p
@@ -186,6 +187,23 @@ def __call__(self, *args):
         _ = args
         return RETURN_SWITCH[ret_tcode.value](ret_val)
 
+
+def __init_handle_by_constructor__(fconstructor, args):
+    """Initialize handle by constructor"""
+    temp_args = []
+    values, tcodes, num_args = _make_tvm_args(args, temp_args)
+    ret_val = TVMValue()
+    ret_tcode = ctypes.c_int()
+    check_call(_LIB.TVMFuncCall(
+        fconstructor.handle, values, tcodes, ctypes.c_int(num_args),
+        ctypes.byref(ret_val), ctypes.byref(ret_tcode)))
+    _ = temp_args
+    _ = args
+    assert ret_tcode.value == TypeCode.NODE_HANDLE
+    handle = ret_val.v_handle
+    return handle
+
+
 def _return_module(x):
     """Return function"""
     handle = x.v_handle
@@ -202,6 +220,7 @@ def _handle_return_func(x):
 
 
 # setup return handle for function type
+_node.__init_by_constructor__ = __init_handle_by_constructor__
 RETURN_SWITCH[TypeCode.FUNC_HANDLE] = _handle_return_func
 RETURN_SWITCH[TypeCode.MODULE_HANDLE] = _return_module
 RETURN_SWITCH[TypeCode.NDARRAY_CONTAINER] = lambda x: _make_array(x.v_handle, False)
diff --git a/python/tvm/_ffi/_ctypes/node.py b/python/tvm/_ffi/_ctypes/node.py
index 925aa93f8f96..eb9e930b30eb 100644
--- a/python/tvm/_ffi/_ctypes/node.py
+++ b/python/tvm/_ffi/_ctypes/node.py
@@ -1,5 +1,5 @@
 # pylint: disable=invalid-name, protected-access
-# pylint: disable=no-member, missing-docstring
+# pylint: disable=no-member, missing-docstring, not-callable
 from __future__ import absolute_import
 
 import ctypes
@@ -9,6 +9,7 @@
 from .types import RETURN_SWITCH, C_TO_PY_ARG_SWITCH, _wrap_arg_func
 
 NodeHandle = ctypes.c_void_p
+__init_by_constructor__ = None
 
 """Maps node type to its constructor"""
 NODE_TYPE = {}
@@ -58,4 +59,26 @@ def __getattr__(self, name):
                 "'%s' object has no attribute '%s'" % (str(type(self)), name))
         return RETURN_SWITCH[ret_type_code.value](ret_val)
 
+    def __init_handle_by_constructor__(self, fconstructor, *args):
+        """Initialize the handle by calling constructor function.
+
+        Parameters
+        ----------
+        fconstructor : Function
+            Constructor function.
+
+        args: list of objects
+            The arguments to the constructor
+
+        Note
+        ----
+        We have a special calling convention to call constructor functions.
+        So the return handle is directly set into the Node object
+        instead of creating a new Node.
+        """
+        handle = __init_by_constructor__(fconstructor, args)
+        if not isinstance(handle, NodeHandle):
+            handle = NodeHandle(handle)
+        self.handle = handle
+
 _set_class_node_base(NodeBase)
diff --git a/python/tvm/_ffi/_cython/function.pxi b/python/tvm/_ffi/_cython/function.pxi
index 989f5b8e7b47..dcbf4c665e66 100644
--- a/python/tvm/_ffi/_cython/function.pxi
+++ b/python/tvm/_ffi/_cython/function.pxi
@@ -196,37 +196,54 @@ cdef inline object make_ret(TVMValue value, int tcode):
     raise ValueError("Unhandled type code %d" % tcode)
 
 
-cdef inline object FuncCall3(void* chandle, tuple args, int nargs):
+cdef inline int FuncCall3(void* chandle,
+                          tuple args,
+                          int nargs,
+                          TVMValue* ret_val,
+                          int* ret_tcode) except -1:
     cdef TVMValue[3] values
     cdef int[3] tcodes
-    cdef TVMValue ret_val
-    cdef int ret_code
     nargs = len(args)
     temp_args = []
     for i in range(nargs):
         make_arg(args[i], &values[i], &tcodes[i], temp_args)
     CALL(TVMFuncCall(chandle, &values[0], &tcodes[0],
-                     nargs, &ret_val, &ret_code))
-    return make_ret(ret_val, ret_code)
+                     nargs, ret_val, ret_tcode))
+    return 0
 
-cdef inline object FuncCall(void* chandle, tuple args):
+cdef inline int FuncCall(void* chandle,
+                         tuple args,
+                         TVMValue* ret_val,
+                         int* ret_tcode) except -1:
     cdef int nargs
     nargs = len(args)
     if nargs <= 3:
-        return FuncCall3(chandle, args, nargs)
+        FuncCall3(chandle, args, nargs, ret_val, ret_tcode)
+        return 0
 
     cdef vector[TVMValue] values
     cdef vector[int] tcodes
-    cdef TVMValue ret_val
-    cdef int ret_code
     values.resize(max(nargs, 1))
     tcodes.resize(max(nargs, 1))
     temp_args = []
     for i in range(nargs):
         make_arg(args[i], &values[i], &tcodes[i], temp_args)
     CALL(TVMFuncCall(chandle, &values[0], &tcodes[0],
-                     nargs, &ret_val, &ret_code))
-    return make_ret(ret_val, ret_code)
+                     nargs, ret_val, ret_tcode))
+    return 0
+
+
+cdef inline int ConstructorCall(void* constructor_handle,
+                                int type_code,
+                                tuple args,
+                                void** handle) except -1:
+    """Call contructor of a handle function"""
+    cdef TVMValue ret_val
+    cdef int ret_tcode
+    FuncCall(constructor_handle, args, &ret_val, &ret_tcode)
+    assert ret_tcode == type_code
+    handle[0] = ret_val.v_handle
+    return 0
 
 
 cdef class FunctionBase:
@@ -264,7 +281,10 @@ cdef class FunctionBase:
             CALL(TVMFuncFree(self.chandle))
 
     def __call__(self, *args):
-        return FuncCall(self.chandle, args)
+        cdef TVMValue ret_val
+        cdef int ret_tcode
+        FuncCall(self.chandle, args, &ret_val, &ret_tcode)
+        return make_ret(ret_val, ret_tcode)
 
 _CLASS_FUNCTION = None
 _CLASS_MODULE = None
diff --git a/python/tvm/_ffi/_cython/node.pxi b/python/tvm/_ffi/_cython/node.pxi
index 1ced48878803..c62e4ab44cef 100644
--- a/python/tvm/_ffi/_cython/node.pxi
+++ b/python/tvm/_ffi/_cython/node.pxi
@@ -65,4 +65,27 @@ cdef class NodeBase:
                 "'%s' object has no attribute '%s'" % (type(self), name))
         return make_ret(ret_val, ret_type_code)
 
+    def __init_handle_by_constructor__(self, fconstructor, *args):
+        """Initialize the handle by calling constructor function.
+
+        Parameters
+        ----------
+        fconstructor : Function
+            Constructor function.
+
+        args: list of objects
+            The arguments to the constructor
+
+        Note
+        ----
+        We have a special calling convention to call constructor functions.
+        So the return handle is directly set into the Node object
+        instead of creating a new Node.
+        """
+        cdef void* chandle
+        ConstructorCall(
+            (<FunctionBase>fconstructor).chandle,
+            kNodeHandle, args, &chandle)
+        self.chandle = chandle
+
 _set_class_node_base(NodeBase)
diff --git a/python/tvm/_ffi/function.py b/python/tvm/_ffi/function.py
index cfda2a35f9b9..ca1812d4109a 100644
--- a/python/tvm/_ffi/function.py
+++ b/python/tvm/_ffi/function.py
@@ -262,23 +262,7 @@ def _list(name, func):
 def _get_api(f):
     flocal = f
     flocal.is_global = True
-    def my_api_func(*args):
-        """
-
-        This is a type erased API that calls into Global PackedFunc.
-        These APIs corresponds to functions registered from C++ backend
-        and can be used as developer functions.
-
-        args : list
-          The positional arguments to the function call.
-
-        Returns
-        -------
-        value : int, float, None, Node or Function
-        The result of the API function call.
-        """
-        return flocal(*args)
-    return my_api_func
+    return flocal
 
 def _init_api(namespace, target_module_name=None):
     """Initialize api for a given module name
diff --git a/python/tvm/api.py b/python/tvm/api.py
index 75debc33db66..2bcb003ee7e5 100644
--- a/python/tvm/api.py
+++ b/python/tvm/api.py
@@ -134,9 +134,9 @@ def any(*args):
         raise ValueError("Any must take at least 1 argument")
     if len(args) == 1:
         return args[0]
-    ret = _make.Or(args[0], args[1])
+    ret = _expr.Or(args[0], args[1])
     for i in range(2, len(args)):
-        ret = _make.Or(ret, args[i])
+        ret = _expr.Or(ret, args[i])
     return ret
 
 
@@ -158,9 +158,9 @@ def all(*args):
         raise ValueError("Any must take at least 1 argument")
     if len(args) == 1:
         return args[0]
-    ret = _make.And(args[0], args[1])
+    ret = _expr.And(args[0], args[1])
     for i in range(2, len(args)):
-        ret = _make.And(ret, args[i])
+        ret = _expr.And(ret, args[i])
     return ret
 
 
@@ -616,7 +616,7 @@ def select(cond, t, f):
     node : Node
         The tvm.expr.Select node
     """
-    return _make.Select(convert(cond), convert(t), convert(f))
+    return _expr.Select(convert(cond), convert(t), convert(f))
 
 
 def comm_reducer(fcombine, fidentity, name="reduce"):
@@ -699,7 +699,7 @@ def _make_reduce(expr, axis, where=None):
         axis = convert(axis if isinstance(axis, (list, tuple)) else [axis])
         if where is None:
             where = convert(True)
-        outputs = tuple(_make.Reduce(combiner, expr, axis, where, i)
+        outputs = tuple(_expr.Reduce(combiner, expr, axis, where, i)
                         for i in range(size))
         return outputs[0] if size == 1 else outputs
 
@@ -751,5 +751,5 @@ def reducer(expr, axis, where=None, *args):
 _init_api("tvm.api")
 #pylint: disable=unnecessary-lambda
 sum = comm_reducer(lambda x, y: x+y, lambda t: const(0, dtype=t), name="sum")
-min = comm_reducer(lambda x, y: _make.Min(x, y), max_value, name='min')
-max = comm_reducer(lambda x, y: _make.Max(x, y), min_value, name='max')
+min = comm_reducer(lambda x, y: _expr.Min(x, y), max_value, name='min')
+max = comm_reducer(lambda x, y: _expr.Max(x, y), min_value, name='max')
diff --git a/python/tvm/expr.py b/python/tvm/expr.py
index 8bf46b7eee62..1c1c9f82cb97 100644
--- a/python/tvm/expr.py
+++ b/python/tvm/expr.py
@@ -225,127 +225,545 @@ class LogicalExpr(Expr):
 
 @register_node("Variable")
 class Var(Expr):
-    """Symbolic variable."""
-    pass
+    """Symbolic variable.
+
+    Parameters
+    ----------
+    name : str
+        The name
+
+    dtype : int
+        The data type
+    """
+    def __init__(self, name, dtype):
+        self.__init_handle_by_constructor__(
+            _api_internal._Var, name, dtype)
+
 
 @register_node
 class Reduce(Expr):
-    pass
+    """Reduce node.
+
+    Parameters
+    ----------
+    combiner : CommReducer
+        The combiner.
+
+    src : list of Expr
+        The source expression.
+
+    rdom : list of IterVar
+        The iteration domain
+
+    condition : Expr
+        The reduce condition.
+
+    value_index : int
+        The value index.
+    """
+    def __init__(self, combiner, src, rdom, condition, value_index):
+        self.__init_handle_by_constructor__(
+            _make.Reduce, combiner, src, rdom,
+            condition, value_index)
+
 
 @register_node
 class FloatImm(ConstExpr):
-    pass
+    """Float constant.
+
+    Parameters
+    ----------
+    dtype : str
+        The data type
+
+    value : float
+        The constant value.
+    """
+    def __init__(self, dtype, value):
+        self.__init_handle_by_constructor__(
+            _make.FloatImm, dtype, value)
 
 @register_node
 class IntImm(ConstExpr):
-    pass
+    """Int constant.
+
+    Parameters
+    ----------
+    dtype : str
+        The data type
+
+    value : int
+        The constant value.
+    """
+    def __init__(self, dtype, value):
+        self.__init_handle_by_constructor__(
+            _make.IntImm, dtype, value)
+
 
 @register_node
 class UIntImm(ConstExpr):
-    pass
+    """UInt constant.
+
+    Parameters
+    ----------
+    dtype : str
+        The data type
+
+    value : int
+        The constant value.
+    """
+    def __init__(self, dtype, value):
+        self.__init_handle_by_constructor__(
+            _make.UIntImm, dtype, value)
+
 
 @register_node
 class StringImm(ConstExpr):
-    pass
+    """String constant.
+
+    Parameters
+    ----------
+    value : str
+        The value of the function.
+    """
+    def __init__(self, value):
+        self.__init_handle_by_constructor__(
+            _make.StringImm, value)
+
 
 @register_node
 class Cast(Expr):
-    pass
+    """Cast expression.
+
+    Parameters
+    ----------
+    dtype : str
+        The data type
+
+    value : Expr
+        The value of the function.
+    """
+    def __init__(self, dtype, value):
+        self.__init_handle_by_constructor__(
+            _make.Cast, dtype, value)
+
 
 @register_node
 class Add(BinaryOpExpr):
-    pass
+    """Add node.
+
+    Parameters
+    ----------
+    a : Expr
+        The left hand operand.
+
+    b : Expr
+        The right hand operand.
+    """
+    def __init__(self, a, b):
+        self.__init_handle_by_constructor__(
+            _make.Add, a, b)
+
 
 @register_node
 class Sub(BinaryOpExpr):
-    pass
+    """Sub node.
+
+    Parameters
+    ----------
+    a : Expr
+        The left hand operand.
+
+    b : Expr
+        The right hand operand.
+    """
+    def __init__(self, a, b):
+        self.__init_handle_by_constructor__(
+            _make.Sub, a, b)
+
 
 @register_node
 class Mul(BinaryOpExpr):
-    pass
+    """Mul node.
+
+    Parameters
+    ----------
+    a : Expr
+        The left hand operand.
+
+    b : Expr
+        The right hand operand.
+    """
+    def __init__(self, a, b):
+        self.__init_handle_by_constructor__(
+            _make.Mul, a, b)
+
 
 @register_node
 class Div(BinaryOpExpr):
-    pass
+    """Div node.
+
+    Parameters
+    ----------
+    a : Expr
+        The left hand operand.
+
+    b : Expr
+        The right hand operand.
+    """
+    def __init__(self, a, b):
+        self.__init_handle_by_constructor__(
+            _make.Div, a, b)
+
 
 @register_node
 class Mod(BinaryOpExpr):
-    pass
+    """Mod node.
+
+    Parameters
+    ----------
+    a : Expr
+        The left hand operand.
+
+    b : Expr
+        The right hand operand.
+    """
+    def __init__(self, a, b):
+        self.__init_handle_by_constructor__(
+            _make.Mod, a, b)
+
 
 @register_node
 class Min(BinaryOpExpr):
-    pass
+    """Min node.
+
+    Parameters
+    ----------
+    a : Expr
+        The left hand operand.
+
+    b : Expr
+        The right hand operand.
+    """
+    def __init__(self, a, b):
+        self.__init_handle_by_constructor__(
+            _make.Min, a, b)
+
 
 @register_node
 class Max(BinaryOpExpr):
-    pass
+    """Max node.
+
+    Parameters
+    ----------
+    a : Expr
+        The left hand operand.
+
+    b : Expr
+        The right hand operand.
+    """
+    def __init__(self, a, b):
+        self.__init_handle_by_constructor__(
+            _make.Max, a, b)
+
 
 @register_node
 class EQ(CmpExpr):
-    pass
+    """EQ node.
+
+    Parameters
+    ----------
+    a : Expr
+        The left hand operand.
+
+    b : Expr
+        The right hand operand.
+    """
+    def __init__(self, a, b):
+        self.__init_handle_by_constructor__(
+            _make.EQ, a, b)
+
 
 @register_node
 class NE(CmpExpr):
-    pass
+    """NE node.
+
+    Parameters
+    ----------
+    a : Expr
+        The left hand operand.
+
+    b : Expr
+        The right hand operand.
+    """
+    def __init__(self, a, b):
+        self.__init_handle_by_constructor__(
+            _make.NE, a, b)
+
 
 @register_node
 class LT(CmpExpr):
-    pass
+    """LT node.
+
+    Parameters
+    ----------
+    a : Expr
+        The left hand operand.
+
+    b : Expr
+        The right hand operand.
+    """
+    def __init__(self, a, b):
+        self.__init_handle_by_constructor__(
+            _make.LT, a, b)
+
 
 @register_node
 class LE(CmpExpr):
-    pass
+    """LE node.
+
+    Parameters
+    ----------
+    a : Expr
+        The left hand operand.
+
+    b : Expr
+        The right hand operand.
+    """
+    def __init__(self, a, b):
+        self.__init_handle_by_constructor__(
+            _make.LE, a, b)
+
 
 @register_node
 class GT(CmpExpr):
-    pass
+    """GT node.
+
+    Parameters
+    ----------
+    a : Expr
+        The left hand operand.
+
+    b : Expr
+        The right hand operand.
+    """
+    def __init__(self, a, b):
+        self.__init_handle_by_constructor__(
+            _make.GT, a, b)
+
 
 @register_node
 class GE(CmpExpr):
-    pass
+    """GE node.
+
+    Parameters
+    ----------
+    a : Expr
+        The left hand operand.
+
+    b : Expr
+        The right hand operand.
+    """
+    def __init__(self, a, b):
+        self.__init_handle_by_constructor__(
+            _make.GE, a, b)
+
 
 @register_node
 class And(LogicalExpr):
-    pass
+    """And node.
+
+    Parameters
+    ----------
+    a : Expr
+        The left hand operand.
+
+    b : Expr
+        The right hand operand.
+    """
+    def __init__(self, a, b):
+        self.__init_handle_by_constructor__(
+            _make.And, a, b)
+
 
 @register_node
 class Or(LogicalExpr):
-    pass
+    """Or node.
+
+    Parameters
+    ----------
+    a : Expr
+        The left hand operand.
+
+    b : Expr
+        The right hand operand.
+    """
+    def __init__(self, a, b):
+        self.__init_handle_by_constructor__(
+            _make.Or, a, b)
+
 
 @register_node
 class Not(LogicalExpr):
-    pass
+    """Not node.
+
+    Parameters
+    ----------
+    a : Expr
+        The input value
+    """
+    def __init__(self, a):
+        self.__init_handle_by_constructor__(
+            _make.Not, a)
+
 
 @register_node
 class Select(Expr):
-    pass
+    """Select node.
+
+    Parameters
+    ----------
+    condition : Expr
+        The condition expression.
+
+    true_value : Expr
+        The value to take when condition is true.
+
+    false_value : Expr
+        The value to take when condition is false.
+    """
+    def __init__(self, condition, true_value, false_value):
+        self.__init_handle_by_constructor__(
+            _make.Select, condition, true_value, false_value)
+
 
 @register_node
 class Load(Expr):
-    pass
+    """Load node.
+
+    Parameters
+    ----------
+    dtype : str
+        The data type.
+
+    buffer_var : Var
+        The buffer variable in the load expression.
+
+    index : Expr
+        The index in the load.
+
+    predicate : Expr
+        The load predicate.
+    """
+    def __init__(self, dtype, buffer_var, index, predicate):
+        self.__init_handle_by_constructor__(
+            _make.Load, dtype, buffer_var, index, predicate)
+
 
 @register_node
 class Ramp(Expr):
-    pass
+    """Ramp node.
+
+    Parameters
+    ----------
+    base : Expr
+        The base expression.
+
+    stride : ramp stride
+        The stride of the ramp.
+
+    lanes : int
+        The lanes of the expression.
+    """
+    def __init__(self, base, stride, lanes):
+        self.__init_handle_by_constructor__(
+            _make.Ramp, base, stride, lanes)
+
 
 @register_node
 class Broadcast(Expr):
-    pass
+    """Broadcast node.
+
+    Parameters
+    ----------
+    value : Expr
+        The value of the expression.
+
+    lanes : int
+        The lanes of the expression.
+    """
+    def __init__(self, value, lanes):
+        self.__init_handle_by_constructor__(
+            _make.Broadcast, value, lanes)
+
 
 @register_node
 class Shuffle(Expr):
-    pass
+    """Shuffle node.
+
+    Parameters
+    ----------
+    vectors : Array of Expr
+        The vectors
+
+    indices : Array of indices
+        The indices
+    """
+    def __init__(self, vectors, indices):
+        self.__init_handle_by_constructor__(
+            _make.Shuffle, vectors, indices)
+
 
 @register_node
 class Call(Expr):
+    """Call node.
+
+    Parameters
+    ----------
+    dtype : str
+        The return data type
+
+    name : str
+        The name of the function
+
+    args : list of Expr
+        The input arguments to the call
+
+    call_type : int
+        The type of the call
+
+    func : Operation, optional
+        Operation if call_type is Halide
+
+    value_index : int
+        The output value index
+    """
     Extern = 0
     ExternCPlusPlus = 1
     PureExtern = 2
     Halide = 3
     Intrinsic = 4
     PureIntrinsic = 5
+    def __init__(self, dtype, name, args, call_type, func, value_index):
+        self.__init_handle_by_constructor__(
+            _make.Call, dtype, name, args, call_type, func, value_index)
 
 
 @register_node
 class Let(Expr):
-    pass
+    """Let node.
+
+    Parameters
+    ----------
+    var : Var
+        The variable in the binding.
+
+    value : Expr
+        The value in to be binded.
+
+    body : Expr
+        The body expression.
+    """
+    def __init__(self, var, value, body):
+        self.__init_handle_by_constructor__(
+            _make.Let, var, value, body)
diff --git a/python/tvm/make.py b/python/tvm/make.py
index 19949509778b..6238fd7f1789 100644
--- a/python/tvm/make.py
+++ b/python/tvm/make.py
@@ -6,9 +6,10 @@
 Each api is a PackedFunc that can be called in a positional argument manner.
 You can use make function to build the IR node.
 """
+from __future__ import absolute_import as _abs
 from ._ffi.function import _init_api
 from ._ffi.runtime_ctypes import TVMType
-from . import stmt as _stmt
+
 
 def range_by_min_extent(min_value, extent):
     """Construct a Range by min and extent.
@@ -98,44 +99,4 @@ def node(type_key, **kwargs):
     return _Node(*args)
 
 
-def stmt_seq(*args):
-    """Make sequence of statements
-
-    Parameters
-    ----------
-    args : list of Expr or Var
-        List of statements to be combined as sequence.
-
-    Returns
-    -------
-    stmt : Stmt
-        The combined statement.
-    """
-    ret = None
-    for value in args:
-        if not isinstance(value, _stmt.Stmt):
-            value = Evaluate(value)
-        ret = value if ret is None else Block(ret, value)
-    return ret if ret else Evaluate(0)
-
-
-def stmt_list(stmt):
-    """Make list of stmt from blocks.
-
-    Parameters
-    ----------
-    stmt : A block statement
-
-    Returns
-    -------
-    stmt_list : list of Stmt
-         The unpacked list of statements
-    """
-    if isinstance(stmt, _stmt.Block):
-        return stmt_list(stmt.first) + stmt_list(stmt.rest)
-    elif isinstance(stmt, _stmt.ProducerConsumer):
-        return stmt_list(stmt.body)
-    return [stmt]
-
-
 _init_api("tvm.make")
diff --git a/python/tvm/stmt.py b/python/tvm/stmt.py
index 1f5fea11a472..48d91dfa8044 100644
--- a/python/tvm/stmt.py
+++ b/python/tvm/stmt.py
@@ -15,65 +15,376 @@
 """
 from __future__ import absolute_import as _abs
 from ._ffi.node import NodeBase, register_node
+from . import make as _make
+
 
 class Stmt(NodeBase):
     pass
 
 @register_node
 class LetStmt(Stmt):
-    pass
+    """LetStmt node.
+
+    Parameters
+    ----------
+    var : Var
+        The variable in the binding.
+
+    value : Expr
+        The value in to be binded.
+
+    body : Stmt
+        The body statement.
+    """
+    def __init__(self, var, value, body):
+        self.__init_handle_by_constructor__(
+            _make.LetStmt, var, value, body)
+
 
 @register_node
 class AssertStmt(Stmt):
-    pass
+    """AssertStmt node.
+
+    Parameters
+    ----------
+    condition : Expr
+        The assert condition.
+
+    message : Expr
+        The error message.
+
+    body : Stmt
+        The body statement.
+    """
+    def __init__(self, condition, message, body):
+        self.__init_handle_by_constructor__(
+            _make.AssertStmt, condition, message, body)
+
 
 @register_node
 class ProducerConsumer(Stmt):
-    pass
+    """ProducerConsumer node.
+
+    Parameters
+    ----------
+    func : Operation
+        The Operation.
+
+    is_producer : bool
+        Whether if the node is producer.
+
+    body : Stmt
+        The body statement.
+    """
+    def __init__(self, func, is_producer, body):
+        self.__init_handle_by_constructor__(
+            _make.ProducerConsumer, func, is_producer, body)
+
 
 @register_node
 class For(Stmt):
+    """For node.
+
+    Parameters
+    ----------
+    loop_var : Var
+        The loop variable.
+
+    min_val : Expr
+        The begining value.
+
+    extent : Expr
+        The length of the loop.
+
+    for_type : int
+        The for type.
+
+    device_api : int
+        The device api type.
+
+    body : Stmt
+        The body statement.
+    """
     Serial = 0
     Parallel = 1
     Vectorized = 2
     Unrolled = 3
+    def __init__(self,
+                 loop_var,
+                 min_val,
+                 extent,
+                 for_type,
+                 device_api,
+                 body):
+        self.__init_handle_by_constructor__(
+            _make.For, loop_var, min_val, extent,
+            for_type, device_api, body)
+
 
 @register_node
 class Store(Stmt):
-    pass
+    """Store node.
+
+    Parameters
+    ----------
+    buffer_var : Var
+        The buffer Variable.
+
+    value : Expr
+        The value we want to store.
+
+    index : Expr
+        The index in the store expression.
+
+    predicate : Expr
+        The store predicate.
+    """
+    def __init__(self, buffer_var, value, index, predicate):
+        self.__init_handle_by_constructor__(
+            _make.Store, buffer_var, value, index, predicate)
+
 
 @register_node
 class Provide(Stmt):
-    pass
+    """Provide node.
+
+    Parameters
+    ----------
+    func : Operation
+        The operation to create the function.
+
+    value_index : int
+        The output value index
+
+    value : Expr
+        The value to be stored.
+
+    args : list of Expr
+        The index arguments of the Provide.
+    """
+    def __init__(self, func, value_index, value, args):
+        self.__init_handle_by_constructor__(
+            _make.Provide, func, value_index, value, args)
+
 
 @register_node
 class Allocate(Stmt):
-    pass
+    """Allocate node.
+
+    Parameters
+    ----------
+    buffer_var : Var
+        The buffer variable.
+
+    dtype : str
+        The data type of the buffer.
+
+    extents : list of Expr
+        The extents of the allocate
+
+    condition : Expr
+        The condition.
+
+    body : Stmt
+        The body statement.
+    """
+    def __init__(self,
+                 buffer_var,
+                 dtype,
+                 extents,
+                 condition,
+                 body):
+        self.__init_handle_by_constructor__(
+            _make.Allocate, buffer_var, dtype,
+            extents, condition, body)
+
 
 @register_node
 class AttrStmt(Stmt):
-    pass
+    """AttrStmt node.
+
+    Parameters
+    ----------
+    node : Node
+        The node to annotate the attribute
+
+    attr_key : str
+        Attribute type key.
+
+    value : Expr
+        The value of the attribute
+
+    body : Stmt
+        The body statement.
+    """
+    def __init__(self, node, attr_key, value, body):
+        self.__init_handle_by_constructor__(
+            _make.AttrStmt, node, attr_key, value, body)
+
 
 @register_node
 class Free(Stmt):
-    pass
+    """Free node.
+
+    Parameters
+    ----------
+    buffer_var : Var
+        The buffer variable.
+    """
+    def __init__(self, buffer_var):
+        self.__init_handle_by_constructor__(
+            _make.Free, buffer_var)
+
 
 @register_node
 class Realize(Stmt):
-    pass
+    """Realize node.
+
+    Parameters
+    ----------
+    func : Operation
+        The operation to create the function.
+
+    value_index : int
+        The output value index
+
+    dtype : str
+        The data type of the operation.
+
+    bounds : list of range
+        The bound of realize
+
+    condition : Expr
+        The realize condition.
+
+    body : Stmt
+        The realize body
+    """
+    def __init__(self,
+                 func,
+                 value_index,
+                 dtype,
+                 bounds,
+                 condition,
+                 body):
+        self.__init_handle_by_constructor__(
+            _make.Realize, func, value_index, dtype,
+            bounds, condition, body)
+
 
 @register_node
 class Block(Stmt):
-    pass
+    """Block node.
+
+    Parameters
+    ----------
+    first : Stmt
+        The first statement.
+
+    rest : Stmt
+        The following statement.
+    """
+    def __init__(self, first, rest):
+        self.__init_handle_by_constructor__(
+            _make.Block, first, rest)
+
 
 @register_node
 class IfThenElse(Stmt):
-    pass
+    """IfThenElse node.
+
+    Parameters
+    ----------
+    condition : Expr
+        The expression
+
+    then_case : Stmt
+        The statement to execute if condition is true.
+
+    else_case : Stmt
+        The statement to execute if condition is false.
+    """
+    def __init__(self, condition, then_case, else_case):
+        self.__init_handle_by_constructor__(
+            _make.IfThenElse, condition, then_case, else_case)
+
 
 @register_node
 class Evaluate(Stmt):
-    pass
+    """Evaluate node.
+
+    Parameters
+    ----------
+    value : Expr
+        The expression to be evalued.
+    """
+    def __init__(self, value):
+        self.__init_handle_by_constructor__(
+            _make.Evaluate, value)
+
 
 @register_node
 class Prefetch(Stmt):
-    pass
+    """Prefetch node.
+
+    Parameters
+    ----------
+    func : Operation
+        The operation to create the function.
+
+    value_index : int
+        The output value index
+
+    dtype : str
+        The data type to be prefetched.
+
+    bounds : list of Range
+        The bounds to be prefetched.
+    """
+    def __init__(self, func, value_index, dtype, bounds):
+        self.__init_handle_by_constructor__(
+            _make.Prefetch, func, value_index, dtype, bounds)
+
+
+def stmt_seq(*args):
+    """Make sequence of statements
+
+    Parameters
+    ----------
+    args : list of Expr or Var
+        List of statements to be combined as sequence.
+
+    Returns
+    -------
+    stmt : Stmt
+        The combined statement.
+    """
+    ret = None
+    for value in args:
+        if not isinstance(value, Stmt):
+            value = Evaluate(value)
+        ret = value if ret is None else Block(ret, value)
+    return ret if ret else Evaluate(0)
+
+
+def stmt_list(stmt):
+    """Make list of stmt from blocks.
+
+    Parameters
+    ----------
+    stmt : A block statement
+
+    Returns
+    -------
+    stmt_list : list of Stmt
+         The unpacked list of statements
+    """
+    if isinstance(stmt, Block):
+        return stmt_list(stmt.first) + stmt_list(stmt.rest)
+    elif isinstance(stmt, ProducerConsumer):
+        return stmt_list(stmt.body)
+    return [stmt]
+
+
+_make.stmt_list = stmt_list
+_make.stmt_seq = stmt_seq
diff --git a/src/api/api_ir.cc b/src/api/api_ir.cc
index bc9293c20b7a..8a65260a0f58 100644
--- a/src/api/api_ir.cc
+++ b/src/api/api_ir.cc
@@ -170,6 +170,7 @@ REGISTER_MAKE3(Select);
 REGISTER_MAKE3(Ramp);
 REGISTER_MAKE2(Cast);
 REGISTER_MAKE2(Broadcast);
+REGISTER_MAKE2(Shuffle);
 REGISTER_MAKE3(Let);
 REGISTER_MAKE3(LetStmt);
 REGISTER_MAKE3(AssertStmt);
diff --git a/tests/python/unittest/test_lang_constructor.py b/tests/python/unittest/test_lang_constructor.py
new file mode 100644
index 000000000000..caca08afa804
--- /dev/null
+++ b/tests/python/unittest/test_lang_constructor.py
@@ -0,0 +1,202 @@
+import tvm
+
+def test_expr_constructor():
+    x = tvm.expr.Var("xx", "float32")
+    assert isinstance(x, tvm.expr.Var)
+    assert x.name == "xx"
+
+    x = tvm.expr.Reduce(None, [1],
+                        [tvm.api._IterVar((0, 1), "x", 2)],
+                        None, 0)
+    assert isinstance(x, tvm.expr.Reduce)
+    assert x.combiner == None
+    assert x.value_index == 0
+
+    x = tvm.expr.FloatImm("float32", 1.0)
+    assert isinstance(x, tvm.expr.FloatImm)
+    assert x.value == 1.0
+    assert x.dtype == "float32"
+
+    x = tvm.expr.IntImm("int64", 2)
+    assert isinstance(x, tvm.expr.IntImm)
+    assert x.value == 2
+    assert x.dtype == "int64"
+
+    x = tvm.expr.UIntImm("uint16", 2)
+    assert isinstance(x, tvm.expr.UIntImm)
+    assert x.value == 2
+    assert x.dtype == "uint16"
+
+    x = tvm.expr.StringImm("xyza")
+    assert isinstance(x, tvm.expr.StringImm)
+    assert x.value == "xyza"
+
+    x = tvm.expr.Cast("float32", tvm.expr.IntImm("int32", 1))
+    assert isinstance(x, tvm.expr.Cast)
+    assert x.dtype == "float32"
+    assert x.value.value == 1
+
+    a = tvm.const(1.0, dtype="float32")
+    b = tvm.var("x", dtype="float32")
+
+    for cls in [tvm.expr.Add,
+                tvm.expr.Sub,
+                tvm.expr.Mul,
+                tvm.expr.Div,
+                tvm.expr.Mod,
+                tvm.expr.Min,
+                tvm.expr.Max,
+                tvm.expr.LT,
+                tvm.expr.LE,
+                tvm.expr.GT,
+                tvm.expr.GE]:
+        x = cls(a, b)
+        assert isinstance(x, cls)
+        assert x.a == a
+        assert x.b.same_as(b)
+
+
+    a = tvm.convert(tvm.var("x") > 1)
+    b = tvm.convert(tvm.var("x") == 1)
+
+    for cls in [tvm.expr.And,
+                tvm.expr.Or]:
+        x = cls(a, b)
+        assert isinstance(x, cls)
+        assert x.a == a
+        assert x.b.same_as(b)
+
+    x = tvm.expr.Not(a)
+    assert isinstance(x, tvm.expr.Not)
+    assert x.a == a
+
+    x = tvm.expr.Select(a, a, b)
+    assert isinstance(x, tvm.expr.Select)
+    assert x.true_value == a
+    assert x.false_value == b
+    assert x.condition == a
+
+    buffer_var = tvm.var("x", dtype="handle")
+    x = tvm.expr.Load("float32", buffer_var, 1, a)
+    assert isinstance(x, tvm.expr.Load)
+    assert x.dtype == "float32"
+    assert x.buffer_var == buffer_var
+    assert x.index.value == 1
+    assert x.predicate == a
+
+    x = tvm.expr.Ramp(1, 2, 10)
+    assert isinstance(x, tvm.expr.Ramp)
+    assert x.base.value == 1
+    assert x.stride.value == 2
+    assert x.lanes == 10
+
+    x = tvm.expr.Broadcast(a, 10)
+    assert isinstance(x, tvm.expr.Broadcast)
+    assert x.value == a
+    assert x.lanes == 10
+
+    x = tvm.expr.Shuffle([a], [0])
+    assert isinstance(x, tvm.expr.Shuffle)
+    assert x.vectors[0] == a
+    assert x.indices[0].value == 0
+
+    x = tvm.expr.Call("float32", "xyz", [a], tvm.expr.Call.Extern, None, 0)
+    assert isinstance(x, tvm.expr.Call)
+    assert x.dtype == "float32"
+    assert x.name == "xyz"
+    assert x.args[0] == a
+    assert x.call_type == tvm.expr.Call.Extern
+    assert x.func == None
+    assert x.value_index == 0
+
+    v = tvm.var("aa")
+    x = tvm.expr.Let(v, 1, v)
+    assert x.var == v
+    assert x.value.value == 1
+    assert x.body == v
+
+
+def test_stmt_constructor():
+    v = tvm.var("aa")
+    buffer_var = tvm.var("buf", dtype="handle")
+    nop = tvm.stmt.Evaluate(1)
+    x = tvm.stmt.LetStmt(v, 1, tvm.stmt.Evaluate(1))
+    assert isinstance(x, tvm.stmt.LetStmt)
+    assert x.var == v
+    assert x.value.value == 1
+    assert isinstance(x.body, tvm.stmt.Evaluate)
+
+    x = tvm.stmt.AttrStmt(v == 1, "xx", 1, tvm.stmt.Evaluate(1))
+    assert isinstance(x, tvm.stmt.AttrStmt)
+    assert x.value.value == 1
+
+    x = tvm.stmt.Block(tvm.stmt.Evaluate(11),
+                       nop)
+    assert isinstance(x, tvm.stmt.Block)
+    assert x.first.value.value == 11
+    assert x.rest == nop
+
+    x = tvm.stmt.AssertStmt(tvm.const(1, "uint1"),
+                            tvm.convert("hellow"),
+                            nop)
+    assert isinstance(x, tvm.stmt.AssertStmt)
+    assert x.body == nop
+
+    x = tvm.stmt.ProducerConsumer(None, True, nop)
+    assert isinstance(x, tvm.stmt.ProducerConsumer)
+    assert x.body == nop
+
+    x = tvm.stmt.For(tvm.var("x"), 0, 10, 0, 0, nop)
+    assert isinstance(x, tvm.stmt.For)
+    assert x.min.value == 0
+    assert x.extent.value == 10
+    assert x.body == nop
+
+    x = tvm.stmt.Store(buffer_var, 1, 10, tvm.const(1, "uint1"))
+    assert isinstance(x, tvm.stmt.Store)
+    assert x.buffer_var == buffer_var
+    assert x.index.value == 10
+    assert x.value.value == 1
+
+    tensor = tvm.placeholder((), dtype="float32")
+    x = tvm.stmt.Provide(tensor.op, 0, 10, [])
+    assert isinstance(x, tvm.stmt.Provide)
+    assert x.value_index == 0
+    assert x.value.value == 10
+
+    x = tvm.stmt.Allocate(buffer_var, "float32", [10],
+                          tvm.const(1, "uint1"), nop)
+    assert isinstance(x, tvm.stmt.Allocate)
+    assert x.dtype == "float32"
+    assert x.buffer_var == buffer_var
+    assert x.body == nop
+
+    x = tvm.stmt.AttrStmt(buffer_var, "xyz", 1, nop)
+    assert isinstance(x, tvm.stmt.AttrStmt)
+    assert x.node == buffer_var
+    assert x.attr_key == "xyz"
+    assert x.body == nop
+
+    x = tvm.stmt.Free(buffer_var)
+    assert isinstance(x, tvm.stmt.Free)
+    assert x.buffer_var == buffer_var
+
+    x = tvm.stmt.Realize(None, 0, "float", [], tvm.const(1, "uint1"), nop)
+    assert isinstance(x, tvm.stmt.Realize)
+    assert x.body == nop
+
+    x = tvm.stmt.IfThenElse(tvm.const(1, "uint1"),
+                            tvm.stmt.Evaluate(11),
+                            nop)
+    assert isinstance(x, tvm.stmt.IfThenElse)
+    assert x.then_case.value.value == 11
+    assert x.else_case == nop
+
+    x = tvm.stmt.Prefetch(None, 1, "float32", [])
+    assert isinstance(x, tvm.stmt.Prefetch)
+    assert x.value_index == 1
+
+
+if __name__ == "__main__":
+    test_expr_constructor()
+    test_stmt_constructor()

From 0edf87e8548c9e96aa82278afdb6cca93f33cad6 Mon Sep 17 00:00:00 2001
From: Sergei Grechanik <grechanik.sergey@huawei.com>
Date: Fri, 24 Aug 2018 02:49:36 +0300
Subject: [PATCH 62/77] [NNVM][TEST] Test against numerical grad (#1505)

* [NNVM][TEST] Numerical gradient testing

* [NNVM][TEST] Make some tests a little faster

* Fix the failing test_top_level3

* Target exclusion for the check_function

* Try to ignore singularities

* grad_input_vars now can't contain shapes

* Don't pass unnecessary grad_input_vars to check_function

* Multiple outputs; fixes; testing of check_function

* Use numerical_grads_params to pass parameters to numgrad checker

* Fail when no action is requested excplicitly

* Pass additional params to functions

* Silence the linter issue

* Simplified numgrad checking

* Improved docs for check_function

* Fixed the error message when no dtype is provided

* Several fixes

* Tests with shape/dtype inference for inputs

* Don't check dense's grads on cuda

* Raise an error if output dtypes haven't been inferred

* Moved shape/dtype inference into a separate function; use float32 as fallback

* Remove redundant dtype=float32

* Fix multiple outputs

* Use check_function in the rest of the test_top_level1
---
 docs/api/python/nnvm/index.rst                |   1 +
 docs/api/python/nnvm/testing.rst              |  14 +
 docs/api/python/nnvm/testing_new_ops.rst      | 135 ++++
 nnvm/python/nnvm/testing/__init__.py          |   1 +
 nnvm/python/nnvm/testing/check_computation.py | 641 ++++++++++++++++++
 nnvm/tests/python/compiler/test_top_level1.py | 468 +++++++------
 nnvm/tests/python/compiler/test_top_level3.py |   9 +-
 nnvm/tests/python/compiler/test_top_level4.py | 153 ++---
 8 files changed, 1095 insertions(+), 327 deletions(-)
 create mode 100644 docs/api/python/nnvm/testing.rst
 create mode 100644 docs/api/python/nnvm/testing_new_ops.rst
 create mode 100644 nnvm/python/nnvm/testing/check_computation.py

diff --git a/docs/api/python/nnvm/index.rst b/docs/api/python/nnvm/index.rst
index c0e5912c76be..64447bb793fb 100644
--- a/docs/api/python/nnvm/index.rst
+++ b/docs/api/python/nnvm/index.rst
@@ -11,3 +11,4 @@ This document contains the python API to NNVM compiler toolchain.
    symbol
    graph
    top
+   testing
diff --git a/docs/api/python/nnvm/testing.rst b/docs/api/python/nnvm/testing.rst
new file mode 100644
index 000000000000..56783622648d
--- /dev/null
+++ b/docs/api/python/nnvm/testing.rst
@@ -0,0 +1,14 @@
+nnvm.testing
+------------
+
+.. automodule:: nnvm.testing
+
+.. autofunction:: nnvm.testing.ctx_list
+
+nnvm.testing.check_computation
+------------------------------
+
+.. automodule:: nnvm.testing.check_computation
+    :members:
+
+.. include:: testing_new_ops.rst
diff --git a/docs/api/python/nnvm/testing_new_ops.rst b/docs/api/python/nnvm/testing_new_ops.rst
new file mode 100644
index 000000000000..dfe7df485b78
--- /dev/null
+++ b/docs/api/python/nnvm/testing_new_ops.rst
@@ -0,0 +1,135 @@
+Testing new operations
+----------------------
+
+When adding new operations, it is a good idea to test them. Testing
+should be done with the function ``nnvm.testing.check_function``. You
+should provide it with the symbol representing the result of a
+computation and a reference numpy implementation. By default, it will
+also check analytical gradients against numerical gradients if
+analytical gradients are implemented for your operation. You can also
+pass a reference implementation for the gradients, but numerical
+gradients will still be checked. Numerical gradient checking may be
+switched off explicitly, but doing this is not a good idea generally.
+Here is an example testing the logarithm operation:
+
+.. code:: python
+
+    import numpy as np
+    import nnvm
+    import nnvm.symbol as sym
+    from nnvm.testing.check_computation import check_function
+
+    x = sym.Variable("x")
+    y = sym.log(x)
+
+    def forward(x):
+        return np.log(x)
+
+    def backward(head_grads, x):
+        return [1. / x * head_grads]
+
+    dtype = "float32"
+    shape = {'x': (1, 3, 32, 32)}
+    check_function(y, forward, backward, in_range=(0.001, 2.0), dtype=dtype, shape=shape)
+
+If you run the code above, you might get an ``AssertionError`` in rare
+cases. That’s why it is recommended to run new tests a lot of times.
+
+.. code:: python
+
+    for _ in range(10000):
+        check_function(y, forward, backward, in_range=(0.001, 2.0), dtype=dtype, shape=shape)
+
+If you run the code above then sooner or later you will get an exception
+which may look like this:
+
+.. code-block:: text
+
+    AssertionError: Analytical and numerical grads wrt x differ too much
+    analytical grad = [
+            ...
+        ]
+    numerical grad = [
+            ...
+        ]
+    distance > atol*sqrt(n) + rtol*grad_norm
+    distance 308.50885009765625 > 0.01*55.42562584220407 + 0.1*2167.70703125
+
+It means that either you have a mistake in the ``FGradient`` function or
+the numerical error is too high. Generally, if you look at the printed
+gradients and see that they differ only slightly or just in a single
+position, then it is a numerical error. But if the gradients look
+completely different, especially if many corresponding positions have
+different signs, then it must be something wrong with the analytical
+gradient implementation.
+
+Then try to make this error reproducible, and also try to reduce the
+shape of inputs, but not too much, a vector of 10 elements is a
+reasonable choice. Also you won’t need reference functions ``forward``
+and ``backward``, and restricting the number of targets might also be a
+good idea. Since the error may manifest itself only in rare cases, you
+might want to run it in a loop.
+
+.. code:: python
+
+    shape = {'x': (10,)}
+    np.random.seed(42)
+
+    for _ in range(1000):
+        check_function(y, in_range=(0.001, 2.0), dtype=dtype, shape=shape,
+                       numerical_grads=True, only_targets=['llvm'])
+
+Running this code will result in the following:
+
+.. code-block:: text
+
+    check_function failed while checking gradients numerically, here is the main graph
+    Graph(%x, %head_grads_0) {
+      %x, shape=[10], dtype=0
+      %head_grads_0, shape=[10], dtype=0
+      %1 = log(%x), shape=[10], dtype=0
+      %3 = elemwise_div(%head_grads_0, %x), shape=[10], dtype=0
+      ret %1, %3, %head_grads_0
+    }
+    graph_attr_keys = [layout_inputs, dtype_num_unknown_nodes, dtype, shape_num_unknown_nodes, shape]
+
+    Generated inputs:
+    {'x': array([2.5660574e-01, 1.5313280e+00, 1.0232578e-03, 8.3371508e-01,
+           1.0454979e+00, 1.1021420e-01, 1.9461832e+00, 4.5302454e-01,
+           6.0909325e-01, 6.0858107e-01], dtype=float32), 'head_grads_0': array([0.4616029 , 0.00394617, 1.4589603 , 1.9337242 , 0.44936267,
+           1.3264314 , 1.4840508 , 1.6970023 , 0.84583575, 0.60655886],
+          dtype=float32)}
+
+    ...
+
+    AssertionError: Analytical and numerical grads wrt x differ too much
+    analytical grad = [1.7988799e+00 2.5769596e-03 1.4257993e+03 2.3194065e+00 4.2980734e-01
+     1.2035031e+01 7.6254421e-01 3.7459390e+00 1.3886802e+00 9.9667716e-01]
+     numerical grad = [1.7948151e+00 1.9073486e-03 9.9268610e+02 2.3174286e+00 4.2915344e-01
+     1.1980057e+01 7.6198578e-01 3.7412643e+00 1.3866425e+00 9.9563599e-01]
+    distance > atol*sqrt(n) + rtol*grad_norm
+    distance 433.11322021484375 > 0.01*3.1622776601683795 + 0.1*992.7716674804688
+
+In this case the largest difference is in the 2nd position (starting
+from 0) which corresponds to input value ``1.0232578e-03``. This value
+is too close to the singularity, so the numerical derivative gets too
+imprecise. The solution is to shrink the range for ``x``, here, for
+example, ``(0.002, 2.0)`` turned out to be enough. Don’t forget to run
+lots of tests, so that other people don’t get false positives.
+
+.. code:: python
+
+    for _ in range(100):
+        check_function(y, in_range={x: (0.002, 2.0)}, dtype=dtype, shape=(1, 3, 32, 32),
+                       numerical_grads=True, only_targets=['llvm'])
+
+If you need a more precise control over which values get passed to the
+checking function, you can use ``values={x: ...}``:
+
+.. code:: python
+
+    x_val = np.array([1.2594858e+00, 1.0960974e-01, 1.4975418e+00, 6.3585603e-01,
+           1.2692513e-03, 1.0227472e+00, 9.4656967e-02, 5.5306298e-01,
+           1.4142460e+00, 1.2631655e-01], dtype=np.float32)
+    check_function(y, values={x: x_val}, dtype=dtype, shape=shape,
+                   numerical_grads=True, only_targets=['llvm'])
diff --git a/nnvm/python/nnvm/testing/__init__.py b/nnvm/python/nnvm/testing/__init__.py
index a04d2bc83587..acf37999cc15 100644
--- a/nnvm/python/nnvm/testing/__init__.py
+++ b/nnvm/python/nnvm/testing/__init__.py
@@ -13,3 +13,4 @@
 from . import dcgan
 from . import dqn
 from . import yolo2_detection
+from . import check_computation
diff --git a/nnvm/python/nnvm/testing/check_computation.py b/nnvm/python/nnvm/testing/check_computation.py
new file mode 100644
index 000000000000..a207e8eb8ceb
--- /dev/null
+++ b/nnvm/python/nnvm/testing/check_computation.py
@@ -0,0 +1,641 @@
+# pylint: disable=cell-var-from-loop,no-else-return
+"""Helper utilities to check functions and their gradients."""
+from __future__ import absolute_import as _abs
+
+import logging
+import numpy as np
+
+import tvm
+from tvm.contrib import graph_runtime
+
+import nnvm
+from nnvm.compiler import graph_util
+from nnvm.compiler.graph_attr import TCODE_TO_DTYPE, DTYPE_TO_TCODE
+from .config import ctx_list
+
+def infer_shapes_dtypes(graph, shape=None, dtype=None, fallback_dtype=None):
+    """Runs dtype and shape inference passes on a graph and returns the resulting graph
+    along with the inferred information.
+
+    Parameters
+    ----------
+    graph : nnvm.graph.Graph
+        A graph we want to run inference on.
+
+    shape : Dict[str, Tuple[int]] or Tuple[int], optional
+        A dict mapping input variable names to shapes.
+        By default shapes will be inferred from variables' attributes.
+        Note that this parameter takes precedence over variables' attributes.
+
+    dtype : Dict[str, str] or str, optional
+        A dict mapping input variable names to dtypes, or just a single dtype.
+        By default dtypes will be inferred from variables' attributes.
+        Note that this parameter takes precedence over variables' attributes.
+
+    fallback_dtype : str, optional
+        A dtype that will be used for variables whose dtype can't be inferred from other
+        variables' dtypes.
+
+    Returns
+    -------
+    graph : nnvm.graph.Graph
+        The resulting graph with dtype and shape information on its nodes.
+
+    input_shapes : Dict[str, Tuple[int]]
+        The inferred shapes of input variables merged with the `shape` dictionary.
+
+    input_dtypes : Dict[str, str]
+        The inferred dtypes of input variables merged with the `dtype` dictionary.
+
+    output_shapes : List[Tuple[int]]
+        The inferred shapes of outputs.
+
+    output_dtypes : List[str]
+        The inferred dtypes of outputs.
+    """
+    # Preprocess input parameters
+    if shape is None:
+        shape = {}
+
+    if dtype is None:
+        dtype = {}
+
+    if not isinstance(shape, dict):
+        shape = {x: shape for x in graph.symbol.list_input_variables()}
+
+    if not isinstance(dtype, dict):
+        dtype = {x: dtype for x in graph.symbol.list_input_variables()}
+
+    shape = _dict_var_to_dict_str(shape)
+    dtype = _dict_var_to_dict_str(dtype)
+
+    # The graph may already contain shape and dtype info, so extract it and merge with
+    # the user-specified shapes and dtypes (use the user-specified one on contradiction)
+    all_initial_shapes = graph.json_attr('shape')
+    all_initial_dtypes = graph.json_attr('dtype')
+
+    if all_initial_shapes:
+        for x in graph.index.input_names:
+            if x not in shape:
+                x_shape = tuple(all_initial_shapes[graph.index.entry_id(x)])
+                shape[x] = x_shape
+
+    if all_initial_dtypes:
+        for x in graph.index.input_names:
+            if x not in dtype:
+                x_dtype = TCODE_TO_DTYPE[all_initial_dtypes[graph.index.entry_id(x)]]
+                dtype[x] = x_dtype
+
+    # Perform inference
+    nnvm.compiler.graph_attr.set_shape_inputs(graph, shape)
+    nnvm.compiler.graph_attr.set_dtype_inputs(graph, dtype)
+
+    graph = graph.apply('InferShape').apply('InferType')
+
+    shapes = graph.json_attr('shape')
+    dtypes = graph.json_attr('dtype')
+
+    out_len = len(graph.symbol.list_output_names())
+
+    index = graph.index
+
+    output_shapes = \
+        [tuple(shapes[index.entry_id(index.output_entries[i])]) for i in range(out_len)]
+    output_dtypes = \
+        [TCODE_TO_DTYPE[dtypes[index.entry_id(index.output_entries[i])]] for i in range(out_len)]
+
+    # Postprocess the results
+    input_shapes = shape.copy()
+    input_dtypes = dtype.copy()
+
+    for x in graph.symbol.list_input_variables():
+        x_name = x.attr('name')
+        x_node_id = graph.index.node_id(x_name)
+        input_shapes[x_name] = tuple(shapes[x_node_id])
+        input_dtypes[x_name] = TCODE_TO_DTYPE[dtypes[x_node_id]]
+
+    # Merge the original user-specified shapes in case some of them are specified for non-existing
+    # variables
+    for x_name, x_shape in shape.items():
+        x_shape = tuple(x_shape)
+        if input_shapes.get(x_name, x_shape) != x_shape:
+            raise RuntimeError("Inferred shape differs from the provided shape.\n"
+                               "Provided shapes: {}\nInferred shapes: {}"
+                               .format(shapes, input_shapes))
+        else:
+            input_shapes[x_name] = x_shape
+
+    # Merge the original user-specified dtypes
+    for x_name, x_dtype in dtype.items():
+        if not isinstance(x_dtype, str):
+            x_dtype = TCODE_TO_DTYPE[x_dtype]
+        if input_dtypes.get(x_name, x_dtype) != x_dtype:
+            raise RuntimeError("Inferred dtype differs from the provided dtype.\n"
+                               "Provided dtypes: {}\nInferred dtypes: {}"
+                               .format(dtypes, input_dtypes))
+        else:
+            input_dtypes[x_name] = x_dtype
+
+    # If some dtypes weren't inferred and there is a fallback dtype, assign it to those varibles
+    # and repeat the inference
+    if fallback_dtype is not None and not all(input_dtypes.values()):
+        input_dtypes = {x: input_dtypes[x] if input_dtypes[x] else fallback_dtype
+                        for x in input_dtypes}
+        return infer_shapes_dtypes(graph, input_shapes, input_dtypes, fallback_dtype=None)
+
+    return graph, input_shapes, input_dtypes, output_shapes, output_dtypes
+
+def graph_to_function(graph, target, ctx, shape=None, dtype=None):
+    """Convert a graph to a function taking a keyword args and returning a list of results
+    (both args and results are numpy arrays).
+
+    Example::
+
+        fun = graph_to_function(graph, llvm, cpu(0))
+        [res1, res2] = fun(x=np.zeros((1,2)), y=np.zeros((1,)))
+
+    Parameters
+    ----------
+    graph : nnvm.graph.Graph
+        A graph we want to convert to a function.
+
+    target : str or :any:`tvm.target.Target`
+        The build target
+
+    ctx : TVMContext
+        The context to deploy the module.
+
+    shape : Dict[str, Tuple[int]], optional
+        A dict mapping input variable names to shapes.
+        By default shapes will be inferred from variables' attributes.
+        Note that this parameter takes precedence over variables' attributes.
+
+    dtype : Dict[str, str] or str, optional
+        A dict mapping input variable names to dtypes, or just a single dtype.
+        By default dtypes will be inferred from variables' attributes.
+        Note that this parameter takes precedence over variables' attributes.
+
+    Returns
+    -------
+    function : Callable[..., List[numpy.ndarray]]
+    """
+    # Infer missing shapes and dtypes
+    graph, shape, dtype, output_shapes, output_dtypes = \
+        infer_shapes_dtypes(graph, shape=shape, dtype=dtype)
+
+    if None in dtype.values():
+        raise ValueError("Input variables with no type: {}".format(dtype))
+
+    if not all(shape.values()):
+        raise ValueError("Input variables with no shape: {}".format(shape))
+
+    compute_graph, lib, params = nnvm.compiler.build(graph, target, shape=shape, dtype=dtype)
+    module = graph_runtime.create(compute_graph, lib, ctx)
+
+    if params:
+        module.set_inputs(**params)
+
+    def run(**kwargs):
+        module.run(**kwargs)
+        res = []
+        for i, (o_shape, o_dtype) in enumerate(zip(output_shapes, output_dtypes)):
+            res.append(module.get_output(i, tvm.nd.empty(o_shape, o_dtype)).asnumpy())
+        return res
+
+    return run
+
+def _dict_var_to_dict_str(dictionary):
+    """Convert a Dict[nnvm.Symbol, T] to Dict[str, T]"""
+    if isinstance(dictionary, dict):
+        return {s.attr('name') if isinstance(s, nnvm.symbol.Symbol) else s:
+                dictionary[s] for s in dictionary}
+    else:
+        return dictionary
+
+def check_function(symbol, forward=None, backward=None, grad_input_vars=None,
+                   shape=None, dtype=None, in_range=None, values=None,
+                   exclude_targets=None, only_targets=None,
+                   additional_params=None,
+                   numerical_grads=None, numerical_grads_params=None,
+                   atol=1e-5, rtol=1e-5, quiet=False):
+    """Compute the function and/or its gradients on a random input and raise
+    an exception if the result doesn't match the reference implementation.
+
+    Parameters
+    ----------
+    symbol : nnvm.Symbol
+        A symbol representing the output.
+
+    forward : Callable[..., List[numpy.ndarray]], optional
+        A reference implementation to compare with.
+
+    backward : Callable[..., List[numpy.ndarray] or Dict[str, numpy.ndarray]], optional
+        A reference implementation of gradients. Should also accept head_grads besides
+        normal inputs which is a list of gradients of some scalar wrt the outputs or just a
+        single gradient if there are multiple outputs.
+        Should return either a dict mapping input variable names to the respective
+        gradients or a list of gradients wrt variables from grad_input_vars in
+        exactly the same order (in alphabetical order by default).
+
+    grad_input_vars : List[nnvm.Symbol or str], optional
+        A list of variables with respect to which the gradients will be computed.
+        None (default) means that all input variables will be used in an alphabetical order.
+
+    shape : Dict[nnvm.Symbol or str, Tuple[int]] or Tuple[int], optional
+        A dict mapping input variable names to shapes, or just a single shape.
+        By default shapes will be inferred from variables' attributes (see the Examples).
+        Note that this parameter takes precedence over variables' attributes.
+
+    dtype : Dict[nnvm.Symbol or str, str] or str, optional
+        A dict mapping input variable names to dtypes, or just a single dtype.
+        By default dtypes will be inferred from variables' attributes (see the Examples).
+        If dtypes cannot be inferred for some variables then float32 will be used as a fallback.
+        Note that this parameter takes precedence over variables' attributes.
+
+    in_range : Dict[nnvm.Symbol or str, (float, float)] or (float, float), optional
+        A dict mapping input variable names to ranges or just a single range
+        (the same for all variables). Input values will be generated from
+        uniform distributions on these ranges. `head_grads` can also be
+        assigned a range this way.
+
+    values : Dict[nnvm.Symbol or str, numpy.ndarray], optional
+        A dict explicitly providing values for some variables instead of random generation.
+
+    exclude_targets : Set[str], optional
+        Skip compiling and running anything for these targets.
+
+    only_targets : Set[str], optional
+        Test only for those targets from `ctx_list()` that are also in this set.
+
+    additional_params : dict, optional
+        A dict of additional parameters which will be passed to forward and backward.
+
+    numerical_grads : bool or 'if_possible', optional
+        Whether to additionally check against numerically computed gradients. If 'if_possible' or
+        None is passed (which is the default) then it will try to create a gradient computation
+        graph and then check gradients numerically only if this graph can be created (i.e. if there
+        are some operations with unimplemented gradients, it will just issue a warning).
+        Checking against numerical gradients is done via the `check_numerical_grads` function.
+
+    numerical_grads_params : dict, optional
+        Additional parameters for `check_numerical_grads`.
+
+    atol : float, optional
+        Absolute tolerance for `np.testing.assert_allclose`. NOT used for numerical gradients.
+
+    rtol : float, optional
+        Relative tolerance for `np.testing.assert_allclose`. NOT used for numerical gradients.
+
+    quiet : bool, optional
+        Don't dump additional information to stdout on failure.
+
+    Examples
+    --------
+    .. code-block:: python
+
+        x = sym.Variable("x", shape=(1, 2))
+        y = sym.Variable("y", shape=(1, 2))
+
+        # check the function and its gradients both numerically and using a reference function
+        check_function(x + 2*y,
+                       lambda x, y: x + 2*y,
+                       lambda x, y, head_grads: {'x': head_grads, 'y': 2*head_grads})
+
+        # just check gradients numerically
+        check_function(x + 2*y, numerical_grads=True)
+
+        # just check the forward computation
+        check_function(x + 2*y, lambda x, y: x + 2*y, numerical_grads=False)
+
+        # specifying dtype
+        check_function(x + 2*y, lambda x, y: x + 2*y, dtype='float64')
+
+        # dtypes can also be specified during variable creation with dtype codes
+        x = sym.Variable("x", dtype=0)
+        check_function(x + 1, shape=(2, 2), numerical_grads=True)
+    """
+    # validate and preprocess the input params
+    if numerical_grads is None and forward is None and backward is None:
+        raise ValueError("No reference function was passed to check_function. If you only want to "
+                         "check gradients numerically, pass numerical_grads=True explicitly.")
+
+    if numerical_grads is None:
+        numerical_grads = 'if_possible'
+
+    if numerical_grads not in [False, True, 'if_possible']:
+        raise ValueError("numerical_grads must be a bool or 'if_possible', not {}"
+                         .format(numerical_grads))
+
+    if additional_params is None:
+        additional_params = {}
+
+    input_vars = symbol.list_input_variables()
+    input_dict = {x.attr('name'): x for x in input_vars}
+
+    if grad_input_vars is None:
+        grad_input_vars = sorted(input_vars, key=lambda x: x.attr('name'))
+    else:
+        grad_input_vars = [input_dict[x] if isinstance(x, str) else x for x in grad_input_vars]
+
+    in_range = _dict_var_to_dict_str(in_range)
+    values = _dict_var_to_dict_str(values)
+
+    out_len = len(symbol.list_output_names())
+
+    # Infer the output shapes and dtypes, and preprocess the shape and dtype params
+    forward_graph, shape, dtype, out_shapes, out_dtypes = \
+        infer_shapes_dtypes(nnvm.graph.create(symbol), shape=shape, dtype=dtype,
+                            fallback_dtype='float32')
+
+    if not all(out_shapes) or not all(out_dtypes):
+        if not quiet:
+            print(forward_graph.ir(join_node_attrs=['shape', 'dtype']))
+        raise ValueError("Could not infer shapes or dtypes for outputs.\n"
+                         "out_shapes = {}\nout_dtypes = {}".format(out_shapes, out_dtypes))
+
+    backward_graph = None
+
+    # If we want gradients, we have to recreate the graph, but now with gradient computations
+    # Note that here we need out_shapes for defining the shape of head grads, so we have to
+    # create the graph twice
+    if backward is not None or numerical_grads:
+        try:
+            head_grads_symbols = [nnvm.symbol.Variable("head_grads_" + str(i),
+                                                       shape=out_shapes[i],
+                                                       dtype=DTYPE_TO_TCODE[out_dtypes[i]])
+                                  for i in range(out_len)]
+            grad_symbols = graph_util.gradients([symbol], grad_input_vars,
+                                                grad_ys=head_grads_symbols)
+            # Sometimes grads do not depend on head_grads, so head_grads does not appear
+            # in the variable list; adding it manually prevents this, making things a bit easier
+            backward_graph = \
+                nnvm.graph.create(nnvm.symbol.Group([symbol] + grad_symbols + head_grads_symbols))
+
+            backward_graph, shape, dtype, out_shapes, out_dtypes = \
+                infer_shapes_dtypes(backward_graph, shape=shape, dtype=dtype,
+                                    fallback_dtype='float32')
+        except nnvm._base.NNVMError as err:
+            if backward is None and numerical_grads == "if_possible":
+                logging.warning("Won't check gradients because: %s", str(err).split('\n', 1)[0])
+                numerical_grads = False
+                backward_graph = None
+            else:
+                raise
+
+    main_graph = backward_graph if backward_graph is not None else forward_graph
+
+    # Generate random data for inputs (including head_grads)
+
+    np_inputs = {}
+
+    for x in main_graph.symbol.list_input_variables():
+        x_name = x.attr('name')
+        x_shape = shape[x_name]
+        x_dtype = dtype[x_name]
+
+        if values is not None and x_name in values:
+            np_inputs[x_name] = values[x_name].astype(x_dtype)
+            continue
+
+        low = -1.0
+        high = 1.0
+        if in_range is not None:
+            if isinstance(in_range, dict):
+                if x_name in in_range:
+                    low = in_range[x_name][0]
+                    high = in_range[x_name][1]
+            else:
+                low = in_range[0]
+                high = in_range[1]
+
+        np_inputs[x_name] = np.random.uniform(size=x_shape, low=low, high=high).astype(x_dtype)
+
+    np_inputs_without_head_grads = {k: np_inputs[k] for k in np_inputs
+                                    if not k.startswith('head_grads_')}
+
+    nothing_was_done = True
+
+    # Compute and compare the results
+    for target, ctx in ctx_list():
+        if exclude_targets is not None:
+            if target in exclude_targets or str(target) in exclude_targets:
+                logging.info("Skipping target = %s, ctx = %s", target, ctx)
+                continue
+        if only_targets is not None:
+            if target not in only_targets and str(target) not in only_targets:
+                logging.info("Skipping target = %s, ctx = %s", target, ctx)
+                continue
+
+        logging.info("Checking computation on target = %s, ctx = %s", target, ctx)
+
+        debug_stage = None
+
+        try:
+            nnvm_res = None
+
+            debug_stage = "compiling"
+            main_function = graph_to_function(main_graph, target, ctx)
+
+            # nnvm_res contains the output and gradients (if they are needed)
+            debug_stage = "running"
+            nnvm_res = main_function(**np_inputs)
+
+            if backward_graph is not None:
+                grad_var_names = [x.attr('name') for x in grad_input_vars]
+                nnvm_grads = {x: v for x, v in zip(grad_var_names, nnvm_res[out_len:])}
+
+            if forward is not None:
+                nothing_was_done = False
+                debug_stage = "checking forward computation"
+                logging.debug(debug_stage)
+
+                params = {}
+                params.update(np_inputs_without_head_grads)
+                params.update(additional_params)
+                numpy_res = forward(**params)
+
+                if isinstance(numpy_res, tuple):
+                    numpy_res = list(numpy_res)
+
+                if not isinstance(numpy_res, list):
+                    numpy_res = [numpy_res]
+
+                if len(numpy_res) != out_len:
+                    raise ValueError("Forward function returned {} values, but "
+                                     "the nnvm graph returns {} values"
+                                     .format(len(numpy_res), out_len))
+
+                for i in range(out_len):
+                    np.testing.assert_allclose(nnvm_res[i], numpy_res[i], atol=atol, rtol=rtol)
+
+            if backward is not None:
+                nothing_was_done = False
+                debug_stage = "checking gradients"
+                logging.debug(debug_stage)
+
+                np_head_grads = [np_inputs["head_grads_" + str(i)] for i in range(out_len)]
+
+                if out_len == 1:
+                    np_head_grads = np_head_grads[0]
+
+                params = {'head_grads': np_head_grads}
+                params.update(np_inputs_without_head_grads)
+                params.update(additional_params)
+                numpy_grads = backward(**params)
+
+                if not isinstance(numpy_grads, dict):
+                    if isinstance(numpy_grads, tuple):
+                        numpy_grads = list(numpy_grads)
+                    if not isinstance(numpy_grads, list):
+                        numpy_grads = [numpy_grads]
+                    numpy_grads = {x: v for x, v in zip(grad_var_names, numpy_grads)}
+                    if len(numpy_grads) != len(grad_var_names):
+                        raise ValueError("The backward function returns a list of gradients which "
+                                         "does not contain gradients for these variables: {}"
+                                         .format(set(grad_var_names) - set(numpy_grads)))
+
+                for x_name in numpy_grads:
+                    np.testing.assert_allclose(nnvm_grads[x_name], numpy_grads[x_name],
+                                               atol=atol, rtol=rtol)
+
+            if numerical_grads:
+                nothing_was_done = False
+                debug_stage = "checking gradients numerically"
+                logging.debug(debug_stage)
+
+                forward_function = graph_to_function(forward_graph, target, ctx)
+
+                # Since the result may be non-scalar, we have to put another operation on the top,
+                # so we just multiple by the randomly generated head_grads and then sum everything.
+                # This way we can reuse the gradient values which has been already computed.
+                def scalar_function(**kwargs):
+                    res = forward_function(**kwargs)
+                    return np.sum([np.dot(np_inputs['head_grads_' + str(i)].ravel(), res[i].ravel())
+                                   for i in range(out_len)])
+
+                if numerical_grads_params is None:
+                    numerical_grads_params = {}
+
+                check_numerical_grads(
+                    scalar_function,
+                    input_values=np_inputs_without_head_grads,
+                    grad_values=nnvm_grads,
+                    **numerical_grads_params)
+
+        except:
+            if not quiet:
+                print("\ncheck_function failed while {}, here is the main graph"
+                      .format(debug_stage))
+                print(main_graph.ir(join_node_attrs=['shape', 'dtype']))
+                if nnvm_res is not None:
+                    print("Generated inputs:")
+                    print(np_inputs)
+                    print()
+            raise
+
+    if nothing_was_done:
+        logging.warning("Nothing was done in check_function. Check ctx_list().")
+
+
+def check_numerical_grads(function, input_values, grad_values, function_value=None,
+                          delta=1e-3, atol=1e-2, rtol=0.1):
+    """A helper function that checks that numerical gradients of a function are equal to
+    gradients computed in some different way (analytical gradients).
+
+    Numerical gradients are computed using finite difference approximation. To reduce the number of
+    function evaluations, the number of points used is gradually increased if the error value is
+    too high (up to 5 points).
+
+    Parameters
+    ----------
+    function
+        A function that takes inputs as keyword arguments (like `function(**input_values)`) and
+        returns a scalar result. Should accept numpy ndarrays.
+
+    input_values : Dict[str, numpy.ndarray]
+        A dict assigning values to variables. Represents the point at which gradients should be
+        computed.
+
+    grad_values : Dict[str, numpy.ndarray]
+        Gradients computed using a different method.
+
+    function_value : float, optional
+        Should be equal to `function(**input_values)`.
+
+    delta : float, optional
+        A small number used for numerical computation of partial derivatives. The default 1e-3 is a
+        good choice for float32.
+
+    atol : float, optional
+        Absolute tolerance.
+
+    rtol : float, optional
+        Relative tolerance.
+    """
+
+    if function_value is None:
+        function_value = function(**input_values)
+
+    # a helper to modify j-th element of val by a_delta
+    def modify(val, j, a_delta):
+        val = val.copy()
+        val.reshape(-1)[j] = val.reshape(-1)[j] + a_delta
+        return val
+
+    # numerically compute a partial derivative with respect to j-th element of the var `name`
+    def derivative(x_name, j, a_delta):
+        modified_values = {n: modify(val, j, a_delta) if n == x_name else val
+                           for n, val in input_values.items()}
+        return (function(**modified_values) - function_value)/a_delta
+
+    def compare_derivative(j, n_der, grad):
+        der = grad.reshape(-1)[j]
+        return np.abs(n_der - der) < atol + rtol*np.abs(n_der)
+
+    for x_name, grad in grad_values.items():
+        if grad.shape != input_values[x_name].shape:
+            raise AssertionError(
+                "Gradient wrt '{}' has unexpected shape {}, expected {} "
+                .format(x_name, grad.shape, input_values[x_name].shape))
+
+        ngrad = np.zeros_like(grad)
+
+        # compute partial derivatives for each position in this variable
+        for j in range(np.prod(grad.shape)):
+            # forward difference approximation
+            nder = derivative(x_name, j, delta)
+
+            # if the derivative is not equal to the analytical one, try to use more
+            # precise and expensive methods
+            if not compare_derivative(j, nder, grad):
+                # central difference approximation
+                nder = (derivative(x_name, j, -delta) + nder)/2
+
+                if not compare_derivative(j, nder, grad):
+                    # central difference approximation using h = delta/2
+                    cnder2 = (derivative(x_name, j, delta/2) + derivative(x_name, j, -delta/2))/2
+                    # five-point derivative
+                    nder = (4*cnder2 - nder)/3
+
+            ngrad.reshape(-1)[j] = nder
+
+        dist = np.sqrt(np.sum((ngrad - grad)**2))
+        grad_norm = np.sqrt(np.sum(ngrad**2))
+
+        # we multiple atol by this number to make it more universal for different sizes
+        sqrt_n = np.sqrt(float(np.prod(grad.shape)))
+
+        if dist > atol*sqrt_n + rtol*grad_norm:
+            raise AssertionError(
+                "Analytical and numerical grads wrt {} differ too much\n"
+                "analytical grad = {}\n numerical grad = {}\n"
+                "distance > atol*sqrt(n) + rtol*grad_norm\n"
+                "distance {} > {}*{} + {}*{}"
+                .format(x_name, grad, ngrad,
+                        dist, atol, sqrt_n, rtol, grad_norm))
+
+        max_diff = np.max(np.abs(ngrad - grad))
+        avg_diff = np.mean(np.abs(ngrad - grad))
+        logging.info("Numerical grad test wrt %s of shape %s passes, "
+                     "dist = %f, max_diff = %f, avg_diff = %f",
+                     x_name, grad.shape, dist, max_diff, avg_diff)
diff --git a/nnvm/tests/python/compiler/test_top_level1.py b/nnvm/tests/python/compiler/test_top_level1.py
index d9c6655fea1d..ba6280dd9b14 100644
--- a/nnvm/tests/python/compiler/test_top_level1.py
+++ b/nnvm/tests/python/compiler/test_top_level1.py
@@ -5,49 +5,162 @@
 import nnvm.symbol as sym
 import nnvm.compiler
 from nnvm.testing.config import ctx_list
+from nnvm.testing.check_computation import check_function
 
-def helper(symbol, inputs, dtype,
-           np_forward, np_backward=None,
-           need_input=True, need_head_grads=True,
-           rnd_min=-1, rnd_max=1):
-    ishapes = {}
-    itypes = {}
-    input_syms = []
-    np_inputs = {}
-    for (name, shape, s) in inputs:
-        ishapes.update({name: shape})
-        itypes.update({name: dtype})
-        np_inputs.update({name: np.random.uniform(rnd_min, rnd_max, size=shape).astype(dtype)})
-        input_syms.append(s)
-
-    for target, ctx in ctx_list():
-        graph, lib, _ = nnvm.compiler.build(symbol, target, ishapes, itypes)
-        m = graph_runtime.create(graph, lib, ctx)
-        m.run(**np_inputs)
-        y_np = np_forward(**np_inputs)
-        out = m.get_output(0, tvm.nd.empty(y_np.shape, dtype))
-        np.testing.assert_allclose(out.asnumpy(), y_np, atol=1e-5, rtol=1e-5)
-        # backward
-        if np_backward:
-            graph._set_symbol_list_attr("grad_ys", symbol)
-            graph._set_symbol_list_attr("grad_xs", input_syms)
-            graph._set_symbol_list_attr("grad_ys_out_grad", sym.Variable("head_grads", shape=y_np.shape))
-            graph = graph.apply("Gradient")
-            ishapes.update({"head_grads": y_np.shape})
-            graph, lib, _ = nnvm.compiler.build(graph, target, ishapes)
-            m = graph_runtime.create(graph, lib, ctx)
-            head_grads = np.random.uniform(size=y_np.shape).astype(dtype)
-            y_np = np_backward(head_grads=head_grads, **np_inputs)
-            b_inputs = {}
-            if need_input:
-                b_inputs.update(np_inputs)
-            if need_head_grads:
-                b_inputs.update({"head_grads":head_grads})
-            m.run(**b_inputs)
-            for i in range(len(y_np)):
-                out = m.get_output(i, tvm.nd.empty(y_np[i].shape, dtype))
-                np.testing.assert_allclose(out.asnumpy(), y_np[i], atol=1e-5, rtol=1e-5)
+def test_check_function():
+    # test the testing function
 
+    x = sym.Variable("x")
+    y = sym.Variable("y")
+
+    # different styles of returning gradients from the backward function
+    check_function(x + 2*y, lambda x, y: x + 2*y,
+                   lambda x, y, head_grads: [head_grads, 2*head_grads],
+                   shape={'x': (1, 2), y: (1, 2)}, dtype='float32')
+    check_function(x + 2*y, lambda x, y: x + 2*y,
+                   lambda x, y, head_grads: (head_grads, 2*head_grads),
+                   shape={'x': (1, 2), y: (1, 2)}, dtype='float32')
+    check_function(x + 2*y, lambda x, y: x + 2*y,
+                   lambda x, y, head_grads: {'x': head_grads, 'y': 2*head_grads},
+                   shape={'x': (1, 2), y: (1, 2)}, dtype='float32')
+    check_function(x + 2*y, lambda x, y: x + 2*y,
+                   lambda x, y, head_grads: {'y': 2*head_grads},
+                   shape={'x': (1, 2), y: (1, 2)}, dtype='float32')
+    check_function(x + 2*y, lambda x, y: x + 2*y,
+                   lambda x, y, head_grads: [2*head_grads],
+                   grad_input_vars=[y],
+                   shape={'x': (1, 2), y: (1, 2)}, dtype='float32')
+    check_function(x + 2*y, lambda x, y: x + 2*y,
+                   lambda x, y, head_grads: 2*head_grads,
+                   grad_input_vars=[y],
+                   shape={'x': (1, 2), y: (1, 2)}, dtype='float32')
+    check_function(x + 2*y, lambda x, y: x + 2*y,
+                   lambda x, y, head_grads: 2*head_grads,
+                   grad_input_vars=[y],
+                   shape={'x': (1, 2), y: (1, 2)}, dtype='float64')
+
+    # test just numerical gradients
+    # different styles of shape and dtype passing
+    check_function(x + 2*y, shape={'x': (1, 2), y: (1, 2)},
+                   numerical_grads=True)
+    check_function(x + 2*y, shape={'x': (1, 2), y: (1, 2)}, dtype='float32',
+                   numerical_grads=True)
+    check_function(x + 2*y, shape={'x': (1, 2), y: (1, 2)}, dtype={x: 'float32', 'y': 'float32'},
+                   numerical_grads=True)
+    check_function(x + 2*y, shape=(1, 2), dtype='float32',
+                   numerical_grads=True)
+
+    # specifying variable attributes on variable creation
+    # (in this case type codes must be used)
+    x = sym.Variable("x", dtype=0, shape=(1, 2))
+    check_function(x + 2*y, shape={y: (1, 2)}, dtype={'y': 'float32'}, numerical_grads=True)
+    y = sym.Variable("y", dtype=0, shape=(1, 2))
+
+    # shape overriding
+    def _fwd1(x, y):
+        assert x.shape == (1, 1)
+        assert y.shape == (1, 2)
+        return x + 2*y
+    check_function(x + 2*y, _fwd1, shape={x: (1, 1)})
+
+    # in_range
+    def _fwd2(x, y):
+        assert x.shape == (100,)
+        assert (x <= 0.9).all()
+        assert (x >= 0.8).all()
+        return x + 2*y
+    check_function(x + 2*y, _fwd2, shape=(100,), in_range=(0.8, 0.9), numerical_grads=False)
+    check_function(x + 2*y, _fwd2, shape=(100,), in_range={'x': (0.8, 0.9)}, numerical_grads=False)
+    check_function(x + 2*y, backward=lambda x, y, head_grads: [1.0, 2.0],
+                   in_range={'head_grads_0': (1.0, 1.0)})
+    # explicit passing of values
+    check_function(x + 2*y, backward=lambda x, y, head_grads: [1.0, 2.0],
+                   values={'head_grads_0': np.full((1, 2), 1.0)})
+
+    # check that the function reports errors
+    def _check_function_must_fail(*args, **kwargs):
+        error = AssertionError
+        if 'error' in kwargs:
+            error = kwargs['error']
+            del kwargs['error']
+        try:
+            check_function(*args, quiet=True, **kwargs)
+        except error:
+            pass
+        else:
+            raise AssertionError("check_function didn't raise an exception")
+
+    _check_function_must_fail(x + 2*y, error=ValueError)
+    _check_function_must_fail(x + 2*y, lambda x, y: x + y)
+    _check_function_must_fail(x + 2*y, backward=lambda x, y, head_grads: [1.0, 2.0])
+    _check_function_must_fail(sym.block_grad(x + 2*y), numerical_grads=True)
+    _check_function_must_fail(x*x, numerical_grads=True,
+                              numerical_grads_params={'atol': 0.0, 'rtol': 0.0})
+
+    # different styles of returning results from the forward function
+    check_function(x + 2*y, lambda x, y: [x + 2*y], numerical_grads=False)
+    _check_function_must_fail(x + 2*y, lambda x, y: [x + 2*y, x], numerical_grads=False,
+                              error=ValueError)
+    _check_function_must_fail(x + 2*y, lambda x, y: [], numerical_grads=False,
+                              error=ValueError)
+
+    # multiple outputs
+    z = sym.Group([2*x + y, x + 2*y])
+    check_function(z, lambda x, y: [2*x + y, x + 2*y])
+    check_function(z, lambda x, y: (2*x + y, x + 2*y))
+    check_function(z, backward=lambda x, y, head_grads: [2*head_grads[0] + head_grads[1],
+                                                         head_grads[0] + 2*head_grads[1]])
+    _check_function_must_fail(z, backward=lambda x, y, head_grads: [2*head_grads[0],
+                                                                    2*head_grads[1]])
+    check_function(z, backward=lambda x, y, head_grads: [head_grads[1], 2*head_grads[1]],
+                   in_range={'head_grads_0': (0, 0)})
+    check_function(z, numerical_grads=True)
+
+    z = sym.Group([sym.block_grad(2*x + y), x + 2*y])
+    check_function(z, lambda x, y: [2*x + y, x + 2*y], numerical_grads=False)
+    _check_function_must_fail(z, lambda x, y: [2*x + y, x + 2*y])
+    _check_function_must_fail(z, numerical_grads=True)
+
+    z = sym.Group([2*x + y, sym.block_grad(x + 2*y)])
+    _check_function_must_fail(z, numerical_grads=True)
+
+    z = sym.Group([2*x + y, x + 2*y, x, y, sym.sum(x)])
+    check_function(z, lambda x, y: [2*x + y, x + 2*y, x, y, np.sum(x)])
+
+    # passing additional parameters to forward and backward
+    def _fwd3(x, p):
+        assert p == 'v'
+        return x + 1
+    def _bwd3(x, p, head_grads):
+        assert p == 'v'
+        return head_grads
+    check_function(x + 1, _fwd3, _bwd3, additional_params={'p': 'v'})
+
+    # implicitly created variables and shape/dtype inference for inputs
+    x = sym.Variable("x", shape=(2, 3), dtype=0)
+    b = sym.Variable("b")
+    y = sym.dense(data=x, bias=b, units=4)
+    # Don't check gradients on cuda because is doesn't yet support ewise after reduce
+    check_function(y, exclude_targets={'cuda'}, numerical_grads=True)
+    check_function(y, shape={'x': (3, 4)}, exclude_targets={'cuda'}, numerical_grads=True)
+    check_function(y, dtype={'x': 'float64'}, exclude_targets={'cuda'}, numerical_grads=True)
+
+    x = sym.Variable("x")
+    b = sym.Variable("b")
+    w = sym.Variable("w")
+    y = sym.dense(data=x, bias=b, weight=w, units=4)
+    def _fwd_dense(x, w, b):
+        return np.dot(x, w.T) + b
+    check_function(y, _fwd_dense, shape={'x': (1,2)}, dtype={'x': 'float32'}, numerical_grads=False)
+    check_function(y, _fwd_dense, shape={'x': (1,2)}, dtype={'w': 'float64'}, numerical_grads=False)
+    _check_function_must_fail(y, _fwd_dense, shape={'x': (1,2)},
+                              dtype={'w': 'float64', 'b': 'float32'},
+                              numerical_grads=False,
+                              error=nnvm._base.NNVMError)
+    # fails because no shape
+    _check_function_must_fail(y, _fwd_dense, numerical_grads=False, error=ValueError)
+    # ok because type is float32 by default
+    check_function(y, _fwd_dense, shape={'x': (1,2)}, numerical_grads=False)
 
 def test_relu():
     x = sym.Variable("x")
@@ -62,10 +175,8 @@ def backward(head_grads, x):
         return [(sub > 0).astype("float") * \
                 ((x > 0).astype("float") + 0.3 * (x < 0).astype("float")) * head_grads]
 
-    dtype = "float32"
-    dshape = (1, 3, 32, 32)
-    inputs = [('x', dshape, x)]
-    helper(y, inputs, dtype, forward, backward)
+    shape = {'x': (1, 3, 32, 32)}
+    check_function(y, forward, backward, shape=shape)
 
 def test_prelu_nchw():
     x = sym.Variable("x")
@@ -75,15 +186,8 @@ def test_prelu_nchw():
     def forward(x, a):
         return (x < 0) * (x * a.reshape(3, 1, 1)) + (x>=0) * x
 
-    dtype = "float32"
-    dshape_x = (1, 3, 32, 32)
-    dshape_w = (3,)
-
-    inputs = [
-        ('x', dshape_x, x),
-        ('a', dshape_w, a)
-    ]
-    helper(y, inputs, dtype, forward)
+    shape = {'x': (1, 3, 32, 32), 'a': (3,)}
+    check_function(y, forward, shape=shape)
 
 def test_prelu_nhwc():
     x = sym.Variable("x")
@@ -93,17 +197,8 @@ def test_prelu_nhwc():
     def forward(x, a):
         return (x < 0) * (x * a.reshape(1, 1, 3)) + (x>=0) * x
 
-    dtype = "float32"
-    dshape_x = (1, 32, 32, 3)
-    dshape_w = (3,)
-
-    inputs = [
-        ('x', dshape_x, x),
-        ('a', dshape_w, a)
-    ]
-
-
-    helper(y, inputs, dtype, forward)
+    shape = {'x': (1, 32, 32, 3), 'a': (3,)}
+    check_function(y, forward, shape=shape)
 
 def test_sym_scalar_pow():
     scalar = 3
@@ -116,10 +211,8 @@ def forward(x):
     def backward(head_grads, x):
         return [scalar * x**(scalar -  1) * head_grads]
 
-    dtype = "float32"
-    dshape = (1, 3, 32, 32)
-    inputs = [('x', dshape, x)]
-    helper(y, inputs, dtype, forward, backward)
+    shape = {'x': (1, 3, 32, 32)}
+    check_function(y, forward, backward, shape=shape)
 
 
 def test_scalar_sym_pow():
@@ -133,10 +226,8 @@ def forward(x):
     def backward(head_grads, x):
         return [np.log(scalar) * scalar**x * head_grads]
 
-    dtype = "float32"
-    dshape = (1, 3, 32, 32)
-    inputs = [('x', dshape, x)]
-    helper(y, inputs, dtype, forward, backward)
+    shape = {'x': (1, 3, 32, 32)}
+    check_function(y, forward, backward, shape=shape)
 
 
 def test_exp():
@@ -149,10 +240,8 @@ def forward(x):
     def backward(head_grads, x):
         return [np.exp(x) * head_grads]
 
-    dtype = "float32"
-    dshape = (1, 3, 32, 32)
-    inputs = [('x', dshape, x)]
-    helper(y, inputs, dtype, forward, backward)
+    shape = {'x': (1, 3, 32, 32)}
+    check_function(y, forward, backward, shape=shape)
 
 
 def test_log():
@@ -165,10 +254,8 @@ def forward(x):
     def backward(head_grads, x):
         return [1. / x * head_grads]
 
-    dtype = "float32"
-    dshape = (1, 3, 32, 32)
-    inputs = [('x', dshape, x)]
-    helper(y, inputs, dtype, forward, backward, rnd_min=0.001)
+    shape = {'x': (1, 3, 32, 32)}
+    check_function(y, forward, backward, in_range=(0.002, 2.0), shape=shape)
 
 
 def test_tanh():
@@ -182,10 +269,8 @@ def backward(head_grads, x):
         y_np = forward(x)
         return [(1 - y_np**2) * head_grads]
 
-    dtype = "float32"
-    dshape = (1, 3, 32, 32)
-    inputs = [('x', dshape, x)]
-    helper(y, inputs, dtype, forward, backward)
+    shape = {'x': (1, 3, 32, 32)}
+    check_function(y, forward, backward, shape=shape)
 
 
 def test_sigmoid():
@@ -199,10 +284,8 @@ def backward(head_grads, x):
         y_np = forward(x)
         return [y_np *(1 - y_np) * head_grads]
 
-    dtype = "float32"
-    dshape = (1, 3, 32, 32)
-    inputs = [('x', dshape, x)]
-    helper(y, inputs, dtype, forward, backward)
+    shape = {'x': (1, 3, 32, 32)}
+    check_function(y, forward, backward, shape=shape)
 
 
 def test_softmax():
@@ -217,10 +300,10 @@ def backward(head_grads, x):
         grad = y * (head_grads - np.sum(y * head_grads, axis=1, keepdims=True))
         return [grad]
 
-    dtype = "float32"
-    dshape = (10, 1000)
-    inputs = [('x', dshape, x)]
-    helper(y, inputs, dtype, forward, backward)
+    check_function(y, forward, backward,
+                   shape={'x': (10, 1000)}, numerical_grads=False)
+    check_function(y, forward, backward,
+                   shape={'x': (2, 10)})
 
 
 def test_log_softmax():
@@ -235,10 +318,10 @@ def backward(head_grads, x):
         grad = head_grads - np.exp(y) * np.sum(head_grads, axis=1, keepdims=True)
         return [grad]
 
-    dtype = "float32"
-    dshape = (10, 1000)
-    inputs = [('x', dshape, x)]
-    helper(y, inputs, dtype, forward, backward)
+    check_function(y, forward, backward,
+                   shape={'x': (10, 1000)}, numerical_grads=False)
+    check_function(y, forward, backward,
+                   shape={'x': (2, 10)})
 
 
 def test_dense():
@@ -250,13 +333,16 @@ def test_dense():
 
     def forward(x, dense_weight, dense_bias):
         return np.dot(x, dense_weight.T) + dense_bias
-    dtype = "float32"
-    inputs = [
-        ('x', (10, 100), x),
-        ('dense_weight', (3, 100), w),
-        ('dense_bias', (3,), b)
-    ]
-    helper(y, inputs, dtype, forward)
+    shape = {
+        'x': (10, 100),
+        'w': (3, 100),
+        'b': (3,)
+    }
+    # Don't check gradients on cuda because is doesn't yet support ewise after reduce
+    check_function(y, forward, shape=shape,
+                   exclude_targets={'cuda'}, numerical_grads=True)
+    check_function(y, forward, shape=shape,
+                   only_targets={'cuda'}, numerical_grads=False)
 
 
 def test_batchnorm():
@@ -272,35 +358,25 @@ def test_batchnorm():
     def forward(x, gamma, beta, moving_mean, moving_var):
         return (x - moving_mean) / np.sqrt(moving_var + eps) * gamma + beta
 
-    dtype = "float32"
-    inputs = [
-        ('x', (10, 20), x),
-        ('gamma', (20,), gamma),
-        ('beta', (20,), beta),
-        ('moving_mean', (20,), moving_var),
-        ('moving_var', (20,), moving_mean)
-    ]
+    shape = {
+        'x': (10, 20),
+        'gamma': (20,),
+        'beta': (20,),
+        'moving_mean': (20,),
+        'moving_var': (20,)
+    }
 
-    helper(y, inputs,  dtype, forward, rnd_min=0.001)
+    check_function(y, forward, in_range=(0.001, 1.0), shape=shape)
 
 
 def verify_concatenate(ishape, axis):
-    x = [sym.Variable("x%d" % i) for i in range(len(ishape))]
+    x = [sym.Variable("x%d" % i, shape=ishape[i]) for i in range(len(ishape))]
     y = sym.concatenate(*x, axis=axis) + 1
-    dtype = "float32"
-    for target, ctx in ctx_list():
-        # set input
-        data = []
-        for i, shape in enumerate(ishape):
-            data.append(np.random.uniform(size=shape).astype(dtype))
-        pdict = {"x%d" % i :  v for i, v in enumerate(data)}
-        shape = {"x%d" % i :  v.shape for i, v in enumerate(data)}
-        graph, lib, _ = nnvm.compiler.build(y, target, shape)
-        m = graph_runtime.create(graph, lib, ctx)
-        m.run(**pdict)
-        out_np = np.concatenate(data, axis=axis) + 1
-        out = m.get_output(0, tvm.nd.empty(out_np.shape))
-        np.testing.assert_allclose(out.asnumpy(), out_np, atol=1e-5, rtol=1e-5)
+
+    def forward(**kwargs):
+        return np.concatenate(list(kwargs.values()), axis=axis) + 1
+
+    check_function(y, forward)
 
 
 def test_concatenate():
@@ -309,19 +385,13 @@ def test_concatenate():
 
 
 def verify_split(ishape, indices_or_sections, axis):
-    x = sym.Variable("x")
+    x = sym.Variable("x", shape=ishape)
     y = sym.split(x, indices_or_sections=indices_or_sections, axis=axis)
-    dtype = "float32"
-    x_np = np.random.uniform(size=ishape).astype(dtype)
-    res = np.split(x_np, indices_or_sections, axis=axis)
-    for target, ctx in ctx_list():
-        # set input
-        graph, lib, _ = nnvm.compiler.build(y, target, {"x": ishape})
-        m = graph_runtime.create(graph, lib, ctx)
-        m.run(x=x_np)
-        for i, arr  in enumerate(res):
-            out = m.get_output(i, tvm.nd.empty(arr.shape))
-            np.testing.assert_allclose(out.asnumpy(), arr, atol=1e-5, rtol=1e-5)
+
+    def forward(x):
+        return np.split(x, indices_or_sections, axis=axis)
+
+    check_function(y, forward)
 
 
 def test_split():
@@ -331,28 +401,22 @@ def test_split():
 
 def verify_strided_slice(ishape, begin, end, strideinp=None):
     stride = strideinp if strideinp else [1, 1, 1]
-    x = sym.Variable("x")
+    x = sym.Variable("x", shape=ishape)
     if strideinp:
         y = sym.strided_slice(x, begin = begin, end = end, stride = stride) + 1
     else:
         y = sym.strided_slice(x, begin = begin, end = end) + 1
-    x_np = np.random.uniform(size=ishape).astype("float32")
+
     for i in range(len(begin), 3):
         begin.append(0)
     for i in range(len(end), 3):
         end.append(ishape[i])
-    def test_forward(x, begin, end, stride):
+
+    def test_forward(x):
         return x[begin[0]:end[0]:stride[0],
                     begin[1]:end[1]:stride[1], begin[2]:end[2]:stride[2]] + 1
 
-    for target, ctx in ctx_list():
-        # set input
-        graph, lib, _ = nnvm.compiler.build(y, target, {"x": ishape})
-        m = graph_runtime.create(graph, lib, ctx)
-        m.run(x=x_np)
-        res = test_forward(x_np, begin, end, stride)
-        out = m.get_output(0, tvm.nd.empty(res.shape))
-        np.testing.assert_allclose(out.asnumpy(), res, atol=1e-5, rtol=1e-5)
+    check_function(y, test_forward)
 
 def test_strided_slice():
     verify_strided_slice((3, 4, 3), [0, 0, 0], [4, -5, 4], [1, -1, 2])
@@ -369,24 +433,18 @@ def verify_take(src_shape, indices_src, axis=None):
     src_dtype = "float32"
     indices_dtype = "int32"
     indices_src = np.array(indices_src, dtype=indices_dtype)
-    a = sym.Variable("a")
-    indices = sym.Variable("indices")
+    a = sym.Variable("a", shape=src_shape)
+    indices = sym.Variable("indices", shape=indices_src.shape)
     y = sym.take(a, indices, axis=axis)
-    for target, ctx in ctx_list():
-        # set input
-        shape_dict = {"a":src_shape, "indices":indices_src.shape}
-        type_dict = {"a":src_dtype, "indices":indices_dtype}
-        graph, lib, _ = nnvm.compiler.build(y, target, shape=shape_dict, dtype=type_dict)
-        m = graph_runtime.create(graph, lib, ctx)
-
-        shape_size = 1
-        for i in range(len(src_shape)):
-            shape_size = shape_size * src_shape[i]
-        a_src = np.arange(shape_size, dtype=src_dtype).reshape((src_shape))
-        out_np = np.take(a_src, indices_src, axis=axis)
-        m.run(a=a_src, indices=indices_src)
-        out = m.get_output(0, tvm.nd.empty(out_np.shape, dtype=src_dtype))
-        np.testing.assert_allclose(out.asnumpy(), out_np, atol=1e-5, rtol=1e-5)
+
+    def forward(a, indices):
+        return np.take(a, indices=indices, axis=axis)
+
+    a_src = np.arange(np.prod(src_shape), dtype=src_dtype).reshape(src_shape)
+
+    check_function(y, forward,
+                   dtype={'a': src_dtype, 'indices': indices_dtype},
+                   values={'a': a_src, 'indices': indices_src})
 
 def test_take():
     verify_take((4,), [1])
@@ -399,9 +457,9 @@ def test_take():
     verify_take((4,3,5,6), [[2,1,0,0]], -2)
 
 
-def verify_squeeze(dshape, axis):
+def verify_squeeze(shape, axis):
     x = sym.Variable("x")
-    if axis:
+    if axis is not None:
         y = sym.squeeze(x, axis=axis)
     else:
         y = sym.squeeze(x)
@@ -413,9 +471,7 @@ def forward(x):
     def backward(head_grads, x):
         return [np.reshape(head_grads, x.shape)]
 
-    dtype = "float32"
-    inputs = [('x', dshape, x)]
-    helper(y, inputs, dtype, forward, backward)
+    check_function(y, forward, backward, shape=shape)
 
 
 def test_squeeze():
@@ -433,61 +489,40 @@ def forward(x):
                       pad_width=((0, 0), (0, 0), (0, 1), (2, 3)),
                       mode='constant', constant_values=1.)
 
-    dtype = "float32"
-    inputs = [('x', (1, 3, 28, 28), x)]
-    helper(y, inputs, dtype, forward)
+    shape = {'x': (1, 3, 28, 28)}
+    check_function(y, forward, shape=shape)
 
 def verify_lrn(ishape, size, axis, bias, alpha, beta):
-    x = sym.Variable("x")
+    x = sym.Variable("x", shape=ishape)
     y = sym.lrn(x, size=size, axis=axis, bias=bias, alpha=alpha, beta=beta)
-    dtype = "float32"
-    x_np = np.random.uniform(size=ishape).astype(dtype)
 
-    for target, ctx in ctx_list():
-        graph, lib, _ = nnvm.compiler.build(y, target, {"x": ishape})
-        m = graph_runtime.create(graph, lib, ctx)
-        m.run(x=x_np)
-        out = m.get_output(0, tvm.nd.empty(ishape))
-        out_np = topi.testing.lrn_python(x_np, size, axis, bias, alpha, beta)
-        np.testing.assert_allclose(out.asnumpy(), out_np, atol=1e-5, rtol=1e-5)
+    def forward1(x):
+        return topi.testing.lrn_python(x, size, axis, bias, alpha, beta)
+
+    check_function(y, forward1)
+
+    def forward2(x):
+        y = forward1(x)
+        return (y > 0)*y
 
     #Checking LRN op followed by elementwise op relu
-    z = sym.relu(y)
-    x_np = np.random.uniform(low=-10.0, high=10.0, size=ishape).astype(dtype)
-    for target, ctx in ctx_list():
-        graph, lib, _ = nnvm.compiler.build(z, target, {"x": ishape})
-        m = graph_runtime.create(graph, lib, ctx)
-        m.run(x=x_np)
-        out = m.get_output(0, tvm.nd.empty(ishape))
-        out_np = topi.testing.lrn_python(x_np, size, axis, bias, alpha, beta)
-        out_np = (out_np > 0) * out_np
-        np.testing.assert_allclose(out.asnumpy(), out_np, atol=1e-5, rtol=1e-5)
+    check_function(sym.relu(y), forward2, in_range={'x': (-10.0, 10.0)})
 
 def verify_l2_normalize(ishape, eps, axis):
-    x = sym.Variable("x")
+    x = sym.Variable("x", shape=ishape)
     y = sym.l2_normalize(x, eps=eps, axis=axis)
-    dtype = "float32"
-    x_np = np.random.uniform(size=ishape).astype(dtype)
 
-    for target, ctx in ctx_list():
-        graph, lib, _ = nnvm.compiler.build(y, target, {"x": ishape})
-        m = graph_runtime.create(graph, lib, ctx)
-        m.run(x=x_np)
-        out = m.get_output(0, tvm.nd.empty(ishape))
-        out_np = topi.testing.l2_normalize_python(x_np, eps, axis)
-        np.testing.assert_allclose(out.asnumpy(), out_np, atol=1e-5, rtol=1e-5)
+    def forward1(x):
+        return topi.testing.l2_normalize_python(x, eps, axis)
+
+    check_function(y, forward1)
+
+    def forward2(x):
+        y = forward1(x)
+        return (y > 0)*y
 
     #Checking L2 normalization op followed by elementwise op relu
-    z = sym.relu(y)
-    x_np = np.random.uniform(low=-10.0, high=10.0, size=ishape).astype(dtype)
-    for target, ctx in ctx_list():
-        graph, lib, _ = nnvm.compiler.build(z, target, {"x": ishape})
-        m = graph_runtime.create(graph, lib, ctx)
-        m.run(x=x_np)
-        out = m.get_output(0, tvm.nd.empty(ishape))
-        out_np = topi.testing.l2_normalize_python(x_np, eps, axis)
-        out_np = (out_np > 0) * out_np
-        np.testing.assert_allclose(out.asnumpy(), out_np, atol=1e-5, rtol=1e-5)
+    check_function(sym.relu(y), forward2, in_range={'x': (-10.0, 10.0)})
 
 def test_lrn():
     verify_lrn((1, 3, 20, 20), 3, 1, 1.0, 1.0, 0.5)
@@ -498,6 +533,7 @@ def test_l2_normalize():
     verify_l2_normalize((1, 3, 20, 20), 0.001, (1, 2))
 
 if __name__ == "__main__":
+    test_check_function()
     test_split()
     test_concatenate()
     test_log_softmax()
diff --git a/nnvm/tests/python/compiler/test_top_level3.py b/nnvm/tests/python/compiler/test_top_level3.py
index c8bd37c38e5b..11af2d0bc9c4 100644
--- a/nnvm/tests/python/compiler/test_top_level3.py
+++ b/nnvm/tests/python/compiler/test_top_level3.py
@@ -5,15 +5,14 @@
 import nnvm.symbol as sym
 import nnvm.compiler
 from nnvm.testing.config import ctx_list
-from test_top_level1 import helper
+from nnvm.testing.check_computation import check_function
 
 def check_map(symfunc, np_func, np_backward=None, dtype="float32", rnd_min=-1, rnd_max=1):
     x = sym.Variable("x")
     y = symfunc(x)
-    dshape = (1, 3, 32, 32)
-    inputs = [('x', dshape, x)]
-    helper(y, inputs, dtype, lambda x: np_func(x), np_backward,
-           rnd_min=rnd_min, rnd_max=rnd_max)
+    shape = {'x': (1, 3, 32, 32)}
+    check_function(y, lambda x: np_func(x), np_backward,
+                   dtype=dtype, shape=shape, in_range=(rnd_min, rnd_max))
 
 
 def test_floor():
diff --git a/nnvm/tests/python/compiler/test_top_level4.py b/nnvm/tests/python/compiler/test_top_level4.py
index 5bf134b49a7b..50ce1571e1a8 100644
--- a/nnvm/tests/python/compiler/test_top_level4.py
+++ b/nnvm/tests/python/compiler/test_top_level4.py
@@ -6,52 +6,7 @@
 import nnvm.symbol as sym
 import nnvm.compiler
 from nnvm.testing.config import ctx_list
-
-
-def helper(symbol, inputs, dtype,
-           np_forward, np_backward=None,
-           need_input=True, need_head_grads=True, in_range={}):
-    ishapes = {}
-    input_syms = []
-    np_inputs = {}
-    for (name, shape, s) in inputs:
-        ishapes.update({name: shape})
-        if name in in_range:
-            np_inputs.update({name: np.random.uniform(size=shape,
-                                                      low=in_range[name][0],
-                                                      high=in_range[name][1]).astype(dtype)})
-        else:
-            np_inputs.update({name: np.random.uniform(size=shape).astype(dtype)})
-        input_syms.append(s)
-
-    for target, ctx in ctx_list():
-        graph, lib, _ = nnvm.compiler.build(symbol, target, ishapes, dtype=dtype)
-        m = graph_runtime.create(graph, lib, ctx)
-        m.run(**np_inputs)
-        y_np = np_forward(**np_inputs)
-        out = m.get_output(0, tvm.nd.empty(y_np.shape, dtype))
-        np.testing.assert_allclose(out.asnumpy(), y_np, atol=1e-5, rtol=1e-5)
-        # backward
-        if np_backward:
-            graph._set_symbol_list_attr("grad_ys", symbol)
-            graph._set_symbol_list_attr("grad_xs", input_syms)
-            graph._set_symbol_list_attr("grad_ys_out_grad", sym.Variable("head_grads", shape=y_np.shape))
-            graph = graph.apply("Gradient")
-            ishapes.update({"head_grads": y_np.shape})
-            graph, lib, _ = nnvm.compiler.build(graph, target, ishapes)
-            m = graph_runtime.create(graph, lib, ctx)
-            head_grads = np.random.uniform(size=y_np.shape).astype(dtype)
-            y_np = np_backward(head_grads=head_grads, **np_inputs)
-            b_inputs = {}
-            if need_input:
-                b_inputs.update(np_inputs)
-            if need_head_grads:
-                b_inputs.update({"head_grads":head_grads})
-            m.run(**b_inputs)
-            for i in range(len(y_np)):
-                out = m.get_output(i, tvm.nd.empty(y_np[i].shape, dtype))
-                np.testing.assert_allclose(out.asnumpy(), y_np[i], atol=1e-5, rtol=1e-5)
-
+from nnvm.testing.check_computation import check_function
 
 def verify_transpose(dshape, axes):
     x = sym.Variable("x")
@@ -228,93 +183,92 @@ def backward(head_grads, x):
         mask2 = np.less_equal(x, a_max).astype("float")
         return [head_grads * mask1 * mask2]
 
-
-    dtype = "float32"
-    inputs = [('x', (3, 4, 5), x)]
-    helper(y, inputs, dtype, forward, backward)
+    shape = {'x': (3, 4, 5)}
+    check_function(y, forward, backward, shape=shape)
 
 
 def test_broadcast():
     a = sym.Variable("a")
     b = sym.Variable("b")
-    inputs = [('a', (3, 4, 5), a),
-              ('b', (1, 5), b)]
-    dtype = "float32"
+    shape = {'a': (3, 4, 5), 'b': (1, 5)}
 
     def _collapse(g):
-        return g.reshape(-1, inputs[-1][1][-1]).sum(0, keepdims=True)
+        return g.reshape(-1, shape['b'][-1]).sum(0, keepdims=True)
 
     y = sym.broadcast_add(a, b)
     def _backward_add(head_grads, a, b):
         da = head_grads
         db = _collapse(head_grads)
         return da, db
-    helper(y, inputs, dtype, lambda a, b: a + b, _backward_add)
+    check_function(y, lambda a, b: a + b, _backward_add, shape=shape)
 
     y = sym.broadcast_sub(a, b)
     def _backward_sub(head_grads, a, b):
         da = head_grads
         db = -_collapse(head_grads)
         return da, db
-    helper(y, inputs, dtype, lambda a, b: a - b, _backward_sub)
+    check_function(y, lambda a, b: a - b, _backward_sub, shape=shape)
 
     y = sym.broadcast_mul(a, b)
     def _backward_mul(head_grads, a, b):
         da = head_grads * b
         db = _collapse(head_grads * a)
         return da, db
-    helper(y, inputs, dtype, lambda a, b: a * b, _backward_mul)
+    check_function(y, lambda a, b: a * b, _backward_mul, shape=shape)
 
     y = sym.broadcast_div(a, b)
     def _backward_div(head_grads, a, b):
         da = head_grads / b
         db = _collapse(- head_grads * a / b**2)
         return da, db
-    helper(y, inputs, dtype, lambda a, b: a / b, _backward_div)
+    # We avoid computing numerical derivatives too close to zero here
+    check_function(y, lambda a, b: a / b, _backward_div, shape=shape, numerical_grads=False)
+    check_function(y, lambda a, b: a / b, _backward_div, shape=shape,
+                   in_range={'b': (0.1, 20)})
 
     y = sym.broadcast_mod(a, b)
-    helper(y, inputs, 'int32',
-           lambda a, b: np.mod(a, b),
-           in_range={'a': (0.001, 100), 'b': (1, 100)})
+    check_function(y,
+                   lambda a, b: np.mod(a, b),
+                   in_range={'a': (0.001, 100), 'b': (1, 100)}, dtype='int32', shape=shape)
 
     y = sym.broadcast_max(a, b)
-    helper(y, inputs, dtype, lambda a, b: np.maximum(a, b))
+    check_function(y, lambda a, b: np.maximum(a, b), shape=shape)
 
     y = sym.broadcast_min(a, b)
-    helper(y, inputs, dtype, lambda a, b: np.minimum(a, b))
+    check_function(y, lambda a, b: np.minimum(a, b), shape=shape)
 
     y = sym.broadcast_pow(a, b)
-    helper(y, inputs, dtype,
-           lambda a, b: np.power(a, b),
-           in_range={'a': (0.001, 100), 'b': (0.001, 2)})
+    check_function(y,
+                   lambda a, b: np.power(a, b),
+                   in_range={'a': (0.001, 100), 'b': (0.001, 2)}, shape=shape)
 
     y = sym.broadcast_left_shift(a, b)
-    helper(y, inputs, 'int32', lambda a, b: a << b)
+    check_function(y, lambda a, b: a << b, dtype='int32', shape=shape)
 
     y = sym.broadcast_right_shift(a, b)
-    helper(y, inputs, 'int32', lambda a, b: a >> b)
+    check_function(y, lambda a, b: a >> b, dtype='int32', shape=shape)
 
     y = sym.broadcast_greater(a, b)
-    helper(y, inputs, dtype, lambda a, b: np.greater(a, b))
+    check_function(y, lambda a, b: np.greater(a, b), shape=shape)
 
     y = sym.broadcast_less(a, b)
-    helper(y, inputs, dtype, lambda a, b: np.less(a, b))
+    check_function(y, lambda a, b: np.less(a, b), shape=shape)
 
     y = sym.broadcast_equal(a, b)
-    helper(y, inputs, 'int32', lambda a, b: np.equal(a, b),
-           in_range={'a': (-2, 2), 'b': (-2, 2)})
+    check_function(y, lambda a, b: np.equal(a, b),
+                   in_range={'a': (-2, 2), 'b': (-2, 2)}, dtype='int32', shape=shape)
 
     y = sym.broadcast_not_equal(a, b)
-    helper(y, inputs, 'int32', lambda a, b: np.not_equal(a, b),
-           in_range={'a': (-2, 2), 'b': (-2, 2)})
+    check_function(y, lambda a, b: np.not_equal(a, b),
+                   in_range={'a': (-2, 2), 'b': (-2, 2)}, dtype='int32', shape=shape)
 
     y = sym.broadcast_greater_equal(a, b)
-    helper(y, inputs, 'int32', lambda a, b: np.greater_equal(a, b),
-           in_range={'a': (-3, 3), 'b': (-3, 3)})
+    check_function(y, lambda a, b: np.greater_equal(a, b),
+                   in_range={'a': (-3, 3), 'b': (-3, 3)}, dtype='int32', shape=shape)
 
     y = sym.broadcast_less_equal(a, b)
-    helper(y, inputs, 'int32', lambda a, b: np.less_equal(a, b),
-           in_range={'a': (-3, 3), 'b': (-3, 3)})
+    check_function(y, lambda a, b: np.less_equal(a, b),
+                   in_range={'a': (-3, 3), 'b': (-3, 3)}, dtype='int32', shape=shape)
 
 def test_greater():
     l = sym.Variable("l")
@@ -325,13 +279,10 @@ def forward(l, r):
         return np.greater(l, r).astype("float32")
 
     def backward(head_grads, l, r):
-        return [np.zeros_like(l)]
+        return {'l': np.zeros_like(l)}
 
-
-    dtype = "float32"
-    inputs = [('l', (3, 4, 5), l),
-              ('r', (3, 4, 5), r)]
-    helper(y, inputs, dtype, forward, backward, need_head_grads=False)
+    shape = {'l': (3, 4, 5), 'r': (3, 4, 5)}
+    check_function(y, forward, backward, shape=shape)
 
 
 def test_less():
@@ -343,13 +294,10 @@ def forward(l, r):
         return np.less(l, r).astype("float32")
 
     def backward(head_grads, l, r):
-        return [np.zeros_like(l)]
+        return {'l': np.zeros_like(l)}
 
-
-    dtype = "float32"
-    inputs = [('l', (3, 4, 5), l),
-              ('r', (3, 4, 5), r)]
-    helper(y, inputs, dtype, forward, backward, need_head_grads=False)
+    shape = {'l': (3, 4, 5), 'r': (3, 4, 5)}
+    check_function(y, forward, backward, shape=shape)
 
 
 def test_reshape_like():
@@ -364,11 +312,8 @@ def backward(head_grads, x, y):
         return [np.reshape(head_grads, x.shape),
                 np.zeros_like(y)]
 
-
-    dtype = "float32"
-    inputs = [('x', (3, 4, 5), x),
-              ('y', (5, 4, 3), y)]
-    helper(z, inputs, dtype, forward, backward)
+    shape = {'x': (3, 4, 5), 'y': (5, 4, 3)}
+    check_function(z, forward, backward, shape=shape)
 
 
 def verify_expand_like(in_shape, out_shape, axis, exclude):
@@ -412,10 +357,8 @@ def backward(head_grads, x, y):
                 np.zeros_like(y)]
 
 
-    dtype = "float32"
-    inputs = [('x', in_shape, x),
-              ('y', out_shape, y)]
-    helper(z, inputs, dtype, forward, backward, need_input=False)
+    shape = {'x': in_shape, 'y': out_shape}
+    check_function(z, forward, backward, shape=shape)
 
 
 def test_expand_like():
@@ -440,10 +383,8 @@ def forward(**inputs):
     def backward(head_grads, **inputs):
         return [head_grads] * num_args
 
-    dtype = "float32"
-    inputs = [("input" + str(i), (3, 4, 5), s[i])
-              for i in range(num_args)]
-    helper(y, inputs, dtype, forward, backward, need_input=False)
+    shape = {s[i]: (3, 4, 5) for i in range(num_args)}
+    check_function(y, forward, backward, shape=shape)
 
 
 def test_elemwise_sum():
@@ -463,9 +404,9 @@ def backward(head_grads, x):
         return [np.zeros_like(head_grads)]
 
 
-    dtype = "float32"
-    inputs = [('x', (3, 4, 5), x)]
-    helper(y, inputs, dtype, forward, backward, need_head_grads=False)
+    shape = {'x': (3, 4, 5)}
+    # Numerical grad checking would fail for this function
+    check_function(y, forward, backward, shape=shape, numerical_grads=False)
 
 
 def test_full():

From 771d895d223ee36d5330242934f6a93a1d7a5b8f Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <mercy_zheng@sjtu.edu.cn>
Date: Thu, 23 Aug 2018 19:37:03 -0700
Subject: [PATCH 63/77] [AUTOTVM] Fix local executor (#1651)

The old queue size is too small. It will stall the executor due to race condition.
---
 python/tvm/autotvm/measure/local_executor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/autotvm/measure/local_executor.py b/python/tvm/autotvm/measure/local_executor.py
index 8a045ecfb4c0..55f1dc75fc5c 100644
--- a/python/tvm/autotvm/measure/local_executor.py
+++ b/python/tvm/autotvm/measure/local_executor.py
@@ -133,7 +133,7 @@ def submit(self, func, *args, **kwargs):
         if not self.do_fork:
             return LocalFutureNoFork(func(*args, **kwargs))
 
-        queue = Queue(1)
+        queue = Queue(2)
         process = Process(target=timeout_monitor,
                           args=(queue, self.timeout, func, args, kwargs))
         process.start()

From 531efd6f4287b7fd5ee1921686736b621cfd7969 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Thu, 23 Aug 2018 21:59:34 -0700
Subject: [PATCH 64/77] [NODE] Enable global singleton object, allow
 set_body_typed in function registry, default fallback of IRPrinter. (#1652)

---
 HalideIR                           |  2 +-
 include/tvm/base.h                 | 72 ++++++++++++++++++++++++------
 include/tvm/runtime/registry.h     | 18 ++++++++
 nnvm/src/compiler/graph_runtime.cc |  2 +-
 src/api/api_base.cc                | 12 +----
 src/lang/reflection.cc             | 30 +++++++++++--
 6 files changed, 108 insertions(+), 28 deletions(-)

diff --git a/HalideIR b/HalideIR
index a0b9563f4571..6f64f7866747 160000
--- a/HalideIR
+++ b/HalideIR
@@ -1 +1 @@
-Subproject commit a0b9563f45719553adf4d39fe3c14db1af0e1f40
+Subproject commit 6f64f7866747a2a81bec84aea9bde0479c5b72c1
diff --git a/include/tvm/base.h b/include/tvm/base.h
index 2c5c5ad54875..c2d796b6002c 100644
--- a/include/tvm/base.h
+++ b/include/tvm/base.h
@@ -68,26 +68,72 @@ inline NodeType LoadJSON(const std::string& json_str) {
   return NodeType(LoadJSON_(json_str));
 }
 
-/*! \brief typedef the factory function of data iterator */
-using NodeFactory = std::function<std::shared_ptr<Node> ()>;
 /*!
- * \brief Registry entry for NodeFactory
+ * \brief Registry entry for NodeFactory.
+ *
+ *  There are two types of Nodes that can be serialized.
+ *  The normal node requires a registration a creator function that
+ *  constructs an empty Node of the corresponding type.
+ *
+ *  The global singleton(e.g. global operator) where only global_key need to be serialized,
+ *  in this case, FGlobalKey need to be defined.
  */
-struct NodeFactoryReg
-    : public dmlc::FunctionRegEntryBase<NodeFactoryReg,
-                                        NodeFactory> {
+struct NodeFactoryReg {
+  /*!
+   * \brief creator function.
+   * \param global_key Key that identifies a global single object.
+   *        If this is not empty then FGlobalKey
+   * \return The created function.
+   */
+  using FCreate = std::function<std::shared_ptr<Node>(const std::string& global_key)>;
+  /*!
+   * \brief Global key function, only needed by global objects.
+   * \param node The node pointer.
+   * \return node The global key to the node.
+   */
+  using FGlobalKey = std::function<std::string(const Node* node)>;
+  /*! \brief registered name */
+  std::string name;
+  /*!
+   * \brief The creator function
+   */
+  FCreate fcreator = nullptr;
+  /*!
+   * \brief The global key function.
+   */
+  FGlobalKey fglobal_key = nullptr;
+  // setter of creator
+  NodeFactoryReg& set_creator(FCreate f) {  // NOLINT(*)
+    this->fcreator = f;
+    return *this;
+  }
+  // setter of creator
+  NodeFactoryReg& set_global_key(FGlobalKey f) {  // NOLINT(*)
+    this->fglobal_key = f;
+    return *this;
+  }
+  // global registry singleton
+  TVM_DLL static ::dmlc::Registry<::tvm::NodeFactoryReg> *Registry();
 };
 
+/*!
+ * \brief Register a Node type
+ * \note This is necessary to enable serialization of the Node.
+ */
 #define TVM_REGISTER_NODE_TYPE(TypeName)                                \
   static DMLC_ATTRIBUTE_UNUSED ::tvm::NodeFactoryReg & __make_Node ## _ ## TypeName ## __ = \
-      ::dmlc::Registry<::tvm::NodeFactoryReg>::Get()->__REGISTER__(TypeName::_type_key) \
-      .set_body([]() { return std::make_shared<TypeName>(); })
+      ::tvm::NodeFactoryReg::Registry()->__REGISTER__(TypeName::_type_key) \
+      .set_creator([](const std::string&) { return std::make_shared<TypeName>(); })
+
+
+#define TVM_STRINGIZE_DETAIL(x) #x
+#define TVM_STRINGIZE(x) TVM_STRINGIZE_DETAIL(x)
+#define TVM_DESCRIBE(...) describe(__VA_ARGS__ "\n\nFrom:" __FILE__ ":" TVM_STRINGIZE(__LINE__))
+/*!
+ * \brief Macro to include current line as string
+ */
+#define TVM_ADD_FILELINE "\n\nDefined in " __FILE__ ":L" TVM_STRINGIZE(__LINE__)
 
-TVM_DLL::dmlc::Registry<::tvm::NodeFactoryReg > * GetTVMNodeFactoryRegistry();
 
-#define TVM_EXTERNAL_REGISTER_NODE_TYPE(TypeName)                                \
-  static DMLC_ATTRIBUTE_UNUSED ::tvm::NodeFactoryReg & __make_Node ## _ ## TypeName ## __ = \
-      ::tvm::GetTVMNodeFactoryRegistry()->__REGISTER__(TypeName::_type_key) \
-      .set_body([]() { return std::make_shared<TypeName>(); })
 }  // namespace tvm
 #endif  // TVM_BASE_H_
diff --git a/include/tvm/runtime/registry.h b/include/tvm/runtime/registry.h
index c10a03e0604a..9466056a1282 100644
--- a/include/tvm/runtime/registry.h
+++ b/include/tvm/runtime/registry.h
@@ -47,6 +47,24 @@ class Registry {
   Registry& set_body(PackedFunc::FType f) {  // NOLINT(*)
     return set_body(PackedFunc(f));
   }
+  /*!
+   * \brief set the body of the function to be TypedPackedFunc.
+   *
+   * \code
+   *
+   * TVM_REGISTER_API("addone")
+   * .set_body_typed<int(int)>([](int x) { return x + 1; });
+   *
+   * \endcode
+   *
+   * \param f The body of the function.
+   * \tparam FType the signature of the function.
+   * \tparam FLambda The type of f.
+   */
+  template<typename FType, typename FLambda>
+  Registry& set_body_typed(FLambda f) {
+    return set_body(TypedPackedFunc<FType>(f).packed());
+  }
   /*!
    * \brief Register a function with given name
    * \param name The name of the function.
diff --git a/nnvm/src/compiler/graph_runtime.cc b/nnvm/src/compiler/graph_runtime.cc
index bc4a803681e4..c680e82dd936 100644
--- a/nnvm/src/compiler/graph_runtime.cc
+++ b/nnvm/src/compiler/graph_runtime.cc
@@ -100,6 +100,6 @@ TVM_REGISTER_GLOBAL("nnvm.compiler._load_param_dict")
     *rv = ret;
   });
 
-TVM_EXTERNAL_REGISTER_NODE_TYPE(NDArrayWrapperNode);
+TVM_REGISTER_NODE_TYPE(NDArrayWrapperNode);
 }  // namespace compiler
 }  // namespace nnvm
diff --git a/src/api/api_base.cc b/src/api/api_base.cc
index 3583f42a00c9..a385d170070a 100644
--- a/src/api/api_base.cc
+++ b/src/api/api_base.cc
@@ -24,21 +24,13 @@ TVM_REGISTER_API("_raw_ptr")
   });
 
 TVM_REGISTER_API("_save_json")
-.set_body([](TVMArgs args,  TVMRetValue *ret) {
-    *ret = SaveJSON(args[0]);
-  });
+.set_body_typed<std::string(NodeRef)>(SaveJSON);
 
 TVM_REGISTER_API("_load_json")
-.set_body([](TVMArgs args,  TVMRetValue *ret) {
-    *ret = LoadJSON<NodeRef>(args[0]);
-  });
+.set_body_typed<NodeRef(std::string)>(LoadJSON<NodeRef>);
 
 TVM_REGISTER_API("_TVMSetStream")
 .set_body([](TVMArgs args,  TVMRetValue *ret) {
     TVMSetStream(args[0], args[1], args[2]);
   });
-
-TVM_DLL::dmlc::Registry<::tvm::NodeFactoryReg > * GetTVMNodeFactoryRegistry() {
-  return ::dmlc::Registry<::tvm::NodeFactoryReg>::Get();
-}
 }  // namespace tvm
diff --git a/src/lang/reflection.cc b/src/lang/reflection.cc
index 9fb9143aa7f4..93e2defd5aef 100644
--- a/src/lang/reflection.cc
+++ b/src/lang/reflection.cc
@@ -20,6 +20,10 @@ DMLC_REGISTRY_ENABLE(::tvm::NodeFactoryReg);
 
 namespace tvm {
 
+::dmlc::Registry<NodeFactoryReg>* NodeFactoryReg::Registry() {
+  return ::dmlc::Registry<NodeFactoryReg>::Get();
+}
+
 inline std::string Type2String(const Type& t) {
   if (t.code()  ==Type::Handle) return "handle";
   std::ostringstream os;
@@ -115,6 +119,8 @@ using AttrMap = std::map<std::string, std::string>;
 struct JSONNode {
   // The type key of the data
   std::string type_key;
+  // The global key for global object
+  std::string global_key;
   // the attributes
   AttrMap attrs;
   // container keys
@@ -125,6 +131,9 @@ struct JSONNode {
   void Save(dmlc::JSONWriter *writer) const {
     writer->BeginObject();
     writer->WriteObjectKeyValue("type_key", type_key);
+    if (global_key.size() != 0) {
+      writer->WriteObjectKeyValue("global_key", global_key);
+    }
     if (attrs.size() != 0) {
       writer->WriteObjectKeyValue("attrs", attrs);
     }
@@ -140,9 +149,11 @@ struct JSONNode {
   void Load(dmlc::JSONReader *reader) {
     attrs.clear();
     data.clear();
+    global_key.clear();
     type_key.clear();
     dmlc::JSONObjectReadHelper helper;
     helper.DeclareOptionalField("type_key", &type_key);
+    helper.DeclareOptionalField("global_key", &global_key);
     helper.DeclareOptionalField("attrs", &attrs);
     helper.DeclareOptionalField("keys", &keys);
     helper.DeclareOptionalField("data", &data);
@@ -195,6 +206,12 @@ class JSONAttrGetter : public AttrVisitor {
       return;
     }
     node_->type_key = node->type_key();
+    // sepcially handle global object
+    auto* f = dmlc::Registry<NodeFactoryReg>::Find(node_->type_key);
+    if (f->fglobal_key != nullptr) {
+      node_->global_key = f->fglobal_key(node);
+      return;
+    }
     node_->attrs.clear();
     node_->data.clear();
     if (node->is_type<ArrayNode>()) {
@@ -403,7 +420,7 @@ std::shared_ptr<Node> LoadJSON_(std::string json_str) {
       auto* f = dmlc::Registry<NodeFactoryReg>::Find(jnode.type_key);
       CHECK(f != nullptr)
           << "Node type \'" << jnode.type_key << "\' is not registered in TVM";
-      nodes.emplace_back(f->body());
+      nodes.emplace_back(f->fcreator(jnode.global_key));
     } else {
       nodes.emplace_back(std::shared_ptr<Node>());
     }
@@ -415,7 +432,11 @@ std::shared_ptr<Node> LoadJSON_(std::string json_str) {
 
   for (size_t i = 0; i < nodes.size(); ++i) {
     setter.node_ = &jgraph.nodes[i];
-    setter.Set(nodes[i].get());
+    // do not need to recover content of global singleton object
+    // they are registered via the environment
+    if (setter.node_->global_key.length() == 0) {
+      setter.Set(nodes[i].get());
+    }
   }
   return nodes.at(jgraph.root);
 }
@@ -493,11 +514,14 @@ void InitNodeByPackedArgs(Node* n, const TVMArgs& args) {
 //   key1, value1, ..., key_n, value_n
 void MakeNode(const TVMArgs& args, TVMRetValue* rv) {
   std::string type_key = args[0];
+  std::string empty_str;
   auto* f = dmlc::Registry<NodeFactoryReg>::Find(type_key);
   CHECK(f != nullptr)
       << "Node type \'" << type_key << "\' is not registered in TVM";
   TVMArgs kwargs(args.values + 1, args.type_codes + 1, args.size() - 1);
-  std::shared_ptr<Node> n = f->body();
+  CHECK(f->fglobal_key == nullptr)
+      << "Cannot make node type \'" << type_key << "\' with global_key.";
+  std::shared_ptr<Node> n = f->fcreator(empty_str);
   if (n->derived_from<BaseAttrsNode>()) {
     static_cast<BaseAttrsNode*>(n.get())->InitByPackedArgs(kwargs);
   } else {

From a1688998f80df1fa44fd61bab2478042c3e9049a Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <mercy_zheng@sjtu.edu.cn>
Date: Fri, 24 Aug 2018 09:37:05 -0700
Subject: [PATCH 65/77] improve text summary (#1655)

---
 apps/android_rpc/README.md                | 10 +--
 apps/benchmark/README.md                  | 16 ++---
 python/tvm/rpc/client.py                  | 25 ++++---
 python/tvm/rpc/tornado_util.py            |  2 +
 python/tvm/rpc/tracker.py                 | 31 ++++++++-
 tests/python/unittest/test_runtime_rpc.py | 80 ++++++++++++++++++++++-
 tutorials/autotvm/tune_nnvm_arm.py        | 14 ++--
 7 files changed, 146 insertions(+), 32 deletions(-)

diff --git a/apps/android_rpc/README.md b/apps/android_rpc/README.md
index eef22f3c7010..453263aa824e 100644
--- a/apps/android_rpc/README.md
+++ b/apps/android_rpc/README.md
@@ -104,11 +104,11 @@ You are supposed to find a free "android" in the queue status.
 ...
 
 Queue Status
-----------------------------
-key    	free	pending
-----------------------------
-android	1	0
-----------------------------
+-------------------------------
+key       total  free  pending
+-------------------------------
+android   1      1     0
+-------------------------------
 ```
 
 
diff --git a/apps/benchmark/README.md b/apps/benchmark/README.md
index ee22f90dc435..95742e0decf4 100644
--- a/apps/benchmark/README.md
+++ b/apps/benchmark/README.md
@@ -40,14 +40,14 @@ python3 -m tvm.exec.rpc_tracker
   For our test environment, one sample output can be 
   ```bash
   Queue Status                
-  ------------------------------
-  key            free    pending    
-  ------------------------------
-  mate10pro      1       0   
-  p20pro         2       0  
-  pixel2         2       0 
-  rk3399         2       0
-  rasp3b         8       0
+  ----------------------------------
+  key          total  free  pending    
+  ----------------------------------
+  mate10pro    1      1     0
+  p20pro       2      2     0 
+  pixel2       2      2     0
+  rk3399       2      2     0
+  rasp3b       8      8     0
   ```
 
  4. Run benchmark  
diff --git a/python/tvm/rpc/client.py b/python/tvm/rpc/client.py
index ffbe6eeab6ee..ae44e5a79933 100644
--- a/python/tvm/rpc/client.py
+++ b/python/tvm/rpc/client.py
@@ -218,6 +218,9 @@ def summary(self):
     def text_summary(self):
         """Get a text summary of the tracker."""
         data = self.summary()
+
+        total_ct = {}
+
         res = ""
         res += "Server List\n"
         res += "----------------------------\n"
@@ -225,8 +228,12 @@ def text_summary(self):
         res += "----------------------------\n"
         for item in data["server_info"]:
             addr = item["addr"]
-            res += addr[0] + ":" + str(addr[1])+ "\t"
+            res += addr[0] + ":" + str(addr[1]) + "\t"
             res += item["key"] + "\n"
+            key = item['key'].split(':')[1]   # 'server:rasp3b` -> 'rasp3b'
+            if key not in total_ct:
+                total_ct[key] = 0
+            total_ct[key] += 1
         res += "----------------------------\n"
         res += "\n"
 
@@ -240,14 +247,16 @@ def text_summary(self):
             max_key_len = 0
 
         res += "Queue Status\n"
-        res += "----------------------------\n"
-        res += ("%%-%ds" % max_key_len + "\tfree\tpending\n") % 'key'
-        res += "----------------------------\n"
+        title = ("%%-%ds" % max_key_len + "   total  free  pending\n") % 'key'
+        separate_line = '-' * len(title) + '\n'
+        res += separate_line + title + separate_line
         for k in keys:
-            res += ("%%-%ds" % max_key_len + "\t%d\t%g\n") % \
-                   (k, queue_info[k]["free"], queue_info[k]["pending"])
-
-        res += "----------------------------\n"
+            total = total_ct.get(k, 0)
+            free, pending = queue_info[k]["free"], queue_info[k]["pending"]
+            if total or pending:
+                res += ("%%-%ds" % max_key_len + "   %-5d  %-4d  %-7d\n") % \
+                       (k, total, free, pending)
+        res += separate_line
         return res
 
     def request(self, key, priority=1, session_timeout=0, max_retry=5):
diff --git a/python/tvm/rpc/tornado_util.py b/python/tvm/rpc/tornado_util.py
index 00e1fd13865b..eafea2e85394 100644
--- a/python/tvm/rpc/tornado_util.py
+++ b/python/tvm/rpc/tornado_util.py
@@ -66,6 +66,8 @@ def _update_write(self):
         while self._pending_write:
             try:
                 msg = self._pending_write[0]
+                if self._sock is None:
+                    return
                 nsend = self._sock.send(msg)
                 if nsend != len(msg):
                     self._pending_write[0] = msg[nsend:]
diff --git a/python/tvm/rpc/tracker.py b/python/tvm/rpc/tracker.py
index de39c97b5000..88868ad6e978 100644
--- a/python/tvm/rpc/tracker.py
+++ b/python/tvm/rpc/tracker.py
@@ -78,6 +78,16 @@ def request(self, user, priority, callback):
         """
         raise NotImplementedError()
 
+    def remove(self, value):
+        """Remove a resource in the scheduler
+
+        Parameters
+        ----------
+        value: object
+            The resource to remove
+        """
+        pass
+
     def summary(self):
         """Get summary information of the scheduler."""
         raise NotImplementedError()
@@ -108,6 +118,11 @@ def request(self, user, priority, callback):
         heapq.heappush(self._requests, (-priority, time.time(), callback))
         self._schedule()
 
+    def remove(self, value):
+        if value in self._values:
+            self._values.remove(value)
+            self._schedule()
+
     def summary(self):
         """Get summary information of the scheduler."""
         return {"free": len(self._values),
@@ -132,6 +147,7 @@ def __init__(self, tracker, sock, addr):
         # list of pending match keys that has not been used.
         self.pending_matchkeys = set()
         self._tracker._connections.add(self)
+        self.put_values = []
 
     def name(self):
         """name of connection"""
@@ -199,9 +215,11 @@ def call_handler(self, args):
             self.pending_matchkeys.add(matchkey)
             # got custom address (from rpc server)
             if args[3] is not None:
-                self._tracker.put(key, (self, args[3], port, matchkey))
+                value = (self, args[3], port, matchkey)
             else:
-                self._tracker.put(key, (self, self._addr[0], port, matchkey))
+                value = (self, self._addr[0], port, matchkey)
+            self._tracker.put(key, value)
+            self.put_values.append(value)
             self.ret_value(TrackerCode.SUCCESS)
         elif code == TrackerCode.REQUEST:
             key = args[1]
@@ -239,7 +257,7 @@ def _cb(value):
             self.close()
 
     def on_close(self):
-        self._tracker._connections.remove(self)
+        self._tracker.close(self)
 
     def on_error(self, err):
         logger.warning("%s: Error in RPC Tracker: %s", self.name(), err)
@@ -285,6 +303,13 @@ def request(self, key, user, priority, callback):
             self._scheduler_map[key] = self.create_scheduler(key)
         self._scheduler_map[key].request(user, priority, callback)
 
+    def close(self, conn):
+        self._connections.remove(conn)
+        if 'key' in conn._info:
+            key = conn._info['key'].split(':')[1]  # 'server:rasp3b' -> 'rasp3b'
+            for value in conn.put_values:
+                self._scheduler_map[key].remove(value)
+
     def stop(self):
         """Safely stop tracker."""
         for conn in list(self._connections):
diff --git a/tests/python/unittest/test_runtime_rpc.py b/tests/python/unittest/test_runtime_rpc.py
index 0de788068b6b..756b2867184d 100644
--- a/tests/python/unittest/test_runtime_rpc.py
+++ b/tests/python/unittest/test_runtime_rpc.py
@@ -1,10 +1,13 @@
 import tvm
 import os
 import logging
-import numpy as np
 import time
+import multiprocessing
+
+import numpy as np
 from tvm import rpc
 from tvm.contrib import util
+from tvm.rpc.tracker import Tracker
 
 
 def test_bigendian_rpc():
@@ -237,6 +240,79 @@ def addone(x):
     rev = client.download("dat.bin")
     assert rev == blob
 
+def test_rpc_tracker_register():
+    # test registration
+    tracker = Tracker('localhost', port=9000, port_end=10000)
+    device_key = 'test_device'
+    server = rpc.Server('localhost', port=9000, port_end=10000,
+                        key=device_key,
+                        tracker_addr=(tracker.host, tracker.port))
+    time.sleep(1)
+    client = rpc.connect_tracker(tracker.host, tracker.port)
+
+    summary = client.summary()
+    assert summary['queue_info'][device_key]['free'] == 1
+
+    remote = client.request(device_key)
+    summary = client.summary()
+    assert summary['queue_info'][device_key]['free'] == 0
+
+    del remote
+    time.sleep(1)
+
+    summary = client.summary()
+    assert summary['queue_info'][device_key]['free'] == 1
+
+    server.terminate()
+    time.sleep(1)
+
+    summary = client.summary()
+    assert summary['queue_info'][device_key]['free'] == 0
+
+    tracker.terminate()
+
+def test_rpc_tracker_request():
+    # test concurrent request
+    tracker = Tracker('localhost', port=9000, port_end=10000)
+    device_key = 'test_device'
+    server = rpc.Server('localhost', port=9000, port_end=10000,
+                        key=device_key,
+                        tracker_addr=(tracker.host, tracker.port))
+    client = rpc.connect_tracker(tracker.host, tracker.port)
+
+    def target(host, port, device_key, timeout):
+        client = rpc.connect_tracker(host, port)
+        remote = client.request(device_key, session_timeout=timeout)
+        while True:
+            pass
+        remote.cpu()
+
+    proc1 = multiprocessing.Process(target=target,
+                                    args=(tracker.host, tracker.port, device_key, 4))
+    proc2 = multiprocessing.Process(target=target,
+                                    args=(tracker.host, tracker.port, device_key, 200))
+    proc1.start()
+    time.sleep(0.5)
+    proc2.start()
+    time.sleep(0.5)
+
+    summary = client.summary()
+    assert summary['queue_info'][device_key]['free'] == 0
+    assert summary['queue_info'][device_key]['pending'] == 1
+
+    proc1.terminate()
+    proc1.join()
+    time.sleep(0.5)
+
+    summary = client.summary()
+    assert summary['queue_info'][device_key]['free'] == 0
+    assert summary['queue_info'][device_key]['pending'] == 0
+
+    proc2.terminate()
+    proc2.join()
+    server.terminate()
+    tracker.terminate()
+
 
 if __name__ == "__main__":
     logging.basicConfig(level=logging.INFO)
@@ -248,3 +324,5 @@ def addone(x):
     test_rpc_array()
     test_rpc_simple()
     test_local_func()
+    test_rpc_tracker_register()
+    test_rpc_tracker_request()
diff --git a/tutorials/autotvm/tune_nnvm_arm.py b/tutorials/autotvm/tune_nnvm_arm.py
index e85786037477..a080681f4ca3 100644
--- a/tutorials/autotvm/tune_nnvm_arm.py
+++ b/tutorials/autotvm/tune_nnvm_arm.py
@@ -151,13 +151,13 @@ def get_network(name, batch_size):
 # .. code-block:: bash
 #
 #    Queue Status
-#    ----------------------------
-#    key          free    pending
-#    ----------------------------
-#    mate10pro    2       0
-#    rk3399       2       0
-#    rpi3b        11      0
-#    ----------------------------
+#    ----------------------------------
+#    key          total  free  pending
+#    ----------------------------------
+#    mate10pro    2      2     0
+#    rk3399       2      2     0
+#    rpi3b        11     11    0
+#    ----------------------------------
 
 ###########################################
 # Set Tuning Options

From cfc848a70d29d61e37026750692606bb9482a54a Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Fri, 24 Aug 2018 21:06:15 -0700
Subject: [PATCH 66/77] [LANG] Improve serializer (#1658)

---
 src/lang/attrs.cc                             | 2 ++
 src/lang/reflection.cc                        | 5 +++++
 tests/python/unittest/test_lang_reflection.py | 3 ++-
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/lang/attrs.cc b/src/lang/attrs.cc
index 0d8d1f3c9ece..360c5b2e9833 100644
--- a/src/lang/attrs.cc
+++ b/src/lang/attrs.cc
@@ -42,4 +42,6 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
 
 TVM_REGISTER_NODE_TYPE(DictAttrsNode);
 
+TVM_REGISTER_NODE_TYPE(AttrFieldInfoNode);
+
 }  // namespace tvm
diff --git a/src/lang/reflection.cc b/src/lang/reflection.cc
index 93e2defd5aef..a33594107a69 100644
--- a/src/lang/reflection.cc
+++ b/src/lang/reflection.cc
@@ -236,6 +236,11 @@ class JSONAttrGetter : public AttrVisitor {
             node_index_->at(kv.second.get()));
       }
     } else {
+      // do not need to recover content of global singleton object
+      // they are registered via the environment
+      auto* f = dmlc::Registry<NodeFactoryReg>::Find(node->type_key());
+      if (f != nullptr && f->fglobal_key != nullptr) return;
+      // recursively index normal object.
       node->VisitAttrs(this);
     }
   }
diff --git a/tests/python/unittest/test_lang_reflection.py b/tests/python/unittest/test_lang_reflection.py
index 2ba67b8d9c86..9678fff8ef9b 100644
--- a/tests/python/unittest/test_lang_reflection.py
+++ b/tests/python/unittest/test_lang_reflection.py
@@ -58,7 +58,8 @@ def test_make_attrs():
 
     dattr = tvm.make.node("DictAttrs", x=1, y=10, name="xyz", padding=(0,0))
     assert dattr.x.value == 1
-
+    datrr = tvm.load_json(tvm.save_json(dattr))
+    assert dattr.name.value == "xyz"
 
 
 def test_make_sum():

From 37988ad3a252a19ae490d7f58938c6ca9843dce1 Mon Sep 17 00:00:00 2001
From: MORITA Kazutaka <morita.kazutaka@lab.ntt.co.jp>
Date: Sat, 25 Aug 2018 13:15:35 +0900
Subject: [PATCH 67/77] [RUNTIME][OPENCL] delay device check (#1657)

---
 docs/deploy/aws_fpga.md             | 4 ++--
 src/runtime/opencl/opencl_module.cc | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/deploy/aws_fpga.md b/docs/deploy/aws_fpga.md
index 7554ce7f64cd..86918ca90a80 100644
--- a/docs/deploy/aws_fpga.md
+++ b/docs/deploy/aws_fpga.md
@@ -108,10 +108,10 @@ python run.py
 Synthesis
 ---------
 
-- Run synthesis with the following script. `XCL_EMULATION_MODE` must be set to 1 at this stage.
+- Run synthesis with the following script.
 
 ```bash
-export XCL_EMULATION_MODE=1
+unset XCL_EMULATION_MODE
 export XCL_TARGET=hw
 
 python build.py
diff --git a/src/runtime/opencl/opencl_module.cc b/src/runtime/opencl/opencl_module.cc
index 6d392036cc56..ed5c3c235ac1 100644
--- a/src/runtime/opencl/opencl_module.cc
+++ b/src/runtime/opencl/opencl_module.cc
@@ -34,6 +34,7 @@ class OpenCLWrappedFunc {
   void operator()(TVMArgs args,
                   TVMRetValue* rv,
                   void** void_args) const {
+    CHECK(w_->context != nullptr) << "No OpenCL device";
     cl::OpenCLThreadEntry* t = w_->GetThreadEntry();
     // get the kernel from thread local kernel table.
     if (entry_.kernel_id >= t->kernel_table.size()) {
@@ -157,7 +158,6 @@ std::string OpenCLModuleNode::GetSource(const std::string& format) {
 void OpenCLModuleNode::Init() {
   workspace_ = GetGlobalWorkspace();
   workspace_->Init();
-  CHECK(workspace_->context != nullptr) << "No OpenCL device";
   device_built_flag_.resize(workspace_->devices.size(), false);
   // initialize the kernel id, need to lock global table.
   std::lock_guard<std::mutex> lock(workspace_->mu);

From a03c60ba1c83d39b857b827f144bf1b371d0b0c7 Mon Sep 17 00:00:00 2001
From: MORITA Kazutaka <morita.kazutaka@lab.ntt.co.jp>
Date: Sat, 25 Aug 2018 17:05:06 +0900
Subject: [PATCH 68/77] [CODEGEN][AOCL] Add math intrinsic rules (#1653)

* [CODEGEN][AOCL] Add math intrinsic rules

* introduce aocl_emu target for AOCL emulation

* rename aocl_emu with aocl_sw_emu

* update docs
---
 docs/deploy/aocl_fpga.md                    |  4 +-
 python/tvm/_ffi/runtime_ctypes.py           |  1 +
 src/codegen/build_module.cc                 |  2 +-
 src/codegen/codegen_aocl.cc                 | 23 +++---
 src/codegen/intrin_rule_aocl.cc             | 82 +++++++++++++++++++++
 tests/python/integration/test_ewise_fpga.py |  4 +-
 topi/tests/python/test_topi_math.py         |  3 +-
 7 files changed, 103 insertions(+), 16 deletions(-)
 create mode 100644 src/codegen/intrin_rule_aocl.cc

diff --git a/docs/deploy/aocl_fpga.md b/docs/deploy/aocl_fpga.md
index bd0dae97879d..f29fc9ef1ace 100644
--- a/docs/deploy/aocl_fpga.md
+++ b/docs/deploy/aocl_fpga.md
@@ -12,7 +12,7 @@ We use two python scripts for this tutorial.
 import tvm
 
 tgt_host="llvm"
-tgt="aocl -device=s5_ref -mattr=emulator"
+tgt="aocl_sw_emu"
 
 n = tvm.var("n")
 A = tvm.placeholder((n,), name='A')
@@ -38,7 +38,7 @@ import tvm
 import numpy as np
 import os
 
-tgt="aocl -device=s5_ref -mattr=emulator"
+tgt="aocl_sw_emu"
 
 fadd = tvm.module.load("myadd.so")
 fadd_dev = tvm.module.load("myadd.aocx")
diff --git a/python/tvm/_ffi/runtime_ctypes.py b/python/tvm/_ffi/runtime_ctypes.py
index 4c36e82a81ec..4f94e0e62d0a 100644
--- a/python/tvm/_ffi/runtime_ctypes.py
+++ b/python/tvm/_ffi/runtime_ctypes.py
@@ -115,6 +115,7 @@ class TVMContext(ctypes.Structure):
         'cl': 4,
         'opencl': 4,
         'aocl' : 5,
+        'aocl_sw_emu' : 5,
         'sdaccel': 6,
         'vulkan': 7,
         'metal': 8,
diff --git a/src/codegen/build_module.cc b/src/codegen/build_module.cc
index 0cb0ec3cc4be..fef5a28b1d21 100644
--- a/src/codegen/build_module.cc
+++ b/src/codegen/build_module.cc
@@ -92,7 +92,7 @@ Target CreateTarget(const std::string& target_name,
     t->device_type = kDLOpenCL;
     t->keys_array.push_back(ir::StringImm::make("sdaccel"));
     t->keys_array.push_back(ir::StringImm::make("hls"));
-  } else if (target_name == "aocl") {
+  } else if (target_name == "aocl" || target_name == "aocl_sw_emu") {
     t->device_type = kDLAOCL;
     t->keys_array.push_back(ir::StringImm::make("aocl"));
     t->keys_array.push_back(ir::StringImm::make("hls"));
diff --git a/src/codegen/codegen_aocl.cc b/src/codegen/codegen_aocl.cc
index 506a4f7ed92c..6ae89fecf6bf 100644
--- a/src/codegen/codegen_aocl.cc
+++ b/src/codegen/codegen_aocl.cc
@@ -13,7 +13,8 @@
 namespace tvm {
 namespace codegen {
 
-runtime::Module BuildAOCL(Array<LoweredFunc> funcs, std::string target_str) {
+runtime::Module BuildAOCL(Array<LoweredFunc> funcs, std::string target_str,
+                          bool emulation) {
   // Get code.
   using tvm::runtime::Registry;
   bool output_ssa = false;
@@ -31,17 +32,14 @@ runtime::Module BuildAOCL(Array<LoweredFunc> funcs, std::string target_str) {
   runtime::SaveBinaryToFile("aocl.cl", code.c_str());
 
   // Compile the .cl file.
+  std::string cmd = "aoc aocl.cl";
   Target target = Target::create(target_str);
-  if (target->device_name == "") {
-    LOG(FATAL) << "AOCL device name not specified in build target.";
+  if (target->device_name != "") {
+    cmd += " -board=" + target->device_name;
   }
-  std::string cmd = "aoc aocl.cl";
-  for (std::string option : target->options()) {
-    if (option == "-mattr=emulator") {
-      cmd += " -march=emulator";
-    }
+  if (emulation) {
+    cmd += " -march=emulator";
   }
-  cmd += " -board=" + target->device_name;
   if (system(cmd.c_str()) != 0) {
     LOG(FATAL) << "OpenCL offline compilation error.";
   }
@@ -55,7 +53,12 @@ runtime::Module BuildAOCL(Array<LoweredFunc> funcs, std::string target_str) {
 
 TVM_REGISTER_API("codegen.build_aocl")
 .set_body([](TVMArgs args, TVMRetValue* rv) {
-    *rv = BuildAOCL(args[0], args[1]);
+    *rv = BuildAOCL(args[0], args[1], false);
+  });
+
+TVM_REGISTER_API("codegen.build_aocl_sw_emu")
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    *rv = BuildAOCL(args[0], args[1], true);
   });
 
 }  // namespace codegen
diff --git a/src/codegen/intrin_rule_aocl.cc b/src/codegen/intrin_rule_aocl.cc
new file mode 100644
index 000000000000..fc5dbe741d63
--- /dev/null
+++ b/src/codegen/intrin_rule_aocl.cc
@@ -0,0 +1,82 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file intrin_rule_aocl.cc
+ * \brief AOCL intrinsic rules.
+ */
+#include "intrin_rule.h"
+
+namespace tvm {
+namespace codegen {
+namespace intrin {
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl.floor")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl.ceil")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl.trunc")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl.fabs")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl.round")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl.exp")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl.log")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl.tanh")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl.sqrt")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl.pow")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl.popcount")
+.set_body(DispatchExtern<Direct>);
+
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl_sw_emu.floor")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl_sw_emu.ceil")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl_sw_emu.trunc")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl_sw_emu.fabs")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl_sw_emu.round")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl_sw_emu.exp")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl_sw_emu.log")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl_sw_emu.tanh")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl_sw_emu.sqrt")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl_sw_emu.pow")
+.set_body(DispatchExtern<Direct>);
+
+TVM_REGISTER_GLOBAL("tvm.intrin.rule.aocl_sw_emu.popcount")
+.set_body(DispatchExtern<Direct>);
+
+
+}  // namespace intrin
+}  // namespace codegen
+}  // namespace tvm
diff --git a/tests/python/integration/test_ewise_fpga.py b/tests/python/integration/test_ewise_fpga.py
index 0abefff02778..2524e2d230b1 100644
--- a/tests/python/integration/test_ewise_fpga.py
+++ b/tests/python/integration/test_ewise_fpga.py
@@ -44,7 +44,7 @@ def check_device(device, host="llvm"):
     if "AWS_PLATFORM" in os.environ:
         check_device("sdaccel -device=" + os.environ.get("AWS_PLATFORM"))
 
-    check_device("aocl -device=s5_ref -mattr=emulator")
+    check_device("aocl_sw_emu")
 
 def test_multi_kernel():
     # graph
@@ -82,7 +82,7 @@ def check_device(device, host="llvm"):
             d.asnumpy(), a.asnumpy() * 2 + b.asnumpy(), rtol=1e-5)
 
     check_device("sdaccel")
-    check_device("aocl -device=s5_ref -mattr=emulator")
+    check_device("aocl_sw_emu")
 
 
 if __name__ == "__main__":
diff --git a/topi/tests/python/test_topi_math.py b/topi/tests/python/test_topi_math.py
index 8d82dbe5bf82..5d606d507387 100644
--- a/topi/tests/python/test_topi_math.py
+++ b/topi/tests/python/test_topi_math.py
@@ -39,7 +39,8 @@ def check_device(device):
             foo(a, b)
             np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5, atol=1e-5)
 
-        for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'llvm', 'nvptx', 'sdaccel']:
+        for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'llvm', 'nvptx', 'sdaccel',
+                       'aocl_sw_emu']:
             check_device(device)
 
 

From a9e0567df03d36a8e363918d224f3102f8423f9a Mon Sep 17 00:00:00 2001
From: Siju <sijusamuel@gmail.com>
Date: Sun, 26 Aug 2018 12:20:55 +0530
Subject: [PATCH 69/77] [FRONTEND][ONNX]HardSigmoid, min, max, mean ops support
 (#1645)

---
 nnvm/python/nnvm/frontend/onnx.py             |  56 +++++++-
 .../python/frontend/onnx/test_forward.py      | 125 ++++++++++++++++++
 2 files changed, 176 insertions(+), 5 deletions(-)

diff --git a/nnvm/python/nnvm/frontend/onnx.py b/nnvm/python/nnvm/frontend/onnx.py
index f62202a37dff..5127dfd299bd 100644
--- a/nnvm/python/nnvm/frontend/onnx.py
+++ b/nnvm/python/nnvm/frontend/onnx.py
@@ -529,6 +529,53 @@ def _impl_v1(cls, inputs, attr, params):
         return _sym.lrn(inputs[0], size=nsize, axis=axis,
                         alpha=alpha, beta=beta, bias=bias)
 
+class Maximum(OnnxOpConverter):
+    """ Operator converter for Maximum.
+    """
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        if not isinstance(inputs, list) or len(inputs) < 2:
+            raise ValueError("Expect minimum 2 inputs")
+        _max = inputs[0]
+        for i in range(1, len(inputs)):
+            _max = AttrCvt(op_name='broadcast_max')([_max, inputs[i]], {})
+        return _max
+
+class Minimum(OnnxOpConverter):
+    """ Operator converter for Minimum.
+    """
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        if not isinstance(inputs, list) or len(inputs) < 2:
+            raise ValueError("Expect minimum 2 inputs")
+        _min = inputs[0]
+        for i in range(1, len(inputs)):
+            _min = AttrCvt(op_name='broadcast_min')([_min, inputs[i]], {})
+        return _min
+
+class Mean(OnnxOpConverter):
+    """ Operator converter for Mean.
+    """
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        if not isinstance(inputs, list) or len(inputs) < 2:
+            raise ValueError("Expect minimum 2 inputs")
+        count = len(inputs)
+        _sum = inputs[0]
+        for i in range(1, count):
+            _sum = AttrCvt(op_name='broadcast_add')([_sum, inputs[i]], {})
+        return _sum / count
+
+class HardSigmoid(OnnxOpConverter):
+    """ Operator converter for HardSigmoid.
+    """
+    @classmethod
+    def _impl_v1(cls, inputs, attr, params):
+        alpha = attr.get('alpha', 0.2)
+        beta = attr.get('beta', 0.5)
+        transformX = (inputs[0] * alpha) + beta
+        attr = {'a_min':0, 'a_max':1}
+        return AttrCvt(op_name='clip')([transformX], attr)
 
 # compatible operators that do NOT require any conversion.
 _identity_list = []
@@ -557,7 +604,6 @@ def _get_convert_map(opset):
         # 'MeanVarianceNormalization'
         # 'Crop'
         # 'Embedding'
-        # 'Upsample'
         'Upsample' : Upsample.get_converter(opset),
         'SpatialBN': BatchNorm.get_converter(opset),
 
@@ -591,11 +637,11 @@ def _get_convert_map(opset):
         'Pow': Renamer('broadcast_pow'),
         'PRelu': Prelu.get_converter(opset),
         'Sigmoid': Renamer('sigmoid'),
-        # 'HardSigmoid'
-        # 'Max' : this is the elemwise maximum
-        # 'Min' : this is the elemwise minimum
+        'HardSigmoid': HardSigmoid.get_converter(opset),
+        'Max': Maximum.get_converter(opset),
+        'Min': Minimum.get_converter(opset),
         'Sum': Sum.get_converter(opset),
-        # 'Mean'
+        'Mean': Mean.get_converter(opset),
         'Clip': AttrCvt('clip', transforms={'min': 'a_min', 'max': 'a_max'}),
         # softmax default axis is different in onnx
         'Softmax': AttrCvt('softmax', {'axis': ('axis', 1)}),
diff --git a/nnvm/tests/python/frontend/onnx/test_forward.py b/nnvm/tests/python/frontend/onnx/test_forward.py
index 3f2fbb144289..49cf58fa1aa5 100644
--- a/nnvm/tests/python/frontend/onnx/test_forward.py
+++ b/nnvm/tests/python/frontend/onnx/test_forward.py
@@ -426,6 +426,127 @@ def test_upsample():
     _test_upsample_nearest()
     _test_upsample_bilinear()
 
+def verify_min(input_dim):
+    dtype = 'float32'
+
+    a_np1 = np.random.uniform(size=input_dim).astype(dtype)
+    a_np2 = np.random.uniform(size=input_dim).astype(dtype)
+    a_np3 = np.random.uniform(size=input_dim).astype(dtype)
+
+    b_np = np.min((a_np1, a_np2, a_np3), axis=0)
+
+    min_node = helper.make_node("Min", ["a_np1", "a_np2", "a_np3"], ["out"])
+
+    graph = helper.make_graph([min_node],
+                              "Min_test",
+                              inputs = [helper.make_tensor_value_info("a_np1",
+                                            TensorProto.FLOAT, list(input_dim)),
+                                        helper.make_tensor_value_info("a_np2",
+                                            TensorProto.FLOAT, list(input_dim)),
+                                        helper.make_tensor_value_info("a_np3",
+                                            TensorProto.FLOAT, list(input_dim))],
+                              outputs = [helper.make_tensor_value_info("out",
+                                            TensorProto.FLOAT, list(b_np.shape))])
+
+    model = helper.make_model(graph, producer_name='Min_test')
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, [a_np1, a_np2, a_np3], target, ctx, b_np.shape)
+        np.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
+
+def test_forward_min():
+    verify_min((1, 3, 20, 20))
+    verify_min((20, 20))
+
+def verify_max(input_dim):
+    dtype = 'float32'
+
+    a_np1 = np.random.uniform(size=input_dim).astype(dtype)
+    a_np2 = np.random.uniform(size=input_dim).astype(dtype)
+    a_np3 = np.random.uniform(size=input_dim).astype(dtype)
+
+    b_np = np.max((a_np1, a_np2, a_np3), axis=0)
+
+    max_node = helper.make_node("Max", ["a_np1", "a_np2", "a_np3"], ["out"])
+
+    graph = helper.make_graph([max_node],
+                              "Max_test",
+                              inputs = [helper.make_tensor_value_info("a_np1",
+                                            TensorProto.FLOAT, list(input_dim)),
+                                        helper.make_tensor_value_info("a_np2",
+                                            TensorProto.FLOAT, list(input_dim)),
+                                        helper.make_tensor_value_info("a_np3",
+                                            TensorProto.FLOAT, list(input_dim))],
+                              outputs = [helper.make_tensor_value_info("out",
+                                            TensorProto.FLOAT, list(b_np.shape))])
+
+    model = helper.make_model(graph, producer_name='Max_test')
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, [a_np1, a_np2, a_np3], target, ctx, b_np.shape)
+        np.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
+
+def test_forward_max():
+    verify_max((1, 3, 20, 20))
+    verify_max((20, 20))
+
+def verify_mean(input_dim):
+    dtype = 'float32'
+
+    a_np1 = np.random.uniform(size=input_dim).astype(dtype)
+    a_np2 = np.random.uniform(size=input_dim).astype(dtype)
+    a_np3 = np.random.uniform(size=input_dim).astype(dtype)
+
+    b_np = np.mean((a_np1, a_np2, a_np3), axis=0)
+
+    mean_node = helper.make_node("Mean", ["a_np1", "a_np2", "a_np3"], ["out"])
+
+    graph = helper.make_graph([mean_node],
+                              "Mean_test",
+                              inputs = [helper.make_tensor_value_info("a_np1",
+                                            TensorProto.FLOAT, list(input_dim)),
+                                        helper.make_tensor_value_info("a_np2",
+                                            TensorProto.FLOAT, list(input_dim)),
+                                        helper.make_tensor_value_info("a_np3",
+                                            TensorProto.FLOAT, list(input_dim))],
+                              outputs = [helper.make_tensor_value_info("out",
+                                            TensorProto.FLOAT, list(b_np.shape))])
+
+    model = helper.make_model(graph, producer_name='Mean_test')
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, [a_np1, a_np2, a_np3], target, ctx, b_np.shape)
+        np.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
+
+def test_forward_mean():
+    verify_mean((1, 3, 20, 20))
+    verify_mean((20, 20))
+
+def verify_hardsigmoid(input_dim, alpha, beta):
+    dtype = 'float32'
+
+    a_np1 = np.random.uniform(size=input_dim).astype(dtype)
+
+    b_np = np.clip(a_np1 * alpha + beta, 0, 1)
+
+    hardsigmoid_node = helper.make_node("HardSigmoid", ["a_np1"], ["out"], alpha=alpha, beta=beta)
+
+    graph = helper.make_graph([hardsigmoid_node],
+                              "HardSigmoid_test",
+                              inputs = [helper.make_tensor_value_info("a_np1",
+                                            TensorProto.FLOAT, list(input_dim))],
+                              outputs = [helper.make_tensor_value_info("out",
+                                            TensorProto.FLOAT, list(b_np.shape))])
+
+    model = helper.make_model(graph, producer_name='HardSigmoid_test')
+
+    for target, ctx in ctx_list():
+        tvm_out = get_tvm_output(model, [a_np1], target, ctx, b_np.shape)
+        np.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5)
+
+def test_forward_hardsigmoid():
+    verify_hardsigmoid((1, 3, 20, 20), 0.5, 0.6)
+    verify_hardsigmoid((20, 20), 0.3, 0.4)
 
 if __name__ == '__main__':
     # verify_super_resolution_example()
@@ -445,3 +566,7 @@ def test_upsample():
     test_gather()
     test_lrn()
     test_upsample()
+    test_forward_min()
+    test_forward_max()
+    test_forward_mean()
+    test_forward_hardsigmoid()

From 463e5c3874ac1ccba6be75d9bfbcc984921caaba Mon Sep 17 00:00:00 2001
From: Tatsuya Nishiyama <nishiyama.tatsuya0@gmail.com>
Date: Tue, 28 Aug 2018 05:05:50 +0900
Subject: [PATCH 70/77] [FRONTEND][TENSORFLOW] fix the convertion of sum and
 add testcase for it (#1654)

* [TENSORFLOW] fix the convertion of sum and add testcase for it

* delete checking tyoe of axis and divide reduce test
---
 nnvm/python/nnvm/frontend/tensorflow.py       |  2 ++
 .../frontend/tensorflow/test_forward.py       | 21 +++++++++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/nnvm/python/nnvm/frontend/tensorflow.py b/nnvm/python/nnvm/frontend/tensorflow.py
index 65dd3619b5b2..6be5333ccee6 100644
--- a/nnvm/python/nnvm/frontend/tensorflow.py
+++ b/nnvm/python/nnvm/frontend/tensorflow.py
@@ -444,6 +444,8 @@ def _impl(inputs, attr, params):
 def _sum():
     def _impl(inputs, attr, params):
         axis = params.pop(inputs[1].list_output_names()[0]).asnumpy()
+        # convert to tuple for preventing invalid parameter format error
+        axis = tuple(axis)
         return AttrCvt(
             op_name='sum',
             extras={'axis': axis},
diff --git a/nnvm/tests/python/frontend/tensorflow/test_forward.py b/nnvm/tests/python/frontend/tensorflow/test_forward.py
index 6fa020a03444..e0e18d1bdb06 100644
--- a/nnvm/tests/python/frontend/tensorflow/test_forward.py
+++ b/nnvm/tests/python/frontend/tensorflow/test_forward.py
@@ -348,6 +348,26 @@ def test_forward_argminmax():
         _test_argx(tf.argmax, data=data, axis=axis)
         _test_argx(tf.argmin, data=data, axis=axis)
 
+#######################################################################
+# Reduce
+# ------
+
+def _test_reduce(func, data, **kwargs):
+    """ One iteration of a reduce operation"""
+
+    with tf.Graph().as_default():
+        inp = array_ops.placeholder(shape=data.shape, dtype=data.dtype, name="c0")
+        func(inp, name="reducex0", **kwargs)
+
+        compare_tf_with_tvm(data, 'c0:0', 'reducex0:0')
+
+def test_forward_reduce():
+    data = np.random.uniform(size=(8,4,9)).astype('float32')
+    _test_reduce(tf.reduce_sum, data=data)
+    _test_reduce(tf.reduce_sum, data=data, axis=0)
+    _test_reduce(tf.reduce_sum, data=data, axis=(0,1))    
+
+
 #######################################################################
 # Variable
 # --------
@@ -844,6 +864,7 @@ def test_forward_l2_normalize():
     test_forward_squeeze()
     test_forward_sigmoid()
     test_forward_argminmax()
+    test_forward_reduce()
     if tf.__version__ == '1.4.1':
         _test_forward_concat_v2()
     test_forward_multi_input()

From 1d7c52f2b23a53c14dfc3c69f4ef3e57d1e557a7 Mon Sep 17 00:00:00 2001
From: Jian Weng <werefluke@gmail.com>
Date: Mon, 27 Aug 2018 13:33:27 -0700
Subject: [PATCH 71/77] add docstring skip in hybrid script (#1668)

* add docstring skip in hybrid script

* fix lint
---
 python/tvm/hybrid/parser.py                 | 4 ++--
 python/tvm/hybrid/util.py                   | 6 ++++++
 tests/python/unittest/test_hybrid_script.py | 1 +
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/python/tvm/hybrid/parser.py b/python/tvm/hybrid/parser.py
index 1e532367a321..cf21ea950549 100644
--- a/python/tvm/hybrid/parser.py
+++ b/python/tvm/hybrid/parser.py
@@ -3,7 +3,7 @@
 import ast
 import operator
 import sys
-from .util import make_nop, halide_imm_types
+from .util import make_nop, halide_imm_types, is_docstring
 from .intrin import LOOP_INTRIN, MATH_INTRIN
 from .var_decl import determine_variable_usage
 from ..api import thread_axis
@@ -15,7 +15,7 @@
 
 def list_to_block(visit, lst):
     """Convert a list of Python IR nodes to HalideIR Block"""
-    lst = list(map(visit, lst))
+    lst = [visit(stmt) for stmt in lst if not is_docstring(stmt)]
     lst = [stmt for stmt in lst if not _ir_pass.Equal(stmt, make_nop())]
     if not lst:
         return make_nop()
diff --git a/python/tvm/hybrid/util.py b/python/tvm/hybrid/util.py
index 43d26e859560..2a43957e9706 100644
--- a/python/tvm/hybrid/util.py
+++ b/python/tvm/hybrid/util.py
@@ -1,5 +1,6 @@
 """Internal utilities for parsing Python subset to HalideIR"""
 
+import ast
 import inspect
 import numpy
 from .intrin import HYBRID_GLOBALS
@@ -22,6 +23,11 @@ def make_nop():
     return _make.Evaluate(_api.const(0, dtype='int32'))
 
 
+def is_docstring(node):
+    """Checks if a Python AST node is a docstring"""
+    return isinstance(node, ast.Expr) and isinstance(node.value, ast.Str)
+
+
 def _pruned_source(func):
     """Prune source code's extra leading spaces"""
     lines = inspect.getsource(func).split('\n')
diff --git a/tests/python/unittest/test_hybrid_script.py b/tests/python/unittest/test_hybrid_script.py
index 0f500d7c704f..ef0bcf8f72e5 100644
--- a/tests/python/unittest/test_hybrid_script.py
+++ b/tests/python/unittest/test_hybrid_script.py
@@ -43,6 +43,7 @@ def tvm_val_2_py_val(val):
 
 @script
 def outer_product(n, m, a, b, c):
+    """This is a simple outer product"""
     for i in range(n):
         for j in range(m):
             c[i, j] = a[i] * b[j]

From ea01417671084988e8fe378ab114406be069ab9e Mon Sep 17 00:00:00 2001
From: MORINAGA <34588258+imorinaga@users.noreply.github.com>
Date: Tue, 28 Aug 2018 14:02:24 +0900
Subject: [PATCH 72/77] [DOCS][NNVM] Delete duplicated tensor operators from
 list (#1669)

---
 docs/nnvm_top.rst | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/docs/nnvm_top.rst b/docs/nnvm_top.rst
index 927990647a69..663c85ac789e 100644
--- a/docs/nnvm_top.rst
+++ b/docs/nnvm_top.rst
@@ -136,11 +136,9 @@ This level enables typical convnet models.
    nnvm.symbol.collapse_sum
    nnvm.symbol.broadcast_equal
    nnvm.symbol.broadcast_greater_equal
-   nnvm.symbol.broadcast_greater_equal
    nnvm.symbol.broadcast_greater
    nnvm.symbol.broadcast_left_shift
    nnvm.symbol.broadcast_less_equal
-   nnvm.symbol.broadcast_less_equal
    nnvm.symbol.broadcast_less
    nnvm.symbol.broadcast_max
    nnvm.symbol.broadcast_min
@@ -252,11 +250,9 @@ Detailed Definitions
 .. autofunction:: nnvm.symbol.collapse_sum
 .. autofunction:: nnvm.symbol.broadcast_equal
 .. autofunction:: nnvm.symbol.broadcast_greater_equal
-.. autofunction:: nnvm.symbol.broadcast_greater_equal
 .. autofunction:: nnvm.symbol.broadcast_greater
 .. autofunction:: nnvm.symbol.broadcast_left_shift
 .. autofunction:: nnvm.symbol.broadcast_less_equal
-.. autofunction:: nnvm.symbol.broadcast_less_equal
 .. autofunction:: nnvm.symbol.broadcast_less
 .. autofunction:: nnvm.symbol.broadcast_max
 .. autofunction:: nnvm.symbol.broadcast_min

From 38203a860b45616270bdd219ac181d51d18ad060 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Tue, 28 Aug 2018 17:01:57 -0500
Subject: [PATCH 73/77] Fix incorrect stride in conv2d_nhwc_python (#1670)

---
 topi/python/topi/testing/conv2d_nhwc_python.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/topi/python/topi/testing/conv2d_nhwc_python.py b/topi/python/topi/testing/conv2d_nhwc_python.py
index 880088a6f89f..461d7a6a0e06 100644
--- a/topi/python/topi/testing/conv2d_nhwc_python.py
+++ b/topi/python/topi/testing/conv2d_nhwc_python.py
@@ -63,5 +63,5 @@ def conv2d_nhwc_python(a_np, w_np, stride, padding):
                     apad = at[n, c]
                 out = scipy.signal.convolve2d(
                     apad, np.rot90(np.rot90(wt[f, c])), mode='valid')
-                bt[n, f] += out[::stride, ::stride]
+                bt[n, f] += out[::stride_h, ::stride_w]
     return bt.transpose((0, 2, 3, 1))

From 12839e6d2bbe6640c72019c604f3c59656970f77 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <mercy_zheng@sjtu.edu.cn>
Date: Tue, 28 Aug 2018 22:56:49 -0700
Subject: [PATCH 74/77] [AUTOTVM] Decouple build and run in measurement (#1661)

---
 docs/api/python/autotvm.rst                   |   5 +
 python/tvm/autotvm/__init__.py                |   3 +-
 python/tvm/autotvm/measure/__init__.py        |   8 +-
 python/tvm/autotvm/measure/local_executor.py  |  14 +-
 python/tvm/autotvm/measure/measure.py         | 251 +++--
 python/tvm/autotvm/measure/measure_methods.py | 896 ++++++++++--------
 python/tvm/autotvm/tuner/ga_tuner.py          |   2 +-
 .../tvm/autotvm/tuner/sa_model_optimizer.py   |   2 +-
 tests/python/integration/test_tuning.py       |  41 +-
 tests/python/unittest/test_autotvm_common.py  |  19 +
 .../python/unittest/test_autotvm_database.py  | 151 +--
 tests/python/unittest/test_autotvm_measure.py |  97 ++
 topi/recipe/gemm/gemm_int8.py                 |   7 +-
 tutorials/autotvm/tune_conv2d_cuda.py         |  12 +-
 tutorials/autotvm/tune_nnvm_arm.py            | 123 +--
 tutorials/autotvm/tune_simple_template.py     |   9 +-
 16 files changed, 880 insertions(+), 760 deletions(-)
 create mode 100644 tests/python/unittest/test_autotvm_measure.py

diff --git a/docs/api/python/autotvm.rst b/docs/api/python/autotvm.rst
index f03406dbc720..93d6905077fb 100644
--- a/docs/api/python/autotvm.rst
+++ b/docs/api/python/autotvm.rst
@@ -16,6 +16,11 @@ tvm.autotvm.measure
 
 .. autofunction:: tvm.autotvm.measure.create_measure_batch
 
+.. autoclass:: tvm.autotvm.measure.measure_methods.LocalBuilder
+
+.. autoclass:: tvm.autotvm.measure.measure_methods.RPCRunner
+
+.. autoclass:: tvm.autotvm.measure.measure_methods.LocalRunner
 
 tvm.autotvm.tuner
 ~~~~~~~~~~~~~~~~~
diff --git a/python/tvm/autotvm/__init__.py b/python/tvm/autotvm/__init__.py
index 625b50c10853..7170dbdd8565 100644
--- a/python/tvm/autotvm/__init__.py
+++ b/python/tvm/autotvm/__init__.py
@@ -22,7 +22,8 @@
 from . import tophub
 
 # some shortcuts
-from .measure import measure_option, MeasureInput, MeasureResult, MeasureErrorNo
+from .measure import measure_option, MeasureInput, MeasureResult, MeasureErrorNo, \
+    LocalBuilder, LocalRunner, RPCRunner
 from .tuner import callback
 from .task import template, get_config, create, ConfigSpace, ConfigEntity, \
     register_topi_compute, register_topi_schedule, \
diff --git a/python/tvm/autotvm/measure/__init__.py b/python/tvm/autotvm/measure/__init__.py
index 880dfd1ffe29..8a6126641a99 100644
--- a/python/tvm/autotvm/measure/__init__.py
+++ b/python/tvm/autotvm/measure/__init__.py
@@ -1,7 +1,7 @@
 """Distributed executor infrastructure to scale up the tuning"""
 
-from .measure import MeasureInput, MeasureResult, MeasureErrorNo, measure_option
-from .measure_methods import request_remote, check_remote, create_measure_batch, rpc
-
+from .measure import MeasureInput, MeasureResult, MeasureErrorNo, measure_option, \
+    create_measure_batch
+from .measure_methods import LocalBuilder, LocalRunner, RPCRunner, request_remote
+from .executor import Executor
 from .local_executor import LocalExecutor
-from .executor import Future, Executor
diff --git a/python/tvm/autotvm/measure/local_executor.py b/python/tvm/autotvm/measure/local_executor.py
index 55f1dc75fc5c..63d995c3580c 100644
--- a/python/tvm/autotvm/measure/local_executor.py
+++ b/python/tvm/autotvm/measure/local_executor.py
@@ -37,7 +37,8 @@ def _execute_func(func, queue, args, kwargs):
         res = exc
     queue.put(res)
 
-def timeout_monitor(queue, timeout, func, args, kwargs):
+
+def call_with_timeout(queue, timeout, func, args, kwargs):
     """A wrapper to support timeout of a function call"""
 
     # start a new process for timeout (cannot use thread because we have c function)
@@ -45,17 +46,12 @@ def timeout_monitor(queue, timeout, func, args, kwargs):
     p.start()
     p.join(timeout=timeout)
 
-    alive = p.is_alive()
+    queue.put(executor.TimeoutError())
+
     kill_child_processes(p.pid)
     p.terminate()
     p.join()
 
-    if alive:
-        queue.put(executor.TimeoutError())
-    else:
-        if queue.empty():
-            queue.put(executor.ExecutionError("Fatal error in local executor"))
-
 
 class LocalFuture(executor.Future):
     """Local wrapper for the future
@@ -134,7 +130,7 @@ def submit(self, func, *args, **kwargs):
             return LocalFutureNoFork(func(*args, **kwargs))
 
         queue = Queue(2)
-        process = Process(target=timeout_monitor,
+        process = Process(target=call_with_timeout,
                           args=(queue, self.timeout, func, args, kwargs))
         process.start()
         return LocalFuture(process, queue)
diff --git a/python/tvm/autotvm/measure/measure.py b/python/tvm/autotvm/measure/measure.py
index 2d780eeaf004..38b5f99eacb9 100644
--- a/python/tvm/autotvm/measure/measure.py
+++ b/python/tvm/autotvm/measure/measure.py
@@ -1,5 +1,6 @@
 # pylint: disable=pointless-string-statement,consider-using-enumerate,invalid-name
 """User facing API for specifying how to measure the generated code"""
+import multiprocessing
 from collections import namedtuple
 
 class MeasureInput(namedtuple("MeasureInput", ["target", "task", "config"])):
@@ -16,6 +17,7 @@ class MeasureInput(namedtuple("MeasureInput", ["target", "task", "config"])):
         Specific configuration.
     """
 
+
 class MeasureResult(namedtuple("MeasureResult", ["costs", "error_no", "all_cost", "timestamp"])):
     """
     Stores all the results of a measurement
@@ -23,8 +25,8 @@ class MeasureResult(namedtuple("MeasureResult", ["costs", "error_no", "all_cost"
     Parameters
     ----------
     costs: Array of float or Array of Exception
-        If no error occurs for this measurement, it is an array of measured running times.
-        If some error occurs during the measurement, it is an array of the exception objections.
+        If no error occurs during measurement, it is an array of measured running times.
+        If an error occurs during measurement, it is an array of the exception objections.
     error_no: int
         Denote error type, defined by MeasureErrorNo
     all_cost: float
@@ -37,92 +39,185 @@ class MeasureResult(namedtuple("MeasureResult", ["costs", "error_no", "all_cost"
 class MeasureErrorNo(object):
     """Error type for MeasureResult"""
     NO_ERROR = 0              # no error
-    INSTANTIATION_ERROR = 1   # error when calling template function
+    INSTANTIATION_ERROR = 1   # actively detected error in instantiating a template with a config
     COMPILE_HOST = 2          # error when compiling code on host (e.g. tvm.build)
-    COMPILE_DEVICE = 3        # error when compiling code on device (e.g. opencl JIT on device)
+    COMPILE_DEVICE = 3        # error when compiling code on device (e.g. OpenCL JIT on the device)
     RUNTIME_DEVICE = 4        # error when run program on device
     WRONG_ANSWER = 5          # answer is wrong when compared to a golden output
-    FLEET_ERROR = 6           # error of measure infrastructure
+    BUILD_TIMEOUT = 6         # timeout during compilation
+    RUN_TIMEOUT = 7           # timeout during run
+    UNKNOWN_ERROR = 8         # unknown error
+
 
+class Builder(object):
+    """Builder that builds programs in tuning
 
-def measure_option(measure_func,
-                   number=1,
-                   repeat=1,
-                   timeout=60,
-                   n_parallel=1,
-                   do_fork=True,
-                   build_func='default',
-                   check_correctness=False,
-                   replay_db=None):
-    """Configure how to do measurement
+    Parameters
+    ----------
+    timeout: float, optional
+        The timeout of a build task
+    n_parallel: int, optional
+        The number of tasks submitted in parallel
+        By default it will use all cpu cores
+    """
+    def __init__(self, timeout=10, n_parallel=None):
+        self.timeout = timeout
+        self.n_parallel = n_parallel or multiprocessing.cpu_count()
+        self.build_kwargs = {}
+        self.task = None
+
+    def set_task(self, task, build_kwargs=None):
+        """
+        Initialize for a new tuning task
+
+        Parameters
+        ----------
+        task: Task
+            The tuning task
+        build_kwargs: dict, optional
+            The additional kwargs for build function
+        """
+        self.task = task
+        self.build_kwargs = build_kwargs
+
+    def build(self, measure_inputs):
+        """Build programs
+
+        Parameters
+        ----------
+        measure_inputs: List of MeasureInput
+            The measure input
+
+        Returns
+        -------
+        build_results: List of BuildResult
+            The build result.
+        """
+        raise NotImplementedError()
+
+
+class Runner(object):
+    """Runner that runs and measures the time cost of a generated program in tuning
 
     Parameters
     ----------
-    measure_func: str or callable
-        'local': use the local device for measurement. The tuner will start a tracker
-        and a RPC server silently for the user.
-
-        callable: It is a callable function for measurement.
-                  See the return value of measure/measure_methods.py::rpc for example.
-    number : int, optional
-        Number of times to do the measurement for average
-    repeat : int, optional
-        Number of times to repeat the measurement.
-        In total, the generated code will be run (1 + number x repeat) times,
-        where the first one is warm up. The returned result contains `repeat` costs,
-        each of which is the average of `number` test run.
-    timeout: int, optional
-        Timeout for a whole batch. TimeoutError will be returned as the result if a
-        task timeouts.
+    timeout: float, optional
+        The timeout of a build task
     n_parallel: int, optional
-        The number of measurement task that can run in parallel.
-        Set this according to the number of cpu cores (for compilation) and
-        the number of devices you have (for measuring generate code).
-    do_fork: bool, optional
-        Whether use multiprocessing (based on fork) for running measure jobs in parallel.
-        Set this to False if you want to debug (see trackback) or using fork is not suitable.
-        NOTE: If this is False, parallel and timeout do not work.
-    build_func: str or callable, optional
-        'default': call default builder. This works for normal target (llvm, cuda)
-
-        'ndk': use Android NDK to create shared library. Use this for android target.
-
-        callable: customized build function for other backends (e.g. VTA).
-                  See measure/measure_methods.py::default_build_func for example.
-    check_correctness: bool, optional
-        Whether check correctness after measurement. This will use llvm cpu target to generate
-        reference output.
-    replay_db : Database, optional
-        The database that we retrieve saved MeasureResult from.
+        The number of tasks submitted in parallel
+        By default it will use all cpu cores
+    """
+    def __init__(self, timeout=5, n_parallel=None):
+        self.timeout = timeout
+        self.n_parallel = n_parallel or multiprocessing.cpu_count()
+        self.task = None
+
+    def set_task(self, task):
+        """
+        Initialize for a new tuning task
+
+        Parameters
+        ----------
+        task: Task
+            The tuning task
+        """
+        self.task = task
+
+    def get_build_kwargs(self):
+        """
+        Get device specific build arguments (e.g. maximum shared memory size)
+
+        Returns
+        ----------
+        kwargs: dict
+            The additional keyword arguments
+        """
+        raise NotImplementedError()
+
+    def run(self, measure_inputs, build_results):
+        """Run amd measure built programs
+
+        Parameters
+        ----------
+        measure_inputs: List of MeasureInput
+            The raw measure input
+        build_results: List of BuildResults
+            The build results
+
+        Returns
+        -------
+        measure_results: List of MeasureResult
+            The final results of measurement
+        """
+        raise NotImplementedError()
+
+
+def measure_option(builder, runner):
+    """
+    Set options for measure. To measure a config, we will build it and run it.
+    So we have to set options for these two steps.
+    They have their own options on timeout, parallel, etc.
+
+    Parameters
+    ----------
+    builder: Builder
+        Specify how to build programs
+    runner: Runner
+        Specify how to run programs
+    """
+    from .measure_methods import LocalBuilder, LocalRunner
+
+    if isinstance(builder, str):
+        if builder == 'local':
+            builder = LocalBuilder()
+        else:
+            raise ValueError("Invalid builder: " + builder)
+
+    if isinstance(runner, str):
+        if runner == 'local':
+            runner = LocalRunner()
+        else:
+            raise ValueError("Invalid runner: " + runner)
+
+    opt = {
+        'builder': builder,
+        'runner': runner,
+    }
+
+    return opt
+
+
+def create_measure_batch(task, option):
+    """Get a standard measure_batch function.
+
+    Parameters
+    ----------
+    task: tvm.autotvm.task.Task
+        The tuning task
+    option: dict
+        The option for measuring generated code.
+        You should use the return value of function :any:`measure_option` for this argument.
 
     Returns
     -------
-    options: dict
-        A dict to store all options
-
-    Note
-    ----
-    To support customized measure, you can pass callable `measure_func` or
-    `build_func` in. The `measure_func` will call `build_func` to build binary library
-    and handle the logic of measurement.
-
-    Signature:
-    * measure_func (see the return value of measure/measure_methods.py::rpc for example)
-    def measure_func(input_pack, build_func, build_kwargs, number, repeat, ref_input, ref_output):
-        return measure_results
-
-    * build_func (see measure/measure_methods.py::default_build_func for example)
-    def build_func(inp, tmp_dir, **kwargs):
-        return func, args, filename
+    measure_batch: callable
+        a callback function to measure a batch of configs
     """
-    return {
-        'measure_func': measure_func,
-        'number': number,
-        'repeat': repeat,
-        'timeout': timeout,
-        'n_parallel': n_parallel,
-        'do_fork': do_fork,
-        'build_func': build_func,
-        'check_correctness': check_correctness,
-        'replay_db': replay_db,
-    }
+    builder = option['builder']
+    runner = option['runner']
+
+    attach_objects = runner.set_task(task)
+
+    # feed device related information from runner to builder
+    # (e.g. max shared memory for validity checking)
+    build_kwargs = runner.get_build_kwargs()
+    builder.set_task(task, build_kwargs)
+
+    def measure_batch(measure_inputs):
+        build_results = builder.build(measure_inputs)
+        results = runner.run(measure_inputs, build_results)
+        return results
+
+    measure_batch.n_parallel = builder.n_parallel
+    measure_batch.attach_objects = attach_objects
+    return measure_batch
diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py
index 2d740b9493b2..6a3cd028393c 100644
--- a/python/tvm/autotvm/measure/measure_methods.py
+++ b/python/tvm/autotvm/measure/measure_methods.py
@@ -1,129 +1,339 @@
-# pylint: disable=consider-using-enumerate,invalid-name,too-many-function-args
+# pylint: disable=invalid-name,too-many-function-args,too-many-nested-blocks
 """
 Functions that run on executor for measurement.
-These functions are responsible for building tvm module, uploading it to
-remote devices, recording the running time costs and checking the correctness of output
+
+These functions are responsible for building the tvm module, uploading it to
+remote devices, recording the running time costs, and checking the correctness of the output.
 """
 
 import logging
+import shutil
 import os
+import threading
 import time
 from random import getrandbits
-import threading
+from collections import namedtuple
+import tempfile
 
 import numpy as np
 
-from ... import ir_pass, build, build_config, nd, context, TVMError, register_func, \
-    target as _target, rpc as _rpc
-from ...contrib import nvcc, util, ndk
+from ... import ir_pass, build, build_config, nd, TVMError, register_func, \
+    rpc as _rpc, target as _target
+from ...contrib import nvcc, ndk
 
 from ..util import get_const_tuple
 from ..env import AutotvmGlobalScope
 from ..task.space import InstantiationError
 
-from .measure import MeasureResult, MeasureErrorNo
+from .measure import MeasureResult, MeasureErrorNo, Builder, Runner
 from .local_executor import LocalExecutor
 
 logger = logging.getLogger('autotvm')
 
-class HashMismatchError(ValueError):
-    """Raised when the code hash of a submitted config doesn't match that on the
-       measure side """
-    pass
+class BuildResult(namedtuple("BuildResult", ('filename', 'arg_info', 'error', 'time_cost'))):
+    """
+    Stores all the necessary inputs for a measurement.
 
+    Parameters
+    ----------
+    filename : str
+        The filename of generated library
+    arg_info : Tuple
+        The shape and dtype information of tvm tensor arguments
+    error : Exception
+        The error happens during compilation.
+    time_cost : float
+        The time cost of building
+    """
 
-def request_remote(device_key, tracker_addr=None, priority=1, timeout=60):
-    """request a remote session
+class LocalBuilder(Builder):
+    """Run compilation on local machine
 
     Parameters
     ----------
-    device_key: string
-        device key of registered device in tracker
-    tracker_addr: Tuple(string, int), optional
-        The address of rpc tracker in (host, port) format.
-        If is none, will use environment variable "TVM_TRACKER_HOST"
-        and "TVM_TRACKER_PORT"
-    priority: int, optional
-        The priority of this request, larger is more prior
-    timeout: float, optional
-        The timeout of this session (units: seconds)
-
-    Returns
-    ------
-    session: RPCSession
+    timeout: float
+        The timeout of a compilation
+    n_parallel: int
+        The number of tasks run in parallel. "None" will use all cpu cores
+    build_func: callable or str
+        If is 'default', use default build function
+        If is 'ndk', use function for android ndk
+        If is callable, use it as custom build function
     """
-    # connect to the tracker
-    if tracker_addr:
-        host = tracker_addr[0] or os.environ['TVM_TRACKER_HOST']
-        port = tracker_addr[1] or int(os.environ['TVM_TRACKER_PORT'])
-    else:
-        host = os.environ['TVM_TRACKER_HOST']
-        port = int(os.environ['TVM_TRACKER_PORT'])
+    def __init__(self, timeout=10, n_parallel=None, build_func='default'):
+        super(LocalBuilder, self).__init__(timeout, n_parallel)
+
+        if isinstance(build_func, str):
+            if build_func == 'default':
+                build_func = default_build_func
+            elif build_func == 'ndk':
+                build_func = android_ndk_build_func
+            else:
+                raise ValueError("Invalid build_func" + build_func)
 
-    tracker = _rpc.connect_tracker(host, port)
-    remote = tracker.request(device_key, priority=priority,
-                             session_timeout=timeout)
-    return remote
+        self.build_func = build_func
+        self.tmp_dir = tempfile.mkdtemp()
+        self.executor = LocalExecutor(timeout=timeout)
 
-def check_remote(target, device_key, tracker_addr=None, priority=2, timeout=10):
-    """
-    Check the availability of a remote device
+    def build(self, measure_inputs):
+        results = []
+
+        for i in range(0, len(measure_inputs), self.n_parallel):
+            futures = []
+            for inp in measure_inputs[i:i + self.n_parallel]:
+                ret = self.executor.submit(self.build_func,
+                                           inp,
+                                           self.tmp_dir,
+                                           **self.build_kwargs)
+                futures.append(ret)
+
+            for future in futures:
+                res = future.get()
+
+                if isinstance(res, Exception):
+                    # timeout or fleet error, return MeasureResult directly
+                    results.append(MeasureResult((res,), MeasureErrorNo.BUILD_TIMEOUT,
+                                                 self.timeout, time.time()))
+                elif res.error is not None:
+                    # instantiation errorD
+                    if isinstance(res.error, InstantiationError):
+                        results.append(MeasureResult((res.error,),
+                                                     MeasureErrorNo.INSTANTIATION_ERROR,
+                                                     res.time_cost, time.time()))
+                    else:
+                        if "InstantiationError" in str(res.error):
+                            msg = str(res.error)
+                            try:
+                                msg = msg.split('\n')[-2].split(": ")[1]
+                            except Exception:  # pylint: disable=broad-except
+                                pass
+                            results.append(MeasureResult((InstantiationError(msg),),
+                                                         MeasureErrorNo.INSTANTIATION_ERROR,
+                                                         res.time_cost, time.time()))
+                        else:  # tvm error
+                            results.append(MeasureResult((res.error,),
+                                                         MeasureErrorNo.COMPILE_HOST,
+                                                         res.time_cost, time.time()))
+                else:
+                    # return BuildResult
+                    results.append(res)
+
+        return results
+
+    def __del__(self):
+        shutil.rmtree(self.tmp_dir)
+
+
+class RPCRunner(Runner):
+    """Run generated code on remove devices.
+    This function will ask a RPC Tracker to get device for measurement.
 
     Parameters
     ----------
-    target: Target
-        The wanted compilation target
-    device_key: string
-        device key of registered device in tracker
-    tracker_addr: Tuple(string, int), optional
-        The address of rpc tracker in (host, port) format.
-        If is none, will use environment variable "TVM_TRACKER_HOST"
-        and "TVM_TRACKER_PORT"
-    priority: int, optional
-        The priority of this request, larger is more prior
-    timeout: float, optional
-        The timeout of this check (units: seconds).
-        If time is out, a RuntimeError will be raised.
+    timeout: float
+        The timeout of a compilation
+    n_parallel: int
+        The number of tasks run in parallel. "None" will use all cpu cores
+    key: str
+        The key of the device registered in the tracker
+    host: str
+        The host address of RPC Tracker
+    port: int
+        The port of RPC Tracker
+    number : int, optional
+        Number of times to do measurement for tasking average
+    repeat : int, optional
+        Number of times to repeat the measurement.
+        In total, the generated code will be run (1 + number x repeat) times,
+        where the first one is warm up. The returned result contains `repeat` costs,
+    min_repeat_ms : float, optional
+        Minimum duration of a timer measurement in milliseconds.
+        When the run time of a measurement trial falls below this time, the
+        `number` parameter will be automatically increased.
+        Set this to improve the accuracy of perf measurement, e.g., when timers
+        are not precise enough to capture short-running tasks. This parameter is
+        also critical when devices need a certain minimum running time to "warm
+        up," such as GPUs that need time to reach a performance power state.
+    cooldown_interval: float, optional
+        The cool down interval between two measurements.
+    check_correctness: bool, optional
+        Whether check correctness after measurement. This will use llvm cpu target to
+        call your template and get the reference output.
+        This can work for TOPI templates, but may not work for your custom template.
     """
-    def _check():
-        remote = request_remote(device_key, tracker_addr, priority)
-        remote.context(str(target))
-    t = threading.Thread(target=_check,)
-    t.start()
-    t.join(timeout)
-    return not t.is_alive()
+    def __init__(self,
+                 key, host, port, priority=1,
+                 timeout=10, n_parallel=None,
+                 number=4, repeat=3, min_repeat_ms=0, cooldown_interval=0.1,
+                 check_correctness=False):
+        super(RPCRunner, self).__init__(timeout, n_parallel)
+
+        self.key = key
+        self.host = host
+        self.port = port
+        self.priority = priority
+        self.timeout = timeout
+
+        self.number = number
+        self.repeat = repeat
+        self.min_repeat_ms = min_repeat_ms
+        self.cur_number = number
+
+        self.ref_input = None
+        self.ref_output = None
+        self.check_correctness = check_correctness
+        self.cooldown_interval = cooldown_interval
+
+        self.executor = LocalExecutor()
+
+    def set_task(self, task):
+        self.task = task
+        self.cur_number = self.number
+
+        if check_remote(task.target, self.key, self.host, self.port):
+            logger.info("Get devices for measurement successfully!")
+        else:
+            raise RuntimeError("Cannot get remote devices from the tracker. "
+                               "Please check the status of tracker by "
+                               "'python -m tvm.exec.query_rpc_tracker --port [THE PORT YOU USE]' "
+                               "and make sure you have free devices on the queue status.")
 
-def create_measure_batch(task, option):
-    """Get a standard measure_batch function.
+        if self.check_correctness:
+            # use llvm cpu to generate a reference input/output
+            # this option works for tuning topi, but might not work for you custom op
+            with _target.create("llvm"):
+                s, arg_bufs = task.instantiate(task.config_space.get(0))
+            self.ref_input = [np.random.uniform(size=get_const_tuple(x.shape)).astype(x.dtype)
+                              for x in arg_bufs]
+            func = build(s, arg_bufs, "llvm")
+            tvm_buf = [nd.array(x) for x in self.ref_input]
+            func(*tvm_buf)
+            self.ref_output = [x.asnumpy() for x in tvm_buf]
+
+    def get_build_kwargs(self):
+        kwargs = {}
+        if 'cuda' in self.task.target.keys or 'opencl' in self.task.target.keys:
+            remote = request_remote(self.key, self.host, self.port)
+            ctx = remote.context(str(self.task.target), 0)
+            max_dims = ctx.max_thread_dimensions
+            kwargs['check_gpu'] = {
+                'max_shared_memory_per_block': ctx.max_shared_memory_per_block,
+                'max_threads_per_block': ctx.max_threads_per_block,
+                'max_thread_x': max_dims[0],
+                'max_thread_y': max_dims[1],
+                'max_thread_z': max_dims[2],
+            }
+
+            if 'cuda' in self.task.target.keys:
+                kwargs["cuda_arch"] = "sm_" + "".join(ctx.compute_version.split('.'))
+
+        return kwargs
+
+    def run(self, measure_inputs, build_results):
+        results = []
+        remote_args = (self.key, self.host, self.port, self.priority, self.timeout)
+
+        for i in range(0, len(measure_inputs), self.n_parallel):
+            futures = []
+            for measure_inp, build_res in zip(measure_inputs[i:i+self.n_parallel],
+                                              build_results[i:i+self.n_parallel]):
+                ret = self.executor.submit(run_through_rpc,
+                                           measure_inp,
+                                           build_res,
+                                           self.cur_number,
+                                           self.repeat,
+                                           self.cooldown_interval,
+                                           remote_args,
+                                           self.ref_input,
+                                           self.ref_output)
+                futures.append(ret)
+
+            for future in futures:
+                res = future.get()
+                if isinstance(res, Exception):   # executor error or timeout
+                    results.append(MeasureResult((str(res),), MeasureErrorNo.RUN_TIMEOUT,
+                                                 self.timeout, time.time()))
+                else:
+                    results.append(res)
+
+        # If some runs were too fast, do remeasure for them
+        # to meet the requirement of `min_repeat_ms`
+        remeasure = np.zeros((len(measure_inputs),), dtype=np.bool)
+        pre_number = next_number = self.cur_number
+        min_repeat_duration = self.min_repeat_ms / 1000.0
+        for i, res in enumerate(results):
+            if res.error_no == MeasureErrorNo.NO_ERROR:
+                if np.mean(res.costs) * pre_number <= min_repeat_duration:
+                    next_number = max(next_number,
+                                      int(np.ceil(min_repeat_duration / np.mean(res.costs))))
+                    remeasure[i] = True
+
+        if pre_number != next_number:
+            self.cur_number = next_number
+            msg = "increasing number to %d" % self.cur_number
+            logger.info(msg)
+
+            re_measure_inputs = [x for i, x in enumerate(measure_inputs) if remeasure[i]]
+            re_build_results = [x for i, x in enumerate(build_results) if remeasure[i]]
+            re_res = self.run(re_measure_inputs, re_build_results)
+            ct = 0
+            for i, rerun in enumerate(remeasure):
+                if rerun:
+                    results[i] = re_res[ct]
+                    ct += 1
+
+        return results
+
+class LocalRunner(RPCRunner):
+    """Run generated code on local devices.
 
     Parameters
     ----------
-    task: tvm.autotvm.task.Task
-        The tuning task
-    option: dict
-        The option for measuring generated code.
-        You should use the return value of function :any:`measure_option` for this argument.
-
-    Returns
-    -------
-    measure_batch: callable
-        a callback function to measure a batch of configs
+    timeout: float
+        The timeout of a compilation
+    number : int, optional
+        Number of times to do measurement for tasking average
+    repeat : int, optional
+        Number of times to repeat the measurement.
+        In total, the generated code will be run (1 + number x repeat) times,
+        where the first one is warm up. The returned result contains `repeat` costs,
+        each of which is the average of `number` test run.
+    min_repeat_ms : float, optional
+        Minimum duration of a timer measurement in milliseconds.
+        When the run time of a measurement trial falls below this time, the
+        `number` parameter will be automatically increased.
+        Set this to improve the accuracy of perf measurement, e.g., when timers
+        are not precise enough to capture short-running tasks. This parameter is
+        also critical when devices need a certain minimum running time to "warm
+        up," such as GPUs that need time to reach a performance power state.
+    cooldown_interval: float, optional
+        The cool down interval between two measurements.
+    check_correctness: bool, optional
+        Whether check correctness after measurement. This will use llvm cpu target to
+        call your template and get the reference output.
+        This can work for TOPI templates, but may not work for your custom template.
+
+    Note
+    ----
+    This is a "fake" local mode. We start a silent rpc tracker and rpc server
+    for the user. In this way we reuse timeout/isolation mechanism in RPC infrastructure.
     """
-    from ..database import filter_inputs
-
-    measure_func = option['measure_func']
-    number, repeat = option['number'], option['repeat']
-    timeout, n_parallel, do_fork = option['timeout'], option['n_parallel'], option['do_fork']
-    build_func = option['build_func']
-    check_correctness = option['check_correctness']
-    replay_db = option['replay_db']
+    def __init__(self,
+                 timeout=10,
+                 number=4, repeat=3, min_repeat_ms=0, cooldown_interval=0.1,
+                 check_correctness=False):
+        super(LocalRunner, self).__init__('', None, None, 0,
+                                          timeout=timeout, n_parallel=1,
+                                          number=number, repeat=repeat,
+                                          min_repeat_ms=min_repeat_ms,
+                                          cooldown_interval=cooldown_interval,
+                                          check_correctness=check_correctness)
+        self.tracker = None
+        self.server = None
+
+    def set_task(self, task):
+        self.task = task
 
-    executor = LocalExecutor(timeout=timeout, do_fork=do_fork)
-
-    # convert convenient string to function object
-    attach_objects = None
-    if measure_func == 'local':
-        # start temporary rpc tracker and rpc server for the user
         from ...rpc.tracker import Tracker
         from ...rpc.server import Server
 
@@ -133,360 +343,215 @@ def create_measure_batch(task, option):
                         key=device_key,
                         use_popen=True, silent=True,
                         tracker_addr=(tracker.host, tracker.port))
+        self.key = device_key
+        self.host = tracker.host
+        self.port = tracker.port
 
-        measure_func = rpc(device_key, tracker.host, tracker.port)
-        attach_objects = (server, tracker)
+        super(LocalRunner, self).set_task(task)
+        return server, tracker
 
-    build_kwargs = {}
-    if build_func == 'default':
-        build_func = default_build_func
-    if build_func == 'ndk':
-        build_func = default_build_func
-        build_kwargs['use_ndk'] = True
 
-    # check the availability of remote devices
-    if hasattr(measure_func, 'rpc_info'):
-        rpc_info = measure_func.rpc_info
-        if check_remote(task.target, rpc_info['key'], (rpc_info['host'], rpc_info['port'])):
-            logger.info("Get devices for measurement successfully!")
-        else:
-            raise RuntimeError("Cannot get remote devices from the tracker. "
-                               "Please check the status of tracker by "
-                               "'python -m tvm.exec.query_rpc_tracker --port [THE PORT YOU USE]' "
-                               "and make sure you have free devices on the queue status.")
+def _build_func_common(measure_input, check_gpu=None, cuda_arch=None, build_option=None):
+    """Common part for building a configuration"""
+    target, task, config = measure_input
 
-    # add device info of cuda and opencl target
-    if ('cuda' in task.target.keys or 'opencl' in task.target.keys) \
-            and hasattr(measure_func, 'rpc_info'):
-        rpc_info = measure_func.rpc_info
-        add_gpu_target_info(task.target, rpc_info["key"], (rpc_info["host"], rpc_info["port"]),
-                            build_kwargs)
-
-    if check_correctness:
-        # use llvm cpu to generate a reference input/output
-        # this option works for tuning topi, but might not work for you custom op
-        with _target.create("llvm"):
-            s, arg_bufs = task.instantiate(task.config_space.get(0))
-        ref_input = [np.random.uniform(size=get_const_tuple(x.shape)).astype(x.dtype)
-                     for x in arg_bufs]
-        func = build(s, arg_bufs, "llvm")
-        tvm_buf = [nd.array(x) for x in ref_input]
-        func(*tvm_buf)
-        ref_output = [x.asnumpy() for x in tvm_buf]
-    else:
-        ref_input = ref_output = None
-
-    def measure_batch(measure_inputs):
-        """measure the time cost for a batch of configs in real machines"""
-        if replay_db is not None:
-            partial_results, measure_inputs = \
-                filter_inputs(replay_db, measure_inputs, retry=False)
-
-        # launch measure jobs in parallel
-        pack_size = getattr(measure_func, "pack_size", 1)  # measure `pack_size` inputs in one job
-        futures = []
-        for i in range(0, len(measure_inputs), pack_size):
-            input_pack = measure_inputs[i:i + pack_size]
-            ret = executor.submit(
-                measure_func,
-                input_pack,
-                build_func,
-                build_kwargs,
-                number,
-                repeat,
-                ref_input,
-                ref_output)
-            futures.append(ret)
-
-        # transform results
-        results = []
-        for future in futures:
-            result = future.get()
-            if isinstance(result, Exception):
-                tstamp = time.time()
-                results.extend([MeasureResult((result,), MeasureErrorNo.FLEET_ERROR,
-                                              timeout, tstamp)] * pack_size)
-            else:
-                results.extend(result)
-
-        if replay_db is not None:
-            result_idx = 0
-            for i in range(len(partial_results)):
-                if partial_results[i] is None:
-                    partial_results[i] = results[result_idx]
-                    result_idx += 1
-            return partial_results
-        return results
+    with target:
+        s, args = task.instantiate(config)
+
+        # check invalidity of template and code hash consistency
+        if not config.valid():
+            raise InstantiationError(config.errors)
+
+        opts = build_option or {}
+        if check_gpu:  # Add verify pass to filter out invalid configs in advance.
+            opts["add_lower_pass"] = [(2, gpu_verify_pass(**check_gpu))]
+        if cuda_arch:
+            set_cuda_target_arch(cuda_arch)
 
-    measure_batch.n_parallel = n_parallel
-    # attach server and tracker object to avoid them of being garbage-collected
-    measure_batch.attach_objects = attach_objects
-    return measure_batch
+        with build_config(**opts):
+            func = build(s, args, target_host=task.target_host)
+    return func, tuple((get_const_tuple(x.shape), x.dtype) for x in args)
 
 
-def rpc(key,
-        host=None,
-        port=None,
-        priority=1,
-        session_timeout=60,
-        pack_size=1):
+def default_build_func(measure_input, tmp_dir, **kwargs):
     """
-    Create a standard measure_func which uses RPC Tracker for measurement.
-    This measure_func will request a device from the RPC Tracker and
-    upload the built binary library to that device for measurement.
+    Default build func. This can work for cuda, opencl, llvm backend
 
     Parameters
     ----------
-    key: str
-        The registered key of the device in tracker. The tuner will request devices for
-        measurement by this key.
-    host: str, optional
-        The hostname of RPC Tracker. If not set, will use environment variable "TVM_TRACKER_HOST"
-    port: int, optional
-        The port of RPC Tracker. If not set, will use environment variable "TVM_TRACKER_PORT"
-    priority: int, optional
-        Priority of this task, used by scheduler in tracker
-    session_timeout: int, optional
-        Timeout of rpc session
-    pack_size: int, optional
-        The number of configs measure in one RPC session.
-        Usually this can be set to 1. If your device has high overhead to establish a
-        rpc connection, set this higher.
+    measure_input: MeasureInput
+        The input of measurement
+    tmp_dir: str
+        The path of temporary directory to export generated library
+    """
+    tic = time.time()
+    try:
+        filename = os.path.join(tmp_dir, "tmp_func_%0x.tar" % getrandbits(64))
+        func, arg_info = _build_func_common(measure_input, **kwargs)
+        func.export_library(filename)
+    except Exception as e:  # pylint: disable=broad-except
+        return BuildResult(None, None, e, time.time() - tic)
+    return BuildResult(filename, arg_info, None, time.time() - tic)
+
+
+def android_ndk_build_func(measure_input, tmp_dir, **kwargs):
+    """
+    Build function for android device using ndk.
+
+    Parameters
+    ----------
+    measure_input: MeasureInput
+        The input of measurement
+    tmp_dir: str
+        The path of temporary directory to export generated library
     """
-    def fmeasure(input_pack, build_func, build_kwargs, number, repeat, ref_input, ref_output):
-        """Do measurement for a list of inputs inside a same RPC session.
-
-        Parameters
-        ----------
-        input_pack: List of MeasureInput
-            The inputs of measurement
-        build_func: callable
-            Function for building the code. see :any:`default_build_func` for example
-        build_kwargs: dict
-            Extra arguments for build_func
-        number : int, optional
-            Number of times to do the measurement for average
-        repeat : int, optional
-            Number of times to repeat the measurement.
-            In total, the generated code will be run (1 + number x repeat) times,
-            where the first one is warm up. The returned result contains `repeat` costs,
-            each of which is the average of `number` test run.
-        ref_input: List of numpy array
-            Reference input for correctness check
-        ref_output: List of numpy array
-            Reference output for correctness check
-
-        Returns
-        -------
-        results: List of MeasureResult
-            The results for input_pack
-        """
-        remote_args = (key, (host, port), priority, session_timeout)
-
-        res = _measure_common(input_pack, build_func, build_kwargs, number, repeat,
-                              ref_input, ref_output,
-                              remote_args)
-        return res
-
-    fmeasure.pack_size = pack_size
-    fmeasure.rpc_info = {"key": key, "host": host, "port": port}
-    return fmeasure
-
-
-def _measure_common(input_pack, build_func, build_kwargs, number, repeat,
-                    ref_input=None, ref_output=None, remote_args=None):
-    """Measure the time cost for a pack of inputs.
-
-    (Note: A pack is a list of inputs which will be measured inside a same RPC session)
+    tic = time.time()
+    try:
+        filename = os.path.join(tmp_dir, "tmp_func_%0x.so" % getrandbits(64))
+        func, arg_info = _build_func_common(measure_input, **kwargs)
+        func.export_library(filename, ndk.create_shared)
+    except Exception as e:  # pylint: disable=broad-except
+        return BuildResult(None, None, e, time.time() - tic)
+    return BuildResult(filename, arg_info, None, time.time() - tic)
+
+
+def run_through_rpc(measure_input, build_result,
+                    number, repeat, cooldown_interval,
+                    remote_args, ref_input=None, ref_output=None):
+    """Run a generated library through rpc
 
     Parameters
     ----------
-    input_pack : list of MeasureInput
-        The inputs we need to evaluate
-    build_func : function takes MeasureInput returns tuple of (time_func, ctx, args)
-        The build function used to build each input.
-    build_kwargs: Dict
-        The extra keyword arguments to build_func
+    measure_input: MeasureInput
+        The raw measure input
+    build_result: BuildResult
+        The result returned from Builder. This contains the path to the generated library.
     number : int, optional
-        Number of times to do the measurement for average
+        Number of times to do measurement for tasking average
     repeat : int, optional
         Number of times to repeat the measurement.
         In total, the generated code will be run (1 + number x repeat) times,
         where the first one is warm up. The returned result contains `repeat` costs,
         each of which is the average of `number` test run.
-    ref_input: Array of np.ndarray, optional
-        Reference input for checking correctness
-    ref_output: Array of np.ndarray, optional
-        Reference output for checking correctness
-    remote_args: Tuple, optional
-        The arguments to request_remote. If is not None, will use remote rpc devices.
-
-    Returns
-    -------
-    res_pack : Array of MeasureResult
-        The list of results of measurement.
+    cooldown_interval: float
+        The cool down interval between two measurements
+    remote_args: Tuple
+        The argument for request_remote
+    ref_input: List of np.ndarray
+        The reference input used for checking correctness
+    ref_output: List of np.ndarray
+        The reference output used for checking correctness
     """
-    res_pack = []
-    tmp_dir = util.tempdir() if remote_args else None
-    assert len(input_pack) == 1, "Only supports input_pack == 1 for now"
-
-    for inp in input_pack:
-        tic = time.time()
-
-        # build function
-        try:
-            func, arg_bufs, filename = build_func(inp, tmp_dir, **build_kwargs)
-        except TVMError as exc:
-            tstamp = time.time()
-            msg = str(exc)
-            if "Stack trace returned" in msg:
-                msg = msg[:msg.index("Stack trace returned")]
-            if "InstantiationError" in msg:
-                try:
-                    msg = msg.split('\n')[-2].split(": ")[1]
-                except Exception:  # pylint: disable=broad-except
-                    pass
-                res_pack.append(MeasureResult((InstantiationError(msg),),
-                                              MeasureErrorNo.INSTANTIATION_ERROR,
-                                              tstamp - tic, tstamp))
-            else:
-                res_pack.append(MeasureResult((RuntimeError(msg),),
-                                              MeasureErrorNo.COMPILE_HOST,
-                                              tstamp - tic, tstamp))
-            continue
-        except InstantiationError as e:
-            tstamp = time.time()
-            res_pack.append(MeasureResult((InstantiationError(str(e)),),
-                                          MeasureErrorNo.INSTANTIATION_ERROR,
-                                          tstamp - tic, tstamp))
-            continue
-
-        # measure time
-        errno = MeasureErrorNo.NO_ERROR
-        try:
-            # upload built module
-            if remote_args:
-                remote = request_remote(*remote_args)
-                remote.upload(tmp_dir.relpath(filename))
-                func = remote.load_module(filename)
-                ctx = remote.context(str(inp.target), 0)
-                time_f = func.time_evaluator(
-                    func.entry_name, ctx, number=number, repeat=repeat)
-            else:
-                ctx = context(str(inp.target), 0)
-                time_f = func.time_evaluator(
-                    func.entry_name, ctx, number=number, repeat=repeat)
-
-            # set input
-            if ref_input:
-                args = [nd.array(x, ctx=ctx) for x in ref_input]
-            else:
-                args = [nd.empty(get_const_tuple(x.shape), dtype=x.dtype, ctx=ctx)
-                        for x in arg_bufs]
-
-            costs = time_f(*args).results
-            if len(costs) > 2:  # remove largest and smallest value to reduce variance
-                costs = list(costs)
-                costs.sort()
-                costs = tuple(costs[1:-1])
-
-            # check correctness of output
-            if ref_output:
-                for expected, real in zip(ref_output, args):
-                    if not np.allclose(expected, real.asnumpy(), rtol=1e-4):
-                        logger.warning("Wrong Answer!")
-                        errno = MeasureErrorNo.WRONG_ANSWER
-        except TVMError as exc:
-            msg = str(exc)
-            if "Stack trace returned" in msg:
-                msg = msg[:msg.index("Stack trace returned")]
-            if "CUDA Source" in msg:
-                msg = msg[:msg.index("CUDA Source")]
-            costs = (RuntimeError(msg),)
-            errno = MeasureErrorNo.RUNTIME_DEVICE
-        tstamp = time.time()
-        res_pack.append(MeasureResult(costs, errno, tstamp - tic, tstamp))
-    return res_pack
-
-
-def default_build_func(inp, tmp_dir=None, **kwargs):
-    """Build function module. Exception will be raised when any error occurs
+    if isinstance(build_result, MeasureResult):
+        return build_result
+
+    tic = time.time()
+    errno = MeasureErrorNo.NO_ERROR
+    try:
+        # upload built module
+        remote = request_remote(*remote_args)
+        remote.upload(build_result.filename)
+        func = remote.load_module(os.path.split(build_result.filename)[1])
+        ctx = remote.context(str(measure_input.target), 0)
+        time_f = func.time_evaluator(
+            func.entry_name, ctx, number=number, repeat=repeat)
+
+        # set input
+        if ref_input:
+            args = [nd.array(x, ctx=ctx) for x in ref_input]
+        else:
+            args = [nd.empty(x[0], dtype=x[1], ctx=ctx) for x in build_result.arg_info]
+
+        costs = time_f(*args).results
+        if len(costs) > 2:  # remove largest and smallest value to reduce variance
+            costs = list(costs)
+            costs.sort()
+            costs = tuple(costs[1:-1])
+
+        # check correctness of output
+        if ref_output:
+            for expected, real in zip(ref_output, args):
+                if not np.allclose(expected, real.asnumpy(), rtol=1e-4):
+                    logger.warning("Wrong Answer!")
+                    errno = MeasureErrorNo.WRONG_ANSWER
+    except TVMError as exc:
+        msg = str(exc)
+        if "Stack trace returned" in msg:
+            msg = msg[:msg.index("Stack trace returned")]
+        if "CUDA Source" in msg:
+            msg = msg[:msg.index("CUDA Source")]
+        costs = (RuntimeError(msg[:1024]),)
+        errno = MeasureErrorNo.RUNTIME_DEVICE
+    tstamp = time.time()
+    time.sleep(cooldown_interval)
+    return MeasureResult(costs, errno, tstamp - tic + build_result.time_cost, tstamp)
+
+
+def request_remote(device_key, host=None, port=None, priority=1, timeout=60):
+    """Request a remote session
 
     Parameters
     ----------
-    inp: MeasureInput
-       The input of this measurement
-    tmp_dir: tvm.contrib.util.TempDirectory, optional
-       The temporary directory for exporting built binary library.
-       If is not None (in RPC mode), the library in this directory will be uploaded to
-       remote devices.
-    kwargs: Dict, optional
-        Other extra arguments
+    device_key: string
+        The device key of registered device in tracker
+    host: host, optional
+        The host address of rpc tracker.
+        If is none, will use environment variable "TVM_TRACKER_HOST"
+    port: int, optional
+        The port of rpc tracker.
+        If is none, will use environment variable "TVM_TRACKER_PORT"
+    priority: int, optional
+        The priority of this request, larger is more prior
+    timeout: float, optional
+        The timeout of this session (units: second)
 
     Returns
-    -------
-    func: Function
-        TVM built function. Typically this is the return value of tvm.build.
-    args: Array of Buffer or Tensor
-        The argument list for the function. Typically this is the second argument of tvm.build.
-    filename: str
-        The filename of the output build library
+    ------
+    session: RPCSession
     """
-    # build function
-    with inp.target:
-        s, args = inp.task.instantiate(inp.config)
+    # connect to the tracker
+    host = host or os.environ['TVM_TRACKER_HOST']
+    port = port or int(os.environ['TVM_TRACKER_PORT'])
 
-        # check invalidity of template and code hash consistency
-        if not inp.config.valid():
-            raise InstantiationError(inp.config.errors)
-        code_hash = getattr(s, 'code_hash', None)
-        if inp.config.code_hash != code_hash:
-            raise HashMismatchError('got {0:s}, expected {1:s}'
-                                    .format(str(inp.config.code_hash), str(code_hash)))
-
-        opts = {}
-        if "check_gpu" in kwargs:  # Add verify pass to filter out invalid configs in advance.
-            opts["add_lower_pass"] = [(2, gpu_verify_pass(**kwargs['check_gpu']))]
-        if 'cuda_arch' in kwargs:
-            set_cuda_target_arch(kwargs['cuda_arch'])
+    tracker = _rpc.connect_tracker(host, port)
+    remote = tracker.request(device_key, priority=priority,
+                             session_timeout=timeout)
+    return remote
 
-        with build_config(**opts):
-            func = build(s, args, target_host=inp.task.target_host)
 
-    # export library to temp directory
-    if tmp_dir:
-        if kwargs.get('use_ndk', False):  # for Android NDK
-            filename = "tmp_func_%0x.so" % getrandbits(64)
-            func.export_library(tmp_dir.relpath(filename), ndk.create_shared)
-        else:
-            filename = "tmp_func_%0x.tar" % getrandbits(64)
-            func.export_library(tmp_dir.relpath(filename))
-    else:
-        filename = None
-
-    return func, args, filename
-
-
-def add_gpu_target_info(target, device_key, rpc_tracker_addr, kwargs):
-    """Add device info for gpu target.
-    The info will be used to check the validity of generated code."""
-    remote = request_remote(device_key, rpc_tracker_addr)
-    ctx = remote.context(str(target), 0)
-    max_dims = ctx.max_thread_dimensions
-    kwargs['check_gpu'] = {
-        'max_shared_memory_per_block': ctx.max_shared_memory_per_block,
-        'max_threads_per_block': ctx.max_threads_per_block,
-        'max_thread_x': max_dims[0],
-        'max_thread_y': max_dims[1],
-        'max_thread_z': max_dims[2],
-    }
-
-    if 'cuda' in target.keys:
-        kwargs["cuda_arch"] = "sm_" + "".join(ctx.compute_version.split('.'))
+def check_remote(target, device_key, host=None, port=None, priority=2, timeout=10):
+    """
+    Check the availability of a remote device
 
-def set_cuda_target_arch(arch):
-    """set target architecture of nvcc compiler"""
-    AutotvmGlobalScope.current.cuda_target_arch = arch
+    Parameters
+    ----------
+    target: Target
+        The wanted compilation target
+    device_key: string
+        device key of registered device in tracker
+    host: host, optional
+        The host address of rpc tracker.
+        If is none, will use environment variable "TVM_TRACKER_HOST"
+    port: int, optional
+        The port address of rpc tracker.
+        If is none, will use environment variable "TVM_TRACKER_PORT"
+    priority: int, optional
+        The priority of this request, larger is more prior
+    timeout: float, optional
+        The timeout of this check (units: seconds).
+
+    Returns
+    -------
+    available: bool
+        True if can find available device
+    """
+    def _check():
+        remote = request_remote(device_key, host, port, priority)
+        remote.context(str(target))
+    t = threading.Thread(target=_check,)
+    t.start()
+    t.join(timeout)
+    return not t.is_alive()
 
 
 @register_func
@@ -496,6 +561,17 @@ def tvm_callback_cuda_compile(code):
     return ptx
 
 
+def set_cuda_target_arch(arch):
+    """set target architecture of nvcc compiler
+
+    Parameters
+    ----------
+    arch: str
+        The argument of nvcc -arch. (e.g. "sm_51", "sm_62")
+    """
+    AutotvmGlobalScope.current.cuda_target_arch = arch
+
+
 def gpu_verify_pass(**kwargs):
     """Verify the validity of a gpu kernel.
     This pass will check memory usage and number of threads per block.
diff --git a/python/tvm/autotvm/tuner/ga_tuner.py b/python/tvm/autotvm/tuner/ga_tuner.py
index b9d900e49577..1afaca73ebb6 100644
--- a/python/tvm/autotvm/tuner/ga_tuner.py
+++ b/python/tvm/autotvm/tuner/ga_tuner.py
@@ -22,7 +22,7 @@ class GATuner(Tuner):
     mutation_prob: float
         probability of mutation of a knob in a gene
     """
-    def __init__(self, task, pop_size, elite_num=3, mutation_prob=0.1):
+    def __init__(self, task, pop_size=100, elite_num=3, mutation_prob=0.1):
         super(GATuner, self).__init__(task)
 
         # algorithm configurations
diff --git a/python/tvm/autotvm/tuner/sa_model_optimizer.py b/python/tvm/autotvm/tuner/sa_model_optimizer.py
index 1947c6dde4e0..77c7e919593b 100644
--- a/python/tvm/autotvm/tuner/sa_model_optimizer.py
+++ b/python/tvm/autotvm/tuner/sa_model_optimizer.py
@@ -87,7 +87,7 @@ def find_maximums(self, model, num, exclusive):
 
             new_scores = model.predict(new_points)
 
-            ac_prob = np.exp((new_scores - scores) / (t + 1e-2))
+            ac_prob = np.exp(np.minimum((new_scores - scores) / (t + 1e-5), 1))
             ac_index = np.random.random(len(ac_prob)) < ac_prob
 
             points[ac_index] = new_points[ac_index]
diff --git a/tests/python/integration/test_tuning.py b/tests/python/integration/test_tuning.py
index 87da86a4654f..8e1b458a6d2f 100644
--- a/tests/python/integration/test_tuning.py
+++ b/tests/python/integration/test_tuning.py
@@ -103,34 +103,7 @@ def get_sample_task(target=tvm.target.cuda(), target_host=None):
                                target=target, target_host=target_host)
     return task, target
 
-
-def test_task_tuner_without_measurement():
-    """test task and tuner without measurement"""
-    task, target = get_sample_task()
-
-    def custom_measure(input_pack, build_func, build_args, number, repeat,
-                       ref_input, ref_output):
-        from tvm.autotvm import MeasureResult
-
-        results = []
-        for inp in input_pack:
-            tic = time.time()
-            # do nothing
-            time.sleep(0.001)
-            results.append(MeasureResult([time.time() - tic], 0,
-                                         time.time() - tic, time.time()))
-        return results
-    measure_option = autotvm.measure_option(custom_measure)
-
-    logging.info("%s", task.config_space)
-
-    # new tuner and recorder
-    for tuner_class in [autotvm.tuner.RandomTuner, autotvm.tuner.GridSearchTuner]:
-        tuner = tuner_class(task)
-        tuner.tune(n_trial=10, measure_option=measure_option)
-        assert tuner.best_flops > 1
-
-def test_tuning_with_measure():
+def test_tuning():
     def check(target, target_host):
         ctx = tvm.context(target, 0)
         if not ctx.exist:
@@ -141,12 +114,12 @@ def check(target, target_host):
         task, target = get_sample_task(target, target_host)
         logging.info("%s", task.config_space)
 
-        measure_option = autotvm.measure_option('local',
-                                                timeout=4,
-                                                number=2)
+        measure_option = autotvm.measure_option(
+            autotvm.LocalBuilder(),
+            autotvm.LocalRunner())
 
         tuner = RandomTuner(task)
-        tuner.tune(n_trial=10, measure_option=measure_option)
+        tuner.tune(n_trial=20, measure_option=measure_option)
 
     check("cuda", None)
     check("opencl", None)
@@ -155,6 +128,4 @@ def check(target, target_host):
     # only print log when invoked from main
     logging.basicConfig(level=logging.DEBUG)
 
-    test_task_tuner_without_measurement()
-    test_tuning_with_measure()
-
+    test_tuning()
diff --git a/tests/python/unittest/test_autotvm_common.py b/tests/python/unittest/test_autotvm_common.py
index 3a6883f69489..ed39c3846c8c 100644
--- a/tests/python/unittest/test_autotvm_common.py
+++ b/tests/python/unittest/test_autotvm_common.py
@@ -32,6 +32,25 @@ def matmul(N, L, M, dtype):
 
     return s, [A, B, C]
 
+@autotvm.template
+def bad_matmul(N, L, M, dtype):
+    if 'bad_device' in tvm.target.current_target().keys:
+        A = tvm.placeholder((N, L), name='A', dtype=dtype)
+        B = tvm.placeholder((L, M), name='B', dtype=dtype)
+
+        k = tvm.reduce_axis((0, L-1), name='k')
+        C = tvm.compute((N, M), lambda i, j: tvm.sum(A[i, k] * B[k, j], axis=k), name='C')
+        s = tvm.create_schedule(C.op)
+
+        # schedule
+        y, x = s[C].op.axis
+        cfg = autotvm.get_config()
+        cfg.define_split("tile_y", y, num_outputs=2)
+        cfg.define_split("tile_x", x, num_outputs=2)
+        return s, [A, B, C]
+
+    return matmul(N, L, M, dtype)
+
 def get_sample_task(n=128):
     """return a sample task for testing"""
     target = tvm.target.create("llvm")
diff --git a/tests/python/unittest/test_autotvm_database.py b/tests/python/unittest/test_autotvm_database.py
index af4704d95e51..aa956f61bbcf 100644
--- a/tests/python/unittest/test_autotvm_database.py
+++ b/tests/python/unittest/test_autotvm_database.py
@@ -1,17 +1,11 @@
 """Test database"""
 import copy
 import logging
-import time
 
-import numpy as np
-import tvm
-
-from tvm import autotvm
 from tvm.autotvm import database
-from tvm.autotvm.measure.measure_methods import HashMismatchError
-from tvm.autotvm.record import encode, MeasureInput, MeasureResult
+from tvm.autotvm.record import encode, MeasureResult
 
-from test_autotvm_common import get_sample_task, get_sample_records
+from test_autotvm_common import get_sample_records
 
 def test_save_load():
     logging.info("test basic db load/save ...")
@@ -35,66 +29,6 @@ def test_save_load():
 
 TRIAL_LIMIT = 2
 
-def test_db_filter():
-    logging.info("test db filter ...")
-
-    # Pick a GPU target because there are more likely to be failures/invalid configs
-    task, target = get_sample_task()
-
-    ctx = tvm.context(str(target))
-    if not ctx.exist:
-        logging.warning("Skip this test because there is no supported device for test")
-
-    batch_size = 2
-
-    measure_option = autotvm.measure_option('local', do_fork=False, timeout=2)
-    measure_batch = autotvm.measure.create_measure_batch(task, measure_option)
-
-    ct = 0
-    all_inputs = list()
-    all_results = list()
-    batches = list()
-    tuner = autotvm.tuner.RandomTuner(task)
-    while ct < TRIAL_LIMIT:
-        inputs = list()
-        for i in range(batch_size):
-            cfg = tuner.next_batch(1)[0]
-            inputs.append((MeasureInput(target, task, cfg)))
-            all_inputs.append(inputs[-1])
-        batches.append(inputs)
-        results = measure_batch(inputs)
-        all_results += results
-        ct += 1
-
-    del measure_batch
-
-    db = database.DummyDatabase()
-    db.flush()
-
-    # First setting, memoize one input at a time, check that each is saved and replayed
-    measure_option = autotvm.measure_option('local', do_fork=False, timeout=2, replay_db=db)
-    measure_batch = autotvm.measure.create_measure_batch(task, measure_option)
-
-    for i in range(len(all_inputs)+1):
-        db.flush()
-        for j in range(i):
-            db.save(all_inputs[j], all_results[j])
-
-        for k in range(len(batches)):
-            batch = batches[k]
-            batch_result = measure_batch(batch)
-            for l in range(batch_size):
-                all_idx = k*batch_size + l
-                assert batch_result[l] is not None
-                if all_idx < i:
-                    assert encode(batch[l], batch_result[l]) == encode(batch[l], all_results[all_idx]), \
-                        "(no retry) EXPECTED MATCH, GOT MISMATCH"
-                else:
-                    assert encode(batch[l], batch_result[l]) != encode(batch[l], all_results[all_idx]), \
-                        "(no retry) EXPECTED MISMATCH, GOT MATCH"
-
-    del measure_batch
-
 def test_db_hash():
     logging.info("test db hash check ...")
     inp1, res1 = get_sample_records(1)[0]
@@ -149,89 +83,8 @@ def test_db_latest_all():
     assert encode(inp1, load4[1]) == encode(inp1, res2)
     assert encode(inp1, load4[2]) == encode(inp1, res3)
 
-def test_db_save_replay():
-    logging.info("test db save (from measure_batch) and replay ...")
-    _db = database.DummyDatabase()
-    _db.flush()
-
-    task, target = get_sample_task()
-
-    ctx = tvm.context(str(target))
-    if not ctx.exist:
-        logging.warning("Skip this test because there is no supported device for test")
-
-    measure_option = autotvm.measure_option('local',
-                                            do_fork=False,
-                                            timeout=2,
-                                            replay_db=_db)
-    measure_batch = autotvm.measure.create_measure_batch(task, measure_option)
-
-    batch_size = 2
-
-    ct = 0
-    all_inputs = list()
-    all_results = list()
-    batches = list()
-    tuner = autotvm.tuner.RandomTuner(task)
-    while ct < TRIAL_LIMIT:
-        inputs = list()
-        for i in range(batch_size):
-            cfg = tuner.next_batch(1)[0]
-            inputs.append((MeasureInput(target, task, cfg)))
-            all_inputs.append(inputs[-1])
-        batches.append(inputs)
-        results = measure_batch(inputs)
-        all_results += results
-        ct += 1
-    callback = autotvm.callback.log_to_database(_db)
-    callback(None, all_inputs, all_results)
-
-    assert len(_db.db.keys()) == batch_size * TRIAL_LIMIT, \
-        "%d vs %d" % (len(_db.db.keys()), batch_size * TRIAL_LIMIT)
-
-    all_results_2 = measure_batch(all_inputs)
-    all_results_3 = measure_batch(all_inputs)
-
-    for i in range(len(all_results)):
-        encr1 = encode(all_inputs[i], all_results[i])
-        encr2 = encode(all_inputs[i], all_results_2[i])
-        encr3 = encode(all_inputs[i], all_results_3[i])
-        assert encr1 == encr2, "EXPECTED MATCH WITH SAVE REPLAY (first replay), got MISMATCH"
-        assert encr2 == encr3, "EXPECTED MATCH WITH SAVE REPLAY (second replay), got MISMATCH"
-
-    del measure_batch
-
-def test_check_hashmismatch():
-    logging.info("test hash mismatch check")
-
-    task, target = get_sample_task()
-
-    ctx = tvm.context(str(target))
-    if not ctx.exist:
-        logging.warning("Skip this test because there is no supported device for test")
-
-    measure_option = autotvm.measure_option('local', do_fork=False)
-    measure_batch = autotvm.measure.create_measure_batch(task, measure_option)
-
-    inputs = list()
-    cfg = task.config_space.get(np.random.randint(len(task.config_space)))
-    # notvalidh is not a valid CRC32 hash (not hex)
-    cfg.code_hash = 'notvalidh'
-    inputs.append((MeasureInput(target, task, cfg)))
-
-    try:
-        results = measure_batch(inputs)
-        assert False, "HashMismatchError should be raised"
-    except HashMismatchError:
-        pass
-
-    del measure_batch
-
 if __name__ == '__main__':
     logging.basicConfig(level=logging.INFO)
     test_save_load()
-    test_db_filter()
     test_db_hash()
     test_db_latest_all()
-    test_db_save_replay()
-    test_check_hashmismatch()
diff --git a/tests/python/unittest/test_autotvm_measure.py b/tests/python/unittest/test_autotvm_measure.py
new file mode 100644
index 000000000000..e29cc2c51658
--- /dev/null
+++ b/tests/python/unittest/test_autotvm_measure.py
@@ -0,0 +1,97 @@
+"""Test builder and runner"""
+import logging
+import time
+
+import numpy as np
+
+import tvm
+from tvm import autotvm
+from test_autotvm_common import get_sample_task, bad_matmul
+from tvm.autotvm.measure.measure import Runner, MeasureResult, MeasureErrorNo
+
+def test_task_tuner_without_measurement():
+    """test task and tuner without measurement"""
+    task, target = get_sample_task()
+
+    class DummyRunner(Runner):
+        def __init__(self):
+            super(DummyRunner, self).__init__(1, 1)
+
+        def run(self, measure_inputs, build_results):
+            return [MeasureResult((np.random.random(),), 0, 0.2, time.time())
+                    for _ in range(len(measure_inputs))]
+
+        def get_build_kwargs(self):
+            return {}
+
+    measure_option = autotvm.measure_option(
+        builder=autotvm.LocalBuilder(),
+        runner=DummyRunner()
+    )
+
+    logging.info("%s", task.config_space)
+
+    for tuner_class in [autotvm.tuner.RandomTuner,
+                        autotvm.tuner.GridSearchTuner,
+                        autotvm.tuner.GATuner,
+                        autotvm.tuner.XGBTuner]:
+        tuner = tuner_class(task)
+        tuner.tune(n_trial=10, measure_option=measure_option)
+        assert tuner.best_flops > 1
+
+def test_check_correctness():
+    task, target = get_sample_task()
+
+    measure_option = autotvm.measure_option(
+        builder=autotvm.LocalBuilder(),
+        runner=autotvm.LocalRunner(check_correctness=True)
+    )
+
+    def _callback_correct(tuner, measure_inputs, measure_results):
+        for inp, res in zip(measure_inputs, measure_results):
+            assert res.error_no == 0
+
+    tuner = autotvm.tuner.RandomTuner(task)
+    tuner.tune(n_trial=2, measure_option=measure_option,
+               callbacks=[_callback_correct])
+
+    # a bad template
+    n = 128
+    target = tvm.target.create("llvm -device=bad_device")
+    task = autotvm.task.create(bad_matmul, args=(n, n, n, 'float32'), target=target)
+
+    def _callback_wrong(tuner, measure_inputs, measure_results):
+        for inp, res in zip(measure_inputs, measure_results):
+            assert res.error_no == MeasureErrorNo.WRONG_ANSWER
+
+    tuner = autotvm.tuner.RandomTuner(task)
+    tuner.tune(n_trial=2, measure_option=measure_option,
+               callbacks=[_callback_wrong])
+
+
+def test_min_repeat_ms():
+    task, target = get_sample_task()
+
+    measure_option = autotvm.measure_option(
+        builder=autotvm.LocalBuilder(),
+        runner=autotvm.LocalRunner(number=1, min_repeat_ms=100)
+    )
+
+    def _callback(tuner, measure_inputs, measure_results):
+        for inp, res in zip(measure_inputs, measure_results):
+            if res.error_no != 0:
+                continue
+
+            assert 1000 * np.mean(res.costs) * \
+                   measure_option['runner'].cur_number >= 100
+
+    tuner = autotvm.tuner.RandomTuner(task)
+    tuner.tune(n_trial=5, measure_option=measure_option,
+               callbacks=[_callback])
+
+if __name__ == '__main__':
+    logging.basicConfig(level=logging.INFO)
+
+    test_task_tuner_without_measurement()
+    test_check_correctness()
+    test_min_repeat_ms()
diff --git a/topi/recipe/gemm/gemm_int8.py b/topi/recipe/gemm/gemm_int8.py
index 61ef97d0a2bf..4cce2735c4a2 100644
--- a/topi/recipe/gemm/gemm_int8.py
+++ b/topi/recipe/gemm/gemm_int8.py
@@ -137,12 +137,15 @@ def block_size_filter(entity):
     print(task.config_space)
 
     measure_option = autotvm.measure_option(
-        measure_func='local', number=10, n_parallel=8, timeout=20)
+        builder=autotvm.LocalBuilder(),
+        runner=autotvm.LocalRunner(repeat=3, min_repeat_ms=100, timeout=4)
+    )
+
     log_name = 'gemm_int8.log'
     if DO_TUNING:
         tuner = autotvm.tuner.XGBTuner(task)
         tuner.tune(n_trial=1000, measure_option=measure_option,
-               callbacks=[autotvm.callback.log_to_file(log_name)])
+                   callbacks=[autotvm.callback.log_to_file(log_name)])
 
         dispatch_context = autotvm.apply_history_best(log_name)
         best_config = dispatch_context.query(task.target, task.workload)
diff --git a/tutorials/autotvm/tune_conv2d_cuda.py b/tutorials/autotvm/tune_conv2d_cuda.py
index 3ff26a05064d..3cd63d03dfd9 100644
--- a/tutorials/autotvm/tune_conv2d_cuda.py
+++ b/tutorials/autotvm/tune_conv2d_cuda.py
@@ -164,12 +164,12 @@ def conv2d_no_batching(N, H, W, CO, CI, KH, KW, stride, padding):
                            target='cuda')
 print(task.config_space)
 
-# use local gpu, measure 5 times for every config to reduce variance
-# run 8 parallel threads for compilation
-measure_option = autotvm.measure_option('local',
-                                        number=5,
-                                        n_parallel=8,
-                                        timeout=20)
+# use local gpu, measure 10 times for every config to reduce variance
+# The timeout of compiling a program is 10 seconds, the timeout for running is 4 seconds
+measure_option = autotvm.measure_option(
+    builder=autotvm.LocalBuilder(),
+    runner=autotvm.LocalRunner(repeat=3, min_repeat_ms=100, timeout=4)
+)
 
 # begin tuning, log records to file `conv2d.log`
 tuner = autotvm.tuner.XGBTuner(task)
diff --git a/tutorials/autotvm/tune_nnvm_arm.py b/tutorials/autotvm/tune_nnvm_arm.py
index a080681f4ca3..8ab7bb2f176c 100644
--- a/tutorials/autotvm/tune_nnvm_arm.py
+++ b/tutorials/autotvm/tune_nnvm_arm.py
@@ -65,15 +65,20 @@ def get_network(name, batch_size):
     input_shape = (batch_size, 3, 224, 224)
     output_shape = (batch_size, 1000)
 
-    if name =='resnet-18':
-        net, params = nnvm.testing.resnet.get_workload(num_layers=18, batch_size=batch_size)
-    elif name =='mobilenet':
+    if "resnet" in name:
+        n_layer = int(name.split('-')[1])
+        net, params = nnvm.testing.resnet.get_workload(num_layers=n_layer, batch_size=batch_size)
+    elif "vgg" in name:
+        n_layer = int(name.split('-')[1])
+        net, params = nnvm.testing.vgg.get_workload(num_layers=n_layer, batch_size=batch_size)
+    elif name == 'mobilenet':
         net, params = nnvm.testing.mobilenet.get_workload(batch_size=batch_size)
-    elif name =='squeezenet v1.1':
+    elif name == 'squeezenet_v1.1':
         net, params = nnvm.testing.squeezenet.get_workload(batch_size=batch_size, version='1.1')
-    elif name =='vgg-16':
-        net, params = nnvm.testing.vgg.get_workload(num_layers=16, batch_size=batch_size)
-    elif name =='custom':
+    elif name == 'inception_v3':
+        input_shape = (1, 3, 299, 299)
+        net, params = nnvm.testing.inception_v3.get_workload(batch_size=batch_size)
+    elif name == 'custom':
         # an example for custom network
         from nnvm.testing import utils
         net = nnvm.sym.Variable('data')
@@ -92,6 +97,7 @@ def get_network(name, batch_size):
 
     return net, params, input_shape, output_shape
 
+
 #################################################################
 # Start RPC Tracker
 # -----------------
@@ -158,6 +164,8 @@ def get_network(name, batch_size):
 #    rk3399       2      2     0
 #    rpi3b        11     11    0
 #    ----------------------------------
+#
+# You can register multiple devices to the tracker to accelerate the measurement in tuning.
 
 ###########################################
 # Set Tuning Options
@@ -184,34 +192,30 @@ def get_network(name, batch_size):
 dtype = 'float32'
 
 tuning_option = {
-   'log_filename': log_file,
-
-   'tuner': 'xgb',
-   'n_trial': 1000,
-   'early_stopping': 250,
-
-   'measure_option': autotvm.measure_option(
-       autotvm.measure.rpc(device_key, host='localhost', port=9190),
-       number=4,
-       n_parallel=1,
-       timeout=10,
-       build_func='ndk' if use_android else 'default',
-   ),
+    'log_filename': log_file,
+
+    'tuner': 'xgb',
+    'n_trial': 1000,
+    'early_stopping': 400,
+
+    'measure_option': autotvm.measure_option(
+        builder=autotvm.LocalBuilder(
+            build_func='ndk' if use_android else 'default'),
+        runner=autotvm.RPCRunner(
+            device_key, host='localhost', port=9190,
+            number=5,
+            timeout=4,
+        ),
+    ),
 }
 
 ####################################################################
 #
 # .. note:: How to set tuning options
 #
-#   In general, the default value provided here works well. It is the same
-#   value that we used to generate pre-tuned parameters.
-#   If you have multiple devices, you can set :code:`n_parallel` to
-#   the number of devices you have. (e.g. set it to 3 if you register 3 rk3399
-#   boards to the tracker).
+#   In general, the default value provided here works well.
 #   If you have large time budget, you can set :code:`n_trial`, :code:`early_stopping` larger,
 #   which makes the tuning run longer.
-#   If your device is very slow or a single conv2d operator in your network has large FLOPs,
-#   consider setting timeout larger.
 #
 
 ###################################################################
@@ -219,7 +223,7 @@ def get_network(name, batch_size):
 # ------------
 # Now we can extract tuning tasks from the network and begin tuning.
 # Here we provide a simple utility function to tune a list of tasks.
-# This function is just an initial implementation which tune them in sequential order.
+# This function is just an initial implementation which tunes them in sequential order.
 # Later we will bring more sophisticated tuner scheduler.
 
 # You can skip the implementation of this function for this tutorial.
@@ -236,7 +240,9 @@ def tune_tasks(tasks,
             try:  # try winograd template
                 tsk = autotvm.task.create(tasks[i].name, tasks[i].args,
                                           tasks[i].target, tasks[i].target_host, 'winograd')
-                tasks.append(tsk)
+                input_channel = tsk.workload[1][1]
+                if input_channel >= 64:
+                    tasks[i] = tsk
             except Exception:
                 pass
 
@@ -245,8 +251,8 @@ def tune_tasks(tasks,
     if os.path.exists(tmp_log_file):
         os.remove(tmp_log_file)
 
-    for i, tsk in enumerate(tasks):
-        prefix = "[Task %2d/%2d] " %(i+1, len(tasks))
+    for i, tsk in enumerate(reversed(tasks)):
+        prefix = "[Task %2d/%2d] " % (i+1, len(tasks))
 
         # create tuner
         if tuner == 'xgb' or tuner == 'xgb-rank':
@@ -280,7 +286,7 @@ def tune_tasks(tasks,
 ########################################################################
 # Finally we launch tuning jobs and evaluate the end-to-end performance.
 
-def tune_and_evaluate():
+def tune_and_evaluate(tuning_opt):
     # extract workloads from nnvm graph
     print("Extract tasks...")
     net, params, input_shape, out_shape = get_network(network, batch_size=1)
@@ -290,19 +296,18 @@ def tune_and_evaluate():
 
     # run tuning tasks
     print("Tuning...")
-    tune_tasks(tasks, **tuning_option)
+    tune_tasks(tasks, **tuning_opt)
 
     # compile kernels with history best records
     with autotvm.apply_history_best(log_file):
         print("Compile...")
         with nnvm.compiler.build_config(opt_level=2, add_pass=['AlterOpLayout']):
             graph, lib, params = nnvm.compiler.build(
-                net, target=target,
-                shape={'data': input_shape}, params=params, dtype=dtype)
+                net, target=target, shape={'data': input_shape}, params=params, dtype=dtype)
 
         # export library
         tmp = tempdir()
-        if tuning_option['measure_option']['build_func'] == 'ndk': # for android
+        if use_android:
             from tvm.contrib import ndk
             filename = "net.so"
             lib.export_library(tmp.relpath(filename), ndk.create_shared)
@@ -312,8 +317,7 @@ def tune_and_evaluate():
 
         # upload module to device
         print("Upload...")
-        remote = autotvm.measure.request_remote(device_key,
-                                                tracker_addr=('localhost', 9190),
+        remote = autotvm.measure.request_remote(device_key, 'localhost', 9190,
                                                 timeout=10000)
         remote.upload(tmp.relpath(filename))
         rlib = remote.load_module(filename)
@@ -328,47 +332,44 @@ def tune_and_evaluate():
 
         # evaluate
         print("Evaluate inference time cost...")
-        ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=10)
-        prof_res = np.array(ftimer().results) * 1000 # convert to millisecond
+        ftimer = module.module.time_evaluator("run", ctx, number=8, repeat=3)
+        prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
         print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
               (np.mean(prof_res), np.std(prof_res)))
 
 # We do not run the tuning in our webpage server since it takes too long.
 # Uncomment the following line to run by yourself.
-# tune_and_evaluate()
+
+# tune_and_evaluate(tuning_option)
 
 ######################################################################
 # Sample Output
 # -------------
-# The tuning needs to train xgboost models and use them for prediction.
+# The tuning needs to compile many programs and extract feature from them.
 # So a high performance CPU is recommended.
-# It takes about 2 hours on a 32T AMD Ryzen CPU.
-# One sample output is
+# One sample output is listed below.
+# It takes about 2 hours on a 32T AMD Ryzen Threadripper.
 #
 # .. code-block:: bash
 #
 #    Extract tasks...
 #    Tuning...
-#    [Task  1/16]  Current/Best:   18.85/  19.67 GFLOPS | Progress: (353/1000) | 387.05 s Done.
-#    [Task  2/16]  Current/Best:   16.10/  23.50 GFLOPS | Progress: (444/1000) | 379.99 s Done.
-#    [Task  3/16]  Current/Best:    5.49/  13.96 GFLOPS | Progress: (610/1000) | 485.87 s Done.
-#    [Task  4/16]  Current/Best:   10.07/  20.48 GFLOPS | Progress: (430/1000) | 391.66 s Done.
-#    [Task  5/16]  Current/Best:   11.50/  15.50 GFLOPS | Progress: (374/1000) | 356.03 s Done.
-#    [Task  6/16]  Current/Best:   10.76/  23.77 GFLOPS | Progress: (526/1000) | 526.42 s Done.
-#    [Task  7/16]  Current/Best:   12.71/  22.03 GFLOPS | Progress: (341/1000) | 322.96 s Done.
-#    [Task  8/16]  Current/Best:    8.60/  17.91 GFLOPS | Progress: (272/1000) | 236.08 s Done.
-#    [Task  9/16]  Current/Best:   15.37/  23.62 GFLOPS | Progress: (275/1000) | 275.18 s Done.
-#    [Task 10/16]  Current/Best:    6.62/  23.01 GFLOPS | Progress: (330/1000) | 315.02 s Done.
-#    [Task 11/16]  Current/Best:    1.85/  21.39 GFLOPS | Progress: (281/1000) | 239.19 s Done.
-#    [Task 12/16]  Current/Best:   15.41/  24.02 GFLOPS | Progress: (258/1000) | 270.82 s Done.
-#    [Task 13/16]  Current/Best:   17.96/  25.79 GFLOPS | Progress: (380/1000) | 738.29 s Done.
-#    [Task 14/16]  Current/Best:   14.81/  31.17 GFLOPS | Progress: (413/1000) | 799.21 s Done.
-#    [Task 15/16]  Current/Best:   24.39/  40.97 GFLOPS | Progress: (355/1000) | 700.25 s Done.
-#    [Task 16/16]  Current/Best:    9.42/  49.90 GFLOPS | Progress: (348/1000) | 603.84 s Done.
+#    [Task  1/12]  Current/Best:   22.37/  52.19 GFLOPS | Progress: (544/1000) | 406.59 s Done.
+#    [Task  2/12]  Current/Best:    6.51/  18.77 GFLOPS | Progress: (608/1000) | 325.05 s Done.
+#    [Task  3/12]  Current/Best:    4.67/  24.87 GFLOPS | Progress: (480/1000) | 372.31 s Done.
+#    [Task  4/12]  Current/Best:   11.35/  46.83 GFLOPS | Progress: (736/1000) | 602.39 s Done.
+#    [Task  5/12]  Current/Best:    1.01/  19.80 GFLOPS | Progress: (448/1000) | 262.16 s Done.
+#    [Task  6/12]  Current/Best:    2.47/  23.76 GFLOPS | Progress: (672/1000) | 563.85 s Done.
+#    [Task  7/12]  Current/Best:   14.57/  33.97 GFLOPS | Progress: (544/1000) | 465.15 s Done.
+#    [Task  8/12]  Current/Best:    1.13/  17.65 GFLOPS | Progress: (576/1000) | 365.08 s Done.
+#    [Task  9/12]  Current/Best:   14.45/  22.66 GFLOPS | Progress: (928/1000) | 724.25 s Done.
+#    [Task 10/12]  Current/Best:    3.22/  15.36 GFLOPS | Progress: (864/1000) | 564.27 s Done.
+#    [Task 11/12]  Current/Best:   11.03/  32.23 GFLOPS | Progress: (736/1000) | 635.15 s Done.
+#    [Task 12/12]  Current/Best:    8.00/  21.65 GFLOPS | Progress: (1000/1000) | 1111.81 s Done.
 #    Compile...
 #    Upload...
 #    Evaluate inference time cost...
-#    Mean inference time (std dev): 157.29 ms (1.74 ms)
+#    Mean inference time (std dev): 162.59 ms (0.06 ms)
 
 ######################################################################
 #
diff --git a/tutorials/autotvm/tune_simple_template.py b/tutorials/autotvm/tune_simple_template.py
index 8d4aab0b2c27..5b3ddaaf644a 100644
--- a/tutorials/autotvm/tune_simple_template.py
+++ b/tutorials/autotvm/tune_simple_template.py
@@ -271,9 +271,12 @@ def matmul(N, L, M, dtype):
 logging.getLogger('autotvm').setLevel(logging.DEBUG)
 logging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout))
 
-# use local cpu, measure 5 times for every config to reduce variance
-measure_option = autotvm.measure_option('local',
-                                        number=5)
+# There are two steps for measuring a config: build and run.
+# By default, we use all cpu cores to compile program. Then measure them sequentially.
+# We measure 5 times and take average to reduce variance.
+measure_option = autotvm.measure_option(
+    builder='local',
+    runner=autotvm.LocalRunner(number=5))
 
 # begin tuning, log records to file `matmul.log`
 tuner = autotvm.tuner.RandomTuner(task)

From 7c4dd0bda75a14053f91ae4fa234d3c33f8b7fff Mon Sep 17 00:00:00 2001
From: MORITA Kazutaka <morita.kazutaka@lab.ntt.co.jp>
Date: Thu, 30 Aug 2018 02:29:53 +0900
Subject: [PATCH 75/77] [TOPI] add nn schedulers for HLS backends (#1663)

* [TOPI] add nn schedulers for HLS backends

* fix pylint

* fix topi transform test
---
 nnvm/python/nnvm/testing/config.py       |   4 +-
 src/codegen/codegen_aocl.cc              |   2 +
 topi/python/topi/hls/__init__.py         |   1 +
 topi/python/topi/hls/nn.py               | 407 +++++++++++++++++++++++
 topi/tests/python/common.py              |   2 +-
 topi/tests/python/test_topi_pooling.py   |   6 +-
 topi/tests/python/test_topi_reduce.py    |   4 +-
 topi/tests/python/test_topi_relu.py      |   4 +-
 topi/tests/python/test_topi_softmax.py   |   4 +-
 topi/tests/python/test_topi_transform.py |  20 +-
 10 files changed, 437 insertions(+), 17 deletions(-)
 create mode 100644 topi/python/topi/hls/nn.py

diff --git a/nnvm/python/nnvm/testing/config.py b/nnvm/python/nnvm/testing/config.py
index 0eab3e6b3389..bf22ea7e3887 100644
--- a/nnvm/python/nnvm/testing/config.py
+++ b/nnvm/python/nnvm/testing/config.py
@@ -10,5 +10,5 @@ def ctx_list():
     device_list = (device_list.split(",") if device_list
                    else ["llvm", "cuda"])
     device_list = set(device_list)
-    res = [("llvm", tvm.cpu(0)), ("cuda", tvm.gpu(0))]
-    return [x for x in res if x[1].exist and x[0] in device_list]
+    res = [(device, tvm.context(device, 0)) for device in device_list]
+    return [x for x in res if x[1].exist]
diff --git a/src/codegen/codegen_aocl.cc b/src/codegen/codegen_aocl.cc
index 6ae89fecf6bf..d9167a7aadcd 100644
--- a/src/codegen/codegen_aocl.cc
+++ b/src/codegen/codegen_aocl.cc
@@ -33,6 +33,8 @@ runtime::Module BuildAOCL(Array<LoweredFunc> funcs, std::string target_str,
 
   // Compile the .cl file.
   std::string cmd = "aoc aocl.cl";
+  // AOCL supports fp64.
+  cmd += " -Dcl_khr_fp64";
   Target target = Target::create(target_str);
   if (target->device_name != "") {
     cmd += " -board=" + target->device_name;
diff --git a/topi/python/topi/hls/__init__.py b/topi/python/topi/hls/__init__.py
index 69b80514ff56..65f091fc9916 100644
--- a/topi/python/topi/hls/__init__.py
+++ b/topi/python/topi/hls/__init__.py
@@ -3,3 +3,4 @@
 from __future__ import absolute_import as _abs
 
 from .injective import schedule_injective, schedule_elemwise, schedule_broadcast
+from .nn import *
diff --git a/topi/python/topi/hls/nn.py b/topi/python/topi/hls/nn.py
new file mode 100644
index 000000000000..8c986d7a5663
--- /dev/null
+++ b/topi/python/topi/hls/nn.py
@@ -0,0 +1,407 @@
+# pylint: disable=invalid-name,unused-variable,unused-argument
+"""HLS nn operators"""
+from __future__ import absolute_import as _abs
+import tvm
+from .. import tag
+from .. import generic
+
+
+def _schedule_conv2d(outs):
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+    tvm.schedule.AutoInlineInjective(s)
+
+    def traverse(OP):
+        """Internal travserse function"""
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if tag.is_injective(OP.tag):
+            if OP not in s.outputs:
+                s[OP].compute_inline()
+            for tensor in OP.input_tensors:
+                if tensor.op.input_tensors:
+                    traverse(tensor.op)
+        # schedule conv2d
+        elif OP.tag.find("conv2d") >= 0:
+            Conv2d = OP.output(0)
+            if not Conv2d.op in s.outputs:
+                Out = outs[0].op.output(0)
+                s[Conv2d].compute_at(s[Out], s[Out].op.axis[1])
+        else:
+            raise RuntimeError("Unsupported operator: %s" % OP.tag)
+
+    traverse(outs[0].op)
+
+    px, x = s[outs[0]].split(outs[0].op.axis[0], nparts=1)
+    s[outs[0]].bind(px, tvm.thread_axis("pipeline"))
+    return s
+
+
+@generic.schedule_conv2d_nchw.register(["hls"])
+def schedule_conv2d_nchw(outs):
+    """Schedule for conv2d_nchw
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of conv2d_nchw
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _schedule_conv2d(outs)
+
+
+@generic.schedule_conv2d_nhwc.register(["hls"])
+def schedule_conv2d_nhwc(outs):
+    """Schedule for conv2d_nhwc
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of conv2d_nchw
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _schedule_conv2d(outs)
+
+
+@generic.schedule_conv2d_NCHWc.register(["hls"])
+def schedule_conv2d_NCHWc(num_filter, kernel_size, strides,
+                          padding, layout, out_layout, outs):
+    """Schedule for conv2d_NCHW[x]c
+
+    Parameters
+    ----------
+    num_filter : int
+        The number of filter, i.e., the output channel.
+
+    kernel_size : tuple of int
+        (kernel_height, kernel_width)
+
+    strides : tuple of int
+        (stride_of_height, stride_of_width)
+
+    padding : tuple of int
+        (pad_of_height, pad_of_width)
+
+    layout : str
+        Input data layout
+
+    out_layout : str
+        Output data layout
+
+    outs : Array of Tensor
+        The computation graph description of conv2d_NCHWc
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    sch : Schedule
+        The computation schedule for the op.
+    """
+    return _schedule_conv2d(outs)
+
+
+@generic.schedule_conv2d_transpose_nchw.register(["hls"])
+def schedule_conv2d_transpose_nchw(outs):
+    """Schedule for conv2d_transpose_nchw
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of conv2d_transpose_nchw
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for the op.
+    """
+    return _schedule_conv2d(outs)
+
+
+@generic.schedule_depthwise_conv2d_nchw.register(["hls"])
+def schedule_depthwise_conv2d_nchw(outs):
+    """Schedule for depthwise_conv2d_nchw
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of depthwise_conv2d_nchw
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _schedule_conv2d(outs)
+
+
+@generic.schedule_depthwise_conv2d_nhwc.register(["hls"])
+def schedule_depthwise_conv2d_nhwc(outs):
+    """Schedule for depthwise_conv2d_nhwc
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of depthwise_conv2d_nhwc
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _schedule_conv2d(outs)
+
+@generic.schedule_bitserial_conv2d_nchw.register(["hls"])
+def schedule_bitserial_conv2d_nchw(outs):
+    """Schedule for bitserial_conv2d_nchw
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of bitserial_conv2d_nchw
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _schedule_conv2d(outs)
+
+
+@generic.schedule_bitserial_conv2d_nhwc.register(["hls"])
+def schedule_bitserial_conv2d_nhwc(outs):
+    """Schedule for bitserial_conv2d_nhwc
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of bitserial_conv2d_nchw
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _schedule_conv2d(outs)
+
+
+@generic.schedule_reduce.register(["hls"])
+def schedule_reduce(outs):
+    """Schedule for reduction
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of reduce
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+    tvm.schedule.AutoInlineInjective(s)
+
+    def traverse(OP):
+        """Internal travserse function"""
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if tag.is_broadcast(OP.tag):
+            if OP not in s.outputs:
+                s[OP].compute_inline()
+            for tensor in OP.input_tensors:
+                if tensor.op.input_tensors:
+                    traverse(tensor.op)
+        elif OP.tag in ["comm_reduce", "comm_reduce_idx"]:
+            if OP.tag == "comm_reduce":
+                Reduce = OP.output(0)
+            else:
+                Reduce = OP.input_tensors[0]
+            if not Reduce.op in s.outputs:
+                Out = outs[0].op.output(0)
+                s[Reduce].compute_at(s[Out], s[Out].op.axis[0])
+        else:
+            raise RuntimeError("Unsupported operator: %s" % OP.tag)
+
+    traverse(outs[0].op)
+
+    fused = s[outs[0]].fuse()
+    px, x = s[outs[0]].split(fused, nparts=1)
+    s[outs[0]].bind(px, tvm.thread_axis("pipeline"))
+    return s
+
+
+@generic.schedule_softmax.register(["hls"])
+def schedule_softmax(outs):
+    """Schedule for softmax
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of softmax
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+    tvm.schedule.AutoInlineInjective(s)
+
+    softmax = outs[0]
+    max_elem = softmax.op.input_tensors[1]
+    expsum = softmax.op.input_tensors[2]
+
+    s[expsum].compute_at(s[softmax], s[softmax].op.axis[1])
+    s[max_elem].compute_at(s[softmax], s[softmax].op.axis[1])
+
+    px, x = s[softmax].split(softmax.op.axis[0], nparts=1)
+    s[softmax].bind(px, tvm.thread_axis("pipeline"))
+    return s
+
+
+@generic.schedule_dense.register(["hls"])
+def schedule_dense(outs):
+    """Schedule for dense
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of dense
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+    tvm.schedule.AutoInlineInjective(s)
+
+    def traverse(OP):
+        """Internal travserse function"""
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if tag.is_broadcast(OP.tag):
+            if OP not in s.outputs:
+                s[OP].compute_inline()
+            for tensor in OP.input_tensors:
+                if tensor.op.input_tensors:
+                    traverse(tensor.op)
+        # schedule dense
+        elif OP.tag == 'dense':
+            Dense = OP.output(0)
+            if not Dense.op in s.outputs:
+                Out = outs[0].op.output(0)
+                s[Dense].compute_at(s[Out], s[Out].op.axis[1])
+        else:
+            raise RuntimeError("Unsupported operator: %s" % OP.tag)
+
+    traverse(outs[0].op)
+
+    px, x = s[outs[0]].split(outs[0].op.axis[0], nparts=1)
+    s[outs[0]].bind(px, tvm.thread_axis("pipeline"))
+    return s
+
+
+@generic.schedule_pool.register(["hls"])
+def schedule_pool(outs, layout):
+    """Schedule for pool
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of pool
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+    tvm.schedule.AutoInlineInjective(s)
+
+    def traverse(OP):
+        """Internal travserse function"""
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if tag.is_broadcast(OP.tag):
+            if OP not in s.outputs:
+                s[OP].compute_inline()
+            for tensor in OP.input_tensors:
+                if tensor.op.input_tensors:
+                    traverse(tensor.op)
+        # schedule pool
+        elif OP.tag.startswith('pool'):
+            Pool = OP.output(0)
+            if not Pool.op in s.outputs:
+                Out = outs[0].op.output(0)
+                s[Pool].compute_at(s[Out], s[Out].op.axis[1])
+        else:
+            raise RuntimeError("Unsupported operator: %s" % OP.tag)
+
+    traverse(outs[0].op)
+
+    px, x = s[outs[0]].split(outs[0].op.axis[0], nparts=1)
+    s[outs[0]].bind(px, tvm.thread_axis("pipeline"))
+    return s
+
+
+@generic.schedule_global_pool.register(["hls"])
+def schedule_global_pool(outs):
+    """Schedule for global pool
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of global pool
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
+    s = tvm.create_schedule([x.op for x in outs])
+    tvm.schedule.AutoInlineInjective(s)
+
+    def traverse(OP):
+        """Internal travserse function"""
+        # inline all one-to-one-mapping operators except the last stage (output)
+        if tag.is_broadcast(OP.tag):
+            if OP not in s.outputs:
+                s[OP].compute_inline()
+            for tensor in OP.input_tensors:
+                if tensor.op.input_tensors:
+                    traverse(tensor.op)
+        # schedule global_pool
+        elif OP.tag.startswith('global_pool'):
+            Pool = OP.output(0)
+            if not Pool.op in s.outputs:
+                Out = outs[0].op.output(0)
+                s[Pool].compute_at(s[Out], s[Out].op.axis[1])
+        else:
+            raise RuntimeError("Unsupported operator: %s" % OP.tag)
+
+    traverse(outs[0].op)
+
+    px, x = s[outs[0]].split(outs[0].op.axis[0], nparts=1)
+    s[outs[0]].bind(px, tvm.thread_axis("pipeline"))
+    return s
diff --git a/topi/tests/python/common.py b/topi/tests/python/common.py
index d992be9292fc..820cb561c0c3 100644
--- a/topi/tests/python/common.py
+++ b/topi/tests/python/common.py
@@ -9,4 +9,4 @@ def get_all_backend():
         A list of all supported targets
     """
     return ['llvm', 'cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx',
-            'llvm -device=arm_cpu']
+            'llvm -device=arm_cpu', 'aocl_sw_emu']
diff --git a/topi/tests/python/test_topi_pooling.py b/topi/tests/python/test_topi_pooling.py
index b87795743c4c..578adf60094a 100644
--- a/topi/tests/python/test_topi_pooling.py
+++ b/topi/tests/python/test_topi_pooling.py
@@ -5,6 +5,8 @@
 import math
 from topi.util import get_const_tuple
 
+from common import get_all_backend
+
 def verify_pool(n, ic, ih, kh, sh, padding, pool_type, ceil_mode, count_include_pad=True):
     iw = ih
     kw = kh
@@ -64,7 +66,7 @@ def check_device(device):
         f(a, b)
         np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
-    for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
+    for device in get_all_backend():
         check_device(device)
 
 def test_pool():
@@ -109,7 +111,7 @@ def check_device(device):
         f(a, b)
         np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
-    for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']:
+    for device in get_all_backend():
         check_device(device)
 
 def test_global_pool():
diff --git a/topi/tests/python/test_topi_reduce.py b/topi/tests/python/test_topi_reduce.py
index 331498deb10c..0be652948060 100644
--- a/topi/tests/python/test_topi_reduce.py
+++ b/topi/tests/python/test_topi_reduce.py
@@ -4,6 +4,8 @@
 import tvm
 import topi
 
+from common import get_all_backend
+
 def _my_npy_argmax(arr, axis, keepdims):
     if not keepdims:
         return arr.argmax(axis=axis)
@@ -90,7 +92,7 @@ def check_device(device):
                 np.testing.assert_allclose(out_tvm_val, in_npy_map.min(axis=axis), 1E-3, 1E-3)
         else:
             np.testing.assert_allclose(out_tvm.asnumpy(), out_npy, 1E-3, 1E-3)
-    for device in ["cuda", "opencl", "metal", "llvm", "rocm", "vulkan", "nvptx"]:
+    for device in get_all_backend():
         check_device(device)
 
 
diff --git a/topi/tests/python/test_topi_relu.py b/topi/tests/python/test_topi_relu.py
index 7c75a9b08975..3e38e707a6da 100644
--- a/topi/tests/python/test_topi_relu.py
+++ b/topi/tests/python/test_topi_relu.py
@@ -5,6 +5,8 @@
 import topi
 from topi.util import get_const_tuple
 
+from common import get_all_backend
+
 def verify_relu(m, n):
     A = tvm.placeholder((m, n), name='A')
     B = topi.nn.relu(A)
@@ -27,7 +29,7 @@ def check_device(device):
         foo(a, b)
         np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
-    for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx', 'sdaccel']:
+    for device in get_all_backend():
         check_device(device)
 
 
diff --git a/topi/tests/python/test_topi_softmax.py b/topi/tests/python/test_topi_softmax.py
index f12070695220..cad30fa00e5b 100644
--- a/topi/tests/python/test_topi_softmax.py
+++ b/topi/tests/python/test_topi_softmax.py
@@ -7,6 +7,8 @@
 import logging
 from topi.util import get_const_tuple
 
+from common import get_all_backend
+
 def verify_softmax(m, n, dtype="float32"):
     A = tvm.placeholder((m, n), dtype=dtype, name='A')
     B = topi.nn.softmax(A)
@@ -63,7 +65,7 @@ def check_device(device):
         foo(a, b)
         np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
 
-    for device in ["cuda", "opencl", "metal", "rocm", "vulkan", "nvptx"]:
+    for device in get_all_backend():
         check_device(device)
 
 
diff --git a/topi/tests/python/test_topi_transform.py b/topi/tests/python/test_topi_transform.py
index 4788d758cf45..123df331e174 100644
--- a/topi/tests/python/test_topi_transform.py
+++ b/topi/tests/python/test_topi_transform.py
@@ -3,6 +3,8 @@
 import tvm
 import topi
 
+from common import get_all_backend
+
 def verify_expand_dims(in_shape, out_shape, axis, num_newaxis):
     A = tvm.placeholder(shape=in_shape, name="A")
     B = topi.expand_dims(A, axis, num_newaxis)
@@ -22,7 +24,7 @@ def check_device(device):
         foo(data_nd, out_nd)
         np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm", "vulkan", "sdaccel"]:
+    for device in get_all_backend():
         check_device(device)
 
 
@@ -45,7 +47,7 @@ def check_device(device):
         foo(data_nd, out_nd)
         np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm", "vulkan", "sdaccel"]:
+    for device in get_all_backend():
         check_device(device)
 
 
@@ -68,7 +70,7 @@ def check_device(device):
         foo(data_nd, out_nd)
         np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm", "vulkan", "sdaccel"]:
+    for device in get_all_backend():
         check_device(device)
 
 
@@ -96,7 +98,7 @@ def check_device(device):
         foo(data_nd, out_nd)
         np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm", "vulkan", "sdaccel"]:
+    for device in get_all_backend():
         check_device(device)
 
 def verify_concatenate(shapes, axis):
@@ -121,7 +123,7 @@ def check_device(device):
         foo(*(data_nds + [out_nd]))
         np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm", "vulkan", "sdaccel"]:
+    for device in get_all_backend():
         check_device(device)
 
 
@@ -146,7 +148,7 @@ def check_device(device):
         for out_nd, out_npy in zip(out_nds, out_npys):
             np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm", "vulkan", "sdaccel"]:
+    for device in get_all_backend():
         check_device(device)
 
 
@@ -204,7 +206,7 @@ def check_device(device):
         foo(data_nd, out_nd)
         np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in ["llvm", "cuda", "opencl", "sdaccel"]:
+    for device in ["llvm", "cuda", "opencl", "sdaccel", "aocl_sw_emu"]:
         check_device(device)
 
 def verify_take(src_shape, indices_src, axis=None):
@@ -243,7 +245,7 @@ def check_device(device):
         foo(data_nd, indices_nd, out_nd)
         np.testing.assert_allclose(out_nd.asnumpy(), out_npys)
 
-    for device in ["llvm", "opencl", "sdaccel"]:
+    for device in ["llvm", "opencl", "sdaccel", "aocl_sw_emu"]:
         check_device(device)
 
 def verify_strided_slice(in_shape, begin, end, stride=None):
@@ -270,7 +272,7 @@ def check_device(device):
         foo(data_nd, out_nd)
         np.testing.assert_allclose(out_nd.asnumpy(), out_npy)
 
-    for device in ["llvm", "opencl", "sdaccel"]:
+    for device in ["llvm", "opencl", "sdaccel", "aocl_sw_emu"]:
         check_device(device)
 
 def test_strided_slice():

From 9f99a4fac1951cdbcc63ae2b1e79184aa9c1e10b Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Thu, 30 Aug 2018 00:47:44 -0500
Subject: [PATCH 76/77] Fix incorrect doc in conv2d_nhwc_python (#1677)

---
 topi/python/topi/testing/conv2d_nhwc_python.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/topi/python/topi/testing/conv2d_nhwc_python.py b/topi/python/topi/testing/conv2d_nhwc_python.py
index 461d7a6a0e06..a872bddab09b 100644
--- a/topi/python/topi/testing/conv2d_nhwc_python.py
+++ b/topi/python/topi/testing/conv2d_nhwc_python.py
@@ -13,7 +13,7 @@ def conv2d_nhwc_python(a_np, w_np, stride, padding):
         4-D with shape [batch, in_height, in_width, in_channel]
 
     w_np : numpy.ndarray
-        4-D with shape [num_filter, filter_height, filter_width, in_channel]
+        4-D with shape [filter_height, filter_width, in_channel, num_filter]
 
     stride : int or a list/tuple of two ints
         Stride size, or [stride_height, stride_width]

From 0c523787297039ce00b320c1d32e022e61e97ac2 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <mercy_zheng@sjtu.edu.cn>
Date: Wed, 29 Aug 2018 23:09:33 -0700
Subject: [PATCH 77/77] [PASS] Enhance gpu verify pass (#1660)

---
 src/pass/verify_gpu_code.cc                   | 16 ++++++++++++-
 .../unittest/test_pass_verify_gpu_code.py     | 24 +++++++++++++++++++
 2 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/src/pass/verify_gpu_code.cc b/src/pass/verify_gpu_code.cc
index 363b7c4cf7cc..70908eb43d6b 100644
--- a/src/pass/verify_gpu_code.cc
+++ b/src/pass/verify_gpu_code.cc
@@ -86,17 +86,29 @@ class GPUCodeVerifier : public IRVisitor {
       // record the number of threads in a block
       std::string name = var.get()->name_hint;
       if (name == "threadIdx.x" || name == "threadIdx.y" || name == "threadIdx.z") {
+        size_t length = static_cast<size_t>(extent->value);
         if (!visited_threads_.count(name)) {
           visited_threads_.insert(name);
-          size_t length = static_cast<size_t>(extent->value);
           thread_per_block_ *= length;
 
           if (name == "threadIdx.x") {
             valid_ &= length <= max_thread_x_;
+            thread_x_extent_ = length;
           } else if (name == "threadIdx.y") {
             valid_ &= length <= max_thread_y_;
+            thread_y_extent_ = length;
           } else if (name == "threadIdx.z") {
             valid_ &= length <= max_thread_z_;
+            thread_z_extent_ = length;
+          }
+        } else {
+          // the thread should be bound to axes with the same length
+          if (name == "threadIdx.x") {
+            valid_ &= length == thread_x_extent_;
+          } else if (name == "threadIdx.y") {
+            valid_ &= length == thread_y_extent_;
+          } else if (name == "threadIdx.z") {
+            valid_ &= length == thread_z_extent_;
           }
         }
       }
@@ -111,6 +123,8 @@ class GPUCodeVerifier : public IRVisitor {
   std::unordered_set<const tvm::Variable *> visited_shared_buffers_;
   std::unordered_set<std::string> visited_threads_;
 
+  size_t thread_x_extent_, thread_y_extent_, thread_z_extent_;
+
   size_t local_memory_per_block_;
   size_t shared_memory_per_block_;
   size_t thread_per_block_;
diff --git a/tests/python/unittest/test_pass_verify_gpu_code.py b/tests/python/unittest/test_pass_verify_gpu_code.py
index 6fc0387cf144..e3884a727852 100644
--- a/tests/python/unittest/test_pass_verify_gpu_code.py
+++ b/tests/python/unittest/test_pass_verify_gpu_code.py
@@ -162,8 +162,32 @@ def test_multiple_kernels():
             tvm.build(s, [A, C], target)
         assert valid[0]
 
+def test_wrong_bind():
+    N = 1024
+
+    A = tvm.placeholder((N, N-1), name='A')
+    B = tvm.compute((N, N-1), lambda i, j: A[i, j])
+
+    s = tvm.create_schedule([B.op])
+
+    # bind a thread axis to two loop axes with different lengths
+    s[B].bind(s[B].op.axis[0], tvm.thread_axis("threadIdx.x"))
+    s[B].bind(s[B].op.axis[1], tvm.thread_axis("threadIdx.x"))
+
+    for target in ['opencl', 'cuda']:
+        if not tvm.context(target).exist:
+            continue
+
+        valid = [None]
+        with tvm.build_config(**{"add_lower_pass": [
+                (2, get_verify_pass(valid, max_threads_per_block=N*N))]}):
+            tvm.build(s, [A, B], target)
+        assert not valid[0]
+
+
 if __name__ == "__main__":
     test_local_memory()
     test_shared_memory()
     test_num_thread()
     test_multiple_kernels()
+    test_wrong_bind()