From 1ed28aeb5967f8705d28f9d4a5ddcb834843e4a2 Mon Sep 17 00:00:00 2001 From: masahi Date: Thu, 9 Aug 2018 02:07:44 +0900 Subject: [PATCH 01/77] [NNVM] Enhance operator fusion for more element wise patterns (#1548) --- nnvm/src/compiler/graph_fuse.cc | 97 ++++++++++++++++++++ nnvm/tests/python/compiler/test_op_fusion.py | 44 ++++++++- topi/python/topi/arm_cpu/conv2d.py | 5 +- topi/python/topi/util.py | 26 ++++-- 4 files changed, 158 insertions(+), 14 deletions(-) diff --git a/nnvm/src/compiler/graph_fuse.cc b/nnvm/src/compiler/graph_fuse.cc index d4e668972593..247224aa9bfe 100644 --- a/nnvm/src/compiler/graph_fuse.cc +++ b/nnvm/src/compiler/graph_fuse.cc @@ -161,6 +161,103 @@ nnvm::Graph GraphFusePartition(nnvm::Graph g) { } } } + + /* + Above algorithm will not fuse a node whose output is fed to more than one + child node. This is because in general, it does not make sense to fuse multiple + children branches with their parent, as in the following example. + + conv2d + / | \ + / | \ + op op op + | | | + | | | + + However, when all children branches meet at a certain node, there is a possibility for + further operator fusion. For example, all nodes in the following subgraph can be fused + into a single node, if three 'in-between' nodes and the bottom node are all element wise + operation. + + conv2d + / | \ + / | \ + op op op + \ | / + \ | / + elemwise add + | + + This pattern is not uncommon. For example, it arises when conv2d op is followed by exponential + linear unit. If bias add and batch normalization are also present, they can be fused as well. + + In fact, above fusion algorithm already fuses three in-between nodes and the element wise + add node in the figure above. The following code fuses the conv2d node with the already + fused children nodes. The following patterns are supported. + + * Any number of child nodes from the top node + * The path from the top node to bottom node can contain any number of element wise ops. + + The only restriction is that in-between nodes cannot have more than one child. + + The overview of the algorithm below is as follows: + + 1. Check if all children nodes are fused into a single op by the existing fusion algorithm + 2. Fuse the parent node to children nodes, and update its group id to be the children's group id + 3. If the parent node originally belongs to another group (for example, conv + batch norm), + propagate the new group id to a grand parent and upward + */ + if (opt_level >= 1) { + std::vector > children_group_ids(idx.num_nodes()); + std::vector > node_ids_per_group(idx.num_nodes()); + for (uint32_t nid = idx.num_nodes() - 1; nid != 0; --nid) { + const auto& inode = idx[nid]; + if (inode.source->is_variable()) continue; + CHECK_NE(group_vec[nid], -1); + node_ids_per_group[group_vec[nid]].push_back(nid); + if (inode.inputs.size() != 1) continue; + const uint32_t parent_nid = inode.inputs[0].node_id; + // if parent node has more than one child, record each child's group id. + if (ref_count[parent_nid] > 1) children_group_ids[parent_nid].push_back(group_vec[nid]); + } + + std::vector new_group_id(idx.num_nodes(), -1); + for (uint32_t nid = idx.num_nodes() - 1; nid != 0; --nid) { + if (new_group_id[group_vec[nid]] != -1) { + // propagate new group id from child + group_vec[nid] = new_group_id[group_vec[nid]]; + } + TOpPattern pt = op_pattern.get(idx[nid].source->op(), kOpaque); + if (pt == kOpaque) continue; + const auto& group_ids = children_group_ids[nid]; + if (group_ids.size() <= 1) continue; + const uint32_t child_group_id = group_ids[0]; + const auto& children_node_ids = node_ids_per_group[child_group_id]; + + auto is_same_group_id = [child_group_id](uint32_t id) { + return id == child_group_id; + }; + auto is_fusible_pattern = [&idx](uint32_t child_nid) { + TOpPattern child_pt = op_pattern.get(idx[child_nid].source->op(), kOpaque); + return child_pt <= kBroadcast; + }; + // fuse this node with children if + // all children belong to the same group and + // all nodes in the group are element wise or broadcast op. + const bool can_be_fused = std::all_of(group_ids.begin(), group_ids.end(), is_same_group_id) && + std::all_of(children_node_ids.begin(), children_node_ids.end(), is_fusible_pattern); + + if (can_be_fused) { + new_group_id[group_vec[nid]] = child_group_id; + group_vec[nid] = child_group_id; + for (uint32_t nid2 : node_ids_per_group[child_group_id]) { + pattern_vec[nid2] = pattern_vec[nid]; + master_vec[nid2] = master_vec[nid]; + } + } + } + } + g.attrs["group_root"] = std::make_shared(std::move(group_vec)); g.attrs["group_master"] = std::make_shared(std::move(master_vec)); g.attrs["pattern"] = std::make_shared(std::move(pattern_vec)); diff --git a/nnvm/tests/python/compiler/test_op_fusion.py b/nnvm/tests/python/compiler/test_op_fusion.py index f33e18197840..8d05ae02c579 100644 --- a/nnvm/tests/python/compiler/test_op_fusion.py +++ b/nnvm/tests/python/compiler/test_op_fusion.py @@ -5,7 +5,7 @@ from tvm.contrib import graph_runtime from nnvm import symbol as sym from nnvm.compiler import graph_util, graph_attr -from nnvm.testing import ctx_list +from nnvm.testing import ctx_list, utils def test_ewise_injective(): x = sym.Variable("x") @@ -77,7 +77,49 @@ def test_injective_reduce_injective(): np.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5) +def build_and_run(sym, params, data, out_shape, target, ctx, opt_level=2): + with nnvm.compiler.build_config(opt_level=opt_level): + graph, lib, params = nnvm.compiler.build(sym, target, shape={"data":data.shape}, params=params) + module = graph_runtime.create(graph, lib, ctx) + module.set_input(**params) + module.set_input("data", data) + module.run() + out = module.get_output(0, tvm.nd.empty(out_shape)) + return out.asnumpy(), graph + + +def test_fuse_conv2d_elu(): + def elu(data): + return -0.5 * sym.relu(1 - sym.exp(data)) + sym.relu(data) + + def get_sym(out_channel): + data = sym.Variable(name="data") + data = sym.conv2d(data=data, kernel_size=(3,3), channels=out_channel, padding=(1, 1), + layout="NCHW", kernel_layout="OIHW", use_bias=True) + data = sym.batch_norm(data) + data = elu(data) + return data + + in_channel = 8 + out_channel = 16 + size = 64 + dshape = (1, in_channel, size, size) + oshape = (1, out_channel, size, size) + data = np.random.uniform(-1, 1, dshape).astype(np.float32) + + for target, ctx in ctx_list(): + sym1 = get_sym(out_channel) + sym2 = get_sym(out_channel) + _, params1 = utils.create_workload(sym1, 1, dshape[1:], seed=0) + _, params2 = utils.create_workload(sym2, 1, dshape[1:], seed=0) + output1, g1 = build_and_run(sym1, params1, data, oshape, target, ctx, opt_level=2) + output2, g2 = build_and_run(sym2, params2, data, oshape, target, ctx, opt_level=0) + np.testing.assert_allclose(output1, output2, rtol=1e-5, atol=1e-5) + # data, conv weight, bias, batch norm gamma, batch norm beta, conv op + assert g1.index.num_nodes == 6 + if __name__ == "__main__": test_injective_reduce_injective() test_ewise_injective() test_conv_ewise_injective() + test_fuse_conv2d_elu() diff --git a/topi/python/topi/arm_cpu/conv2d.py b/topi/python/topi/arm_cpu/conv2d.py index e28c08cb93ee..f5dbec8e552b 100644 --- a/topi/python/topi/arm_cpu/conv2d.py +++ b/topi/python/topi/arm_cpu/conv2d.py @@ -39,11 +39,10 @@ def decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype): def schedule_conv2d_nchw_arm_cpu(cfg, outs): """TOPI schedule callback""" s = tvm.create_schedule([x.op for x in outs]) - scheduled_ops = [] def _callback(op): # schedule conv2d - if 'spatial_conv_output' in op.tag and op not in scheduled_ops: + if 'spatial_conv_output' in op.tag: output = op.output(0) conv = op.input_tensors[0] @@ -65,8 +64,6 @@ def _callback(op): output = op.output(0) _schedule_winograd(cfg, s, output, outs[0]) - scheduled_ops.append(op) - traverse_inline(s, outs[0].op, _callback) return s diff --git a/topi/python/topi/util.py b/topi/python/topi/util.py index b5d5dd2b99ad..71e123e83475 100644 --- a/topi/python/topi/util.py +++ b/topi/python/topi/util.py @@ -5,25 +5,33 @@ from . import tag -def traverse_inline(s, op, callback): +def traverse_inline(s, final_op, callback): """Traverse computation graph and do auto inline Parameters ---------- s: schedule The schedule - op: Operation + final_op: Operation The final output operator. callback: callable The callback function on each op """ - if tag.is_injective(op.tag): - if op not in s.outputs: - s[op].compute_inline() - for tensor in op.input_tensors: - if tensor.op.input_tensors: - traverse_inline(s, tensor.op, callback) - callback(op) + visited = set() + + def _traverse(op): + if op in visited: + return + visited.add(op) + if tag.is_injective(op.tag): + if op not in s.outputs: + s[op].compute_inline() + for tensor in op.input_tensors: + if tensor.op.input_tensors: + _traverse(tensor.op) + callback(op) + + _traverse(final_op) def prod(x): From 60da47055a1e9f2ef3ecfa6efc70a0c8d21ddf5e Mon Sep 17 00:00:00 2001 From: "Tang, Cheng" Date: Wed, 8 Aug 2018 10:11:24 -0700 Subject: [PATCH 02/77] add tvm external registry entrypoint (#1562) --- include/tvm/base.h | 6 ++++++ nnvm/src/compiler/graph_runtime.cc | 2 +- src/api/api_base.cc | 4 ++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/include/tvm/base.h b/include/tvm/base.h index d113f45352bb..1d7cf8add3ca 100644 --- a/include/tvm/base.h +++ b/include/tvm/base.h @@ -83,5 +83,11 @@ struct NodeFactoryReg ::dmlc::Registry<::tvm::NodeFactoryReg>::Get()->__REGISTER__(TypeName::_type_key) \ .set_body([]() { return std::make_shared(); }) +TVM_DLL::dmlc::Registry<::tvm::NodeFactoryReg > * GetTVMNodeFactoryRegistry(); + +#define TVM_EXTERNAL_REGISTER_NODE_TYPE(TypeName) \ + static DMLC_ATTRIBUTE_UNUSED ::tvm::NodeFactoryReg & __make_Node ## _ ## TypeName ## __ = \ + ::tvm::GetTVMNodeFactoryRegistry()->__REGISTER__(TypeName::_type_key) \ + .set_body([]() { return std::make_shared(); }) } // namespace tvm #endif // TVM_BASE_H_ diff --git a/nnvm/src/compiler/graph_runtime.cc b/nnvm/src/compiler/graph_runtime.cc index e623192258de..7301fd74117e 100644 --- a/nnvm/src/compiler/graph_runtime.cc +++ b/nnvm/src/compiler/graph_runtime.cc @@ -100,6 +100,6 @@ TVM_REGISTER_GLOBAL("nnvm.compiler._load_param_dict") *rv = ret; }); -TVM_REGISTER_NODE_TYPE(NDArrayWrapperNode); +TVM_EXTERNAL_REGISTER_NODE_TYPE(NDArrayWrapperNode); } // namespace compiler } // namespace nnvm diff --git a/src/api/api_base.cc b/src/api/api_base.cc index cc76f6a8f50b..37970e69e24f 100644 --- a/src/api/api_base.cc +++ b/src/api/api_base.cc @@ -41,4 +41,8 @@ TVM_REGISTER_API("_TVMSetStream") .set_body([](TVMArgs args, TVMRetValue *ret) { TVMSetStream(args[0], args[1], args[2]); }); + +TVM_DLL::dmlc::Registry<::tvm::NodeFactoryReg > * GetTVMNodeFactoryRegistry() { + return ::dmlc::Registry<::tvm::NodeFactoryReg>::Get(); +} } // namespace tvm From 7ea06e6e4cb0d3045255c4c98928d69db32562d5 Mon Sep 17 00:00:00 2001 From: Siju Date: Wed, 8 Aug 2018 22:53:51 +0530 Subject: [PATCH 03/77] [ONNX]onnx gather bug fix (#1543) --- nnvm/python/nnvm/frontend/onnx.py | 10 +- .../python/frontend/onnx/test_forward.py | 186 ++++++++++-------- 2 files changed, 104 insertions(+), 92 deletions(-) diff --git a/nnvm/python/nnvm/frontend/onnx.py b/nnvm/python/nnvm/frontend/onnx.py index f4062c103fcf..fa26648b293a 100644 --- a/nnvm/python/nnvm/frontend/onnx.py +++ b/nnvm/python/nnvm/frontend/onnx.py @@ -489,15 +489,11 @@ def _impl_v1(cls, inputs, attr, params): class Gather(OnnxOpConverter): """ Operator converter for Gather. """ - @classmethod def _impl_v1(cls, inputs, attr, params): - axis = attr['axis'] - indices = np.array(attr['indices'], dtype='int32') - name = 'gather_indices' - gather_indices = _sym.Variable(name=name, init=indices) - params[name] = indices - return _sym.take(inputs[0], gather_indices, axis=axis) + axis = attr.get('axis', 0) + return AttrCvt(op_name='take', + extras={'axis':axis})(inputs, attr) class LRN(OnnxOpConverter): """ Operator converter for Local Response Normalization. diff --git a/nnvm/tests/python/frontend/onnx/test_forward.py b/nnvm/tests/python/frontend/onnx/test_forward.py index f4dc3559ee16..9fb3aed2da10 100644 --- a/nnvm/tests/python/frontend/onnx/test_forward.py +++ b/nnvm/tests/python/frontend/onnx/test_forward.py @@ -8,21 +8,50 @@ from model_zoo import super_resolution, squeezenet1_1, lenet, resnet18_1_0 from onnx import helper, TensorProto -def get_tvm_output(model, x, target, ctx, out_shape, dtype='float32'): - new_sym, params = nnvm.frontend.from_onnx(model) - input_name = model.graph.input[0].name - shape_dict = {input_name: x.shape} - dtype_dict = {input_name: dtype} - graph, lib, params = nnvm.compiler.build(new_sym, target, shape_dict, dtype_dict, params=params) +def get_tvm_output(graph_def, input_data, target, ctx, output_shape, output_dtype='float32'): + """ Generic function to execute and get tvm output""" + + sym, params = nnvm.frontend.from_onnx(graph_def) + target = 'llvm' + if isinstance(input_data, list): + input_names = {} + shape_dict = {} + dtype_dict = {} + for i, _ in enumerate(input_data): + input_names[i] = graph_def.graph.input[i].name + shape_dict[input_names[i]] = input_data[i].shape + dtype_dict[input_names[i]] = input_data[i].dtype + else: + input_names = graph_def.graph.input[0].name + shape_dict = {input_names: input_data.shape} + dtype_dict = {input_names: input_data.dtype} + + graph, lib, params = nnvm.compiler.build(sym, target, shape_dict, + dtype=dtype_dict, params=params) + + ctx = tvm.cpu(0) + from tvm.contrib import graph_runtime m = graph_runtime.create(graph, lib, ctx) # set inputs - m.set_input(input_name, tvm.nd.array(x.astype(dtype))) + if isinstance(input_data, list): + for i, e in enumerate(input_names): + m.set_input(input_names[i], tvm.nd.array(input_data[i].astype(input_data[i].dtype))) + else: + m.set_input(input_names, tvm.nd.array(input_data.astype(input_data.dtype))) + m.set_input(**params) + # execute m.run() # get outputs - out = m.get_output(0, tvm.nd.empty(out_shape, dtype)) - return out.asnumpy() - + if isinstance(output_shape, list) and isinstance(output_dtype, list): + tvm_output_list = [] + for i, s in enumerate(output_shape): + tvm_output = m.get_output(i, tvm.nd.empty((s), output_dtype[i])) + tvm_output_list.append(tvm_output.asnumpy()) + return tvm_output_list + else: + tvm_output = m.get_output(0, tvm.nd.empty((output_shape), output_dtype)) + return tvm_output.asnumpy() def get_caffe2_output(model, x, dtype='float32'): import caffe2.python.onnx.backend @@ -70,13 +99,15 @@ def test_reshape(): graph = helper.make_graph([ref_node, reshape_node], "reshape_test", - inputs = [helper.make_tensor_value_info("in", TensorProto.FLOAT, list(in_shape))], - outputs = [helper.make_tensor_value_info("out", TensorProto.FLOAT, list(ref_shape))]) + inputs = [helper.make_tensor_value_info("in", + TensorProto.FLOAT, list(in_shape))], + outputs = [helper.make_tensor_value_info("out", + TensorProto.FLOAT, list(ref_shape))]) model = helper.make_model(graph, producer_name='reshape_test') for target, ctx in ctx_list(): - x = np.random.uniform(size=in_shape) + x = np.random.uniform(size=in_shape).astype('int32') tvm_out = get_tvm_output(model, x, target, ctx, ref_shape, 'float32') np.testing.assert_allclose(ref_shape, tvm_out.shape) @@ -98,13 +129,15 @@ def test_reshape_like(): graph = helper.make_graph([ref_node, copy_node, reshape_node], "reshape_like_test", - inputs = [helper.make_tensor_value_info("in", TensorProto.FLOAT, list(in_shape))], - outputs = [helper.make_tensor_value_info("out", TensorProto.FLOAT, list(ref_shape))]) + inputs = [helper.make_tensor_value_info("in", + TensorProto.FLOAT, list(in_shape))], + outputs = [helper.make_tensor_value_info("out", + TensorProto.FLOAT, list(ref_shape))]) model = helper.make_model(graph, producer_name='reshape_like_test') for target, ctx in ctx_list(): - x = np.random.uniform(size=in_shape) + x = np.random.uniform(size=in_shape).astype('float32') tvm_out = get_tvm_output(model, x, target, ctx, ref_shape, 'float32') np.testing.assert_allclose(ref_shape, tvm_out.shape) @@ -122,31 +155,18 @@ def _test_power_iteration(x_shape, y_shape): graph = helper.make_graph([res], 'power_test', - inputs = [helper.make_tensor_value_info("x", TensorProto.FLOAT, list(x_shape)), - helper.make_tensor_value_info("y", TensorProto.FLOAT, list(y_shape))], - outputs = [helper.make_tensor_value_info("out", TensorProto.FLOAT, list(np_res.shape))]) + inputs = [helper.make_tensor_value_info("x", + TensorProto.FLOAT, list(x_shape)), + helper.make_tensor_value_info("y", + TensorProto.FLOAT, list(y_shape))], + outputs = [helper.make_tensor_value_info("out", + TensorProto.FLOAT, list(np_res.shape))]) model = helper.make_model(graph, producer_name='power_test') for target, ctx in ctx_list(): - new_sym, params = nnvm.frontend.from_onnx(model) - - input_name = model.graph.input[0].name - input_name1 = model.graph.input[1].name - shape_dict = {input_name: x.shape, input_name1: y.shape} - dtype_dict = {input_name: x.dtype, input_name1: y.dtype} - - graph, lib, params = nnvm.compiler.build(new_sym, target, shape_dict, dtype_dict, params=params) - m = graph_runtime.create(graph, lib, ctx) - # set inputs - m.set_input(input_name, tvm.nd.array(x)) - m.set_input(input_name1, tvm.nd.array(y)) - m.set_input(**params) - m.run() - # get outputs - tvm_out = m.get_output(0, tvm.nd.empty(np_res.shape, np_res.dtype)) - - np.testing.assert_allclose(np_res, tvm_out.asnumpy(), rtol=1e-5, atol=1e-5) + tvm_out = get_tvm_output(model, [x, y], target, ctx, np_res.shape) + np.testing.assert_allclose(np_res, tvm_out, rtol=1e-5, atol=1e-5) def test_power(): _test_power_iteration((1, 3), (1)) @@ -160,13 +180,15 @@ def test_squeeze(): graph = helper.make_graph([y], 'squeeze_test', - inputs = [helper.make_tensor_value_info("in", TensorProto.FLOAT, list(in_shape))], - outputs = [helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_shape))]) + inputs = [helper.make_tensor_value_info("in", + TensorProto.FLOAT, list(in_shape))], + outputs = [helper.make_tensor_value_info("out", + TensorProto.FLOAT, list(out_shape))]) model = helper.make_model(graph, producer_name='squeeze_test') for target, ctx in ctx_list(): - x = np.random.uniform(size=in_shape) + x = np.random.uniform(size=in_shape).astype('float32') tvm_out = get_tvm_output(model, x, target, ctx, out_shape, 'float32') np.testing.assert_allclose(out_shape, tvm_out.shape) @@ -179,44 +201,47 @@ def test_unsqueeze(): graph = helper.make_graph([y], 'squeeze_test', - inputs = [helper.make_tensor_value_info("in", TensorProto.FLOAT, list(in_shape))], - outputs = [helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_shape))]) + inputs = [helper.make_tensor_value_info("in", + TensorProto.FLOAT, list(in_shape))], + outputs = [helper.make_tensor_value_info("out", + TensorProto.FLOAT, list(out_shape))]) model = helper.make_model(graph, producer_name='squeeze_test') for target, ctx in ctx_list(): - x = np.random.uniform(size=in_shape) + x = np.random.uniform(size=in_shape).astype('float32') tvm_out = get_tvm_output(model, x, target, ctx, out_shape, 'float32') np.testing.assert_allclose(out_shape, tvm_out.shape) -def verify_gather(in_shape, indices, axis=0): - indices_src = np.array(indices, dtype="int32") - - x = np.random.uniform(size=in_shape) - out_np = np.take(x, indices_src, axis=axis) +def verify_gather(in_shape, indices, axis, dtype): + x = np.random.uniform(size=in_shape).astype(dtype) + indices = np.array(indices, dtype="int32") + out_np = np.take(x, indices, axis=axis) - y = helper.make_node("Gather", ['in'], ['out'], indices=indices, axis=axis) + y = helper.make_node("Gather", ['in', 'indices'], ['out'], axis=axis) graph = helper.make_graph([y], 'gather_test', inputs = [helper.make_tensor_value_info("in", - TensorProto.FLOAT, list(in_shape))], + TensorProto.FLOAT, list(in_shape)), + helper.make_tensor_value_info("indices", + TensorProto.INT32, list(indices.shape))], outputs = [helper.make_tensor_value_info("out", - TensorProto.FLOAT, list(out_np.shape))]) - + TensorProto.FLOAT, list(out_np.shape))]) model = helper.make_model(graph, producer_name='gather_test') for target, ctx in ctx_list(): - tvm_out = get_tvm_output(model, x, target, ctx, out_np.shape, 'float32') - - np.testing.assert_allclose(out_np, tvm_out) + tvm_out = get_tvm_output(model, [x, indices], target, ctx, out_np.shape) + np.testing.assert_allclose(out_np, tvm_out) def test_gather(): - verify_gather((4,), [1]) - verify_gather((4,), [0, 1, 2, 3]) - verify_gather((4, 2), [1], 1) - verify_gather((4, 3, 5, 6), [2, 1, 0, 0], -2) + verify_gather((4,), [1], 0, 'int32') + verify_gather((1,4), [0], 0, 'int32') + verify_gather((4,), [[[1,0],[0,1]]], 0, 'float32') + verify_gather((2,2), [[[1,0],[0,1]]], 1, 'int32') + verify_gather((3,3,3), [[[1,0]]], -1, 'int32') + verify_gather((4,3,5,6), [[2,1,0,0]], 0, 'float32') def _test_slice_iteration(indata, outdata, starts, ends, axes=None): if axes: @@ -226,8 +251,10 @@ def _test_slice_iteration(indata, outdata, starts, ends, axes=None): graph = helper.make_graph([y], 'slice_test', - inputs = [helper.make_tensor_value_info("in", TensorProto.FLOAT, list(indata.shape))], - outputs = [helper.make_tensor_value_info("out", TensorProto.FLOAT, list(outdata.shape))]) + inputs = [helper.make_tensor_value_info("in", + TensorProto.FLOAT, list(indata.shape))], + outputs = [helper.make_tensor_value_info("out", + TensorProto.FLOAT, list(outdata.shape))]) model = helper.make_model(graph, producer_name='slice_test') @@ -251,8 +278,10 @@ def _test_onnx_op_elementwise(inshape, outfunc, npargs, dtype, opname, kwargs): graph = helper.make_graph([y], opname+'_test', - inputs = [helper.make_tensor_value_info("in", TensorProto.FLOAT, list(indata.shape))], - outputs = [helper.make_tensor_value_info("out", TensorProto.FLOAT, list(outdata.shape))]) + inputs = [helper.make_tensor_value_info("in", + TensorProto.FLOAT, list(indata.shape))], + outputs = [helper.make_tensor_value_info("out", + TensorProto.FLOAT, list(outdata.shape))]) model = helper.make_model(graph, producer_name=opname+'_test') @@ -278,40 +307,27 @@ def test_clip(): def test_matmul(): a_shape = (4, 3) b_shape = (3, 4) - out_shape = (4, 4) a_array = np.random.uniform(size=a_shape).astype('float32') b_array = np.random.uniform(size=b_shape).astype('float32') + out_np = np.matmul(a_array, b_array) mul_node = helper.make_node("MatMul", ["a", "b"], ["out"]) graph = helper.make_graph([mul_node], "matmul_test", - inputs = [helper.make_tensor_value_info("a", TensorProto.FLOAT, list(a_shape)), - helper.make_tensor_value_info("b", TensorProto.FLOAT, list(b_shape))], - outputs = [helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_shape))]) + inputs = [helper.make_tensor_value_info("a", + TensorProto.FLOAT, list(a_shape)), + helper.make_tensor_value_info("b", + TensorProto.FLOAT, list(b_shape))], + outputs = [helper.make_tensor_value_info("out", + TensorProto.FLOAT, list(out_np.shape))]) model = helper.make_model(graph, producer_name='matmul_test') for target, ctx in ctx_list(): - new_sym, params = nnvm.frontend.from_onnx(model) - - input_name = model.graph.input[0].name - input_name1 = model.graph.input[1].name - shape_dict = {input_name: a_array.shape, input_name1: b_array.shape} - dtype_dict = {input_name: 'float32', input_name1: 'float32'} - - graph, lib, params = nnvm.compiler.build(new_sym, target, shape_dict, dtype_dict, params=params) - m = graph_runtime.create(graph, lib, ctx) - # set inputs - m.set_input(input_name, tvm.nd.array(a_array.astype('float32'))) - m.set_input(input_name1, tvm.nd.array(b_array.astype('float32'))) - m.set_input(**params) - m.run() - # get outputs - tvm_out = m.get_output(0, tvm.nd.empty(out_shape, 'float32')) - - np.testing.assert_allclose(np.matmul(a_array, b_array), tvm_out.asnumpy(), rtol=1e-5, atol=1e-5) + tvm_out = get_tvm_output(model, [a_array, b_array], target, ctx, out_np.shape) + np.testing.assert_allclose(out_np, tvm_out, rtol=1e-5, atol=1e-5) def verify_lrn(shape, nsize, dtype, alpha=None, beta=None, bias=None): in_array = np.random.uniform(size=shape).astype(dtype) From c9f9a3f9be7db611d11b9a28476af62571af9581 Mon Sep 17 00:00:00 2001 From: Siju Date: Wed, 8 Aug 2018 22:56:10 +0530 Subject: [PATCH 04/77] l2normalization operator support for tensorflow (#1528) --- nnvm/python/nnvm/frontend/tensorflow.py | 23 ++++++++++- .../frontend/tensorflow/test_forward.py | 39 ++++++++++++++++++- 2 files changed, 59 insertions(+), 3 deletions(-) diff --git a/nnvm/python/nnvm/frontend/tensorflow.py b/nnvm/python/nnvm/frontend/tensorflow.py index ac47f9c3435f..d761e34c7c59 100644 --- a/nnvm/python/nnvm/frontend/tensorflow.py +++ b/nnvm/python/nnvm/frontend/tensorflow.py @@ -434,6 +434,21 @@ def _impl(inputs, attr, params): return AttrCvt(op_name='lrn')(new_inputs, attr_new) return _impl +def _sum(): + def _impl(inputs, attr, params): + axis = params.pop(inputs[1].list_output_names()[0]).asnumpy() + return AttrCvt( + op_name='sum', + extras={'axis': axis}, + transforms={'keep_dims':'keepdims'}, + ignores=['name', 'Tidx'])(inputs[0], attr) + return _impl + +def _square(): + def _impl(inputs, attr, params): + return _sym.elemwise_mul(inputs[0], inputs[0]) + return _impl + def _gather_v2(): "Tensorflow now support only gatherv2" def _impl(inputs, attr, params): @@ -651,13 +666,17 @@ def _impl(inputs, attr, params): 'Identity' : _identity(), 'MatMul' : _matmul(), 'MaxPool' : _pooling('max_pool'), + 'Add' : _elemwise('add'), + 'Sub' : _elemwise('sub'), 'Mul' : _elemwise('mul'), + 'Maximum' : _elemwise('max'), + 'Minimum' : _elemwise('min'), + 'Sum' : _sum(), + 'Square' : _square(), 'Relu' : AttrCvt('relu'), 'Reshape' : _reshape(), 'ResizeBilinear' : _resize_bilinear(), 'Softmax' : AttrCvt('softmax', {'axis': ('axis', 1)}), - 'Sub' : _elemwise('sub'), - 'Add' : _elemwise('add'), 'Rsqrt' : _rsqrt(), 'Squeeze' : _squeeze(), 'FusedBatchNorm' : _fused_batch_norm(), diff --git a/nnvm/tests/python/frontend/tensorflow/test_forward.py b/nnvm/tests/python/frontend/tensorflow/test_forward.py index 96b472e8b381..495852f9e5d6 100644 --- a/nnvm/tests/python/frontend/tensorflow/test_forward.py +++ b/nnvm/tests/python/frontend/tensorflow/test_forward.py @@ -12,6 +12,7 @@ from tensorflow.python.framework import constant_op from tensorflow.python.framework import graph_util from tensorflow.python.ops import nn_ops +from tensorflow.python.ops import nn from tensorflow.python.ops import array_ops from tensorflow.python.ops import gen_array_ops from tensorflow.python.ops import math_ops @@ -948,7 +949,6 @@ def _test_lrn(ishape, size, axis, bias, alpha, beta): sess, sess.graph.as_graph_def(add_shapes=True), ['lrn'],) - tf_output = run_tf_graph(sess, inp_array, 'lrn0_data:0', 'lrn:0') tvm_output = run_tvm_graph(graph_def, inp_array, @@ -959,6 +959,42 @@ def _test_lrn(ishape, size, axis, bias, alpha, beta): def test_forward_lrn(): _test_lrn((1, 3, 20, 20), 3, 1, 1.0, 1.0, 0.5) +####################################################################### +# l2_normalize +# ------------ +def _test_l2_normalize(ishape, eps, axis): + """ testing l2 normalize (uses max, sum, square, sqrt frontend operators)""" + + inp_array = np.random.uniform(size=ishape).astype(np.float32) + inp_array.fill(1) + + with tf.Graph().as_default(): + in1 = tf.placeholder(shape=inp_array.shape, dtype=inp_array.dtype, name="Placeholder") + nn.l2_normalize(in1, + axis=axis, + epsilon=eps, + name=None, + dim=None) + + with tf.Session() as sess: + graph_def = tf.graph_util.convert_variables_to_constants( + sess, + sess.graph.as_graph_def(add_shapes=True), + ['l2_normalize'], + ) + tf_output = run_tf_graph(sess, inp_array, 'Placeholder:0', 'Placeholder:0') + tvm_output = run_tvm_graph(graph_def, + inp_array, + "Placeholder", + tf_output.shape, + tf_output.dtype) + + np.testing.assert_allclose(tf_output, tvm_output, atol=1e-3, rtol=1e-3) + sess.close() +def test_forward_l2_normalize(): + _test_l2_normalize((1, 3, 20, 20), 0.001, (0,)) + +####################################################################### # Main # ---- if __name__ == '__main__': @@ -981,3 +1017,4 @@ def test_forward_lrn(): test_forward_gather() test_forward_ptb() test_forward_lrn() + test_forward_l2_normalize() From 53d243114d3d35d27c7b5bd90fafccb4d8d9dda0 Mon Sep 17 00:00:00 2001 From: Zhi <5145158+zhiics@users.noreply.github.com> Date: Wed, 8 Aug 2018 13:51:47 -0700 Subject: [PATCH 05/77] Separate fusion and Compilation (#1564) * Separate fusion and compilation * fix description of graph_fuse.h * fix lint * fix @masahi 's comments, move fusion out of target * fix graph passing and make fused_entries singula in graph attr * fix typo * fix some comments * run test again * remove rvalue for graphfuse and graphfindfusiablegroups --- nnvm/python/nnvm/compiler/build_module.py | 4 +- nnvm/src/compiler/graph_compile.cc | 259 +++++++++++++++++ nnvm/src/compiler/graph_fuse.cc | 333 +++------------------- nnvm/src/compiler/graph_fuse.h | 80 ++++++ 4 files changed, 388 insertions(+), 288 deletions(-) create mode 100644 nnvm/src/compiler/graph_compile.cc create mode 100644 nnvm/src/compiler/graph_fuse.h diff --git a/nnvm/python/nnvm/compiler/build_module.py b/nnvm/python/nnvm/compiler/build_module.py index fd8599bcfa93..217598c9d79a 100644 --- a/nnvm/python/nnvm/compiler/build_module.py +++ b/nnvm/python/nnvm/compiler/build_module.py @@ -298,8 +298,10 @@ def build(graph, target=None, shape=None, dtype="float32", else: graph._set_json_attr("opt_level", 0, "int") graph = graph.apply("InferShape").apply("InferType") + graph = graph.apply("GraphFindFusibleGroups") + graph = graph.apply("GraphFuse") with target: - graph = graph.apply("GraphFusePartition").apply("GraphFuseCompile") + graph = graph.apply("GraphCompile") libmod = graph_attr._move_out_module(graph, "module") # Write variable initial values into params if init_var: diff --git a/nnvm/src/compiler/graph_compile.cc b/nnvm/src/compiler/graph_compile.cc new file mode 100644 index 000000000000..e51730c09d66 --- /dev/null +++ b/nnvm/src/compiler/graph_compile.cc @@ -0,0 +1,259 @@ +/*! + * Copyright (c) 2018 by Contributors + * \file graph_compile.cc + * \brief Compile a graph. It lowers the graph nodes into low level IR. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "compile_engine.h" +#include "graph_fuse.h" +#include "graph_runtime.h" +#include "pattern_util.h" + +namespace nnvm { +namespace compiler { + +using namespace tvm; + +// Decorate the result of PlanMemory +// This function does two things: +// - Give separate memory to each variable. +// - Tie the memory of output/lhs in assign node properly +// so the execution of assign can have side effect. +nnvm::Graph DecorateMemoryPlan( + nnvm::Graph g, + const std::vector& assign_flag) { + const IndexedGraph& idx = g.indexed_graph(); + StorageVector storage_vec = g.MoveCopyAttr("storage_id"); + g.attrs.erase("storage_allocated_bytes"); + g.attrs.erase("storage_inplace_index"); + size_t num_not_allocated = g.MoveCopyAttr( + "storage_num_not_allocated"); + CHECK_EQ(num_not_allocated, 0U) + << "Can only build inference graph with all statically allocated memory"; + + // Reassign variable id so that they are different. + int max_id = 0; + for (size_t i = 0; i < storage_vec.size(); ++i) { + max_id = std::max(storage_vec[i] + 1, max_id); + } + for (uint32_t nid : idx.input_nodes()) { + storage_vec[idx.entry_id(nid, 0)] = max_id++; + } + // Tie up the assign node storage properly. + for (uint32_t nid = 0 ; nid < idx.num_nodes(); ++nid) { + if (assign_flag[nid] == 0) continue; + const auto& inode = idx[nid]; + int var_storage_id = storage_vec[idx.entry_id(inode.inputs[0])]; + storage_vec[idx.entry_id(nid, 0)] = var_storage_id; + + if (assign_flag[nid] == 2) { + storage_vec[idx.entry_id(inode.inputs[1])] = var_storage_id; + } + } + g.attrs["storage_id"] = std::make_shared(std::move(storage_vec)); + return g; +} + +nnvm::Graph GraphCompile(const nnvm::Graph& g) { + // Get attributes from the graph. + const ShapeVector& shape_vec = g.GetAttr("shape"); + const DTypeVector& dtype_vec = g.GetAttr("dtype"); + const GroupVec& group_vec = g.GetAttr("group_root"); + const MasterVec& master_vec = g.GetAttr("group_master"); + const PatternVec& pattern_vec = g.GetAttr("pattern"); + + CHECK(g.HasAttr("fused_entry")) << "Fusion hasn't been applied yet."; + FuseEntryVec fuse_entries = g.GetAttr("fused_entry"); + + std::string target = g.GetAttr("target"); + std::string target_host; + + if (g.HasAttr("target_host")) { + target_host = g.GetAttr("target_host"); + } + // Specially handle assign. + const nnvm::Op* assign_op = nnvm::Op::Get("_assign"); + + // Start lowering. + Array func_list; + std::unordered_set func_set; + const IndexedGraph& idx = g.indexed_graph(); + + for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) { + const auto& inode = idx[nid]; + if (inode.source->is_variable()) continue; + int root_id = group_vec[nid]; + if (static_cast(nid) != root_id) continue; + int master = master_vec[root_id]; + FuseEntry& fe = fuse_entries[root_id]; + + const IndexedGraph& subidx = fe.subgraph.indexed_graph(); + CHECK_EQ(subidx.input_nodes().size(), fe.imap.size()); + CHECK_EQ(subidx.input_nodes().size(), fe.input_info.size()); + + Array inputs; + for (uint32_t sub_input_id : subidx.input_nodes()) { + auto it = fe.input_info.find(subidx[sub_input_id].source); + inputs.push_back(it->second); + } + // Find master idx in the subgraph. + int sub_master_idx = 0; + for (uint32_t i = 0; i < subidx.num_nodes(); i++) { + if (subidx[i].source->op() == idx[master].source->op()) { + sub_master_idx = i; + break; + } + } + fe.compiled_func = GraphLower(fe.subgraph, inputs, target, sub_master_idx); + for (LoweredFunc f : fe.compiled_func->funcs) { + if (!func_set.count(f.get())) { + func_set.insert(f.get()); + func_list.push_back(f); + } + } + } + + const nnvm::Op* tvm_op = nnvm::Op::Get("tvm_op"); + + std::unordered_map old_new; + for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) { + const auto& inode = idx[nid]; + if (inode.source->is_variable()) { + // Only copy name since that is sufficient. + nnvm::NodePtr np = nnvm::Node::Create(); + np->attrs.name = inode.source->attrs.name; + old_new[nid] = np; + continue; + } + int root_id = group_vec[nid]; + if (static_cast(nid) != root_id) continue; + + // Handle normal op + FuseEntry& fe = fuse_entries[root_id]; + const IndexedGraph& subidx = fe.subgraph.indexed_graph(); + nnvm::NodePtr np = nnvm::Node::Create(); + np->attrs.op = tvm_op; + np->attrs.name = inode.source->attrs.name; + TVMOpParam param; + param.func_name = fe.compiled_func->func_name; + param.num_inputs = static_cast(fe.imap.size()); + param.num_outputs = static_cast(fe.subgraph.outputs.size()); + param.flatten_data = fe.flatten_data; + param.UpdateDict(&(np->attrs.dict)); + np->attrs.parsed = std::move(param); + + for (uint32_t sub_input_id : subidx.input_nodes()) { + // Need to make sure subgraph input order is consistent to the order of + // the graph input. + auto rit = fe.reverse_imap.find(subidx[sub_input_id].source); + CHECK(rit != fe.reverse_imap.end()); + const IndexedGraph::NodeEntry& e = rit->second; + auto it = old_new.find(e.node_id); + CHECK(it != old_new.end()) + << "cannot find node_id=" << e.node_id; + np->inputs.emplace_back( + nnvm::NodeEntry{it->second, e.index, e.version}); + } + for (const uint32_t node_id : inode.control_deps) { + auto it = old_new.find(node_id); + CHECK(it != old_new.end()); + np->control_deps.emplace_back(it->second); + } + old_new[nid] = np; + } + nnvm::Graph ret; + for (const auto& e : idx.outputs()) { + auto it = old_new.find(group_vec[e.node_id]); + CHECK(it != old_new.end()) + << "cannot find node_id=" << e.node_id; + ret.outputs.emplace_back( + nnvm::NodeEntry{it->second, e.index, e.version}); + } + + // Reference counter of each op node. + // For now, always store result when an op is referred more than once. + std::vector ref_count = GetNodeRefCounts(idx); + for (const auto& e : idx.outputs()) { + // This line will realize all the outputs. + ref_count[e.node_id] += 1; + } + + const IndexedGraph& new_idx = ret.indexed_graph(); + + // Handling assign: + // + // assign is a special operator that mutates the variable. + // Currently assign is implemented as output = copy(input[1]) + // Then we run DecorageMemoryPlan to force + // output.storage = input[0].storage + // + std::vector assign_flag(new_idx.num_nodes(), 0); + ShapeVector new_shape_vec = ShapeVector(new_idx.num_node_entries(), TShape()); + DTypeVector new_dtype_vec = DTypeVector(new_idx.num_node_entries()); + std::vector new_dltype_vec(new_idx.num_node_entries()); + + for (const auto& kv : old_new) { + uint32_t nid = kv.first; + const auto& inode = idx[nid]; + uint32_t new_nid = new_idx.node_id(kv.second.get()); + if (inode.source->op() == assign_op) { + // Check if rhs of assign can be computed inplace. + // If yes, we can simply set that memory to be assign target + // and change assign to nop. + const IndexedGraph::NodeEntry& rhs = inode.inputs[1]; + if (ref_count[rhs.node_id] <= 1 && + !(idx[rhs.node_id].source->is_variable()) && + pattern_vec[group_vec[rhs.node_id]] <= kBroadcast) { + assign_flag[new_nid] = 2; + TVMOpParam& param = dmlc::get(kv.second->attrs.parsed); + param.func_name = "__nop"; + param.UpdateDict(&(kv.second->attrs.dict)); + } else { + assign_flag[new_nid] = 1; + } + } + for (uint32_t i = 0; i < inode.source->num_outputs(); ++i) { + uint32_t new_eid = new_idx.entry_id(new_idx.node_id(kv.second.get()), i); + uint32_t old_eid = idx.entry_id(nid, i); + new_shape_vec[new_eid] = shape_vec[old_eid]; + new_dtype_vec[new_eid] = dtype_vec[old_eid]; + new_dltype_vec[new_eid] = tvm::runtime::TVMType2String( + GetDLType(dtype_vec[old_eid])); + } + } + ret.attrs["shape"] = std::make_shared(std::move(new_shape_vec)); + ret.attrs["dtype"] = std::make_shared(std::move(new_dtype_vec)); + ret.attrs["dltype"] = std::make_shared(std::move(new_dltype_vec)); + + // Setup module + static const PackedFunc& fbuild = GetPackedFunc("nnvm.compiler.build_target"); + tvm::runtime::Module module = fbuild(func_list, target, target_host); + ret.attrs["module"] = std::make_shared(std::move(module)); + ret = nnvm::ApplyPass(ret, "PlanMemory"); + ret = DecorateMemoryPlan(ret, assign_flag); + return ret; +} + +NNVM_REGISTER_PASS(GraphCompile) + .set_body(GraphCompile) + .depend_graph_attr("shape") + .depend_graph_attr("dtype") + .depend_graph_attr("fused_entry") + .depend_graph_attr("group_root") + .depend_graph_attr("pattern") + .depend_graph_attr("group_master"); + +} // namespace compiler +} // namespace nnvm diff --git a/nnvm/src/compiler/graph_fuse.cc b/nnvm/src/compiler/graph_fuse.cc index 247224aa9bfe..52a8ae44f8ee 100644 --- a/nnvm/src/compiler/graph_fuse.cc +++ b/nnvm/src/compiler/graph_fuse.cc @@ -3,18 +3,19 @@ * \file graph_fuse.cc * \brief Fuse the operators together. */ +#include +#include #include +#include #include #include -#include -#include #include #include -#include -#include +#include #include -#include -#include "./compile_engine.h" +#include + +#include "./graph_fuse.h" #include "./graph_runtime.h" #include "./pattern_util.h" @@ -22,28 +23,10 @@ namespace nnvm { namespace compiler { using namespace tvm; -// The single fuse rule. -enum class FuseRule { - kUknown, - kFuseToMaster, - kRealize -}; - -/*! - * \brief Get DLDataType from dtype flag. - * - * \param type_flag The data type flag - * \return corresponding DLDataType - */ -DLDataType GetDLType(int type_flag) { - return Type2TVMType(GetTVMType(type_flag)); -} - // Partition the graph into segments // Each segment will be compiled into one operator. -// Need also mark the property of the segment. -nnvm::Graph GraphFusePartition(nnvm::Graph g) { - // setup ref counter +// Also mark the property of the segment. +nnvm::Graph GraphFindFusibleGroups(nnvm::Graph g) { const IndexedGraph& idx = g.indexed_graph(); int opt_level = 2; if (g.attrs.count("opt_level") != 0) { @@ -61,7 +44,7 @@ nnvm::Graph GraphFusePartition(nnvm::Graph g) { ref_count[e.node_id] += 1; } // Pattern for the subgraph - std::vector pattern_vec(idx.num_nodes(), kOpaque); + PatternVec pattern_vec(idx.num_nodes(), kOpaque); // Whether node can be fused to parent. std::vector fuse_vec(idx.num_nodes(), FuseRule::kUknown); // Master node id of fusion segment. @@ -77,7 +60,7 @@ nnvm::Graph GraphFusePartition(nnvm::Graph g) { TOpPattern pt = op_pattern.get(inode.source->op(), kOpaque); if (pt <= kBroadcast) { - // Try to check if we can fuse to the master. + // Check if we can fuse to the master. int chosen_master = -1; bool ewise = inode.source->num_outputs() == 1; for (const auto& e : inode.inputs) { @@ -108,7 +91,7 @@ nnvm::Graph GraphFusePartition(nnvm::Graph g) { pt = ewise ? kElemWise : kBroadcast; } } else if (pt == kInjective || pt == kCommReduce) { - // fuse to the comm reduce or injective + // Fuse to the comm reduce or injective for (const auto& e : inode.inputs) { if (fuse_vec[e.node_id] == FuseRule::kUknown) { TOpPattern ipt = pattern_vec[e.node_id]; @@ -123,7 +106,7 @@ nnvm::Graph GraphFusePartition(nnvm::Graph g) { master_vec[nid] = nid; } } else { - // realize + // Realize master_vec[nid] = nid; for (const auto& e : inode.inputs) { if (fuse_vec[e.node_id] == FuseRule::kUknown) { @@ -144,15 +127,15 @@ nnvm::Graph GraphFusePartition(nnvm::Graph g) { } } - // point to the group root id of each node - std::vector group_vec(idx.num_nodes(), -1); + // Point to the group root id of each node. + GroupVec group_vec(idx.num_nodes(), -1); for (uint32_t i = idx.num_nodes(); i != 0; --i) { uint32_t nid = i - 1; const auto& inode = idx[nid]; if (group_vec[nid] == -1) { group_vec[nid] = nid; } - // propagate the group id. + // Propagate the group id. for (const auto& e : inode.inputs) { if (fuse_vec[e.node_id] == FuseRule::kFuseToMaster) { CHECK(group_vec[e.node_id] == -1|| @@ -264,121 +247,43 @@ nnvm::Graph GraphFusePartition(nnvm::Graph g) { return g; } - -NNVM_REGISTER_PASS(GraphFusePartition) -.set_body(GraphFusePartition) +NNVM_REGISTER_PASS(GraphFindFusibleGroups) +.set_body(GraphFindFusibleGroups) .depend_graph_attr("shape") .depend_graph_attr("dtype"); - -// Decorate the result of PlanMemory -// This function does two things: -// - Give separate memory to each variable -// - Tie the memory of output/lhs in assign node properly -// so the execution of assign can have side effect. -nnvm::Graph DecorateMemoryPlan( - nnvm::Graph g, - const std::vector& assign_flag) { - // setup ref counter - const IndexedGraph& idx = g.indexed_graph(); - StorageVector storage_vec = g.MoveCopyAttr("storage_id"); - g.attrs.erase("storage_allocated_bytes"); - g.attrs.erase("storage_inplace_index"); - size_t num_not_allocated = g.MoveCopyAttr( - "storage_num_not_allocated"); - CHECK_EQ(num_not_allocated, 0U) - << "Can only build inference graph with all statically allocated memory"; - - // reassign variable id so that they are different. - int max_id = 0; - for (size_t i = 0; i < storage_vec.size(); ++i) { - max_id = std::max(storage_vec[i] + 1, max_id); - } - for (uint32_t nid : idx.input_nodes()) { - storage_vec[idx.entry_id(nid, 0)] = max_id++; - } - // tie up the assign node storage properly - for (uint32_t nid = 0 ; nid < idx.num_nodes(); ++nid) { - if (assign_flag[nid] == 0) continue; - const auto& inode = idx[nid]; - int var_storage_id = storage_vec[idx.entry_id(inode.inputs[0])]; - storage_vec[idx.entry_id(nid, 0)] = var_storage_id; - - if (assign_flag[nid] == 2) { - storage_vec[idx.entry_id(inode.inputs[1])] = var_storage_id; - } - } - g.attrs["storage_id"] = std::make_shared(std::move(storage_vec)); - return g; -} - -struct INodeEntryHash { - size_t operator()(const IndexedGraph::NodeEntry& e) const { - return e.node_id; - } -}; - -struct INodeEntryEqual { - size_t operator()(const IndexedGraph::NodeEntry& a, - const IndexedGraph::NodeEntry& b) const { - return a.node_id == b.node_id && a.index == b.index; - } -}; - -// Auxiliary data structure for representing fused op. -struct FuseEntry { - // subgraph of the fragement - Graph subgraph; - // The input map - std::unordered_map imap; - // reverse map to the old input entry - std::unordered_map reverse_imap; - // TVM Placeholder for inputs - std::unordered_map input_info; - // Whether we can flatten data - bool flatten_data; - // The corresponding function. - GraphFunc compiled_func; -}; - // Fuse the partitioned graph into segments. -// Create a new graph with fused noded. -// Also inheritate attribute shape, dltype from previous graph. -nnvm::Graph GraphFuseCompile(nnvm::Graph g) { - // setup ref counter +// Create a new graph with fused nodes. +// Also inherit attribute shape, dltype from the previous graph. +nnvm::Graph GraphFuse(nnvm::Graph g) { + CHECK(g.HasAttr("group_root") && g.HasAttr("pattern")) + << "GraphFindFusibleGroups pass hasn't been applied yet."; + const IndexedGraph& idx = g.indexed_graph(); // Get attributes from the graph const ShapeVector& shape_vec = g.GetAttr("shape"); const DTypeVector& dtype_vec = g.GetAttr("dtype"); - const std::vector& group_vec = g.GetAttr >("group_root"); - const std::vector& master_vec = g.GetAttr >("group_master"); - const std::vector& pattern_vec = - g.GetAttr >("pattern"); - std::string target = g.GetAttr("target"); - std::string target_host; + const GroupVec& group_vec = g.GetAttr("group_root"); + const PatternVec& pattern_vec = g.GetAttr("pattern"); - if (g.HasAttr("target_host")) { - target_host = g.GetAttr("target_host"); - } - // specially handle assign + // Specially handle assign op. const nnvm::Op* assign_op = nnvm::Op::Get("_assign"); - std::vector fuse_vec(idx.num_nodes()); - // setup inputs and placeholder. + FuseEntryVec fuse_entries(idx.num_nodes()); + // Setup inputs and placeholder. for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) { const auto& inode = idx[nid]; if (inode.source->is_variable()) continue; CHECK_GE(group_vec[nid], 0); int root_id = group_vec[nid]; - FuseEntry& fe = fuse_vec[root_id]; + FuseEntry& fe = fuse_entries[root_id]; fe.flatten_data = (pattern_vec[root_id] == kElemWise || inode.source->op() == assign_op); for (const auto& e : inode.inputs) { if (group_vec[e.node_id] != root_id && fe.imap.count(e) == 0) { Array shape; if (fe.flatten_data) { - // elementwise support flatten + // Elementwise support flatten int64_t prod = 1; for (int64_t x : shape_vec[idx.entry_id(e)]) { prod *= x; @@ -403,17 +308,18 @@ nnvm::Graph GraphFuseCompile(nnvm::Graph g) { } } } + // Setup the Subgraph std::vector subgraph_vec(idx.num_node_entries()); for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) { const auto& inode = idx[nid]; if (inode.source->is_variable()) continue; int root_id = group_vec[nid]; - FuseEntry& fe = fuse_vec[root_id]; - // copy and create subgraph node. + FuseEntry& fe = fuse_entries[root_id]; + // Create a subgraph node. NodePtr gnode = Node::Create(); gnode->attrs = inode.source->attrs; - // input loading + // Set input entries for the subgraph node. for (const auto& e : inode.inputs) { if (group_vec[e.node_id] != root_id) { auto it = fe.imap.find(e); @@ -426,7 +332,7 @@ nnvm::Graph GraphFuseCompile(nnvm::Graph g) { gnode->inputs.push_back(ne); } } - // schedule on root node, and use master's schedule + // Schedule on the root node and use the master's schedule if (static_cast(nid) != root_id) { for (uint32_t index = 0; index < inode.source->num_outputs(); ++index) { uint32_t eid = idx.entry_id(nid, index); @@ -438,165 +344,18 @@ nnvm::Graph GraphFuseCompile(nnvm::Graph g) { } } } - // Start lowering - Array func_list; - std::unordered_set func_set; - - for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) { - const auto& inode = idx[nid]; - if (inode.source->is_variable()) continue; - int root_id = group_vec[nid]; - if (static_cast(nid) != root_id) continue; - int master = master_vec[root_id]; - FuseEntry& fe = fuse_vec[root_id]; - - const IndexedGraph& subidx = fe.subgraph.indexed_graph(); - CHECK_EQ(subidx.input_nodes().size(), fe.imap.size()); - CHECK_EQ(subidx.input_nodes().size(), fe.input_info.size()); - - Array inputs; - for (uint32_t sub_input_id : subidx.input_nodes()) { - auto it = fe.input_info.find(subidx[sub_input_id].source); - inputs.push_back(it->second); - } - // find master idx in subgraph - int sub_master_idx = 0; - for (uint32_t i = 0; i < subidx.num_nodes(); i++) { - if (subidx[i].source->op() == idx[master].source->op()) { - sub_master_idx = i; - break; - } - } - fe.compiled_func = GraphLower(fe.subgraph, inputs, target, sub_master_idx); - for (LoweredFunc f : fe.compiled_func->funcs) { - if (!func_set.count(f.get())) { - func_set.insert(f.get()); - func_list.push_back(f); - } - } - } - - const nnvm::Op* tvm_op = nnvm::Op::Get("tvm_op"); - - std::unordered_map old_new; - for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) { - const auto& inode = idx[nid]; - if (inode.source->is_variable()) { - // only copy over name since that is sufficient. - nnvm::NodePtr np = nnvm::Node::Create(); - np->attrs.name = inode.source->attrs.name; - old_new[nid] = np; - continue; - } - int root_id = group_vec[nid]; - if (static_cast(nid) != root_id) continue; - - // Handle normal op - FuseEntry& fe = fuse_vec[root_id]; - const IndexedGraph& subidx = fe.subgraph.indexed_graph(); - nnvm::NodePtr np = nnvm::Node::Create(); - np->attrs.op = tvm_op; - np->attrs.name = inode.source->attrs.name; - TVMOpParam param; - param.func_name = fe.compiled_func->func_name; - param.num_inputs = static_cast(fe.imap.size()); - param.num_outputs = static_cast(fe.subgraph.outputs.size()); - param.flatten_data = fe.flatten_data; - param.UpdateDict(&(np->attrs.dict)); - np->attrs.parsed = std::move(param); - - for (uint32_t sub_input_id : subidx.input_nodes()) { - // Need to make sure subgraph input order meets order of the graph input - auto rit = fe.reverse_imap.find(subidx[sub_input_id].source); - CHECK(rit != fe.reverse_imap.end()); - const IndexedGraph::NodeEntry& e = rit->second; - auto it = old_new.find(e.node_id); - CHECK(it != old_new.end()) - << "cannot find node_id=" << e.node_id; - np->inputs.emplace_back( - nnvm::NodeEntry{it->second, e.index, e.version}); - } - for (const uint32_t node_id : inode.control_deps) { - auto it = old_new.find(node_id); - CHECK(it != old_new.end()); - np->control_deps.emplace_back(it->second); - } - old_new[nid] = np; - } - nnvm::Graph ret; - for (const auto& e : idx.outputs()) { - auto it = old_new.find(group_vec[e.node_id]); - CHECK(it != old_new.end()) - << "cannot find node_id=" << e.node_id; - ret.outputs.emplace_back( - nnvm::NodeEntry{it->second, e.index, e.version}); - } - - // Reference counter of each op node - // For now, always store result when an op is referred more than once. - std::vector ref_count = GetNodeRefCounts(idx); - for (const auto& e : idx.outputs()) { - // this line will realize all the outputs - ref_count[e.node_id] += 1; - } - - const IndexedGraph& new_idx = ret.indexed_graph(); - - // Handling assign: - // - // assign is a special operator that mutates the variable. - // Currently assign is implemented as output = copy(input[1]) - // Then we run DecorageMemoryPlan to force - // output.storage = input[0].storage - // - std::vector assign_flag(new_idx.num_nodes(), 0); - ShapeVector new_shape_vec = ShapeVector(new_idx.num_node_entries(), TShape()); - DTypeVector new_dtype_vec = DTypeVector(new_idx.num_node_entries()); - std::vector new_dltype_vec(new_idx.num_node_entries()); - - for (const auto& kv : old_new) { - uint32_t nid = kv.first; - const auto& inode = idx[nid]; - uint32_t new_nid = new_idx.node_id(kv.second.get()); - if (inode.source->op() == assign_op) { - // Check if rhs of assign can be comute inplace - // If yes, we can simply set that memory to be assign target - // and change assign to nop - const IndexedGraph::NodeEntry& rhs = inode.inputs[1]; - if (ref_count[rhs.node_id] <= 1 && - !(idx[rhs.node_id].source->is_variable()) && - pattern_vec[group_vec[rhs.node_id]] <= kBroadcast) { - assign_flag[new_nid] = 2; - TVMOpParam& param = dmlc::get(kv.second->attrs.parsed); - param.func_name = "__nop"; - param.UpdateDict(&(kv.second->attrs.dict)); - } else { - assign_flag[new_nid] = 1; - } - } - for (uint32_t i = 0; i < inode.source->num_outputs(); ++i) { - uint32_t new_eid = new_idx.entry_id(new_idx.node_id(kv.second.get()), i); - uint32_t old_eid = idx.entry_id(nid, i); - new_shape_vec[new_eid] = shape_vec[old_eid]; - new_dtype_vec[new_eid] = dtype_vec[old_eid]; - new_dltype_vec[new_eid] = tvm::runtime::TVMType2String( - GetDLType(dtype_vec[old_eid])); - } - } - ret.attrs["shape"] = std::make_shared(std::move(new_shape_vec)); - ret.attrs["dtype"] = std::make_shared(std::move(new_dtype_vec)); - ret.attrs["dltype"] = std::make_shared(std::move(new_dltype_vec)); - // Setup module - static const PackedFunc& fbuild = GetPackedFunc("nnvm.compiler.build_target"); - tvm::runtime::Module module = fbuild(func_list, target, target_host); - ret.attrs["module"] = std::make_shared(std::move(module)); - ret = nnvm::ApplyPass(ret, "PlanMemory"); - ret = DecorateMemoryPlan(ret, assign_flag); - return ret; + g.attrs["fused_entry"] = std::make_shared(std::move(fuse_entries)); + return g; } -NNVM_REGISTER_PASS(GraphFuseCompile) -.set_body(GraphFuseCompile); +NNVM_REGISTER_PASS(GraphFuse) + .set_body(GraphFuse) + .set_change_graph(true) + .provide_graph_attr("fused_entry") + .depend_graph_attr("shape") + .depend_graph_attr("dtype") + .depend_graph_attr("group_root") + .depend_graph_attr("group_master"); } // namespace compiler } // namespace nnvm diff --git a/nnvm/src/compiler/graph_fuse.h b/nnvm/src/compiler/graph_fuse.h new file mode 100644 index 000000000000..6faac7d3e162 --- /dev/null +++ b/nnvm/src/compiler/graph_fuse.h @@ -0,0 +1,80 @@ +/*! + * Copyright (c) 2018 by Contributors + * \file graph_fuse.h + * \brief Definition of structs used by graph fusion +*/ +#ifndef NNVM_COMPILER_GRAPH_FUSE_H_ +#define NNVM_COMPILER_GRAPH_FUSE_H_ + +#include +#include + +#include "compile_engine.h" + +namespace nnvm { +namespace compiler { + +// The single fuse rule. +enum class FuseRule { + kUknown, + kFuseToMaster, + kRealize +}; + +/*! + * \brief Get DLDataType from dtype flag. + * + * \param type_flag The data type flag + * \return corresponding DLDataType + */ +inline DLDataType GetDLType(int type_flag) { + return tvm::Type2TVMType(GetTVMType(type_flag)); +} + +struct INodeEntryHash { + size_t operator()(const IndexedGraph::NodeEntry& e) const { + return e.node_id; + } +}; + +struct INodeEntryEqual { + size_t operator()(const IndexedGraph::NodeEntry &a, + const IndexedGraph::NodeEntry &b) const { + return a.node_id == b.node_id && a.index == b.index; + } +}; + +// Auxiliary data structure for representing fused op. +struct FuseEntry { + // Subgraph of the fragment + Graph subgraph; + // The input map + std::unordered_map + imap; + // Reverse map to the old input entry + std::unordered_map reverse_imap; + // TVM Placeholder for inputs + std::unordered_map input_info; + // Whether we can flatten data + bool flatten_data; + // The corresponding function. + GraphFunc compiled_func; +}; + +// GroupVec stores the root node ids of the fused nodes. +using GroupVec = std::vector; + +// MasterVec stores master node ids of fused groups. +using MasterVec = std::vector; + +// FuseVec stores fused entries. +using FuseEntryVec = std::vector; + +// PatternVec stores operator patterns. +using PatternVec = std::vector; + +} // namespace compiler +} // namespace nnvm + +#endif // NNVM_COMPILER_GRAPH_FUSE_H_ From 85483c3734ce243a97212b6399229e0fa09f4968 Mon Sep 17 00:00:00 2001 From: MORITA Kazutaka Date: Thu, 9 Aug 2018 11:38:09 +0900 Subject: [PATCH 06/77] [TOPI] add injective scheduler for HLS backends (#1553) * [TOPI] add injective scheduler for HLS backends * Introduced PrintBinaryExpr --- src/codegen/build_module.cc | 2 ++ src/codegen/codegen_vhls.cc | 41 ++++++++++++++++++++++++ src/codegen/codegen_vhls.h | 2 ++ src/codegen/intrin_rule_vhls.cc | 15 +++++++++ topi/python/topi/__init__.py | 1 + topi/python/topi/hls/__init__.py | 5 +++ topi/python/topi/hls/injective.py | 31 ++++++++++++++++++ topi/tests/python/test_topi_broadcast.py | 2 ++ topi/tests/python/test_topi_clip.py | 2 +- topi/tests/python/test_topi_math.py | 2 +- topi/tests/python/test_topi_relu.py | 2 +- topi/tests/python/test_topi_transform.py | 18 +++++------ 12 files changed, 111 insertions(+), 12 deletions(-) create mode 100644 topi/python/topi/hls/__init__.py create mode 100644 topi/python/topi/hls/injective.py diff --git a/src/codegen/build_module.cc b/src/codegen/build_module.cc index 45d10617de05..0cb0ec3cc4be 100644 --- a/src/codegen/build_module.cc +++ b/src/codegen/build_module.cc @@ -91,9 +91,11 @@ Target CreateTarget(const std::string& target_name, } else if (target_name == "sdaccel") { t->device_type = kDLOpenCL; t->keys_array.push_back(ir::StringImm::make("sdaccel")); + t->keys_array.push_back(ir::StringImm::make("hls")); } else if (target_name == "aocl") { t->device_type = kDLAOCL; t->keys_array.push_back(ir::StringImm::make("aocl")); + t->keys_array.push_back(ir::StringImm::make("hls")); } else if (target_name == "opengl") { t->device_type = kOpenGL; t->keys_array.push_back(ir::StringImm::make("opengl")); diff --git a/src/codegen/codegen_vhls.cc b/src/codegen/codegen_vhls.cc index 0795670bded6..b9f9f7505978 100644 --- a/src/codegen/codegen_vhls.cc +++ b/src/codegen/codegen_vhls.cc @@ -16,6 +16,7 @@ void CodeGenVivadoHLS::Init(bool output_ssa) { CodeGenC::Init(output_ssa); this->stream << "#include \n\n"; + this->stream << "#include \n\n"; } void CodeGenVivadoHLS::PrintType(Type t, std::ostream& os) { @@ -67,6 +68,46 @@ void CodeGenVivadoHLS::PreFunctionBody(LoweredFunc f) { this->stream << "#pragma HLS INTERFACE s_axilite port=return bundle=control\n\n"; } +template +inline void PrintBinaryExpr(const T* op, + const char *opstr, + std::ostream& os, // NOLINT(*) + CodeGenVivadoHLS* p) { + os << opstr << '('; + p->PrintExpr(op->a, os); + os << ", "; + p->PrintExpr(op->b, os); + os << ')'; +} + +void CodeGenVivadoHLS::VisitExpr_(const Min *op, std::ostream& os) { // NOLINT(*) + const char *opstr = "std::min"; + if (op->type.is_float()) { + switch (op->type.bits()) { + case 32: + opstr = "fminf"; break; + case 64: + opstr = "fmin"; break; + } + } + + PrintBinaryExpr(op, opstr, os, this); +} + +void CodeGenVivadoHLS::VisitExpr_(const Max *op, std::ostream& os) { // NOLINT(*) + const char *opstr = "std::max"; + if (op->type.is_float()) { + switch (op->type.bits()) { + case 32: + opstr = "fmaxf"; break; + case 64: + opstr = "fmax"; break; + } + } + + PrintBinaryExpr(op, opstr, os, this); +} + runtime::Module BuildSDAccel(Array funcs, std::string target_str) { using tvm::runtime::Registry; diff --git a/src/codegen/codegen_vhls.h b/src/codegen/codegen_vhls.h index 02a0da2dcfa5..bcb7d6f49d8c 100644 --- a/src/codegen/codegen_vhls.h +++ b/src/codegen/codegen_vhls.h @@ -20,6 +20,8 @@ class CodeGenVivadoHLS final : public CodeGenC { void PrintType(Type t, std::ostream& os); void AddFunction(LoweredFunc f); void PreFunctionBody(LoweredFunc f); + void VisitExpr_(const Min *op, std::ostream& os); + void VisitExpr_(const Max *op, std::ostream& os); }; } // namespace codegen diff --git a/src/codegen/intrin_rule_vhls.cc b/src/codegen/intrin_rule_vhls.cc index 32869eec1db5..b360142cd985 100644 --- a/src/codegen/intrin_rule_vhls.cc +++ b/src/codegen/intrin_rule_vhls.cc @@ -9,6 +9,21 @@ namespace tvm { namespace codegen { namespace intrin { +TVM_REGISTER_GLOBAL("tvm.intrin.rule.sdaccel.floor") +.set_body(DispatchExtern); + +TVM_REGISTER_GLOBAL("tvm.intrin.rule.sdaccel.ceil") +.set_body(DispatchExtern); + +TVM_REGISTER_GLOBAL("tvm.intrin.rule.sdaccel.trunc") +.set_body(DispatchExtern); + +TVM_REGISTER_GLOBAL("tvm.intrin.rule.sdaccel.fabs") +.set_body(DispatchExtern); + +TVM_REGISTER_GLOBAL("tvm.intrin.rule.sdaccel.round") +.set_body(DispatchExtern); + TVM_REGISTER_GLOBAL("tvm.intrin.rule.sdaccel.exp") .set_body(DispatchExtern); diff --git a/topi/python/topi/__init__.py b/topi/python/topi/__init__.py index 349f805cc7f2..3ef59913e07b 100644 --- a/topi/python/topi/__init__.py +++ b/topi/python/topi/__init__.py @@ -32,6 +32,7 @@ from . import rocm from . import vision from . import image +from . import hls # not import testing by default # because testing can have extra deps that are not necessary # we can import them from test cases explicitly diff --git a/topi/python/topi/hls/__init__.py b/topi/python/topi/hls/__init__.py new file mode 100644 index 000000000000..69b80514ff56 --- /dev/null +++ b/topi/python/topi/hls/__init__.py @@ -0,0 +1,5 @@ +# pylint: disable=redefined-builtin, wildcard-import +"""HLS specific declaration and schedules.""" +from __future__ import absolute_import as _abs + +from .injective import schedule_injective, schedule_elemwise, schedule_broadcast diff --git a/topi/python/topi/hls/injective.py b/topi/python/topi/hls/injective.py new file mode 100644 index 000000000000..21dfeb95df1b --- /dev/null +++ b/topi/python/topi/hls/injective.py @@ -0,0 +1,31 @@ +# pylint: disable=invalid-name, unused-variable, +"""Schedule for composition of injective operator""" +import tvm +from .. import generic + +@generic.schedule_injective.register(["hls"]) +def schedule_injective(outs): + """Schedule for injective op. + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of reduce in the format + of an array of tensors. + + Returns + ------- + sch: Schedule + The computation schedule for the op. + """ + outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs + s = tvm.create_schedule([x.op for x in outs]) + tvm.schedule.AutoInlineInjective(s) + for out in outs: + fused = s[out].fuse(*s[out].op.axis) + px, x = s[out].split(fused, nparts=1) + s[out].bind(px, tvm.thread_axis("pipeline")) + return s + +schedule_elemwise = schedule_injective +schedule_broadcast = schedule_injective diff --git a/topi/tests/python/test_topi_broadcast.py b/topi/tests/python/test_topi_broadcast.py index c5720050e538..f888033b3914 100644 --- a/topi/tests/python/test_topi_broadcast.py +++ b/topi/tests/python/test_topi_broadcast.py @@ -31,6 +31,7 @@ def check_device(device): check_device("metal") check_device("rocm") check_device("nvptx") + check_device("sdaccel") def verify_broadcast_binary_ele(lhs_shape, rhs_shape, @@ -87,6 +88,7 @@ def check_device(device): check_device("metal") check_device("rocm") check_device("nvptx") + check_device("sdaccel") def test_broadcast_to(): verify_broadcast_to_ele((1,), (10,), topi.broadcast_to) diff --git a/topi/tests/python/test_topi_clip.py b/topi/tests/python/test_topi_clip.py index 041565433bcc..ffc89aeb9bc3 100644 --- a/topi/tests/python/test_topi_clip.py +++ b/topi/tests/python/test_topi_clip.py @@ -34,7 +34,7 @@ def check_device(device): f(a, b) np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) - for device in ['llvm', 'opencl']: + for device in ['llvm', 'opencl', 'sdaccel']: check_device(device) def test_clip(): diff --git a/topi/tests/python/test_topi_math.py b/topi/tests/python/test_topi_math.py index 4190c8e1d213..8d82dbe5bf82 100644 --- a/topi/tests/python/test_topi_math.py +++ b/topi/tests/python/test_topi_math.py @@ -39,7 +39,7 @@ def check_device(device): foo(a, b) np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5, atol=1e-5) - for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'llvm', 'nvptx']: + for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'llvm', 'nvptx', 'sdaccel']: check_device(device) diff --git a/topi/tests/python/test_topi_relu.py b/topi/tests/python/test_topi_relu.py index 9feeabbf122f..7c75a9b08975 100644 --- a/topi/tests/python/test_topi_relu.py +++ b/topi/tests/python/test_topi_relu.py @@ -27,7 +27,7 @@ def check_device(device): foo(a, b) np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) - for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx']: + for device in ['cuda', 'opencl', 'metal', 'rocm', 'vulkan', 'nvptx', 'sdaccel']: check_device(device) diff --git a/topi/tests/python/test_topi_transform.py b/topi/tests/python/test_topi_transform.py index b62f6b43c3de..4788d758cf45 100644 --- a/topi/tests/python/test_topi_transform.py +++ b/topi/tests/python/test_topi_transform.py @@ -22,7 +22,7 @@ def check_device(device): foo(data_nd, out_nd) np.testing.assert_allclose(out_nd.asnumpy(), out_npy) - for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm", "vulkan"]: + for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm", "vulkan", "sdaccel"]: check_device(device) @@ -45,7 +45,7 @@ def check_device(device): foo(data_nd, out_nd) np.testing.assert_allclose(out_nd.asnumpy(), out_npy) - for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm", "vulkan"]: + for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm", "vulkan", "sdaccel"]: check_device(device) @@ -68,7 +68,7 @@ def check_device(device): foo(data_nd, out_nd) np.testing.assert_allclose(out_nd.asnumpy(), out_npy) - for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm", "vulkan"]: + for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm", "vulkan", "sdaccel"]: check_device(device) @@ -96,7 +96,7 @@ def check_device(device): foo(data_nd, out_nd) np.testing.assert_allclose(out_nd.asnumpy(), out_npy) - for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm", "vulkan"]: + for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm", "vulkan", "sdaccel"]: check_device(device) def verify_concatenate(shapes, axis): @@ -121,7 +121,7 @@ def check_device(device): foo(*(data_nds + [out_nd])) np.testing.assert_allclose(out_nd.asnumpy(), out_npy) - for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm", "vulkan"]: + for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm", "vulkan", "sdaccel"]: check_device(device) @@ -146,7 +146,7 @@ def check_device(device): for out_nd, out_npy in zip(out_nds, out_npys): np.testing.assert_allclose(out_nd.asnumpy(), out_npy) - for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm", "vulkan"]: + for device in ["llvm", "nvptx", "cuda", "opencl", "metal", "rocm", "vulkan", "sdaccel"]: check_device(device) @@ -204,7 +204,7 @@ def check_device(device): foo(data_nd, out_nd) np.testing.assert_allclose(out_nd.asnumpy(), out_npy) - for device in ["llvm", "cuda", "opencl"]: + for device in ["llvm", "cuda", "opencl", "sdaccel"]: check_device(device) def verify_take(src_shape, indices_src, axis=None): @@ -243,7 +243,7 @@ def check_device(device): foo(data_nd, indices_nd, out_nd) np.testing.assert_allclose(out_nd.asnumpy(), out_npys) - for device in ["llvm", "opencl"]: + for device in ["llvm", "opencl", "sdaccel"]: check_device(device) def verify_strided_slice(in_shape, begin, end, stride=None): @@ -270,7 +270,7 @@ def check_device(device): foo(data_nd, out_nd) np.testing.assert_allclose(out_nd.asnumpy(), out_npy) - for device in ["llvm", "opencl"]: + for device in ["llvm", "opencl", "sdaccel"]: check_device(device) def test_strided_slice(): From 41d4dd6ef16445c4ea7c7835f139d9f30398f027 Mon Sep 17 00:00:00 2001 From: Wuwei Lin Date: Thu, 9 Aug 2018 11:40:28 +0800 Subject: [PATCH 07/77] Use int for int8x4 due to performance overhead of char4 (#1569) * Use int for int8x4 due to performance overhead of char4 * Add a comment about using int * Remove invalid test --- src/codegen/codegen_cuda.cc | 6 +++++- tests/python/unittest/test_codegen_cuda.py | 1 - 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/codegen/codegen_cuda.cc b/src/codegen/codegen_cuda.cc index 70ab807c9509..44c02830d0fc 100644 --- a/src/codegen/codegen_cuda.cc +++ b/src/codegen/codegen_cuda.cc @@ -90,7 +90,11 @@ void CodeGenCUDA::PrintType(Type t, std::ostream& os) { // NOLINT(*) if (t.lanes() == 4) { // directly 4 8 bit int in integer. enable_int8_ = true; - os << "char4"; return; + + // We use int for int8x4 instead of char4 because using char4 is + // likely to produce extra instructions to pack four int8 elements + // into 32-bit data. + os << "int"; return; } else if (t.lanes() == 8) { enable_int8_ = true; os << "int2"; return; diff --git a/tests/python/unittest/test_codegen_cuda.py b/tests/python/unittest/test_codegen_cuda.py index b0ed92a30281..0b54863d1aab 100644 --- a/tests/python/unittest/test_codegen_cuda.py +++ b/tests/python/unittest/test_codegen_cuda.py @@ -31,7 +31,6 @@ def check_cuda(dtype, n, lanes): check_cuda("float32", 64, 2) check_cuda("float16", 64, 2) - check_cuda("int8", 64, 4) def test_cuda_multiply_add(): From 6d4cf44855108f0bf0465e45182199a9a224328b Mon Sep 17 00:00:00 2001 From: Siva Date: Thu, 9 Aug 2018 23:36:03 +0530 Subject: [PATCH 08/77] [DOCS] Neural network Deployment Guide with System Module Mode #1523 (#1533) --- docs/deploy/nnvm.md | 57 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/docs/deploy/nnvm.md b/docs/deploy/nnvm.md index aa6c39facd1f..e4ce14528b53 100644 --- a/docs/deploy/nnvm.md +++ b/docs/deploy/nnvm.md @@ -116,3 +116,60 @@ int main() return 0; } ``` + +## Deploy as System Module +C++ additionally support deployment as system module. +This process need few additional options as given below to NNVM build. + +- For target llvm append --system-lib as ```target=llvm --system-lib``` +- For a GPU build (or non llvm) the additional option should be given to targat_host as ```target_host=llvm --system-lib``` + +Module export require additional options for not to compile but save as ```lib.export_library (path, fcompile=False)``` + +The output of above API is a tar compressed file containing object file ```(lib.o)``` and cpp source file ```(devc.cc)``` which embeds device blob. Thease two files should be compiled along with other files or objects while building c++ application. +Please refer to [Makefile](https://github.com/dmlc/tvm/tree/master/apps/howto_deploy/Makefile#L32) for a reference. + +The c++ code to load this system module require the below change. + +```cpp + // tvm module for compiled functions + tvm::runtime::Module mod_syslib = (*tvm::runtime::Registry::Get("module._GetSystemLib"))(); +``` + +Based on the build environment the system object, device blob source should be included in the final executable. An example with bazel build is given below. +```bash +cc_library( + name = "host_module", + srcs = ["lib.o"], + alwayslink=1 +) + +cc_library( + name = "device_module", + srcs = ["devc.cc"], + alwayslink=1 +) + +cc_library( + name = "tvm_runtime", + srcs = ["libtvm_runtime_pack.cc"], +) + +cc_binary( + name = "bazel_deploy", + srcs = ["cpp_deploy.cc"], + deps = [ + ":tvm_runtime", ":host_module", ":device_module" + ], + linkopts = [ "-lpthread -ldl" ] +) + +``` + +This build directive creates +- new library ```host_module``` out of ```lib.o``` +- new library ```device_module``` out of ```devc.cc``` + +These intermediate modules can be used as a dependency to final deploy application. + +In bazel ```alwayslink=1``` enforce embedding entire lib into application (even though it doesn't call any API from this module). From 672147c8e183de76a7fe34fa2aeace7c8829cabd Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Thu, 9 Aug 2018 11:17:37 -0700 Subject: [PATCH 09/77] add conv2d transpose and fix bugs (#1566) --- nnvm/include/nnvm/top/nn.h | 5 + nnvm/python/nnvm/testing/dcgan.py | 19 ++- nnvm/python/nnvm/top/nn.py | 6 +- nnvm/src/top/nn/convolution.cc | 2 +- .../python/frontend/mxnet/model_zoo/dcgan.py | 19 ++- python/tvm/autotvm/measure/measure_methods.py | 4 +- python/tvm/autotvm/task/dispatcher.py | 31 +++- python/tvm/autotvm/task/nnvm_integration.py | 13 +- python/tvm/autotvm/tophub.py | 18 ++- python/tvm/autotvm/tuner/callback.py | 4 +- topi/python/topi/arm_cpu/__init__.py | 1 + topi/python/topi/arm_cpu/conv2d.py | 24 +-- topi/python/topi/arm_cpu/conv2d_transpose.py | 144 ++++++++++++++++++ topi/python/topi/arm_cpu/depthwise_conv2d.py | 17 ++- topi/python/topi/nn/conv2d_transpose.py | 8 +- topi/tests/python/test_topi_conv2d.py | 2 +- .../python/test_topi_conv2d_transpose_nchw.py | 2 +- tutorials/autotvm/tune_nnvm_arm.py | 58 ++++--- tutorials/nnvm_quick_start.py | 2 +- 19 files changed, 299 insertions(+), 80 deletions(-) create mode 100644 topi/python/topi/arm_cpu/conv2d_transpose.py diff --git a/nnvm/include/nnvm/top/nn.h b/nnvm/include/nnvm/top/nn.h index c9baa116e8aa..865024733494 100644 --- a/nnvm/include/nnvm/top/nn.h +++ b/nnvm/include/nnvm/top/nn.h @@ -254,6 +254,7 @@ struct Conv2DTransposeParam : public dmlc::Parameter { int groups; std::string layout; std::string kernel_layout; + int out_dtype; bool use_bias; DMLC_DECLARE_PARAMETER(Conv2DTransposeParam) { @@ -286,6 +287,10 @@ struct Conv2DTransposeParam : public dmlc::Parameter { .describe("Dimension ordering of data and weight. Can be 'OIHW', 'OIHW16o16i', etc." "'O', 'I', 'H', 'W' stands for num_filter, input_channel, height, and width" "dimensions respectively."); + DMLC_DECLARE_DTYPE_FIELD(out_dtype) + .add_enum("same", -1) + .set_default(-1) + .describe("Output data type, set to explicit type under mixed precision setting"); DMLC_DECLARE_FIELD(use_bias).set_default(true) .describe("Whether the layer uses a bias vector."); } diff --git a/nnvm/python/nnvm/testing/dcgan.py b/nnvm/python/nnvm/testing/dcgan.py index 421699ad4c14..4adc63bd1f2a 100644 --- a/nnvm/python/nnvm/testing/dcgan.py +++ b/nnvm/python/nnvm/testing/dcgan.py @@ -42,28 +42,31 @@ def deconv2d_bn_relu(data, prefix, **kwargs): def get_symbol(oshape, ngf=128, code=None): """get symbol of dcgan generator""" - assert oshape[-1] == 32, "Only support 32x32 image" - assert oshape[-2] == 32, "Only support 32x32 image" + assert oshape[-1] == 64, "Only support 64x64 image" + assert oshape[-2] == 64, "Only support 64x64 image" code = sym.Variable("data") if code is None else code - net = sym.dense(code, name="g1", units=4*4*ngf*4, use_bias=False) + net = sym.dense(code, name="g1", units=4*4*ngf*8, use_bias=False) net = sym.relu(net) # 4 x 4 - net = sym.reshape(net, shape=(-1, ngf * 4, 4, 4)) + net = sym.reshape(net, shape=(-1, ngf * 8, 4, 4)) # 8 x 8 net = deconv2d_bn_relu( - net, ishape=(ngf * 4, 4, 4), oshape=(ngf * 2, 8, 8), kshape=(4, 4), prefix="g2") + net, ishape=(ngf * 8, 4, 4), oshape=(ngf * 4, 8, 8), kshape=(4, 4), prefix="g2") # 16x16 net = deconv2d_bn_relu( - net, ishape=(ngf * 2, 8, 8), oshape=(ngf, 16, 16), kshape=(4, 4), prefix="g3") + net, ishape=(ngf * 4, 8, 8), oshape=(ngf * 2, 16, 16), kshape=(4, 4), prefix="g3") # 32x32 + net = deconv2d_bn_relu( + net, ishape=(ngf * 2, 16, 16), oshape=(ngf, 32, 32), kshape=(4, 4), prefix="g4") + # 64x64 net = deconv2d( - net, ishape=(ngf, 16, 16), oshape=oshape[-3:], kshape=(4, 4), name="g4_deconv") + net, ishape=(ngf, 32, 32), oshape=oshape[-3:], kshape=(4, 4), name="g5_deconv") net = sym.tanh(net) return net -def get_workload(batch_size, oshape=(3, 32, 32), ngf=128, random_len=100, dtype="float32"): +def get_workload(batch_size, oshape=(3, 64, 64), ngf=128, random_len=100, dtype="float32"): """Get benchmark workload for a DCGAN generator Parameters diff --git a/nnvm/python/nnvm/top/nn.py b/nnvm/python/nnvm/top/nn.py index f59424203402..f9a2c2813a04 100644 --- a/nnvm/python/nnvm/top/nn.py +++ b/nnvm/python/nnvm/top/nn.py @@ -251,11 +251,15 @@ def compute_conv2d_transpose(attrs, inputs, _): strides = attrs.get_int_tuple("strides") dilation = attrs.get_int_tuple("dilation") groups = attrs.get_int("groups") + out_dtype = attrs.get_string("out_dtype") layout = attrs["layout"] + out_dtype = inputs[0].dtype if out_dtype == "same" else out_dtype + assert layout == "NCHW", "only support nchw for now" assert dilation == (1, 1), "not support dilate now" assert groups == 1, "only support groups == 1 for now" - out = topi.nn.conv2d_transpose_nchw(inputs[0], inputs[1], strides, padding) + + out = topi.nn.conv2d_transpose_nchw(inputs[0], inputs[1], strides, padding, out_dtype) if attrs.get_bool("use_bias"): bias = inputs[2] bias = topi.expand_dims(bias, axis=1, num_newaxis=2) diff --git a/nnvm/src/top/nn/convolution.cc b/nnvm/src/top/nn/convolution.cc index 2843bea1f4ad..229d4ac30f78 100644 --- a/nnvm/src/top/nn/convolution.cc +++ b/nnvm/src/top/nn/convolution.cc @@ -556,7 +556,7 @@ v (batch_size, channels, out_height, out_width) if `layout` is `NCHW` .set_attr("FGetAttrDict", ParamGetAttrDict) .set_attr("FListInputNames", UseBiasListInputNames) .set_attr("FInferShape", Conv2DTransposeInferShape) -.set_attr("FInferType", ElemwiseType<-1, 1>) +.set_attr("FInferType", Conv2DInferType) .set_attr("FCorrectLayout", Conv2DTransposeCorrectLayout) .set_num_outputs(1) .set_num_inputs(UseBiasNumInputs) diff --git a/nnvm/tests/python/frontend/mxnet/model_zoo/dcgan.py b/nnvm/tests/python/frontend/mxnet/model_zoo/dcgan.py index 98133d369b13..8af030b6b184 100644 --- a/nnvm/tests/python/frontend/mxnet/model_zoo/dcgan.py +++ b/nnvm/tests/python/frontend/mxnet/model_zoo/dcgan.py @@ -40,24 +40,27 @@ def deconv2d_bn_relu(data, prefix, **kwargs): net = mx.sym.Activation(net, name="%s_act" % prefix, act_type='relu') return net -def get_symbol(oshape=(3, 32, 32), ngf=128, code=None): +def get_symbol(oshape=(3, 64, 64), ngf=128, code=None): """get symbol of dcgan generator""" - assert oshape[-1] == 32, "Only support 32x32 image" - assert oshape[-2] == 32, "Only support 32x32 image" + assert oshape[-1] == 64, "Only support 64x64 image" + assert oshape[-2] == 64, "Only support 64x64 image" code = mx.sym.Variable("data") if code is None else code - net = mx.sym.FullyConnected(code, name="g1", num_hidden=4*4*ngf*4, no_bias=True, flatten=False) + net = mx.sym.FullyConnected(code, name="g1", num_hidden=ngf*8*4*4, no_bias=True, flatten=False) net = mx.sym.Activation(net, act_type='relu') # 4 x 4 - net = mx.sym.reshape(net, shape=(-1, ngf * 4, 4, 4)) + net = mx.sym.reshape(net, shape=(-1, ngf * 8, 4, 4)) # 8 x 8 net = deconv2d_bn_relu( - net, ishape=(ngf * 4, 4, 4), oshape=(ngf * 2, 8, 8), kshape=(4, 4), prefix="g2") + net, ishape=(ngf * 8, 4, 4), oshape=(ngf * 4, 8, 8), kshape=(4, 4), prefix="g2") # 16x16 net = deconv2d_bn_relu( - net, ishape=(ngf * 2, 8, 8), oshape=(ngf, 16, 16), kshape=(4, 4), prefix="g3") + net, ishape=(ngf * 4, 8, 8), oshape=(ngf * 2, 16, 16), kshape=(4, 4), prefix="g3") # 32x32 + net = deconv2d_bn_relu( + net, ishape=(ngf * 2, 16, 16), oshape=(ngf, 32, 32), kshape=(4, 4), prefix="g4") + # 64x64 net = deconv2d( - net, ishape=(ngf, 16, 16), oshape=oshape[-3:], kshape=(4, 4), name="g4_deconv") + net, ishape=(ngf, 32, 32), oshape=oshape[-3:], kshape=(4, 4), name="g5_deconv") net = mx.sym.Activation(net, act_type='tanh') return net diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py index 30802dd8198e..6e95a6e435d0 100644 --- a/python/tvm/autotvm/measure/measure_methods.py +++ b/python/tvm/autotvm/measure/measure_methods.py @@ -345,7 +345,9 @@ def _measure_common(input_pack, build_func, build_kwargs, number, repeat, msg = msg.split('\n')[-2].split(": ")[1] except Exception: # pylint: disable=broad-except pass - raise InstantiationError(msg) + res_pack.append(MeasureResult((InstantiationError(msg),), + MeasureErrorNo.INSTANTIATION_ERROR, + tstamp - tic, tstamp)) else: res_pack.append(MeasureResult((RuntimeError(msg),), MeasureErrorNo.COMPILE_HOST, diff --git a/python/tvm/autotvm/task/dispatcher.py b/python/tvm/autotvm/task/dispatcher.py index 2304b425f34b..93f6d584abfa 100644 --- a/python/tvm/autotvm/task/dispatcher.py +++ b/python/tvm/autotvm/task/dispatcher.py @@ -21,6 +21,8 @@ from tvm import target as _target +from .space import ConfigSpace + logger = logging.getLogger('autotvm') class DispatchContext(object): @@ -120,7 +122,12 @@ def dispatch_func(func, *args, **kwargs): raise RuntimeError("DispatchContext is not initialized") workload = func(*args, **kwargs) cfg = context.query(tgt, workload) - return dispatch_dict[cfg.template_key](cfg, *args, **kwargs) + if cfg.template_key: + return dispatch_dict[cfg.template_key](cfg, *args, **kwargs) + else: + assert dispatch_dict, "No func registered for this dispatcher" + for v in dispatch_dict.values(): + return v(cfg, *args, **kwargs) fdecorate = decorate(fworkload, dispatch_func) fdecorate.register = register @@ -159,13 +166,18 @@ class ApplyHistoryBest(DispatchContext): Otherwise, it is an iterator. default: ConfigEntity, optional The default config to return when no history records + allow_fallback: bool + Whether allow to use a fallback configuration if cannot find + tuned result. """ - def __init__(self, records, default=None): + def __init__(self, records, default=None, allow_fallback=False): super(ApplyHistoryBest, self).__init__() self.best_by_targetkey = {} self.best_by_model = {} self._default = default + self._allow_fallback = allow_fallback + self.fallback = {} if records: self.load(records) @@ -244,5 +256,18 @@ def query(self, target, workload): if self._default: return self._default + + if self._allow_fallback: + key = (target, workload) + if key in self.fallback: + return self.fallback[key] + logger.warning( + "Cannot find config for target=%s, workload=%s. A fallback configuration " + "is used, which may bring great performance regression.", target, workload) + cfg = ConfigSpace() + self.fallback[key] = cfg + return cfg + raise RuntimeError( - "Cannot find config for target=%s, workload=%s" % (target, workload)) + "Cannot find config for target=%s, workload=%s. You need to do tuning " + "for this workload to get the config." % (target, workload)) diff --git a/python/tvm/autotvm/task/nnvm_integration.py b/python/tvm/autotvm/task/nnvm_integration.py index a16527f9cb01..338b46784a75 100644 --- a/python/tvm/autotvm/task/nnvm_integration.py +++ b/python/tvm/autotvm/task/nnvm_integration.py @@ -53,12 +53,14 @@ def __init__(self): import nnvm self.symbol2topi = { - nnvm.sym.conv2d: [topi.nn.conv2d, topi.nn.depthwise_conv2d_nchw] + nnvm.sym.conv2d: [topi.nn.conv2d, topi.nn.depthwise_conv2d_nchw], + nnvm.sym.conv2d_transpose: [topi.nn.conv2d_transpose], } self.topi_to_task = { topi.nn.conv2d: "topi_nn_conv2d", topi.nn.depthwise_conv2d_nchw: "topi_nn_depthwise_conv2d_nchw", + topi.nn.conv2d_transpose_nchw: "topi_nn_conv2d_transpose_nchw", } self._register_dummy() @@ -110,6 +112,15 @@ def _topi_nn_depthwise_conv2d_nchw(*args, **kwargs): s = topi.generic.schedule_depthwise_conv2d_nchw([C]) return s, [A, W, C] + @register("topi_nn_conv2d_transpose_nchw") + def _topi_nn_conv2d_transpose_nchw(*args, **kwargs): + assert not kwargs, "Do not support kwargs in template function call" + args = deserialize_args(args) + A, W = args[:2] + C = topi.nn.conv2d_transpose_nchw(*args, **kwargs) + s = topi.generic.schedule_conv2d_transpose_nchw([C]) + return s, [A, W, C] + def reset(self): """Reset task collections""" self.task_collection = [] diff --git a/python/tvm/autotvm/tophub.py b/python/tvm/autotvm/tophub.py index 94ff011f4f28..e11bb7a4fc92 100644 --- a/python/tvm/autotvm/tophub.py +++ b/python/tvm/autotvm/tophub.py @@ -9,6 +9,7 @@ import logging import os import json +import sys from .task import ApplyHistoryBest from .. import target as _target @@ -27,7 +28,7 @@ def _alias(name): return table.get(name, name) -def context(target, extra_files=None): +def context(target, extra_files=None, allow_fallback=False): """Return the dispatch context with pre-tuned parameters. The corresponding downloaded *.log files under tophub root path will be loaded. Users can also add their own files in argument `extra_files`. @@ -38,9 +39,12 @@ def context(target, extra_files=None): The compilation target extra_files: list of str, optional Extra log files to load + allow_fallback: bool + Whether allow to use a fallback configuration if cannot find + tuned result. """ rootpath = AUTOTVM_TOPHUB_ROOT_PATH - best_context = ApplyHistoryBest([]) + best_context = ApplyHistoryBest([], allow_fallback=allow_fallback) if isinstance(target, str): target = _target.create(target) @@ -99,7 +103,15 @@ def check_package(backend): if os.path.isfile(os.path.join(AUTOTVM_TOPHUB_ROOT_PATH, backend + ".log")): return - download_package(backend) + + if sys.version_info >= (3,): + import urllib.request as urllib2 + else: + import urllib2 + try: + download_package(backend) + except urllib2.URLError: + logging.warning("Failed to download tophub package for %s", backend) def list_packages(): diff --git a/python/tvm/autotvm/tuner/callback.py b/python/tvm/autotvm/tuner/callback.py index a777f9c7ceb8..15d5ac1c9689 100644 --- a/python/tvm/autotvm/tuner/callback.py +++ b/python/tvm/autotvm/tuner/callback.py @@ -118,8 +118,8 @@ def _callback(tuner, inputs, results): ctx.cur_flops = flops ctx.best_flops = tuner.best_flops - sys.stdout.write('%s Current/Best: %7.2f/%7.2f GFLOPS | Progress: (%d/%d) ' - '| %.2f s\r' % + sys.stdout.write('\r%s Current/Best: %7.2f/%7.2f GFLOPS | Progress: (%d/%d) ' + '| %.2f s' % (prefix, ctx.cur_flops/1e9, ctx.best_flops/1e9, ctx.ct, ctx.total, time.time() - tic)) sys.stdout.flush() diff --git a/topi/python/topi/arm_cpu/__init__.py b/topi/python/topi/arm_cpu/__init__.py index bb79769c1adc..8d78f67ac0b6 100644 --- a/topi/python/topi/arm_cpu/__init__.py +++ b/topi/python/topi/arm_cpu/__init__.py @@ -2,4 +2,5 @@ from . import conv2d from . import depthwise_conv2d +from . import conv2d_transpose from . import bitserial_conv2d diff --git a/topi/python/topi/arm_cpu/conv2d.py b/topi/python/topi/arm_cpu/conv2d.py index f5dbec8e552b..48bb4fb022c7 100644 --- a/topi/python/topi/arm_cpu/conv2d.py +++ b/topi/python/topi/arm_cpu/conv2d.py @@ -42,7 +42,7 @@ def schedule_conv2d_nchw_arm_cpu(cfg, outs): def _callback(op): # schedule conv2d - if 'spatial_conv_output' in op.tag: + if 'spatial_conv2d_output' in op.tag: output = op.output(0) conv = op.input_tensors[0] @@ -60,7 +60,7 @@ def _callback(op): _schedule_spatial_pack(cfg, s, data_vec, kernel_vec, conv, output, outs[0]) - if 'winograd_conv_output' in op.tag: + if 'winograd_conv2d_output' in op.tag: output = op.output(0) _schedule_winograd(cfg, s, output, outs[0]) @@ -72,7 +72,7 @@ def _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, n assert layout == "NCHW", "Only support NCHW" out_dtype = out_dtype or data.dtype - _, CI, IH, IW = get_const_tuple(data.shape) + N, CI, IH, IW = get_const_tuple(data.shape) if len(kernel.shape) == 4: pre_packed = False CO, _, KH, KW = get_const_tuple(kernel.shape) @@ -81,13 +81,12 @@ def _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, n CO, _, KH, KW, VC = get_const_tuple(kernel.shape) CO = CO * VC - pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, (KH, KW)) + pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (KH, KW)) HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides) - N = 1 - OH = (IH + pad_top + pad_down - KH) // HSTR + 1 + OH = (IH + pad_top + pad_bottom - KH) // HSTR + 1 OW = (IW + pad_left + pad_right - KW) // WSTR + 1 - data_pad = pad(data, [0, 0, pad_top, pad_left], [0, 0, pad_down, pad_right]) + data_pad = pad(data, [0, 0, pad_top, pad_left], [0, 0, pad_bottom, pad_right]) # ==================== define configuration space ==================== n, co, oh, ow = cfg.axis(N), cfg.axis(CO), cfg.axis(OH), cfg.axis(OW) @@ -145,7 +144,7 @@ def _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, n output = tvm.compute(oshape, lambda n, co, h, w: conv[n][co//VC][h//VH][w//VW][h%VH][w%VW][co%VC], - name='output_unpack', tag='spatial_conv_output', + name='output_unpack', tag='spatial_conv2d_output', attrs={'workload': _conv_arg_to_workload(data, kernel, strides, padding, layout, out_dtype)}) return output @@ -195,11 +194,14 @@ def _schedule_spatial_pack(cfg, s, data_vec, kernel_vec, if kernel_vec.op.name == 'kernel_vec': co, _, _, _, _ = s[kernel_vec].op.axis if autotvm.GLOBAL_SCOPE.in_tuning: - # kernel packing will be pre-computed during compliation, so we skip + # kernel packing will be pre-computed during compilation, so we skip # this part to make tuning records correct s[kernel_vec].pragma(co, 'debug_skip_region') else: s[kernel_vec].parallel(co) + elif kernel_vec.op.name == 'kernel_vec_conv2d_transpose': # for conv2d transpose + co, _, _, _, _ = s[kernel_vec].op.axis + s[kernel_vec].parallel(co) return s @@ -330,7 +332,7 @@ def _decl_winograd(cfg, data, kernel, strides, padding, layout, out_dtype, tile_ # unpack output output = tvm.compute((N, K, H, W), lambda n, k, h, w: Y[k][n * nH * nW + (h//m) * nW + w//m][h % m][w % m], - name='output', tag='winograd_conv_output', + name='output', tag='winograd_conv2d_output', attrs={'workload': _winograd_conv_arg_to_workload( data, kernel, strides, padding, layout, out_dtype, tile_size)}) @@ -462,7 +464,7 @@ def schedule_conv2d_winograd_without_weight_transform_(cfg, outs): s = tvm.create_schedule([x.op for x in outs]) def _callback(op): - if 'winograd_conv_output' in op.tag: + if 'winograd_conv2d_output' in op.tag: output = op.output(0) _schedule_winograd(cfg, s, output, outs[0]) diff --git a/topi/python/topi/arm_cpu/conv2d_transpose.py b/topi/python/topi/arm_cpu/conv2d_transpose.py new file mode 100644 index 000000000000..99f199782ed2 --- /dev/null +++ b/topi/python/topi/arm_cpu/conv2d_transpose.py @@ -0,0 +1,144 @@ +# pylint: disable=invalid-name, unused-variable +"""Transposed 2D convolution operators (sometimes called Deconvolution).""" +from __future__ import absolute_import as _abs + +import tvm +from tvm import autotvm + +from ..generic import schedule_conv2d_transpose_nchw +from ..nn import conv2d_transpose_nchw, dilate, pad, get_pad_tuple +from ..util import get_const_tuple, traverse_inline +from .conv2d import _schedule_spatial_pack + +@autotvm.task.register_topi_compute(conv2d_transpose_nchw, "arm_cpu", "direct") +def conv2d_transpose_nchw_arm(cfg, Input, Filter, strides, padding, out_dtype): + """Transposed 2D convolution nchw forward operator. + + Parameters + ---------- + Input : tvm.Tensor + 4-D with shape [batch, in_channel, in_height, in_width] + + Filter : tvm.Tensor + 4-D with shape [in_channel, num_filter, filter_height, filter_width] + + strides : tuple of two ints + The spatial stride along height and width + + padding : int or str + Padding size, or ['VALID', 'SAME'] + + out_dtype: str + The output data type. This is used for mixed precision. + + Returns + ------- + Output : tvm.Tensor + 4-D with shape [batch, out_channel, out_height, out_width] + """ + return _decl_spatial_pack(cfg, Input, Filter, strides, padding, "NCHW", out_dtype, 2) + +def _decl_spatial_pack(cfg, data, kernel, strides, padding, layout, out_dtype, num_tile): + assert layout == "NCHW", "Only support NCHW" + out_dtype = out_dtype or data.dtype + + N, CI, IH, IW = get_const_tuple(data.shape) + _, CO, KH, KW = get_const_tuple(kernel.shape) + + pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (KH, KW)) + bpad_top, bpad_bottom = KH - 1 - pad_top, KH - 1 - pad_bottom + bpad_left, bpad_right = KW - 1 - pad_left, KW - 1 - pad_right + HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides) + + OH = (IH - 1) * HSTR - pad_top - pad_bottom + KH + OW = (IW - 1) * WSTR - pad_left - pad_right + KW + + dilated_input = dilate(data, [1, 1, HSTR, WSTR]) + data_pad = pad(dilated_input, [0, 0, bpad_top, bpad_left], [0, 0, bpad_bottom, bpad_right]) + + # ==================== define configuration space ==================== + n, co, oh, ow = cfg.axis(N), cfg.axis(CO), cfg.axis(OH), cfg.axis(OW) + ci, kh, kw = cfg.reduce_axis(CI), cfg.reduce_axis(KH), cfg.reduce_axis(KW) + + if num_tile == 2: # for arm cpu + co, vc = cfg.define_split('tile_co', co, num_outputs=2) + oh, vh = cfg.define_split('tile_oh', oh, num_outputs=2) + ow, vw = cfg.define_split('tile_ow', ow, num_outputs=2) + elif num_tile == 3: # for mali gpu + co, _, vc = cfg.define_split('tile_co', co, num_outputs=3) + oh, _, vh = cfg.define_split('tile_oh', oh, num_outputs=3) + ow, _, vw = cfg.define_split('tile_ow', ow, num_outputs=3) + else: + raise RuntimeError("Invalid num_tile") + + cfg.define_reorder("reorder_0", + [n, co, oh, ow, ci, kh, kw, vh, vw, vc], + policy='candidate', candidate=[ + [n, co, oh, ow, ci, kh, kw, vh, vw, vc], + [n, co, oh, ow, ci, kh, kw, vc, vh, vw]]) + + cfg.define_annotate("ann_reduce", [kh, kw], policy='try_unroll') + cfg.define_annotate("ann_spatial", [vh, vw, vc], policy='try_unroll_vec') + # ==================================================================== + + VC = cfg["tile_co"].size[-1] + VH = cfg["tile_oh"].size[-1] + VW = cfg["tile_ow"].size[-1] + + dvshape = (N, OH // VH, OW // VW, CI, VH + KH-1, VW + KW-1) + kvshape = (CO // VC, CI, KH, KW, VC) + ovshape = (N, CO // VC, OH // VH, OW // VW, VH, VW, VC) + oshape = (N, CO, OH, OW) + + data_vec = tvm.compute(dvshape, lambda n, h, w, ci, vh, vw: + data_pad[n][ci][h*VH + vh][w*VW + vw], + name='data_vec') + + kernel_vec = tvm.compute(kvshape, lambda co, ci, kh, kw, vc: + kernel[ci][co*VC+vc][kh][kw], + name='kernel_vec_conv2d_transpose') + + ci = tvm.reduce_axis((0, CI), name='ci') + kh = tvm.reduce_axis((0, KH), name='kh') + kw = tvm.reduce_axis((0, KW), name='kw') + + conv = tvm.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \ + tvm.sum(data_vec[n, h, w, ci, vh + kh, vw + kw].astype(out_dtype) * + kernel_vec[co, ci, KH - 1 - kh, KW - 1 - kw, vc].astype(out_dtype), + axis=[ci, kh, kw]), name='conv') + + output = tvm.compute(oshape, lambda n, co, h, w: + conv[n][co//VC][h//VH][w//VW][h%VH][w%VW][co%VC], + name='output_unpack', tag='spatial_conv2d_transpose_output') + return output + + +# register customized schedule for arm cpu. +@autotvm.task.register_topi_schedule(schedule_conv2d_transpose_nchw, "arm_cpu", "direct") +def schedule_conv2d_transpose_arm(cfg, outs): + """Schedule conv2d transpose for arm cpu""" + s = tvm.create_schedule([x.op for x in outs]) + + def _callback(op): + if 'spatial_conv2d_transpose_output' in op.tag: + output = op.output(0) + conv = op.input_tensors[0] + + data_vec = conv.op.input_tensors[0] + data_pad = data_vec.op.input_tensors[0] + dilated_input = data_pad.op.input_tensors[0] + s[data_pad].compute_inline() + s[dilated_input].compute_inline() + + kernel_vec = conv.op.input_tensors[1] + if kernel_vec.op.name == 'kernel_vec': + kernel = kernel_vec.op.input_tensors[0] + else: + kernel = kernel_vec + if isinstance(kernel.op, tvm.tensor.ComputeOp) and "dilate" in kernel.op.tag: + s[kernel].compute_inline() + + _schedule_spatial_pack(cfg, s, data_vec, kernel_vec, conv, output, outs[0]) + + traverse_inline(s, outs[0].op, _callback) + return s diff --git a/topi/python/topi/arm_cpu/depthwise_conv2d.py b/topi/python/topi/arm_cpu/depthwise_conv2d.py index c4dbeae38eeb..8aafc436319f 100644 --- a/topi/python/topi/arm_cpu/depthwise_conv2d.py +++ b/topi/python/topi/arm_cpu/depthwise_conv2d.py @@ -15,7 +15,16 @@ # register customized schedule for arm cpu. @autotvm.task.register_topi_schedule(schedule_depthwise_conv2d_nchw, 'arm_cpu', 'direct') def schedule_depthwise_conv2d_nchw_(cfg, outs): - """Schedule depthwise conv2d""" + """Schedule depthwise conv2d + + Parameters + ---------- + cfg: ConfigEntity + The configuration of this tempalte + outs: Array of Tensor + The computation graph description of depthwise convolution2d + in the format of an array of tensors. + """ outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs s = tvm.create_schedule([x.op for x in outs]) @@ -79,10 +88,8 @@ def _schedule(cfg, s, data, data_pad, kernel, output): return s - scheduled_ops = [] - def _callback(op): - if op.tag == 'depthwise_conv2d_nchw' and op not in scheduled_ops: + if op.tag == 'depthwise_conv2d_nchw': output = op.output(0) kernel = op.input_tensors[1] data = op.input_tensors[0] @@ -92,7 +99,5 @@ def _callback(op): data = data_pad.op.input_tensors[0] _schedule(cfg, s, data, data_pad, kernel, output) - scheduled_ops.append(op) - traverse_inline(s, outs[0].op, _callback) return s diff --git a/topi/python/topi/nn/conv2d_transpose.py b/topi/python/topi/nn/conv2d_transpose.py index 270409ad099b..f9fdd3f1b2dc 100644 --- a/topi/python/topi/nn/conv2d_transpose.py +++ b/topi/python/topi/nn/conv2d_transpose.py @@ -10,7 +10,7 @@ @tvm.target.generic_func -def conv2d_transpose_nchw(Input, Filter, strides, padding): +def conv2d_transpose_nchw(Input, Filter, strides, padding, out_dtype): """Transposed 2D convolution nchw forward operator. Parameters @@ -27,6 +27,9 @@ def conv2d_transpose_nchw(Input, Filter, strides, padding): padding : int or str Padding size, or ['VALID', 'SAME'] + out_dtype : str + The output data type. This is used for mixed precision. + Returns ------- Output : tvm.Tensor @@ -58,7 +61,8 @@ def conv2d_transpose_nchw(Input, Filter, strides, padding): Output = tvm.compute( (batch, out_c, out_h, out_w), lambda b, c, h, w: tvm.sum( - PaddedInput[b, dc, h+dh, w+dw] * Filter[dc, c, filter_h-1-dh, filter_w-1-dw], + PaddedInput[b, dc, h+dh, w+dw].astype(out_dtype) * + Filter[dc, c, filter_h-1-dh, filter_w-1-dw].astype(out_dtype), axis=[dc, dh, dw]), tag="conv2d_transpose_nchw") return Output diff --git a/topi/tests/python/test_topi_conv2d.py b/topi/tests/python/test_topi_conv2d.py index 124c98c65c7a..365fdf551c4f 100644 --- a/topi/tests/python/test_topi_conv2d.py +++ b/topi/tests/python/test_topi_conv2d.py @@ -40,7 +40,7 @@ def get_ref_data(): np.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) def test_conv2d(): - with autotvm.tophub.context(tvm.target.arm_cpu('rasp3b')): + with autotvm.tophub.context(tvm.target.arm_cpu('rasp3b'), allow_fallback=True): verify_conv2d(1, 56, 64, 64, 3, 1, 1) if __name__ == "__main__": diff --git a/topi/tests/python/test_topi_conv2d_transpose_nchw.py b/topi/tests/python/test_topi_conv2d_transpose_nchw.py index 674fe9fd8953..0c985400031a 100644 --- a/topi/tests/python/test_topi_conv2d_transpose_nchw.py +++ b/topi/tests/python/test_topi_conv2d_transpose_nchw.py @@ -12,7 +12,7 @@ def verify_conv2d_transpose_nchw(batch, in_channel, in_size, num_filter, kernel, A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A') W = tvm.placeholder((in_channel, num_filter, kernel, kernel), name='W') - B = topi.nn.conv2d_transpose_nchw(A, W, [stride, stride], padding) + B = topi.nn.conv2d_transpose_nchw(A, W, [stride, stride], padding, A.dtype) C = topi.nn.relu(B) a_shape = get_const_tuple(A.shape) diff --git a/tutorials/autotvm/tune_nnvm_arm.py b/tutorials/autotvm/tune_nnvm_arm.py index 37779403fd0a..d11823f204e1 100644 --- a/tutorials/autotvm/tune_nnvm_arm.py +++ b/tutorials/autotvm/tune_nnvm_arm.py @@ -62,7 +62,7 @@ def get_network(name, batch_size): """Get the symbol definition and random weight of a network""" - shape = {"data": (batch_size, 3, 224, 224)} + input_shape = (batch_size, 3, 224, 224) output_shape = (batch_size, 1000) if name =='resnet-18': @@ -90,7 +90,7 @@ def get_network(name, batch_size): else: raise ValueError("Unsupported network: " + name) - return net, params, shape, output_shape + return net, params, input_shape, output_shape ################################################################# # Start RPC Tracker @@ -226,8 +226,8 @@ def get_network(name, batch_size): def tune_tasks(tasks, measure_option, tuner='xgb', - n_trial=500, - early_stopping=200, + n_trial=1000, + early_stopping=None, log_filename='tuning.log', use_transfer_learning=True, try_winograd=True): @@ -283,10 +283,10 @@ def tune_tasks(tasks, def tune_and_evaluate(): # extract workloads from nnvm graph print("Extract tasks...") - net, params, shape, out_shape = get_network(network, batch_size=1) - tasks = autotvm.task.extract_from_graph(net, shape=shape, dtype=dtype, - symbols=(nnvm.sym.conv2d,), - target=target) + net, params, input_shape, out_shape = get_network(network, batch_size=1) + tasks = autotvm.task.extract_from_graph(net, target=target, + shape={'data': input_shape}, dtype=dtype, + symbols=(nnvm.sym.conv2d,)) # run tuning tasks print("Tuning...") @@ -298,7 +298,7 @@ def tune_and_evaluate(): with nnvm.compiler.build_config(opt_level=2, add_pass=['AlterOpLayout']): graph, lib, params = nnvm.compiler.build( net, target=target, - shape=shape, params=params, dtype=dtype) + shape={'data': input_shape}, params=params, dtype=dtype) # export library tmp = tempdir() @@ -319,7 +319,7 @@ def tune_and_evaluate(): # upload parameters to device ctx = remote.context(str(target), 0) rparams = {k: tvm.nd.array(v, ctx) for k, v in params.items()} - data_tvm = tvm.nd.array((np.random.uniform(size=shape['data'])).astype(dtype)) + data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)) module = runtime.create(graph, rlib, ctx) module.set_input('data', data_tvm) module.set_input(**rparams) @@ -341,35 +341,33 @@ def tune_and_evaluate(): # ------------- # The tuning needs to train xgboost models and use them for prediction. # So a high performance CPU is recommended. -# It takes about 1.5 hour on a 32T AMD Ryzen CPU. +# It takes about 2 hours on a 32T AMD Ryzen CPU. # One sample output is # # .. code-block:: bash # # Extract tasks... # Tuning... -# [Task 1/16] Current/Best: 13.15/ 20.49 GFLOPS | Progress: (297/1000) | 348.51 s Done. -# [Task 2/16] Current/Best: 16.66/ 22.64 GFLOPS | Progress: (475/1000) | 415.42 s Done. -# [Task 3/16] Current/Best: 10.33/ 14.19 GFLOPS | Progress: (306/1000) | 239.61 s Done. -# [Task 4/16] Current/Best: 13.29/ 20.88 GFLOPS | Progress: (242/1000) | 227.48 s Done. -# [Task 5/16] Current/Best: 13.28/ 15.61 GFLOPS | Progress: (237/1000) | 191.56 s Done. -# [Task 6/16] Current/Best: 20.16/ 23.86 GFLOPS | Progress: (315/1000) | 304.31 s Done. -# [Task 7/16] Current/Best: 9.22/ 22.00 GFLOPS | Progress: (458/1000) | 433.26 s Done. -# [Task 8/16] Current/Best: 14.12/ 17.80 GFLOPS | Progress: (270/1000) | 240.73 s Done. -# [Task 9/16] Current/Best: 14.59/ 24.02 GFLOPS | Progress: (209/1000) | 213.61 s Done. -# [Task 10/16] Current/Best: 9.86/ 21.74 GFLOPS | Progress: (367/1000) | 359.93 s Done. -# [Task 11/16] Current/Best: 5.01/ 18.86 GFLOPS | Progress: (202/1000) | 191.18 s Done. -# [Task 12/16] Current/Best: 8.61/ 25.23 GFLOPS | Progress: (220/1000) | 220.74 s Done. -# [Task 13/16] Current/Best: 10.87/ 25.79 GFLOPS | Progress: (465/1000) | 902.14 s Done. -# [Task 14/16] Current/Best: 15.33/ 29.38 GFLOPS | Progress: (239/1000) | 481.33 s Done. -# [Task 15/16] Current/Best: 12.09/ 38.60 GFLOPS | Progress: (476/1000) | 928.35 s Done. -# [Task 16/16] Current/Best: 16.77/ 47.08 GFLOPS | Progress: (255/1000) | 439.91 s Done. +# [Task 1/16] Current/Best: 18.85/ 19.67 GFLOPS | Progress: (353/1000) | 387.05 s Done. +# [Task 2/16] Current/Best: 16.10/ 23.50 GFLOPS | Progress: (444/1000) | 379.99 s Done. +# [Task 3/16] Current/Best: 5.49/ 13.96 GFLOPS | Progress: (610/1000) | 485.87 s Done. +# [Task 4/16] Current/Best: 10.07/ 20.48 GFLOPS | Progress: (430/1000) | 391.66 s Done. +# [Task 5/16] Current/Best: 11.50/ 15.50 GFLOPS | Progress: (374/1000) | 356.03 s Done. +# [Task 6/16] Current/Best: 10.76/ 23.77 GFLOPS | Progress: (526/1000) | 526.42 s Done. +# [Task 7/16] Current/Best: 12.71/ 22.03 GFLOPS | Progress: (341/1000) | 322.96 s Done. +# [Task 8/16] Current/Best: 8.60/ 17.91 GFLOPS | Progress: (272/1000) | 236.08 s Done. +# [Task 9/16] Current/Best: 15.37/ 23.62 GFLOPS | Progress: (275/1000) | 275.18 s Done. +# [Task 10/16] Current/Best: 6.62/ 23.01 GFLOPS | Progress: (330/1000) | 315.02 s Done. +# [Task 11/16] Current/Best: 1.85/ 21.39 GFLOPS | Progress: (281/1000) | 239.19 s Done. +# [Task 12/16] Current/Best: 15.41/ 24.02 GFLOPS | Progress: (258/1000) | 270.82 s Done. +# [Task 13/16] Current/Best: 17.96/ 25.79 GFLOPS | Progress: (380/1000) | 738.29 s Done. +# [Task 14/16] Current/Best: 14.81/ 31.17 GFLOPS | Progress: (413/1000) | 799.21 s Done. +# [Task 15/16] Current/Best: 24.39/ 40.97 GFLOPS | Progress: (355/1000) | 700.25 s Done. +# [Task 16/16] Current/Best: 9.42/ 49.90 GFLOPS | Progress: (348/1000) | 603.84 s Done. # Compile... # Upload... # Evaluate inference time cost... -# Mean inference time (std dev): 156.51 ms (0.89 ms) -# - +# Mean inference time (std dev): 157.29 ms (1.74 ms) ###################################################################### # diff --git a/tutorials/nnvm_quick_start.py b/tutorials/nnvm_quick_start.py index c9f6c33591d0..c171823604cd 100644 --- a/tutorials/nnvm_quick_start.py +++ b/tutorials/nnvm_quick_start.py @@ -109,7 +109,7 @@ # Save and Load Compiled Module # ----------------------------- # We can also save the graph, lib and parameters into files and load them -# back in development environment. +# back in deploy environment. #################################################### From 764516a6b3375d611a1dfa7f8735ac3f185a2f18 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Thu, 9 Aug 2018 14:21:05 -0700 Subject: [PATCH 10/77] [TEST] force openblas threads to be 1 (#1580) --- tests/scripts/task_python_nnvm.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/scripts/task_python_nnvm.sh b/tests/scripts/task_python_nnvm.sh index 2fc41980fb3d..790073a2fe8b 100755 --- a/tests/scripts/task_python_nnvm.sh +++ b/tests/scripts/task_python_nnvm.sh @@ -1,6 +1,8 @@ #!/bin/bash export PYTHONPATH=nnvm/python:python:topi/python +# to avoid openblas threading error +export OMP_NUM_THREADS=1 echo "Running unittest..." python -m nose -v nnvm/tests/python/unittest || exit -1 From 2afe024809abd22a76ac8072fb0a0310df7360f2 Mon Sep 17 00:00:00 2001 From: Dayananda V Date: Fri, 10 Aug 2018 07:11:49 +0530 Subject: [PATCH 11/77] Vulkan TVM Android Support (#1571) --- apps/android_rpc/README.md | 25 +++-- .../app/src/main/jni/Application.mk | 16 ++-- apps/android_rpc/tests/android_rpc_test.py | 91 +++++++++++++------ .../src/main/java/ml/dmlc/tvm/TVMContext.java | 15 +++ .../main/java/ml/dmlc/tvm/rpc/RPCSession.java | 18 ++++ python/tvm/rpc/client.py | 4 + web/tvm_runtime.js | 2 + 7 files changed, 125 insertions(+), 46 deletions(-) diff --git a/apps/android_rpc/README.md b/apps/android_rpc/README.md index 41d361c823ed..eef22f3c7010 100644 --- a/apps/android_rpc/README.md +++ b/apps/android_rpc/README.md @@ -123,18 +123,25 @@ export TVM_NDK_CC=/opt/android-toolchain-arm64/bin/aarch64-linux-android-g++ python android_rpc_test.py ``` -This will compile TVM IR to shared libraries (CPU and OpenCL) and run vector addition on your Android device. On my test device, it gives following results. +This will compile TVM IR to shared libraries (CPU, OpenCL and Vulkan) and run vector addition on your Android device. To verify compiled TVM IR shared libraries on OpenCL target set [`'test_opencl = True'`](https://github.com/dmlc/tvm/blob/master/apps/android_rpc/tests/android_rpc_test.py#L25) and on Vulkan target set [`'test_vulkan = False'`](https://github.com/dmlc/tvm/blob/master/apps/android_rpc/tests/android_rpc_test.py#L27) in [tests/android_rpc_test.py](https://github.com/dmlc/tvm/blob/master/apps/android_rpc/tests/android_rpc_test.py), by default on CPU target will execute. +On my test device, it gives following results. ```bash -TVM: Initializing cython mode... -[01:21:43] src/codegen/llvm/codegen_llvm.cc:75: set native vector to be 32 for target aarch64 -[01:21:43] src/runtime/opencl/opencl_device_api.cc:194: Initialize OpenCL platform 'Apple' -[01:21:43] src/runtime/opencl/opencl_device_api.cc:214: opencl(0)='Iris' cl_device_id=0x1024500 -[01:21:44] src/codegen/llvm/codegen_llvm.cc:75: set native vector to be 32 for target aarch64 -Run GPU test ... -0.000155807 secs/op Run CPU test ... -0.00139824 secs/op +0.000962932 secs/op + +Run GPU(OpenCL Flavor) test ... +0.000155807 secs/op + +[23:29:34] /home/tvm/src/runtime/vulkan/vulkan_device_api.cc:674: Cannot initialize vulkan: [23:29:34] /home/tvm/src/runtime/vulkan/vulkan_device_api.cc:512: Check failed: __e == VK_SUCCESS Vulan Error, code=-9: VK_ERROR_INCOMPATIBLE_DRIVER + +Stack trace returned 10 entries: +[bt] (0) /home/user/.local/lib/python3.6/site-packages/tvm-0.4.0-py3.6-linux-x86_64.egg/tvm/libtvm.so(dmlc::StackTrace[abi:cxx11]()+0x53) [0x7f477f5399f3] +......... + +You can still compile vulkan module but cannot run locally +Run GPU(Vulkan Flavor) test ... +0.000225198 secs/op ``` You can define your own TVM operators and test via this RPC app on your Android device to find the most optimized TVM schedule. diff --git a/apps/android_rpc/app/src/main/jni/Application.mk b/apps/android_rpc/app/src/main/jni/Application.mk index 5bf52bdaffc0..f142e2995777 100644 --- a/apps/android_rpc/app/src/main/jni/Application.mk +++ b/apps/android_rpc/app/src/main/jni/Application.mk @@ -1,9 +1,9 @@ ifndef config - ifneq ("$(wildcard ./config.mk)","") - config ?= config.mk - else - config ?= make/config.mk - endif + ifneq ("$(wildcard ./config.mk)","") + config ?= config.mk + else + config ?= make/config.mk + endif endif include $(config) @@ -16,10 +16,10 @@ APP_STL := c++_static APP_CPPFLAGS += -DDMLC_LOG_STACK_TRACE=0 -DTVM4J_ANDROID=1 -std=c++11 -Oz -frtti ifeq ($(USE_OPENCL), 1) - APP_CPPFLAGS += -DTVM_OPENCL_RUNTIME=1 + APP_CPPFLAGS += -DTVM_OPENCL_RUNTIME=1 endif ifeq ($(USE_VULKAN), 1) - APP_CPPFLAGS += -DTVM_VULKAN_RUNTIME=1 - APP_LDFLAGS += -lvulkan + APP_CPPFLAGS += -DTVM_VULKAN_RUNTIME=1 + APP_LDFLAGS += -lvulkan endif diff --git a/apps/android_rpc/tests/android_rpc_test.py b/apps/android_rpc/tests/android_rpc_test.py index cfb04c1ca9a9..44618efd45c1 100644 --- a/apps/android_rpc/tests/android_rpc_test.py +++ b/apps/android_rpc/tests/android_rpc_test.py @@ -21,59 +21,92 @@ arch = "arm64" target = "llvm -target=%s-linux-android" % arch +# whether enable to execute test on OpenCL target +test_opencl = False +# whether enable to execute test on Vulkan target +test_vulkan = False + def test_rpc_module(): # graph n = tvm.convert(1024) A = tvm.placeholder((n,), name='A') B = tvm.compute(A.shape, lambda *i: A(*i) + 1.0, name='B') + a_np = np.random.uniform(size=1024).astype(A.dtype) temp = util.tempdir() - s = tvm.create_schedule(B.op) - xo, xi = s[B].split(B.op.axis[0], factor=64) - s[B].bind(xi, tvm.thread_axis("threadIdx.x")) - s[B].bind(xo, tvm.thread_axis("blockIdx.x")) - # Build the dynamic lib. - # If we don't want to do metal and only use cpu, just set target to be target - f = tvm.build(s, [A, B], "opencl", target_host=target, name="myadd") - path_dso1 = temp.relpath("dev_lib2.so") - f.export_library(path_dso1, ndk.create_shared) + # Establish remote connection with target hardware + tracker = rpc.connect_tracker(tracker_host, tracker_port) + remote = tracker.request(key, priority=0, + session_timeout=60) + + # Compile the Graph for CPU target s = tvm.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=64) s[B].parallel(xi) s[B].pragma(xo, "parallel_launch_point") s[B].pragma(xi, "parallel_barrier_when_finish") f = tvm.build(s, [A, B], target, name="myadd_cpu") - path_dso2 = temp.relpath("cpu_lib.so") - f.export_library(path_dso2, ndk.create_shared) - - tracker = rpc.connect_tracker(tracker_host, tracker_port) - remote = tracker.request(key, priority=0, - session_timeout=60) + path_dso_cpu = temp.relpath("cpu_lib.so") + f.export_library(path_dso_cpu, ndk.create_shared) + # Execute the portable graph on cpu target print('Run CPU test ...') ctx = remote.cpu(0) - remote.upload(path_dso2) + remote.upload(path_dso_cpu) f2 = remote.load_module("cpu_lib.so") - a_np = np.random.uniform(size=1024).astype(A.dtype) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) time_f = f2.time_evaluator(f2.entry_name, ctx, number=10) cost = time_f(a, b).mean - print('%g secs/op' % cost) + print('%g secs/op\n' % cost) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) + # Compile the Graph for OpenCL target + if test_opencl: + s = tvm.create_schedule(B.op) + xo, xi = s[B].split(B.op.axis[0], factor=64) + s[B].bind(xi, tvm.thread_axis("threadIdx.x")) + s[B].bind(xo, tvm.thread_axis("blockIdx.x")) + # Build the dynamic lib. + # If we don't want to do metal and only use cpu, just set target to be target + f = tvm.build(s, [A, B], "opencl", target_host=target, name="myadd") + path_dso_cl = temp.relpath("dev_lib_cl.so") + f.export_library(path_dso_cl, ndk.create_shared) + + print('Run GPU(OpenCL Flavor) test ...') + ctx = remote.cl(0) + remote.upload(path_dso_cl) + f1 = remote.load_module("dev_lib_cl.so") + a = tvm.nd.array(a_np, ctx) + b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) + time_f = f1.time_evaluator(f1.entry_name, ctx, number=10) + cost = time_f(a, b).mean + print('%g secs/op\n' % cost) + np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) + + # Compile the Graph for Vulkan target + if test_vulkan: + s = tvm.create_schedule(B.op) + xo, xi = s[B].split(B.op.axis[0], factor=64) + s[B].bind(xi, tvm.thread_axis("threadIdx.x")) + s[B].bind(xo, tvm.thread_axis("blockIdx.x")) + # Build the dynamic lib. + # If we don't want to do metal and only use cpu, just set target to be target + f = tvm.build(s, [A, B], "vulkan", target_host=target, name="myadd") + path_dso_vulkan = temp.relpath("dev_lib_vulkan.so") + f.export_library(path_dso_vulkan, ndk.create_shared) + + print('Run GPU(Vulkan Flavor) test ...') + ctx = remote.vulkan(0) + remote.upload(path_dso_vulkan) + f1 = remote.load_module("dev_lib_vulkan.so") + a = tvm.nd.array(a_np, ctx) + b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) + time_f = f1.time_evaluator(f1.entry_name, ctx, number=10) + cost = time_f(a, b).mean + print('%g secs/op\n' % cost) + np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) - print('Run GPU test ...') - ctx = remote.cl(0) - remote.upload(path_dso1) - f1 = remote.load_module("dev_lib2.so") - a_np = np.random.uniform(size=1024).astype(A.dtype) - a = tvm.nd.array(a_np, ctx) - b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) - time_f = f1.time_evaluator(f1.entry_name, ctx, number=10) - cost = time_f(a, b).mean - print('%g secs/op' % cost) - np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) if __name__ == "__main__": test_rpc_module() diff --git a/jvm/core/src/main/java/ml/dmlc/tvm/TVMContext.java b/jvm/core/src/main/java/ml/dmlc/tvm/TVMContext.java index 0d108e0a2943..d9051f0d9d4d 100644 --- a/jvm/core/src/main/java/ml/dmlc/tvm/TVMContext.java +++ b/jvm/core/src/main/java/ml/dmlc/tvm/TVMContext.java @@ -30,6 +30,7 @@ public class TVMContext { MASK2STR.put(1, "cpu"); MASK2STR.put(2, "gpu"); MASK2STR.put(4, "opencl"); + MASK2STR.put(7, "vulkan"); MASK2STR.put(8, "metal"); MASK2STR.put(9, "vpi"); @@ -38,6 +39,7 @@ public class TVMContext { STR2MASK.put("cuda", 2); STR2MASK.put("cl", 4); STR2MASK.put("opencl", 4); + STR2MASK.put("vulkan", 7); STR2MASK.put("metal", 8); STR2MASK.put("vpi", 9); } @@ -81,6 +83,19 @@ public static TVMContext opencl() { return opencl(0); } + /** + * Construct a Vulkan device. + * @param devId The device id + * @return The created context + */ + public static TVMContext vulkan(int devId) { + return new TVMContext(7, devId); + } + + public static TVMContext vulkan() { + return vulkan(0); + } + /** * Construct a metal device. * @param devId The device id diff --git a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/RPCSession.java b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/RPCSession.java index 0eec9224a40c..8ebf188b0667 100644 --- a/jvm/core/src/main/java/ml/dmlc/tvm/rpc/RPCSession.java +++ b/jvm/core/src/main/java/ml/dmlc/tvm/rpc/RPCSession.java @@ -143,6 +143,24 @@ public TVMContext cl() { return cl(0); } + /** + * Construct remote OpenCL device. + * @param devId device id. + * @return Remote OpenCL context. + */ + public TVMContext vulkan(int devId) { + return context(7, devId); + } + + /** + * Construct remote OpenCL device. + * @return Remote OpenCL context. + */ + public TVMContext vulkan() { + return vulkan(0); + } + + /** * Construct remote Metal device. * @param devId device id. diff --git a/python/tvm/rpc/client.py b/python/tvm/rpc/client.py index 57f368b0e660..ffbe6eeab6ee 100644 --- a/python/tvm/rpc/client.py +++ b/python/tvm/rpc/client.py @@ -130,6 +130,10 @@ def cl(self, dev_id=0): """Construct OpenCL device.""" return self.context(4, dev_id) + def vulkan(self, dev_id=0): + """Construct Vulkan device.""" + return self.context(7, dev_id) + def metal(self, dev_id=0): """Construct Metal device.""" return self.context(8, dev_id) diff --git a/web/tvm_runtime.js b/web/tvm_runtime.js index ef594e9433fb..786745d3ce88 100644 --- a/web/tvm_runtime.js +++ b/web/tvm_runtime.js @@ -696,6 +696,7 @@ var tvm_runtime = tvm_runtime || {}; 1 : "cpu", 2 : "gpu", 4 : "opencl", + 7 : "vulkan", 8 : "metal", 9 : "vpi", 11 : "opengl", @@ -706,6 +707,7 @@ var tvm_runtime = tvm_runtime || {}; "cuda": 2, "cl": 4, "opencl": 4, + "vulkan": 7, "metal": 8, "vpi": 9, "opengl": 11, From 48fc410e0102e22ab9fdd5ec8879e10c6a3f1580 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Thu, 9 Aug 2018 18:55:48 -0700 Subject: [PATCH 12/77] [TEAM] merrymercy->code owner (#1581) --- CONTRIBUTORS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 6e3cf55b94b0..2d571ba668ea 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -18,6 +18,7 @@ See the [community structure document](http://docs.tvm.ai/contribute/community.h - [Yuwei Hu](https://github.com/Huyuwei) TOPI - [Zhixun Tan](https://github.com/phisiart) OpenGL/WebGL backend - [Nick Hynes](https://github.com/nhynes) SGX and secured computing +- [Lianmin Zheng](https://github.com/merrymercy) AutoTVM ## Reviewers - [Masahiro Masuda](https://github.com/masahi) @@ -27,7 +28,6 @@ See the [community structure document](http://docs.tvm.ai/contribute/community.h - [Alex Weaver](https://github.com/alex-weaver) - [Eddie Yan](https://github.com/eqy) - [Joshua Z. Zhang](https://github.com/zhreshold) -- [Lianmin Zheng](https://github.com/merrymercy) ## List of Contributors - [Full List of Contributors](https://github.com/dmlc/tvm/graphs/contributors) From 48ff777aa347036af5c22d25aa9096b61a9519ad Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Fri, 10 Aug 2018 11:45:09 -0700 Subject: [PATCH 13/77] [AUTOTVM] API change (#1583) --- python/tvm/autotvm/__init__.py | 2 +- python/tvm/autotvm/measure/__init__.py | 2 +- python/tvm/autotvm/measure/measure.py | 10 ++++---- python/tvm/autotvm/measure/measure_methods.py | 24 +++++++++---------- python/tvm/autotvm/tuner/tuner.py | 4 ++-- tutorials/autotvm/tune_conv2d_cuda.py | 2 +- tutorials/autotvm/tune_nnvm_arm.py | 6 ++--- 7 files changed, 25 insertions(+), 25 deletions(-) diff --git a/python/tvm/autotvm/__init__.py b/python/tvm/autotvm/__init__.py index 20426be84aa1..5b312d93d288 100644 --- a/python/tvm/autotvm/__init__.py +++ b/python/tvm/autotvm/__init__.py @@ -22,7 +22,7 @@ from . import tophub # some shortcuts -from .measure import measure_option, MeasureInput, MeasureResult, MeasureErrorNo, use_rpc +from .measure import measure_option, MeasureInput, MeasureResult, MeasureErrorNo from .tuner import callback from .task import template, get_config, create, ConfigSpace, ConfigEntity, \ ApplyHistoryBest as apply_history_best diff --git a/python/tvm/autotvm/measure/__init__.py b/python/tvm/autotvm/measure/__init__.py index b9bd3c37b01d..880dfd1ffe29 100644 --- a/python/tvm/autotvm/measure/__init__.py +++ b/python/tvm/autotvm/measure/__init__.py @@ -1,7 +1,7 @@ """Distributed executor infrastructure to scale up the tuning""" from .measure import MeasureInput, MeasureResult, MeasureErrorNo, measure_option -from .measure_methods import request_remote, check_remote, create_measure_batch, use_rpc +from .measure_methods import request_remote, check_remote, create_measure_batch, rpc from .local_executor import LocalExecutor from .executor import Future, Executor diff --git a/python/tvm/autotvm/measure/measure.py b/python/tvm/autotvm/measure/measure.py index 6a05e1a6a349..2325a970bc45 100644 --- a/python/tvm/autotvm/measure/measure.py +++ b/python/tvm/autotvm/measure/measure.py @@ -49,7 +49,7 @@ def measure_option(measure_func, number=1, repeat=1, timeout=60, - parallel_num=1, + n_parallel=1, do_fork=True, build_func='default', check_correctness=False, @@ -63,7 +63,7 @@ def measure_option(measure_func, and a RPC server silently for the user. callable: It is a callable function for measurement. - See the return value of measure/measure_methods.py::use_rpc for example. + See the return value of measure/measure_methods.py::rpc for example. number : int, optional Number of times to do the measurement for average repeat : int, optional @@ -74,7 +74,7 @@ def measure_option(measure_func, timeout: int, optional Timeout for a whole batch. TimeoutError will be returned as the result if a task timeouts. - parallel_num: int, optional + n_parallel: int, optional The number of measurement task that can run in parallel. Set this according to the number of cpu cores (for compilation) and the number of devices you have (for measuring generate code). @@ -106,7 +106,7 @@ def measure_option(measure_func, and handle the logic of measurement. Signature: - * measure_func (see the return value of measure/measure_methods.py::use_rpc for example) + * measure_func (see the return value of measure/measure_methods.py::rpc for example) def measure_func(input_pack, build_func, build_kwargs, number, repeat, ref_input, ref_output): return measure_results @@ -119,7 +119,7 @@ def build_func(inp, tmp_dir, **kwargs): 'number': number, 'repeat': repeat, 'timeout': timeout, - 'parallel_num': parallel_num, + 'n_parallel': n_parallel, 'do_fork': do_fork, 'build_func': build_func, 'check_correctness': check_correctness, diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py index 6e95a6e435d0..e192ee26ee3e 100644 --- a/python/tvm/autotvm/measure/measure_methods.py +++ b/python/tvm/autotvm/measure/measure_methods.py @@ -13,8 +13,8 @@ import numpy as np -from ... import rpc, ir_pass, build, build_config, nd, context, TVMError, register_func, \ - target as _target +from ... import ir_pass, build, build_config, nd, context, TVMError, register_func, \ + target as _target, rpc as _rpc from ...contrib import nvcc, util, ndk from ..util import get_const_tuple @@ -60,7 +60,7 @@ def request_remote(device_key, tracker_addr=None, priority=1, timeout=60): host = os.environ['TVM_TRACKER_HOST'] port = int(os.environ['TVM_TRACKER_PORT']) - tracker = rpc.connect_tracker(host, port) + tracker = _rpc.connect_tracker(host, port) remote = tracker.request(device_key, priority=priority, session_timeout=timeout) return remote @@ -113,7 +113,7 @@ def create_measure_batch(task, option): measure_func = option['measure_func'] number, repeat = option['number'], option['repeat'] - timeout, parallel_num, do_fork = option['timeout'], option['parallel_num'], option['do_fork'] + timeout, n_parallel, do_fork = option['timeout'], option['n_parallel'], option['do_fork'] build_func = option['build_func'] check_correctness = option['check_correctness'] replay_db = option['replay_db'] @@ -134,7 +134,7 @@ def create_measure_batch(task, option): use_popen=True, silent=True, tracker_addr=(tracker.host, tracker.port)) - measure_func = use_rpc(device_key, tracker.host, tracker.port) + measure_func = rpc(device_key, tracker.host, tracker.port) attach_objects = (server, tracker) build_kwargs = {} @@ -218,18 +218,18 @@ def measure_batch(measure_inputs): return partial_results return results - measure_batch.parallel_num = parallel_num + measure_batch.n_parallel = n_parallel # attach server and tracker object to avoid them of being garbage-collected measure_batch.attach_objects = attach_objects return measure_batch -def use_rpc(key, - host=None, - port=None, - priority=1, - session_timeout=60, - pack_size=1): +def rpc(key, + host=None, + port=None, + priority=1, + session_timeout=60, + pack_size=1): """ Create a standard measure_func which uses RPC Tracker for measurement. This measure_func will request a device from the RPC Tracker and diff --git a/python/tvm/autotvm/tuner/tuner.py b/python/tvm/autotvm/tuner/tuner.py index 5d1fc1507e58..91004cba4603 100644 --- a/python/tvm/autotvm/tuner/tuner.py +++ b/python/tvm/autotvm/tuner/tuner.py @@ -85,7 +85,7 @@ def tune(self, n_trial, measure_option, early_stopping=None, callbacks=()): every measurement pair. See autotvm/tuner/callback.py for some examples. """ measure_batch = create_measure_batch(self.task, measure_option) - parallel_num = getattr(measure_batch, 'parallel_num', 1) + n_parallel = getattr(measure_batch, 'n_parallel', 1) early_stopping = early_stopping or 1e9 old_level = logger.level @@ -95,7 +95,7 @@ def tune(self, n_trial, measure_option, early_stopping=None, callbacks=()): if not self.has_next(): break - configs = self.next_batch(min(parallel_num, n_trial - i)) + configs = self.next_batch(min(n_parallel, n_trial - i)) inputs = [MeasureInput(self.task.target, self.task, config) for config in configs] results = measure_batch(inputs) diff --git a/tutorials/autotvm/tune_conv2d_cuda.py b/tutorials/autotvm/tune_conv2d_cuda.py index 179ac811ab70..375d1a9b755e 100644 --- a/tutorials/autotvm/tune_conv2d_cuda.py +++ b/tutorials/autotvm/tune_conv2d_cuda.py @@ -168,7 +168,7 @@ def conv2d_no_batching(N, H, W, CI, CO, KH, KW, stride, padding): # run 8 parallel threads for compilation measure_option = autotvm.measure_option('local', number=5, - parallel_num=8, + n_parallel=8, timeout=20) # begin tuning, log records to file `conv2d.log` diff --git a/tutorials/autotvm/tune_nnvm_arm.py b/tutorials/autotvm/tune_nnvm_arm.py index d11823f204e1..f3d1c62bdaf2 100644 --- a/tutorials/autotvm/tune_nnvm_arm.py +++ b/tutorials/autotvm/tune_nnvm_arm.py @@ -191,9 +191,9 @@ def get_network(name, batch_size): 'early_stopping': 250, 'measure_option': autotvm.measure_option( - autotvm.use_rpc(device_key, host='localhost', port=9190), + autotvm.measure.rpc(device_key, host='localhost', port=9190), number=4, - parallel_num=1, + n_parallel=1, timeout=10, build_func='ndk' if use_android else 'default', ), @@ -205,7 +205,7 @@ def get_network(name, batch_size): # # In general, the default value provided here works well. It is the same # value that we used to generate pre-tuned parameters. -# If you have multiple devices, you can set :code:`parallel_num` to +# If you have multiple devices, you can set :code:`n_parallel` to # the number of devices you have. (e.g. set it to 3 if you register 3 rk3399 # boards to the tracker). # If you have large time budget, you can set :code:`n_trial`, :code:`early_stopping` larger, From b0368e338f7e1df63dfc5a697c8c9694e97d10a0 Mon Sep 17 00:00:00 2001 From: Hao Jin Date: Fri, 10 Aug 2018 18:01:55 -0400 Subject: [PATCH 14/77] update dmlc-core for security reason (#1584) --- dmlc-core | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dmlc-core b/dmlc-core index e864aa6757cd..4f0564ec7694 160000 --- a/dmlc-core +++ b/dmlc-core @@ -1 +1 @@ -Subproject commit e864aa6757cdbe78b1296fe5231fd3050b7802c3 +Subproject commit 4f0564ec769477c66d480dd966088f172050c874 From f52255b9b50457e8a6b747a6a2352d3b0d444236 Mon Sep 17 00:00:00 2001 From: eqy Date: Fri, 10 Aug 2018 15:02:10 -0700 Subject: [PATCH 15/77] DLPack Conversion API (#1573) --- include/tvm/runtime/c_runtime_api.h | 26 ++++++++++++ include/tvm/runtime/ndarray.h | 2 +- python/tvm/_ffi/ndarray.py | 62 ++++++++++++++++++++++++++++- python/tvm/contrib/dlpack.py | 43 ++++++++++++++++++++ python/tvm/ndarray.py | 2 +- src/runtime/ndarray.cc | 36 +++++++++++++---- tests/python/contrib/test_dlpack.py | 44 ++++++++++++++++++++ 7 files changed, 205 insertions(+), 10 deletions(-) create mode 100644 python/tvm/contrib/dlpack.py create mode 100644 tests/python/contrib/test_dlpack.py diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h index 17d00bf479aa..dca0d5ed4a30 100644 --- a/include/tvm/runtime/c_runtime_api.h +++ b/include/tvm/runtime/c_runtime_api.h @@ -445,6 +445,32 @@ TVM_DLL int TVMArrayCopyFromTo(TVMArrayHandle from, TVMArrayHandle to, TVMStreamHandle stream); +/*! + * \brief Produce an array from the DLManagedTensor that shares data memory + * with the DLManagedTensor. + * \param from The source DLManagedTensor. + * \param out The output array handle. + * \return 0 when success, -1 when failure happens + */ +TVM_DLL int TVMArrayFromDLPack(DLManagedTensor* from, + TVMArrayHandle* out); + +/*! + * \brief Produce a DLMangedTensor from the array that shares data memory with + * the array. + * \param from The source array. + * \param out The DLManagedTensor handle. + * \return 0 when success, -1 when failure happens + */ +TVM_DLL int TVMArrayToDLPack(TVMArrayHandle from, + DLManagedTensor** out); + +/*! + * \brief Delete (free) a DLManagedTensor's data. + * \param dltensor Pointer to the DLManagedTensor. + */ +TVM_DLL void TVMDLManagedTensorCallDeleter(DLManagedTensor* dltensor); + /*! * \brief Create a new runtime stream. * diff --git a/include/tvm/runtime/ndarray.h b/include/tvm/runtime/ndarray.h index 2b51b2e0fcfe..d3ecce8ba9d0 100644 --- a/include/tvm/runtime/ndarray.h +++ b/include/tvm/runtime/ndarray.h @@ -155,7 +155,7 @@ class NDArray { * that is DLPack compatible. * * The memory is retained until the NDArray went out of scope. - * + * \param tensor The DLPack tensor to copy from. * \return The created NDArray view. */ TVM_DLL static NDArray FromDLPack(DLManagedTensor* tensor); diff --git a/python/tvm/_ffi/ndarray.py b/python/tvm/_ffi/ndarray.py index 3788c07ac440..d994d7c2e4a5 100644 --- a/python/tvm/_ffi/ndarray.py +++ b/python/tvm/_ffi/ndarray.py @@ -5,7 +5,7 @@ import sys import ctypes import numpy as np -from .base import _LIB, check_call, c_array, string_types, _FFI_MODE +from .base import _LIB, check_call, c_array, string_types, _FFI_MODE, c_str from .runtime_ctypes import TVMType, TVMContext, TVMArray, TVMArrayHandle from .runtime_ctypes import TypeCode, tvm_shape_index_t @@ -28,6 +28,17 @@ from ._ctypes.ndarray import NDArrayBase as _NDArrayBase +TVMPyCapsuleDestructor = ctypes.CFUNCTYPE(None, ctypes.c_void_p) +_c_str_dltensor = c_str('dltensor') + + +# used for PyCapsule manipulation +if hasattr(ctypes, 'pythonapi'): + ctypes.pythonapi.PyCapsule_GetName.restype = ctypes.c_char_p + ctypes.pythonapi.PyCapsule_GetPointer.restype = ctypes.c_void_p + ctypes.pythonapi.PyCapsule_New.restype = ctypes.py_object + + def context(dev_type, dev_id=0): """Construct a TVM context with given device type and id. @@ -62,6 +73,7 @@ def context(dev_type, dev_id=0): dev_type = TVMContext.STR2MASK[dev_type] return TVMContext(dev_type, dev_id) + def numpyasarray(np_data): """Return a TVMArray representation of a numpy array. """ @@ -112,6 +124,42 @@ def empty(shape, dtype="float32", ctx=context(1, 0)): ctypes.byref(handle))) return _make_array(handle, False) + +def from_dlpack(dltensor): + """Produce an array from a DLPack tensor without memory copy. + Retreives the underlying DLPack tensor's pointer to create an array from the + data. Removes the original DLPack tensor's destructor as now the array is + responsible for destruction. + + Parameters + ---------- + dltensor : DLPack tensor + + Returns + ------- + arr: tvm.nd.NDArray + The array view of the tensor data. + """ + dltensor = ctypes.py_object(dltensor) + name = ctypes.pythonapi.PyCapsule_GetName(dltensor) + ptr = ctypes.pythonapi.PyCapsule_GetPointer(dltensor, name) + handle = TVMArrayHandle() + check_call(_LIB.TVMArrayFromDLPack(ptr, ctypes.byref(handle))) + ctypes.pythonapi.PyCapsule_SetDestructor(dltensor, None) + return _make_array(handle, False) + + +def _dlpack_deleter(pycapsule): + pycapsule = ctypes.py_object(pycapsule) + if ctypes.pythonapi.PyCapsule_IsValid(pycapsule, _c_str_dltensor): + ptr = ctypes.pythonapi.PyCapsule_GetPointer(pycapsule, _c_str_dltensor) + _LIB.TVMDLManagedTensorCallDeleter(ptr) + ctypes.pythonapi.PyCapsule_SetDestructor(dltensor, TVMPyCapsuleDestructor(0)) + + +_c_dlpack_deleter = TVMPyCapsuleDestructor(_dlpack_deleter) + + class NDArrayBase(_NDArrayBase): """A simple Device/CPU Array object in runtime.""" @property @@ -260,6 +308,18 @@ def copyto(self, target): raise ValueError("Unsupported target type %s" % str(type(target))) return target + def to_dlpack(self): + """Produce an array from a DLPack Tensor without copying memory + + Returns + ------- + dlpack : DLPack tensor view of the array data + """ + handle = ctypes.c_void_p() + check_call(_LIB.TVMArrayToDLPack(self.handle, ctypes.byref(handle))) + return ctypes.pythonapi.PyCapsule_New(handle, _c_str_dltensor, _c_dlpack_deleter) + + def free_extension_handle(handle, type_code): """Free c++ extension type handle diff --git a/python/tvm/contrib/dlpack.py b/python/tvm/contrib/dlpack.py new file mode 100644 index 000000000000..11db29f98b3e --- /dev/null +++ b/python/tvm/contrib/dlpack.py @@ -0,0 +1,43 @@ +"""Wrapping functions to bridge frameworks with DLPack support to TVM""" +from .. import ndarray + +def convert_func(tvm_func, tensor_type, to_dlpack_func): + """Convert a tvm function into one that accepts a tensor from another + framework, provided the other framework supports DLPACK + + Parameters + ---------- + tvm_func: Function + Built tvm function operating on arrays + + tensor_type: Type + Type of the tensors of the target framework + + to_dlpack_func: Function + Function to convert the source tensors to DLPACK + """ + assert callable(tvm_func) + + def _wrapper(*args): + args = tuple(ndarray.from_dlpack(to_dlpack_func(arg))\ + if isinstance(arg, tensor_type) else arg for arg in args) + return tvm_func(*args) + + return _wrapper + +def to_pytorch_func(tvm_func): + """Convert a tvm function into one that accepts PyTorch tensors + + Parameters + ---------- + tvm_func: Function + Built tvm function operating on arrays + + Returns + ------- + wrapped_func: Function + Wrapped tvm function that operates on PyTorch tensors + """ + import torch + import torch.utils.dlpack + return convert_func(tvm_func, torch.Tensor, torch.utils.dlpack.to_dlpack) diff --git a/python/tvm/ndarray.py b/python/tvm/ndarray.py index 18e958973d94..448e5f6d8bdb 100644 --- a/python/tvm/ndarray.py +++ b/python/tvm/ndarray.py @@ -8,7 +8,7 @@ import numpy as _np from ._ffi.ndarray import TVMContext, TVMType, NDArrayBase -from ._ffi.ndarray import context, empty +from ._ffi.ndarray import context, empty, from_dlpack from ._ffi.ndarray import _set_class_ndarray from ._ffi.ndarray import register_extension, free_extension_handle diff --git a/src/runtime/ndarray.cc b/src/runtime/ndarray.cc index f862f32f6e99..424a2b09cb15 100644 --- a/src/runtime/ndarray.cc +++ b/src/runtime/ndarray.cc @@ -93,6 +93,16 @@ struct NDArray::Internal { arr.data_ = nullptr; return tensor; } + // Container to DLManagedTensor + static DLManagedTensor* ToDLPack(NDArray::Container* from) { + CHECK(from != nullptr); + DLManagedTensor* ret = new DLManagedTensor(); + ret->dl_tensor = from->dl_tensor; + ret->manager_ctx = from; + from->IncRef(); + ret->deleter = NDArrayDLPackDeleter; + return ret; + } }; NDArray NDArray::CreateView(std::vector shape, @@ -115,13 +125,7 @@ NDArray NDArray::CreateView(std::vector shape, } DLManagedTensor* NDArray::ToDLPack() const { - CHECK(data_ != nullptr); - DLManagedTensor* ret = new DLManagedTensor(); - ret->dl_tensor = data_->dl_tensor; - ret->manager_ctx = const_cast(this); - data_->IncRef(); - ret->deleter = NDArrayDLPackDeleter; - return ret; + return Internal::ToDLPack(data_); } NDArray NDArray::Empty(std::vector shape, @@ -213,6 +217,24 @@ int TVMArrayCopyFromTo(TVMArrayHandle from, API_END(); } +int TVMArrayFromDLPack(DLManagedTensor* from, + TVMArrayHandle* out) { + API_BEGIN(); + *out = NDArray::Internal::MoveAsDLTensor(NDArray::FromDLPack(from)); + API_END(); +} + +int TVMArrayToDLPack(TVMArrayHandle from, + DLManagedTensor** out) { + API_BEGIN(); + *out = NDArray::Internal::ToDLPack(reinterpret_cast(from)); + API_END(); +} + +void TVMDLManagedTensorCallDeleter(DLManagedTensor* dltensor) { + (*(dltensor->deleter))(dltensor); +} + int TVMArrayCopyFromBytes(TVMArrayHandle handle, void* data, size_t nbytes) { diff --git a/tests/python/contrib/test_dlpack.py b/tests/python/contrib/test_dlpack.py new file mode 100644 index 000000000000..9a8ea34e69d5 --- /dev/null +++ b/tests/python/contrib/test_dlpack.py @@ -0,0 +1,44 @@ +import tvm +import numpy as np +from tvm.contrib.dlpack import to_pytorch_func + +def test(): + a = np.random.randn(1337) + tvm_a = tvm.nd.array(a) + np.testing.assert_equal(tvm.nd.from_dlpack(tvm_a.to_dlpack()).asnumpy(), a) + + try: + import torch + import torch.utils.dlpack + + x = torch.rand(56, 56) + tvm_x = tvm.nd.from_dlpack(torch.utils.dlpack.to_dlpack(x)) + np.testing.assert_equal(x.numpy(), tvm_x.asnumpy()) + y = tvm.nd.from_dlpack(tvm_x.to_dlpack()) + np.testing.assert_equal(y.asnumpy(), tvm_x.asnumpy()) + np.testing.assert_equal(torch.utils.dlpack.from_dlpack(y.to_dlpack()).numpy(), tvm_x.asnumpy()) + + n = tvm.convert(137) + xx = torch.rand(137,137) + yy = torch.rand(137,137) + zz2 = torch.empty(137,137) + zz = xx.mm(yy) + XX = tvm.placeholder((n,n), name='X') + YY = tvm.placeholder((n,n), name='Y') + + k = tvm.reduce_axis((0, n), name='k') + ZZ = tvm.compute((n,n), lambda i,j : tvm.sum(XX[i,k]*YY[k,j], axis=k)) + s = tvm.create_schedule(ZZ.op) + f = tvm.build(s, [XX, YY, ZZ], target_host='llvm', name='f') + + f_pytorch = to_pytorch_func(f) + zz2 = torch.empty(137,137) + f_pytorch(xx, yy, zz2) + np.testing.assert_allclose(zz.numpy(), zz2.numpy(), rtol=1e-6) + + except ImportError: + pass + + +if __name__ == '__main__': + test() From edda6cc1106798d42eeeac0a56bab3853bba413e Mon Sep 17 00:00:00 2001 From: eqy Date: Fri, 10 Aug 2018 19:04:46 -0700 Subject: [PATCH 16/77] use phone EditText for numerical fields (#1587) --- apps/android_rpc/app/src/main/res/layout/content_main.xml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/apps/android_rpc/app/src/main/res/layout/content_main.xml b/apps/android_rpc/app/src/main/res/layout/content_main.xml index 0f2564833ecd..82be44d98451 100644 --- a/apps/android_rpc/app/src/main/res/layout/content_main.xml +++ b/apps/android_rpc/app/src/main/res/layout/content_main.xml @@ -20,6 +20,7 @@ android:hint="@string/input_address" android:layout_width="wrap_content" android:layout_height="wrap_content" + android:inputType="phone" android:background="@android:drawable/editbox_background"/> @@ -37,6 +38,7 @@ android:minWidth="100dip" android:layout_width="wrap_content" android:layout_height="wrap_content" + android:inputType="phone" android:background="@android:drawable/editbox_background"/> From ec3f09b30ec1c2203a119b5fccfeb48166fcc7b3 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Sat, 11 Aug 2018 09:15:05 -0700 Subject: [PATCH 17/77] [RUNTIME] Refactor to enable stackvm in runtime. (#1588) --- CMakeLists.txt | 12 +- Jenkinsfile | 1 + cmake/config.cmake | 3 + include/tvm/ir.h | 20 +-- include/tvm/runtime/util.h | 28 +++- python/tvm/module.py | 9 +- src/codegen/codegen.cc | 1 - src/codegen/stack_vm/stack_vm_module.cc | 71 ---------- .../codegen_stackvm.cc} | 23 +++- .../codegen_stackvm.h} | 10 +- .../stackvm/stackvm.cc} | 71 +++++++--- .../stack_vm.h => runtime/stackvm/stackvm.h} | 55 +++++--- src/runtime/stackvm/stackvm_module.cc | 128 ++++++++++++++++++ src/runtime/stackvm/stackvm_module.h | 27 ++++ tests/python/unittest/test_module_load.py | 22 ++- 15 files changed, 337 insertions(+), 144 deletions(-) delete mode 100644 src/codegen/stack_vm/stack_vm_module.cc rename src/codegen/{stack_vm/codegen_stack_vm.cc => stackvm/codegen_stackvm.cc} (95%) rename src/codegen/{stack_vm/codegen_stack_vm.h => stackvm/codegen_stackvm.h} (95%) rename src/{codegen/stack_vm/stack_vm.cc => runtime/stackvm/stackvm.cc} (90%) rename src/{codegen/stack_vm/stack_vm.h => runtime/stackvm/stackvm.h} (89%) create mode 100644 src/runtime/stackvm/stackvm_module.cc create mode 100644 src/runtime/stackvm/stackvm_module.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 39776d53d1f1..572f4aef1432 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -29,6 +29,7 @@ tvm_option(USE_ROCM "Build with ROCM" OFF) tvm_option(ROCM_PATH "The path to rocm" /opt/rocm) tvm_option(USE_RPC "Build with RPC" ON) tvm_option(USE_LLVM "Build with LLVM, can be set to specific llvm-config path" OFF) +tvm_option(USE_STACKVM_RUNTIME "Include stackvm into the runtime" OFF) tvm_option(USE_GRAPH_RUNTIME "Build with tiny graph runtime" ON) tvm_option(USE_GRAPH_RUNTIME_DEBUG "Build with tiny graph runtime debug mode" OFF) tvm_option(USE_RTTI "Build with RTTI" ON) @@ -97,7 +98,6 @@ file(GLOB COMPILER_SRCS src/arithmetic/*.cc src/autotvm/*.cc src/codegen/*.cc - src/codegen/stack_vm/*.cc src/lang/*.cc src/pass/*.cc src/op/*.cc @@ -135,6 +135,16 @@ if(USE_RPC) list(APPEND RUNTIME_SRCS ${RUNTIME_RPC_SRCS}) endif(USE_RPC) +file(GLOB STACKVM_RUNTIME_SRCS src/runtime/stackvm/*.cc) +file(GLOB STACKVM_CODEGEN_SRCS src/codegen/stackvm/*.cc) +list(APPEND COMPILER_SRCS ${STACKVM_CODEGEN_SRCS}) +if(USE_STACKVM_RUNTIME) + message(STATUS "Build with stackvm support in runtime...") + list(APPEND RUNTIME_SRCS ${STACKVM_RUNTIME_SRCS}) +else() + list(APPEND COMPILER_SRCS ${STACKVM_RUNTIME_SRCS}) +endif(USE_STACKVM_RUNTIME) + if(USE_GRAPH_RUNTIME) message(STATUS "Build with Graph runtime support...") file(GLOB RUNTIME_GRAPH_SRCS src/runtime/graph/*.cc) diff --git a/Jenkinsfile b/Jenkinsfile index bec0d2be5df8..2ecf3c59f8aa 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -96,6 +96,7 @@ stage('Build') { echo set\\(USE_RPC ON\\) >> config.cmake echo set\\(USE_SORT ON\\) >> config.cmake echo set\\(USE_GRAPH_RUNTIME ON\\) >> config.cmake + echo set\\(USE_STACKVM_RUNTIME ON\\) >> config.cmake echo set\\(USE_BLAS openblas\\) >> config.cmake echo set\\(CMAKE_CXX_COMPILER g++\\) >> config.cmake echo set\\(CMAKE_CXX_FLAGS -Werror\\) >> config.cmake diff --git a/cmake/config.cmake b/cmake/config.cmake index 85c5102169a9..c364a88cce11 100644 --- a/cmake/config.cmake +++ b/cmake/config.cmake @@ -65,6 +65,9 @@ set(USE_OPENGL OFF) # Whether enable RPC runtime set(USE_RPC ON) +# Whether embed stackvm into the runtime +set(USE_STACKVM_RUNTIME OFF) + # Whether enable tiny embedded graph runtime. set(USE_GRAPH_RUNTIME ON) diff --git a/include/tvm/ir.h b/include/tvm/ir.h index 9ea16131188d..646824332902 100644 --- a/include/tvm/ir.h +++ b/include/tvm/ir.h @@ -12,6 +12,7 @@ #include #include "./base.h" #include "./expr.h" +#include "./runtime/util.h" namespace tvm { namespace ir { @@ -449,25 +450,6 @@ constexpr const char* tvm_global_barrier_kinit = "tvm_global_barrier_kinit"; */ constexpr const char* tvm_thread_allreduce = "tvm_thread_allreduce"; -/*! \brief The kind of structure field info */ -enum TVMStructFieldKind : int { - // array head address - kArrAddr, - kArrData, - kArrShape, - kArrStrides, - kArrNDim, - kArrTypeCode, - kArrTypeBits, - kArrTypeLanes, - kArrByteOffset, - kArrDeviceId, - kArrDeviceType, - kArrKindBound_, - // TVMValue field - kTVMValueContent, - kTVMValueKindBound_ -}; } // namespace intrinsic // Reuse IR node defintiion from HalideIR diff --git a/include/tvm/runtime/util.h b/include/tvm/runtime/util.h index 160642ffcc85..7fa62be912be 100644 --- a/include/tvm/runtime/util.h +++ b/include/tvm/runtime/util.h @@ -21,7 +21,33 @@ namespace runtime { inline bool TypeMatch(TVMType t, int code, int bits, int lanes = 1) { return t.code == code && t.bits == bits && t.lanes == lanes; } - } // namespace runtime } // namespace tvm +// Forward declare the intrinsic id we need +// in structure fetch to enable stackvm in runtime +namespace tvm { +namespace ir { +namespace intrinsic { +/*! \brief The kind of structure field info used in intrinsic */ +enum TVMStructFieldKind : int { + // array head address + kArrAddr, + kArrData, + kArrShape, + kArrStrides, + kArrNDim, + kArrTypeCode, + kArrTypeBits, + kArrTypeLanes, + kArrByteOffset, + kArrDeviceId, + kArrDeviceType, + kArrKindBound_, + // TVMValue field + kTVMValueContent, + kTVMValueKindBound_ +}; +} // namespace intrinsic +} // namespace ir +} // namespace tvm #endif // TVM_RUNTIME_UTIL_H_ diff --git a/python/tvm/module.py b/python/tvm/module.py index 1b83c9b26243..6cca6fb0f722 100644 --- a/python/tvm/module.py +++ b/python/tvm/module.py @@ -90,9 +90,12 @@ def export_library(self, kwargs : dict, optiona; Additional arguments passed to fcompile """ - if self.type_key == "stacktvm": - raise ValueError("Module[%s]: export_library requires llvm module," - " did you build with LLVM enabled?" % self.type_key) + if self.type_key == "stackvm": + if not file_name.endswith(".stackvm"): + raise ValueError("Module[%s]: can only be saved as stackvm format." + "did you build with LLVM enabled?" % self.type_key) + self.save(file_name) + return if self.type_key != "llvm": raise ValueError("Module[%s]: Only llvm support export shared" % self.type_key) diff --git a/src/codegen/codegen.cc b/src/codegen/codegen.cc index 8bc7d238a866..12570e5881a9 100644 --- a/src/codegen/codegen.cc +++ b/src/codegen/codegen.cc @@ -40,7 +40,6 @@ std::string PackImportsToC(const runtime::Module& mod, bool system_lib) { CHECK_EQ(im->imports().size(), 0U) << "Only support simply one-level hierarchy"; std::string tkey = im->type_key(); - std::string bin; stream->Write(tkey); im->SaveToBinary(stream); } diff --git a/src/codegen/stack_vm/stack_vm_module.cc b/src/codegen/stack_vm/stack_vm_module.cc deleted file mode 100644 index 731663deb448..000000000000 --- a/src/codegen/stack_vm/stack_vm_module.cc +++ /dev/null @@ -1,71 +0,0 @@ -/*! - * Copyright (c) 2017 by Contributors - * \file stack_vm_module.cc - */ -#include -#include -#include -#include "./codegen_stack_vm.h" - -namespace tvm { -namespace codegen { - -class StackVMModuleNode : public runtime::ModuleNode { - public: - const char* type_key() const { - return "stackvm"; - } - - PackedFunc GetFunction( - const std::string& name, - const std::shared_ptr& sptr_to_self) final { - if (name == runtime::symbol::tvm_module_main) { - return GetFunction(entry_func_, sptr_to_self); - } - auto it = fmap_.find(name); - if (it == fmap_.end()) return PackedFunc(); - const StackVM& vm = it->second; - // capture sptr_to_self to keep module node alive. - return PackedFunc([vm, sptr_to_self](TVMArgs args, TVMRetValue* rv) { - vm(args); - }); - } - - std::string GetSource(const std::string& format) final { - std::ostringstream os; - for (const auto& kv : fmap_) { - os << "Function: " << kv.first << '\n'; - os << kv.second; - } - return os.str(); - } - - static runtime::Module Build(const Array& funcs) { - CHECK_NE(funcs.size(), 0U); - std::shared_ptr n = - std::make_shared(); - for (LoweredFunc f : funcs) { - StackVM vm = codegen::CodeGenStackVM().Compile(f); - CHECK(!n->fmap_.count(f->name)) - << "Function name " << f->name << "already exist in list"; - vm.mod_ctx = n.get(); - n->fmap_[f->name] = std::move(vm); - } - n->entry_func_ = funcs[0]->name; - return runtime::Module(n); - } - - private: - // entry function. - std::string entry_func_; - // internal function map - std::unordered_map fmap_; -}; - -TVM_REGISTER_API("codegen.build_stackvm") -.set_body([](TVMArgs args, TVMRetValue* rv) { - *rv = StackVMModuleNode::Build(args[0]); - }); - -} // namespace codegen -} // namespace tvm diff --git a/src/codegen/stack_vm/codegen_stack_vm.cc b/src/codegen/stackvm/codegen_stackvm.cc similarity index 95% rename from src/codegen/stack_vm/codegen_stack_vm.cc rename to src/codegen/stackvm/codegen_stackvm.cc index 168e411fa6e2..517793ff14a3 100644 --- a/src/codegen/stack_vm/codegen_stack_vm.cc +++ b/src/codegen/stackvm/codegen_stackvm.cc @@ -1,11 +1,12 @@ /*! * Copyright (c) 2017 by Contributors - * \file codegen_stack_vm.cc + * \file codegen_stackvm.cc */ #include #include #include -#include "./codegen_stack_vm.h" +#include "./codegen_stackvm.h" +#include "../../runtime/stackvm/stackvm_module.h" namespace tvm { namespace codegen { @@ -19,6 +20,7 @@ StackVM CodeGenStackVM::Compile(LoweredFunc f) { CHECK_EQ(static_cast(vid), i); } this->Push(f->body); + vm_.InitCache(); return std::move(vm_); } @@ -486,5 +488,22 @@ void CodeGenStackVM::VisitExpr_(const Let *op) { this->PushOp(StackVM::STORE_HEAP, static_cast(vid)); this->Push(op->body); } + +runtime::Module BuildStackVM(const Array& funcs) { + CHECK_NE(funcs.size(), 0U); + std::unordered_map fmap; + for (LoweredFunc f : funcs) { + StackVM vm = codegen::CodeGenStackVM().Compile(f); + CHECK(!fmap.count(f->name)) + << "Function name " << f->name << "already exist in list"; + fmap[f->name] = std::move(vm); + } + return runtime::StackVMModuleCreate(fmap, funcs[0]->name); +} + +TVM_REGISTER_API("codegen.build_stackvm") +.set_body([](TVMArgs args, TVMRetValue* rv) { + *rv = BuildStackVM(args[0]); + }); } // namespace codegen } // namespace tvm diff --git a/src/codegen/stack_vm/codegen_stack_vm.h b/src/codegen/stackvm/codegen_stackvm.h similarity index 95% rename from src/codegen/stack_vm/codegen_stack_vm.h rename to src/codegen/stackvm/codegen_stackvm.h index 089284529242..23bd61dcb4c2 100644 --- a/src/codegen/stack_vm/codegen_stack_vm.h +++ b/src/codegen/stackvm/codegen_stackvm.h @@ -3,8 +3,8 @@ * \file codegen_stack_vm.h * \brief Codegen into Simple Stack VM. */ -#ifndef TVM_CODEGEN_STACK_VM_CODEGEN_STACK_VM_H_ -#define TVM_CODEGEN_STACK_VM_CODEGEN_STACK_VM_H_ +#ifndef TVM_CODEGEN_STACKVM_CODEGEN_STACKVM_H_ +#define TVM_CODEGEN_STACKVM_CODEGEN_STACKVM_H_ #include #include @@ -14,12 +14,14 @@ #include #include -#include "./stack_vm.h" +#include "../../runtime/stackvm/stackvm.h" namespace tvm { namespace codegen { using namespace ir; +using runtime::StackVM; + /*! * \brief A base class to generate a stack VM. * This module is used to generate host wrapper @@ -145,4 +147,4 @@ class CodeGenStackVM } // namespace codegen } // namespace tvm -#endif // TVM_CODEGEN_STACK_VM_CODEGEN_STACK_VM_H_ +#endif // TVM_CODEGEN_STACKVM_CODEGEN_STACKVM_H_ diff --git a/src/codegen/stack_vm/stack_vm.cc b/src/runtime/stackvm/stackvm.cc similarity index 90% rename from src/codegen/stack_vm/stack_vm.cc rename to src/runtime/stackvm/stackvm.cc index 95feeae3679e..f86bfec087e4 100644 --- a/src/codegen/stack_vm/stack_vm.cc +++ b/src/runtime/stackvm/stackvm.cc @@ -1,15 +1,16 @@ /*! * Copyright (c) 2017 by Contributors * Implementation stack VM. - * \file stack_vm.cc + * \file stackvm.cc */ #include -#include +#include #include -#include "./stack_vm.h" +#include +#include "./stackvm.h" namespace tvm { -namespace codegen { +namespace runtime { typedef dmlc::ThreadLocalStore StackVMStateStore; @@ -172,28 +173,64 @@ std::ostream& operator<<(std::ostream& os, const StackVM& vm) { // NOLINT(*) return os; } -void StackVM::operator()(const runtime::TVMArgs& args) const { +void StackVM::Run(const runtime::TVMArgs& args, + runtime::ModuleNode* mod_ctx) const { StackVM::State* s = StackVM::ThreadLocalState(); + if (s->heap.size() < heap_size) { + s->heap.resize(heap_size); + } s->sp = 0; s->pc = 0; - if (s->heap.size() < this->heap_size) { - s->heap.resize(this->heap_size); - } - + s->mod_ctx = mod_ctx; s->heap[0].v_handle = (void*)args.values; // NOLINT(*) s->heap[1].v_handle = (void*)args.type_codes; // NOLINT(*) s->heap[2].v_int64 = args.num_args; this->Run(s); } +void StackVM::InitCache() { + extern_func_cache_.clear(); + extern_func_cache_.resize( + extern_func_name.size(), PackedFunc(nullptr)); +} + +void StackVM::Save(dmlc::Stream* strm) const { + // to be endian invariant. + std::vector code_copy(code.size()); + std::transform(code.begin(), code.end(), code_copy.begin(), [](Code c) { + return c.v_int; + }); + strm->Write(code_copy); + strm->Write(str_data); + strm->Write(extern_func_name); + strm->Write(heap_id_name); + strm->Write(heap_size); + strm->Write(stack_size); +} + +bool StackVM::Load(dmlc::Stream* strm) { + // to be endian invariant. + std::vector code_copy; + if (!strm->Read(&code_copy)) return false; + code.resize(code_copy.size()); + std::transform(code_copy.begin(), code_copy.end(), code.begin(), [](int v) { + Code code; code.v_int = v; return code; + }); + if (!strm->Read(&str_data)) return false; + if (!strm->Read(&extern_func_name)) return false; + if (!strm->Read(&heap_id_name)) return false; + if (!strm->Read(&heap_size)) return false; + if (!strm->Read(&stack_size)) return false; + this->InitCache(); + return true; +} + void StackVM::Run(State* s) const { int64_t sp = s->sp; int64_t pc = s->pc; int64_t alloca_sp = s->sp; std::vector& stack = s->stack; std::vector& heap = s->heap; - s->extern_func.clear(); - s->extern_func.resize(extern_func_name.size()); if (stack.size() < stack_size) { stack.resize(stack_size); } @@ -488,17 +525,19 @@ void StackVM::Run(State* s) const { } const PackedFunc& StackVM::GetExtern(State* s, int fid) const { - PackedFunc& f = s->extern_func[fid]; + CHECK_LT(static_cast(fid), extern_func_cache_.size()); + // allow race write in this, since write is idempotent + PackedFunc& f = extern_func_cache_[fid]; if (f == nullptr) { - CHECK(mod_ctx != nullptr) + CHECK(s->mod_ctx != nullptr) << "No local context is set in stackvm"; - const PackedFunc* pf = mod_ctx->GetFuncFromEnv(extern_func_name[fid]); + CHECK(s->mod_ctx != nullptr); + const PackedFunc* pf = s->mod_ctx->GetFuncFromEnv(extern_func_name[fid]); CHECK(pf != nullptr); f = *pf; - CHECK(f != nullptr); } return f; } -} // namespace codegen +} // namespace runtime } // namespace tvm diff --git a/src/codegen/stack_vm/stack_vm.h b/src/runtime/stackvm/stackvm.h similarity index 89% rename from src/codegen/stack_vm/stack_vm.h rename to src/runtime/stackvm/stackvm.h index 54972d39a5df..b2ce975b2c73 100644 --- a/src/codegen/stack_vm/stack_vm.h +++ b/src/runtime/stackvm/stackvm.h @@ -1,36 +1,36 @@ /*! * Copyright (c) 2016 by Contributors - * \file stack_vm.h + * \file stackvm.h * \brief A simple stack-based virtual machine. * * This can be used to interepret host side code * to setup calls into device functions * when only Runtime compilation for device is available(via NVRTC or OpenCL). */ -#ifndef TVM_CODEGEN_STACK_VM_STACK_VM_H_ -#define TVM_CODEGEN_STACK_VM_STACK_VM_H_ +#ifndef TVM_RUNTIME_STACKVM_STACKVM_H_ +#define TVM_RUNTIME_STACKVM_STACKVM_H_ #include #include #include -#include #include #include namespace tvm { -namespace codegen { +namespace runtime { using runtime::operator<<; /*! - * \brief A simple stack-based virtual machine. + * \brief A simple stack-based virtual machine program. */ class StackVM { public: /*! - * \brief Invoke the StackVM as PackedFunc + * \brief Invoke the StackVM program. * \param args The arguments to the StackVM. + * \param mod_ctx The module context used in running. */ - void operator()(const TVMArgs& args) const; + void Run(const TVMArgs& args, runtime::ModuleNode* mod_ctx) const; /*! * \brief The opcode of stack vm * \note Notation @@ -276,21 +276,25 @@ class StackVM { std::vector stack; /*! \brief The global heap space */ std::vector heap; - /*! \brief extern functions */ - std::vector extern_func; /*! \brief stack pointer */ int64_t sp{0}; /*! \brief program counter */ int64_t pc{0}; + /*! \brief The current module context of stackvm */ + runtime::ModuleNode* mod_ctx{nullptr}; }; - /*! \brief The external function entries. */ - struct ExternFuncEntry { - std::string name; - runtime::PackedFunc func; - }; - - /*! \brief execute the stack vm with given state */ - void Run(State* state) const; + /*! \brief Initialize local cache*/ + void InitCache(); + /*! + * \brief Save stackvm program to an output stream + * \param strm The output stream + */ + void Save(dmlc::Stream* strm) const; + /*! + * \brief Load stackvm program from output stream + * \param strm The output stream + */ + bool Load(dmlc::Stream* strm); /*! * \brief Print instruction at location pc * \param os The ostream @@ -300,12 +304,11 @@ class StackVM { int64_t PrintCode(std::ostream&os, int64_t pc) const; // NOLINT(*) /*! \brief Get thread local state of the stack VM */ static State* ThreadLocalState(); + // The code below are programs /*! \brief The instructions */ std::vector code; /*! \brief constant error messages */ std::vector str_data; - /*! \brief The current module context of stackvm */ - runtime::ModuleNode* mod_ctx{nullptr}; /*! \brief Extern functions */ std::vector extern_func_name; /*! \brief name of each heap id */ @@ -385,10 +388,18 @@ class StackVM { friend std::ostream& operator<<(std::ostream& os, const StackVM& vm); // NOLINT(*) private: + // execute the stack vm with given state + void Run(State* state) const; // get extern function. const PackedFunc& GetExtern(State* s, int fid) const; + // cached extern function + mutable std::vector extern_func_cache_; }; -} // namespace codegen +} // namespace runtime } // namespace tvm -#endif // TVM_CODEGEN_STACK_VM_STACK_VM_H_ + +namespace dmlc { +DMLC_DECLARE_TRAITS(has_saveload, ::tvm::runtime::StackVM, true); +} +#endif // TVM_RUNTIME_STACKVM_STACKVM_H_ diff --git a/src/runtime/stackvm/stackvm_module.cc b/src/runtime/stackvm/stackvm_module.cc new file mode 100644 index 000000000000..71ca9ba6c09a --- /dev/null +++ b/src/runtime/stackvm/stackvm_module.cc @@ -0,0 +1,128 @@ +/*! + * Copyright (c) 2017 by Contributors + * \file stackvm_module.cc + */ +#include +#include +#include +#include "./stackvm_module.h" +#include "../file_util.h" +#include "../module_util.h" + +namespace tvm { +namespace runtime { + +class StackVMModuleNode : public runtime::ModuleNode { + public: + const char* type_key() const { + return "stackvm"; + } + + PackedFunc GetFunction( + const std::string& name, + const std::shared_ptr& sptr_to_self) final { + if (name == runtime::symbol::tvm_module_main) { + return GetFunction(entry_func_, sptr_to_self); + } + auto it = fmap_.find(name); + if (it == fmap_.end()) return PackedFunc(); + const StackVM& vm = it->second; + // capture sptr_to_self to keep module node alive. + return PackedFunc([vm, sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { + vm.Run(args, this); + }); + } + + std::string GetSource(const std::string& format) final { + std::ostringstream os; + for (const auto& kv : fmap_) { + os << "Function: " << kv.first << '\n'; + os << kv.second; + } + return os.str(); + } + + void SaveToFile(const std::string& file_name, + const std::string& format) final { + std::string data, mblob; + dmlc::MemoryStringStream writer(&data); + dmlc::Stream* strm = &writer; + strm->Write(fmap_); + strm->Write(entry_func_); + // also save imports + uint64_t num_imports = static_cast(imports_.size()); + strm->Write(num_imports); + + for (runtime::Module im : imports_) { + CHECK_EQ(im->imports().size(), 0U) + << "Only support simply one-level hierarchy"; + std::string tkey = im->type_key(); + strm->Write(tkey); + LOG(INFO) << "save " << tkey; + im->SaveToBinary(strm); + LOG(INFO) << "FInish save " << tkey; + } + SaveBinaryToFile(file_name, data); + } + + static Module Create(std::unordered_map fmap, + std::string entry_func) { + std::shared_ptr n = + std::make_shared(); + n->fmap_ = std::move(fmap); + n->entry_func_ = std::move(entry_func); + return Module(n); + } + + static Module Load(dmlc::Stream* strm) { + std::unordered_map fmap; + std::string entry_func, data; + strm->Read(&fmap); + strm->Read(&entry_func); + std::shared_ptr n = + std::make_shared(); + n->fmap_ = std::move(fmap); + n->entry_func_ = std::move(entry_func); + uint64_t num_imports; + strm->Read(&num_imports); + for (uint64_t i = 0; i < num_imports; ++i) { + std::string tkey; + CHECK(strm->Read(&tkey)); + std::string fkey = "module.loadbinary_" + tkey; + const PackedFunc* f = Registry::Get(fkey); + CHECK(f != nullptr) + << "Loader of " << tkey << "(" + << fkey << ") is not presented."; + Module m = (*f)(static_cast(strm)); + n->imports_.emplace_back(std::move(m)); + } + return Module(n); + } + + static Module LoadFromFile(std::string file_name, + std::string format) { + std::string data; + LoadBinaryFromFile(file_name, &data); + dmlc::MemoryStringStream reader(&data); + return Load(&reader); + } + + private: + // internal function map + std::unordered_map fmap_; + // entry function. + std::string entry_func_; +}; + +Module StackVMModuleCreate(std::unordered_map fmap, + std::string entry_func) { + return StackVMModuleNode::Create(fmap, entry_func); +} + +TVM_REGISTER_GLOBAL("module.loadfile_stackvm") +.set_body([](TVMArgs args, TVMRetValue* rv) { + *rv = StackVMModuleNode::LoadFromFile(args[0], args[1]); + }); + +} // namespace runtime +} // namespace tvm diff --git a/src/runtime/stackvm/stackvm_module.h b/src/runtime/stackvm/stackvm_module.h new file mode 100644 index 000000000000..fcd51a64f870 --- /dev/null +++ b/src/runtime/stackvm/stackvm_module.h @@ -0,0 +1,27 @@ +/*! + * Copyright (c) 2018 by Contributors + * \file stackvm_module.h + * \brief StackVM module + */ +#ifndef TVM_RUNTIME_STACKVM_STACKVM_MODULE_H_ +#define TVM_RUNTIME_STACKVM_STACKVM_MODULE_H_ + +#include +#include +#include "./stackvm.h" + +namespace tvm { +namespace runtime { +/*! + * \brief create a stackvm module + * + * \param fmap The map from name to function + * \param entry_func The entry function name. + * \return The created module + */ +Module StackVMModuleCreate(std::unordered_map fmap, + std::string entry_func); + +} // namespace runtime +} // namespace tvm +#endif // TVM_RUNTIME_STACKVM_STACKVM_MODULE_H_ diff --git a/tests/python/unittest/test_module_load.py b/tests/python/unittest/test_module_load.py index 1b239a357f66..8ee3ea5e06c0 100644 --- a/tests/python/unittest/test_module_load.py +++ b/tests/python/unittest/test_module_load.py @@ -109,11 +109,25 @@ def check_device(device): f2[name](a, b) np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) - check_device("cuda") - check_device("vulkan") - check_device("opencl") - check_device("metal") + def check_stackvm(device): + ctx = tvm.context(device, 0) + if not ctx.exist: + print("Skip because %s is not enabled" % device) + return + temp = util.tempdir() + name = "myadd_%s" % device + f = tvm.build(s, [A, B], device, "stackvm", name=name) + path_dso = temp.relpath("dev_lib.stackvm") + #f.export_library(path_dso) + #f1 = tvm.module.load(path_dso) + a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), ctx) + b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), ctx) + f(a, b) + np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1) + for device in ["cuda", "vulkan", "opencl", "metal"]: + check_device(device) + check_stackvm(device) def test_combine_module_llvm(): """Test combine multiple module into one shared lib.""" From 19cf5c66ada5ed9be027eaafae07099cfaf93164 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Sat, 11 Aug 2018 15:31:15 -0700 Subject: [PATCH 18/77] [DLPACK] Enable cython support (#1589) --- HalideIR | 2 +- include/tvm/runtime/c_runtime_api.h | 2 +- python/tvm/_ffi/_ctypes/ndarray.py | 49 ++++++++++++++++++++++++++++- python/tvm/_ffi/_cython/base.pxi | 13 +++++++- python/tvm/_ffi/_cython/ndarray.pxi | 38 ++++++++++++++++++++++ python/tvm/_ffi/ndarray.py | 48 +++------------------------- tests/scripts/task_python_nnvm.sh | 4 +++ tests/scripts/task_python_topi.sh | 4 +++ 8 files changed, 113 insertions(+), 47 deletions(-) diff --git a/HalideIR b/HalideIR index a5a80bdc8232..a0b9563f4571 160000 --- a/HalideIR +++ b/HalideIR @@ -1 +1 @@ -Subproject commit a5a80bdc8232c9dbfe508bb5c46e8f58cdf7ec20 +Subproject commit a0b9563f45719553adf4d39fe3c14db1af0e1f40 diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h index dca0d5ed4a30..32d574340052 100644 --- a/include/tvm/runtime/c_runtime_api.h +++ b/include/tvm/runtime/c_runtime_api.h @@ -467,7 +467,7 @@ TVM_DLL int TVMArrayToDLPack(TVMArrayHandle from, /*! * \brief Delete (free) a DLManagedTensor's data. - * \param dltensor Pointer to the DLManagedTensor. + * \param dltensor Pointer to the DLManagedTensor. */ TVM_DLL void TVMDLManagedTensorCallDeleter(DLManagedTensor* dltensor); diff --git a/python/tvm/_ffi/_ctypes/ndarray.py b/python/tvm/_ffi/_ctypes/ndarray.py index df877679fc7d..8b88e7dc98ea 100644 --- a/python/tvm/_ffi/_ctypes/ndarray.py +++ b/python/tvm/_ffi/_ctypes/ndarray.py @@ -1,11 +1,47 @@ +# pylint: disable=invalid-name """Runtime NDArray api""" from __future__ import absolute_import import ctypes -from ..base import _LIB, check_call +from ..base import _LIB, check_call, c_str from ..runtime_ctypes import TVMArrayHandle from .types import RETURN_SWITCH, C_TO_PY_ARG_SWITCH, _wrap_arg_func, _return_handle + +TVMPyCapsuleDestructor = ctypes.CFUNCTYPE(None, ctypes.c_void_p) +_c_str_dltensor = c_str('dltensor') +_c_str_used_dltensor = c_str('used_dltensor') + + +# used for PyCapsule manipulation +if hasattr(ctypes, 'pythonapi'): + ctypes.pythonapi.PyCapsule_GetName.restype = ctypes.c_char_p + ctypes.pythonapi.PyCapsule_GetPointer.restype = ctypes.c_void_p + ctypes.pythonapi.PyCapsule_New.restype = ctypes.py_object + + +def _from_dlpack(dltensor): + dltensor = ctypes.py_object(dltensor) + if ctypes.pythonapi.PyCapsule_IsValid(dltensor, _c_str_dltensor): + ptr = ctypes.pythonapi.PyCapsule_GetPointer(dltensor, _c_str_dltensor) + handle = TVMArrayHandle() + check_call(_LIB.TVMArrayFromDLPack(ptr, ctypes.byref(handle))) + ctypes.pythonapi.PyCapsule_SetName(dltensor, _c_str_used_dltensor) + ctypes.pythonapi.PyCapsule_SetDestructor(dltensor, TVMPyCapsuleDestructor(0)) + return _make_array(handle, False) + raise ValueError("Expect a dltensor field, PyCapsule can only be consumed once") + + +def _dlpack_deleter(pycapsule): + pycapsule = ctypes.cast(pycapsule, ctypes.py_object) + if ctypes.pythonapi.PyCapsule_IsValid(pycapsule, _c_str_dltensor): + ptr = ctypes.pythonapi.PyCapsule_GetPointer(pycapsule, _c_str_dltensor) + _LIB.TVMDLManagedTensorCallDeleter(ptr) + ctypes.pythonapi.PyCapsule_SetDestructor(dltensor, TVMPyCapsuleDestructor(0)) + +_c_dlpack_deleter = TVMPyCapsuleDestructor(_dlpack_deleter) + + class NDArrayBase(object): """A simple Device/CPU Array object in runtime.""" __slots__ = ["handle", "is_view"] @@ -29,6 +65,17 @@ def __del__(self): def _tvm_handle(self): return ctypes.cast(self.handle, ctypes.c_void_p).value + def to_dlpack(self): + """Produce an array from a DLPack Tensor without copying memory + + Returns + ------- + dlpack : DLPack tensor view of the array data + """ + handle = ctypes.c_void_p() + check_call(_LIB.TVMArrayToDLPack(self.handle, ctypes.byref(handle))) + return ctypes.pythonapi.PyCapsule_New(handle, _c_str_dltensor, _c_dlpack_deleter) + def _make_array(handle, is_view): handle = ctypes.cast(handle, TVMArrayHandle) diff --git a/python/tvm/_ffi/_cython/base.pxi b/python/tvm/_ffi/_cython/base.pxi index 50a99245f793..00173c431bb7 100644 --- a/python/tvm/_ffi/_cython/base.pxi +++ b/python/tvm/_ffi/_cython/base.pxi @@ -1,6 +1,7 @@ from ..base import TVMError from libcpp.vector cimport vector from cpython.version cimport PY_MAJOR_VERSION +from cpython cimport pycapsule from libc.stdint cimport int64_t, uint64_t, uint8_t, uint16_t import ctypes @@ -40,6 +41,11 @@ cdef extern from "tvm/runtime/c_runtime_api.h": int64_t* strides uint64_t byte_offset + ctypedef struct DLManagedTensor: + DLTensor dl_tensor + void* manager_ctx + void (*deleter)(DLManagedTensor* self) + ctypedef struct TVMValue: int64_t v_int64 double v_float64 @@ -49,7 +55,7 @@ cdef extern from "tvm/runtime/c_runtime_api.h": DLContext v_ctx ctypedef int64_t tvm_index_t -ctypedef void* DLTensorHandle +ctypedef DLTensor* DLTensorHandle ctypedef void* TVMStreamHandle ctypedef void* TVMRetValueHandle ctypedef void* TVMFunctionHandle @@ -92,6 +98,11 @@ cdef extern from "tvm/runtime/c_runtime_api.h": int TVMArrayCopyFromTo(DLTensorHandle src, DLTensorHandle to, TVMStreamHandle stream) + int TVMArrayFromDLPack(DLManagedTensor* arr_from, + DLTensorHandle* out) + int TVMArrayToDLPack(DLTensorHandle arr_from, + DLManagedTensor** out) + void TVMDLManagedTensorCallDeleter(DLManagedTensor* dltensor) cdef extern from "tvm/c_dsl_api.h": int TVMNodeFree(NodeHandle handle) diff --git a/python/tvm/_ffi/_cython/ndarray.pxi b/python/tvm/_ffi/_cython/ndarray.pxi index 44b0a544609d..0a507affec1c 100644 --- a/python/tvm/_ffi/_cython/ndarray.pxi +++ b/python/tvm/_ffi/_cython/ndarray.pxi @@ -1,5 +1,29 @@ from ..runtime_ctypes import TVMArrayHandle +cdef const char* _c_str_dltensor = "dltensor" +cdef const char* _c_str_used_dltensor = "used_dltensor" + + +cdef void _c_dlpack_deleter(object pycaps): + cdef DLManagedTensor* dltensor + if pycapsule.PyCapsule_IsValid(pycaps, _c_str_dltensor): + dltensor = pycapsule.PyCapsule_GetPointer(pycaps, _c_str_dltensor) + TVMDLManagedTensorCallDeleter(dltensor) + + +def _from_dlpack(object dltensor): + cdef DLManagedTensor* ptr + cdef DLTensorHandle chandle + if pycapsule.PyCapsule_IsValid(dltensor, _c_str_dltensor): + ptr = pycapsule.PyCapsule_GetPointer(dltensor, _c_str_dltensor) + CALL(TVMArrayFromDLPack(ptr, &chandle)) + # set name and destructor to be empty + pycapsule.PyCapsule_SetDestructor(dltensor, NULL) + pycapsule.PyCapsule_SetName(dltensor, _c_str_used_dltensor) + return c_make_array(chandle, 0) + raise ValueError("Expect a dltensor field, pycapsule.PyCapsule can only be consumed once") + + cdef class NDArrayBase: cdef DLTensor* chandle cdef int c_is_view @@ -35,12 +59,26 @@ cdef class NDArrayBase: if self.c_is_view == 0: CALL(TVMArrayFree(self.chandle)) + def to_dlpack(self): + """Produce an array from a DLPack Tensor without copying memory + + Returns + ------- + dlpack : DLPack tensor view of the array data + """ + cdef DLManagedTensor* dltensor + if self.c_is_view != 0: + raise ValueError("to_dlpack do not work with memory views") + CALL(TVMArrayToDLPack(self.chandle, &dltensor)) + return pycapsule.PyCapsule_New(dltensor, _c_str_dltensor, _c_dlpack_deleter) + cdef c_make_array(void* chandle, is_view): ret = _CLASS_NDARRAY(None, is_view) (ret).chandle = chandle return ret + cdef _TVM_COMPATS = () cdef _TVM_EXT_RET = {} diff --git a/python/tvm/_ffi/ndarray.py b/python/tvm/_ffi/ndarray.py index d994d7c2e4a5..e49c3b62f473 100644 --- a/python/tvm/_ffi/ndarray.py +++ b/python/tvm/_ffi/ndarray.py @@ -17,28 +17,17 @@ if _FFI_MODE == "ctypes": raise ImportError() if sys.version_info >= (3, 0): - from ._cy3.core import _set_class_ndarray, _reg_extension, _make_array + from ._cy3.core import _set_class_ndarray, _reg_extension, _make_array, _from_dlpack from ._cy3.core import NDArrayBase as _NDArrayBase else: - from ._cy2.core import _set_class_ndarray, _reg_extension, _make_array + from ._cy2.core import _set_class_ndarray, _reg_extension, _make_array, _from_dlpack from ._cy2.core import NDArrayBase as _NDArrayBase except IMPORT_EXCEPT: # pylint: disable=wrong-import-position - from ._ctypes.ndarray import _set_class_ndarray, _reg_extension, _make_array + from ._ctypes.ndarray import _set_class_ndarray, _reg_extension, _make_array, _from_dlpack from ._ctypes.ndarray import NDArrayBase as _NDArrayBase -TVMPyCapsuleDestructor = ctypes.CFUNCTYPE(None, ctypes.c_void_p) -_c_str_dltensor = c_str('dltensor') - - -# used for PyCapsule manipulation -if hasattr(ctypes, 'pythonapi'): - ctypes.pythonapi.PyCapsule_GetName.restype = ctypes.c_char_p - ctypes.pythonapi.PyCapsule_GetPointer.restype = ctypes.c_void_p - ctypes.pythonapi.PyCapsule_New.restype = ctypes.py_object - - def context(dev_type, dev_id=0): """Construct a TVM context with given device type and id. @@ -134,30 +123,14 @@ def from_dlpack(dltensor): Parameters ---------- dltensor : DLPack tensor + Input DLManagedTensor, can only be consumed once. Returns ------- arr: tvm.nd.NDArray The array view of the tensor data. """ - dltensor = ctypes.py_object(dltensor) - name = ctypes.pythonapi.PyCapsule_GetName(dltensor) - ptr = ctypes.pythonapi.PyCapsule_GetPointer(dltensor, name) - handle = TVMArrayHandle() - check_call(_LIB.TVMArrayFromDLPack(ptr, ctypes.byref(handle))) - ctypes.pythonapi.PyCapsule_SetDestructor(dltensor, None) - return _make_array(handle, False) - - -def _dlpack_deleter(pycapsule): - pycapsule = ctypes.py_object(pycapsule) - if ctypes.pythonapi.PyCapsule_IsValid(pycapsule, _c_str_dltensor): - ptr = ctypes.pythonapi.PyCapsule_GetPointer(pycapsule, _c_str_dltensor) - _LIB.TVMDLManagedTensorCallDeleter(ptr) - ctypes.pythonapi.PyCapsule_SetDestructor(dltensor, TVMPyCapsuleDestructor(0)) - - -_c_dlpack_deleter = TVMPyCapsuleDestructor(_dlpack_deleter) + return _from_dlpack(dltensor) class NDArrayBase(_NDArrayBase): @@ -308,17 +281,6 @@ def copyto(self, target): raise ValueError("Unsupported target type %s" % str(type(target))) return target - def to_dlpack(self): - """Produce an array from a DLPack Tensor without copying memory - - Returns - ------- - dlpack : DLPack tensor view of the array data - """ - handle = ctypes.c_void_p() - check_call(_LIB.TVMArrayToDLPack(self.handle, ctypes.byref(handle))) - return ctypes.pythonapi.PyCapsule_New(handle, _c_str_dltensor, _c_dlpack_deleter) - def free_extension_handle(handle, type_code): """Free c++ extension type handle diff --git a/tests/scripts/task_python_nnvm.sh b/tests/scripts/task_python_nnvm.sh index 790073a2fe8b..cf6039d58416 100755 --- a/tests/scripts/task_python_nnvm.sh +++ b/tests/scripts/task_python_nnvm.sh @@ -4,6 +4,10 @@ export PYTHONPATH=nnvm/python:python:topi/python # to avoid openblas threading error export OMP_NUM_THREADS=1 +# Rebuild cython +make cython || exit -1 +make cython3 || exit -1 + echo "Running unittest..." python -m nose -v nnvm/tests/python/unittest || exit -1 python3 -m nose -v nnvm/tests/python/unittest || exit -1 diff --git a/tests/scripts/task_python_topi.sh b/tests/scripts/task_python_topi.sh index 13a324d79b1f..6842ddaae13a 100755 --- a/tests/scripts/task_python_topi.sh +++ b/tests/scripts/task_python_topi.sh @@ -1,4 +1,8 @@ export PYTHONPATH=python:topi/python +# Rebuild cython +make cython || exit -1 +make cython3 || exit -1 + python -m nose -v topi/tests/python || exit -1 python3 -m nose -v topi/tests/python || exit -1 From 60769b77f9abe29aafabda4d5d1cd625e7c61f9f Mon Sep 17 00:00:00 2001 From: Leyuan Wang Date: Mon, 13 Aug 2018 09:35:46 -0700 Subject: [PATCH 19/77] Fixed bugs for SSD sorting and multbox detection (#1578) --- topi/python/topi/cuda/nms.py | 480 ++++++++++++++++++++------ topi/python/topi/cuda/ssd/multibox.py | 225 ++++++++---- 2 files changed, 534 insertions(+), 171 deletions(-) diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py index 4d4e402de5c2..361208bf1cfb 100644 --- a/topi/python/topi/cuda/nms.py +++ b/topi/python/topi/cuda/nms.py @@ -7,19 +7,155 @@ from topi.vision import nms -def sort_ir(data, index, output, axis, is_descend): - """Low level IR to do sorting on the GPU, same usage as tvm.contrib.sort.argsort on the CPU. +def sort_pre_ir(index, sizes_out, axis_mul_before, axis_mul_after): + """Low level IR routing subfunction 1/4 for computing segments' staring locatons. + + Parameters + ---------- + index : Buffer + Buffer of number of valid output boxes. + + sizes_out : Buffer + Output buffer of start locations of each sorting segment. + + axis_mul_before : int + The multiplication result of axis dimensions before axis. + + axis_mul_after : int + The multiplication result of axis dimensions after axis. + + Returns + ------- + stmt : Stmt + The result IR statement. + """ + max_threads = int( + tvm.target.current_target(allow_none=False).max_num_threads) + tx = tvm.thread_axis("threadIdx.x") + bx = tvm.thread_axis("blockIdx.x") + ib = tvm.ir_builder.create() + p_index = ib.buffer_ptr(index) + dshape = sizes_out.shape + sizes = ib.buffer_ptr(sizes_out) + nthread_tx = max_threads + nthread_bx = dshape[0] // max_threads + 1 + ib.scope_attr(tx, "thread_extent", nthread_tx) + ib.scope_attr(bx, "thread_extent", nthread_bx) + tid = bx * max_threads + tx + + with ib.if_scope(tid < axis_mul_before * axis_mul_after): + sizes[tid] = p_index[tid] + + # scan + with ib.if_scope(tid < 1): + with ib.for_range(0, axis_mul_before * axis_mul_after - 1, name="k") as k: + sizes[k + 1] += sizes[k] + body = ib.get() + return body + + +def sort_pre_ir_data(data, index, sizes_in, data_out, index_out, \ + axis, axis_mul_before, axis_mul_after): + """Low level IR routing subfunction 2/4 for flattening data and indices into segmented format. Parameters ---------- data: Buffer - 2D Buffer of input boxes' score with shape [batch_size, num_anchors]. + Buffer of output boxes with class and score. index : Buffer - Buffer of number of valid number of boxes. + Buffer of number of valid output boxes. - output : Buffer - Output buffer of indicies of sorted tensor. + sizes_in : Buffer + Buffer of start locations of each sorting segment. + + data_out : Buffer + Buffer of flattened segmented data. + + index_out : Buffer + Buffer of flattened segmented indices. + + axis : int + The axis used for sorting. + + axis_mul_before : int + The multiplication result of axis dimensions before axis. + + axis_mul_after : int + The multiplication result of axis dimensions after axis. + + Returns + ------- + stmt : Stmt + The result IR statement. + """ + ib = tvm.ir_builder.create() + sizes = ib.buffer_ptr(sizes_in) + p_index = ib.buffer_ptr(index) + p_data = ib.buffer_ptr(data) + data_new = ib.buffer_ptr(data_out) + index_new = ib.buffer_ptr(index_out) + max_threads = int( + tvm.target.current_target(allow_none=False).max_num_threads) + tx = tvm.thread_axis("threadIdx.x") + bx = tvm.thread_axis("blockIdx.x") + dshape = tvm.max(sizes_in.shape[0], p_index[0]) + nthread_tx = max_threads + nthread_bx = dshape // max_threads + 1 + ib.scope_attr(tx, "thread_extent", nthread_tx) + ib.scope_attr(bx, "thread_extent", nthread_bx) + tid = bx * max_threads + tx + with ib.if_scope(axis_mul_before * axis_mul_after > 1): + with ib.if_scope(tid < axis_mul_before * axis_mul_after): + i = tid / axis_mul_after + j = tid % axis_mul_after + current_sort_num = p_index[tid] + base_idx = i * data.shape[axis] * axis_mul_after + j + with ib.for_range(0, current_sort_num, name="k") as k: + full_idx = base_idx + k * axis_mul_after + with ib.if_scope(tid == 0): + start = 0 + with ib.else_scope(): + start = sizes[tid-1] + index_new[start + k] = k + data_new[start + k] = p_data[full_idx] + with ib.else_scope(): + with ib.if_scope(tid == 0): + with ib.for_range(0, p_index[0], name="k") as k: + index_new[k] = k + + body = ib.get() + return body + +def sort_oet_ir(data, index, new_data, new_index, loc, out_index, axis_mul_before, \ + axis_mul_after, axis, is_descend): + """Low level IR routing subfunction 3/4 for Odd-Even-Transposition sorting. + + Parameters + ---------- + data: Buffer + Buffer of output boxes with class and score. + + index : Buffer + Buffer of number of valid output boxes. + + new_data : Buffer + Buffer of flattened segmented data. + + new_index : Buffer + Buffer of flattened segmented indices. + + loc : Buffer + Buffer of start locations of each sorting segment. + + out_index : Buffer + Output buffer of output box indexes sorted by score in a flattened segmented format. + + axis_mul_before : int + The multiplication result of axis dimensions before axis. + + axis_mul_after : int + The multiplication result of axis dimensions after axis. axis : int The axis used for sorting. @@ -32,15 +168,197 @@ def sort_ir(data, index, output, axis, is_descend): stmt : Stmt The result IR statement. """ - max_threads = int( tvm.target.current_target(allow_none=False).max_num_threads) tx = tvm.thread_axis("threadIdx.x") bx = tvm.thread_axis("blockIdx.x") ib = tvm.ir_builder.create() + dshape = loc.shape + fshape = data.shape[axis] * dshape[0] + temp_data = ib.allocate( + "float32", dshape, name="temp_data", scope="local") p_data = ib.buffer_ptr(data) p_index = ib.buffer_ptr(index) + data_new = ib.buffer_ptr(new_data) + index_new = ib.buffer_ptr(new_index) + index_out = ib.buffer_ptr(out_index) + sizes = ib.buffer_ptr(loc) + nthread_tx = max_threads + nthread_bx = fshape // max_threads + 1 + ib.scope_attr(tx, "thread_extent", nthread_tx) + ib.scope_attr(bx, "thread_extent", nthread_bx) + tid = bx * max_threads + tx + + with ib.if_scope(axis_mul_before * axis_mul_after > 1): + with ib.if_scope(tid < axis_mul_before * axis_mul_after): + with ib.if_scope(tid == 0): + start = 0 + with ib.else_scope(): + start = sizes[tid-1] + # OddEvenTransposeSort + with ib.for_range(0, p_index[tid], name="k") as k: + with ib.for_range(0, p_index[tid] - 1, name="i") as i: + with ib.if_scope(i % 2 == k % 2): + with ib.if_scope(((data_new[i+start] < data_new[i+start+1]) == is_descend)): + temp_data[tid] = data_new[i+start] + data_new[i+start] = data_new[i+start+1] + data_new[i+start+1] = temp_data[tid] + index_out[tid] = index_new[i+start] + index_new[i+start] = index_new[i+start+1] + index_new[i+start+1] = index_out[tid] + with ib.if_scope(tid < 1): + with ib.for_range(0, sizes[dshape[0] - 1], name="i") as i: + index_out[i] = index_new[i] + with ib.else_scope(): + with ib.for_range(0, fshape, name="k", for_type="unroll") as k: + with ib.if_scope(tvm.all(k % 2 == tid % 2, tid < fshape)): + with ib.if_scope(k % 2 == 0): + with ib.if_scope(tvm.all(tid + 1 < fshape, (p_data[tid] < p_data[tid+1]) \ + == is_descend)): + data_new[tid] = p_data[tid+1] + index_out[tid] = index_new[tid+1] + with ib.else_scope(): + data_new[tid] = p_data[tid] + index_out[tid] = index_new[tid] + with ib.else_scope(): + with ib.if_scope(tvm.all(tid + 1 < fshape, (data_new[tid] < data_new[tid+1]) \ + == is_descend)): + p_data[tid] = data_new[tid+1] + index_new[tid] = index_out[tid+1] + with ib.else_scope(): + p_data[tid] = data_new[tid] + index_new[tid] = index_out[tid] + with ib.if_scope(tvm.all(k % 2 != tid % 2, tid < fshape)): + with ib.if_scope(k % 2 == 0): + with ib.if_scope(tvm.all(tid > 0, (p_data[tid-1] < p_data[tid]) == is_descend)): + data_new[tid] = p_data[tid-1] + index_out[tid] = index_new[tid-1] + with ib.else_scope(): + data_new[tid] = p_data[tid] + index_out[tid] = index_new[tid] + with ib.else_scope(): + with ib.if_scope(tvm.all(tid > 0, (data_new[tid-1] < data_new[tid]) \ + == is_descend)): + p_data[tid] = data_new[tid-1] + index_new[tid] = index_out[tid-1] + with ib.else_scope(): + p_data[tid] = data_new[tid] + index_new[tid] = index_out[tid] + with ib.if_scope(fshape % 2 == 1): + with ib.if_scope(tid < 1): + with ib.for_range(0, fshape, name="k") as k: + index_out[tid] = index_new[tid] + body = ib.get() + return body + + +def sort_ir_out(data, index, new_index, loc, output, axis_mul_before, axis_mul_after, axis): + """Low level IR routing subfunction 4/4 for writing sorted indices to output format. + + Parameters + ---------- + data: Buffer + Buffer of output boxes with class and score. + + index : Buffer + Buffer of number of valid output boxes. + + new_index : Buffer + Buffer of sorted indices in a flatten format. + + loc : Buffer + Buffer of start locations of each sorting segment. + + output : Buffer + Output buffer of output box indexes sorted by score. + + axis_mul_before : int + The multiplication result of axis dimensions before axis. + + axis_mul_after : int + The multiplication result of axis dimensions after axis. + + axis : int + The axis used for sorting. + + is_descend : bool + If the sorted data is in descending order. + + Returns + ------- + stmt : Stmt + The result IR statement. + """ + max_threads = int( + tvm.target.current_target(allow_none=False).max_num_threads) + tx = tvm.thread_axis("threadIdx.x") + bx = tvm.thread_axis("blockIdx.x") + ib = tvm.ir_builder.create() + dshape = tvm.max(loc.shape[0], data.shape[axis]) + p_index = ib.buffer_ptr(index) + index_new = ib.buffer_ptr(new_index) + sizes = ib.buffer_ptr(loc) p_out = ib.buffer_ptr(output) + nthread_tx = max_threads + nthread_bx = dshape // max_threads + 1 + ib.scope_attr(tx, "thread_extent", nthread_tx) + ib.scope_attr(bx, "thread_extent", nthread_bx) + tid = bx * max_threads + tx + + with ib.if_scope(axis_mul_before * axis_mul_after > 1): + with ib.if_scope(tid < axis_mul_before * axis_mul_after): + i = tid / axis_mul_after + j = tid % axis_mul_after + base_idx = i * data.shape[axis] * axis_mul_after + j + with ib.for_range(0, data.shape[axis], name="k") as k: + with ib.if_scope(tid == 0): + start = 0 + with ib.else_scope(): + start = sizes[tid-1] + p_out[base_idx + k * axis_mul_after] = tvm.select( + k < p_index[tid], index_new[k+start], k) + with ib.else_scope(): + with ib.if_scope(tid < data.shape[axis]): + p_out[tid] = tvm.select(tid < p_index[0], index_new[tid], tid) + + body = ib.get() + return body + + +def sort_gpu(data, data_buf, index, index_buf, output_buf, axis, is_descend): + """Function to generate low level IR to do sorting on the GPU, use it by calling sort_gpu. + + Parameters + ---------- + data: tvm.Tensor + 3-D tensor with shape [batch_size, num_anchors, 6]. + The last dimension should be in format of + [class_id, score, box_left, box_top, box_right, box_bottom]. + + data_buf: Buffer + 2D Buffer of input boxes' score with shape [batch_size, num_anchors]. + + index : tvm.Tensor + 1-D tensor for valid number of boxes. + + index_buf : Buffer + Buffer of number of valid number of boxes. + + output_buf : Buffer + Output buffer of indicies of sorted tensor. + + axis : int + The axis used for sorting. + + is_descend : bool + If the sorted data is in descending order. + + Returns + ------- + out : tvm.Tensor + 3-D tensor with shape [batch_size, num_anchors]. + """ + ndim = len(data.shape) assert data.dtype == "float32", "Currently only supports input dtype to be float32" assert axis < ndim, "Axis out of boundary for input ndim %d" % ndim @@ -55,89 +373,60 @@ def sort_ir(data, index, output, axis, is_descend): elif i > axis: axis_mul_after *= data.shape[i] - dshape = 0 - for i in range(0, len(index.shape)): - dshape += index.shape[i] - dshape = tvm.select(dshape > axis_mul_before*axis_mul_after, dshape, - axis_mul_before*axis_mul_after) - - sizes_temp = ib.allocate( - "int32", dshape, name="sizes_temp", scope="global") - sizes = ib.allocate("int32", dshape, name="sizes", scope="global") - temp_index = ib.allocate("int32", dshape, name="temp_index", scope="local") - temp_data = ib.allocate("float32", dshape, name="temp_data", scope="local") - data_new = ib.allocate("float32", dshape, name="data_new", scope="global") - index_new = ib.allocate("int32", dshape, name="index_new", scope="global") - nthread_tx = max_threads - nthread_bx = dshape // max_threads + 1 - ib.scope_attr(tx, "thread_extent", nthread_tx) - ib.scope_attr(bx, "thread_extent", nthread_bx) - tid = bx * max_threads + tx - - with ib.if_scope(tid < axis_mul_before * axis_mul_after): - sizes[tid] = p_index[tid] - sizes_temp[tid] = p_index[tid] - - with ib.if_scope(tid < axis_mul_before * axis_mul_after): - with ib.for_range(0, tvm.floor(tvm.sqrt((axis_mul_before * axis_mul_after) \ - .astype("float32"))) + 1, name="k") as k: - with ib.if_scope(tid - (tvm.const(1, "int32") << k) >= 0): - with ib.if_scope(k % 2 == 0): - sizes[tid] += sizes_temp[tid - ( - tvm.const(1, "int32") << k)] - sizes_temp[tid] = sizes[tid] - with ib.else_scope(): - sizes_temp[tid] += sizes[tid - ( - tvm.const(1, "int32") << k)] - sizes[tid] = sizes_temp[tid] - - with ib.if_scope(tid < axis_mul_before * axis_mul_after): - i = tid / axis_mul_after - j = tid % axis_mul_after - current_sort_num = p_index[tid] - base_idx = i * data.shape[axis] * axis_mul_after + j - with ib.for_range(0, current_sort_num, name="k") as k: - full_idx = base_idx + k * axis_mul_after - with ib.if_scope(tid == 0): - start = 0 - with ib.else_scope(): - start = sizes[tid-1] - index_new[start + k] = k - data_new[start + k] = p_data[full_idx] - - with ib.if_scope(tid < axis_mul_before * axis_mul_after): - with ib.if_scope(tid == 0): - start = 0 - with ib.else_scope(): - start = sizes[tid-1] - # OddEvenTransposeSort - with ib.for_range(0, p_index[tid], name="k") as k: - with ib.for_range(0, p_index[tid] - 1, name="i") as i: - with ib.if_scope(i % 2 == (k & 1)): - with ib.if_scope(((data_new[i+start] < data_new[i+start+1]) ^ - is_descend) == False): - temp_data[tid] = data_new[i+start] - data_new[i+start] = data_new[i+start+1] - data_new[i+start+1] = temp_data[tid] - temp_index[tid] = index_new[i+start] - index_new[i+start] = index_new[i+start+1] - index_new[i+start+1] = temp_index[tid] - - with ib.if_scope(tid < axis_mul_before * axis_mul_after): - i = tid / axis_mul_after - j = tid % axis_mul_after - current_sort_num = p_index[tid] - base_idx = i * data.shape[axis] * axis_mul_after + j - with ib.for_range(0, data.shape[axis], name="k") as k: - with ib.if_scope(tid == 0): - start = 0 - with ib.else_scope(): - start = sizes[tid-1] - p_out[base_idx + k * axis_mul_after] = tvm.select( - k < current_sort_num, - index_new[k+start], k) - body = ib.get() - return body + dshape = axis_mul_before*axis_mul_after + fshape = data.shape[axis] * dshape + + loc_buf = api.decl_buffer(dshape, index.dtype, "sizes", data_alignment=8) + new_index_buf = api.decl_buffer( + fshape, index.dtype, "index_new", data_alignment=8) + out_index_buf = api.decl_buffer( + fshape, index.dtype, "index_out", data_alignment=8) + new_data_buf = api.decl_buffer( + dshape, data.dtype, "data_new", data_alignment=8) + + loc = \ + tvm.extern([(dshape,)], + [index], + lambda ins, outs: sort_pre_ir( + ins[0], outs[0], axis_mul_before, axis_mul_after), + dtype=[index.dtype], + in_buffers=index_buf, + out_buffers=[loc_buf], + tag="sorting_prepare") + + data_new, index_new = \ + tvm.extern([(dshape,), (fshape,)], + [data, index, loc], + lambda ins, outs: sort_pre_ir_data( + ins[0], ins[1], ins[2], outs[0], outs[1], axis, + axis_mul_before, axis_mul_after), + dtype=[data.dtype, index.dtype], + in_buffers=[data_buf, index_buf, loc_buf], + out_buffers=[new_data_buf, new_index_buf], + tag="sorting_data") + + index_out = \ + tvm.extern([(fshape,)], + [data, index, data_new, index_new, loc], + lambda ins, outs: sort_oet_ir( + ins[0], ins[1], ins[2], ins[3], ins[4], outs[0], + axis_mul_before, axis_mul_after, axis, is_descend), + dtype=[index.dtype], + in_buffers=[data_buf, index_buf, + new_data_buf, new_index_buf, loc_buf], + out_buffers=[out_index_buf], + tag="sorting_oet") + out = \ + tvm.extern([data.shape], + [data, index, index_out, loc], + lambda ins, outs: sort_ir_out( + ins[0], ins[1], ins[2], ins[3], outs[0], + axis_mul_before, axis_mul_after, axis), + dtype=[index.dtype], + in_buffers=[data_buf, index_buf, out_index_buf, loc_buf], + out_buffers=output_buf, + tag="sorting_output") + return out def nms_ir(data, sort_result, valid_count, out, nms_threshold, force_suppress, nms_topk): @@ -333,15 +622,8 @@ def nms_gpu(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk sort_tensor_buf = api.decl_buffer(score_shape, sort_tensor_dtype, "sort_tensor_buf", data_alignment=8) - sort_tensor = \ - tvm.extern(score_shape, - [score_tensor, valid_count], - lambda ins, outs: sort_ir( - ins[0], ins[1], outs[0], score_axis, True), - dtype=sort_tensor_dtype, - in_buffers=[score_tensor_buf, valid_count_buf], - out_buffers=sort_tensor_buf, - name="nms_sort") + sort_tensor = sort_gpu(score_tensor, score_tensor_buf, valid_count, + valid_count_buf, sort_tensor_buf, score_axis, True) out = \ tvm.extern(data.shape, [data, sort_tensor, valid_count], diff --git a/topi/python/topi/cuda/ssd/multibox.py b/topi/python/topi/cuda/ssd/multibox.py index c22e7a513d7d..3c013c4d1605 100644 --- a/topi/python/topi/cuda/ssd/multibox.py +++ b/topi/python/topi/cuda/ssd/multibox.py @@ -1,4 +1,4 @@ -# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, too-many-statements +# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, too-many-statements, too-many-function-args """SSD multibox operators""" from __future__ import absolute_import as _abs import math @@ -13,6 +13,7 @@ from topi.vision.ssd import multibox_transform_loc from ..nms import nms + def multibox_prior_ir(data, out, sizes, ratios, steps, offsets): """Low level IR routing for multibox_prior operator. @@ -41,7 +42,8 @@ def multibox_prior_ir(data, out, sizes, ratios, steps, offsets): stmt : Stmt The result IR statement. """ - max_threads = int(math.sqrt(tvm.target.current_target(allow_none=False).max_num_threads)) + max_threads = int(math.sqrt( + tvm.target.current_target(allow_none=False).max_num_threads)) tx = tvm.thread_axis("threadIdx.x") ty = tvm.thread_axis("threadIdx.y") bx = tvm.thread_axis("blockIdx.x") @@ -76,7 +78,8 @@ def multibox_prior_ir(data, out, sizes, ratios, steps, offsets): for k in range(num_sizes + num_ratios - 1): w = tvm.select(k < num_sizes, - size_ratio_concat[k] * in_height / in_width / 2.0, + size_ratio_concat[ + k] * in_height / in_width / 2.0, size_ratio_concat[0] * in_height / in_width * math.sqrt(size_ratio_concat[k + 1]) / 2.0) h = tvm.select(k < num_sizes, size_ratio_concat[k] / 2.0, @@ -93,7 +96,7 @@ def multibox_prior_ir(data, out, sizes, ratios, steps, offsets): @multibox_prior.register(["cuda", "gpu"]) -def multibox_prior_gpu(data, sizes=(1,), ratios=(1,), steps=(-1, -1), \ +def multibox_prior_gpu(data, sizes=(1,), ratios=(1,), steps=(-1, -1), offsets=(0.5, 0.5), clip=False): """Generate prior(anchor) boxes from data, sizes and ratios. @@ -124,31 +127,114 @@ def multibox_prior_gpu(data, sizes=(1,), ratios=(1,), steps=(-1, -1), \ """ num_sizes = len(sizes) num_ratios = len(ratios) - oshape = (1, data.shape[2] * data.shape[3] * (num_sizes + num_ratios - 1), 4) + oshape = ( + 1, data.shape[2] * data.shape[3] * (num_sizes + num_ratios - 1), 4) out = tvm.extern(oshape, [data], lambda ins, outs: - multibox_prior_ir(ins[0], outs[0], sizes, ratios, steps, offsets), + multibox_prior_ir( + ins[0], outs[0], sizes, ratios, steps, offsets), tag="multibox_prior") if clip: out = topi.clip(out, 0, 1) return out -def transform_loc_ir(cls_prob, loc_pred, anchor, valid_count, out, clip, threshold, variances): - """Low level IR routing for transform location in multibox_detection operator. +def transform_loc_pre(cls_prob, valid_count, temp_flag, temp_id, temp_score_out, threshold): + """Low level IR routing for transform location data preparation. Parameters ---------- cls_prob : Buffer Buffer of class probabilities. + valid_count : Buffer + Buffer of number of valid output boxes. + + temp_flag : Buffer + Output intermediate result buffer + + temp_id : Buffer + Output intermediate result buffer + + temp_score_out : Buffer + Output buffer + + threshold : float + Threshold to be a positive prediction. + + Returns + ------- + stmt : Stmt + The result IR statement. + """ + batch_size = cls_prob.shape[0] + num_classes = cls_prob.shape[1] + num_anchors = cls_prob.shape[2] + + max_threads = int( + tvm.target.current_target(allow_none=False).max_num_threads) + ib = tvm.ir_builder.create() + score = ib.buffer_ptr(temp_score_out) + cls_id = ib.buffer_ptr(temp_id) + flag = ib.buffer_ptr(temp_flag) + tx = tvm.thread_axis("threadIdx.x") + bx = tvm.thread_axis("blockIdx.x") + nthread_tx = max_threads + nthread_bx = (batch_size * num_anchors * num_classes) // max_threads + 1 + ib.scope_attr(tx, "thread_extent", nthread_tx) + ib.scope_attr(bx, "thread_extent", nthread_bx) + tid = bx * max_threads + tx + p_cls_prob = ib.buffer_ptr(cls_prob) + p_valid_count = ib.buffer_ptr(valid_count) + + with ib.if_scope(tid < batch_size * num_anchors): + n = tid / num_anchors # number of batches + i = tid % num_anchors # number of anchors + score[i] = -1.0 + cls_id[i] = 0 + p_valid_count[n] = 0 + with ib.for_range(0, num_classes-1, name="k") as k: + temp = p_cls_prob[n * num_anchors * num_classes + (k + 1) * num_anchors + i] + with ib.if_scope(temp > score[i]): + cls_id[i] = k + 1 + score[i] = temp + with ib.if_scope(tvm.all(cls_id[i] > 0, score[i] < threshold)): + cls_id[i] = 0 + with ib.if_scope(cls_id[i] > 0): + flag[i] = 1 + with ib.else_scope(): + flag[i] = 0 + + with ib.if_scope(tid < batch_size): + with ib.for_range(0, num_anchors, name="k") as k: + with ib.if_scope(k > 0): + flag[tid * num_anchors + + k] += flag[tid * num_anchors + k - 1] + p_valid_count[n] = flag[tid * num_anchors + num_anchors - 1] + + body = ib.get() + return body + + +def transform_loc_ir(loc_pred, anchor, temp_flag, temp_id, temp_score_in, \ + out, clip, variances, batch_size, num_classes, num_anchors): + """Low level IR routing for transform location in multibox_detection operator. + + Parameters + ---------- loc_pred : Buffer Buffer of location regression predictions. anchor : Buffer Buffer of prior anchor boxes. - valid_count : Buffer - Buffer of number of valid output boxes. + temp_flag : Buffer + Intermediate result buffer. + + temp_id : Buffer + Intermediate result buffer. + + temp_score_in : Buffer + Input buffer which stores intermediate results. out : Buffer Output buffer. @@ -156,12 +242,18 @@ def transform_loc_ir(cls_prob, loc_pred, anchor, valid_count, out, clip, thresho clip : boolean Whether to clip out-of-boundary boxes. - threshold : float - Threshold to be a positive prediction. - variances : tuple of float Variances to be decoded from box regression output. + batch_size : int + Batch size + + num_classes : int + Number of classes + + num_anchors : int + Number of anchors + Returns ------- stmt : Stmt @@ -187,21 +279,16 @@ def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw, ow = tvm.exp(pw * vw) * aw / 2.0 oh = tvm.exp(ph * vh) * ah / 2.0 return tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, ox - ow)), ox - ow), \ - tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, oy - oh)), oy - oh), \ - tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, ox + ow)), ox + ow), \ - tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, oy + oh)), oy + oh) - - batch_size = cls_prob.shape[0] - num_classes = cls_prob.shape[1] - num_anchors = cls_prob.shape[2] + tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, oy - oh)), oy - oh), \ + tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, ox + ow)), ox + ow), \ + tvm.select(clip, tvm.make.Max(0, tvm.make.Min(1, oy + oh)), oy + oh) + max_threads = int( + tvm.target.current_target(allow_none=False).max_num_threads) ib = tvm.ir_builder.create() - temp_score = ib.allocate('float32', (batch_size * (num_classes -1) * num_anchors, \ - ), name="temp_score", scope="global") - score = ib.allocate('float32', (batch_size * num_anchors, ), name="score", scope="local") - cls_id = ib.allocate('int32', (batch_size * num_anchors, ), name="id", scope="local") - flag = ib.allocate('int32', (batch_size * num_anchors, ), name="flag", scope="global") - max_threads = int(tvm.target.current_target(allow_none=False).max_num_threads) + score = ib.buffer_ptr(temp_score_in) + cls_id = ib.buffer_ptr(temp_id) + flag = ib.buffer_ptr(temp_flag) tx = tvm.thread_axis("threadIdx.x") bx = tvm.thread_axis("blockIdx.x") nthread_tx = max_threads @@ -209,42 +296,13 @@ def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw, ib.scope_attr(tx, "thread_extent", nthread_tx) ib.scope_attr(bx, "thread_extent", nthread_bx) tid = bx * max_threads + tx - p_cls_prob = ib.buffer_ptr(cls_prob) p_loc_pred = ib.buffer_ptr(loc_pred) p_anchor = ib.buffer_ptr(anchor) - p_valid_count = ib.buffer_ptr(valid_count) p_out = ib.buffer_ptr(out) - with ib.if_scope(tid < batch_size * num_anchors * num_classes): - n = tid / (num_anchors * num_classes) - j = (tid % (num_anchors * num_classes)) / num_anchors - i = tid % num_anchors - with ib.if_scope(j > 0): - temp_score[n * num_anchors * num_classes + i * (num_classes - 1) + j-1] = \ - p_cls_prob[tid] - p_valid_count[n] = 0 - with ib.if_scope(tid < batch_size * num_anchors): - n = tid / num_anchors - i = tid % num_anchors - score[tid] = -1.0 - cls_id[tid] = 0 - with ib.for_range(0, num_classes-1, name="k") as k: - temp = temp_score[tid * (num_classes-1) + k] - cls_id[tid] = tvm.select(temp > score[tid], k + 1, cls_id[tid]) - score[tid] = tvm.make.Max(temp, score[tid]) - with ib.if_scope(tvm.all(cls_id[tid] > 0, score[tid] < threshold)): - cls_id[tid] = 0 - with ib.if_scope(cls_id[tid] > 0): - flag[tid] = 1 - with ib.else_scope(): - flag[tid] = 0 - with ib.if_scope(tid < batch_size): - with ib.for_range(0, num_anchors, name="k") as k: - with ib.if_scope(k > 0): - flag[tid * num_anchors + k] += flag[tid * num_anchors + k - 1] - p_valid_count[tid] = flag[tid * num_anchors + num_anchors - 1] + with ib.if_scope(tid < batch_size * num_anchors): - n = tid / num_anchors - i = tid % num_anchors + n = tid / num_anchors # number of batches + i = tid % num_anchors # number of anchors with ib.if_scope(cls_id[tid] > 0): with ib.if_scope(tid == 0): out_base_idx = n * num_anchors * 6 @@ -253,17 +311,17 @@ def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw, p_out[out_base_idx] = cls_id[tid] - 1.0 p_out[out_base_idx + 1] = score[tid] p_out[out_base_idx + 2], p_out[out_base_idx + 3], p_out[out_base_idx + 4], \ - p_out[out_base_idx + 5] = transform_loc(p_loc_pred, tid * 4, p_anchor, i*4, - clip, variances[0], variances[1], - variances[2], variances[3]) + p_out[out_base_idx + 5] = transform_loc(p_loc_pred, tid * 4, + p_anchor, i*4, clip, variances[0], + variances[1], variances[2], variances[3]) body = ib.get() return body @multibox_transform_loc.register(["cuda", "gpu"]) -def multibox_transform_loc_gpu(cls_prob, loc_pred, anchor, clip=True, threshold=0.01, - variances=(0.1, 0.1, 0.2, 0.2)): +def multibox_transform_loc_gpu(cls_prob, loc_pred, anchor, clip=True, \ + threshold=0.01, variances=(0.1, 0.1, 0.2, 0.2)): """Location transformation for multibox detection Parameters @@ -297,20 +355,42 @@ def multibox_transform_loc_gpu(cls_prob, loc_pred, anchor, clip=True, threshold= 1-D tensor with shape (batch_size,), number of valid anchor boxes. """ batch_size = cls_prob.shape[0] - num_anchors = anchor.shape[1] + num_classes = cls_prob.shape[1] + num_anchors = cls_prob.shape[2] oshape = (batch_size, num_anchors, 6) # Define data alignment for intermediate buffer valid_count_dtype = "int32" valid_count_buf = api.decl_buffer((batch_size,), valid_count_dtype, "valid_count_buf", data_alignment=4) - out_buf = api.decl_buffer(oshape, cls_prob.dtype, "out_buf", data_alignment=8) - valid_count, out = \ - tvm.extern([(batch_size,), oshape], - [cls_prob, loc_pred, anchor], + out_buf = api.decl_buffer( + oshape, cls_prob.dtype, "out_buf", data_alignment=8) + size = num_anchors + temp_flag_buf = api.decl_buffer( + (size,), valid_count_dtype, "flag", data_alignment=8) + temp_id_buf = api.decl_buffer( + (size,), valid_count_dtype, "cls_id", data_alignment=8) + temp_score_buf = api.decl_buffer( + (size,), cls_prob.dtype, "score", data_alignment=8) + + valid_count, temp_flag, temp_id, temp_score = \ + tvm.extern([(batch_size,), (size,), (size,), (size,)], + [cls_prob], + lambda ins, outs: transform_loc_pre( + ins[0], outs[0], outs[1], outs[2], outs[3], threshold), + dtype=[valid_count_dtype, + valid_count_dtype, valid_count_dtype, cls_prob.dtype], + out_buffers=[valid_count_buf, + temp_flag_buf, temp_id_buf, temp_score_buf], + tag="multibox_transform_loc_first_step") + + out = \ + tvm.extern([oshape], + [loc_pred, anchor, temp_flag, temp_id, temp_score], lambda ins, outs: transform_loc_ir( - ins[0], ins[1], ins[2], outs[0], outs[1], clip, threshold, variances), - dtype=[valid_count_dtype, cls_prob.dtype], - out_buffers=[valid_count_buf, out_buf], + ins[0], ins[1], ins[2], ins[3], ins[4], outs[0], clip, \ + variances, batch_size, num_classes, num_anchors), + dtype=[cls_prob.dtype], + out_buffers=[out_buf], tag="multibox_transform_loc") return [out, valid_count] @@ -356,5 +436,6 @@ def multibox_detection_gpu(cls_prob, loc_pred, anchor, clip=True, threshold=0.01 """ inter_out = multibox_transform_loc(cls_prob, loc_pred, anchor, clip, threshold, variances) - out = nms(inter_out[0], inter_out[1], nms_threshold, force_suppress, nms_topk) + out = nms( + inter_out[0], inter_out[1], nms_threshold, force_suppress, nms_topk) return out From 81b42bc3716b80e821651d3dcd6942a653e57e9f Mon Sep 17 00:00:00 2001 From: Pariksheet Pinjari Date: Tue, 14 Aug 2018 01:43:09 +0530 Subject: [PATCH 20/77] Split_indices negative axis added (#1595) --- topi/include/topi/transform.h | 5 +++++ topi/tests/python_cpp/test_topi_transform.py | 1 + 2 files changed, 6 insertions(+) diff --git a/topi/include/topi/transform.h b/topi/include/topi/transform.h index 09af612b957b..245b38cfb63d 100644 --- a/topi/include/topi/transform.h +++ b/topi/include/topi/transform.h @@ -475,6 +475,11 @@ inline Array split_sections(const Tensor& x, int axis, std::string name = "tensor", std::string tag = kInjective) { + if (axis < 0) { + axis += static_cast(x->shape.size()); + } + CHECK_LT(axis, x->shape.size()) << "axis out of bounds"; + auto src_axis_size = static_cast(GetConstInt(x->shape[axis])); CHECK_GT(num_sections, 0) << "Slice count must be > 0"; diff --git a/topi/tests/python_cpp/test_topi_transform.py b/topi/tests/python_cpp/test_topi_transform.py index c8b7c3906caa..3f7bdbfdd499 100644 --- a/topi/tests/python_cpp/test_topi_transform.py +++ b/topi/tests/python_cpp/test_topi_transform.py @@ -340,6 +340,7 @@ def test_concatenate(): def test_split(): verify_split((2, 12, 3), 3, 1) + verify_split((2, 12, 3), 3, -1) verify_split((2, 12, 3), [2, 4], 1) verify_split((10, 12, 24), [5, 7, 9], -1) From 84296f96327d190529220596e2cd2de5ff8ed39b Mon Sep 17 00:00:00 2001 From: Albin Joy Date: Tue, 14 Aug 2018 02:44:26 +0530 Subject: [PATCH 21/77] [FRONTEND][TENSORFLOW] Optimized tensorflow testcases (#1546) * [NNVM][TENSORFLOW] Optimized tensorflow testcases * Replace Constants with Placeholder * Review comment fix --- .../frontend/tensorflow/test_forward.py | 457 ++++++------------ 1 file changed, 136 insertions(+), 321 deletions(-) diff --git a/nnvm/tests/python/frontend/tensorflow/test_forward.py b/nnvm/tests/python/frontend/tensorflow/test_forward.py index 495852f9e5d6..64c57c126f8d 100644 --- a/nnvm/tests/python/frontend/tensorflow/test_forward.py +++ b/nnvm/tests/python/frontend/tensorflow/test_forward.py @@ -83,6 +83,34 @@ def run_tf_graph(sess, input_data, input_node, output_node): output_data = sess.run(tensor, input_dict) return output_data + +def compare_tf_with_tvm(in_data, in_name, out_name, init_global_variables=False): + """Generic function to generate and compare tensorflow and TVM output""" + + out_node = out_name.split(':')[0] if ":" in out_name else out_name + + if isinstance(in_name, list): + in_node = [0]*len(in_name) + for i in range(len(in_name)): + in_node[i] = in_name[i].split(':')[0] if ":" in in_name[i] else in_name[i] + else: + in_node = in_name.split(':')[0] if ":" in in_name else in_name + + with tf.Session() as sess: + if init_global_variables: + sess.run(variables.global_variables_initializer()) + final_graph_def = tf.graph_util.convert_variables_to_constants( + sess, + sess.graph.as_graph_def(add_shapes=True), + [out_node], + ) + + tf_output = run_tf_graph(sess, in_data, in_name, out_name) + tvm_output = run_tvm_graph(final_graph_def, in_data, + in_node, tf_output.shape, tf_output.dtype) + np.testing.assert_allclose(tf_output, tvm_output, atol=1e-5, rtol=1e-5) + sess.close() + ####################################################################### # Pooling # ------- @@ -93,31 +121,15 @@ def _test_pooling(input_shape, **kwargs): np.prod(input_shape), dtype=np.float32).reshape(input_shape) - 1 with tf.Graph().as_default(): - in_data = constant_op.constant(x, shape=input_shape, dtype='float32') - # pylint: disable=unused-variable - pool = nn_ops.pool(in_data, **kwargs) - # pylint: enable=unused-variable + in_data = array_ops.placeholder(shape=input_shape, dtype='float32') + nn_ops.pool(in_data, **kwargs) if kwargs['pooling_type'] == 'MAX': - out_node = 'max_pool' out_name = 'max_pool:0' else: - out_node = 'avg_pool' out_name = 'avg_pool:0' - with tf.Session() as sess: - graph_def = tf.graph_util.convert_variables_to_constants( - sess, - sess.graph.as_graph_def(add_shapes=True), - [out_node], - ) - - tf_output = run_tf_graph(sess, x, 'Const:0', out_name) - tvm_output = run_tvm_graph(graph_def, x.astype('float32'), - "Const", tf_output.shape, 'float32') - np.testing.assert_allclose(tf_output, tvm_output, atol=1e-3, rtol=1e-3) - - sess.close() + compare_tf_with_tvm(x, 'Placeholder:0', out_name) def test_forward_pooling(): """ Pooling """ @@ -195,35 +207,19 @@ def _test_convolution(tensor_in_sizes, filter_in_sizes, filter_array = [f * 1.0 for f in range(1, total_size_2 + 1)] with tf.Graph().as_default(): - in_data = constant_op.constant(data_array, shape=tensor_in_sizes, dtype='float32') + in_data = array_ops.placeholder(shape=tensor_in_sizes, dtype='float32') in_filter = constant_op.constant(filter_array, shape=filter_in_sizes, dtype='float32') strides = [1] + strides + [1] dilations = [1] + dilations + [1] - # pylint: disable=unused-variable - conv = nn_ops.conv2d(in_data, - in_filter, - strides=strides, - padding=padding, - data_format=data_format) - # pylint: enable=unused-variable + nn_ops.conv2d(in_data, + in_filter, + strides=strides, + padding=padding, + data_format=data_format) - with tf.Session() as sess: - graph_def = tf.graph_util.convert_variables_to_constants( - sess, - sess.graph.as_graph_def(add_shapes=True), - ['Conv2D'], - ) - - tf_output = run_tf_graph(sess, np.reshape(data_array, tensor_in_sizes), - 'Const:0', 'Conv2D:0') - tvm_output = run_tvm_graph(graph_def, - np.reshape(data_array, tensor_in_sizes).astype('float32'), - "Const", tf_output.shape, 'float32') - - np.testing.assert_allclose(tf_output, tvm_output, atol=1e-3, rtol=1e-3) - - sess.close() + compare_tf_with_tvm(np.reshape(data_array, tensor_in_sizes).astype('float32'), + 'Placeholder:0', 'Conv2D:0') def test_forward_convolution(): _test_convolution([4, 8, 8, 176], [1, 1, 176, 32], [1, 1], [1, 1], 'SAME', 'NHWC') @@ -239,28 +235,10 @@ def _test_reshape(data, out_shape): """ One iteration of reshape operation with given data and out shape """ with tf.Graph().as_default(): - in_data = constant_op.constant(data, shape=data.shape, dtype=data.dtype) - - # pylint: disable=unused-variable - reshape_out = array_ops.reshape(in_data, out_shape) - # pylint: enable=unused-variable + in_data = array_ops.placeholder(shape=data.shape, dtype=data.dtype) + array_ops.reshape(in_data, out_shape) - with tf.Session() as sess: - graph_def = tf.graph_util.convert_variables_to_constants( - sess, - sess.graph.as_graph_def(add_shapes=True), - ['Reshape'], - ) - - tf_output = run_tf_graph(sess, data, - 'Const:0', 'Reshape:0') - tvm_output = run_tvm_graph(graph_def, - data, - "Const", tf_output.shape, data.dtype) - - np.testing.assert_allclose(tf_output, tvm_output) - - sess.close() + compare_tf_with_tvm(data, 'Placeholder:0', 'Reshape:0') def test_forward_reshape(): _test_reshape(np.arange(6.0), [2, 3]) @@ -279,31 +257,14 @@ def _test_squeeze(data, squeeze_dims=None): squeeze_dims = [] with tf.Graph().as_default(): - in_data = constant_op.constant(data, shape=data.shape, dtype=data.dtype) + in_data = array_ops.placeholder(shape=data.shape, dtype=data.dtype) - # pylint: disable=unused-variable if squeeze_dims: - squeeze_out = array_ops.squeeze(in_data, squeeze_dims) + array_ops.squeeze(in_data, squeeze_dims) else: - squeeze_out = array_ops.squeeze(in_data) - # pylint: enable=unused-variable - - with tf.Session() as sess: - graph_def = tf.graph_util.convert_variables_to_constants( - sess, - sess.graph.as_graph_def(add_shapes=True), - ['Squeeze'], - ) - - tf_output = run_tf_graph(sess, data, - 'Const:0', 'Squeeze:0') - tvm_output = run_tvm_graph(graph_def, - data, - "Const", tf_output.shape, data.dtype) + array_ops.squeeze(in_data) - np.testing.assert_allclose(tf_output, tvm_output) - - sess.close() + compare_tf_with_tvm(data, 'Placeholder:0', 'Squeeze:0') def test_forward_squeeze(): """ Squeeze """ @@ -336,28 +297,10 @@ def _test_concat_v2(data, dim): """ One iteration of ConcatV2 """ with tf.Graph().as_default(): + gen_array_ops._concat_v2(data, dim) - # pylint: disable=unused-variable - concat_out = gen_array_ops._concat_v2(data, dim) - # pylint: enable=unused-variable - - with tf.Session() as sess: - graph_def = tf.graph_util.convert_variables_to_constants( - sess, - sess.graph.as_graph_def(add_shapes=True), - ['ConcatV2'], - ) - - tf_output = run_tf_graph(sess, data, - ['ConcatV2/values_0:0', 'ConcatV2/values_1:0'], 'ConcatV2:0') - tvm_output = run_tvm_graph(graph_def, - data, - ["ConcatV2/values_0", 'ConcatV2/values_1'], - tf_output.shape, tf_output.dtype) - - np.testing.assert_allclose(tf_output, tvm_output) - - sess.close() + compare_tf_with_tvm(data, ['ConcatV2/values_0:0', 'ConcatV2/values_1:0'], + 'ConcatV2:0') def _test_forward_concat_v2(): t1 = np.array([]) @@ -377,28 +320,10 @@ def _test_sigmoid(data): """ One iteration of sigmoid """ with tf.Graph().as_default(): - in_data = constant_op.constant(data, shape=data.shape, dtype=data.dtype) - - # pylint: disable=unused-variable + in_data = array_ops.placeholder(shape=data.shape, dtype=data.dtype) sigmoid_out = math_ops.sigmoid(in_data) - # pylint: enable=unused-variable - - with tf.Session() as sess: - graph_def = tf.graph_util.convert_variables_to_constants( - sess, - sess.graph.as_graph_def(add_shapes=True), - ['Sigmoid'], - ) - - tf_output = run_tf_graph(sess, data, - 'Const:0', 'Sigmoid:0') - tvm_output = run_tvm_graph(graph_def, - data, - "Const", tf_output.shape, data.dtype) - np.testing.assert_allclose(tf_output, tvm_output, atol=1e-5, rtol=1e-5) - - sess.close() + compare_tf_with_tvm(data, 'Placeholder:0', 'Sigmoid:0') def test_forward_sigmoid(): """ Sigmoid """ @@ -412,24 +337,10 @@ def test_forward_sigmoid(): def _test_argx(func, data, **kwargs): with tf.Graph().as_default(): - inp = constant_op.constant(data, shape=data.shape, dtype=data.dtype, name="c0") - - # pylint: disable=unused-variable - out = func(inp, name="argx0", **kwargs) - # pylint: enable=unused-variable - - with tf.Session() as sess: - graph_def = tf.graph_util.convert_variables_to_constants( - sess=sess, - input_graph_def=sess.graph.as_graph_def(add_shapes=True), - output_node_names=["argx0"]) - - tf_output = run_tf_graph(sess, data, input_node="c0:0", output_node="argx0:0") - tvm_output = run_tvm_graph(graph_def, data, "c0", tf_output.shape, output_dtype='int32') - - np.testing.assert_allclose(tf_output, tvm_output, atol=1e-5, rtol=1e-5) + inp = array_ops.placeholder(shape=data.shape, dtype=data.dtype, name="c0") + func(inp, name="argx0", **kwargs, output_type=tf.int32) - sess.close() + compare_tf_with_tvm(data, 'c0:0', 'argx0:0') def test_argmin_argmax(): for axis in [None,0,1,2]: @@ -442,6 +353,8 @@ def test_argmin_argmax(): # -------- def _test_variable(data): + """ One iteration of a variable """ + tf.reset_default_graph() input_op = array_ops.placeholder(shape=data.shape, dtype=data.dtype) input_tensor = array_ops.reshape(input_op, data.shape) @@ -450,84 +363,15 @@ def _test_variable(data): with variable_scope.variable_scope("linear", reuse=None): w = variable_scope.get_variable( "w", shape=[size, size], dtype=input_tensor.dtype) - # pylint: disable=unused-variable - output_op = math_ops.matmul(input_tensor, w) - # pylint: enable=unused-variable - - with tf.Session() as sess: - sess.run(variables.global_variables_initializer()) - final_graph_def = tf.graph_util.convert_variables_to_constants( - sess, - sess.graph.as_graph_def(add_shapes=True), - ['MatMul'], - ) - - tf_output = run_tf_graph(sess, data, 'Placeholder:0', 'MatMul:0') - tvm_output = run_tvm_graph(final_graph_def, data, - "Placeholder", tf_output.shape, data.dtype) + math_ops.matmul(input_tensor, w) - np.testing.assert_allclose(tf_output, tvm_output, atol=1e-5, rtol=1e-5) - sess.close() + compare_tf_with_tvm(data, 'Placeholder:0', 'MatMul:0', init_global_variables=True) def test_forward_variable(): """Variable type op test""" _test_variable(np.random.uniform(size=(32, 100)).astype('float32')) -####################################################################### -# LSTM -# ---- -def _test_lstm_cell(batch_size, num_hidden, num_layers, forget_bias, dtype): - tf.reset_default_graph() - input_size = num_hidden - input_data = np.full((batch_size, input_size), 1., dtype=dtype) - in_state_c = np.full((num_layers, batch_size, num_hidden), 0.1, dtype=dtype) - in_state_h = np.full((num_layers, batch_size, num_hidden), 0.1, dtype=dtype) - - def _get_tensorflow_output(): - with tf.Session() as sess: - with variable_scope.variable_scope( - "root", initializer=init_ops.constant_initializer(0.5)): - m0 = array_ops.zeros([batch_size, num_hidden]) - m1 = array_ops.zeros([batch_size, num_hidden]) - x=tf.placeholder(shape=(batch_size, input_size), dtype=dtype) - g, ((out_m0, out_m1)) = \ - tf.contrib.rnn.LSTMBlockCell(num_hidden, - forget_bias=forget_bias)(x, ((m0, m1))) - sess.run([variables.global_variables_initializer()]) - res = sess.run([g, out_m0, out_m1], { - x.name: np.array([[1., 1.]]), - m0.name: 0.1 * np.ones([batch_size, num_hidden]), - m1.name: 0.1 * np.ones([batch_size, num_hidden]), - }) - graph_def = sess.graph.as_graph_def(add_shapes=True) - final_graph_def = graph_util.convert_variables_to_constants( - sess, - graph_def, - ['root/lstm_cell/LSTMBlockCell']) - return final_graph_def, res - - graph_def, tf_out = _get_tensorflow_output() - tvm_output = run_tvm_graph(graph_def, [input_data, in_state_c, in_state_h], - ['root/Placeholder', 'root/lstm_cell/LSTMBlockCell_c', - 'root/lstm_cell/LSTMBlockCell_h'], - [tf_out[0].shape, (2, batch_size, num_hidden)], - [tf_out[0].dtype, tf_out[1].dtype]) - - if isinstance(tvm_output, list): - out = tvm_output[0] - out_state = tvm_output[1] - out_state_tup = np.split(out_state, indices_or_sections=2, axis=0) - out_state_c = np.reshape(out_state_tup[0], (batch_size, num_hidden)) - out_state_h = np.reshape(out_state_tup[1], (batch_size, num_hidden)) - tvm_out = [out, out_state_c, out_state_h] - np.testing.assert_allclose(tf_out, tvm_out, rtol=1e-3, atol=1e-3) - -def test_forward_lstm(): - '''test LSTM block cell''' - _test_lstm_cell(1, 2, 1, 0.0, 'float32') - - ####################################################################### # StridedSlice # ------------ @@ -535,6 +379,8 @@ def test_forward_lstm(): def _test_stridedslice(ip_shape, begin, end, stride, dtype, begin_mask=0, end_mask=0, new_axis_mask=0, shrink_axis_mask=0, ellipsis_mask=0): + """ One iteration of a Stridedslice """ + tf.reset_default_graph() in_data = tf.placeholder(dtype, ip_shape, name="in_data") tf.strided_slice(in_data, begin, end, stride, begin_mask=begin_mask, @@ -543,17 +389,7 @@ def _test_stridedslice(ip_shape, begin, end, stride, dtype, ellipsis_mask=ellipsis_mask, name="strided_slice") np_data = np.random.uniform(size=ip_shape).astype(dtype) - with tf.Session() as sess: - final_graph_def = tf.graph_util.convert_variables_to_constants( - sess, - sess.graph.as_graph_def(add_shapes=True), - ['strided_slice']) - tf_output = run_tf_graph(sess, np_data, - 'in_data:0', 'strided_slice:0') - tvm_output = run_tvm_graph(final_graph_def, np_data, - "in_data", tf_output.shape, np_data.dtype) - np.testing.assert_allclose(tf_output, tvm_output, atol=1e-5, rtol=1e-5) - sess.close() + compare_tf_with_tvm(np_data, 'in_data:0', 'strided_slice:0') def test_forward_stridedslice(): '''test StridedSlice''' @@ -586,6 +422,8 @@ def test_forward_stridedslice(): # ------ def _test_gather(ip_shape, indice_shape, indice_value, axis, dtype): + """ One iteration of a Gather """ + tf.reset_default_graph() in_data = tf.placeholder(dtype, ip_shape, name="in_data") indices = tf.placeholder("int32", indice_shape, name="indices") @@ -601,17 +439,7 @@ def _fill_indices(indice_value): return indices np_indices = _fill_indices(indice_value) - with tf.Session() as sess: - final_graph_def = tf.graph_util.convert_variables_to_constants( - sess, - sess.graph.as_graph_def(add_shapes=True), - ['GatherV2']) - tf_output = run_tf_graph(sess, [np_data, np_indices], ['in_data:0', - 'indices:0'], 'GatherV2:0') - tvm_output = run_tvm_graph(final_graph_def, [np_data, np_indices], - ['in_data', 'indices'], tf_output.shape, dtype) - np.testing.assert_allclose(tf_output, tvm_output, atol=1e-5, rtol=1e-5) - sess.close() + compare_tf_with_tvm([np_data, np_indices], ['in_data:0', 'indices:0'], 'GatherV2:0') def test_forward_gather(): '''test gather layer''' @@ -640,28 +468,11 @@ def test_forward_multi_input(): out1 = tf.add(in1, in2, name='out1') out2 = tf.subtract(in3, in4, name='out2') - out = tf.multiply(out1, out2, name='out') + in_data = np.arange(9, dtype='int32').reshape([3, 3]) - with tf.Session() as sess: - graph_def = tf.graph_util.convert_variables_to_constants( - sess, - sess.graph.as_graph_def(add_shapes=True), - ['out'], - ) - - in_data = np.arange(9, dtype='int32').reshape([3, 3]) - - tf_output = run_tf_graph(sess, [in_data, in_data, in_data, in_data ], - ['in1:0', 'in2:0', 'in3:0', 'in4:0'], 'out:0') - tvm_output = run_tvm_graph(graph_def, - [in_data, in_data, in_data, in_data ], - ['in1', 'in2', 'in3', 'in4'], - tf_output.shape, tf_output.dtype) - - np.testing.assert_allclose(tf_output, tvm_output) - - sess.close() + compare_tf_with_tvm([in_data, in_data, in_data, in_data], + ['in1:0', 'in2:0', 'in3:0', 'in4:0'], 'out:0') ####################################################################### # Resize Bilinear @@ -674,36 +485,75 @@ def _test_resize_bilinear(in_shape, to_shape, align_corners): shape_data = np.array(to_shape).astype('int32') with tf.Graph().as_default(): - in_data = constant_op.constant(data, shape=data.shape, dtype=data.dtype) + in_data = array_ops.placeholder(shape=data.shape, dtype=data.dtype) shape_data = constant_op.constant(shape_data, shape=shape_data.shape, dtype=shape_data.dtype) + tf.image.resize_bilinear(in_data, shape_data, align_corners=align_corners) - # pylint: disable=unused-variable - resize_out = tf.image.resize_bilinear(in_data, shape_data, align_corners=align_corners) - # pylint: enable=unused-variable + compare_tf_with_tvm(data, 'Placeholder:0', 'ResizeBilinear:0') - with tf.Session() as sess: - graph_def = tf.graph_util.convert_variables_to_constants( - sess, - sess.graph.as_graph_def(add_shapes=True), - ['ResizeBilinear'], - ) +def test_forward_resize_bilinear(): + """ Resize Bilinear """ - tf_output = run_tf_graph(sess, data, - 'Const:0', 'ResizeBilinear:0') + _test_resize_bilinear((4, 16, 32, 32), [50, 50], False) + _test_resize_bilinear((6, 32, 64, 64), [20, 20], True) - tvm_output = run_tvm_graph(graph_def, - data, - "Const", tf_output.shape, data.dtype) - np.testing.assert_allclose(tf_output, tvm_output, atol=1e-3, rtol=1e-3) +####################################################################### +# LSTM +# ---- - sess.close() +def _test_lstm_cell(batch_size, num_hidden, num_layers, forget_bias, dtype): + """ One iteration of a LSTM cell """ -def test_forward_resize_bilinear(): - """ Resize Bilinear """ + tf.reset_default_graph() + input_size = num_hidden + input_data = np.full((batch_size, input_size), 1., dtype=dtype) + in_state_c = np.full((num_layers, batch_size, num_hidden), 0.1, dtype=dtype) + in_state_h = np.full((num_layers, batch_size, num_hidden), 0.1, dtype=dtype) - _test_resize_bilinear((4, 16, 32, 32), [50, 50], False) - _test_resize_bilinear((6, 32, 64, 64), [20, 20], True) + def _get_tensorflow_output(): + with tf.Session() as sess: + with variable_scope.variable_scope( + "root", initializer=init_ops.constant_initializer(0.5)): + m0 = array_ops.zeros([batch_size, num_hidden]) + m1 = array_ops.zeros([batch_size, num_hidden]) + x=tf.placeholder(shape=(batch_size, input_size), dtype=dtype) + g, ((out_m0, out_m1)) = \ + tf.contrib.rnn.LSTMBlockCell(num_hidden, + forget_bias=forget_bias)(x, ((m0, m1))) + sess.run([variables.global_variables_initializer()]) + res = sess.run([g, out_m0, out_m1], { + x.name: np.array([[1., 1.]]), + m0.name: 0.1 * np.ones([batch_size, num_hidden]), + m1.name: 0.1 * np.ones([batch_size, num_hidden]), + }) + graph_def = sess.graph.as_graph_def(add_shapes=True) + final_graph_def = graph_util.convert_variables_to_constants( + sess, + graph_def, + ['root/lstm_cell/LSTMBlockCell']) + return final_graph_def, res + + graph_def, tf_out = _get_tensorflow_output() + tvm_output = run_tvm_graph(graph_def, [input_data, in_state_c, in_state_h], + ['root/Placeholder', 'root/lstm_cell/LSTMBlockCell_c', + 'root/lstm_cell/LSTMBlockCell_h'], + [tf_out[0].shape, (2, batch_size, num_hidden)], + [tf_out[0].dtype, tf_out[1].dtype]) + assert isinstance(tvm_output, list) + + out = tvm_output[0] + out_state = tvm_output[1] + out_state_tup = np.split(out_state, indices_or_sections=2, axis=0) + out_state_c = np.reshape(out_state_tup[0], (batch_size, num_hidden)) + out_state_h = np.reshape(out_state_tup[1], (batch_size, num_hidden)) + tvm_out = [out, out_state_c, out_state_h] + np.testing.assert_allclose(tf_out, tvm_out, rtol=1e-3, atol=1e-3) + +def test_forward_lstm(): + '''test LSTM block cell''' + + _test_lstm_cell(1, 2, 1, 0.0, 'float32') ####################################################################### # Pad @@ -714,30 +564,17 @@ def _test_pad(input_shape, paddings, mode, **kwargs): x = np.arange(np.prod(input_shape), dtype=np.float32).reshape(input_shape) with tf.Graph().as_default(): - in_data = constant_op.constant(x, shape=input_shape, dtype='float32') + in_data = array_ops.placeholder(shape=input_shape, dtype='float32') pad_values = constant_op.constant(paddings) pad = tf.pad(in_data, paddings=pad_values, mode=mode, **kwargs) if mode == 'CONSTANT': if 'constant_values' in kwargs: - out_node = 'PadV2' out_name = 'PadV2:0' else: - out_node = 'Pad' out_name = 'Pad:0' - with tf.Session() as sess: - graph_def = tf.graph_util.convert_variables_to_constants( - sess, - sess.graph.as_graph_def(add_shapes=True), - [out_node], - ) - - tf_output = run_tf_graph(sess, x, 'Const:0', out_name) - tvm_output = run_tvm_graph(graph_def, x.astype('float32'), - "Const", tf_output.shape, 'float32') - np.testing.assert_allclose(tf_output, tvm_output) - sess.close() + compare_tf_with_tvm(x, 'Placeholder:0', out_name) def test_forward_pad(): """ Pad """ @@ -944,17 +781,7 @@ def _test_lrn(ishape, size, axis, bias, alpha, beta): alpha=alpha, beta=beta) - with tf.Session() as sess: - graph_def = tf.graph_util.convert_variables_to_constants( - sess, - sess.graph.as_graph_def(add_shapes=True), - ['lrn'],) - tf_output = run_tf_graph(sess, inp_array, 'lrn0_data:0', 'lrn:0') - tvm_output = run_tvm_graph(graph_def, - inp_array, - "lrn0_data", tf_output.shape, tf_output.dtype) - np.testing.assert_allclose(tf_output, tvm_output, atol=1e-3, rtol=1e-3) - sess.close() + compare_tf_with_tvm(inp_array, 'lrn0_data:0', 'lrn:0') def test_forward_lrn(): _test_lrn((1, 3, 20, 20), 3, 1, 1.0, 1.0, 0.5) @@ -962,38 +789,26 @@ def test_forward_lrn(): ####################################################################### # l2_normalize # ------------ + def _test_l2_normalize(ishape, eps, axis): """ testing l2 normalize (uses max, sum, square, sqrt frontend operators)""" inp_array = np.random.uniform(size=ishape).astype(np.float32) - inp_array.fill(1) with tf.Graph().as_default(): - in1 = tf.placeholder(shape=inp_array.shape, dtype=inp_array.dtype, name="Placeholder") + in1 = tf.placeholder(shape=inp_array.shape, dtype=inp_array.dtype) nn.l2_normalize(in1, axis=axis, epsilon=eps, name=None, dim=None) - with tf.Session() as sess: - graph_def = tf.graph_util.convert_variables_to_constants( - sess, - sess.graph.as_graph_def(add_shapes=True), - ['l2_normalize'], - ) - tf_output = run_tf_graph(sess, inp_array, 'Placeholder:0', 'Placeholder:0') - tvm_output = run_tvm_graph(graph_def, - inp_array, - "Placeholder", - tf_output.shape, - tf_output.dtype) - - np.testing.assert_allclose(tf_output, tvm_output, atol=1e-3, rtol=1e-3) - sess.close() + compare_tf_with_tvm(inp_array, 'Placeholder:0', 'l2_normalize:0') + def test_forward_l2_normalize(): _test_l2_normalize((1, 3, 20, 20), 0.001, (0,)) + ####################################################################### # Main # ---- @@ -1011,7 +826,7 @@ def test_forward_l2_normalize(): test_forward_mobilenet() test_forward_variable() test_forward_resize_bilinear() - test_forward_pad() + test_forward_pad() test_forward_lstm() test_forward_stridedslice() test_forward_gather() From daa2569999d1afd5fec4501288e055f683ed5312 Mon Sep 17 00:00:00 2001 From: Siva Date: Tue, 14 Aug 2018 21:28:34 +0530 Subject: [PATCH 22/77] [NNVM][DOC] Update NNVM symbol documentation to latest. Ref. 1591 (#1599) --- docs/nnvm_top.rst | 79 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/docs/nnvm_top.rst b/docs/nnvm_top.rst index 96a37b779e1e..927990647a69 100644 --- a/docs/nnvm_top.rst +++ b/docs/nnvm_top.rst @@ -29,6 +29,7 @@ This level enables fully connected multi-layer perceptron. nnvm.symbol.dense nnvm.symbol.relu + nnvm.symbol.prelu nnvm.symbol.tanh nnvm.symbol.sigmoid nnvm.symbol.exp @@ -39,6 +40,8 @@ This level enables fully connected multi-layer perceptron. nnvm.symbol.elemwise_mul nnvm.symbol.elemwise_div nnvm.symbol.elemwise_sum + nnvm.symbol.elemwise_mod + nnvm.symbol.elemwise_pow nnvm.symbol.flatten nnvm.symbol.concatenate nnvm.symbol.expand_dims @@ -50,6 +53,14 @@ This level enables fully connected multi-layer perceptron. nnvm.symbol.log_softmax nnvm.symbol.pad nnvm.symbol.block_grad + nnvm.symbol.matmul + nnvm.symbol.resize + nnvm.symbol.upsampling + nnvm.symbol.take + nnvm.symbol.l2_normalize + nnvm.symbol.flip + nnvm.symbol.lrn + nnvm.symbol.where **Level 2: Convolutions** @@ -92,6 +103,7 @@ This level enables typical convnet models. nnvm.symbol.__lshift_scalar__ nnvm.symbol.__rshift_scalar__ + **Level 4: Broadcast and Reductions** .. autosummary:: @@ -117,11 +129,43 @@ This level enables typical convnet models. nnvm.symbol.ones_like nnvm.symbol.zeros nnvm.symbol.zeros_like + nnvm.symbol.slice_like + nnvm.symbol.strided_slice + nnvm.symbol.argmax + nnvm.symbol.argmin + nnvm.symbol.collapse_sum + nnvm.symbol.broadcast_equal + nnvm.symbol.broadcast_greater_equal + nnvm.symbol.broadcast_greater_equal + nnvm.symbol.broadcast_greater + nnvm.symbol.broadcast_left_shift + nnvm.symbol.broadcast_less_equal + nnvm.symbol.broadcast_less_equal + nnvm.symbol.broadcast_less + nnvm.symbol.broadcast_max + nnvm.symbol.broadcast_min + nnvm.symbol.broadcast_mod + nnvm.symbol.broadcast_not_equal + nnvm.symbol.broadcast_pow + nnvm.symbol.broadcast_right_shift + + +**Level 5: Vision Operators** + +.. autosummary:: + :nosignatures: + + nnvm.symbol.multibox_prior + nnvm.symbol.multibox_transform_loc + nnvm.symbol.nms + nnvm.symbol.yolo_region + nnvm.symbol.yolo_reorg Detailed Definitions -------------------- .. autofunction:: nnvm.symbol.dense .. autofunction:: nnvm.symbol.relu +.. autofunction:: nnvm.symbol.prelu .. autofunction:: nnvm.symbol.tanh .. autofunction:: nnvm.symbol.sigmoid .. autofunction:: nnvm.symbol.exp @@ -132,6 +176,8 @@ Detailed Definitions .. autofunction:: nnvm.symbol.elemwise_mul .. autofunction:: nnvm.symbol.elemwise_div .. autofunction:: nnvm.symbol.elemwise_sum +.. autofunction:: nnvm.symbol.elemwise_mod +.. autofunction:: nnvm.symbol.elemwise_pow .. autofunction:: nnvm.symbol.flatten .. autofunction:: nnvm.symbol.concatenate .. autofunction:: nnvm.symbol.expand_dims @@ -143,6 +189,14 @@ Detailed Definitions .. autofunction:: nnvm.symbol.log_softmax .. autofunction:: nnvm.symbol.pad .. autofunction:: nnvm.symbol.block_grad +.. autofunction:: nnvm.symbol.matmul +.. autofunction:: nnvm.symbol.resize +.. autofunction:: nnvm.symbol.upsampling +.. autofunction:: nnvm.symbol.take +.. autofunction:: nnvm.symbol.l2_normalize +.. autofunction:: nnvm.symbol.flip +.. autofunction:: nnvm.symbol.lrn +.. autofunction:: nnvm.symbol.where .. autofunction:: nnvm.symbol.conv2d .. autofunction:: nnvm.symbol.conv2d_transpose @@ -191,3 +245,28 @@ Detailed Definitions .. autofunction:: nnvm.symbol.ones_like .. autofunction:: nnvm.symbol.zeros .. autofunction:: nnvm.symbol.zeros_like +.. autofunction:: nnvm.symbol.slice_like +.. autofunction:: nnvm.symbol.strided_slice +.. autofunction:: nnvm.symbol.argmax +.. autofunction:: nnvm.symbol.argmin +.. autofunction:: nnvm.symbol.collapse_sum +.. autofunction:: nnvm.symbol.broadcast_equal +.. autofunction:: nnvm.symbol.broadcast_greater_equal +.. autofunction:: nnvm.symbol.broadcast_greater_equal +.. autofunction:: nnvm.symbol.broadcast_greater +.. autofunction:: nnvm.symbol.broadcast_left_shift +.. autofunction:: nnvm.symbol.broadcast_less_equal +.. autofunction:: nnvm.symbol.broadcast_less_equal +.. autofunction:: nnvm.symbol.broadcast_less +.. autofunction:: nnvm.symbol.broadcast_max +.. autofunction:: nnvm.symbol.broadcast_min +.. autofunction:: nnvm.symbol.broadcast_mod +.. autofunction:: nnvm.symbol.broadcast_not_equal +.. autofunction:: nnvm.symbol.broadcast_pow +.. autofunction:: nnvm.symbol.broadcast_right_shift + +.. autofunction:: nnvm.symbol.multibox_prior +.. autofunction:: nnvm.symbol.multibox_transform_loc +.. autofunction:: nnvm.symbol.nms +.. autofunction:: nnvm.symbol.yolo_region +.. autofunction:: nnvm.symbol.yolo_reorg From bc7431bda67e49a6fbfb06a2acfc1899c456e2ba Mon Sep 17 00:00:00 2001 From: Siva Date: Tue, 14 Aug 2018 22:14:33 +0530 Subject: [PATCH 23/77] [NNVM][POOL] bug fix. Remove the hardcode. (#1600) --- nnvm/src/top/nn/pooling.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nnvm/src/top/nn/pooling.cc b/nnvm/src/top/nn/pooling.cc index cccd5b1c710b..8b9b7a64aa0d 100644 --- a/nnvm/src/top/nn/pooling.cc +++ b/nnvm/src/top/nn/pooling.cc @@ -77,7 +77,7 @@ inline bool Pool2DInferShape(const nnvm::NodeAttrs& attrs, } else { oshape[hidx] = ((dshape[hidx] + pad_h - param.pool_size[0] + param.strides[0] - 1) / param.strides[0]) + 1; - oshape[widx] = ((dshape[3] + pad_w - param.pool_size[1] + + oshape[widx] = ((dshape[widx] + pad_w - param.pool_size[1] + param.strides[1] - 1) / param.strides[1]) + 1; } NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, 0, oshape); From 54a115ef14fb6dabbf6ea8eb9e6dd85846030c72 Mon Sep 17 00:00:00 2001 From: Siju Date: Tue, 14 Aug 2018 23:18:02 +0530 Subject: [PATCH 24/77] [FRONTEND][DARKNET]LSTM and GRU support (#1576) --- nnvm/python/nnvm/frontend/darknet.py | 140 ++++++++++++++++-- nnvm/python/nnvm/testing/darknet.py | 3 + .../python/frontend/darknet/test_forward.py | 44 +++++- 3 files changed, 175 insertions(+), 12 deletions(-) diff --git a/nnvm/python/nnvm/frontend/darknet.py b/nnvm/python/nnvm/frontend/darknet.py index 3a197a416219..3aa36b7e7ef9 100644 --- a/nnvm/python/nnvm/frontend/darknet.py +++ b/nnvm/python/nnvm/frontend/darknet.py @@ -412,7 +412,12 @@ def __init__(self, net, dtype='float32'): self._sym_array = {} self._tvmparams = {} self._outs = [] - self._rnn_state_ctr = 0 + self._state_ctr = {} + self._state_ctr['rnn'] = 0 + self._state_ctr['crnn'] = 0 + self._state_ctr['lstm'] = 0 + self._state_ctr['cell_state'] = 0 + self._state_ctr['gru'] = 0 def _read_memory_buffer(self, shape, data): length = 1 @@ -623,16 +628,16 @@ def _get_opname(self, layer): """Returs the layer name.""" return layer.type - def _new_rnn_state_sym(self, state=None): + def _new_rnn_state_sym(self, state=None, name='rnn'): """Returs a symbol for state""" - name = "rnn%d_state" % (self._rnn_state_ctr) - self._rnn_state_ctr += 1 - return _sym.Variable(name=name, init=state) + sym_name = name + "%d_state" % self._state_ctr[name] + self._state_ctr[name] += 1 + return _sym.Variable(name=sym_name, init=state) - def _get_rnn_state_buffer(self, layer): + def _get_rnn_state_buffer(self, layer, name): """Get the state buffer for rnn.""" buffer = np.zeros((1, layer.outputs), self.dtype) - return self._new_rnn_state_sym(buffer) + return self._new_rnn_state_sym(buffer, name) def _get_darknet_rnn_attrs(self, layer, sym): """Get the rnn converted symbol from attributes.""" @@ -653,7 +658,7 @@ def _handle_darknet_rnn_layers(self, layer_num, sym): attr.update({'batch' : layer.batch}) attr.update({'num_hidden' : str(layer.outputs)}) - state = self._get_rnn_state_buffer(layer) + state = self._get_rnn_state_buffer(layer, 'rnn') for _ in range(layer.steps): input_layer = layer.input_layer @@ -678,7 +683,7 @@ def _handle_darknet_rnn_layers(self, layer_num, sym): attr.update({'batch' : layer.batch}) attr.update({'num_hidden' : str(layer.outputs)}) - state = self._get_rnn_state_buffer(layer) + state = self._get_rnn_state_buffer(layer, 'crnn') for _ in range(layer.steps): input_layer = layer.input_layer @@ -698,6 +703,123 @@ def _handle_darknet_rnn_layers(self, layer_num, sym): self._sym_array[layer_num] = sym processed = True + elif LAYERTYPE.LSTM == layer.type: + if layer.steps > 1: + raise NotImplementedError("Currently support only single step GRU") + + op_name_add = 'elemwise_add' + op_name_mul = 'elemwise_mul' + attrs = {} + act_attr = {} + + h_state = self._get_rnn_state_buffer(layer, 'lstm') + c_state = self._get_rnn_state_buffer(layer, 'cell_state') + for _ in range(layer.steps): + sym_wf = self._get_darknet_rnn_attrs(layer.wf, h_state) + sym_wi = self._get_darknet_rnn_attrs(layer.wi, h_state) + sym_wg = self._get_darknet_rnn_attrs(layer.wg, h_state) + sym_wo = self._get_darknet_rnn_attrs(layer.wo, h_state) + + input_sym = sym + sym_uf = self._get_darknet_rnn_attrs(layer.uf, input_sym) + sym_ui = self._get_darknet_rnn_attrs(layer.ui, input_sym) + sym_ug = self._get_darknet_rnn_attrs(layer.ug, input_sym) + sym_uo = self._get_darknet_rnn_attrs(layer.uo, input_sym) + + new_inputs = _as_list([sym_wf, sym_uf]) + add_f = _darknet_get_nnvm_op(op_name_add)(*new_inputs, **attrs) + + new_inputs = _as_list([sym_wi, sym_ui]) + add_i = _darknet_get_nnvm_op(op_name_add)(*new_inputs, **attrs) + + new_inputs = _as_list([sym_wg, sym_ug]) + add_g = _darknet_get_nnvm_op(op_name_add)(*new_inputs, **attrs) + + new_inputs = _as_list([sym_wo, sym_uo]) + add_o = _darknet_get_nnvm_op(op_name_add)(*new_inputs, **attrs) + + act_attr['activation'] = ACTIVATION.LOGISTIC + act_f, _ = _darknet_activations(_as_list(add_f), act_attr) + + act_attr['activation'] = ACTIVATION.LOGISTIC + act_i, _ = _darknet_activations(_as_list(add_i), act_attr) + + act_attr['activation'] = ACTIVATION.TANH + act_g, _ = _darknet_activations(_as_list(add_g), act_attr) + + act_attr['activation'] = ACTIVATION.LOGISTIC + act_o, _ = _darknet_activations(_as_list(add_o), act_attr) + + new_inputs = _as_list([act_i, act_g]) + mul_t = _darknet_get_nnvm_op(op_name_mul)(*new_inputs, **attrs) + + new_inputs = _as_list([act_f, c_state]) + c_state = _darknet_get_nnvm_op(op_name_mul)(*new_inputs, **attrs) + + new_inputs = _as_list([mul_t, c_state]) + c_state = _darknet_get_nnvm_op(op_name_add)(*new_inputs, **attrs) + + act_attr['activation'] = ACTIVATION.TANH + h_state, _ = _darknet_activations(_as_list(c_state), act_attr) + + new_inputs = _as_list([act_o, h_state]) + h_state = _darknet_get_nnvm_op(op_name_mul)(*new_inputs, **attrs) + self._outs = self._outs + [c_state, h_state] + sym = h_state + self._sym_array[layer_num] = sym + processed = True + + elif LAYERTYPE.GRU == layer.type: + if layer.steps > 1: + raise NotImplementedError("Currently support only single step GRU") + + op_name_add = 'elemwise_add' + op_name_mul = 'elemwise_mul' + attrs = {} + act_attr = {} + + state = self._get_rnn_state_buffer(layer, "gru") + for _ in range(layer.steps): + sym_wz = self._get_darknet_rnn_attrs(layer.wz, state) + sym_wr = self._get_darknet_rnn_attrs(layer.wr, state) + + input_sym = sym + sym_uz = self._get_darknet_rnn_attrs(layer.uz, input_sym) + sym_ur = self._get_darknet_rnn_attrs(layer.ur, input_sym) + sym_uh = self._get_darknet_rnn_attrs(layer.uh, input_sym) + + new_inputs = _as_list([sym_uz, sym_wz]) + add_z = _darknet_get_nnvm_op(op_name_add)(*new_inputs, **attrs) + + new_inputs = _as_list([sym_ur, sym_wr]) + add_r = _darknet_get_nnvm_op(op_name_add)(*new_inputs, **attrs) + + act_attr['activation'] = ACTIVATION.LOGISTIC + act_z, _ = _darknet_activations(_as_list(add_z), act_attr) + + act_attr['activation'] = ACTIVATION.LOGISTIC + act_r, _ = _darknet_activations(_as_list(add_r), act_attr) + + new_inputs = _as_list([act_r, state]) + forgot = _darknet_get_nnvm_op(op_name_mul)(*new_inputs, **attrs) + + sym_wh = self._get_darknet_rnn_attrs(layer.wh, forgot) + + new_inputs = _as_list([sym_uh, sym_wh]) + h_state = _darknet_get_nnvm_op(op_name_add)(*new_inputs, **attrs) + + if layer.tanh == 1: + act_attr['activation'] = ACTIVATION.TANH + else: + act_attr['activation'] = ACTIVATION.LOGISTIC + h_state, _ = _darknet_activations(_as_list(h_state), act_attr) + + sym = act_z * state + (1 - act_z) * h_state + + self._outs = self._outs + [sym] + self._sym_array[layer_num] = sym + processed = True + return processed, sym def from_darknet(self): diff --git a/nnvm/python/nnvm/testing/darknet.py b/nnvm/python/nnvm/testing/darknet.py index 362fd3058954..e3d110e9605e 100644 --- a/nnvm/python/nnvm/testing/darknet.py +++ b/nnvm/python/nnvm/testing/darknet.py @@ -491,6 +491,9 @@ class ACTIVATION(object): layer make_region_layer(int batch, int w, int h, int n, int classes, int coords); layer make_softmax_layer(int batch, int inputs, int groups); layer make_rnn_layer(int batch, int inputs, int outputs, int steps, ACTIVATION activation, int batch_normalize, int adam); +layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int output_filters, int steps, ACTIVATION activation, int batch_normalize); +layer make_lstm_layer(int batch, int inputs, int outputs, int steps, int batch_normalize, int adam); +layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_normalize, int adam); void free_network(network *net); """ ) diff --git a/nnvm/tests/python/frontend/darknet/test_forward.py b/nnvm/tests/python/frontend/darknet/test_forward.py index e68aed085664..5fc71a86211e 100644 --- a/nnvm/tests/python/frontend/darknet/test_forward.py +++ b/nnvm/tests/python/frontend/darknet/test_forward.py @@ -306,7 +306,7 @@ def test_forward_softmax_temperature(): LIB.free_network(net) def test_forward_rnn(): - '''test softmax layer''' + '''test RNN layer''' net = LIB.make_network(1) batch = 1 inputs = 256 @@ -325,7 +325,7 @@ def test_forward_rnn(): LIB.free_network(net) def test_forward_crnn(): - '''test softmax layer''' + '''test CRNN layer''' net = LIB.make_network(1) batch = 1 c = 3 @@ -349,6 +349,42 @@ def test_forward_crnn(): test_forward(net) LIB.free_network(net) +def test_forward_lstm(): + '''test LSTM layer''' + net = LIB.make_network(1) + batch = 1 + inputs = 256 + outputs = 256 + steps = 1 + batch_normalize = 0 + adam = 0 + layer_1 = LIB.make_lstm_layer(batch, inputs, outputs, steps, batch_normalize, adam) + net.layers[0] = layer_1 + net.inputs = inputs + net.outputs = outputs + net.w = net.h = 0 + LIB.resize_network(net, net.w, net.h) + test_rnn_forward(net) + LIB.free_network(net) + +def test_forward_gru(): + '''test GRU layer''' + net = LIB.make_network(1) + batch = 1 + inputs = 256 + outputs = 256 + steps = 1 + batch_normalize = 0 + adam = 0 + layer_1 = LIB.make_gru_layer(batch, inputs, outputs, steps, batch_normalize, adam) + net.layers[0] = layer_1 + net.inputs = inputs + net.outputs = outputs + net.w = net.h = 0 + LIB.resize_network(net, net.w, net.h) + test_rnn_forward(net) + LIB.free_network(net) + def test_forward_activation_logistic(): '''test logistic activation layer''' net = LIB.make_network(1) @@ -395,4 +431,6 @@ def test_forward_activation_logistic(): test_forward_elu() test_forward_rnn() test_forward_crnn() - test_forward_activation_logistic() \ No newline at end of file + test_forward_lstm() + test_forward_gru() + test_forward_activation_logistic() From 7751a6babc1e0bcd6d94560530a78fb0579fc9f1 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Wed, 15 Aug 2018 15:34:05 -0700 Subject: [PATCH 25/77] [AUTOTVM] Fix GATuner and improve error message (#1605) --- include/tvm/operation.h | 2 ++ python/tvm/autotvm/measure/measure_methods.py | 2 ++ python/tvm/autotvm/task/nnvm_integration.py | 15 ++++++++++++++- python/tvm/autotvm/task/task.py | 2 +- python/tvm/autotvm/task/topi_integration.py | 2 +- python/tvm/autotvm/tuner/callback.py | 8 +++++++- python/tvm/autotvm/tuner/ga_tuner.py | 14 ++++++++++---- 7 files changed, 37 insertions(+), 8 deletions(-) diff --git a/include/tvm/operation.h b/include/tvm/operation.h index d13680531af9..ed8be6e4a7c0 100644 --- a/include/tvm/operation.h +++ b/include/tvm/operation.h @@ -366,6 +366,8 @@ class ExternOpNode : public OperationNode { v->Visit("tag", &tag); v->Visit("attrs", &attrs); v->Visit("inputs", &inputs); + v->Visit("input_placeholders", &input_placeholders); + v->Visit("output_placeholders", &output_placeholders); v->Visit("body", &body); } EXPORT static Operation make(std::string name, diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py index e192ee26ee3e..d845cc1f88fd 100644 --- a/python/tvm/autotvm/measure/measure_methods.py +++ b/python/tvm/autotvm/measure/measure_methods.py @@ -394,6 +394,8 @@ def _measure_common(input_pack, build_func, build_kwargs, number, repeat, msg = str(exc) if "Stack trace returned" in msg: msg = msg[:msg.index("Stack trace returned")] + if "CUDA Source" in msg: + msg = msg[:msg.index("CUDA Source")] costs = (RuntimeError(msg),) errno = MeasureErrorNo.RUNTIME_DEVICE tstamp = time.time() diff --git a/python/tvm/autotvm/task/nnvm_integration.py b/python/tvm/autotvm/task/nnvm_integration.py index 338b46784a75..1b50869fc378 100644 --- a/python/tvm/autotvm/task/nnvm_integration.py +++ b/python/tvm/autotvm/task/nnvm_integration.py @@ -4,12 +4,16 @@ """ import warnings +import logging + from ... import tensor, placeholder, target as _target from ..util import get_const_tuple from .task import create, register +from .dispatcher import ApplyHistoryBest +logger = logging.getLogger('autotvm') def serialize_args(args): """serialize arguments of a topi function to a hashable tuple. @@ -176,8 +180,17 @@ def extract_from_graph(graph, shape, dtype, target, symbols, target_host=None): # run compiler to collect all TOPI calls during compilation env.reset() + + # disable logger temporarily + old_state = logger.disabled + logger.disabled = True + + # use a dummy target to do a fake compile for collecting topi calls dummy_target = _target.create("opencl -device=dummy") - nnvm.compiler.build(graph, target=dummy_target, shape=shape, dtype=dtype) + with ApplyHistoryBest([], allow_fallback=True): + nnvm.compiler.build(graph, target=dummy_target, shape=shape, dtype=dtype) + + logger.disabled = old_state tasks = [] for task_name, args in env.get_tasks(): diff --git a/python/tvm/autotvm/task/task.py b/python/tvm/autotvm/task/task.py index 7a386f1f9e67..f8923fca56e3 100644 --- a/python/tvm/autotvm/task/task.py +++ b/python/tvm/autotvm/task/task.py @@ -368,7 +368,7 @@ def traverse(ops): pass else: raise FlopCalculationError("Only support tvm.compute currently. " - "Other ops like tvm.scan is not supported") + "Other ops like tvm.scan/tvm.extern is not supported") return ret try: diff --git a/python/tvm/autotvm/task/topi_integration.py b/python/tvm/autotvm/task/topi_integration.py index 012ca4a214e9..18f45f8d6708 100644 --- a/python/tvm/autotvm/task/topi_integration.py +++ b/python/tvm/autotvm/task/topi_integration.py @@ -62,7 +62,7 @@ def _decorator(f): for target_key in targets: if target_key not in _REGISTED_DISPATHCER: _REGISTED_DISPATHCER[target_key] = {} - if topi_compute not in _REGISTED_DISPATHCER: + if topi_compute not in _REGISTED_DISPATHCER[target_key]: @topi_compute.register(target_key) @dispatcher def config_dispatcher(*args, **kwargs): diff --git a/python/tvm/autotvm/tuner/callback.py b/python/tvm/autotvm/tuner/callback.py index 15d5ac1c9689..6f66871f671c 100644 --- a/python/tvm/autotvm/tuner/callback.py +++ b/python/tvm/autotvm/tuner/callback.py @@ -101,11 +101,17 @@ def __init__(self): self.total = total def __del__(self): - sys.stdout.write(' Done.\n') + if logger.level < logging.DEBUG: # only print progress bar in non-debug mode + sys.stdout.write(' Done.\n') ctx = _Context() tic = time.time() + if logger.level < logging.DEBUG: # only print progress bar in non-debug mode + sys.stdout.write('\r%s Current/Best: %7.2f/%7.2f GFLOPS | Progress: (%d/%d) ' + '| %.2f s' % (prefix, 0, 0, 0, total, time.time() - tic)) + sys.stdout.flush() + def _callback(tuner, inputs, results): ctx.ct += len(inputs) diff --git a/python/tvm/autotvm/tuner/ga_tuner.py b/python/tvm/autotvm/tuner/ga_tuner.py index 916bd4ee68c6..b92737ed5317 100644 --- a/python/tvm/autotvm/tuner/ga_tuner.py +++ b/python/tvm/autotvm/tuner/ga_tuner.py @@ -47,6 +47,7 @@ def __init__(self, task, pop_size, elite_num=3, mutation_prob=0.1): # random initialization self.pop_size = min(self.pop_size, len(self.space)) + self.elite_num = min(self.pop_size, self.elite_num) for _ in range(self.pop_size): tmp_gene = point2knob(np.random.randint(len(self.space)), self.dims) while knob2point(tmp_gene, self.dims) in self.visited: @@ -70,9 +71,9 @@ def update(self, inputs, results): y = inp.task.flop / np.mean(res.costs) self.scores.append(y) else: - self.scores.append(0) + self.scores.append(0.0) - if len(self.scores) >= len(self.genes): + if len(self.scores) >= len(self.genes) and len(self.visited) < len(self.space): genes = self.genes + self.elites scores = np.array(self.scores[:len(self.genes)] + self.elite_scores) @@ -85,8 +86,13 @@ def update(self, inputs, results): # cross over indices = np.arange(len(genes)) - scores /= np.max(scores) - probs = scores / np.sum(scores) + max_score = np.max(scores) + if max_score < 1e-8: + probs = np.empty_like(scores) + probs[:] = 1.0 / len(scores) + else: + scores /= max_score + probs = scores / np.sum(scores) tmp_genes = [] for _ in range(self.pop_size): p1, p2 = np.random.choice(indices, size=2, replace=False, p=probs) From a80356bbe0e857d674d3dbe0f0cfbec43ba5c950 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Wed, 15 Aug 2018 15:42:24 -0700 Subject: [PATCH 26/77] [NNVM] Add symbol for inception v3 (#1604) --- nnvm/python/nnvm/testing/__init__.py | 1 + nnvm/python/nnvm/testing/inception_v3.py | 255 ++++++++++++++++++ nnvm/python/nnvm/testing/squeezenet.py | 2 +- nnvm/src/compiler/graph_hash.cc | 2 +- .../frontend/mxnet/model_zoo/__init__.py | 9 +- .../frontend/mxnet/model_zoo/inception_v3.py | 170 ++++++++++++ .../tests/python/frontend/mxnet/test_graph.py | 11 +- 7 files changed, 442 insertions(+), 8 deletions(-) create mode 100644 nnvm/python/nnvm/testing/inception_v3.py create mode 100644 nnvm/tests/python/frontend/mxnet/model_zoo/inception_v3.py diff --git a/nnvm/python/nnvm/testing/__init__.py b/nnvm/python/nnvm/testing/__init__.py index bff828d68280..4a879047ec7e 100644 --- a/nnvm/python/nnvm/testing/__init__.py +++ b/nnvm/python/nnvm/testing/__init__.py @@ -8,6 +8,7 @@ from . import resnet from . import vgg from . import squeezenet +from . import inception_v3 from . import dcgan from . import dqn from . import yolo2_detection diff --git a/nnvm/python/nnvm/testing/inception_v3.py b/nnvm/python/nnvm/testing/inception_v3.py new file mode 100644 index 000000000000..f14daa1ae656 --- /dev/null +++ b/nnvm/python/nnvm/testing/inception_v3.py @@ -0,0 +1,255 @@ +""" +Inception V3, suitable for images with around 299 x 299 + +Reference: +Szegedy, Christian, et al. "Rethinking the Inception Architecture for Computer Vision." +arXiv preprint arXiv:1512.00567 (2015). + +Adopted from https://github.com/apache/incubator-mxnet/blob/ + master/example/image-classification/symbols/inception-v3.py +""" +# pylint: disable=invalid-name,missing-docstring,unused-argument +from .. import symbol as sym +from .utils import create_workload + +def Conv(data, num_filter, kernel=(1, 1), stride=(1, 1), pad=(0, 0), name=None, suffix=''): + conv = sym.conv2d(data=data, channels=num_filter, kernel_size=kernel, + strides=stride, padding=pad, use_bias=False, + name='%s%s_conv2d' % (name, suffix)) + bn = sym.batch_norm(data=conv, name='%s%s_batchnorm' % (name, suffix), epsilon=2e-5) + act = sym.relu(data=bn, name='%s%s_relu' % (name, suffix)) + return act + +def Pooling(data, kernel, stride, pad, pool_type, name): + if pool_type == 'max': + return sym.max_pool2d(data=data, pool_size=kernel, strides=stride, padding=pad, name=name) + elif pool_type == 'avg': + return sym.avg_pool2d(data=data, pool_size=kernel, strides=stride, padding=pad, name=name, + count_include_pad=True) + else: + raise ValueError("Invalid pooling type: " + pool_type) + +def Inception7A(data, + num_1x1, + num_3x3_red, num_3x3_1, num_3x3_2, + num_5x5_red, num_5x5, + pool, proj, + name): + tower_1x1 = Conv(data, num_1x1, name=('%s_conv' % name)) + tower_5x5 = Conv(data, num_5x5_red, name=('%s_tower' % name), suffix='_conv') + tower_5x5 = Conv(tower_5x5, num_5x5, kernel=(5, 5), pad=(2, 2), name=('%s_tower' % name), + suffix='_conv_1') + tower_3x3 = Conv(data, num_3x3_red, name=('%s_tower_1' % name), suffix='_conv') + tower_3x3 = Conv(tower_3x3, num_3x3_1, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name), + suffix='_conv_1') + tower_3x3 = Conv(tower_3x3, num_3x3_2, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name), + suffix='_conv_2') + pooling = Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, + name=('%s_pool_%s_pool' % (pool, name))) + + cproj = Conv(pooling, proj, name=('%s_tower_2' % name), suffix='_conv') + concat = sym.concatenate(*[tower_1x1, tower_5x5, tower_3x3, cproj], + name='ch_concat_%s_chconcat' % name) + return concat + +# First Downsample +def Inception7B(data, + num_3x3, + num_d3x3_red, num_d3x3_1, num_d3x3_2, + pool, + name): + tower_3x3 = Conv(data, num_3x3, kernel=(3, 3), pad=(0, 0), stride=(2, 2), + name=('%s_conv' % name)) + tower_d3x3 = Conv(data, num_d3x3_red, name=('%s_tower' % name), suffix='_conv') + tower_d3x3 = Conv(tower_d3x3, num_d3x3_1, kernel=(3, 3), pad=(1, 1), stride=(1, 1), + name=('%s_tower' % name), suffix='_conv_1') + tower_d3x3 = Conv(tower_d3x3, num_d3x3_2, kernel=(3, 3), pad=(0, 0), stride=(2, 2), + name=('%s_tower' % name), suffix='_conv_2') + pooling = Pooling(data=data, kernel=(3, 3), stride=(2, 2), pad=(0, 0), pool_type="max", + name=('max_pool_%s_pool' % name)) + concat = sym.concatenate(*[tower_3x3, tower_d3x3, pooling], name='ch_concat_%s_chconcat' % name) + return concat + +def Inception7C(data, + num_1x1, + num_d7_red, num_d7_1, num_d7_2, + num_q7_red, num_q7_1, num_q7_2, num_q7_3, num_q7_4, + pool, proj, + name): + tower_1x1 = Conv(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_conv' % name)) + tower_d7 = Conv(data=data, num_filter=num_d7_red, name=('%s_tower' % name), suffix='_conv') + tower_d7 = Conv(data=tower_d7, num_filter=num_d7_1, kernel=(1, 7), pad=(0, 3), + name=('%s_tower' % name), suffix='_conv_1') + tower_d7 = Conv(data=tower_d7, num_filter=num_d7_2, kernel=(7, 1), pad=(3, 0), + name=('%s_tower' % name), suffix='_conv_2') + tower_q7 = Conv(data=data, num_filter=num_q7_red, name=('%s_tower_1' % name), suffix='_conv') + tower_q7 = Conv(data=tower_q7, num_filter=num_q7_1, kernel=(7, 1), pad=(3, 0), + name=('%s_tower_1' % name), suffix='_conv_1') + tower_q7 = Conv(data=tower_q7, num_filter=num_q7_2, kernel=(1, 7), pad=(0, 3), + name=('%s_tower_1' % name), suffix='_conv_2') + tower_q7 = Conv(data=tower_q7, num_filter=num_q7_3, kernel=(7, 1), pad=(3, 0), + name=('%s_tower_1' % name), suffix='_conv_3') + tower_q7 = Conv(data=tower_q7, num_filter=num_q7_4, kernel=(1, 7), pad=(0, 3), + name=('%s_tower_1' % name), suffix='_conv_4') + pooling = Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, + name=('%s_pool_%s_pool' % (pool, name))) + cproj = Conv(data=pooling, num_filter=proj, kernel=(1, 1), + name=('%s_tower_2' % name), suffix='_conv') + # concat + concat = sym.concatenate(*[tower_1x1, tower_d7, tower_q7, cproj], + name='ch_concat_%s_chconcat' % name) + return concat + +def Inception7D(data, + num_3x3_red, num_3x3, + num_d7_3x3_red, num_d7_1, num_d7_2, num_d7_3x3, + pool, + name): + tower_3x3 = Conv(data=data, num_filter=num_3x3_red, name=('%s_tower' % name), + suffix='_conv') + tower_3x3 = Conv(data=tower_3x3, num_filter=num_3x3, kernel=(3, 3), pad=(0, 0), stride=(2, 2), + name=('%s_tower' % name), suffix='_conv_1') + tower_d7_3x3 = Conv(data=data, num_filter=num_d7_3x3_red, name=('%s_tower_1' % name), + suffix='_conv') + tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_1, kernel=(1, 7), pad=(0, 3), + name=('%s_tower_1' % name), suffix='_conv_1') + tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_2, kernel=(7, 1), pad=(3, 0), + name=('%s_tower_1' % name), suffix='_conv_2') + tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_3x3, kernel=(3, 3), stride=(2, 2), + name=('%s_tower_1' % name), suffix='_conv_3') + pooling = Pooling(data=data, kernel=(3, 3), stride=(2, 2), pool_type=pool, pad=(0, 0), + name=('%s_pool_%s_pool' % (pool, name))) + # concat + concat = sym.concatenate(*[tower_3x3, tower_d7_3x3, pooling], + name='ch_concat_%s_chconcat' % name) + return concat + +def Inception7E(data, + num_1x1, + num_d3_red, num_d3_1, num_d3_2, + num_3x3_d3_red, num_3x3, num_3x3_d3_1, num_3x3_d3_2, + pool, proj, + name): + tower_1x1 = Conv(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_conv' % name)) + tower_d3 = Conv(data=data, num_filter=num_d3_red, name=('%s_tower' % name), suffix='_conv') + tower_d3_a = Conv(data=tower_d3, num_filter=num_d3_1, kernel=(1, 3), pad=(0, 1), + name=('%s_tower' % name), suffix='_mixed_conv') + tower_d3_b = Conv(data=tower_d3, num_filter=num_d3_2, kernel=(3, 1), pad=(1, 0), + name=('%s_tower' % name), suffix='_mixed_conv_1') + tower_3x3_d3 = Conv(data=data, num_filter=num_3x3_d3_red, name=('%s_tower_1' % name), + suffix='_conv') + tower_3x3_d3 = Conv(data=tower_3x3_d3, num_filter=num_3x3, kernel=(3, 3), pad=(1, 1), + name=('%s_tower_1' % name), suffix='_conv_1') + tower_3x3_d3_a = Conv(data=tower_3x3_d3, num_filter=num_3x3_d3_1, kernel=(1, 3), pad=(0, 1), + name=('%s_tower_1' % name), suffix='_mixed_conv') + tower_3x3_d3_b = Conv(data=tower_3x3_d3, num_filter=num_3x3_d3_2, kernel=(3, 1), pad=(1, 0), + name=('%s_tower_1' % name), suffix='_mixed_conv_1') + pooling = Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, + name=('%s_pool_%s_pool' % (pool, name))) + cproj = Conv(data=pooling, num_filter=proj, kernel=(1, 1), name=('%s_tower_2' % name), + suffix='_conv') + # concat + concat = sym.concatenate( + *[tower_1x1, tower_d3_a, tower_d3_b, tower_3x3_d3_a, tower_3x3_d3_b, cproj], + name='ch_concat_%s_chconcat' % name) + return concat + + +def get_symbol(num_classes=1000, **kwargs): + data = sym.Variable(name="data") + # stage 1 + conv = Conv(data, 32, kernel=(3, 3), stride=(2, 2), name="conv") + conv_1 = Conv(conv, 32, kernel=(3, 3), name="conv_1") + conv_2 = Conv(conv_1, 64, kernel=(3, 3), pad=(1, 1), name="conv_2") + pool = Pooling(data=conv_2, kernel=(3, 3), stride=(2, 2), pool_type="max", pad=(0, 0), + name="pool") + # stage 2 + conv_3 = Conv(pool, 80, kernel=(1, 1), name="conv_3") + conv_4 = Conv(conv_3, 192, kernel=(3, 3), name="conv_4") + pool1 = Pooling(data=conv_4, kernel=(3, 3), stride=(2, 2), pool_type="max", pad=(0, 0), + name="pool1") + + # stage 3 + in3a = Inception7A(pool1, 64, + 64, 96, 96, + 48, 64, + "avg", 32, "mixed") + in3b = Inception7A(in3a, 64, + 64, 96, 96, + 48, 64, + "avg", 64, "mixed_1") + in3c = Inception7A(in3b, 64, + 64, 96, 96, + 48, 64, + "avg", 64, "mixed_2") + in3d = Inception7B(in3c, 384, + 64, 96, 96, + "max", "mixed_3") + # stage 4 + in4a = Inception7C(in3d, 192, + 128, 128, 192, + 128, 128, 128, 128, 192, + "avg", 192, "mixed_4") + in4b = Inception7C(in4a, 192, + 160, 160, 192, + 160, 160, 160, 160, 192, + "avg", 192, "mixed_5") + in4c = Inception7C(in4b, 192, + 160, 160, 192, + 160, 160, 160, 160, 192, + "avg", 192, "mixed_6") + in4d = Inception7C(in4c, 192, + 192, 192, 192, + 192, 192, 192, 192, 192, + "avg", 192, "mixed_7") + in4e = Inception7D(in4d, 192, 320, + 192, 192, 192, 192, + "max", "mixed_8") + # stage 5 + in5a = Inception7E(in4e, 320, + 384, 384, 384, + 448, 384, 384, 384, + "avg", 192, "mixed_9") + in5b = Inception7E(in5a, 320, + 384, 384, 384, + 448, 384, 384, 384, + "max", 192, "mixed_10") + # pool + pool = Pooling(data=in5b, kernel=(8, 8), stride=(1, 1), pool_type="avg", pad=(0, 0), + name="global_pool") + flatten = sym.flatten(data=pool, name="flatten") + fc1 = sym.dense(data=flatten, units=num_classes, name='fc1') + softmax = sym.softmax(data=fc1, name='softmax') + return softmax + +def get_workload(batch_size=1, num_classes=1000, + image_shape=(3, 299, 299), dtype="float32", **kwargs): + """Get benchmark workload for InceptionV3 + + Parameters + ---------- + batch_size : int + The batch size used in the model + + num_classes : int, optional + Number of classes + + image_shape : tuple, optional + The input image shape + + dtype : str, optional + The data type + + kwargs : dict + Extra arguments + + Returns + ------- + net : nnvm.Symbol + The computational graph + + params : dict of str to NDArray + The parameters. + """ + net = get_symbol(num_classes=num_classes, **kwargs) + return create_workload(net, batch_size, image_shape, dtype) diff --git a/nnvm/python/nnvm/testing/squeezenet.py b/nnvm/python/nnvm/testing/squeezenet.py index a445e8cfb7da..eab2cf06fee6 100644 --- a/nnvm/python/nnvm/testing/squeezenet.py +++ b/nnvm/python/nnvm/testing/squeezenet.py @@ -98,7 +98,7 @@ def get_symbol(num_classes, version, **kwargs): def get_workload(batch_size=1, num_classes=1000, version='1.0', image_shape=(3, 224, 224), dtype="float32", **kwargs): - """Get benchmark workload for resnet + """Get benchmark workload for SqueezeNet Parameters ---------- diff --git a/nnvm/src/compiler/graph_hash.cc b/nnvm/src/compiler/graph_hash.cc index d881130f72cc..ccd2e3ce433f 100644 --- a/nnvm/src/compiler/graph_hash.cc +++ b/nnvm/src/compiler/graph_hash.cc @@ -125,7 +125,7 @@ std::string GraphDeepCompare(const Graph& a, const IndexedGraph& idxb = b.indexed_graph(); std::ostringstream err; if (idxa.num_nodes() != idxb.num_nodes()) { - err << "Number of nodes mismatch"; + err << "Number of nodes mismatch (" << idxa.num_nodes() << " v.s " << idxb.num_nodes() << ")"; return err.str(); } if (idxa.num_node_entries() != idxb.num_node_entries()) { diff --git a/nnvm/tests/python/frontend/mxnet/model_zoo/__init__.py b/nnvm/tests/python/frontend/mxnet/model_zoo/__init__.py index e3c9acdf23ef..66e743ad9c33 100644 --- a/nnvm/tests/python/frontend/mxnet/model_zoo/__init__.py +++ b/nnvm/tests/python/frontend/mxnet/model_zoo/__init__.py @@ -1,11 +1,8 @@ """MXNet and NNVM model zoo.""" from __future__ import absolute_import -from . import mlp, resnet, vgg, dqn, dcgan, squeezenet +from . import mlp, resnet, vgg, dqn, dcgan, squeezenet, inception_v3 import nnvm.testing -__all__ = ['mx_mlp', 'nnvm_mlp', 'mx_resnet', 'nnvm_resnet', 'mx_vgg', 'nnvm_vgg', - 'mx_squeezenet', 'nnvm_squeezenet'] - _num_class = 1000 # mlp fc @@ -35,6 +32,10 @@ mx_squeezenet[version] = squeezenet.get_symbol(version=version) nnvm_squeezenet[version] = nnvm.testing.squeezenet.get_workload(1, version=version)[0] +# inception +mx_inception_v3 = inception_v3.get_symbol() +nnvm_inception_v3 = nnvm.testing.inception_v3.get_workload(1)[0] + # dqn mx_dqn = dqn.get_symbol() nnvm_dqn = nnvm.testing.dqn.get_workload(1)[0] diff --git a/nnvm/tests/python/frontend/mxnet/model_zoo/inception_v3.py b/nnvm/tests/python/frontend/mxnet/model_zoo/inception_v3.py new file mode 100644 index 000000000000..b8585bf05037 --- /dev/null +++ b/nnvm/tests/python/frontend/mxnet/model_zoo/inception_v3.py @@ -0,0 +1,170 @@ +""" +Inception V3, suitable for images with around 299 x 299 + +Reference: +Szegedy, Christian, et al. "Rethinking the Inception Architecture for Computer Vision." arXiv preprint arXiv:1512.00567 (2015). + +Adopted from https://github.com/apache/incubator-mxnet/blob/ + master/example/image-classification/symbols/inception-v3.py +""" +import mxnet as mx +import numpy as np + +def Conv(data, num_filter, kernel=(1, 1), stride=(1, 1), pad=(0, 0), name=None, suffix=''): + conv = mx.sym.Convolution(data=data, num_filter=num_filter, kernel=kernel, stride=stride, pad=pad, no_bias=True, name='%s%s_conv2d' %(name, suffix)) + bn = mx.sym.BatchNorm(data=conv, eps=2e-5, name='%s%s_batchnorm' % (name, suffix)) + act = mx.sym.Activation(data=bn, act_type='relu', name='%s%s_relu' %(name, suffix)) + return act + + +def Inception7A(data, + num_1x1, + num_3x3_red, num_3x3_1, num_3x3_2, + num_5x5_red, num_5x5, + pool, proj, + name): + tower_1x1 = Conv(data, num_1x1, name=('%s_conv' % name)) + tower_5x5 = Conv(data, num_5x5_red, name=('%s_tower' % name), suffix='_conv') + tower_5x5 = Conv(tower_5x5, num_5x5, kernel=(5, 5), pad=(2, 2), name=('%s_tower' % name), suffix='_conv_1') + tower_3x3 = Conv(data, num_3x3_red, name=('%s_tower_1' % name), suffix='_conv') + tower_3x3 = Conv(tower_3x3, num_3x3_1, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name), suffix='_conv_1') + tower_3x3 = Conv(tower_3x3, num_3x3_2, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name), suffix='_conv_2') + pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name))) + cproj = Conv(pooling, proj, name=('%s_tower_2' % name), suffix='_conv') + concat = mx.sym.Concat(*[tower_1x1, tower_5x5, tower_3x3, cproj], name='ch_concat_%s_chconcat' % name) + return concat + +# First Downsample +def Inception7B(data, + num_3x3, + num_d3x3_red, num_d3x3_1, num_d3x3_2, + pool, + name): + tower_3x3 = Conv(data, num_3x3, kernel=(3, 3), pad=(0, 0), stride=(2, 2), name=('%s_conv' % name)) + tower_d3x3 = Conv(data, num_d3x3_red, name=('%s_tower' % name), suffix='_conv') + tower_d3x3 = Conv(tower_d3x3, num_d3x3_1, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name=('%s_tower' % name), suffix='_conv_1') + tower_d3x3 = Conv(tower_d3x3, num_d3x3_2, kernel=(3, 3), pad=(0, 0), stride=(2, 2), name=('%s_tower' % name), suffix='_conv_2') + pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pad=(0,0), pool_type="max", name=('max_pool_%s_pool' % name)) + concat = mx.sym.Concat(*[tower_3x3, tower_d3x3, pooling], name='ch_concat_%s_chconcat' % name) + return concat + +def Inception7C(data, + num_1x1, + num_d7_red, num_d7_1, num_d7_2, + num_q7_red, num_q7_1, num_q7_2, num_q7_3, num_q7_4, + pool, proj, + name): + tower_1x1 = Conv(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_conv' % name)) + tower_d7 = Conv(data=data, num_filter=num_d7_red, name=('%s_tower' % name), suffix='_conv') + tower_d7 = Conv(data=tower_d7, num_filter=num_d7_1, kernel=(1, 7), pad=(0, 3), name=('%s_tower' % name), suffix='_conv_1') + tower_d7 = Conv(data=tower_d7, num_filter=num_d7_2, kernel=(7, 1), pad=(3, 0), name=('%s_tower' % name), suffix='_conv_2') + tower_q7 = Conv(data=data, num_filter=num_q7_red, name=('%s_tower_1' % name), suffix='_conv') + tower_q7 = Conv(data=tower_q7, num_filter=num_q7_1, kernel=(7, 1), pad=(3, 0), name=('%s_tower_1' % name), suffix='_conv_1') + tower_q7 = Conv(data=tower_q7, num_filter=num_q7_2, kernel=(1, 7), pad=(0, 3), name=('%s_tower_1' % name), suffix='_conv_2') + tower_q7 = Conv(data=tower_q7, num_filter=num_q7_3, kernel=(7, 1), pad=(3, 0), name=('%s_tower_1' % name), suffix='_conv_3') + tower_q7 = Conv(data=tower_q7, num_filter=num_q7_4, kernel=(1, 7), pad=(0, 3), name=('%s_tower_1' % name), suffix='_conv_4') + pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name))) + cproj = Conv(data=pooling, num_filter=proj, kernel=(1, 1), name=('%s_tower_2' % name), suffix='_conv') + # concat + concat = mx.sym.Concat(*[tower_1x1, tower_d7, tower_q7, cproj], name='ch_concat_%s_chconcat' % name) + return concat + +def Inception7D(data, + num_3x3_red, num_3x3, + num_d7_3x3_red, num_d7_1, num_d7_2, num_d7_3x3, + pool, + name): + tower_3x3 = Conv(data=data, num_filter=num_3x3_red, name=('%s_tower' % name), suffix='_conv') + tower_3x3 = Conv(data=tower_3x3, num_filter=num_3x3, kernel=(3, 3), pad=(0,0), stride=(2, 2), name=('%s_tower' % name), suffix='_conv_1') + tower_d7_3x3 = Conv(data=data, num_filter=num_d7_3x3_red, name=('%s_tower_1' % name), suffix='_conv') + tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_1, kernel=(1, 7), pad=(0, 3), name=('%s_tower_1' % name), suffix='_conv_1') + tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_2, kernel=(7, 1), pad=(3, 0), name=('%s_tower_1' % name), suffix='_conv_2') + tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=num_d7_3x3, kernel=(3, 3), stride=(2, 2), name=('%s_tower_1' % name), suffix='_conv_3') + pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name))) + # concat + concat = mx.sym.Concat(*[tower_3x3, tower_d7_3x3, pooling], name='ch_concat_%s_chconcat' % name) + return concat + +def Inception7E(data, + num_1x1, + num_d3_red, num_d3_1, num_d3_2, + num_3x3_d3_red, num_3x3, num_3x3_d3_1, num_3x3_d3_2, + pool, proj, + name): + tower_1x1 = Conv(data=data, num_filter=num_1x1, kernel=(1, 1), name=('%s_conv' % name)) + tower_d3 = Conv(data=data, num_filter=num_d3_red, name=('%s_tower' % name), suffix='_conv') + tower_d3_a = Conv(data=tower_d3, num_filter=num_d3_1, kernel=(1, 3), pad=(0, 1), name=('%s_tower' % name), suffix='_mixed_conv') + tower_d3_b = Conv(data=tower_d3, num_filter=num_d3_2, kernel=(3, 1), pad=(1, 0), name=('%s_tower' % name), suffix='_mixed_conv_1') + tower_3x3_d3 = Conv(data=data, num_filter=num_3x3_d3_red, name=('%s_tower_1' % name), suffix='_conv') + tower_3x3_d3 = Conv(data=tower_3x3_d3, num_filter=num_3x3, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name), suffix='_conv_1') + tower_3x3_d3_a = Conv(data=tower_3x3_d3, num_filter=num_3x3_d3_1, kernel=(1, 3), pad=(0, 1), name=('%s_tower_1' % name), suffix='_mixed_conv') + tower_3x3_d3_b = Conv(data=tower_3x3_d3, num_filter=num_3x3_d3_2, kernel=(3, 1), pad=(1, 0), name=('%s_tower_1' % name), suffix='_mixed_conv_1') + pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name))) + cproj = Conv(data=pooling, num_filter=proj, kernel=(1, 1), name=('%s_tower_2' % name), suffix='_conv') + # concat + concat = mx.sym.Concat(*[tower_1x1, tower_d3_a, tower_d3_b, tower_3x3_d3_a, tower_3x3_d3_b, cproj], name='ch_concat_%s_chconcat' % name) + return concat + +def get_symbol(num_classes=1000, **kwargs): + data = mx.sym.Variable(name="data") + # stage 1 + conv = Conv(data, 32, kernel=(3, 3), stride=(2, 2), name="conv") + conv_1 = Conv(conv, 32, kernel=(3, 3), name="conv_1") + conv_2 = Conv(conv_1, 64, kernel=(3, 3), pad=(1, 1), name="conv_2") + pool = mx.sym.Pooling(data=conv_2, kernel=(3, 3), stride=(2, 2), pool_type="max", name="pool") + # stage 2 + conv_3 = Conv(pool, 80, kernel=(1, 1), name="conv_3") + conv_4 = Conv(conv_3, 192, kernel=(3, 3), name="conv_4") + pool1 = mx.sym.Pooling(data=conv_4, kernel=(3, 3), stride=(2, 2), pool_type="max", name="pool1") + + # # stage 3 + in3a = Inception7A(pool1, 64, + 64, 96, 96, + 48, 64, + "avg", 32, "mixed") + in3b = Inception7A(in3a, 64, + 64, 96, 96, + 48, 64, + "avg", 64, "mixed_1") + in3c = Inception7A(in3b, 64, + 64, 96, 96, + 48, 64, + "avg", 64, "mixed_2") + in3d = Inception7B(in3c, 384, + 64, 96, 96, + "max", "mixed_3") + # stage 4 + in4a = Inception7C(in3d, 192, + 128, 128, 192, + 128, 128, 128, 128, 192, + "avg", 192, "mixed_4") + in4b = Inception7C(in4a, 192, + 160, 160, 192, + 160, 160, 160, 160, 192, + "avg", 192, "mixed_5") + in4c = Inception7C(in4b, 192, + 160, 160, 192, + 160, 160, 160, 160, 192, + "avg", 192, "mixed_6") + in4d = Inception7C(in4c, 192, + 192, 192, 192, + 192, 192, 192, 192, 192, + "avg", 192, "mixed_7") + in4e = Inception7D(in4d, 192, 320, + 192, 192, 192, 192, + "max", "mixed_8") + # stage 5 + in5a = Inception7E(in4e, 320, + 384, 384, 384, + 448, 384, 384, 384, + "avg", 192, "mixed_9") + in5b = Inception7E(in5a, 320, + 384, 384, 384, + 448, 384, 384, 384, + "max", 192, "mixed_10") + # pool + pool = mx.sym.Pooling(data=in5b, kernel=(8, 8), stride=(1, 1), pool_type="avg", name="global_pool") + flatten = mx.sym.Flatten(data=pool, name="flatten") + fc1 = mx.sym.FullyConnected(data=flatten, num_hidden=num_classes, name='fc1', flatten=False) + softmax = mx.sym.SoftmaxOutput(data=fc1, name='softmax') + return softmax diff --git a/nnvm/tests/python/frontend/mxnet/test_graph.py b/nnvm/tests/python/frontend/mxnet/test_graph.py index 18e124ad6ffc..e89224cd969e 100644 --- a/nnvm/tests/python/frontend/mxnet/test_graph.py +++ b/nnvm/tests/python/frontend/mxnet/test_graph.py @@ -39,17 +39,23 @@ def test_squeezenet(): nnvm_sym = model_zoo.nnvm_squeezenet[version] compare_graph(from_mx_sym, nnvm_sym) +def test_inception_v3(): + mx_sym = model_zoo.mx_inception_v3 + from_mx_sym, _ = nnvm.frontend.from_mxnet(mx_sym) + nnvm_sym = model_zoo.nnvm_inception_v3 + compare_graph(from_mx_sym, nnvm_sym, ishape=(2, 3, 299, 299)) + def test_dqn(): mx_sym = model_zoo.mx_dqn from_mx_sym, _ = nnvm.frontend.from_mxnet(mx_sym) nnvm_sym = model_zoo.nnvm_dqn - compare_graph(from_mx_sym, nnvm_sym) + compare_graph(from_mx_sym, nnvm_sym, ishape=(2, 4, 84, 84)) def test_dcgan(): mx_sym = model_zoo.mx_dcgan from_mx_sym, _ = nnvm.frontend.from_mxnet(mx_sym) nnvm_sym = model_zoo.nnvm_dcgan - compare_graph(from_mx_sym, nnvm_sym) + compare_graph(from_mx_sym, nnvm_sym, ishape=(2, 100)) def test_multi_outputs(): def compose(F, **kwargs): @@ -70,3 +76,4 @@ def compose(F, **kwargs): test_dqn() test_dcgan() test_squeezenet() + test_inception_v3() From 9bcc317371f709e0f455b8a147511a203d38c71e Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Wed, 15 Aug 2018 23:21:05 -0700 Subject: [PATCH 27/77] [TEAM] New reviewer: kevinthesun (#1606) --- CONTRIBUTORS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 2d571ba668ea..9db50b02b11a 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -26,6 +26,7 @@ See the [community structure document](http://docs.tvm.ai/contribute/community.h - [Pariksheet Pinjari](https://github.com/PariksheetPinjari909) - [Siva](https://github.com/srkreddy1238) - [Alex Weaver](https://github.com/alex-weaver) +- [Yao Wang](https://github.com/kevinthesun) - [Eddie Yan](https://github.com/eqy) - [Joshua Z. Zhang](https://github.com/zhreshold) From 11dd933f71e0da53169b89d59abbccf0b73f4f0f Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Thu, 16 Aug 2018 14:05:08 -0700 Subject: [PATCH 28/77] [RUNTIME] Enable return NDArray in RPC (#1610) --- include/tvm/runtime/ndarray.h | 1 + src/api/api_base.cc | 8 +++ src/runtime/rpc/rpc_module.cc | 61 +++++++++++++++++++--- src/runtime/rpc/rpc_session.cc | 63 +++++++++++++++++++---- src/runtime/rpc/rpc_session.h | 1 + tests/python/unittest/test_runtime_rpc.py | 44 +++++++++++++++- 6 files changed, 160 insertions(+), 18 deletions(-) diff --git a/include/tvm/runtime/ndarray.h b/include/tvm/runtime/ndarray.h index d3ecce8ba9d0..c288ce5f3adb 100644 --- a/include/tvm/runtime/ndarray.h +++ b/include/tvm/runtime/ndarray.h @@ -246,6 +246,7 @@ struct NDArray::Container { private: friend class NDArray; + friend class RPCWrappedFunc; /*! * \brief The shape container, * can be used used for shape data. diff --git a/src/api/api_base.cc b/src/api/api_base.cc index 37970e69e24f..70301993ad3a 100644 --- a/src/api/api_base.cc +++ b/src/api/api_base.cc @@ -37,6 +37,14 @@ TVM_REGISTER_API("_nop") .set_body([](TVMArgs args, TVMRetValue *ret) { }); +// internal fucntion used for debug and testing purposes +TVM_REGISTER_API("_ndarray_use_count") +.set_body([](TVMArgs args, TVMRetValue *ret) { + runtime::NDArray nd = args[0]; + // substract the current one + *ret = (nd.use_count() - 1); + }); + TVM_REGISTER_API("_TVMSetStream") .set_body([](TVMArgs args, TVMRetValue *ret) { TVMSetStream(args[0], args[1], args[2]); diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc index 251871bf0cc1..d6c56e1b7cf4 100644 --- a/src/runtime/rpc/rpc_module.cc +++ b/src/runtime/rpc/rpc_module.cc @@ -12,13 +12,13 @@ namespace tvm { namespace runtime { // Wrapped remote function to packed func. -struct RPCWrappedFunc { +class RPCWrappedFunc { public: RPCWrappedFunc(void* handle, std::shared_ptr sess) : handle_(handle), sess_(sess) { fwrap_ = PackedFunc([sess](TVMArgs args, TVMRetValue* rv) { - WrapRemote(sess, args.values[0].v_handle, args.type_codes[0], rv); + WrapRemote(sess, args, rv); }); } @@ -34,10 +34,47 @@ struct RPCWrappedFunc { } static void WrapRemote(std::shared_ptr sess, - void* handle, - int tcode, + TVMArgs args, TVMRetValue* rv); + // deleter of RPC remote array + static void RemoteNDArrayDeleter(NDArray::Container* ptr) { + RemoteSpace* space = static_cast(ptr->dl_tensor.data); + space->sess->CallRemote(RPCCode::kNDArrayFree, ptr->manager_ctx); + delete space; + delete ptr; + } + // wrap return value as remote NDArray. + static NDArray WrapRemoteNDArray(std::shared_ptr sess, + DLTensor* tensor, + void* nd_handle) { + NDArray::Container* data = new NDArray::Container(); + data->manager_ctx = nd_handle; + data->deleter = RemoteNDArrayDeleter; + RemoteSpace* space = new RemoteSpace(); + space->sess = sess; + space->data = tensor->data; + data->dl_tensor.data = space; + NDArray ret(data); + // RAII now in effect + data->shape_ = std::vector( + tensor->shape, tensor->shape + tensor->ndim); + data->dl_tensor.shape = dmlc::BeginPtr(data->shape_); + data->dl_tensor.ndim = static_cast(data->shape_.size()); + // setup dtype + data->dl_tensor.dtype = tensor->dtype; + // setup ctx, encode as remote session + data->dl_tensor.ctx.device_id = tensor->ctx.device_id; + data->dl_tensor.ctx.device_type = static_cast( + static_cast(tensor->ctx.device_type) + + kRPCSessMask * (sess->table_index() + 1)); + // check strides. + CHECK(tensor->strides == nullptr); + // setup byteoffset + data->dl_tensor.byte_offset = tensor->byte_offset; + return ret; + } + private: PackedFunc fwrap_; void* handle_{nullptr}; @@ -126,20 +163,28 @@ class RPCModuleNode final : public ModuleNode { }; void RPCWrappedFunc::WrapRemote(std::shared_ptr sess, - void* handle, - int tcode, + TVMArgs args, TVMRetValue *rv) { + void* handle = args.values[0].v_handle; + int tcode = args.type_codes[0]; + if (handle == nullptr) return; if (tcode == kFuncHandle) { auto wf = std::make_shared(handle, sess); *rv = PackedFunc([wf](TVMArgs args, TVMRetValue* rv) { return wf->operator()(args, rv); }); - } else { - CHECK_EQ(tcode, kModuleHandle); + } else if (tcode == kModuleHandle) { std::shared_ptr n = std::make_shared(handle, sess); *rv = Module(n); + } else if (tcode == kArrayHandle || tcode == kNDArrayContainer) { + CHECK_EQ(args.size(), 2); + DLTensor* tensor = args[0]; + void* nd_handle = args[1]; + *rv = WrapRemoteNDArray(sess, tensor, nd_handle); + } else { + LOG(FATAL) << "Cannot wrap tcode=" << tcode; } } diff --git a/src/runtime/rpc/rpc_session.cc b/src/runtime/rpc/rpc_session.cc index 21fff7b29882..6bb01b9bd459 100644 --- a/src/runtime/rpc/rpc_session.cc +++ b/src/runtime/rpc/rpc_session.cc @@ -130,19 +130,22 @@ class RPCSession::EventHandler : public dmlc::Stream { break; } case kReturnReceived: { - CHECK_EQ(arg_buf_->value.size(), 1U); + CHECK_GE(arg_buf_->value.size(), 1U); + TVMArgValue argv = arg_buf_->AsTVMArgs()[0]; if (argv.type_code() == kFuncHandle || - argv.type_code() == kModuleHandle) { + argv.type_code() == kModuleHandle || + argv.type_code() == kArrayHandle) { CHECK(fwrap != nullptr) << "function/module wrapper not available"; fwrap->CallPacked(arg_buf_->AsTVMArgs(), rv); } else { + CHECK_EQ(arg_buf_->value.size(), 1U); *rv = argv; } arg_buf_.reset(); this->SwitchToState(kRecvCode); std::swap(client_mode_, client_mode); - return RPCCode::kReturn; + return RPCCode::kReturn; } case kCopyAckReceived: { std::swap(client_mode_, client_mode); @@ -172,15 +175,22 @@ class RPCSession::EventHandler : public dmlc::Stream { ctx.device_type = static_cast(dev_type % kRPCSessMask); return ctx; } - // send Packed sequence to writer. - void SendPackedSeq(const TVMValue* arg_values, const int* type_codes, int n) { + // Send Packed sequence to writer. + // return_ndarray is a special flag to handle returning of ndarray + // In this case, we return the shape, context and data of the array, + // as well as a customized PackedFunc that handles deletion of + // the array in the remote. + void SendPackedSeq(const TVMValue* arg_values, + const int* type_codes, + int n, + bool return_ndarray = false) { this->Write(n); - // only handles . for (int i = 0; i < n; ++i) { int tcode = type_codes[i]; if (tcode == kNDArrayContainer) tcode = kArrayHandle; this->Write(tcode); } + // Argument packing. for (int i = 0; i < n; ++i) { int tcode = type_codes[i]; @@ -215,9 +225,23 @@ class RPCSession::EventHandler : public dmlc::Stream { case kNDArrayContainer: case kArrayHandle: { DLTensor* arr = static_cast(value.v_handle); - TVMContext ctx = StripSessMask(arr->ctx); - uint64_t data = reinterpret_cast( - static_cast(arr->data)->data); + TVMContext ctx; + uint64_t data; + if (!return_ndarray) { + // in the client mode + // ctx contains the remote table index + // the space is wrapped by an RemoteSpace + // that holds reference to the session. + ctx = StripSessMask(arr->ctx); + data = reinterpret_cast( + static_cast(arr->data)->data); + } else { + // When we return NDArray, we directly return + // the space and the context + // The client will be further wrapping + ctx = arr->ctx; + data = reinterpret_cast(arr->data); + } this->Write(data); this->Write(ctx); this->Write(arr->ndim); @@ -701,6 +725,21 @@ class RPCSession::EventHandler : public dmlc::Stream { << "Only server can send function and module handle back."; rv.MoveToCHost(&ret_value, &ret_tcode); SendPackedSeq(&ret_value, &ret_tcode, 1); + } else if (rv.type_code() == kNDArrayContainer) { + // always send handle in 64 bit. + CHECK(!client_mode_) + << "Only server can send NDArray back"; + // We follow a special protocol to return NDArray to client side + // The first pack value is the NDArray handle as DLTensor + // The second pack value is a customized deleter that deletes the NDArray. + TVMValue ret_value_pack[2]; + int ret_tcode_pack[2]; + rv.MoveToCHost(&ret_value_pack[0], &ret_tcode_pack[0]); + + NDArray::Container* nd = static_cast(ret_value_pack[0].v_handle); + ret_value_pack[1].v_handle = nd; + ret_tcode_pack[1] = kHandle; + SendPackedSeq(ret_value_pack, ret_tcode_pack, 2, true); } else { ret_value = rv.value(); ret_tcode = rv.type_code(); @@ -1090,6 +1129,11 @@ void RPCModuleGetSource(TVMArgs args, TVMRetValue *rv) { *rv = (*static_cast(mhandle))->GetSource(fmt); } +void RPCNDArrayFree(TVMArgs args, TVMRetValue *rv) { + void* handle = args[0]; + static_cast(handle)->DecRef(); +} + void RPCGetTimeEvaluator(TVMArgs args, TVMRetValue *rv) { PackedFunc *pf = static_cast(args[0].operator void*()); void *fhandle = new PackedFunc(WrapTimeEvaluator(*pf, args[1], args[2], args[3])); @@ -1138,6 +1182,7 @@ void RPCSession::EventHandler::HandlePackedCall() { case RPCCode::kModuleFree: CallHandler(RPCModuleFree); break; case RPCCode::kModuleGetFunc: CallHandler(RPCModuleGetFunc); break; case RPCCode::kModuleGetSource: CallHandler(RPCModuleGetSource); break; + case RPCCode::kNDArrayFree: CallHandler(RPCNDArrayFree); break; default: LOG(FATAL) << "Unknown event " << static_cast(code_); } CHECK_EQ(state_, kRecvCode); diff --git a/src/runtime/rpc/rpc_session.h b/src/runtime/rpc/rpc_session.h index 68f6763ae6db..4b736de0e041 100644 --- a/src/runtime/rpc/rpc_session.h +++ b/src/runtime/rpc/rpc_session.h @@ -48,6 +48,7 @@ enum class RPCCode : int { kModuleFree, kModuleGetFunc, kModuleGetSource, + kNDArrayFree }; /*! diff --git a/tests/python/unittest/test_runtime_rpc.py b/tests/python/unittest/test_runtime_rpc.py index e7c0cc1bbabd..0de788068b6b 100644 --- a/tests/python/unittest/test_runtime_rpc.py +++ b/tests/python/unittest/test_runtime_rpc.py @@ -175,6 +175,7 @@ def test_rpc_return_func(): @tvm.register_func("rpc.test.remote_func") def addone(x): return lambda y: x+y + server = rpc.Server("localhost", key="x1") client = rpc.connect(server.host, server.port, key="x1") f1 = client.get_function("rpc.test.remote_func") @@ -182,6 +183,46 @@ def addone(x): assert fadd(12) == 22 +def test_rpc_return_ndarray(): + # Use closure to check the ref counter correctness + nd = tvm.nd.array(np.zeros(10).astype("float32")) + @tvm.register_func("rpc.test.remote_return_nd") + def my_module(name): + if name == "get_arr": + return lambda : nd + elif name == "ref_count": + return lambda : tvm._api_internal._ndarray_use_count(nd) + elif name == "get_elem": + return lambda idx: nd.asnumpy()[idx] + elif name == "get_arr_elem": + return lambda arr, idx: arr.asnumpy()[idx] + + # start server + server = rpc.Server("localhost", key="x1") + client = rpc.connect(server.host, server.port, key="x1") + m = client.get_function("rpc.test.remote_return_nd") + get_arr = m("get_arr") + ref_count = m("ref_count") + get_elem = m("get_elem") + get_arr_elem = m("get_arr_elem") + # array test + def run_arr_test(): + arr = get_arr() + assert ref_count() == 2 + arr2 = get_arr() + assert ref_count() == 3 + assert arr.context == client.cpu(0) + arr.copyfrom(np.ones(10).astype(arr.dtype)) + assert arr2.asnumpy()[0] == 1.0 + assert get_elem(0) == 1.0 + assert get_arr_elem(arr2, 0) == 1.0 + + assert ref_count() == 1 + run_arr_test() + # check recycle correctness + assert ref_count() == 1 + + def test_local_func(): @tvm.register_func("rpc.test.remote_func2") def addone(x): @@ -199,9 +240,10 @@ def addone(x): if __name__ == "__main__": logging.basicConfig(level=logging.INFO) + test_rpc_return_ndarray() + test_rpc_return_func() test_bigendian_rpc() test_rpc_remote_module() - test_rpc_return_func() test_rpc_file_exchange() test_rpc_array() test_rpc_simple() From 093dc741366df633e841a38f3e015bd8464eb1c0 Mon Sep 17 00:00:00 2001 From: Keren Zhou Date: Thu, 16 Aug 2018 16:35:23 -0700 Subject: [PATCH 29/77] [NNVM] Add ONNX upsample converter (#1591) --- nnvm/python/nnvm/frontend/onnx.py | 19 ++++++++ .../python/frontend/onnx/test_forward.py | 47 +++++++++++++++++++ 2 files changed, 66 insertions(+) diff --git a/nnvm/python/nnvm/frontend/onnx.py b/nnvm/python/nnvm/frontend/onnx.py index fa26648b293a..f62202a37dff 100644 --- a/nnvm/python/nnvm/frontend/onnx.py +++ b/nnvm/python/nnvm/frontend/onnx.py @@ -406,6 +406,24 @@ def _impl(inputs, attr, params): return _impl +class Upsample(OnnxOpConverter): + """ Operator converter for Upsample (nearest mode). + """ + + @classmethod + def _impl_v7(cls, inputs, attr, params): + scales = attr.get('scales') + assert len(scales) == 4 and scales[0] == 1.0 and scales[1] == 1.0 and scales[2] == scales[3] + mode = attr.get('mode') + if mode == b'nearest': + method = "NEAREST_NEIGHBOR" + elif mode == b'linear': + method = "BILINEAR" + else: + raise ValueError("Invalid ONNX upsample mode: {}".format(mode)) + return _sym.upsampling(inputs[0], scale=int(scales[-1]), method=method, layout='NCHW') + + class Shape(OnnxOpConverter): """ Operator converter for Shape. """ @@ -540,6 +558,7 @@ def _get_convert_map(opset): # 'Crop' # 'Embedding' # 'Upsample' + 'Upsample' : Upsample.get_converter(opset), 'SpatialBN': BatchNorm.get_converter(opset), # defs/generator diff --git a/nnvm/tests/python/frontend/onnx/test_forward.py b/nnvm/tests/python/frontend/onnx/test_forward.py index 9fb3aed2da10..3f2fbb144289 100644 --- a/nnvm/tests/python/frontend/onnx/test_forward.py +++ b/nnvm/tests/python/frontend/onnx/test_forward.py @@ -1,6 +1,8 @@ import numpy as np import math import nnvm +import topi +import topi.testing import tvm from tvm.contrib import graph_runtime from nnvm.testing.config import ctx_list @@ -380,6 +382,50 @@ def test_lrn(): verify_lrn((5, 5, 5, 5), 3, 'float32') verify_lrn((5, 5, 5, 5), 3, 'float32', alpha=0.0002, beta=0.5, bias=2.0) +def _test_upsample_nearest(): + scale = 2 + in_shape = (1, 1, 3, 3) + out_shape = (1, 1, 3*scale, 3*scale) + y = helper.make_node("Upsample", ['in'], ['out'], mode='nearest', scales=[1.0, 1.0, 2.0, 2.0]) + + in_array = np.random.uniform(size=in_shape).astype(np.float32) + out_array = topi.testing.upsampling_python(in_array, scale, "NCHW") + + graph = helper.make_graph([y], + 'upsample_nearest_test', + inputs = [helper.make_tensor_value_info("in", TensorProto.FLOAT, list(in_shape))], + outputs = [helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_shape))]) + + model = helper.make_model(graph, producer_name='upsample_nearest_test') + + for target, ctx in ctx_list(): + tvm_out = get_tvm_output(model, in_array, target, ctx, out_shape, 'float32') + np.testing.assert_allclose(out_array, tvm_out) + +def _test_upsample_bilinear(): + scale = 2 + in_shape = (1, 1, 3, 3) + out_shape = (1, 1, 3*scale, 3*scale) + y = helper.make_node("Upsample", ['in'], ['out'], mode='linear', scales=[1.0, 1.0, 2.0, 2.0]) + + in_array = np.random.uniform(size=in_shape).astype(np.float32) + out_array = topi.testing.bilinear_resize_python(in_array, (3*scale, 3*scale), "NCHW") + + graph = helper.make_graph([y], + 'upsample_bilinear_test', + inputs = [helper.make_tensor_value_info("in", TensorProto.FLOAT, list(in_shape))], + outputs = [helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_shape))]) + + model = helper.make_model(graph, producer_name='upsample_bilinear_test') + + for target, ctx in ctx_list(): + tvm_out = get_tvm_output(model, in_array, target, ctx, out_shape, 'float32') + np.testing.assert_allclose(out_array, tvm_out, rtol=1e-5, atol=1e-5) + +def test_upsample(): + _test_upsample_nearest() + _test_upsample_bilinear() + if __name__ == '__main__': # verify_super_resolution_example() @@ -398,3 +444,4 @@ def test_lrn(): test_matmul() test_gather() test_lrn() + test_upsample() From e282915a9c76964551d600bc01602dc53d7a4da1 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Thu, 16 Aug 2018 16:37:03 -0700 Subject: [PATCH 30/77] add -mattr=+neon for all arm cpu target (#1612) --- apps/benchmark/README.md | 11 +++++++---- python/tvm/target.py | 16 ++++++++-------- tutorials/nnvm/deploy_model_on_rasp.py | 2 +- 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/apps/benchmark/README.md b/apps/benchmark/README.md index e83e47c46eb7..ee22f90dc435 100644 --- a/apps/benchmark/README.md +++ b/apps/benchmark/README.md @@ -63,8 +63,11 @@ python3 -m tvm.exec.rpc_tracker python3 arm_cpu_imagenet_bench.py --device mate10pro --rpc-key mate10pro ``` - If your device has a same SoC of the above device, you can reuse these parameters - (e.g. use `llvm -device=arm_cpu -mode=rk3399 -target=aarch64-linux-gnu` as target). - Otherwise, you need to tune for your own device, please follow this - [tutorial](https://docs.tvm.ai/tutorials/autotvm/tune_nnvm_arm.html). + If your device has a same or similar SoC of the above devices, you can reuse these parameters. + For example, if your SoC is similar to rasp3b, use + ```bash + python3 arm_cpu_imagenet_bench.py --device rasp3b --rpc-key your_custom_key + ``` + For other devices, to get the best performance, it is recommended that you tune your network by yourself. + Please follow this [tutorial](https://docs.tvm.ai/tutorials/autotvm/tune_nnvm_arm.html). diff --git a/python/tvm/target.py b/python/tvm/target.py index fed20c3914c6..e2d780f75264 100644 --- a/python/tvm/target.py +++ b/python/tvm/target.py @@ -427,14 +427,14 @@ def arm_cpu(model='unknown', options=None): from . import autotvm trans_table = { - "pixel2": ["-model=snapdragon835", "-target=arm64-linux-android"], - "mate10": ["-model=kirin970", "-target=arm64-linux-android"], - "mate10pro": ["-model=kirin970", "-target=arm64-linux-android"], - "p20": ["-model=kirin970", "-target=arm64-linux-android"], - "p20pro": ["-model=kirin970", "-target=arm64-linux-android"], - "rasp3b": ["-model=bcm2837", "-target=armv7l-linux-gnueabihf"], - "rk3399": ["-model=rk3399", "-target=aarch64-linux-gnu"], - "pynq": ["-model=pynq", "-target=armv7a-linux-eabi"], + "pixel2": ["-model=snapdragon835", "-target=arm64-linux-android -mattr=+neon"], + "mate10": ["-model=kirin970", "-target=arm64-linux-android -mattr=+neon"], + "mate10pro": ["-model=kirin970", "-target=arm64-linux-android -mattr=+neon"], + "p20": ["-model=kirin970", "-target=arm64-linux-android -mattr=+neon"], + "p20pro": ["-model=kirin970", "-target=arm64-linux-android -mattr=+neon"], + "rasp3b": ["-model=bcm2837", "-target=armv7l-linux-gnueabihf -mattr=+neon"], + "rk3399": ["-model=rk3399", "-target=aarch64-linux-gnu -mattr=+neon"], + "pynq": ["-model=pynq", "-target=armv7a-linux-eabi -mattr=+neon"], } pre_defined_opt = trans_table.get(model, ["-model=%s" % model]) diff --git a/tutorials/nnvm/deploy_model_on_rasp.py b/tutorials/nnvm/deploy_model_on_rasp.py index c11f202c1251..40dbdaeb00ee 100644 --- a/tutorials/nnvm/deploy_model_on_rasp.py +++ b/tutorials/nnvm/deploy_model_on_rasp.py @@ -154,7 +154,7 @@ def transform_image(image): else: target = tvm.target.arm_cpu('rasp3b') # The above line is a simple form of - # target = tvm.target.create('llvm -devcie=arm_cpu -target=armv7l-linux-gnueabihf') + # target = tvm.target.create('llvm -devcie=arm_cpu -model=bcm2837 -target=armv7l-linux-gnueabihf -mattr=+neon') with nnvm.compiler.build_config(opt_level=2, add_pass=['AlterOpLayout']): graph, lib, params = nnvm.compiler.build( From acc2151c6992cefccc1538819f726854bcc6a3c7 Mon Sep 17 00:00:00 2001 From: Ashok Emani Date: Thu, 16 Aug 2018 19:19:43 -0700 Subject: [PATCH 31/77] fix output_shape in conv2d_nchw (#1613) --- topi/include/topi/nn.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/topi/include/topi/nn.h b/topi/include/topi/nn.h index 53b899796e37..4a537a646425 100644 --- a/topi/include/topi/nn.h +++ b/topi/include/topi/nn.h @@ -265,7 +265,7 @@ inline tvm::Tensor conv2d_nchw(const tvm::Tensor& I, auto pW = I->shape[3]; tvm::Array output_shape{ I->shape[0], // B - W->shape[1], // O + W->shape[0], // O (I->shape[2] - W->shape[2] + 2 * pad_h) / stride_h + 1, // H (I->shape[3] - W->shape[3] + 2 * pad_w) / stride_w + 1 // W }; From 6cd5a8f991eb8ba3cdd27a063067af733aecc9a2 Mon Sep 17 00:00:00 2001 From: masahi Date: Fri, 17 Aug 2018 11:37:56 +0900 Subject: [PATCH 32/77] [NNVM] Bug fix Prevent fusing convolution with injective op (#1608) --- nnvm/src/compiler/graph_fuse.cc | 31 +++++++++++++++++- nnvm/tests/python/compiler/test_op_fusion.py | 34 ++++++++++++++++++++ 2 files changed, 64 insertions(+), 1 deletion(-) diff --git a/nnvm/src/compiler/graph_fuse.cc b/nnvm/src/compiler/graph_fuse.cc index 52a8ae44f8ee..f65312be1a29 100644 --- a/nnvm/src/compiler/graph_fuse.cc +++ b/nnvm/src/compiler/graph_fuse.cc @@ -63,12 +63,16 @@ nnvm::Graph GraphFindFusibleGroups(nnvm::Graph g) { // Check if we can fuse to the master. int chosen_master = -1; bool ewise = inode.source->num_outputs() == 1; + bool mark_as_injective = false; for (const auto& e : inode.inputs) { if (fuse_vec[e.node_id] == FuseRule::kUknown) { TOpPattern ipt = pattern_vec[e.node_id]; if (ipt != kElemWise) ewise = false; - if (ipt <= kInjective) { + if (ipt <= kBroadcast) { + fuse_vec[e.node_id] = FuseRule::kFuseToMaster; + } else if (ipt == kInjective) { fuse_vec[e.node_id] = FuseRule::kFuseToMaster; + mark_as_injective = true; } else if (ipt == kOutEWiseFusable && chosen_master == -1 && shape_vec[idx.entry_id(nid, 0)] == shape_vec[idx.entry_id(e)]) { @@ -87,6 +91,8 @@ nnvm::Graph GraphFindFusibleGroups(nnvm::Graph g) { master_vec[nid] = chosen_master; if (chosen_master != -1) { pt = kOutEWiseFusable; + } else if (mark_as_injective) { + pt = kInjective; } else { pt = ewise ? kElemWise : kBroadcast; } @@ -135,8 +141,31 @@ nnvm::Graph GraphFindFusibleGroups(nnvm::Graph g) { if (group_vec[nid] == -1) { group_vec[nid] = nid; } + + // Check if injective op and out_ewise_fusable op (e.g. conv2d) are in the same group. + bool parent_out_ewise = false; + bool parent_injective = false; + for (const auto& e : inode.inputs) { + TOpPattern pt = pattern_vec[e.node_id]; + if (pt == kOutEWiseFusable) { + parent_out_ewise = true; + } else if (pt == kInjective) { + parent_injective = true; + } + } + // Change the master node from out_ewise_fusable op to itself + if (parent_injective && parent_out_ewise) master_vec[nid] = nid; + // Propagate the group id. for (const auto& e : inode.inputs) { + TOpPattern pt = pattern_vec[e.node_id]; + if (parent_out_ewise && parent_injective) { + if (pt == kOutEWiseFusable) { + continue; // Do not fuse out_ewise_fusable op + } else if (pt == kInjective) { + master_vec[e.node_id] = nid; + } + } if (fuse_vec[e.node_id] == FuseRule::kFuseToMaster) { CHECK(group_vec[e.node_id] == -1|| group_vec[e.node_id] == group_vec[nid]); diff --git a/nnvm/tests/python/compiler/test_op_fusion.py b/nnvm/tests/python/compiler/test_op_fusion.py index 8d05ae02c579..5f4da3865a45 100644 --- a/nnvm/tests/python/compiler/test_op_fusion.py +++ b/nnvm/tests/python/compiler/test_op_fusion.py @@ -77,6 +77,39 @@ def test_injective_reduce_injective(): np.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5) +def test_injective_conv2d(): + channels = 16 + data = sym.Variable(name="data") + pool = sym.global_avg_pool2d(data=data) + weight = sym.reshape(pool, shape=[1, channels, 1, 1]) + residual = sym.conv2d(data=data, kernel_size=(3,3), channels=channels, padding=(1, 1), + layout="NCHW", kernel_layout="OIHW", use_bias=False, name="conv") + net = weight * data + residual + size = 56 + dtype="float32" + dshape = (1, channels, size, size) + kshape = (channels, channels, 3, 3) + oshape = dshape + shape_dict = {"data": dshape} + + for target, ctx in ctx_list(): + graph, lib, _ = nnvm.compiler.build(net, target, shape_dict) + # data, global_avg_pool, conv weight, conv op, fused elemwise add + assert graph.index.num_nodes == 5 + + data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype)) + kernel = tvm.nd.array(np.random.uniform(size=kshape).astype(dtype)) + m = graph_runtime.create(graph, lib, ctx) + m.run(data=data, conv_weight=kernel) + # get output + out = m.get_output(0, tvm.nd.empty(oshape, dtype)) + residual = topi.testing.conv2d_nchw_python( + data.asnumpy(), kernel.asnumpy(), (1,1), 'SAME') + weight = np.mean(data.asnumpy(), axis=(2, 3)) + c_np = weight[:, :, np.newaxis, np.newaxis] * data.asnumpy() + residual + np.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5) + + def build_and_run(sym, params, data, out_shape, target, ctx, opt_level=2): with nnvm.compiler.build_config(opt_level=opt_level): graph, lib, params = nnvm.compiler.build(sym, target, shape={"data":data.shape}, params=params) @@ -123,3 +156,4 @@ def get_sym(out_channel): test_ewise_injective() test_conv_ewise_injective() test_fuse_conv2d_elu() + test_injective_conv2d() From 9b0e499054f204cee411afc1965c95a87d6dab1c Mon Sep 17 00:00:00 2001 From: Sergey Mironov Date: Sat, 18 Aug 2018 07:40:52 +0300 Subject: [PATCH 33/77] [NNVM] TF: Add Pack operation (#1570) --- nnvm/include/nnvm/top/tensor.h | 2 +- nnvm/python/nnvm/frontend/tensorflow.py | 9 ++++++ nnvm/src/top/tensor/transform.cc | 19 ++++++------ .../frontend/tensorflow/test_forward.py | 29 ++++++++++++++++++- 4 files changed, 48 insertions(+), 11 deletions(-) diff --git a/nnvm/include/nnvm/top/tensor.h b/nnvm/include/nnvm/top/tensor.h index 22ee9d7118e6..53ed5b3b0a22 100644 --- a/nnvm/include/nnvm/top/tensor.h +++ b/nnvm/include/nnvm/top/tensor.h @@ -16,7 +16,7 @@ namespace top { struct ConcatenateParam : public dmlc::Parameter { int axis; DMLC_DECLARE_PARAMETER(ConcatenateParam) { - DMLC_DECLARE_FIELD(axis).set_lower_bound(0).set_default(1) + DMLC_DECLARE_FIELD(axis).set_default(1) .describe("the axis to be concated."); } }; diff --git a/nnvm/python/nnvm/frontend/tensorflow.py b/nnvm/python/nnvm/frontend/tensorflow.py index d761e34c7c59..092b8fa20219 100644 --- a/nnvm/python/nnvm/frontend/tensorflow.py +++ b/nnvm/python/nnvm/frontend/tensorflow.py @@ -339,6 +339,14 @@ def _impl(inputs, attr, params): extras={'axis': axis.asnumpy()[0]})(inputs, attr) return _impl +def _pack(): + def _impl(inputs, attr, params): + axis = int(attr["axis"]) + inputs_reshaped = [_sym.expand_dims(i, axis=axis, num_newaxis=1) for i in inputs] + return _sym.concatenate(*inputs_reshaped, axis=axis) + + return _impl + def _reshape(): def _impl(inputs, attr, params): try: @@ -673,6 +681,7 @@ def _impl(inputs, attr, params): 'Minimum' : _elemwise('min'), 'Sum' : _sum(), 'Square' : _square(), + 'Pack' : _pack(), 'Relu' : AttrCvt('relu'), 'Reshape' : _reshape(), 'ResizeBilinear' : _resize_bilinear(), diff --git a/nnvm/src/top/tensor/transform.cc b/nnvm/src/top/tensor/transform.cc index 78255d20f040..52dca5654838 100644 --- a/nnvm/src/top/tensor/transform.cc +++ b/nnvm/src/top/tensor/transform.cc @@ -93,23 +93,24 @@ inline bool ConcatenateInferShape(const NodeAttrs& attrs, TShape dshape; dim_t size = 0; bool has_zero = false; + int axis = param.axis >= 0 ? param.axis : in_shape->at(0).ndim() + param.axis; for (size_t i = 0; i < in_shape->size(); ++i) { TShape tmp = (*in_shape)[i]; if (tmp.ndim()) { - CHECK_LT(static_cast(param.axis), tmp.ndim()) - << "concat dim " << param.axis << " out of range of input shape " << tmp; - has_zero = tmp[param.axis] == 0 || has_zero; - size += tmp[param.axis]; - tmp[param.axis] = 0; + CHECK_LT(static_cast(axis), tmp.ndim()) + << "concat dim " << axis << " out of range of input shape " << tmp; + has_zero = tmp[axis] == 0 || has_zero; + size += tmp[axis]; + tmp[axis] = 0; shape_assign(&dshape, tmp); } } TShape tmp = (*out_shape)[0]; if (tmp.ndim()) { - CHECK_LT(static_cast(param.axis), tmp.ndim()) - << "concat dim " << param.axis << " out of range of input shape " << tmp; - tmp[param.axis] = 0; + CHECK_LT(static_cast(axis), tmp.ndim()) + << "concat dim " << axis << " out of range of input shape " << tmp; + tmp[axis] = 0; shape_assign(&dshape, tmp); } @@ -119,7 +120,7 @@ inline bool ConcatenateInferShape(const NodeAttrs& attrs, NNVM_ASSIGN_INPUT_SHAPE(attrs, *in_shape, i, dshape); } - if (!has_zero) dshape[param.axis] = size; + if (!has_zero) dshape[axis] = size; NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, 0, dshape); return dshape.Size() != 0; } diff --git a/nnvm/tests/python/frontend/tensorflow/test_forward.py b/nnvm/tests/python/frontend/tensorflow/test_forward.py index 64c57c126f8d..6fa020a03444 100644 --- a/nnvm/tests/python/frontend/tensorflow/test_forward.py +++ b/nnvm/tests/python/frontend/tensorflow/test_forward.py @@ -342,7 +342,7 @@ def _test_argx(func, data, **kwargs): compare_tf_with_tvm(data, 'c0:0', 'argx0:0') -def test_argmin_argmax(): +def test_forward_argminmax(): for axis in [None,0,1,2]: data = np.random.uniform(size=(8,4,9)).astype('float32') _test_argx(tf.argmax, data=data, axis=axis) @@ -555,6 +555,31 @@ def test_forward_lstm(): _test_lstm_cell(1, 2, 1, 0.0, 'float32') + + +####################################################################### +# Pack +# --- +def _test_pack(axis, shape, **kwargs): + + a = np.arange(np.prod(shape), dtype=np.float32).reshape(shape) + b = np.arange(np.prod(shape), dtype=np.float32).reshape(shape) + + with tf.Graph().as_default(): + tf_a = array_ops.placeholder(shape=shape, dtype='float32', name='pl_a') + tf_b = array_ops.placeholder(shape=shape, dtype='float32', name='pl_b') + tf_c = tf.stack([tf_a,tf_b], axis=axis, **kwargs) + assert tf_c.op.op_def.name == 'Pack', "tf.stack() is expected to produce 'Pack' operation" + + compare_tf_with_tvm([a,b], ['pl_a:0','pl_b:0'], 'stack:0') + +def test_forward_pack(): + for axis in range(-3,3): + _test_pack(axis, [3,2,1]) + for axis in range(-1,1): + _test_pack(axis, [3]) + _test_pack(0, []) + ####################################################################### # Pad # --- @@ -818,9 +843,11 @@ def test_forward_l2_normalize(): test_forward_reshape() test_forward_squeeze() test_forward_sigmoid() + test_forward_argminmax() if tf.__version__ == '1.4.1': _test_forward_concat_v2() test_forward_multi_input() + test_forward_pack() test_forward_inception_v3() test_forward_inception_v1() test_forward_mobilenet() From 38d0835728f8b11c4467ec6a55a44ad9de24de7b Mon Sep 17 00:00:00 2001 From: xqdan Date: Sun, 19 Aug 2018 02:18:29 +0800 Subject: [PATCH 34/77] #1592 [PASS] Fix missing mem CHECK in storage_rewrite (#1616) --- src/pass/storage_rewrite.cc | 6 ++ .../unittest/test_pass_storage_rewrite.py | 63 ++++++++++++------- 2 files changed, 48 insertions(+), 21 deletions(-) diff --git a/src/pass/storage_rewrite.cc b/src/pass/storage_rewrite.cc index 0170499e1491..877216ed7656 100644 --- a/src/pass/storage_rewrite.cc +++ b/src/pass/storage_rewrite.cc @@ -584,6 +584,12 @@ class StoragePlanRewriter : public IRMutator { e->new_alloc = Allocate::make( e->alloc_var, alloc_type, {combo_size}, const_true(), Evaluate::make(0)); + if (e->scope.tag.length() != 0) { + MemoryInfo info = GetMemoryInfo(e->scope.to_string()); + uint64_t total_elem = e->const_nbits / e->elem_type.bits(); + CHECK_LE(total_elem * e->elem_type.bits(), info->max_num_bits) + << "Allocation exceed bound of memory tag " << e->scope.to_string(); + } } } } diff --git a/tests/python/unittest/test_pass_storage_rewrite.py b/tests/python/unittest/test_pass_storage_rewrite.py index 2bb02998982f..3c07a1f26aff 100644 --- a/tests/python/unittest/test_pass_storage_rewrite.py +++ b/tests/python/unittest/test_pass_storage_rewrite.py @@ -28,15 +28,30 @@ def verify(n): tvm.ir_pass.PostOrderVisit(stmt, verify) assert num_alloc[0] == 1 +def register_mem(scope_tb, max_bits): + #Register mem + @tvm.register_func("tvm.info.mem.%s" % scope_tb) + def mem_info_inp_buffer(): + return tvm.make.node("MemoryInfo", + unit_bits= 16, + max_simd_bits=32, + max_num_bits=max_bits, + head_address=None) + def test_alloc_seq(): + scope_tb = "local.L0A" + max_bits = 1024 * 1024 * 1024 + + register_mem(scope_tb, max_bits) + ib = tvm.ir_builder.create() n = tvm.var("n") with ib.for_range(0, n, name="i") as i: with ib.for_range(0, 10, name="j") as j: - A = ib.allocate("float32", 200, name="A", scope="local.L0A") + A = ib.allocate("float32", 200, name="A", scope=scope_tb) A[j] = 1.2 with ib.for_range(0, 10, name="j") as j: - A = ib.allocate("float32", 200, name="B", scope="local.L0A") + A = ib.allocate("float32", 200, name="B", scope=scope_tb) A[j] = 1.3 body = ib.get() @@ -233,16 +248,9 @@ def test_parallel_alloc(): assert(isinstance(body.body.body.body.body, tvm.stmt.Allocate)) -def test_inplace_rule2(): +def test_inplace_rule2(scope_tb = "local_TB2", max_bits = 1024 * 1024 * 1024): #Test Buffer - scope_tb = "local_TB2" - @tvm.register_func("tvm.info.mem.%s" % scope_tb) - def mem_info_inp_buffer(): - return tvm.make.node("MemoryInfo", - unit_bits= 16, - max_simd_bits=32, - max_num_bits=1024*1024*1024, - head_address=None) + register_mem(scope_tb, max_bits) m = 10 A = tvm.placeholder((m,), name='A') C = tvm.placeholder((m,), name='C') @@ -275,16 +283,23 @@ def verify(n): tvm.ir_pass.PostOrderVisit(stmt, verify) assert num_alloc[0] == 2 +def test_exceed_mem(): + max_bits = 639 + # The critical max_num_bits is between 639 and 640 + loc = -1 + try: + test_inplace_rule2("local_TEM", max_bits) + except Exception as e: + estr = str(e) + loc = estr.find('Allocation exceed bound of memory') + assert loc != -1 + def test_inplace_rule3(): #Test Buffer scope_tb = "local_TB3" - @tvm.register_func("tvm.info.mem.%s" % scope_tb) - def mem_info_inp_buffer(): - return tvm.make.node("MemoryInfo", - unit_bits= 16, - max_simd_bits=32, - max_num_bits=1024*1024*1024, - head_address=None) + max_bits=1024 * 1024 * 1024 + + register_mem(scope_tb, max_bits) m = 10 B0 = tvm.placeholder((m,), name='B0') B1 = tvm.placeholder((m,), name='B1') @@ -388,17 +403,22 @@ def verify(n): assert num_alloc[0] == 1 def test_alloc_seq_type2(): + scope_tb = "local.L0A2" + max_bits=1024 * 1024 * 1024 + + register_mem(scope_tb, max_bits) + ib = tvm.ir_builder.create() n = tvm.var("n") with ib.for_range(0, n, name="i") as i: with ib.for_range(0, 10, name="j") as j: - A = ib.allocate("float32", 200, name="A", scope="local.L0A") + A = ib.allocate("float32", 200, name="A", scope=scope_tb) A[j] = 1.2 with ib.for_range(0, 20, name="j") as j: - B = ib.allocate("int16", 400, name="B", scope="local.L0A") + B = ib.allocate("int16", 400, name="B", scope=scope_tb) B[j] = tvm.const(1, "int16") with ib.for_range(0, 10, name="j") as j: - C = ib.allocate("float32", 200, name="C", scope="local.L0A") + C = ib.allocate("float32", 200, name="C", scope=scope_tb) C[j] = 1.2 body = ib.get() @@ -465,6 +485,7 @@ def test_replace_dataflow(): test_storage_combine() test_storage_share_gpu() test_inplace_rule2() + test_exceed_mem() test_inplace_rule3() test_alloc_seq_type() test_alloc_seq_type2() From a5f2759ec5ac79592f860919349a85bf63ed548a Mon Sep 17 00:00:00 2001 From: Siju Date: Sat, 18 Aug 2018 23:50:58 +0530 Subject: [PATCH 35/77] =?UTF-8?q?[FRONTEND][COREML]MultiplyLayerParams=20L?= =?UTF-8?q?2NormalizeLayerParams=20and=20UpsampleLayerParams=20support=20?= =?UTF-8?q?=E2=80=A6=20(#1511)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- nnvm/python/nnvm/frontend/coreml.py | 22 ++ .../python/frontend/coreml/test_forward.py | 190 +++++++++++++++++- 2 files changed, 211 insertions(+), 1 deletion(-) diff --git a/nnvm/python/nnvm/frontend/coreml.py b/nnvm/python/nnvm/frontend/coreml.py index e80cfe23f220..3ca76bb0b20e 100644 --- a/nnvm/python/nnvm/frontend/coreml.py +++ b/nnvm/python/nnvm/frontend/coreml.py @@ -217,6 +217,16 @@ def AddLayerParams(op, insyms, symtab): ret = _sym.__add_scalar__(ret, scalar=op.alpha) return ret +def MultiplyLayerParams(op, insyms, symtab): + if not isinstance(insyms, list): + insyms = [insyms] + ret = insyms[0] + for i in range(1, len(insyms)): + ret = _sym.elemwise_mul(ret, insyms[i]) + if op.alpha != 1: + ret = _sym.__mul_scalar__(ret, scalar=op.alpha) + return ret + def ConcatLayerParams(op, insyms, symtab): if not isinstance(insyms, list): insyms = [insyms] @@ -249,6 +259,15 @@ def PermuteLayerParams(op, insym, symtab): axes = tuple(op.axis) return _sym.transpose(insym, axes=axes) +def UpsampleLayerParams(op, insym, symtab): + if op.scalingFactor[0] != op.scalingFactor[1]: + raise NotImplementedError("Upsampling only supported with same \ + height and width scaling factor.") + interpolationMode = 'NEAREST_NEIGHBOR' if op.mode == 0 else 'BILINEAR' + return _sym.upsampling(insym, scale=op.scalingFactor[0], method=interpolationMode) + +def L2NormalizeLayerParams(op, insym, symtab): + return _sym.l2_normalize(insym, eps=op.epsilon, axis=1) _convert_map = { 'NeuralNetworkMeanImage': NeuralNetworkMeanImage, @@ -261,10 +280,13 @@ def PermuteLayerParams(op, insym, symtab): 'SoftmaxLayerParams':SoftmaxLayerParams, 'InnerProductLayerParams':InnerProductLayerParams, 'AddLayerParams':AddLayerParams, + 'MultiplyLayerParams':MultiplyLayerParams, 'FlattenLayerParams':FlattenLayerParams, 'ConcatLayerParams':ConcatLayerParams, 'PaddingLayerParams':PaddingLayerParams, 'PermuteLayerParams':PermuteLayerParams, + 'UpsampleLayerParams':UpsampleLayerParams, + 'L2NormalizeLayerParams':L2NormalizeLayerParams } def coreml_op_to_nnvm(op, inname, outname, symtab): diff --git a/nnvm/tests/python/frontend/coreml/test_forward.py b/nnvm/tests/python/frontend/coreml/test_forward.py index d5c460e56987..27ae28c20ab9 100644 --- a/nnvm/tests/python/frontend/coreml/test_forward.py +++ b/nnvm/tests/python/frontend/coreml/test_forward.py @@ -1,8 +1,12 @@ import numpy as np -import topi +from coremltools.models.neural_network import NeuralNetworkBuilder +from coremltools.models import datatypes + import tvm from tvm.contrib import graph_runtime +import topi +import topi.testing import nnvm.symbol as sym import nnvm.compiler from nnvm.testing.config import ctx_list @@ -40,6 +44,190 @@ def test_resnet50_checkonly(): model_file = model_zoo.get_resnet50() test_model_checkonly(model_file, 'resnet50') +def run_tvm_graph(graph_def, input_data, input_name, output_shape, output_dtype='float32'): + """ Generic function to compile on nnvm and execute on tvm """ + + sym, params = nnvm.frontend.from_coreml(graph_def) + target = 'llvm' + if isinstance(input_data, list): + shape_dict = {} + dtype_dict = {} + for i, e in enumerate(input_name): + shape_dict[e] = input_data[i].shape + dtype_dict[e] = input_data[i].dtype + else: + shape_dict = {input_name: input_data.shape} + dtype_dict = {input_name: input_data.dtype} + + graph, lib, params = nnvm.compiler.build(sym, target, shape_dict, + dtype=dtype_dict, params=params) + + ctx = tvm.cpu(0) + from tvm.contrib import graph_runtime + m = graph_runtime.create(graph, lib, ctx) + # set inputs + if isinstance(input_data, list): + for i, e in enumerate(input_name): + m.set_input(e, tvm.nd.array(input_data[i].astype(input_data[i].dtype))) + else: + m.set_input(input_name, tvm.nd.array(input_data.astype(input_data.dtype))) + + m.set_input(**params) + # execute + m.run() + # get outputs + if isinstance(output_shape, list) and isinstance(output_dtype, list): + tvm_output_list = [] + for i, s in enumerate(output_shape): + tvm_output = m.get_output(i, tvm.nd.empty((s), output_dtype[i])) + tvm_output_list.append(tvm_output.asnumpy()) + return tvm_output_list + else: + tvm_output = m.get_output(0, tvm.nd.empty((output_shape), output_dtype)) + return tvm_output.asnumpy() + +def verify_AddLayerParams(input_dim, alpha=2): + dtype = 'float32' + + a_np1 = np.random.uniform(size=input_dim).astype(dtype) + a_np2 = np.random.uniform(size=input_dim).astype(dtype) + + b_np = np.add(a_np1, a_np2) + alpha + inputs = [('input1', datatypes.Array(*input_dim)), + ('input2', datatypes.Array(*input_dim))] + output = [('output', datatypes.Array(*b_np.shape))] + builder = NeuralNetworkBuilder(inputs, output) + builder.add_elementwise(name='Add', + alpha=alpha, + input_names=['input1', 'input2'], + output_name='output', + mode='ADD') + model = cm.models.MLModel(builder.spec) + for target, ctx in ctx_list(): + out = run_tvm_graph(model, + [a_np1, a_np2], + ['input1', 'input2'], + b_np.shape, + dtype) + np.testing.assert_allclose(out, b_np, rtol=1e-5) + +def test_forward_AddLayerParams(): + verify_AddLayerParams((1, 2, 2), 0) + verify_AddLayerParams((1, 2, 2), 1) + verify_AddLayerParams((1, 3, 3), 2) + +def verify_MultiplyLayerParams(input_dim, alpha): + dtype = 'float32' + + a_np1 = np.random.uniform(size=input_dim).astype(dtype) + a_np2 = np.random.uniform(size=input_dim).astype(dtype) + + b_np = np.multiply(a_np1, a_np2) * alpha + inputs = [('input1', datatypes.Array(*input_dim)), + ('input2', datatypes.Array(*input_dim))] + output = [('output', datatypes.Array(*b_np.shape))] + builder = NeuralNetworkBuilder(inputs, output) + builder.add_elementwise(name='Mul', + alpha=alpha, + input_names=['input1', 'input2'], + output_name='output', + mode='MULTIPLY') + model = cm.models.MLModel(builder.spec) + for target, ctx in ctx_list(): + out = run_tvm_graph(model, + [a_np1, a_np2], + ['input1', 'input2'], + b_np.shape, + dtype) + np.testing.assert_allclose(out, b_np, rtol=1e-5) + +def test_forward_MultiplyLayerParams(): + verify_MultiplyLayerParams((1, 2, 2), 0) + verify_MultiplyLayerParams((1, 2, 2), 1) + verify_MultiplyLayerParams((1, 3, 3), 2) + +def verify_ConcatLayerParams(input1_dim, input2_dim): + dtype = 'float32' + + a_np1 = np.random.uniform(size=input1_dim).astype(dtype) + a_np2 = np.random.uniform(size=input2_dim).astype(dtype) + + b_np = np.concatenate((a_np1, a_np2), axis=1) + inputs = [('input1', datatypes.Array(*input1_dim)), + ('input2', datatypes.Array(*input2_dim))] + output = [('output', datatypes.Array(*b_np.shape))] + builder = NeuralNetworkBuilder(inputs, output) + builder.add_elementwise(name='Concate', + input_names=['input1', 'input2'], + output_name='output', + mode='CONCAT') + model = cm.models.MLModel(builder.spec) + for target, ctx in ctx_list(): + out = run_tvm_graph(model, + [a_np1, a_np2], + ['input1', 'input2'], + b_np.shape, + dtype) + np.testing.assert_allclose(out, b_np, rtol=1e-5) + +def test_forward_ConcatLayerParams(): + verify_ConcatLayerParams((1, 1, 2, 2), (1, 2, 2, 2)) + verify_ConcatLayerParams((1, 2, 4, 4), (1, 3, 4, 4)) + +def verify_UpsampleLayerParams(input_dim, scale, mode): + dtype = "float32" + + a_np = np.full(input_dim, 1, dtype=dtype) + if mode == 'NN': + b_np = topi.testing.upsampling_python(a_np, scale) + else: + new_h = input_dim[2] * scale + new_w = input_dim[3] * scale + b_np = topi.testing.bilinear_resize_python(a_np, (new_h, new_w), 'NCHW') + + input = [('input', datatypes.Array(*input_dim))] + output = [('output', datatypes.Array(*b_np.shape))] + builder = NeuralNetworkBuilder(input, output) + builder.add_upsample(name='Upsample', + scaling_factor_h=scale, + scaling_factor_w=scale, + mode=mode, + input_name='input', + output_name='output') + + model = cm.models.MLModel(builder.spec) + for target, ctx in ctx_list(): + out = run_tvm_graph(model, a_np, 'input', b_np.shape, dtype) + np.testing.assert_allclose(out, b_np, rtol=1e-5) + +def test_forward_UpsampleLayerParams(): + verify_UpsampleLayerParams((1, 16, 32, 32), 2, 'NN') + verify_UpsampleLayerParams((1, 4, 6, 6), 3, 'BILINEAR') + +def verify_l2_normalize(input_dim, eps): + dtype = "float32" + + a_np = np.random.uniform(size=input_dim).astype(dtype) + b_np = topi.testing.l2_normalize_python(a_np, eps, 1) + + input = [('input', datatypes.Array(*input_dim))] + output = [('output', datatypes.Array(*b_np.shape))] + builder = NeuralNetworkBuilder(input, output) + builder.add_l2_normalize(name='L2', epsilon=eps, input_name='input', output_name='output') + + model = cm.models.MLModel(builder.spec) + for target, ctx in ctx_list(): + out = run_tvm_graph(model, a_np, 'input', b_np.shape, dtype) + np.testing.assert_allclose(out, b_np, rtol=1e-5) + +def test_forward_l2_normalize(): + verify_l2_normalize((1, 3, 20, 20), 0.001) + if __name__ == '__main__': test_mobilenet_checkonly() test_resnet50_checkonly() + test_forward_AddLayerParams() + test_forward_ConcatLayerParams() + test_forward_MultiplyLayerParams() + test_forward_UpsampleLayerParams() + test_forward_l2_normalize() From 1c66012ac9ccca6c9e8a3dcf24f737c1a0f734c7 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Sun, 19 Aug 2018 19:29:35 -0700 Subject: [PATCH 36/77] fix import (#1621) --- nnvm/python/nnvm/testing/yolo2_detection.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/nnvm/python/nnvm/testing/yolo2_detection.py b/nnvm/python/nnvm/testing/yolo2_detection.py index b7744c45cff4..0b229149b8ea 100644 --- a/nnvm/python/nnvm/testing/yolo2_detection.py +++ b/nnvm/python/nnvm/testing/yolo2_detection.py @@ -10,9 +10,6 @@ import math from collections import namedtuple import numpy as np -from PIL import Image -from PIL import ImageDraw -from PIL import ImageFont def _entry_index(batch, w, h, outputs, classes, coords, location, entry): n = int(location/(w*h)) @@ -186,6 +183,10 @@ def _draw_label(im, r, c, label, rgb): _set_pixel(im, i+c, j+r, k, val)#rgb[k] * val) def _get_label(labelstr, rgb): + from PIL import Image + from PIL import ImageDraw + from PIL import ImageFont + text = labelstr colorText = "black" testDraw = ImageDraw.Draw(Image.new('RGB', (1, 1))) From b00aabc5ca9108a3dcb68e19af5ae19d723ed746 Mon Sep 17 00:00:00 2001 From: masahi Date: Mon, 20 Aug 2018 11:30:01 +0900 Subject: [PATCH 37/77] Add missing check when deciding conv op and injective op are in the same group (#1622) --- nnvm/src/compiler/graph_fuse.cc | 1 + nnvm/tests/python/compiler/test_op_fusion.py | 34 ++++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/nnvm/src/compiler/graph_fuse.cc b/nnvm/src/compiler/graph_fuse.cc index f65312be1a29..4999d93d1861 100644 --- a/nnvm/src/compiler/graph_fuse.cc +++ b/nnvm/src/compiler/graph_fuse.cc @@ -146,6 +146,7 @@ nnvm::Graph GraphFindFusibleGroups(nnvm::Graph g) { bool parent_out_ewise = false; bool parent_injective = false; for (const auto& e : inode.inputs) { + if (fuse_vec[e.node_id] != FuseRule::kFuseToMaster) continue; TOpPattern pt = pattern_vec[e.node_id]; if (pt == kOutEWiseFusable) { parent_out_ewise = true; diff --git a/nnvm/tests/python/compiler/test_op_fusion.py b/nnvm/tests/python/compiler/test_op_fusion.py index 5f4da3865a45..0c81ac890d55 100644 --- a/nnvm/tests/python/compiler/test_op_fusion.py +++ b/nnvm/tests/python/compiler/test_op_fusion.py @@ -110,6 +110,39 @@ def test_injective_conv2d(): np.testing.assert_allclose(out.asnumpy(), c_np, rtol=1e-5) +def test_concatenate_conv2d(): + ch = 3 + size = 8 + data = sym.Variable(name="data") + concat = sym.concatenate(data, data, axis=1) + conv = sym.conv2d(data=concat, kernel_size=(1,1), channels=ch*2, use_bias=False, name="conv") + net = sym.elemwise_add(concat, conv) + + dtype="float32" + dshape = (1, ch, size, size) + kshape = (ch*2, ch*2, 1, 1) + oshape = (1, ch*2, size, size) + shape_dict = {"data": dshape} + + for target, ctx in ctx_list(): + graph, lib, _ = nnvm.compiler.build(net, target, shape_dict) + # data, conv weight, conv op, concat + assert graph.index.num_nodes == 4 + + data = tvm.nd.array(np.random.uniform(size=dshape).astype(dtype)) + kernel = tvm.nd.array(np.random.uniform(size=kshape).astype(dtype)) + m = graph_runtime.create(graph, lib, ctx) + m.run(data=data, conv_weight=kernel) + # get output + out = m.get_output(0, tvm.nd.empty(oshape, dtype)) + + concat = np.concatenate((data.asnumpy(), data.asnumpy()), axis=1) + conv = topi.testing.conv2d_nchw_python( + concat, kernel.asnumpy(), (1,1), 'SAME') + ref = concat + conv + np.testing.assert_allclose(out.asnumpy(), ref, rtol=1e-5) + + def build_and_run(sym, params, data, out_shape, target, ctx, opt_level=2): with nnvm.compiler.build_config(opt_level=opt_level): graph, lib, params = nnvm.compiler.build(sym, target, shape={"data":data.shape}, params=params) @@ -157,3 +190,4 @@ def get_sym(out_channel): test_conv_ewise_injective() test_fuse_conv2d_elu() test_injective_conv2d() + test_concatenate_conv2d() From 20c495e95bb3f0cd3542e029e8c54701d827cc7d Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Mon, 20 Aug 2018 09:20:19 -0700 Subject: [PATCH 38/77] [NODEREF] Introduce named attribute system. (#1618) --- include/tvm/attrs.h | 593 ++++++++++++++++++ include/tvm/runtime/packed_func.h | 8 + python/tvm/make.py | 11 + src/api/api_base.cc | 12 - src/api/api_test.cc | 46 ++ src/api/dsl_api.cc | 37 +- src/lang/attrs.cc | 45 ++ src/lang/reflection.cc | 42 +- tests/cpp/attrs_test.cc | 76 +++ tests/python/unittest/test_lang_reflection.py | 26 + 10 files changed, 861 insertions(+), 35 deletions(-) create mode 100644 include/tvm/attrs.h create mode 100644 src/api/api_test.cc create mode 100644 src/lang/attrs.cc create mode 100644 tests/cpp/attrs_test.cc diff --git a/include/tvm/attrs.h b/include/tvm/attrs.h new file mode 100644 index 000000000000..aed6b1ff722f --- /dev/null +++ b/include/tvm/attrs.h @@ -0,0 +1,593 @@ +/*! + * Copyright (c) 2018 by Contributors + * \file tvm/attrs.h + * \brief TVM attribute module + * + * This module enables declaration of named attributes + * which support default value setup and bound checking. + * + * \code + * struct MyAttrs : public tvm::AttrsNode { + * float learning_rate; + * int num_hidden; + * std::string name; + * // declare attribute fields in header file + * TVM_DECLARE_ATTRS(MyAttrs, "attrs.MyAttrs") { + * TVM_ATTR_FIELD(num_hidden).set_lower_bound(1); + * TVM_ATTR_FIELD(learning_rate).set_default(0.01f); + * TVM_ATTR_FIELD(name).set_default("hello"); + * } + * }; + * // register it in cc file + * TVM_REGISTER_NODE_TYPE(MyAttrs); + * \endcode + * + * \sa AttrsNode, TVM_DECLARE_ATTRS, TVM_ATTR_FIELD + */ +#ifndef TVM_ATTRS_H_ +#define TVM_ATTRS_H_ + +#include +#include +#include +#include +#include "./ir.h" +#include "./base.h" +#include "./packed_func_ext.h" + +namespace tvm { +/*! + * \brief Declare an attribute function. + * \param ClassName The name of the class. + * \param TypeKey The type key to be used by the TVM node system. + */ +#define TVM_DECLARE_ATTRS(ClassName, TypeKey) \ + static constexpr const char* _type_key = TypeKey; \ + TVM_DECLARE_NODE_TYPE_INFO(ClassName, ::tvm::BaseAttrsNode); \ + template \ + void __VisitAttrs__(FVisit& __fvisit__) // NOLINT(*) + + +/*! + * \brief Declare an attribute field. + * \param FieldName The field name. + */ +#define TVM_ATTR_FIELD(FieldName) \ + __fvisit__(#FieldName, &FieldName) + + +/*! \brief Error thrown during attribute checking. */ +struct AttrError : public dmlc::Error { + /*! + * \brief constructor + * \param msg error message + */ + explicit AttrError(const std::string &msg) + : dmlc::Error(msg) {} +}; + +/*! + * \brief Information about attribute fields in string representations. + */ +struct AttrFieldInfo { + /*! \brief name of the field */ + std::string name; + /*! \brief type docstring information in str. */ + std::string type_info; + /*! \brief detailed description of the type */ + std::string description; +}; + +/*! + * \brief Base class of all attribute class + * \note Do not subclass AttrBaseNode directly, + * subclass AttrsNode instead. + * \sa AttrsNode + */ +class BaseAttrsNode : public Node { + public: + using TVMArgs = runtime::TVMArgs; + using TVMRetValue = runtime::TVMRetValue; + /*! + * \brief Initialize the attributes by sequence of arguments + * \param args The postional arguments in the form + * [key0, value0, key1, value1, ..., key_n, value_n] + */ + template + inline void InitBySeq(Args&& ...args); + /*! + * \brief Print readible docstring to ostream, add newline. + * \param os the stream to print the docstring to. + */ + inline void PrintDocString(std::ostream &os) const; // NOLINT(*) + /*! + * \brief Get the field information about the + * \note This function throws when the required a field is not present. + */ + TVM_DLL virtual std::vector ListFieldInfo() const = 0; + /*! + * \brief Initialize the attributes by arguments. + * \param kwargs The key value pairs for initialization. + * [key0, value0, key1, value1, ..., key_n, value_n] + * \param allow_unknown Whether allow additional unknown fields. + * \note This function throws when the required a field is not present. + */ + TVM_DLL virtual void InitByPackedArgs(const TVMArgs& kwargs, bool allow_unknown = false) = 0; + + static constexpr const char* _type_key = "Attrs"; + TVM_DECLARE_BASE_NODE_INFO(BaseAttrsNode, Node); +}; + +/*! \brief Base attribute container for all attributes */ +class Attrs : public NodeRef { + public: + // normal constructor + Attrs() {} + // construct from shared ptr. + explicit Attrs(std::shared_ptr n) : NodeRef(n) {} + + /*! \return The attribute node */ + const BaseAttrsNode* operator->() const { + return ptr(); + } + /*! \brief specify container node */ + using ContainerType = BaseAttrsNode; + + private: + /*! \return the internal attribute node */ + const BaseAttrsNode* ptr() const { + return static_cast(node_.get()); + } +}; + +/*! + * \brief Specialized attribute type that is backed by a map. + * The DictAttrsNode implements the Attrs behavior, + * its fields are directly accessible via object.field_name + * like other normal nodes. + */ +class DictAttrsNode : public BaseAttrsNode { + public: + /*! \brief internal attrs map */ + Map dict; + /*! + * \brief Consruct a Attrs backed by DictAttrsNode. + * \param dict The attributes. + * \return The dict attributes. + */ + TVM_DLL static Attrs make(Map dict); + // implementations + void VisitAttrs(AttrVisitor* v) final; + void InitByPackedArgs(const runtime::TVMArgs& args, bool allow_unknown) final; + std::vector ListFieldInfo() const final; + // type info + static constexpr const char* _type_key = "DictAttrs"; + TVM_DECLARE_NODE_TYPE_INFO(DictAttrsNode, BaseAttrsNode); +}; + +// Namespace containing detail implementations +namespace detail { +using runtime::TVMArgValue; + +// helper entry that does nothing in set_default/bound/describe calls. +struct AttrNopEntry { + using TSelf = AttrNopEntry; + + TSelf& describe(DMLC_ATTRIBUTE_UNUSED const char* str) { + return *this; + } + template + TSelf& set_default(DMLC_ATTRIBUTE_UNUSED T value) { + return *this; + } + template + TSelf& set_lower_bound(DMLC_ATTRIBUTE_UNUSED T begin) { + return *this; + } + template + TSelf& set_upper_bound(DMLC_ATTRIBUTE_UNUSED T end) { + return *this; + } +}; + +// Wrapper for normal visitor. +class AttrNormalVisitor { + public: + explicit AttrNormalVisitor(AttrVisitor* visitor) + : visitor_(visitor) { + } + template + AttrNopEntry operator()(const char* key, T* value) { + visitor_->Visit(key, value); + return AttrNopEntry(); + } + + private: + AttrVisitor* visitor_; +}; + +// helper entry that does initialization, set default. +template +struct AttrInitEntry { + // The attributes + using TSelf = AttrInitEntry; + // The type key + const char* type_key_; + // field name + const char* key_; + // internal value. + T* value_; + // whether the value is missing. + bool value_missing_{true}; + // If the value is still missing in destruction time throw an error. + ~AttrInitEntry() DMLC_THROW_EXCEPTION { + if (value_missing_) { + std::ostringstream os; + os << type_key_ << ": Cannot find required field \'" << key_ + << "\' during initialization"; + throw AttrError(os.str()); + } + } + // override fields. + // This function sets the lower bound of the attribute + TSelf& set_lower_bound(DMLC_ATTRIBUTE_UNUSED const T& begin) { + if (this->value_missing_) return *this; + const T& val = *value_; + if (begin > val) { + std::ostringstream os; + os << type_key_ << "." << key_ << ": " + << "value " << val + << " is smaller than the lower bound " << begin; + throw AttrError(os.str()); + } + return *this; + } + // This function sets the upper bound of the attribute + TSelf& set_upper_bound(DMLC_ATTRIBUTE_UNUSED const T& end) { + if (this->value_missing_) return *this; + const T& val = *value_; + if (val > end) { + std::ostringstream os; + os << type_key_ << "." << key_ << ": " + << "value " << val + << " is bigger than the upper bound " << end; + throw AttrError(os.str()); + } + return *this; + } + // set default when + TSelf& set_default(DMLC_ATTRIBUTE_UNUSED const T& value) { + if (!value_missing_) return *this; + *value_ = value; + value_missing_ = false; + return *this; + } + TSelf& describe(DMLC_ATTRIBUTE_UNUSED const char* str) { + return *this; + } +}; + +// Template function to allow smart conversion +// from Expr types into the constants. +template +inline void SetValue(T* ptr, const TVMArgValue& val) { + *ptr = val.operator T(); +} +template +inline void SetIntValue(T* ptr, const TVMArgValue& val) { + if (val.type_code() == kDLInt) { + *ptr = static_cast(val.value().v_int64); + } else { + Expr expr = val; + CHECK(expr.defined()); + if (const ir::IntImm* op = expr.as()) { + *ptr = static_cast(op->value); + } else if (const ir::UIntImm* op = expr.as()) { + *ptr = static_cast(op->value); + } else { + LOG(FATAL) << "Expect int value, but get " << expr->type_key(); + } + } +} +template<> +inline void SetValue(std::string* ptr, const TVMArgValue& val) { + if (val.type_code() == kStr) { + *ptr = val.operator std::string(); + } else { + Expr expr = val; + const ir::StringImm* op = expr.as(); + CHECK(op != nullptr); + *ptr = op->value; + } +} +template<> +inline void SetValue(double* ptr, const TVMArgValue& val) { + if (val.type_code() == kDLFloat || val.type_code() == kDLInt) { + *ptr = val.operator double(); + } else { + Expr expr = val; + CHECK(expr.defined()); + if (const ir::IntImm* op = expr.as()) { + *ptr = static_cast(op->value); + } else if (const ir::IntImm* op = expr.as()) { + *ptr = static_cast(op->value); + } else if (const ir::UIntImm* op = expr.as()) { + *ptr = static_cast(op->value); + } else { + LOG(FATAL) << "Expect float value, but get " << expr->type_key(); + } + } +} +template<> +inline void SetValue(int* ptr, const TVMArgValue& val) { + SetIntValue(ptr, val); +} +template<> +inline void SetValue(int64_t* ptr, const TVMArgValue& val) { + SetIntValue(ptr, val); +} +template<> +inline void SetValue(uint64_t* ptr, const TVMArgValue& val) { + SetIntValue(ptr, val); +} +template<> +inline void SetValue(bool* ptr, const TVMArgValue& val) { + SetIntValue(ptr, val); +} + +// Visitor for value initialization +template +class AttrInitVisitor { + public: + // Counter of number of matched attributes during visit. + // This is used to decide if there is additional unmatched attributes. + size_t hit_count_{0}; + // constructor + AttrInitVisitor(const char* type_key, FFind ffind) + : type_key_(type_key), ffind_(ffind) { + } + + template + AttrInitEntry operator()(const char* key, T* value) { + TVMArgValue val; + AttrInitEntry opt; + opt.type_key_ = type_key_; + opt.key_ = key; + opt.value_ = value; + if (ffind_(key, &val)) { + SetValue(value, val); + opt.value_missing_ = false; + ++hit_count_; + } else { + opt.value_missing_ = true; + } + return opt; + } + + private: + // the type key + const char* type_key_; + FFind ffind_; +}; + +template +inline AttrInitVisitor CreateInitVisitor( + const char* type_key, + FFind ffind) { + return AttrInitVisitor(type_key, ffind); +} + +/*! + * \brief Helper struct to get the type name known to tvm. + * \tparam T the type we are interested in. + */ +template +struct TypeName { + static constexpr const char* value = T::ContainerType::_type_key; +}; + +template<> +struct TypeName { + static constexpr const char* value = "int"; +}; + +template<> +struct TypeName { + static constexpr const char* value = "int64"; +}; + +template<> +struct TypeName { + static constexpr const char* value = "uint64_t"; +}; + +template<> +struct TypeName { + static constexpr const char* value = "Type"; +}; + +template<> +struct TypeName { + static constexpr const char* value = "str"; +}; + +template<> +struct TypeName { + static constexpr const char* value = "bool"; +}; + +template<> +struct TypeName { + static constexpr const char* value = "handle"; +}; + +template<> +struct TypeName { + static constexpr const char* value = "double"; +}; + +class AttrDocEntry { + public: + using TSelf = AttrDocEntry; + + explicit AttrDocEntry(AttrFieldInfo* info) + : info_(info) { + } + TSelf& describe(DMLC_ATTRIBUTE_UNUSED const char* str) { + info_->description = str; + return *this; + } + template + TSelf& set_default(DMLC_ATTRIBUTE_UNUSED T value) { + std::ostringstream os; + os << info_->type_info << ", default=" << value; + info_->type_info = os.str(); + return *this; + } + template + TSelf& set_lower_bound(DMLC_ATTRIBUTE_UNUSED T begin) { + return *this; + } + template + TSelf& set_upper_bound(DMLC_ATTRIBUTE_UNUSED T end) { + return *this; + } + + private: + AttrFieldInfo* info_; +}; + +class AttrDocVisitor { + public: + template + AttrDocEntry operator()(const char* key, T* v) { + AttrFieldInfo info; + info.name = key; + info.type_info = TypeName::value; + fields_.emplace_back(std::move(info)); + return AttrDocEntry(&(fields_.back())); + } + + std::vector fields_; +}; + +class AttrExistVisitor { + public: + std::string key_; + bool exist_{false}; + + template + AttrNopEntry operator()(const char* key, T* v) { + if (exist_) return AttrNopEntry(); + if (key == key_) exist_ = true; + return AttrNopEntry(); + } +}; +} // namespace detail + +/*! + * \brief The base class of the all the + * Use "curiously recurring template pattern". + * + * \tparam DerivedType The final attribute type. + */ +template +class AttrsNode : public BaseAttrsNode { + public: + void VisitAttrs(AttrVisitor* v) final { + detail::AttrNormalVisitor vis(v); + self()->__VisitAttrs__(vis); + } + + void InitByPackedArgs(const runtime::TVMArgs& args, bool allow_unknown) final { + CHECK_EQ(args.size() % 2, 0); + const int kLinearSearchBound = 16; + int hit_count = 0; + // applies two stratgies to lookup + if (args.size() < kLinearSearchBound) { + // linear search. + auto ffind = [&args](const char* key, runtime::TVMArgValue* val) { + for (int i = 0; i < args.size(); i += 2) { + CHECK_EQ(args.type_codes[i], kStr); + if (!std::strcmp(key, args.values[i].v_str)) { + *val = args[i + 1]; + return true; + } + } + return false; + }; + auto vis = detail::CreateInitVisitor(DerivedType::_type_key, ffind); + self()->__VisitAttrs__(vis); + hit_count = vis.hit_count_; + } else { + // construct a map then do lookup. + std::unordered_map kwargs; + for (int i = 0; i < args.size(); i += 2) { + CHECK_EQ(args.type_codes[i], kStr); + kwargs[args[i].operator std::string()] = args[i + 1]; + } + auto ffind = [&kwargs](const char *key, runtime::TVMArgValue* val) { + auto it = kwargs.find(key); + if (it != kwargs.end()) { + *val = it->second; + return true; + } + return false; + }; + auto vis = detail::CreateInitVisitor(DerivedType::_type_key, ffind); + self()->__VisitAttrs__(vis); + hit_count = vis.hit_count_; + } + // error handling, slow path + if (hit_count * 2 != args.size() && !allow_unknown) { + for (int i = 0; i < args.size(); i += 2) { + detail::AttrExistVisitor visitor; + visitor.key_ = args[i].operator std::string(); + self()->__VisitAttrs__(visitor); + if (!visitor.exist_) { + std::ostringstream os; + os << DerivedType::_type_key + << ": does not have field \'" << visitor.key_ + << "\', Possible fields:\n"; + os << "----------------\n"; + this->PrintDocString(os); + throw AttrError(os.str()); + } + } + } + } + + std::vector ListFieldInfo() const final { + detail::AttrDocVisitor visitor; + self()->__VisitAttrs__(visitor); + return visitor.fields_; + } + + private: + DerivedType* self() const { + return const_cast( + static_cast(this)); + } +}; + + +template +inline void BaseAttrsNode::InitBySeq(Args&& ...args) { + runtime::PackedFunc pf([this](const TVMArgs& args, TVMRetValue *rv) { + this->InitByPackedArgs(args); + }); + pf(std::forward(args)...); +} + +inline void BaseAttrsNode::PrintDocString(std::ostream &os) const { // NOLINT(*) + std::vector entry = this->ListFieldInfo(); + for (AttrFieldInfo info : entry) { + os << info.name << " : " << info.type_info << '\n'; + if (info.description.length() != 0) { + os << " " << info.description << '\n'; + } + } +} + +} // namespace tvm +#endif // TVM_ATTRS_H_ diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h index 6d8df4a5e3d6..63e8ca7cd16b 100644 --- a/include/tvm/runtime/packed_func.h +++ b/include/tvm/runtime/packed_func.h @@ -223,6 +223,12 @@ class ExtTypeVTable { class TVMPODValue_ { public: operator double() const { + // Allow automatic conversion from int to float + // This avoids errors when user pass in int from + // the frontend while the API expects a float. + if (type_code_ == kDLInt) { + return static_cast(value_.v_int64); + } TVM_CHECK_TYPE_CODE(type_code_, kDLFloat); return value_.v_float64; } @@ -310,6 +316,8 @@ class TVMPODValue_ { */ class TVMArgValue : public TVMPODValue_ { public: + /*! \brief default constructor */ + TVMArgValue() {} /*! * \brief constructor * \param value of the function diff --git a/python/tvm/make.py b/python/tvm/make.py index 49f698f4f663..19949509778b 100644 --- a/python/tvm/make.py +++ b/python/tvm/make.py @@ -71,6 +71,17 @@ def node(type_key, **kwargs): **kwargs : dict The fields of the node. + Returns + ------- + node : Node + The corresponding DSL Node + + Note + ---- + If the created node is instance of AttrsNode, then + the creator function will also run bound checks and + default value setup as supported by Attrs. + Example ------- The following code constructs a IntImm object diff --git a/src/api/api_base.cc b/src/api/api_base.cc index 70301993ad3a..3583f42a00c9 100644 --- a/src/api/api_base.cc +++ b/src/api/api_base.cc @@ -33,18 +33,6 @@ TVM_REGISTER_API("_load_json") *ret = LoadJSON(args[0]); }); -TVM_REGISTER_API("_nop") -.set_body([](TVMArgs args, TVMRetValue *ret) { - }); - -// internal fucntion used for debug and testing purposes -TVM_REGISTER_API("_ndarray_use_count") -.set_body([](TVMArgs args, TVMRetValue *ret) { - runtime::NDArray nd = args[0]; - // substract the current one - *ret = (nd.use_count() - 1); - }); - TVM_REGISTER_API("_TVMSetStream") .set_body([](TVMArgs args, TVMRetValue *ret) { TVMSetStream(args[0], args[1], args[2]); diff --git a/src/api/api_test.cc b/src/api/api_test.cc new file mode 100644 index 000000000000..1744267fdcd7 --- /dev/null +++ b/src/api/api_test.cc @@ -0,0 +1,46 @@ + /*! + * Copyright (c) 2018 by Contributors + * Code mainly used for test purposes. + * \file api_test.cc + */ +#include +#include +#include +#include + +namespace tvm { +// Attrs used to python API +struct TestAttrs : public AttrsNode { + int axis; + std::string name; + Array padding; + + TVM_DECLARE_ATTRS(TestAttrs, "attrs.TestAttrs") { + TVM_ATTR_FIELD(axis) + .set_default(10) + .set_lower_bound(1) + .set_upper_bound(10) + .describe("axis field"); + TVM_ATTR_FIELD(name) + .describe("name"); + TVM_ATTR_FIELD(padding) + .describe("padding of input") + .set_default(Array({0, 0})); + } +}; + +TVM_REGISTER_NODE_TYPE(TestAttrs); + +TVM_REGISTER_API("_nop") +.set_body([](TVMArgs args, TVMRetValue *ret) { + }); + +// internal fucntion used for debug and testing purposes +TVM_REGISTER_API("_ndarray_use_count") +.set_body([](TVMArgs args, TVMRetValue *ret) { + runtime::NDArray nd = args[0]; + // substract the current one + *ret = (nd.use_count() - 1); + }); + +} // namespace tvm diff --git a/src/api/dsl_api.cc b/src/api/dsl_api.cc index 80d7c3163e10..9157e62fda8a 100644 --- a/src/api/dsl_api.cc +++ b/src/api/dsl_api.cc @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -124,22 +125,35 @@ class DSLAPIImpl : public DSLAPI { (*static_cast(handle))->type_index()); } void NodeGetAttr(NodeHandle handle, - const char* key, - TVMValue* ret_val, - int* ret_type_code, - int* ret_success) const final { + const char* key, + TVMValue* ret_val, + int* ret_type_code, + int* ret_success) const final { TVMRetValue rv; APIAttrGetter getter; + TVMAPINode* tnode = static_cast(handle); getter.skey = key; getter.ret = &rv; - TVMAPINode* tnode = static_cast(handle); if (getter.skey == "type_key") { ret_val->v_str = (*tnode)->type_key(); *ret_type_code = kStr; *ret_success = 1; - } else { + return; + } else if (!(*tnode)->is_type()) { (*tnode)->VisitAttrs(&getter); *ret_success = getter.found_ref_object || rv.type_code() != kNull; + } else { + // specially handle dict attr + DictAttrsNode* dnode = static_cast(tnode->get()); + auto it = dnode->dict.find(key); + if (it != dnode->dict.end()) { + *ret_success = 1; + rv = (*it).second; + } else { + *ret_success = 0; + } + } + if (*ret_success) { if (rv.type_code() == kStr || rv.type_code() == kTVMType) { TVMAPIThreadLocalEntry *e = TVMAPIThreadLocalStore::Get(); @@ -159,7 +173,16 @@ class DSLAPIImpl : public DSLAPI { TVMAPINode* tnode = static_cast(handle); APIAttrDir dir; dir.names = &(ret->ret_vec_str); - (*tnode)->VisitAttrs(&dir); + + if (!(*tnode)->is_type()) { + (*tnode)->VisitAttrs(&dir); + } else { + // specially handle dict attr + DictAttrsNode* dnode = static_cast(tnode->get()); + for (const auto& kv : dnode->dict) { + ret->ret_vec_str.push_back(kv.first); + } + } ret->ret_vec_charp.clear(); for (size_t i = 0; i < ret->ret_vec_str.size(); ++i) { ret->ret_vec_charp.push_back(ret->ret_vec_str[i].c_str()); diff --git a/src/lang/attrs.cc b/src/lang/attrs.cc new file mode 100644 index 000000000000..49a91983e79d --- /dev/null +++ b/src/lang/attrs.cc @@ -0,0 +1,45 @@ +/*! + * Copyright (c) 2018 by Contributors + * \file attrs.cc + */ +#include + +namespace tvm { + +void DictAttrsNode::VisitAttrs(AttrVisitor* v) { + v->Visit("__dict__", &dict); +} + +void DictAttrsNode::InitByPackedArgs( + const runtime::TVMArgs& args, bool allow_unknown) { + for (int i = 0; i < args.size(); i += 2) { + std::string key = args[i]; + runtime::TVMArgValue val = args[i + 1]; + if (val.type_code() == kNodeHandle) { + dict.Set(key, val.operator NodeRef()); + } else if (val.type_code() == kStr) { + dict.Set(key, Expr(val.operator std::string())); + } else { + dict.Set(key, val.operator Expr()); + } + } +} + +std::vector DictAttrsNode::ListFieldInfo() const { + return {}; +} + +Attrs DictAttrsNode::make(Map dict) { + std::shared_ptr n = std::make_shared(); + n->dict = std::move(dict); + return Attrs(n); +} + +TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable) +.set_dispatch([](const DictAttrsNode *op, IRPrinter *p) { + p->stream << op->dict; +}); + +TVM_REGISTER_NODE_TYPE(DictAttrsNode); + +} // namespace tvm diff --git a/src/lang/reflection.cc b/src/lang/reflection.cc index 7c4e862f0abb..9fb9143aa7f4 100644 --- a/src/lang/reflection.cc +++ b/src/lang/reflection.cc @@ -5,6 +5,7 @@ */ #include #include +#include #include #include #include @@ -467,22 +468,15 @@ class NodeAttrSetter : public AttrVisitor { } }; -// API function to make node. -// args format: -// type_key, key1, value1, ..., key_n, value_n -void MakeNode(runtime::TVMArgs args, runtime::TVMRetValue* rv) { + +void InitNodeByPackedArgs(Node* n, const TVMArgs& args) { NodeAttrSetter setter; - setter.type_key = args[0].operator std::string(); - CHECK_EQ(args.size() % 2, 1); - for (int i = 1; i < args.size(); i += 2) { - setter.attrs.emplace( - args[i].operator std::string(), - runtime::TVMArgValue(args.values[i + 1], args.type_codes[i + 1])); - } - auto* f = dmlc::Registry::Find(setter.type_key); - CHECK(f != nullptr) - << "Node type \'" << setter.type_key << "\' is not registered in TVM"; - std::shared_ptr n = f->body(); + setter.type_key = n->type_key(); + CHECK_EQ(args.size() % 2, 0); + for (int i = 0; i < args.size(); i += 2) { + setter.attrs.emplace(args[i].operator std::string(), + args[i + 1]); + } n->VisitAttrs(&setter); if (setter.attrs.size() != 0) { std::ostringstream os; @@ -492,10 +486,26 @@ void MakeNode(runtime::TVMArgs args, runtime::TVMRetValue* rv) { } LOG(FATAL) << os.str(); } +} + +// API function to make node. +// args format: +// key1, value1, ..., key_n, value_n +void MakeNode(const TVMArgs& args, TVMRetValue* rv) { + std::string type_key = args[0]; + auto* f = dmlc::Registry::Find(type_key); + CHECK(f != nullptr) + << "Node type \'" << type_key << "\' is not registered in TVM"; + TVMArgs kwargs(args.values + 1, args.type_codes + 1, args.size() - 1); + std::shared_ptr n = f->body(); + if (n->derived_from()) { + static_cast(n.get())->InitByPackedArgs(kwargs); + } else { + InitNodeByPackedArgs(n.get(), kwargs); + } *rv = NodeRef(n); } TVM_REGISTER_GLOBAL("make._Node") .set_body(MakeNode); - } // namespace tvm diff --git a/tests/cpp/attrs_test.cc b/tests/cpp/attrs_test.cc new file mode 100644 index 000000000000..138e0b242e02 --- /dev/null +++ b/tests/cpp/attrs_test.cc @@ -0,0 +1,76 @@ +#include +#include +#include +#include + +namespace tvm { +namespace test { +// test example usage docs +struct TestAttrs : public AttrsNode { + int axis; + std::string name; + Expr expr; + double learning_rate; + + TVM_DECLARE_ATTRS(TestAttrs, "attrs.cpptest.TestAttrs") { + TVM_ATTR_FIELD(axis) + .set_default(10) + .set_lower_bound(1) + .set_upper_bound(10) + .describe("axis field"); + TVM_ATTR_FIELD(name) + .describe("name of the field"); + TVM_ATTR_FIELD(expr) + .describe("expression field") + .set_default(make_const(Int(32), 1)); + TVM_ATTR_FIELD(learning_rate) + .describe("learning_rate") + .set_default(0.1); + } +}; +} +} + +TEST(Attrs, Basic) { + using namespace tvm; + using namespace tvm::test; + std::shared_ptr n = std::make_shared(); + try { + n->InitBySeq("axis", 10); + LOG(FATAL) << "bad"; + } catch (const tvm::AttrError& e) { + } + try { + n->InitBySeq("axis", 12, "name", "111"); + LOG(FATAL) << "bad"; + } catch (const tvm::AttrError& e) { + } + + try { + n->InitBySeq("axisx", 12, "name", "111"); + LOG(FATAL) << "bad"; + } catch (const tvm::AttrError& e) { + std::string what = e.what(); + CHECK(what.find("expr : Expr, default=1") != std::string::npos); + CHECK(what.find("axisx") != std::string::npos); + } + n->InitBySeq("learning_rate", Expr(1), "expr", 128, "name", "xx"); + CHECK_EQ(n->learning_rate, 1.0); + + n->InitBySeq("name", "xxx", "expr", 128); + CHECK_EQ(n->name, "xxx"); + CHECK_EQ(n->axis, 10); + CHECK_EQ(n->expr.as()->value, 128); + // Check docstring + std::ostringstream os; + n->PrintDocString(os); + LOG(INFO) << "docstring\n"<< os.str(); + CHECK(os.str().find("expr : Expr, default=1") != std::string::npos); +} + + +int main(int argc, char ** argv) { + testing::InitGoogleTest(&argc, argv); + testing::FLAGS_gtest_death_test_style = "threadsafe"; + return RUN_ALL_TESTS(); +} diff --git a/tests/python/unittest/test_lang_reflection.py b/tests/python/unittest/test_lang_reflection.py index fefb8771a812..2ba67b8d9c86 100644 --- a/tests/python/unittest/test_lang_reflection.py +++ b/tests/python/unittest/test_lang_reflection.py @@ -36,6 +36,31 @@ def test_make_node(): assert AA.op == A.op assert AA.value_index == A.value_index + +def test_make_attrs(): + try: + x = tvm.make.node("attrs.TestAttrs", unknown_key=1, name="xx") + assert False + except tvm.TVMError as e: + assert str(e).find("unknown_key") != -1 + + try: + x = tvm.make.node("attrs.TestAttrs", axis=100, name="xx") + assert False + except tvm.TVMError as e: + assert str(e).find("upper bound") != -1 + + x = tvm.make.node("attrs.TestAttrs", name="xx", padding=(3,4)) + assert x.name == "xx" + assert x.padding[0].value == 3 + assert x.padding[1].value == 4 + assert x.axis == 10 + + dattr = tvm.make.node("DictAttrs", x=1, y=10, name="xyz", padding=(0,0)) + assert dattr.x.value == 1 + + + def test_make_sum(): A = tvm.placeholder((2, 10), name='A') k = tvm.reduce_axis((0,10), "k") @@ -46,6 +71,7 @@ def test_make_sum(): assert BB.op.body[0].combiner is not None if __name__ == "__main__": + test_make_attrs() test_make_node() test_make_smap() test_const_saveload_json() From 4dc21bdb29fdaecd0a45982c31f4dc4cae99238a Mon Sep 17 00:00:00 2001 From: Pariksheet Pinjari Date: Mon, 20 Aug 2018 22:43:35 +0530 Subject: [PATCH 39/77] [NNVM][DARKNET]Yolo and Upsample frontend support (#1501) * Yolo and Upsample frontend support * Lint fix * Mac support added * Code clean and trigger CI --- nnvm/python/nnvm/frontend/darknet.py | 32 +++++++- nnvm/python/nnvm/testing/darknet.py | 25 +++++- nnvm/python/nnvm/top/vision.py | 15 ++++ nnvm/src/top/vision/yolo/yolo.cc | 33 ++++++++ .../python/frontend/darknet/test_forward.py | 30 +++++++- tutorials/nnvm/from_darknet.py | 77 +++++++++---------- 6 files changed, 165 insertions(+), 47 deletions(-) create mode 100644 nnvm/src/top/vision/yolo/yolo.cc diff --git a/nnvm/python/nnvm/frontend/darknet.py b/nnvm/python/nnvm/frontend/darknet.py index 3aa36b7e7ef9..7fb3e34750c8 100644 --- a/nnvm/python/nnvm/frontend/darknet.py +++ b/nnvm/python/nnvm/frontend/darknet.py @@ -32,8 +32,12 @@ class LAYERTYPE(object): NETWORK = 20 XNOR = 21 REGION = 22 - REORG = 23 - BLANK = 24 + YOLO = 23 + REORG = 24 + UPSAMPLE = 25 + LOGXENT = 26 + L2NORM = 27 + BLANK = 28 class ACTIVATION(object): """Darknet ACTIVATION Class constant.""" @@ -257,6 +261,12 @@ def _darknet_reshape(inputs, attrs): new_attrs['shape'] = _darknet_required_attr(attrs, 'shape') return _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs), None +def _darknet_upsampling(inputs, attrs): + """Process the upsampling operation.""" + op_name, new_attrs = 'upsampling', {} + new_attrs['scale'] = attrs.get('scale', 1) + return _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs), None + def _darknet_softmax_output(inputs, attrs): """Process the softmax operation.""" temperature = attrs.get('temperature', 1) @@ -298,6 +308,15 @@ def _darknet_region(inputs, attrs): new_attrs['softmax'] = attrs.get('softmax', 0) return _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs), None +def _darknet_yolo(inputs, attrs): + """Process the yolo operation.""" + op_name, new_attrs = 'yolov3_yolo', {} + if 'n' in attrs: + new_attrs['n'] = attrs.get('n', 1) + if 'classes' in attrs: + new_attrs['classes'] = attrs.get('classes', 1) + return _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs), None + def _darknet_activations(inputs, attrs): """Process the activation function.""" act = _darknet_required_attr(attrs, 'activation') @@ -350,6 +369,8 @@ def _darknet_op_not_support(inputs, attrs): LAYERTYPE.REORG : _darknet_reorg, LAYERTYPE.REGION : _darknet_region, LAYERTYPE.SHORTCUT : _darknet_shortcut, + LAYERTYPE.UPSAMPLE : _darknet_upsampling, + LAYERTYPE.YOLO : _darknet_yolo, LAYERTYPE.DETECTION : _darknet_op_not_support, LAYERTYPE.CROP : _darknet_op_not_support, LAYERTYPE.COST : _darknet_op_not_support, @@ -575,6 +596,13 @@ def _get_darknet_attrs(self, layer, layer_num): attr.update({'coords' : layer.coords}) attr.update({'background' : layer.background}) attr.update({'softmax' : layer.softmax}) + + elif LAYERTYPE.YOLO == layer.type: + attr.update({'n' : layer.n}) + attr.update({'classes' : layer.classes}) + + elif LAYERTYPE.UPSAMPLE == layer.type: + attr.update({'scale' : layer.stride}) else: err = "Darknet layer type {} is not supported in nnvm.".format(layer.type) raise NotImplementedError(err) diff --git a/nnvm/python/nnvm/testing/darknet.py b/nnvm/python/nnvm/testing/darknet.py index e3d110e9605e..9a346e01b50b 100644 --- a/nnvm/python/nnvm/testing/darknet.py +++ b/nnvm/python/nnvm/testing/darknet.py @@ -115,8 +115,12 @@ class LAYERTYPE(object): NETWORK = 20 XNOR = 21 REGION = 22 - REORG = 23 - BLANK = 24 + YOLO = 23 + REORG = 24 + UPSAMPLE = 25 + LOGXENT = 26 + L2NORM = 27 + BLANK = 28 class ACTIVATION(object): """Darknet ACTIVATION Class constant.""" @@ -182,12 +186,16 @@ class ACTIVATION(object): NETWORK, XNOR, REGION, + YOLO, REORG, + UPSAMPLE, + LOGXENT, + L2NORM, BLANK } LAYERTYPE; typedef enum{ - SSE, MASKED, LONE, SEG, SMOOTH + SSE, MASKED, L1, SEG, SMOOTH, WGAN } COSTTYPE; @@ -241,18 +249,20 @@ class ACTIVATION(object): float shift; float ratio; float learning_rate_scale; + float clip; int softmax; int classes; int coords; int background; int rescore; int objectness; - int does_cost; int joint; int noadjust; int reorg; int log; int tanh; + int *mask; + int total; float alpha; float beta; @@ -265,13 +275,17 @@ class ACTIVATION(object): float class_scale; int bias_match; int random; + float ignore_thresh; + float truth_thresh; float thresh; + float focus; int classfix; int absolute; int onlyforward; int stopbackward; int dontload; + int dontsave; int dontloadscales; float temperature; @@ -309,6 +323,7 @@ class ACTIVATION(object): float * delta; float * output; + float * loss; float * squared; float * norms; @@ -462,6 +477,7 @@ class ACTIVATION(object): int train; int index; float *cost; + float clip; } network; @@ -491,6 +507,7 @@ class ACTIVATION(object): layer make_region_layer(int batch, int w, int h, int n, int classes, int coords); layer make_softmax_layer(int batch, int inputs, int groups); layer make_rnn_layer(int batch, int inputs, int outputs, int steps, ACTIVATION activation, int batch_normalize, int adam); +layer make_yolo_layer(int batch, int w, int h, int n, int total, int *mask, int classes); layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int output_filters, int steps, ACTIVATION activation, int batch_normalize); layer make_lstm_layer(int batch, int inputs, int outputs, int steps, int batch_normalize, int adam); layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_normalize, int adam); diff --git a/nnvm/python/nnvm/top/vision.py b/nnvm/python/nnvm/top/vision.py index f2e12c0f367a..e59b2bdfe6d9 100644 --- a/nnvm/python/nnvm/top/vision.py +++ b/nnvm/python/nnvm/top/vision.py @@ -38,6 +38,21 @@ def schedule_region(attrs, outs, target): reg.register_pattern("yolo_region", OpPattern.OPAQUE) +@reg.register_compute("yolov3_yolo") +def compute_yolo(attrs, inputs, _): + """Compute definition of yolo""" + n = attrs.get_int("n") + classes = attrs.get_int("classes") + return topi.vision.yolo.yolo(inputs[0], n, classes) + +@reg.register_schedule("yolov3_yolo") +def schedule_yolo(attrs, outs, target): + """Schedule definition of yolo""" + with tvm.target.create(target): + return topi.generic.schedule_injective(outs) + +reg.register_pattern("yolov3_yolo", OpPattern.OPAQUE) + # multibox_prior @reg.register_schedule("multibox_prior") def schedule_multibox_prior(_, outs, target): diff --git a/nnvm/src/top/vision/yolo/yolo.cc b/nnvm/src/top/vision/yolo/yolo.cc new file mode 100644 index 000000000000..4800f4371f9d --- /dev/null +++ b/nnvm/src/top/vision/yolo/yolo.cc @@ -0,0 +1,33 @@ +/*! + * Copyright (c) 2018 by Contributors + * \file yolo.cc + * \brief Property def of yolo operators. + */ +#include +#include +#include +#include +#include "../../elemwise_op_common.h" + +namespace nnvm { +namespace top { + +NNVM_REGISTER_OP(yolov3_yolo) +.describe(R"code(Yolo layer +)code" NNVM_ADD_FILELINE) +.set_num_inputs(1) +.set_num_outputs(1) +.set_support_level(5) +.add_argument("data", "Tensor", "Input data") +.set_attr("FInferShape", ElemwiseShape<1, 1>) +.set_attr( + "FInplaceOption", + [](const NodeAttrs &attrs) { + return std::vector>{{0, 0}, {1, 0}}; + }) +.set_attr("FGradient", [](const NodePtr &n, + const std::vector &ograds) { + return std::vector{ograds[0], ograds[0]}; +}); +} // namespace top +} // namespace nnvm diff --git a/nnvm/tests/python/frontend/darknet/test_forward.py b/nnvm/tests/python/frontend/darknet/test_forward.py index 5fc71a86211e..3d7d06b48483 100644 --- a/nnvm/tests/python/frontend/darknet/test_forward.py +++ b/nnvm/tests/python/frontend/darknet/test_forward.py @@ -44,7 +44,7 @@ def _download(url, path, overwrite=False, sizecompare=False): except: urllib.urlretrieve(url, path) -DARKNET_LIB = 'libdarknet.so' +DARKNET_LIB = 'libdarknet2.0.so' DARKNETLIB_URL = 'https://github.com/siju-samuel/darknet/blob/master/lib/' \ + DARKNET_LIB + '?raw=true' _download(DARKNETLIB_URL, DARKNET_LIB) @@ -239,6 +239,8 @@ def test_forward_shortcut(): layer_2 = LIB.make_convolutional_layer(1, 111, 111, 32, 32, 1, 1, 1, 0, 1, 0, 0, 0, 0) layer_3 = LIB.make_shortcut_layer(1, 0, 111, 111, 32, 111, 111, 32) layer_3.activation = 1 + layer_3.alpha = 1 + layer_3.beta = 1 net.layers[0] = layer_1 net.layers[1] = layer_2 net.layers[2] = layer_3 @@ -272,6 +274,30 @@ def test_forward_region(): test_forward(net) LIB.free_network(net) +def test_forward_yolo_op(): + '''test yolo layer''' + net = LIB.make_network(2) + layer_1 = LIB.make_convolutional_layer(1, 224, 224, 3, 14, 1, 3, 2, 0, 1, 0, 0, 0, 0) + a = [] + layer_2 = LIB.make_yolo_layer(1, 111, 111, 2, 0, a, 2) + net.layers[0] = layer_1 + net.layers[1] = layer_2 + net.w = net.h = 224 + LIB.resize_network(net, 224, 224) + test_forward(net) + LIB.free_network(net) + +def test_forward_upsample(): + '''test upsample layer''' + net = LIB.make_network(1) + layer = LIB.make_upsample_layer(1, 19, 19, 3, 3) + layer.scale = 1 + net.layers[0] = layer + net.w = net.h = 19 + LIB.resize_network(net, 19, 19) + test_forward(net) + LIB.free_network(net) + def test_forward_elu(): '''test elu activation layer''' net = LIB.make_network(1) @@ -428,6 +454,8 @@ def test_forward_activation_logistic(): test_forward_rnn() test_forward_reorg() test_forward_region() + test_forward_yolo_op() + test_forward_upsample() test_forward_elu() test_forward_rnn() test_forward_crnn() diff --git a/tutorials/nnvm/from_darknet.py b/tutorials/nnvm/from_darknet.py index 883026f2af98..c6b70cf59413 100644 --- a/tutorials/nnvm/from_darknet.py +++ b/tutorials/nnvm/from_darknet.py @@ -22,54 +22,48 @@ import numpy as np import tvm import os +import sys from ctypes import * from tvm.contrib.download import download from nnvm.testing.darknet import __darknetffi__ -###################################################################### -# Set the parameters here. -# Supported models alexnet, resnet50, resnet152, extraction, yolo -# -model_name = 'yolo' -test_image = 'dog.jpg' -target = 'llvm' -ctx = tvm.cpu(0) +#Model name +MODEL_NAME = 'yolo' ###################################################################### -# Prepare cfg and weights file -# ---------------------------- -# Pretrained model available https://pjreddie.com/darknet/imagenet/ -# Download cfg and weights file first time. +# Download required files +# ----------------------- +# Download cfg and weights file if first time. +CFG_NAME = MODEL_NAME + '.cfg' +WEIGHTS_NAME = MODEL_NAME + '.weights' +REPO_URL = 'https://github.com/siju-samuel/darknet/blob/master/' +CFG_URL = REPO_URL + 'cfg/' + CFG_NAME + '?raw=true' +WEIGHTS_URL = REPO_URL + 'weights/' + WEIGHTS_NAME + '?raw=true' + +download(CFG_URL, CFG_NAME) +download(WEIGHTS_URL, WEIGHTS_NAME) -cfg_name = model_name + '.cfg' -weights_name = model_name + '.weights' -cfg_url = 'https://github.com/siju-samuel/darknet/blob/master/cfg/' + \ - cfg_name + '?raw=true' -weights_url = 'http://pjreddie.com/media/files/' + weights_name + '?raw=true' - -download(cfg_url, cfg_name) -download(weights_url, weights_name) - -###################################################################### # Download and Load darknet library -# --------------------------------- - -darknet_lib = 'libdarknet.so' -darknetlib_url = 'https://github.com/siju-samuel/darknet/blob/master/lib/' + \ - darknet_lib + '?raw=true' -download(darknetlib_url, darknet_lib) - -#if the file doesnt exist, then exit normally. -if os.path.isfile('./' + darknet_lib) is False: - exit(0) - -darknet_lib = __darknetffi__.dlopen('./' + darknet_lib) -cfg = "./" + str(cfg_name) -weights = "./" + str(weights_name) -net = darknet_lib.load_network(cfg.encode('utf-8'), weights.encode('utf-8'), 0) +if sys.platform in ['linux', 'linux2']: + DARKNET_LIB = 'libdarknet2.0.so' + DARKNET_URL = REPO_URL + 'lib/' + DARKNET_LIB + '?raw=true' +elif sys.platform == 'darwin': + DARKNET_LIB = 'libdarknet_mac2.0.so' + DARKNET_URL = REPO_URL + 'lib_osx/' + DARKNET_LIB + '?raw=true' +else: + err = "Darknet lib is not supported on {} platform".format(sys.platform) + raise NotImplementedError(err) + +download(DARKNET_URL, DARKNET_LIB) + +DARKNET_LIB = __darknetffi__.dlopen('./' + DARKNET_LIB) +cfg = "./" + str(CFG_NAME) +weights = "./" + str(WEIGHTS_NAME) +net = DARKNET_LIB.load_network(cfg.encode('utf-8'), weights.encode('utf-8'), 0) dtype = 'float32' batch_size = 1 + print("Converting darknet to nnvm symbols...") sym, params = nnvm.frontend.darknet.from_darknet(net, dtype) @@ -77,7 +71,9 @@ # Compile the model on NNVM # ------------------------- # compile the model -data = np.empty([batch_size, net.c ,net.h, net.w], dtype); +target = 'llvm' +ctx = tvm.cpu(0) +data = np.empty([batch_size, net.c, net.h, net.w], dtype) shape = {'data': data.shape} print("Compiling the model...") with nnvm.compiler.build_config(opt_level=2): @@ -103,6 +99,7 @@ def save_lib(): ###################################################################### # Load a test image # -------------------------------------------------------------------- +test_image = 'dog.jpg' print("Loading the test image...") img_url = 'https://github.com/siju-samuel/darknet/blob/master/data/' + \ test_image +'?raw=true' @@ -134,7 +131,7 @@ def save_lib(): hier_thresh = 0.5 img = nnvm.testing.darknet.load_image_color(test_image) _, im_h, im_w = img.shape -probs= [] +probs = [] boxes = [] region_layer = net.layers[net.n - 1] boxes, probs = nnvm.testing.yolo2_detection.get_region_boxes(region_layer, im_w, im_h, net.w, net.h, @@ -157,5 +154,5 @@ def save_lib(): nnvm.testing.yolo2_detection.draw_detections(img, region_layer.w*region_layer.h*region_layer.n, thresh, boxes, probs, names, region_layer.classes) -plt.imshow(img.transpose(1,2,0)) +plt.imshow(img.transpose(1, 2, 0)) plt.show() From 5d533ec99b2b6b53f9abfa2f8844a82f180043de Mon Sep 17 00:00:00 2001 From: Yao Wang Date: Mon, 20 Aug 2018 13:27:31 -0700 Subject: [PATCH 40/77] Improve x86 Inception (#1506) * Improve x86 pooling and concat * Fix * Fix test concatenate correct layout * Add conditional vectorize * Fix lint * Modify schedule for global pooling * Fix * Fix warning * Fix alter layout test * Remove vectorization for pooling when using 4D layout * Remove vectorization for 4D concat * Fix concatenate layout * Fix concatenate schedule * Fix concat * Fix lint * Fix concat * Simplify pooling logic * Update docstring * Fix test topi pooling * Small changes --- nnvm/python/nnvm/top/nn.py | 10 ++-- nnvm/python/nnvm/top/transform.py | 8 +++- nnvm/src/top/tensor/transform.cc | 24 ++++++++-- .../python/unittest/test_correct_layout.py | 21 +++++++-- topi/include/topi/nn/pooling.h | 12 ++--- topi/python/topi/cuda/pooling.py | 7 ++- topi/python/topi/generic/injective.py | 17 +++++++ topi/python/topi/generic/nn.py | 5 +- topi/python/topi/opengl/pooling.py | 7 ++- topi/python/topi/x86/injective.py | 46 +++++++++++++++++++ topi/python/topi/x86/pooling.py | 44 +++++++++++++++--- topi/tests/python/test_topi_pooling.py | 6 ++- 12 files changed, 174 insertions(+), 33 deletions(-) diff --git a/nnvm/python/nnvm/top/nn.py b/nnvm/python/nnvm/top/nn.py index f9a2c2813a04..b452738123c3 100644 --- a/nnvm/python/nnvm/top/nn.py +++ b/nnvm/python/nnvm/top/nn.py @@ -280,20 +280,22 @@ def schedule_conv2d_transpose(attrs, outs, target): # max_pool2d @reg.register_schedule("max_pool2d") -def schedule_max_pool2d(_, outs, target): +def schedule_max_pool2d(attrs, outs, target): """Schedule definition of max_pool2d""" + layout = attrs["layout"] with tvm.target.create(target): - return topi.generic.schedule_pool(outs) + return topi.generic.schedule_pool(outs, layout) reg.register_pattern("max_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE) # avg_pool2d @reg.register_schedule("avg_pool2d") -def schedule_avg_pool2d(_, outs, target): +def schedule_avg_pool2d(attrs, outs, target): """Schedule definition of avg_pool2d""" + layout = attrs["layout"] with tvm.target.create(target): - return topi.generic.schedule_pool(outs) + return topi.generic.schedule_pool(outs, layout) reg.register_pattern("avg_pool2d", OpPattern.OUT_ELEMWISE_FUSABLE) diff --git a/nnvm/python/nnvm/top/transform.py b/nnvm/python/nnvm/top/transform.py index facb345c1abe..594007239d4a 100644 --- a/nnvm/python/nnvm/top/transform.py +++ b/nnvm/python/nnvm/top/transform.py @@ -2,6 +2,7 @@ """Tensor transformation ops""" from __future__ import absolute_import +import tvm import topi from .tensor import _fschedule_broadcast, _fschedule_injective from . import registry as reg @@ -58,8 +59,13 @@ def compute_reshape_like(attrs, inputs, out_info): reg.register_schedule("squeeze", _fschedule_injective) # concatenate +@reg.register_schedule("concatenate") +def schedule_concatenate(_, outs, target): + """Schedule definition of concatenate""" + with tvm.target.create(target): + return topi.generic.schedule_concatenate(outs) + reg.register_pattern("concatenate", OpPattern.INJECTIVE) -reg.register_schedule("concatenate", _fschedule_injective) # split reg.register_pattern("split", OpPattern.INJECTIVE) diff --git a/nnvm/src/top/tensor/transform.cc b/nnvm/src/top/tensor/transform.cc index 52dca5654838..b1485438ca50 100644 --- a/nnvm/src/top/tensor/transform.cc +++ b/nnvm/src/top/tensor/transform.cc @@ -129,15 +129,31 @@ inline bool ConcatenateCorrectLayout(const NodeAttrs& attrs, std::vector *ilayouts, const std::vector *last_ilayouts, std::vector *olayouts) { + const ConcatenateParam& param = nnvm::get(attrs.parsed); CHECK_EQ(ilayouts->size(), last_ilayouts->size()); CHECK_EQ(olayouts->size(), 1U); - for (size_t i = 0; i < ilayouts->size(); ++i) { - const Layout& input = last_ilayouts->at(i).defined() ? - last_ilayouts->at(i) : ilayouts->at(i); - NNVM_ASSIGN_LAYOUT(*ilayouts, i, input); + Layout layout; + if (!ilayouts->at(0).defined()) { + layout = last_ilayouts->at(0); + } else if (param.axis >= static_cast(ilayouts->at(0).ndim())) { + CHECK(last_ilayouts->at(0).defined()) + << "Current input layout " << ilayouts->at(0) + << " is invalid but last input layout is not " + "defined for the first input."; + layout = last_ilayouts->at(0); + } else if (last_ilayouts->at(0).defined() + && ilayouts->at(0)[param.axis] + != last_ilayouts->at(0)[param.axis]) { + layout = last_ilayouts->at(0); + } else { + layout = ilayouts->at(0); } + for (size_t i = 0; i < ilayouts->size(); ++i) { + NNVM_ASSIGN_LAYOUT(*ilayouts, i, layout); + } + NNVM_ASSIGN_LAYOUT(*olayouts, 0, layout); return true; } diff --git a/nnvm/tests/python/unittest/test_correct_layout.py b/nnvm/tests/python/unittest/test_correct_layout.py index 6176586284a7..8961498a579e 100644 --- a/nnvm/tests/python/unittest/test_correct_layout.py +++ b/nnvm/tests/python/unittest/test_correct_layout.py @@ -77,14 +77,25 @@ def test_concatenate(): g, ldict = correct_layout(z, {"x": "HW", "y": "HW"}) assert(ldict["x"][0] == "HW") assert(ldict["y"][0] == "HW") - assert(ldict["concat"][0] == "__undef__") + assert(ldict["concat"][0] == "HW") # second pass will insert layout transform _, ldict = correct_layout(g, {"x": "HW16w", "y": "HW16w"}) assert(ldict["x"][0] == "HW16w") assert(ldict["y"][0] == "HW16w") - assert(ldict["x_HW"][0] == "HW") - assert(ldict["y_HW"][0] == "HW") - assert(ldict["concat"][0] == "__undef__") + assert(ldict["concat"][0] == "HW16w") + + x1 = sym.Variable("x", shape=(10, 20, 60)) + x2 = sym.Variable("y", shape=(10, 20, 40)) + z = sym.concatenate(x1, x2, axis=2, name="concat") + g, ldict = correct_layout(z, {"x": "H20wW", "y": "H20wW"}) + assert(ldict["x"][0] == "H20wW") + assert(ldict["y"][0] == "H20wW") + assert(ldict["concat"][0] == "H20wW") + # second pass will insert layout transform + _, ldict = correct_layout(g, {"x": "HW", "y": "HW"}) + assert(ldict["x_H20wW"][0] == "H20wW") + assert(ldict["x_H20wW"][0] == "H20wW") + assert(ldict["concat"][0] == "H20wW") def test_expand_dims(): @@ -349,4 +360,4 @@ def test_reduce(): test_transpose() test_broadcast_to() test_broadcast_binary() - test_reduce() \ No newline at end of file + test_reduce() diff --git a/topi/include/topi/nn/pooling.h b/topi/include/topi/nn/pooling.h index 26d61d42991d..ca318adfe6cb 100644 --- a/topi/include/topi/nn/pooling.h +++ b/topi/include/topi/nn/pooling.h @@ -112,18 +112,18 @@ inline Tensor pool_impl(const Tensor& x, }, "tensor", "pool_max"); } else if (pool_type == kAvgPool) { auto temp = do_pad ? pad(x, pad_before, pad_after, 0, "pad_temp") : x; - auto tsum = tvm::compute(out_shape, [&](const Array& output) { + auto tavg = [&](const Array& output, Expr divide_factor) { Array indices; for (const Var& var : output) indices.push_back(var); indices.Set(height_axis, output[height_axis] * stride_height + dheight); indices.Set(width_axis, output[width_axis] * stride_width + dwidth); - return tvm::sum(temp(indices), { dheight, dwidth }); - }, "tensor", "pool_avg"); + return tvm::sum(temp(indices) / divide_factor, { dheight, dwidth }); + }; return tvm::compute(out_shape, [&](const Array& output) { if (count_include_pad) { - return tsum(output) / (kernel_height * kernel_width); + return tavg(output, kernel_height * kernel_width); } else { Expr h_start = output[height_axis] * stride_height - pad_top; Expr w_start = output[width_axis] * stride_width - pad_left; @@ -133,9 +133,9 @@ inline Tensor pool_impl(const Tensor& x, w_start = ir::Max::make(w_start, make_const(Int(32), 0)); Expr divide_factor = ir::Max::make((h_end - h_start) * (w_end - w_start), make_const(Int(32), 1)); - return tsum(output) / divide_factor; + return tavg(output, divide_factor); } - }, "tensor", kElementWise); + }, "tensor", "pool_avg"); } else { LOG(ERROR) << "Unrecognized pool_type: " << pool_type; return x; diff --git a/topi/python/topi/cuda/pooling.py b/topi/python/topi/cuda/pooling.py index 637f664fbd36..6b36e9a8743f 100644 --- a/topi/python/topi/cuda/pooling.py +++ b/topi/python/topi/cuda/pooling.py @@ -1,4 +1,4 @@ -# pylint: disable=invalid-name, unused-variable +# pylint: disable=invalid-name, unused-variable, unused-argument """Schedule for pooling operators""" import tvm from .. import tag @@ -70,7 +70,7 @@ def traverse(OP): @generic.schedule_pool.register(["cuda", "gpu"]) -def schedule_pool(outs): +def schedule_pool(outs, layout): """Schedule for pool. Parameters @@ -79,6 +79,9 @@ def schedule_pool(outs): The computation graph description of pool in the format of an array of tensors. + layout: str + Data layout. + Returns ------- s: Schedule diff --git a/topi/python/topi/generic/injective.py b/topi/python/topi/generic/injective.py index 0a9e394661af..975e4c11ea41 100644 --- a/topi/python/topi/generic/injective.py +++ b/topi/python/topi/generic/injective.py @@ -29,5 +29,22 @@ def schedule_injective(outs): s[x].fuse(s[x].op.axis) return s +@tvm.target.generic_func +def schedule_concatenate(outs): + """Schedule for concatenate op. + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of reduce in the format + of an array of tensors. + + Returns + ------- + sch: Schedule + The computation schedule for the op. + """ + return schedule_injective(outs) + schedule_elemwise = schedule_injective schedule_broadcast = schedule_injective diff --git a/topi/python/topi/generic/nn.py b/topi/python/topi/generic/nn.py index 1e01adb899b7..874decc792ec 100644 --- a/topi/python/topi/generic/nn.py +++ b/topi/python/topi/generic/nn.py @@ -282,7 +282,7 @@ def schedule_dense(outs): @tvm.target.override_native_generic_func("schedule_pool") -def schedule_pool(outs): +def schedule_pool(outs, layout): """Schedule for pool Parameters @@ -291,6 +291,9 @@ def schedule_pool(outs): The computation graph description of pool in the format of an array of tensors. + layout: str + Data layout. + Returns ------- sch: Schedule diff --git a/topi/python/topi/opengl/pooling.py b/topi/python/topi/opengl/pooling.py index 8195ea91d8a6..d6dbf0eac5c2 100644 --- a/topi/python/topi/opengl/pooling.py +++ b/topi/python/topi/opengl/pooling.py @@ -1,4 +1,4 @@ -# pylint: disable=invalid-name, unused-variable +# pylint: disable=invalid-name, unused-variable, unused-argument """Schedule for pooling operators""" import tvm from .. import tag @@ -54,7 +54,7 @@ def traverse(OP): @generic.schedule_pool.register(["opengl"]) -def schedule_pool(outs): +def schedule_pool(outs, layout): """Schedule for pool. Parameters @@ -63,6 +63,9 @@ def schedule_pool(outs): The computation graph description of pool in the format of an array of tensors. + layout: str + Data layout. + Returns ------- s: Schedule diff --git a/topi/python/topi/x86/injective.py b/topi/python/topi/x86/injective.py index b43ebb98b82f..ac552903ad7f 100644 --- a/topi/python/topi/x86/injective.py +++ b/topi/python/topi/x86/injective.py @@ -33,5 +33,51 @@ def schedule_injective(outs): s[x].parallel(s[x].op.axis[0]) return s +@generic.schedule_concatenate.register(["cpu"]) +def schedule_concatenate(outs): + """X86 schedule for concatenate op. + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of injective in the format + of an array of tensors. + + Returns + ------- + sch: Schedule + The computation schedule for the op. + """ + def vectorize(sch, tensor, vectorize_limit): + """Internal vectorization function for concatenate.""" + inner_axis = s[tensor].op.axis[len(s[tensor].op.axis) - 1] + inner_length = tensor.shape[len(tensor.shape) - 1].value + if inner_length <= vectorize_limit: + sch[tensor].vectorize(inner_axis) + else: + split_factor = 1 + for i in range(vectorize_limit, 1, -1): + if inner_length % i == 0: + split_factor = i + break + if split_factor > 1: + _, inner_i = sch[tensor].split(inner_axis, split_factor) + sch[tensor].vectorize(inner_i) + + outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs + x = outs[0] + s = tvm.create_schedule([x.op for x in outs]) + tvm.schedule.AutoInlineInjective(s) + if len(s[x].op.axis) >= 5: + fused = s[x].fuse(s[x].op.axis[0], s[x].op.axis[1], s[x].op.axis[2]) + vectorize(s, x, 64) + s[x].parallel(fused) + elif len(s[x].op.axis) >= 3: + fused = s[x].fuse(s[x].op.axis[0], s[x].op.axis[1]) + s[x].parallel(fused) + else: + s[x].parallel(s[x].op.axis[0]) + return s + schedule_elemwise = schedule_injective schedule_broadcast = schedule_injective diff --git a/topi/python/topi/x86/pooling.py b/topi/python/topi/x86/pooling.py index 998edf7a0e16..5fce5f32afb6 100644 --- a/topi/python/topi/x86/pooling.py +++ b/topi/python/topi/x86/pooling.py @@ -4,19 +4,47 @@ from .. import generic from .. import tag -def _parallel_sch(sch): +def _parallel_sch(sch, oshape, do_vectorize=False): + def vectorize(fused_axis, num_parallel_axis, vectorize_limit=64): + """Internal vectorization utility function.""" + reorder_axis = [fused_axis] + for i in range(num_parallel_axis, len(sch.op.axis) - 1): + reorder_axis.append(sch.op.axis[i]) + kw, kh = sch.op.reduce_axis + fuse_k = sch.fuse(kw, kh) + c = sch.op.axis[len(sch.op.axis) - 1] + reorder_axis += [fuse_k, c] + sch.reorder(*reorder_axis) + inner_length = oshape[len(oshape) - 1].value + if inner_length <= vectorize_limit: + sch.vectorize(c) + else: + split_factor = 1 + for i in range(vectorize_limit, 1, -1): + if inner_length % i == 0: + split_factor = i + break + if split_factor > 1: + _, c_i = sch.split(c, split_factor) + sch.vectorize(c_i) + if len(sch.op.axis) >= 5: fused = sch.fuse(sch.op.axis[0], sch.op.axis[1], sch.op.axis[2]) - sch.parallel(fused) + if do_vectorize: + vectorize(fused, 3) + elif len(sch.op.axis) >= 3: fused = sch.fuse(sch.op.axis[0], sch.op.axis[1]) - sch.parallel(fused) + if do_vectorize: + vectorize(fused, 2) else: sch.parallel(sch.op.axis[0]) + return + sch.parallel(fused) @generic.schedule_pool.register(["cpu"]) -def schedule_pool(outs): +def schedule_pool(outs, layout): """Schedule for pool Parameters @@ -25,6 +53,9 @@ def schedule_pool(outs): The computation graph description of pool in the format of an array of tensors. + layout: str + Data layout. + Returns ------- sch: Schedule @@ -37,7 +68,8 @@ def schedule_pool(outs): def _schedule(PaddedInput, Pool): if isinstance(PaddedInput.op, tvm.tensor.ComputeOp): s[PaddedInput].compute_inline() - _parallel_sch(s[Pool]) + do_vectorize = layout[-1] not in "HWhw" + _parallel_sch(s[Pool], outs[0].shape, do_vectorize) def traverse(OP): """Internal travserse function""" @@ -93,7 +125,7 @@ def traverse(OP): # schedule pool elif OP.tag.startswith('global_pool'): Pool = OP.output(0) - _parallel_sch(s[Pool]) + _parallel_sch(s[Pool], outs[0].shape) else: raise RuntimeError("Unsupported operator: %s" % OP.tag) diff --git a/topi/tests/python/test_topi_pooling.py b/topi/tests/python/test_topi_pooling.py index c9f790146b4a..b87795743c4c 100644 --- a/topi/tests/python/test_topi_pooling.py +++ b/topi/tests/python/test_topi_pooling.py @@ -10,9 +10,11 @@ def verify_pool(n, ic, ih, kh, sh, padding, pool_type, ceil_mode, count_include_ kw = kh sw = sh pt, pl, pb, pr = padding + layout = "NCHW" A = tvm.placeholder((n, ic, ih, iw), name='A') B = topi.nn.pool(A, kernel=[kh, kw], stride=[sh, sw], padding=padding, - pool_type=pool_type, ceil_mode=ceil_mode, count_include_pad=count_include_pad) + pool_type=pool_type, ceil_mode=ceil_mode, + layout="NCHW", count_include_pad=count_include_pad) B = topi.nn.relu(B) dtype = A.dtype @@ -54,7 +56,7 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - s = topi.generic.schedule_pool(B) + s = topi.generic.schedule_pool(B, layout) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), ctx) From 7cb85d81968cd69576d923852d812590b93cc26d Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Mon, 20 Aug 2018 16:28:28 -0700 Subject: [PATCH 41/77] [VERSION] Update to 0.5.dev (#1623) * [VERSION] Update to 0.5.dev * Update the docs to include all intrins --- NEWS.md | 63 +++++++++++++++++++++++++++ conda/nnvm/meta.yaml | 2 +- conda/topi/meta.yaml | 2 +- conda/tvm-libs/meta.yaml | 2 +- conda/tvm/meta.yaml | 2 +- docs/api/python/intrin.rst | 6 +++ include/tvm/runtime/c_runtime_api.h | 2 +- python/tvm/_ffi/libinfo.py | 6 ++- python/update_version.py | 66 +++++++++++++++++++++++++++++ web/tvm_runtime.js | 2 +- 10 files changed, 145 insertions(+), 8 deletions(-) create mode 100644 python/update_version.py diff --git a/NEWS.md b/NEWS.md index 567aabf3fcbd..2c2f616cb2f0 100644 --- a/NEWS.md +++ b/NEWS.md @@ -9,6 +9,69 @@ Refer to the Roadmap issue for complete list on on-going version features. If you check in something that is not reflected in Roadmap issue, please reply to that issue so it can get added. +## 0.4 + +This release features several major improvements. The high-level graph optimizer is now part of TVM repo. Some of the highlights are: Initial support of AutoTVM for automated optimization; customized accelerator backend VTA. + +- Tensor operator primitives + - Introduce attrs field to operator primitives(e.g. compute) to store additional metadata, the attrs can be used as hint for scheduling +- Enable embedding of asm micro-kernels +- Hybrid python programming model + - python AST based IR builder interface + - support GPU programs +- AutoTVM, Automated tuning, and scheduling + - basic autotvm infra + - GPU IR verifier + - basic autotuning tutorial + - topi integration +- ARM support + - winograd support + - initial support of ARM autotuning records +- TOPI Vision + - Generic GPU sort support(useful for vision) + - SSD operator support +- TOPI numpy consistency + - Rename all binary operators for numpy consistecy: broadcast_add-> add, broadcast_sub -> substract, broadcast_mul -> multiply, broadcast_div->divide + - New operators: slice, LRN, equal, not_equal, less, greater + - tutorials on topi +- Initial low-bit operator support support + - Optimized popcount generation on ARM + - general bit-serial convolution and GEMM + - optimized low bit kernels + - parallel optimization +- New topi backend optimization for intel graphics +- Adapt AVX schedules for SSE target +- VTA: customized accelerator backend + - custom hardware backend example + - tutorials on how to use customized accelerator +- Initial experimental support for HLS backend +- Bugfix in SPIRV code generator for vulkan +- libdevice support, enable NVPTX backend +- Introduce NDArrayContainer for managed NDarray +- RPC and Device API + - Support communication between big/small endian machines. + - RPC and device API protocol upgrade (this is a non-backward compatible change) to support big-small endian communication. This is a non-backward compatible change, need to use the latest version of TVM runtime with the RPC + - graduate rpc from contrib, tvm.contrib.rpc->tvm.rpc + -Support tracker in Android RPC, add fault tolerance for AutoTVM +- BIG.LITTLE aware threadpool +- tvm4j graph runtime that runs end to end workload in java +- DLPack support + - Support from_dlpack and to_dlpack + - Enables bridges to pytorch +- Enable link of stackvm in runtime +- Tensorflow graphdef frontend +- Keras frontend + - improved to support reuse layers, add activations +- ONNX + - gather, LRN +- CoreML frontend + - Support C-RNN and activation functions +- Fix grads for sum and expand_like +- Enhanced operator fusion for multiple elemwise branches +- Separate nnvm fusion and compilation pass +- Unified build system to cmake, customizable cmake path for vulkan, rocm, cuda + + ## 0.3 This release features numerous improvements in TOPI and backends. We make the first step toward object detection support in TOPI, featuring operators necessary for YOLO and SSDs. The topi now supports numpy-style API and operator overloading. RPC is significantly improved to support resource allocation and using a pool of devices. We are adding two new backends: WebGL for running GPUs on the browser, and Vulkan for running on next-generation graphics API. diff --git a/conda/nnvm/meta.yaml b/conda/nnvm/meta.yaml index a8b47d0de118..9c045c177ff6 100644 --- a/conda/nnvm/meta.yaml +++ b/conda/nnvm/meta.yaml @@ -1,4 +1,4 @@ -{% set version = "0.4.dev" %} +{% set version = "0.5.dev" %} package: name: nnvm diff --git a/conda/topi/meta.yaml b/conda/topi/meta.yaml index af2fb4fd4228..4002f577863b 100644 --- a/conda/topi/meta.yaml +++ b/conda/topi/meta.yaml @@ -1,4 +1,4 @@ -{% set version = "0.4.dev" %} +{% set version = "0.5.dev" %} package: name: topi diff --git a/conda/tvm-libs/meta.yaml b/conda/tvm-libs/meta.yaml index dbdfd4a7701f..d6902c45a693 100644 --- a/conda/tvm-libs/meta.yaml +++ b/conda/tvm-libs/meta.yaml @@ -1,4 +1,4 @@ -{% set version = "0.4.dev" %} +{% set version = "0.5.dev" %} package: name: tvm-libs diff --git a/conda/tvm/meta.yaml b/conda/tvm/meta.yaml index 478e095322eb..fe53b7dd49d9 100644 --- a/conda/tvm/meta.yaml +++ b/conda/tvm/meta.yaml @@ -1,4 +1,4 @@ -{% set version = "0.4.dev" %} +{% set version = "0.5.dev" %} package: name: tvm diff --git a/docs/api/python/intrin.rst b/docs/api/python/intrin.rst index 3942c57f1a04..59f695196ce8 100644 --- a/docs/api/python/intrin.rst +++ b/docs/api/python/intrin.rst @@ -6,7 +6,10 @@ tvm.intrin tvm.call_packed tvm.call_pure_intrin + tvm.call_intrin tvm.call_pure_extern + tvm.call_extern + tvm.call_llvm_intrin tvm.register_intrin_rule tvm.exp tvm.log @@ -18,7 +21,10 @@ tvm.intrin .. autofunction:: tvm.call_packed .. autofunction:: tvm.call_pure_intrin +.. autofunction:: tvm.call_intrin .. autofunction:: tvm.call_pure_extern +.. autofunction:: tvm.call_extern +.. autofunction:: tvm.call_llvm_intrin .. autofunction:: tvm.register_intrin_rule .. autofunction:: tvm.exp .. autofunction:: tvm.log diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h index 32d574340052..52499fb9186f 100644 --- a/include/tvm/runtime/c_runtime_api.h +++ b/include/tvm/runtime/c_runtime_api.h @@ -43,7 +43,7 @@ #endif // TVM version -#define TVM_VERSION "0.4.0" +#define TVM_VERSION "0.5.dev" // TVM Runtime is DLPack compatible. diff --git a/python/tvm/_ffi/libinfo.py b/python/tvm/_ffi/libinfo.py index 390849f8536d..f911829d38b1 100644 --- a/python/tvm/_ffi/libinfo.py +++ b/python/tvm/_ffi/libinfo.py @@ -100,5 +100,7 @@ def find_lib_path(name=None, search_path=None, optional=False): # current version -# We use the version of the incoming release for code that is under development -__version__ = "0.4.0" +# We use the version of the incoming release for code +# that is under development. +# The following line is set by tvm/python/update_version.py +__version__ = "0.5.dev" diff --git a/python/update_version.py b/python/update_version.py new file mode 100644 index 000000000000..9e958f109479 --- /dev/null +++ b/python/update_version.py @@ -0,0 +1,66 @@ +""" +This is the global script that set the version information of TVM. +This script runs and update all the locations that related to versions + +List of affected files: +- tvm-root/python/tvm/_ffi/libinfo.py +- tvm-root/include/tvm/runtime/c_runtime_api.h +- tvm-root/web/tvm_runtime.js +- tvm-root/conda/tvm/meta.yaml +- tvm-root/conda/topi/meta.yaml +- tvm-root/conda/nnvm/meta.yaml +- tvm-root/conda/tvm-libs/meta.yaml +""" +import os +import re +# current version +# We use the version of the incoming release for code +# that is under development +__version__ = "0.5.dev" + +# Implementations +def update(file_name, pattern, repl): + update = [] + hit_counter = 0 + need_update = False + for l in open(file_name): + result = re.findall(pattern, l) + if result: + assert len(result) == 1 + hit_counter += 1 + if result[0] != repl: + l = re.sub(pattern, repl, l) + need_update = True + print("%s: %s->%s" % (file_name, result[0], repl)) + else: + print("%s: version is already %s" % (file_name, repl)) + + update.append(l) + if hit_counter != 1: + raise RuntimeError("Cannot find version in %s" % file_name) + + if need_update: + with open(file_name, "w") as output_file: + for l in update: + output_file.write(l) + + +def main(): + curr_dir = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) + proj_root = os.path.abspath(os.path.join(curr_dir, "..")) + # python path + update(os.path.join(proj_root, "python", "tvm", "_ffi", "libinfo.py"), + r"(?<=__version__ = \")[.0-9a-z]+", __version__) + # C++ header + update(os.path.join(proj_root, "include", "tvm", "runtime", "c_runtime_api.h"), + "(?<=TVM_VERSION \")[.0-9a-z]+", __version__) + # conda + for path in ["tvm", "topi", "nnvm", "tvm-libs"]: + update(os.path.join(proj_root, "conda", path, "meta.yaml"), + "(?<=version = \")[.0-9a-z]+", __version__) + # web + update(os.path.join(proj_root, "web", "tvm_runtime.js"), + "(?<=@version )[.0-9a-z]+", __version__) + +if __name__ == "__main__": + main() diff --git a/web/tvm_runtime.js b/web/tvm_runtime.js index 786745d3ce88..2eab15093b72 100644 --- a/web/tvm_runtime.js +++ b/web/tvm_runtime.js @@ -2,7 +2,7 @@ * TVM Javascript web runtime library. * * @projectname tvm - * @version 0.1 + * @version 0.5.dev */ /* eslint no-unused-vars: "off" */ /* eslint no-unexpected-multiline: "off" */ From 21e1301086f74996af2d54e81ff342aa65b6cd35 Mon Sep 17 00:00:00 2001 From: Wuwei Lin Date: Tue, 21 Aug 2018 12:40:23 -0500 Subject: [PATCH 42/77] Add int8 gemm recipe (#1614) --- topi/recipe/gemm/gemm_int8.py | 185 ++++++++++++++++++++++++++++++++++ 1 file changed, 185 insertions(+) create mode 100644 topi/recipe/gemm/gemm_int8.py diff --git a/topi/recipe/gemm/gemm_int8.py b/topi/recipe/gemm/gemm_int8.py new file mode 100644 index 000000000000..61ef97d0a2bf --- /dev/null +++ b/topi/recipe/gemm/gemm_int8.py @@ -0,0 +1,185 @@ +"Example code to perform int8 GEMM" +import logging +import sys +import numpy as np +import tvm +from tvm import autotvm + +DO_TUNING = True +PRETUNED_INDEX = 75333 + +def intrin_dot(): + n = 4 # dp4a requires operands packed by 4 + x = tvm.placeholder((n,), name='x', dtype='int8') + y = tvm.placeholder((n,), name='y', dtype='int8') + k = tvm.reduce_axis((0, n), name='k') + + z = tvm.compute( + (1,), lambda _: tvm.sum( + x[k].astype('int32') * y[k].astype('int32'), axis=k)) + + def intrin_func(ins, outs): + xx, yy = ins + zz = outs[0] + ib = tvm.ir_builder.create() + + dp4a = zz.vstore(0, tvm.call_pure_extern('int32', '__dp4a', + xx.vload(0, dtype='int8x4'), + yy.vload(0, dtype='int8x4'), + zz.vload(0))) + ib.emit(dp4a) + + body = ib.get() + return body, zz.vstore(0, 0), body + + with tvm.build_config(data_alignment=4, offset_factor=1) as cfg: + binds = {t: tvm.decl_buffer(t.shape, t.dtype, t.op.name, + data_alignment=cfg.data_alignment, + offset_factor=cfg.offset_factor, + scope='local') for t in [x, y, z]} + return tvm.decl_tensor_intrin(z.op, intrin_func, binds=binds) + + +dot = intrin_dot() + + +@autotvm.template +def gemm_int8(n, m, l): + A = tvm.placeholder((n, l), name='A', dtype='int8') + B = tvm.placeholder((m, l), name='B', dtype='int8') + + k = tvm.reduce_axis((0, l), name='k') + C = tvm.compute((n, m), lambda i, j: tvm.sum(A[i, k].astype('int32') * B[j, k].astype( + 'int32'), axis=k), name='C') + + cfg = autotvm.get_config() + s = tvm.create_schedule(C.op) + y, x = C.op.axis + + AA = s.cache_read(A, 'shared', [C]) + BB = s.cache_read(B, 'shared', [C]) + AL = s.cache_read(AA, 'local', [C]) + BL = s.cache_read(BB, 'local', [C]) + CC = s.cache_write(C, 'local') + + k = CC.op.reduce_axis[0] + + cfg.define_split('tile_k', cfg.axis(k), num_outputs=3, + filter=lambda entity: entity.size[2] == 4 and \ + entity.size[0] * 2 >= entity.size[1]) + + ko, kt, ki = cfg['tile_k'].apply(s, CC, k) + + s[CC].tensorize(ki, dot) + + block_x = tvm.thread_axis('blockIdx.x') + block_y = tvm.thread_axis('blockIdx.y') + thread_x = tvm.thread_axis('threadIdx.x') + thread_y = tvm.thread_axis('threadIdx.y') + + def block_size_filter(entity): + return entity.size[0] * 2 >= entity.size[1] * 2 and \ + entity.size[1] <= 16 and entity.size[3] <= 4 + cfg.define_split('tile_y', cfg.axis(y), num_outputs=4, filter=block_size_filter) + cfg.define_split('tile_x', cfg.axis(x), num_outputs=4, filter=block_size_filter) + by, tyz, ty, yi = cfg['tile_y'].apply(s, C, y) + bx, txz, tx, xi = cfg['tile_x'].apply(s, C, x) + + s[C].bind(by, block_y) + s[C].bind(bx, block_x) + s[C].bind(tyz, tvm.thread_axis('vthread')) + s[C].bind(txz, tvm.thread_axis('vthread')) + s[C].bind(ty, thread_y) + s[C].bind(tx, thread_x) + s[C].reorder(by, bx, tyz, txz, ty, tx, yi, xi) + + s[CC].compute_at(s[C], tx) + + yo, xo = CC.op.axis + s[CC].reorder(ko, kt, yo, xo, ki) + s[CC].unroll(kt) + + for stage in [AL, BL]: + s[stage].compute_at(s[CC], kt) + _, xi = s[stage].split(stage.op.axis[1], factor=4) + s[stage].vectorize(xi) + s[stage].double_buffer() + + cfg.define_knob('storage_align', [16, 48]) + for stage in [AA, BB]: + s[stage].storage_align(s[stage].op.axis[0], + cfg['storage_align'].val, 0) + s[stage].compute_at(s[CC], ko) + + fused = s[stage].fuse(*s[stage].op.axis) + ty, tx = s[stage].split(fused, nparts=cfg['tile_y'].size[2]) + tx, xi = s[stage].split(tx, nparts=cfg['tile_x'].size[2]) + _, xi = s[stage].split(xi, factor=16) + + s[stage].bind(ty, thread_y) + s[stage].bind(tx, thread_x) + s[stage].vectorize(xi) + + cfg.define_knob('auto_unroll_max_step', [512, 1500]) + s[C].pragma(by, 'auto_unroll_max_step', cfg['auto_unroll_max_step'].val) + s[C].pragma(by, 'unroll_explicit', False) + + cfg.add_flop(n*m*l*2) + return s, [A, B, C] + + +if __name__ == '__main__': + N = 2048 + n = m = l = N + + logging.basicConfig(level=logging.DEBUG, stream=sys.stdout) + task = autotvm.task.create(gemm_int8, args=(n, m, l), target='cuda') + print(task.config_space) + + measure_option = autotvm.measure_option( + measure_func='local', number=10, n_parallel=8, timeout=20) + log_name = 'gemm_int8.log' + if DO_TUNING: + tuner = autotvm.tuner.XGBTuner(task) + tuner.tune(n_trial=1000, measure_option=measure_option, + callbacks=[autotvm.callback.log_to_file(log_name)]) + + dispatch_context = autotvm.apply_history_best(log_name) + best_config = dispatch_context.query(task.target, task.workload) + print('\nBest config:') + print(best_config) + else: + config = task.config_space.get(PRETUNED_INDEX) + dispatch_context = autotvm.task.ApplyConfig(config) + print("Using pretuned config:") + print(config) + + with dispatch_context: + with tvm.target.create('cuda'): + s, arg_bufs = gemm_int8(n, m, l) + f = tvm.build(s, arg_bufs, 'cuda', name='gemm_int8') + + ctx = tvm.context('cuda', 0) + + a_np = np.random.randint(size=(n, l), low=-128, high=127, dtype='int8') + b_np = np.random.randint(size=(m, l), low=-128, high=127, dtype='int8') + + a = tvm.nd.array(a_np, ctx) + b = tvm.nd.array(b_np, ctx) + c = tvm.nd.array(np.zeros((n, m), dtype='int32'), ctx) + f(a, b, c) + + np.testing.assert_allclose( + c.asnumpy(), + np.dot( + a_np.astype('int32'), + b_np.T.astype('int32')), + rtol=1e-5) + + num_ops = 2 * l * m * n + num_runs = 1000 + timer_f = f.time_evaluator(f.entry_name, ctx, number=num_runs) + t = timer_f(a, b, c).mean + GOPS = num_ops / (t * 1e3) / 1e6 + print("average time cost of %d runs = %g ms, %g GOPS." % + (num_runs, t * 1e3, GOPS)) From 16d3c1f6af2b1a564c89b37c6c057d007f78bf41 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Tue, 21 Aug 2018 16:35:59 -0700 Subject: [PATCH 43/77] [RUNTIME] Add TypedPackedFunc (#1626) --- include/tvm/runtime/packed_func.h | 251 ++++++++++++++++++++++++++++++ tests/cpp/packed_func_test.cc | 23 +++ 2 files changed, 274 insertions(+) diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h index 63e8ca7cd16b..758d03b5b18b 100644 --- a/include/tvm/runtime/packed_func.h +++ b/include/tvm/runtime/packed_func.h @@ -118,6 +118,163 @@ class PackedFunc { FType body_; }; +/*! + * \brief Please refer to \ref TypedPackedFuncAnchor "TypedPackedFunc" + */ +template +class TypedPackedFunc; + +/*! + * \anchor TypedPackedFuncAnchor + * \brief A PackedFunc wrapper to provide typed function signature. + * It is backed by a PackedFunc internally. + * + * TypedPackedFunc enables compile time type checking. + * TypedPackedFunc works with the runtime system: + * - It can be passed as an argument of PackedFunc. + * - It can be assigned to TVMRetValue. + * - It can be directly converted to a type-erased PackedFunc. + * + * Developers should prefer TypedPackedFunc over PackedFunc in C++ code + * as it enables compile time checking. + * We can construct a TypedPackedFunc from a lambda function + * with the same signature. + * + * \code + * // user defined lambda function. + * auto addone = [](int x)->int { + * return x + 1; + * }; + * // We can directly convert + * // lambda function to TypedPackedFunc + * TypedPackedFunc ftyped(addone); + * // invoke the function. + * int y = ftyped(1); + * // Can be directly converted to PackedFunc + * PackedFunc packed = ftype; + * \endcode + * \tparam R The return value of the function. + * \tparam Args The argument signature of the function. + */ +template +class TypedPackedFunc { + public: + /*! \brief short hand for this function type */ + using TSelf = TypedPackedFunc; + /*! \brief default constructor */ + TypedPackedFunc() {} + /*! + * \brief construct by wrap a PackedFunc + * + * Example usage: + * \code + * PackedFunc packed([](TVMArgs args, TVMRetValue *rv) { + * int x = args[0]; + * *rv = x + 1; + * }); + * // construct from packed function + * TypedPackedFunc ftyped(packed); + * // call the typed version. + * CHECK_EQ(ftyped(1), 2); + * \endcode + * + * \param packed The packed function + */ + explicit TypedPackedFunc(PackedFunc packed) + : packed_(packed) { + } + /*! + * \brief construct from a lambda function with the same signature. + * + * Example usage: + * \code + * auto typed_lambda = [](int x)->int { return x + 1; } + * // construct from packed function + * TypedPackedFunc ftyped(typed_lambda); + * // call the typed version. + * CHECK_EQ(ftyped(1), 2); + * \endcode + * + * \param typed_lambda typed lambda function. + * \tparam FLambda the type of the lambda function. + */ + template + >::value>::type> + explicit TypedPackedFunc(const FLambda& typed_lambda) { + this->AssignTypedLambda(typed_lambda); + } + /*! + * \brief copy assignment operator from typed lambda + * + * Example usage: + * \code + * // construct from packed function + * TypedPackedFunc ftyped; + * ftyped = [](int x) { return x + 1; } + * // call the typed version. + * CHECK_EQ(ftyped(1), 2); + * \endcode + * + * \param typed_lambda typed lambda function. + * \tparam FLambda the type of the lambda function. + * \returns reference to self. + */ + template + >::value>::type> + TSelf& operator=(FLambda typed_lambda) { // NOLINT(*) + this->AssignTypedLambda(typed_lambda); + return *this; + } + /*! + * \brief copy assignment operator from PackedFunc. + * \param packed The packed function. + * \returns reference to self. + */ + TSelf& operator=(PackedFunc packed) { + packed_ = packed; + return *this; + } + /*! + * \brief Invoke the operator. + * \param args The arguments + * \returns The return value. + */ + inline R operator()(Args ...args) const; + /*! + * \brief convert to PackedFunc + * \return the internal PackedFunc + */ + operator PackedFunc() const { + return packed(); + } + /*! + * \return reference the internal PackedFunc + */ + const PackedFunc& packed() const { + return packed_; + } + + private: + friend class TVMRetValue; + /*! \brief The internal packed function */ + PackedFunc packed_; + /*! + * \brief Assign the packed field using a typed lambda function. + * + * \param flambda The lambda function. + * \tparam FLambda The lambda function type. + * \note We capture the lambda when possible for maximum efficiency. + */ + template + inline void AssignTypedLambda(FLambda flambda); +}; + /*! \brief Arguments into TVM functions. */ class TVMArgs { public: @@ -361,6 +518,10 @@ class TVMArgValue : public TVMPODValue_ { TVM_CHECK_TYPE_CODE(type_code_, kFuncHandle); return *ptr(); } + template + operator TypedPackedFunc() const { + return TypedPackedFunc(operator PackedFunc()); + } operator Module() const { TVM_CHECK_TYPE_CODE(type_code_, kModuleHandle); return *ptr(); @@ -446,6 +607,10 @@ class TVMRetValue : public TVMPODValue_ { TVM_CHECK_TYPE_CODE(type_code_, kFuncHandle); return *ptr(); } + template + operator TypedPackedFunc() const { + return TypedPackedFunc(operator PackedFunc()); + } operator Module() const { TVM_CHECK_TYPE_CODE(type_code_, kModuleHandle); return *ptr(); @@ -512,6 +677,10 @@ class TVMRetValue : public TVMPODValue_ { this->SwitchToClass(kFuncHandle, f); return *this; } + template + TVMRetValue& operator=(const TypedPackedFunc& f) { + return operator=(f.packed()); + } TVMRetValue& operator=(Module m) { this->SwitchToClass(kModuleHandle, m); return *this; @@ -847,6 +1016,10 @@ class TVMArgsSetter { values_[i].v_handle = const_cast(&value); type_codes_[i] = kFuncHandle; } + template + void operator()(size_t i, const TypedPackedFunc& value) const { // NOLINT(*) + operator()(i, value.packed()); + } void operator()(size_t i, const Module& value) const { // NOLINT(*) values_[i].v_handle = const_cast(&value); type_codes_[i] = kModuleHandle; @@ -894,6 +1067,84 @@ inline TVMRetValue PackedFunc::operator()(Args&& ...args) const { return rv; } +namespace detail { +template +struct unpack_call_dispatcher { + template + static void run(const F& f, + const TVMArgs& args_pack, + TVMRetValue* rv, + Args&&... unpacked_args) { + unpack_call_dispatcher + ::run(f, args_pack, rv, + std::forward(unpacked_args)..., + args_pack[index]); + } +}; + +template +struct unpack_call_dispatcher { + template + static void run(const F& f, + const TVMArgs& args_pack, + TVMRetValue* rv, + Args&&... unpacked_args) { + *rv = R(f(std::forward(unpacked_args)...)); + } +}; + +template +struct unpack_call_dispatcher { + template + static void run(const F& f, + const TVMArgs& args_pack, + TVMRetValue* rv, + Args&&... unpacked_args) { + f(std::forward(unpacked_args)...); + } +}; + +template +inline void unpack_call(const F& f, const TVMArgs& args, TVMRetValue* rv) { + unpack_call_dispatcher::run(f, args, rv); +} + +template +inline R call_packed(const PackedFunc& pf, Args&& ...args) { + return R(pf(std::forward(args)...)); +} + +template +struct typed_packed_call_dispatcher { + template + static inline R run(const PackedFunc& pf, Args&& ...args) { + return pf(std::forward(args)...); + } +}; + +template<> +struct typed_packed_call_dispatcher { + template + static inline void run(const PackedFunc& pf, Args&& ...args) { + pf(std::forward(args)...); + } +}; +} // namespace detail + +template +template +inline void TypedPackedFunc::AssignTypedLambda(FType flambda) { + packed_ = PackedFunc([flambda](const TVMArgs& args, TVMRetValue* rv) { + detail::unpack_call(flambda, args, rv); + }); +} + +template +inline R TypedPackedFunc::operator()(Args... args) const { + return detail::typed_packed_call_dispatcher + ::run(packed_, std::forward(args)...); +} + // extension and node type handling namespace detail { template diff --git a/tests/cpp/packed_func_test.cc b/tests/cpp/packed_func_test.cc index 9b2f1df73731..abe26fabe9ea 100644 --- a/tests/cpp/packed_func_test.cc +++ b/tests/cpp/packed_func_test.cc @@ -135,6 +135,29 @@ TEST(PackedFunc, Type) { CHECK(get_type2("float32x2").operator Type() == Float(32, 2)); } +TEST(TypedPackedFunc, HighOrder) { + using namespace tvm; + using namespace tvm::runtime; + using Int1Func = TypedPackedFunc; + using Int2Func = TypedPackedFunc; + using BindFunc = TypedPackedFunc; + BindFunc ftyped; + ftyped = [](Int2Func f1, int value) -> Int1Func { + auto binded = [f1, value](int x) { + return f1(value, x); + }; + Int1Func x(binded); + return x; + }; + auto add = [](int x, int y) { return x + y; }; + CHECK_EQ(ftyped(Int2Func(add), 1)(2), 3); + PackedFunc f = ftyped(Int2Func(add), 1); + CHECK_EQ(f(3).operator int(), 4); + // call the type erased version. + Int1Func f1 = ftyped.packed()(Int2Func(add), 1); + CHECK_EQ(f1(3), 4); +} + // new namespoace namespace test { // register int vector as extension type From 729224b17f5ff1b64c5ee086fabc3dac61742725 Mon Sep 17 00:00:00 2001 From: eqy Date: Tue, 21 Aug 2018 18:35:03 -0700 Subject: [PATCH 44/77] check in (#1629) --- .../java/ml/dmlc/tvm/tvmrpc/MainActivity.java | 24 ++++--------------- .../app/src/main/res/layout/content_main.xml | 11 --------- .../app/src/main/res/values/strings.xml | 3 +-- 3 files changed, 5 insertions(+), 33 deletions(-) diff --git a/apps/android_rpc/app/src/main/java/ml/dmlc/tvm/tvmrpc/MainActivity.java b/apps/android_rpc/app/src/main/java/ml/dmlc/tvm/tvmrpc/MainActivity.java index d80008bbe258..2ea4e4cb7528 100644 --- a/apps/android_rpc/app/src/main/java/ml/dmlc/tvm/tvmrpc/MainActivity.java +++ b/apps/android_rpc/app/src/main/java/ml/dmlc/tvm/tvmrpc/MainActivity.java @@ -39,11 +39,9 @@ public class MainActivity extends AppCompatActivity { - private boolean skipRelaunch = true; // wait time before automatic restart of RPC Activity public static final int HANDLER_RESTART_DELAY = 5000; - private void showDialog(String title, String msg) { AlertDialog.Builder builder = new AlertDialog.Builder(this); builder.setTitle(title); @@ -91,7 +89,7 @@ private void setupRelaunch() { final Runnable rPCStarter = new Runnable() { public void run() { if (switchPersistent.isChecked()) { - System.err.println("relaunching RPC activity in 5s..."); + System.err.println("relaunching RPC activity..."); Intent intent = ((MainActivity) context).updateRPCPrefs(); startActivity(intent); } @@ -116,6 +114,7 @@ public void onCheckedChanged(CompoundButton buttonView, boolean isChecked) { if (isChecked) { System.err.println("automatic RPC restart enabled..."); updateRPCPrefs(); + setupRelaunch(); } else { System.err.println("automatic RPC restart disabled..."); updateRPCPrefs(); @@ -123,29 +122,14 @@ public void onCheckedChanged(CompoundButton buttonView, boolean isChecked) { } }); - Button startRPC = findViewById(R.id.button_start_rpc); - startRPC.setOnClickListener(new View.OnClickListener() { - public void onClick(View v) { - Intent intent = ((MainActivity) context).updateRPCPrefs(); - startActivity(intent); - } - }); - enableInputView(true); } @Override protected void onResume() { System.err.println("MainActivity onResume..."); - System.err.println("skipRelaunch: " + skipRelaunch); - // if this is the first time onResume is called, do nothing, otherwise we - // may double launch - if (!skipRelaunch) { - enableInputView(true); - setupRelaunch(); - } else { - skipRelaunch = false; - } + enableInputView(true); + setupRelaunch(); super.onResume(); } diff --git a/apps/android_rpc/app/src/main/res/layout/content_main.xml b/apps/android_rpc/app/src/main/res/layout/content_main.xml index 82be44d98451..69c1f76030df 100644 --- a/apps/android_rpc/app/src/main/res/layout/content_main.xml +++ b/apps/android_rpc/app/src/main/res/layout/content_main.xml @@ -78,15 +78,4 @@ android:textOn="@string/switch_on" /> - -