From a433b0e6736a9ae62185018f0698ddde04f4daba Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Tue, 29 Jan 2019 10:37:54 -0800 Subject: [PATCH 01/14] Add batch_dot and cpu schedule --- docs/langref/relay_op.rst | 2 + python/tvm/relay/frontend/mxnet.py | 20 ++++++++- python/tvm/relay/op/nn/_nn.py | 15 +++++++ python/tvm/relay/op/nn/nn.py | 24 +++++++++++ src/relay/op/nn/nn.cc | 63 ++++++++++++++++++++++++++++ tests/python/relay/test_op_level1.py | 44 +++++++++++++++++++ topi/python/topi/generic/nn.py | 7 ++++ topi/python/topi/nn/__init__.py | 1 + topi/python/topi/x86/nn.py | 52 +++++++++++++++++++++-- topi/python/topi/x86/util.py | 9 ++++ topi/src/topi.cc | 10 +++++ 11 files changed, 242 insertions(+), 5 deletions(-) diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst index e1f38c61eb1f..4be6a01758b4 100644 --- a/docs/langref/relay_op.rst +++ b/docs/langref/relay_op.rst @@ -39,6 +39,7 @@ This level enables fully connected multi-layer perceptron. tvm.relay.nn.relu tvm.relay.nn.dropout tvm.relay.nn.batch_norm + tvm.relay.nn.batch_dot tvm.relay.nn.bias_add @@ -171,6 +172,7 @@ Level 1 Definitions .. autofunction:: tvm.relay.nn.relu .. autofunction:: tvm.relay.nn.dropout .. autofunction:: tvm.relay.nn.batch_norm +.. autofunction:: tvm.relay.nn.batch_dot .. autofunction:: tvm.relay.nn.bias_add diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py index ea49a6642796..fb81d3eef990 100644 --- a/python/tvm/relay/frontend/mxnet.py +++ b/python/tvm/relay/frontend/mxnet.py @@ -268,6 +268,23 @@ def _mx_multibox_detection(inputs, attrs): return _op.vision.nms(ret[0], ret[1], **new_attrs1) +def _mx_div_sqrt_dim(inputs, attrs): + ty = ir_pass.infer_type(inputs[0])._checked_type_ + sqrt_dim = _op.sqrt(_expr.const(int(ty.shape[-1]), ty.dtype)) + out = inputs[0] / sqrt_dim + print(out.astext()) + return out + + +def _mx_batch_dot(inputs, attrs): + a = ir_pass.infer_type(inputs[0]) + b = ir_pass.infer_type(inputs[1]) + + print(a.astext()) + print(b.astext()) + exit() + + # Note: due to attribute conversion constraint # ops in the identity set must be attribute free _identity_list = [ @@ -363,7 +380,8 @@ def _mx_multibox_detection(inputs, attrs): # "broadcast_to", # "gather_nd", # "Crop" : _crop_like, - + "_contrib_div_sqrt_dim": _mx_div_sqrt_dim, + "batch_dot": _mx_batch_dot, } # set identity list diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py index ad67e78c5ac1..ebb7ff56007d 100644 --- a/python/tvm/relay/op/nn/_nn.py +++ b/python/tvm/relay/op/nn/_nn.py @@ -46,6 +46,21 @@ def schedule_dense(attrs, outputs, target): reg.register_pattern("nn.dense", reg.OpPattern.OUT_ELEMWISE_FUSABLE) +# batch_dot +@reg.register_compute("nn.batch_dot") +def compute_batch_dot(attrs, inputs, out_type, target): + """Compute definition of batch_dot""" + return [topi.nn.batch_dot(inputs[0], inputs[1])] + +@reg.register_schedule("nn.batch_dot") +def schedule_batch_dot(attrs, outputs, target): + """Schedule definition of dense""" + with target: + return topi.generic.schedule_batch_dot(outputs) + +reg.register_pattern("nn.batch_dot", reg.OpPattern.OUT_ELEMWISE_FUSABLE) + + # conv2d @reg.register_compute("nn.conv2d") def compute_conv2d(attrs, inputs, out_type, target): diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py index f70b0072f7db..6f1e686eabae 100644 --- a/python/tvm/relay/op/nn/nn.py +++ b/python/tvm/relay/op/nn/nn.py @@ -767,6 +767,30 @@ def batch_norm(data, return TupleWrapper(result, 3) +def batch_dot(x, y): + r""" + Computes dot product of `x` and `y` when `x` and `y` are data in batch. + + .. math:: + + \mbox{batch_dot}(x, y)[i, :, :] = \mbox{dot}(x[i, :, :], y[i, :, :]^T) + + Parameters + ---------- + x : tvm.relay.Expr + The first input. + + y : tvm.relay.Expr + The second input. + + Returns + ------- + result: tvm.relay.Expr + The computed result. + """ + return _make.batch_dot(x, y) + + def contrib_conv2d_winograd_without_weight_transform(data, weight, tile_size, diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc index 7ed43d0df019..33aebfca9734 100644 --- a/src/relay/op/nn/nn.cc +++ b/src/relay/op/nn/nn.cc @@ -654,5 +654,68 @@ axis to be the last item in the input shape. .set_support_level(1) .add_type_rel("BatchNorm", BatchNormRel); + +// relay.nn.batch_dot +bool BatchDotRel(const Array& types, + int num_inputs, + const Attrs& attrs, + const TypeReporter& reporter) { + CHECK_EQ(types.size(), 3); + const auto* x = types[0].as(); + const auto* y = types[1].as(); + if (x == nullptr || y == nullptr) return false; + if (x->shape.size() != 3 || y->shape.size() != 3) return false; + CHECK(reporter->AssertEQ(x->shape[0], y->shape[0])) + << "BatchDot: batch dimension doesn't match, " + << " x shape=" << x->shape + << ", y shape=" << y->shape; + CHECK(reporter->AssertEQ(x->shape[2], y->shape[2])) + << "BatchDot: shapes of x and y is inconsistent, " + << " x shape=" << x->shape + << ", y shape=" << y->shape; + + Array oshape = x->shape; + oshape.Set(2, y->shape[1]); + + // assign output type + reporter->Assign(types[2], TensorTypeNode::make(oshape, x->dtype)); + return true; +} + + +// Positional relay function to create dense operator used by frontend FFI. +Expr MakeBatchDot(Expr x, + Expr y) { + static const Op& op = Op::Get("nn.batch_dot"); + return CallNode::make(op, {x, y}, Attrs(), {}); +} + + +TVM_REGISTER_API("relay.op.nn._make.batch_dot") +.set_body([](const TVMArgs& args, TVMRetValue* rv) { + runtime::detail::unpack_call(MakeBatchDot, args, rv); + }); + + +RELAY_REGISTER_OP("nn.batch_dot") +.describe(R"code(Computes dot product of `x` and `y` when `x` and `y` are data +in batch. + +.. math:: + + batch\_dot(x, y)[i, :, :] = dot(x[i, :, :], y[i, :, :]^T) + +- **x**: `(b, m, k)` +- **y**: `(b, n, k)` +- **out**: `(b, m, n)`. + +)code" TVM_ADD_FILELINE) +.set_num_inputs(2) +.add_argument("x", "3D Tensor", "First input.") +.add_argument("y", "3D Tensor", "Second input.") +.set_support_level(1) +.add_type_rel("BatchDot", BatchDotRel); + + } // namespace relay } // namespace tvm diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py index 6a1662b65170..781d2937b068 100644 --- a/tests/python/relay/test_op_level1.py +++ b/tests/python/relay/test_op_level1.py @@ -306,6 +306,49 @@ def test_dense(): tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5) +def test_batch_dot(): + # TODO!!! + n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w") + x = relay.var("x", relay.TensorType((n, c, h, w), "float32")) + w = relay.var("w", relay.TensorType((2, w), "float32")) + y = relay.nn.dense(x, w, units=2) + "units=2" in y.astext() + yy = relay.ir_pass.infer_type(y) + assert yy.checked_type == relay.TensorType((n, c, h, 2), "float32") + + n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), 2 + x = relay.var("x", relay.TensorType((n, c, h, w), "float32")) + wh, ww = tvm.var("wh"), tvm.var("ww") + w = relay.var("w", relay.TensorType((ww, wh), "float32")) + y = relay.nn.dense(x, w) + yy = relay.ir_pass.infer_type(y) + assert yy.checked_type == relay.TensorType((n, c, h, ww), "float32") + + n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), 2 + x = relay.var("x", relay.TensorType((n, c, h, w), "float32")) + w = relay.var("w", relay.IncompleteType()) + y = relay.nn.dense(x, w, units=2) + yy = relay.ir_pass.infer_type(y) + assert yy.checked_type == relay.TensorType((n, c, h, 2), "float32") + + x = relay.var("x", shape=(10, 5)) + w = relay.var("w", shape=(2, 5)) + z = relay.nn.dense(x, w) + + # Check result. + func = relay.Function([x, w], z) + x_data = np.random.rand(10, 5).astype('float32') + w_data = np.random.rand(2, 5).astype('float32') + ref_res = np.dot(x_data, w_data.T) + + for target, ctx in ctx_list(): + intrp1 = relay.create_executor("graph", ctx=ctx, target=target) + intrp2 = relay.create_executor("debug", ctx=ctx, target=target) + op_res1 = intrp1.evaluate(func)(x_data, w_data) + tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5) + op_res2 = intrp2.evaluate(func)(x_data, w_data) + tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5) + if __name__ == "__main__": test_concatenate() @@ -319,3 +362,4 @@ def test_dense(): test_dropout() test_batch_norm() test_dense() + test_batch_dot() diff --git a/topi/python/topi/generic/nn.py b/topi/python/topi/generic/nn.py index 8c303e5be182..15a55eca05c6 100644 --- a/topi/python/topi/generic/nn.py +++ b/topi/python/topi/generic/nn.py @@ -410,3 +410,10 @@ def schedule_l2_normalize(outs): target = tvm.target.current_target(allow_none=False) cpp_target = cpp.TEST_create_target(target.target_name) return cpp.generic.default_schedule(cpp_target, outs, False) + +@tvm.target.generic_func +def schedule_batch_dot(outs): + print('schedule_batch_dot generic') + target = tvm.target.current_target(allow_none=False) + cpp_target = cpp.TEST_create_target(target.target_name) + return cpp.generic.default_schedule(cpp_target, outs, False) diff --git a/topi/python/topi/nn/__init__.py b/topi/python/topi/nn/__init__.py index 690379135e06..4a2749465781 100644 --- a/topi/python/topi/nn/__init__.py +++ b/topi/python/topi/nn/__init__.py @@ -18,3 +18,4 @@ from .local_response_norm import * from .bitserial_conv2d import * from .l2_normalize import * +from .batch_dot import * diff --git a/topi/python/topi/x86/nn.py b/topi/python/topi/x86/nn.py index ab6dda40cc9d..41096634aca8 100644 --- a/topi/python/topi/x86/nn.py +++ b/topi/python/topi/x86/nn.py @@ -5,7 +5,7 @@ from tvm import autotvm from tvm.autotvm.task.space import SplitEntity -from .util import get_fp32_len +from .util import get_fp32_len, get_max_power2_factor from .. import generic, tag, nn from ..util import traverse_inline, get_const_tuple @@ -112,7 +112,6 @@ def _declaration_dense_nopack(cfg, data, weight, bias=None): @autotvm.register_topi_schedule(generic.schedule_dense, "cpu", "direct") def _schedule_dense(cfg, outs): s = tvm.create_schedule([x.op for x in outs]) - scheduled_ops = [] def _callback(op): if "dense_pack" in op.tag: @@ -126,7 +125,6 @@ def _callback(op): @autotvm.register_topi_schedule(generic.schedule_dense, "cpu", "direct_pack") def _schedule_dense_pack(cfg, outs): s = tvm.create_schedule([x.op for x in outs]) - scheduled_ops = [] def _callback(op): if "dense_pack" in op.tag: @@ -138,7 +136,6 @@ def _callback(op): @autotvm.register_topi_schedule(generic.schedule_dense, "cpu", "direct_nopack") def _schedule_dense_nopack(cfg, outs): s = tvm.create_schedule([x.op for x in outs]) - scheduled_ops = [] def _callback(op): if 'dense_nopack' in op.tag: @@ -239,3 +236,50 @@ def _default_dense_nopack_config(cfg, M, N, K): cfg["tile_k"] = SplitEntity([K // tilek_bn, tilek_bn]) cfg["tile_x"] = SplitEntity([N, 1]) cfg["tile_y"] = SplitEntity([1, M]) + + +@generic.schedule_batch_dot.register(["cpu"]) +def schedule_batch_dot(outs): + """Schedule for softmax + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of softmax + in the format of an array of tensors. + + Returns + ------- + sch: Schedule + The computation schedule for the op. + """ + s = tvm.create_schedule([x.op for x in outs]) + + def _callback(op): + if "batch_dot" in op.tag: + C = op.output(0) + A, B = s[C].op.input_tensors + _, M, N = get_const_tuple(C.shape) + k, = s[C].op.reduce_axis + ko, ki = s[C].split(k, 16) + CC = s.rfactor(C, ki) + + b, y, x = s[C].op.axis + y_bn = get_max_power2_factor(M, 8) + x_bn = get_max_power2_factor(N, 8) + yo, yi = s[C].split(y, y_bn) + xo, xi = s[C].split(x, x_bn) + s[C].reorder(b, yo, xo, yi, xi) + bxyo = s[C].fuse(b, yo, xo) + s[C].parallel(bxyo) + s[C].fuse(yi, xi) + + s[CC].compute_at(s[C], bxyo) + _, _, y, x = s[CC].op.axis + s[CC].fuse(y, x) + s[CC].vectorize(s[CC].op.axis[0]) + s[C].pragma(bxyo, 'auto_unroll_max_step', 16) + + traverse_inline(s, outs[0].op, _callback) + + return s diff --git a/topi/python/topi/x86/util.py b/topi/python/topi/x86/util.py index 678ff8e24cff..a8a19365e2ab 100644 --- a/topi/python/topi/x86/util.py +++ b/topi/python/topi/x86/util.py @@ -10,3 +10,12 @@ def get_fp32_len(): if opt == '-mcpu=skylake-avx512': fp32_vec_len = 16 return fp32_vec_len + +def get_max_power2_factor(n, max_value=None): + x = 1 + while n % 2 == 0: + if max_value is not None and max_value < x * 2: + break + x *= 2 + n /= 2 + return x diff --git a/topi/src/topi.cc b/topi/src/topi.cc index 7adcb11c5656..b1118c582382 100644 --- a/topi/src/topi.cc +++ b/topi/src/topi.cc @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -357,6 +358,12 @@ TVM_REGISTER_GLOBAL("topi.nn.dense") *rv = nn::dense(args[0], args[1], args[2]); }); +/* Ops from nn/batch_dot.h */ +TVM_REGISTER_GLOBAL("topi.nn.batch_dot") +.set_body([](TVMArgs args, TVMRetValue *rv) { + *rv = nn::batch_dot(args[0], args[1]); + }); + /* Ops from nn/dilate.h */ TVM_REGISTER_GLOBAL("topi.nn.dilate") .set_body([](TVMArgs args, TVMRetValue *rv) { @@ -610,6 +617,9 @@ TVM_REGISTER_GENERIC_FUNC(schedule_dense) .register_func({ "cuda", "gpu" }, WrapSchedule(topi::cuda::schedule_dense)) .register_func({ "rocm" }, WrapSchedule(topi::rocm::schedule_dense)); +TVM_REGISTER_GENERIC_FUNC(schedule_batch_dot) +.set_default(WrapSchedule(topi::generic::default_schedule)); + TVM_REGISTER_GENERIC_FUNC(schedule_pool) .set_default(WrapSchedule(topi::generic::default_schedule)) .register_func({ "cpu" }, WrapSchedule(topi::x86::default_schedule)) From bdb10fd409a70b8457152f1b83444ddae9396a76 Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Mon, 4 Feb 2019 11:22:32 -0800 Subject: [PATCH 02/14] Add relay support for batch_dot --- docs/langref/relay_op.rst | 4 +- python/tvm/relay/frontend/mxnet.py | 20 +-------- src/relay/op/nn/nn.cc | 2 +- tests/python/relay/test_op_level1.py | 45 -------------------- tests/python/relay/test_op_level10.py | 34 +++++++++++++++ topi/include/topi/nn/batch_dot.h | 50 ++++++++++++++++++++++ topi/python/topi/nn/batch_dot.py | 20 +++++++++ topi/tests/python/test_topi_batch_dot.py | 54 ++++++++++++++++++++++++ 8 files changed, 162 insertions(+), 67 deletions(-) create mode 100644 topi/include/topi/nn/batch_dot.h create mode 100644 topi/python/topi/nn/batch_dot.py create mode 100644 topi/tests/python/test_topi_batch_dot.py diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst index 4be6a01758b4..2d7a4e6fcc21 100644 --- a/docs/langref/relay_op.rst +++ b/docs/langref/relay_op.rst @@ -39,7 +39,6 @@ This level enables fully connected multi-layer perceptron. tvm.relay.nn.relu tvm.relay.nn.dropout tvm.relay.nn.batch_norm - tvm.relay.nn.batch_dot tvm.relay.nn.bias_add @@ -151,6 +150,7 @@ This level support backpropagation of broadcast operators. It is temporary. tvm.relay.device_copy tvm.relay.annotation.on_device tvm.relay.reverse_reshape + tvm.relay.nn.batch_dot Level 1 Definitions @@ -172,7 +172,6 @@ Level 1 Definitions .. autofunction:: tvm.relay.nn.relu .. autofunction:: tvm.relay.nn.dropout .. autofunction:: tvm.relay.nn.batch_norm -.. autofunction:: tvm.relay.nn.batch_dot .. autofunction:: tvm.relay.nn.bias_add @@ -262,3 +261,4 @@ Level 10 Definitions .. autofunction:: tvm.relay.device_copy .. autofunction:: tvm.relay.annotation.on_device .. autofunction:: tvm.relay.reverse_reshape +.. autofunction:: tvm.relay.nn.batch_dot diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py index fb81d3eef990..e64f20964ea9 100644 --- a/python/tvm/relay/frontend/mxnet.py +++ b/python/tvm/relay/frontend/mxnet.py @@ -268,23 +268,6 @@ def _mx_multibox_detection(inputs, attrs): return _op.vision.nms(ret[0], ret[1], **new_attrs1) -def _mx_div_sqrt_dim(inputs, attrs): - ty = ir_pass.infer_type(inputs[0])._checked_type_ - sqrt_dim = _op.sqrt(_expr.const(int(ty.shape[-1]), ty.dtype)) - out = inputs[0] / sqrt_dim - print(out.astext()) - return out - - -def _mx_batch_dot(inputs, attrs): - a = ir_pass.infer_type(inputs[0]) - b = ir_pass.infer_type(inputs[1]) - - print(a.astext()) - print(b.astext()) - exit() - - # Note: due to attribute conversion constraint # ops in the identity set must be attribute free _identity_list = [ @@ -298,6 +281,7 @@ def _mx_batch_dot(inputs, attrs): "slice_like", "zeros_like", "ones_like", + "batch_dot", ] _convert_map = { @@ -380,8 +364,6 @@ def _mx_batch_dot(inputs, attrs): # "broadcast_to", # "gather_nd", # "Crop" : _crop_like, - "_contrib_div_sqrt_dim": _mx_div_sqrt_dim, - "batch_dot": _mx_batch_dot, } # set identity list diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc index 33aebfca9734..434cd1af00df 100644 --- a/src/relay/op/nn/nn.cc +++ b/src/relay/op/nn/nn.cc @@ -713,7 +713,7 @@ in batch. .set_num_inputs(2) .add_argument("x", "3D Tensor", "First input.") .add_argument("y", "3D Tensor", "Second input.") -.set_support_level(1) +.set_support_level(10) .add_type_rel("BatchDot", BatchDotRel); diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py index 781d2937b068..e4f5a797beb6 100644 --- a/tests/python/relay/test_op_level1.py +++ b/tests/python/relay/test_op_level1.py @@ -306,50 +306,6 @@ def test_dense(): tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5) -def test_batch_dot(): - # TODO!!! - n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w") - x = relay.var("x", relay.TensorType((n, c, h, w), "float32")) - w = relay.var("w", relay.TensorType((2, w), "float32")) - y = relay.nn.dense(x, w, units=2) - "units=2" in y.astext() - yy = relay.ir_pass.infer_type(y) - assert yy.checked_type == relay.TensorType((n, c, h, 2), "float32") - - n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), 2 - x = relay.var("x", relay.TensorType((n, c, h, w), "float32")) - wh, ww = tvm.var("wh"), tvm.var("ww") - w = relay.var("w", relay.TensorType((ww, wh), "float32")) - y = relay.nn.dense(x, w) - yy = relay.ir_pass.infer_type(y) - assert yy.checked_type == relay.TensorType((n, c, h, ww), "float32") - - n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), 2 - x = relay.var("x", relay.TensorType((n, c, h, w), "float32")) - w = relay.var("w", relay.IncompleteType()) - y = relay.nn.dense(x, w, units=2) - yy = relay.ir_pass.infer_type(y) - assert yy.checked_type == relay.TensorType((n, c, h, 2), "float32") - - x = relay.var("x", shape=(10, 5)) - w = relay.var("w", shape=(2, 5)) - z = relay.nn.dense(x, w) - - # Check result. - func = relay.Function([x, w], z) - x_data = np.random.rand(10, 5).astype('float32') - w_data = np.random.rand(2, 5).astype('float32') - ref_res = np.dot(x_data, w_data.T) - - for target, ctx in ctx_list(): - intrp1 = relay.create_executor("graph", ctx=ctx, target=target) - intrp2 = relay.create_executor("debug", ctx=ctx, target=target) - op_res1 = intrp1.evaluate(func)(x_data, w_data) - tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5) - op_res2 = intrp2.evaluate(func)(x_data, w_data) - tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5) - - if __name__ == "__main__": test_concatenate() test_bias_add() @@ -362,4 +318,3 @@ def test_batch_dot(): test_dropout() test_batch_norm() test_dense() - test_batch_dot() diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py index a6e169e23a6c..84de324f2c45 100644 --- a/tests/python/relay/test_op_level10.py +++ b/tests/python/relay/test_op_level10.py @@ -144,8 +144,42 @@ def verify_reverse_reshape(shape, newshape, oshape): verify_reverse_reshape((2, 3, 4), (-1, 0), (6, 4)) verify_reverse_reshape((2, 3, 4), (0, -3), (2, 12)) +def verify_batch_dot(x_shape, y_shape, out_shape, dtype="float32"): + x = relay.var("x", relay.TensorType(x_shape, dtype)) + y = relay.var("y", relay.TensorType(y_shape, dtype)) + z = relay.nn.batch_dot(x, y) + zz = relay.ir_pass.infer_type(z) + assert zz.checked_type == relay.ty.TensorType(out_shape, dtype) + + func = relay.Function([x, y], z) + x_np = np.random.uniform(size=x_shape).astype(dtype) + y_np = np.random.uniform(size=y_shape).astype(dtype) + z_np = np.zeros(out_shape).astype(dtype) + for i in range(x_shape[0]): + z_np[i] = np.dot(x_np[i], y_np[i].T) + + for target, ctx in ctx_list(): + for kind in ["graph", "debug"]: + intrp = relay.create_executor(kind, ctx=ctx, target=target) + z = intrp.evaluate(func)(x_np, y_np) + tvm.testing.assert_allclose(z.asnumpy(), z_np, rtol=1e-5) + +def test_batch_dot(): + b, m, n, k = tvm.var("b"), tvm.var("m"), tvm.var("n"), tvm.var("k") + x = relay.var("x", relay.TensorType((b, m, k), "float32")) + y = relay.var("y", relay.TensorType((b, n, k), "float32")) + z = relay.nn.batch_dot(x, y) + zz = relay.ir_pass.infer_type(z) + assert zz.checked_type == relay.TensorType((b, m, n), "float32") + + verify_batch_dot((1, 16, 32), (1, 16, 32), (1, 16, 16)) + verify_batch_dot((5, 16, 32), (5, 16, 32), (5, 16, 16)) + verify_batch_dot((5, 16, 32), (5, 20, 32), (5, 16, 20)) + + if __name__ == "__main__": test_collapse_sum_like() test_broadcast_to_like() test_slice_like() test_reverse_reshape() + test_batch_dot() diff --git a/topi/include/topi/nn/batch_dot.h b/topi/include/topi/nn/batch_dot.h new file mode 100644 index 000000000000..95afdf3696b6 --- /dev/null +++ b/topi/include/topi/nn/batch_dot.h @@ -0,0 +1,50 @@ +/*! + * Copyright (c) 2019 by Contributors + * \brief Batch dot op constructions + * \file nn/batch_dot.h + */ +#ifndef TOPI_NN_BATCH_DOT_H_ +#define TOPI_NN_BATCH_DOT_H_ + +#include + +#include "topi/tags.h" +#include "tvm/tvm.h" + +namespace topi { +namespace nn { +using namespace tvm; + +/*! +* \brief Creates an operation that calculates data * weight^T + bias +* +* \param data Tensor with shape [batch, in_dim] +* \param weight Tensor with shape [out_dim, in_dim] +* \param bias Tensor with shape [out_dim]. Optional; to omit bias, pass Tensor() +* +* \return Tensor with shape [batch, out_dim] +*/ +inline tvm::Tensor batch_dot(const tvm::Tensor& x, + const tvm::Tensor& y) { + CHECK_EQ(x->shape.size(), 3) << "batch_dot requires 3-D data"; + CHECK_EQ(y->shape.size(), 3) << "batch_dot requires 3-D data"; + + auto batch = x->shape[0]; + auto M = x->shape[1]; + auto K = x->shape[2]; + auto N = y->shape[1]; + + auto k = tvm::reduce_axis(Range(0, K), "k"); + auto result = tvm::compute( + { batch, M, N }, + [&](Var b, Var i, Var j) { + return tvm::sum(x(b, i, k) * y(b, j, k), { k }); + }, "tensor", "batch_dot"); + + return result; +} + +} // namespace nn +} // namespace topi + +#endif // TOPI_NN_BATCH_DOT_H_ diff --git a/topi/python/topi/nn/batch_dot.py b/topi/python/topi/nn/batch_dot.py new file mode 100644 index 000000000000..e61edd614508 --- /dev/null +++ b/topi/python/topi/nn/batch_dot.py @@ -0,0 +1,20 @@ +"""Binary Neural Network (BNN) Operators""" +from __future__ import absolute_import as _abs +import tvm +from .. import tag +from ..util import get_const_tuple + + +def batch_dot(x, y): + assert len(x.shape) == 3 and len(y.shape) == 3, "only support 3-dim batch_dot" + x_shape = get_const_tuple(x.shape) + y_shape = get_const_tuple(y.shape) + assert x_shape[0] == y_shape[0], "batch dimension doesn't match" + assert x_shape[2] == y_shape[2], "shapes of x and y is inconsistant" + batch, M, K = x.shape + N = y.shape[1] + k = tvm.reduce_axis((0, K), name='k') + + return tvm.compute((batch, M, N), + lambda b, i, j: tvm.sum(x[b, i, k] * y[b, j, k], axis=k), + tag='batch_dot') diff --git a/topi/tests/python/test_topi_batch_dot.py b/topi/tests/python/test_topi_batch_dot.py new file mode 100644 index 000000000000..d904606164f8 --- /dev/null +++ b/topi/tests/python/test_topi_batch_dot.py @@ -0,0 +1,54 @@ +"""Test code for batch_dot operator""" +import numpy as np +import tvm +import topi +import topi.testing +from topi.util import get_const_tuple +from tvm.contrib.pickle_memoize import memoize + +from common import get_all_backend + +def verify_batch_dot(batch, M, N, K): + x = tvm.placeholder((batch, M, K), name='x') + y = tvm.placeholder((batch, N, K), name='y') + dtype = x.dtype + + # use memoize to pickle the test data for next time use + @memoize("topi.tests.test_topi_batch_dot") + def get_ref_data(): + a_np = np.random.uniform(size=(batch, M, K)).astype(dtype) + b_np = np.random.uniform(size=(batch, N, K)).astype(dtype) + c_np = np.zeros((batch, M, N)).astype(dtype) + for i in range(batch): + c_np[i] = np.dot(a_np[i], b_np[i].T) + return (a_np, b_np, c_np) + # get the test data + a_np, b_np, c_np = get_ref_data() + + def check_device(device): + ctx = tvm.context(device, 0) + if not ctx.exist: + print("Skip because %s is not enabled" % device) + return + print("Running on target: %s" % device) + with tvm.target.create(device): + out = topi.nn.batch_dot(x, y) + s = topi.generic.schedule_batch_dot([out]) + a = tvm.nd.array(a_np, ctx) + b = tvm.nd.array(b_np, ctx) + c = tvm.nd.array(np.zeros(get_const_tuple(out.shape), dtype=dtype), ctx) + f = tvm.build(s, [x, y, out], device, name="dense") + f(a, b, c) + tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5) + + for device in get_all_backend(): + check_device(device) + +def test_batch_dot(): + verify_batch_dot(1, 16, 16, 32) + verify_batch_dot(5, 16, 16, 32) + verify_batch_dot(5, 16, 20, 32) + + +if __name__ == "__main__": + test_batch_dot() From 2ca20987a97d20af1ce4c95b024bc44b26dfd2ef Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Mon, 11 Feb 2019 14:52:34 -0800 Subject: [PATCH 03/14] Rename batch_dot to batch_matmul --- python/tvm/relay/frontend/mxnet.py | 6 +++- python/tvm/relay/op/nn/_nn.py | 22 ++++++------- python/tvm/relay/op/nn/nn.py | 9 +++--- src/relay/op/nn/nn.cc | 32 +++++++++---------- tests/python/relay/test_op_level10.py | 22 ++++++------- .../topi/nn/{batch_dot.h => batch_matmul.h} | 25 +++++++-------- topi/python/topi/generic/nn.py | 3 +- topi/python/topi/nn/__init__.py | 2 +- .../topi/nn/{batch_dot.py => batch_matmul.py} | 6 ++-- topi/python/topi/testing/__init__.py | 1 + topi/python/topi/x86/nn.py | 10 +++--- topi/src/topi.cc | 10 +++--- ...batch_dot.py => test_topi_batch_matmul.py} | 24 +++++++------- 13 files changed, 87 insertions(+), 85 deletions(-) rename topi/include/topi/nn/{batch_dot.h => batch_matmul.h} (53%) rename topi/python/topi/nn/{batch_dot.py => batch_matmul.py} (88%) rename topi/tests/python/{test_topi_batch_dot.py => test_topi_batch_matmul.py} (71%) diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py index e64f20964ea9..26cd565c91c9 100644 --- a/python/tvm/relay/frontend/mxnet.py +++ b/python/tvm/relay/frontend/mxnet.py @@ -268,6 +268,10 @@ def _mx_multibox_detection(inputs, attrs): return _op.vision.nms(ret[0], ret[1], **new_attrs1) +def _mx_batch_dot(inputs, attrs): + return _op.batch_matmul(inputs[0], inputs[1]) + + # Note: due to attribute conversion constraint # ops in the identity set must be attribute free _identity_list = [ @@ -281,7 +285,6 @@ def _mx_multibox_detection(inputs, attrs): "slice_like", "zeros_like", "ones_like", - "batch_dot", ] _convert_map = { @@ -352,6 +355,7 @@ def _mx_multibox_detection(inputs, attrs): "expand_dims" : _mx_expand_dims, "Concat" : _mx_concat, "concat" : _mx_concat, + "batch_dot" : _mx_batch_dot, "LeakyReLU" : _mx_leaky_relu, "SoftmaxOutput" : _mx_softmax_output, "SoftmaxActivation" : _mx_softmax_activation, diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py index ebb7ff56007d..3402bbd81288 100644 --- a/python/tvm/relay/op/nn/_nn.py +++ b/python/tvm/relay/op/nn/_nn.py @@ -46,19 +46,19 @@ def schedule_dense(attrs, outputs, target): reg.register_pattern("nn.dense", reg.OpPattern.OUT_ELEMWISE_FUSABLE) -# batch_dot -@reg.register_compute("nn.batch_dot") -def compute_batch_dot(attrs, inputs, out_type, target): - """Compute definition of batch_dot""" - return [topi.nn.batch_dot(inputs[0], inputs[1])] - -@reg.register_schedule("nn.batch_dot") -def schedule_batch_dot(attrs, outputs, target): - """Schedule definition of dense""" +# batch_matmul +@reg.register_compute("nn.batch_matmul") +def compute_batch_matmul(attrs, inputs, out_type, target): + """Compute definition of batch_matmul""" + return [topi.nn.batch_matmul(inputs[0], inputs[1])] + +@reg.register_schedule("nn.batch_matmul") +def schedule_batch_matmul(attrs, outputs, target): + """Schedule definition of batch_matmul""" with target: - return topi.generic.schedule_batch_dot(outputs) + return topi.generic.schedule_batch_matmul(outputs) -reg.register_pattern("nn.batch_dot", reg.OpPattern.OUT_ELEMWISE_FUSABLE) +reg.register_pattern("nn.batch_matmul", reg.OpPattern.OUT_ELEMWISE_FUSABLE) # conv2d diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py index 6f1e686eabae..ddbad9880953 100644 --- a/python/tvm/relay/op/nn/nn.py +++ b/python/tvm/relay/op/nn/nn.py @@ -767,13 +767,14 @@ def batch_norm(data, return TupleWrapper(result, 3) -def batch_dot(x, y): +def batch_matmul(x, y): r""" - Computes dot product of `x` and `y` when `x` and `y` are data in batch. + Computes batch matrix multiplication of `x` and `y` when `x` and `y` are data + in batch. .. math:: - \mbox{batch_dot}(x, y)[i, :, :] = \mbox{dot}(x[i, :, :], y[i, :, :]^T) + \mbox{batch_matmul}(x, y)[i, :, :] = \mbox{matmul}(x[i, :, :], y[i, :, :]^T) Parameters ---------- @@ -788,7 +789,7 @@ def batch_dot(x, y): result: tvm.relay.Expr The computed result. """ - return _make.batch_dot(x, y) + return _make.batch_matmul(x, y) def contrib_conv2d_winograd_without_weight_transform(data, diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc index 434cd1af00df..7b997231d59f 100644 --- a/src/relay/op/nn/nn.cc +++ b/src/relay/op/nn/nn.cc @@ -655,11 +655,11 @@ axis to be the last item in the input shape. .add_type_rel("BatchNorm", BatchNormRel); -// relay.nn.batch_dot -bool BatchDotRel(const Array& types, - int num_inputs, - const Attrs& attrs, - const TypeReporter& reporter) { +// relay.nn.batch_matmul +bool BatchMatmulRel(const Array& types, + int num_inputs, + const Attrs& attrs, + const TypeReporter& reporter) { CHECK_EQ(types.size(), 3); const auto* x = types[0].as(); const auto* y = types[1].as(); @@ -683,27 +683,27 @@ bool BatchDotRel(const Array& types, } -// Positional relay function to create dense operator used by frontend FFI. -Expr MakeBatchDot(Expr x, - Expr y) { - static const Op& op = Op::Get("nn.batch_dot"); +// Positional relay function to create batch_matmul operator used by frontend FFI. +Expr MakeBatchMatmul(Expr x, + Expr y) { + static const Op& op = Op::Get("nn.batch_matmul"); return CallNode::make(op, {x, y}, Attrs(), {}); } -TVM_REGISTER_API("relay.op.nn._make.batch_dot") +TVM_REGISTER_API("relay.op.nn._make.batch_matmul") .set_body([](const TVMArgs& args, TVMRetValue* rv) { - runtime::detail::unpack_call(MakeBatchDot, args, rv); + runtime::detail::unpack_call(MakeBatchMatmul, args, rv); }); -RELAY_REGISTER_OP("nn.batch_dot") -.describe(R"code(Computes dot product of `x` and `y` when `x` and `y` are data -in batch. +RELAY_REGISTER_OP("nn.batch_matmul") +.describe(R"code(Computes matrix multiplication of `x` and `y` when `x` and `y` +are data in batch. .. math:: - batch\_dot(x, y)[i, :, :] = dot(x[i, :, :], y[i, :, :]^T) + batch\_matmul(x, y)[i, :, :] = matmul(x[i, :, :], y[i, :, :]^T) - **x**: `(b, m, k)` - **y**: `(b, n, k)` @@ -714,7 +714,7 @@ in batch. .add_argument("x", "3D Tensor", "First input.") .add_argument("y", "3D Tensor", "Second input.") .set_support_level(10) -.add_type_rel("BatchDot", BatchDotRel); +.add_type_rel("BatchMatmul", BatchMatmulRel); } // namespace relay diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py index 84de324f2c45..9552342f1a3a 100644 --- a/tests/python/relay/test_op_level10.py +++ b/tests/python/relay/test_op_level10.py @@ -4,6 +4,8 @@ import tvm from tvm import relay from tvm.relay.testing import ctx_list +import topi +import topi.testing def test_collapse_sum_like(): shape = (3, 4, 5, 6) @@ -144,19 +146,17 @@ def verify_reverse_reshape(shape, newshape, oshape): verify_reverse_reshape((2, 3, 4), (-1, 0), (6, 4)) verify_reverse_reshape((2, 3, 4), (0, -3), (2, 12)) -def verify_batch_dot(x_shape, y_shape, out_shape, dtype="float32"): +def verify_batch_matmul(x_shape, y_shape, out_shape, dtype="float32"): x = relay.var("x", relay.TensorType(x_shape, dtype)) y = relay.var("y", relay.TensorType(y_shape, dtype)) - z = relay.nn.batch_dot(x, y) + z = relay.nn.batch_matmul(x, y) zz = relay.ir_pass.infer_type(z) assert zz.checked_type == relay.ty.TensorType(out_shape, dtype) func = relay.Function([x, y], z) x_np = np.random.uniform(size=x_shape).astype(dtype) y_np = np.random.uniform(size=y_shape).astype(dtype) - z_np = np.zeros(out_shape).astype(dtype) - for i in range(x_shape[0]): - z_np[i] = np.dot(x_np[i], y_np[i].T) + z_np = topi.testing.batch_matmul(x_np, y_np) for target, ctx in ctx_list(): for kind in ["graph", "debug"]: @@ -164,17 +164,17 @@ def verify_batch_dot(x_shape, y_shape, out_shape, dtype="float32"): z = intrp.evaluate(func)(x_np, y_np) tvm.testing.assert_allclose(z.asnumpy(), z_np, rtol=1e-5) -def test_batch_dot(): +def test_batch_matmul(): b, m, n, k = tvm.var("b"), tvm.var("m"), tvm.var("n"), tvm.var("k") x = relay.var("x", relay.TensorType((b, m, k), "float32")) y = relay.var("y", relay.TensorType((b, n, k), "float32")) - z = relay.nn.batch_dot(x, y) + z = relay.nn.batch_matmul(x, y) zz = relay.ir_pass.infer_type(z) assert zz.checked_type == relay.TensorType((b, m, n), "float32") - verify_batch_dot((1, 16, 32), (1, 16, 32), (1, 16, 16)) - verify_batch_dot((5, 16, 32), (5, 16, 32), (5, 16, 16)) - verify_batch_dot((5, 16, 32), (5, 20, 32), (5, 16, 20)) + verify_batch_matmul((1, 16, 32), (1, 16, 32), (1, 16, 16)) + verify_batch_matmul((5, 16, 32), (5, 16, 32), (5, 16, 16)) + verify_batch_matmul((5, 16, 32), (5, 20, 32), (5, 16, 20)) if __name__ == "__main__": @@ -182,4 +182,4 @@ def test_batch_dot(): test_broadcast_to_like() test_slice_like() test_reverse_reshape() - test_batch_dot() + test_batch_matmul() diff --git a/topi/include/topi/nn/batch_dot.h b/topi/include/topi/nn/batch_matmul.h similarity index 53% rename from topi/include/topi/nn/batch_dot.h rename to topi/include/topi/nn/batch_matmul.h index 95afdf3696b6..b59ffe02e942 100644 --- a/topi/include/topi/nn/batch_dot.h +++ b/topi/include/topi/nn/batch_matmul.h @@ -1,10 +1,10 @@ /*! * Copyright (c) 2019 by Contributors - * \brief Batch dot op constructions - * \file nn/batch_dot.h + * \brief Batch matmul op constructions + * \file nn/batch_matmul.h */ -#ifndef TOPI_NN_BATCH_DOT_H_ -#define TOPI_NN_BATCH_DOT_H_ +#ifndef TOPI_NN_BATCH_MATMUL_H_ +#define TOPI_NN_BATCH_MATMUL_H_ #include @@ -16,16 +16,15 @@ namespace nn { using namespace tvm; /*! -* \brief Creates an operation that calculates data * weight^T + bias +* \brief Creates an operation that calculates matrix multiplication in batch. * -* \param data Tensor with shape [batch, in_dim] -* \param weight Tensor with shape [out_dim, in_dim] -* \param bias Tensor with shape [out_dim]. Optional; to omit bias, pass Tensor() +* \param x Tensor with shape [batch, M, K] +* \param y Tensor with shape [batch, N, K] * -* \return Tensor with shape [batch, out_dim] +* \return Tensor with shape [batch, M, N] */ -inline tvm::Tensor batch_dot(const tvm::Tensor& x, - const tvm::Tensor& y) { +inline tvm::Tensor batch_matmul(const tvm::Tensor& x, + const tvm::Tensor& y) { CHECK_EQ(x->shape.size(), 3) << "batch_dot requires 3-D data"; CHECK_EQ(y->shape.size(), 3) << "batch_dot requires 3-D data"; @@ -39,7 +38,7 @@ inline tvm::Tensor batch_dot(const tvm::Tensor& x, { batch, M, N }, [&](Var b, Var i, Var j) { return tvm::sum(x(b, i, k) * y(b, j, k), { k }); - }, "tensor", "batch_dot"); + }, "tensor", "batch_matmul"); return result; } @@ -47,4 +46,4 @@ inline tvm::Tensor batch_dot(const tvm::Tensor& x, } // namespace nn } // namespace topi -#endif // TOPI_NN_BATCH_DOT_H_ +#endif // TOPI_NN_BATCH_MATMUL_H_ diff --git a/topi/python/topi/generic/nn.py b/topi/python/topi/generic/nn.py index 15a55eca05c6..00b742f24e64 100644 --- a/topi/python/topi/generic/nn.py +++ b/topi/python/topi/generic/nn.py @@ -412,8 +412,7 @@ def schedule_l2_normalize(outs): return cpp.generic.default_schedule(cpp_target, outs, False) @tvm.target.generic_func -def schedule_batch_dot(outs): - print('schedule_batch_dot generic') +def schedule_batch_matmul(outs): target = tvm.target.current_target(allow_none=False) cpp_target = cpp.TEST_create_target(target.target_name) return cpp.generic.default_schedule(cpp_target, outs, False) diff --git a/topi/python/topi/nn/__init__.py b/topi/python/topi/nn/__init__.py index 4a2749465781..e23aeb003dc9 100644 --- a/topi/python/topi/nn/__init__.py +++ b/topi/python/topi/nn/__init__.py @@ -18,4 +18,4 @@ from .local_response_norm import * from .bitserial_conv2d import * from .l2_normalize import * -from .batch_dot import * +from .batch_matmul import * diff --git a/topi/python/topi/nn/batch_dot.py b/topi/python/topi/nn/batch_matmul.py similarity index 88% rename from topi/python/topi/nn/batch_dot.py rename to topi/python/topi/nn/batch_matmul.py index e61edd614508..51c4a69c5dbc 100644 --- a/topi/python/topi/nn/batch_dot.py +++ b/topi/python/topi/nn/batch_matmul.py @@ -5,8 +5,8 @@ from ..util import get_const_tuple -def batch_dot(x, y): - assert len(x.shape) == 3 and len(y.shape) == 3, "only support 3-dim batch_dot" +def batch_matmul(x, y): + assert len(x.shape) == 3 and len(y.shape) == 3, "only support 3-dim batch_matmul" x_shape = get_const_tuple(x.shape) y_shape = get_const_tuple(y.shape) assert x_shape[0] == y_shape[0], "batch dimension doesn't match" @@ -17,4 +17,4 @@ def batch_dot(x, y): return tvm.compute((batch, M, N), lambda b, i, j: tvm.sum(x[b, i, k] * y[b, j, k], axis=k), - tag='batch_dot') + tag='batch_matmul') diff --git a/topi/python/topi/testing/__init__.py b/topi/python/topi/testing/__init__.py index 5ea9683f72ef..e4579008667b 100644 --- a/topi/python/topi/testing/__init__.py +++ b/topi/python/topi/testing/__init__.py @@ -21,3 +21,4 @@ from .l2_normalize_python import l2_normalize_python from .gather_nd_python import gather_nd_python from .strided_slice_python import strided_slice_python +from .batch_matmul import batch_matmul diff --git a/topi/python/topi/x86/nn.py b/topi/python/topi/x86/nn.py index 41096634aca8..d5a6ca76f7ed 100644 --- a/topi/python/topi/x86/nn.py +++ b/topi/python/topi/x86/nn.py @@ -238,14 +238,14 @@ def _default_dense_nopack_config(cfg, M, N, K): cfg["tile_y"] = SplitEntity([1, M]) -@generic.schedule_batch_dot.register(["cpu"]) -def schedule_batch_dot(outs): - """Schedule for softmax +@generic.schedule_batch_matmul.register(["cpu"]) +def schedule_batch_matmul(outs): + """Schedule for batch_matmul Parameters ---------- outs: Array of Tensor - The computation graph description of softmax + The computation graph description of batch_matmul in the format of an array of tensors. Returns @@ -256,7 +256,7 @@ def schedule_batch_dot(outs): s = tvm.create_schedule([x.op for x in outs]) def _callback(op): - if "batch_dot" in op.tag: + if "batch_matmul" in op.tag: C = op.output(0) A, B = s[C].op.input_tensors _, M, N = get_const_tuple(C.shape) diff --git a/topi/src/topi.cc b/topi/src/topi.cc index b1118c582382..e4ec7bf24b01 100644 --- a/topi/src/topi.cc +++ b/topi/src/topi.cc @@ -17,7 +17,7 @@ #include #include -#include +#include #include #include #include @@ -358,10 +358,10 @@ TVM_REGISTER_GLOBAL("topi.nn.dense") *rv = nn::dense(args[0], args[1], args[2]); }); -/* Ops from nn/batch_dot.h */ -TVM_REGISTER_GLOBAL("topi.nn.batch_dot") +/* Ops from nn/batch_matmul.h */ +TVM_REGISTER_GLOBAL("topi.nn.batch_matmul") .set_body([](TVMArgs args, TVMRetValue *rv) { - *rv = nn::batch_dot(args[0], args[1]); + *rv = nn::batch_matmul(args[0], args[1]); }); /* Ops from nn/dilate.h */ @@ -617,7 +617,7 @@ TVM_REGISTER_GENERIC_FUNC(schedule_dense) .register_func({ "cuda", "gpu" }, WrapSchedule(topi::cuda::schedule_dense)) .register_func({ "rocm" }, WrapSchedule(topi::rocm::schedule_dense)); -TVM_REGISTER_GENERIC_FUNC(schedule_batch_dot) +TVM_REGISTER_GENERIC_FUNC(schedule_batch_matmul) .set_default(WrapSchedule(topi::generic::default_schedule)); TVM_REGISTER_GENERIC_FUNC(schedule_pool) diff --git a/topi/tests/python/test_topi_batch_dot.py b/topi/tests/python/test_topi_batch_matmul.py similarity index 71% rename from topi/tests/python/test_topi_batch_dot.py rename to topi/tests/python/test_topi_batch_matmul.py index d904606164f8..8ac3e24a3408 100644 --- a/topi/tests/python/test_topi_batch_dot.py +++ b/topi/tests/python/test_topi_batch_matmul.py @@ -1,4 +1,4 @@ -"""Test code for batch_dot operator""" +"""Test code for batch_matmul operator""" import numpy as np import tvm import topi @@ -8,19 +8,17 @@ from common import get_all_backend -def verify_batch_dot(batch, M, N, K): +def verify_batch_matmul(batch, M, N, K): x = tvm.placeholder((batch, M, K), name='x') y = tvm.placeholder((batch, N, K), name='y') dtype = x.dtype # use memoize to pickle the test data for next time use - @memoize("topi.tests.test_topi_batch_dot") + @memoize("topi.tests.test_topi_batch_matmul") def get_ref_data(): a_np = np.random.uniform(size=(batch, M, K)).astype(dtype) b_np = np.random.uniform(size=(batch, N, K)).astype(dtype) - c_np = np.zeros((batch, M, N)).astype(dtype) - for i in range(batch): - c_np[i] = np.dot(a_np[i], b_np[i].T) + c_np = topi.testing.batch_matmul(a_np, b_np) return (a_np, b_np, c_np) # get the test data a_np, b_np, c_np = get_ref_data() @@ -32,8 +30,8 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.create(device): - out = topi.nn.batch_dot(x, y) - s = topi.generic.schedule_batch_dot([out]) + out = topi.nn.batch_matmul(x, y) + s = topi.generic.schedule_batch_matmul([out]) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros(get_const_tuple(out.shape), dtype=dtype), ctx) @@ -44,11 +42,11 @@ def check_device(device): for device in get_all_backend(): check_device(device) -def test_batch_dot(): - verify_batch_dot(1, 16, 16, 32) - verify_batch_dot(5, 16, 16, 32) - verify_batch_dot(5, 16, 20, 32) +def test_batch_matmul(): + verify_batch_matmul(1, 16, 16, 32) + verify_batch_matmul(5, 16, 16, 32) + verify_batch_matmul(5, 16, 20, 32) if __name__ == "__main__": - test_batch_dot() + test_batch_matmul() From e20b1e8dab15a5443f55dd479cd5352585c813c4 Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Mon, 11 Feb 2019 14:55:16 -0800 Subject: [PATCH 04/14] nits --- docs/langref/relay_op.rst | 4 ++-- tests/python/relay/test_op_level10.py | 1 - topi/include/topi/nn/batch_matmul.h | 4 ++-- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst index 2d7a4e6fcc21..c6702d4b4e51 100644 --- a/docs/langref/relay_op.rst +++ b/docs/langref/relay_op.rst @@ -150,7 +150,7 @@ This level support backpropagation of broadcast operators. It is temporary. tvm.relay.device_copy tvm.relay.annotation.on_device tvm.relay.reverse_reshape - tvm.relay.nn.batch_dot + tvm.relay.nn.batch_matmul Level 1 Definitions @@ -261,4 +261,4 @@ Level 10 Definitions .. autofunction:: tvm.relay.device_copy .. autofunction:: tvm.relay.annotation.on_device .. autofunction:: tvm.relay.reverse_reshape -.. autofunction:: tvm.relay.nn.batch_dot +.. autofunction:: tvm.relay.nn.batch_matmul diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py index 9552342f1a3a..6d68197ff7ac 100644 --- a/tests/python/relay/test_op_level10.py +++ b/tests/python/relay/test_op_level10.py @@ -128,7 +128,6 @@ def verify_reverse_reshape(shape, newshape, oshape): x = relay.var("x", relay.TensorType(shape, "float32")) z = relay.reverse_reshape(x, newshape=newshape) zz = relay.ir_pass.infer_type(z) - print(zz.checked_type) assert "newshape=" in z.astext() assert zz.checked_type == relay.ty.TensorType(oshape, "float32") diff --git a/topi/include/topi/nn/batch_matmul.h b/topi/include/topi/nn/batch_matmul.h index b59ffe02e942..968e1b0c697c 100644 --- a/topi/include/topi/nn/batch_matmul.h +++ b/topi/include/topi/nn/batch_matmul.h @@ -25,8 +25,8 @@ using namespace tvm; */ inline tvm::Tensor batch_matmul(const tvm::Tensor& x, const tvm::Tensor& y) { - CHECK_EQ(x->shape.size(), 3) << "batch_dot requires 3-D data"; - CHECK_EQ(y->shape.size(), 3) << "batch_dot requires 3-D data"; + CHECK_EQ(x->shape.size(), 3) << "batch_matmul requires 3-D data"; + CHECK_EQ(y->shape.size(), 3) << "batch_matmul requires 3-D data"; auto batch = x->shape[0]; auto M = x->shape[1]; From 15f4b7c23acc7200d161c4c54ae5d52528901236 Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Mon, 11 Feb 2019 15:32:15 -0800 Subject: [PATCH 05/14] Add missing file --- topi/python/topi/testing/batch_matmul.py | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 topi/python/topi/testing/batch_matmul.py diff --git a/topi/python/topi/testing/batch_matmul.py b/topi/python/topi/testing/batch_matmul.py new file mode 100644 index 000000000000..9d3c5cc47a85 --- /dev/null +++ b/topi/python/topi/testing/batch_matmul.py @@ -0,0 +1,11 @@ +# pylint: disable=invalid-name +"""Batch matmul in python""" +import numpy as np + +def batch_matmul(x, y): + batch, M, K = x.shape + N = y.shape[1] + out = np.zeros((batch, M, N)).astype(x.dtype) + for i in range(batch): + out[i] = np.dot(x[i], y[i].T) + return out From 779b4590184eb60a88c7427ce98a2953d301d197 Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Tue, 12 Feb 2019 00:40:53 +0000 Subject: [PATCH 06/14] Put batch_matmul and dense x86 schedule in separate files --- python/tvm/relay/frontend/mxnet.py | 5 + topi/python/topi/x86/batch_matmul.py | 55 ++++++ topi/python/topi/x86/dense.py | 208 ++++++++++++++++++++++ topi/python/topi/x86/nn.py | 250 +-------------------------- 4 files changed, 269 insertions(+), 249 deletions(-) create mode 100644 topi/python/topi/x86/batch_matmul.py create mode 100644 topi/python/topi/x86/dense.py diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py index 26cd565c91c9..afb1d386e800 100644 --- a/python/tvm/relay/frontend/mxnet.py +++ b/python/tvm/relay/frontend/mxnet.py @@ -269,6 +269,11 @@ def _mx_multibox_detection(inputs, attrs): def _mx_batch_dot(inputs, attrs): + transpose_a = attrs.get_bool("transpose_a", False) + transpose_b = attrs.get_bool("transpose_b", False) + if transpose_a is True or transpose_b is False: + raise RuntimeError("batch_dot: only support transpose_a=False and " + "transpose_b=True") return _op.batch_matmul(inputs[0], inputs[1]) diff --git a/topi/python/topi/x86/batch_matmul.py b/topi/python/topi/x86/batch_matmul.py new file mode 100644 index 000000000000..0723ed36da60 --- /dev/null +++ b/topi/python/topi/x86/batch_matmul.py @@ -0,0 +1,55 @@ +# pylint: disable=invalid-name,too-many-locals,unused-variable +"""x86 batch_matmul operators""" +from __future__ import absolute_import as _abs +import tvm + +from .util import get_max_power2_factor +from .. import generic +from ..util import traverse_inline, get_const_tuple + + +@generic.schedule_batch_matmul.register(["cpu"]) +def schedule_batch_matmul(outs): + """Schedule for batch_matmul + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of batch_matmul + in the format of an array of tensors. + + Returns + ------- + sch: Schedule + The computation schedule for the op. + """ + s = tvm.create_schedule([x.op for x in outs]) + + def _callback(op): + if "batch_matmul" in op.tag: + C = op.output(0) + A, B = s[C].op.input_tensors + _, M, N = get_const_tuple(C.shape) + k, = s[C].op.reduce_axis + ko, ki = s[C].split(k, 16) + CC = s.rfactor(C, ki) + + b, y, x = s[C].op.axis + y_bn = get_max_power2_factor(M, 8) + x_bn = get_max_power2_factor(N, 8) + yo, yi = s[C].split(y, y_bn) + xo, xi = s[C].split(x, x_bn) + s[C].reorder(b, yo, xo, yi, xi) + bxyo = s[C].fuse(b, yo, xo) + s[C].parallel(bxyo) + s[C].fuse(yi, xi) + + s[CC].compute_at(s[C], bxyo) + _, _, y, x = s[CC].op.axis + s[CC].fuse(y, x) + s[CC].vectorize(s[CC].op.axis[0]) + s[C].pragma(bxyo, 'auto_unroll_max_step', 16) + + traverse_inline(s, outs[0].op, _callback) + + return s diff --git a/topi/python/topi/x86/dense.py b/topi/python/topi/x86/dense.py new file mode 100644 index 000000000000..3ba3909c54a4 --- /dev/null +++ b/topi/python/topi/x86/dense.py @@ -0,0 +1,208 @@ +# pylint: disable=invalid-name,too-many-locals,unused-variable +"""x86 dense operators""" +from __future__ import absolute_import as _abs +import tvm +from tvm import autotvm +from tvm.autotvm.task.space import SplitEntity + +from .util import get_fp32_len, get_max_power2_factor +from .. import generic, tag, nn +from ..util import traverse_inline, get_const_tuple + +@autotvm.register_topi_compute(nn.dense, "cpu", "direct") +def _declaration_dense(cfg, data, weight, bias=None): + batch, _ = get_const_tuple(data.shape) + + # For small batch sizes, don't pack weight into cache-friendly layout + # because of overhead in packing and limited reuse from batch dimension + # TODO(icemelon9): use a more systematic way to determine which schedule to use + if batch <= 16: + return _declaration_dense_nopack(cfg, data, weight, bias) + return _declaration_dense_pack(cfg, data, weight, bias) + + +# Declare dense compute with packing weight into cache-friendly layout +@autotvm.register_topi_compute(nn.dense, "cpu", "direct_pack") +def _declaration_dense_pack(cfg, data, weight, bias=None): + batch, in_dim = get_const_tuple(data.shape) + out_dim, _ = get_const_tuple(weight.shape) + # create tuning space + cfg.define_split("tile_y", batch, num_outputs=3) + cfg.define_split("tile_x", out_dim, num_outputs=3) + cfg.define_split("tile_k", in_dim, num_outputs=2) + if cfg.is_fallback: + _default_dense_pack_config(cfg, batch, out_dim, in_dim) + + packw_bn = cfg["tile_x"].size[-1] + packw_shape = (out_dim // packw_bn, in_dim, packw_bn) + packw = tvm.compute(packw_shape, + lambda z, y, x: weight[z * packw_bn + x, y], name="packed_weight") + + k = tvm.reduce_axis((0, in_dim), name="k") + C = tvm.compute((batch, out_dim), + lambda y, x: tvm.sum( + data[y, k] * packw[x // packw_bn, k, x % packw_bn], + axis=k), + tag="dense_pack") + if bias is not None: + C = tvm.compute((batch, out_dim), lambda i, j: C[i, j] + bias[j], + tag=tag.BROADCAST) + return C + + +# Declare dense compute without packing weight +@autotvm.register_topi_compute(nn.dense, "cpu", "direct_nopack") +def _declaration_dense_nopack(cfg, data, weight, bias=None): + batch, in_dim = get_const_tuple(data.shape) + out_dim, _ = get_const_tuple(weight.shape) + # create tuning space + cfg.define_split("tile_x", out_dim, num_outputs=2) + cfg.define_split("tile_y", batch, num_outputs=2) + cfg.define_split("tile_k", in_dim, num_outputs=2) + if cfg.is_fallback: + _default_dense_nopack_config(cfg, batch, out_dim, in_dim) + + vec = cfg["tile_k"].size[-1] + k = tvm.reduce_axis((0, in_dim // vec), "k") + CC = tvm.compute((batch, out_dim, vec), + lambda z, y, x: tvm.sum( + data[z, k * vec + x] * weight[y, k * vec + x], axis=k)) + + kk = tvm.reduce_axis((0, vec), "kk") + C = tvm.compute((batch, out_dim), + lambda y, x: tvm.sum(CC[y, x, kk], axis=kk), + tag="dense_nopack") + if bias is not None: + C = tvm.compute((batch, out_dim), lambda i, j: C[i, j] + bias[j], + tag=tag.BROADCAST) + + return C + + +@autotvm.register_topi_schedule(generic.schedule_dense, "cpu", "direct") +def _schedule_dense(cfg, outs): + s = tvm.create_schedule([x.op for x in outs]) + + def _callback(op): + if "dense_pack" in op.tag: + _schedule_dense_pack_template(cfg, s, op.output(0)) + elif 'dense_nopack' in op.tag: + _schedule_dense_nopack_template(cfg, s, op.output(0)) + traverse_inline(s, outs[0].op, _callback) + return s + + +@autotvm.register_topi_schedule(generic.schedule_dense, "cpu", "direct_pack") +def _schedule_dense_pack(cfg, outs): + s = tvm.create_schedule([x.op for x in outs]) + + def _callback(op): + if "dense_pack" in op.tag: + _schedule_dense_pack_template(cfg, s, op.output(0)) + traverse_inline(s, outs[0].op, _callback) + return s + + +@autotvm.register_topi_schedule(generic.schedule_dense, "cpu", "direct_nopack") +def _schedule_dense_nopack(cfg, outs): + s = tvm.create_schedule([x.op for x in outs]) + + def _callback(op): + if 'dense_nopack' in op.tag: + _schedule_dense_nopack_template(cfg, s, op.output(0)) + traverse_inline(s, outs[0].op, _callback) + return s + + +def _schedule_dense_pack_template(cfg, s, C): + A, packedB = s[C].op.input_tensors + + CC = s.cache_write(C, "global") + y, x = s[C].op.axis + k, = s[CC].op.reduce_axis + + yt, yo, yi = cfg["tile_y"].apply(s, C, y) + xt, xo, xi = cfg["tile_x"].apply(s, C, x) + s[C].reorder(yt, xt, yo, xo, yi, xi) + xyt = s[C].fuse(yt, xt) + s[C].parallel(xyt) + xyo = s[C].fuse(yo, xo) + s[C].unroll(yi) + s[C].vectorize(xi) + + s[CC].compute_at(s[C], xyo) + y, x = s[CC].op.axis + ko, ki = cfg["tile_k"].apply(s, CC, k) + s[CC].reorder(ko, ki, y, x) + s[CC].vectorize(x) + s[CC].unroll(y) + s[CC].unroll(ki) + + z, y, x = s[packedB].op.axis + s[packedB].reorder(z, x, y) + s[packedB].parallel(z) + s[packedB].vectorize(y) + return s + + +def _schedule_dense_nopack_template(cfg, s, C): + y, x = s[C].op.axis + kk, = s[C].op.reduce_axis + yo, yi = cfg["tile_y"].apply(s, C, y) + xo, xi = cfg["tile_x"].apply(s, C, x) + s[C].reorder(yo, xo, yi, xi) + xyo = s[C].fuse(yo, xo) + s[C].parallel(xyo) + s[C].unroll(kk) + + CC, = s[C].op.input_tensors + s[CC].compute_at(s[C], xyo) + z, y, x = s[CC].op.axis + k, = s[CC].op.reduce_axis + yz = s[CC].fuse(z, y) + s[CC].reorder(k, yz, x) + s[CC].unroll(yz) + s[CC].vectorize(x) + return s + + +def _default_dense_pack_config(cfg, M, N, K): + vec_width = get_fp32_len() + + tilex_ii = 1 + for bn in range(vec_width*2, 0, -1): + if N % bn == 0: + tilex_ii = bn + break + NN = N // tilex_ii + tilex_oi = 1 + while NN // tilex_oi > 4: + if (NN // tilex_oi) % 2 == 1: + break + tilex_oi *= 2 + + tiley_ii = 8 + while M % tiley_ii != 0: + tiley_ii //= 2 + MM = M // tiley_ii + tiley_oi = 1 + while MM // tiley_oi > 4: + if (MM // tiley_oi) % 2 == 1: + break + tiley_oi *= 2 + + cfg["tile_y"] = SplitEntity([MM // tiley_oi, tiley_oi, tiley_ii]) + cfg["tile_x"] = SplitEntity([NN // tilex_oi, tilex_oi, tilex_ii]) + cfg["tile_k"] = SplitEntity([K, 1]) + + +def _default_dense_nopack_config(cfg, M, N, K): + vec_width = get_fp32_len() + tilek_bn = 1 + for bn in range(vec_width*2, 0, -1): + if K % bn == 0: + tilek_bn = bn + break + cfg["tile_k"] = SplitEntity([K // tilek_bn, tilek_bn]) + cfg["tile_x"] = SplitEntity([N, 1]) + cfg["tile_y"] = SplitEntity([1, M]) diff --git a/topi/python/topi/x86/nn.py b/topi/python/topi/x86/nn.py index d5a6ca76f7ed..3b57c0096c58 100644 --- a/topi/python/topi/x86/nn.py +++ b/topi/python/topi/x86/nn.py @@ -5,9 +5,7 @@ from tvm import autotvm from tvm.autotvm.task.space import SplitEntity -from .util import get_fp32_len, get_max_power2_factor -from .. import generic, tag, nn -from ..util import traverse_inline, get_const_tuple +from .. import generic @generic.schedule_softmax.register(["cpu"]) def schedule_softmax(outs): @@ -37,249 +35,3 @@ def schedule_softmax(outs): else: s[x].parallel(s[x].op.axis[0]) return s - - -@autotvm.register_topi_compute(nn.dense, "cpu", "direct") -def _declaration_dense(cfg, data, weight, bias=None): - batch, _ = get_const_tuple(data.shape) - - # For small batch sizes, don't pack weight into cache-friendly layout - # because of overhead in packing and limited reuse from batch dimension - # TODO(icemelon9): use a more systematic way to determine which schedule to use - if batch <= 16: - return _declaration_dense_nopack(cfg, data, weight, bias) - return _declaration_dense_pack(cfg, data, weight, bias) - - -# Declare dense compute with packing weight into cache-friendly layout -@autotvm.register_topi_compute(nn.dense, "cpu", "direct_pack") -def _declaration_dense_pack(cfg, data, weight, bias=None): - batch, in_dim = get_const_tuple(data.shape) - out_dim, _ = get_const_tuple(weight.shape) - # create tuning space - cfg.define_split("tile_y", batch, num_outputs=3) - cfg.define_split("tile_x", out_dim, num_outputs=3) - cfg.define_split("tile_k", in_dim, num_outputs=2) - if cfg.is_fallback: - _default_dense_pack_config(cfg, batch, out_dim, in_dim) - - packw_bn = cfg["tile_x"].size[-1] - packw_shape = (out_dim // packw_bn, in_dim, packw_bn) - packw = tvm.compute(packw_shape, - lambda z, y, x: weight[z * packw_bn + x, y], name="packed_weight") - - k = tvm.reduce_axis((0, in_dim), name="k") - C = tvm.compute((batch, out_dim), - lambda y, x: tvm.sum( - data[y, k] * packw[x // packw_bn, k, x % packw_bn], - axis=k), - tag="dense_pack") - if bias is not None: - C = tvm.compute((batch, out_dim), lambda i, j: C[i, j] + bias[j], - tag=tag.BROADCAST) - return C - - -# Declare dense compute without packing weight -@autotvm.register_topi_compute(nn.dense, "cpu", "direct_nopack") -def _declaration_dense_nopack(cfg, data, weight, bias=None): - batch, in_dim = get_const_tuple(data.shape) - out_dim, _ = get_const_tuple(weight.shape) - # create tuning space - cfg.define_split("tile_x", out_dim, num_outputs=2) - cfg.define_split("tile_y", batch, num_outputs=2) - cfg.define_split("tile_k", in_dim, num_outputs=2) - if cfg.is_fallback: - _default_dense_nopack_config(cfg, batch, out_dim, in_dim) - - vec = cfg["tile_k"].size[-1] - k = tvm.reduce_axis((0, in_dim // vec), "k") - CC = tvm.compute((batch, out_dim, vec), - lambda z, y, x: tvm.sum( - data[z, k * vec + x] * weight[y, k * vec + x], axis=k)) - - kk = tvm.reduce_axis((0, vec), "kk") - C = tvm.compute((batch, out_dim), - lambda y, x: tvm.sum(CC[y, x, kk], axis=kk), - tag="dense_nopack") - if bias is not None: - C = tvm.compute((batch, out_dim), lambda i, j: C[i, j] + bias[j], - tag=tag.BROADCAST) - - return C - - -@autotvm.register_topi_schedule(generic.schedule_dense, "cpu", "direct") -def _schedule_dense(cfg, outs): - s = tvm.create_schedule([x.op for x in outs]) - - def _callback(op): - if "dense_pack" in op.tag: - _schedule_dense_pack_template(cfg, s, op.output(0)) - elif 'dense_nopack' in op.tag: - _schedule_dense_nopack_template(cfg, s, op.output(0)) - traverse_inline(s, outs[0].op, _callback) - return s - - -@autotvm.register_topi_schedule(generic.schedule_dense, "cpu", "direct_pack") -def _schedule_dense_pack(cfg, outs): - s = tvm.create_schedule([x.op for x in outs]) - - def _callback(op): - if "dense_pack" in op.tag: - _schedule_dense_pack_template(cfg, s, op.output(0)) - traverse_inline(s, outs[0].op, _callback) - return s - - -@autotvm.register_topi_schedule(generic.schedule_dense, "cpu", "direct_nopack") -def _schedule_dense_nopack(cfg, outs): - s = tvm.create_schedule([x.op for x in outs]) - - def _callback(op): - if 'dense_nopack' in op.tag: - _schedule_dense_nopack_template(cfg, s, op.output(0)) - traverse_inline(s, outs[0].op, _callback) - return s - - -def _schedule_dense_pack_template(cfg, s, C): - A, packedB = s[C].op.input_tensors - - CC = s.cache_write(C, "global") - y, x = s[C].op.axis - k, = s[CC].op.reduce_axis - - yt, yo, yi = cfg["tile_y"].apply(s, C, y) - xt, xo, xi = cfg["tile_x"].apply(s, C, x) - s[C].reorder(yt, xt, yo, xo, yi, xi) - xyt = s[C].fuse(yt, xt) - s[C].parallel(xyt) - xyo = s[C].fuse(yo, xo) - s[C].unroll(yi) - s[C].vectorize(xi) - - s[CC].compute_at(s[C], xyo) - y, x = s[CC].op.axis - ko, ki = cfg["tile_k"].apply(s, CC, k) - s[CC].reorder(ko, ki, y, x) - s[CC].vectorize(x) - s[CC].unroll(y) - s[CC].unroll(ki) - - z, y, x = s[packedB].op.axis - s[packedB].reorder(z, x, y) - s[packedB].parallel(z) - s[packedB].vectorize(y) - return s - - -def _schedule_dense_nopack_template(cfg, s, C): - y, x = s[C].op.axis - kk, = s[C].op.reduce_axis - yo, yi = cfg["tile_y"].apply(s, C, y) - xo, xi = cfg["tile_x"].apply(s, C, x) - s[C].reorder(yo, xo, yi, xi) - xyo = s[C].fuse(yo, xo) - s[C].parallel(xyo) - s[C].unroll(kk) - - CC, = s[C].op.input_tensors - s[CC].compute_at(s[C], xyo) - z, y, x = s[CC].op.axis - k, = s[CC].op.reduce_axis - yz = s[CC].fuse(z, y) - s[CC].reorder(k, yz, x) - s[CC].unroll(yz) - s[CC].vectorize(x) - return s - - -def _default_dense_pack_config(cfg, M, N, K): - vec_width = get_fp32_len() - - tilex_ii = 1 - for bn in range(vec_width*2, 0, -1): - if N % bn == 0: - tilex_ii = bn - break - NN = N // tilex_ii - tilex_oi = 1 - while NN // tilex_oi > 4: - if (NN // tilex_oi) % 2 == 1: - break - tilex_oi *= 2 - - tiley_ii = 8 - while M % tiley_ii != 0: - tiley_ii //= 2 - MM = M // tiley_ii - tiley_oi = 1 - while MM // tiley_oi > 4: - if (MM // tiley_oi) % 2 == 1: - break - tiley_oi *= 2 - - cfg["tile_y"] = SplitEntity([MM // tiley_oi, tiley_oi, tiley_ii]) - cfg["tile_x"] = SplitEntity([NN // tilex_oi, tilex_oi, tilex_ii]) - cfg["tile_k"] = SplitEntity([K, 1]) - - -def _default_dense_nopack_config(cfg, M, N, K): - vec_width = get_fp32_len() - tilek_bn = 1 - for bn in range(vec_width*2, 0, -1): - if K % bn == 0: - tilek_bn = bn - break - cfg["tile_k"] = SplitEntity([K // tilek_bn, tilek_bn]) - cfg["tile_x"] = SplitEntity([N, 1]) - cfg["tile_y"] = SplitEntity([1, M]) - - -@generic.schedule_batch_matmul.register(["cpu"]) -def schedule_batch_matmul(outs): - """Schedule for batch_matmul - - Parameters - ---------- - outs: Array of Tensor - The computation graph description of batch_matmul - in the format of an array of tensors. - - Returns - ------- - sch: Schedule - The computation schedule for the op. - """ - s = tvm.create_schedule([x.op for x in outs]) - - def _callback(op): - if "batch_matmul" in op.tag: - C = op.output(0) - A, B = s[C].op.input_tensors - _, M, N = get_const_tuple(C.shape) - k, = s[C].op.reduce_axis - ko, ki = s[C].split(k, 16) - CC = s.rfactor(C, ki) - - b, y, x = s[C].op.axis - y_bn = get_max_power2_factor(M, 8) - x_bn = get_max_power2_factor(N, 8) - yo, yi = s[C].split(y, y_bn) - xo, xi = s[C].split(x, x_bn) - s[C].reorder(b, yo, xo, yi, xi) - bxyo = s[C].fuse(b, yo, xo) - s[C].parallel(bxyo) - s[C].fuse(yi, xi) - - s[CC].compute_at(s[C], bxyo) - _, _, y, x = s[CC].op.axis - s[CC].fuse(y, x) - s[CC].vectorize(s[CC].op.axis[0]) - s[C].pragma(bxyo, 'auto_unroll_max_step', 16) - - traverse_inline(s, outs[0].op, _callback) - - return s From 2e639684d1291fd55d71365b95b10f3ad5361607 Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Tue, 12 Feb 2019 01:33:04 +0000 Subject: [PATCH 07/14] Fix pylint --- topi/python/topi/nn/batch_matmul.py | 19 +++++++++++++++++-- topi/python/topi/testing/batch_matmul.py | 17 ++++++++++++++++- topi/python/topi/x86/batch_matmul.py | 3 +-- 3 files changed, 34 insertions(+), 5 deletions(-) diff --git a/topi/python/topi/nn/batch_matmul.py b/topi/python/topi/nn/batch_matmul.py index 51c4a69c5dbc..07e363868b05 100644 --- a/topi/python/topi/nn/batch_matmul.py +++ b/topi/python/topi/nn/batch_matmul.py @@ -1,11 +1,27 @@ """Binary Neural Network (BNN) Operators""" +# pylint: disable=invalid-name from __future__ import absolute_import as _abs import tvm -from .. import tag from ..util import get_const_tuple def batch_matmul(x, y): + """Computes batch matrix multiplication of `x` and `y` when `x` and `y` are + data in batch. + + Parameters + ---------- + x : tvm.Tensor + 3-D with shape [batch, M, K] + + y : tvm.TEnsor + 3-D with shape [batch, N, K] + + Returns + ------- + output : tvm.Tensor + 3-D with shape [batch, M, N] + """ assert len(x.shape) == 3 and len(y.shape) == 3, "only support 3-dim batch_matmul" x_shape = get_const_tuple(x.shape) y_shape = get_const_tuple(y.shape) @@ -14,7 +30,6 @@ def batch_matmul(x, y): batch, M, K = x.shape N = y.shape[1] k = tvm.reduce_axis((0, K), name='k') - return tvm.compute((batch, M, N), lambda b, i, j: tvm.sum(x[b, i, k] * y[b, j, k], axis=k), tag='batch_matmul') diff --git a/topi/python/topi/testing/batch_matmul.py b/topi/python/topi/testing/batch_matmul.py index 9d3c5cc47a85..a7b2f9344f29 100644 --- a/topi/python/topi/testing/batch_matmul.py +++ b/topi/python/topi/testing/batch_matmul.py @@ -3,7 +3,22 @@ import numpy as np def batch_matmul(x, y): - batch, M, K = x.shape + """batch_matmul operator implemented in numpy. + + Parameters + ---------- + x : numpy.ndarray + 3-D with shape [batch, M, K] + + y : numpy.ndarray + 3-D with shape [batch, N, K] + + Returns + ------- + out : numpy.ndarray + 3-D with shape [batch, M, N] + """ + batch, M, _ = x.shape N = y.shape[1] out = np.zeros((batch, M, N)).astype(x.dtype) for i in range(batch): diff --git a/topi/python/topi/x86/batch_matmul.py b/topi/python/topi/x86/batch_matmul.py index 0723ed36da60..57a4361bf45f 100644 --- a/topi/python/topi/x86/batch_matmul.py +++ b/topi/python/topi/x86/batch_matmul.py @@ -49,7 +49,6 @@ def _callback(op): s[CC].fuse(y, x) s[CC].vectorize(s[CC].op.axis[0]) s[C].pragma(bxyo, 'auto_unroll_max_step', 16) - + traverse_inline(s, outs[0].op, _callback) - return s From 29fae0f2fa97641e21af3526e9c1c1cb89d9c758 Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Tue, 12 Feb 2019 00:49:09 -0800 Subject: [PATCH 08/14] Remove unused import --- topi/python/topi/x86/dense.py | 2 +- topi/python/topi/x86/nn.py | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/topi/python/topi/x86/dense.py b/topi/python/topi/x86/dense.py index 3ba3909c54a4..33575b4c399d 100644 --- a/topi/python/topi/x86/dense.py +++ b/topi/python/topi/x86/dense.py @@ -5,7 +5,7 @@ from tvm import autotvm from tvm.autotvm.task.space import SplitEntity -from .util import get_fp32_len, get_max_power2_factor +from .util import get_fp32_len from .. import generic, tag, nn from ..util import traverse_inline, get_const_tuple diff --git a/topi/python/topi/x86/nn.py b/topi/python/topi/x86/nn.py index 3b57c0096c58..73463242e96d 100644 --- a/topi/python/topi/x86/nn.py +++ b/topi/python/topi/x86/nn.py @@ -2,9 +2,6 @@ """x86 nn operators""" from __future__ import absolute_import as _abs import tvm -from tvm import autotvm -from tvm.autotvm.task.space import SplitEntity - from .. import generic @generic.schedule_softmax.register(["cpu"]) From 9a983f7fc189259c8d89ec163f4ad95f90fc318e Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Wed, 13 Feb 2019 03:47:48 +0000 Subject: [PATCH 09/14] Add cuda schedule for batch_matmul --- topi/python/topi/cuda/__init__.py | 1 + topi/python/topi/cuda/batch_matmul.py | 89 +++++++++++++++++++++++++++ topi/python/topi/util.py | 26 ++++++++ topi/python/topi/x86/batch_matmul.py | 3 +- topi/python/topi/x86/util.py | 9 --- 5 files changed, 117 insertions(+), 11 deletions(-) create mode 100644 topi/python/topi/cuda/batch_matmul.py diff --git a/topi/python/topi/cuda/__init__.py b/topi/python/topi/cuda/__init__.py index 28d2eb258bea..d06b4f4ca8f1 100644 --- a/topi/python/topi/cuda/__init__.py +++ b/topi/python/topi/cuda/__init__.py @@ -14,6 +14,7 @@ from .pooling import schedule_pool, schedule_global_pool from .extern import schedule_extern from .nn import schedule_lrn, schedule_l2_normalize +from .batch_matmul import schedule_batch_matmul from .vision import * from . import ssd from .ssd import * diff --git a/topi/python/topi/cuda/batch_matmul.py b/topi/python/topi/cuda/batch_matmul.py new file mode 100644 index 000000000000..a1fa256028da --- /dev/null +++ b/topi/python/topi/cuda/batch_matmul.py @@ -0,0 +1,89 @@ +# pylint: disable=invalid-name,too-many-locals,unused-variable +"""cuda batch_matmul operators""" +from __future__ import absolute_import as _abs +import tvm + +from .. import generic +from ..util import traverse_inline, get_const_tuple, get_max_power2_factor + + +@generic.schedule_batch_matmul.register(["cuda", "gpu"]) +def schedule_batch_matmul(outs): + """Schedule for batch_matmul + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of batch_matmul + in the format of an array of tensors. + + Returns + ------- + s: Schedule + The computation schedule for the op. + """ + s = tvm.create_schedule([x.op for x in outs]) + + def _schedule(op): + C = op.output(0) + A, B = s[C].op.input_tensors + _, M, N = get_const_tuple(C.shape) + AA = s.cache_read(A, "shared", [C]) + AL = s.cache_read(AA, "local", [C]) + BB = s.cache_read(B, "shared", [C]) + BL = s.cache_read(BB, "local", [C]) + CC = s.cache_write(C, "local") + + b, y, x = s[C].op.axis + y_bn = get_max_power2_factor(M, 64) + x_bn = get_max_power2_factor(N, 64) + by, y = s[C].split(y, y_bn) + bx, x = s[C].split(x, x_bn) + y_nthreads = min(y_bn, 8) + x_nthreads = min(x_bn, 8) + ty, yi = s[C].split(y, nparts=y_nthreads) + tx, xi = s[C].split(x, nparts=x_nthreads) + thread_x = tvm.thread_axis((0, x_nthreads), "threadIdx.x") + thread_y = tvm.thread_axis((0, y_nthreads), "threadIdx.y") + + s[C].reorder(b, by, bx, ty, tx, yi, xi) + s[C].bind(b, tvm.thread_axis("blockIdx.z")) + s[C].bind(by, tvm.thread_axis("blockIdx.y")) + s[C].bind(bx, tvm.thread_axis("blockIdx.x")) + s[C].bind(ty, thread_y) + s[C].bind(tx, thread_x) + s[C].pragma(yi, "auto_unroll_max_step", 16) + + s[CC].compute_at(s[C], tx) + _, yi, xi = s[CC].op.axis + k, = s[CC].op.reduce_axis + ko, ki = s[CC].split(k, 8) + s[CC].reorder(ko, ki, yi, xi) + s[CC].pragma(ki, "auto_unroll_max_step", 16) + + s[AA].compute_at(s[CC], ko) + s[AL].compute_at(s[CC], ki) + s[BB].compute_at(s[CC], ko) + s[BL].compute_at(s[CC], ki) + _, y, k = s[AA].op.axis + ty, yi = s[AA].split(y, nparts=y_nthreads) + tx, ki = s[AA].split(k, nparts=x_nthreads) + s[AA].reorder(ty, tx, yi, ki) + s[AA].bind(ty, thread_y) + s[AA].bind(tx, thread_x) + s[AA].pragma(yi, "auto_unroll_max_step", 16) + + _, x, k = s[BB].op.axis + ty, xi = s[BB].split(x, nparts=y_nthreads) + tx, ki = s[BB].split(k, nparts=x_nthreads) + s[BB].bind(ty, thread_y) + s[BB].bind(tx, thread_x) + s[BB].reorder(ty, tx, xi, ki) + s[BB].pragma(xi, "auto_unroll_max_step", 16) + + def _callback(op): + if "batch_matmul" in op.tag: + _schedule(op) + + traverse_inline(s, outs[0].op, _callback) + return s diff --git a/topi/python/topi/util.py b/topi/python/topi/util.py index 6d7326580f6d..d630628b4379 100644 --- a/topi/python/topi/util.py +++ b/topi/python/topi/util.py @@ -255,3 +255,29 @@ def select_array(i, j): return now return tvm.compute(matrix.shape, select_array, name=name) + + +def get_max_power2_factor(n, max_value=None): + """Get max factor of n in power of 2. If max_value is specificed, max factor + value will be no more max_value, + + Parameter + --------- + n : int + The input value + + max_value : int, optional + The max value for the factor + + Returns + ------- + factor : int + The max factor in power of 2. + """ + x = 1 + while n % 2 == 0: + if max_value is not None and max_value < x * 2: + break + x *= 2 + n /= 2 + return x diff --git a/topi/python/topi/x86/batch_matmul.py b/topi/python/topi/x86/batch_matmul.py index 57a4361bf45f..37890e389366 100644 --- a/topi/python/topi/x86/batch_matmul.py +++ b/topi/python/topi/x86/batch_matmul.py @@ -3,9 +3,8 @@ from __future__ import absolute_import as _abs import tvm -from .util import get_max_power2_factor from .. import generic -from ..util import traverse_inline, get_const_tuple +from ..util import traverse_inline, get_const_tuple, get_max_power2_factor @generic.schedule_batch_matmul.register(["cpu"]) diff --git a/topi/python/topi/x86/util.py b/topi/python/topi/x86/util.py index a8a19365e2ab..678ff8e24cff 100644 --- a/topi/python/topi/x86/util.py +++ b/topi/python/topi/x86/util.py @@ -10,12 +10,3 @@ def get_fp32_len(): if opt == '-mcpu=skylake-avx512': fp32_vec_len = 16 return fp32_vec_len - -def get_max_power2_factor(n, max_value=None): - x = 1 - while n % 2 == 0: - if max_value is not None and max_value < x * 2: - break - x *= 2 - n /= 2 - return x From 33ccd4bc94922f409ee48f123871c0b5633170ba Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Mon, 18 Feb 2019 14:52:15 -0800 Subject: [PATCH 10/14] Add test case with larger batch size --- tests/python/relay/test_op_level10.py | 1 + topi/tests/python/test_topi_batch_matmul.py | 1 + 2 files changed, 2 insertions(+) diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py index 6d68197ff7ac..34285d2b18dd 100644 --- a/tests/python/relay/test_op_level10.py +++ b/tests/python/relay/test_op_level10.py @@ -174,6 +174,7 @@ def test_batch_matmul(): verify_batch_matmul((1, 16, 32), (1, 16, 32), (1, 16, 16)) verify_batch_matmul((5, 16, 32), (5, 16, 32), (5, 16, 16)) verify_batch_matmul((5, 16, 32), (5, 20, 32), (5, 16, 20)) + verify_batch_matmul((30, 16, 32), (30, 20, 32), (30, 16, 20)) if __name__ == "__main__": diff --git a/topi/tests/python/test_topi_batch_matmul.py b/topi/tests/python/test_topi_batch_matmul.py index 8ac3e24a3408..f699d6aa8dcb 100644 --- a/topi/tests/python/test_topi_batch_matmul.py +++ b/topi/tests/python/test_topi_batch_matmul.py @@ -46,6 +46,7 @@ def test_batch_matmul(): verify_batch_matmul(1, 16, 16, 32) verify_batch_matmul(5, 16, 16, 32) verify_batch_matmul(5, 16, 20, 32) + verify_batch_matmul(30, 16, 20, 32) if __name__ == "__main__": From 8cc095d6426bbe71028a9035b21e752726403bb1 Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Mon, 18 Feb 2019 15:31:58 -0800 Subject: [PATCH 11/14] Add batch_matmul in api doc --- docs/api/python/topi.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/api/python/topi.rst b/docs/api/python/topi.rst index 856bad198e88..4d2eeb7692c4 100644 --- a/docs/api/python/topi.rst +++ b/docs/api/python/topi.rst @@ -41,6 +41,7 @@ List of operators topi.nn.upsampling topi.nn.softmax topi.nn.dense + topi.nn.batch_matmul topi.nn.log_softmax topi.nn.conv2d_nchw topi.nn.conv2d_hwcn @@ -134,6 +135,7 @@ topi.nn .. autofunction:: topi.nn.upsampling .. autofunction:: topi.nn.softmax .. autofunction:: topi.nn.dense +.. autofunction:: topi.nn.batch_matmul .. autofunction:: topi.nn.log_softmax .. autofunction:: topi.nn.conv2d_nchw .. autofunction:: topi.nn.conv2d_hwcn From 41b1d3646e30075c0a38dc68016d09f5b86ecf9c Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Thu, 21 Feb 2019 14:12:59 -0800 Subject: [PATCH 12/14] Fix quantize pass rounding error --- tests/python/relay/test_pass_quantize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python/relay/test_pass_quantize.py b/tests/python/relay/test_pass_quantize.py index 6d65d7b2d9ee..2e2389d16244 100644 --- a/tests/python/relay/test_pass_quantize.py +++ b/tests/python/relay/test_pass_quantize.py @@ -75,7 +75,7 @@ def make_qgraph(data, weight): graph = relay.create_executor('graph') res0 = graph.evaluate(qgraph0)(dataset[0]['data']) res1 = graph.evaluate(qgraph1)(dataset[0]['data']) - tvm.testing.assert_allclose(res0.asnumpy(), res1.asnumpy()) + tvm.testing.assert_allclose(res0.asnumpy(), res1.asnumpy(), rtol=1e-3) if __name__ == "__main__": From d47383320ca36ac86265031614b656c53706c687 Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Thu, 28 Feb 2019 16:48:10 -0800 Subject: [PATCH 13/14] Fix pylint and minor change --- python/tvm/relay/frontend/mxnet.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py index d2461fadec76..4112825c2f8f 100644 --- a/python/tvm/relay/frontend/mxnet.py +++ b/python/tvm/relay/frontend/mxnet.py @@ -286,12 +286,13 @@ def _mx_multibox_detection(inputs, attrs): def _mx_batch_dot(inputs, attrs): transpose_a = attrs.get_bool("transpose_a", False) transpose_b = attrs.get_bool("transpose_b", False) - if transpose_a is True or transpose_b is False: - raise RuntimeError("batch_dot: only support transpose_a=False and " - "transpose_b=True") + if transpose_a is True: + raise RuntimeError("batch_dot: only support transpose_a=False") + if transpose_b is False: + b = _op.transpose(b, axes=[0, 2, 1]) return _op.batch_matmul(inputs[0], inputs[1]) - + def _mx_arange(inputs, attrs): assert len(inputs) == 0 if attrs.get_int("repeat", 1) != 1: @@ -302,7 +303,6 @@ def _mx_arange(inputs, attrs): new_attrs["step"] = attrs.get_float("step", 1) new_attrs["dtype"] = attrs.get_str("dtype", "float32") return _op.arange(**new_attrs) - def _mx_roi_align(inputs, attrs): From be4d7ce2a633828e9de4ef9313775d16fd026d77 Mon Sep 17 00:00:00 2001 From: Haichen Shen Date: Thu, 28 Feb 2019 17:48:19 -0800 Subject: [PATCH 14/14] bug fix --- python/tvm/relay/frontend/mxnet.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py index 4112825c2f8f..3d3bb8e4fd84 100644 --- a/python/tvm/relay/frontend/mxnet.py +++ b/python/tvm/relay/frontend/mxnet.py @@ -284,13 +284,15 @@ def _mx_multibox_detection(inputs, attrs): def _mx_batch_dot(inputs, attrs): + assert len(inputs) == 2 + a, b = inputs transpose_a = attrs.get_bool("transpose_a", False) transpose_b = attrs.get_bool("transpose_b", False) if transpose_a is True: raise RuntimeError("batch_dot: only support transpose_a=False") if transpose_b is False: b = _op.transpose(b, axes=[0, 2, 1]) - return _op.batch_matmul(inputs[0], inputs[1]) + return _op.batch_matmul(a, b) def _mx_arange(inputs, attrs):