From a433b0e6736a9ae62185018f0698ddde04f4daba Mon Sep 17 00:00:00 2001
From: Haichen Shen <shenhaichen@gmail.com>
Date: Tue, 29 Jan 2019 10:37:54 -0800
Subject: [PATCH 01/14] Add batch_dot and cpu schedule

---
 docs/langref/relay_op.rst            |  2 +
 python/tvm/relay/frontend/mxnet.py   | 20 ++++++++-
 python/tvm/relay/op/nn/_nn.py        | 15 +++++++
 python/tvm/relay/op/nn/nn.py         | 24 +++++++++++
 src/relay/op/nn/nn.cc                | 63 ++++++++++++++++++++++++++++
 tests/python/relay/test_op_level1.py | 44 +++++++++++++++++++
 topi/python/topi/generic/nn.py       |  7 ++++
 topi/python/topi/nn/__init__.py      |  1 +
 topi/python/topi/x86/nn.py           | 52 +++++++++++++++++++++--
 topi/python/topi/x86/util.py         |  9 ++++
 topi/src/topi.cc                     | 10 +++++
 11 files changed, 242 insertions(+), 5 deletions(-)

diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index e1f38c61eb1f..4be6a01758b4 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -39,6 +39,7 @@ This level enables fully connected multi-layer perceptron.
    tvm.relay.nn.relu
    tvm.relay.nn.dropout
    tvm.relay.nn.batch_norm
+   tvm.relay.nn.batch_dot
    tvm.relay.nn.bias_add
 
 
@@ -171,6 +172,7 @@ Level 1 Definitions
 .. autofunction:: tvm.relay.nn.relu
 .. autofunction:: tvm.relay.nn.dropout
 .. autofunction:: tvm.relay.nn.batch_norm
+.. autofunction:: tvm.relay.nn.batch_dot
 .. autofunction:: tvm.relay.nn.bias_add
 
 
diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index ea49a6642796..fb81d3eef990 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -268,6 +268,23 @@ def _mx_multibox_detection(inputs, attrs):
     return _op.vision.nms(ret[0], ret[1], **new_attrs1)
 
 
+def _mx_div_sqrt_dim(inputs, attrs):
+    ty = ir_pass.infer_type(inputs[0])._checked_type_
+    sqrt_dim = _op.sqrt(_expr.const(int(ty.shape[-1]), ty.dtype))
+    out = inputs[0] / sqrt_dim
+    print(out.astext())
+    return out
+
+
+def _mx_batch_dot(inputs, attrs):
+    a = ir_pass.infer_type(inputs[0])
+    b = ir_pass.infer_type(inputs[1])
+    
+    print(a.astext())
+    print(b.astext())
+    exit()
+
+
 # Note: due to attribute conversion constraint
 # ops in the identity set must be attribute free
 _identity_list = [
@@ -363,7 +380,8 @@ def _mx_multibox_detection(inputs, attrs):
     # "broadcast_to",
     # "gather_nd",
     # "Crop"          : _crop_like,
-
+    "_contrib_div_sqrt_dim": _mx_div_sqrt_dim,
+    "batch_dot": _mx_batch_dot,
 }
 
 # set identity list
diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index ad67e78c5ac1..ebb7ff56007d 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -46,6 +46,21 @@ def schedule_dense(attrs, outputs, target):
 reg.register_pattern("nn.dense", reg.OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
+# batch_dot
+@reg.register_compute("nn.batch_dot")
+def compute_batch_dot(attrs, inputs, out_type, target):
+    """Compute definition of batch_dot"""
+    return [topi.nn.batch_dot(inputs[0], inputs[1])]
+
+@reg.register_schedule("nn.batch_dot")
+def schedule_batch_dot(attrs, outputs, target):
+    """Schedule definition of dense"""
+    with target:
+        return topi.generic.schedule_batch_dot(outputs)
+
+reg.register_pattern("nn.batch_dot", reg.OpPattern.OUT_ELEMWISE_FUSABLE)
+
+
 # conv2d
 @reg.register_compute("nn.conv2d")
 def compute_conv2d(attrs, inputs, out_type, target):
diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index f70b0072f7db..6f1e686eabae 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -767,6 +767,30 @@ def batch_norm(data,
     return TupleWrapper(result, 3)
 
 
+def batch_dot(x, y):
+    r"""
+    Computes dot product of `x` and `y` when `x` and `y` are data in batch.
+
+    .. math::
+
+        \mbox{batch_dot}(x, y)[i, :, :] = \mbox{dot}(x[i, :, :], y[i, :, :]^T)
+
+    Parameters
+    ----------
+    x : tvm.relay.Expr
+        The first input.
+
+    y : tvm.relay.Expr
+        The second input.
+
+    Returns
+    -------
+    result: tvm.relay.Expr
+        The computed result.
+    """
+    return _make.batch_dot(x, y)
+
+
 def contrib_conv2d_winograd_without_weight_transform(data,
                                                      weight,
                                                      tile_size,
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index 7ed43d0df019..33aebfca9734 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -654,5 +654,68 @@ axis to be the last item in the input shape.
 .set_support_level(1)
 .add_type_rel("BatchNorm", BatchNormRel);
 
+
+// relay.nn.batch_dot
+bool BatchDotRel(const Array<Type>& types,
+                 int num_inputs,
+                 const Attrs& attrs,
+                 const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 3);
+  const auto* x = types[0].as<TensorTypeNode>();
+  const auto* y = types[1].as<TensorTypeNode>();
+  if (x == nullptr || y == nullptr) return false;
+  if (x->shape.size() != 3 || y->shape.size() != 3) return false;
+  CHECK(reporter->AssertEQ(x->shape[0], y->shape[0]))
+      << "BatchDot: batch dimension doesn't match, "
+      << " x shape=" << x->shape
+      << ", y shape=" << y->shape;
+  CHECK(reporter->AssertEQ(x->shape[2], y->shape[2]))
+      << "BatchDot: shapes of x and y is inconsistent, "
+      << " x shape=" << x->shape
+      << ", y shape=" << y->shape;
+
+  Array<tvm::Expr> oshape = x->shape;
+  oshape.Set(2, y->shape[1]);
+
+  // assign output type
+  reporter->Assign(types[2], TensorTypeNode::make(oshape, x->dtype));
+  return true;
+}
+
+
+// Positional relay function to create dense operator used by frontend FFI.
+Expr MakeBatchDot(Expr x,
+                  Expr y) {
+  static const Op& op = Op::Get("nn.batch_dot");
+  return CallNode::make(op, {x, y}, Attrs(), {});
+}
+
+
+TVM_REGISTER_API("relay.op.nn._make.batch_dot")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 2>(MakeBatchDot, args, rv);
+  });
+
+
+RELAY_REGISTER_OP("nn.batch_dot")
+.describe(R"code(Computes dot product of `x` and `y` when `x` and `y` are data
+in batch.
+
+.. math::
+
+  batch\_dot(x, y)[i, :, :] = dot(x[i, :, :], y[i, :, :]^T)
+
+- **x**: `(b, m, k)`
+- **y**: `(b, n, k)`
+- **out**: `(b, m, n)`.
+
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(2)
+.add_argument("x", "3D Tensor", "First input.")
+.add_argument("y", "3D Tensor", "Second input.")
+.set_support_level(1)
+.add_type_rel("BatchDot", BatchDotRel);
+
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index 6a1662b65170..781d2937b068 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -306,6 +306,49 @@ def test_dense():
         tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
 
 
+def test_batch_dot():
+    # TODO!!!
+    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
+    x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
+    w = relay.var("w", relay.TensorType((2, w), "float32"))
+    y = relay.nn.dense(x, w, units=2)
+    "units=2" in y.astext()
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((n, c, h, 2), "float32")
+
+    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), 2
+    x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
+    wh, ww = tvm.var("wh"), tvm.var("ww")
+    w = relay.var("w", relay.TensorType((ww, wh), "float32"))
+    y = relay.nn.dense(x, w)
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((n, c, h, ww), "float32")
+
+    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), 2
+    x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
+    w = relay.var("w", relay.IncompleteType())
+    y = relay.nn.dense(x, w, units=2)
+    yy = relay.ir_pass.infer_type(y)
+    assert yy.checked_type == relay.TensorType((n, c, h, 2), "float32")
+
+    x = relay.var("x", shape=(10, 5))
+    w = relay.var("w", shape=(2, 5))
+    z = relay.nn.dense(x, w)
+
+    # Check result.
+    func = relay.Function([x, w], z)
+    x_data = np.random.rand(10, 5).astype('float32')
+    w_data = np.random.rand(2, 5).astype('float32')
+    ref_res = np.dot(x_data, w_data.T)
+
+    for target, ctx in ctx_list():
+        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+        intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
+        op_res1 = intrp1.evaluate(func)(x_data, w_data)
+        tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5)
+        op_res2 = intrp2.evaluate(func)(x_data, w_data)
+        tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
+
 
 if __name__ == "__main__":
     test_concatenate()
@@ -319,3 +362,4 @@ def test_dense():
     test_dropout()
     test_batch_norm()
     test_dense()
+    test_batch_dot()
diff --git a/topi/python/topi/generic/nn.py b/topi/python/topi/generic/nn.py
index 8c303e5be182..15a55eca05c6 100644
--- a/topi/python/topi/generic/nn.py
+++ b/topi/python/topi/generic/nn.py
@@ -410,3 +410,10 @@ def schedule_l2_normalize(outs):
     target = tvm.target.current_target(allow_none=False)
     cpp_target = cpp.TEST_create_target(target.target_name)
     return cpp.generic.default_schedule(cpp_target, outs, False)
+
+@tvm.target.generic_func
+def schedule_batch_dot(outs):
+    print('schedule_batch_dot generic')
+    target = tvm.target.current_target(allow_none=False)
+    cpp_target = cpp.TEST_create_target(target.target_name)
+    return cpp.generic.default_schedule(cpp_target, outs, False)
diff --git a/topi/python/topi/nn/__init__.py b/topi/python/topi/nn/__init__.py
index 690379135e06..4a2749465781 100644
--- a/topi/python/topi/nn/__init__.py
+++ b/topi/python/topi/nn/__init__.py
@@ -18,3 +18,4 @@
 from .local_response_norm import *
 from .bitserial_conv2d import *
 from .l2_normalize import *
+from .batch_dot import *
diff --git a/topi/python/topi/x86/nn.py b/topi/python/topi/x86/nn.py
index ab6dda40cc9d..41096634aca8 100644
--- a/topi/python/topi/x86/nn.py
+++ b/topi/python/topi/x86/nn.py
@@ -5,7 +5,7 @@
 from tvm import autotvm
 from tvm.autotvm.task.space import SplitEntity
 
-from .util import get_fp32_len
+from .util import get_fp32_len, get_max_power2_factor
 from .. import generic, tag, nn
 from ..util import traverse_inline, get_const_tuple
 
@@ -112,7 +112,6 @@ def _declaration_dense_nopack(cfg, data, weight, bias=None):
 @autotvm.register_topi_schedule(generic.schedule_dense, "cpu", "direct")
 def _schedule_dense(cfg, outs):
     s = tvm.create_schedule([x.op for x in outs])
-    scheduled_ops = []
 
     def _callback(op):
         if "dense_pack" in op.tag:
@@ -126,7 +125,6 @@ def _callback(op):
 @autotvm.register_topi_schedule(generic.schedule_dense, "cpu", "direct_pack")
 def _schedule_dense_pack(cfg, outs):
     s = tvm.create_schedule([x.op for x in outs])
-    scheduled_ops = []
 
     def _callback(op):
         if "dense_pack" in op.tag:
@@ -138,7 +136,6 @@ def _callback(op):
 @autotvm.register_topi_schedule(generic.schedule_dense, "cpu", "direct_nopack")
 def _schedule_dense_nopack(cfg, outs):
     s = tvm.create_schedule([x.op for x in outs])
-    scheduled_ops = []
 
     def _callback(op):
         if 'dense_nopack' in op.tag:
@@ -239,3 +236,50 @@ def _default_dense_nopack_config(cfg, M, N, K):
     cfg["tile_k"] = SplitEntity([K // tilek_bn, tilek_bn])
     cfg["tile_x"] = SplitEntity([N, 1])
     cfg["tile_y"] = SplitEntity([1, M])
+
+
+@generic.schedule_batch_dot.register(["cpu"])
+def schedule_batch_dot(outs):
+    """Schedule for softmax
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of softmax
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if "batch_dot" in op.tag:
+            C = op.output(0)
+            A, B = s[C].op.input_tensors
+            _, M, N = get_const_tuple(C.shape)
+            k, = s[C].op.reduce_axis
+            ko, ki = s[C].split(k, 16)
+            CC = s.rfactor(C, ki)
+
+            b, y, x = s[C].op.axis
+            y_bn = get_max_power2_factor(M, 8)
+            x_bn = get_max_power2_factor(N, 8)
+            yo, yi = s[C].split(y, y_bn)
+            xo, xi = s[C].split(x, x_bn)
+            s[C].reorder(b, yo, xo, yi, xi)
+            bxyo = s[C].fuse(b, yo, xo)
+            s[C].parallel(bxyo)
+            s[C].fuse(yi, xi)
+
+            s[CC].compute_at(s[C], bxyo)
+            _, _, y, x = s[CC].op.axis
+            s[CC].fuse(y, x)
+            s[CC].vectorize(s[CC].op.axis[0])
+            s[C].pragma(bxyo, 'auto_unroll_max_step', 16)
+    
+    traverse_inline(s, outs[0].op, _callback)
+    
+    return s
diff --git a/topi/python/topi/x86/util.py b/topi/python/topi/x86/util.py
index 678ff8e24cff..a8a19365e2ab 100644
--- a/topi/python/topi/x86/util.py
+++ b/topi/python/topi/x86/util.py
@@ -10,3 +10,12 @@ def get_fp32_len():
             if opt == '-mcpu=skylake-avx512':
                 fp32_vec_len = 16
     return fp32_vec_len
+
+def get_max_power2_factor(n, max_value=None):
+    x = 1
+    while n % 2 == 0:
+        if max_value is not None and max_value < x * 2:
+            break
+        x *= 2
+        n /= 2
+    return x
diff --git a/topi/src/topi.cc b/topi/src/topi.cc
index 7adcb11c5656..b1118c582382 100644
--- a/topi/src/topi.cc
+++ b/topi/src/topi.cc
@@ -17,6 +17,7 @@
 #include <topi/reduction.h>
 #include <topi/transform.h>
 
+#include <topi/nn/batch_dot.h>
 #include <topi/nn/batch_norm.h>
 #include <topi/nn/bnn.h>
 #include <topi/nn/dense.h>
@@ -357,6 +358,12 @@ TVM_REGISTER_GLOBAL("topi.nn.dense")
   *rv = nn::dense(args[0], args[1], args[2]);
   });
 
+/* Ops from nn/batch_dot.h */
+TVM_REGISTER_GLOBAL("topi.nn.batch_dot")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = nn::batch_dot(args[0], args[1]);
+  });
+
 /* Ops from nn/dilate.h */
 TVM_REGISTER_GLOBAL("topi.nn.dilate")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
@@ -610,6 +617,9 @@ TVM_REGISTER_GENERIC_FUNC(schedule_dense)
 .register_func({ "cuda", "gpu" }, WrapSchedule(topi::cuda::schedule_dense))
 .register_func({ "rocm" }, WrapSchedule(topi::rocm::schedule_dense));
 
+TVM_REGISTER_GENERIC_FUNC(schedule_batch_dot)
+.set_default(WrapSchedule(topi::generic::default_schedule));
+
 TVM_REGISTER_GENERIC_FUNC(schedule_pool)
 .set_default(WrapSchedule(topi::generic::default_schedule))
 .register_func({ "cpu" }, WrapSchedule(topi::x86::default_schedule))

From bdb10fd409a70b8457152f1b83444ddae9396a76 Mon Sep 17 00:00:00 2001
From: Haichen Shen <shenhaichen@gmail.com>
Date: Mon, 4 Feb 2019 11:22:32 -0800
Subject: [PATCH 02/14] Add relay support for batch_dot

---
 docs/langref/relay_op.rst                |  4 +-
 python/tvm/relay/frontend/mxnet.py       | 20 +--------
 src/relay/op/nn/nn.cc                    |  2 +-
 tests/python/relay/test_op_level1.py     | 45 --------------------
 tests/python/relay/test_op_level10.py    | 34 +++++++++++++++
 topi/include/topi/nn/batch_dot.h         | 50 ++++++++++++++++++++++
 topi/python/topi/nn/batch_dot.py         | 20 +++++++++
 topi/tests/python/test_topi_batch_dot.py | 54 ++++++++++++++++++++++++
 8 files changed, 162 insertions(+), 67 deletions(-)
 create mode 100644 topi/include/topi/nn/batch_dot.h
 create mode 100644 topi/python/topi/nn/batch_dot.py
 create mode 100644 topi/tests/python/test_topi_batch_dot.py

diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index 4be6a01758b4..2d7a4e6fcc21 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -39,7 +39,6 @@ This level enables fully connected multi-layer perceptron.
    tvm.relay.nn.relu
    tvm.relay.nn.dropout
    tvm.relay.nn.batch_norm
-   tvm.relay.nn.batch_dot
    tvm.relay.nn.bias_add
 
 
@@ -151,6 +150,7 @@ This level support backpropagation of broadcast operators. It is temporary.
    tvm.relay.device_copy
    tvm.relay.annotation.on_device
    tvm.relay.reverse_reshape
+   tvm.relay.nn.batch_dot
 
 
 Level 1 Definitions
@@ -172,7 +172,6 @@ Level 1 Definitions
 .. autofunction:: tvm.relay.nn.relu
 .. autofunction:: tvm.relay.nn.dropout
 .. autofunction:: tvm.relay.nn.batch_norm
-.. autofunction:: tvm.relay.nn.batch_dot
 .. autofunction:: tvm.relay.nn.bias_add
 
 
@@ -262,3 +261,4 @@ Level 10 Definitions
 .. autofunction:: tvm.relay.device_copy
 .. autofunction:: tvm.relay.annotation.on_device
 .. autofunction:: tvm.relay.reverse_reshape
+.. autofunction:: tvm.relay.nn.batch_dot
diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index fb81d3eef990..e64f20964ea9 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -268,23 +268,6 @@ def _mx_multibox_detection(inputs, attrs):
     return _op.vision.nms(ret[0], ret[1], **new_attrs1)
 
 
-def _mx_div_sqrt_dim(inputs, attrs):
-    ty = ir_pass.infer_type(inputs[0])._checked_type_
-    sqrt_dim = _op.sqrt(_expr.const(int(ty.shape[-1]), ty.dtype))
-    out = inputs[0] / sqrt_dim
-    print(out.astext())
-    return out
-
-
-def _mx_batch_dot(inputs, attrs):
-    a = ir_pass.infer_type(inputs[0])
-    b = ir_pass.infer_type(inputs[1])
-    
-    print(a.astext())
-    print(b.astext())
-    exit()
-
-
 # Note: due to attribute conversion constraint
 # ops in the identity set must be attribute free
 _identity_list = [
@@ -298,6 +281,7 @@ def _mx_batch_dot(inputs, attrs):
     "slice_like",
     "zeros_like",
     "ones_like",
+    "batch_dot",
 ]
 
 _convert_map = {
@@ -380,8 +364,6 @@ def _mx_batch_dot(inputs, attrs):
     # "broadcast_to",
     # "gather_nd",
     # "Crop"          : _crop_like,
-    "_contrib_div_sqrt_dim": _mx_div_sqrt_dim,
-    "batch_dot": _mx_batch_dot,
 }
 
 # set identity list
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index 33aebfca9734..434cd1af00df 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -713,7 +713,7 @@ in batch.
 .set_num_inputs(2)
 .add_argument("x", "3D Tensor", "First input.")
 .add_argument("y", "3D Tensor", "Second input.")
-.set_support_level(1)
+.set_support_level(10)
 .add_type_rel("BatchDot", BatchDotRel);
 
 
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index 781d2937b068..e4f5a797beb6 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -306,50 +306,6 @@ def test_dense():
         tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
 
 
-def test_batch_dot():
-    # TODO!!!
-    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
-    x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
-    w = relay.var("w", relay.TensorType((2, w), "float32"))
-    y = relay.nn.dense(x, w, units=2)
-    "units=2" in y.astext()
-    yy = relay.ir_pass.infer_type(y)
-    assert yy.checked_type == relay.TensorType((n, c, h, 2), "float32")
-
-    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), 2
-    x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
-    wh, ww = tvm.var("wh"), tvm.var("ww")
-    w = relay.var("w", relay.TensorType((ww, wh), "float32"))
-    y = relay.nn.dense(x, w)
-    yy = relay.ir_pass.infer_type(y)
-    assert yy.checked_type == relay.TensorType((n, c, h, ww), "float32")
-
-    n, c , h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), 2
-    x = relay.var("x", relay.TensorType((n, c, h, w), "float32"))
-    w = relay.var("w", relay.IncompleteType())
-    y = relay.nn.dense(x, w, units=2)
-    yy = relay.ir_pass.infer_type(y)
-    assert yy.checked_type == relay.TensorType((n, c, h, 2), "float32")
-
-    x = relay.var("x", shape=(10, 5))
-    w = relay.var("w", shape=(2, 5))
-    z = relay.nn.dense(x, w)
-
-    # Check result.
-    func = relay.Function([x, w], z)
-    x_data = np.random.rand(10, 5).astype('float32')
-    w_data = np.random.rand(2, 5).astype('float32')
-    ref_res = np.dot(x_data, w_data.T)
-
-    for target, ctx in ctx_list():
-        intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
-        intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
-        op_res1 = intrp1.evaluate(func)(x_data, w_data)
-        tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5)
-        op_res2 = intrp2.evaluate(func)(x_data, w_data)
-        tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
-
-
 if __name__ == "__main__":
     test_concatenate()
     test_bias_add()
@@ -362,4 +318,3 @@ def test_batch_dot():
     test_dropout()
     test_batch_norm()
     test_dense()
-    test_batch_dot()
diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py
index a6e169e23a6c..84de324f2c45 100644
--- a/tests/python/relay/test_op_level10.py
+++ b/tests/python/relay/test_op_level10.py
@@ -144,8 +144,42 @@ def verify_reverse_reshape(shape, newshape, oshape):
     verify_reverse_reshape((2, 3, 4), (-1, 0), (6, 4))
     verify_reverse_reshape((2, 3, 4), (0, -3), (2, 12))
 
+def verify_batch_dot(x_shape, y_shape, out_shape, dtype="float32"):
+    x = relay.var("x", relay.TensorType(x_shape, dtype))
+    y = relay.var("y", relay.TensorType(y_shape, dtype))
+    z = relay.nn.batch_dot(x, y)
+    zz = relay.ir_pass.infer_type(z)
+    assert zz.checked_type == relay.ty.TensorType(out_shape, dtype)
+
+    func = relay.Function([x, y], z)
+    x_np = np.random.uniform(size=x_shape).astype(dtype)
+    y_np = np.random.uniform(size=y_shape).astype(dtype)
+    z_np = np.zeros(out_shape).astype(dtype)
+    for i in range(x_shape[0]):
+        z_np[i] = np.dot(x_np[i], y_np[i].T)
+
+    for target, ctx in ctx_list():
+        for kind in ["graph", "debug"]:
+            intrp = relay.create_executor(kind, ctx=ctx, target=target)
+            z = intrp.evaluate(func)(x_np, y_np)
+            tvm.testing.assert_allclose(z.asnumpy(), z_np, rtol=1e-5)
+
+def test_batch_dot():
+    b, m, n, k = tvm.var("b"), tvm.var("m"), tvm.var("n"), tvm.var("k")
+    x = relay.var("x", relay.TensorType((b, m, k), "float32"))
+    y = relay.var("y", relay.TensorType((b, n, k), "float32"))
+    z = relay.nn.batch_dot(x, y)
+    zz = relay.ir_pass.infer_type(z)
+    assert zz.checked_type == relay.TensorType((b, m, n), "float32")
+
+    verify_batch_dot((1, 16, 32), (1, 16, 32), (1, 16, 16))
+    verify_batch_dot((5, 16, 32), (5, 16, 32), (5, 16, 16))
+    verify_batch_dot((5, 16, 32), (5, 20, 32), (5, 16, 20))
+
+
 if __name__ == "__main__":
     test_collapse_sum_like()
     test_broadcast_to_like()
     test_slice_like()
     test_reverse_reshape()
+    test_batch_dot()
diff --git a/topi/include/topi/nn/batch_dot.h b/topi/include/topi/nn/batch_dot.h
new file mode 100644
index 000000000000..95afdf3696b6
--- /dev/null
+++ b/topi/include/topi/nn/batch_dot.h
@@ -0,0 +1,50 @@
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \brief Batch dot op constructions
+ * \file nn/batch_dot.h
+ */
+#ifndef TOPI_NN_BATCH_DOT_H_
+#define TOPI_NN_BATCH_DOT_H_
+
+#include <string>
+
+#include "topi/tags.h"
+#include "tvm/tvm.h"
+
+namespace topi {
+namespace nn {
+using namespace tvm;
+
+/*!
+* \brief Creates an operation that calculates data * weight^T + bias
+*
+* \param data Tensor with shape [batch, in_dim]
+* \param weight Tensor with shape [out_dim, in_dim]
+* \param bias Tensor with shape [out_dim]. Optional; to omit bias, pass Tensor()
+*
+* \return Tensor with shape [batch, out_dim]
+*/
+inline tvm::Tensor batch_dot(const tvm::Tensor& x,
+                             const tvm::Tensor& y) {
+  CHECK_EQ(x->shape.size(), 3) << "batch_dot requires 3-D data";
+  CHECK_EQ(y->shape.size(), 3) << "batch_dot requires 3-D data";
+
+  auto batch = x->shape[0];
+  auto M = x->shape[1];
+  auto K = x->shape[2];
+  auto N = y->shape[1];
+
+  auto k = tvm::reduce_axis(Range(0, K), "k");
+  auto result = tvm::compute(
+      { batch, M, N },
+      [&](Var b, Var i, Var j) {
+        return tvm::sum(x(b, i, k) * y(b, j, k), { k });
+      }, "tensor", "batch_dot");
+
+  return result;
+}
+
+}  // namespace nn
+}  // namespace topi
+
+#endif  // TOPI_NN_BATCH_DOT_H_
diff --git a/topi/python/topi/nn/batch_dot.py b/topi/python/topi/nn/batch_dot.py
new file mode 100644
index 000000000000..e61edd614508
--- /dev/null
+++ b/topi/python/topi/nn/batch_dot.py
@@ -0,0 +1,20 @@
+"""Binary Neural Network (BNN) Operators"""
+from __future__ import absolute_import as _abs
+import tvm
+from .. import tag
+from ..util import get_const_tuple
+
+
+def batch_dot(x, y):
+    assert len(x.shape) == 3 and len(y.shape) == 3, "only support 3-dim batch_dot"
+    x_shape = get_const_tuple(x.shape)
+    y_shape = get_const_tuple(y.shape)
+    assert x_shape[0] == y_shape[0], "batch dimension doesn't match"
+    assert x_shape[2] == y_shape[2], "shapes of x and y is inconsistant"
+    batch, M, K = x.shape
+    N = y.shape[1]
+    k = tvm.reduce_axis((0, K), name='k')
+    
+    return tvm.compute((batch, M, N),
+                       lambda b, i, j: tvm.sum(x[b, i, k] * y[b, j, k], axis=k),
+                       tag='batch_dot')
diff --git a/topi/tests/python/test_topi_batch_dot.py b/topi/tests/python/test_topi_batch_dot.py
new file mode 100644
index 000000000000..d904606164f8
--- /dev/null
+++ b/topi/tests/python/test_topi_batch_dot.py
@@ -0,0 +1,54 @@
+"""Test code for batch_dot operator"""
+import numpy as np
+import tvm
+import topi
+import topi.testing
+from topi.util import get_const_tuple
+from tvm.contrib.pickle_memoize import memoize
+
+from common import get_all_backend
+
+def verify_batch_dot(batch, M, N, K):
+    x = tvm.placeholder((batch, M, K), name='x')
+    y = tvm.placeholder((batch, N, K), name='y')
+    dtype = x.dtype
+
+    # use memoize to pickle the test data for next time use
+    @memoize("topi.tests.test_topi_batch_dot")
+    def get_ref_data():
+        a_np = np.random.uniform(size=(batch, M, K)).astype(dtype)
+        b_np = np.random.uniform(size=(batch, N, K)).astype(dtype)
+        c_np = np.zeros((batch, M, N)).astype(dtype)
+        for i in range(batch):
+            c_np[i] = np.dot(a_np[i], b_np[i].T)
+        return (a_np, b_np, c_np)
+    # get the test data
+    a_np, b_np, c_np = get_ref_data()
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            out = topi.nn.batch_dot(x, y)
+            s = topi.generic.schedule_batch_dot([out])
+        a = tvm.nd.array(a_np, ctx)
+        b = tvm.nd.array(b_np, ctx)
+        c = tvm.nd.array(np.zeros(get_const_tuple(out.shape), dtype=dtype), ctx)
+        f = tvm.build(s, [x, y, out], device, name="dense")
+        f(a, b, c)
+        tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
+
+    for device in get_all_backend():
+        check_device(device)
+
+def test_batch_dot():
+    verify_batch_dot(1, 16, 16, 32)
+    verify_batch_dot(5, 16, 16, 32)
+    verify_batch_dot(5, 16, 20, 32)
+
+
+if __name__ == "__main__":
+    test_batch_dot()

From 2ca20987a97d20af1ce4c95b024bc44b26dfd2ef Mon Sep 17 00:00:00 2001
From: Haichen Shen <shenhaichen@gmail.com>
Date: Mon, 11 Feb 2019 14:52:34 -0800
Subject: [PATCH 03/14] Rename batch_dot to batch_matmul

---
 python/tvm/relay/frontend/mxnet.py            |  6 +++-
 python/tvm/relay/op/nn/_nn.py                 | 22 ++++++-------
 python/tvm/relay/op/nn/nn.py                  |  9 +++---
 src/relay/op/nn/nn.cc                         | 32 +++++++++----------
 tests/python/relay/test_op_level10.py         | 22 ++++++-------
 .../topi/nn/{batch_dot.h => batch_matmul.h}   | 25 +++++++--------
 topi/python/topi/generic/nn.py                |  3 +-
 topi/python/topi/nn/__init__.py               |  2 +-
 .../topi/nn/{batch_dot.py => batch_matmul.py} |  6 ++--
 topi/python/topi/testing/__init__.py          |  1 +
 topi/python/topi/x86/nn.py                    | 10 +++---
 topi/src/topi.cc                              | 10 +++---
 ...batch_dot.py => test_topi_batch_matmul.py} | 24 +++++++-------
 13 files changed, 87 insertions(+), 85 deletions(-)
 rename topi/include/topi/nn/{batch_dot.h => batch_matmul.h} (53%)
 rename topi/python/topi/nn/{batch_dot.py => batch_matmul.py} (88%)
 rename topi/tests/python/{test_topi_batch_dot.py => test_topi_batch_matmul.py} (71%)

diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index e64f20964ea9..26cd565c91c9 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -268,6 +268,10 @@ def _mx_multibox_detection(inputs, attrs):
     return _op.vision.nms(ret[0], ret[1], **new_attrs1)
 
 
+def _mx_batch_dot(inputs, attrs):
+    return _op.batch_matmul(inputs[0], inputs[1])
+
+
 # Note: due to attribute conversion constraint
 # ops in the identity set must be attribute free
 _identity_list = [
@@ -281,7 +285,6 @@ def _mx_multibox_detection(inputs, attrs):
     "slice_like",
     "zeros_like",
     "ones_like",
-    "batch_dot",
 ]
 
 _convert_map = {
@@ -352,6 +355,7 @@ def _mx_multibox_detection(inputs, attrs):
     "expand_dims"   : _mx_expand_dims,
     "Concat"        : _mx_concat,
     "concat"        : _mx_concat,
+    "batch_dot"     : _mx_batch_dot,
     "LeakyReLU"     : _mx_leaky_relu,
     "SoftmaxOutput" : _mx_softmax_output,
     "SoftmaxActivation" : _mx_softmax_activation,
diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index ebb7ff56007d..3402bbd81288 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -46,19 +46,19 @@ def schedule_dense(attrs, outputs, target):
 reg.register_pattern("nn.dense", reg.OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
-# batch_dot
-@reg.register_compute("nn.batch_dot")
-def compute_batch_dot(attrs, inputs, out_type, target):
-    """Compute definition of batch_dot"""
-    return [topi.nn.batch_dot(inputs[0], inputs[1])]
-
-@reg.register_schedule("nn.batch_dot")
-def schedule_batch_dot(attrs, outputs, target):
-    """Schedule definition of dense"""
+# batch_matmul
+@reg.register_compute("nn.batch_matmul")
+def compute_batch_matmul(attrs, inputs, out_type, target):
+    """Compute definition of batch_matmul"""
+    return [topi.nn.batch_matmul(inputs[0], inputs[1])]
+
+@reg.register_schedule("nn.batch_matmul")
+def schedule_batch_matmul(attrs, outputs, target):
+    """Schedule definition of batch_matmul"""
     with target:
-        return topi.generic.schedule_batch_dot(outputs)
+        return topi.generic.schedule_batch_matmul(outputs)
 
-reg.register_pattern("nn.batch_dot", reg.OpPattern.OUT_ELEMWISE_FUSABLE)
+reg.register_pattern("nn.batch_matmul", reg.OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
 # conv2d
diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index 6f1e686eabae..ddbad9880953 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -767,13 +767,14 @@ def batch_norm(data,
     return TupleWrapper(result, 3)
 
 
-def batch_dot(x, y):
+def batch_matmul(x, y):
     r"""
-    Computes dot product of `x` and `y` when `x` and `y` are data in batch.
+    Computes batch matrix multiplication of `x` and `y` when `x` and `y` are data
+    in batch.
 
     .. math::
 
-        \mbox{batch_dot}(x, y)[i, :, :] = \mbox{dot}(x[i, :, :], y[i, :, :]^T)
+        \mbox{batch_matmul}(x, y)[i, :, :] = \mbox{matmul}(x[i, :, :], y[i, :, :]^T)
 
     Parameters
     ----------
@@ -788,7 +789,7 @@ def batch_dot(x, y):
     result: tvm.relay.Expr
         The computed result.
     """
-    return _make.batch_dot(x, y)
+    return _make.batch_matmul(x, y)
 
 
 def contrib_conv2d_winograd_without_weight_transform(data,
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index 434cd1af00df..7b997231d59f 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -655,11 +655,11 @@ axis to be the last item in the input shape.
 .add_type_rel("BatchNorm", BatchNormRel);
 
 
-// relay.nn.batch_dot
-bool BatchDotRel(const Array<Type>& types,
-                 int num_inputs,
-                 const Attrs& attrs,
-                 const TypeReporter& reporter) {
+// relay.nn.batch_matmul
+bool BatchMatmulRel(const Array<Type>& types,
+                    int num_inputs,
+                    const Attrs& attrs,
+                    const TypeReporter& reporter) {
   CHECK_EQ(types.size(), 3);
   const auto* x = types[0].as<TensorTypeNode>();
   const auto* y = types[1].as<TensorTypeNode>();
@@ -683,27 +683,27 @@ bool BatchDotRel(const Array<Type>& types,
 }
 
 
-// Positional relay function to create dense operator used by frontend FFI.
-Expr MakeBatchDot(Expr x,
-                  Expr y) {
-  static const Op& op = Op::Get("nn.batch_dot");
+// Positional relay function to create batch_matmul operator used by frontend FFI.
+Expr MakeBatchMatmul(Expr x,
+                     Expr y) {
+  static const Op& op = Op::Get("nn.batch_matmul");
   return CallNode::make(op, {x, y}, Attrs(), {});
 }
 
 
-TVM_REGISTER_API("relay.op.nn._make.batch_dot")
+TVM_REGISTER_API("relay.op.nn._make.batch_matmul")
 .set_body([](const TVMArgs& args, TVMRetValue* rv) {
-    runtime::detail::unpack_call<Expr, 2>(MakeBatchDot, args, rv);
+    runtime::detail::unpack_call<Expr, 2>(MakeBatchMatmul, args, rv);
   });
 
 
-RELAY_REGISTER_OP("nn.batch_dot")
-.describe(R"code(Computes dot product of `x` and `y` when `x` and `y` are data
-in batch.
+RELAY_REGISTER_OP("nn.batch_matmul")
+.describe(R"code(Computes matrix multiplication of `x` and `y` when `x` and `y`
+are data in batch.
 
 .. math::
 
-  batch\_dot(x, y)[i, :, :] = dot(x[i, :, :], y[i, :, :]^T)
+  batch\_matmul(x, y)[i, :, :] = matmul(x[i, :, :], y[i, :, :]^T)
 
 - **x**: `(b, m, k)`
 - **y**: `(b, n, k)`
@@ -714,7 +714,7 @@ in batch.
 .add_argument("x", "3D Tensor", "First input.")
 .add_argument("y", "3D Tensor", "Second input.")
 .set_support_level(10)
-.add_type_rel("BatchDot", BatchDotRel);
+.add_type_rel("BatchMatmul", BatchMatmulRel);
 
 
 }  // namespace relay
diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py
index 84de324f2c45..9552342f1a3a 100644
--- a/tests/python/relay/test_op_level10.py
+++ b/tests/python/relay/test_op_level10.py
@@ -4,6 +4,8 @@
 import tvm
 from tvm import relay
 from tvm.relay.testing import ctx_list
+import topi
+import topi.testing
 
 def test_collapse_sum_like():
     shape = (3, 4, 5, 6)
@@ -144,19 +146,17 @@ def verify_reverse_reshape(shape, newshape, oshape):
     verify_reverse_reshape((2, 3, 4), (-1, 0), (6, 4))
     verify_reverse_reshape((2, 3, 4), (0, -3), (2, 12))
 
-def verify_batch_dot(x_shape, y_shape, out_shape, dtype="float32"):
+def verify_batch_matmul(x_shape, y_shape, out_shape, dtype="float32"):
     x = relay.var("x", relay.TensorType(x_shape, dtype))
     y = relay.var("y", relay.TensorType(y_shape, dtype))
-    z = relay.nn.batch_dot(x, y)
+    z = relay.nn.batch_matmul(x, y)
     zz = relay.ir_pass.infer_type(z)
     assert zz.checked_type == relay.ty.TensorType(out_shape, dtype)
 
     func = relay.Function([x, y], z)
     x_np = np.random.uniform(size=x_shape).astype(dtype)
     y_np = np.random.uniform(size=y_shape).astype(dtype)
-    z_np = np.zeros(out_shape).astype(dtype)
-    for i in range(x_shape[0]):
-        z_np[i] = np.dot(x_np[i], y_np[i].T)
+    z_np = topi.testing.batch_matmul(x_np, y_np)
 
     for target, ctx in ctx_list():
         for kind in ["graph", "debug"]:
@@ -164,17 +164,17 @@ def verify_batch_dot(x_shape, y_shape, out_shape, dtype="float32"):
             z = intrp.evaluate(func)(x_np, y_np)
             tvm.testing.assert_allclose(z.asnumpy(), z_np, rtol=1e-5)
 
-def test_batch_dot():
+def test_batch_matmul():
     b, m, n, k = tvm.var("b"), tvm.var("m"), tvm.var("n"), tvm.var("k")
     x = relay.var("x", relay.TensorType((b, m, k), "float32"))
     y = relay.var("y", relay.TensorType((b, n, k), "float32"))
-    z = relay.nn.batch_dot(x, y)
+    z = relay.nn.batch_matmul(x, y)
     zz = relay.ir_pass.infer_type(z)
     assert zz.checked_type == relay.TensorType((b, m, n), "float32")
 
-    verify_batch_dot((1, 16, 32), (1, 16, 32), (1, 16, 16))
-    verify_batch_dot((5, 16, 32), (5, 16, 32), (5, 16, 16))
-    verify_batch_dot((5, 16, 32), (5, 20, 32), (5, 16, 20))
+    verify_batch_matmul((1, 16, 32), (1, 16, 32), (1, 16, 16))
+    verify_batch_matmul((5, 16, 32), (5, 16, 32), (5, 16, 16))
+    verify_batch_matmul((5, 16, 32), (5, 20, 32), (5, 16, 20))
 
 
 if __name__ == "__main__":
@@ -182,4 +182,4 @@ def test_batch_dot():
     test_broadcast_to_like()
     test_slice_like()
     test_reverse_reshape()
-    test_batch_dot()
+    test_batch_matmul()
diff --git a/topi/include/topi/nn/batch_dot.h b/topi/include/topi/nn/batch_matmul.h
similarity index 53%
rename from topi/include/topi/nn/batch_dot.h
rename to topi/include/topi/nn/batch_matmul.h
index 95afdf3696b6..b59ffe02e942 100644
--- a/topi/include/topi/nn/batch_dot.h
+++ b/topi/include/topi/nn/batch_matmul.h
@@ -1,10 +1,10 @@
 /*!
  *  Copyright (c) 2019 by Contributors
- * \brief Batch dot op constructions
- * \file nn/batch_dot.h
+ * \brief Batch matmul op constructions
+ * \file nn/batch_matmul.h
  */
-#ifndef TOPI_NN_BATCH_DOT_H_
-#define TOPI_NN_BATCH_DOT_H_
+#ifndef TOPI_NN_BATCH_MATMUL_H_
+#define TOPI_NN_BATCH_MATMUL_H_
 
 #include <string>
 
@@ -16,16 +16,15 @@ namespace nn {
 using namespace tvm;
 
 /*!
-* \brief Creates an operation that calculates data * weight^T + bias
+* \brief Creates an operation that calculates matrix multiplication in batch.
 *
-* \param data Tensor with shape [batch, in_dim]
-* \param weight Tensor with shape [out_dim, in_dim]
-* \param bias Tensor with shape [out_dim]. Optional; to omit bias, pass Tensor()
+* \param x Tensor with shape [batch, M, K]
+* \param y Tensor with shape [batch, N, K]
 *
-* \return Tensor with shape [batch, out_dim]
+* \return Tensor with shape [batch, M, N]
 */
-inline tvm::Tensor batch_dot(const tvm::Tensor& x,
-                             const tvm::Tensor& y) {
+inline tvm::Tensor batch_matmul(const tvm::Tensor& x,
+                                const tvm::Tensor& y) {
   CHECK_EQ(x->shape.size(), 3) << "batch_dot requires 3-D data";
   CHECK_EQ(y->shape.size(), 3) << "batch_dot requires 3-D data";
 
@@ -39,7 +38,7 @@ inline tvm::Tensor batch_dot(const tvm::Tensor& x,
       { batch, M, N },
       [&](Var b, Var i, Var j) {
         return tvm::sum(x(b, i, k) * y(b, j, k), { k });
-      }, "tensor", "batch_dot");
+      }, "tensor", "batch_matmul");
 
   return result;
 }
@@ -47,4 +46,4 @@ inline tvm::Tensor batch_dot(const tvm::Tensor& x,
 }  // namespace nn
 }  // namespace topi
 
-#endif  // TOPI_NN_BATCH_DOT_H_
+#endif  // TOPI_NN_BATCH_MATMUL_H_
diff --git a/topi/python/topi/generic/nn.py b/topi/python/topi/generic/nn.py
index 15a55eca05c6..00b742f24e64 100644
--- a/topi/python/topi/generic/nn.py
+++ b/topi/python/topi/generic/nn.py
@@ -412,8 +412,7 @@ def schedule_l2_normalize(outs):
     return cpp.generic.default_schedule(cpp_target, outs, False)
 
 @tvm.target.generic_func
-def schedule_batch_dot(outs):
-    print('schedule_batch_dot generic')
+def schedule_batch_matmul(outs):
     target = tvm.target.current_target(allow_none=False)
     cpp_target = cpp.TEST_create_target(target.target_name)
     return cpp.generic.default_schedule(cpp_target, outs, False)
diff --git a/topi/python/topi/nn/__init__.py b/topi/python/topi/nn/__init__.py
index 4a2749465781..e23aeb003dc9 100644
--- a/topi/python/topi/nn/__init__.py
+++ b/topi/python/topi/nn/__init__.py
@@ -18,4 +18,4 @@
 from .local_response_norm import *
 from .bitserial_conv2d import *
 from .l2_normalize import *
-from .batch_dot import *
+from .batch_matmul import *
diff --git a/topi/python/topi/nn/batch_dot.py b/topi/python/topi/nn/batch_matmul.py
similarity index 88%
rename from topi/python/topi/nn/batch_dot.py
rename to topi/python/topi/nn/batch_matmul.py
index e61edd614508..51c4a69c5dbc 100644
--- a/topi/python/topi/nn/batch_dot.py
+++ b/topi/python/topi/nn/batch_matmul.py
@@ -5,8 +5,8 @@
 from ..util import get_const_tuple
 
 
-def batch_dot(x, y):
-    assert len(x.shape) == 3 and len(y.shape) == 3, "only support 3-dim batch_dot"
+def batch_matmul(x, y):
+    assert len(x.shape) == 3 and len(y.shape) == 3, "only support 3-dim batch_matmul"
     x_shape = get_const_tuple(x.shape)
     y_shape = get_const_tuple(y.shape)
     assert x_shape[0] == y_shape[0], "batch dimension doesn't match"
@@ -17,4 +17,4 @@ def batch_dot(x, y):
     
     return tvm.compute((batch, M, N),
                        lambda b, i, j: tvm.sum(x[b, i, k] * y[b, j, k], axis=k),
-                       tag='batch_dot')
+                       tag='batch_matmul')
diff --git a/topi/python/topi/testing/__init__.py b/topi/python/topi/testing/__init__.py
index 5ea9683f72ef..e4579008667b 100644
--- a/topi/python/topi/testing/__init__.py
+++ b/topi/python/topi/testing/__init__.py
@@ -21,3 +21,4 @@
 from .l2_normalize_python import l2_normalize_python
 from .gather_nd_python import gather_nd_python
 from .strided_slice_python import strided_slice_python
+from .batch_matmul import batch_matmul
diff --git a/topi/python/topi/x86/nn.py b/topi/python/topi/x86/nn.py
index 41096634aca8..d5a6ca76f7ed 100644
--- a/topi/python/topi/x86/nn.py
+++ b/topi/python/topi/x86/nn.py
@@ -238,14 +238,14 @@ def _default_dense_nopack_config(cfg, M, N, K):
     cfg["tile_y"] = SplitEntity([1, M])
 
 
-@generic.schedule_batch_dot.register(["cpu"])
-def schedule_batch_dot(outs):
-    """Schedule for softmax
+@generic.schedule_batch_matmul.register(["cpu"])
+def schedule_batch_matmul(outs):
+    """Schedule for batch_matmul
 
     Parameters
     ----------
     outs: Array of Tensor
-          The computation graph description of softmax
+          The computation graph description of batch_matmul
           in the format of an array of tensors.
 
     Returns
@@ -256,7 +256,7 @@ def schedule_batch_dot(outs):
     s = tvm.create_schedule([x.op for x in outs])
 
     def _callback(op):
-        if "batch_dot" in op.tag:
+        if "batch_matmul" in op.tag:
             C = op.output(0)
             A, B = s[C].op.input_tensors
             _, M, N = get_const_tuple(C.shape)
diff --git a/topi/src/topi.cc b/topi/src/topi.cc
index b1118c582382..e4ec7bf24b01 100644
--- a/topi/src/topi.cc
+++ b/topi/src/topi.cc
@@ -17,7 +17,7 @@
 #include <topi/reduction.h>
 #include <topi/transform.h>
 
-#include <topi/nn/batch_dot.h>
+#include <topi/nn/batch_matmul.h>
 #include <topi/nn/batch_norm.h>
 #include <topi/nn/bnn.h>
 #include <topi/nn/dense.h>
@@ -358,10 +358,10 @@ TVM_REGISTER_GLOBAL("topi.nn.dense")
   *rv = nn::dense(args[0], args[1], args[2]);
   });
 
-/* Ops from nn/batch_dot.h */
-TVM_REGISTER_GLOBAL("topi.nn.batch_dot")
+/* Ops from nn/batch_matmul.h */
+TVM_REGISTER_GLOBAL("topi.nn.batch_matmul")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
-  *rv = nn::batch_dot(args[0], args[1]);
+  *rv = nn::batch_matmul(args[0], args[1]);
   });
 
 /* Ops from nn/dilate.h */
@@ -617,7 +617,7 @@ TVM_REGISTER_GENERIC_FUNC(schedule_dense)
 .register_func({ "cuda", "gpu" }, WrapSchedule(topi::cuda::schedule_dense))
 .register_func({ "rocm" }, WrapSchedule(topi::rocm::schedule_dense));
 
-TVM_REGISTER_GENERIC_FUNC(schedule_batch_dot)
+TVM_REGISTER_GENERIC_FUNC(schedule_batch_matmul)
 .set_default(WrapSchedule(topi::generic::default_schedule));
 
 TVM_REGISTER_GENERIC_FUNC(schedule_pool)
diff --git a/topi/tests/python/test_topi_batch_dot.py b/topi/tests/python/test_topi_batch_matmul.py
similarity index 71%
rename from topi/tests/python/test_topi_batch_dot.py
rename to topi/tests/python/test_topi_batch_matmul.py
index d904606164f8..8ac3e24a3408 100644
--- a/topi/tests/python/test_topi_batch_dot.py
+++ b/topi/tests/python/test_topi_batch_matmul.py
@@ -1,4 +1,4 @@
-"""Test code for batch_dot operator"""
+"""Test code for batch_matmul operator"""
 import numpy as np
 import tvm
 import topi
@@ -8,19 +8,17 @@
 
 from common import get_all_backend
 
-def verify_batch_dot(batch, M, N, K):
+def verify_batch_matmul(batch, M, N, K):
     x = tvm.placeholder((batch, M, K), name='x')
     y = tvm.placeholder((batch, N, K), name='y')
     dtype = x.dtype
 
     # use memoize to pickle the test data for next time use
-    @memoize("topi.tests.test_topi_batch_dot")
+    @memoize("topi.tests.test_topi_batch_matmul")
     def get_ref_data():
         a_np = np.random.uniform(size=(batch, M, K)).astype(dtype)
         b_np = np.random.uniform(size=(batch, N, K)).astype(dtype)
-        c_np = np.zeros((batch, M, N)).astype(dtype)
-        for i in range(batch):
-            c_np[i] = np.dot(a_np[i], b_np[i].T)
+        c_np = topi.testing.batch_matmul(a_np, b_np)
         return (a_np, b_np, c_np)
     # get the test data
     a_np, b_np, c_np = get_ref_data()
@@ -32,8 +30,8 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.create(device):
-            out = topi.nn.batch_dot(x, y)
-            s = topi.generic.schedule_batch_dot([out])
+            out = topi.nn.batch_matmul(x, y)
+            s = topi.generic.schedule_batch_matmul([out])
         a = tvm.nd.array(a_np, ctx)
         b = tvm.nd.array(b_np, ctx)
         c = tvm.nd.array(np.zeros(get_const_tuple(out.shape), dtype=dtype), ctx)
@@ -44,11 +42,11 @@ def check_device(device):
     for device in get_all_backend():
         check_device(device)
 
-def test_batch_dot():
-    verify_batch_dot(1, 16, 16, 32)
-    verify_batch_dot(5, 16, 16, 32)
-    verify_batch_dot(5, 16, 20, 32)
+def test_batch_matmul():
+    verify_batch_matmul(1, 16, 16, 32)
+    verify_batch_matmul(5, 16, 16, 32)
+    verify_batch_matmul(5, 16, 20, 32)
 
 
 if __name__ == "__main__":
-    test_batch_dot()
+    test_batch_matmul()

From e20b1e8dab15a5443f55dd479cd5352585c813c4 Mon Sep 17 00:00:00 2001
From: Haichen Shen <shenhaichen@gmail.com>
Date: Mon, 11 Feb 2019 14:55:16 -0800
Subject: [PATCH 04/14] nits

---
 docs/langref/relay_op.rst             | 4 ++--
 tests/python/relay/test_op_level10.py | 1 -
 topi/include/topi/nn/batch_matmul.h   | 4 ++--
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index 2d7a4e6fcc21..c6702d4b4e51 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -150,7 +150,7 @@ This level support backpropagation of broadcast operators. It is temporary.
    tvm.relay.device_copy
    tvm.relay.annotation.on_device
    tvm.relay.reverse_reshape
-   tvm.relay.nn.batch_dot
+   tvm.relay.nn.batch_matmul
 
 
 Level 1 Definitions
@@ -261,4 +261,4 @@ Level 10 Definitions
 .. autofunction:: tvm.relay.device_copy
 .. autofunction:: tvm.relay.annotation.on_device
 .. autofunction:: tvm.relay.reverse_reshape
-.. autofunction:: tvm.relay.nn.batch_dot
+.. autofunction:: tvm.relay.nn.batch_matmul
diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py
index 9552342f1a3a..6d68197ff7ac 100644
--- a/tests/python/relay/test_op_level10.py
+++ b/tests/python/relay/test_op_level10.py
@@ -128,7 +128,6 @@ def verify_reverse_reshape(shape, newshape, oshape):
         x = relay.var("x", relay.TensorType(shape, "float32"))
         z = relay.reverse_reshape(x, newshape=newshape)
         zz = relay.ir_pass.infer_type(z)
-        print(zz.checked_type)
         assert "newshape=" in z.astext()
         assert zz.checked_type == relay.ty.TensorType(oshape, "float32")
 
diff --git a/topi/include/topi/nn/batch_matmul.h b/topi/include/topi/nn/batch_matmul.h
index b59ffe02e942..968e1b0c697c 100644
--- a/topi/include/topi/nn/batch_matmul.h
+++ b/topi/include/topi/nn/batch_matmul.h
@@ -25,8 +25,8 @@ using namespace tvm;
 */
 inline tvm::Tensor batch_matmul(const tvm::Tensor& x,
                                 const tvm::Tensor& y) {
-  CHECK_EQ(x->shape.size(), 3) << "batch_dot requires 3-D data";
-  CHECK_EQ(y->shape.size(), 3) << "batch_dot requires 3-D data";
+  CHECK_EQ(x->shape.size(), 3) << "batch_matmul requires 3-D data";
+  CHECK_EQ(y->shape.size(), 3) << "batch_matmul requires 3-D data";
 
   auto batch = x->shape[0];
   auto M = x->shape[1];

From 15f4b7c23acc7200d161c4c54ae5d52528901236 Mon Sep 17 00:00:00 2001
From: Haichen Shen <shenhaichen@gmail.com>
Date: Mon, 11 Feb 2019 15:32:15 -0800
Subject: [PATCH 05/14] Add missing file

---
 topi/python/topi/testing/batch_matmul.py | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 topi/python/topi/testing/batch_matmul.py

diff --git a/topi/python/topi/testing/batch_matmul.py b/topi/python/topi/testing/batch_matmul.py
new file mode 100644
index 000000000000..9d3c5cc47a85
--- /dev/null
+++ b/topi/python/topi/testing/batch_matmul.py
@@ -0,0 +1,11 @@
+# pylint: disable=invalid-name
+"""Batch matmul in python"""
+import numpy as np
+
+def batch_matmul(x, y):
+    batch, M, K = x.shape
+    N = y.shape[1]
+    out = np.zeros((batch, M, N)).astype(x.dtype)
+    for i in range(batch):
+        out[i] = np.dot(x[i], y[i].T)
+    return out

From 779b4590184eb60a88c7427ce98a2953d301d197 Mon Sep 17 00:00:00 2001
From: Haichen Shen <shenhaichen@gmail.com>
Date: Tue, 12 Feb 2019 00:40:53 +0000
Subject: [PATCH 06/14] Put batch_matmul and dense x86 schedule in separate
 files

---
 python/tvm/relay/frontend/mxnet.py   |   5 +
 topi/python/topi/x86/batch_matmul.py |  55 ++++++
 topi/python/topi/x86/dense.py        | 208 ++++++++++++++++++++++
 topi/python/topi/x86/nn.py           | 250 +--------------------------
 4 files changed, 269 insertions(+), 249 deletions(-)
 create mode 100644 topi/python/topi/x86/batch_matmul.py
 create mode 100644 topi/python/topi/x86/dense.py

diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index 26cd565c91c9..afb1d386e800 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -269,6 +269,11 @@ def _mx_multibox_detection(inputs, attrs):
 
 
 def _mx_batch_dot(inputs, attrs):
+    transpose_a = attrs.get_bool("transpose_a", False)
+    transpose_b = attrs.get_bool("transpose_b", False)
+    if transpose_a is True or transpose_b is False:
+        raise RuntimeError("batch_dot: only support transpose_a=False and "
+                           "transpose_b=True")
     return _op.batch_matmul(inputs[0], inputs[1])
 
 
diff --git a/topi/python/topi/x86/batch_matmul.py b/topi/python/topi/x86/batch_matmul.py
new file mode 100644
index 000000000000..0723ed36da60
--- /dev/null
+++ b/topi/python/topi/x86/batch_matmul.py
@@ -0,0 +1,55 @@
+# pylint: disable=invalid-name,too-many-locals,unused-variable
+"""x86 batch_matmul operators"""
+from __future__ import absolute_import as _abs
+import tvm
+
+from .util import get_max_power2_factor
+from .. import generic
+from ..util import traverse_inline, get_const_tuple
+
+
+@generic.schedule_batch_matmul.register(["cpu"])
+def schedule_batch_matmul(outs):
+    """Schedule for batch_matmul
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of batch_matmul
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if "batch_matmul" in op.tag:
+            C = op.output(0)
+            A, B = s[C].op.input_tensors
+            _, M, N = get_const_tuple(C.shape)
+            k, = s[C].op.reduce_axis
+            ko, ki = s[C].split(k, 16)
+            CC = s.rfactor(C, ki)
+
+            b, y, x = s[C].op.axis
+            y_bn = get_max_power2_factor(M, 8)
+            x_bn = get_max_power2_factor(N, 8)
+            yo, yi = s[C].split(y, y_bn)
+            xo, xi = s[C].split(x, x_bn)
+            s[C].reorder(b, yo, xo, yi, xi)
+            bxyo = s[C].fuse(b, yo, xo)
+            s[C].parallel(bxyo)
+            s[C].fuse(yi, xi)
+
+            s[CC].compute_at(s[C], bxyo)
+            _, _, y, x = s[CC].op.axis
+            s[CC].fuse(y, x)
+            s[CC].vectorize(s[CC].op.axis[0])
+            s[C].pragma(bxyo, 'auto_unroll_max_step', 16)
+    
+    traverse_inline(s, outs[0].op, _callback)
+    
+    return s
diff --git a/topi/python/topi/x86/dense.py b/topi/python/topi/x86/dense.py
new file mode 100644
index 000000000000..3ba3909c54a4
--- /dev/null
+++ b/topi/python/topi/x86/dense.py
@@ -0,0 +1,208 @@
+# pylint: disable=invalid-name,too-many-locals,unused-variable
+"""x86 dense operators"""
+from __future__ import absolute_import as _abs
+import tvm
+from tvm import autotvm
+from tvm.autotvm.task.space import SplitEntity
+
+from .util import get_fp32_len, get_max_power2_factor
+from .. import generic, tag, nn
+from ..util import traverse_inline, get_const_tuple
+
+@autotvm.register_topi_compute(nn.dense, "cpu", "direct")
+def _declaration_dense(cfg, data, weight, bias=None):
+    batch, _ = get_const_tuple(data.shape)
+
+    # For small batch sizes, don't pack weight into cache-friendly layout
+    # because of overhead in packing and limited reuse from batch dimension
+    # TODO(icemelon9): use a more systematic way to determine which schedule to use
+    if batch <= 16:
+        return _declaration_dense_nopack(cfg, data, weight, bias)
+    return _declaration_dense_pack(cfg, data, weight, bias)
+
+
+# Declare dense compute with packing weight into cache-friendly layout
+@autotvm.register_topi_compute(nn.dense, "cpu", "direct_pack")
+def _declaration_dense_pack(cfg, data, weight, bias=None):
+    batch, in_dim = get_const_tuple(data.shape)
+    out_dim, _ = get_const_tuple(weight.shape)
+    # create tuning space
+    cfg.define_split("tile_y", batch, num_outputs=3)
+    cfg.define_split("tile_x", out_dim, num_outputs=3)
+    cfg.define_split("tile_k", in_dim, num_outputs=2)
+    if cfg.is_fallback:
+        _default_dense_pack_config(cfg, batch, out_dim, in_dim)
+
+    packw_bn = cfg["tile_x"].size[-1]
+    packw_shape = (out_dim // packw_bn, in_dim, packw_bn)
+    packw = tvm.compute(packw_shape,
+                        lambda z, y, x: weight[z * packw_bn + x, y], name="packed_weight")
+
+    k = tvm.reduce_axis((0, in_dim), name="k")
+    C = tvm.compute((batch, out_dim),
+                    lambda y, x: tvm.sum(
+                        data[y, k] * packw[x // packw_bn, k, x % packw_bn],
+                        axis=k),
+                    tag="dense_pack")
+    if bias is not None:
+        C = tvm.compute((batch, out_dim), lambda i, j: C[i, j] + bias[j],
+                        tag=tag.BROADCAST)
+    return C
+
+
+# Declare dense compute without packing weight
+@autotvm.register_topi_compute(nn.dense, "cpu", "direct_nopack")
+def _declaration_dense_nopack(cfg, data, weight, bias=None):
+    batch, in_dim = get_const_tuple(data.shape)
+    out_dim, _ = get_const_tuple(weight.shape)
+    # create tuning space
+    cfg.define_split("tile_x", out_dim, num_outputs=2)
+    cfg.define_split("tile_y", batch, num_outputs=2)
+    cfg.define_split("tile_k", in_dim, num_outputs=2)
+    if cfg.is_fallback:
+        _default_dense_nopack_config(cfg, batch, out_dim, in_dim)
+
+    vec = cfg["tile_k"].size[-1]
+    k = tvm.reduce_axis((0, in_dim // vec), "k")
+    CC = tvm.compute((batch, out_dim, vec),
+                     lambda z, y, x: tvm.sum(
+                         data[z, k * vec + x] * weight[y, k * vec + x], axis=k))
+
+    kk = tvm.reduce_axis((0, vec), "kk")
+    C = tvm.compute((batch, out_dim),
+                    lambda y, x: tvm.sum(CC[y, x, kk], axis=kk),
+                    tag="dense_nopack")
+    if bias is not None:
+        C = tvm.compute((batch, out_dim), lambda i, j: C[i, j] + bias[j],
+                        tag=tag.BROADCAST)
+
+    return C
+
+
+@autotvm.register_topi_schedule(generic.schedule_dense, "cpu", "direct")
+def _schedule_dense(cfg, outs):
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if "dense_pack" in op.tag:
+            _schedule_dense_pack_template(cfg, s, op.output(0))
+        elif 'dense_nopack' in op.tag:
+            _schedule_dense_nopack_template(cfg, s, op.output(0))
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+
+@autotvm.register_topi_schedule(generic.schedule_dense, "cpu", "direct_pack")
+def _schedule_dense_pack(cfg, outs):
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if "dense_pack" in op.tag:
+            _schedule_dense_pack_template(cfg, s, op.output(0))
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+
+@autotvm.register_topi_schedule(generic.schedule_dense, "cpu", "direct_nopack")
+def _schedule_dense_nopack(cfg, outs):
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if 'dense_nopack' in op.tag:
+            _schedule_dense_nopack_template(cfg, s, op.output(0))
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+
+def _schedule_dense_pack_template(cfg, s, C):
+    A, packedB = s[C].op.input_tensors
+
+    CC = s.cache_write(C, "global")
+    y, x = s[C].op.axis
+    k, = s[CC].op.reduce_axis
+
+    yt, yo, yi = cfg["tile_y"].apply(s, C, y)
+    xt, xo, xi = cfg["tile_x"].apply(s, C, x)
+    s[C].reorder(yt, xt, yo, xo, yi, xi)
+    xyt = s[C].fuse(yt, xt)
+    s[C].parallel(xyt)
+    xyo = s[C].fuse(yo, xo)
+    s[C].unroll(yi)
+    s[C].vectorize(xi)
+
+    s[CC].compute_at(s[C], xyo)
+    y, x = s[CC].op.axis
+    ko, ki = cfg["tile_k"].apply(s, CC, k)
+    s[CC].reorder(ko, ki, y, x)
+    s[CC].vectorize(x)
+    s[CC].unroll(y)
+    s[CC].unroll(ki)
+
+    z, y, x = s[packedB].op.axis
+    s[packedB].reorder(z, x, y)
+    s[packedB].parallel(z)
+    s[packedB].vectorize(y)
+    return s
+
+
+def _schedule_dense_nopack_template(cfg, s, C):
+    y, x = s[C].op.axis
+    kk, = s[C].op.reduce_axis
+    yo, yi = cfg["tile_y"].apply(s, C, y)
+    xo, xi = cfg["tile_x"].apply(s, C, x)
+    s[C].reorder(yo, xo, yi, xi)
+    xyo = s[C].fuse(yo, xo)
+    s[C].parallel(xyo)
+    s[C].unroll(kk)
+
+    CC, = s[C].op.input_tensors
+    s[CC].compute_at(s[C], xyo)
+    z, y, x = s[CC].op.axis
+    k, = s[CC].op.reduce_axis
+    yz = s[CC].fuse(z, y)
+    s[CC].reorder(k, yz, x)
+    s[CC].unroll(yz)
+    s[CC].vectorize(x)
+    return s
+
+
+def _default_dense_pack_config(cfg, M, N, K):
+    vec_width = get_fp32_len()
+
+    tilex_ii = 1
+    for bn in range(vec_width*2, 0, -1):
+        if N % bn == 0:
+            tilex_ii = bn
+            break
+    NN = N // tilex_ii
+    tilex_oi = 1
+    while NN // tilex_oi > 4:
+        if (NN // tilex_oi) % 2 == 1:
+            break
+        tilex_oi *= 2
+
+    tiley_ii = 8
+    while M % tiley_ii != 0:
+        tiley_ii //= 2
+    MM = M // tiley_ii
+    tiley_oi = 1
+    while MM // tiley_oi > 4:
+        if (MM // tiley_oi) % 2 == 1:
+            break
+        tiley_oi *= 2
+
+    cfg["tile_y"] = SplitEntity([MM // tiley_oi, tiley_oi, tiley_ii])
+    cfg["tile_x"] = SplitEntity([NN // tilex_oi, tilex_oi, tilex_ii])
+    cfg["tile_k"] = SplitEntity([K, 1])
+
+
+def _default_dense_nopack_config(cfg, M, N, K):
+    vec_width = get_fp32_len()
+    tilek_bn = 1
+    for bn in range(vec_width*2, 0, -1):
+        if K % bn == 0:
+            tilek_bn = bn
+            break
+    cfg["tile_k"] = SplitEntity([K // tilek_bn, tilek_bn])
+    cfg["tile_x"] = SplitEntity([N, 1])
+    cfg["tile_y"] = SplitEntity([1, M])
diff --git a/topi/python/topi/x86/nn.py b/topi/python/topi/x86/nn.py
index d5a6ca76f7ed..3b57c0096c58 100644
--- a/topi/python/topi/x86/nn.py
+++ b/topi/python/topi/x86/nn.py
@@ -5,9 +5,7 @@
 from tvm import autotvm
 from tvm.autotvm.task.space import SplitEntity
 
-from .util import get_fp32_len, get_max_power2_factor
-from .. import generic, tag, nn
-from ..util import traverse_inline, get_const_tuple
+from .. import generic
 
 @generic.schedule_softmax.register(["cpu"])
 def schedule_softmax(outs):
@@ -37,249 +35,3 @@ def schedule_softmax(outs):
     else:
         s[x].parallel(s[x].op.axis[0])
     return s
-
-
-@autotvm.register_topi_compute(nn.dense, "cpu", "direct")
-def _declaration_dense(cfg, data, weight, bias=None):
-    batch, _ = get_const_tuple(data.shape)
-
-    # For small batch sizes, don't pack weight into cache-friendly layout
-    # because of overhead in packing and limited reuse from batch dimension
-    # TODO(icemelon9): use a more systematic way to determine which schedule to use
-    if batch <= 16:
-        return _declaration_dense_nopack(cfg, data, weight, bias)
-    return _declaration_dense_pack(cfg, data, weight, bias)
-
-
-# Declare dense compute with packing weight into cache-friendly layout
-@autotvm.register_topi_compute(nn.dense, "cpu", "direct_pack")
-def _declaration_dense_pack(cfg, data, weight, bias=None):
-    batch, in_dim = get_const_tuple(data.shape)
-    out_dim, _ = get_const_tuple(weight.shape)
-    # create tuning space
-    cfg.define_split("tile_y", batch, num_outputs=3)
-    cfg.define_split("tile_x", out_dim, num_outputs=3)
-    cfg.define_split("tile_k", in_dim, num_outputs=2)
-    if cfg.is_fallback:
-        _default_dense_pack_config(cfg, batch, out_dim, in_dim)
-
-    packw_bn = cfg["tile_x"].size[-1]
-    packw_shape = (out_dim // packw_bn, in_dim, packw_bn)
-    packw = tvm.compute(packw_shape,
-                        lambda z, y, x: weight[z * packw_bn + x, y], name="packed_weight")
-
-    k = tvm.reduce_axis((0, in_dim), name="k")
-    C = tvm.compute((batch, out_dim),
-                    lambda y, x: tvm.sum(
-                        data[y, k] * packw[x // packw_bn, k, x % packw_bn],
-                        axis=k),
-                    tag="dense_pack")
-    if bias is not None:
-        C = tvm.compute((batch, out_dim), lambda i, j: C[i, j] + bias[j],
-                        tag=tag.BROADCAST)
-    return C
-
-
-# Declare dense compute without packing weight
-@autotvm.register_topi_compute(nn.dense, "cpu", "direct_nopack")
-def _declaration_dense_nopack(cfg, data, weight, bias=None):
-    batch, in_dim = get_const_tuple(data.shape)
-    out_dim, _ = get_const_tuple(weight.shape)
-    # create tuning space
-    cfg.define_split("tile_x", out_dim, num_outputs=2)
-    cfg.define_split("tile_y", batch, num_outputs=2)
-    cfg.define_split("tile_k", in_dim, num_outputs=2)
-    if cfg.is_fallback:
-        _default_dense_nopack_config(cfg, batch, out_dim, in_dim)
-
-    vec = cfg["tile_k"].size[-1]
-    k = tvm.reduce_axis((0, in_dim // vec), "k")
-    CC = tvm.compute((batch, out_dim, vec),
-                     lambda z, y, x: tvm.sum(
-                         data[z, k * vec + x] * weight[y, k * vec + x], axis=k))
-
-    kk = tvm.reduce_axis((0, vec), "kk")
-    C = tvm.compute((batch, out_dim),
-                    lambda y, x: tvm.sum(CC[y, x, kk], axis=kk),
-                    tag="dense_nopack")
-    if bias is not None:
-        C = tvm.compute((batch, out_dim), lambda i, j: C[i, j] + bias[j],
-                        tag=tag.BROADCAST)
-
-    return C
-
-
-@autotvm.register_topi_schedule(generic.schedule_dense, "cpu", "direct")
-def _schedule_dense(cfg, outs):
-    s = tvm.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if "dense_pack" in op.tag:
-            _schedule_dense_pack_template(cfg, s, op.output(0))
-        elif 'dense_nopack' in op.tag:
-            _schedule_dense_nopack_template(cfg, s, op.output(0))
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-@autotvm.register_topi_schedule(generic.schedule_dense, "cpu", "direct_pack")
-def _schedule_dense_pack(cfg, outs):
-    s = tvm.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if "dense_pack" in op.tag:
-            _schedule_dense_pack_template(cfg, s, op.output(0))
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-@autotvm.register_topi_schedule(generic.schedule_dense, "cpu", "direct_nopack")
-def _schedule_dense_nopack(cfg, outs):
-    s = tvm.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if 'dense_nopack' in op.tag:
-            _schedule_dense_nopack_template(cfg, s, op.output(0))
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-def _schedule_dense_pack_template(cfg, s, C):
-    A, packedB = s[C].op.input_tensors
-
-    CC = s.cache_write(C, "global")
-    y, x = s[C].op.axis
-    k, = s[CC].op.reduce_axis
-
-    yt, yo, yi = cfg["tile_y"].apply(s, C, y)
-    xt, xo, xi = cfg["tile_x"].apply(s, C, x)
-    s[C].reorder(yt, xt, yo, xo, yi, xi)
-    xyt = s[C].fuse(yt, xt)
-    s[C].parallel(xyt)
-    xyo = s[C].fuse(yo, xo)
-    s[C].unroll(yi)
-    s[C].vectorize(xi)
-
-    s[CC].compute_at(s[C], xyo)
-    y, x = s[CC].op.axis
-    ko, ki = cfg["tile_k"].apply(s, CC, k)
-    s[CC].reorder(ko, ki, y, x)
-    s[CC].vectorize(x)
-    s[CC].unroll(y)
-    s[CC].unroll(ki)
-
-    z, y, x = s[packedB].op.axis
-    s[packedB].reorder(z, x, y)
-    s[packedB].parallel(z)
-    s[packedB].vectorize(y)
-    return s
-
-
-def _schedule_dense_nopack_template(cfg, s, C):
-    y, x = s[C].op.axis
-    kk, = s[C].op.reduce_axis
-    yo, yi = cfg["tile_y"].apply(s, C, y)
-    xo, xi = cfg["tile_x"].apply(s, C, x)
-    s[C].reorder(yo, xo, yi, xi)
-    xyo = s[C].fuse(yo, xo)
-    s[C].parallel(xyo)
-    s[C].unroll(kk)
-
-    CC, = s[C].op.input_tensors
-    s[CC].compute_at(s[C], xyo)
-    z, y, x = s[CC].op.axis
-    k, = s[CC].op.reduce_axis
-    yz = s[CC].fuse(z, y)
-    s[CC].reorder(k, yz, x)
-    s[CC].unroll(yz)
-    s[CC].vectorize(x)
-    return s
-
-
-def _default_dense_pack_config(cfg, M, N, K):
-    vec_width = get_fp32_len()
-
-    tilex_ii = 1
-    for bn in range(vec_width*2, 0, -1):
-        if N % bn == 0:
-            tilex_ii = bn
-            break
-    NN = N // tilex_ii
-    tilex_oi = 1
-    while NN // tilex_oi > 4:
-        if (NN // tilex_oi) % 2 == 1:
-            break
-        tilex_oi *= 2
-
-    tiley_ii = 8
-    while M % tiley_ii != 0:
-        tiley_ii //= 2
-    MM = M // tiley_ii
-    tiley_oi = 1
-    while MM // tiley_oi > 4:
-        if (MM // tiley_oi) % 2 == 1:
-            break
-        tiley_oi *= 2
-
-    cfg["tile_y"] = SplitEntity([MM // tiley_oi, tiley_oi, tiley_ii])
-    cfg["tile_x"] = SplitEntity([NN // tilex_oi, tilex_oi, tilex_ii])
-    cfg["tile_k"] = SplitEntity([K, 1])
-
-
-def _default_dense_nopack_config(cfg, M, N, K):
-    vec_width = get_fp32_len()
-    tilek_bn = 1
-    for bn in range(vec_width*2, 0, -1):
-        if K % bn == 0:
-            tilek_bn = bn
-            break
-    cfg["tile_k"] = SplitEntity([K // tilek_bn, tilek_bn])
-    cfg["tile_x"] = SplitEntity([N, 1])
-    cfg["tile_y"] = SplitEntity([1, M])
-
-
-@generic.schedule_batch_matmul.register(["cpu"])
-def schedule_batch_matmul(outs):
-    """Schedule for batch_matmul
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-          The computation graph description of batch_matmul
-          in the format of an array of tensors.
-
-    Returns
-    -------
-    sch: Schedule
-        The computation schedule for the op.
-    """
-    s = tvm.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if "batch_matmul" in op.tag:
-            C = op.output(0)
-            A, B = s[C].op.input_tensors
-            _, M, N = get_const_tuple(C.shape)
-            k, = s[C].op.reduce_axis
-            ko, ki = s[C].split(k, 16)
-            CC = s.rfactor(C, ki)
-
-            b, y, x = s[C].op.axis
-            y_bn = get_max_power2_factor(M, 8)
-            x_bn = get_max_power2_factor(N, 8)
-            yo, yi = s[C].split(y, y_bn)
-            xo, xi = s[C].split(x, x_bn)
-            s[C].reorder(b, yo, xo, yi, xi)
-            bxyo = s[C].fuse(b, yo, xo)
-            s[C].parallel(bxyo)
-            s[C].fuse(yi, xi)
-
-            s[CC].compute_at(s[C], bxyo)
-            _, _, y, x = s[CC].op.axis
-            s[CC].fuse(y, x)
-            s[CC].vectorize(s[CC].op.axis[0])
-            s[C].pragma(bxyo, 'auto_unroll_max_step', 16)
-    
-    traverse_inline(s, outs[0].op, _callback)
-    
-    return s

From 2e639684d1291fd55d71365b95b10f3ad5361607 Mon Sep 17 00:00:00 2001
From: Haichen Shen <shenhaichen@gmail.com>
Date: Tue, 12 Feb 2019 01:33:04 +0000
Subject: [PATCH 07/14] Fix pylint

---
 topi/python/topi/nn/batch_matmul.py      | 19 +++++++++++++++++--
 topi/python/topi/testing/batch_matmul.py | 17 ++++++++++++++++-
 topi/python/topi/x86/batch_matmul.py     |  3 +--
 3 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/topi/python/topi/nn/batch_matmul.py b/topi/python/topi/nn/batch_matmul.py
index 51c4a69c5dbc..07e363868b05 100644
--- a/topi/python/topi/nn/batch_matmul.py
+++ b/topi/python/topi/nn/batch_matmul.py
@@ -1,11 +1,27 @@
 """Binary Neural Network (BNN) Operators"""
+# pylint: disable=invalid-name
 from __future__ import absolute_import as _abs
 import tvm
-from .. import tag
 from ..util import get_const_tuple
 
 
 def batch_matmul(x, y):
+    """Computes batch matrix multiplication of `x` and `y` when `x` and `y` are
+    data in batch.
+
+    Parameters
+    ----------
+    x : tvm.Tensor
+        3-D with shape [batch, M, K]
+
+    y : tvm.TEnsor
+        3-D with shape [batch, N, K]
+
+    Returns
+    -------
+    output : tvm.Tensor
+        3-D with shape [batch, M, N]
+    """
     assert len(x.shape) == 3 and len(y.shape) == 3, "only support 3-dim batch_matmul"
     x_shape = get_const_tuple(x.shape)
     y_shape = get_const_tuple(y.shape)
@@ -14,7 +30,6 @@ def batch_matmul(x, y):
     batch, M, K = x.shape
     N = y.shape[1]
     k = tvm.reduce_axis((0, K), name='k')
-    
     return tvm.compute((batch, M, N),
                        lambda b, i, j: tvm.sum(x[b, i, k] * y[b, j, k], axis=k),
                        tag='batch_matmul')
diff --git a/topi/python/topi/testing/batch_matmul.py b/topi/python/topi/testing/batch_matmul.py
index 9d3c5cc47a85..a7b2f9344f29 100644
--- a/topi/python/topi/testing/batch_matmul.py
+++ b/topi/python/topi/testing/batch_matmul.py
@@ -3,7 +3,22 @@
 import numpy as np
 
 def batch_matmul(x, y):
-    batch, M, K = x.shape
+    """batch_matmul operator implemented in numpy.
+
+    Parameters
+    ----------
+    x : numpy.ndarray
+        3-D with shape [batch, M, K]
+
+    y : numpy.ndarray
+        3-D with shape [batch, N, K]
+
+    Returns
+    -------
+    out : numpy.ndarray
+        3-D with shape [batch, M, N]
+    """
+    batch, M, _ = x.shape
     N = y.shape[1]
     out = np.zeros((batch, M, N)).astype(x.dtype)
     for i in range(batch):
diff --git a/topi/python/topi/x86/batch_matmul.py b/topi/python/topi/x86/batch_matmul.py
index 0723ed36da60..57a4361bf45f 100644
--- a/topi/python/topi/x86/batch_matmul.py
+++ b/topi/python/topi/x86/batch_matmul.py
@@ -49,7 +49,6 @@ def _callback(op):
             s[CC].fuse(y, x)
             s[CC].vectorize(s[CC].op.axis[0])
             s[C].pragma(bxyo, 'auto_unroll_max_step', 16)
-    
+
     traverse_inline(s, outs[0].op, _callback)
-    
     return s

From 29fae0f2fa97641e21af3526e9c1c1cb89d9c758 Mon Sep 17 00:00:00 2001
From: Haichen Shen <shenhaichen@gmail.com>
Date: Tue, 12 Feb 2019 00:49:09 -0800
Subject: [PATCH 08/14] Remove unused import

---
 topi/python/topi/x86/dense.py | 2 +-
 topi/python/topi/x86/nn.py    | 3 ---
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/topi/python/topi/x86/dense.py b/topi/python/topi/x86/dense.py
index 3ba3909c54a4..33575b4c399d 100644
--- a/topi/python/topi/x86/dense.py
+++ b/topi/python/topi/x86/dense.py
@@ -5,7 +5,7 @@
 from tvm import autotvm
 from tvm.autotvm.task.space import SplitEntity
 
-from .util import get_fp32_len, get_max_power2_factor
+from .util import get_fp32_len
 from .. import generic, tag, nn
 from ..util import traverse_inline, get_const_tuple
 
diff --git a/topi/python/topi/x86/nn.py b/topi/python/topi/x86/nn.py
index 3b57c0096c58..73463242e96d 100644
--- a/topi/python/topi/x86/nn.py
+++ b/topi/python/topi/x86/nn.py
@@ -2,9 +2,6 @@
 """x86 nn operators"""
 from __future__ import absolute_import as _abs
 import tvm
-from tvm import autotvm
-from tvm.autotvm.task.space import SplitEntity
-
 from .. import generic
 
 @generic.schedule_softmax.register(["cpu"])

From 9a983f7fc189259c8d89ec163f4ad95f90fc318e Mon Sep 17 00:00:00 2001
From: Haichen Shen <shenhaichen@gmail.com>
Date: Wed, 13 Feb 2019 03:47:48 +0000
Subject: [PATCH 09/14] Add cuda schedule for batch_matmul

---
 topi/python/topi/cuda/__init__.py     |  1 +
 topi/python/topi/cuda/batch_matmul.py | 89 +++++++++++++++++++++++++++
 topi/python/topi/util.py              | 26 ++++++++
 topi/python/topi/x86/batch_matmul.py  |  3 +-
 topi/python/topi/x86/util.py          |  9 ---
 5 files changed, 117 insertions(+), 11 deletions(-)
 create mode 100644 topi/python/topi/cuda/batch_matmul.py

diff --git a/topi/python/topi/cuda/__init__.py b/topi/python/topi/cuda/__init__.py
index 28d2eb258bea..d06b4f4ca8f1 100644
--- a/topi/python/topi/cuda/__init__.py
+++ b/topi/python/topi/cuda/__init__.py
@@ -14,6 +14,7 @@
 from .pooling import schedule_pool, schedule_global_pool
 from .extern import schedule_extern
 from .nn import schedule_lrn, schedule_l2_normalize
+from .batch_matmul import schedule_batch_matmul
 from .vision import *
 from . import ssd
 from .ssd import *
diff --git a/topi/python/topi/cuda/batch_matmul.py b/topi/python/topi/cuda/batch_matmul.py
new file mode 100644
index 000000000000..a1fa256028da
--- /dev/null
+++ b/topi/python/topi/cuda/batch_matmul.py
@@ -0,0 +1,89 @@
+# pylint: disable=invalid-name,too-many-locals,unused-variable
+"""cuda batch_matmul operators"""
+from __future__ import absolute_import as _abs
+import tvm
+
+from .. import generic
+from ..util import traverse_inline, get_const_tuple, get_max_power2_factor
+
+
+@generic.schedule_batch_matmul.register(["cuda", "gpu"])
+def schedule_batch_matmul(outs):
+    """Schedule for batch_matmul
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of batch_matmul
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for the op.
+    """
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _schedule(op):
+        C = op.output(0)
+        A, B = s[C].op.input_tensors
+        _, M, N = get_const_tuple(C.shape)
+        AA = s.cache_read(A, "shared", [C])
+        AL = s.cache_read(AA, "local", [C])
+        BB = s.cache_read(B, "shared", [C])
+        BL = s.cache_read(BB, "local", [C])
+        CC = s.cache_write(C, "local")
+
+        b, y, x = s[C].op.axis
+        y_bn = get_max_power2_factor(M, 64)
+        x_bn = get_max_power2_factor(N, 64)
+        by, y = s[C].split(y, y_bn)
+        bx, x = s[C].split(x, x_bn)
+        y_nthreads = min(y_bn, 8)
+        x_nthreads = min(x_bn, 8)
+        ty, yi = s[C].split(y, nparts=y_nthreads)
+        tx, xi = s[C].split(x, nparts=x_nthreads)
+        thread_x = tvm.thread_axis((0, x_nthreads), "threadIdx.x")
+        thread_y = tvm.thread_axis((0, y_nthreads), "threadIdx.y")
+
+        s[C].reorder(b, by, bx, ty, tx, yi, xi)
+        s[C].bind(b, tvm.thread_axis("blockIdx.z"))
+        s[C].bind(by, tvm.thread_axis("blockIdx.y"))
+        s[C].bind(bx, tvm.thread_axis("blockIdx.x"))
+        s[C].bind(ty, thread_y)
+        s[C].bind(tx, thread_x)
+        s[C].pragma(yi, "auto_unroll_max_step", 16)
+
+        s[CC].compute_at(s[C], tx)
+        _, yi, xi = s[CC].op.axis
+        k, = s[CC].op.reduce_axis
+        ko, ki = s[CC].split(k, 8)
+        s[CC].reorder(ko, ki, yi, xi)
+        s[CC].pragma(ki, "auto_unroll_max_step", 16)
+
+        s[AA].compute_at(s[CC], ko)
+        s[AL].compute_at(s[CC], ki)
+        s[BB].compute_at(s[CC], ko)
+        s[BL].compute_at(s[CC], ki)
+        _, y, k = s[AA].op.axis
+        ty, yi = s[AA].split(y, nparts=y_nthreads)
+        tx, ki = s[AA].split(k, nparts=x_nthreads)
+        s[AA].reorder(ty, tx, yi, ki)
+        s[AA].bind(ty, thread_y)
+        s[AA].bind(tx, thread_x)
+        s[AA].pragma(yi, "auto_unroll_max_step", 16)
+
+        _, x, k = s[BB].op.axis
+        ty, xi = s[BB].split(x, nparts=y_nthreads)
+        tx, ki = s[BB].split(k, nparts=x_nthreads)
+        s[BB].bind(ty, thread_y)
+        s[BB].bind(tx, thread_x)
+        s[BB].reorder(ty, tx, xi, ki)
+        s[BB].pragma(xi, "auto_unroll_max_step", 16)
+
+    def _callback(op):
+        if "batch_matmul" in op.tag:
+            _schedule(op)
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
diff --git a/topi/python/topi/util.py b/topi/python/topi/util.py
index 6d7326580f6d..d630628b4379 100644
--- a/topi/python/topi/util.py
+++ b/topi/python/topi/util.py
@@ -255,3 +255,29 @@ def select_array(i, j):
         return now
 
     return tvm.compute(matrix.shape, select_array, name=name)
+
+
+def get_max_power2_factor(n, max_value=None):
+    """Get max factor of n in power of 2. If max_value is specificed, max factor
+    value will be no more max_value,
+
+    Parameter
+    ---------
+    n : int
+        The input value
+
+    max_value : int, optional
+        The max value for the factor
+
+    Returns
+    -------
+    factor : int
+        The max factor in power of 2.
+    """
+    x = 1
+    while n % 2 == 0:
+        if max_value is not None and max_value < x * 2:
+            break
+        x *= 2
+        n /= 2
+    return x
diff --git a/topi/python/topi/x86/batch_matmul.py b/topi/python/topi/x86/batch_matmul.py
index 57a4361bf45f..37890e389366 100644
--- a/topi/python/topi/x86/batch_matmul.py
+++ b/topi/python/topi/x86/batch_matmul.py
@@ -3,9 +3,8 @@
 from __future__ import absolute_import as _abs
 import tvm
 
-from .util import get_max_power2_factor
 from .. import generic
-from ..util import traverse_inline, get_const_tuple
+from ..util import traverse_inline, get_const_tuple, get_max_power2_factor
 
 
 @generic.schedule_batch_matmul.register(["cpu"])
diff --git a/topi/python/topi/x86/util.py b/topi/python/topi/x86/util.py
index a8a19365e2ab..678ff8e24cff 100644
--- a/topi/python/topi/x86/util.py
+++ b/topi/python/topi/x86/util.py
@@ -10,12 +10,3 @@ def get_fp32_len():
             if opt == '-mcpu=skylake-avx512':
                 fp32_vec_len = 16
     return fp32_vec_len
-
-def get_max_power2_factor(n, max_value=None):
-    x = 1
-    while n % 2 == 0:
-        if max_value is not None and max_value < x * 2:
-            break
-        x *= 2
-        n /= 2
-    return x

From 33ccd4bc94922f409ee48f123871c0b5633170ba Mon Sep 17 00:00:00 2001
From: Haichen Shen <shenhaichen@gmail.com>
Date: Mon, 18 Feb 2019 14:52:15 -0800
Subject: [PATCH 10/14] Add test case with larger batch size

---
 tests/python/relay/test_op_level10.py       | 1 +
 topi/tests/python/test_topi_batch_matmul.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py
index 6d68197ff7ac..34285d2b18dd 100644
--- a/tests/python/relay/test_op_level10.py
+++ b/tests/python/relay/test_op_level10.py
@@ -174,6 +174,7 @@ def test_batch_matmul():
     verify_batch_matmul((1, 16, 32), (1, 16, 32), (1, 16, 16))
     verify_batch_matmul((5, 16, 32), (5, 16, 32), (5, 16, 16))
     verify_batch_matmul((5, 16, 32), (5, 20, 32), (5, 16, 20))
+    verify_batch_matmul((30, 16, 32), (30, 20, 32), (30, 16, 20))
 
 
 if __name__ == "__main__":
diff --git a/topi/tests/python/test_topi_batch_matmul.py b/topi/tests/python/test_topi_batch_matmul.py
index 8ac3e24a3408..f699d6aa8dcb 100644
--- a/topi/tests/python/test_topi_batch_matmul.py
+++ b/topi/tests/python/test_topi_batch_matmul.py
@@ -46,6 +46,7 @@ def test_batch_matmul():
     verify_batch_matmul(1, 16, 16, 32)
     verify_batch_matmul(5, 16, 16, 32)
     verify_batch_matmul(5, 16, 20, 32)
+    verify_batch_matmul(30, 16, 20, 32)
 
 
 if __name__ == "__main__":

From 8cc095d6426bbe71028a9035b21e752726403bb1 Mon Sep 17 00:00:00 2001
From: Haichen Shen <shenhaichen@gmail.com>
Date: Mon, 18 Feb 2019 15:31:58 -0800
Subject: [PATCH 11/14] Add batch_matmul in api doc

---
 docs/api/python/topi.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/api/python/topi.rst b/docs/api/python/topi.rst
index 856bad198e88..4d2eeb7692c4 100644
--- a/docs/api/python/topi.rst
+++ b/docs/api/python/topi.rst
@@ -41,6 +41,7 @@ List of operators
    topi.nn.upsampling
    topi.nn.softmax
    topi.nn.dense
+   topi.nn.batch_matmul
    topi.nn.log_softmax
    topi.nn.conv2d_nchw
    topi.nn.conv2d_hwcn
@@ -134,6 +135,7 @@ topi.nn
 .. autofunction:: topi.nn.upsampling
 .. autofunction:: topi.nn.softmax
 .. autofunction:: topi.nn.dense
+.. autofunction:: topi.nn.batch_matmul
 .. autofunction:: topi.nn.log_softmax
 .. autofunction:: topi.nn.conv2d_nchw
 .. autofunction:: topi.nn.conv2d_hwcn

From 41b1d3646e30075c0a38dc68016d09f5b86ecf9c Mon Sep 17 00:00:00 2001
From: Haichen Shen <shenhaichen@gmail.com>
Date: Thu, 21 Feb 2019 14:12:59 -0800
Subject: [PATCH 12/14] Fix quantize pass rounding error

---
 tests/python/relay/test_pass_quantize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/relay/test_pass_quantize.py b/tests/python/relay/test_pass_quantize.py
index 6d65d7b2d9ee..2e2389d16244 100644
--- a/tests/python/relay/test_pass_quantize.py
+++ b/tests/python/relay/test_pass_quantize.py
@@ -75,7 +75,7 @@ def make_qgraph(data, weight):
     graph = relay.create_executor('graph')
     res0 = graph.evaluate(qgraph0)(dataset[0]['data'])
     res1 = graph.evaluate(qgraph1)(dataset[0]['data'])
-    tvm.testing.assert_allclose(res0.asnumpy(), res1.asnumpy())
+    tvm.testing.assert_allclose(res0.asnumpy(), res1.asnumpy(), rtol=1e-3)
 
 
 if __name__ == "__main__":

From d47383320ca36ac86265031614b656c53706c687 Mon Sep 17 00:00:00 2001
From: Haichen Shen <shenhaichen@gmail.com>
Date: Thu, 28 Feb 2019 16:48:10 -0800
Subject: [PATCH 13/14] Fix pylint and minor change

---
 python/tvm/relay/frontend/mxnet.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index d2461fadec76..4112825c2f8f 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -286,12 +286,13 @@ def _mx_multibox_detection(inputs, attrs):
 def _mx_batch_dot(inputs, attrs):
     transpose_a = attrs.get_bool("transpose_a", False)
     transpose_b = attrs.get_bool("transpose_b", False)
-    if transpose_a is True or transpose_b is False:
-        raise RuntimeError("batch_dot: only support transpose_a=False and "
-                           "transpose_b=True")
+    if transpose_a is True:
+        raise RuntimeError("batch_dot: only support transpose_a=False")
+    if transpose_b is False:
+        b = _op.transpose(b, axes=[0, 2, 1])
     return _op.batch_matmul(inputs[0], inputs[1])
 
-  
+
 def _mx_arange(inputs, attrs):
     assert len(inputs) == 0
     if attrs.get_int("repeat", 1) != 1:
@@ -302,7 +303,6 @@ def _mx_arange(inputs, attrs):
     new_attrs["step"] = attrs.get_float("step", 1)
     new_attrs["dtype"] = attrs.get_str("dtype", "float32")
     return _op.arange(**new_attrs)
-  
 
 
 def _mx_roi_align(inputs, attrs):

From be4d7ce2a633828e9de4ef9313775d16fd026d77 Mon Sep 17 00:00:00 2001
From: Haichen Shen <shenhaichen@gmail.com>
Date: Thu, 28 Feb 2019 17:48:19 -0800
Subject: [PATCH 14/14] bug fix

---
 python/tvm/relay/frontend/mxnet.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index 4112825c2f8f..3d3bb8e4fd84 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -284,13 +284,15 @@ def _mx_multibox_detection(inputs, attrs):
 
 
 def _mx_batch_dot(inputs, attrs):
+    assert len(inputs) == 2
+    a, b = inputs
     transpose_a = attrs.get_bool("transpose_a", False)
     transpose_b = attrs.get_bool("transpose_b", False)
     if transpose_a is True:
         raise RuntimeError("batch_dot: only support transpose_a=False")
     if transpose_b is False:
         b = _op.transpose(b, axes=[0, 2, 1])
-    return _op.batch_matmul(inputs[0], inputs[1])
+    return _op.batch_matmul(a, b)
 
 
 def _mx_arange(inputs, attrs):