From b2228c0f17698e99cf95bdeeca587ad52e6b5223 Mon Sep 17 00:00:00 2001
From: Abhikrant Sharma <quic_abhikran@quicinc.com>
Date: Mon, 4 Oct 2021 04:55:32 -0500
Subject: [PATCH 01/11] [ONNX] Add MatMulInteger16 contrib op

---
 python/tvm/relay/frontend/onnx.py          | 83 ++++++++++++++++++++++
 tests/python/frontend/onnx/test_forward.py | 43 ++++++++++-
 2 files changed, 125 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 86cb178d0875..7c5b9461c4f9 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -873,6 +873,88 @@ def flatten_to_nd(x, x_shape, nd=3):
         return _op.nn.dense(inputs[0], input_1_t)
 
 
+class MatMulInteger16(OnnxOpConverter):
+    """Operator converter for MatMulInteger16 from Microsoft onnxruntime contrib opset."""
+
+    @classmethod
+    def _impl_v10(cls, inputs, attr, params):
+        assert len(inputs) == 2, "MatMul op take 2 inputs, {} given".format(len(inputs))
+        a_shape = shape_of(inputs[0])
+        a_rank = infer_shape(a_shape)[0]
+        b_shape = shape_of(inputs[1])
+        b_rank = infer_shape(b_shape)[0]
+        a_dtype = infer_type(inputs[0]).checked_type.dtype
+        b_dtype = infer_type(inputs[1]).checked_type.dtype
+        # Check input data types
+        assert a_dtype in ("int16", "uint16"), "MatMulInteger16: invalid dtype for first input"
+        assert b_dtype in ("int16", "uint16"), "MatMulInteger16: invalid dtype for second input"
+        out_dtype = "int32"
+        # Set output data type as uint32 when both inputs are uint16
+        if a_dtype == "uint16" and b_dtype == "uint16":
+           out_dtype = "uint32"
+        if a_rank > 2 or b_rank > 2:
+            def flatten_to_nd(x, x_shape, nd=3):
+                ndims = infer_shape(x_shape)[0]
+                if ndims == nd:
+                    return x
+                newshape = _op.concatenate(
+                    [
+                        _expr.const([-1], dtype=infer_type(x_shape).checked_type.dtype),
+                        _op.strided_slice(x_shape, [ndims - nd + 1], [ndims]),
+                    ],
+                    0,
+                )
+                out = _op.reshape(x, fold_constant(newshape))
+                return out
+
+            b_type = infer_type(inputs[1])
+            # Convert to dense if the second matrix is 2d and non-dynamic
+            if b_rank == 2 and not _ty.is_dynamic(b_type.checked_type):
+                a = flatten_to_nd(inputs[0], a_shape, 2)
+                b = _op.transpose(inputs[1])
+                output = _op.nn.dense(a, b, out_dtype=out_dtype)
+            else:
+                # Convert a and b into 3 dimensional tensors.
+                a = flatten_to_nd(inputs[0], a_shape, 3)
+                b = flatten_to_nd(inputs[1], b_shape, 3)
+                # Perform a NN batch matmul.
+                output = _op.nn.batch_matmul(a, b, out_dtype=out_dtype, transpose_b=False)
+            # Determine the output batch dimension.
+            if a_rank > b_rank:
+                out_batch = _op.strided_slice(a_shape, [0], [a_rank - 2])
+            elif a_rank < b_rank:
+                out_batch = _op.strided_slice(b_shape, [0], [b_rank - 2])
+            # If its unclear how broadcasting should be applied, the output
+            # shape is determined by choosing the maximum value from each input.
+            else:
+                out_batch = _op.concatenate(
+                    [
+                        _op.maximum(
+                            _op.strided_slice(a_shape, [i], [i + 1]),
+                            _op.strided_slice(b_shape, [i], [i + 1]),
+                        )
+                        for i in range(a_rank - 2)
+                    ],
+                    0,
+                )
+            # Reshape output to original dimensions.
+            final_shape = _op.concatenate(
+                [
+                    out_batch,
+                    _op.strided_slice(
+                        a_shape, [infer_shape(a_shape)[0] - 2], [infer_shape(a_shape)[0] - 1]
+                    ),
+                    _op.strided_slice(
+                        b_shape, [infer_shape(b_shape)[0] - 1], [infer_shape(b_shape)[0]]
+                    ),
+                ],
+                0,
+            )
+            return _op.reshape(output, fold_constant(final_shape))
+        # Use relay matmul
+        return _op.nn.matmul(inputs[0], inputs[1], out_dtype=out_dtype)
+
+
 class Mod(OnnxOpConverter):
     """Operator converter for Mod."""
 
@@ -4144,6 +4226,7 @@ def _get_convert_map(opset):
         "Softsign": Softsign.get_converter(opset),
         "Gemm": Gemm.get_converter(opset),
         "MatMul": MatMul.get_converter(opset),
+        "MatMulInteger16": MatMulInteger16.get_converter(opset),
         "Mod": Mod.get_converter(opset),
         "Xor": Renamer("logical_xor"),
         # defs/nn
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 69bb44e360ff..6d777eb017b8 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -1281,6 +1281,47 @@ def verify_batch_matmul(a_shape, b_shape, out_shape, convert_config=None):
         convert_config={"use_nt_batch_matmul": False},
     )
 
+@tvm.testing.parametrize_targets
+def test_matmulinteger16(target, dev):
+    def verify_matmulinteger16(a_shape, b_shape, out_shape):
+        a_dtype = "int16"
+        b_dtype = "int16"
+        low = -10
+        high = 10
+
+        a_proto = TensorProto.INT16
+        b_proto = TensorProto.INT16
+        out_proto = TensorProto.INT32
+        a_array = np.random.randint(low, high, size=a_shape).astype(a_dtype)
+        b_array = np.random.randint(low, high, size=b_shape).astype(b_dtype)
+
+        mul_node = helper.make_node("MatMulInteger16",
+            ["a", "b"],
+            ["out"],
+            domain="com.microsoft")
+
+        graph = helper.make_graph(
+            [mul_node],
+            "matmuli16_test",
+            inputs=[
+                helper.make_tensor_value_info("a", a_proto, list(a_shape)),
+                helper.make_tensor_value_info("b", b_proto, list(b_shape)),
+            ],
+            outputs=[helper.make_tensor_value_info("out", out_proto, list(out_shape))],
+        )
+
+        model = helper.make_model(graph, producer_name="matmuli16_test")
+        verify_with_ort_with_inputs(model, [a_array, b_array], target=target, dev=dev)
+
+    # Working tests
+    verify_matmulinteger16((2, 4, 3), (1, 3, 4), (2, 4, 4))
+    verify_matmulinteger16((1, 4, 3), (2, 3, 4), (2, 4, 4))
+    verify_matmulinteger16((4, 3), (3, 4), (4, 4))
+    verify_matmulinteger16((5, 7), (7, 8), (5, 8))
+    verify_matmulinteger16((2, 3, 4, 3), (2, 3, 3, 4), (2, 3, 4, 4))
+    verify_matmulinteger16((2, 4, 3), (3, 4), (2, 4, 4))
+    verify_matmulinteger16((2, 3, 4, 3), (3, 4), (2, 3, 4, 4))
+
 
 def verify_simple_dynamic_model(a_shape, b_shape, target, dev):
     def verify_model(model, a_shape, b_shape):
@@ -4932,7 +4973,6 @@ def verify_eyelike(indata):
     "test_if_seq",
     "test_loop11",
     "test_loop13_seq",
-    "test_matmulinteger",
     "test_maxpool_2d_same_lower",
     "test_maxpool_2d_same_upper",
     "test_maxpool_with_argmax_2d_precomputed_pads",
@@ -5801,6 +5841,7 @@ def repeat(N, D):
     test_onehot()
     test_gemm()
     test_matmul()
+    test_matmulinteger16()
     test_gather()
     test_gatherelements()
     test_gather_nd()

From d6503bf4b27d3db08ad1b05d021c1d4f55ed3cad Mon Sep 17 00:00:00 2001
From: Abhikrant Sharma <quic_abhikran@quicinc.com>
Date: Mon, 4 Oct 2021 05:09:32 -0500
Subject: [PATCH 02/11] Fix formatting errors

---
 python/tvm/relay/frontend/onnx.py          | 3 ++-
 tests/python/frontend/onnx/test_forward.py | 6 ++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 7c5b9461c4f9..7e4d033b5725 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -891,8 +891,9 @@ def _impl_v10(cls, inputs, attr, params):
         out_dtype = "int32"
         # Set output data type as uint32 when both inputs are uint16
         if a_dtype == "uint16" and b_dtype == "uint16":
-           out_dtype = "uint32"
+            out_dtype = "uint32"
         if a_rank > 2 or b_rank > 2:
+
             def flatten_to_nd(x, x_shape, nd=3):
                 ndims = infer_shape(x_shape)[0]
                 if ndims == nd:
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 6d777eb017b8..703e69e981a2 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -1281,6 +1281,7 @@ def verify_batch_matmul(a_shape, b_shape, out_shape, convert_config=None):
         convert_config={"use_nt_batch_matmul": False},
     )
 
+
 @tvm.testing.parametrize_targets
 def test_matmulinteger16(target, dev):
     def verify_matmulinteger16(a_shape, b_shape, out_shape):
@@ -1295,10 +1296,7 @@ def verify_matmulinteger16(a_shape, b_shape, out_shape):
         a_array = np.random.randint(low, high, size=a_shape).astype(a_dtype)
         b_array = np.random.randint(low, high, size=b_shape).astype(b_dtype)
 
-        mul_node = helper.make_node("MatMulInteger16",
-            ["a", "b"],
-            ["out"],
-            domain="com.microsoft")
+        mul_node = helper.make_node("MatMulInteger16", ["a", "b"], ["out"], domain="com.microsoft")
 
         graph = helper.make_graph(
             [mul_node],

From 77c1be7c97fa3253292942b66259f6501145ecce Mon Sep 17 00:00:00 2001
From: Abhikrant Sharma <quic_abhikran@quicinc.com>
Date: Mon, 4 Oct 2021 09:06:49 -0500
Subject: [PATCH 03/11] Remove a code comment and do not set default value of
 nd

---
 python/tvm/relay/frontend/onnx.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 7e4d033b5725..549b54790624 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -889,12 +889,11 @@ def _impl_v10(cls, inputs, attr, params):
         assert a_dtype in ("int16", "uint16"), "MatMulInteger16: invalid dtype for first input"
         assert b_dtype in ("int16", "uint16"), "MatMulInteger16: invalid dtype for second input"
         out_dtype = "int32"
-        # Set output data type as uint32 when both inputs are uint16
         if a_dtype == "uint16" and b_dtype == "uint16":
             out_dtype = "uint32"
         if a_rank > 2 or b_rank > 2:
 
-            def flatten_to_nd(x, x_shape, nd=3):
+            def flatten_to_nd(x, x_shape, nd):
                 ndims = infer_shape(x_shape)[0]
                 if ndims == nd:
                     return x

From ccbe433937c0661414a5e62e1f6321058057e158 Mon Sep 17 00:00:00 2001
From: Abhikrant Sharma <quic_abhikran@quicinc.com>
Date: Mon, 4 Oct 2021 11:06:45 -0500
Subject: [PATCH 04/11] Move flatten_to_nd function outside matmul to be used
 across multiple functions

---
 python/tvm/relay/frontend/onnx.py          | 45 ++++++++--------------
 tests/python/frontend/onnx/test_forward.py |  1 +
 2 files changed, 16 insertions(+), 30 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 549b54790624..9e39d3fe1105 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -212,6 +212,21 @@ def get_scalar(x, params, dtype="float32"):
     return _op.cast(x, dtype)
 
 
+def flatten_to_nd(x, x_shape, nd=3):
+    ndims = infer_shape(x_shape)[0]
+    if ndims == nd:
+        return x
+    newshape = _op.concatenate(
+        [
+            _expr.const([-1], dtype=infer_type(x_shape).checked_type.dtype),
+            _op.strided_slice(x_shape, [ndims - nd + 1], [ndims]),
+        ],
+        0,
+    )
+    out = _op.reshape(x, fold_constant(newshape))
+    return out
+
+
 class OnnxOpConverter(object):
     """A helper class for holding onnx op converters."""
 
@@ -803,21 +818,6 @@ def _impl_v1(cls, inputs, attr, params):
         b_rank = infer_shape(b_shape)[0]
         # When performing a batch matmul, we need to properly handle N-dim shapes.
         if a_rank > 2 or b_rank > 2:
-
-            def flatten_to_nd(x, x_shape, nd=3):
-                ndims = infer_shape(x_shape)[0]
-                if ndims == nd:
-                    return x
-                newshape = _op.concatenate(
-                    [
-                        _expr.const([-1], dtype=infer_type(x_shape).checked_type.dtype),
-                        _op.strided_slice(x_shape, [ndims - nd + 1], [ndims]),
-                    ],
-                    0,
-                )
-                out = _op.reshape(x, fold_constant(newshape))
-                return out
-
             b_type = infer_type(inputs[1])
             # Convert to dense if the second matrix is 2d and non-dynamic
             if b_rank == 2 and not _ty.is_dynamic(b_type.checked_type):
@@ -892,21 +892,6 @@ def _impl_v10(cls, inputs, attr, params):
         if a_dtype == "uint16" and b_dtype == "uint16":
             out_dtype = "uint32"
         if a_rank > 2 or b_rank > 2:
-
-            def flatten_to_nd(x, x_shape, nd):
-                ndims = infer_shape(x_shape)[0]
-                if ndims == nd:
-                    return x
-                newshape = _op.concatenate(
-                    [
-                        _expr.const([-1], dtype=infer_type(x_shape).checked_type.dtype),
-                        _op.strided_slice(x_shape, [ndims - nd + 1], [ndims]),
-                    ],
-                    0,
-                )
-                out = _op.reshape(x, fold_constant(newshape))
-                return out
-
             b_type = infer_type(inputs[1])
             # Convert to dense if the second matrix is 2d and non-dynamic
             if b_rank == 2 and not _ty.is_dynamic(b_type.checked_type):
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index 703e69e981a2..d1e987a30a8f 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -4971,6 +4971,7 @@ def verify_eyelike(indata):
     "test_if_seq",
     "test_loop11",
     "test_loop13_seq",
+    "test_matmulinteger",
     "test_maxpool_2d_same_lower",
     "test_maxpool_2d_same_upper",
     "test_maxpool_with_argmax_2d_precomputed_pads",

From 69f245351b435df5f124a23462fe674578c8a7ee Mon Sep 17 00:00:00 2001
From: Abhikrant Sharma <quic_abhikran@quicinc.com>
Date: Mon, 4 Oct 2021 23:44:08 -0500
Subject: [PATCH 05/11] Add function docstring and describe the tests

---
 python/tvm/relay/frontend/onnx.py          |  1 +
 tests/python/frontend/onnx/test_forward.py | 12 +++++++-----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 9e39d3fe1105..c1e282a5fb52 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -213,6 +213,7 @@ def get_scalar(x, params, dtype="float32"):
 
 
 def flatten_to_nd(x, x_shape, nd=3):
+    """Helper to flatten multi dimensional arrays to specific dimension"""
     ndims = infer_shape(x_shape)[0]
     if ndims == nd:
         return x
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index d1e987a30a8f..b018e558ca34 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -1311,13 +1311,15 @@ def verify_matmulinteger16(a_shape, b_shape, out_shape):
         model = helper.make_model(graph, producer_name="matmuli16_test")
         verify_with_ort_with_inputs(model, [a_array, b_array], target=target, dev=dev)
 
-    # Working tests
-    verify_matmulinteger16((2, 4, 3), (1, 3, 4), (2, 4, 4))
-    verify_matmulinteger16((1, 4, 3), (2, 3, 4), (2, 4, 4))
+    # 2D computation to verify matmul op
     verify_matmulinteger16((4, 3), (3, 4), (4, 4))
     verify_matmulinteger16((5, 7), (7, 8), (5, 8))
-    verify_matmulinteger16((2, 3, 4, 3), (2, 3, 3, 4), (2, 3, 4, 4))
-    verify_matmulinteger16((2, 4, 3), (3, 4), (2, 4, 4))
+    # Verify 3D matmul using batch_matmul op
+    verify_matmulinteger16((2, 4, 3), (1, 3, 4), (2, 4, 4))
+    verify_matmulinteger16((1, 4, 3), (2, 3, 4), (2, 4, 4))
+    # Test implicit broadcasting
+    verify_matmulinteger16((2, 3, 5, 3), (2, 3, 3, 5), (2, 3, 5, 5))
+    verify_matmulinteger16((2, 7, 3), (3, 7), (2, 7, 7))
     verify_matmulinteger16((2, 3, 4, 3), (3, 4), (2, 3, 4, 4))
 
 

From 1aa71eb072ec945b62802cc6faa023d0739aabf9 Mon Sep 17 00:00:00 2001
From: Abhikrant Sharma <quic_abhikran@quicinc.com>
Date: Thu, 7 Oct 2021 12:44:16 -0500
Subject: [PATCH 06/11] Use max/min value of int16 as high/low while generating
 input vectors

---
 tests/python/frontend/onnx/test_forward.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index b018e558ca34..c2fa08c2d0f9 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -1287,8 +1287,8 @@ def test_matmulinteger16(target, dev):
     def verify_matmulinteger16(a_shape, b_shape, out_shape):
         a_dtype = "int16"
         b_dtype = "int16"
-        low = -10
-        high = 10
+        low = np.iinfo(np.int16).min
+        high = np.iinfo(np.int16).max
 
         a_proto = TensorProto.INT16
         b_proto = TensorProto.INT16

From b2f7437c9252054cc180e3858c2fd4ad26094b16 Mon Sep 17 00:00:00 2001
From: Abhikrant Sharma <quic_abhikran@quicinc.com>
Date: Thu, 18 Nov 2021 01:41:54 -0600
Subject: [PATCH 07/11] Converge MatMul and MatMulInteger16 ops into a single
 op using output dtype

---
 python/tvm/relay/frontend/onnx.py | 198 +++++++++++-------------------
 1 file changed, 72 insertions(+), 126 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index c1e282a5fb52..9b99dbed4cb7 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -212,20 +212,75 @@ def get_scalar(x, params, dtype="float32"):
     return _op.cast(x, dtype)
 
 
-def flatten_to_nd(x, x_shape, nd=3):
-    """Helper to flatten multi dimensional arrays to specific dimension"""
-    ndims = infer_shape(x_shape)[0]
-    if ndims == nd:
-        return x
-    newshape = _op.concatenate(
-        [
-            _expr.const([-1], dtype=infer_type(x_shape).checked_type.dtype),
-            _op.strided_slice(x_shape, [ndims - nd + 1], [ndims]),
-        ],
-        0,
-    )
-    out = _op.reshape(x, fold_constant(newshape))
-    return out
+def matmul_out_dtype(inputs, out_dtype):
+   """Common function to handle MatMul and MatMulInteger16"""
+   a_shape = shape_of(inputs[0])
+   a_rank = infer_shape(a_shape)[0]
+   b_shape = shape_of(inputs[1])
+   b_rank = infer_shape(b_shape)[0]
+   if a_rank > 2 or b_rank > 2:
+       def flatten_to_nd(x, x_shape, nd=3):
+          ndims = infer_shape(x_shape)[0]
+          if ndims == nd:
+              return x
+          newshape = _op.concatenate(
+              [
+                  _expr.const([-1], dtype=infer_type(x_shape).checked_type.dtype),
+                  _op.strided_slice(x_shape, [ndims - nd + 1], [ndims]),
+              ],
+              0,
+          )
+          out = _op.reshape(x, fold_constant(newshape))
+          return out
+
+       b_type = infer_type(inputs[1])
+       # Convert to dense if the second matrix is 2d and non-dynamic
+       if b_rank == 2 and not _ty.is_dynamic(b_type.checked_type):
+           a = flatten_to_nd(inputs[0], a_shape, 2)
+           b = _op.transpose(inputs[1])
+           output = _op.nn.dense(a, b, out_dtype=out_dtype)
+       else:
+           # Convert a and b into 3 dimensional tensors.
+           a = flatten_to_nd(inputs[0], a_shape, 3)
+           b = flatten_to_nd(inputs[1], b_shape, 3)
+           # Perform a NN batch matmul.
+           output = _op.nn.batch_matmul(a, b, out_dtype=out_dtype, transpose_b=False)
+       # Determine the output batch dimension.
+       if a_rank > b_rank:
+           out_batch = _op.strided_slice(a_shape, [0], [a_rank - 2])
+       elif a_rank < b_rank:
+           out_batch = _op.strided_slice(b_shape, [0], [b_rank - 2])
+       # If its unclear how broadcasting should be applied, the output
+       # shape is determined by choosing the maximum value from each input.
+       else:
+           out_batch = _op.concatenate(
+               [
+                   _op.maximum(
+                       _op.strided_slice(a_shape, [i], [i + 1]),
+                       _op.strided_slice(b_shape, [i], [i + 1]),
+                   )
+                   for i in range(a_rank - 2)
+               ],
+               0,
+           )
+       # Reshape output to original dimensions.
+       final_shape = _op.concatenate(
+           [
+               out_batch,
+               _op.strided_slice(
+                   a_shape, [infer_shape(a_shape)[0] - 2], [infer_shape(a_shape)[0] - 1]
+               ),
+               _op.strided_slice(
+                   b_shape, [infer_shape(b_shape)[0] - 1], [infer_shape(b_shape)[0]]
+               ),
+           ],
+           0,
+       )
+       return _op.reshape(output, fold_constant(final_shape))
+   # Otherwise a simple dense op will get the job done.
+   input_1_t = _op.transpose(inputs[1], axes=(1, 0))
+   return _op.nn.dense(inputs[0], input_1_t, out_dtype=out_dtype)
+
 
 
 class OnnxOpConverter(object):
@@ -813,65 +868,7 @@ class MatMul(OnnxOpConverter):
     def _impl_v1(cls, inputs, attr, params):
         assert len(inputs) == 2, "MatMul op take 2 inputs, {} given".format(len(inputs))
         # Need to check input shape as batch matmul must be supported.
-        a_shape = shape_of(inputs[0])
-        a_rank = infer_shape(a_shape)[0]
-        b_shape = shape_of(inputs[1])
-        b_rank = infer_shape(b_shape)[0]
-        # When performing a batch matmul, we need to properly handle N-dim shapes.
-        if a_rank > 2 or b_rank > 2:
-            b_type = infer_type(inputs[1])
-            # Convert to dense if the second matrix is 2d and non-dynamic
-            if b_rank == 2 and not _ty.is_dynamic(b_type.checked_type):
-                a = flatten_to_nd(inputs[0], a_shape, 2)
-                b = _op.transpose(inputs[1])
-                output = _op.nn.dense(a, b)
-            else:
-                # Convert a and b into 3 dimensional tensors.
-                a = flatten_to_nd(inputs[0], a_shape, 3)
-                b = flatten_to_nd(inputs[1], b_shape, 3)
-                if ONNX_DEFAULT_CONFIGS["use_nt_batch_matmul"]:
-                    # Transpose matrix dimensions of b.
-                    b = _op.transpose(b, [0, 2, 1])
-                    # Perform a NT batch matmul.
-                    output = _op.nn.batch_matmul(a, b)
-                else:
-                    # Perform a NN batch matmul.
-                    output = _op.nn.batch_matmul(a, b, transpose_b=False)
-            # Determine the output batch dimension.
-            if a_rank > b_rank:
-                out_batch = _op.strided_slice(a_shape, [0], [a_rank - 2])
-            elif a_rank < b_rank:
-                out_batch = _op.strided_slice(b_shape, [0], [b_rank - 2])
-            # If its unclear how broadcasting should be applied, the output
-            # shape is determined by choosing the maximum value from each input.
-            else:
-                out_batch = _op.concatenate(
-                    [
-                        _op.maximum(
-                            _op.strided_slice(a_shape, [i], [i + 1]),
-                            _op.strided_slice(b_shape, [i], [i + 1]),
-                        )
-                        for i in range(a_rank - 2)
-                    ],
-                    0,
-                )
-            # Reshape output to original dimensions.
-            final_shape = _op.concatenate(
-                [
-                    out_batch,
-                    _op.strided_slice(
-                        a_shape, [infer_shape(a_shape)[0] - 2], [infer_shape(a_shape)[0] - 1]
-                    ),
-                    _op.strided_slice(
-                        b_shape, [infer_shape(b_shape)[0] - 1], [infer_shape(b_shape)[0]]
-                    ),
-                ],
-                0,
-            )
-            return _op.reshape(output, fold_constant(final_shape))
-        # Otherwise a simple dense op will get the job done.
-        input_1_t = _op.transpose(inputs[1], axes=(1, 0))
-        return _op.nn.dense(inputs[0], input_1_t)
+        return matmul_out_dtype(inputs, out_dtype=infer_type(inputs[0]).checked_type.dtype)
 
 
 class MatMulInteger16(OnnxOpConverter):
@@ -879,11 +876,7 @@ class MatMulInteger16(OnnxOpConverter):
 
     @classmethod
     def _impl_v10(cls, inputs, attr, params):
-        assert len(inputs) == 2, "MatMul op take 2 inputs, {} given".format(len(inputs))
-        a_shape = shape_of(inputs[0])
-        a_rank = infer_shape(a_shape)[0]
-        b_shape = shape_of(inputs[1])
-        b_rank = infer_shape(b_shape)[0]
+        assert len(inputs) == 2, "MatMulInteger16 op take 2 inputs, {} given".format(len(inputs))
         a_dtype = infer_type(inputs[0]).checked_type.dtype
         b_dtype = infer_type(inputs[1]).checked_type.dtype
         # Check input data types
@@ -892,54 +885,7 @@ def _impl_v10(cls, inputs, attr, params):
         out_dtype = "int32"
         if a_dtype == "uint16" and b_dtype == "uint16":
             out_dtype = "uint32"
-        if a_rank > 2 or b_rank > 2:
-            b_type = infer_type(inputs[1])
-            # Convert to dense if the second matrix is 2d and non-dynamic
-            if b_rank == 2 and not _ty.is_dynamic(b_type.checked_type):
-                a = flatten_to_nd(inputs[0], a_shape, 2)
-                b = _op.transpose(inputs[1])
-                output = _op.nn.dense(a, b, out_dtype=out_dtype)
-            else:
-                # Convert a and b into 3 dimensional tensors.
-                a = flatten_to_nd(inputs[0], a_shape, 3)
-                b = flatten_to_nd(inputs[1], b_shape, 3)
-                # Perform a NN batch matmul.
-                output = _op.nn.batch_matmul(a, b, out_dtype=out_dtype, transpose_b=False)
-            # Determine the output batch dimension.
-            if a_rank > b_rank:
-                out_batch = _op.strided_slice(a_shape, [0], [a_rank - 2])
-            elif a_rank < b_rank:
-                out_batch = _op.strided_slice(b_shape, [0], [b_rank - 2])
-            # If its unclear how broadcasting should be applied, the output
-            # shape is determined by choosing the maximum value from each input.
-            else:
-                out_batch = _op.concatenate(
-                    [
-                        _op.maximum(
-                            _op.strided_slice(a_shape, [i], [i + 1]),
-                            _op.strided_slice(b_shape, [i], [i + 1]),
-                        )
-                        for i in range(a_rank - 2)
-                    ],
-                    0,
-                )
-            # Reshape output to original dimensions.
-            final_shape = _op.concatenate(
-                [
-                    out_batch,
-                    _op.strided_slice(
-                        a_shape, [infer_shape(a_shape)[0] - 2], [infer_shape(a_shape)[0] - 1]
-                    ),
-                    _op.strided_slice(
-                        b_shape, [infer_shape(b_shape)[0] - 1], [infer_shape(b_shape)[0]]
-                    ),
-                ],
-                0,
-            )
-            return _op.reshape(output, fold_constant(final_shape))
-        # Use relay matmul
-        return _op.nn.matmul(inputs[0], inputs[1], out_dtype=out_dtype)
-
+        return matmul_out_dtype(inputs, out_dtype)
 
 class Mod(OnnxOpConverter):
     """Operator converter for Mod."""

From 446fccea95f094468dd4d6d30a3b23983e8a0f36 Mon Sep 17 00:00:00 2001
From: Abhikrant Sharma <quic_abhikran@quicinc.com>
Date: Thu, 18 Nov 2021 01:49:11 -0600
Subject: [PATCH 08/11] Fix indentation issues

---
 python/tvm/relay/frontend/onnx.py | 126 +++++++++++++++---------------
 1 file changed, 63 insertions(+), 63 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 9b99dbed4cb7..65481cd509c5 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -213,73 +213,73 @@ def get_scalar(x, params, dtype="float32"):
 
 
 def matmul_out_dtype(inputs, out_dtype):
-   """Common function to handle MatMul and MatMulInteger16"""
-   a_shape = shape_of(inputs[0])
-   a_rank = infer_shape(a_shape)[0]
-   b_shape = shape_of(inputs[1])
-   b_rank = infer_shape(b_shape)[0]
-   if a_rank > 2 or b_rank > 2:
-       def flatten_to_nd(x, x_shape, nd=3):
-          ndims = infer_shape(x_shape)[0]
-          if ndims == nd:
-              return x
-          newshape = _op.concatenate(
-              [
-                  _expr.const([-1], dtype=infer_type(x_shape).checked_type.dtype),
-                  _op.strided_slice(x_shape, [ndims - nd + 1], [ndims]),
-              ],
-              0,
-          )
-          out = _op.reshape(x, fold_constant(newshape))
-          return out
-
-       b_type = infer_type(inputs[1])
-       # Convert to dense if the second matrix is 2d and non-dynamic
-       if b_rank == 2 and not _ty.is_dynamic(b_type.checked_type):
-           a = flatten_to_nd(inputs[0], a_shape, 2)
-           b = _op.transpose(inputs[1])
-           output = _op.nn.dense(a, b, out_dtype=out_dtype)
-       else:
-           # Convert a and b into 3 dimensional tensors.
-           a = flatten_to_nd(inputs[0], a_shape, 3)
-           b = flatten_to_nd(inputs[1], b_shape, 3)
-           # Perform a NN batch matmul.
-           output = _op.nn.batch_matmul(a, b, out_dtype=out_dtype, transpose_b=False)
-       # Determine the output batch dimension.
-       if a_rank > b_rank:
-           out_batch = _op.strided_slice(a_shape, [0], [a_rank - 2])
-       elif a_rank < b_rank:
-           out_batch = _op.strided_slice(b_shape, [0], [b_rank - 2])
-       # If its unclear how broadcasting should be applied, the output
-       # shape is determined by choosing the maximum value from each input.
-       else:
-           out_batch = _op.concatenate(
+    """Common function to handle MatMul and MatMulInteger16"""
+    a_shape = shape_of(inputs[0])
+    a_rank = infer_shape(a_shape)[0]
+    b_shape = shape_of(inputs[1])
+    b_rank = infer_shape(b_shape)[0]
+    if a_rank > 2 or b_rank > 2:
+        def flatten_to_nd(x, x_shape, nd=3):
+           ndims = infer_shape(x_shape)[0]
+           if ndims == nd:
+               return x
+           newshape = _op.concatenate(
                [
-                   _op.maximum(
-                       _op.strided_slice(a_shape, [i], [i + 1]),
-                       _op.strided_slice(b_shape, [i], [i + 1]),
-                   )
-                   for i in range(a_rank - 2)
+                   _expr.const([-1], dtype=infer_type(x_shape).checked_type.dtype),
+                   _op.strided_slice(x_shape, [ndims - nd + 1], [ndims]),
                ],
                0,
            )
-       # Reshape output to original dimensions.
-       final_shape = _op.concatenate(
-           [
-               out_batch,
-               _op.strided_slice(
-                   a_shape, [infer_shape(a_shape)[0] - 2], [infer_shape(a_shape)[0] - 1]
-               ),
-               _op.strided_slice(
-                   b_shape, [infer_shape(b_shape)[0] - 1], [infer_shape(b_shape)[0]]
-               ),
-           ],
-           0,
-       )
-       return _op.reshape(output, fold_constant(final_shape))
-   # Otherwise a simple dense op will get the job done.
-   input_1_t = _op.transpose(inputs[1], axes=(1, 0))
-   return _op.nn.dense(inputs[0], input_1_t, out_dtype=out_dtype)
+           out = _op.reshape(x, fold_constant(newshape))
+           return out
+
+        b_type = infer_type(inputs[1])
+        # Convert to dense if the second matrix is 2d and non-dynamic
+        if b_rank == 2 and not _ty.is_dynamic(b_type.checked_type):
+            a = flatten_to_nd(inputs[0], a_shape, 2)
+            b = _op.transpose(inputs[1])
+            output = _op.nn.dense(a, b, out_dtype=out_dtype)
+        else:
+            # Convert a and b into 3 dimensional tensors.
+            a = flatten_to_nd(inputs[0], a_shape, 3)
+            b = flatten_to_nd(inputs[1], b_shape, 3)
+            # Perform a NN batch matmul.
+            output = _op.nn.batch_matmul(a, b, out_dtype=out_dtype, transpose_b=False)
+        # Determine the output batch dimension.
+        if a_rank > b_rank:
+            out_batch = _op.strided_slice(a_shape, [0], [a_rank - 2])
+        elif a_rank < b_rank:
+            out_batch = _op.strided_slice(b_shape, [0], [b_rank - 2])
+        # If its unclear how broadcasting should be applied, the output
+        # shape is determined by choosing the maximum value from each input.
+        else:
+            out_batch = _op.concatenate(
+                [
+                    _op.maximum(
+                        _op.strided_slice(a_shape, [i], [i + 1]),
+                        _op.strided_slice(b_shape, [i], [i + 1]),
+                    )
+                    for i in range(a_rank - 2)
+                ],
+                0,
+            )
+        # Reshape output to original dimensions.
+        final_shape = _op.concatenate(
+            [
+                out_batch,
+                _op.strided_slice(
+                    a_shape, [infer_shape(a_shape)[0] - 2], [infer_shape(a_shape)[0] - 1]
+                ),
+                _op.strided_slice(
+                    b_shape, [infer_shape(b_shape)[0] - 1], [infer_shape(b_shape)[0]]
+                ),
+            ],
+            0,
+        )
+        return _op.reshape(output, fold_constant(final_shape))
+    # Otherwise a simple dense op will get the job done.
+    input_1_t = _op.transpose(inputs[1], axes=(1, 0))
+    return _op.nn.dense(inputs[0], input_1_t, out_dtype=out_dtype)
 
 
 

From 0324148eef6000ddfdb4bb775cd48ce77e166937 Mon Sep 17 00:00:00 2001
From: Abhikrant Sharma <quic_abhikran@quicinc.com>
Date: Thu, 18 Nov 2021 04:23:34 -0600
Subject: [PATCH 09/11] Formatting changes

---
 python/tvm/relay/frontend/onnx.py | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 65481cd509c5..a1eeafee1553 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -219,19 +219,20 @@ def matmul_out_dtype(inputs, out_dtype):
     b_shape = shape_of(inputs[1])
     b_rank = infer_shape(b_shape)[0]
     if a_rank > 2 or b_rank > 2:
+
         def flatten_to_nd(x, x_shape, nd=3):
-           ndims = infer_shape(x_shape)[0]
-           if ndims == nd:
-               return x
-           newshape = _op.concatenate(
-               [
-                   _expr.const([-1], dtype=infer_type(x_shape).checked_type.dtype),
-                   _op.strided_slice(x_shape, [ndims - nd + 1], [ndims]),
-               ],
-               0,
-           )
-           out = _op.reshape(x, fold_constant(newshape))
-           return out
+            ndims = infer_shape(x_shape)[0]
+            if ndims == nd:
+                return x
+            newshape = _op.concatenate(
+                [
+                    _expr.const([-1], dtype=infer_type(x_shape).checked_type.dtype),
+                    _op.strided_slice(x_shape, [ndims - nd + 1], [ndims]),
+                ],
+                0,
+            )
+            out = _op.reshape(x, fold_constant(newshape))
+            return out
 
         b_type = infer_type(inputs[1])
         # Convert to dense if the second matrix is 2d and non-dynamic
@@ -282,7 +283,6 @@ def flatten_to_nd(x, x_shape, nd=3):
     return _op.nn.dense(inputs[0], input_1_t, out_dtype=out_dtype)
 
 
-
 class OnnxOpConverter(object):
     """A helper class for holding onnx op converters."""
 
@@ -887,6 +887,7 @@ def _impl_v10(cls, inputs, attr, params):
             out_dtype = "uint32"
         return matmul_out_dtype(inputs, out_dtype)
 
+
 class Mod(OnnxOpConverter):
     """Operator converter for Mod."""
 

From d5d7b307636f9ffa41bb87b5350ff06834509339 Mon Sep 17 00:00:00 2001
From: Abhikrant Sharma <quic_abhikran@quicinc.com>
Date: Sun, 21 Nov 2021 01:36:31 -0600
Subject: [PATCH 10/11] Fix CUDA batchmatmul strategy to allow mixed precision

---
 python/tvm/relay/op/strategy/cuda.py       | 2 +-
 tests/python/frontend/onnx/test_forward.py | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index da7cbd5cec10..f17903806b27 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -839,7 +839,7 @@ def batch_matmul_strategy_cuda(attrs, inputs, out_type, target):
         )
     else:
         strategy.add_implementation(
-            wrap_compute_batch_matmul(topi.cuda.batch_matmul),
+            wrap_compute_batch_matmul(topi.cuda.batch_matmul, need_out_dtype=True),
             wrap_topi_schedule(topi.cuda.schedule_batch_matmul),
             name="batch_matmul.cuda",
             plevel=10,
diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index c2fa08c2d0f9..dcbd662ec219 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -4973,7 +4973,6 @@ def verify_eyelike(indata):
     "test_if_seq",
     "test_loop11",
     "test_loop13_seq",
-    "test_matmulinteger",
     "test_maxpool_2d_same_lower",
     "test_maxpool_2d_same_upper",
     "test_maxpool_with_argmax_2d_precomputed_pads",

From c0b868f92047c4ca957d5649a52b8dd957f9f3cf Mon Sep 17 00:00:00 2001
From: Abhikrant Sharma <quic_abhikran@quicinc.com>
Date: Sun, 21 Nov 2021 08:09:51 -0600
Subject: [PATCH 11/11] Add test_matmulinteger to unsupported_onnx_tests

---
 tests/python/frontend/onnx/test_forward.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py
index ef1d9f4ab3e4..01b570cdc245 100644
--- a/tests/python/frontend/onnx/test_forward.py
+++ b/tests/python/frontend/onnx/test_forward.py
@@ -5009,6 +5009,7 @@ def verify_eyelike(indata):
     "test_loop11",
     "test_loop13_seq",
     "test_lstm_batchwise",
+    "test_matmulinteger",
     "test_maxpool_2d_same_lower",
     "test_maxpool_2d_same_upper",
     "test_maxpool_with_argmax_2d_precomputed_pads",