From 7e3bfbda34ceeea179ae63545950050d89d06b45 Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini@arm.com>
Date: Tue, 6 Oct 2020 14:53:46 +0100
Subject: [PATCH] Add dot product support for quantized convolution. (#6445)

* Add dot product support for quantized convolution.

We added two new intrinsics in: topi/arm_cpu/tensor_intrin.py, namely
- mmla4x4: compute a matrix multiplication between tile A(4,4) and tile
  B(4,4)
- mmla16x4: compute a matrix multiplication between tile A(rows,4) and tile
  B(4,16)
Then we used those intrinsics in two separate strategies. We added the
strategies in topi/arm_cpu/conv2d_int8.py and implemented the schedules
in topi/arm_cpu/conv2d_gemm.py. In particular:
- schedule_conv2d_gemm, when accelerated, packs matrix A, compute GEMM,
  and unpack the resulting matrix. This uses the mmla4x4 intrinsic
- schedule_conv2d_gemm_hybrid doesn't do any packing on A and C which
  are in native form.  This uses the mmla16x4 intrinsic

Please note that for the limitations of `tensorize` we need to pad
matrix A in both cases (when dimensions are not multiple of the tiling
shape)

Change-Id: Id0d818d84ffc458c6dad7983fd350a0f3d5db395

* Add back nhwc_spatial_pack strategy as default

Change-Id: I8b1826a7ae1d742956296e8d157da19955a4942c

* Fix linting through Black

Change-Id: Ic74ef5461a90bca9f4d4980a214137e384d5f923

* Fix python linting

Change-Id: I5fb8a2ae4467a87bd3470f6b3753c074f9b7cc78

* Addressing review comments

Change-Id: I284b1f2c121051e672f548d6c6ee2a3267854e31

* Fix black linting issues

Change-Id: I1813b0226b536aedee0dce9eeeba27aa2d95518b

* Fixing failing test and adding tests for dot-product compilation

Change-Id: Ic040722abd5538fccb85af4de922394c939e7000

* Fixing linting and review comments

Change-Id: If09e3baa514c85dc78d3c27c2ac2fa2e01773d89

* Fixing black linting and address comments

Change-Id: I857b28b6f9b23307d8c1eebc509de6ad2783c756

* Address review comments

Change-Id: I63d1a639d4a72abeb33148fd2868cd356ef84122
---
 python/tvm/relay/op/strategy/arm_cpu.py       |  44 ++-
 python/tvm/topi/arm_cpu/arm_utils.py          | 100 ++++++
 python/tvm/topi/arm_cpu/conv2d_alter_op.py    | 105 ++++--
 python/tvm/topi/arm_cpu/conv2d_gemm.py        | 283 ++++++++++++-----
 python/tvm/topi/arm_cpu/conv2d_int8.py        | 120 ++++++-
 python/tvm/topi/arm_cpu/tensor_intrin.py      | 298 +++++++++++++++++-
 .../python/relay/test_pass_alter_op_layout.py |   2 +-
 .../topi/python/test_topi_conv2d_int8.py      | 105 +++---
 8 files changed, 874 insertions(+), 183 deletions(-)
 create mode 100644 python/tvm/topi/arm_cpu/arm_utils.py

diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
index 54624ce35b55..6759a54d0b80 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -135,20 +135,29 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
                     name="conv2d_direct_simd.micro_dev",
                 )
             elif kernel_layout == "HWIO":
-                is_aarch64 = "aarch64" in str(isa.target)
-
+                is_aarch64 = topi.arm_cpu.arm_utils.is_aarch64_arm()
+                has_dot_prod = topi.arm_cpu.arm_utils.is_dotprod_available()
+                if has_dot_prod and data.dtype in ["int8", "uint8"]:
+                    strategy.add_implementation(
+                        wrap_compute_conv2d(topi.arm_cpu.compute_conv2d_NHWC_quantized_native),
+                        wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized_native),
+                        name="conv2d_NHWC_quantized_native.arm_cpu",
+                    )
                 if is_aarch64 and data.dtype in ["int8", "uint8"]:
                     strategy.add_implementation(
-                        wrap_compute_conv2d(topi.arm_cpu.compute_conv2d_NHWC_quantized),
-                        wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized),
-                        name="conv2d_NHWC_quantized.arm_cpu",
+                        wrap_compute_conv2d(topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved),
+                        wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved),
+                        name="conv2d_NHWC_quantized_interleaved.arm_cpu",
+                    )
+                if (not is_aarch64) or (data.dtype not in ["int8", "uint8"]):
+                    # TODO(@giuseros)
+                    # This strategy errors out for quantized data types when tuning.
+                    # Let's use this only for non-aarch64 or non-quantized cases
+                    strategy.add_implementation(
+                        wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_spatial_pack),
+                        wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_spatial_pack),
+                        name="conv2d_nhwc_spatial_pack.arm_cpu",
                     )
-
-                strategy.add_implementation(
-                    wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_spatial_pack),
-                    wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_spatial_pack),
-                    name="conv2d_nhwc_spatial_pack.arm_cpu",
-                )
             else:
                 raise RuntimeError(
                     "Unsupported kernel layout {} for conv2d NHWC".format(kernel_layout)
@@ -328,11 +337,18 @@ def conv2d_gemm_without_weight_transform_strategy_arm_cpu(attrs, inputs, out_typ
     data = inputs[0]
     strategy = _op.OpStrategy()
 
+    interleaved_compute = topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved_without_transform
+    native_compute = topi.arm_cpu.compute_conv2d_NHWC_quantized_native_without_transform
     if layout == "NHWC" and data.dtype in ["int8", "uint8"]:
         strategy.add_implementation(
-            wrap_compute_conv2d_gemm(topi.arm_cpu.compute_conv2d_NHWC_quantized_without_transform),
-            wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized),
-            name="conv2d_NHWC_quantized_without_transform.arm_cpu",
+            wrap_compute_conv2d_gemm(native_compute),
+            wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized_native),
+            name="conv2d_NHWC_quantized_native_without_transform.arm_cpu",
+        )
+        strategy.add_implementation(
+            wrap_compute_conv2d_gemm(interleaved_compute),
+            wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved),
+            name="conv2d_NHWC_quantized_interleaved_without_transform.arm_cpu",
         )
     else:
         raise RuntimeError(
diff --git a/python/tvm/topi/arm_cpu/arm_utils.py b/python/tvm/topi/arm_cpu/arm_utils.py
new file mode 100644
index 000000000000..7e0f566b96f4
--- /dev/null
+++ b/python/tvm/topi/arm_cpu/arm_utils.py
@@ -0,0 +1,100 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name,unused-variable,unused-argument,no-member
+"""Arm target utility functions"""
+
+import re
+import tvm
+
+
+def get_arch_version(target_mattr):
+    """Parse the LLVM target -mattr, and return
+    the architecture version in a decimal representation
+    (e.g., if -mattr=v8.4a, return 8.4)
+    """
+
+    arch_version = 8.0
+    m = re.compile(r"\+v(.*)\.(.*)a")
+    for attr in target_mattr:
+        match_obj = m.match(attr)
+        if match_obj:
+            major = int(match_obj.group(1))
+            minor = int(match_obj.group(2))
+            decimal = 10
+            if minor >= 10:
+                decimal = 100
+            arch_version = major + float(minor) / decimal
+
+    return arch_version
+
+
+def is_dotprod_available():
+    """ Checks whether the hardware has support for fast Int8 arithmetic operations. """
+    target = tvm.target.Target.current(allow_none=False)
+    arch_version = get_arch_version(target.mattr)
+    return arch_version >= 8.4 or ((arch_version in (8.2, 8.3)) and "+dotprod" in target.mattr)
+
+
+def is_aarch64_arm():
+    """ Checks whether we are compiling for an AArch64 target. """
+    target = tvm.target.Target.current(allow_none=False)
+    return "aarch64" in target.attrs.get("mtriple", "")
+
+
+def get_tiling_B_interleaved_t(interleave_A):
+    """Compute the tiling information for matrix B', where B'
+    is the transposed and interleaved version of matrix B in C=A*B.
+
+    The tiling information is chosen to maximize register usage during the
+    tile computation.
+
+    Please refer to:
+        - https://discuss.tvm.apache.org/t/rfc-accelerate-quantized-convolution-through-dot-product
+        - Conv2DGemmWeightTransformRel in src/relay/op/nn/convolution.h
+     In order to have more information
+
+    Parameters
+    ----------
+    interleave_A: bool
+                  determines if A is expected to be interleaved
+
+    Returns
+    ----------
+    tile_rows_B: the output tile rows of B'
+    tile_cols_B: the output tile columns of B'
+    """
+    if is_dotprod_available():
+        # The number of tile rows of B' vary depending on the
+        # strategy:
+        # * If we are interleaving A, then we select 12 columns from B'(i.e.,
+        #   12 rows from B).
+        # * If we are not interleaving A, then we select 16 columns from B'(i.e.,
+        #   16 rows from B).
+        tile_rows_B = 12 if interleave_A else 16
+
+        # Dot product instruction groups 2 (u)int16x8 vectors in
+        # groups of 4 and compute the dot product among those groups
+        # This means that the number of columns in a tile of B' (i.e.,  the
+        # rows of the original matrix B)  need to be 4.
+        tile_cols_B = 4
+    else:
+        # If dot product is not available, A must be interleaved. In this case
+        # we load 4 rows of B' (i.e., 4 columns of B). Each of them will contain 16 elements
+        tile_rows_B = 4
+        tile_cols_B = 16
+
+    return tile_rows_B, tile_cols_B
diff --git a/python/tvm/topi/arm_cpu/conv2d_alter_op.py b/python/tvm/topi/arm_cpu/conv2d_alter_op.py
index 7bf7e42237d2..a64bc413e0c6 100644
--- a/python/tvm/topi/arm_cpu/conv2d_alter_op.py
+++ b/python/tvm/topi/arm_cpu/conv2d_alter_op.py
@@ -27,10 +27,64 @@
 from ..nn import conv2d_alter_layout
 from ..util import get_const_tuple
 from ..x86.conv2d import _get_default_config as _get_x86_default_config
+from .arm_utils import get_tiling_B_interleaved_t
 
 logger = logging.getLogger("topi")
 
 
+def interleave_transpose_weights(inputs, data, kernel, interleave_A):
+    """Transform the weight matrix by reshaping, interleaving and transposing it
+
+    Parameters
+    ----------
+    inputs : tvm.relay.Expr
+        Grouped input symbols
+    data :
+        Input shape and dtype
+    kernel :
+        Input shape and dtype
+    interleave_A: indicates if we expect matrix A to be interleaved
+
+    Returns
+    ----------
+    new_kernel : tvm.te.placeholder
+                 A placeholder with the new shape
+    new_kernel_expr : tvm.relay.Expr
+                The relay expression of the weights
+    """
+    assert (
+        data.dtype == "int8"
+        and kernel.dtype == "int8"
+        or data.dtype == "uint8"
+        and kernel.dtype == "uint8"
+    )
+
+    KH, KW, IC, OC = get_const_tuple(kernel.shape)
+    K = KH * KW * IC
+    N = OC
+
+    # Get tiling information for the interleaved transposed version of B
+    tile_rows_B, tile_cols_B = get_tiling_B_interleaved_t(interleave_A)
+
+    pad_K = 0
+    pad_N = 0
+
+    if N % tile_rows_B != 0:
+        pad_N = tile_rows_B - (N % tile_rows_B)
+    if K % tile_cols_B != 0:
+        pad_K = tile_cols_B - (K % tile_cols_B)
+
+    N_padded = N + pad_N
+    K_padded = K + pad_K
+    new_kernel_expr = relay.nn.contrib_conv2d_gemm_weight_transform(
+        inputs[1], tile_rows_B, tile_cols_B
+    )
+    new_kernel = te.placeholder(
+        (N_padded // tile_rows_B, K_padded // tile_cols_B, tile_rows_B, tile_cols_B), kernel.dtype
+    )
+    return new_kernel, new_kernel_expr
+
+
 @conv2d_alter_layout.register(["arm_cpu"])
 def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
     target = tvm.target.Target.current(allow_none=False)
@@ -279,36 +333,13 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
         )
         dispatch_ctx.update(target, new_workload, cfg)
         return relay.nn.contrib_depthwise_conv2d_nchwc(*inputs, **new_attrs)
-    if topi_tmpl == "conv2d_NHWC_quantized.arm_cpu":
-        assert (
-            data.dtype == "int8"
-            and kernel.dtype == "int8"
-            or data.dtype == "uint8"
-            and kernel.dtype == "uint8"
-        )
+    if topi_tmpl == "conv2d_NHWC_quantized_interleaved.arm_cpu":
         assert data_layout == "NHWC" and kernel_layout == "HWIO"
-        KH, KW, IC, OC = get_const_tuple(kernel.shape)
-        K = KH * KW * IC
-        N = OC
-
-        tile_rows = 4
-        tile_cols = 16
-        pad_K = 0
-        pad_N = 0
-
-        if N % tile_rows != 0:
-            pad_N = tile_rows - (N % tile_rows)
-        if K % tile_cols != 0:
-            pad_K = tile_cols - (K % tile_cols)
-
-        N_padded = N + pad_N
-        K_padded = K + pad_K
-        kernel_expr = relay.nn.contrib_conv2d_gemm_weight_transform(inputs[1], tile_rows, tile_cols)
-        new_kernel = te.placeholder(
-            (N_padded // tile_rows, K_padded // tile_cols, tile_rows, tile_cols), kernel.dtype
+        KH, KW, _, OC = get_const_tuple(kernel.shape)
+        new_workload_name = "conv2d_NHWC_quantized_interleaved_without_transform.arm_cpu"
+        new_kernel, new_kernel_expr = interleave_transpose_weights(
+            inputs, data, kernel, interleave_A=True
         )
-
-        new_workload_name = "conv2d_NHWC_quantized_without_transform.arm_cpu"
         new_workload = autotvm.task.args_to_workload(
             [data, new_kernel, strides, padding, dilation, out_dtype, (KH, KW), OC],
             new_workload_name,
@@ -316,7 +347,21 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
         dispatch_ctx.update(target, new_workload, cfg)
 
         return relay.nn.contrib_conv2d_gemm_without_weight_transform(
-            inputs[0], kernel_expr, **new_attrs
+            inputs[0], new_kernel_expr, **new_attrs
+        )
+    if topi_tmpl == "conv2d_NHWC_quantized_native.arm_cpu":
+        assert data_layout == "NHWC" and kernel_layout == "HWIO"
+        KH, KW, _, OC = get_const_tuple(kernel.shape)
+        new_workload_name = "conv2d_NHWC_quantized_native_without_transform.arm_cpu"
+        new_kernel, new_kernel_expr = interleave_transpose_weights(
+            inputs, data, kernel, interleave_A=False
+        )
+        new_workload = autotvm.task.args_to_workload(
+            [data, new_kernel, strides, padding, dilation, out_dtype, (KH, KW), OC],
+            new_workload_name,
+        )
+        dispatch_ctx.update(target, new_workload, cfg)
+        return relay.nn.contrib_conv2d_gemm_without_weight_transform(
+            inputs[0], new_kernel_expr, **new_attrs
         )
-
     return None
diff --git a/python/tvm/topi/arm_cpu/conv2d_gemm.py b/python/tvm/topi/arm_cpu/conv2d_gemm.py
index 7f73cc828fa7..b40fb89b5d33 100644
--- a/python/tvm/topi/arm_cpu/conv2d_gemm.py
+++ b/python/tvm/topi/arm_cpu/conv2d_gemm.py
@@ -23,18 +23,52 @@
 from tvm.autotvm.task.space import AnnotateEntity, ReorderEntity, OtherOptionEntity
 from ..util import get_const_tuple, get_const_int
 from ..nn.util import get_pad_tuple
-from .tensor_intrin import gemm_quantized, gemm_quantized_impl
+from .tensor_intrin import (
+    gemm_quantized,
+    gemm_quantized_impl,
+    gemm_acc_4x4_int8_int8_int32,
+    gemm_acc_nx16_int8_int8_int32,
+)
+from .arm_utils import is_aarch64_arm, is_dotprod_available
 
 
-def is_aarch64_arm():
-    """ Checks whether we are compiling for an AArch64 target. """
-    target = tvm.target.Target.current(allow_none=False)
-    return "aarch64" in target.attrs.get("mtriple", "")
+def configure_knobs(cfg, M, K):
+    """ Configure auto-tuning knobs for the interleaved strategy """
+
+    x, y = cfg.axis(M // 4), cfg.axis(K // 16)
+    cfg.define_reorder("reorder_gemm", [x, y], policy="candidate", candidate=[[x, y], [y, x]])
+
+    outer_loop, inner_loop = cfg.axis(4), cfg.axis(16)
+    cfg.define_annotate(
+        "A_interleaved_unroll_vec", [outer_loop, inner_loop], policy="try_unroll_vec"
+    )
+
+    # Fallback configuration
+    if cfg.is_fallback:
+        cfg["reorder_gemm"] = ReorderEntity([0, 1])
+        cfg["A_interleaved_unroll_vec"] = AnnotateEntity(["unroll", "vec"])
+
+    if not is_dotprod_available():
+        cfg.define_knob("gemm_quantized_unroll", [True, False])
+        cfg.define_knob("gemm_quantized_interleave", [True, False])
+
+        if cfg.is_fallback:
+            cfg["gemm_quantized_unroll"] = OtherOptionEntity(False)
+            cfg["gemm_quantized_interleave"] = OtherOptionEntity(True)
 
 
 # Compute function
 def compute_conv2d_gemm_without_weight_transform(
-    cfg, data, B_interleaved_t, strides, padding, dilation, out_dtype, kernel_size, output_channels
+    cfg,
+    data,
+    B_interleaved_t,
+    strides,
+    padding,
+    dilation,
+    out_dtype,
+    kernel_size,
+    output_channels,
+    interleave_A,
 ):
     """Compute conv2d by transforming the input,
     executing GEMM and transforming the output back"""
@@ -42,8 +76,7 @@ def compute_conv2d_gemm_without_weight_transform(
 
     KH, KW = get_const_tuple(kernel_size)
     OC = get_const_int(output_channels)
-
-    K_AREA = KH * KW
+    kernel_area = KH * KW
 
     if isinstance(dilation, int):
         dilation_h = dilation_w = dilation
@@ -67,18 +100,14 @@ def compute_conv2d_gemm_without_weight_transform(
     else:
         data_pad = data
 
-    # --- Im2col
+    # Im2col
     M = OH * OW
-    K = IC * K_AREA
+    K = IC * kernel_area
     N = OC
 
     A_shape = (batches, M, K)
-    if K_AREA == 1:
-        A = te.compute(
-            A_shape,
-            lambda n, x, y: data_pad[n, HSTR * (x // OW), WSTR * (x % OW), y],
-            name="data_flatten",
-        )
+    if kernel_area == 1:
+        A = tvm.topi.reshape(data_pad, A_shape)
     else:
         A = te.compute(
             A_shape,
@@ -90,82 +119,117 @@ def compute_conv2d_gemm_without_weight_transform(
             ],
             name="data_im2col",
         )
-    N_transformed = B_interleaved_t.shape[0]
 
-    # --- Pad if necessary
-    idxm = tvm.tir.indexmod
+    #  Pad if necessary
+    N_transformed = B_interleaved_t.shape[0]
+    tile_rows_B = B_interleaved_t.shape[2]
+    tile_cols_B = B_interleaved_t.shape[3]
+
+    # Select the tiling strategy for A.
+    # The tiling information is chosen to maximize register usage during
+    # the tile computation.
+    #
+    # Please refer to:
+    #   - https://discuss.tvm.apache.org/t/rfc-accelerate-quantized-convolution-through-dot-product
+    #   - Conv2DGemmWeightTransformRel in src/relay/op/nn/convolution.h
+    # In order to have more information
+    #
+    if is_dotprod_available() and interleave_A:
+        # If dot product has been enabled, and we are interleaving A
+        # tile size should be 8x4
+        tile_rows_A = 8
+        tile_cols_A = 4
+    else:
+        # If either there is no dot product or if we are using a native strategy
+        # tile size should be 4x16
+        tile_rows_A = 4
+        tile_cols_A = 16
 
-    pad_m = 0
-    pad_k = 0
+    pad_M = 0
+    pad_K = 0
 
-    if M % 4 != 0:
-        pad_m = 4 - (M % 4)
+    if M % tile_rows_A != 0:
+        pad_M = tile_rows_A - (M % tile_rows_A)
 
-    if K % 16 != 0:
-        pad_k = 16 - (K % 16)
+    if K % tile_cols_A != 0:
+        pad_K = tile_cols_A - (K % tile_cols_A)
 
-    M_padded = M + pad_m
-    K_padded = K + pad_k
+    M_padded = M + pad_M
+    K_padded = K + pad_K
+    N_padded = N_transformed * tile_rows_B
 
     pad_before = (0, 0, 0)
-    pad_after = (0, pad_m, pad_k)
+    pad_after = (0, pad_M, pad_K)
 
-    if pad_m != 0 or pad_k != 0:
+    if pad_M != 0 or pad_K != 0:
         A = nn.pad(A, pad_before=pad_before, pad_after=pad_after, name="A_padded")
 
-    # --- GEMM: A*B'
+    idxm = tvm.tir.indexmod
     k = te.reduce_axis((0, K_padded), "k")
 
-    A_interleaved = te.compute(
-        (batches, M_padded // 4, K_padded // 16, 4, 16),
-        lambda b, x, y, z, w: A[b, z + 4 * x, w + 16 * y],
-        name="A_interleaved",
-    )
+    if interleave_A:
+        # Configuration space
+        configure_knobs(cfg, M_padded, K_padded)
 
-    C_interleaved = te.compute(
-        (batches, M_padded // 4, N_transformed, 4, 4),
-        lambda b, x, y, w, z: te.sum(
-            A_interleaved[b, x, k // 16, w, idxm(k, 16)].astype(out_dtype)
-            * B_interleaved_t[y, k // 16, z, idxm(k, 16)].astype(out_dtype),
-            axis=k,
-        ),
-        name="C_interleaved",
-    )
+        # Pack the input data
+        A_interleaved = te.compute(
+            (batches, M_padded // tile_rows_A, K_padded // tile_cols_A, tile_rows_A, tile_cols_A),
+            lambda b, x, y, z, w: A[b, z + tile_rows_A * x, w + tile_cols_A * y],
+            name="A_interleaved",
+        )
+        # Execute GEMM
+        C_interleaved = te.compute(
+            (batches, M_padded // tile_rows_A, N_transformed, tile_rows_A, tile_rows_B),
+            lambda b, x, y, w, z: te.sum(
+                A_interleaved[b, x, k // tile_cols_A, w, idxm(k, tile_cols_A)].astype("int32")
+                * B_interleaved_t[y, k // tile_cols_B, z, idxm(k, tile_cols_B)].astype("int32"),
+                axis=k,
+            ),
+            name="C_interleaved",
+        )
+        # Unpack the result
+        C = te.compute(
+            (batches, M, N),
+            lambda b, x, y: C_interleaved[
+                b, x // tile_rows_A, y // tile_rows_B, idxm(x, tile_rows_A), idxm(y, tile_rows_B)
+            ].astype(out_dtype),
+            name="C",
+        )
+        zero = tvm.tir.const(0)
+    else:
+        # No need to pack/unpack, execute GEMM directly
+        C = te.compute(
+            (batches, M_padded, N_padded),
+            lambda b, x, y: te.sum(
+                A[b, x, k].astype("int32")
+                * B_interleaved_t[
+                    y // tile_rows_B, k // tile_cols_B, idxm(y, tile_rows_B), idxm(k, tile_cols_B)
+                ].astype("int32"),
+                axis=k,
+            ),
+            name="C",
+        )
 
-    # --- Unpack C
-    C = te.compute(
-        (batches, M, N),
-        lambda b, x, y: C_interleaved[b, x // 4, y // 4, idxm(x, 4), idxm(y, 4)],
-        name="C",
-    )
+        # We need to ensure that infer bound pass does not remove the padding
+        # which is necessary for the tensorizations to work. So we need to
+        # add a dummy reference to the padding area of the result
+        zero = (
+            tvm.tir.const(1, C.dtype) * C[0, M_padded - 1, N_padded - 1]
+            - tvm.tir.const(1, C.dtype) * C[0, M_padded - 1, N_padded - 1]
+        )
 
-    # --- Produce the conv output
+    # Reshape the result into a convolution output
     out_shape = (batches, OH, OW, OC)
-    out = te.compute(out_shape, lambda b, x, y, z: C(b, y + OW * x, z), name="conv2d_gemm_output")
-
-    # Configuration space
-    x, y = cfg.axis(M_padded // 4), cfg.axis(K_padded // 16)
-    cfg.define_reorder("reorder_gemm", [x, y], policy="candidate", candidate=[[x, y], [y, x]])
-
-    outer_loop, inner_loop = cfg.axis(4), cfg.axis(16)
-    cfg.define_annotate(
-        "A_interleaved_unroll_vec", [outer_loop, inner_loop], policy="try_unroll_vec"
+    out = te.compute(
+        out_shape,
+        lambda b, x, y, z: (C(b, y + OW * x, z) + zero).astype(out_dtype),
+        name="conv2d_gemm_output",
     )
-    cfg.define_knob("gemm_quantized_unroll", [True, False])
-    cfg.define_knob("gemm_quantized_interleave", [True, False])
-
-    # Fallback configuration
-    if cfg.is_fallback:
-        cfg["reorder_gemm"] = ReorderEntity([0, 1])
-        cfg["A_interleaved_unroll_vec"] = AnnotateEntity(["unroll", "vec"])
-        cfg["gemm_quantized_unroll"] = OtherOptionEntity(False)
-        cfg["gemm_quantized_interleave"] = OtherOptionEntity(True)
     return out
 
 
-# Schedules
-def schedule_conv2d_gemm(cfg, s, out, final_out):
-    """Create schedule for tensors"""
+def schedule_conv2d_gemm_interleaved(cfg, s, out, final_out):
+    """ Schedule the conv2d_gemm interleaved strategy """
     C = out.op.input_tensors[0]
     C_interleaved = C.op.input_tensors[0]
     A_interleaved = C_interleaved.op.input_tensors[0]
@@ -193,7 +257,7 @@ def schedule_conv2d_gemm(cfg, s, out, final_out):
     # Computation(through tensorize)
     b, xo, yo, xi, yi = C_interleaved.op.axis
     outer_gemm, inner_gemm = cfg["reorder_gemm"].apply(s, C_interleaved, [xo, yo])
-    s[C_interleaved].reorder(yi, xi)
+
     b_outer_gemm_fused = s[C_interleaved].fuse(b, outer_gemm)
     s[C_interleaved].parallel(b_outer_gemm_fused)
     s[A_interleaved].compute_at(s[C_interleaved], b_outer_gemm_fused)
@@ -204,9 +268,33 @@ def schedule_conv2d_gemm(cfg, s, out, final_out):
 
     in_type = A_interleaved.dtype
     out_type = C.dtype
-    if is_aarch64_arm() and out_type == "int32":
+
+    k = C_interleaved.op.reduce_axis[0]
+    _, M, N = C.shape
+    if is_dotprod_available():
+        gemm_acc = gemm_acc_4x4_int8_int8_int32(in_type)
+        xi_outer, yi_outer, xi_inner, yi_inner = s[C_interleaved].tile(
+            xi, yi, x_factor=8, y_factor=4
+        )
+        k_outer, k_inner = s[C_interleaved].split(k, 4)
+        xi_inner_outer, xi_inner_inner = s[C_interleaved].split(xi_inner, 4)
+        s[C_interleaved].reorder(
+            b_outer_gemm_fused,
+            inner_gemm,
+            xi_outer,
+            yi_outer,
+            k_outer,
+            xi_inner_outer,
+            xi_inner_inner,
+            yi_inner,
+            k_inner,
+        )
+        s[C_interleaved].tensorize(xi_inner_inner, gemm_acc)
+        s[C_interleaved].unroll(xi_inner_outer)
+
+    elif is_aarch64_arm():
+        s[C_interleaved].reorder(yi, xi)
         K = A_interleaved_input.shape[2]
-        _, M, N = C.shape
         assert in_type in ["int8", "uint8"], "Only int8 and uint8 gemm are supported"
         unroll = cfg["gemm_quantized_unroll"].val
         interleave = cfg["gemm_quantized_interleave"].val
@@ -225,3 +313,48 @@ def schedule_conv2d_gemm(cfg, s, out, final_out):
         s[C].compute_at(s[out], inner)
         s[out].vectorize(inner)
     return s
+
+
+def schedule_conv2d_gemm_native(cfg, s, out, final_out):
+    """ Schedule the conv2d_gemm hybrid strategy """
+    C = out.op.input_tensors[0]
+    A = C.op.input_tensors[0]
+    in_type = A.dtype
+
+    # Computation
+    b, x, y = C.op.axis
+    (k,) = C.op.reduce_axis
+    k_outer, k_inner = s[C].split(k, 16)
+    x_outer, y_outer, x_inner, y_inner = s[C].tile(x, y, x_factor=4, y_factor=16)
+    s[C].reorder(b, x_outer, y_outer, k_outer, x_inner, y_inner, k_inner)
+    gemm_acc = gemm_acc_nx16_int8_int8_int32(in_type, rows=1)
+    s[C].unroll(x_inner)
+    s[C].tensorize(y_inner, gemm_acc)
+    s[C].parallel(x_outer)
+
+    # Input transform
+    if A.op.name == "A_padded":
+        padding_A = True
+        data_im2col = A.op.input_tensors[0]
+    else:
+        padding_A = False
+        data_im2col = A
+
+    b, m, n = data_im2col.op.axis
+    if data_im2col.op.name == "data_im2col":
+        n_outer, n_inner = s[data_im2col].split(n, 16)
+        s[data_im2col].unroll(n_outer)
+        s[data_im2col].vectorize(n_inner)
+        s[data_im2col].parallel(m)
+    elif padding_A:
+        s[data_im2col].compute_inline()
+        s[A].compute_at(s[C], x_inner)
+    else:
+        s[data_im2col].compute_at(s[C], x_inner)
+
+    # Output transform
+    if out != final_out:
+        n, h, w, c = out.op.axis
+        _, inner = s[out].split(c, 4)
+        s[out].vectorize(inner)
+    return s
diff --git a/python/tvm/topi/arm_cpu/conv2d_int8.py b/python/tvm/topi/arm_cpu/conv2d_int8.py
index 307f9e102acf..43fe80178bd3 100644
--- a/python/tvm/topi/arm_cpu/conv2d_int8.py
+++ b/python/tvm/topi/arm_cpu/conv2d_int8.py
@@ -24,7 +24,12 @@
 from .. import nn
 from ..nn.conv2d import _get_workload as _get_conv2d_workload
 from .tensor_intrin import dot_int8_int8_int32
-from .conv2d_gemm import compute_conv2d_gemm_without_weight_transform, schedule_conv2d_gemm
+from .conv2d_gemm import (
+    compute_conv2d_gemm_without_weight_transform,
+    schedule_conv2d_gemm_interleaved,
+    schedule_conv2d_gemm_native,
+)
+from .arm_utils import get_tiling_B_interleaved_t
 
 
 def _get_default_config(cfg, data, kernel, strides, padding, out_dtype):
@@ -115,30 +120,46 @@ def traverse(op):
     return s
 
 
-@autotvm.register_topi_compute("conv2d_NHWC_quantized.arm_cpu")
-def compute_conv2d_NHWC_quantized(cfg, data, kernel, strides, padding, dilation, out_dtype):
+def _compute_conv2d_NHWC_quantized(
+    cfg, data, kernel, strides, padding, dilation, out_dtype, interleave_A
+):
     N, IH, IW, IC = get_const_tuple(data.shape)
     KH, KW, _, OC = get_const_tuple(kernel.shape)
-    tile_rows = 4
-    tile_cols = 16
-    kernel = nn.conv2d_gemm_weight_transform(kernel, tile_rows, tile_cols)
+    tile_rows_B, tile_cols_B = get_tiling_B_interleaved_t(interleave_A)
+
+    kernel = nn.conv2d_gemm_weight_transform(kernel, tile_rows_B, tile_cols_B)
     return compute_conv2d_gemm_without_weight_transform(
-        cfg, data, kernel, strides, padding, dilation, out_dtype, (KH, KW), OC
+        cfg, data, kernel, strides, padding, dilation, out_dtype, (KH, KW), OC, interleave_A
     )
 
 
-@autotvm.register_topi_compute("conv2d_NHWC_quantized_without_transform.arm_cpu")
-def compute_conv2d_NHWC_quantized_without_transform(
-    cfg, data, B, strides, padding, dilation, out_dtype, kernel_size=None, output_channels=None
+def _compute_conv2d_NHWC_quantized_without_transform(
+    cfg,
+    data,
+    B,
+    strides,
+    padding,
+    dilation,
+    out_dtype,
+    kernel_size=None,
+    output_channels=None,
+    interleave_A=False,
 ):
-    """Compute for conv2d_NHWC_quantized without weight transform."""
     return compute_conv2d_gemm_without_weight_transform(
-        cfg, data, B, strides, padding, dilation, out_dtype, kernel_size, output_channels
+        cfg,
+        data,
+        B,
+        strides,
+        padding,
+        dilation,
+        out_dtype,
+        kernel_size,
+        output_channels,
+        interleave_A,
     )
 
 
-@autotvm.register_topi_schedule("conv2d_NHWC_quantized.arm_cpu")
-def schedule_conv2d_NHWC_quantized(cfg, outs):
+def _schedule_conv2d_NHWC_quantized(cfg, outs, interleave_A):
     """Create schedule for tensors"""
     s = te.create_schedule([x.op for x in outs])
     # Vectorize the output and then inline all the rest
@@ -153,12 +174,79 @@ def _callback(op):
         """Traverse operators from computation graph"""
         if op.name == "conv2d_gemm_output":
             conv_out = op.output(0)
-            schedule_conv2d_gemm(cfg, s, conv_out, out)
+            if interleave_A:
+                schedule_conv2d_gemm_interleaved(cfg, s, conv_out, out)
+            else:
+                schedule_conv2d_gemm_native(cfg, s, conv_out, out)
             if out != conv_out:
                 s[conv_out].compute_at(s[out], inner)
             else:
                 C = conv_out.op.input_tensors[0]
-                s[C].compute_at(s[out], inner)
+                if interleave_A:
+                    s[C].compute_at(s[out], inner)
 
     traverse_inline(s, outs[0].op, _callback)
     return s
+
+
+# Interleaved schedules: those schedule will interleave the input data. The
+# weights are interleaved and transposed
+@autotvm.register_topi_compute("conv2d_NHWC_quantized_interleaved.arm_cpu")
+def compute_conv2d_NHWC_quantized_interleaved(
+    cfg, data, kernel, strides, padding, dilation, out_dtype
+):
+    """ Interface for interleaved compute_conv2d_NHWC_quantized_interleaved"""
+    return _compute_conv2d_NHWC_quantized(
+        cfg, data, kernel, strides, padding, dilation, out_dtype, True
+    )
+
+
+@autotvm.register_topi_compute("conv2d_NHWC_quantized_interleaved_without_transform.arm_cpu")
+def compute_conv2d_NHWC_quantized_interleaved_without_transform(
+    cfg, data, kernel, strides, padding, dilation, out_dtype, kernel_size, output_channels
+):
+    """ Interface for interleaved compute_conv2d_NHWC_quantized_interleaved_without_transform"""
+    return _compute_conv2d_NHWC_quantized_without_transform(
+        cfg, data, kernel, strides, padding, dilation, out_dtype, kernel_size, output_channels, True
+    )
+
+
+@autotvm.register_topi_schedule("conv2d_NHWC_quantized_interleaved.arm_cpu")
+def schedule_conv2d_NHWC_quantized_interleaved(cfg, outs):
+    """ Interface for interleaved schedule_conv2d_NHWC_quantized_interleaved"""
+    return _schedule_conv2d_NHWC_quantized(cfg, outs, True)
+
+
+# Native schedules: those schedule won't interleave A (which is left in its native form).
+# The weights are interleaved and transposed
+@autotvm.register_topi_compute("conv2d_NHWC_quantized_native.arm_cpu")
+def compute_conv2d_NHWC_quantized_native(cfg, data, kernel, strides, padding, dilation, out_dtype):
+    """ Interface for native compute_conv2d_NHWC_quantized"""
+    return _compute_conv2d_NHWC_quantized(
+        cfg, data, kernel, strides, padding, dilation, out_dtype, False
+    )
+
+
+@autotvm.register_topi_compute("conv2d_NHWC_quantized_native_without_transform.arm_cpu")
+def compute_conv2d_NHWC_quantized_native_without_transform(
+    cfg, data, kernel, strides, padding, dilation, out_dtype, kernel_size, output_channels
+):
+    """ Interface for compute_conv2d_NHWC_quantized_native_without_transform"""
+    return _compute_conv2d_NHWC_quantized_without_transform(
+        cfg,
+        data,
+        kernel,
+        strides,
+        padding,
+        dilation,
+        out_dtype,
+        kernel_size,
+        output_channels,
+        False,
+    )
+
+
+@autotvm.register_topi_schedule("conv2d_NHWC_quantized_native.arm_cpu")
+def schedule_conv2d_NHWC_quantized_native(cfg, outs):
+    """ Interface for native schedule_conv2d_NHWC_quantized"""
+    return _schedule_conv2d_NHWC_quantized(cfg, outs, False)
diff --git a/python/tvm/topi/arm_cpu/tensor_intrin.py b/python/tvm/topi/arm_cpu/tensor_intrin.py
index e87bdc47d0b0..73cfacb62079 100644
--- a/python/tvm/topi/arm_cpu/tensor_intrin.py
+++ b/python/tvm/topi/arm_cpu/tensor_intrin.py
@@ -417,12 +417,10 @@ def gemm_quantized(M, N, K, unroll, interleave, in_type, out_type):
     idxm = tvm.tir.indexmod
 
     k = te.reduce_axis((0, K), "k")
-
     C = te.compute(
         (te.var("m"), te.var("n")),
         lambda x, y: te.sum(
-            A[k // 16, x, idxm(k, 16)].astype(out_type)
-            * B[k // 16, y, idxm(k, 16)].astype(out_type),
+            A[k // 16, x, idxm(k, 16)].astype("int32") * B[k // 16, y, idxm(k, 16)].astype("int32"),
             axis=k,
         ),
         name="C",
@@ -445,7 +443,7 @@ def gemm_quantized(M, N, K, unroll, interleave, in_type, out_type):
     )
 
     c_buffer = tvm.tir.decl_buffer(
-        C.shape, dtype=out_type, name="c_buffer", offset_factor=1, strides=[te.var("sc"), 1]
+        C.shape, dtype="int32", name="c_buffer", offset_factor=1, strides=[te.var("sc"), 1]
     )
 
     def _intrin_func(ins, outs):
@@ -589,6 +587,298 @@ def _instr(index):
     )
 
 
+def select_word(vec, lane, dtype_vec):
+    """
+    Utility function used to select a int8x4 word within a int8x16 vector
+    and replicate 4 times.
+    The pseudo-code for this operation is:
+
+    v = [x0, ..., x15]
+    vsub(lane) = v[4*lane:4*lane+3]
+    replicated_v(lane) = [vsub(lane), vsub(lane), vsub(lane), vsub(lane)]
+
+    Note that 0<=lane<4
+
+     Parameters
+    ----------
+    vec: tvm.tir.Expr
+         int8x16 vector expression
+    lane: int
+        vector lane we want to replicate
+    dtype_vec: str
+        vector data type (e.g., int8x16)
+
+    Returns
+    ----------
+    output: tvm.tir.Expr
+        replicated vector
+    """
+    # Reinterpret vec_a as 4 int32 words
+    vec_int32 = tvm.tir.call_intrin("int32x4", "tir.reinterpret", vec)
+    # Broadcast the lane-th word
+    vec_int32_shuffled = tvm.tir.Shuffle([vec_int32], [lane, lane, lane, lane])
+    # Convert back to uint8x16
+    vec_int8_broadcast = tvm.tir.call_intrin(dtype_vec, "tir.reinterpret", vec_int32_shuffled)
+    return vec_int8_broadcast
+
+
+def gemm_acc_4x4_int8_int8_int32(dtype):
+    """
+    Int8 4x4 matrix multiplication and accumulation using sdot/udot
+    instructions. This function takes two arrays of int8 datatype
+    -- A[4][4] and B[4][4] and produces a 4x4 matrix
+    which is equal to A*B.
+
+    The pseudo code is as follows.
+
+    .. code-block:: c
+
+        void gemm_acc_4x4_int8_int8_int32(int8 A[4][4], int8 B[4][4], int32 C[4][4]){
+            for (int i = 0; i < 4; i++){
+                for (int j = 0; i < 4; i++){
+                    for (int k = 0; k < 4; k++){
+                        C[i][j] += A[i][k] * B[j][k]
+                    }
+            }
+        }
+
+    Notes:
+        * The rows of matrix B are transposed
+        * The tiling strategy is picked to maximize register usage.
+
+    Parameters
+    ----------
+    dtype: str, {"uint8", "int8"}
+        Whether it works on unsigned int or signed int
+
+    Returns
+    -------
+    intrin : TensorIntrin
+        The Arm TensorIntrin that can be used in tensorizing schedule
+    """
+    # This needs to be a variable number of "rows" since TVM
+    # "thinks" I only need to compute one row because of
+    # padding
+    A = te.placeholder((te.var("rows"), 4), dtype, name="A")
+    B = te.placeholder((4, 4), dtype, name="B")
+    dtype_vec = dtype + "x16"
+
+    k = te.reduce_axis((0, 4), name="k")
+    C = te.compute(
+        (te.var("rows"), 4),
+        lambda i, j: te.sum(A[i, k].astype("int32") * B[j, k].astype("int32"), axis=k),
+        name="C",
+    )
+
+    aa_buffer = tvm.tir.decl_buffer(
+        A.shape, dtype, name="aa_buffer", offset_factor=1, strides=[te.var("sa"), 1]
+    )
+    bb_buffer = tvm.tir.decl_buffer(
+        B.shape, dtype, name="bb_buffer", offset_factor=1, strides=[te.var("sb"), 1]
+    )
+    cc_buffer = tvm.tir.decl_buffer(
+        C.shape, dtype="int32", name="cc_buffer", offset_factor=1, strides=[te.var("sc"), 1]
+    )
+
+    llvm_intrin = "llvm.aarch64.neon.sdot" if dtype == "int8" else "llvm.aarch64.neon.udot"
+
+    def _intrin_func(ins, outs):
+        def _instr(index):
+            ib = tvm.tir.ir_builder.create()
+            if index == 1:
+                for i in range(0, 4):
+                    ib.emit(outs[0].vstore([i, 0], tvm.tir.const(0, "int32x4")))
+                return ib.get()
+            # Load all the elements of tile A.
+            # vec_a = [a, b, c, d,
+            #          e, f, g, h,
+            #          l, m, n, o,
+            #          p, q, r, s];
+            vec_a = ins[0].vload([0, 0], dtype_vec)
+
+            # Replicate 4 times the i-th row of A. For instance,
+            # vec_a[0] = [a, b, c, d,
+            #             a, b, c, d,
+            #             a, b, c, d,
+            #             a, b, c, d,];
+            vec_aa = [select_word(vec_a, i, dtype_vec) for i in range(0, 4)]
+
+            # Load all the elements of B. Remember that B
+            # is transposed:
+            # vec_b = [0, 4, 8, 12,
+            #          1, 5, 9, 13,
+            #          2, 6, 10, 14,
+            #          3, 7, 11, 15,];
+            vec_b = ins[1].vload([0, 0], dtype_vec)
+
+            # Execute the dot product
+            for i in range(0, 4):
+                vec_c = outs[0].vload([i, 0], "int32x4")
+                # Compute the product between the i-th row of A
+                # and all the rows of B. Remember that sdot/udot
+                # subdive the input vectors in 16 elements
+                # and then take the dot product among each group.
+                # The result is stored in a int32x4 register
+                #
+                # For instance, for i=0, we have:
+                # sdot(vec_aa[0], vec_b) = [a*0+b*4+c*8+d*12,
+                #                           a*1+b*5+c*9+d*13,
+                #                           a*2+b*6+c*10+d*14,
+                #                           a*3+b*7+c*11+d*15]
+                vdot = tvm.tir.call_llvm_intrin(
+                    "int32x4",
+                    llvm_intrin,
+                    tvm.tir.const(3, "uint32"),
+                    vec_c,
+                    vec_b,
+                    vec_aa[i],
+                )
+
+                # Store the result
+                ib.emit(outs[0].vstore([i, 0], vdot))
+
+            return ib.get()
+
+        # body, reset, update
+        return _instr(0), _instr(1), _instr(2)
+
+    buffer_params = {"offset_factor": 1}
+    return te.decl_tensor_intrin(
+        C.op,
+        _intrin_func,
+        binds={A: aa_buffer, B: bb_buffer, C: cc_buffer},
+        default_buffer_params=buffer_params,
+    )
+
+
+def gemm_acc_nx16_int8_int8_int32(dtype, rows):
+    """
+    Int8 nx16 matrix multiplication and accumulation using sdot/udot instructions
+    This function takes two arrays of int8 datatype -- A[n][4] and
+    B[4][16] and produces a rowsx16 matrix which is equal to A*B
+    The pseudo code is as follows.
+
+    .. code-block:: c
+
+        void mmla_nx16_int8_int8_int32(int8 A[n][16], int8 B[4][16][4], int32 output[n][16]){
+            for (int i = 0; i < n; i++){
+                for (int j = 0; i < 16; i++){
+                    for (int k = 0; k < 16; k++){
+                        out[i][j] += A[i][k] * B[k//4][j][k%4]
+                    }
+                }
+            }
+        }
+
+    Notes:
+        * The rows of matrix B are transposed
+        * The tile size of B is 16x4. Since the reduction variable k moves between 0 and 16
+          we need 4 tiles of B to compute a single row of the output. The first 4 values of
+          k will be fetched from B[0][j][k], the second batch of 4 from B[1][j][k] and so on
+        * The tiling strategy is picked to maximize register usage.
+
+    Parameters
+    ----------
+    dtype: str, {"uint8", "int8"}
+        Whether it works on unsigned int or signed int
+    rows: int
+        Number of of the output rows "n"
+
+    Returns
+    -------
+    intrin : TensorIntrin
+        The Arm TensorIntrin that can be used in tensorizing schedule
+    """
+    A = te.placeholder((rows, 16), dtype, name="A")
+    B = te.placeholder((4, 16, 4), dtype, name="B")
+    dtype_vec = dtype + "x16"
+    idxm = tvm.tir.indexmod
+    k = te.reduce_axis((0, 16), name="k")
+    C = te.compute(
+        (rows, 16),
+        lambda i, j: te.sum(
+            A[i, k].astype("int32") * B[k // 4, j, idxm(k, 4)].astype("int32"), axis=k
+        ),
+        name="C",
+    )
+
+    aa_buffer = tvm.tir.decl_buffer(
+        A.shape, dtype, name="aa_buffer", offset_factor=1, strides=[te.var("sa"), 1]
+    )
+    bb_buffer = tvm.tir.decl_buffer(
+        B.shape,
+        dtype,
+        name="bb_buffer",
+        offset_factor=1,
+        strides=[te.var("sb0"), te.var("sb1"), 1],
+    )
+    cc_buffer = tvm.tir.decl_buffer(
+        C.shape, dtype="int32", name="cc_buffer", offset_factor=1, strides=[te.var("sc"), 1]
+    )
+
+    llvm_intrin = "llvm.aarch64.neon.sdot" if dtype == "int8" else "llvm.aarch64.neon.udot"
+
+    def _intrin_func(ins, outs):
+        def _instr(index):
+            ib = tvm.tir.ir_builder.create()
+            if index == 1:
+                for i in range(0, rows):
+                    ib.emit(outs[0].vstore([i, 0], tvm.tir.const(0, "int32x16")))
+                return ib.get()
+            # Iterate on the number of rows of the output
+            for k in range(0, rows):
+                # Load 16 elements of A
+                # vec_a = [a, b, c, d, e, f, g, h, l, m, n, o, p, q, r, s];
+                vec_a = ins[0].vload([k, 0], dtype_vec)
+
+                # Iterate over each of the 4 rowsx4 tiles of the output
+                for j in range(0, 4):
+                    # Accumulate over each of the 4 (16x4) tiles contained in B
+                    for i in range(0, 4):
+                        # Replicate a single 4-element group of A (A[k, i:i+4])
+                        vec_aa = select_word(vec_a, i, dtype_vec)
+
+                        # Load 4 rows (each rows with 4 elements) from B (B[i:i+4, j:j+4])
+                        # vec_b = [0, 16, 32, 48,
+                        #          1, 17, 33, 49,
+                        #          2, 18, 34, 50,
+                        #          3, 19, 35, 51,];
+                        vec_b = ins[1].vload([i, 4 * j, 0], dtype_vec)
+
+                        # Accumulate in the correct part of the output
+                        vec_c = outs[0].vload([k, 4 * j], "int32x4")
+
+                        # Compute the dot product between the rowsx4 tile
+                        # from A and the 4x4 tile from B
+                        #
+                        # For instance, for i=0, we have:
+                        # sdot(vec_aa[0], vec_b) = [a*0+b*16+c*32+d*48,
+                        #                           a*1+b*17+c*33+d*49,
+                        #                           a*2+b*18+c*34+d*50,
+                        #                           a*3+b*19+c*35+d*51]
+                        vdot = tvm.tir.call_llvm_intrin(
+                            "int32x4",
+                            llvm_intrin,
+                            tvm.tir.const(3, "uint32"),
+                            vec_c,
+                            vec_b,
+                            vec_aa,
+                        )
+                        ib.emit(outs[0].vstore([k, 4 * j], vdot))
+            return ib.get()
+
+        # body, reset, update
+        return _instr(0), _instr(1), _instr(2)
+
+    buffer_params = {"offset_factor": 1}
+    return te.decl_tensor_intrin(
+        C.op,
+        _intrin_func,
+        binds={A: aa_buffer, B: bb_buffer, C: cc_buffer},
+        default_buffer_params=buffer_params,
+    )
+
+
 def _q_multiply_shift_arm(op):
     """
     Implementation of q_multiply_shift_arm through arm intrinsics
diff --git a/tests/python/relay/test_pass_alter_op_layout.py b/tests/python/relay/test_pass_alter_op_layout.py
index 4d508403570f..9df838bfd717 100644
--- a/tests/python/relay/test_pass_alter_op_layout.py
+++ b/tests/python/relay/test_pass_alter_op_layout.py
@@ -1105,7 +1105,7 @@ def _query_inside(self, target, workload):
         def update(self, target, workload, cfg):
             key = (str(target), workload)
             assert workload[2][1] == expected_workload_shape
-            assert workload[0] == "conv2d_NHWC_quantized_without_transform.arm_cpu"
+            assert workload[0] == "conv2d_NHWC_quantized_interleaved_without_transform.arm_cpu"
             self.memory[key] = cfg
 
     def alter_conv2d(attrs, inputs, tinfos, out_type):
diff --git a/tests/python/topi/python/test_topi_conv2d_int8.py b/tests/python/topi/python/test_topi_conv2d_int8.py
index 238517e5ed75..b2f9835c3d66 100644
--- a/tests/python/topi/python/test_topi_conv2d_int8.py
+++ b/tests/python/topi/python/test_topi_conv2d_int8.py
@@ -56,48 +56,67 @@ def compile_conv2d_NHWC_gemm_int8_arm(
     W = te.placeholder((kernel, kernel, in_channel, num_filter), name="W", dtype="int8")
     bias = te.placeholder((num_filter,), name="bias", dtype="int8")
     dtype = "int32"
-    device = "llvm --device arm_cpu --mtriple aarch64-linux-gnu"
-
-    ctx = tvm.context(device, 0)
-    if not tvm.testing.device_enabled(device):
-        print("Skip because %s is not enabled" % device)
-        return
-    print("Compiling on arm AArch64 target: %s" % device)
-    with tvm.target.Target(device):
-        assert is_aarch64_arm(), "AArch64 target not recognized"
-
-        C = topi.arm_cpu.compute_conv2d_NHWC_quantized(
-            A, W, (stride, stride), padding, (dilation, dilation), dtype
-        )
+    devices = [
+        (
+            "llvm --device arm_cpu --mtriple aarch64-linux-gnu",
+            topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved,
+            topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved,
+        ),
+        (
+            "llvm --device arm_cpu --mtriple aarch64-linux-gnu -mattr=+v8.2a,+dotprod",
+            topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved,
+            topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved,
+        ),
+        (
+            "llvm --device arm_cpu --mtriple aarch64-linux-gnu -mattr=+v8.2a,+dotprod",
+            topi.arm_cpu.compute_conv2d_NHWC_quantized_native,
+            topi.arm_cpu.schedule_conv2d_NHWC_quantized_native,
+        ),
+    ]
+
+    for device_tuple in devices:
+        device = device_tuple[0]
+        compute = device_tuple[1]
+        schedule = device_tuple[2]
+
+        ctx = tvm.context(device, 0)
+        if not tvm.testing.device_enabled(device):
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Compiling on arm AArch64 target: %s" % device)
+        with tvm.target.Target(device):
+            assert is_aarch64_arm(), "AArch64 target not recognized"
+
+            C = compute(A, W, (stride, stride), padding, (dilation, dilation), dtype)
+            if add_bias:
+                C = topi.add(C, bias)
+            if add_relu:
+                C = topi.nn.relu(C)
+            s = schedule([C])
+
         if add_bias:
-            C = topi.add(C, bias)
-        if add_relu:
-            C = topi.nn.relu(C)
-        s = topi.arm_cpu.schedule_conv2d_NHWC_quantized([C])
-
-    if add_bias:
-        tvm.build(
-            s,
-            [A, W, bias, C],
-            device,
-            name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
-            % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
-        )
-        func = tvm.build(
-            s,
-            [A, W, bias, C],
-            device,
-            name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
-            % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
-        )
-    else:
-        func = tvm.build(
-            s,
-            [A, W, C],
-            device,
-            name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
-            % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
-        )
+            tvm.build(
+                s,
+                [A, W, bias, C],
+                device,
+                name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
+                % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
+            )
+            func = tvm.build(
+                s,
+                [A, W, bias, C],
+                device,
+                name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
+                % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
+            )
+        else:
+            func = tvm.build(
+                s,
+                [A, W, C],
+                device,
+                name="relu_%d_%d_%d_%d_%d_%d_%d_%d"
+                % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation),
+            )
 
 
 def verify_conv2d_NHWC_gemm_int8(
@@ -155,14 +174,14 @@ def check_device(device):
             return
         print("Running on target: %s" % device)
         with tvm.target.Target(device):
-            C = topi.arm_cpu.compute_conv2d_NHWC_quantized(
+            C = topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved(
                 A, W, (stride, stride), padding, (dilation, dilation), dtype
             )
             if add_bias:
                 C = topi.add(C, bias)
             if add_relu:
                 C = topi.nn.relu(C)
-            s = topi.arm_cpu.schedule_conv2d_NHWC_quantized([C])
+            s = topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved([C])
 
         a = tvm.nd.array(a_np, ctx)
         w = tvm.nd.array(w_np, ctx)